diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,195494 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999820939352159, + "eval_steps": 500, + "global_step": 27923, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.581212956828478e-05, + "grad_norm": 266.45928955078125, + "learning_rate": 2.386634844868735e-07, + "loss": 8.763, + "step": 1 + }, + { + "epoch": 7.162425913656956e-05, + "grad_norm": 253.94284057617188, + "learning_rate": 4.77326968973747e-07, + "loss": 9.2062, + "step": 2 + }, + { + "epoch": 0.00010743638870485433, + "grad_norm": 354.7848815917969, + "learning_rate": 7.159904534606206e-07, + "loss": 6.9487, + "step": 3 + }, + { + "epoch": 0.00014324851827313912, + "grad_norm": 144.5425262451172, + "learning_rate": 9.54653937947494e-07, + "loss": 6.5165, + "step": 4 + }, + { + "epoch": 0.00017906064784142388, + "grad_norm": 220.5895538330078, + "learning_rate": 1.1933174224343676e-06, + "loss": 5.5709, + "step": 5 + }, + { + "epoch": 0.00021487277740970867, + "grad_norm": 167.82615661621094, + "learning_rate": 1.4319809069212413e-06, + "loss": 8.7909, + "step": 6 + }, + { + "epoch": 0.00025068490697799346, + "grad_norm": 395.8609619140625, + "learning_rate": 1.6706443914081146e-06, + "loss": 7.7716, + "step": 7 + }, + { + "epoch": 0.00028649703654627824, + "grad_norm": 247.2559051513672, + "learning_rate": 1.909307875894988e-06, + "loss": 5.9296, + "step": 8 + }, + { + "epoch": 0.000322309166114563, + "grad_norm": 175.59742736816406, + "learning_rate": 2.1479713603818614e-06, + "loss": 7.2554, + "step": 9 + }, + { + "epoch": 0.00035812129568284776, + "grad_norm": 245.14215087890625, + "learning_rate": 2.386634844868735e-06, + "loss": 6.6004, + "step": 10 + }, + { + "epoch": 0.00039393342525113255, + "grad_norm": 187.2186737060547, + "learning_rate": 2.625298329355609e-06, + "loss": 5.9285, + "step": 11 + }, + { + "epoch": 0.00042974555481941734, + "grad_norm": 132.06202697753906, + "learning_rate": 2.8639618138424826e-06, + "loss": 5.628, + "step": 12 + }, + { + "epoch": 0.0004655576843877021, + "grad_norm": 234.14048767089844, + "learning_rate": 3.1026252983293554e-06, + "loss": 6.124, + "step": 13 + }, + { + "epoch": 0.0005013698139559869, + "grad_norm": 138.61692810058594, + "learning_rate": 3.341288782816229e-06, + "loss": 5.9886, + "step": 14 + }, + { + "epoch": 0.0005371819435242717, + "grad_norm": 56.17988204956055, + "learning_rate": 3.579952267303103e-06, + "loss": 4.8213, + "step": 15 + }, + { + "epoch": 0.0005729940730925565, + "grad_norm": 56.36459732055664, + "learning_rate": 3.818615751789976e-06, + "loss": 4.7952, + "step": 16 + }, + { + "epoch": 0.0006088062026608413, + "grad_norm": 74.7711181640625, + "learning_rate": 4.05727923627685e-06, + "loss": 5.8223, + "step": 17 + }, + { + "epoch": 0.000644618332229126, + "grad_norm": 58.69936752319336, + "learning_rate": 4.295942720763723e-06, + "loss": 4.8434, + "step": 18 + }, + { + "epoch": 0.0006804304617974107, + "grad_norm": 43.7608757019043, + "learning_rate": 4.5346062052505965e-06, + "loss": 5.6254, + "step": 19 + }, + { + "epoch": 0.0007162425913656955, + "grad_norm": 44.504364013671875, + "learning_rate": 4.77326968973747e-06, + "loss": 4.2283, + "step": 20 + }, + { + "epoch": 0.0007520547209339803, + "grad_norm": 44.918399810791016, + "learning_rate": 5.011933174224344e-06, + "loss": 4.5779, + "step": 21 + }, + { + "epoch": 0.0007878668505022651, + "grad_norm": 62.90056228637695, + "learning_rate": 5.250596658711218e-06, + "loss": 5.0793, + "step": 22 + }, + { + "epoch": 0.0008236789800705499, + "grad_norm": 48.883602142333984, + "learning_rate": 5.489260143198091e-06, + "loss": 4.4507, + "step": 23 + }, + { + "epoch": 0.0008594911096388347, + "grad_norm": 52.969486236572266, + "learning_rate": 5.727923627684965e-06, + "loss": 3.9912, + "step": 24 + }, + { + "epoch": 0.0008953032392071195, + "grad_norm": 71.30107116699219, + "learning_rate": 5.966587112171838e-06, + "loss": 3.867, + "step": 25 + }, + { + "epoch": 0.0009311153687754042, + "grad_norm": 42.344852447509766, + "learning_rate": 6.205250596658711e-06, + "loss": 3.5093, + "step": 26 + }, + { + "epoch": 0.000966927498343689, + "grad_norm": 46.08336639404297, + "learning_rate": 6.4439140811455855e-06, + "loss": 4.1226, + "step": 27 + }, + { + "epoch": 0.0010027396279119738, + "grad_norm": 22.715564727783203, + "learning_rate": 6.682577565632458e-06, + "loss": 2.9302, + "step": 28 + }, + { + "epoch": 0.0010385517574802586, + "grad_norm": 30.1304874420166, + "learning_rate": 6.921241050119331e-06, + "loss": 3.8326, + "step": 29 + }, + { + "epoch": 0.0010743638870485434, + "grad_norm": 26.537921905517578, + "learning_rate": 7.159904534606206e-06, + "loss": 3.4354, + "step": 30 + }, + { + "epoch": 0.0011101760166168282, + "grad_norm": 30.003963470458984, + "learning_rate": 7.398568019093079e-06, + "loss": 3.9301, + "step": 31 + }, + { + "epoch": 0.001145988146185113, + "grad_norm": 23.912275314331055, + "learning_rate": 7.637231503579952e-06, + "loss": 3.569, + "step": 32 + }, + { + "epoch": 0.0011818002757533978, + "grad_norm": 28.667634963989258, + "learning_rate": 7.875894988066825e-06, + "loss": 3.3183, + "step": 33 + }, + { + "epoch": 0.0012176124053216825, + "grad_norm": 24.40900230407715, + "learning_rate": 8.1145584725537e-06, + "loss": 3.6607, + "step": 34 + }, + { + "epoch": 0.0012534245348899673, + "grad_norm": 19.25312614440918, + "learning_rate": 8.353221957040573e-06, + "loss": 3.6169, + "step": 35 + }, + { + "epoch": 0.001289236664458252, + "grad_norm": 19.610139846801758, + "learning_rate": 8.591885441527446e-06, + "loss": 3.3279, + "step": 36 + }, + { + "epoch": 0.0013250487940265367, + "grad_norm": 18.048316955566406, + "learning_rate": 8.83054892601432e-06, + "loss": 3.3814, + "step": 37 + }, + { + "epoch": 0.0013608609235948215, + "grad_norm": 20.420162200927734, + "learning_rate": 9.069212410501193e-06, + "loss": 2.9829, + "step": 38 + }, + { + "epoch": 0.0013966730531631063, + "grad_norm": 15.9304838180542, + "learning_rate": 9.307875894988068e-06, + "loss": 2.898, + "step": 39 + }, + { + "epoch": 0.001432485182731391, + "grad_norm": 23.585050582885742, + "learning_rate": 9.54653937947494e-06, + "loss": 3.5377, + "step": 40 + }, + { + "epoch": 0.0014682973122996758, + "grad_norm": 20.667461395263672, + "learning_rate": 9.785202863961815e-06, + "loss": 3.6565, + "step": 41 + }, + { + "epoch": 0.0015041094418679606, + "grad_norm": 19.172515869140625, + "learning_rate": 1.0023866348448688e-05, + "loss": 3.3047, + "step": 42 + }, + { + "epoch": 0.0015399215714362454, + "grad_norm": 17.925945281982422, + "learning_rate": 1.026252983293556e-05, + "loss": 3.1411, + "step": 43 + }, + { + "epoch": 0.0015757337010045302, + "grad_norm": 12.201303482055664, + "learning_rate": 1.0501193317422435e-05, + "loss": 2.6465, + "step": 44 + }, + { + "epoch": 0.001611545830572815, + "grad_norm": 18.059865951538086, + "learning_rate": 1.0739856801909308e-05, + "loss": 3.6168, + "step": 45 + }, + { + "epoch": 0.0016473579601410998, + "grad_norm": 21.842723846435547, + "learning_rate": 1.0978520286396183e-05, + "loss": 3.1488, + "step": 46 + }, + { + "epoch": 0.0016831700897093846, + "grad_norm": 17.55625343322754, + "learning_rate": 1.1217183770883056e-05, + "loss": 3.0756, + "step": 47 + }, + { + "epoch": 0.0017189822192776693, + "grad_norm": 12.256664276123047, + "learning_rate": 1.145584725536993e-05, + "loss": 2.7213, + "step": 48 + }, + { + "epoch": 0.0017547943488459541, + "grad_norm": 11.992024421691895, + "learning_rate": 1.1694510739856803e-05, + "loss": 2.829, + "step": 49 + }, + { + "epoch": 0.001790606478414239, + "grad_norm": 11.272849082946777, + "learning_rate": 1.1933174224343676e-05, + "loss": 2.8738, + "step": 50 + }, + { + "epoch": 0.0018264186079825237, + "grad_norm": 9.471379280090332, + "learning_rate": 1.2171837708830549e-05, + "loss": 2.664, + "step": 51 + }, + { + "epoch": 0.0018622307375508085, + "grad_norm": 8.199422836303711, + "learning_rate": 1.2410501193317422e-05, + "loss": 2.3583, + "step": 52 + }, + { + "epoch": 0.0018980428671190933, + "grad_norm": 7.3866729736328125, + "learning_rate": 1.2649164677804295e-05, + "loss": 2.4145, + "step": 53 + }, + { + "epoch": 0.001933854996687378, + "grad_norm": 12.23798656463623, + "learning_rate": 1.2887828162291171e-05, + "loss": 2.7636, + "step": 54 + }, + { + "epoch": 0.0019696671262556626, + "grad_norm": 14.02424430847168, + "learning_rate": 1.3126491646778044e-05, + "loss": 3.3066, + "step": 55 + }, + { + "epoch": 0.0020054792558239476, + "grad_norm": 24.34091567993164, + "learning_rate": 1.3365155131264917e-05, + "loss": 3.8594, + "step": 56 + }, + { + "epoch": 0.002041291385392232, + "grad_norm": 9.424992561340332, + "learning_rate": 1.360381861575179e-05, + "loss": 2.7613, + "step": 57 + }, + { + "epoch": 0.0020771035149605172, + "grad_norm": 4.725447177886963, + "learning_rate": 1.3842482100238662e-05, + "loss": 1.973, + "step": 58 + }, + { + "epoch": 0.002112915644528802, + "grad_norm": 6.388454437255859, + "learning_rate": 1.4081145584725539e-05, + "loss": 2.3938, + "step": 59 + }, + { + "epoch": 0.002148727774097087, + "grad_norm": 11.523760795593262, + "learning_rate": 1.4319809069212412e-05, + "loss": 2.5413, + "step": 60 + }, + { + "epoch": 0.0021845399036653714, + "grad_norm": 12.98708724975586, + "learning_rate": 1.4558472553699284e-05, + "loss": 3.0063, + "step": 61 + }, + { + "epoch": 0.0022203520332336564, + "grad_norm": 8.39086627960205, + "learning_rate": 1.4797136038186157e-05, + "loss": 2.452, + "step": 62 + }, + { + "epoch": 0.002256164162801941, + "grad_norm": 5.200684070587158, + "learning_rate": 1.5035799522673034e-05, + "loss": 2.0954, + "step": 63 + }, + { + "epoch": 0.002291976292370226, + "grad_norm": 6.227000713348389, + "learning_rate": 1.5274463007159905e-05, + "loss": 2.3347, + "step": 64 + }, + { + "epoch": 0.0023277884219385105, + "grad_norm": 4.207845211029053, + "learning_rate": 1.551312649164678e-05, + "loss": 2.0379, + "step": 65 + }, + { + "epoch": 0.0023636005515067955, + "grad_norm": 7.577055931091309, + "learning_rate": 1.575178997613365e-05, + "loss": 2.3339, + "step": 66 + }, + { + "epoch": 0.00239941268107508, + "grad_norm": 9.227364540100098, + "learning_rate": 1.5990453460620525e-05, + "loss": 2.6596, + "step": 67 + }, + { + "epoch": 0.002435224810643365, + "grad_norm": 7.107662677764893, + "learning_rate": 1.62291169451074e-05, + "loss": 2.4291, + "step": 68 + }, + { + "epoch": 0.0024710369402116497, + "grad_norm": 8.73476791381836, + "learning_rate": 1.6467780429594274e-05, + "loss": 2.6181, + "step": 69 + }, + { + "epoch": 0.0025068490697799347, + "grad_norm": 4.854764461517334, + "learning_rate": 1.6706443914081145e-05, + "loss": 2.0247, + "step": 70 + }, + { + "epoch": 0.0025426611993482192, + "grad_norm": 3.9345383644104004, + "learning_rate": 1.694510739856802e-05, + "loss": 2.1223, + "step": 71 + }, + { + "epoch": 0.002578473328916504, + "grad_norm": 5.067500114440918, + "learning_rate": 1.718377088305489e-05, + "loss": 2.0448, + "step": 72 + }, + { + "epoch": 0.002614285458484789, + "grad_norm": 13.264959335327148, + "learning_rate": 1.742243436754177e-05, + "loss": 2.7075, + "step": 73 + }, + { + "epoch": 0.0026500975880530734, + "grad_norm": 5.611446380615234, + "learning_rate": 1.766109785202864e-05, + "loss": 2.3618, + "step": 74 + }, + { + "epoch": 0.0026859097176213584, + "grad_norm": 5.168875694274902, + "learning_rate": 1.7899761336515515e-05, + "loss": 2.3011, + "step": 75 + }, + { + "epoch": 0.002721721847189643, + "grad_norm": 6.040040969848633, + "learning_rate": 1.8138424821002386e-05, + "loss": 2.339, + "step": 76 + }, + { + "epoch": 0.002757533976757928, + "grad_norm": 9.418103218078613, + "learning_rate": 1.837708830548926e-05, + "loss": 2.3076, + "step": 77 + }, + { + "epoch": 0.0027933461063262125, + "grad_norm": 4.1048383712768555, + "learning_rate": 1.8615751789976135e-05, + "loss": 1.9941, + "step": 78 + }, + { + "epoch": 0.0028291582358944975, + "grad_norm": 3.6161460876464844, + "learning_rate": 1.885441527446301e-05, + "loss": 2.0778, + "step": 79 + }, + { + "epoch": 0.002864970365462782, + "grad_norm": 5.113503932952881, + "learning_rate": 1.909307875894988e-05, + "loss": 2.0094, + "step": 80 + }, + { + "epoch": 0.002900782495031067, + "grad_norm": 7.152892112731934, + "learning_rate": 1.9331742243436756e-05, + "loss": 2.3829, + "step": 81 + }, + { + "epoch": 0.0029365946245993517, + "grad_norm": 6.205478668212891, + "learning_rate": 1.957040572792363e-05, + "loss": 2.1036, + "step": 82 + }, + { + "epoch": 0.0029724067541676367, + "grad_norm": 14.791426658630371, + "learning_rate": 1.98090692124105e-05, + "loss": 2.2735, + "step": 83 + }, + { + "epoch": 0.0030082188837359213, + "grad_norm": 6.165616035461426, + "learning_rate": 2.0047732696897376e-05, + "loss": 2.251, + "step": 84 + }, + { + "epoch": 0.0030440310133042063, + "grad_norm": 11.363359451293945, + "learning_rate": 2.0286396181384247e-05, + "loss": 2.302, + "step": 85 + }, + { + "epoch": 0.003079843142872491, + "grad_norm": 5.888169288635254, + "learning_rate": 2.052505966587112e-05, + "loss": 2.0928, + "step": 86 + }, + { + "epoch": 0.003115655272440776, + "grad_norm": 10.855888366699219, + "learning_rate": 2.0763723150357996e-05, + "loss": 2.7463, + "step": 87 + }, + { + "epoch": 0.0031514674020090604, + "grad_norm": 8.183070182800293, + "learning_rate": 2.100238663484487e-05, + "loss": 2.6879, + "step": 88 + }, + { + "epoch": 0.0031872795315773454, + "grad_norm": 4.4423723220825195, + "learning_rate": 2.1241050119331742e-05, + "loss": 1.7398, + "step": 89 + }, + { + "epoch": 0.00322309166114563, + "grad_norm": 4.584626197814941, + "learning_rate": 2.1479713603818617e-05, + "loss": 1.895, + "step": 90 + }, + { + "epoch": 0.003258903790713915, + "grad_norm": 5.959711074829102, + "learning_rate": 2.171837708830549e-05, + "loss": 2.3659, + "step": 91 + }, + { + "epoch": 0.0032947159202821995, + "grad_norm": 3.2629969120025635, + "learning_rate": 2.1957040572792366e-05, + "loss": 1.9619, + "step": 92 + }, + { + "epoch": 0.0033305280498504846, + "grad_norm": 4.400030612945557, + "learning_rate": 2.2195704057279237e-05, + "loss": 1.9871, + "step": 93 + }, + { + "epoch": 0.003366340179418769, + "grad_norm": 5.728138446807861, + "learning_rate": 2.243436754176611e-05, + "loss": 1.9259, + "step": 94 + }, + { + "epoch": 0.003402152308987054, + "grad_norm": 4.519594192504883, + "learning_rate": 2.2673031026252983e-05, + "loss": 1.8022, + "step": 95 + }, + { + "epoch": 0.0034379644385553387, + "grad_norm": 2.7000021934509277, + "learning_rate": 2.291169451073986e-05, + "loss": 1.6802, + "step": 96 + }, + { + "epoch": 0.0034737765681236233, + "grad_norm": 10.363709449768066, + "learning_rate": 2.3150357995226732e-05, + "loss": 2.2133, + "step": 97 + }, + { + "epoch": 0.0035095886976919083, + "grad_norm": 5.193636417388916, + "learning_rate": 2.3389021479713606e-05, + "loss": 1.9587, + "step": 98 + }, + { + "epoch": 0.003545400827260193, + "grad_norm": 2.796421527862549, + "learning_rate": 2.3627684964200477e-05, + "loss": 1.7176, + "step": 99 + }, + { + "epoch": 0.003581212956828478, + "grad_norm": 3.223402261734009, + "learning_rate": 2.3866348448687352e-05, + "loss": 2.0404, + "step": 100 + }, + { + "epoch": 0.0036170250863967624, + "grad_norm": 4.542501449584961, + "learning_rate": 2.4105011933174227e-05, + "loss": 2.203, + "step": 101 + }, + { + "epoch": 0.0036528372159650474, + "grad_norm": 3.740020990371704, + "learning_rate": 2.4343675417661098e-05, + "loss": 2.2128, + "step": 102 + }, + { + "epoch": 0.003688649345533332, + "grad_norm": 2.7657647132873535, + "learning_rate": 2.4582338902147972e-05, + "loss": 1.9226, + "step": 103 + }, + { + "epoch": 0.003724461475101617, + "grad_norm": 5.928515434265137, + "learning_rate": 2.4821002386634844e-05, + "loss": 2.2464, + "step": 104 + }, + { + "epoch": 0.0037602736046699016, + "grad_norm": 3.2757768630981445, + "learning_rate": 2.5059665871121718e-05, + "loss": 1.8416, + "step": 105 + }, + { + "epoch": 0.0037960857342381866, + "grad_norm": 3.6708478927612305, + "learning_rate": 2.529832935560859e-05, + "loss": 2.0104, + "step": 106 + }, + { + "epoch": 0.003831897863806471, + "grad_norm": 2.5940299034118652, + "learning_rate": 2.5536992840095464e-05, + "loss": 1.5897, + "step": 107 + }, + { + "epoch": 0.003867709993374756, + "grad_norm": 2.8654122352600098, + "learning_rate": 2.5775656324582342e-05, + "loss": 1.9403, + "step": 108 + }, + { + "epoch": 0.0039035221229430407, + "grad_norm": 2.952970266342163, + "learning_rate": 2.6014319809069216e-05, + "loss": 1.885, + "step": 109 + }, + { + "epoch": 0.003939334252511325, + "grad_norm": 4.362255096435547, + "learning_rate": 2.6252983293556088e-05, + "loss": 2.0423, + "step": 110 + }, + { + "epoch": 0.003975146382079611, + "grad_norm": 4.102433204650879, + "learning_rate": 2.6491646778042962e-05, + "loss": 1.9887, + "step": 111 + }, + { + "epoch": 0.004010958511647895, + "grad_norm": 2.9979641437530518, + "learning_rate": 2.6730310262529833e-05, + "loss": 1.8572, + "step": 112 + }, + { + "epoch": 0.00404677064121618, + "grad_norm": 3.637627124786377, + "learning_rate": 2.6968973747016708e-05, + "loss": 1.9665, + "step": 113 + }, + { + "epoch": 0.004082582770784464, + "grad_norm": 3.132642984390259, + "learning_rate": 2.720763723150358e-05, + "loss": 1.7278, + "step": 114 + }, + { + "epoch": 0.00411839490035275, + "grad_norm": 5.542552471160889, + "learning_rate": 2.7446300715990454e-05, + "loss": 1.8081, + "step": 115 + }, + { + "epoch": 0.0041542070299210344, + "grad_norm": 2.0010836124420166, + "learning_rate": 2.7684964200477325e-05, + "loss": 1.451, + "step": 116 + }, + { + "epoch": 0.004190019159489319, + "grad_norm": 3.096308469772339, + "learning_rate": 2.7923627684964203e-05, + "loss": 1.813, + "step": 117 + }, + { + "epoch": 0.004225831289057604, + "grad_norm": 2.9095513820648193, + "learning_rate": 2.8162291169451077e-05, + "loss": 1.6167, + "step": 118 + }, + { + "epoch": 0.004261643418625889, + "grad_norm": 2.3704395294189453, + "learning_rate": 2.840095465393795e-05, + "loss": 1.6941, + "step": 119 + }, + { + "epoch": 0.004297455548194174, + "grad_norm": 3.150348663330078, + "learning_rate": 2.8639618138424823e-05, + "loss": 1.9385, + "step": 120 + }, + { + "epoch": 0.004333267677762458, + "grad_norm": 3.3327701091766357, + "learning_rate": 2.8878281622911694e-05, + "loss": 1.7891, + "step": 121 + }, + { + "epoch": 0.004369079807330743, + "grad_norm": 4.004137992858887, + "learning_rate": 2.911694510739857e-05, + "loss": 1.703, + "step": 122 + }, + { + "epoch": 0.004404891936899027, + "grad_norm": 4.375809669494629, + "learning_rate": 2.935560859188544e-05, + "loss": 1.7924, + "step": 123 + }, + { + "epoch": 0.004440704066467313, + "grad_norm": 2.075587511062622, + "learning_rate": 2.9594272076372315e-05, + "loss": 2.058, + "step": 124 + }, + { + "epoch": 0.004476516196035597, + "grad_norm": 3.968283176422119, + "learning_rate": 2.983293556085919e-05, + "loss": 2.0578, + "step": 125 + }, + { + "epoch": 0.004512328325603882, + "grad_norm": 1.9869202375411987, + "learning_rate": 3.0071599045346067e-05, + "loss": 1.692, + "step": 126 + }, + { + "epoch": 0.0045481404551721664, + "grad_norm": 2.9801366329193115, + "learning_rate": 3.031026252983294e-05, + "loss": 1.6555, + "step": 127 + }, + { + "epoch": 0.004583952584740452, + "grad_norm": 3.602670907974243, + "learning_rate": 3.054892601431981e-05, + "loss": 1.7811, + "step": 128 + }, + { + "epoch": 0.0046197647143087365, + "grad_norm": 2.0646204948425293, + "learning_rate": 3.0787589498806684e-05, + "loss": 1.6711, + "step": 129 + }, + { + "epoch": 0.004655576843877021, + "grad_norm": 3.786837577819824, + "learning_rate": 3.102625298329356e-05, + "loss": 2.1889, + "step": 130 + }, + { + "epoch": 0.004691388973445306, + "grad_norm": 2.5075771808624268, + "learning_rate": 3.126491646778043e-05, + "loss": 1.9497, + "step": 131 + }, + { + "epoch": 0.004727201103013591, + "grad_norm": 2.487145185470581, + "learning_rate": 3.15035799522673e-05, + "loss": 1.7222, + "step": 132 + }, + { + "epoch": 0.004763013232581876, + "grad_norm": 4.664052486419678, + "learning_rate": 3.1742243436754176e-05, + "loss": 1.8125, + "step": 133 + }, + { + "epoch": 0.00479882536215016, + "grad_norm": 2.533400774002075, + "learning_rate": 3.198090692124105e-05, + "loss": 1.8583, + "step": 134 + }, + { + "epoch": 0.004834637491718445, + "grad_norm": 2.8833436965942383, + "learning_rate": 3.2219570405727925e-05, + "loss": 1.8781, + "step": 135 + }, + { + "epoch": 0.00487044962128673, + "grad_norm": 2.816530227661133, + "learning_rate": 3.24582338902148e-05, + "loss": 1.7998, + "step": 136 + }, + { + "epoch": 0.004906261750855015, + "grad_norm": 3.2313337326049805, + "learning_rate": 3.2696897374701674e-05, + "loss": 1.7115, + "step": 137 + }, + { + "epoch": 0.004942073880423299, + "grad_norm": 2.415179491043091, + "learning_rate": 3.293556085918855e-05, + "loss": 1.803, + "step": 138 + }, + { + "epoch": 0.004977886009991584, + "grad_norm": 2.6969661712646484, + "learning_rate": 3.3174224343675416e-05, + "loss": 1.6731, + "step": 139 + }, + { + "epoch": 0.005013698139559869, + "grad_norm": 3.4127819538116455, + "learning_rate": 3.341288782816229e-05, + "loss": 1.9913, + "step": 140 + }, + { + "epoch": 0.005049510269128154, + "grad_norm": 2.1067464351654053, + "learning_rate": 3.3651551312649165e-05, + "loss": 1.5501, + "step": 141 + }, + { + "epoch": 0.0050853223986964385, + "grad_norm": 2.5211939811706543, + "learning_rate": 3.389021479713604e-05, + "loss": 1.7098, + "step": 142 + }, + { + "epoch": 0.005121134528264723, + "grad_norm": 2.674433946609497, + "learning_rate": 3.4128878281622915e-05, + "loss": 1.9032, + "step": 143 + }, + { + "epoch": 0.005156946657833008, + "grad_norm": 2.814911127090454, + "learning_rate": 3.436754176610978e-05, + "loss": 1.8304, + "step": 144 + }, + { + "epoch": 0.005192758787401293, + "grad_norm": 2.6798105239868164, + "learning_rate": 3.4606205250596664e-05, + "loss": 1.9212, + "step": 145 + }, + { + "epoch": 0.005228570916969578, + "grad_norm": 2.255305528640747, + "learning_rate": 3.484486873508354e-05, + "loss": 1.7516, + "step": 146 + }, + { + "epoch": 0.005264383046537862, + "grad_norm": 2.5120773315429688, + "learning_rate": 3.5083532219570406e-05, + "loss": 1.857, + "step": 147 + }, + { + "epoch": 0.005300195176106147, + "grad_norm": 3.6451504230499268, + "learning_rate": 3.532219570405728e-05, + "loss": 1.9054, + "step": 148 + }, + { + "epoch": 0.005336007305674432, + "grad_norm": 2.6806640625, + "learning_rate": 3.5560859188544155e-05, + "loss": 1.4721, + "step": 149 + }, + { + "epoch": 0.005371819435242717, + "grad_norm": 1.8750903606414795, + "learning_rate": 3.579952267303103e-05, + "loss": 1.5406, + "step": 150 + }, + { + "epoch": 0.005407631564811001, + "grad_norm": 2.424189805984497, + "learning_rate": 3.60381861575179e-05, + "loss": 1.7863, + "step": 151 + }, + { + "epoch": 0.005443443694379286, + "grad_norm": 2.935016632080078, + "learning_rate": 3.627684964200477e-05, + "loss": 1.7224, + "step": 152 + }, + { + "epoch": 0.005479255823947571, + "grad_norm": 2.003542184829712, + "learning_rate": 3.651551312649165e-05, + "loss": 1.7105, + "step": 153 + }, + { + "epoch": 0.005515067953515856, + "grad_norm": 3.495239496231079, + "learning_rate": 3.675417661097852e-05, + "loss": 1.9181, + "step": 154 + }, + { + "epoch": 0.0055508800830841405, + "grad_norm": 1.9848411083221436, + "learning_rate": 3.6992840095465396e-05, + "loss": 1.5987, + "step": 155 + }, + { + "epoch": 0.005586692212652425, + "grad_norm": 2.2125675678253174, + "learning_rate": 3.723150357995227e-05, + "loss": 1.6504, + "step": 156 + }, + { + "epoch": 0.0056225043422207105, + "grad_norm": 2.676414728164673, + "learning_rate": 3.7470167064439145e-05, + "loss": 1.7868, + "step": 157 + }, + { + "epoch": 0.005658316471788995, + "grad_norm": 2.1446855068206787, + "learning_rate": 3.770883054892602e-05, + "loss": 1.6534, + "step": 158 + }, + { + "epoch": 0.00569412860135728, + "grad_norm": 3.0930895805358887, + "learning_rate": 3.794749403341289e-05, + "loss": 1.743, + "step": 159 + }, + { + "epoch": 0.005729940730925564, + "grad_norm": 3.054556131362915, + "learning_rate": 3.818615751789976e-05, + "loss": 1.7359, + "step": 160 + }, + { + "epoch": 0.00576575286049385, + "grad_norm": 3.361989736557007, + "learning_rate": 3.8424821002386637e-05, + "loss": 1.8632, + "step": 161 + }, + { + "epoch": 0.005801564990062134, + "grad_norm": 3.6084907054901123, + "learning_rate": 3.866348448687351e-05, + "loss": 1.9003, + "step": 162 + }, + { + "epoch": 0.005837377119630419, + "grad_norm": 1.9387969970703125, + "learning_rate": 3.8902147971360386e-05, + "loss": 1.573, + "step": 163 + }, + { + "epoch": 0.005873189249198703, + "grad_norm": 2.152981996536255, + "learning_rate": 3.914081145584726e-05, + "loss": 1.6964, + "step": 164 + }, + { + "epoch": 0.005909001378766989, + "grad_norm": 1.969617247581482, + "learning_rate": 3.9379474940334135e-05, + "loss": 1.5307, + "step": 165 + }, + { + "epoch": 0.005944813508335273, + "grad_norm": 2.4235682487487793, + "learning_rate": 3.9618138424821e-05, + "loss": 1.5868, + "step": 166 + }, + { + "epoch": 0.005980625637903558, + "grad_norm": 2.6382267475128174, + "learning_rate": 3.985680190930788e-05, + "loss": 1.4746, + "step": 167 + }, + { + "epoch": 0.0060164377674718425, + "grad_norm": 2.3006324768066406, + "learning_rate": 4.009546539379475e-05, + "loss": 1.6643, + "step": 168 + }, + { + "epoch": 0.006052249897040127, + "grad_norm": 3.0609922409057617, + "learning_rate": 4.0334128878281626e-05, + "loss": 1.91, + "step": 169 + }, + { + "epoch": 0.0060880620266084125, + "grad_norm": 1.9908839464187622, + "learning_rate": 4.0572792362768494e-05, + "loss": 1.9362, + "step": 170 + }, + { + "epoch": 0.006123874156176697, + "grad_norm": 1.8651454448699951, + "learning_rate": 4.081145584725537e-05, + "loss": 1.6055, + "step": 171 + }, + { + "epoch": 0.006159686285744982, + "grad_norm": 2.0197460651397705, + "learning_rate": 4.105011933174224e-05, + "loss": 1.6162, + "step": 172 + }, + { + "epoch": 0.006195498415313266, + "grad_norm": 2.119907855987549, + "learning_rate": 4.1288782816229125e-05, + "loss": 1.7101, + "step": 173 + }, + { + "epoch": 0.006231310544881552, + "grad_norm": 2.264143705368042, + "learning_rate": 4.152744630071599e-05, + "loss": 1.5838, + "step": 174 + }, + { + "epoch": 0.006267122674449836, + "grad_norm": 2.26053786277771, + "learning_rate": 4.176610978520287e-05, + "loss": 1.6075, + "step": 175 + }, + { + "epoch": 0.006302934804018121, + "grad_norm": 2.495673179626465, + "learning_rate": 4.200477326968974e-05, + "loss": 1.7947, + "step": 176 + }, + { + "epoch": 0.006338746933586405, + "grad_norm": 3.5133705139160156, + "learning_rate": 4.2243436754176616e-05, + "loss": 1.8883, + "step": 177 + }, + { + "epoch": 0.006374559063154691, + "grad_norm": 1.7048587799072266, + "learning_rate": 4.2482100238663484e-05, + "loss": 1.6943, + "step": 178 + }, + { + "epoch": 0.006410371192722975, + "grad_norm": 1.9267873764038086, + "learning_rate": 4.272076372315036e-05, + "loss": 1.5593, + "step": 179 + }, + { + "epoch": 0.00644618332229126, + "grad_norm": 2.540088653564453, + "learning_rate": 4.295942720763723e-05, + "loss": 1.7066, + "step": 180 + }, + { + "epoch": 0.0064819954518595445, + "grad_norm": 2.216285228729248, + "learning_rate": 4.319809069212411e-05, + "loss": 1.4369, + "step": 181 + }, + { + "epoch": 0.00651780758142783, + "grad_norm": 2.7366178035736084, + "learning_rate": 4.343675417661098e-05, + "loss": 1.77, + "step": 182 + }, + { + "epoch": 0.0065536197109961145, + "grad_norm": 2.9457297325134277, + "learning_rate": 4.367541766109786e-05, + "loss": 1.8139, + "step": 183 + }, + { + "epoch": 0.006589431840564399, + "grad_norm": 3.530334234237671, + "learning_rate": 4.391408114558473e-05, + "loss": 1.8187, + "step": 184 + }, + { + "epoch": 0.006625243970132684, + "grad_norm": 3.357346534729004, + "learning_rate": 4.41527446300716e-05, + "loss": 1.6704, + "step": 185 + }, + { + "epoch": 0.006661056099700969, + "grad_norm": 3.4373438358306885, + "learning_rate": 4.4391408114558474e-05, + "loss": 1.6934, + "step": 186 + }, + { + "epoch": 0.006696868229269254, + "grad_norm": 2.24269700050354, + "learning_rate": 4.463007159904535e-05, + "loss": 1.5898, + "step": 187 + }, + { + "epoch": 0.006732680358837538, + "grad_norm": 3.069500684738159, + "learning_rate": 4.486873508353222e-05, + "loss": 1.5757, + "step": 188 + }, + { + "epoch": 0.006768492488405823, + "grad_norm": 3.091425895690918, + "learning_rate": 4.510739856801909e-05, + "loss": 1.7151, + "step": 189 + }, + { + "epoch": 0.006804304617974108, + "grad_norm": 2.1302952766418457, + "learning_rate": 4.5346062052505965e-05, + "loss": 1.7047, + "step": 190 + }, + { + "epoch": 0.006840116747542393, + "grad_norm": 2.542729377746582, + "learning_rate": 4.5584725536992847e-05, + "loss": 1.7612, + "step": 191 + }, + { + "epoch": 0.006875928877110677, + "grad_norm": 2.7235147953033447, + "learning_rate": 4.582338902147972e-05, + "loss": 1.8554, + "step": 192 + }, + { + "epoch": 0.006911741006678962, + "grad_norm": 2.558994770050049, + "learning_rate": 4.606205250596659e-05, + "loss": 1.5257, + "step": 193 + }, + { + "epoch": 0.0069475531362472465, + "grad_norm": 2.686633825302124, + "learning_rate": 4.6300715990453463e-05, + "loss": 1.8072, + "step": 194 + }, + { + "epoch": 0.006983365265815532, + "grad_norm": 1.691375970840454, + "learning_rate": 4.653937947494034e-05, + "loss": 1.4815, + "step": 195 + }, + { + "epoch": 0.0070191773953838165, + "grad_norm": 1.907771348953247, + "learning_rate": 4.677804295942721e-05, + "loss": 1.5085, + "step": 196 + }, + { + "epoch": 0.007054989524952101, + "grad_norm": 3.1421594619750977, + "learning_rate": 4.701670644391408e-05, + "loss": 1.6389, + "step": 197 + }, + { + "epoch": 0.007090801654520386, + "grad_norm": 1.9183869361877441, + "learning_rate": 4.7255369928400955e-05, + "loss": 1.5199, + "step": 198 + }, + { + "epoch": 0.007126613784088671, + "grad_norm": 3.203547716140747, + "learning_rate": 4.749403341288783e-05, + "loss": 1.7751, + "step": 199 + }, + { + "epoch": 0.007162425913656956, + "grad_norm": 2.4169490337371826, + "learning_rate": 4.7732696897374704e-05, + "loss": 1.6014, + "step": 200 + }, + { + "epoch": 0.00719823804322524, + "grad_norm": 3.712132215499878, + "learning_rate": 4.797136038186158e-05, + "loss": 1.7816, + "step": 201 + }, + { + "epoch": 0.007234050172793525, + "grad_norm": 2.4880003929138184, + "learning_rate": 4.821002386634845e-05, + "loss": 1.614, + "step": 202 + }, + { + "epoch": 0.00726986230236181, + "grad_norm": 1.5979989767074585, + "learning_rate": 4.844868735083533e-05, + "loss": 1.3836, + "step": 203 + }, + { + "epoch": 0.007305674431930095, + "grad_norm": 2.17463755607605, + "learning_rate": 4.8687350835322196e-05, + "loss": 1.6439, + "step": 204 + }, + { + "epoch": 0.007341486561498379, + "grad_norm": 2.0307490825653076, + "learning_rate": 4.892601431980907e-05, + "loss": 1.7467, + "step": 205 + }, + { + "epoch": 0.007377298691066664, + "grad_norm": 2.1891257762908936, + "learning_rate": 4.9164677804295945e-05, + "loss": 1.6726, + "step": 206 + }, + { + "epoch": 0.007413110820634949, + "grad_norm": 2.227116823196411, + "learning_rate": 4.940334128878282e-05, + "loss": 1.7044, + "step": 207 + }, + { + "epoch": 0.007448922950203234, + "grad_norm": 3.0024282932281494, + "learning_rate": 4.964200477326969e-05, + "loss": 1.7156, + "step": 208 + }, + { + "epoch": 0.0074847350797715186, + "grad_norm": 2.97361421585083, + "learning_rate": 4.988066825775656e-05, + "loss": 1.5675, + "step": 209 + }, + { + "epoch": 0.007520547209339803, + "grad_norm": 3.295827865600586, + "learning_rate": 5.0119331742243436e-05, + "loss": 1.9186, + "step": 210 + }, + { + "epoch": 0.0075563593389080886, + "grad_norm": 2.144331693649292, + "learning_rate": 5.035799522673032e-05, + "loss": 1.5187, + "step": 211 + }, + { + "epoch": 0.007592171468476373, + "grad_norm": 3.32950496673584, + "learning_rate": 5.059665871121718e-05, + "loss": 1.6884, + "step": 212 + }, + { + "epoch": 0.007627983598044658, + "grad_norm": 1.79600989818573, + "learning_rate": 5.083532219570406e-05, + "loss": 1.6021, + "step": 213 + }, + { + "epoch": 0.007663795727612942, + "grad_norm": 2.3453643321990967, + "learning_rate": 5.107398568019093e-05, + "loss": 1.445, + "step": 214 + }, + { + "epoch": 0.007699607857181228, + "grad_norm": 2.727658987045288, + "learning_rate": 5.131264916467781e-05, + "loss": 1.567, + "step": 215 + }, + { + "epoch": 0.007735419986749512, + "grad_norm": 1.7330642938613892, + "learning_rate": 5.1551312649164684e-05, + "loss": 1.5268, + "step": 216 + }, + { + "epoch": 0.007771232116317797, + "grad_norm": 2.3396408557891846, + "learning_rate": 5.178997613365155e-05, + "loss": 1.5886, + "step": 217 + }, + { + "epoch": 0.007807044245886081, + "grad_norm": 2.689847946166992, + "learning_rate": 5.202863961813843e-05, + "loss": 1.6337, + "step": 218 + }, + { + "epoch": 0.007842856375454366, + "grad_norm": 2.9377753734588623, + "learning_rate": 5.22673031026253e-05, + "loss": 1.6259, + "step": 219 + }, + { + "epoch": 0.00787866850502265, + "grad_norm": 3.8337669372558594, + "learning_rate": 5.2505966587112175e-05, + "loss": 2.0595, + "step": 220 + }, + { + "epoch": 0.007914480634590935, + "grad_norm": 2.1507017612457275, + "learning_rate": 5.274463007159904e-05, + "loss": 1.6002, + "step": 221 + }, + { + "epoch": 0.007950292764159221, + "grad_norm": 3.378340005874634, + "learning_rate": 5.2983293556085924e-05, + "loss": 1.6729, + "step": 222 + }, + { + "epoch": 0.007986104893727506, + "grad_norm": 2.899789571762085, + "learning_rate": 5.322195704057279e-05, + "loss": 1.7742, + "step": 223 + }, + { + "epoch": 0.00802191702329579, + "grad_norm": 2.566958427429199, + "learning_rate": 5.346062052505967e-05, + "loss": 1.6819, + "step": 224 + }, + { + "epoch": 0.008057729152864075, + "grad_norm": 2.1302566528320312, + "learning_rate": 5.369928400954655e-05, + "loss": 1.5001, + "step": 225 + }, + { + "epoch": 0.00809354128243236, + "grad_norm": 2.246300458908081, + "learning_rate": 5.3937947494033416e-05, + "loss": 1.4505, + "step": 226 + }, + { + "epoch": 0.008129353412000644, + "grad_norm": 4.428207874298096, + "learning_rate": 5.417661097852029e-05, + "loss": 1.6956, + "step": 227 + }, + { + "epoch": 0.008165165541568929, + "grad_norm": 2.463998556137085, + "learning_rate": 5.441527446300716e-05, + "loss": 1.4438, + "step": 228 + }, + { + "epoch": 0.008200977671137213, + "grad_norm": 2.458064079284668, + "learning_rate": 5.465393794749404e-05, + "loss": 1.627, + "step": 229 + }, + { + "epoch": 0.0082367898007055, + "grad_norm": 2.793182134628296, + "learning_rate": 5.489260143198091e-05, + "loss": 1.6909, + "step": 230 + }, + { + "epoch": 0.008272601930273784, + "grad_norm": 3.4207935333251953, + "learning_rate": 5.513126491646778e-05, + "loss": 1.6793, + "step": 231 + }, + { + "epoch": 0.008308414059842069, + "grad_norm": 2.2470600605010986, + "learning_rate": 5.536992840095465e-05, + "loss": 1.623, + "step": 232 + }, + { + "epoch": 0.008344226189410353, + "grad_norm": 2.106220006942749, + "learning_rate": 5.560859188544153e-05, + "loss": 1.49, + "step": 233 + }, + { + "epoch": 0.008380038318978638, + "grad_norm": 3.599937915802002, + "learning_rate": 5.5847255369928406e-05, + "loss": 1.7566, + "step": 234 + }, + { + "epoch": 0.008415850448546923, + "grad_norm": 2.1725025177001953, + "learning_rate": 5.6085918854415273e-05, + "loss": 1.6674, + "step": 235 + }, + { + "epoch": 0.008451662578115207, + "grad_norm": 4.666144847869873, + "learning_rate": 5.6324582338902155e-05, + "loss": 1.9162, + "step": 236 + }, + { + "epoch": 0.008487474707683492, + "grad_norm": 1.8423166275024414, + "learning_rate": 5.656324582338902e-05, + "loss": 1.4833, + "step": 237 + }, + { + "epoch": 0.008523286837251778, + "grad_norm": 2.7874555587768555, + "learning_rate": 5.68019093078759e-05, + "loss": 1.7252, + "step": 238 + }, + { + "epoch": 0.008559098966820063, + "grad_norm": 1.7881454229354858, + "learning_rate": 5.7040572792362765e-05, + "loss": 1.4533, + "step": 239 + }, + { + "epoch": 0.008594911096388347, + "grad_norm": 2.1257472038269043, + "learning_rate": 5.7279236276849646e-05, + "loss": 1.4879, + "step": 240 + }, + { + "epoch": 0.008630723225956632, + "grad_norm": 3.0346462726593018, + "learning_rate": 5.7517899761336514e-05, + "loss": 1.6895, + "step": 241 + }, + { + "epoch": 0.008666535355524916, + "grad_norm": 2.270362377166748, + "learning_rate": 5.775656324582339e-05, + "loss": 1.6339, + "step": 242 + }, + { + "epoch": 0.008702347485093201, + "grad_norm": 2.9611315727233887, + "learning_rate": 5.799522673031027e-05, + "loss": 1.6146, + "step": 243 + }, + { + "epoch": 0.008738159614661485, + "grad_norm": 2.667623519897461, + "learning_rate": 5.823389021479714e-05, + "loss": 1.7676, + "step": 244 + }, + { + "epoch": 0.00877397174422977, + "grad_norm": 2.829141139984131, + "learning_rate": 5.847255369928402e-05, + "loss": 1.4516, + "step": 245 + }, + { + "epoch": 0.008809783873798055, + "grad_norm": 2.124843120574951, + "learning_rate": 5.871121718377088e-05, + "loss": 1.6966, + "step": 246 + }, + { + "epoch": 0.008845596003366341, + "grad_norm": 2.788170337677002, + "learning_rate": 5.894988066825776e-05, + "loss": 1.7343, + "step": 247 + }, + { + "epoch": 0.008881408132934625, + "grad_norm": 2.5599524974823, + "learning_rate": 5.918854415274463e-05, + "loss": 1.4274, + "step": 248 + }, + { + "epoch": 0.00891722026250291, + "grad_norm": 2.3799476623535156, + "learning_rate": 5.942720763723151e-05, + "loss": 1.2899, + "step": 249 + }, + { + "epoch": 0.008953032392071195, + "grad_norm": 2.313469171524048, + "learning_rate": 5.966587112171838e-05, + "loss": 1.4864, + "step": 250 + }, + { + "epoch": 0.00898884452163948, + "grad_norm": 1.8909716606140137, + "learning_rate": 5.990453460620525e-05, + "loss": 1.3952, + "step": 251 + }, + { + "epoch": 0.009024656651207764, + "grad_norm": 3.236157178878784, + "learning_rate": 6.0143198090692134e-05, + "loss": 1.7445, + "step": 252 + }, + { + "epoch": 0.009060468780776048, + "grad_norm": 1.8800829648971558, + "learning_rate": 6.0381861575179e-05, + "loss": 1.429, + "step": 253 + }, + { + "epoch": 0.009096280910344333, + "grad_norm": 2.1391427516937256, + "learning_rate": 6.062052505966588e-05, + "loss": 1.5344, + "step": 254 + }, + { + "epoch": 0.00913209303991262, + "grad_norm": 2.2850732803344727, + "learning_rate": 6.0859188544152745e-05, + "loss": 1.5958, + "step": 255 + }, + { + "epoch": 0.009167905169480904, + "grad_norm": 3.0357441902160645, + "learning_rate": 6.109785202863962e-05, + "loss": 1.5378, + "step": 256 + }, + { + "epoch": 0.009203717299049188, + "grad_norm": 2.880872964859009, + "learning_rate": 6.133651551312649e-05, + "loss": 1.5779, + "step": 257 + }, + { + "epoch": 0.009239529428617473, + "grad_norm": 2.8689186573028564, + "learning_rate": 6.157517899761337e-05, + "loss": 1.5466, + "step": 258 + }, + { + "epoch": 0.009275341558185757, + "grad_norm": 2.46167254447937, + "learning_rate": 6.181384248210024e-05, + "loss": 1.5887, + "step": 259 + }, + { + "epoch": 0.009311153687754042, + "grad_norm": 3.1274564266204834, + "learning_rate": 6.205250596658712e-05, + "loss": 1.5613, + "step": 260 + }, + { + "epoch": 0.009346965817322327, + "grad_norm": 2.283450126647949, + "learning_rate": 6.2291169451074e-05, + "loss": 1.4544, + "step": 261 + }, + { + "epoch": 0.009382777946890611, + "grad_norm": 2.0386641025543213, + "learning_rate": 6.252983293556087e-05, + "loss": 1.5232, + "step": 262 + }, + { + "epoch": 0.009418590076458897, + "grad_norm": 3.481228828430176, + "learning_rate": 6.276849642004773e-05, + "loss": 1.6494, + "step": 263 + }, + { + "epoch": 0.009454402206027182, + "grad_norm": 5.3388800621032715, + "learning_rate": 6.30071599045346e-05, + "loss": 1.7463, + "step": 264 + }, + { + "epoch": 0.009490214335595467, + "grad_norm": 2.0809738636016846, + "learning_rate": 6.324582338902148e-05, + "loss": 1.4703, + "step": 265 + }, + { + "epoch": 0.009526026465163751, + "grad_norm": 2.7124922275543213, + "learning_rate": 6.348448687350835e-05, + "loss": 1.5647, + "step": 266 + }, + { + "epoch": 0.009561838594732036, + "grad_norm": 2.6489923000335693, + "learning_rate": 6.372315035799523e-05, + "loss": 1.4582, + "step": 267 + }, + { + "epoch": 0.00959765072430032, + "grad_norm": 2.7069008350372314, + "learning_rate": 6.39618138424821e-05, + "loss": 1.6169, + "step": 268 + }, + { + "epoch": 0.009633462853868605, + "grad_norm": 2.5912561416625977, + "learning_rate": 6.420047732696898e-05, + "loss": 1.5278, + "step": 269 + }, + { + "epoch": 0.00966927498343689, + "grad_norm": 2.7277116775512695, + "learning_rate": 6.443914081145585e-05, + "loss": 1.6742, + "step": 270 + }, + { + "epoch": 0.009705087113005174, + "grad_norm": 2.103933334350586, + "learning_rate": 6.467780429594272e-05, + "loss": 1.4998, + "step": 271 + }, + { + "epoch": 0.00974089924257346, + "grad_norm": 1.9468450546264648, + "learning_rate": 6.49164677804296e-05, + "loss": 1.4758, + "step": 272 + }, + { + "epoch": 0.009776711372141745, + "grad_norm": 2.0153558254241943, + "learning_rate": 6.515513126491647e-05, + "loss": 1.4574, + "step": 273 + }, + { + "epoch": 0.00981252350171003, + "grad_norm": 2.9895577430725098, + "learning_rate": 6.539379474940335e-05, + "loss": 1.3721, + "step": 274 + }, + { + "epoch": 0.009848335631278314, + "grad_norm": 2.6503305435180664, + "learning_rate": 6.563245823389022e-05, + "loss": 1.6039, + "step": 275 + }, + { + "epoch": 0.009884147760846599, + "grad_norm": 2.5690181255340576, + "learning_rate": 6.58711217183771e-05, + "loss": 1.6495, + "step": 276 + }, + { + "epoch": 0.009919959890414883, + "grad_norm": 2.7271571159362793, + "learning_rate": 6.610978520286396e-05, + "loss": 1.584, + "step": 277 + }, + { + "epoch": 0.009955772019983168, + "grad_norm": 2.022071123123169, + "learning_rate": 6.634844868735083e-05, + "loss": 1.708, + "step": 278 + }, + { + "epoch": 0.009991584149551452, + "grad_norm": 1.8691020011901855, + "learning_rate": 6.65871121718377e-05, + "loss": 1.262, + "step": 279 + }, + { + "epoch": 0.010027396279119739, + "grad_norm": 1.8203330039978027, + "learning_rate": 6.682577565632458e-05, + "loss": 1.4293, + "step": 280 + }, + { + "epoch": 0.010063208408688023, + "grad_norm": 2.833688497543335, + "learning_rate": 6.706443914081146e-05, + "loss": 1.6313, + "step": 281 + }, + { + "epoch": 0.010099020538256308, + "grad_norm": 3.0585758686065674, + "learning_rate": 6.730310262529833e-05, + "loss": 1.459, + "step": 282 + }, + { + "epoch": 0.010134832667824592, + "grad_norm": 2.3719539642333984, + "learning_rate": 6.754176610978521e-05, + "loss": 1.8621, + "step": 283 + }, + { + "epoch": 0.010170644797392877, + "grad_norm": 2.9056437015533447, + "learning_rate": 6.778042959427208e-05, + "loss": 1.6852, + "step": 284 + }, + { + "epoch": 0.010206456926961162, + "grad_norm": 2.3873486518859863, + "learning_rate": 6.801909307875896e-05, + "loss": 1.551, + "step": 285 + }, + { + "epoch": 0.010242269056529446, + "grad_norm": 3.5705528259277344, + "learning_rate": 6.825775656324583e-05, + "loss": 1.9528, + "step": 286 + }, + { + "epoch": 0.01027808118609773, + "grad_norm": 2.3103690147399902, + "learning_rate": 6.84964200477327e-05, + "loss": 1.7863, + "step": 287 + }, + { + "epoch": 0.010313893315666015, + "grad_norm": 2.794950485229492, + "learning_rate": 6.873508353221956e-05, + "loss": 1.5539, + "step": 288 + }, + { + "epoch": 0.010349705445234302, + "grad_norm": 3.5903635025024414, + "learning_rate": 6.897374701670645e-05, + "loss": 1.4581, + "step": 289 + }, + { + "epoch": 0.010385517574802586, + "grad_norm": 3.4575657844543457, + "learning_rate": 6.921241050119333e-05, + "loss": 1.9184, + "step": 290 + }, + { + "epoch": 0.01042132970437087, + "grad_norm": 2.38521409034729, + "learning_rate": 6.94510739856802e-05, + "loss": 1.5218, + "step": 291 + }, + { + "epoch": 0.010457141833939155, + "grad_norm": 2.6045734882354736, + "learning_rate": 6.968973747016708e-05, + "loss": 1.524, + "step": 292 + }, + { + "epoch": 0.01049295396350744, + "grad_norm": 2.0054783821105957, + "learning_rate": 6.992840095465394e-05, + "loss": 1.5243, + "step": 293 + }, + { + "epoch": 0.010528766093075724, + "grad_norm": 2.1140778064727783, + "learning_rate": 7.016706443914081e-05, + "loss": 1.5203, + "step": 294 + }, + { + "epoch": 0.010564578222644009, + "grad_norm": 3.731011152267456, + "learning_rate": 7.040572792362768e-05, + "loss": 1.3458, + "step": 295 + }, + { + "epoch": 0.010600390352212294, + "grad_norm": 3.0818264484405518, + "learning_rate": 7.064439140811456e-05, + "loss": 1.6117, + "step": 296 + }, + { + "epoch": 0.01063620248178058, + "grad_norm": 1.7704260349273682, + "learning_rate": 7.088305489260143e-05, + "loss": 1.4489, + "step": 297 + }, + { + "epoch": 0.010672014611348864, + "grad_norm": 4.21128511428833, + "learning_rate": 7.112171837708831e-05, + "loss": 1.5717, + "step": 298 + }, + { + "epoch": 0.010707826740917149, + "grad_norm": 3.0868494510650635, + "learning_rate": 7.136038186157519e-05, + "loss": 1.6225, + "step": 299 + }, + { + "epoch": 0.010743638870485434, + "grad_norm": 2.5721237659454346, + "learning_rate": 7.159904534606206e-05, + "loss": 1.4213, + "step": 300 + }, + { + "epoch": 0.010779451000053718, + "grad_norm": 2.313586711883545, + "learning_rate": 7.183770883054893e-05, + "loss": 1.5104, + "step": 301 + }, + { + "epoch": 0.010815263129622003, + "grad_norm": 4.64439058303833, + "learning_rate": 7.20763723150358e-05, + "loss": 1.5411, + "step": 302 + }, + { + "epoch": 0.010851075259190287, + "grad_norm": 2.5180857181549072, + "learning_rate": 7.231503579952268e-05, + "loss": 1.5587, + "step": 303 + }, + { + "epoch": 0.010886887388758572, + "grad_norm": 3.869720458984375, + "learning_rate": 7.255369928400954e-05, + "loss": 1.5974, + "step": 304 + }, + { + "epoch": 0.010922699518326858, + "grad_norm": 2.4489381313323975, + "learning_rate": 7.279236276849643e-05, + "loss": 1.5586, + "step": 305 + }, + { + "epoch": 0.010958511647895143, + "grad_norm": 2.266218423843384, + "learning_rate": 7.30310262529833e-05, + "loss": 1.4492, + "step": 306 + }, + { + "epoch": 0.010994323777463427, + "grad_norm": 2.6831986904144287, + "learning_rate": 7.326968973747017e-05, + "loss": 1.5739, + "step": 307 + }, + { + "epoch": 0.011030135907031712, + "grad_norm": 2.0091495513916016, + "learning_rate": 7.350835322195704e-05, + "loss": 1.4518, + "step": 308 + }, + { + "epoch": 0.011065948036599996, + "grad_norm": 2.2850751876831055, + "learning_rate": 7.374701670644391e-05, + "loss": 1.5194, + "step": 309 + }, + { + "epoch": 0.011101760166168281, + "grad_norm": 2.1731743812561035, + "learning_rate": 7.398568019093079e-05, + "loss": 1.4046, + "step": 310 + }, + { + "epoch": 0.011137572295736566, + "grad_norm": 1.953567624092102, + "learning_rate": 7.422434367541766e-05, + "loss": 1.4523, + "step": 311 + }, + { + "epoch": 0.01117338442530485, + "grad_norm": 2.4975109100341797, + "learning_rate": 7.446300715990454e-05, + "loss": 1.7057, + "step": 312 + }, + { + "epoch": 0.011209196554873135, + "grad_norm": 2.3047521114349365, + "learning_rate": 7.470167064439141e-05, + "loss": 1.4276, + "step": 313 + }, + { + "epoch": 0.011245008684441421, + "grad_norm": 2.1207351684570312, + "learning_rate": 7.494033412887829e-05, + "loss": 1.5282, + "step": 314 + }, + { + "epoch": 0.011280820814009706, + "grad_norm": 2.6506083011627197, + "learning_rate": 7.517899761336516e-05, + "loss": 1.5721, + "step": 315 + }, + { + "epoch": 0.01131663294357799, + "grad_norm": 1.7507563829421997, + "learning_rate": 7.541766109785204e-05, + "loss": 1.6187, + "step": 316 + }, + { + "epoch": 0.011352445073146275, + "grad_norm": 2.510202407836914, + "learning_rate": 7.565632458233891e-05, + "loss": 1.6641, + "step": 317 + }, + { + "epoch": 0.01138825720271456, + "grad_norm": 2.121964931488037, + "learning_rate": 7.589498806682577e-05, + "loss": 1.5175, + "step": 318 + }, + { + "epoch": 0.011424069332282844, + "grad_norm": 1.9674251079559326, + "learning_rate": 7.613365155131266e-05, + "loss": 1.5351, + "step": 319 + }, + { + "epoch": 0.011459881461851128, + "grad_norm": 3.4989013671875, + "learning_rate": 7.637231503579952e-05, + "loss": 1.5804, + "step": 320 + }, + { + "epoch": 0.011495693591419413, + "grad_norm": 1.9454535245895386, + "learning_rate": 7.66109785202864e-05, + "loss": 1.6218, + "step": 321 + }, + { + "epoch": 0.0115315057209877, + "grad_norm": 2.2531652450561523, + "learning_rate": 7.684964200477327e-05, + "loss": 1.4871, + "step": 322 + }, + { + "epoch": 0.011567317850555984, + "grad_norm": 2.7230684757232666, + "learning_rate": 7.708830548926015e-05, + "loss": 1.7892, + "step": 323 + }, + { + "epoch": 0.011603129980124268, + "grad_norm": 1.7765686511993408, + "learning_rate": 7.732696897374702e-05, + "loss": 1.3985, + "step": 324 + }, + { + "epoch": 0.011638942109692553, + "grad_norm": 2.5176761150360107, + "learning_rate": 7.756563245823389e-05, + "loss": 1.5598, + "step": 325 + }, + { + "epoch": 0.011674754239260838, + "grad_norm": 2.064635753631592, + "learning_rate": 7.780429594272077e-05, + "loss": 1.5401, + "step": 326 + }, + { + "epoch": 0.011710566368829122, + "grad_norm": 2.688495635986328, + "learning_rate": 7.804295942720764e-05, + "loss": 1.656, + "step": 327 + }, + { + "epoch": 0.011746378498397407, + "grad_norm": 2.9895081520080566, + "learning_rate": 7.828162291169452e-05, + "loss": 1.5079, + "step": 328 + }, + { + "epoch": 0.011782190627965691, + "grad_norm": 2.745944023132324, + "learning_rate": 7.852028639618139e-05, + "loss": 1.5691, + "step": 329 + }, + { + "epoch": 0.011818002757533978, + "grad_norm": 2.3113796710968018, + "learning_rate": 7.875894988066827e-05, + "loss": 1.6125, + "step": 330 + }, + { + "epoch": 0.011853814887102262, + "grad_norm": 2.8892931938171387, + "learning_rate": 7.899761336515514e-05, + "loss": 1.6385, + "step": 331 + }, + { + "epoch": 0.011889627016670547, + "grad_norm": 2.405374050140381, + "learning_rate": 7.9236276849642e-05, + "loss": 1.3467, + "step": 332 + }, + { + "epoch": 0.011925439146238831, + "grad_norm": 2.8202621936798096, + "learning_rate": 7.947494033412887e-05, + "loss": 1.6252, + "step": 333 + }, + { + "epoch": 0.011961251275807116, + "grad_norm": 3.1517958641052246, + "learning_rate": 7.971360381861575e-05, + "loss": 1.6875, + "step": 334 + }, + { + "epoch": 0.0119970634053754, + "grad_norm": 3.480652093887329, + "learning_rate": 7.995226730310262e-05, + "loss": 1.717, + "step": 335 + }, + { + "epoch": 0.012032875534943685, + "grad_norm": 2.413255214691162, + "learning_rate": 8.01909307875895e-05, + "loss": 1.6398, + "step": 336 + }, + { + "epoch": 0.01206868766451197, + "grad_norm": 1.8622037172317505, + "learning_rate": 8.042959427207638e-05, + "loss": 1.4522, + "step": 337 + }, + { + "epoch": 0.012104499794080254, + "grad_norm": 2.7918858528137207, + "learning_rate": 8.066825775656325e-05, + "loss": 1.7565, + "step": 338 + }, + { + "epoch": 0.01214031192364854, + "grad_norm": 4.46349573135376, + "learning_rate": 8.090692124105012e-05, + "loss": 1.5644, + "step": 339 + }, + { + "epoch": 0.012176124053216825, + "grad_norm": 2.5053446292877197, + "learning_rate": 8.114558472553699e-05, + "loss": 1.6558, + "step": 340 + }, + { + "epoch": 0.01221193618278511, + "grad_norm": 3.422001838684082, + "learning_rate": 8.138424821002387e-05, + "loss": 1.5828, + "step": 341 + }, + { + "epoch": 0.012247748312353394, + "grad_norm": 2.465543508529663, + "learning_rate": 8.162291169451074e-05, + "loss": 1.6544, + "step": 342 + }, + { + "epoch": 0.012283560441921679, + "grad_norm": 1.9110023975372314, + "learning_rate": 8.186157517899762e-05, + "loss": 1.4342, + "step": 343 + }, + { + "epoch": 0.012319372571489963, + "grad_norm": 3.2773234844207764, + "learning_rate": 8.210023866348449e-05, + "loss": 1.7891, + "step": 344 + }, + { + "epoch": 0.012355184701058248, + "grad_norm": 3.242049217224121, + "learning_rate": 8.233890214797137e-05, + "loss": 1.3956, + "step": 345 + }, + { + "epoch": 0.012390996830626532, + "grad_norm": 3.2116689682006836, + "learning_rate": 8.257756563245825e-05, + "loss": 1.6721, + "step": 346 + }, + { + "epoch": 0.012426808960194819, + "grad_norm": 2.917055606842041, + "learning_rate": 8.28162291169451e-05, + "loss": 1.4671, + "step": 347 + }, + { + "epoch": 0.012462621089763103, + "grad_norm": 3.3686275482177734, + "learning_rate": 8.305489260143198e-05, + "loss": 1.6433, + "step": 348 + }, + { + "epoch": 0.012498433219331388, + "grad_norm": 2.5646297931671143, + "learning_rate": 8.329355608591885e-05, + "loss": 1.6772, + "step": 349 + }, + { + "epoch": 0.012534245348899672, + "grad_norm": 2.865633964538574, + "learning_rate": 8.353221957040573e-05, + "loss": 1.7133, + "step": 350 + }, + { + "epoch": 0.012570057478467957, + "grad_norm": 2.5325567722320557, + "learning_rate": 8.37708830548926e-05, + "loss": 1.4873, + "step": 351 + }, + { + "epoch": 0.012605869608036242, + "grad_norm": 2.647639036178589, + "learning_rate": 8.400954653937948e-05, + "loss": 1.4405, + "step": 352 + }, + { + "epoch": 0.012641681737604526, + "grad_norm": 2.211080551147461, + "learning_rate": 8.424821002386635e-05, + "loss": 1.548, + "step": 353 + }, + { + "epoch": 0.01267749386717281, + "grad_norm": 2.5468978881835938, + "learning_rate": 8.448687350835323e-05, + "loss": 1.4381, + "step": 354 + }, + { + "epoch": 0.012713305996741097, + "grad_norm": 3.32773756980896, + "learning_rate": 8.47255369928401e-05, + "loss": 1.5501, + "step": 355 + }, + { + "epoch": 0.012749118126309382, + "grad_norm": 2.5647685527801514, + "learning_rate": 8.496420047732697e-05, + "loss": 1.6415, + "step": 356 + }, + { + "epoch": 0.012784930255877666, + "grad_norm": 2.0625, + "learning_rate": 8.520286396181385e-05, + "loss": 1.3179, + "step": 357 + }, + { + "epoch": 0.01282074238544595, + "grad_norm": 2.5888986587524414, + "learning_rate": 8.544152744630072e-05, + "loss": 1.6903, + "step": 358 + }, + { + "epoch": 0.012856554515014235, + "grad_norm": 1.8542308807373047, + "learning_rate": 8.56801909307876e-05, + "loss": 1.4017, + "step": 359 + }, + { + "epoch": 0.01289236664458252, + "grad_norm": 2.4340898990631104, + "learning_rate": 8.591885441527447e-05, + "loss": 1.6514, + "step": 360 + }, + { + "epoch": 0.012928178774150804, + "grad_norm": 2.002131938934326, + "learning_rate": 8.615751789976135e-05, + "loss": 1.5071, + "step": 361 + }, + { + "epoch": 0.012963990903719089, + "grad_norm": 3.116835355758667, + "learning_rate": 8.639618138424822e-05, + "loss": 1.6753, + "step": 362 + }, + { + "epoch": 0.012999803033287374, + "grad_norm": 2.75567364692688, + "learning_rate": 8.663484486873508e-05, + "loss": 1.6025, + "step": 363 + }, + { + "epoch": 0.01303561516285566, + "grad_norm": 2.3767430782318115, + "learning_rate": 8.687350835322196e-05, + "loss": 1.3671, + "step": 364 + }, + { + "epoch": 0.013071427292423944, + "grad_norm": 2.2214763164520264, + "learning_rate": 8.711217183770883e-05, + "loss": 1.6114, + "step": 365 + }, + { + "epoch": 0.013107239421992229, + "grad_norm": 3.0783498287200928, + "learning_rate": 8.735083532219571e-05, + "loss": 1.4932, + "step": 366 + }, + { + "epoch": 0.013143051551560514, + "grad_norm": 2.9228477478027344, + "learning_rate": 8.758949880668258e-05, + "loss": 1.3618, + "step": 367 + }, + { + "epoch": 0.013178863681128798, + "grad_norm": 2.6385319232940674, + "learning_rate": 8.782816229116946e-05, + "loss": 1.4264, + "step": 368 + }, + { + "epoch": 0.013214675810697083, + "grad_norm": 1.7489711046218872, + "learning_rate": 8.806682577565633e-05, + "loss": 1.38, + "step": 369 + }, + { + "epoch": 0.013250487940265367, + "grad_norm": 2.224829912185669, + "learning_rate": 8.83054892601432e-05, + "loss": 1.5456, + "step": 370 + }, + { + "epoch": 0.013286300069833652, + "grad_norm": 2.071970224380493, + "learning_rate": 8.854415274463007e-05, + "loss": 1.5634, + "step": 371 + }, + { + "epoch": 0.013322112199401938, + "grad_norm": 3.9384520053863525, + "learning_rate": 8.878281622911695e-05, + "loss": 1.4102, + "step": 372 + }, + { + "epoch": 0.013357924328970223, + "grad_norm": 3.641294479370117, + "learning_rate": 8.902147971360383e-05, + "loss": 1.5808, + "step": 373 + }, + { + "epoch": 0.013393736458538507, + "grad_norm": 2.6139872074127197, + "learning_rate": 8.92601431980907e-05, + "loss": 1.4484, + "step": 374 + }, + { + "epoch": 0.013429548588106792, + "grad_norm": 3.1963746547698975, + "learning_rate": 8.949880668257758e-05, + "loss": 1.8896, + "step": 375 + }, + { + "epoch": 0.013465360717675076, + "grad_norm": 2.3136579990386963, + "learning_rate": 8.973747016706445e-05, + "loss": 1.7105, + "step": 376 + }, + { + "epoch": 0.013501172847243361, + "grad_norm": 2.85591983795166, + "learning_rate": 8.997613365155131e-05, + "loss": 1.5592, + "step": 377 + }, + { + "epoch": 0.013536984976811646, + "grad_norm": 3.3353285789489746, + "learning_rate": 9.021479713603818e-05, + "loss": 1.6256, + "step": 378 + }, + { + "epoch": 0.01357279710637993, + "grad_norm": 2.0619935989379883, + "learning_rate": 9.045346062052506e-05, + "loss": 1.2515, + "step": 379 + }, + { + "epoch": 0.013608609235948217, + "grad_norm": 2.125812292098999, + "learning_rate": 9.069212410501193e-05, + "loss": 1.492, + "step": 380 + }, + { + "epoch": 0.013644421365516501, + "grad_norm": 2.689272165298462, + "learning_rate": 9.093078758949881e-05, + "loss": 1.4702, + "step": 381 + }, + { + "epoch": 0.013680233495084786, + "grad_norm": 2.815014123916626, + "learning_rate": 9.116945107398569e-05, + "loss": 1.4324, + "step": 382 + }, + { + "epoch": 0.01371604562465307, + "grad_norm": 1.9601852893829346, + "learning_rate": 9.140811455847256e-05, + "loss": 1.648, + "step": 383 + }, + { + "epoch": 0.013751857754221355, + "grad_norm": 2.6593594551086426, + "learning_rate": 9.164677804295944e-05, + "loss": 1.564, + "step": 384 + }, + { + "epoch": 0.01378766988378964, + "grad_norm": 3.2223029136657715, + "learning_rate": 9.18854415274463e-05, + "loss": 1.6724, + "step": 385 + }, + { + "epoch": 0.013823482013357924, + "grad_norm": 2.2675719261169434, + "learning_rate": 9.212410501193318e-05, + "loss": 1.4491, + "step": 386 + }, + { + "epoch": 0.013859294142926208, + "grad_norm": 1.9350119829177856, + "learning_rate": 9.236276849642005e-05, + "loss": 1.7217, + "step": 387 + }, + { + "epoch": 0.013895106272494493, + "grad_norm": 1.6141301393508911, + "learning_rate": 9.260143198090693e-05, + "loss": 1.5277, + "step": 388 + }, + { + "epoch": 0.01393091840206278, + "grad_norm": 2.6942622661590576, + "learning_rate": 9.28400954653938e-05, + "loss": 1.4942, + "step": 389 + }, + { + "epoch": 0.013966730531631064, + "grad_norm": 2.0238726139068604, + "learning_rate": 9.307875894988068e-05, + "loss": 1.6766, + "step": 390 + }, + { + "epoch": 0.014002542661199349, + "grad_norm": 1.7464032173156738, + "learning_rate": 9.331742243436754e-05, + "loss": 1.4111, + "step": 391 + }, + { + "epoch": 0.014038354790767633, + "grad_norm": 2.241011381149292, + "learning_rate": 9.355608591885443e-05, + "loss": 1.6471, + "step": 392 + }, + { + "epoch": 0.014074166920335918, + "grad_norm": 2.3051598072052, + "learning_rate": 9.379474940334129e-05, + "loss": 1.5918, + "step": 393 + }, + { + "epoch": 0.014109979049904202, + "grad_norm": 1.7830252647399902, + "learning_rate": 9.403341288782816e-05, + "loss": 1.3958, + "step": 394 + }, + { + "epoch": 0.014145791179472487, + "grad_norm": 2.398982524871826, + "learning_rate": 9.427207637231504e-05, + "loss": 1.509, + "step": 395 + }, + { + "epoch": 0.014181603309040771, + "grad_norm": 1.6258431673049927, + "learning_rate": 9.451073985680191e-05, + "loss": 1.4408, + "step": 396 + }, + { + "epoch": 0.014217415438609058, + "grad_norm": 1.8768000602722168, + "learning_rate": 9.474940334128879e-05, + "loss": 1.3374, + "step": 397 + }, + { + "epoch": 0.014253227568177342, + "grad_norm": 3.2014665603637695, + "learning_rate": 9.498806682577566e-05, + "loss": 1.6725, + "step": 398 + }, + { + "epoch": 0.014289039697745627, + "grad_norm": 3.334557294845581, + "learning_rate": 9.522673031026254e-05, + "loss": 1.7284, + "step": 399 + }, + { + "epoch": 0.014324851827313911, + "grad_norm": 2.247483730316162, + "learning_rate": 9.546539379474941e-05, + "loss": 1.4581, + "step": 400 + }, + { + "epoch": 0.014360663956882196, + "grad_norm": 2.5330018997192383, + "learning_rate": 9.570405727923628e-05, + "loss": 1.6383, + "step": 401 + }, + { + "epoch": 0.01439647608645048, + "grad_norm": 3.856920003890991, + "learning_rate": 9.594272076372316e-05, + "loss": 1.6659, + "step": 402 + }, + { + "epoch": 0.014432288216018765, + "grad_norm": 1.8212999105453491, + "learning_rate": 9.618138424821003e-05, + "loss": 1.4602, + "step": 403 + }, + { + "epoch": 0.01446810034558705, + "grad_norm": 2.0022270679473877, + "learning_rate": 9.64200477326969e-05, + "loss": 1.4875, + "step": 404 + }, + { + "epoch": 0.014503912475155336, + "grad_norm": 1.913409948348999, + "learning_rate": 9.665871121718377e-05, + "loss": 1.3684, + "step": 405 + }, + { + "epoch": 0.01453972460472362, + "grad_norm": 2.898055076599121, + "learning_rate": 9.689737470167066e-05, + "loss": 1.6825, + "step": 406 + }, + { + "epoch": 0.014575536734291905, + "grad_norm": 2.658048629760742, + "learning_rate": 9.713603818615752e-05, + "loss": 1.6149, + "step": 407 + }, + { + "epoch": 0.01461134886386019, + "grad_norm": 1.7398033142089844, + "learning_rate": 9.737470167064439e-05, + "loss": 1.2893, + "step": 408 + }, + { + "epoch": 0.014647160993428474, + "grad_norm": 2.224679470062256, + "learning_rate": 9.761336515513126e-05, + "loss": 1.6499, + "step": 409 + }, + { + "epoch": 0.014682973122996759, + "grad_norm": 2.1326613426208496, + "learning_rate": 9.785202863961814e-05, + "loss": 1.3466, + "step": 410 + }, + { + "epoch": 0.014718785252565043, + "grad_norm": 2.226675510406494, + "learning_rate": 9.809069212410502e-05, + "loss": 1.4604, + "step": 411 + }, + { + "epoch": 0.014754597382133328, + "grad_norm": 2.5426831245422363, + "learning_rate": 9.832935560859189e-05, + "loss": 1.722, + "step": 412 + }, + { + "epoch": 0.014790409511701613, + "grad_norm": 1.3666105270385742, + "learning_rate": 9.856801909307877e-05, + "loss": 1.2438, + "step": 413 + }, + { + "epoch": 0.014826221641269899, + "grad_norm": 2.8758585453033447, + "learning_rate": 9.880668257756564e-05, + "loss": 1.662, + "step": 414 + }, + { + "epoch": 0.014862033770838183, + "grad_norm": 1.7606292963027954, + "learning_rate": 9.90453460620525e-05, + "loss": 1.4071, + "step": 415 + }, + { + "epoch": 0.014897845900406468, + "grad_norm": 2.3375020027160645, + "learning_rate": 9.928400954653937e-05, + "loss": 1.4146, + "step": 416 + }, + { + "epoch": 0.014933658029974753, + "grad_norm": 2.8114795684814453, + "learning_rate": 9.952267303102626e-05, + "loss": 1.6179, + "step": 417 + }, + { + "epoch": 0.014969470159543037, + "grad_norm": 2.596499443054199, + "learning_rate": 9.976133651551312e-05, + "loss": 1.526, + "step": 418 + }, + { + "epoch": 0.015005282289111322, + "grad_norm": 2.122304677963257, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 419 + }, + { + "epoch": 0.015041094418679606, + "grad_norm": 2.2158541679382324, + "learning_rate": 0.00010023866348448687, + "loss": 1.6054, + "step": 420 + }, + { + "epoch": 0.01507690654824789, + "grad_norm": 1.8862284421920776, + "learning_rate": 0.00010047732696897377, + "loss": 1.4248, + "step": 421 + }, + { + "epoch": 0.015112718677816177, + "grad_norm": 2.669682502746582, + "learning_rate": 0.00010071599045346064, + "loss": 1.6673, + "step": 422 + }, + { + "epoch": 0.015148530807384462, + "grad_norm": 2.158485174179077, + "learning_rate": 0.0001009546539379475, + "loss": 1.4613, + "step": 423 + }, + { + "epoch": 0.015184342936952746, + "grad_norm": 2.587890148162842, + "learning_rate": 0.00010119331742243436, + "loss": 1.482, + "step": 424 + }, + { + "epoch": 0.01522015506652103, + "grad_norm": 2.7657506465911865, + "learning_rate": 0.00010143198090692125, + "loss": 1.6588, + "step": 425 + }, + { + "epoch": 0.015255967196089315, + "grad_norm": 2.52256441116333, + "learning_rate": 0.00010167064439140812, + "loss": 1.6202, + "step": 426 + }, + { + "epoch": 0.0152917793256576, + "grad_norm": 1.8821640014648438, + "learning_rate": 0.00010190930787589499, + "loss": 1.5808, + "step": 427 + }, + { + "epoch": 0.015327591455225885, + "grad_norm": 2.9007699489593506, + "learning_rate": 0.00010214797136038186, + "loss": 1.2874, + "step": 428 + }, + { + "epoch": 0.015363403584794169, + "grad_norm": 1.9984593391418457, + "learning_rate": 0.00010238663484486875, + "loss": 1.4416, + "step": 429 + }, + { + "epoch": 0.015399215714362455, + "grad_norm": 3.3654909133911133, + "learning_rate": 0.00010262529832935562, + "loss": 1.5851, + "step": 430 + }, + { + "epoch": 0.01543502784393074, + "grad_norm": 2.6032309532165527, + "learning_rate": 0.00010286396181384249, + "loss": 1.5924, + "step": 431 + }, + { + "epoch": 0.015470839973499025, + "grad_norm": 3.3429181575775146, + "learning_rate": 0.00010310262529832937, + "loss": 1.6539, + "step": 432 + }, + { + "epoch": 0.01550665210306731, + "grad_norm": 3.3925251960754395, + "learning_rate": 0.00010334128878281624, + "loss": 1.9181, + "step": 433 + }, + { + "epoch": 0.015542464232635594, + "grad_norm": 2.270540952682495, + "learning_rate": 0.0001035799522673031, + "loss": 1.5382, + "step": 434 + }, + { + "epoch": 0.015578276362203878, + "grad_norm": 2.245727777481079, + "learning_rate": 0.00010381861575178997, + "loss": 1.3325, + "step": 435 + }, + { + "epoch": 0.015614088491772163, + "grad_norm": 2.1484389305114746, + "learning_rate": 0.00010405727923627687, + "loss": 1.454, + "step": 436 + }, + { + "epoch": 0.01564990062134045, + "grad_norm": 2.4563543796539307, + "learning_rate": 0.00010429594272076373, + "loss": 1.5659, + "step": 437 + }, + { + "epoch": 0.015685712750908732, + "grad_norm": 1.9042775630950928, + "learning_rate": 0.0001045346062052506, + "loss": 1.5145, + "step": 438 + }, + { + "epoch": 0.01572152488047702, + "grad_norm": 2.2876698970794678, + "learning_rate": 0.00010477326968973748, + "loss": 1.7526, + "step": 439 + }, + { + "epoch": 0.0157573370100453, + "grad_norm": 2.390381336212158, + "learning_rate": 0.00010501193317422435, + "loss": 1.6119, + "step": 440 + }, + { + "epoch": 0.015793149139613587, + "grad_norm": 2.185748338699341, + "learning_rate": 0.00010525059665871122, + "loss": 1.5182, + "step": 441 + }, + { + "epoch": 0.01582896126918187, + "grad_norm": 2.2949841022491455, + "learning_rate": 0.00010548926014319809, + "loss": 1.5194, + "step": 442 + }, + { + "epoch": 0.015864773398750157, + "grad_norm": 2.234761953353882, + "learning_rate": 0.00010572792362768498, + "loss": 1.2936, + "step": 443 + }, + { + "epoch": 0.015900585528318443, + "grad_norm": 2.609597682952881, + "learning_rate": 0.00010596658711217185, + "loss": 1.7512, + "step": 444 + }, + { + "epoch": 0.015936397657886726, + "grad_norm": 2.6631908416748047, + "learning_rate": 0.00010620525059665872, + "loss": 1.5374, + "step": 445 + }, + { + "epoch": 0.015972209787455012, + "grad_norm": 2.149907350540161, + "learning_rate": 0.00010644391408114558, + "loss": 1.5469, + "step": 446 + }, + { + "epoch": 0.016008021917023295, + "grad_norm": 1.7066127061843872, + "learning_rate": 0.00010668257756563247, + "loss": 1.5709, + "step": 447 + }, + { + "epoch": 0.01604383404659158, + "grad_norm": 2.1762490272521973, + "learning_rate": 0.00010692124105011933, + "loss": 1.3379, + "step": 448 + }, + { + "epoch": 0.016079646176159864, + "grad_norm": 2.8034191131591797, + "learning_rate": 0.0001071599045346062, + "loss": 1.8192, + "step": 449 + }, + { + "epoch": 0.01611545830572815, + "grad_norm": 2.088667392730713, + "learning_rate": 0.0001073985680190931, + "loss": 1.5966, + "step": 450 + }, + { + "epoch": 0.016151270435296437, + "grad_norm": 1.815684199333191, + "learning_rate": 0.00010763723150357996, + "loss": 1.4084, + "step": 451 + }, + { + "epoch": 0.01618708256486472, + "grad_norm": 2.5688586235046387, + "learning_rate": 0.00010787589498806683, + "loss": 1.5571, + "step": 452 + }, + { + "epoch": 0.016222894694433006, + "grad_norm": 2.6634678840637207, + "learning_rate": 0.0001081145584725537, + "loss": 1.4437, + "step": 453 + }, + { + "epoch": 0.01625870682400129, + "grad_norm": 1.97719407081604, + "learning_rate": 0.00010835322195704058, + "loss": 1.4032, + "step": 454 + }, + { + "epoch": 0.016294518953569575, + "grad_norm": 2.735307216644287, + "learning_rate": 0.00010859188544152745, + "loss": 1.4813, + "step": 455 + }, + { + "epoch": 0.016330331083137858, + "grad_norm": 2.0602328777313232, + "learning_rate": 0.00010883054892601432, + "loss": 1.5677, + "step": 456 + }, + { + "epoch": 0.016366143212706144, + "grad_norm": 2.0698113441467285, + "learning_rate": 0.00010906921241050121, + "loss": 1.4229, + "step": 457 + }, + { + "epoch": 0.016401955342274427, + "grad_norm": 2.866910696029663, + "learning_rate": 0.00010930787589498808, + "loss": 1.4976, + "step": 458 + }, + { + "epoch": 0.016437767471842713, + "grad_norm": 1.467098593711853, + "learning_rate": 0.00010954653937947495, + "loss": 1.2618, + "step": 459 + }, + { + "epoch": 0.016473579601411, + "grad_norm": 2.4938628673553467, + "learning_rate": 0.00010978520286396181, + "loss": 1.707, + "step": 460 + }, + { + "epoch": 0.016509391730979282, + "grad_norm": 1.890442132949829, + "learning_rate": 0.0001100238663484487, + "loss": 1.3069, + "step": 461 + }, + { + "epoch": 0.01654520386054757, + "grad_norm": 2.0892975330352783, + "learning_rate": 0.00011026252983293556, + "loss": 1.4264, + "step": 462 + }, + { + "epoch": 0.01658101599011585, + "grad_norm": 2.0770015716552734, + "learning_rate": 0.00011050119331742243, + "loss": 1.5281, + "step": 463 + }, + { + "epoch": 0.016616828119684138, + "grad_norm": 3.5818850994110107, + "learning_rate": 0.0001107398568019093, + "loss": 1.8764, + "step": 464 + }, + { + "epoch": 0.01665264024925242, + "grad_norm": 2.12496018409729, + "learning_rate": 0.0001109785202863962, + "loss": 1.404, + "step": 465 + }, + { + "epoch": 0.016688452378820707, + "grad_norm": 2.270075559616089, + "learning_rate": 0.00011121718377088306, + "loss": 1.3983, + "step": 466 + }, + { + "epoch": 0.01672426450838899, + "grad_norm": 1.6868388652801514, + "learning_rate": 0.00011145584725536993, + "loss": 1.3629, + "step": 467 + }, + { + "epoch": 0.016760076637957276, + "grad_norm": 2.2169179916381836, + "learning_rate": 0.00011169451073985681, + "loss": 1.6407, + "step": 468 + }, + { + "epoch": 0.016795888767525562, + "grad_norm": 2.054626703262329, + "learning_rate": 0.00011193317422434368, + "loss": 1.4835, + "step": 469 + }, + { + "epoch": 0.016831700897093845, + "grad_norm": 1.835919976234436, + "learning_rate": 0.00011217183770883055, + "loss": 1.5081, + "step": 470 + }, + { + "epoch": 0.01686751302666213, + "grad_norm": 2.0308279991149902, + "learning_rate": 0.00011241050119331741, + "loss": 1.5131, + "step": 471 + }, + { + "epoch": 0.016903325156230414, + "grad_norm": 2.4710946083068848, + "learning_rate": 0.00011264916467780431, + "loss": 1.4718, + "step": 472 + }, + { + "epoch": 0.0169391372857987, + "grad_norm": 1.5181260108947754, + "learning_rate": 0.00011288782816229118, + "loss": 1.4645, + "step": 473 + }, + { + "epoch": 0.016974949415366983, + "grad_norm": 2.6623988151550293, + "learning_rate": 0.00011312649164677805, + "loss": 1.6559, + "step": 474 + }, + { + "epoch": 0.01701076154493527, + "grad_norm": 2.294326066970825, + "learning_rate": 0.00011336515513126493, + "loss": 1.5614, + "step": 475 + }, + { + "epoch": 0.017046573674503556, + "grad_norm": 2.280698299407959, + "learning_rate": 0.0001136038186157518, + "loss": 1.7461, + "step": 476 + }, + { + "epoch": 0.01708238580407184, + "grad_norm": 4.556243896484375, + "learning_rate": 0.00011384248210023866, + "loss": 1.9445, + "step": 477 + }, + { + "epoch": 0.017118197933640125, + "grad_norm": 2.7702457904815674, + "learning_rate": 0.00011408114558472553, + "loss": 1.5847, + "step": 478 + }, + { + "epoch": 0.017154010063208408, + "grad_norm": 3.4149343967437744, + "learning_rate": 0.00011431980906921242, + "loss": 1.6731, + "step": 479 + }, + { + "epoch": 0.017189822192776694, + "grad_norm": 2.6306471824645996, + "learning_rate": 0.00011455847255369929, + "loss": 1.6897, + "step": 480 + }, + { + "epoch": 0.017225634322344977, + "grad_norm": 2.003208637237549, + "learning_rate": 0.00011479713603818616, + "loss": 1.5142, + "step": 481 + }, + { + "epoch": 0.017261446451913263, + "grad_norm": 2.253591775894165, + "learning_rate": 0.00011503579952267303, + "loss": 1.5199, + "step": 482 + }, + { + "epoch": 0.017297258581481546, + "grad_norm": 2.561537504196167, + "learning_rate": 0.00011527446300715991, + "loss": 1.3543, + "step": 483 + }, + { + "epoch": 0.017333070711049833, + "grad_norm": 2.88240647315979, + "learning_rate": 0.00011551312649164678, + "loss": 1.3692, + "step": 484 + }, + { + "epoch": 0.01736888284061812, + "grad_norm": 1.805859923362732, + "learning_rate": 0.00011575178997613365, + "loss": 1.5633, + "step": 485 + }, + { + "epoch": 0.017404694970186402, + "grad_norm": 2.282787799835205, + "learning_rate": 0.00011599045346062054, + "loss": 1.5185, + "step": 486 + }, + { + "epoch": 0.017440507099754688, + "grad_norm": 2.162992000579834, + "learning_rate": 0.00011622911694510741, + "loss": 1.5942, + "step": 487 + }, + { + "epoch": 0.01747631922932297, + "grad_norm": 2.0853872299194336, + "learning_rate": 0.00011646778042959428, + "loss": 1.4874, + "step": 488 + }, + { + "epoch": 0.017512131358891257, + "grad_norm": 1.9102565050125122, + "learning_rate": 0.00011670644391408114, + "loss": 1.4809, + "step": 489 + }, + { + "epoch": 0.01754794348845954, + "grad_norm": 2.676542282104492, + "learning_rate": 0.00011694510739856804, + "loss": 1.7596, + "step": 490 + }, + { + "epoch": 0.017583755618027826, + "grad_norm": 2.1234824657440186, + "learning_rate": 0.0001171837708830549, + "loss": 1.4975, + "step": 491 + }, + { + "epoch": 0.01761956774759611, + "grad_norm": 2.2232038974761963, + "learning_rate": 0.00011742243436754176, + "loss": 1.445, + "step": 492 + }, + { + "epoch": 0.017655379877164395, + "grad_norm": 1.653510332107544, + "learning_rate": 0.00011766109785202863, + "loss": 1.5766, + "step": 493 + }, + { + "epoch": 0.017691192006732682, + "grad_norm": 4.879425525665283, + "learning_rate": 0.00011789976133651552, + "loss": 1.7899, + "step": 494 + }, + { + "epoch": 0.017727004136300965, + "grad_norm": 1.8105230331420898, + "learning_rate": 0.00011813842482100239, + "loss": 1.4313, + "step": 495 + }, + { + "epoch": 0.01776281626586925, + "grad_norm": 2.2959015369415283, + "learning_rate": 0.00011837708830548926, + "loss": 1.5273, + "step": 496 + }, + { + "epoch": 0.017798628395437534, + "grad_norm": 2.696749687194824, + "learning_rate": 0.00011861575178997615, + "loss": 1.5946, + "step": 497 + }, + { + "epoch": 0.01783444052500582, + "grad_norm": 3.0255179405212402, + "learning_rate": 0.00011885441527446302, + "loss": 1.8037, + "step": 498 + }, + { + "epoch": 0.017870252654574103, + "grad_norm": 2.070873260498047, + "learning_rate": 0.00011909307875894989, + "loss": 1.3847, + "step": 499 + }, + { + "epoch": 0.01790606478414239, + "grad_norm": 1.8404755592346191, + "learning_rate": 0.00011933174224343676, + "loss": 1.4182, + "step": 500 + }, + { + "epoch": 0.017941876913710676, + "grad_norm": 2.6692428588867188, + "learning_rate": 0.00011957040572792364, + "loss": 1.6379, + "step": 501 + }, + { + "epoch": 0.01797768904327896, + "grad_norm": 2.721924066543579, + "learning_rate": 0.0001198090692124105, + "loss": 1.7721, + "step": 502 + }, + { + "epoch": 0.018013501172847245, + "grad_norm": 1.856130599975586, + "learning_rate": 0.00012004773269689737, + "loss": 1.3399, + "step": 503 + }, + { + "epoch": 0.018049313302415528, + "grad_norm": 3.4332196712493896, + "learning_rate": 0.00012028639618138427, + "loss": 1.5406, + "step": 504 + }, + { + "epoch": 0.018085125431983814, + "grad_norm": 1.510866403579712, + "learning_rate": 0.00012052505966587114, + "loss": 1.4494, + "step": 505 + }, + { + "epoch": 0.018120937561552097, + "grad_norm": 2.149031400680542, + "learning_rate": 0.000120763723150358, + "loss": 1.5588, + "step": 506 + }, + { + "epoch": 0.018156749691120383, + "grad_norm": 3.3539419174194336, + "learning_rate": 0.00012100238663484487, + "loss": 1.3412, + "step": 507 + }, + { + "epoch": 0.018192561820688666, + "grad_norm": 2.411203622817993, + "learning_rate": 0.00012124105011933175, + "loss": 1.6064, + "step": 508 + }, + { + "epoch": 0.018228373950256952, + "grad_norm": 2.345850706100464, + "learning_rate": 0.00012147971360381862, + "loss": 1.45, + "step": 509 + }, + { + "epoch": 0.01826418607982524, + "grad_norm": 1.6317548751831055, + "learning_rate": 0.00012171837708830549, + "loss": 1.4779, + "step": 510 + }, + { + "epoch": 0.01829999820939352, + "grad_norm": 2.12003493309021, + "learning_rate": 0.00012195704057279236, + "loss": 1.5397, + "step": 511 + }, + { + "epoch": 0.018335810338961808, + "grad_norm": 2.1677746772766113, + "learning_rate": 0.00012219570405727924, + "loss": 1.4041, + "step": 512 + }, + { + "epoch": 0.01837162246853009, + "grad_norm": 2.0251317024230957, + "learning_rate": 0.0001224343675417661, + "loss": 1.5153, + "step": 513 + }, + { + "epoch": 0.018407434598098377, + "grad_norm": 2.02629017829895, + "learning_rate": 0.00012267303102625297, + "loss": 1.4137, + "step": 514 + }, + { + "epoch": 0.01844324672766666, + "grad_norm": 1.6484004259109497, + "learning_rate": 0.00012291169451073987, + "loss": 1.4443, + "step": 515 + }, + { + "epoch": 0.018479058857234946, + "grad_norm": 2.1281087398529053, + "learning_rate": 0.00012315035799522674, + "loss": 1.4706, + "step": 516 + }, + { + "epoch": 0.01851487098680323, + "grad_norm": 2.4120399951934814, + "learning_rate": 0.0001233890214797136, + "loss": 1.2966, + "step": 517 + }, + { + "epoch": 0.018550683116371515, + "grad_norm": 4.100740909576416, + "learning_rate": 0.00012362768496420047, + "loss": 1.8292, + "step": 518 + }, + { + "epoch": 0.0185864952459398, + "grad_norm": 2.3660569190979004, + "learning_rate": 0.00012386634844868737, + "loss": 1.6374, + "step": 519 + }, + { + "epoch": 0.018622307375508084, + "grad_norm": 1.7659571170806885, + "learning_rate": 0.00012410501193317423, + "loss": 1.3706, + "step": 520 + }, + { + "epoch": 0.01865811950507637, + "grad_norm": 2.330357551574707, + "learning_rate": 0.0001243436754176611, + "loss": 1.3862, + "step": 521 + }, + { + "epoch": 0.018693931634644653, + "grad_norm": 2.310359477996826, + "learning_rate": 0.000124582338902148, + "loss": 1.4126, + "step": 522 + }, + { + "epoch": 0.01872974376421294, + "grad_norm": 2.0201125144958496, + "learning_rate": 0.00012482100238663487, + "loss": 1.5457, + "step": 523 + }, + { + "epoch": 0.018765555893781222, + "grad_norm": 2.607203483581543, + "learning_rate": 0.00012505966587112173, + "loss": 1.3196, + "step": 524 + }, + { + "epoch": 0.01880136802334951, + "grad_norm": 1.8595284223556519, + "learning_rate": 0.0001252983293556086, + "loss": 1.4802, + "step": 525 + }, + { + "epoch": 0.018837180152917795, + "grad_norm": 1.9876042604446411, + "learning_rate": 0.00012553699284009547, + "loss": 1.6771, + "step": 526 + }, + { + "epoch": 0.018872992282486078, + "grad_norm": 2.028693437576294, + "learning_rate": 0.00012577565632458234, + "loss": 1.4664, + "step": 527 + }, + { + "epoch": 0.018908804412054364, + "grad_norm": 2.0126700401306152, + "learning_rate": 0.0001260143198090692, + "loss": 1.3732, + "step": 528 + }, + { + "epoch": 0.018944616541622647, + "grad_norm": 2.4646711349487305, + "learning_rate": 0.00012625298329355607, + "loss": 1.806, + "step": 529 + }, + { + "epoch": 0.018980428671190933, + "grad_norm": 2.2601797580718994, + "learning_rate": 0.00012649164677804297, + "loss": 1.5969, + "step": 530 + }, + { + "epoch": 0.019016240800759216, + "grad_norm": 2.5840182304382324, + "learning_rate": 0.00012673031026252983, + "loss": 1.5037, + "step": 531 + }, + { + "epoch": 0.019052052930327502, + "grad_norm": 1.7634466886520386, + "learning_rate": 0.0001269689737470167, + "loss": 1.3574, + "step": 532 + }, + { + "epoch": 0.019087865059895785, + "grad_norm": 1.8872238397598267, + "learning_rate": 0.0001272076372315036, + "loss": 1.6497, + "step": 533 + }, + { + "epoch": 0.01912367718946407, + "grad_norm": 2.4874236583709717, + "learning_rate": 0.00012744630071599047, + "loss": 1.7061, + "step": 534 + }, + { + "epoch": 0.019159489319032358, + "grad_norm": 2.9422991275787354, + "learning_rate": 0.00012768496420047733, + "loss": 1.3664, + "step": 535 + }, + { + "epoch": 0.01919530144860064, + "grad_norm": 1.798109769821167, + "learning_rate": 0.0001279236276849642, + "loss": 1.4006, + "step": 536 + }, + { + "epoch": 0.019231113578168927, + "grad_norm": 2.2389039993286133, + "learning_rate": 0.0001281622911694511, + "loss": 1.6001, + "step": 537 + }, + { + "epoch": 0.01926692570773721, + "grad_norm": 1.9805078506469727, + "learning_rate": 0.00012840095465393796, + "loss": 1.5406, + "step": 538 + }, + { + "epoch": 0.019302737837305496, + "grad_norm": 1.676135778427124, + "learning_rate": 0.00012863961813842483, + "loss": 1.6067, + "step": 539 + }, + { + "epoch": 0.01933854996687378, + "grad_norm": 2.1805307865142822, + "learning_rate": 0.0001288782816229117, + "loss": 1.5176, + "step": 540 + }, + { + "epoch": 0.019374362096442065, + "grad_norm": 1.6874924898147583, + "learning_rate": 0.00012911694510739857, + "loss": 1.3764, + "step": 541 + }, + { + "epoch": 0.019410174226010348, + "grad_norm": 2.5747992992401123, + "learning_rate": 0.00012935560859188543, + "loss": 1.6511, + "step": 542 + }, + { + "epoch": 0.019445986355578634, + "grad_norm": 1.771819829940796, + "learning_rate": 0.0001295942720763723, + "loss": 1.2664, + "step": 543 + }, + { + "epoch": 0.01948179848514692, + "grad_norm": 2.090907096862793, + "learning_rate": 0.0001298329355608592, + "loss": 1.7495, + "step": 544 + }, + { + "epoch": 0.019517610614715204, + "grad_norm": 1.4134457111358643, + "learning_rate": 0.00013007159904534607, + "loss": 1.4309, + "step": 545 + }, + { + "epoch": 0.01955342274428349, + "grad_norm": 2.0431406497955322, + "learning_rate": 0.00013031026252983293, + "loss": 1.6351, + "step": 546 + }, + { + "epoch": 0.019589234873851773, + "grad_norm": 1.3423436880111694, + "learning_rate": 0.0001305489260143198, + "loss": 1.276, + "step": 547 + }, + { + "epoch": 0.01962504700342006, + "grad_norm": 1.6442458629608154, + "learning_rate": 0.0001307875894988067, + "loss": 1.4488, + "step": 548 + }, + { + "epoch": 0.019660859132988342, + "grad_norm": 1.785022497177124, + "learning_rate": 0.00013102625298329356, + "loss": 1.2787, + "step": 549 + }, + { + "epoch": 0.019696671262556628, + "grad_norm": 1.57109797000885, + "learning_rate": 0.00013126491646778043, + "loss": 1.5193, + "step": 550 + }, + { + "epoch": 0.01973248339212491, + "grad_norm": 2.120318651199341, + "learning_rate": 0.00013150357995226733, + "loss": 1.5145, + "step": 551 + }, + { + "epoch": 0.019768295521693197, + "grad_norm": 1.9597117900848389, + "learning_rate": 0.0001317422434367542, + "loss": 1.3981, + "step": 552 + }, + { + "epoch": 0.019804107651261484, + "grad_norm": 2.4275288581848145, + "learning_rate": 0.00013198090692124106, + "loss": 1.6049, + "step": 553 + }, + { + "epoch": 0.019839919780829766, + "grad_norm": 2.1035890579223633, + "learning_rate": 0.00013221957040572793, + "loss": 1.4694, + "step": 554 + }, + { + "epoch": 0.019875731910398053, + "grad_norm": 2.2771594524383545, + "learning_rate": 0.00013245823389021482, + "loss": 1.7156, + "step": 555 + }, + { + "epoch": 0.019911544039966336, + "grad_norm": 2.0216922760009766, + "learning_rate": 0.00013269689737470167, + "loss": 1.4243, + "step": 556 + }, + { + "epoch": 0.019947356169534622, + "grad_norm": 2.6359267234802246, + "learning_rate": 0.00013293556085918853, + "loss": 1.5066, + "step": 557 + }, + { + "epoch": 0.019983168299102905, + "grad_norm": 2.2418434619903564, + "learning_rate": 0.0001331742243436754, + "loss": 1.652, + "step": 558 + }, + { + "epoch": 0.02001898042867119, + "grad_norm": 1.6685441732406616, + "learning_rate": 0.0001334128878281623, + "loss": 1.5786, + "step": 559 + }, + { + "epoch": 0.020054792558239477, + "grad_norm": 1.7990673780441284, + "learning_rate": 0.00013365155131264916, + "loss": 1.7492, + "step": 560 + }, + { + "epoch": 0.02009060468780776, + "grad_norm": 3.1365246772766113, + "learning_rate": 0.00013389021479713603, + "loss": 1.4642, + "step": 561 + }, + { + "epoch": 0.020126416817376046, + "grad_norm": 1.7074172496795654, + "learning_rate": 0.00013412887828162293, + "loss": 1.4806, + "step": 562 + }, + { + "epoch": 0.02016222894694433, + "grad_norm": 2.618900775909424, + "learning_rate": 0.0001343675417661098, + "loss": 1.5925, + "step": 563 + }, + { + "epoch": 0.020198041076512616, + "grad_norm": 1.8773434162139893, + "learning_rate": 0.00013460620525059666, + "loss": 1.6463, + "step": 564 + }, + { + "epoch": 0.0202338532060809, + "grad_norm": 2.0296530723571777, + "learning_rate": 0.00013484486873508353, + "loss": 1.4843, + "step": 565 + }, + { + "epoch": 0.020269665335649185, + "grad_norm": 1.5701045989990234, + "learning_rate": 0.00013508353221957042, + "loss": 1.336, + "step": 566 + }, + { + "epoch": 0.020305477465217468, + "grad_norm": 2.7090563774108887, + "learning_rate": 0.0001353221957040573, + "loss": 1.3759, + "step": 567 + }, + { + "epoch": 0.020341289594785754, + "grad_norm": 3.502772808074951, + "learning_rate": 0.00013556085918854416, + "loss": 1.4918, + "step": 568 + }, + { + "epoch": 0.02037710172435404, + "grad_norm": 1.492253303527832, + "learning_rate": 0.00013579952267303105, + "loss": 1.397, + "step": 569 + }, + { + "epoch": 0.020412913853922323, + "grad_norm": 2.087813377380371, + "learning_rate": 0.00013603818615751792, + "loss": 1.4519, + "step": 570 + }, + { + "epoch": 0.02044872598349061, + "grad_norm": 2.9932732582092285, + "learning_rate": 0.0001362768496420048, + "loss": 1.5685, + "step": 571 + }, + { + "epoch": 0.020484538113058892, + "grad_norm": 1.847307562828064, + "learning_rate": 0.00013651551312649166, + "loss": 1.5718, + "step": 572 + }, + { + "epoch": 0.02052035024262718, + "grad_norm": 2.114513397216797, + "learning_rate": 0.00013675417661097853, + "loss": 1.6751, + "step": 573 + }, + { + "epoch": 0.02055616237219546, + "grad_norm": 2.1023786067962646, + "learning_rate": 0.0001369928400954654, + "loss": 1.4972, + "step": 574 + }, + { + "epoch": 0.020591974501763748, + "grad_norm": 2.4452288150787354, + "learning_rate": 0.00013723150357995226, + "loss": 1.3711, + "step": 575 + }, + { + "epoch": 0.02062778663133203, + "grad_norm": 1.66960871219635, + "learning_rate": 0.00013747016706443913, + "loss": 1.5393, + "step": 576 + }, + { + "epoch": 0.020663598760900317, + "grad_norm": 1.7724089622497559, + "learning_rate": 0.00013770883054892602, + "loss": 1.4416, + "step": 577 + }, + { + "epoch": 0.020699410890468603, + "grad_norm": 2.385772705078125, + "learning_rate": 0.0001379474940334129, + "loss": 1.7429, + "step": 578 + }, + { + "epoch": 0.020735223020036886, + "grad_norm": 4.203891754150391, + "learning_rate": 0.00013818615751789976, + "loss": 1.5037, + "step": 579 + }, + { + "epoch": 0.020771035149605172, + "grad_norm": 2.31666898727417, + "learning_rate": 0.00013842482100238665, + "loss": 1.5759, + "step": 580 + }, + { + "epoch": 0.020806847279173455, + "grad_norm": 2.6279335021972656, + "learning_rate": 0.00013866348448687352, + "loss": 1.6, + "step": 581 + }, + { + "epoch": 0.02084265940874174, + "grad_norm": 2.5068516731262207, + "learning_rate": 0.0001389021479713604, + "loss": 1.6855, + "step": 582 + }, + { + "epoch": 0.020878471538310024, + "grad_norm": 2.1207051277160645, + "learning_rate": 0.00013914081145584726, + "loss": 1.3105, + "step": 583 + }, + { + "epoch": 0.02091428366787831, + "grad_norm": 2.8799822330474854, + "learning_rate": 0.00013937947494033415, + "loss": 1.5562, + "step": 584 + }, + { + "epoch": 0.020950095797446597, + "grad_norm": 2.6221089363098145, + "learning_rate": 0.00013961813842482102, + "loss": 1.7223, + "step": 585 + }, + { + "epoch": 0.02098590792701488, + "grad_norm": 1.9271230697631836, + "learning_rate": 0.0001398568019093079, + "loss": 1.661, + "step": 586 + }, + { + "epoch": 0.021021720056583166, + "grad_norm": 2.0012738704681396, + "learning_rate": 0.00014009546539379476, + "loss": 1.4425, + "step": 587 + }, + { + "epoch": 0.02105753218615145, + "grad_norm": 3.31992506980896, + "learning_rate": 0.00014033412887828162, + "loss": 1.713, + "step": 588 + }, + { + "epoch": 0.021093344315719735, + "grad_norm": 1.5446326732635498, + "learning_rate": 0.0001405727923627685, + "loss": 1.5133, + "step": 589 + }, + { + "epoch": 0.021129156445288018, + "grad_norm": 1.6269475221633911, + "learning_rate": 0.00014081145584725536, + "loss": 1.3636, + "step": 590 + }, + { + "epoch": 0.021164968574856304, + "grad_norm": 2.511564016342163, + "learning_rate": 0.00014105011933174225, + "loss": 1.4981, + "step": 591 + }, + { + "epoch": 0.021200780704424587, + "grad_norm": 2.521416187286377, + "learning_rate": 0.00014128878281622912, + "loss": 1.7319, + "step": 592 + }, + { + "epoch": 0.021236592833992873, + "grad_norm": 1.8882571458816528, + "learning_rate": 0.000141527446300716, + "loss": 1.4489, + "step": 593 + }, + { + "epoch": 0.02127240496356116, + "grad_norm": 2.174049139022827, + "learning_rate": 0.00014176610978520286, + "loss": 1.3963, + "step": 594 + }, + { + "epoch": 0.021308217093129442, + "grad_norm": 2.4185924530029297, + "learning_rate": 0.00014200477326968975, + "loss": 1.6373, + "step": 595 + }, + { + "epoch": 0.02134402922269773, + "grad_norm": 2.1939618587493896, + "learning_rate": 0.00014224343675417662, + "loss": 1.5498, + "step": 596 + }, + { + "epoch": 0.02137984135226601, + "grad_norm": 2.4913125038146973, + "learning_rate": 0.0001424821002386635, + "loss": 1.8692, + "step": 597 + }, + { + "epoch": 0.021415653481834298, + "grad_norm": 1.9227137565612793, + "learning_rate": 0.00014272076372315038, + "loss": 1.6009, + "step": 598 + }, + { + "epoch": 0.02145146561140258, + "grad_norm": 2.1582274436950684, + "learning_rate": 0.00014295942720763725, + "loss": 1.6387, + "step": 599 + }, + { + "epoch": 0.021487277740970867, + "grad_norm": 2.5576729774475098, + "learning_rate": 0.00014319809069212412, + "loss": 1.2952, + "step": 600 + }, + { + "epoch": 0.02152308987053915, + "grad_norm": 1.9126322269439697, + "learning_rate": 0.000143436754176611, + "loss": 1.4009, + "step": 601 + }, + { + "epoch": 0.021558902000107436, + "grad_norm": 2.6814358234405518, + "learning_rate": 0.00014367541766109785, + "loss": 1.408, + "step": 602 + }, + { + "epoch": 0.021594714129675723, + "grad_norm": 1.689543604850769, + "learning_rate": 0.00014391408114558472, + "loss": 1.5057, + "step": 603 + }, + { + "epoch": 0.021630526259244005, + "grad_norm": 1.5119539499282837, + "learning_rate": 0.0001441527446300716, + "loss": 1.3634, + "step": 604 + }, + { + "epoch": 0.02166633838881229, + "grad_norm": 1.6267719268798828, + "learning_rate": 0.00014439140811455846, + "loss": 1.4943, + "step": 605 + }, + { + "epoch": 0.021702150518380574, + "grad_norm": 1.9156160354614258, + "learning_rate": 0.00014463007159904535, + "loss": 1.5274, + "step": 606 + }, + { + "epoch": 0.02173796264794886, + "grad_norm": 1.9828640222549438, + "learning_rate": 0.00014486873508353222, + "loss": 1.4492, + "step": 607 + }, + { + "epoch": 0.021773774777517144, + "grad_norm": 2.049508571624756, + "learning_rate": 0.0001451073985680191, + "loss": 1.7436, + "step": 608 + }, + { + "epoch": 0.02180958690708543, + "grad_norm": 2.167750120162964, + "learning_rate": 0.00014534606205250598, + "loss": 1.492, + "step": 609 + }, + { + "epoch": 0.021845399036653716, + "grad_norm": 1.9610743522644043, + "learning_rate": 0.00014558472553699285, + "loss": 1.3095, + "step": 610 + }, + { + "epoch": 0.021881211166222, + "grad_norm": 1.864098310470581, + "learning_rate": 0.00014582338902147972, + "loss": 1.5824, + "step": 611 + }, + { + "epoch": 0.021917023295790285, + "grad_norm": 2.4633240699768066, + "learning_rate": 0.0001460620525059666, + "loss": 1.5223, + "step": 612 + }, + { + "epoch": 0.021952835425358568, + "grad_norm": 1.8569053411483765, + "learning_rate": 0.00014630071599045348, + "loss": 1.5965, + "step": 613 + }, + { + "epoch": 0.021988647554926855, + "grad_norm": 1.4310226440429688, + "learning_rate": 0.00014653937947494035, + "loss": 1.3882, + "step": 614 + }, + { + "epoch": 0.022024459684495137, + "grad_norm": 1.8998956680297852, + "learning_rate": 0.00014677804295942722, + "loss": 1.5781, + "step": 615 + }, + { + "epoch": 0.022060271814063424, + "grad_norm": 1.602454662322998, + "learning_rate": 0.00014701670644391409, + "loss": 1.2836, + "step": 616 + }, + { + "epoch": 0.022096083943631706, + "grad_norm": 1.8213428258895874, + "learning_rate": 0.00014725536992840095, + "loss": 1.6076, + "step": 617 + }, + { + "epoch": 0.022131896073199993, + "grad_norm": 1.9968669414520264, + "learning_rate": 0.00014749403341288782, + "loss": 1.4507, + "step": 618 + }, + { + "epoch": 0.02216770820276828, + "grad_norm": 2.010653018951416, + "learning_rate": 0.0001477326968973747, + "loss": 1.5399, + "step": 619 + }, + { + "epoch": 0.022203520332336562, + "grad_norm": 1.9982693195343018, + "learning_rate": 0.00014797136038186158, + "loss": 1.4877, + "step": 620 + }, + { + "epoch": 0.022239332461904848, + "grad_norm": 2.66056489944458, + "learning_rate": 0.00014821002386634845, + "loss": 1.5331, + "step": 621 + }, + { + "epoch": 0.02227514459147313, + "grad_norm": 2.598308801651001, + "learning_rate": 0.00014844868735083532, + "loss": 1.6922, + "step": 622 + }, + { + "epoch": 0.022310956721041417, + "grad_norm": 2.060718297958374, + "learning_rate": 0.0001486873508353222, + "loss": 1.2368, + "step": 623 + }, + { + "epoch": 0.0223467688506097, + "grad_norm": 3.3509955406188965, + "learning_rate": 0.00014892601431980908, + "loss": 1.5786, + "step": 624 + }, + { + "epoch": 0.022382580980177987, + "grad_norm": 2.1646339893341064, + "learning_rate": 0.00014916467780429595, + "loss": 1.2934, + "step": 625 + }, + { + "epoch": 0.02241839310974627, + "grad_norm": 1.9818437099456787, + "learning_rate": 0.00014940334128878282, + "loss": 1.4907, + "step": 626 + }, + { + "epoch": 0.022454205239314556, + "grad_norm": 2.5283091068267822, + "learning_rate": 0.0001496420047732697, + "loss": 1.4884, + "step": 627 + }, + { + "epoch": 0.022490017368882842, + "grad_norm": 3.418313980102539, + "learning_rate": 0.00014988066825775658, + "loss": 1.4712, + "step": 628 + }, + { + "epoch": 0.022525829498451125, + "grad_norm": 3.563206434249878, + "learning_rate": 0.00015011933174224345, + "loss": 1.578, + "step": 629 + }, + { + "epoch": 0.02256164162801941, + "grad_norm": 2.4981441497802734, + "learning_rate": 0.00015035799522673032, + "loss": 1.5151, + "step": 630 + }, + { + "epoch": 0.022597453757587694, + "grad_norm": 2.353191614151001, + "learning_rate": 0.0001505966587112172, + "loss": 1.301, + "step": 631 + }, + { + "epoch": 0.02263326588715598, + "grad_norm": 2.0780320167541504, + "learning_rate": 0.00015083532219570408, + "loss": 1.7056, + "step": 632 + }, + { + "epoch": 0.022669078016724263, + "grad_norm": 2.961005687713623, + "learning_rate": 0.00015107398568019092, + "loss": 1.5987, + "step": 633 + }, + { + "epoch": 0.02270489014629255, + "grad_norm": 2.7877278327941895, + "learning_rate": 0.00015131264916467781, + "loss": 1.3884, + "step": 634 + }, + { + "epoch": 0.022740702275860836, + "grad_norm": 1.9565117359161377, + "learning_rate": 0.00015155131264916468, + "loss": 1.5739, + "step": 635 + }, + { + "epoch": 0.02277651440542912, + "grad_norm": 1.853684663772583, + "learning_rate": 0.00015178997613365155, + "loss": 1.3897, + "step": 636 + }, + { + "epoch": 0.022812326534997405, + "grad_norm": 1.895973563194275, + "learning_rate": 0.00015202863961813842, + "loss": 1.4722, + "step": 637 + }, + { + "epoch": 0.022848138664565688, + "grad_norm": 2.927177906036377, + "learning_rate": 0.0001522673031026253, + "loss": 1.4169, + "step": 638 + }, + { + "epoch": 0.022883950794133974, + "grad_norm": 2.1195576190948486, + "learning_rate": 0.00015250596658711218, + "loss": 1.2965, + "step": 639 + }, + { + "epoch": 0.022919762923702257, + "grad_norm": 2.678638219833374, + "learning_rate": 0.00015274463007159905, + "loss": 1.4658, + "step": 640 + }, + { + "epoch": 0.022955575053270543, + "grad_norm": 1.9623972177505493, + "learning_rate": 0.00015298329355608592, + "loss": 1.5578, + "step": 641 + }, + { + "epoch": 0.022991387182838826, + "grad_norm": 2.223118782043457, + "learning_rate": 0.0001532219570405728, + "loss": 1.5751, + "step": 642 + }, + { + "epoch": 0.023027199312407112, + "grad_norm": 1.7825822830200195, + "learning_rate": 0.00015346062052505968, + "loss": 1.3153, + "step": 643 + }, + { + "epoch": 0.0230630114419754, + "grad_norm": 3.0335352420806885, + "learning_rate": 0.00015369928400954655, + "loss": 1.7608, + "step": 644 + }, + { + "epoch": 0.02309882357154368, + "grad_norm": 1.9853134155273438, + "learning_rate": 0.00015393794749403344, + "loss": 1.4849, + "step": 645 + }, + { + "epoch": 0.023134635701111968, + "grad_norm": 1.9020859003067017, + "learning_rate": 0.0001541766109785203, + "loss": 1.568, + "step": 646 + }, + { + "epoch": 0.02317044783068025, + "grad_norm": 1.6567760705947876, + "learning_rate": 0.00015441527446300718, + "loss": 1.4153, + "step": 647 + }, + { + "epoch": 0.023206259960248537, + "grad_norm": 1.6762211322784424, + "learning_rate": 0.00015465393794749404, + "loss": 1.4104, + "step": 648 + }, + { + "epoch": 0.02324207208981682, + "grad_norm": 2.0650994777679443, + "learning_rate": 0.0001548926014319809, + "loss": 1.6498, + "step": 649 + }, + { + "epoch": 0.023277884219385106, + "grad_norm": 2.1684529781341553, + "learning_rate": 0.00015513126491646778, + "loss": 1.6503, + "step": 650 + }, + { + "epoch": 0.02331369634895339, + "grad_norm": 1.4035128355026245, + "learning_rate": 0.00015536992840095465, + "loss": 1.5015, + "step": 651 + }, + { + "epoch": 0.023349508478521675, + "grad_norm": 1.7270311117172241, + "learning_rate": 0.00015560859188544154, + "loss": 1.6038, + "step": 652 + }, + { + "epoch": 0.02338532060808996, + "grad_norm": 2.0642249584198, + "learning_rate": 0.0001558472553699284, + "loss": 1.5229, + "step": 653 + }, + { + "epoch": 0.023421132737658244, + "grad_norm": 2.0907576084136963, + "learning_rate": 0.00015608591885441528, + "loss": 1.3946, + "step": 654 + }, + { + "epoch": 0.02345694486722653, + "grad_norm": 1.6790848970413208, + "learning_rate": 0.00015632458233890215, + "loss": 1.4842, + "step": 655 + }, + { + "epoch": 0.023492756996794813, + "grad_norm": 1.9100477695465088, + "learning_rate": 0.00015656324582338904, + "loss": 1.5578, + "step": 656 + }, + { + "epoch": 0.0235285691263631, + "grad_norm": 2.7048964500427246, + "learning_rate": 0.0001568019093078759, + "loss": 1.7734, + "step": 657 + }, + { + "epoch": 0.023564381255931383, + "grad_norm": 2.3780438899993896, + "learning_rate": 0.00015704057279236278, + "loss": 1.6938, + "step": 658 + }, + { + "epoch": 0.02360019338549967, + "grad_norm": 1.7933615446090698, + "learning_rate": 0.00015727923627684964, + "loss": 1.2613, + "step": 659 + }, + { + "epoch": 0.023636005515067955, + "grad_norm": 1.2775394916534424, + "learning_rate": 0.00015751789976133654, + "loss": 1.3772, + "step": 660 + }, + { + "epoch": 0.023671817644636238, + "grad_norm": 1.7792919874191284, + "learning_rate": 0.0001577565632458234, + "loss": 1.601, + "step": 661 + }, + { + "epoch": 0.023707629774204524, + "grad_norm": 2.0483450889587402, + "learning_rate": 0.00015799522673031027, + "loss": 1.4633, + "step": 662 + }, + { + "epoch": 0.023743441903772807, + "grad_norm": 1.8090413808822632, + "learning_rate": 0.00015823389021479714, + "loss": 1.5488, + "step": 663 + }, + { + "epoch": 0.023779254033341093, + "grad_norm": 1.5232633352279663, + "learning_rate": 0.000158472553699284, + "loss": 1.5515, + "step": 664 + }, + { + "epoch": 0.023815066162909376, + "grad_norm": 1.8411604166030884, + "learning_rate": 0.00015871121718377088, + "loss": 1.484, + "step": 665 + }, + { + "epoch": 0.023850878292477663, + "grad_norm": 1.5994863510131836, + "learning_rate": 0.00015894988066825775, + "loss": 1.2893, + "step": 666 + }, + { + "epoch": 0.023886690422045945, + "grad_norm": 2.1134698390960693, + "learning_rate": 0.00015918854415274464, + "loss": 1.6296, + "step": 667 + }, + { + "epoch": 0.02392250255161423, + "grad_norm": 1.4447811841964722, + "learning_rate": 0.0001594272076372315, + "loss": 1.5032, + "step": 668 + }, + { + "epoch": 0.023958314681182518, + "grad_norm": 1.7641936540603638, + "learning_rate": 0.00015966587112171838, + "loss": 1.3263, + "step": 669 + }, + { + "epoch": 0.0239941268107508, + "grad_norm": 2.2972166538238525, + "learning_rate": 0.00015990453460620524, + "loss": 1.6751, + "step": 670 + }, + { + "epoch": 0.024029938940319087, + "grad_norm": 1.7759915590286255, + "learning_rate": 0.00016014319809069214, + "loss": 1.3086, + "step": 671 + }, + { + "epoch": 0.02406575106988737, + "grad_norm": 1.2277716398239136, + "learning_rate": 0.000160381861575179, + "loss": 1.3404, + "step": 672 + }, + { + "epoch": 0.024101563199455656, + "grad_norm": 1.8086826801300049, + "learning_rate": 0.00016062052505966587, + "loss": 1.3971, + "step": 673 + }, + { + "epoch": 0.02413737532902394, + "grad_norm": 2.4142329692840576, + "learning_rate": 0.00016085918854415277, + "loss": 1.6087, + "step": 674 + }, + { + "epoch": 0.024173187458592225, + "grad_norm": 1.6419340372085571, + "learning_rate": 0.00016109785202863964, + "loss": 1.4247, + "step": 675 + }, + { + "epoch": 0.02420899958816051, + "grad_norm": 1.8007947206497192, + "learning_rate": 0.0001613365155131265, + "loss": 1.577, + "step": 676 + }, + { + "epoch": 0.024244811717728795, + "grad_norm": 1.502155065536499, + "learning_rate": 0.00016157517899761337, + "loss": 1.2915, + "step": 677 + }, + { + "epoch": 0.02428062384729708, + "grad_norm": 3.514531373977661, + "learning_rate": 0.00016181384248210024, + "loss": 2.0749, + "step": 678 + }, + { + "epoch": 0.024316435976865364, + "grad_norm": 1.7119113206863403, + "learning_rate": 0.0001620525059665871, + "loss": 1.3846, + "step": 679 + }, + { + "epoch": 0.02435224810643365, + "grad_norm": 1.8698878288269043, + "learning_rate": 0.00016229116945107398, + "loss": 1.4897, + "step": 680 + }, + { + "epoch": 0.024388060236001933, + "grad_norm": 2.292787790298462, + "learning_rate": 0.00016252983293556087, + "loss": 1.6294, + "step": 681 + }, + { + "epoch": 0.02442387236557022, + "grad_norm": 1.5224436521530151, + "learning_rate": 0.00016276849642004774, + "loss": 1.3606, + "step": 682 + }, + { + "epoch": 0.024459684495138502, + "grad_norm": 2.2999541759490967, + "learning_rate": 0.0001630071599045346, + "loss": 1.6978, + "step": 683 + }, + { + "epoch": 0.02449549662470679, + "grad_norm": 1.8322933912277222, + "learning_rate": 0.00016324582338902147, + "loss": 1.5394, + "step": 684 + }, + { + "epoch": 0.024531308754275075, + "grad_norm": 2.6339821815490723, + "learning_rate": 0.00016348448687350837, + "loss": 1.4105, + "step": 685 + }, + { + "epoch": 0.024567120883843357, + "grad_norm": 1.660827875137329, + "learning_rate": 0.00016372315035799524, + "loss": 1.3912, + "step": 686 + }, + { + "epoch": 0.024602933013411644, + "grad_norm": 1.5298078060150146, + "learning_rate": 0.0001639618138424821, + "loss": 1.3918, + "step": 687 + }, + { + "epoch": 0.024638745142979927, + "grad_norm": 1.7153851985931396, + "learning_rate": 0.00016420047732696897, + "loss": 1.5179, + "step": 688 + }, + { + "epoch": 0.024674557272548213, + "grad_norm": 1.2843354940414429, + "learning_rate": 0.00016443914081145587, + "loss": 1.4719, + "step": 689 + }, + { + "epoch": 0.024710369402116496, + "grad_norm": 1.647976279258728, + "learning_rate": 0.00016467780429594274, + "loss": 1.5815, + "step": 690 + }, + { + "epoch": 0.024746181531684782, + "grad_norm": 2.423780918121338, + "learning_rate": 0.0001649164677804296, + "loss": 1.7812, + "step": 691 + }, + { + "epoch": 0.024781993661253065, + "grad_norm": 2.153710126876831, + "learning_rate": 0.0001651551312649165, + "loss": 1.5844, + "step": 692 + }, + { + "epoch": 0.02481780579082135, + "grad_norm": 1.7690187692642212, + "learning_rate": 0.00016539379474940334, + "loss": 1.2681, + "step": 693 + }, + { + "epoch": 0.024853617920389637, + "grad_norm": 2.3822097778320312, + "learning_rate": 0.0001656324582338902, + "loss": 1.6041, + "step": 694 + }, + { + "epoch": 0.02488943004995792, + "grad_norm": 1.7092225551605225, + "learning_rate": 0.00016587112171837707, + "loss": 1.2328, + "step": 695 + }, + { + "epoch": 0.024925242179526207, + "grad_norm": 1.6043064594268799, + "learning_rate": 0.00016610978520286397, + "loss": 1.3735, + "step": 696 + }, + { + "epoch": 0.02496105430909449, + "grad_norm": 1.990520715713501, + "learning_rate": 0.00016634844868735084, + "loss": 1.3684, + "step": 697 + }, + { + "epoch": 0.024996866438662776, + "grad_norm": 4.514682292938232, + "learning_rate": 0.0001665871121718377, + "loss": 1.9956, + "step": 698 + }, + { + "epoch": 0.02503267856823106, + "grad_norm": 1.6009727716445923, + "learning_rate": 0.0001668257756563246, + "loss": 1.472, + "step": 699 + }, + { + "epoch": 0.025068490697799345, + "grad_norm": 2.4647836685180664, + "learning_rate": 0.00016706443914081147, + "loss": 1.5021, + "step": 700 + }, + { + "epoch": 0.025104302827367628, + "grad_norm": 2.4203274250030518, + "learning_rate": 0.00016730310262529834, + "loss": 1.721, + "step": 701 + }, + { + "epoch": 0.025140114956935914, + "grad_norm": 2.447371006011963, + "learning_rate": 0.0001675417661097852, + "loss": 1.6775, + "step": 702 + }, + { + "epoch": 0.0251759270865042, + "grad_norm": 2.295504093170166, + "learning_rate": 0.0001677804295942721, + "loss": 1.4216, + "step": 703 + }, + { + "epoch": 0.025211739216072483, + "grad_norm": 1.9491688013076782, + "learning_rate": 0.00016801909307875897, + "loss": 1.2632, + "step": 704 + }, + { + "epoch": 0.02524755134564077, + "grad_norm": 1.4649652242660522, + "learning_rate": 0.00016825775656324583, + "loss": 1.5811, + "step": 705 + }, + { + "epoch": 0.025283363475209052, + "grad_norm": 2.4039993286132812, + "learning_rate": 0.0001684964200477327, + "loss": 1.6793, + "step": 706 + }, + { + "epoch": 0.02531917560477734, + "grad_norm": 1.4853521585464478, + "learning_rate": 0.0001687350835322196, + "loss": 1.3945, + "step": 707 + }, + { + "epoch": 0.02535498773434562, + "grad_norm": 1.7133545875549316, + "learning_rate": 0.00016897374701670646, + "loss": 1.5144, + "step": 708 + }, + { + "epoch": 0.025390799863913908, + "grad_norm": 2.1684529781341553, + "learning_rate": 0.00016921241050119333, + "loss": 1.6288, + "step": 709 + }, + { + "epoch": 0.025426611993482194, + "grad_norm": 1.6406062841415405, + "learning_rate": 0.0001694510739856802, + "loss": 1.6772, + "step": 710 + }, + { + "epoch": 0.025462424123050477, + "grad_norm": 1.9215788841247559, + "learning_rate": 0.00016968973747016707, + "loss": 1.7083, + "step": 711 + }, + { + "epoch": 0.025498236252618763, + "grad_norm": 1.6396383047103882, + "learning_rate": 0.00016992840095465394, + "loss": 1.4036, + "step": 712 + }, + { + "epoch": 0.025534048382187046, + "grad_norm": 1.959446907043457, + "learning_rate": 0.0001701670644391408, + "loss": 1.3906, + "step": 713 + }, + { + "epoch": 0.025569860511755332, + "grad_norm": 1.883048176765442, + "learning_rate": 0.0001704057279236277, + "loss": 1.5238, + "step": 714 + }, + { + "epoch": 0.025605672641323615, + "grad_norm": 1.6886613368988037, + "learning_rate": 0.00017064439140811457, + "loss": 1.2999, + "step": 715 + }, + { + "epoch": 0.0256414847708919, + "grad_norm": 1.5378632545471191, + "learning_rate": 0.00017088305489260143, + "loss": 1.5146, + "step": 716 + }, + { + "epoch": 0.025677296900460184, + "grad_norm": 1.893235683441162, + "learning_rate": 0.00017112171837708833, + "loss": 1.5693, + "step": 717 + }, + { + "epoch": 0.02571310903002847, + "grad_norm": 1.9049478769302368, + "learning_rate": 0.0001713603818615752, + "loss": 1.4909, + "step": 718 + }, + { + "epoch": 0.025748921159596757, + "grad_norm": 1.622850775718689, + "learning_rate": 0.00017159904534606206, + "loss": 1.4286, + "step": 719 + }, + { + "epoch": 0.02578473328916504, + "grad_norm": 1.6297088861465454, + "learning_rate": 0.00017183770883054893, + "loss": 1.456, + "step": 720 + }, + { + "epoch": 0.025820545418733326, + "grad_norm": 1.8837428092956543, + "learning_rate": 0.00017207637231503583, + "loss": 1.6866, + "step": 721 + }, + { + "epoch": 0.02585635754830161, + "grad_norm": 1.3682360649108887, + "learning_rate": 0.0001723150357995227, + "loss": 1.4156, + "step": 722 + }, + { + "epoch": 0.025892169677869895, + "grad_norm": 1.9026600122451782, + "learning_rate": 0.00017255369928400956, + "loss": 1.4751, + "step": 723 + }, + { + "epoch": 0.025927981807438178, + "grad_norm": 1.929057240486145, + "learning_rate": 0.00017279236276849643, + "loss": 1.6288, + "step": 724 + }, + { + "epoch": 0.025963793937006464, + "grad_norm": 2.201592445373535, + "learning_rate": 0.0001730310262529833, + "loss": 1.6388, + "step": 725 + }, + { + "epoch": 0.025999606066574747, + "grad_norm": 1.5440824031829834, + "learning_rate": 0.00017326968973747017, + "loss": 1.3585, + "step": 726 + }, + { + "epoch": 0.026035418196143034, + "grad_norm": 1.8354631662368774, + "learning_rate": 0.00017350835322195703, + "loss": 1.6254, + "step": 727 + }, + { + "epoch": 0.02607123032571132, + "grad_norm": 2.6769959926605225, + "learning_rate": 0.00017374701670644393, + "loss": 1.7456, + "step": 728 + }, + { + "epoch": 0.026107042455279603, + "grad_norm": 2.4423978328704834, + "learning_rate": 0.0001739856801909308, + "loss": 1.6399, + "step": 729 + }, + { + "epoch": 0.02614285458484789, + "grad_norm": 2.5522501468658447, + "learning_rate": 0.00017422434367541766, + "loss": 1.7678, + "step": 730 + }, + { + "epoch": 0.026178666714416172, + "grad_norm": 2.196291446685791, + "learning_rate": 0.00017446300715990453, + "loss": 1.4841, + "step": 731 + }, + { + "epoch": 0.026214478843984458, + "grad_norm": 1.977138876914978, + "learning_rate": 0.00017470167064439143, + "loss": 1.2913, + "step": 732 + }, + { + "epoch": 0.02625029097355274, + "grad_norm": 1.4526777267456055, + "learning_rate": 0.0001749403341288783, + "loss": 1.4228, + "step": 733 + }, + { + "epoch": 0.026286103103121027, + "grad_norm": 2.256680965423584, + "learning_rate": 0.00017517899761336516, + "loss": 1.6329, + "step": 734 + }, + { + "epoch": 0.026321915232689314, + "grad_norm": 2.406615972518921, + "learning_rate": 0.00017541766109785203, + "loss": 1.4376, + "step": 735 + }, + { + "epoch": 0.026357727362257596, + "grad_norm": 1.6776665449142456, + "learning_rate": 0.00017565632458233893, + "loss": 1.6527, + "step": 736 + }, + { + "epoch": 0.026393539491825883, + "grad_norm": 1.9995125532150269, + "learning_rate": 0.0001758949880668258, + "loss": 1.4871, + "step": 737 + }, + { + "epoch": 0.026429351621394166, + "grad_norm": 1.4872878789901733, + "learning_rate": 0.00017613365155131266, + "loss": 1.4856, + "step": 738 + }, + { + "epoch": 0.026465163750962452, + "grad_norm": 2.111163854598999, + "learning_rate": 0.00017637231503579953, + "loss": 1.3824, + "step": 739 + }, + { + "epoch": 0.026500975880530735, + "grad_norm": 1.478540062904358, + "learning_rate": 0.0001766109785202864, + "loss": 1.4685, + "step": 740 + }, + { + "epoch": 0.02653678801009902, + "grad_norm": 2.021808385848999, + "learning_rate": 0.00017684964200477326, + "loss": 1.485, + "step": 741 + }, + { + "epoch": 0.026572600139667304, + "grad_norm": 1.5773123502731323, + "learning_rate": 0.00017708830548926013, + "loss": 1.5509, + "step": 742 + }, + { + "epoch": 0.02660841226923559, + "grad_norm": 1.6470612287521362, + "learning_rate": 0.00017732696897374703, + "loss": 1.4433, + "step": 743 + }, + { + "epoch": 0.026644224398803876, + "grad_norm": 2.136145830154419, + "learning_rate": 0.0001775656324582339, + "loss": 1.3873, + "step": 744 + }, + { + "epoch": 0.02668003652837216, + "grad_norm": 1.6097244024276733, + "learning_rate": 0.00017780429594272076, + "loss": 1.3893, + "step": 745 + }, + { + "epoch": 0.026715848657940446, + "grad_norm": 1.758933424949646, + "learning_rate": 0.00017804295942720766, + "loss": 1.3918, + "step": 746 + }, + { + "epoch": 0.02675166078750873, + "grad_norm": 2.294854164123535, + "learning_rate": 0.00017828162291169453, + "loss": 1.4343, + "step": 747 + }, + { + "epoch": 0.026787472917077015, + "grad_norm": 1.7386239767074585, + "learning_rate": 0.0001785202863961814, + "loss": 1.6053, + "step": 748 + }, + { + "epoch": 0.026823285046645298, + "grad_norm": 2.6452503204345703, + "learning_rate": 0.00017875894988066826, + "loss": 1.6661, + "step": 749 + }, + { + "epoch": 0.026859097176213584, + "grad_norm": 1.4374903440475464, + "learning_rate": 0.00017899761336515516, + "loss": 1.3552, + "step": 750 + }, + { + "epoch": 0.026894909305781867, + "grad_norm": 1.7318204641342163, + "learning_rate": 0.00017923627684964202, + "loss": 1.5204, + "step": 751 + }, + { + "epoch": 0.026930721435350153, + "grad_norm": 2.0017101764678955, + "learning_rate": 0.0001794749403341289, + "loss": 1.7253, + "step": 752 + }, + { + "epoch": 0.02696653356491844, + "grad_norm": 2.1962034702301025, + "learning_rate": 0.00017971360381861576, + "loss": 1.5905, + "step": 753 + }, + { + "epoch": 0.027002345694486722, + "grad_norm": 1.7802438735961914, + "learning_rate": 0.00017995226730310263, + "loss": 1.4616, + "step": 754 + }, + { + "epoch": 0.02703815782405501, + "grad_norm": 1.5497382879257202, + "learning_rate": 0.0001801909307875895, + "loss": 1.5067, + "step": 755 + }, + { + "epoch": 0.02707396995362329, + "grad_norm": 2.2650210857391357, + "learning_rate": 0.00018042959427207636, + "loss": 1.572, + "step": 756 + }, + { + "epoch": 0.027109782083191578, + "grad_norm": 1.8235764503479004, + "learning_rate": 0.00018066825775656326, + "loss": 1.6846, + "step": 757 + }, + { + "epoch": 0.02714559421275986, + "grad_norm": 2.3579845428466797, + "learning_rate": 0.00018090692124105013, + "loss": 1.6, + "step": 758 + }, + { + "epoch": 0.027181406342328147, + "grad_norm": 1.5049972534179688, + "learning_rate": 0.000181145584725537, + "loss": 1.4298, + "step": 759 + }, + { + "epoch": 0.027217218471896433, + "grad_norm": 1.9720600843429565, + "learning_rate": 0.00018138424821002386, + "loss": 1.5995, + "step": 760 + }, + { + "epoch": 0.027253030601464716, + "grad_norm": 1.2860896587371826, + "learning_rate": 0.00018162291169451076, + "loss": 1.4422, + "step": 761 + }, + { + "epoch": 0.027288842731033002, + "grad_norm": 1.852426528930664, + "learning_rate": 0.00018186157517899762, + "loss": 1.7004, + "step": 762 + }, + { + "epoch": 0.027324654860601285, + "grad_norm": 1.7979011535644531, + "learning_rate": 0.0001821002386634845, + "loss": 1.4845, + "step": 763 + }, + { + "epoch": 0.02736046699016957, + "grad_norm": 1.5481610298156738, + "learning_rate": 0.00018233890214797139, + "loss": 1.4179, + "step": 764 + }, + { + "epoch": 0.027396279119737854, + "grad_norm": 2.205472946166992, + "learning_rate": 0.00018257756563245825, + "loss": 1.4621, + "step": 765 + }, + { + "epoch": 0.02743209124930614, + "grad_norm": 1.9159303903579712, + "learning_rate": 0.00018281622911694512, + "loss": 1.4516, + "step": 766 + }, + { + "epoch": 0.027467903378874423, + "grad_norm": 1.6566940546035767, + "learning_rate": 0.000183054892601432, + "loss": 1.4411, + "step": 767 + }, + { + "epoch": 0.02750371550844271, + "grad_norm": 1.3315765857696533, + "learning_rate": 0.00018329355608591888, + "loss": 1.6464, + "step": 768 + }, + { + "epoch": 0.027539527638010996, + "grad_norm": 1.7418568134307861, + "learning_rate": 0.00018353221957040575, + "loss": 1.5169, + "step": 769 + }, + { + "epoch": 0.02757533976757928, + "grad_norm": 1.4596407413482666, + "learning_rate": 0.0001837708830548926, + "loss": 1.3066, + "step": 770 + }, + { + "epoch": 0.027611151897147565, + "grad_norm": 1.5729186534881592, + "learning_rate": 0.00018400954653937946, + "loss": 1.4194, + "step": 771 + }, + { + "epoch": 0.027646964026715848, + "grad_norm": 1.3924620151519775, + "learning_rate": 0.00018424821002386636, + "loss": 1.3641, + "step": 772 + }, + { + "epoch": 0.027682776156284134, + "grad_norm": 2.2335948944091797, + "learning_rate": 0.00018448687350835322, + "loss": 1.5088, + "step": 773 + }, + { + "epoch": 0.027718588285852417, + "grad_norm": 1.5323538780212402, + "learning_rate": 0.0001847255369928401, + "loss": 1.5291, + "step": 774 + }, + { + "epoch": 0.027754400415420703, + "grad_norm": 1.83860445022583, + "learning_rate": 0.00018496420047732699, + "loss": 1.7284, + "step": 775 + }, + { + "epoch": 0.027790212544988986, + "grad_norm": 1.4397697448730469, + "learning_rate": 0.00018520286396181385, + "loss": 1.4679, + "step": 776 + }, + { + "epoch": 0.027826024674557272, + "grad_norm": 1.4892014265060425, + "learning_rate": 0.00018544152744630072, + "loss": 1.4999, + "step": 777 + }, + { + "epoch": 0.02786183680412556, + "grad_norm": 1.835747480392456, + "learning_rate": 0.0001856801909307876, + "loss": 1.4269, + "step": 778 + }, + { + "epoch": 0.02789764893369384, + "grad_norm": 2.1149823665618896, + "learning_rate": 0.00018591885441527448, + "loss": 1.604, + "step": 779 + }, + { + "epoch": 0.027933461063262128, + "grad_norm": 1.5769168138504028, + "learning_rate": 0.00018615751789976135, + "loss": 1.4824, + "step": 780 + }, + { + "epoch": 0.02796927319283041, + "grad_norm": 1.957750916481018, + "learning_rate": 0.00018639618138424822, + "loss": 1.5468, + "step": 781 + }, + { + "epoch": 0.028005085322398697, + "grad_norm": 2.7326269149780273, + "learning_rate": 0.0001866348448687351, + "loss": 1.4224, + "step": 782 + }, + { + "epoch": 0.02804089745196698, + "grad_norm": 1.4467041492462158, + "learning_rate": 0.00018687350835322198, + "loss": 1.4311, + "step": 783 + }, + { + "epoch": 0.028076709581535266, + "grad_norm": 2.53226637840271, + "learning_rate": 0.00018711217183770885, + "loss": 1.4572, + "step": 784 + }, + { + "epoch": 0.028112521711103552, + "grad_norm": 1.5374526977539062, + "learning_rate": 0.00018735083532219572, + "loss": 1.6605, + "step": 785 + }, + { + "epoch": 0.028148333840671835, + "grad_norm": 1.7808685302734375, + "learning_rate": 0.00018758949880668259, + "loss": 1.5757, + "step": 786 + }, + { + "epoch": 0.02818414597024012, + "grad_norm": 2.249417543411255, + "learning_rate": 0.00018782816229116945, + "loss": 1.754, + "step": 787 + }, + { + "epoch": 0.028219958099808404, + "grad_norm": 2.189558744430542, + "learning_rate": 0.00018806682577565632, + "loss": 1.4268, + "step": 788 + }, + { + "epoch": 0.02825577022937669, + "grad_norm": 1.5054012537002563, + "learning_rate": 0.0001883054892601432, + "loss": 1.4221, + "step": 789 + }, + { + "epoch": 0.028291582358944974, + "grad_norm": 1.8405632972717285, + "learning_rate": 0.00018854415274463008, + "loss": 1.4885, + "step": 790 + }, + { + "epoch": 0.02832739448851326, + "grad_norm": 1.692240834236145, + "learning_rate": 0.00018878281622911695, + "loss": 1.5223, + "step": 791 + }, + { + "epoch": 0.028363206618081543, + "grad_norm": 2.1384613513946533, + "learning_rate": 0.00018902147971360382, + "loss": 1.6371, + "step": 792 + }, + { + "epoch": 0.02839901874764983, + "grad_norm": 1.4599164724349976, + "learning_rate": 0.00018926014319809071, + "loss": 1.391, + "step": 793 + }, + { + "epoch": 0.028434830877218115, + "grad_norm": 1.59282648563385, + "learning_rate": 0.00018949880668257758, + "loss": 1.5908, + "step": 794 + }, + { + "epoch": 0.028470643006786398, + "grad_norm": 1.3749408721923828, + "learning_rate": 0.00018973747016706445, + "loss": 1.5841, + "step": 795 + }, + { + "epoch": 0.028506455136354684, + "grad_norm": 1.9376039505004883, + "learning_rate": 0.00018997613365155132, + "loss": 1.4579, + "step": 796 + }, + { + "epoch": 0.028542267265922967, + "grad_norm": 1.7510720491409302, + "learning_rate": 0.0001902147971360382, + "loss": 1.537, + "step": 797 + }, + { + "epoch": 0.028578079395491254, + "grad_norm": 2.0348446369171143, + "learning_rate": 0.00019045346062052508, + "loss": 1.3949, + "step": 798 + }, + { + "epoch": 0.028613891525059536, + "grad_norm": 2.175197124481201, + "learning_rate": 0.00019069212410501195, + "loss": 1.6778, + "step": 799 + }, + { + "epoch": 0.028649703654627823, + "grad_norm": 2.048635482788086, + "learning_rate": 0.00019093078758949882, + "loss": 1.4223, + "step": 800 + }, + { + "epoch": 0.028685515784196106, + "grad_norm": 2.723456382751465, + "learning_rate": 0.00019116945107398568, + "loss": 1.4424, + "step": 801 + }, + { + "epoch": 0.028721327913764392, + "grad_norm": 1.29623544216156, + "learning_rate": 0.00019140811455847255, + "loss": 1.3854, + "step": 802 + }, + { + "epoch": 0.028757140043332678, + "grad_norm": 1.4079904556274414, + "learning_rate": 0.00019164677804295942, + "loss": 1.2864, + "step": 803 + }, + { + "epoch": 0.02879295217290096, + "grad_norm": 1.7057297229766846, + "learning_rate": 0.00019188544152744631, + "loss": 1.7421, + "step": 804 + }, + { + "epoch": 0.028828764302469247, + "grad_norm": 1.7163022756576538, + "learning_rate": 0.00019212410501193318, + "loss": 1.4201, + "step": 805 + }, + { + "epoch": 0.02886457643203753, + "grad_norm": 2.2443811893463135, + "learning_rate": 0.00019236276849642005, + "loss": 1.6298, + "step": 806 + }, + { + "epoch": 0.028900388561605816, + "grad_norm": 1.7485485076904297, + "learning_rate": 0.00019260143198090692, + "loss": 1.541, + "step": 807 + }, + { + "epoch": 0.0289362006911741, + "grad_norm": 2.3779067993164062, + "learning_rate": 0.0001928400954653938, + "loss": 1.5413, + "step": 808 + }, + { + "epoch": 0.028972012820742386, + "grad_norm": 2.3681983947753906, + "learning_rate": 0.00019307875894988068, + "loss": 1.7147, + "step": 809 + }, + { + "epoch": 0.029007824950310672, + "grad_norm": 1.521950602531433, + "learning_rate": 0.00019331742243436755, + "loss": 1.3338, + "step": 810 + }, + { + "epoch": 0.029043637079878955, + "grad_norm": 3.0837557315826416, + "learning_rate": 0.00019355608591885444, + "loss": 1.4374, + "step": 811 + }, + { + "epoch": 0.02907944920944724, + "grad_norm": 2.050992965698242, + "learning_rate": 0.0001937947494033413, + "loss": 1.3627, + "step": 812 + }, + { + "epoch": 0.029115261339015524, + "grad_norm": 1.5406626462936401, + "learning_rate": 0.00019403341288782818, + "loss": 1.5549, + "step": 813 + }, + { + "epoch": 0.02915107346858381, + "grad_norm": 1.5034905672073364, + "learning_rate": 0.00019427207637231505, + "loss": 1.3835, + "step": 814 + }, + { + "epoch": 0.029186885598152093, + "grad_norm": 1.7369670867919922, + "learning_rate": 0.00019451073985680191, + "loss": 1.5491, + "step": 815 + }, + { + "epoch": 0.02922269772772038, + "grad_norm": 1.7397220134735107, + "learning_rate": 0.00019474940334128878, + "loss": 1.4288, + "step": 816 + }, + { + "epoch": 0.029258509857288662, + "grad_norm": 1.4150187969207764, + "learning_rate": 0.00019498806682577565, + "loss": 1.3875, + "step": 817 + }, + { + "epoch": 0.02929432198685695, + "grad_norm": 1.6811386346817017, + "learning_rate": 0.00019522673031026252, + "loss": 1.601, + "step": 818 + }, + { + "epoch": 0.029330134116425235, + "grad_norm": 2.223656177520752, + "learning_rate": 0.0001954653937947494, + "loss": 1.6315, + "step": 819 + }, + { + "epoch": 0.029365946245993518, + "grad_norm": 1.4075154066085815, + "learning_rate": 0.00019570405727923628, + "loss": 1.4508, + "step": 820 + }, + { + "epoch": 0.029401758375561804, + "grad_norm": 2.015578031539917, + "learning_rate": 0.00019594272076372315, + "loss": 1.6791, + "step": 821 + }, + { + "epoch": 0.029437570505130087, + "grad_norm": 2.8869681358337402, + "learning_rate": 0.00019618138424821004, + "loss": 1.7354, + "step": 822 + }, + { + "epoch": 0.029473382634698373, + "grad_norm": 1.5231835842132568, + "learning_rate": 0.0001964200477326969, + "loss": 1.3458, + "step": 823 + }, + { + "epoch": 0.029509194764266656, + "grad_norm": 1.5858217477798462, + "learning_rate": 0.00019665871121718378, + "loss": 1.2866, + "step": 824 + }, + { + "epoch": 0.029545006893834942, + "grad_norm": 2.5863804817199707, + "learning_rate": 0.00019689737470167065, + "loss": 1.5297, + "step": 825 + }, + { + "epoch": 0.029580819023403225, + "grad_norm": 1.8091540336608887, + "learning_rate": 0.00019713603818615754, + "loss": 1.4346, + "step": 826 + }, + { + "epoch": 0.02961663115297151, + "grad_norm": 2.9086296558380127, + "learning_rate": 0.0001973747016706444, + "loss": 1.5333, + "step": 827 + }, + { + "epoch": 0.029652443282539798, + "grad_norm": 1.8545640707015991, + "learning_rate": 0.00019761336515513128, + "loss": 1.4658, + "step": 828 + }, + { + "epoch": 0.02968825541210808, + "grad_norm": 2.5969512462615967, + "learning_rate": 0.00019785202863961817, + "loss": 1.543, + "step": 829 + }, + { + "epoch": 0.029724067541676367, + "grad_norm": 1.6143041849136353, + "learning_rate": 0.000198090692124105, + "loss": 1.4382, + "step": 830 + }, + { + "epoch": 0.02975987967124465, + "grad_norm": 1.9123790264129639, + "learning_rate": 0.00019832935560859188, + "loss": 1.5413, + "step": 831 + }, + { + "epoch": 0.029795691800812936, + "grad_norm": 1.4040930271148682, + "learning_rate": 0.00019856801909307875, + "loss": 1.4746, + "step": 832 + }, + { + "epoch": 0.02983150393038122, + "grad_norm": 1.5856297016143799, + "learning_rate": 0.00019880668257756564, + "loss": 1.483, + "step": 833 + }, + { + "epoch": 0.029867316059949505, + "grad_norm": 1.5469998121261597, + "learning_rate": 0.0001990453460620525, + "loss": 1.4081, + "step": 834 + }, + { + "epoch": 0.02990312818951779, + "grad_norm": 2.020606756210327, + "learning_rate": 0.00019928400954653938, + "loss": 1.4982, + "step": 835 + }, + { + "epoch": 0.029938940319086074, + "grad_norm": 1.9376537799835205, + "learning_rate": 0.00019952267303102625, + "loss": 1.4423, + "step": 836 + }, + { + "epoch": 0.02997475244865436, + "grad_norm": 1.8903049230575562, + "learning_rate": 0.00019976133651551314, + "loss": 1.7031, + "step": 837 + }, + { + "epoch": 0.030010564578222643, + "grad_norm": 2.1008646488189697, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 838 + }, + { + "epoch": 0.03004637670779093, + "grad_norm": 1.3041636943817139, + "learning_rate": 0.0001999999993273145, + "loss": 1.3056, + "step": 839 + }, + { + "epoch": 0.030082188837359213, + "grad_norm": 1.4034329652786255, + "learning_rate": 0.000199999997309258, + "loss": 1.5072, + "step": 840 + }, + { + "epoch": 0.0301180009669275, + "grad_norm": 1.6098731756210327, + "learning_rate": 0.00019999999394583053, + "loss": 1.6392, + "step": 841 + }, + { + "epoch": 0.03015381309649578, + "grad_norm": 1.7606528997421265, + "learning_rate": 0.00019999998923703213, + "loss": 1.4203, + "step": 842 + }, + { + "epoch": 0.030189625226064068, + "grad_norm": 1.714280128479004, + "learning_rate": 0.00019999998318286286, + "loss": 1.4908, + "step": 843 + }, + { + "epoch": 0.030225437355632354, + "grad_norm": 3.193429708480835, + "learning_rate": 0.0001999999757833228, + "loss": 1.5114, + "step": 844 + }, + { + "epoch": 0.030261249485200637, + "grad_norm": 2.2431693077087402, + "learning_rate": 0.00019999996703841207, + "loss": 1.4047, + "step": 845 + }, + { + "epoch": 0.030297061614768923, + "grad_norm": 1.7106622457504272, + "learning_rate": 0.00019999995694813073, + "loss": 1.3734, + "step": 846 + }, + { + "epoch": 0.030332873744337206, + "grad_norm": 1.4019434452056885, + "learning_rate": 0.00019999994551247901, + "loss": 1.3089, + "step": 847 + }, + { + "epoch": 0.030368685873905493, + "grad_norm": 1.4563263654708862, + "learning_rate": 0.000199999932731457, + "loss": 1.691, + "step": 848 + }, + { + "epoch": 0.030404498003473775, + "grad_norm": 1.398736834526062, + "learning_rate": 0.00019999991860506492, + "loss": 1.4807, + "step": 849 + }, + { + "epoch": 0.03044031013304206, + "grad_norm": 1.3401799201965332, + "learning_rate": 0.00019999990313330286, + "loss": 1.4868, + "step": 850 + }, + { + "epoch": 0.030476122262610345, + "grad_norm": 1.781977891921997, + "learning_rate": 0.00019999988631617114, + "loss": 1.1983, + "step": 851 + }, + { + "epoch": 0.03051193439217863, + "grad_norm": 1.517777919769287, + "learning_rate": 0.00019999986815366993, + "loss": 1.4623, + "step": 852 + }, + { + "epoch": 0.030547746521746917, + "grad_norm": 1.4744325876235962, + "learning_rate": 0.0001999998486457995, + "loss": 1.5216, + "step": 853 + }, + { + "epoch": 0.0305835586513152, + "grad_norm": 1.4800273180007935, + "learning_rate": 0.00019999982779256005, + "loss": 1.367, + "step": 854 + }, + { + "epoch": 0.030619370780883486, + "grad_norm": 1.7968140840530396, + "learning_rate": 0.00019999980559395195, + "loss": 1.5347, + "step": 855 + }, + { + "epoch": 0.03065518291045177, + "grad_norm": 1.3774813413619995, + "learning_rate": 0.00019999978204997545, + "loss": 1.4071, + "step": 856 + }, + { + "epoch": 0.030690995040020055, + "grad_norm": 1.8599704504013062, + "learning_rate": 0.00019999975716063087, + "loss": 1.5372, + "step": 857 + }, + { + "epoch": 0.030726807169588338, + "grad_norm": 1.9149694442749023, + "learning_rate": 0.0001999997309259185, + "loss": 1.5942, + "step": 858 + }, + { + "epoch": 0.030762619299156625, + "grad_norm": 1.5339001417160034, + "learning_rate": 0.0001999997033458388, + "loss": 1.5526, + "step": 859 + }, + { + "epoch": 0.03079843142872491, + "grad_norm": 1.4732356071472168, + "learning_rate": 0.00019999967442039206, + "loss": 1.4618, + "step": 860 + }, + { + "epoch": 0.030834243558293194, + "grad_norm": 1.3672585487365723, + "learning_rate": 0.0001999996441495787, + "loss": 1.2246, + "step": 861 + }, + { + "epoch": 0.03087005568786148, + "grad_norm": 1.619768500328064, + "learning_rate": 0.0001999996125333991, + "loss": 1.5323, + "step": 862 + }, + { + "epoch": 0.030905867817429763, + "grad_norm": 1.3358949422836304, + "learning_rate": 0.00019999957957185375, + "loss": 1.4442, + "step": 863 + }, + { + "epoch": 0.03094167994699805, + "grad_norm": 1.9378008842468262, + "learning_rate": 0.000199999545264943, + "loss": 1.6725, + "step": 864 + }, + { + "epoch": 0.030977492076566332, + "grad_norm": 1.379014253616333, + "learning_rate": 0.00019999950961266738, + "loss": 1.3987, + "step": 865 + }, + { + "epoch": 0.03101330420613462, + "grad_norm": 1.4479185342788696, + "learning_rate": 0.00019999947261502735, + "loss": 1.3602, + "step": 866 + }, + { + "epoch": 0.0310491163357029, + "grad_norm": 1.6457114219665527, + "learning_rate": 0.0001999994342720234, + "loss": 1.3278, + "step": 867 + }, + { + "epoch": 0.031084928465271187, + "grad_norm": 1.5082173347473145, + "learning_rate": 0.00019999939458365605, + "loss": 1.2639, + "step": 868 + }, + { + "epoch": 0.031120740594839474, + "grad_norm": 1.6259551048278809, + "learning_rate": 0.00019999935354992582, + "loss": 1.4369, + "step": 869 + }, + { + "epoch": 0.031156552724407757, + "grad_norm": 2.3184916973114014, + "learning_rate": 0.0001999993111708333, + "loss": 1.6269, + "step": 870 + }, + { + "epoch": 0.031192364853976043, + "grad_norm": 1.5446441173553467, + "learning_rate": 0.00019999926744637903, + "loss": 1.4667, + "step": 871 + }, + { + "epoch": 0.031228176983544326, + "grad_norm": 1.7728568315505981, + "learning_rate": 0.0001999992223765636, + "loss": 1.4731, + "step": 872 + }, + { + "epoch": 0.03126398911311261, + "grad_norm": 2.0149970054626465, + "learning_rate": 0.00019999917596138765, + "loss": 1.713, + "step": 873 + }, + { + "epoch": 0.0312998012426809, + "grad_norm": 3.3689420223236084, + "learning_rate": 0.00019999912820085176, + "loss": 1.7099, + "step": 874 + }, + { + "epoch": 0.03133561337224918, + "grad_norm": 1.8272250890731812, + "learning_rate": 0.0001999990790949566, + "loss": 1.4747, + "step": 875 + }, + { + "epoch": 0.031371425501817464, + "grad_norm": 1.5982478857040405, + "learning_rate": 0.0001999990286437028, + "loss": 1.5219, + "step": 876 + }, + { + "epoch": 0.031407237631385754, + "grad_norm": 1.3071531057357788, + "learning_rate": 0.00019999897684709104, + "loss": 1.4441, + "step": 877 + }, + { + "epoch": 0.03144304976095404, + "grad_norm": 2.705305337905884, + "learning_rate": 0.00019999892370512208, + "loss": 1.6229, + "step": 878 + }, + { + "epoch": 0.03147886189052232, + "grad_norm": 1.943268895149231, + "learning_rate": 0.00019999886921779657, + "loss": 1.5105, + "step": 879 + }, + { + "epoch": 0.0315146740200906, + "grad_norm": 1.4625228643417358, + "learning_rate": 0.00019999881338511526, + "loss": 1.3756, + "step": 880 + }, + { + "epoch": 0.03155048614965889, + "grad_norm": 1.4387773275375366, + "learning_rate": 0.0001999987562070789, + "loss": 1.4314, + "step": 881 + }, + { + "epoch": 0.031586298279227175, + "grad_norm": 1.3481978178024292, + "learning_rate": 0.00019999869768368828, + "loss": 1.3846, + "step": 882 + }, + { + "epoch": 0.03162211040879546, + "grad_norm": 1.4743783473968506, + "learning_rate": 0.0001999986378149442, + "loss": 1.5548, + "step": 883 + }, + { + "epoch": 0.03165792253836374, + "grad_norm": 2.110506772994995, + "learning_rate": 0.00019999857660084737, + "loss": 1.3561, + "step": 884 + }, + { + "epoch": 0.03169373466793203, + "grad_norm": 1.566511631011963, + "learning_rate": 0.00019999851404139873, + "loss": 1.4323, + "step": 885 + }, + { + "epoch": 0.03172954679750031, + "grad_norm": 2.6302683353424072, + "learning_rate": 0.00019999845013659906, + "loss": 1.4866, + "step": 886 + }, + { + "epoch": 0.031765358927068596, + "grad_norm": 1.4151936769485474, + "learning_rate": 0.00019999838488644924, + "loss": 1.4898, + "step": 887 + }, + { + "epoch": 0.031801171056636886, + "grad_norm": 1.5072022676467896, + "learning_rate": 0.00019999831829095013, + "loss": 1.6192, + "step": 888 + }, + { + "epoch": 0.03183698318620517, + "grad_norm": 1.7588951587677002, + "learning_rate": 0.00019999825035010263, + "loss": 1.4268, + "step": 889 + }, + { + "epoch": 0.03187279531577345, + "grad_norm": 2.475877523422241, + "learning_rate": 0.00019999818106390766, + "loss": 1.4271, + "step": 890 + }, + { + "epoch": 0.031908607445341734, + "grad_norm": 1.8262286186218262, + "learning_rate": 0.0001999981104323662, + "loss": 1.4576, + "step": 891 + }, + { + "epoch": 0.031944419574910024, + "grad_norm": 1.3965345621109009, + "learning_rate": 0.00019999803845547907, + "loss": 1.3823, + "step": 892 + }, + { + "epoch": 0.03198023170447831, + "grad_norm": 1.487810730934143, + "learning_rate": 0.00019999796513324735, + "loss": 1.4678, + "step": 893 + }, + { + "epoch": 0.03201604383404659, + "grad_norm": 1.7437043190002441, + "learning_rate": 0.00019999789046567203, + "loss": 1.4166, + "step": 894 + }, + { + "epoch": 0.03205185596361488, + "grad_norm": 2.20723032951355, + "learning_rate": 0.00019999781445275406, + "loss": 1.3949, + "step": 895 + }, + { + "epoch": 0.03208766809318316, + "grad_norm": 1.2976787090301514, + "learning_rate": 0.0001999977370944945, + "loss": 1.8005, + "step": 896 + }, + { + "epoch": 0.032123480222751445, + "grad_norm": 1.988075613975525, + "learning_rate": 0.00019999765839089434, + "loss": 1.2636, + "step": 897 + }, + { + "epoch": 0.03215929235231973, + "grad_norm": 2.2271056175231934, + "learning_rate": 0.00019999757834195472, + "loss": 1.502, + "step": 898 + }, + { + "epoch": 0.03219510448188802, + "grad_norm": 1.4957082271575928, + "learning_rate": 0.00019999749694767666, + "loss": 1.297, + "step": 899 + }, + { + "epoch": 0.0322309166114563, + "grad_norm": 2.0688042640686035, + "learning_rate": 0.0001999974142080612, + "loss": 1.7097, + "step": 900 + }, + { + "epoch": 0.03226672874102458, + "grad_norm": 1.6941030025482178, + "learning_rate": 0.00019999733012310958, + "loss": 1.5371, + "step": 901 + }, + { + "epoch": 0.03230254087059287, + "grad_norm": 1.7772458791732788, + "learning_rate": 0.00019999724469282288, + "loss": 1.4186, + "step": 902 + }, + { + "epoch": 0.032338353000161156, + "grad_norm": 2.323482036590576, + "learning_rate": 0.00019999715791720223, + "loss": 1.4788, + "step": 903 + }, + { + "epoch": 0.03237416512972944, + "grad_norm": 1.852022647857666, + "learning_rate": 0.00019999706979624877, + "loss": 1.515, + "step": 904 + }, + { + "epoch": 0.03240997725929772, + "grad_norm": 2.5033960342407227, + "learning_rate": 0.00019999698032996377, + "loss": 1.6154, + "step": 905 + }, + { + "epoch": 0.03244578938886601, + "grad_norm": 2.098865032196045, + "learning_rate": 0.00019999688951834836, + "loss": 1.3432, + "step": 906 + }, + { + "epoch": 0.032481601518434294, + "grad_norm": 2.3131368160247803, + "learning_rate": 0.0001999967973614038, + "loss": 1.4624, + "step": 907 + }, + { + "epoch": 0.03251741364800258, + "grad_norm": 2.036224603652954, + "learning_rate": 0.00019999670385913133, + "loss": 1.627, + "step": 908 + }, + { + "epoch": 0.03255322577757086, + "grad_norm": 1.6072630882263184, + "learning_rate": 0.00019999660901153218, + "loss": 1.3712, + "step": 909 + }, + { + "epoch": 0.03258903790713915, + "grad_norm": 1.4021480083465576, + "learning_rate": 0.00019999651281860762, + "loss": 1.4597, + "step": 910 + }, + { + "epoch": 0.03262485003670743, + "grad_norm": 1.5628105401992798, + "learning_rate": 0.00019999641528035898, + "loss": 1.5476, + "step": 911 + }, + { + "epoch": 0.032660662166275715, + "grad_norm": 1.9513826370239258, + "learning_rate": 0.0001999963163967876, + "loss": 1.7792, + "step": 912 + }, + { + "epoch": 0.032696474295844005, + "grad_norm": 1.3145549297332764, + "learning_rate": 0.00019999621616789473, + "loss": 1.4328, + "step": 913 + }, + { + "epoch": 0.03273228642541229, + "grad_norm": 1.357667326927185, + "learning_rate": 0.00019999611459368174, + "loss": 1.3749, + "step": 914 + }, + { + "epoch": 0.03276809855498057, + "grad_norm": 1.1454778909683228, + "learning_rate": 0.00019999601167415006, + "loss": 1.4278, + "step": 915 + }, + { + "epoch": 0.032803910684548854, + "grad_norm": 2.274576425552368, + "learning_rate": 0.000199995907409301, + "loss": 1.7974, + "step": 916 + }, + { + "epoch": 0.032839722814117144, + "grad_norm": 1.5799471139907837, + "learning_rate": 0.000199995801799136, + "loss": 1.502, + "step": 917 + }, + { + "epoch": 0.032875534943685426, + "grad_norm": 1.8833181858062744, + "learning_rate": 0.00019999569484365645, + "loss": 1.4466, + "step": 918 + }, + { + "epoch": 0.03291134707325371, + "grad_norm": 1.5610058307647705, + "learning_rate": 0.00019999558654286385, + "loss": 1.2844, + "step": 919 + }, + { + "epoch": 0.032947159202822, + "grad_norm": 3.0029516220092773, + "learning_rate": 0.0001999954768967596, + "loss": 1.6285, + "step": 920 + }, + { + "epoch": 0.03298297133239028, + "grad_norm": 2.5965607166290283, + "learning_rate": 0.0001999953659053452, + "loss": 1.4753, + "step": 921 + }, + { + "epoch": 0.033018783461958565, + "grad_norm": 1.5474822521209717, + "learning_rate": 0.0001999952535686221, + "loss": 1.5764, + "step": 922 + }, + { + "epoch": 0.03305459559152685, + "grad_norm": 1.2778196334838867, + "learning_rate": 0.00019999513988659188, + "loss": 1.2869, + "step": 923 + }, + { + "epoch": 0.03309040772109514, + "grad_norm": 2.07330584526062, + "learning_rate": 0.00019999502485925605, + "loss": 1.7764, + "step": 924 + }, + { + "epoch": 0.03312621985066342, + "grad_norm": 1.5719765424728394, + "learning_rate": 0.00019999490848661612, + "loss": 1.2576, + "step": 925 + }, + { + "epoch": 0.0331620319802317, + "grad_norm": 1.4487565755844116, + "learning_rate": 0.00019999479076867368, + "loss": 1.4508, + "step": 926 + }, + { + "epoch": 0.03319784410979999, + "grad_norm": 1.7214974164962769, + "learning_rate": 0.00019999467170543031, + "loss": 1.4211, + "step": 927 + }, + { + "epoch": 0.033233656239368276, + "grad_norm": 1.7321597337722778, + "learning_rate": 0.00019999455129688764, + "loss": 1.4859, + "step": 928 + }, + { + "epoch": 0.03326946836893656, + "grad_norm": 1.806490182876587, + "learning_rate": 0.00019999442954304729, + "loss": 1.4404, + "step": 929 + }, + { + "epoch": 0.03330528049850484, + "grad_norm": 2.1547493934631348, + "learning_rate": 0.00019999430644391082, + "loss": 1.5034, + "step": 930 + }, + { + "epoch": 0.03334109262807313, + "grad_norm": 1.5717108249664307, + "learning_rate": 0.00019999418199947994, + "loss": 1.4316, + "step": 931 + }, + { + "epoch": 0.033376904757641414, + "grad_norm": 1.6698131561279297, + "learning_rate": 0.00019999405620975636, + "loss": 1.566, + "step": 932 + }, + { + "epoch": 0.0334127168872097, + "grad_norm": 2.323439359664917, + "learning_rate": 0.00019999392907474174, + "loss": 1.5717, + "step": 933 + }, + { + "epoch": 0.03344852901677798, + "grad_norm": 1.6666685342788696, + "learning_rate": 0.00019999380059443773, + "loss": 1.4758, + "step": 934 + }, + { + "epoch": 0.03348434114634627, + "grad_norm": 1.8131999969482422, + "learning_rate": 0.00019999367076884616, + "loss": 1.3535, + "step": 935 + }, + { + "epoch": 0.03352015327591455, + "grad_norm": 1.6577718257904053, + "learning_rate": 0.00019999353959796872, + "loss": 1.4692, + "step": 936 + }, + { + "epoch": 0.033555965405482835, + "grad_norm": 1.5016111135482788, + "learning_rate": 0.0001999934070818072, + "loss": 1.495, + "step": 937 + }, + { + "epoch": 0.033591777535051125, + "grad_norm": 1.4556392431259155, + "learning_rate": 0.00019999327322036336, + "loss": 1.419, + "step": 938 + }, + { + "epoch": 0.03362758966461941, + "grad_norm": 1.8193069696426392, + "learning_rate": 0.00019999313801363902, + "loss": 1.4469, + "step": 939 + }, + { + "epoch": 0.03366340179418769, + "grad_norm": 1.6074687242507935, + "learning_rate": 0.00019999300146163597, + "loss": 1.2841, + "step": 940 + }, + { + "epoch": 0.03369921392375597, + "grad_norm": 2.2323546409606934, + "learning_rate": 0.00019999286356435608, + "loss": 1.4817, + "step": 941 + }, + { + "epoch": 0.03373502605332426, + "grad_norm": 1.984520435333252, + "learning_rate": 0.0001999927243218012, + "loss": 1.4635, + "step": 942 + }, + { + "epoch": 0.033770838182892546, + "grad_norm": 1.8351210355758667, + "learning_rate": 0.0001999925837339732, + "loss": 1.3997, + "step": 943 + }, + { + "epoch": 0.03380665031246083, + "grad_norm": 1.4362258911132812, + "learning_rate": 0.00019999244180087395, + "loss": 1.4409, + "step": 944 + }, + { + "epoch": 0.03384246244202912, + "grad_norm": 1.8241633176803589, + "learning_rate": 0.00019999229852250537, + "loss": 1.4493, + "step": 945 + }, + { + "epoch": 0.0338782745715974, + "grad_norm": 2.1113150119781494, + "learning_rate": 0.00019999215389886942, + "loss": 1.5159, + "step": 946 + }, + { + "epoch": 0.033914086701165684, + "grad_norm": 1.58767569065094, + "learning_rate": 0.000199992007929968, + "loss": 1.4788, + "step": 947 + }, + { + "epoch": 0.03394989883073397, + "grad_norm": 1.3424389362335205, + "learning_rate": 0.0001999918606158031, + "loss": 1.3983, + "step": 948 + }, + { + "epoch": 0.03398571096030226, + "grad_norm": 1.4824097156524658, + "learning_rate": 0.0001999917119563767, + "loss": 1.5126, + "step": 949 + }, + { + "epoch": 0.03402152308987054, + "grad_norm": 1.6257961988449097, + "learning_rate": 0.00019999156195169078, + "loss": 1.4896, + "step": 950 + }, + { + "epoch": 0.03405733521943882, + "grad_norm": 2.238340377807617, + "learning_rate": 0.0001999914106017474, + "loss": 1.6174, + "step": 951 + }, + { + "epoch": 0.03409314734900711, + "grad_norm": 1.798643946647644, + "learning_rate": 0.00019999125790654855, + "loss": 1.3671, + "step": 952 + }, + { + "epoch": 0.034128959478575395, + "grad_norm": 1.363549828529358, + "learning_rate": 0.0001999911038660963, + "loss": 1.4834, + "step": 953 + }, + { + "epoch": 0.03416477160814368, + "grad_norm": 1.5384442806243896, + "learning_rate": 0.00019999094848039274, + "loss": 1.3308, + "step": 954 + }, + { + "epoch": 0.03420058373771196, + "grad_norm": 1.469741702079773, + "learning_rate": 0.00019999079174943995, + "loss": 1.262, + "step": 955 + }, + { + "epoch": 0.03423639586728025, + "grad_norm": 2.0101306438446045, + "learning_rate": 0.00019999063367324003, + "loss": 1.4749, + "step": 956 + }, + { + "epoch": 0.03427220799684853, + "grad_norm": 2.279151439666748, + "learning_rate": 0.0001999904742517951, + "loss": 1.5362, + "step": 957 + }, + { + "epoch": 0.034308020126416816, + "grad_norm": 1.4416332244873047, + "learning_rate": 0.00019999031348510733, + "loss": 1.5498, + "step": 958 + }, + { + "epoch": 0.0343438322559851, + "grad_norm": 2.6101648807525635, + "learning_rate": 0.00019999015137317887, + "loss": 1.6863, + "step": 959 + }, + { + "epoch": 0.03437964438555339, + "grad_norm": 2.9115045070648193, + "learning_rate": 0.0001999899879160119, + "loss": 1.5648, + "step": 960 + }, + { + "epoch": 0.03441545651512167, + "grad_norm": 1.4219692945480347, + "learning_rate": 0.00019998982311360863, + "loss": 1.4884, + "step": 961 + }, + { + "epoch": 0.034451268644689954, + "grad_norm": 2.0500988960266113, + "learning_rate": 0.00019998965696597126, + "loss": 1.6169, + "step": 962 + }, + { + "epoch": 0.034487080774258244, + "grad_norm": 1.3502780199050903, + "learning_rate": 0.00019998948947310202, + "loss": 1.3931, + "step": 963 + }, + { + "epoch": 0.03452289290382653, + "grad_norm": 1.604957938194275, + "learning_rate": 0.0001999893206350032, + "loss": 1.4413, + "step": 964 + }, + { + "epoch": 0.03455870503339481, + "grad_norm": 1.2663164138793945, + "learning_rate": 0.00019998915045167702, + "loss": 1.2993, + "step": 965 + }, + { + "epoch": 0.03459451716296309, + "grad_norm": 1.3506345748901367, + "learning_rate": 0.0001999889789231258, + "loss": 1.641, + "step": 966 + }, + { + "epoch": 0.03463032929253138, + "grad_norm": 1.6702309846878052, + "learning_rate": 0.00019998880604935187, + "loss": 1.4805, + "step": 967 + }, + { + "epoch": 0.034666141422099665, + "grad_norm": 1.2872469425201416, + "learning_rate": 0.00019998863183035752, + "loss": 1.1865, + "step": 968 + }, + { + "epoch": 0.03470195355166795, + "grad_norm": 1.9474170207977295, + "learning_rate": 0.0001999884562661451, + "loss": 1.753, + "step": 969 + }, + { + "epoch": 0.03473776568123624, + "grad_norm": 1.7872248888015747, + "learning_rate": 0.00019998827935671697, + "loss": 1.2922, + "step": 970 + }, + { + "epoch": 0.03477357781080452, + "grad_norm": 1.3373949527740479, + "learning_rate": 0.00019998810110207553, + "loss": 1.4248, + "step": 971 + }, + { + "epoch": 0.034809389940372804, + "grad_norm": 1.5269542932510376, + "learning_rate": 0.00019998792150222316, + "loss": 1.5091, + "step": 972 + }, + { + "epoch": 0.034845202069941086, + "grad_norm": 1.6823294162750244, + "learning_rate": 0.0001999877405571623, + "loss": 1.4233, + "step": 973 + }, + { + "epoch": 0.034881014199509376, + "grad_norm": 3.096691608428955, + "learning_rate": 0.00019998755826689535, + "loss": 1.6861, + "step": 974 + }, + { + "epoch": 0.03491682632907766, + "grad_norm": 1.4043667316436768, + "learning_rate": 0.00019998737463142478, + "loss": 1.3464, + "step": 975 + }, + { + "epoch": 0.03495263845864594, + "grad_norm": 1.9794062376022339, + "learning_rate": 0.00019998718965075305, + "loss": 1.5323, + "step": 976 + }, + { + "epoch": 0.03498845058821423, + "grad_norm": 1.2814193964004517, + "learning_rate": 0.00019998700332488265, + "loss": 1.3148, + "step": 977 + }, + { + "epoch": 0.035024262717782514, + "grad_norm": 1.2218396663665771, + "learning_rate": 0.00019998681565381611, + "loss": 1.202, + "step": 978 + }, + { + "epoch": 0.0350600748473508, + "grad_norm": 1.4193840026855469, + "learning_rate": 0.00019998662663755595, + "loss": 1.3797, + "step": 979 + }, + { + "epoch": 0.03509588697691908, + "grad_norm": 1.6496566534042358, + "learning_rate": 0.00019998643627610466, + "loss": 1.4797, + "step": 980 + }, + { + "epoch": 0.03513169910648737, + "grad_norm": 1.5155152082443237, + "learning_rate": 0.00019998624456946492, + "loss": 1.3057, + "step": 981 + }, + { + "epoch": 0.03516751123605565, + "grad_norm": 1.9658448696136475, + "learning_rate": 0.00019998605151763917, + "loss": 1.4364, + "step": 982 + }, + { + "epoch": 0.035203323365623936, + "grad_norm": 1.679531455039978, + "learning_rate": 0.00019998585712063008, + "loss": 1.4085, + "step": 983 + }, + { + "epoch": 0.03523913549519222, + "grad_norm": 2.0714447498321533, + "learning_rate": 0.00019998566137844026, + "loss": 1.6612, + "step": 984 + }, + { + "epoch": 0.03527494762476051, + "grad_norm": 2.2905819416046143, + "learning_rate": 0.00019998546429107235, + "loss": 1.561, + "step": 985 + }, + { + "epoch": 0.03531075975432879, + "grad_norm": 1.8010838031768799, + "learning_rate": 0.00019998526585852898, + "loss": 1.5131, + "step": 986 + }, + { + "epoch": 0.035346571883897074, + "grad_norm": 1.9583847522735596, + "learning_rate": 0.00019998506608081282, + "loss": 1.5696, + "step": 987 + }, + { + "epoch": 0.035382384013465364, + "grad_norm": 2.4268510341644287, + "learning_rate": 0.00019998486495792657, + "loss": 1.5412, + "step": 988 + }, + { + "epoch": 0.035418196143033646, + "grad_norm": 1.588919758796692, + "learning_rate": 0.00019998466248987294, + "loss": 1.4979, + "step": 989 + }, + { + "epoch": 0.03545400827260193, + "grad_norm": 1.408884048461914, + "learning_rate": 0.00019998445867665463, + "loss": 1.5643, + "step": 990 + }, + { + "epoch": 0.03548982040217021, + "grad_norm": 1.870570182800293, + "learning_rate": 0.0001999842535182744, + "loss": 1.5466, + "step": 991 + }, + { + "epoch": 0.0355256325317385, + "grad_norm": 1.5196517705917358, + "learning_rate": 0.00019998404701473504, + "loss": 1.5673, + "step": 992 + }, + { + "epoch": 0.035561444661306785, + "grad_norm": 1.3067079782485962, + "learning_rate": 0.00019998383916603927, + "loss": 1.4743, + "step": 993 + }, + { + "epoch": 0.03559725679087507, + "grad_norm": 2.165579080581665, + "learning_rate": 0.00019998362997218993, + "loss": 1.6714, + "step": 994 + }, + { + "epoch": 0.03563306892044336, + "grad_norm": 1.5383042097091675, + "learning_rate": 0.0001999834194331898, + "loss": 1.5408, + "step": 995 + }, + { + "epoch": 0.03566888105001164, + "grad_norm": 2.973935604095459, + "learning_rate": 0.00019998320754904177, + "loss": 1.9292, + "step": 996 + }, + { + "epoch": 0.03570469317957992, + "grad_norm": 1.762618899345398, + "learning_rate": 0.0001999829943197486, + "loss": 1.4326, + "step": 997 + }, + { + "epoch": 0.035740505309148206, + "grad_norm": 1.547913670539856, + "learning_rate": 0.00019998277974531326, + "loss": 1.4737, + "step": 998 + }, + { + "epoch": 0.035776317438716496, + "grad_norm": 1.5939230918884277, + "learning_rate": 0.00019998256382573856, + "loss": 1.6581, + "step": 999 + }, + { + "epoch": 0.03581212956828478, + "grad_norm": 1.6708388328552246, + "learning_rate": 0.0001999823465610274, + "loss": 1.6453, + "step": 1000 + }, + { + "epoch": 0.03584794169785306, + "grad_norm": 1.6699614524841309, + "learning_rate": 0.0001999821279511828, + "loss": 1.3845, + "step": 1001 + }, + { + "epoch": 0.03588375382742135, + "grad_norm": 1.9064433574676514, + "learning_rate": 0.0001999819079962076, + "loss": 1.3685, + "step": 1002 + }, + { + "epoch": 0.035919565956989634, + "grad_norm": 1.8299814462661743, + "learning_rate": 0.0001999816866961048, + "loss": 1.6348, + "step": 1003 + }, + { + "epoch": 0.03595537808655792, + "grad_norm": 1.4278016090393066, + "learning_rate": 0.00019998146405087738, + "loss": 1.3809, + "step": 1004 + }, + { + "epoch": 0.0359911902161262, + "grad_norm": 1.5477863550186157, + "learning_rate": 0.00019998124006052832, + "loss": 1.5504, + "step": 1005 + }, + { + "epoch": 0.03602700234569449, + "grad_norm": 1.6185246706008911, + "learning_rate": 0.00019998101472506064, + "loss": 1.6143, + "step": 1006 + }, + { + "epoch": 0.03606281447526277, + "grad_norm": 1.909199595451355, + "learning_rate": 0.00019998078804447738, + "loss": 1.5645, + "step": 1007 + }, + { + "epoch": 0.036098626604831055, + "grad_norm": 1.690483808517456, + "learning_rate": 0.00019998056001878158, + "loss": 1.5469, + "step": 1008 + }, + { + "epoch": 0.03613443873439934, + "grad_norm": 1.6344071626663208, + "learning_rate": 0.0001999803306479763, + "loss": 1.5041, + "step": 1009 + }, + { + "epoch": 0.03617025086396763, + "grad_norm": 1.3170932531356812, + "learning_rate": 0.00019998009993206462, + "loss": 1.3699, + "step": 1010 + }, + { + "epoch": 0.03620606299353591, + "grad_norm": 1.5705807209014893, + "learning_rate": 0.0001999798678710497, + "loss": 1.3974, + "step": 1011 + }, + { + "epoch": 0.03624187512310419, + "grad_norm": 1.4980460405349731, + "learning_rate": 0.00019997963446493461, + "loss": 1.7144, + "step": 1012 + }, + { + "epoch": 0.03627768725267248, + "grad_norm": 1.308166742324829, + "learning_rate": 0.00019997939971372252, + "loss": 1.3958, + "step": 1013 + }, + { + "epoch": 0.036313499382240766, + "grad_norm": 1.9236700534820557, + "learning_rate": 0.00019997916361741655, + "loss": 1.6718, + "step": 1014 + }, + { + "epoch": 0.03634931151180905, + "grad_norm": 1.392545461654663, + "learning_rate": 0.0001999789261760199, + "loss": 1.5869, + "step": 1015 + }, + { + "epoch": 0.03638512364137733, + "grad_norm": 1.362557053565979, + "learning_rate": 0.00019997868738953577, + "loss": 1.3355, + "step": 1016 + }, + { + "epoch": 0.03642093577094562, + "grad_norm": 2.0461792945861816, + "learning_rate": 0.00019997844725796733, + "loss": 1.5764, + "step": 1017 + }, + { + "epoch": 0.036456747900513904, + "grad_norm": 1.3404486179351807, + "learning_rate": 0.0001999782057813179, + "loss": 1.426, + "step": 1018 + }, + { + "epoch": 0.03649256003008219, + "grad_norm": 1.2448232173919678, + "learning_rate": 0.00019997796295959065, + "loss": 1.4655, + "step": 1019 + }, + { + "epoch": 0.03652837215965048, + "grad_norm": 1.4044294357299805, + "learning_rate": 0.00019997771879278883, + "loss": 1.4369, + "step": 1020 + }, + { + "epoch": 0.03656418428921876, + "grad_norm": 1.2719827890396118, + "learning_rate": 0.00019997747328091584, + "loss": 1.3422, + "step": 1021 + }, + { + "epoch": 0.03659999641878704, + "grad_norm": 1.4074218273162842, + "learning_rate": 0.00019997722642397484, + "loss": 1.512, + "step": 1022 + }, + { + "epoch": 0.036635808548355325, + "grad_norm": 1.6591966152191162, + "learning_rate": 0.00019997697822196926, + "loss": 1.528, + "step": 1023 + }, + { + "epoch": 0.036671620677923615, + "grad_norm": 1.7976876497268677, + "learning_rate": 0.00019997672867490238, + "loss": 1.3187, + "step": 1024 + }, + { + "epoch": 0.0367074328074919, + "grad_norm": 1.9082822799682617, + "learning_rate": 0.0001999764777827776, + "loss": 1.5728, + "step": 1025 + }, + { + "epoch": 0.03674324493706018, + "grad_norm": 1.7940855026245117, + "learning_rate": 0.00019997622554559824, + "loss": 1.4795, + "step": 1026 + }, + { + "epoch": 0.03677905706662847, + "grad_norm": 1.3792778253555298, + "learning_rate": 0.00019997597196336775, + "loss": 1.5757, + "step": 1027 + }, + { + "epoch": 0.03681486919619675, + "grad_norm": 2.0111641883850098, + "learning_rate": 0.00019997571703608952, + "loss": 1.5035, + "step": 1028 + }, + { + "epoch": 0.036850681325765036, + "grad_norm": 1.7198044061660767, + "learning_rate": 0.00019997546076376695, + "loss": 1.4769, + "step": 1029 + }, + { + "epoch": 0.03688649345533332, + "grad_norm": 2.127777099609375, + "learning_rate": 0.00019997520314640356, + "loss": 1.4955, + "step": 1030 + }, + { + "epoch": 0.03692230558490161, + "grad_norm": 2.164621353149414, + "learning_rate": 0.00019997494418400272, + "loss": 1.3236, + "step": 1031 + }, + { + "epoch": 0.03695811771446989, + "grad_norm": 1.6736340522766113, + "learning_rate": 0.00019997468387656796, + "loss": 1.4311, + "step": 1032 + }, + { + "epoch": 0.036993929844038174, + "grad_norm": 2.7354702949523926, + "learning_rate": 0.00019997442222410283, + "loss": 1.7514, + "step": 1033 + }, + { + "epoch": 0.03702974197360646, + "grad_norm": 1.5382907390594482, + "learning_rate": 0.0001999741592266108, + "loss": 1.4526, + "step": 1034 + }, + { + "epoch": 0.03706555410317475, + "grad_norm": 1.9577051401138306, + "learning_rate": 0.0001999738948840954, + "loss": 1.558, + "step": 1035 + }, + { + "epoch": 0.03710136623274303, + "grad_norm": 2.2414846420288086, + "learning_rate": 0.0001999736291965602, + "loss": 1.7898, + "step": 1036 + }, + { + "epoch": 0.03713717836231131, + "grad_norm": 1.825390100479126, + "learning_rate": 0.00019997336216400876, + "loss": 1.4528, + "step": 1037 + }, + { + "epoch": 0.0371729904918796, + "grad_norm": 2.3134326934814453, + "learning_rate": 0.00019997309378644472, + "loss": 1.6724, + "step": 1038 + }, + { + "epoch": 0.037208802621447885, + "grad_norm": 1.2938429117202759, + "learning_rate": 0.00019997282406387167, + "loss": 1.252, + "step": 1039 + }, + { + "epoch": 0.03724461475101617, + "grad_norm": 1.3428452014923096, + "learning_rate": 0.00019997255299629318, + "loss": 1.1838, + "step": 1040 + }, + { + "epoch": 0.03728042688058445, + "grad_norm": 2.10680890083313, + "learning_rate": 0.00019997228058371298, + "loss": 1.447, + "step": 1041 + }, + { + "epoch": 0.03731623901015274, + "grad_norm": 1.9465391635894775, + "learning_rate": 0.00019997200682613468, + "loss": 1.7122, + "step": 1042 + }, + { + "epoch": 0.037352051139721024, + "grad_norm": 1.6791930198669434, + "learning_rate": 0.00019997173172356202, + "loss": 1.559, + "step": 1043 + }, + { + "epoch": 0.037387863269289306, + "grad_norm": 1.4740599393844604, + "learning_rate": 0.00019997145527599864, + "loss": 1.4965, + "step": 1044 + }, + { + "epoch": 0.037423675398857596, + "grad_norm": 2.3971099853515625, + "learning_rate": 0.00019997117748344825, + "loss": 1.5722, + "step": 1045 + }, + { + "epoch": 0.03745948752842588, + "grad_norm": 1.365066409111023, + "learning_rate": 0.00019997089834591466, + "loss": 1.4264, + "step": 1046 + }, + { + "epoch": 0.03749529965799416, + "grad_norm": 1.3679434061050415, + "learning_rate": 0.00019997061786340158, + "loss": 1.4525, + "step": 1047 + }, + { + "epoch": 0.037531111787562445, + "grad_norm": 2.656130075454712, + "learning_rate": 0.00019997033603591277, + "loss": 1.5885, + "step": 1048 + }, + { + "epoch": 0.037566923917130735, + "grad_norm": 1.437775731086731, + "learning_rate": 0.00019997005286345208, + "loss": 1.2686, + "step": 1049 + }, + { + "epoch": 0.03760273604669902, + "grad_norm": 1.6287206411361694, + "learning_rate": 0.00019996976834602324, + "loss": 1.6452, + "step": 1050 + }, + { + "epoch": 0.0376385481762673, + "grad_norm": 2.255815029144287, + "learning_rate": 0.00019996948248363015, + "loss": 1.5305, + "step": 1051 + }, + { + "epoch": 0.03767436030583559, + "grad_norm": 1.552004337310791, + "learning_rate": 0.0001999691952762766, + "loss": 1.4169, + "step": 1052 + }, + { + "epoch": 0.03771017243540387, + "grad_norm": 1.4061479568481445, + "learning_rate": 0.00019996890672396652, + "loss": 1.387, + "step": 1053 + }, + { + "epoch": 0.037745984564972156, + "grad_norm": 2.2884581089019775, + "learning_rate": 0.0001999686168267037, + "loss": 1.5847, + "step": 1054 + }, + { + "epoch": 0.03778179669454044, + "grad_norm": 1.9982807636260986, + "learning_rate": 0.0001999683255844921, + "loss": 1.4064, + "step": 1055 + }, + { + "epoch": 0.03781760882410873, + "grad_norm": 1.5581481456756592, + "learning_rate": 0.00019996803299733565, + "loss": 1.3602, + "step": 1056 + }, + { + "epoch": 0.03785342095367701, + "grad_norm": 1.9598037004470825, + "learning_rate": 0.00019996773906523827, + "loss": 1.3578, + "step": 1057 + }, + { + "epoch": 0.037889233083245294, + "grad_norm": 2.215311050415039, + "learning_rate": 0.0001999674437882039, + "loss": 1.6319, + "step": 1058 + }, + { + "epoch": 0.03792504521281358, + "grad_norm": 1.5261871814727783, + "learning_rate": 0.0001999671471662365, + "loss": 1.3313, + "step": 1059 + }, + { + "epoch": 0.03796085734238187, + "grad_norm": 2.731802463531494, + "learning_rate": 0.0001999668491993401, + "loss": 1.8969, + "step": 1060 + }, + { + "epoch": 0.03799666947195015, + "grad_norm": 1.7532490491867065, + "learning_rate": 0.00019996654988751867, + "loss": 1.4605, + "step": 1061 + }, + { + "epoch": 0.03803248160151843, + "grad_norm": 2.646721363067627, + "learning_rate": 0.0001999662492307763, + "loss": 1.6436, + "step": 1062 + }, + { + "epoch": 0.03806829373108672, + "grad_norm": 1.6740275621414185, + "learning_rate": 0.000199965947229117, + "loss": 1.3922, + "step": 1063 + }, + { + "epoch": 0.038104105860655005, + "grad_norm": 2.542412757873535, + "learning_rate": 0.0001999656438825448, + "loss": 1.435, + "step": 1064 + }, + { + "epoch": 0.03813991799022329, + "grad_norm": 1.8878990411758423, + "learning_rate": 0.0001999653391910638, + "loss": 1.4056, + "step": 1065 + }, + { + "epoch": 0.03817573011979157, + "grad_norm": 1.506419062614441, + "learning_rate": 0.00019996503315467811, + "loss": 1.4344, + "step": 1066 + }, + { + "epoch": 0.03821154224935986, + "grad_norm": 1.671621322631836, + "learning_rate": 0.00019996472577339186, + "loss": 1.6708, + "step": 1067 + }, + { + "epoch": 0.03824735437892814, + "grad_norm": 1.476319670677185, + "learning_rate": 0.00019996441704720917, + "loss": 1.4119, + "step": 1068 + }, + { + "epoch": 0.038283166508496426, + "grad_norm": 2.513124704360962, + "learning_rate": 0.00019996410697613418, + "loss": 1.4294, + "step": 1069 + }, + { + "epoch": 0.038318978638064716, + "grad_norm": 1.4571789503097534, + "learning_rate": 0.0001999637955601711, + "loss": 1.5553, + "step": 1070 + }, + { + "epoch": 0.038354790767633, + "grad_norm": 1.3184473514556885, + "learning_rate": 0.00019996348279932406, + "loss": 1.2058, + "step": 1071 + }, + { + "epoch": 0.03839060289720128, + "grad_norm": 1.4630950689315796, + "learning_rate": 0.0001999631686935973, + "loss": 1.6243, + "step": 1072 + }, + { + "epoch": 0.038426415026769564, + "grad_norm": 1.939126968383789, + "learning_rate": 0.0001999628532429951, + "loss": 1.6721, + "step": 1073 + }, + { + "epoch": 0.038462227156337854, + "grad_norm": 1.3629194498062134, + "learning_rate": 0.00019996253644752158, + "loss": 1.393, + "step": 1074 + }, + { + "epoch": 0.03849803928590614, + "grad_norm": 1.4944639205932617, + "learning_rate": 0.00019996221830718115, + "loss": 1.4614, + "step": 1075 + }, + { + "epoch": 0.03853385141547442, + "grad_norm": 1.696588158607483, + "learning_rate": 0.00019996189882197797, + "loss": 1.4728, + "step": 1076 + }, + { + "epoch": 0.0385696635450427, + "grad_norm": 1.9315510988235474, + "learning_rate": 0.0001999615779919164, + "loss": 1.4455, + "step": 1077 + }, + { + "epoch": 0.03860547567461099, + "grad_norm": 1.7573798894882202, + "learning_rate": 0.0001999612558170007, + "loss": 1.655, + "step": 1078 + }, + { + "epoch": 0.038641287804179275, + "grad_norm": 1.5539077520370483, + "learning_rate": 0.0001999609322972353, + "loss": 1.4476, + "step": 1079 + }, + { + "epoch": 0.03867709993374756, + "grad_norm": 1.3783748149871826, + "learning_rate": 0.00019996060743262447, + "loss": 1.4464, + "step": 1080 + }, + { + "epoch": 0.03871291206331585, + "grad_norm": 1.586984634399414, + "learning_rate": 0.00019996028122317257, + "loss": 1.5326, + "step": 1081 + }, + { + "epoch": 0.03874872419288413, + "grad_norm": 1.9972493648529053, + "learning_rate": 0.00019995995366888408, + "loss": 1.6113, + "step": 1082 + }, + { + "epoch": 0.03878453632245241, + "grad_norm": 1.880059003829956, + "learning_rate": 0.00019995962476976336, + "loss": 1.3787, + "step": 1083 + }, + { + "epoch": 0.038820348452020696, + "grad_norm": 1.229698896408081, + "learning_rate": 0.00019995929452581478, + "loss": 1.485, + "step": 1084 + }, + { + "epoch": 0.038856160581588986, + "grad_norm": 2.01167368888855, + "learning_rate": 0.00019995896293704285, + "loss": 1.3634, + "step": 1085 + }, + { + "epoch": 0.03889197271115727, + "grad_norm": 1.7610790729522705, + "learning_rate": 0.00019995863000345202, + "loss": 1.4145, + "step": 1086 + }, + { + "epoch": 0.03892778484072555, + "grad_norm": 1.7878234386444092, + "learning_rate": 0.00019995829572504677, + "loss": 1.4432, + "step": 1087 + }, + { + "epoch": 0.03896359697029384, + "grad_norm": 1.4261016845703125, + "learning_rate": 0.00019995796010183157, + "loss": 1.4494, + "step": 1088 + }, + { + "epoch": 0.038999409099862124, + "grad_norm": 1.474094033241272, + "learning_rate": 0.00019995762313381095, + "loss": 1.3981, + "step": 1089 + }, + { + "epoch": 0.03903522122943041, + "grad_norm": 1.332926869392395, + "learning_rate": 0.00019995728482098945, + "loss": 1.3828, + "step": 1090 + }, + { + "epoch": 0.03907103335899869, + "grad_norm": 1.4062614440917969, + "learning_rate": 0.00019995694516337164, + "loss": 1.5379, + "step": 1091 + }, + { + "epoch": 0.03910684548856698, + "grad_norm": 1.4429371356964111, + "learning_rate": 0.00019995660416096206, + "loss": 1.4505, + "step": 1092 + }, + { + "epoch": 0.03914265761813526, + "grad_norm": 1.5757626295089722, + "learning_rate": 0.00019995626181376527, + "loss": 1.3789, + "step": 1093 + }, + { + "epoch": 0.039178469747703545, + "grad_norm": 1.591906189918518, + "learning_rate": 0.00019995591812178596, + "loss": 1.6422, + "step": 1094 + }, + { + "epoch": 0.039214281877271835, + "grad_norm": 1.6903294324874878, + "learning_rate": 0.00019995557308502866, + "loss": 1.4112, + "step": 1095 + }, + { + "epoch": 0.03925009400684012, + "grad_norm": 2.4346542358398438, + "learning_rate": 0.00019995522670349808, + "loss": 1.7512, + "step": 1096 + }, + { + "epoch": 0.0392859061364084, + "grad_norm": 1.5546154975891113, + "learning_rate": 0.00019995487897719888, + "loss": 1.5722, + "step": 1097 + }, + { + "epoch": 0.039321718265976684, + "grad_norm": 1.4588133096694946, + "learning_rate": 0.00019995452990613567, + "loss": 1.2975, + "step": 1098 + }, + { + "epoch": 0.03935753039554497, + "grad_norm": 2.08721661567688, + "learning_rate": 0.00019995417949031323, + "loss": 1.3574, + "step": 1099 + }, + { + "epoch": 0.039393342525113256, + "grad_norm": 3.0603203773498535, + "learning_rate": 0.00019995382772973623, + "loss": 1.7274, + "step": 1100 + }, + { + "epoch": 0.03942915465468154, + "grad_norm": 2.275730609893799, + "learning_rate": 0.00019995347462440938, + "loss": 1.4623, + "step": 1101 + }, + { + "epoch": 0.03946496678424982, + "grad_norm": 2.2733709812164307, + "learning_rate": 0.0001999531201743375, + "loss": 1.5988, + "step": 1102 + }, + { + "epoch": 0.03950077891381811, + "grad_norm": 1.6787751913070679, + "learning_rate": 0.0001999527643795253, + "loss": 1.5566, + "step": 1103 + }, + { + "epoch": 0.039536591043386395, + "grad_norm": 1.6165419816970825, + "learning_rate": 0.00019995240723997757, + "loss": 1.7015, + "step": 1104 + }, + { + "epoch": 0.03957240317295468, + "grad_norm": 1.5689753293991089, + "learning_rate": 0.00019995204875569914, + "loss": 1.3875, + "step": 1105 + }, + { + "epoch": 0.03960821530252297, + "grad_norm": 1.597456455230713, + "learning_rate": 0.00019995168892669485, + "loss": 1.3463, + "step": 1106 + }, + { + "epoch": 0.03964402743209125, + "grad_norm": 1.440782070159912, + "learning_rate": 0.00019995132775296948, + "loss": 1.5513, + "step": 1107 + }, + { + "epoch": 0.03967983956165953, + "grad_norm": 1.2676715850830078, + "learning_rate": 0.00019995096523452795, + "loss": 1.3005, + "step": 1108 + }, + { + "epoch": 0.039715651691227816, + "grad_norm": 1.8268694877624512, + "learning_rate": 0.0001999506013713751, + "loss": 1.4002, + "step": 1109 + }, + { + "epoch": 0.039751463820796105, + "grad_norm": 1.2606676816940308, + "learning_rate": 0.0001999502361635158, + "loss": 1.537, + "step": 1110 + }, + { + "epoch": 0.03978727595036439, + "grad_norm": 1.5977914333343506, + "learning_rate": 0.00019994986961095504, + "loss": 1.5126, + "step": 1111 + }, + { + "epoch": 0.03982308807993267, + "grad_norm": 1.3860092163085938, + "learning_rate": 0.0001999495017136977, + "loss": 1.3644, + "step": 1112 + }, + { + "epoch": 0.03985890020950096, + "grad_norm": 1.9814512729644775, + "learning_rate": 0.00019994913247174876, + "loss": 1.5814, + "step": 1113 + }, + { + "epoch": 0.039894712339069244, + "grad_norm": 1.651170015335083, + "learning_rate": 0.00019994876188511314, + "loss": 1.6396, + "step": 1114 + }, + { + "epoch": 0.03993052446863753, + "grad_norm": 1.5217385292053223, + "learning_rate": 0.00019994838995379585, + "loss": 1.4555, + "step": 1115 + }, + { + "epoch": 0.03996633659820581, + "grad_norm": 1.4478552341461182, + "learning_rate": 0.0001999480166778019, + "loss": 1.3104, + "step": 1116 + }, + { + "epoch": 0.0400021487277741, + "grad_norm": 1.10916006565094, + "learning_rate": 0.00019994764205713631, + "loss": 1.472, + "step": 1117 + }, + { + "epoch": 0.04003796085734238, + "grad_norm": 1.4906243085861206, + "learning_rate": 0.00019994726609180415, + "loss": 1.4938, + "step": 1118 + }, + { + "epoch": 0.040073772986910665, + "grad_norm": 1.5674965381622314, + "learning_rate": 0.00019994688878181044, + "loss": 1.5039, + "step": 1119 + }, + { + "epoch": 0.040109585116478955, + "grad_norm": 1.253129005432129, + "learning_rate": 0.0001999465101271602, + "loss": 1.3392, + "step": 1120 + }, + { + "epoch": 0.04014539724604724, + "grad_norm": 1.6247708797454834, + "learning_rate": 0.00019994613012785868, + "loss": 1.5268, + "step": 1121 + }, + { + "epoch": 0.04018120937561552, + "grad_norm": 1.87022864818573, + "learning_rate": 0.00019994574878391084, + "loss": 1.6066, + "step": 1122 + }, + { + "epoch": 0.0402170215051838, + "grad_norm": 1.4516187906265259, + "learning_rate": 0.00019994536609532187, + "loss": 1.1498, + "step": 1123 + }, + { + "epoch": 0.04025283363475209, + "grad_norm": 1.8140724897384644, + "learning_rate": 0.00019994498206209695, + "loss": 1.1673, + "step": 1124 + }, + { + "epoch": 0.040288645764320376, + "grad_norm": 1.4093788862228394, + "learning_rate": 0.0001999445966842412, + "loss": 1.459, + "step": 1125 + }, + { + "epoch": 0.04032445789388866, + "grad_norm": 1.3681179285049438, + "learning_rate": 0.00019994420996175983, + "loss": 1.4823, + "step": 1126 + }, + { + "epoch": 0.04036027002345694, + "grad_norm": 2.078141689300537, + "learning_rate": 0.00019994382189465802, + "loss": 1.5087, + "step": 1127 + }, + { + "epoch": 0.04039608215302523, + "grad_norm": 1.8450558185577393, + "learning_rate": 0.000199943432482941, + "loss": 1.2845, + "step": 1128 + }, + { + "epoch": 0.040431894282593514, + "grad_norm": 2.135521173477173, + "learning_rate": 0.00019994304172661403, + "loss": 1.6951, + "step": 1129 + }, + { + "epoch": 0.0404677064121618, + "grad_norm": 1.7708842754364014, + "learning_rate": 0.00019994264962568234, + "loss": 1.4989, + "step": 1130 + }, + { + "epoch": 0.04050351854173009, + "grad_norm": 2.986258029937744, + "learning_rate": 0.00019994225618015125, + "loss": 1.7324, + "step": 1131 + }, + { + "epoch": 0.04053933067129837, + "grad_norm": 1.8298053741455078, + "learning_rate": 0.000199941861390026, + "loss": 1.5479, + "step": 1132 + }, + { + "epoch": 0.04057514280086665, + "grad_norm": 1.6138114929199219, + "learning_rate": 0.0001999414652553119, + "loss": 1.4326, + "step": 1133 + }, + { + "epoch": 0.040610954930434935, + "grad_norm": 1.399336338043213, + "learning_rate": 0.00019994106777601432, + "loss": 1.3561, + "step": 1134 + }, + { + "epoch": 0.040646767060003225, + "grad_norm": 1.574440836906433, + "learning_rate": 0.00019994066895213857, + "loss": 1.3445, + "step": 1135 + }, + { + "epoch": 0.04068257918957151, + "grad_norm": 1.4134585857391357, + "learning_rate": 0.00019994026878369003, + "loss": 1.4753, + "step": 1136 + }, + { + "epoch": 0.04071839131913979, + "grad_norm": 1.7624750137329102, + "learning_rate": 0.00019993986727067414, + "loss": 1.402, + "step": 1137 + }, + { + "epoch": 0.04075420344870808, + "grad_norm": 2.275402545928955, + "learning_rate": 0.0001999394644130962, + "loss": 1.6707, + "step": 1138 + }, + { + "epoch": 0.04079001557827636, + "grad_norm": 1.6068122386932373, + "learning_rate": 0.00019993906021096168, + "loss": 1.4047, + "step": 1139 + }, + { + "epoch": 0.040825827707844646, + "grad_norm": 1.6516836881637573, + "learning_rate": 0.00019993865466427603, + "loss": 1.3506, + "step": 1140 + }, + { + "epoch": 0.04086163983741293, + "grad_norm": 1.8428994417190552, + "learning_rate": 0.00019993824777304469, + "loss": 1.3272, + "step": 1141 + }, + { + "epoch": 0.04089745196698122, + "grad_norm": 2.057762622833252, + "learning_rate": 0.0001999378395372731, + "loss": 1.2738, + "step": 1142 + }, + { + "epoch": 0.0409332640965495, + "grad_norm": 2.258199453353882, + "learning_rate": 0.00019993742995696686, + "loss": 1.6893, + "step": 1143 + }, + { + "epoch": 0.040969076226117784, + "grad_norm": 1.5244778394699097, + "learning_rate": 0.0001999370190321314, + "loss": 1.4347, + "step": 1144 + }, + { + "epoch": 0.041004888355686074, + "grad_norm": 1.6868587732315063, + "learning_rate": 0.0001999366067627722, + "loss": 1.2556, + "step": 1145 + }, + { + "epoch": 0.04104070048525436, + "grad_norm": 1.5356584787368774, + "learning_rate": 0.0001999361931488949, + "loss": 1.5246, + "step": 1146 + }, + { + "epoch": 0.04107651261482264, + "grad_norm": 1.9768896102905273, + "learning_rate": 0.00019993577819050505, + "loss": 1.5304, + "step": 1147 + }, + { + "epoch": 0.04111232474439092, + "grad_norm": 1.4518694877624512, + "learning_rate": 0.00019993536188760817, + "loss": 1.4272, + "step": 1148 + }, + { + "epoch": 0.04114813687395921, + "grad_norm": 1.896575689315796, + "learning_rate": 0.00019993494424020992, + "loss": 1.3426, + "step": 1149 + }, + { + "epoch": 0.041183949003527495, + "grad_norm": 2.2670738697052, + "learning_rate": 0.00019993452524831592, + "loss": 1.4688, + "step": 1150 + }, + { + "epoch": 0.04121976113309578, + "grad_norm": 1.6102263927459717, + "learning_rate": 0.0001999341049119318, + "loss": 1.2701, + "step": 1151 + }, + { + "epoch": 0.04125557326266406, + "grad_norm": 1.7803646326065063, + "learning_rate": 0.00019993368323106315, + "loss": 1.5149, + "step": 1152 + }, + { + "epoch": 0.04129138539223235, + "grad_norm": 1.5448225736618042, + "learning_rate": 0.0001999332602057157, + "loss": 1.3117, + "step": 1153 + }, + { + "epoch": 0.041327197521800633, + "grad_norm": 1.9659695625305176, + "learning_rate": 0.0001999328358358952, + "loss": 1.3991, + "step": 1154 + }, + { + "epoch": 0.041363009651368916, + "grad_norm": 1.776111125946045, + "learning_rate": 0.00019993241012160727, + "loss": 1.5546, + "step": 1155 + }, + { + "epoch": 0.041398821780937206, + "grad_norm": 1.8674408197402954, + "learning_rate": 0.00019993198306285766, + "loss": 1.4645, + "step": 1156 + }, + { + "epoch": 0.04143463391050549, + "grad_norm": 1.429282546043396, + "learning_rate": 0.0001999315546596521, + "loss": 1.3502, + "step": 1157 + }, + { + "epoch": 0.04147044604007377, + "grad_norm": 1.3156461715698242, + "learning_rate": 0.0001999311249119964, + "loss": 1.366, + "step": 1158 + }, + { + "epoch": 0.041506258169642055, + "grad_norm": 1.300122618675232, + "learning_rate": 0.0001999306938198963, + "loss": 1.4342, + "step": 1159 + }, + { + "epoch": 0.041542070299210344, + "grad_norm": 2.006173849105835, + "learning_rate": 0.00019993026138335763, + "loss": 1.7274, + "step": 1160 + }, + { + "epoch": 0.04157788242877863, + "grad_norm": 1.7549760341644287, + "learning_rate": 0.0001999298276023862, + "loss": 1.5654, + "step": 1161 + }, + { + "epoch": 0.04161369455834691, + "grad_norm": 1.3473520278930664, + "learning_rate": 0.00019992939247698784, + "loss": 1.4244, + "step": 1162 + }, + { + "epoch": 0.0416495066879152, + "grad_norm": 1.8357698917388916, + "learning_rate": 0.00019992895600716838, + "loss": 1.4489, + "step": 1163 + }, + { + "epoch": 0.04168531881748348, + "grad_norm": 1.483633041381836, + "learning_rate": 0.00019992851819293373, + "loss": 1.479, + "step": 1164 + }, + { + "epoch": 0.041721130947051766, + "grad_norm": 1.5790929794311523, + "learning_rate": 0.00019992807903428976, + "loss": 1.5225, + "step": 1165 + }, + { + "epoch": 0.04175694307662005, + "grad_norm": 1.2782658338546753, + "learning_rate": 0.0001999276385312424, + "loss": 1.4501, + "step": 1166 + }, + { + "epoch": 0.04179275520618834, + "grad_norm": 1.3096636533737183, + "learning_rate": 0.00019992719668379753, + "loss": 1.5303, + "step": 1167 + }, + { + "epoch": 0.04182856733575662, + "grad_norm": 2.1858155727386475, + "learning_rate": 0.00019992675349196114, + "loss": 1.8806, + "step": 1168 + }, + { + "epoch": 0.041864379465324904, + "grad_norm": 2.4594807624816895, + "learning_rate": 0.0001999263089557392, + "loss": 1.6464, + "step": 1169 + }, + { + "epoch": 0.041900191594893194, + "grad_norm": 1.6330476999282837, + "learning_rate": 0.00019992586307513767, + "loss": 1.5955, + "step": 1170 + }, + { + "epoch": 0.041936003724461476, + "grad_norm": 1.6838473081588745, + "learning_rate": 0.00019992541585016254, + "loss": 1.2994, + "step": 1171 + }, + { + "epoch": 0.04197181585402976, + "grad_norm": 1.6568214893341064, + "learning_rate": 0.0001999249672808198, + "loss": 1.4319, + "step": 1172 + }, + { + "epoch": 0.04200762798359804, + "grad_norm": 1.5975526571273804, + "learning_rate": 0.00019992451736711554, + "loss": 1.6214, + "step": 1173 + }, + { + "epoch": 0.04204344011316633, + "grad_norm": 1.6331744194030762, + "learning_rate": 0.00019992406610905582, + "loss": 1.2457, + "step": 1174 + }, + { + "epoch": 0.042079252242734615, + "grad_norm": 1.519890308380127, + "learning_rate": 0.00019992361350664663, + "loss": 1.454, + "step": 1175 + }, + { + "epoch": 0.0421150643723029, + "grad_norm": 1.7362298965454102, + "learning_rate": 0.00019992315955989415, + "loss": 1.5963, + "step": 1176 + }, + { + "epoch": 0.04215087650187118, + "grad_norm": 2.0130434036254883, + "learning_rate": 0.00019992270426880446, + "loss": 1.3899, + "step": 1177 + }, + { + "epoch": 0.04218668863143947, + "grad_norm": 1.7651981115341187, + "learning_rate": 0.00019992224763338366, + "loss": 1.3297, + "step": 1178 + }, + { + "epoch": 0.04222250076100775, + "grad_norm": 1.9180587530136108, + "learning_rate": 0.00019992178965363787, + "loss": 1.3354, + "step": 1179 + }, + { + "epoch": 0.042258312890576036, + "grad_norm": 1.2645833492279053, + "learning_rate": 0.00019992133032957336, + "loss": 1.4028, + "step": 1180 + }, + { + "epoch": 0.042294125020144326, + "grad_norm": 1.9809558391571045, + "learning_rate": 0.0001999208696611962, + "loss": 1.4035, + "step": 1181 + }, + { + "epoch": 0.04232993714971261, + "grad_norm": 1.5047763586044312, + "learning_rate": 0.00019992040764851263, + "loss": 1.2952, + "step": 1182 + }, + { + "epoch": 0.04236574927928089, + "grad_norm": 1.80900239944458, + "learning_rate": 0.00019991994429152888, + "loss": 1.2897, + "step": 1183 + }, + { + "epoch": 0.042401561408849174, + "grad_norm": 2.6405677795410156, + "learning_rate": 0.00019991947959025112, + "loss": 1.7736, + "step": 1184 + }, + { + "epoch": 0.042437373538417464, + "grad_norm": 1.794243335723877, + "learning_rate": 0.0001999190135446857, + "loss": 1.5376, + "step": 1185 + }, + { + "epoch": 0.04247318566798575, + "grad_norm": 2.0358126163482666, + "learning_rate": 0.00019991854615483882, + "loss": 1.7785, + "step": 1186 + }, + { + "epoch": 0.04250899779755403, + "grad_norm": 1.5745772123336792, + "learning_rate": 0.00019991807742071678, + "loss": 1.472, + "step": 1187 + }, + { + "epoch": 0.04254480992712232, + "grad_norm": 2.0657858848571777, + "learning_rate": 0.0001999176073423259, + "loss": 1.3748, + "step": 1188 + }, + { + "epoch": 0.0425806220566906, + "grad_norm": 2.0125019550323486, + "learning_rate": 0.00019991713591967252, + "loss": 1.439, + "step": 1189 + }, + { + "epoch": 0.042616434186258885, + "grad_norm": 2.6329033374786377, + "learning_rate": 0.00019991666315276292, + "loss": 1.5178, + "step": 1190 + }, + { + "epoch": 0.04265224631582717, + "grad_norm": 1.9475456476211548, + "learning_rate": 0.0001999161890416035, + "loss": 1.3971, + "step": 1191 + }, + { + "epoch": 0.04268805844539546, + "grad_norm": 2.1330173015594482, + "learning_rate": 0.00019991571358620068, + "loss": 1.6397, + "step": 1192 + }, + { + "epoch": 0.04272387057496374, + "grad_norm": 1.4532243013381958, + "learning_rate": 0.0001999152367865608, + "loss": 1.4241, + "step": 1193 + }, + { + "epoch": 0.04275968270453202, + "grad_norm": 1.5068355798721313, + "learning_rate": 0.0001999147586426903, + "loss": 1.5051, + "step": 1194 + }, + { + "epoch": 0.04279549483410031, + "grad_norm": 1.7935154438018799, + "learning_rate": 0.00019991427915459558, + "loss": 1.4388, + "step": 1195 + }, + { + "epoch": 0.042831306963668596, + "grad_norm": 1.2878973484039307, + "learning_rate": 0.0001999137983222831, + "loss": 1.4936, + "step": 1196 + }, + { + "epoch": 0.04286711909323688, + "grad_norm": 2.004239559173584, + "learning_rate": 0.0001999133161457594, + "loss": 1.3932, + "step": 1197 + }, + { + "epoch": 0.04290293122280516, + "grad_norm": 1.419323444366455, + "learning_rate": 0.00019991283262503083, + "loss": 1.5137, + "step": 1198 + }, + { + "epoch": 0.04293874335237345, + "grad_norm": 1.513864517211914, + "learning_rate": 0.00019991234776010406, + "loss": 1.4821, + "step": 1199 + }, + { + "epoch": 0.042974555481941734, + "grad_norm": 1.5998741388320923, + "learning_rate": 0.0001999118615509855, + "loss": 1.3224, + "step": 1200 + }, + { + "epoch": 0.04301036761151002, + "grad_norm": 1.3384796380996704, + "learning_rate": 0.00019991137399768166, + "loss": 1.4225, + "step": 1201 + }, + { + "epoch": 0.0430461797410783, + "grad_norm": 1.9338958263397217, + "learning_rate": 0.00019991088510019924, + "loss": 1.4323, + "step": 1202 + }, + { + "epoch": 0.04308199187064659, + "grad_norm": 1.3594177961349487, + "learning_rate": 0.0001999103948585447, + "loss": 1.2117, + "step": 1203 + }, + { + "epoch": 0.04311780400021487, + "grad_norm": 1.848148226737976, + "learning_rate": 0.00019990990327272467, + "loss": 1.5261, + "step": 1204 + }, + { + "epoch": 0.043153616129783155, + "grad_norm": 1.7783968448638916, + "learning_rate": 0.00019990941034274577, + "loss": 1.5028, + "step": 1205 + }, + { + "epoch": 0.043189428259351445, + "grad_norm": 2.062638521194458, + "learning_rate": 0.00019990891606861463, + "loss": 1.2844, + "step": 1206 + }, + { + "epoch": 0.04322524038891973, + "grad_norm": 1.6558737754821777, + "learning_rate": 0.0001999084204503379, + "loss": 1.3452, + "step": 1207 + }, + { + "epoch": 0.04326105251848801, + "grad_norm": 1.7440272569656372, + "learning_rate": 0.00019990792348792224, + "loss": 1.4935, + "step": 1208 + }, + { + "epoch": 0.043296864648056294, + "grad_norm": 1.7240418195724487, + "learning_rate": 0.00019990742518137436, + "loss": 1.5027, + "step": 1209 + }, + { + "epoch": 0.04333267677762458, + "grad_norm": 1.8956997394561768, + "learning_rate": 0.00019990692553070093, + "loss": 1.5143, + "step": 1210 + }, + { + "epoch": 0.043368488907192866, + "grad_norm": 1.3143163919448853, + "learning_rate": 0.0001999064245359087, + "loss": 1.2864, + "step": 1211 + }, + { + "epoch": 0.04340430103676115, + "grad_norm": 1.8588268756866455, + "learning_rate": 0.00019990592219700437, + "loss": 1.3838, + "step": 1212 + }, + { + "epoch": 0.04344011316632944, + "grad_norm": 2.3667871952056885, + "learning_rate": 0.00019990541851399476, + "loss": 1.4789, + "step": 1213 + }, + { + "epoch": 0.04347592529589772, + "grad_norm": 2.9504268169403076, + "learning_rate": 0.00019990491348688657, + "loss": 1.6944, + "step": 1214 + }, + { + "epoch": 0.043511737425466004, + "grad_norm": 1.4357681274414062, + "learning_rate": 0.00019990440711568666, + "loss": 1.4376, + "step": 1215 + }, + { + "epoch": 0.04354754955503429, + "grad_norm": 2.110166549682617, + "learning_rate": 0.00019990389940040184, + "loss": 1.581, + "step": 1216 + }, + { + "epoch": 0.04358336168460258, + "grad_norm": 2.6701440811157227, + "learning_rate": 0.0001999033903410389, + "loss": 1.5738, + "step": 1217 + }, + { + "epoch": 0.04361917381417086, + "grad_norm": 1.9290897846221924, + "learning_rate": 0.00019990287993760473, + "loss": 1.3921, + "step": 1218 + }, + { + "epoch": 0.04365498594373914, + "grad_norm": 1.4818124771118164, + "learning_rate": 0.00019990236819010615, + "loss": 1.3486, + "step": 1219 + }, + { + "epoch": 0.04369079807330743, + "grad_norm": 1.81833016872406, + "learning_rate": 0.0001999018550985501, + "loss": 1.5111, + "step": 1220 + }, + { + "epoch": 0.043726610202875715, + "grad_norm": 2.4536550045013428, + "learning_rate": 0.00019990134066294338, + "loss": 1.7698, + "step": 1221 + }, + { + "epoch": 0.043762422332444, + "grad_norm": 1.8574427366256714, + "learning_rate": 0.00019990082488329308, + "loss": 1.5545, + "step": 1222 + }, + { + "epoch": 0.04379823446201228, + "grad_norm": 1.5017127990722656, + "learning_rate": 0.000199900307759606, + "loss": 1.7069, + "step": 1223 + }, + { + "epoch": 0.04383404659158057, + "grad_norm": 1.364561915397644, + "learning_rate": 0.00019989978929188914, + "loss": 1.4853, + "step": 1224 + }, + { + "epoch": 0.043869858721148854, + "grad_norm": 1.9201189279556274, + "learning_rate": 0.00019989926948014945, + "loss": 1.404, + "step": 1225 + }, + { + "epoch": 0.043905670850717136, + "grad_norm": 2.295900583267212, + "learning_rate": 0.000199898748324394, + "loss": 1.646, + "step": 1226 + }, + { + "epoch": 0.04394148298028542, + "grad_norm": 1.678598403930664, + "learning_rate": 0.00019989822582462972, + "loss": 1.5118, + "step": 1227 + }, + { + "epoch": 0.04397729510985371, + "grad_norm": 1.5834888219833374, + "learning_rate": 0.00019989770198086367, + "loss": 1.3215, + "step": 1228 + }, + { + "epoch": 0.04401310723942199, + "grad_norm": 1.5055537223815918, + "learning_rate": 0.0001998971767931029, + "loss": 1.2811, + "step": 1229 + }, + { + "epoch": 0.044048919368990275, + "grad_norm": 1.8679454326629639, + "learning_rate": 0.0001998966502613545, + "loss": 1.6109, + "step": 1230 + }, + { + "epoch": 0.044084731498558564, + "grad_norm": 1.3446649312973022, + "learning_rate": 0.0001998961223856255, + "loss": 1.4506, + "step": 1231 + }, + { + "epoch": 0.04412054362812685, + "grad_norm": 1.6353193521499634, + "learning_rate": 0.00019989559316592305, + "loss": 1.3533, + "step": 1232 + }, + { + "epoch": 0.04415635575769513, + "grad_norm": 1.398842692375183, + "learning_rate": 0.00019989506260225426, + "loss": 1.5886, + "step": 1233 + }, + { + "epoch": 0.04419216788726341, + "grad_norm": 2.129598617553711, + "learning_rate": 0.00019989453069462623, + "loss": 1.3462, + "step": 1234 + }, + { + "epoch": 0.0442279800168317, + "grad_norm": 1.5416369438171387, + "learning_rate": 0.00019989399744304616, + "loss": 1.5439, + "step": 1235 + }, + { + "epoch": 0.044263792146399986, + "grad_norm": 1.2661333084106445, + "learning_rate": 0.0001998934628475212, + "loss": 1.1931, + "step": 1236 + }, + { + "epoch": 0.04429960427596827, + "grad_norm": 2.8211653232574463, + "learning_rate": 0.00019989292690805854, + "loss": 1.479, + "step": 1237 + }, + { + "epoch": 0.04433541640553656, + "grad_norm": 1.438339114189148, + "learning_rate": 0.00019989238962466542, + "loss": 1.3477, + "step": 1238 + }, + { + "epoch": 0.04437122853510484, + "grad_norm": 1.7372568845748901, + "learning_rate": 0.00019989185099734903, + "loss": 1.4687, + "step": 1239 + }, + { + "epoch": 0.044407040664673124, + "grad_norm": 1.7015050649642944, + "learning_rate": 0.00019989131102611667, + "loss": 1.3895, + "step": 1240 + }, + { + "epoch": 0.04444285279424141, + "grad_norm": 2.8231887817382812, + "learning_rate": 0.00019989076971097555, + "loss": 1.5416, + "step": 1241 + }, + { + "epoch": 0.044478664923809696, + "grad_norm": 2.2279324531555176, + "learning_rate": 0.00019989022705193299, + "loss": 1.6203, + "step": 1242 + }, + { + "epoch": 0.04451447705337798, + "grad_norm": 2.1303787231445312, + "learning_rate": 0.00019988968304899624, + "loss": 1.6371, + "step": 1243 + }, + { + "epoch": 0.04455028918294626, + "grad_norm": 1.8646928071975708, + "learning_rate": 0.00019988913770217269, + "loss": 1.5134, + "step": 1244 + }, + { + "epoch": 0.04458610131251455, + "grad_norm": 1.9565250873565674, + "learning_rate": 0.00019988859101146962, + "loss": 1.5844, + "step": 1245 + }, + { + "epoch": 0.044621913442082835, + "grad_norm": 2.1006932258605957, + "learning_rate": 0.00019988804297689438, + "loss": 1.5162, + "step": 1246 + }, + { + "epoch": 0.04465772557165112, + "grad_norm": 1.8043700456619263, + "learning_rate": 0.0001998874935984544, + "loss": 1.4673, + "step": 1247 + }, + { + "epoch": 0.0446935377012194, + "grad_norm": 1.5370553731918335, + "learning_rate": 0.00019988694287615704, + "loss": 1.3761, + "step": 1248 + }, + { + "epoch": 0.04472934983078769, + "grad_norm": 1.5100451707839966, + "learning_rate": 0.0001998863908100097, + "loss": 1.2889, + "step": 1249 + }, + { + "epoch": 0.04476516196035597, + "grad_norm": 1.8959665298461914, + "learning_rate": 0.00019988583740001984, + "loss": 1.3636, + "step": 1250 + }, + { + "epoch": 0.044800974089924256, + "grad_norm": 1.8653393983840942, + "learning_rate": 0.00019988528264619485, + "loss": 1.483, + "step": 1251 + }, + { + "epoch": 0.04483678621949254, + "grad_norm": 1.3323075771331787, + "learning_rate": 0.00019988472654854222, + "loss": 1.4831, + "step": 1252 + }, + { + "epoch": 0.04487259834906083, + "grad_norm": 2.5798585414886475, + "learning_rate": 0.00019988416910706947, + "loss": 1.775, + "step": 1253 + }, + { + "epoch": 0.04490841047862911, + "grad_norm": 1.493026614189148, + "learning_rate": 0.00019988361032178403, + "loss": 1.4592, + "step": 1254 + }, + { + "epoch": 0.044944222608197394, + "grad_norm": 1.7472642660140991, + "learning_rate": 0.00019988305019269346, + "loss": 1.3939, + "step": 1255 + }, + { + "epoch": 0.044980034737765684, + "grad_norm": 1.5011616945266724, + "learning_rate": 0.00019988248871980532, + "loss": 1.3149, + "step": 1256 + }, + { + "epoch": 0.04501584686733397, + "grad_norm": 1.457779884338379, + "learning_rate": 0.0001998819259031271, + "loss": 1.3753, + "step": 1257 + }, + { + "epoch": 0.04505165899690225, + "grad_norm": 1.2864006757736206, + "learning_rate": 0.00019988136174266643, + "loss": 1.4325, + "step": 1258 + }, + { + "epoch": 0.04508747112647053, + "grad_norm": 2.2285261154174805, + "learning_rate": 0.00019988079623843087, + "loss": 1.5131, + "step": 1259 + }, + { + "epoch": 0.04512328325603882, + "grad_norm": 3.087982416152954, + "learning_rate": 0.000199880229390428, + "loss": 1.5016, + "step": 1260 + }, + { + "epoch": 0.045159095385607105, + "grad_norm": 2.164334297180176, + "learning_rate": 0.00019987966119866554, + "loss": 1.2712, + "step": 1261 + }, + { + "epoch": 0.04519490751517539, + "grad_norm": 2.111055850982666, + "learning_rate": 0.00019987909166315103, + "loss": 1.3199, + "step": 1262 + }, + { + "epoch": 0.04523071964474368, + "grad_norm": 2.1053884029388428, + "learning_rate": 0.00019987852078389218, + "loss": 1.3359, + "step": 1263 + }, + { + "epoch": 0.04526653177431196, + "grad_norm": 2.871025800704956, + "learning_rate": 0.00019987794856089668, + "loss": 1.7435, + "step": 1264 + }, + { + "epoch": 0.04530234390388024, + "grad_norm": 1.6512757539749146, + "learning_rate": 0.0001998773749941722, + "loss": 1.5057, + "step": 1265 + }, + { + "epoch": 0.045338156033448526, + "grad_norm": 2.161500930786133, + "learning_rate": 0.00019987680008372647, + "loss": 1.5045, + "step": 1266 + }, + { + "epoch": 0.045373968163016816, + "grad_norm": 1.8172154426574707, + "learning_rate": 0.00019987622382956722, + "loss": 1.4201, + "step": 1267 + }, + { + "epoch": 0.0454097802925851, + "grad_norm": 1.5449347496032715, + "learning_rate": 0.00019987564623170226, + "loss": 1.5727, + "step": 1268 + }, + { + "epoch": 0.04544559242215338, + "grad_norm": 1.8584249019622803, + "learning_rate": 0.00019987506729013927, + "loss": 1.3892, + "step": 1269 + }, + { + "epoch": 0.04548140455172167, + "grad_norm": 2.092036247253418, + "learning_rate": 0.0001998744870048861, + "loss": 1.3643, + "step": 1270 + }, + { + "epoch": 0.045517216681289954, + "grad_norm": 1.2212584018707275, + "learning_rate": 0.0001998739053759505, + "loss": 1.302, + "step": 1271 + }, + { + "epoch": 0.04555302881085824, + "grad_norm": 1.6147592067718506, + "learning_rate": 0.00019987332240334037, + "loss": 1.4268, + "step": 1272 + }, + { + "epoch": 0.04558884094042652, + "grad_norm": 1.8850232362747192, + "learning_rate": 0.00019987273808706347, + "loss": 1.3941, + "step": 1273 + }, + { + "epoch": 0.04562465306999481, + "grad_norm": 1.5302989482879639, + "learning_rate": 0.00019987215242712775, + "loss": 1.4053, + "step": 1274 + }, + { + "epoch": 0.04566046519956309, + "grad_norm": 1.8344345092773438, + "learning_rate": 0.00019987156542354103, + "loss": 1.2924, + "step": 1275 + }, + { + "epoch": 0.045696277329131375, + "grad_norm": 2.099655866622925, + "learning_rate": 0.00019987097707631124, + "loss": 1.5467, + "step": 1276 + }, + { + "epoch": 0.04573208945869966, + "grad_norm": 1.8967297077178955, + "learning_rate": 0.00019987038738544625, + "loss": 1.1452, + "step": 1277 + }, + { + "epoch": 0.04576790158826795, + "grad_norm": 1.6808077096939087, + "learning_rate": 0.00019986979635095402, + "loss": 1.4229, + "step": 1278 + }, + { + "epoch": 0.04580371371783623, + "grad_norm": 1.4301230907440186, + "learning_rate": 0.00019986920397284253, + "loss": 1.422, + "step": 1279 + }, + { + "epoch": 0.045839525847404514, + "grad_norm": 1.884761095046997, + "learning_rate": 0.0001998686102511197, + "loss": 1.5797, + "step": 1280 + }, + { + "epoch": 0.0458753379769728, + "grad_norm": 1.508285641670227, + "learning_rate": 0.00019986801518579353, + "loss": 1.3558, + "step": 1281 + }, + { + "epoch": 0.045911150106541086, + "grad_norm": 1.8701648712158203, + "learning_rate": 0.00019986741877687207, + "loss": 1.3567, + "step": 1282 + }, + { + "epoch": 0.04594696223610937, + "grad_norm": 3.852509021759033, + "learning_rate": 0.00019986682102436328, + "loss": 1.4741, + "step": 1283 + }, + { + "epoch": 0.04598277436567765, + "grad_norm": 1.64828360080719, + "learning_rate": 0.00019986622192827525, + "loss": 1.6874, + "step": 1284 + }, + { + "epoch": 0.04601858649524594, + "grad_norm": 1.689936637878418, + "learning_rate": 0.000199865621488616, + "loss": 1.4717, + "step": 1285 + }, + { + "epoch": 0.046054398624814225, + "grad_norm": 1.7634226083755493, + "learning_rate": 0.00019986501970539367, + "loss": 1.5095, + "step": 1286 + }, + { + "epoch": 0.04609021075438251, + "grad_norm": 1.374416470527649, + "learning_rate": 0.0001998644165786163, + "loss": 1.423, + "step": 1287 + }, + { + "epoch": 0.0461260228839508, + "grad_norm": 1.7775685787200928, + "learning_rate": 0.00019986381210829199, + "loss": 1.6092, + "step": 1288 + }, + { + "epoch": 0.04616183501351908, + "grad_norm": 1.454088568687439, + "learning_rate": 0.00019986320629442893, + "loss": 1.1969, + "step": 1289 + }, + { + "epoch": 0.04619764714308736, + "grad_norm": 1.3438547849655151, + "learning_rate": 0.00019986259913703526, + "loss": 1.3504, + "step": 1290 + }, + { + "epoch": 0.046233459272655646, + "grad_norm": 1.6785513162612915, + "learning_rate": 0.00019986199063611913, + "loss": 1.3287, + "step": 1291 + }, + { + "epoch": 0.046269271402223935, + "grad_norm": 1.603790044784546, + "learning_rate": 0.0001998613807916887, + "loss": 1.534, + "step": 1292 + }, + { + "epoch": 0.04630508353179222, + "grad_norm": 1.7548747062683105, + "learning_rate": 0.00019986076960375223, + "loss": 1.4362, + "step": 1293 + }, + { + "epoch": 0.0463408956613605, + "grad_norm": 1.441945195198059, + "learning_rate": 0.00019986015707231788, + "loss": 1.5559, + "step": 1294 + }, + { + "epoch": 0.04637670779092879, + "grad_norm": 1.472014307975769, + "learning_rate": 0.00019985954319739392, + "loss": 1.4201, + "step": 1295 + }, + { + "epoch": 0.046412519920497074, + "grad_norm": 1.4826260805130005, + "learning_rate": 0.00019985892797898865, + "loss": 1.4118, + "step": 1296 + }, + { + "epoch": 0.04644833205006536, + "grad_norm": 1.2948439121246338, + "learning_rate": 0.00019985831141711033, + "loss": 1.4884, + "step": 1297 + }, + { + "epoch": 0.04648414417963364, + "grad_norm": 1.226728916168213, + "learning_rate": 0.00019985769351176723, + "loss": 1.3462, + "step": 1298 + }, + { + "epoch": 0.04651995630920193, + "grad_norm": 1.7520865201950073, + "learning_rate": 0.00019985707426296764, + "loss": 1.4718, + "step": 1299 + }, + { + "epoch": 0.04655576843877021, + "grad_norm": 1.6688320636749268, + "learning_rate": 0.00019985645367071993, + "loss": 1.3724, + "step": 1300 + }, + { + "epoch": 0.046591580568338495, + "grad_norm": 1.880753517150879, + "learning_rate": 0.00019985583173503244, + "loss": 1.4602, + "step": 1301 + }, + { + "epoch": 0.04662739269790678, + "grad_norm": 1.4261505603790283, + "learning_rate": 0.00019985520845591356, + "loss": 1.3079, + "step": 1302 + }, + { + "epoch": 0.04666320482747507, + "grad_norm": 1.4113595485687256, + "learning_rate": 0.00019985458383337164, + "loss": 1.1708, + "step": 1303 + }, + { + "epoch": 0.04669901695704335, + "grad_norm": 1.9098893404006958, + "learning_rate": 0.0001998539578674151, + "loss": 1.3169, + "step": 1304 + }, + { + "epoch": 0.04673482908661163, + "grad_norm": 1.5236866474151611, + "learning_rate": 0.00019985333055805236, + "loss": 1.2852, + "step": 1305 + }, + { + "epoch": 0.04677064121617992, + "grad_norm": 1.3962947130203247, + "learning_rate": 0.00019985270190529187, + "loss": 1.2872, + "step": 1306 + }, + { + "epoch": 0.046806453345748206, + "grad_norm": 1.7441189289093018, + "learning_rate": 0.00019985207190914206, + "loss": 1.1982, + "step": 1307 + }, + { + "epoch": 0.04684226547531649, + "grad_norm": 2.027651786804199, + "learning_rate": 0.00019985144056961141, + "loss": 1.6402, + "step": 1308 + }, + { + "epoch": 0.04687807760488477, + "grad_norm": 1.659663438796997, + "learning_rate": 0.00019985080788670847, + "loss": 1.4858, + "step": 1309 + }, + { + "epoch": 0.04691388973445306, + "grad_norm": 2.0031638145446777, + "learning_rate": 0.00019985017386044167, + "loss": 1.2505, + "step": 1310 + }, + { + "epoch": 0.046949701864021344, + "grad_norm": 2.1318984031677246, + "learning_rate": 0.00019984953849081958, + "loss": 1.4806, + "step": 1311 + }, + { + "epoch": 0.04698551399358963, + "grad_norm": 2.2908027172088623, + "learning_rate": 0.00019984890177785077, + "loss": 1.4863, + "step": 1312 + }, + { + "epoch": 0.04702132612315792, + "grad_norm": 2.8177430629730225, + "learning_rate": 0.00019984826372154374, + "loss": 1.4086, + "step": 1313 + }, + { + "epoch": 0.0470571382527262, + "grad_norm": 1.7907696962356567, + "learning_rate": 0.00019984762432190717, + "loss": 1.3646, + "step": 1314 + }, + { + "epoch": 0.04709295038229448, + "grad_norm": 1.9206277132034302, + "learning_rate": 0.00019984698357894957, + "loss": 1.5383, + "step": 1315 + }, + { + "epoch": 0.047128762511862765, + "grad_norm": 1.8232088088989258, + "learning_rate": 0.00019984634149267962, + "loss": 1.4255, + "step": 1316 + }, + { + "epoch": 0.047164574641431055, + "grad_norm": 1.6920350790023804, + "learning_rate": 0.00019984569806310592, + "loss": 1.2995, + "step": 1317 + }, + { + "epoch": 0.04720038677099934, + "grad_norm": 1.4382717609405518, + "learning_rate": 0.00019984505329023717, + "loss": 1.3052, + "step": 1318 + }, + { + "epoch": 0.04723619890056762, + "grad_norm": 1.9669774770736694, + "learning_rate": 0.000199844407174082, + "loss": 1.4389, + "step": 1319 + }, + { + "epoch": 0.04727201103013591, + "grad_norm": 1.3826605081558228, + "learning_rate": 0.00019984375971464913, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.04730782315970419, + "grad_norm": 2.213021993637085, + "learning_rate": 0.00019984311091194725, + "loss": 1.5005, + "step": 1321 + }, + { + "epoch": 0.047343635289272476, + "grad_norm": 1.9886411428451538, + "learning_rate": 0.0001998424607659851, + "loss": 1.3568, + "step": 1322 + }, + { + "epoch": 0.04737944741884076, + "grad_norm": 1.4361685514450073, + "learning_rate": 0.00019984180927677146, + "loss": 1.3089, + "step": 1323 + }, + { + "epoch": 0.04741525954840905, + "grad_norm": 1.3951201438903809, + "learning_rate": 0.00019984115644431502, + "loss": 1.3843, + "step": 1324 + }, + { + "epoch": 0.04745107167797733, + "grad_norm": 2.2167773246765137, + "learning_rate": 0.00019984050226862462, + "loss": 1.5036, + "step": 1325 + }, + { + "epoch": 0.047486883807545614, + "grad_norm": 1.3972649574279785, + "learning_rate": 0.00019983984674970905, + "loss": 1.3155, + "step": 1326 + }, + { + "epoch": 0.0475226959371139, + "grad_norm": 1.969992995262146, + "learning_rate": 0.00019983918988757715, + "loss": 1.5514, + "step": 1327 + }, + { + "epoch": 0.04755850806668219, + "grad_norm": 1.7050504684448242, + "learning_rate": 0.0001998385316822377, + "loss": 1.5325, + "step": 1328 + }, + { + "epoch": 0.04759432019625047, + "grad_norm": 1.8824882507324219, + "learning_rate": 0.0001998378721336996, + "loss": 1.6378, + "step": 1329 + }, + { + "epoch": 0.04763013232581875, + "grad_norm": 2.4197707176208496, + "learning_rate": 0.0001998372112419717, + "loss": 1.7341, + "step": 1330 + }, + { + "epoch": 0.04766594445538704, + "grad_norm": 1.7752982378005981, + "learning_rate": 0.00019983654900706293, + "loss": 1.3252, + "step": 1331 + }, + { + "epoch": 0.047701756584955325, + "grad_norm": 1.3934015035629272, + "learning_rate": 0.00019983588542898218, + "loss": 1.1727, + "step": 1332 + }, + { + "epoch": 0.04773756871452361, + "grad_norm": 1.8567872047424316, + "learning_rate": 0.00019983522050773833, + "loss": 1.6274, + "step": 1333 + }, + { + "epoch": 0.04777338084409189, + "grad_norm": 2.21305513381958, + "learning_rate": 0.00019983455424334038, + "loss": 1.5326, + "step": 1334 + }, + { + "epoch": 0.04780919297366018, + "grad_norm": 1.4270364046096802, + "learning_rate": 0.0001998338866357973, + "loss": 1.3806, + "step": 1335 + }, + { + "epoch": 0.04784500510322846, + "grad_norm": 1.3837236166000366, + "learning_rate": 0.00019983321768511801, + "loss": 1.3601, + "step": 1336 + }, + { + "epoch": 0.047880817232796746, + "grad_norm": 2.017913579940796, + "learning_rate": 0.00019983254739131158, + "loss": 1.5439, + "step": 1337 + }, + { + "epoch": 0.047916629362365036, + "grad_norm": 1.3964064121246338, + "learning_rate": 0.000199831875754387, + "loss": 1.3749, + "step": 1338 + }, + { + "epoch": 0.04795244149193332, + "grad_norm": 1.600164771080017, + "learning_rate": 0.00019983120277435333, + "loss": 1.2359, + "step": 1339 + }, + { + "epoch": 0.0479882536215016, + "grad_norm": 1.2762521505355835, + "learning_rate": 0.00019983052845121954, + "loss": 1.5799, + "step": 1340 + }, + { + "epoch": 0.048024065751069885, + "grad_norm": 1.6331149339675903, + "learning_rate": 0.00019982985278499483, + "loss": 1.4083, + "step": 1341 + }, + { + "epoch": 0.048059877880638174, + "grad_norm": 1.7234724760055542, + "learning_rate": 0.0001998291757756882, + "loss": 1.4863, + "step": 1342 + }, + { + "epoch": 0.04809569001020646, + "grad_norm": 1.5676544904708862, + "learning_rate": 0.00019982849742330875, + "loss": 1.3437, + "step": 1343 + }, + { + "epoch": 0.04813150213977474, + "grad_norm": 1.5481882095336914, + "learning_rate": 0.00019982781772786564, + "loss": 1.488, + "step": 1344 + }, + { + "epoch": 0.04816731426934303, + "grad_norm": 2.0719680786132812, + "learning_rate": 0.00019982713668936805, + "loss": 1.4093, + "step": 1345 + }, + { + "epoch": 0.04820312639891131, + "grad_norm": 2.177197217941284, + "learning_rate": 0.00019982645430782506, + "loss": 1.7078, + "step": 1346 + }, + { + "epoch": 0.048238938528479595, + "grad_norm": 2.0544865131378174, + "learning_rate": 0.00019982577058324589, + "loss": 1.5949, + "step": 1347 + }, + { + "epoch": 0.04827475065804788, + "grad_norm": 2.0218639373779297, + "learning_rate": 0.00019982508551563978, + "loss": 1.3712, + "step": 1348 + }, + { + "epoch": 0.04831056278761617, + "grad_norm": 1.5448157787322998, + "learning_rate": 0.00019982439910501588, + "loss": 1.2387, + "step": 1349 + }, + { + "epoch": 0.04834637491718445, + "grad_norm": 1.5750921964645386, + "learning_rate": 0.0001998237113513835, + "loss": 1.2566, + "step": 1350 + }, + { + "epoch": 0.048382187046752734, + "grad_norm": 1.923832893371582, + "learning_rate": 0.00019982302225475182, + "loss": 1.5544, + "step": 1351 + }, + { + "epoch": 0.04841799917632102, + "grad_norm": 1.4625027179718018, + "learning_rate": 0.0001998223318151301, + "loss": 1.2574, + "step": 1352 + }, + { + "epoch": 0.048453811305889306, + "grad_norm": 2.0278592109680176, + "learning_rate": 0.00019982164003252772, + "loss": 1.3595, + "step": 1353 + }, + { + "epoch": 0.04848962343545759, + "grad_norm": 2.486910104751587, + "learning_rate": 0.0001998209469069539, + "loss": 1.5441, + "step": 1354 + }, + { + "epoch": 0.04852543556502587, + "grad_norm": 3.0132133960723877, + "learning_rate": 0.00019982025243841804, + "loss": 1.6033, + "step": 1355 + }, + { + "epoch": 0.04856124769459416, + "grad_norm": 2.6714718341827393, + "learning_rate": 0.00019981955662692942, + "loss": 1.868, + "step": 1356 + }, + { + "epoch": 0.048597059824162445, + "grad_norm": 1.7286485433578491, + "learning_rate": 0.00019981885947249742, + "loss": 1.3763, + "step": 1357 + }, + { + "epoch": 0.04863287195373073, + "grad_norm": 1.44480562210083, + "learning_rate": 0.0001998181609751314, + "loss": 1.1186, + "step": 1358 + }, + { + "epoch": 0.04866868408329901, + "grad_norm": 1.4445734024047852, + "learning_rate": 0.00019981746113484082, + "loss": 1.3722, + "step": 1359 + }, + { + "epoch": 0.0487044962128673, + "grad_norm": 1.7481951713562012, + "learning_rate": 0.00019981675995163505, + "loss": 1.4145, + "step": 1360 + }, + { + "epoch": 0.04874030834243558, + "grad_norm": 1.455772876739502, + "learning_rate": 0.00019981605742552352, + "loss": 1.3282, + "step": 1361 + }, + { + "epoch": 0.048776120472003866, + "grad_norm": 2.5179476737976074, + "learning_rate": 0.00019981535355651569, + "loss": 1.5755, + "step": 1362 + }, + { + "epoch": 0.048811932601572156, + "grad_norm": 2.2599635124206543, + "learning_rate": 0.00019981464834462103, + "loss": 1.644, + "step": 1363 + }, + { + "epoch": 0.04884774473114044, + "grad_norm": 1.6359597444534302, + "learning_rate": 0.00019981394178984903, + "loss": 1.4032, + "step": 1364 + }, + { + "epoch": 0.04888355686070872, + "grad_norm": 1.9022599458694458, + "learning_rate": 0.0001998132338922092, + "loss": 1.5543, + "step": 1365 + }, + { + "epoch": 0.048919368990277004, + "grad_norm": 1.497995376586914, + "learning_rate": 0.00019981252465171102, + "loss": 1.3419, + "step": 1366 + }, + { + "epoch": 0.048955181119845294, + "grad_norm": 2.097982406616211, + "learning_rate": 0.0001998118140683641, + "loss": 1.4275, + "step": 1367 + }, + { + "epoch": 0.04899099324941358, + "grad_norm": 1.5748658180236816, + "learning_rate": 0.00019981110214217798, + "loss": 1.441, + "step": 1368 + }, + { + "epoch": 0.04902680537898186, + "grad_norm": 1.5817651748657227, + "learning_rate": 0.00019981038887316221, + "loss": 1.3293, + "step": 1369 + }, + { + "epoch": 0.04906261750855015, + "grad_norm": 1.6390124559402466, + "learning_rate": 0.00019980967426132642, + "loss": 1.5155, + "step": 1370 + }, + { + "epoch": 0.04909842963811843, + "grad_norm": 1.7995632886886597, + "learning_rate": 0.0001998089583066802, + "loss": 1.3979, + "step": 1371 + }, + { + "epoch": 0.049134241767686715, + "grad_norm": 1.9470847845077515, + "learning_rate": 0.00019980824100923318, + "loss": 1.3865, + "step": 1372 + }, + { + "epoch": 0.049170053897255, + "grad_norm": 1.746004343032837, + "learning_rate": 0.00019980752236899502, + "loss": 1.437, + "step": 1373 + }, + { + "epoch": 0.04920586602682329, + "grad_norm": 1.1984601020812988, + "learning_rate": 0.00019980680238597542, + "loss": 1.3527, + "step": 1374 + }, + { + "epoch": 0.04924167815639157, + "grad_norm": 1.496687889099121, + "learning_rate": 0.000199806081060184, + "loss": 1.4792, + "step": 1375 + }, + { + "epoch": 0.04927749028595985, + "grad_norm": 1.7111642360687256, + "learning_rate": 0.00019980535839163053, + "loss": 1.3594, + "step": 1376 + }, + { + "epoch": 0.049313302415528136, + "grad_norm": 2.3670859336853027, + "learning_rate": 0.00019980463438032468, + "loss": 1.4673, + "step": 1377 + }, + { + "epoch": 0.049349114545096426, + "grad_norm": 1.5302133560180664, + "learning_rate": 0.0001998039090262762, + "loss": 1.467, + "step": 1378 + }, + { + "epoch": 0.04938492667466471, + "grad_norm": 1.5830029249191284, + "learning_rate": 0.0001998031823294949, + "loss": 1.4597, + "step": 1379 + }, + { + "epoch": 0.04942073880423299, + "grad_norm": 1.403914213180542, + "learning_rate": 0.0001998024542899905, + "loss": 1.3898, + "step": 1380 + }, + { + "epoch": 0.04945655093380128, + "grad_norm": 1.8254815340042114, + "learning_rate": 0.00019980172490777283, + "loss": 1.5457, + "step": 1381 + }, + { + "epoch": 0.049492363063369564, + "grad_norm": 1.4082062244415283, + "learning_rate": 0.00019980099418285166, + "loss": 1.2542, + "step": 1382 + }, + { + "epoch": 0.04952817519293785, + "grad_norm": 1.5617414712905884, + "learning_rate": 0.00019980026211523686, + "loss": 1.1955, + "step": 1383 + }, + { + "epoch": 0.04956398732250613, + "grad_norm": 1.6213651895523071, + "learning_rate": 0.00019979952870493824, + "loss": 1.2707, + "step": 1384 + }, + { + "epoch": 0.04959979945207442, + "grad_norm": 1.6262832880020142, + "learning_rate": 0.00019979879395196575, + "loss": 1.413, + "step": 1385 + }, + { + "epoch": 0.0496356115816427, + "grad_norm": 1.4470893144607544, + "learning_rate": 0.00019979805785632916, + "loss": 1.4932, + "step": 1386 + }, + { + "epoch": 0.049671423711210985, + "grad_norm": 2.6812584400177, + "learning_rate": 0.00019979732041803847, + "loss": 1.6918, + "step": 1387 + }, + { + "epoch": 0.049707235840779275, + "grad_norm": 1.5389777421951294, + "learning_rate": 0.00019979658163710355, + "loss": 1.3421, + "step": 1388 + }, + { + "epoch": 0.04974304797034756, + "grad_norm": 1.9210649728775024, + "learning_rate": 0.00019979584151353437, + "loss": 1.4623, + "step": 1389 + }, + { + "epoch": 0.04977886009991584, + "grad_norm": 1.7657535076141357, + "learning_rate": 0.00019979510004734083, + "loss": 1.309, + "step": 1390 + }, + { + "epoch": 0.049814672229484123, + "grad_norm": 1.9781041145324707, + "learning_rate": 0.00019979435723853296, + "loss": 1.4723, + "step": 1391 + }, + { + "epoch": 0.04985048435905241, + "grad_norm": 1.454453706741333, + "learning_rate": 0.00019979361308712073, + "loss": 1.3771, + "step": 1392 + }, + { + "epoch": 0.049886296488620696, + "grad_norm": 1.890052318572998, + "learning_rate": 0.00019979286759311423, + "loss": 1.3924, + "step": 1393 + }, + { + "epoch": 0.04992210861818898, + "grad_norm": 2.095083475112915, + "learning_rate": 0.00019979212075652334, + "loss": 1.52, + "step": 1394 + }, + { + "epoch": 0.04995792074775727, + "grad_norm": 1.528676152229309, + "learning_rate": 0.00019979137257735823, + "loss": 1.4201, + "step": 1395 + }, + { + "epoch": 0.04999373287732555, + "grad_norm": 1.5879507064819336, + "learning_rate": 0.0001997906230556289, + "loss": 1.3265, + "step": 1396 + }, + { + "epoch": 0.050029545006893834, + "grad_norm": 2.454916477203369, + "learning_rate": 0.00019978987219134545, + "loss": 1.4582, + "step": 1397 + }, + { + "epoch": 0.05006535713646212, + "grad_norm": 1.5155335664749146, + "learning_rate": 0.000199789119984518, + "loss": 1.5691, + "step": 1398 + }, + { + "epoch": 0.05010116926603041, + "grad_norm": 1.9991159439086914, + "learning_rate": 0.0001997883664351567, + "loss": 1.4268, + "step": 1399 + }, + { + "epoch": 0.05013698139559869, + "grad_norm": 2.857806921005249, + "learning_rate": 0.00019978761154327158, + "loss": 1.5886, + "step": 1400 + }, + { + "epoch": 0.05017279352516697, + "grad_norm": 1.7471346855163574, + "learning_rate": 0.0001997868553088729, + "loss": 1.5287, + "step": 1401 + }, + { + "epoch": 0.050208605654735255, + "grad_norm": 2.7446141242980957, + "learning_rate": 0.00019978609773197082, + "loss": 1.6915, + "step": 1402 + }, + { + "epoch": 0.050244417784303545, + "grad_norm": 1.4717384576797485, + "learning_rate": 0.00019978533881257547, + "loss": 1.3406, + "step": 1403 + }, + { + "epoch": 0.05028022991387183, + "grad_norm": 1.3535538911819458, + "learning_rate": 0.0001997845785506971, + "loss": 1.3253, + "step": 1404 + }, + { + "epoch": 0.05031604204344011, + "grad_norm": 2.0726115703582764, + "learning_rate": 0.00019978381694634595, + "loss": 1.5582, + "step": 1405 + }, + { + "epoch": 0.0503518541730084, + "grad_norm": 1.6643153429031372, + "learning_rate": 0.00019978305399953228, + "loss": 1.3728, + "step": 1406 + }, + { + "epoch": 0.050387666302576684, + "grad_norm": 1.5568426847457886, + "learning_rate": 0.0001997822897102663, + "loss": 1.3549, + "step": 1407 + }, + { + "epoch": 0.050423478432144966, + "grad_norm": 1.7432804107666016, + "learning_rate": 0.00019978152407855833, + "loss": 1.4755, + "step": 1408 + }, + { + "epoch": 0.05045929056171325, + "grad_norm": 1.63367760181427, + "learning_rate": 0.00019978075710441867, + "loss": 1.4167, + "step": 1409 + }, + { + "epoch": 0.05049510269128154, + "grad_norm": 1.8527686595916748, + "learning_rate": 0.0001997799887878576, + "loss": 1.4546, + "step": 1410 + }, + { + "epoch": 0.05053091482084982, + "grad_norm": 2.559190511703491, + "learning_rate": 0.0001997792191288855, + "loss": 1.4692, + "step": 1411 + }, + { + "epoch": 0.050566726950418105, + "grad_norm": 1.734200358390808, + "learning_rate": 0.00019977844812751273, + "loss": 1.453, + "step": 1412 + }, + { + "epoch": 0.050602539079986394, + "grad_norm": 1.731003999710083, + "learning_rate": 0.00019977767578374965, + "loss": 1.4739, + "step": 1413 + }, + { + "epoch": 0.05063835120955468, + "grad_norm": 1.3181266784667969, + "learning_rate": 0.0001997769020976066, + "loss": 1.4294, + "step": 1414 + }, + { + "epoch": 0.05067416333912296, + "grad_norm": 2.109467029571533, + "learning_rate": 0.0001997761270690941, + "loss": 1.6018, + "step": 1415 + }, + { + "epoch": 0.05070997546869124, + "grad_norm": 1.9596514701843262, + "learning_rate": 0.00019977535069822246, + "loss": 1.4498, + "step": 1416 + }, + { + "epoch": 0.05074578759825953, + "grad_norm": 1.8794978857040405, + "learning_rate": 0.0001997745729850022, + "loss": 1.4085, + "step": 1417 + }, + { + "epoch": 0.050781599727827816, + "grad_norm": 1.711435079574585, + "learning_rate": 0.00019977379392944377, + "loss": 1.4408, + "step": 1418 + }, + { + "epoch": 0.0508174118573961, + "grad_norm": 1.942873477935791, + "learning_rate": 0.00019977301353155764, + "loss": 1.7777, + "step": 1419 + }, + { + "epoch": 0.05085322398696439, + "grad_norm": 1.5506237745285034, + "learning_rate": 0.00019977223179135428, + "loss": 1.5035, + "step": 1420 + }, + { + "epoch": 0.05088903611653267, + "grad_norm": 1.8519601821899414, + "learning_rate": 0.0001997714487088443, + "loss": 1.467, + "step": 1421 + }, + { + "epoch": 0.050924848246100954, + "grad_norm": 1.40318763256073, + "learning_rate": 0.0001997706642840381, + "loss": 1.5006, + "step": 1422 + }, + { + "epoch": 0.05096066037566924, + "grad_norm": 1.3447288274765015, + "learning_rate": 0.00019976987851694634, + "loss": 1.3751, + "step": 1423 + }, + { + "epoch": 0.050996472505237526, + "grad_norm": 2.5955729484558105, + "learning_rate": 0.00019976909140757956, + "loss": 1.4611, + "step": 1424 + }, + { + "epoch": 0.05103228463480581, + "grad_norm": 2.6938729286193848, + "learning_rate": 0.00019976830295594832, + "loss": 1.6404, + "step": 1425 + }, + { + "epoch": 0.05106809676437409, + "grad_norm": 1.9191757440567017, + "learning_rate": 0.0001997675131620633, + "loss": 1.52, + "step": 1426 + }, + { + "epoch": 0.051103908893942375, + "grad_norm": 2.1226022243499756, + "learning_rate": 0.00019976672202593506, + "loss": 1.4559, + "step": 1427 + }, + { + "epoch": 0.051139721023510665, + "grad_norm": 1.388185739517212, + "learning_rate": 0.00019976592954757427, + "loss": 1.1824, + "step": 1428 + }, + { + "epoch": 0.05117553315307895, + "grad_norm": 1.2671204805374146, + "learning_rate": 0.00019976513572699157, + "loss": 1.3952, + "step": 1429 + }, + { + "epoch": 0.05121134528264723, + "grad_norm": 1.7696503400802612, + "learning_rate": 0.00019976434056419767, + "loss": 1.348, + "step": 1430 + }, + { + "epoch": 0.05124715741221552, + "grad_norm": 2.1478967666625977, + "learning_rate": 0.00019976354405920328, + "loss": 1.4344, + "step": 1431 + }, + { + "epoch": 0.0512829695417838, + "grad_norm": 1.8936184644699097, + "learning_rate": 0.00019976274621201907, + "loss": 1.5287, + "step": 1432 + }, + { + "epoch": 0.051318781671352086, + "grad_norm": 1.6035274267196655, + "learning_rate": 0.00019976194702265578, + "loss": 1.4476, + "step": 1433 + }, + { + "epoch": 0.05135459380092037, + "grad_norm": 2.222153902053833, + "learning_rate": 0.00019976114649112418, + "loss": 1.448, + "step": 1434 + }, + { + "epoch": 0.05139040593048866, + "grad_norm": 3.0058703422546387, + "learning_rate": 0.00019976034461743504, + "loss": 1.8932, + "step": 1435 + }, + { + "epoch": 0.05142621806005694, + "grad_norm": 1.7579532861709595, + "learning_rate": 0.00019975954140159915, + "loss": 1.4487, + "step": 1436 + }, + { + "epoch": 0.051462030189625224, + "grad_norm": 1.780436396598816, + "learning_rate": 0.0001997587368436273, + "loss": 1.3189, + "step": 1437 + }, + { + "epoch": 0.051497842319193514, + "grad_norm": 1.6639810800552368, + "learning_rate": 0.00019975793094353036, + "loss": 1.5179, + "step": 1438 + }, + { + "epoch": 0.0515336544487618, + "grad_norm": 1.6446113586425781, + "learning_rate": 0.0001997571237013191, + "loss": 1.1324, + "step": 1439 + }, + { + "epoch": 0.05156946657833008, + "grad_norm": 1.6378356218338013, + "learning_rate": 0.00019975631511700442, + "loss": 1.2469, + "step": 1440 + }, + { + "epoch": 0.05160527870789836, + "grad_norm": 1.4360932111740112, + "learning_rate": 0.00019975550519059723, + "loss": 1.5105, + "step": 1441 + }, + { + "epoch": 0.05164109083746665, + "grad_norm": 1.5820597410202026, + "learning_rate": 0.00019975469392210834, + "loss": 1.4328, + "step": 1442 + }, + { + "epoch": 0.051676902967034935, + "grad_norm": 2.2156031131744385, + "learning_rate": 0.00019975388131154875, + "loss": 1.8128, + "step": 1443 + }, + { + "epoch": 0.05171271509660322, + "grad_norm": 2.4092953205108643, + "learning_rate": 0.00019975306735892936, + "loss": 1.6637, + "step": 1444 + }, + { + "epoch": 0.05174852722617151, + "grad_norm": 1.626210331916809, + "learning_rate": 0.00019975225206426113, + "loss": 1.2969, + "step": 1445 + }, + { + "epoch": 0.05178433935573979, + "grad_norm": 1.4275579452514648, + "learning_rate": 0.000199751435427555, + "loss": 1.2932, + "step": 1446 + }, + { + "epoch": 0.05182015148530807, + "grad_norm": 2.1349308490753174, + "learning_rate": 0.000199750617448822, + "loss": 1.4887, + "step": 1447 + }, + { + "epoch": 0.051855963614876356, + "grad_norm": 1.6612941026687622, + "learning_rate": 0.0001997497981280731, + "loss": 1.397, + "step": 1448 + }, + { + "epoch": 0.051891775744444646, + "grad_norm": 2.0382704734802246, + "learning_rate": 0.0001997489774653193, + "loss": 1.5802, + "step": 1449 + }, + { + "epoch": 0.05192758787401293, + "grad_norm": 2.9342634677886963, + "learning_rate": 0.00019974815546057172, + "loss": 1.5078, + "step": 1450 + }, + { + "epoch": 0.05196340000358121, + "grad_norm": 1.9757568836212158, + "learning_rate": 0.00019974733211384135, + "loss": 1.3051, + "step": 1451 + }, + { + "epoch": 0.051999212133149494, + "grad_norm": 1.6100764274597168, + "learning_rate": 0.0001997465074251393, + "loss": 1.3788, + "step": 1452 + }, + { + "epoch": 0.052035024262717784, + "grad_norm": 1.4608722925186157, + "learning_rate": 0.00019974568139447666, + "loss": 1.3146, + "step": 1453 + }, + { + "epoch": 0.05207083639228607, + "grad_norm": 1.7355380058288574, + "learning_rate": 0.00019974485402186453, + "loss": 1.298, + "step": 1454 + }, + { + "epoch": 0.05210664852185435, + "grad_norm": 2.425987482070923, + "learning_rate": 0.00019974402530731407, + "loss": 1.5463, + "step": 1455 + }, + { + "epoch": 0.05214246065142264, + "grad_norm": 2.4318230152130127, + "learning_rate": 0.0001997431952508364, + "loss": 1.2418, + "step": 1456 + }, + { + "epoch": 0.05217827278099092, + "grad_norm": 3.1731202602386475, + "learning_rate": 0.00019974236385244268, + "loss": 1.5013, + "step": 1457 + }, + { + "epoch": 0.052214084910559205, + "grad_norm": 3.1651902198791504, + "learning_rate": 0.00019974153111214414, + "loss": 2.0463, + "step": 1458 + }, + { + "epoch": 0.05224989704012749, + "grad_norm": 1.5732263326644897, + "learning_rate": 0.00019974069702995194, + "loss": 1.3797, + "step": 1459 + }, + { + "epoch": 0.05228570916969578, + "grad_norm": 1.900349736213684, + "learning_rate": 0.00019973986160587732, + "loss": 1.6395, + "step": 1460 + }, + { + "epoch": 0.05232152129926406, + "grad_norm": 1.4797172546386719, + "learning_rate": 0.0001997390248399315, + "loss": 1.3467, + "step": 1461 + }, + { + "epoch": 0.052357333428832344, + "grad_norm": 2.1090891361236572, + "learning_rate": 0.00019973818673212578, + "loss": 1.1894, + "step": 1462 + }, + { + "epoch": 0.05239314555840063, + "grad_norm": 1.7037320137023926, + "learning_rate": 0.00019973734728247143, + "loss": 1.2724, + "step": 1463 + }, + { + "epoch": 0.052428957687968916, + "grad_norm": 1.7839444875717163, + "learning_rate": 0.0001997365064909797, + "loss": 1.3163, + "step": 1464 + }, + { + "epoch": 0.0524647698175372, + "grad_norm": 1.1938468217849731, + "learning_rate": 0.0001997356643576619, + "loss": 1.4742, + "step": 1465 + }, + { + "epoch": 0.05250058194710548, + "grad_norm": 1.5941988229751587, + "learning_rate": 0.00019973482088252943, + "loss": 1.2742, + "step": 1466 + }, + { + "epoch": 0.05253639407667377, + "grad_norm": 2.4387192726135254, + "learning_rate": 0.00019973397606559354, + "loss": 1.4547, + "step": 1467 + }, + { + "epoch": 0.052572206206242054, + "grad_norm": 1.5240455865859985, + "learning_rate": 0.0001997331299068657, + "loss": 1.4979, + "step": 1468 + }, + { + "epoch": 0.05260801833581034, + "grad_norm": 1.6664941310882568, + "learning_rate": 0.00019973228240635722, + "loss": 1.4168, + "step": 1469 + }, + { + "epoch": 0.05264383046537863, + "grad_norm": 1.5344760417938232, + "learning_rate": 0.00019973143356407952, + "loss": 1.3579, + "step": 1470 + }, + { + "epoch": 0.05267964259494691, + "grad_norm": 1.7656360864639282, + "learning_rate": 0.00019973058338004407, + "loss": 1.4579, + "step": 1471 + }, + { + "epoch": 0.05271545472451519, + "grad_norm": 2.614107847213745, + "learning_rate": 0.00019972973185426222, + "loss": 1.6142, + "step": 1472 + }, + { + "epoch": 0.052751266854083476, + "grad_norm": 1.6414899826049805, + "learning_rate": 0.0001997288789867455, + "loss": 1.5189, + "step": 1473 + }, + { + "epoch": 0.052787078983651765, + "grad_norm": 2.1286725997924805, + "learning_rate": 0.0001997280247775053, + "loss": 1.2716, + "step": 1474 + }, + { + "epoch": 0.05282289111322005, + "grad_norm": 1.6111173629760742, + "learning_rate": 0.0001997271692265532, + "loss": 1.1772, + "step": 1475 + }, + { + "epoch": 0.05285870324278833, + "grad_norm": 1.7480913400650024, + "learning_rate": 0.0001997263123339007, + "loss": 1.2478, + "step": 1476 + }, + { + "epoch": 0.052894515372356614, + "grad_norm": 1.7318025827407837, + "learning_rate": 0.00019972545409955927, + "loss": 1.2256, + "step": 1477 + }, + { + "epoch": 0.052930327501924904, + "grad_norm": 1.7494148015975952, + "learning_rate": 0.0001997245945235405, + "loss": 1.538, + "step": 1478 + }, + { + "epoch": 0.052966139631493186, + "grad_norm": 1.8142074346542358, + "learning_rate": 0.00019972373360585598, + "loss": 1.439, + "step": 1479 + }, + { + "epoch": 0.05300195176106147, + "grad_norm": 1.8369333744049072, + "learning_rate": 0.0001997228713465172, + "loss": 1.3177, + "step": 1480 + }, + { + "epoch": 0.05303776389062976, + "grad_norm": 2.408365249633789, + "learning_rate": 0.00019972200774553587, + "loss": 1.5234, + "step": 1481 + }, + { + "epoch": 0.05307357602019804, + "grad_norm": 1.9613398313522339, + "learning_rate": 0.00019972114280292355, + "loss": 1.3924, + "step": 1482 + }, + { + "epoch": 0.053109388149766325, + "grad_norm": 1.7257936000823975, + "learning_rate": 0.00019972027651869186, + "loss": 1.279, + "step": 1483 + }, + { + "epoch": 0.05314520027933461, + "grad_norm": 1.3719819784164429, + "learning_rate": 0.0001997194088928525, + "loss": 1.3079, + "step": 1484 + }, + { + "epoch": 0.0531810124089029, + "grad_norm": 1.8057360649108887, + "learning_rate": 0.0001997185399254171, + "loss": 1.4278, + "step": 1485 + }, + { + "epoch": 0.05321682453847118, + "grad_norm": 1.7611794471740723, + "learning_rate": 0.00019971766961639738, + "loss": 1.2786, + "step": 1486 + }, + { + "epoch": 0.05325263666803946, + "grad_norm": 1.8661282062530518, + "learning_rate": 0.00019971679796580504, + "loss": 1.6222, + "step": 1487 + }, + { + "epoch": 0.05328844879760775, + "grad_norm": 1.4145103693008423, + "learning_rate": 0.00019971592497365184, + "loss": 1.3425, + "step": 1488 + }, + { + "epoch": 0.053324260927176036, + "grad_norm": 1.6042143106460571, + "learning_rate": 0.00019971505063994948, + "loss": 1.3664, + "step": 1489 + }, + { + "epoch": 0.05336007305674432, + "grad_norm": 1.935230016708374, + "learning_rate": 0.0001997141749647097, + "loss": 1.522, + "step": 1490 + }, + { + "epoch": 0.0533958851863126, + "grad_norm": 1.6494944095611572, + "learning_rate": 0.00019971329794794436, + "loss": 1.4951, + "step": 1491 + }, + { + "epoch": 0.05343169731588089, + "grad_norm": 1.7416815757751465, + "learning_rate": 0.0001997124195896652, + "loss": 1.4567, + "step": 1492 + }, + { + "epoch": 0.053467509445449174, + "grad_norm": 2.184161901473999, + "learning_rate": 0.00019971153988988406, + "loss": 1.4458, + "step": 1493 + }, + { + "epoch": 0.05350332157501746, + "grad_norm": 1.5230019092559814, + "learning_rate": 0.00019971065884861276, + "loss": 1.332, + "step": 1494 + }, + { + "epoch": 0.05353913370458575, + "grad_norm": 1.621939778327942, + "learning_rate": 0.00019970977646586319, + "loss": 1.6698, + "step": 1495 + }, + { + "epoch": 0.05357494583415403, + "grad_norm": 2.400557518005371, + "learning_rate": 0.00019970889274164715, + "loss": 1.349, + "step": 1496 + }, + { + "epoch": 0.05361075796372231, + "grad_norm": 2.2382516860961914, + "learning_rate": 0.00019970800767597663, + "loss": 1.5401, + "step": 1497 + }, + { + "epoch": 0.053646570093290595, + "grad_norm": 1.460777997970581, + "learning_rate": 0.00019970712126886342, + "loss": 1.589, + "step": 1498 + }, + { + "epoch": 0.053682382222858885, + "grad_norm": 1.8513052463531494, + "learning_rate": 0.00019970623352031952, + "loss": 1.4803, + "step": 1499 + }, + { + "epoch": 0.05371819435242717, + "grad_norm": 1.9679930210113525, + "learning_rate": 0.00019970534443035688, + "loss": 1.4708, + "step": 1500 + }, + { + "epoch": 0.05375400648199545, + "grad_norm": 2.2773420810699463, + "learning_rate": 0.00019970445399898745, + "loss": 1.5525, + "step": 1501 + }, + { + "epoch": 0.05378981861156373, + "grad_norm": 1.8354737758636475, + "learning_rate": 0.0001997035622262232, + "loss": 1.3531, + "step": 1502 + }, + { + "epoch": 0.05382563074113202, + "grad_norm": 2.3149704933166504, + "learning_rate": 0.00019970266911207608, + "loss": 1.3924, + "step": 1503 + }, + { + "epoch": 0.053861442870700306, + "grad_norm": 1.993866205215454, + "learning_rate": 0.00019970177465655818, + "loss": 1.3996, + "step": 1504 + }, + { + "epoch": 0.05389725500026859, + "grad_norm": 2.8436038494110107, + "learning_rate": 0.00019970087885968154, + "loss": 1.7131, + "step": 1505 + }, + { + "epoch": 0.05393306712983688, + "grad_norm": 1.5381276607513428, + "learning_rate": 0.00019969998172145815, + "loss": 1.4003, + "step": 1506 + }, + { + "epoch": 0.05396887925940516, + "grad_norm": 1.5098704099655151, + "learning_rate": 0.00019969908324190012, + "loss": 1.4529, + "step": 1507 + }, + { + "epoch": 0.054004691388973444, + "grad_norm": 1.3422155380249023, + "learning_rate": 0.0001996981834210195, + "loss": 1.3458, + "step": 1508 + }, + { + "epoch": 0.05404050351854173, + "grad_norm": 2.3785884380340576, + "learning_rate": 0.00019969728225882846, + "loss": 1.3476, + "step": 1509 + }, + { + "epoch": 0.05407631564811002, + "grad_norm": 1.5885443687438965, + "learning_rate": 0.0001996963797553391, + "loss": 1.3341, + "step": 1510 + }, + { + "epoch": 0.0541121277776783, + "grad_norm": 1.4273877143859863, + "learning_rate": 0.0001996954759105635, + "loss": 1.5208, + "step": 1511 + }, + { + "epoch": 0.05414793990724658, + "grad_norm": 1.5592671632766724, + "learning_rate": 0.00019969457072451392, + "loss": 1.262, + "step": 1512 + }, + { + "epoch": 0.05418375203681487, + "grad_norm": 2.2370784282684326, + "learning_rate": 0.00019969366419720245, + "loss": 1.6428, + "step": 1513 + }, + { + "epoch": 0.054219564166383155, + "grad_norm": 2.92452073097229, + "learning_rate": 0.00019969275632864133, + "loss": 1.5795, + "step": 1514 + }, + { + "epoch": 0.05425537629595144, + "grad_norm": 1.4909108877182007, + "learning_rate": 0.0001996918471188428, + "loss": 1.2836, + "step": 1515 + }, + { + "epoch": 0.05429118842551972, + "grad_norm": 2.0291028022766113, + "learning_rate": 0.00019969093656781902, + "loss": 1.5238, + "step": 1516 + }, + { + "epoch": 0.05432700055508801, + "grad_norm": 1.3353036642074585, + "learning_rate": 0.00019969002467558228, + "loss": 1.4439, + "step": 1517 + }, + { + "epoch": 0.05436281268465629, + "grad_norm": 1.7566834688186646, + "learning_rate": 0.00019968911144214486, + "loss": 1.5731, + "step": 1518 + }, + { + "epoch": 0.054398624814224576, + "grad_norm": 1.9222277402877808, + "learning_rate": 0.00019968819686751906, + "loss": 1.599, + "step": 1519 + }, + { + "epoch": 0.054434436943792866, + "grad_norm": 1.759495735168457, + "learning_rate": 0.00019968728095171715, + "loss": 1.6218, + "step": 1520 + }, + { + "epoch": 0.05447024907336115, + "grad_norm": 1.3349992036819458, + "learning_rate": 0.00019968636369475142, + "loss": 1.2848, + "step": 1521 + }, + { + "epoch": 0.05450606120292943, + "grad_norm": 1.804618239402771, + "learning_rate": 0.00019968544509663428, + "loss": 1.537, + "step": 1522 + }, + { + "epoch": 0.054541873332497715, + "grad_norm": 1.5982606410980225, + "learning_rate": 0.00019968452515737805, + "loss": 1.4058, + "step": 1523 + }, + { + "epoch": 0.054577685462066004, + "grad_norm": 2.0629003047943115, + "learning_rate": 0.00019968360387699513, + "loss": 1.4786, + "step": 1524 + }, + { + "epoch": 0.05461349759163429, + "grad_norm": 2.2649857997894287, + "learning_rate": 0.00019968268125549794, + "loss": 1.554, + "step": 1525 + }, + { + "epoch": 0.05464930972120257, + "grad_norm": 1.8842236995697021, + "learning_rate": 0.0001996817572928988, + "loss": 1.4718, + "step": 1526 + }, + { + "epoch": 0.05468512185077085, + "grad_norm": 1.3660839796066284, + "learning_rate": 0.0001996808319892102, + "loss": 1.5073, + "step": 1527 + }, + { + "epoch": 0.05472093398033914, + "grad_norm": 1.5514891147613525, + "learning_rate": 0.00019967990534444462, + "loss": 1.323, + "step": 1528 + }, + { + "epoch": 0.054756746109907425, + "grad_norm": 1.4323750734329224, + "learning_rate": 0.00019967897735861446, + "loss": 1.2033, + "step": 1529 + }, + { + "epoch": 0.05479255823947571, + "grad_norm": 2.2659714221954346, + "learning_rate": 0.00019967804803173227, + "loss": 1.158, + "step": 1530 + }, + { + "epoch": 0.054828370369044, + "grad_norm": 1.7140097618103027, + "learning_rate": 0.00019967711736381048, + "loss": 1.4148, + "step": 1531 + }, + { + "epoch": 0.05486418249861228, + "grad_norm": 2.178936243057251, + "learning_rate": 0.00019967618535486164, + "loss": 1.642, + "step": 1532 + }, + { + "epoch": 0.054899994628180564, + "grad_norm": 2.0612549781799316, + "learning_rate": 0.00019967525200489833, + "loss": 1.4074, + "step": 1533 + }, + { + "epoch": 0.054935806757748847, + "grad_norm": 1.6325958967208862, + "learning_rate": 0.00019967431731393308, + "loss": 1.5363, + "step": 1534 + }, + { + "epoch": 0.054971618887317136, + "grad_norm": 2.499089241027832, + "learning_rate": 0.00019967338128197847, + "loss": 1.425, + "step": 1535 + }, + { + "epoch": 0.05500743101688542, + "grad_norm": 1.7674121856689453, + "learning_rate": 0.00019967244390904708, + "loss": 1.4064, + "step": 1536 + }, + { + "epoch": 0.0550432431464537, + "grad_norm": 1.8227787017822266, + "learning_rate": 0.0001996715051951515, + "loss": 1.3681, + "step": 1537 + }, + { + "epoch": 0.05507905527602199, + "grad_norm": 1.8739690780639648, + "learning_rate": 0.0001996705651403044, + "loss": 1.5185, + "step": 1538 + }, + { + "epoch": 0.055114867405590275, + "grad_norm": 1.4769046306610107, + "learning_rate": 0.0001996696237445184, + "loss": 1.3099, + "step": 1539 + }, + { + "epoch": 0.05515067953515856, + "grad_norm": 1.559848427772522, + "learning_rate": 0.0001996686810078062, + "loss": 1.495, + "step": 1540 + }, + { + "epoch": 0.05518649166472684, + "grad_norm": 2.026592254638672, + "learning_rate": 0.00019966773693018045, + "loss": 1.5999, + "step": 1541 + }, + { + "epoch": 0.05522230379429513, + "grad_norm": 1.5739831924438477, + "learning_rate": 0.00019966679151165384, + "loss": 1.5584, + "step": 1542 + }, + { + "epoch": 0.05525811592386341, + "grad_norm": 1.8507097959518433, + "learning_rate": 0.00019966584475223913, + "loss": 1.4538, + "step": 1543 + }, + { + "epoch": 0.055293928053431696, + "grad_norm": 1.5205029249191284, + "learning_rate": 0.00019966489665194904, + "loss": 1.3381, + "step": 1544 + }, + { + "epoch": 0.055329740182999985, + "grad_norm": 2.643441915512085, + "learning_rate": 0.0001996639472107963, + "loss": 1.692, + "step": 1545 + }, + { + "epoch": 0.05536555231256827, + "grad_norm": 1.731264591217041, + "learning_rate": 0.00019966299642879375, + "loss": 1.3536, + "step": 1546 + }, + { + "epoch": 0.05540136444213655, + "grad_norm": 1.4582899808883667, + "learning_rate": 0.00019966204430595412, + "loss": 1.4753, + "step": 1547 + }, + { + "epoch": 0.055437176571704834, + "grad_norm": 3.8145768642425537, + "learning_rate": 0.00019966109084229024, + "loss": 1.7164, + "step": 1548 + }, + { + "epoch": 0.055472988701273124, + "grad_norm": 2.201963424682617, + "learning_rate": 0.00019966013603781493, + "loss": 1.4434, + "step": 1549 + }, + { + "epoch": 0.05550880083084141, + "grad_norm": 1.4924578666687012, + "learning_rate": 0.00019965917989254103, + "loss": 1.4503, + "step": 1550 + }, + { + "epoch": 0.05554461296040969, + "grad_norm": 1.8884055614471436, + "learning_rate": 0.00019965822240648143, + "loss": 1.5276, + "step": 1551 + }, + { + "epoch": 0.05558042508997797, + "grad_norm": 1.6700050830841064, + "learning_rate": 0.00019965726357964902, + "loss": 1.3491, + "step": 1552 + }, + { + "epoch": 0.05561623721954626, + "grad_norm": 3.0052330493927, + "learning_rate": 0.00019965630341205664, + "loss": 1.4055, + "step": 1553 + }, + { + "epoch": 0.055652049349114545, + "grad_norm": 1.6322606801986694, + "learning_rate": 0.00019965534190371725, + "loss": 1.6034, + "step": 1554 + }, + { + "epoch": 0.05568786147868283, + "grad_norm": 1.7401280403137207, + "learning_rate": 0.0001996543790546438, + "loss": 1.6304, + "step": 1555 + }, + { + "epoch": 0.05572367360825112, + "grad_norm": 2.131551742553711, + "learning_rate": 0.00019965341486484923, + "loss": 1.4361, + "step": 1556 + }, + { + "epoch": 0.0557594857378194, + "grad_norm": 2.3153162002563477, + "learning_rate": 0.00019965244933434648, + "loss": 1.5724, + "step": 1557 + }, + { + "epoch": 0.05579529786738768, + "grad_norm": 1.6900177001953125, + "learning_rate": 0.00019965148246314858, + "loss": 1.5303, + "step": 1558 + }, + { + "epoch": 0.055831109996955966, + "grad_norm": 1.7291017770767212, + "learning_rate": 0.00019965051425126852, + "loss": 1.4306, + "step": 1559 + }, + { + "epoch": 0.055866922126524256, + "grad_norm": 1.4189715385437012, + "learning_rate": 0.00019964954469871936, + "loss": 1.2716, + "step": 1560 + }, + { + "epoch": 0.05590273425609254, + "grad_norm": 1.6454137563705444, + "learning_rate": 0.0001996485738055141, + "loss": 1.4657, + "step": 1561 + }, + { + "epoch": 0.05593854638566082, + "grad_norm": 1.5903674364089966, + "learning_rate": 0.00019964760157166578, + "loss": 1.5768, + "step": 1562 + }, + { + "epoch": 0.05597435851522911, + "grad_norm": 2.067021369934082, + "learning_rate": 0.00019964662799718753, + "loss": 1.54, + "step": 1563 + }, + { + "epoch": 0.056010170644797394, + "grad_norm": 2.1511857509613037, + "learning_rate": 0.00019964565308209248, + "loss": 1.6766, + "step": 1564 + }, + { + "epoch": 0.05604598277436568, + "grad_norm": 1.4878629446029663, + "learning_rate": 0.00019964467682639364, + "loss": 1.2078, + "step": 1565 + }, + { + "epoch": 0.05608179490393396, + "grad_norm": 2.0658607482910156, + "learning_rate": 0.00019964369923010424, + "loss": 1.5672, + "step": 1566 + }, + { + "epoch": 0.05611760703350225, + "grad_norm": 2.2423901557922363, + "learning_rate": 0.00019964272029323742, + "loss": 1.5595, + "step": 1567 + }, + { + "epoch": 0.05615341916307053, + "grad_norm": 1.8644993305206299, + "learning_rate": 0.00019964174001580628, + "loss": 1.7236, + "step": 1568 + }, + { + "epoch": 0.056189231292638815, + "grad_norm": 1.5625858306884766, + "learning_rate": 0.00019964075839782407, + "loss": 1.2699, + "step": 1569 + }, + { + "epoch": 0.056225043422207105, + "grad_norm": 2.3152015209198, + "learning_rate": 0.000199639775439304, + "loss": 1.6359, + "step": 1570 + }, + { + "epoch": 0.05626085555177539, + "grad_norm": 2.3881824016571045, + "learning_rate": 0.00019963879114025926, + "loss": 1.4564, + "step": 1571 + }, + { + "epoch": 0.05629666768134367, + "grad_norm": 1.6417608261108398, + "learning_rate": 0.00019963780550070315, + "loss": 1.4733, + "step": 1572 + }, + { + "epoch": 0.05633247981091195, + "grad_norm": 2.299020767211914, + "learning_rate": 0.00019963681852064883, + "loss": 1.404, + "step": 1573 + }, + { + "epoch": 0.05636829194048024, + "grad_norm": 2.1051032543182373, + "learning_rate": 0.0001996358302001097, + "loss": 1.3025, + "step": 1574 + }, + { + "epoch": 0.056404104070048526, + "grad_norm": 2.371185064315796, + "learning_rate": 0.00019963484053909896, + "loss": 1.5089, + "step": 1575 + }, + { + "epoch": 0.05643991619961681, + "grad_norm": 2.136993408203125, + "learning_rate": 0.00019963384953762995, + "loss": 1.4968, + "step": 1576 + }, + { + "epoch": 0.05647572832918509, + "grad_norm": 1.5512198209762573, + "learning_rate": 0.00019963285719571604, + "loss": 1.2465, + "step": 1577 + }, + { + "epoch": 0.05651154045875338, + "grad_norm": 2.116680145263672, + "learning_rate": 0.00019963186351337054, + "loss": 1.3507, + "step": 1578 + }, + { + "epoch": 0.056547352588321664, + "grad_norm": 1.8616209030151367, + "learning_rate": 0.00019963086849060684, + "loss": 1.3857, + "step": 1579 + }, + { + "epoch": 0.05658316471788995, + "grad_norm": 2.1783945560455322, + "learning_rate": 0.0001996298721274383, + "loss": 1.5243, + "step": 1580 + }, + { + "epoch": 0.05661897684745824, + "grad_norm": 2.161471128463745, + "learning_rate": 0.00019962887442387834, + "loss": 1.4471, + "step": 1581 + }, + { + "epoch": 0.05665478897702652, + "grad_norm": 2.7426064014434814, + "learning_rate": 0.0001996278753799404, + "loss": 1.4933, + "step": 1582 + }, + { + "epoch": 0.0566906011065948, + "grad_norm": 1.790327548980713, + "learning_rate": 0.00019962687499563793, + "loss": 1.434, + "step": 1583 + }, + { + "epoch": 0.056726413236163085, + "grad_norm": 2.1728994846343994, + "learning_rate": 0.00019962587327098435, + "loss": 1.4164, + "step": 1584 + }, + { + "epoch": 0.056762225365731375, + "grad_norm": 2.193121910095215, + "learning_rate": 0.00019962487020599315, + "loss": 1.0878, + "step": 1585 + }, + { + "epoch": 0.05679803749529966, + "grad_norm": 2.0306010246276855, + "learning_rate": 0.00019962386580067782, + "loss": 1.3304, + "step": 1586 + }, + { + "epoch": 0.05683384962486794, + "grad_norm": 1.6169382333755493, + "learning_rate": 0.00019962286005505188, + "loss": 1.5178, + "step": 1587 + }, + { + "epoch": 0.05686966175443623, + "grad_norm": 1.703232765197754, + "learning_rate": 0.00019962185296912887, + "loss": 1.4524, + "step": 1588 + }, + { + "epoch": 0.056905473884004514, + "grad_norm": 1.681854486465454, + "learning_rate": 0.00019962084454292235, + "loss": 1.3862, + "step": 1589 + }, + { + "epoch": 0.056941286013572796, + "grad_norm": 2.4113099575042725, + "learning_rate": 0.00019961983477644583, + "loss": 1.5046, + "step": 1590 + }, + { + "epoch": 0.05697709814314108, + "grad_norm": 1.7152905464172363, + "learning_rate": 0.00019961882366971296, + "loss": 1.2981, + "step": 1591 + }, + { + "epoch": 0.05701291027270937, + "grad_norm": 1.8014583587646484, + "learning_rate": 0.00019961781122273734, + "loss": 1.5353, + "step": 1592 + }, + { + "epoch": 0.05704872240227765, + "grad_norm": 3.599487066268921, + "learning_rate": 0.00019961679743553252, + "loss": 1.4818, + "step": 1593 + }, + { + "epoch": 0.057084534531845935, + "grad_norm": 1.7749799489974976, + "learning_rate": 0.0001996157823081122, + "loss": 1.4132, + "step": 1594 + }, + { + "epoch": 0.057120346661414224, + "grad_norm": 1.7546943426132202, + "learning_rate": 0.00019961476584049004, + "loss": 1.3643, + "step": 1595 + }, + { + "epoch": 0.05715615879098251, + "grad_norm": 1.7761893272399902, + "learning_rate": 0.00019961374803267968, + "loss": 1.3766, + "step": 1596 + }, + { + "epoch": 0.05719197092055079, + "grad_norm": 1.617936134338379, + "learning_rate": 0.00019961272888469484, + "loss": 1.4874, + "step": 1597 + }, + { + "epoch": 0.05722778305011907, + "grad_norm": 2.464980125427246, + "learning_rate": 0.00019961170839654922, + "loss": 1.6781, + "step": 1598 + }, + { + "epoch": 0.05726359517968736, + "grad_norm": 1.8678034543991089, + "learning_rate": 0.00019961068656825656, + "loss": 1.3305, + "step": 1599 + }, + { + "epoch": 0.057299407309255646, + "grad_norm": 1.8178790807724, + "learning_rate": 0.0001996096633998306, + "loss": 1.5522, + "step": 1600 + }, + { + "epoch": 0.05733521943882393, + "grad_norm": 1.582253098487854, + "learning_rate": 0.0001996086388912851, + "loss": 1.3154, + "step": 1601 + }, + { + "epoch": 0.05737103156839221, + "grad_norm": 2.2057530879974365, + "learning_rate": 0.00019960761304263386, + "loss": 1.5848, + "step": 1602 + }, + { + "epoch": 0.0574068436979605, + "grad_norm": 2.922191858291626, + "learning_rate": 0.0001996065858538907, + "loss": 1.457, + "step": 1603 + }, + { + "epoch": 0.057442655827528784, + "grad_norm": 1.799712061882019, + "learning_rate": 0.00019960555732506937, + "loss": 1.286, + "step": 1604 + }, + { + "epoch": 0.05747846795709707, + "grad_norm": 1.6574140787124634, + "learning_rate": 0.00019960452745618375, + "loss": 1.2488, + "step": 1605 + }, + { + "epoch": 0.057514280086665356, + "grad_norm": 1.3182849884033203, + "learning_rate": 0.0001996034962472477, + "loss": 1.436, + "step": 1606 + }, + { + "epoch": 0.05755009221623364, + "grad_norm": 1.9132752418518066, + "learning_rate": 0.0001996024636982751, + "loss": 1.3262, + "step": 1607 + }, + { + "epoch": 0.05758590434580192, + "grad_norm": 1.718218207359314, + "learning_rate": 0.0001996014298092798, + "loss": 1.4048, + "step": 1608 + }, + { + "epoch": 0.057621716475370205, + "grad_norm": 1.6665230989456177, + "learning_rate": 0.00019960039458027576, + "loss": 1.3081, + "step": 1609 + }, + { + "epoch": 0.057657528604938495, + "grad_norm": 1.568872094154358, + "learning_rate": 0.00019959935801127686, + "loss": 1.4126, + "step": 1610 + }, + { + "epoch": 0.05769334073450678, + "grad_norm": 2.119325876235962, + "learning_rate": 0.00019959832010229712, + "loss": 1.2247, + "step": 1611 + }, + { + "epoch": 0.05772915286407506, + "grad_norm": 1.6527888774871826, + "learning_rate": 0.0001995972808533504, + "loss": 1.5354, + "step": 1612 + }, + { + "epoch": 0.05776496499364335, + "grad_norm": 1.9083839654922485, + "learning_rate": 0.00019959624026445077, + "loss": 1.2521, + "step": 1613 + }, + { + "epoch": 0.05780077712321163, + "grad_norm": 1.9666969776153564, + "learning_rate": 0.0001995951983356122, + "loss": 1.4249, + "step": 1614 + }, + { + "epoch": 0.057836589252779916, + "grad_norm": 2.463040828704834, + "learning_rate": 0.0001995941550668487, + "loss": 1.4069, + "step": 1615 + }, + { + "epoch": 0.0578724013823482, + "grad_norm": 2.7806801795959473, + "learning_rate": 0.00019959311045817432, + "loss": 1.4268, + "step": 1616 + }, + { + "epoch": 0.05790821351191649, + "grad_norm": 1.570820927619934, + "learning_rate": 0.00019959206450960307, + "loss": 1.3131, + "step": 1617 + }, + { + "epoch": 0.05794402564148477, + "grad_norm": 2.785691022872925, + "learning_rate": 0.0001995910172211491, + "loss": 1.5463, + "step": 1618 + }, + { + "epoch": 0.057979837771053054, + "grad_norm": 1.6891225576400757, + "learning_rate": 0.0001995899685928264, + "loss": 1.4897, + "step": 1619 + }, + { + "epoch": 0.058015649900621344, + "grad_norm": 2.355483055114746, + "learning_rate": 0.0001995889186246492, + "loss": 1.2146, + "step": 1620 + }, + { + "epoch": 0.05805146203018963, + "grad_norm": 2.4746861457824707, + "learning_rate": 0.0001995878673166315, + "loss": 1.267, + "step": 1621 + }, + { + "epoch": 0.05808727415975791, + "grad_norm": 2.928194999694824, + "learning_rate": 0.00019958681466878756, + "loss": 1.4772, + "step": 1622 + }, + { + "epoch": 0.05812308628932619, + "grad_norm": 2.5144028663635254, + "learning_rate": 0.00019958576068113145, + "loss": 1.3158, + "step": 1623 + }, + { + "epoch": 0.05815889841889448, + "grad_norm": 2.7138662338256836, + "learning_rate": 0.00019958470535367742, + "loss": 1.393, + "step": 1624 + }, + { + "epoch": 0.058194710548462765, + "grad_norm": 2.387232542037964, + "learning_rate": 0.00019958364868643958, + "loss": 1.353, + "step": 1625 + }, + { + "epoch": 0.05823052267803105, + "grad_norm": 1.8160326480865479, + "learning_rate": 0.00019958259067943225, + "loss": 1.5637, + "step": 1626 + }, + { + "epoch": 0.05826633480759933, + "grad_norm": 1.9738333225250244, + "learning_rate": 0.0001995815313326696, + "loss": 1.4303, + "step": 1627 + }, + { + "epoch": 0.05830214693716762, + "grad_norm": 2.3734872341156006, + "learning_rate": 0.0001995804706461659, + "loss": 1.3816, + "step": 1628 + }, + { + "epoch": 0.0583379590667359, + "grad_norm": 1.615475058555603, + "learning_rate": 0.0001995794086199354, + "loss": 1.2034, + "step": 1629 + }, + { + "epoch": 0.058373771196304186, + "grad_norm": 1.938766360282898, + "learning_rate": 0.00019957834525399242, + "loss": 1.4121, + "step": 1630 + }, + { + "epoch": 0.058409583325872476, + "grad_norm": 1.9363996982574463, + "learning_rate": 0.00019957728054835125, + "loss": 1.3915, + "step": 1631 + }, + { + "epoch": 0.05844539545544076, + "grad_norm": 2.381605386734009, + "learning_rate": 0.0001995762145030262, + "loss": 1.4157, + "step": 1632 + }, + { + "epoch": 0.05848120758500904, + "grad_norm": 4.692550182342529, + "learning_rate": 0.00019957514711803164, + "loss": 1.307, + "step": 1633 + }, + { + "epoch": 0.058517019714577324, + "grad_norm": 2.1087148189544678, + "learning_rate": 0.00019957407839338193, + "loss": 1.3633, + "step": 1634 + }, + { + "epoch": 0.058552831844145614, + "grad_norm": 2.8628838062286377, + "learning_rate": 0.00019957300832909144, + "loss": 1.4594, + "step": 1635 + }, + { + "epoch": 0.0585886439737139, + "grad_norm": 1.9057936668395996, + "learning_rate": 0.00019957193692517455, + "loss": 1.3708, + "step": 1636 + }, + { + "epoch": 0.05862445610328218, + "grad_norm": 1.8152791261672974, + "learning_rate": 0.00019957086418164567, + "loss": 1.3022, + "step": 1637 + }, + { + "epoch": 0.05866026823285047, + "grad_norm": 2.8561363220214844, + "learning_rate": 0.00019956979009851927, + "loss": 1.3652, + "step": 1638 + }, + { + "epoch": 0.05869608036241875, + "grad_norm": 2.2516229152679443, + "learning_rate": 0.00019956871467580978, + "loss": 1.4476, + "step": 1639 + }, + { + "epoch": 0.058731892491987035, + "grad_norm": 1.5324504375457764, + "learning_rate": 0.00019956763791353165, + "loss": 1.2212, + "step": 1640 + }, + { + "epoch": 0.05876770462155532, + "grad_norm": 1.5676912069320679, + "learning_rate": 0.00019956655981169942, + "loss": 1.3289, + "step": 1641 + }, + { + "epoch": 0.05880351675112361, + "grad_norm": 2.588806629180908, + "learning_rate": 0.00019956548037032752, + "loss": 1.5114, + "step": 1642 + }, + { + "epoch": 0.05883932888069189, + "grad_norm": 1.613540768623352, + "learning_rate": 0.00019956439958943053, + "loss": 1.4308, + "step": 1643 + }, + { + "epoch": 0.058875141010260174, + "grad_norm": 2.1007156372070312, + "learning_rate": 0.00019956331746902298, + "loss": 1.4119, + "step": 1644 + }, + { + "epoch": 0.05891095313982846, + "grad_norm": 2.333542823791504, + "learning_rate": 0.00019956223400911943, + "loss": 1.3637, + "step": 1645 + }, + { + "epoch": 0.058946765269396746, + "grad_norm": 2.7426371574401855, + "learning_rate": 0.00019956114920973442, + "loss": 1.5109, + "step": 1646 + }, + { + "epoch": 0.05898257739896503, + "grad_norm": 2.834852457046509, + "learning_rate": 0.00019956006307088258, + "loss": 1.5092, + "step": 1647 + }, + { + "epoch": 0.05901838952853331, + "grad_norm": 2.003948926925659, + "learning_rate": 0.00019955897559257853, + "loss": 1.4041, + "step": 1648 + }, + { + "epoch": 0.0590542016581016, + "grad_norm": 3.2072465419769287, + "learning_rate": 0.00019955788677483686, + "loss": 1.4767, + "step": 1649 + }, + { + "epoch": 0.059090013787669884, + "grad_norm": 2.1094000339508057, + "learning_rate": 0.00019955679661767226, + "loss": 1.1447, + "step": 1650 + }, + { + "epoch": 0.05912582591723817, + "grad_norm": 2.902888536453247, + "learning_rate": 0.0001995557051210994, + "loss": 1.3931, + "step": 1651 + }, + { + "epoch": 0.05916163804680645, + "grad_norm": 2.082526683807373, + "learning_rate": 0.0001995546122851329, + "loss": 1.4949, + "step": 1652 + }, + { + "epoch": 0.05919745017637474, + "grad_norm": 2.3193819522857666, + "learning_rate": 0.00019955351810978754, + "loss": 1.5591, + "step": 1653 + }, + { + "epoch": 0.05923326230594302, + "grad_norm": 1.9501256942749023, + "learning_rate": 0.000199552422595078, + "loss": 1.4431, + "step": 1654 + }, + { + "epoch": 0.059269074435511306, + "grad_norm": 2.163543939590454, + "learning_rate": 0.000199551325741019, + "loss": 1.3501, + "step": 1655 + }, + { + "epoch": 0.059304886565079595, + "grad_norm": 2.4549899101257324, + "learning_rate": 0.00019955022754762535, + "loss": 1.5294, + "step": 1656 + }, + { + "epoch": 0.05934069869464788, + "grad_norm": 2.2489686012268066, + "learning_rate": 0.0001995491280149118, + "loss": 1.5501, + "step": 1657 + }, + { + "epoch": 0.05937651082421616, + "grad_norm": 1.50262451171875, + "learning_rate": 0.00019954802714289315, + "loss": 1.2383, + "step": 1658 + }, + { + "epoch": 0.059412322953784444, + "grad_norm": 1.6799941062927246, + "learning_rate": 0.0001995469249315842, + "loss": 1.2692, + "step": 1659 + }, + { + "epoch": 0.059448135083352734, + "grad_norm": 1.9055615663528442, + "learning_rate": 0.00019954582138099978, + "loss": 1.3707, + "step": 1660 + }, + { + "epoch": 0.059483947212921016, + "grad_norm": 1.6121039390563965, + "learning_rate": 0.00019954471649115475, + "loss": 1.309, + "step": 1661 + }, + { + "epoch": 0.0595197593424893, + "grad_norm": 1.9664522409439087, + "learning_rate": 0.00019954361026206394, + "loss": 1.2735, + "step": 1662 + }, + { + "epoch": 0.05955557147205759, + "grad_norm": 1.8227052688598633, + "learning_rate": 0.00019954250269374227, + "loss": 1.1805, + "step": 1663 + }, + { + "epoch": 0.05959138360162587, + "grad_norm": 3.8803529739379883, + "learning_rate": 0.0001995413937862046, + "loss": 1.4904, + "step": 1664 + }, + { + "epoch": 0.059627195731194155, + "grad_norm": 1.8025954961776733, + "learning_rate": 0.0001995402835394659, + "loss": 1.3367, + "step": 1665 + }, + { + "epoch": 0.05966300786076244, + "grad_norm": 3.405710220336914, + "learning_rate": 0.00019953917195354105, + "loss": 1.2965, + "step": 1666 + }, + { + "epoch": 0.05969881999033073, + "grad_norm": 1.7282323837280273, + "learning_rate": 0.00019953805902844508, + "loss": 1.4377, + "step": 1667 + }, + { + "epoch": 0.05973463211989901, + "grad_norm": 1.8791756629943848, + "learning_rate": 0.00019953694476419293, + "loss": 1.3791, + "step": 1668 + }, + { + "epoch": 0.05977044424946729, + "grad_norm": 2.240302801132202, + "learning_rate": 0.00019953582916079957, + "loss": 1.2613, + "step": 1669 + }, + { + "epoch": 0.05980625637903558, + "grad_norm": 1.98931086063385, + "learning_rate": 0.00019953471221827998, + "loss": 1.3293, + "step": 1670 + }, + { + "epoch": 0.059842068508603866, + "grad_norm": 1.7607539892196655, + "learning_rate": 0.00019953359393664927, + "loss": 1.2593, + "step": 1671 + }, + { + "epoch": 0.05987788063817215, + "grad_norm": 2.1042211055755615, + "learning_rate": 0.0001995324743159224, + "loss": 1.2322, + "step": 1672 + }, + { + "epoch": 0.05991369276774043, + "grad_norm": 2.5219650268554688, + "learning_rate": 0.00019953135335611452, + "loss": 1.3647, + "step": 1673 + }, + { + "epoch": 0.05994950489730872, + "grad_norm": 1.9913355112075806, + "learning_rate": 0.00019953023105724068, + "loss": 1.4718, + "step": 1674 + }, + { + "epoch": 0.059985317026877004, + "grad_norm": 2.610821008682251, + "learning_rate": 0.00019952910741931592, + "loss": 1.2751, + "step": 1675 + }, + { + "epoch": 0.06002112915644529, + "grad_norm": 3.024325132369995, + "learning_rate": 0.0001995279824423554, + "loss": 1.457, + "step": 1676 + }, + { + "epoch": 0.06005694128601357, + "grad_norm": 2.664989709854126, + "learning_rate": 0.0001995268561263743, + "loss": 1.4608, + "step": 1677 + }, + { + "epoch": 0.06009275341558186, + "grad_norm": 1.955292820930481, + "learning_rate": 0.00019952572847138772, + "loss": 1.3915, + "step": 1678 + }, + { + "epoch": 0.06012856554515014, + "grad_norm": 3.129894971847534, + "learning_rate": 0.00019952459947741082, + "loss": 1.4683, + "step": 1679 + }, + { + "epoch": 0.060164377674718425, + "grad_norm": 3.5218544006347656, + "learning_rate": 0.00019952346914445883, + "loss": 1.5732, + "step": 1680 + }, + { + "epoch": 0.060200189804286715, + "grad_norm": 2.427905559539795, + "learning_rate": 0.00019952233747254691, + "loss": 1.5065, + "step": 1681 + }, + { + "epoch": 0.060236001933855, + "grad_norm": 1.9392086267471313, + "learning_rate": 0.00019952120446169037, + "loss": 1.3542, + "step": 1682 + }, + { + "epoch": 0.06027181406342328, + "grad_norm": 2.4843955039978027, + "learning_rate": 0.00019952007011190433, + "loss": 1.5314, + "step": 1683 + }, + { + "epoch": 0.06030762619299156, + "grad_norm": 1.439486026763916, + "learning_rate": 0.00019951893442320416, + "loss": 1.2593, + "step": 1684 + }, + { + "epoch": 0.06034343832255985, + "grad_norm": 1.392245888710022, + "learning_rate": 0.0001995177973956051, + "loss": 1.3899, + "step": 1685 + }, + { + "epoch": 0.060379250452128136, + "grad_norm": 2.7904052734375, + "learning_rate": 0.00019951665902912243, + "loss": 1.5495, + "step": 1686 + }, + { + "epoch": 0.06041506258169642, + "grad_norm": 2.5147457122802734, + "learning_rate": 0.00019951551932377148, + "loss": 1.4241, + "step": 1687 + }, + { + "epoch": 0.06045087471126471, + "grad_norm": 2.3864707946777344, + "learning_rate": 0.00019951437827956758, + "loss": 1.2438, + "step": 1688 + }, + { + "epoch": 0.06048668684083299, + "grad_norm": 1.7693225145339966, + "learning_rate": 0.0001995132358965261, + "loss": 1.2994, + "step": 1689 + }, + { + "epoch": 0.060522498970401274, + "grad_norm": 3.1155409812927246, + "learning_rate": 0.00019951209217466238, + "loss": 1.4702, + "step": 1690 + }, + { + "epoch": 0.06055831109996956, + "grad_norm": 2.318721294403076, + "learning_rate": 0.0001995109471139918, + "loss": 1.2983, + "step": 1691 + }, + { + "epoch": 0.06059412322953785, + "grad_norm": 2.7146527767181396, + "learning_rate": 0.0001995098007145298, + "loss": 1.4681, + "step": 1692 + }, + { + "epoch": 0.06062993535910613, + "grad_norm": 1.726438283920288, + "learning_rate": 0.00019950865297629184, + "loss": 1.4081, + "step": 1693 + }, + { + "epoch": 0.06066574748867441, + "grad_norm": 3.862142324447632, + "learning_rate": 0.00019950750389929328, + "loss": 1.5838, + "step": 1694 + }, + { + "epoch": 0.0607015596182427, + "grad_norm": 2.0526976585388184, + "learning_rate": 0.0001995063534835496, + "loss": 1.496, + "step": 1695 + }, + { + "epoch": 0.060737371747810985, + "grad_norm": 2.6479671001434326, + "learning_rate": 0.0001995052017290763, + "loss": 1.3878, + "step": 1696 + }, + { + "epoch": 0.06077318387737927, + "grad_norm": 2.8062009811401367, + "learning_rate": 0.00019950404863588883, + "loss": 1.3466, + "step": 1697 + }, + { + "epoch": 0.06080899600694755, + "grad_norm": 2.3253183364868164, + "learning_rate": 0.00019950289420400278, + "loss": 1.6695, + "step": 1698 + }, + { + "epoch": 0.06084480813651584, + "grad_norm": 2.0314323902130127, + "learning_rate": 0.00019950173843343364, + "loss": 1.4798, + "step": 1699 + }, + { + "epoch": 0.06088062026608412, + "grad_norm": 1.6757292747497559, + "learning_rate": 0.00019950058132419692, + "loss": 1.2681, + "step": 1700 + }, + { + "epoch": 0.060916432395652406, + "grad_norm": 3.5771052837371826, + "learning_rate": 0.00019949942287630825, + "loss": 1.4612, + "step": 1701 + }, + { + "epoch": 0.06095224452522069, + "grad_norm": 2.3594746589660645, + "learning_rate": 0.00019949826308978316, + "loss": 1.317, + "step": 1702 + }, + { + "epoch": 0.06098805665478898, + "grad_norm": 2.568232297897339, + "learning_rate": 0.0001994971019646373, + "loss": 1.3415, + "step": 1703 + }, + { + "epoch": 0.06102386878435726, + "grad_norm": 3.044278144836426, + "learning_rate": 0.0001994959395008863, + "loss": 1.3984, + "step": 1704 + }, + { + "epoch": 0.061059680913925544, + "grad_norm": 2.20088267326355, + "learning_rate": 0.00019949477569854575, + "loss": 1.3466, + "step": 1705 + }, + { + "epoch": 0.061095493043493834, + "grad_norm": 4.025981426239014, + "learning_rate": 0.00019949361055763133, + "loss": 1.4401, + "step": 1706 + }, + { + "epoch": 0.06113130517306212, + "grad_norm": 2.8041889667510986, + "learning_rate": 0.00019949244407815875, + "loss": 1.2805, + "step": 1707 + }, + { + "epoch": 0.0611671173026304, + "grad_norm": 1.372471570968628, + "learning_rate": 0.00019949127626014363, + "loss": 1.1324, + "step": 1708 + }, + { + "epoch": 0.06120292943219868, + "grad_norm": 1.3831158876419067, + "learning_rate": 0.00019949010710360173, + "loss": 1.2349, + "step": 1709 + }, + { + "epoch": 0.06123874156176697, + "grad_norm": 3.1152708530426025, + "learning_rate": 0.0001994889366085488, + "loss": 1.609, + "step": 1710 + }, + { + "epoch": 0.061274553691335255, + "grad_norm": 1.8958512544631958, + "learning_rate": 0.00019948776477500053, + "loss": 1.4437, + "step": 1711 + }, + { + "epoch": 0.06131036582090354, + "grad_norm": 2.2268476486206055, + "learning_rate": 0.0001994865916029727, + "loss": 1.3688, + "step": 1712 + }, + { + "epoch": 0.06134617795047183, + "grad_norm": 2.038757801055908, + "learning_rate": 0.00019948541709248116, + "loss": 1.3829, + "step": 1713 + }, + { + "epoch": 0.06138199008004011, + "grad_norm": 1.7927558422088623, + "learning_rate": 0.00019948424124354163, + "loss": 1.2832, + "step": 1714 + }, + { + "epoch": 0.061417802209608394, + "grad_norm": 2.741076707839966, + "learning_rate": 0.00019948306405616996, + "loss": 1.561, + "step": 1715 + }, + { + "epoch": 0.061453614339176676, + "grad_norm": 1.6701358556747437, + "learning_rate": 0.00019948188553038198, + "loss": 1.5272, + "step": 1716 + }, + { + "epoch": 0.061489426468744966, + "grad_norm": 1.5481854677200317, + "learning_rate": 0.0001994807056661936, + "loss": 1.2778, + "step": 1717 + }, + { + "epoch": 0.06152523859831325, + "grad_norm": 1.6461554765701294, + "learning_rate": 0.00019947952446362058, + "loss": 1.4263, + "step": 1718 + }, + { + "epoch": 0.06156105072788153, + "grad_norm": 3.7510669231414795, + "learning_rate": 0.00019947834192267892, + "loss": 1.3581, + "step": 1719 + }, + { + "epoch": 0.06159686285744982, + "grad_norm": 2.7205379009246826, + "learning_rate": 0.00019947715804338447, + "loss": 1.4558, + "step": 1720 + }, + { + "epoch": 0.061632674987018105, + "grad_norm": 3.2680747509002686, + "learning_rate": 0.00019947597282575318, + "loss": 1.179, + "step": 1721 + }, + { + "epoch": 0.06166848711658639, + "grad_norm": 2.0326108932495117, + "learning_rate": 0.00019947478626980097, + "loss": 1.4138, + "step": 1722 + }, + { + "epoch": 0.06170429924615467, + "grad_norm": 2.2116291522979736, + "learning_rate": 0.00019947359837554384, + "loss": 1.3844, + "step": 1723 + }, + { + "epoch": 0.06174011137572296, + "grad_norm": 2.861783742904663, + "learning_rate": 0.00019947240914299776, + "loss": 1.3158, + "step": 1724 + }, + { + "epoch": 0.06177592350529124, + "grad_norm": 3.434288740158081, + "learning_rate": 0.00019947121857217875, + "loss": 1.5056, + "step": 1725 + }, + { + "epoch": 0.061811735634859526, + "grad_norm": 3.402985095977783, + "learning_rate": 0.00019947002666310276, + "loss": 1.3463, + "step": 1726 + }, + { + "epoch": 0.06184754776442781, + "grad_norm": 2.4446206092834473, + "learning_rate": 0.00019946883341578588, + "loss": 1.3569, + "step": 1727 + }, + { + "epoch": 0.0618833598939961, + "grad_norm": 1.9790385961532593, + "learning_rate": 0.00019946763883024415, + "loss": 1.3018, + "step": 1728 + }, + { + "epoch": 0.06191917202356438, + "grad_norm": 3.1593196392059326, + "learning_rate": 0.00019946644290649368, + "loss": 1.2847, + "step": 1729 + }, + { + "epoch": 0.061954984153132664, + "grad_norm": 2.6451570987701416, + "learning_rate": 0.00019946524564455048, + "loss": 1.5531, + "step": 1730 + }, + { + "epoch": 0.061990796282700954, + "grad_norm": 2.7912583351135254, + "learning_rate": 0.00019946404704443072, + "loss": 1.3767, + "step": 1731 + }, + { + "epoch": 0.06202660841226924, + "grad_norm": 2.196303367614746, + "learning_rate": 0.00019946284710615052, + "loss": 1.2287, + "step": 1732 + }, + { + "epoch": 0.06206242054183752, + "grad_norm": 2.200678586959839, + "learning_rate": 0.00019946164582972594, + "loss": 1.3445, + "step": 1733 + }, + { + "epoch": 0.0620982326714058, + "grad_norm": 1.9862322807312012, + "learning_rate": 0.0001994604432151733, + "loss": 1.2102, + "step": 1734 + }, + { + "epoch": 0.06213404480097409, + "grad_norm": 1.857621192932129, + "learning_rate": 0.00019945923926250865, + "loss": 1.5108, + "step": 1735 + }, + { + "epoch": 0.062169856930542375, + "grad_norm": 1.641928791999817, + "learning_rate": 0.00019945803397174823, + "loss": 1.269, + "step": 1736 + }, + { + "epoch": 0.06220566906011066, + "grad_norm": 2.928422689437866, + "learning_rate": 0.00019945682734290825, + "loss": 1.455, + "step": 1737 + }, + { + "epoch": 0.06224148118967895, + "grad_norm": 1.6951353549957275, + "learning_rate": 0.00019945561937600496, + "loss": 1.2616, + "step": 1738 + }, + { + "epoch": 0.06227729331924723, + "grad_norm": 3.561145067214966, + "learning_rate": 0.0001994544100710546, + "loss": 1.3729, + "step": 1739 + }, + { + "epoch": 0.06231310544881551, + "grad_norm": 2.1473782062530518, + "learning_rate": 0.0001994531994280734, + "loss": 1.4473, + "step": 1740 + }, + { + "epoch": 0.062348917578383796, + "grad_norm": 1.962131381034851, + "learning_rate": 0.00019945198744707776, + "loss": 1.4704, + "step": 1741 + }, + { + "epoch": 0.062384729707952086, + "grad_norm": 3.0134944915771484, + "learning_rate": 0.00019945077412808387, + "loss": 1.4653, + "step": 1742 + }, + { + "epoch": 0.06242054183752037, + "grad_norm": 4.290970802307129, + "learning_rate": 0.0001994495594711081, + "loss": 1.2092, + "step": 1743 + }, + { + "epoch": 0.06245635396708865, + "grad_norm": 1.7935841083526611, + "learning_rate": 0.0001994483434761668, + "loss": 1.2245, + "step": 1744 + }, + { + "epoch": 0.06249216609665694, + "grad_norm": 1.4945508241653442, + "learning_rate": 0.0001994471261432763, + "loss": 1.4163, + "step": 1745 + }, + { + "epoch": 0.06252797822622522, + "grad_norm": 1.590043067932129, + "learning_rate": 0.00019944590747245298, + "loss": 1.2712, + "step": 1746 + }, + { + "epoch": 0.06256379035579351, + "grad_norm": 1.966668963432312, + "learning_rate": 0.0001994446874637133, + "loss": 1.3772, + "step": 1747 + }, + { + "epoch": 0.0625996024853618, + "grad_norm": 2.8510682582855225, + "learning_rate": 0.00019944346611707356, + "loss": 1.2144, + "step": 1748 + }, + { + "epoch": 0.06263541461493008, + "grad_norm": 2.2802271842956543, + "learning_rate": 0.0001994422434325503, + "loss": 1.3733, + "step": 1749 + }, + { + "epoch": 0.06267122674449836, + "grad_norm": 2.3561666011810303, + "learning_rate": 0.00019944101941015994, + "loss": 1.2756, + "step": 1750 + }, + { + "epoch": 0.06270703887406665, + "grad_norm": 3.2297258377075195, + "learning_rate": 0.0001994397940499189, + "loss": 1.3327, + "step": 1751 + }, + { + "epoch": 0.06274285100363493, + "grad_norm": 2.0564303398132324, + "learning_rate": 0.00019943856735184373, + "loss": 1.4281, + "step": 1752 + }, + { + "epoch": 0.06277866313320321, + "grad_norm": 3.5143702030181885, + "learning_rate": 0.00019943733931595086, + "loss": 1.2871, + "step": 1753 + }, + { + "epoch": 0.06281447526277151, + "grad_norm": 1.860155463218689, + "learning_rate": 0.0001994361099422569, + "loss": 1.3741, + "step": 1754 + }, + { + "epoch": 0.06285028739233979, + "grad_norm": 1.3469141721725464, + "learning_rate": 0.00019943487923077831, + "loss": 1.1843, + "step": 1755 + }, + { + "epoch": 0.06288609952190807, + "grad_norm": 3.2248470783233643, + "learning_rate": 0.0001994336471815317, + "loss": 1.3702, + "step": 1756 + }, + { + "epoch": 0.06292191165147636, + "grad_norm": 2.6396191120147705, + "learning_rate": 0.00019943241379453364, + "loss": 1.306, + "step": 1757 + }, + { + "epoch": 0.06295772378104464, + "grad_norm": 2.068875312805176, + "learning_rate": 0.00019943117906980068, + "loss": 1.2079, + "step": 1758 + }, + { + "epoch": 0.06299353591061292, + "grad_norm": 1.4099628925323486, + "learning_rate": 0.00019942994300734947, + "loss": 1.3903, + "step": 1759 + }, + { + "epoch": 0.0630293480401812, + "grad_norm": 2.5286054611206055, + "learning_rate": 0.00019942870560719664, + "loss": 1.3587, + "step": 1760 + }, + { + "epoch": 0.06306516016974949, + "grad_norm": 2.9809410572052, + "learning_rate": 0.00019942746686935883, + "loss": 1.3102, + "step": 1761 + }, + { + "epoch": 0.06310097229931778, + "grad_norm": 3.1360323429107666, + "learning_rate": 0.0001994262267938527, + "loss": 1.3395, + "step": 1762 + }, + { + "epoch": 0.06313678442888607, + "grad_norm": 1.4061132669448853, + "learning_rate": 0.00019942498538069495, + "loss": 1.239, + "step": 1763 + }, + { + "epoch": 0.06317259655845435, + "grad_norm": 2.6097867488861084, + "learning_rate": 0.00019942374262990224, + "loss": 1.3259, + "step": 1764 + }, + { + "epoch": 0.06320840868802263, + "grad_norm": 2.261996269226074, + "learning_rate": 0.00019942249854149136, + "loss": 1.3192, + "step": 1765 + }, + { + "epoch": 0.06324422081759092, + "grad_norm": 3.006229877471924, + "learning_rate": 0.000199421253115479, + "loss": 1.4596, + "step": 1766 + }, + { + "epoch": 0.0632800329471592, + "grad_norm": 1.9742592573165894, + "learning_rate": 0.0001994200063518819, + "loss": 1.3949, + "step": 1767 + }, + { + "epoch": 0.06331584507672748, + "grad_norm": 1.9795572757720947, + "learning_rate": 0.0001994187582507169, + "loss": 1.3702, + "step": 1768 + }, + { + "epoch": 0.06335165720629578, + "grad_norm": 2.826500177383423, + "learning_rate": 0.0001994175088120007, + "loss": 1.2052, + "step": 1769 + }, + { + "epoch": 0.06338746933586406, + "grad_norm": 3.846290111541748, + "learning_rate": 0.00019941625803575019, + "loss": 1.2505, + "step": 1770 + }, + { + "epoch": 0.06342328146543234, + "grad_norm": 1.8145561218261719, + "learning_rate": 0.00019941500592198216, + "loss": 1.2854, + "step": 1771 + }, + { + "epoch": 0.06345909359500063, + "grad_norm": 1.7956809997558594, + "learning_rate": 0.00019941375247071346, + "loss": 1.2571, + "step": 1772 + }, + { + "epoch": 0.06349490572456891, + "grad_norm": 1.5560632944107056, + "learning_rate": 0.00019941249768196093, + "loss": 1.2756, + "step": 1773 + }, + { + "epoch": 0.06353071785413719, + "grad_norm": 2.089228630065918, + "learning_rate": 0.0001994112415557415, + "loss": 1.3952, + "step": 1774 + }, + { + "epoch": 0.06356652998370547, + "grad_norm": 2.7770891189575195, + "learning_rate": 0.00019940998409207205, + "loss": 1.4608, + "step": 1775 + }, + { + "epoch": 0.06360234211327377, + "grad_norm": 1.7245399951934814, + "learning_rate": 0.00019940872529096947, + "loss": 1.0825, + "step": 1776 + }, + { + "epoch": 0.06363815424284205, + "grad_norm": 1.946861982345581, + "learning_rate": 0.00019940746515245073, + "loss": 1.3932, + "step": 1777 + }, + { + "epoch": 0.06367396637241034, + "grad_norm": 2.2084126472473145, + "learning_rate": 0.0001994062036765328, + "loss": 1.4542, + "step": 1778 + }, + { + "epoch": 0.06370977850197862, + "grad_norm": 3.416480302810669, + "learning_rate": 0.0001994049408632326, + "loss": 1.4047, + "step": 1779 + }, + { + "epoch": 0.0637455906315469, + "grad_norm": 2.729379415512085, + "learning_rate": 0.00019940367671256716, + "loss": 1.3744, + "step": 1780 + }, + { + "epoch": 0.06378140276111519, + "grad_norm": 1.4750922918319702, + "learning_rate": 0.00019940241122455346, + "loss": 1.4502, + "step": 1781 + }, + { + "epoch": 0.06381721489068347, + "grad_norm": 1.9762709140777588, + "learning_rate": 0.00019940114439920853, + "loss": 1.4151, + "step": 1782 + }, + { + "epoch": 0.06385302702025177, + "grad_norm": 2.103109359741211, + "learning_rate": 0.00019939987623654944, + "loss": 1.3623, + "step": 1783 + }, + { + "epoch": 0.06388883914982005, + "grad_norm": 1.637137532234192, + "learning_rate": 0.0001993986067365932, + "loss": 1.2898, + "step": 1784 + }, + { + "epoch": 0.06392465127938833, + "grad_norm": 1.82806396484375, + "learning_rate": 0.00019939733589935694, + "loss": 1.3147, + "step": 1785 + }, + { + "epoch": 0.06396046340895661, + "grad_norm": 2.5866501331329346, + "learning_rate": 0.00019939606372485776, + "loss": 1.3877, + "step": 1786 + }, + { + "epoch": 0.0639962755385249, + "grad_norm": 1.6654565334320068, + "learning_rate": 0.00019939479021311273, + "loss": 1.42, + "step": 1787 + }, + { + "epoch": 0.06403208766809318, + "grad_norm": 1.6601476669311523, + "learning_rate": 0.000199393515364139, + "loss": 1.2956, + "step": 1788 + }, + { + "epoch": 0.06406789979766146, + "grad_norm": 2.9706051349639893, + "learning_rate": 0.00019939223917795373, + "loss": 1.3093, + "step": 1789 + }, + { + "epoch": 0.06410371192722976, + "grad_norm": 2.1636710166931152, + "learning_rate": 0.00019939096165457411, + "loss": 1.4575, + "step": 1790 + }, + { + "epoch": 0.06413952405679804, + "grad_norm": 1.9196995496749878, + "learning_rate": 0.00019938968279401727, + "loss": 1.4767, + "step": 1791 + }, + { + "epoch": 0.06417533618636632, + "grad_norm": 1.9088143110275269, + "learning_rate": 0.0001993884025963005, + "loss": 1.4325, + "step": 1792 + }, + { + "epoch": 0.06421114831593461, + "grad_norm": 1.8384257555007935, + "learning_rate": 0.00019938712106144096, + "loss": 1.3633, + "step": 1793 + }, + { + "epoch": 0.06424696044550289, + "grad_norm": 1.6444787979125977, + "learning_rate": 0.0001993858381894559, + "loss": 1.4382, + "step": 1794 + }, + { + "epoch": 0.06428277257507117, + "grad_norm": 1.9841870069503784, + "learning_rate": 0.00019938455398036257, + "loss": 1.5296, + "step": 1795 + }, + { + "epoch": 0.06431858470463946, + "grad_norm": 2.3778414726257324, + "learning_rate": 0.00019938326843417826, + "loss": 1.174, + "step": 1796 + }, + { + "epoch": 0.06435439683420775, + "grad_norm": 2.7771689891815186, + "learning_rate": 0.0001993819815509203, + "loss": 1.3473, + "step": 1797 + }, + { + "epoch": 0.06439020896377604, + "grad_norm": 2.6617166996002197, + "learning_rate": 0.00019938069333060593, + "loss": 1.6592, + "step": 1798 + }, + { + "epoch": 0.06442602109334432, + "grad_norm": 1.4965850114822388, + "learning_rate": 0.00019937940377325256, + "loss": 1.2715, + "step": 1799 + }, + { + "epoch": 0.0644618332229126, + "grad_norm": 2.1468217372894287, + "learning_rate": 0.0001993781128788775, + "loss": 1.2041, + "step": 1800 + }, + { + "epoch": 0.06449764535248088, + "grad_norm": 2.064967632293701, + "learning_rate": 0.00019937682064749811, + "loss": 1.397, + "step": 1801 + }, + { + "epoch": 0.06453345748204917, + "grad_norm": 2.4523086547851562, + "learning_rate": 0.0001993755270791318, + "loss": 1.433, + "step": 1802 + }, + { + "epoch": 0.06456926961161745, + "grad_norm": 2.1007041931152344, + "learning_rate": 0.00019937423217379594, + "loss": 1.2113, + "step": 1803 + }, + { + "epoch": 0.06460508174118575, + "grad_norm": 1.6223993301391602, + "learning_rate": 0.00019937293593150796, + "loss": 1.0873, + "step": 1804 + }, + { + "epoch": 0.06464089387075403, + "grad_norm": 2.1523027420043945, + "learning_rate": 0.00019937163835228534, + "loss": 1.2272, + "step": 1805 + }, + { + "epoch": 0.06467670600032231, + "grad_norm": 2.2254951000213623, + "learning_rate": 0.0001993703394361455, + "loss": 1.2048, + "step": 1806 + }, + { + "epoch": 0.0647125181298906, + "grad_norm": 2.378000259399414, + "learning_rate": 0.0001993690391831059, + "loss": 1.1492, + "step": 1807 + }, + { + "epoch": 0.06474833025945888, + "grad_norm": 3.3398659229278564, + "learning_rate": 0.00019936773759318408, + "loss": 1.3352, + "step": 1808 + }, + { + "epoch": 0.06478414238902716, + "grad_norm": 1.9013479948043823, + "learning_rate": 0.0001993664346663975, + "loss": 1.3274, + "step": 1809 + }, + { + "epoch": 0.06481995451859544, + "grad_norm": 1.6717497110366821, + "learning_rate": 0.00019936513040276371, + "loss": 1.1788, + "step": 1810 + }, + { + "epoch": 0.06485576664816373, + "grad_norm": 2.7759976387023926, + "learning_rate": 0.00019936382480230028, + "loss": 1.265, + "step": 1811 + }, + { + "epoch": 0.06489157877773202, + "grad_norm": 2.438318967819214, + "learning_rate": 0.00019936251786502478, + "loss": 1.1034, + "step": 1812 + }, + { + "epoch": 0.0649273909073003, + "grad_norm": 2.7479939460754395, + "learning_rate": 0.00019936120959095473, + "loss": 1.1831, + "step": 1813 + }, + { + "epoch": 0.06496320303686859, + "grad_norm": 2.7253530025482178, + "learning_rate": 0.0001993598999801078, + "loss": 1.4349, + "step": 1814 + }, + { + "epoch": 0.06499901516643687, + "grad_norm": 2.8577640056610107, + "learning_rate": 0.00019935858903250155, + "loss": 1.3445, + "step": 1815 + }, + { + "epoch": 0.06503482729600515, + "grad_norm": 2.584819793701172, + "learning_rate": 0.00019935727674815369, + "loss": 1.2765, + "step": 1816 + }, + { + "epoch": 0.06507063942557344, + "grad_norm": 1.9658708572387695, + "learning_rate": 0.0001993559631270818, + "loss": 1.3347, + "step": 1817 + }, + { + "epoch": 0.06510645155514172, + "grad_norm": 2.665996789932251, + "learning_rate": 0.0001993546481693036, + "loss": 1.2473, + "step": 1818 + }, + { + "epoch": 0.06514226368471002, + "grad_norm": 1.8468258380889893, + "learning_rate": 0.00019935333187483676, + "loss": 1.2363, + "step": 1819 + }, + { + "epoch": 0.0651780758142783, + "grad_norm": 1.4670778512954712, + "learning_rate": 0.000199352014243699, + "loss": 1.3772, + "step": 1820 + }, + { + "epoch": 0.06521388794384658, + "grad_norm": 2.734312057495117, + "learning_rate": 0.00019935069527590805, + "loss": 1.2752, + "step": 1821 + }, + { + "epoch": 0.06524970007341487, + "grad_norm": 1.4494972229003906, + "learning_rate": 0.00019934937497148163, + "loss": 1.2663, + "step": 1822 + }, + { + "epoch": 0.06528551220298315, + "grad_norm": 1.539746642112732, + "learning_rate": 0.00019934805333043752, + "loss": 1.2921, + "step": 1823 + }, + { + "epoch": 0.06532132433255143, + "grad_norm": 2.063230514526367, + "learning_rate": 0.00019934673035279353, + "loss": 1.4273, + "step": 1824 + }, + { + "epoch": 0.06535713646211971, + "grad_norm": 1.5459074974060059, + "learning_rate": 0.00019934540603856743, + "loss": 1.2106, + "step": 1825 + }, + { + "epoch": 0.06539294859168801, + "grad_norm": 2.1695163249969482, + "learning_rate": 0.000199344080387777, + "loss": 1.4844, + "step": 1826 + }, + { + "epoch": 0.0654287607212563, + "grad_norm": 2.3677072525024414, + "learning_rate": 0.00019934275340044013, + "loss": 1.3635, + "step": 1827 + }, + { + "epoch": 0.06546457285082458, + "grad_norm": 2.6817123889923096, + "learning_rate": 0.0001993414250765747, + "loss": 1.1781, + "step": 1828 + }, + { + "epoch": 0.06550038498039286, + "grad_norm": 2.4693644046783447, + "learning_rate": 0.0001993400954161985, + "loss": 1.338, + "step": 1829 + }, + { + "epoch": 0.06553619710996114, + "grad_norm": 2.130228042602539, + "learning_rate": 0.00019933876441932943, + "loss": 1.4064, + "step": 1830 + }, + { + "epoch": 0.06557200923952942, + "grad_norm": 3.2981514930725098, + "learning_rate": 0.00019933743208598546, + "loss": 1.3394, + "step": 1831 + }, + { + "epoch": 0.06560782136909771, + "grad_norm": 1.951635718345642, + "learning_rate": 0.00019933609841618445, + "loss": 1.3546, + "step": 1832 + }, + { + "epoch": 0.065643633498666, + "grad_norm": 2.478203773498535, + "learning_rate": 0.0001993347634099444, + "loss": 1.0806, + "step": 1833 + }, + { + "epoch": 0.06567944562823429, + "grad_norm": 2.365917682647705, + "learning_rate": 0.00019933342706728323, + "loss": 1.5002, + "step": 1834 + }, + { + "epoch": 0.06571525775780257, + "grad_norm": 1.44174063205719, + "learning_rate": 0.0001993320893882189, + "loss": 0.992, + "step": 1835 + }, + { + "epoch": 0.06575106988737085, + "grad_norm": 1.56092369556427, + "learning_rate": 0.00019933075037276949, + "loss": 1.1927, + "step": 1836 + }, + { + "epoch": 0.06578688201693914, + "grad_norm": 1.741999626159668, + "learning_rate": 0.00019932941002095294, + "loss": 1.5581, + "step": 1837 + }, + { + "epoch": 0.06582269414650742, + "grad_norm": 1.8748806715011597, + "learning_rate": 0.00019932806833278726, + "loss": 1.1103, + "step": 1838 + }, + { + "epoch": 0.0658585062760757, + "grad_norm": 3.595700979232788, + "learning_rate": 0.0001993267253082906, + "loss": 1.4809, + "step": 1839 + }, + { + "epoch": 0.065894318405644, + "grad_norm": 1.471718668937683, + "learning_rate": 0.00019932538094748098, + "loss": 1.2121, + "step": 1840 + }, + { + "epoch": 0.06593013053521228, + "grad_norm": 1.2877733707427979, + "learning_rate": 0.00019932403525037642, + "loss": 1.2865, + "step": 1841 + }, + { + "epoch": 0.06596594266478056, + "grad_norm": 2.4598395824432373, + "learning_rate": 0.00019932268821699513, + "loss": 1.2647, + "step": 1842 + }, + { + "epoch": 0.06600175479434885, + "grad_norm": 2.765678882598877, + "learning_rate": 0.0001993213398473552, + "loss": 1.0358, + "step": 1843 + }, + { + "epoch": 0.06603756692391713, + "grad_norm": 2.3092706203460693, + "learning_rate": 0.00019931999014147472, + "loss": 1.4005, + "step": 1844 + }, + { + "epoch": 0.06607337905348541, + "grad_norm": 2.189741373062134, + "learning_rate": 0.0001993186390993719, + "loss": 1.429, + "step": 1845 + }, + { + "epoch": 0.0661091911830537, + "grad_norm": 1.724527359008789, + "learning_rate": 0.0001993172867210649, + "loss": 1.3467, + "step": 1846 + }, + { + "epoch": 0.06614500331262199, + "grad_norm": 3.145249128341675, + "learning_rate": 0.00019931593300657192, + "loss": 1.4602, + "step": 1847 + }, + { + "epoch": 0.06618081544219027, + "grad_norm": 2.141531467437744, + "learning_rate": 0.00019931457795591118, + "loss": 1.1014, + "step": 1848 + }, + { + "epoch": 0.06621662757175856, + "grad_norm": 2.144491195678711, + "learning_rate": 0.00019931322156910088, + "loss": 1.3191, + "step": 1849 + }, + { + "epoch": 0.06625243970132684, + "grad_norm": 1.6876111030578613, + "learning_rate": 0.0001993118638461593, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 0.06628825183089512, + "grad_norm": 2.622537612915039, + "learning_rate": 0.00019931050478710468, + "loss": 1.3732, + "step": 1851 + }, + { + "epoch": 0.0663240639604634, + "grad_norm": 3.1374261379241943, + "learning_rate": 0.00019930914439195534, + "loss": 1.4169, + "step": 1852 + }, + { + "epoch": 0.06635987609003169, + "grad_norm": 3.1396164894104004, + "learning_rate": 0.00019930778266072957, + "loss": 1.1324, + "step": 1853 + }, + { + "epoch": 0.06639568821959999, + "grad_norm": 2.316835880279541, + "learning_rate": 0.00019930641959344566, + "loss": 0.9832, + "step": 1854 + }, + { + "epoch": 0.06643150034916827, + "grad_norm": 2.0262467861175537, + "learning_rate": 0.00019930505519012197, + "loss": 1.2495, + "step": 1855 + }, + { + "epoch": 0.06646731247873655, + "grad_norm": 2.490090847015381, + "learning_rate": 0.0001993036894507769, + "loss": 1.2223, + "step": 1856 + }, + { + "epoch": 0.06650312460830483, + "grad_norm": 3.033574342727661, + "learning_rate": 0.00019930232237542873, + "loss": 1.2912, + "step": 1857 + }, + { + "epoch": 0.06653893673787312, + "grad_norm": 2.0524678230285645, + "learning_rate": 0.0001993009539640959, + "loss": 1.1771, + "step": 1858 + }, + { + "epoch": 0.0665747488674414, + "grad_norm": 1.508873701095581, + "learning_rate": 0.00019929958421679685, + "loss": 1.3, + "step": 1859 + }, + { + "epoch": 0.06661056099700968, + "grad_norm": 1.8693373203277588, + "learning_rate": 0.00019929821313354997, + "loss": 1.387, + "step": 1860 + }, + { + "epoch": 0.06664637312657797, + "grad_norm": 1.6913589239120483, + "learning_rate": 0.00019929684071437373, + "loss": 1.4242, + "step": 1861 + }, + { + "epoch": 0.06668218525614626, + "grad_norm": 1.6671918630599976, + "learning_rate": 0.00019929546695928658, + "loss": 1.3008, + "step": 1862 + }, + { + "epoch": 0.06671799738571454, + "grad_norm": 2.1088640689849854, + "learning_rate": 0.000199294091868307, + "loss": 1.2585, + "step": 1863 + }, + { + "epoch": 0.06675380951528283, + "grad_norm": 1.6261720657348633, + "learning_rate": 0.0001992927154414535, + "loss": 1.2697, + "step": 1864 + }, + { + "epoch": 0.06678962164485111, + "grad_norm": 1.3044513463974, + "learning_rate": 0.00019929133767874454, + "loss": 1.2233, + "step": 1865 + }, + { + "epoch": 0.0668254337744194, + "grad_norm": 1.7452906370162964, + "learning_rate": 0.0001992899585801988, + "loss": 1.388, + "step": 1866 + }, + { + "epoch": 0.06686124590398768, + "grad_norm": 2.365966796875, + "learning_rate": 0.0001992885781458347, + "loss": 1.333, + "step": 1867 + }, + { + "epoch": 0.06689705803355596, + "grad_norm": 1.7576044797897339, + "learning_rate": 0.0001992871963756708, + "loss": 1.183, + "step": 1868 + }, + { + "epoch": 0.06693287016312426, + "grad_norm": 1.3601731061935425, + "learning_rate": 0.00019928581326972582, + "loss": 1.3011, + "step": 1869 + }, + { + "epoch": 0.06696868229269254, + "grad_norm": 1.6038122177124023, + "learning_rate": 0.00019928442882801825, + "loss": 1.2117, + "step": 1870 + }, + { + "epoch": 0.06700449442226082, + "grad_norm": 1.6926742792129517, + "learning_rate": 0.00019928304305056677, + "loss": 1.3079, + "step": 1871 + }, + { + "epoch": 0.0670403065518291, + "grad_norm": 1.8389264345169067, + "learning_rate": 0.00019928165593739, + "loss": 1.4661, + "step": 1872 + }, + { + "epoch": 0.06707611868139739, + "grad_norm": 1.9929006099700928, + "learning_rate": 0.00019928026748850663, + "loss": 1.4411, + "step": 1873 + }, + { + "epoch": 0.06711193081096567, + "grad_norm": 1.6055303812026978, + "learning_rate": 0.00019927887770393533, + "loss": 1.3848, + "step": 1874 + }, + { + "epoch": 0.06714774294053395, + "grad_norm": 2.0053083896636963, + "learning_rate": 0.0001992774865836948, + "loss": 1.2988, + "step": 1875 + }, + { + "epoch": 0.06718355507010225, + "grad_norm": 2.2032880783081055, + "learning_rate": 0.0001992760941278037, + "loss": 1.2309, + "step": 1876 + }, + { + "epoch": 0.06721936719967053, + "grad_norm": 1.7815675735473633, + "learning_rate": 0.00019927470033628087, + "loss": 1.1323, + "step": 1877 + }, + { + "epoch": 0.06725517932923882, + "grad_norm": 2.873889684677124, + "learning_rate": 0.00019927330520914496, + "loss": 1.1899, + "step": 1878 + }, + { + "epoch": 0.0672909914588071, + "grad_norm": 2.373534917831421, + "learning_rate": 0.00019927190874641478, + "loss": 1.3652, + "step": 1879 + }, + { + "epoch": 0.06732680358837538, + "grad_norm": 1.8598734140396118, + "learning_rate": 0.00019927051094810913, + "loss": 1.4614, + "step": 1880 + }, + { + "epoch": 0.06736261571794366, + "grad_norm": 1.5694297552108765, + "learning_rate": 0.00019926911181424682, + "loss": 1.3818, + "step": 1881 + }, + { + "epoch": 0.06739842784751195, + "grad_norm": 2.1247200965881348, + "learning_rate": 0.00019926771134484662, + "loss": 1.2481, + "step": 1882 + }, + { + "epoch": 0.06743423997708024, + "grad_norm": 3.0379204750061035, + "learning_rate": 0.00019926630953992746, + "loss": 1.5188, + "step": 1883 + }, + { + "epoch": 0.06747005210664853, + "grad_norm": 3.5097203254699707, + "learning_rate": 0.00019926490639950812, + "loss": 1.3795, + "step": 1884 + }, + { + "epoch": 0.06750586423621681, + "grad_norm": 2.3128249645233154, + "learning_rate": 0.00019926350192360753, + "loss": 1.1251, + "step": 1885 + }, + { + "epoch": 0.06754167636578509, + "grad_norm": 2.567695379257202, + "learning_rate": 0.00019926209611224454, + "loss": 1.1275, + "step": 1886 + }, + { + "epoch": 0.06757748849535337, + "grad_norm": 1.6130295991897583, + "learning_rate": 0.00019926068896543807, + "loss": 1.227, + "step": 1887 + }, + { + "epoch": 0.06761330062492166, + "grad_norm": 2.1720094680786133, + "learning_rate": 0.0001992592804832071, + "loss": 1.1972, + "step": 1888 + }, + { + "epoch": 0.06764911275448994, + "grad_norm": 1.5900112390518188, + "learning_rate": 0.00019925787066557053, + "loss": 1.2269, + "step": 1889 + }, + { + "epoch": 0.06768492488405824, + "grad_norm": 1.8722728490829468, + "learning_rate": 0.00019925645951254735, + "loss": 1.2684, + "step": 1890 + }, + { + "epoch": 0.06772073701362652, + "grad_norm": 1.6864386796951294, + "learning_rate": 0.00019925504702415653, + "loss": 1.3382, + "step": 1891 + }, + { + "epoch": 0.0677565491431948, + "grad_norm": 1.4951366186141968, + "learning_rate": 0.00019925363320041708, + "loss": 1.2358, + "step": 1892 + }, + { + "epoch": 0.06779236127276309, + "grad_norm": 1.7040454149246216, + "learning_rate": 0.00019925221804134805, + "loss": 1.2712, + "step": 1893 + }, + { + "epoch": 0.06782817340233137, + "grad_norm": 2.360562801361084, + "learning_rate": 0.0001992508015469684, + "loss": 1.235, + "step": 1894 + }, + { + "epoch": 0.06786398553189965, + "grad_norm": 3.5623931884765625, + "learning_rate": 0.00019924938371729728, + "loss": 1.2961, + "step": 1895 + }, + { + "epoch": 0.06789979766146793, + "grad_norm": 2.402339458465576, + "learning_rate": 0.00019924796455235373, + "loss": 1.4394, + "step": 1896 + }, + { + "epoch": 0.06793560979103623, + "grad_norm": 2.3797309398651123, + "learning_rate": 0.00019924654405215682, + "loss": 1.0961, + "step": 1897 + }, + { + "epoch": 0.06797142192060451, + "grad_norm": 3.111851692199707, + "learning_rate": 0.00019924512221672572, + "loss": 1.2563, + "step": 1898 + }, + { + "epoch": 0.0680072340501728, + "grad_norm": 1.8839884996414185, + "learning_rate": 0.00019924369904607945, + "loss": 1.3622, + "step": 1899 + }, + { + "epoch": 0.06804304617974108, + "grad_norm": 2.046614170074463, + "learning_rate": 0.00019924227454023728, + "loss": 1.2442, + "step": 1900 + }, + { + "epoch": 0.06807885830930936, + "grad_norm": 1.5697423219680786, + "learning_rate": 0.0001992408486992183, + "loss": 1.3403, + "step": 1901 + }, + { + "epoch": 0.06811467043887764, + "grad_norm": 2.84218430519104, + "learning_rate": 0.00019923942152304169, + "loss": 1.0773, + "step": 1902 + }, + { + "epoch": 0.06815048256844593, + "grad_norm": 2.1838200092315674, + "learning_rate": 0.0001992379930117267, + "loss": 1.3059, + "step": 1903 + }, + { + "epoch": 0.06818629469801422, + "grad_norm": 2.103097438812256, + "learning_rate": 0.00019923656316529252, + "loss": 1.2302, + "step": 1904 + }, + { + "epoch": 0.06822210682758251, + "grad_norm": 1.6536786556243896, + "learning_rate": 0.00019923513198375837, + "loss": 1.1768, + "step": 1905 + }, + { + "epoch": 0.06825791895715079, + "grad_norm": 1.6784751415252686, + "learning_rate": 0.00019923369946714354, + "loss": 1.4313, + "step": 1906 + }, + { + "epoch": 0.06829373108671907, + "grad_norm": 2.526791572570801, + "learning_rate": 0.00019923226561546726, + "loss": 1.2232, + "step": 1907 + }, + { + "epoch": 0.06832954321628736, + "grad_norm": 2.5111734867095947, + "learning_rate": 0.00019923083042874885, + "loss": 1.5979, + "step": 1908 + }, + { + "epoch": 0.06836535534585564, + "grad_norm": 2.464913845062256, + "learning_rate": 0.00019922939390700767, + "loss": 1.2301, + "step": 1909 + }, + { + "epoch": 0.06840116747542392, + "grad_norm": 2.7396857738494873, + "learning_rate": 0.00019922795605026295, + "loss": 1.252, + "step": 1910 + }, + { + "epoch": 0.0684369796049922, + "grad_norm": 1.9723228216171265, + "learning_rate": 0.00019922651685853407, + "loss": 1.2742, + "step": 1911 + }, + { + "epoch": 0.0684727917345605, + "grad_norm": 2.0593149662017822, + "learning_rate": 0.0001992250763318404, + "loss": 1.269, + "step": 1912 + }, + { + "epoch": 0.06850860386412878, + "grad_norm": 2.046044111251831, + "learning_rate": 0.00019922363447020134, + "loss": 1.4646, + "step": 1913 + }, + { + "epoch": 0.06854441599369707, + "grad_norm": 1.5367751121520996, + "learning_rate": 0.00019922219127363624, + "loss": 1.2559, + "step": 1914 + }, + { + "epoch": 0.06858022812326535, + "grad_norm": 2.2486135959625244, + "learning_rate": 0.00019922074674216456, + "loss": 1.2368, + "step": 1915 + }, + { + "epoch": 0.06861604025283363, + "grad_norm": 2.5762939453125, + "learning_rate": 0.00019921930087580573, + "loss": 1.439, + "step": 1916 + }, + { + "epoch": 0.06865185238240192, + "grad_norm": 2.1230320930480957, + "learning_rate": 0.00019921785367457917, + "loss": 1.2997, + "step": 1917 + }, + { + "epoch": 0.0686876645119702, + "grad_norm": 1.8331669569015503, + "learning_rate": 0.00019921640513850437, + "loss": 1.1791, + "step": 1918 + }, + { + "epoch": 0.0687234766415385, + "grad_norm": 1.8676713705062866, + "learning_rate": 0.00019921495526760083, + "loss": 1.1513, + "step": 1919 + }, + { + "epoch": 0.06875928877110678, + "grad_norm": 1.7079428434371948, + "learning_rate": 0.00019921350406188805, + "loss": 1.2338, + "step": 1920 + }, + { + "epoch": 0.06879510090067506, + "grad_norm": 2.3351263999938965, + "learning_rate": 0.00019921205152138556, + "loss": 1.2731, + "step": 1921 + }, + { + "epoch": 0.06883091303024334, + "grad_norm": 1.8161206245422363, + "learning_rate": 0.00019921059764611284, + "loss": 1.3689, + "step": 1922 + }, + { + "epoch": 0.06886672515981163, + "grad_norm": 1.9403820037841797, + "learning_rate": 0.00019920914243608956, + "loss": 1.3833, + "step": 1923 + }, + { + "epoch": 0.06890253728937991, + "grad_norm": 1.6856886148452759, + "learning_rate": 0.0001992076858913352, + "loss": 1.0631, + "step": 1924 + }, + { + "epoch": 0.06893834941894819, + "grad_norm": 2.447157144546509, + "learning_rate": 0.0001992062280118694, + "loss": 1.4411, + "step": 1925 + }, + { + "epoch": 0.06897416154851649, + "grad_norm": 1.8093321323394775, + "learning_rate": 0.0001992047687977118, + "loss": 1.2057, + "step": 1926 + }, + { + "epoch": 0.06900997367808477, + "grad_norm": 2.5393967628479004, + "learning_rate": 0.00019920330824888197, + "loss": 1.2984, + "step": 1927 + }, + { + "epoch": 0.06904578580765305, + "grad_norm": 2.004560708999634, + "learning_rate": 0.0001992018463653996, + "loss": 1.3407, + "step": 1928 + }, + { + "epoch": 0.06908159793722134, + "grad_norm": 2.0735762119293213, + "learning_rate": 0.00019920038314728434, + "loss": 1.2124, + "step": 1929 + }, + { + "epoch": 0.06911741006678962, + "grad_norm": 2.269886016845703, + "learning_rate": 0.00019919891859455588, + "loss": 1.3729, + "step": 1930 + }, + { + "epoch": 0.0691532221963579, + "grad_norm": 2.5301473140716553, + "learning_rate": 0.00019919745270723395, + "loss": 1.3013, + "step": 1931 + }, + { + "epoch": 0.06918903432592619, + "grad_norm": 1.8918542861938477, + "learning_rate": 0.00019919598548533824, + "loss": 1.4201, + "step": 1932 + }, + { + "epoch": 0.06922484645549448, + "grad_norm": 2.884833812713623, + "learning_rate": 0.00019919451692888848, + "loss": 1.492, + "step": 1933 + }, + { + "epoch": 0.06926065858506276, + "grad_norm": 2.305687665939331, + "learning_rate": 0.00019919304703790446, + "loss": 1.2934, + "step": 1934 + }, + { + "epoch": 0.06929647071463105, + "grad_norm": 1.9472438097000122, + "learning_rate": 0.00019919157581240596, + "loss": 1.3217, + "step": 1935 + }, + { + "epoch": 0.06933228284419933, + "grad_norm": 2.579467535018921, + "learning_rate": 0.00019919010325241275, + "loss": 1.0332, + "step": 1936 + }, + { + "epoch": 0.06936809497376761, + "grad_norm": 1.8814679384231567, + "learning_rate": 0.00019918862935794463, + "loss": 1.5177, + "step": 1937 + }, + { + "epoch": 0.0694039071033359, + "grad_norm": 2.352630853652954, + "learning_rate": 0.00019918715412902142, + "loss": 1.5251, + "step": 1938 + }, + { + "epoch": 0.06943971923290418, + "grad_norm": 2.665059804916382, + "learning_rate": 0.00019918567756566305, + "loss": 1.2844, + "step": 1939 + }, + { + "epoch": 0.06947553136247248, + "grad_norm": 2.4653594493865967, + "learning_rate": 0.0001991841996678893, + "loss": 1.0531, + "step": 1940 + }, + { + "epoch": 0.06951134349204076, + "grad_norm": 2.000418186187744, + "learning_rate": 0.0001991827204357201, + "loss": 1.0557, + "step": 1941 + }, + { + "epoch": 0.06954715562160904, + "grad_norm": 2.0178256034851074, + "learning_rate": 0.0001991812398691753, + "loss": 1.3832, + "step": 1942 + }, + { + "epoch": 0.06958296775117732, + "grad_norm": 1.705024003982544, + "learning_rate": 0.00019917975796827488, + "loss": 1.1844, + "step": 1943 + }, + { + "epoch": 0.06961877988074561, + "grad_norm": 1.985310435295105, + "learning_rate": 0.00019917827473303875, + "loss": 1.2603, + "step": 1944 + }, + { + "epoch": 0.06965459201031389, + "grad_norm": 1.3679518699645996, + "learning_rate": 0.00019917679016348685, + "loss": 1.2391, + "step": 1945 + }, + { + "epoch": 0.06969040413988217, + "grad_norm": 1.5849460363388062, + "learning_rate": 0.00019917530425963916, + "loss": 1.375, + "step": 1946 + }, + { + "epoch": 0.06972621626945047, + "grad_norm": 1.8979302644729614, + "learning_rate": 0.0001991738170215157, + "loss": 1.2143, + "step": 1947 + }, + { + "epoch": 0.06976202839901875, + "grad_norm": 2.177504777908325, + "learning_rate": 0.00019917232844913644, + "loss": 1.2633, + "step": 1948 + }, + { + "epoch": 0.06979784052858704, + "grad_norm": 2.839989185333252, + "learning_rate": 0.00019917083854252142, + "loss": 1.3764, + "step": 1949 + }, + { + "epoch": 0.06983365265815532, + "grad_norm": 1.4288948774337769, + "learning_rate": 0.00019916934730169073, + "loss": 1.0868, + "step": 1950 + }, + { + "epoch": 0.0698694647877236, + "grad_norm": 1.7917519807815552, + "learning_rate": 0.00019916785472666435, + "loss": 1.203, + "step": 1951 + }, + { + "epoch": 0.06990527691729188, + "grad_norm": 2.556344747543335, + "learning_rate": 0.0001991663608174624, + "loss": 1.0605, + "step": 1952 + }, + { + "epoch": 0.06994108904686017, + "grad_norm": 1.9086716175079346, + "learning_rate": 0.000199164865574105, + "loss": 1.4255, + "step": 1953 + }, + { + "epoch": 0.06997690117642846, + "grad_norm": 2.1118621826171875, + "learning_rate": 0.00019916336899661224, + "loss": 1.3383, + "step": 1954 + }, + { + "epoch": 0.07001271330599675, + "grad_norm": 2.11946702003479, + "learning_rate": 0.00019916187108500428, + "loss": 1.3333, + "step": 1955 + }, + { + "epoch": 0.07004852543556503, + "grad_norm": 1.6812269687652588, + "learning_rate": 0.00019916037183930122, + "loss": 1.3621, + "step": 1956 + }, + { + "epoch": 0.07008433756513331, + "grad_norm": 2.5103578567504883, + "learning_rate": 0.00019915887125952327, + "loss": 1.2158, + "step": 1957 + }, + { + "epoch": 0.0701201496947016, + "grad_norm": 2.275507926940918, + "learning_rate": 0.00019915736934569066, + "loss": 1.3039, + "step": 1958 + }, + { + "epoch": 0.07015596182426988, + "grad_norm": 2.3579611778259277, + "learning_rate": 0.0001991558660978235, + "loss": 1.2597, + "step": 1959 + }, + { + "epoch": 0.07019177395383816, + "grad_norm": 1.4225990772247314, + "learning_rate": 0.0001991543615159421, + "loss": 1.2385, + "step": 1960 + }, + { + "epoch": 0.07022758608340644, + "grad_norm": 1.8940180540084839, + "learning_rate": 0.00019915285560006662, + "loss": 1.2541, + "step": 1961 + }, + { + "epoch": 0.07026339821297474, + "grad_norm": 2.106736660003662, + "learning_rate": 0.00019915134835021738, + "loss": 1.3036, + "step": 1962 + }, + { + "epoch": 0.07029921034254302, + "grad_norm": 2.247349262237549, + "learning_rate": 0.00019914983976641466, + "loss": 1.0663, + "step": 1963 + }, + { + "epoch": 0.0703350224721113, + "grad_norm": 1.627952218055725, + "learning_rate": 0.00019914832984867874, + "loss": 1.1688, + "step": 1964 + }, + { + "epoch": 0.07037083460167959, + "grad_norm": 2.174522876739502, + "learning_rate": 0.0001991468185970299, + "loss": 1.3044, + "step": 1965 + }, + { + "epoch": 0.07040664673124787, + "grad_norm": 1.8678032159805298, + "learning_rate": 0.00019914530601148855, + "loss": 1.3291, + "step": 1966 + }, + { + "epoch": 0.07044245886081615, + "grad_norm": 1.3292083740234375, + "learning_rate": 0.000199143792092075, + "loss": 1.2695, + "step": 1967 + }, + { + "epoch": 0.07047827099038444, + "grad_norm": 1.9795500040054321, + "learning_rate": 0.00019914227683880958, + "loss": 1.3629, + "step": 1968 + }, + { + "epoch": 0.07051408311995273, + "grad_norm": 2.115719795227051, + "learning_rate": 0.0001991407602517127, + "loss": 1.2676, + "step": 1969 + }, + { + "epoch": 0.07054989524952102, + "grad_norm": 3.1825754642486572, + "learning_rate": 0.00019913924233080482, + "loss": 1.2259, + "step": 1970 + }, + { + "epoch": 0.0705857073790893, + "grad_norm": 2.2283177375793457, + "learning_rate": 0.0001991377230761063, + "loss": 1.3535, + "step": 1971 + }, + { + "epoch": 0.07062151950865758, + "grad_norm": 1.9531421661376953, + "learning_rate": 0.00019913620248763756, + "loss": 1.2691, + "step": 1972 + }, + { + "epoch": 0.07065733163822586, + "grad_norm": 2.104449510574341, + "learning_rate": 0.0001991346805654191, + "loss": 1.376, + "step": 1973 + }, + { + "epoch": 0.07069314376779415, + "grad_norm": 3.042935848236084, + "learning_rate": 0.00019913315730947143, + "loss": 1.2787, + "step": 1974 + }, + { + "epoch": 0.07072895589736243, + "grad_norm": 1.7879581451416016, + "learning_rate": 0.00019913163271981495, + "loss": 1.074, + "step": 1975 + }, + { + "epoch": 0.07076476802693073, + "grad_norm": 2.5352659225463867, + "learning_rate": 0.00019913010679647027, + "loss": 1.1536, + "step": 1976 + }, + { + "epoch": 0.07080058015649901, + "grad_norm": 2.9801621437072754, + "learning_rate": 0.00019912857953945784, + "loss": 1.2406, + "step": 1977 + }, + { + "epoch": 0.07083639228606729, + "grad_norm": 1.5875250101089478, + "learning_rate": 0.00019912705094879827, + "loss": 1.1307, + "step": 1978 + }, + { + "epoch": 0.07087220441563558, + "grad_norm": 2.4479966163635254, + "learning_rate": 0.00019912552102451206, + "loss": 1.3472, + "step": 1979 + }, + { + "epoch": 0.07090801654520386, + "grad_norm": 1.8649067878723145, + "learning_rate": 0.00019912398976661984, + "loss": 1.3041, + "step": 1980 + }, + { + "epoch": 0.07094382867477214, + "grad_norm": 1.999961256980896, + "learning_rate": 0.0001991224571751422, + "loss": 1.237, + "step": 1981 + }, + { + "epoch": 0.07097964080434042, + "grad_norm": 2.665929079055786, + "learning_rate": 0.00019912092325009975, + "loss": 1.1681, + "step": 1982 + }, + { + "epoch": 0.07101545293390872, + "grad_norm": 2.6833972930908203, + "learning_rate": 0.00019911938799151315, + "loss": 1.3508, + "step": 1983 + }, + { + "epoch": 0.071051265063477, + "grad_norm": 1.571185827255249, + "learning_rate": 0.00019911785139940303, + "loss": 1.3329, + "step": 1984 + }, + { + "epoch": 0.07108707719304529, + "grad_norm": 1.307006597518921, + "learning_rate": 0.00019911631347379008, + "loss": 1.1741, + "step": 1985 + }, + { + "epoch": 0.07112288932261357, + "grad_norm": 1.9748865365982056, + "learning_rate": 0.00019911477421469495, + "loss": 1.2169, + "step": 1986 + }, + { + "epoch": 0.07115870145218185, + "grad_norm": 2.4716310501098633, + "learning_rate": 0.0001991132336221384, + "loss": 1.1611, + "step": 1987 + }, + { + "epoch": 0.07119451358175014, + "grad_norm": 1.6306709051132202, + "learning_rate": 0.00019911169169614117, + "loss": 1.2925, + "step": 1988 + }, + { + "epoch": 0.07123032571131842, + "grad_norm": 1.931722640991211, + "learning_rate": 0.00019911014843672394, + "loss": 1.0968, + "step": 1989 + }, + { + "epoch": 0.07126613784088671, + "grad_norm": 1.5384414196014404, + "learning_rate": 0.0001991086038439075, + "loss": 1.226, + "step": 1990 + }, + { + "epoch": 0.071301949970455, + "grad_norm": 1.515378713607788, + "learning_rate": 0.00019910705791771263, + "loss": 1.1909, + "step": 1991 + }, + { + "epoch": 0.07133776210002328, + "grad_norm": 1.4133131504058838, + "learning_rate": 0.00019910551065816017, + "loss": 1.3478, + "step": 1992 + }, + { + "epoch": 0.07137357422959156, + "grad_norm": 2.3653459548950195, + "learning_rate": 0.0001991039620652709, + "loss": 1.3486, + "step": 1993 + }, + { + "epoch": 0.07140938635915985, + "grad_norm": 1.7128169536590576, + "learning_rate": 0.00019910241213906565, + "loss": 1.2935, + "step": 1994 + }, + { + "epoch": 0.07144519848872813, + "grad_norm": 1.9026917219161987, + "learning_rate": 0.00019910086087956527, + "loss": 1.294, + "step": 1995 + }, + { + "epoch": 0.07148101061829641, + "grad_norm": 1.97882878780365, + "learning_rate": 0.00019909930828679063, + "loss": 0.9956, + "step": 1996 + }, + { + "epoch": 0.07151682274786471, + "grad_norm": 2.62198543548584, + "learning_rate": 0.0001990977543607626, + "loss": 1.338, + "step": 1997 + }, + { + "epoch": 0.07155263487743299, + "grad_norm": 2.338611364364624, + "learning_rate": 0.00019909619910150216, + "loss": 1.3291, + "step": 1998 + }, + { + "epoch": 0.07158844700700127, + "grad_norm": 1.6687880754470825, + "learning_rate": 0.0001990946425090302, + "loss": 1.2951, + "step": 1999 + }, + { + "epoch": 0.07162425913656956, + "grad_norm": 1.5782464742660522, + "learning_rate": 0.00019909308458336759, + "loss": 1.2815, + "step": 2000 + }, + { + "epoch": 0.07166007126613784, + "grad_norm": 1.7959821224212646, + "learning_rate": 0.0001990915253245354, + "loss": 1.2866, + "step": 2001 + }, + { + "epoch": 0.07169588339570612, + "grad_norm": 1.948669672012329, + "learning_rate": 0.0001990899647325545, + "loss": 1.2827, + "step": 2002 + }, + { + "epoch": 0.0717316955252744, + "grad_norm": 1.7995362281799316, + "learning_rate": 0.000199088402807446, + "loss": 1.1826, + "step": 2003 + }, + { + "epoch": 0.0717675076548427, + "grad_norm": 3.1132681369781494, + "learning_rate": 0.00019908683954923082, + "loss": 1.4282, + "step": 2004 + }, + { + "epoch": 0.07180331978441098, + "grad_norm": 2.772125244140625, + "learning_rate": 0.00019908527495793004, + "loss": 1.1657, + "step": 2005 + }, + { + "epoch": 0.07183913191397927, + "grad_norm": 1.8464893102645874, + "learning_rate": 0.00019908370903356468, + "loss": 1.3746, + "step": 2006 + }, + { + "epoch": 0.07187494404354755, + "grad_norm": 1.7727564573287964, + "learning_rate": 0.00019908214177615584, + "loss": 1.404, + "step": 2007 + }, + { + "epoch": 0.07191075617311583, + "grad_norm": 2.2775380611419678, + "learning_rate": 0.00019908057318572458, + "loss": 1.0135, + "step": 2008 + }, + { + "epoch": 0.07194656830268412, + "grad_norm": 1.4529800415039062, + "learning_rate": 0.000199079003262292, + "loss": 1.2185, + "step": 2009 + }, + { + "epoch": 0.0719823804322524, + "grad_norm": 1.602707862854004, + "learning_rate": 0.00019907743200587926, + "loss": 1.1262, + "step": 2010 + }, + { + "epoch": 0.07201819256182068, + "grad_norm": 1.3723883628845215, + "learning_rate": 0.00019907585941650747, + "loss": 1.3091, + "step": 2011 + }, + { + "epoch": 0.07205400469138898, + "grad_norm": 1.7062188386917114, + "learning_rate": 0.00019907428549419777, + "loss": 1.2622, + "step": 2012 + }, + { + "epoch": 0.07208981682095726, + "grad_norm": 1.9840492010116577, + "learning_rate": 0.00019907271023897138, + "loss": 1.374, + "step": 2013 + }, + { + "epoch": 0.07212562895052554, + "grad_norm": 1.6043531894683838, + "learning_rate": 0.00019907113365084947, + "loss": 1.3099, + "step": 2014 + }, + { + "epoch": 0.07216144108009383, + "grad_norm": 1.644025206565857, + "learning_rate": 0.0001990695557298532, + "loss": 1.1785, + "step": 2015 + }, + { + "epoch": 0.07219725320966211, + "grad_norm": 1.7883378267288208, + "learning_rate": 0.0001990679764760039, + "loss": 1.4413, + "step": 2016 + }, + { + "epoch": 0.07223306533923039, + "grad_norm": 1.4924200773239136, + "learning_rate": 0.00019906639588932276, + "loss": 1.3008, + "step": 2017 + }, + { + "epoch": 0.07226887746879868, + "grad_norm": 1.5466560125350952, + "learning_rate": 0.00019906481396983103, + "loss": 1.2409, + "step": 2018 + }, + { + "epoch": 0.07230468959836697, + "grad_norm": 1.6234745979309082, + "learning_rate": 0.00019906323071755005, + "loss": 1.3129, + "step": 2019 + }, + { + "epoch": 0.07234050172793526, + "grad_norm": 2.240438222885132, + "learning_rate": 0.00019906164613250104, + "loss": 1.353, + "step": 2020 + }, + { + "epoch": 0.07237631385750354, + "grad_norm": 1.6415594816207886, + "learning_rate": 0.00019906006021470538, + "loss": 1.2048, + "step": 2021 + }, + { + "epoch": 0.07241212598707182, + "grad_norm": 1.678546667098999, + "learning_rate": 0.00019905847296418437, + "loss": 1.3644, + "step": 2022 + }, + { + "epoch": 0.0724479381166401, + "grad_norm": 1.8753067255020142, + "learning_rate": 0.0001990568843809594, + "loss": 1.1755, + "step": 2023 + }, + { + "epoch": 0.07248375024620839, + "grad_norm": 1.5533781051635742, + "learning_rate": 0.00019905529446505183, + "loss": 1.2115, + "step": 2024 + }, + { + "epoch": 0.07251956237577667, + "grad_norm": 1.86460542678833, + "learning_rate": 0.00019905370321648302, + "loss": 1.2966, + "step": 2025 + }, + { + "epoch": 0.07255537450534497, + "grad_norm": 2.561873197555542, + "learning_rate": 0.00019905211063527442, + "loss": 1.2575, + "step": 2026 + }, + { + "epoch": 0.07259118663491325, + "grad_norm": 1.707633376121521, + "learning_rate": 0.00019905051672144746, + "loss": 1.2657, + "step": 2027 + }, + { + "epoch": 0.07262699876448153, + "grad_norm": 1.6143914461135864, + "learning_rate": 0.00019904892147502352, + "loss": 1.3721, + "step": 2028 + }, + { + "epoch": 0.07266281089404981, + "grad_norm": 1.3289211988449097, + "learning_rate": 0.00019904732489602417, + "loss": 1.1626, + "step": 2029 + }, + { + "epoch": 0.0726986230236181, + "grad_norm": 1.9849467277526855, + "learning_rate": 0.00019904572698447077, + "loss": 1.227, + "step": 2030 + }, + { + "epoch": 0.07273443515318638, + "grad_norm": 1.9812272787094116, + "learning_rate": 0.00019904412774038487, + "loss": 1.3166, + "step": 2031 + }, + { + "epoch": 0.07277024728275466, + "grad_norm": 1.5771857500076294, + "learning_rate": 0.000199042527163788, + "loss": 1.3125, + "step": 2032 + }, + { + "epoch": 0.07280605941232296, + "grad_norm": 1.6684128046035767, + "learning_rate": 0.0001990409252547017, + "loss": 1.3279, + "step": 2033 + }, + { + "epoch": 0.07284187154189124, + "grad_norm": 1.418333649635315, + "learning_rate": 0.0001990393220131475, + "loss": 1.4108, + "step": 2034 + }, + { + "epoch": 0.07287768367145953, + "grad_norm": 1.944291114807129, + "learning_rate": 0.00019903771743914696, + "loss": 1.1122, + "step": 2035 + }, + { + "epoch": 0.07291349580102781, + "grad_norm": 1.6062556505203247, + "learning_rate": 0.00019903611153272168, + "loss": 1.0291, + "step": 2036 + }, + { + "epoch": 0.07294930793059609, + "grad_norm": 2.0736122131347656, + "learning_rate": 0.0001990345042938933, + "loss": 1.2323, + "step": 2037 + }, + { + "epoch": 0.07298512006016437, + "grad_norm": 1.4617313146591187, + "learning_rate": 0.00019903289572268336, + "loss": 1.2515, + "step": 2038 + }, + { + "epoch": 0.07302093218973266, + "grad_norm": 1.8874763250350952, + "learning_rate": 0.0001990312858191136, + "loss": 1.2239, + "step": 2039 + }, + { + "epoch": 0.07305674431930095, + "grad_norm": 1.7421174049377441, + "learning_rate": 0.0001990296745832056, + "loss": 1.3303, + "step": 2040 + }, + { + "epoch": 0.07309255644886924, + "grad_norm": 1.4223439693450928, + "learning_rate": 0.00019902806201498106, + "loss": 1.3432, + "step": 2041 + }, + { + "epoch": 0.07312836857843752, + "grad_norm": 1.6503028869628906, + "learning_rate": 0.0001990264481144617, + "loss": 1.2924, + "step": 2042 + }, + { + "epoch": 0.0731641807080058, + "grad_norm": 1.746628999710083, + "learning_rate": 0.00019902483288166922, + "loss": 1.1676, + "step": 2043 + }, + { + "epoch": 0.07319999283757408, + "grad_norm": 1.586828589439392, + "learning_rate": 0.00019902321631662533, + "loss": 1.263, + "step": 2044 + }, + { + "epoch": 0.07323580496714237, + "grad_norm": 2.0258798599243164, + "learning_rate": 0.0001990215984193518, + "loss": 1.1332, + "step": 2045 + }, + { + "epoch": 0.07327161709671065, + "grad_norm": 1.488634467124939, + "learning_rate": 0.00019901997918987042, + "loss": 1.2692, + "step": 2046 + }, + { + "epoch": 0.07330742922627895, + "grad_norm": 1.294352412223816, + "learning_rate": 0.0001990183586282029, + "loss": 1.1934, + "step": 2047 + }, + { + "epoch": 0.07334324135584723, + "grad_norm": 1.8666986227035522, + "learning_rate": 0.00019901673673437112, + "loss": 1.2525, + "step": 2048 + }, + { + "epoch": 0.07337905348541551, + "grad_norm": 1.9994785785675049, + "learning_rate": 0.00019901511350839686, + "loss": 1.58, + "step": 2049 + }, + { + "epoch": 0.0734148656149838, + "grad_norm": 1.3441909551620483, + "learning_rate": 0.00019901348895030196, + "loss": 1.3585, + "step": 2050 + }, + { + "epoch": 0.07345067774455208, + "grad_norm": 1.488390326499939, + "learning_rate": 0.0001990118630601083, + "loss": 1.3868, + "step": 2051 + }, + { + "epoch": 0.07348648987412036, + "grad_norm": 2.257399797439575, + "learning_rate": 0.00019901023583783776, + "loss": 1.3902, + "step": 2052 + }, + { + "epoch": 0.07352230200368864, + "grad_norm": 2.926664113998413, + "learning_rate": 0.00019900860728351216, + "loss": 1.1383, + "step": 2053 + }, + { + "epoch": 0.07355811413325694, + "grad_norm": 1.570981502532959, + "learning_rate": 0.00019900697739715347, + "loss": 1.3279, + "step": 2054 + }, + { + "epoch": 0.07359392626282522, + "grad_norm": 1.584573745727539, + "learning_rate": 0.00019900534617878365, + "loss": 1.368, + "step": 2055 + }, + { + "epoch": 0.0736297383923935, + "grad_norm": 2.707232713699341, + "learning_rate": 0.0001990037136284246, + "loss": 1.2387, + "step": 2056 + }, + { + "epoch": 0.07366555052196179, + "grad_norm": 3.268656015396118, + "learning_rate": 0.00019900207974609822, + "loss": 1.2246, + "step": 2057 + }, + { + "epoch": 0.07370136265153007, + "grad_norm": 1.9314301013946533, + "learning_rate": 0.00019900044453182662, + "loss": 1.2714, + "step": 2058 + }, + { + "epoch": 0.07373717478109836, + "grad_norm": 1.6391894817352295, + "learning_rate": 0.00019899880798563172, + "loss": 1.2328, + "step": 2059 + }, + { + "epoch": 0.07377298691066664, + "grad_norm": 2.02402400970459, + "learning_rate": 0.00019899717010753558, + "loss": 1.2199, + "step": 2060 + }, + { + "epoch": 0.07380879904023492, + "grad_norm": 1.645845651626587, + "learning_rate": 0.0001989955308975602, + "loss": 1.3254, + "step": 2061 + }, + { + "epoch": 0.07384461116980322, + "grad_norm": 2.0286896228790283, + "learning_rate": 0.00019899389035572763, + "loss": 1.3374, + "step": 2062 + }, + { + "epoch": 0.0738804232993715, + "grad_norm": 2.1522035598754883, + "learning_rate": 0.00019899224848205998, + "loss": 1.2975, + "step": 2063 + }, + { + "epoch": 0.07391623542893978, + "grad_norm": 1.3679964542388916, + "learning_rate": 0.0001989906052765793, + "loss": 1.2229, + "step": 2064 + }, + { + "epoch": 0.07395204755850807, + "grad_norm": 1.4863783121109009, + "learning_rate": 0.00019898896073930776, + "loss": 1.4065, + "step": 2065 + }, + { + "epoch": 0.07398785968807635, + "grad_norm": 1.5703037977218628, + "learning_rate": 0.00019898731487026742, + "loss": 1.2534, + "step": 2066 + }, + { + "epoch": 0.07402367181764463, + "grad_norm": 1.9061179161071777, + "learning_rate": 0.00019898566766948038, + "loss": 1.1332, + "step": 2067 + }, + { + "epoch": 0.07405948394721291, + "grad_norm": 1.8289556503295898, + "learning_rate": 0.00019898401913696892, + "loss": 1.1392, + "step": 2068 + }, + { + "epoch": 0.07409529607678121, + "grad_norm": 1.8268420696258545, + "learning_rate": 0.00019898236927275517, + "loss": 1.4434, + "step": 2069 + }, + { + "epoch": 0.0741311082063495, + "grad_norm": 2.4548845291137695, + "learning_rate": 0.0001989807180768613, + "loss": 1.3928, + "step": 2070 + }, + { + "epoch": 0.07416692033591778, + "grad_norm": 1.8150840997695923, + "learning_rate": 0.00019897906554930956, + "loss": 1.3024, + "step": 2071 + }, + { + "epoch": 0.07420273246548606, + "grad_norm": 2.154608964920044, + "learning_rate": 0.00019897741169012213, + "loss": 1.3789, + "step": 2072 + }, + { + "epoch": 0.07423854459505434, + "grad_norm": 1.565441370010376, + "learning_rate": 0.00019897575649932135, + "loss": 1.2498, + "step": 2073 + }, + { + "epoch": 0.07427435672462263, + "grad_norm": 1.7668839693069458, + "learning_rate": 0.0001989740999769294, + "loss": 1.2098, + "step": 2074 + }, + { + "epoch": 0.07431016885419091, + "grad_norm": 1.6833107471466064, + "learning_rate": 0.0001989724421229686, + "loss": 1.3478, + "step": 2075 + }, + { + "epoch": 0.0743459809837592, + "grad_norm": 1.6366325616836548, + "learning_rate": 0.00019897078293746128, + "loss": 1.3107, + "step": 2076 + }, + { + "epoch": 0.07438179311332749, + "grad_norm": 2.904273509979248, + "learning_rate": 0.0001989691224204297, + "loss": 1.2828, + "step": 2077 + }, + { + "epoch": 0.07441760524289577, + "grad_norm": 1.3599668741226196, + "learning_rate": 0.0001989674605718963, + "loss": 1.2691, + "step": 2078 + }, + { + "epoch": 0.07445341737246405, + "grad_norm": 2.5588557720184326, + "learning_rate": 0.00019896579739188335, + "loss": 1.221, + "step": 2079 + }, + { + "epoch": 0.07448922950203234, + "grad_norm": 1.6055052280426025, + "learning_rate": 0.00019896413288041323, + "loss": 1.2256, + "step": 2080 + }, + { + "epoch": 0.07452504163160062, + "grad_norm": 2.26396107673645, + "learning_rate": 0.00019896246703750837, + "loss": 1.2588, + "step": 2081 + }, + { + "epoch": 0.0745608537611689, + "grad_norm": 2.043041706085205, + "learning_rate": 0.00019896079986319118, + "loss": 1.3785, + "step": 2082 + }, + { + "epoch": 0.0745966658907372, + "grad_norm": 2.3250250816345215, + "learning_rate": 0.00019895913135748407, + "loss": 1.2568, + "step": 2083 + }, + { + "epoch": 0.07463247802030548, + "grad_norm": 1.5247762203216553, + "learning_rate": 0.0001989574615204095, + "loss": 1.1584, + "step": 2084 + }, + { + "epoch": 0.07466829014987376, + "grad_norm": 2.1054582595825195, + "learning_rate": 0.0001989557903519899, + "loss": 1.1477, + "step": 2085 + }, + { + "epoch": 0.07470410227944205, + "grad_norm": 2.2872462272644043, + "learning_rate": 0.0001989541178522478, + "loss": 1.2441, + "step": 2086 + }, + { + "epoch": 0.07473991440901033, + "grad_norm": 1.493675708770752, + "learning_rate": 0.0001989524440212057, + "loss": 1.2686, + "step": 2087 + }, + { + "epoch": 0.07477572653857861, + "grad_norm": 1.7111494541168213, + "learning_rate": 0.00019895076885888613, + "loss": 1.1444, + "step": 2088 + }, + { + "epoch": 0.0748115386681469, + "grad_norm": 1.8571577072143555, + "learning_rate": 0.00019894909236531158, + "loss": 1.0266, + "step": 2089 + }, + { + "epoch": 0.07484735079771519, + "grad_norm": 1.657449722290039, + "learning_rate": 0.0001989474145405046, + "loss": 1.2243, + "step": 2090 + }, + { + "epoch": 0.07488316292728348, + "grad_norm": 2.069841146469116, + "learning_rate": 0.00019894573538448783, + "loss": 1.3753, + "step": 2091 + }, + { + "epoch": 0.07491897505685176, + "grad_norm": 1.59548819065094, + "learning_rate": 0.0001989440548972838, + "loss": 1.1281, + "step": 2092 + }, + { + "epoch": 0.07495478718642004, + "grad_norm": 1.5364599227905273, + "learning_rate": 0.0001989423730789151, + "loss": 1.1043, + "step": 2093 + }, + { + "epoch": 0.07499059931598832, + "grad_norm": 1.4748101234436035, + "learning_rate": 0.00019894068992940448, + "loss": 1.2969, + "step": 2094 + }, + { + "epoch": 0.0750264114455566, + "grad_norm": 2.6572177410125732, + "learning_rate": 0.00019893900544877443, + "loss": 1.1581, + "step": 2095 + }, + { + "epoch": 0.07506222357512489, + "grad_norm": 1.952217936515808, + "learning_rate": 0.00019893731963704773, + "loss": 1.1786, + "step": 2096 + }, + { + "epoch": 0.07509803570469319, + "grad_norm": 1.988777995109558, + "learning_rate": 0.000198935632494247, + "loss": 1.3503, + "step": 2097 + }, + { + "epoch": 0.07513384783426147, + "grad_norm": 1.9163336753845215, + "learning_rate": 0.00019893394402039496, + "loss": 1.3707, + "step": 2098 + }, + { + "epoch": 0.07516965996382975, + "grad_norm": 1.7808533906936646, + "learning_rate": 0.00019893225421551428, + "loss": 1.2433, + "step": 2099 + }, + { + "epoch": 0.07520547209339803, + "grad_norm": 2.9461441040039062, + "learning_rate": 0.0001989305630796278, + "loss": 1.3144, + "step": 2100 + }, + { + "epoch": 0.07524128422296632, + "grad_norm": 2.063166856765747, + "learning_rate": 0.00019892887061275815, + "loss": 1.3814, + "step": 2101 + }, + { + "epoch": 0.0752770963525346, + "grad_norm": 2.2165305614471436, + "learning_rate": 0.00019892717681492815, + "loss": 1.1067, + "step": 2102 + }, + { + "epoch": 0.07531290848210288, + "grad_norm": 1.796379566192627, + "learning_rate": 0.00019892548168616063, + "loss": 1.2612, + "step": 2103 + }, + { + "epoch": 0.07534872061167118, + "grad_norm": 2.149048089981079, + "learning_rate": 0.00019892378522647834, + "loss": 1.1289, + "step": 2104 + }, + { + "epoch": 0.07538453274123946, + "grad_norm": 1.4595372676849365, + "learning_rate": 0.00019892208743590412, + "loss": 1.1108, + "step": 2105 + }, + { + "epoch": 0.07542034487080775, + "grad_norm": 1.7695733308792114, + "learning_rate": 0.00019892038831446085, + "loss": 1.0504, + "step": 2106 + }, + { + "epoch": 0.07545615700037603, + "grad_norm": 1.918438196182251, + "learning_rate": 0.0001989186878621713, + "loss": 1.3616, + "step": 2107 + }, + { + "epoch": 0.07549196912994431, + "grad_norm": 1.8203580379486084, + "learning_rate": 0.00019891698607905843, + "loss": 1.1657, + "step": 2108 + }, + { + "epoch": 0.0755277812595126, + "grad_norm": 1.760979175567627, + "learning_rate": 0.0001989152829651451, + "loss": 1.3923, + "step": 2109 + }, + { + "epoch": 0.07556359338908088, + "grad_norm": 2.237201690673828, + "learning_rate": 0.00019891357852045422, + "loss": 1.2541, + "step": 2110 + }, + { + "epoch": 0.07559940551864916, + "grad_norm": 2.5647056102752686, + "learning_rate": 0.00019891187274500874, + "loss": 1.496, + "step": 2111 + }, + { + "epoch": 0.07563521764821746, + "grad_norm": 1.8292862176895142, + "learning_rate": 0.0001989101656388316, + "loss": 1.1818, + "step": 2112 + }, + { + "epoch": 0.07567102977778574, + "grad_norm": 1.3684004545211792, + "learning_rate": 0.00019890845720194576, + "loss": 1.1203, + "step": 2113 + }, + { + "epoch": 0.07570684190735402, + "grad_norm": 1.7333906888961792, + "learning_rate": 0.00019890674743437424, + "loss": 1.4294, + "step": 2114 + }, + { + "epoch": 0.0757426540369223, + "grad_norm": 2.459545135498047, + "learning_rate": 0.00019890503633614, + "loss": 1.1647, + "step": 2115 + }, + { + "epoch": 0.07577846616649059, + "grad_norm": 1.420395851135254, + "learning_rate": 0.00019890332390726606, + "loss": 1.2162, + "step": 2116 + }, + { + "epoch": 0.07581427829605887, + "grad_norm": 1.7598942518234253, + "learning_rate": 0.00019890161014777546, + "loss": 1.1469, + "step": 2117 + }, + { + "epoch": 0.07585009042562715, + "grad_norm": 3.286714792251587, + "learning_rate": 0.0001988998950576913, + "loss": 1.1944, + "step": 2118 + }, + { + "epoch": 0.07588590255519545, + "grad_norm": 2.105367422103882, + "learning_rate": 0.00019889817863703662, + "loss": 1.2769, + "step": 2119 + }, + { + "epoch": 0.07592171468476373, + "grad_norm": 1.970828890800476, + "learning_rate": 0.0001988964608858345, + "loss": 1.4416, + "step": 2120 + }, + { + "epoch": 0.07595752681433202, + "grad_norm": 1.8442121744155884, + "learning_rate": 0.00019889474180410805, + "loss": 1.1471, + "step": 2121 + }, + { + "epoch": 0.0759933389439003, + "grad_norm": 1.6940690279006958, + "learning_rate": 0.00019889302139188044, + "loss": 1.2394, + "step": 2122 + }, + { + "epoch": 0.07602915107346858, + "grad_norm": 1.5662800073623657, + "learning_rate": 0.00019889129964917478, + "loss": 1.2791, + "step": 2123 + }, + { + "epoch": 0.07606496320303686, + "grad_norm": 1.8618627786636353, + "learning_rate": 0.00019888957657601425, + "loss": 1.1988, + "step": 2124 + }, + { + "epoch": 0.07610077533260515, + "grad_norm": 2.1864240169525146, + "learning_rate": 0.00019888785217242206, + "loss": 1.1689, + "step": 2125 + }, + { + "epoch": 0.07613658746217344, + "grad_norm": 1.56419038772583, + "learning_rate": 0.00019888612643842132, + "loss": 1.2334, + "step": 2126 + }, + { + "epoch": 0.07617239959174173, + "grad_norm": 1.6954307556152344, + "learning_rate": 0.00019888439937403534, + "loss": 1.2679, + "step": 2127 + }, + { + "epoch": 0.07620821172131001, + "grad_norm": 1.8372013568878174, + "learning_rate": 0.0001988826709792873, + "loss": 1.3649, + "step": 2128 + }, + { + "epoch": 0.07624402385087829, + "grad_norm": 2.0111031532287598, + "learning_rate": 0.00019888094125420044, + "loss": 1.2946, + "step": 2129 + }, + { + "epoch": 0.07627983598044658, + "grad_norm": 2.9673349857330322, + "learning_rate": 0.00019887921019879812, + "loss": 1.4922, + "step": 2130 + }, + { + "epoch": 0.07631564811001486, + "grad_norm": 2.415799617767334, + "learning_rate": 0.00019887747781310356, + "loss": 1.2228, + "step": 2131 + }, + { + "epoch": 0.07635146023958314, + "grad_norm": 2.5152480602264404, + "learning_rate": 0.00019887574409714005, + "loss": 1.2624, + "step": 2132 + }, + { + "epoch": 0.07638727236915144, + "grad_norm": 2.2168073654174805, + "learning_rate": 0.00019887400905093096, + "loss": 1.3787, + "step": 2133 + }, + { + "epoch": 0.07642308449871972, + "grad_norm": 1.8609753847122192, + "learning_rate": 0.00019887227267449963, + "loss": 1.1907, + "step": 2134 + }, + { + "epoch": 0.076458896628288, + "grad_norm": 2.5447468757629395, + "learning_rate": 0.00019887053496786937, + "loss": 1.4328, + "step": 2135 + }, + { + "epoch": 0.07649470875785629, + "grad_norm": 1.145759105682373, + "learning_rate": 0.00019886879593106365, + "loss": 1.0972, + "step": 2136 + }, + { + "epoch": 0.07653052088742457, + "grad_norm": 1.7336676120758057, + "learning_rate": 0.00019886705556410576, + "loss": 1.0728, + "step": 2137 + }, + { + "epoch": 0.07656633301699285, + "grad_norm": 1.7241723537445068, + "learning_rate": 0.0001988653138670192, + "loss": 1.2875, + "step": 2138 + }, + { + "epoch": 0.07660214514656113, + "grad_norm": 1.8672105073928833, + "learning_rate": 0.00019886357083982734, + "loss": 1.1354, + "step": 2139 + }, + { + "epoch": 0.07663795727612943, + "grad_norm": 1.708608865737915, + "learning_rate": 0.0001988618264825537, + "loss": 1.2231, + "step": 2140 + }, + { + "epoch": 0.07667376940569771, + "grad_norm": 1.782958745956421, + "learning_rate": 0.00019886008079522167, + "loss": 1.2697, + "step": 2141 + }, + { + "epoch": 0.076709581535266, + "grad_norm": 1.5069931745529175, + "learning_rate": 0.0001988583337778548, + "loss": 1.3029, + "step": 2142 + }, + { + "epoch": 0.07674539366483428, + "grad_norm": 2.082395553588867, + "learning_rate": 0.00019885658543047655, + "loss": 1.299, + "step": 2143 + }, + { + "epoch": 0.07678120579440256, + "grad_norm": 2.174405574798584, + "learning_rate": 0.00019885483575311045, + "loss": 1.5323, + "step": 2144 + }, + { + "epoch": 0.07681701792397085, + "grad_norm": 1.7014076709747314, + "learning_rate": 0.00019885308474578008, + "loss": 1.1602, + "step": 2145 + }, + { + "epoch": 0.07685283005353913, + "grad_norm": 2.0998787879943848, + "learning_rate": 0.00019885133240850892, + "loss": 1.4347, + "step": 2146 + }, + { + "epoch": 0.07688864218310743, + "grad_norm": 2.0645933151245117, + "learning_rate": 0.00019884957874132065, + "loss": 1.3778, + "step": 2147 + }, + { + "epoch": 0.07692445431267571, + "grad_norm": 1.3650110960006714, + "learning_rate": 0.00019884782374423877, + "loss": 1.2928, + "step": 2148 + }, + { + "epoch": 0.07696026644224399, + "grad_norm": 1.84894597530365, + "learning_rate": 0.00019884606741728692, + "loss": 1.2301, + "step": 2149 + }, + { + "epoch": 0.07699607857181227, + "grad_norm": 1.4769166707992554, + "learning_rate": 0.00019884430976048877, + "loss": 1.3945, + "step": 2150 + }, + { + "epoch": 0.07703189070138056, + "grad_norm": 1.5552608966827393, + "learning_rate": 0.00019884255077386788, + "loss": 1.1433, + "step": 2151 + }, + { + "epoch": 0.07706770283094884, + "grad_norm": 1.8629188537597656, + "learning_rate": 0.000198840790457448, + "loss": 1.0847, + "step": 2152 + }, + { + "epoch": 0.07710351496051712, + "grad_norm": 1.4549256563186646, + "learning_rate": 0.00019883902881125278, + "loss": 1.2948, + "step": 2153 + }, + { + "epoch": 0.0771393270900854, + "grad_norm": 1.3869496583938599, + "learning_rate": 0.00019883726583530594, + "loss": 1.2201, + "step": 2154 + }, + { + "epoch": 0.0771751392196537, + "grad_norm": 2.018880605697632, + "learning_rate": 0.00019883550152963113, + "loss": 1.299, + "step": 2155 + }, + { + "epoch": 0.07721095134922198, + "grad_norm": 2.250189781188965, + "learning_rate": 0.00019883373589425215, + "loss": 1.3398, + "step": 2156 + }, + { + "epoch": 0.07724676347879027, + "grad_norm": 2.2821149826049805, + "learning_rate": 0.00019883196892919275, + "loss": 1.2394, + "step": 2157 + }, + { + "epoch": 0.07728257560835855, + "grad_norm": 2.6380081176757812, + "learning_rate": 0.00019883020063447672, + "loss": 1.3576, + "step": 2158 + }, + { + "epoch": 0.07731838773792683, + "grad_norm": 2.115006685256958, + "learning_rate": 0.00019882843101012778, + "loss": 1.3837, + "step": 2159 + }, + { + "epoch": 0.07735419986749512, + "grad_norm": 2.5147488117218018, + "learning_rate": 0.00019882666005616978, + "loss": 1.3208, + "step": 2160 + }, + { + "epoch": 0.0773900119970634, + "grad_norm": 2.6637942790985107, + "learning_rate": 0.00019882488777262655, + "loss": 1.0941, + "step": 2161 + }, + { + "epoch": 0.0774258241266317, + "grad_norm": 1.3328169584274292, + "learning_rate": 0.00019882311415952194, + "loss": 1.0114, + "step": 2162 + }, + { + "epoch": 0.07746163625619998, + "grad_norm": 2.6416330337524414, + "learning_rate": 0.00019882133921687983, + "loss": 1.347, + "step": 2163 + }, + { + "epoch": 0.07749744838576826, + "grad_norm": 2.0462071895599365, + "learning_rate": 0.00019881956294472405, + "loss": 1.2858, + "step": 2164 + }, + { + "epoch": 0.07753326051533654, + "grad_norm": 1.8037712574005127, + "learning_rate": 0.00019881778534307852, + "loss": 1.1469, + "step": 2165 + }, + { + "epoch": 0.07756907264490483, + "grad_norm": 2.6074557304382324, + "learning_rate": 0.0001988160064119671, + "loss": 1.2984, + "step": 2166 + }, + { + "epoch": 0.07760488477447311, + "grad_norm": 2.2006988525390625, + "learning_rate": 0.00019881422615141385, + "loss": 1.2495, + "step": 2167 + }, + { + "epoch": 0.07764069690404139, + "grad_norm": 1.88552725315094, + "learning_rate": 0.00019881244456144262, + "loss": 1.3117, + "step": 2168 + }, + { + "epoch": 0.07767650903360969, + "grad_norm": 1.73715341091156, + "learning_rate": 0.00019881066164207742, + "loss": 1.3053, + "step": 2169 + }, + { + "epoch": 0.07771232116317797, + "grad_norm": 2.16194486618042, + "learning_rate": 0.0001988088773933422, + "loss": 1.2992, + "step": 2170 + }, + { + "epoch": 0.07774813329274625, + "grad_norm": 3.1696643829345703, + "learning_rate": 0.000198807091815261, + "loss": 1.2498, + "step": 2171 + }, + { + "epoch": 0.07778394542231454, + "grad_norm": 1.5327074527740479, + "learning_rate": 0.00019880530490785784, + "loss": 1.1474, + "step": 2172 + }, + { + "epoch": 0.07781975755188282, + "grad_norm": 1.773242473602295, + "learning_rate": 0.00019880351667115673, + "loss": 1.0505, + "step": 2173 + }, + { + "epoch": 0.0778555696814511, + "grad_norm": 1.7599456310272217, + "learning_rate": 0.00019880172710518178, + "loss": 1.371, + "step": 2174 + }, + { + "epoch": 0.07789138181101939, + "grad_norm": 1.7046805620193481, + "learning_rate": 0.00019879993620995702, + "loss": 1.2176, + "step": 2175 + }, + { + "epoch": 0.07792719394058768, + "grad_norm": 1.8652616739273071, + "learning_rate": 0.00019879814398550657, + "loss": 1.5138, + "step": 2176 + }, + { + "epoch": 0.07796300607015597, + "grad_norm": 2.3456504344940186, + "learning_rate": 0.00019879635043185454, + "loss": 1.2659, + "step": 2177 + }, + { + "epoch": 0.07799881819972425, + "grad_norm": 1.7449759244918823, + "learning_rate": 0.00019879455554902502, + "loss": 1.3267, + "step": 2178 + }, + { + "epoch": 0.07803463032929253, + "grad_norm": 1.451749324798584, + "learning_rate": 0.00019879275933704224, + "loss": 1.3672, + "step": 2179 + }, + { + "epoch": 0.07807044245886081, + "grad_norm": 1.7698118686676025, + "learning_rate": 0.00019879096179593027, + "loss": 1.2368, + "step": 2180 + }, + { + "epoch": 0.0781062545884291, + "grad_norm": 2.3607378005981445, + "learning_rate": 0.00019878916292571334, + "loss": 1.4818, + "step": 2181 + }, + { + "epoch": 0.07814206671799738, + "grad_norm": 2.3076746463775635, + "learning_rate": 0.00019878736272641568, + "loss": 1.1782, + "step": 2182 + }, + { + "epoch": 0.07817787884756568, + "grad_norm": 1.8036227226257324, + "learning_rate": 0.00019878556119806148, + "loss": 0.9375, + "step": 2183 + }, + { + "epoch": 0.07821369097713396, + "grad_norm": 1.9721133708953857, + "learning_rate": 0.00019878375834067496, + "loss": 1.1175, + "step": 2184 + }, + { + "epoch": 0.07824950310670224, + "grad_norm": 1.6147727966308594, + "learning_rate": 0.0001987819541542804, + "loss": 1.4744, + "step": 2185 + }, + { + "epoch": 0.07828531523627053, + "grad_norm": 1.7876302003860474, + "learning_rate": 0.0001987801486389021, + "loss": 1.4231, + "step": 2186 + }, + { + "epoch": 0.07832112736583881, + "grad_norm": 3.1724371910095215, + "learning_rate": 0.00019877834179456424, + "loss": 1.1617, + "step": 2187 + }, + { + "epoch": 0.07835693949540709, + "grad_norm": 1.5645091533660889, + "learning_rate": 0.00019877653362129126, + "loss": 1.124, + "step": 2188 + }, + { + "epoch": 0.07839275162497537, + "grad_norm": 1.5567268133163452, + "learning_rate": 0.00019877472411910745, + "loss": 1.2211, + "step": 2189 + }, + { + "epoch": 0.07842856375454367, + "grad_norm": 2.1779510974884033, + "learning_rate": 0.0001987729132880371, + "loss": 1.3263, + "step": 2190 + }, + { + "epoch": 0.07846437588411195, + "grad_norm": 1.4350003004074097, + "learning_rate": 0.00019877110112810463, + "loss": 1.2551, + "step": 2191 + }, + { + "epoch": 0.07850018801368024, + "grad_norm": 1.65862238407135, + "learning_rate": 0.00019876928763933437, + "loss": 1.1966, + "step": 2192 + }, + { + "epoch": 0.07853600014324852, + "grad_norm": 1.9010182619094849, + "learning_rate": 0.00019876747282175078, + "loss": 1.289, + "step": 2193 + }, + { + "epoch": 0.0785718122728168, + "grad_norm": 2.3489441871643066, + "learning_rate": 0.00019876565667537824, + "loss": 1.1147, + "step": 2194 + }, + { + "epoch": 0.07860762440238508, + "grad_norm": 1.7325738668441772, + "learning_rate": 0.00019876383920024117, + "loss": 1.1888, + "step": 2195 + }, + { + "epoch": 0.07864343653195337, + "grad_norm": 1.8538991212844849, + "learning_rate": 0.00019876202039636405, + "loss": 1.3845, + "step": 2196 + }, + { + "epoch": 0.07867924866152166, + "grad_norm": 2.1359007358551025, + "learning_rate": 0.00019876020026377136, + "loss": 0.9924, + "step": 2197 + }, + { + "epoch": 0.07871506079108995, + "grad_norm": 1.4323121309280396, + "learning_rate": 0.00019875837880248756, + "loss": 1.2378, + "step": 2198 + }, + { + "epoch": 0.07875087292065823, + "grad_norm": 1.7778947353363037, + "learning_rate": 0.00019875655601253714, + "loss": 1.3128, + "step": 2199 + }, + { + "epoch": 0.07878668505022651, + "grad_norm": 1.4624927043914795, + "learning_rate": 0.00019875473189394463, + "loss": 1.2056, + "step": 2200 + }, + { + "epoch": 0.0788224971797948, + "grad_norm": 1.584619164466858, + "learning_rate": 0.00019875290644673463, + "loss": 1.2606, + "step": 2201 + }, + { + "epoch": 0.07885830930936308, + "grad_norm": 1.2608320713043213, + "learning_rate": 0.00019875107967093163, + "loss": 1.3664, + "step": 2202 + }, + { + "epoch": 0.07889412143893136, + "grad_norm": 1.7200841903686523, + "learning_rate": 0.00019874925156656024, + "loss": 1.2297, + "step": 2203 + }, + { + "epoch": 0.07892993356849964, + "grad_norm": 1.9174078702926636, + "learning_rate": 0.00019874742213364506, + "loss": 1.4705, + "step": 2204 + }, + { + "epoch": 0.07896574569806794, + "grad_norm": 1.4854531288146973, + "learning_rate": 0.00019874559137221068, + "loss": 1.1689, + "step": 2205 + }, + { + "epoch": 0.07900155782763622, + "grad_norm": 2.91534423828125, + "learning_rate": 0.00019874375928228175, + "loss": 1.1248, + "step": 2206 + }, + { + "epoch": 0.0790373699572045, + "grad_norm": 1.9227732419967651, + "learning_rate": 0.00019874192586388288, + "loss": 1.1647, + "step": 2207 + }, + { + "epoch": 0.07907318208677279, + "grad_norm": 2.3028197288513184, + "learning_rate": 0.00019874009111703878, + "loss": 1.2571, + "step": 2208 + }, + { + "epoch": 0.07910899421634107, + "grad_norm": 2.044182300567627, + "learning_rate": 0.00019873825504177414, + "loss": 1.2511, + "step": 2209 + }, + { + "epoch": 0.07914480634590935, + "grad_norm": 1.9803178310394287, + "learning_rate": 0.0001987364176381136, + "loss": 1.1837, + "step": 2210 + }, + { + "epoch": 0.07918061847547764, + "grad_norm": 1.6979317665100098, + "learning_rate": 0.00019873457890608198, + "loss": 1.3123, + "step": 2211 + }, + { + "epoch": 0.07921643060504593, + "grad_norm": 2.071597099304199, + "learning_rate": 0.0001987327388457039, + "loss": 1.309, + "step": 2212 + }, + { + "epoch": 0.07925224273461422, + "grad_norm": 2.4048233032226562, + "learning_rate": 0.0001987308974570042, + "loss": 1.2641, + "step": 2213 + }, + { + "epoch": 0.0792880548641825, + "grad_norm": 2.134793281555176, + "learning_rate": 0.0001987290547400076, + "loss": 1.1237, + "step": 2214 + }, + { + "epoch": 0.07932386699375078, + "grad_norm": 1.7549666166305542, + "learning_rate": 0.000198727210694739, + "loss": 1.0571, + "step": 2215 + }, + { + "epoch": 0.07935967912331907, + "grad_norm": 1.5631967782974243, + "learning_rate": 0.00019872536532122305, + "loss": 1.1482, + "step": 2216 + }, + { + "epoch": 0.07939549125288735, + "grad_norm": 1.7073184251785278, + "learning_rate": 0.0001987235186194847, + "loss": 1.3588, + "step": 2217 + }, + { + "epoch": 0.07943130338245563, + "grad_norm": 1.8737924098968506, + "learning_rate": 0.00019872167058954874, + "loss": 1.0766, + "step": 2218 + }, + { + "epoch": 0.07946711551202393, + "grad_norm": 1.6593842506408691, + "learning_rate": 0.00019871982123144004, + "loss": 1.2958, + "step": 2219 + }, + { + "epoch": 0.07950292764159221, + "grad_norm": 1.578924298286438, + "learning_rate": 0.00019871797054518347, + "loss": 1.2249, + "step": 2220 + }, + { + "epoch": 0.0795387397711605, + "grad_norm": 2.441600799560547, + "learning_rate": 0.00019871611853080397, + "loss": 1.3822, + "step": 2221 + }, + { + "epoch": 0.07957455190072878, + "grad_norm": 1.5261644124984741, + "learning_rate": 0.00019871426518832644, + "loss": 1.2855, + "step": 2222 + }, + { + "epoch": 0.07961036403029706, + "grad_norm": 1.5515786409378052, + "learning_rate": 0.00019871241051777576, + "loss": 1.3723, + "step": 2223 + }, + { + "epoch": 0.07964617615986534, + "grad_norm": 2.8248307704925537, + "learning_rate": 0.00019871055451917694, + "loss": 1.1453, + "step": 2224 + }, + { + "epoch": 0.07968198828943363, + "grad_norm": 2.118605375289917, + "learning_rate": 0.00019870869719255496, + "loss": 1.2027, + "step": 2225 + }, + { + "epoch": 0.07971780041900192, + "grad_norm": 1.6965337991714478, + "learning_rate": 0.00019870683853793474, + "loss": 1.1039, + "step": 2226 + }, + { + "epoch": 0.0797536125485702, + "grad_norm": 1.7572786808013916, + "learning_rate": 0.00019870497855534137, + "loss": 1.1944, + "step": 2227 + }, + { + "epoch": 0.07978942467813849, + "grad_norm": 1.602038860321045, + "learning_rate": 0.00019870311724479983, + "loss": 1.3999, + "step": 2228 + }, + { + "epoch": 0.07982523680770677, + "grad_norm": 1.7710261344909668, + "learning_rate": 0.00019870125460633514, + "loss": 1.1714, + "step": 2229 + }, + { + "epoch": 0.07986104893727505, + "grad_norm": 1.6082310676574707, + "learning_rate": 0.00019869939063997243, + "loss": 1.2834, + "step": 2230 + }, + { + "epoch": 0.07989686106684334, + "grad_norm": 1.8927005529403687, + "learning_rate": 0.00019869752534573668, + "loss": 1.1545, + "step": 2231 + }, + { + "epoch": 0.07993267319641162, + "grad_norm": 3.285923957824707, + "learning_rate": 0.00019869565872365308, + "loss": 1.3768, + "step": 2232 + }, + { + "epoch": 0.07996848532597992, + "grad_norm": 1.4672666788101196, + "learning_rate": 0.00019869379077374667, + "loss": 1.3261, + "step": 2233 + }, + { + "epoch": 0.0800042974555482, + "grad_norm": 2.9584760665893555, + "learning_rate": 0.00019869192149604264, + "loss": 1.281, + "step": 2234 + }, + { + "epoch": 0.08004010958511648, + "grad_norm": 1.956342101097107, + "learning_rate": 0.0001986900508905661, + "loss": 1.1831, + "step": 2235 + }, + { + "epoch": 0.08007592171468476, + "grad_norm": 1.798429012298584, + "learning_rate": 0.00019868817895734222, + "loss": 1.3503, + "step": 2236 + }, + { + "epoch": 0.08011173384425305, + "grad_norm": 1.7543531656265259, + "learning_rate": 0.00019868630569639618, + "loss": 1.1833, + "step": 2237 + }, + { + "epoch": 0.08014754597382133, + "grad_norm": 2.7258148193359375, + "learning_rate": 0.0001986844311077532, + "loss": 1.2613, + "step": 2238 + }, + { + "epoch": 0.08018335810338961, + "grad_norm": 2.373257637023926, + "learning_rate": 0.0001986825551914385, + "loss": 1.1466, + "step": 2239 + }, + { + "epoch": 0.08021917023295791, + "grad_norm": 2.700920820236206, + "learning_rate": 0.00019868067794747728, + "loss": 1.2861, + "step": 2240 + }, + { + "epoch": 0.08025498236252619, + "grad_norm": 1.4433202743530273, + "learning_rate": 0.00019867879937589486, + "loss": 1.274, + "step": 2241 + }, + { + "epoch": 0.08029079449209447, + "grad_norm": 1.9030646085739136, + "learning_rate": 0.0001986769194767165, + "loss": 1.3091, + "step": 2242 + }, + { + "epoch": 0.08032660662166276, + "grad_norm": 1.5451335906982422, + "learning_rate": 0.00019867503824996745, + "loss": 1.2031, + "step": 2243 + }, + { + "epoch": 0.08036241875123104, + "grad_norm": 2.482590436935425, + "learning_rate": 0.00019867315569567303, + "loss": 1.2489, + "step": 2244 + }, + { + "epoch": 0.08039823088079932, + "grad_norm": 2.196448564529419, + "learning_rate": 0.0001986712718138586, + "loss": 1.2171, + "step": 2245 + }, + { + "epoch": 0.0804340430103676, + "grad_norm": 1.4637300968170166, + "learning_rate": 0.00019866938660454949, + "loss": 1.1464, + "step": 2246 + }, + { + "epoch": 0.0804698551399359, + "grad_norm": 1.9722788333892822, + "learning_rate": 0.00019866750006777102, + "loss": 1.3253, + "step": 2247 + }, + { + "epoch": 0.08050566726950419, + "grad_norm": 1.6236249208450317, + "learning_rate": 0.00019866561220354862, + "loss": 1.3618, + "step": 2248 + }, + { + "epoch": 0.08054147939907247, + "grad_norm": 1.927834153175354, + "learning_rate": 0.0001986637230119077, + "loss": 1.2094, + "step": 2249 + }, + { + "epoch": 0.08057729152864075, + "grad_norm": 1.6344574689865112, + "learning_rate": 0.00019866183249287364, + "loss": 1.2512, + "step": 2250 + }, + { + "epoch": 0.08061310365820903, + "grad_norm": 1.4837630987167358, + "learning_rate": 0.00019865994064647188, + "loss": 1.1401, + "step": 2251 + }, + { + "epoch": 0.08064891578777732, + "grad_norm": 2.1783385276794434, + "learning_rate": 0.0001986580474727279, + "loss": 1.1713, + "step": 2252 + }, + { + "epoch": 0.0806847279173456, + "grad_norm": 1.479164481163025, + "learning_rate": 0.00019865615297166714, + "loss": 1.3354, + "step": 2253 + }, + { + "epoch": 0.08072054004691388, + "grad_norm": 1.570946216583252, + "learning_rate": 0.0001986542571433151, + "loss": 1.4412, + "step": 2254 + }, + { + "epoch": 0.08075635217648218, + "grad_norm": 1.6707414388656616, + "learning_rate": 0.00019865235998769727, + "loss": 1.2768, + "step": 2255 + }, + { + "epoch": 0.08079216430605046, + "grad_norm": 2.2426750659942627, + "learning_rate": 0.0001986504615048392, + "loss": 1.2294, + "step": 2256 + }, + { + "epoch": 0.08082797643561875, + "grad_norm": 1.6538944244384766, + "learning_rate": 0.0001986485616947664, + "loss": 1.3371, + "step": 2257 + }, + { + "epoch": 0.08086378856518703, + "grad_norm": 2.9099326133728027, + "learning_rate": 0.00019864666055750452, + "loss": 1.341, + "step": 2258 + }, + { + "epoch": 0.08089960069475531, + "grad_norm": 1.5786765813827515, + "learning_rate": 0.000198644758093079, + "loss": 1.3463, + "step": 2259 + }, + { + "epoch": 0.0809354128243236, + "grad_norm": 1.830469012260437, + "learning_rate": 0.00019864285430151553, + "loss": 1.2522, + "step": 2260 + }, + { + "epoch": 0.08097122495389188, + "grad_norm": 1.506913661956787, + "learning_rate": 0.00019864094918283968, + "loss": 1.1944, + "step": 2261 + }, + { + "epoch": 0.08100703708346017, + "grad_norm": 1.6646087169647217, + "learning_rate": 0.0001986390427370771, + "loss": 1.1371, + "step": 2262 + }, + { + "epoch": 0.08104284921302846, + "grad_norm": 1.6114869117736816, + "learning_rate": 0.00019863713496425347, + "loss": 1.0509, + "step": 2263 + }, + { + "epoch": 0.08107866134259674, + "grad_norm": 2.321381092071533, + "learning_rate": 0.0001986352258643944, + "loss": 1.1052, + "step": 2264 + }, + { + "epoch": 0.08111447347216502, + "grad_norm": 1.887017011642456, + "learning_rate": 0.00019863331543752558, + "loss": 1.2598, + "step": 2265 + }, + { + "epoch": 0.0811502856017333, + "grad_norm": 1.854819893836975, + "learning_rate": 0.00019863140368367273, + "loss": 1.2719, + "step": 2266 + }, + { + "epoch": 0.08118609773130159, + "grad_norm": 1.6957839727401733, + "learning_rate": 0.00019862949060286158, + "loss": 1.3397, + "step": 2267 + }, + { + "epoch": 0.08122190986086987, + "grad_norm": 1.895107388496399, + "learning_rate": 0.00019862757619511784, + "loss": 1.1924, + "step": 2268 + }, + { + "epoch": 0.08125772199043817, + "grad_norm": 1.8858052492141724, + "learning_rate": 0.0001986256604604673, + "loss": 1.1281, + "step": 2269 + }, + { + "epoch": 0.08129353412000645, + "grad_norm": 1.521772861480713, + "learning_rate": 0.0001986237433989357, + "loss": 1.2148, + "step": 2270 + }, + { + "epoch": 0.08132934624957473, + "grad_norm": 1.4424426555633545, + "learning_rate": 0.0001986218250105489, + "loss": 1.3578, + "step": 2271 + }, + { + "epoch": 0.08136515837914302, + "grad_norm": 2.472515106201172, + "learning_rate": 0.0001986199052953326, + "loss": 1.0731, + "step": 2272 + }, + { + "epoch": 0.0814009705087113, + "grad_norm": 1.9614804983139038, + "learning_rate": 0.0001986179842533127, + "loss": 1.1551, + "step": 2273 + }, + { + "epoch": 0.08143678263827958, + "grad_norm": 1.6230548620224, + "learning_rate": 0.00019861606188451502, + "loss": 1.1166, + "step": 2274 + }, + { + "epoch": 0.08147259476784786, + "grad_norm": 1.5320172309875488, + "learning_rate": 0.00019861413818896546, + "loss": 1.2109, + "step": 2275 + }, + { + "epoch": 0.08150840689741616, + "grad_norm": 1.536584734916687, + "learning_rate": 0.00019861221316668984, + "loss": 1.1733, + "step": 2276 + }, + { + "epoch": 0.08154421902698444, + "grad_norm": 1.7733384370803833, + "learning_rate": 0.0001986102868177141, + "loss": 1.3234, + "step": 2277 + }, + { + "epoch": 0.08158003115655273, + "grad_norm": 1.7809674739837646, + "learning_rate": 0.0001986083591420642, + "loss": 1.4444, + "step": 2278 + }, + { + "epoch": 0.08161584328612101, + "grad_norm": 2.1068077087402344, + "learning_rate": 0.00019860643013976597, + "loss": 1.089, + "step": 2279 + }, + { + "epoch": 0.08165165541568929, + "grad_norm": 2.510042905807495, + "learning_rate": 0.00019860449981084545, + "loss": 1.2397, + "step": 2280 + }, + { + "epoch": 0.08168746754525757, + "grad_norm": 2.3564796447753906, + "learning_rate": 0.00019860256815532854, + "loss": 1.0045, + "step": 2281 + }, + { + "epoch": 0.08172327967482586, + "grad_norm": 1.7568950653076172, + "learning_rate": 0.0001986006351732413, + "loss": 1.0917, + "step": 2282 + }, + { + "epoch": 0.08175909180439415, + "grad_norm": 1.7482914924621582, + "learning_rate": 0.00019859870086460965, + "loss": 1.2972, + "step": 2283 + }, + { + "epoch": 0.08179490393396244, + "grad_norm": 2.1293070316314697, + "learning_rate": 0.0001985967652294597, + "loss": 1.2758, + "step": 2284 + }, + { + "epoch": 0.08183071606353072, + "grad_norm": 1.570251703262329, + "learning_rate": 0.00019859482826781744, + "loss": 1.2448, + "step": 2285 + }, + { + "epoch": 0.081866528193099, + "grad_norm": 2.051706075668335, + "learning_rate": 0.00019859288997970895, + "loss": 1.2921, + "step": 2286 + }, + { + "epoch": 0.08190234032266729, + "grad_norm": 2.4298250675201416, + "learning_rate": 0.0001985909503651603, + "loss": 1.0661, + "step": 2287 + }, + { + "epoch": 0.08193815245223557, + "grad_norm": 2.10935640335083, + "learning_rate": 0.0001985890094241976, + "loss": 1.3509, + "step": 2288 + }, + { + "epoch": 0.08197396458180385, + "grad_norm": 1.8487489223480225, + "learning_rate": 0.0001985870671568469, + "loss": 1.3039, + "step": 2289 + }, + { + "epoch": 0.08200977671137215, + "grad_norm": 1.906097412109375, + "learning_rate": 0.00019858512356313445, + "loss": 1.1486, + "step": 2290 + }, + { + "epoch": 0.08204558884094043, + "grad_norm": 1.8171021938323975, + "learning_rate": 0.00019858317864308628, + "loss": 1.3082, + "step": 2291 + }, + { + "epoch": 0.08208140097050871, + "grad_norm": 1.6094624996185303, + "learning_rate": 0.0001985812323967286, + "loss": 1.1872, + "step": 2292 + }, + { + "epoch": 0.082117213100077, + "grad_norm": 1.579187273979187, + "learning_rate": 0.00019857928482408763, + "loss": 1.3162, + "step": 2293 + }, + { + "epoch": 0.08215302522964528, + "grad_norm": 1.4825934171676636, + "learning_rate": 0.00019857733592518954, + "loss": 1.0409, + "step": 2294 + }, + { + "epoch": 0.08218883735921356, + "grad_norm": 1.930694818496704, + "learning_rate": 0.00019857538570006053, + "loss": 1.3084, + "step": 2295 + }, + { + "epoch": 0.08222464948878185, + "grad_norm": 2.129685163497925, + "learning_rate": 0.00019857343414872685, + "loss": 1.4241, + "step": 2296 + }, + { + "epoch": 0.08226046161835014, + "grad_norm": 2.7485013008117676, + "learning_rate": 0.0001985714812712148, + "loss": 1.3674, + "step": 2297 + }, + { + "epoch": 0.08229627374791842, + "grad_norm": 1.8966022729873657, + "learning_rate": 0.0001985695270675506, + "loss": 1.1231, + "step": 2298 + }, + { + "epoch": 0.08233208587748671, + "grad_norm": 1.814265251159668, + "learning_rate": 0.00019856757153776058, + "loss": 1.2649, + "step": 2299 + }, + { + "epoch": 0.08236789800705499, + "grad_norm": 2.242050886154175, + "learning_rate": 0.000198565614681871, + "loss": 1.1719, + "step": 2300 + }, + { + "epoch": 0.08240371013662327, + "grad_norm": 2.327198028564453, + "learning_rate": 0.0001985636564999082, + "loss": 1.2903, + "step": 2301 + }, + { + "epoch": 0.08243952226619156, + "grad_norm": 1.4549129009246826, + "learning_rate": 0.00019856169699189856, + "loss": 1.2225, + "step": 2302 + }, + { + "epoch": 0.08247533439575984, + "grad_norm": 2.0548324584960938, + "learning_rate": 0.00019855973615786842, + "loss": 1.315, + "step": 2303 + }, + { + "epoch": 0.08251114652532812, + "grad_norm": 2.2001495361328125, + "learning_rate": 0.0001985577739978442, + "loss": 0.9885, + "step": 2304 + }, + { + "epoch": 0.08254695865489642, + "grad_norm": 1.8399901390075684, + "learning_rate": 0.0001985558105118522, + "loss": 1.2013, + "step": 2305 + }, + { + "epoch": 0.0825827707844647, + "grad_norm": 1.5085740089416504, + "learning_rate": 0.00019855384569991892, + "loss": 1.2183, + "step": 2306 + }, + { + "epoch": 0.08261858291403298, + "grad_norm": 1.3861792087554932, + "learning_rate": 0.0001985518795620708, + "loss": 1.2884, + "step": 2307 + }, + { + "epoch": 0.08265439504360127, + "grad_norm": 1.7399684190750122, + "learning_rate": 0.0001985499120983342, + "loss": 1.3765, + "step": 2308 + }, + { + "epoch": 0.08269020717316955, + "grad_norm": 1.4683369398117065, + "learning_rate": 0.00019854794330873568, + "loss": 1.2938, + "step": 2309 + }, + { + "epoch": 0.08272601930273783, + "grad_norm": 2.01338267326355, + "learning_rate": 0.00019854597319330175, + "loss": 1.2249, + "step": 2310 + }, + { + "epoch": 0.08276183143230612, + "grad_norm": 1.7849838733673096, + "learning_rate": 0.00019854400175205883, + "loss": 1.3607, + "step": 2311 + }, + { + "epoch": 0.08279764356187441, + "grad_norm": 1.626592993736267, + "learning_rate": 0.00019854202898503346, + "loss": 1.1697, + "step": 2312 + }, + { + "epoch": 0.0828334556914427, + "grad_norm": 1.3716307878494263, + "learning_rate": 0.00019854005489225224, + "loss": 1.2715, + "step": 2313 + }, + { + "epoch": 0.08286926782101098, + "grad_norm": 1.5707690715789795, + "learning_rate": 0.00019853807947374166, + "loss": 1.4525, + "step": 2314 + }, + { + "epoch": 0.08290507995057926, + "grad_norm": 1.6518725156784058, + "learning_rate": 0.0001985361027295283, + "loss": 1.2353, + "step": 2315 + }, + { + "epoch": 0.08294089208014754, + "grad_norm": 2.2260186672210693, + "learning_rate": 0.00019853412465963883, + "loss": 1.2309, + "step": 2316 + }, + { + "epoch": 0.08297670420971583, + "grad_norm": 1.7736612558364868, + "learning_rate": 0.0001985321452640998, + "loss": 1.192, + "step": 2317 + }, + { + "epoch": 0.08301251633928411, + "grad_norm": 1.8787245750427246, + "learning_rate": 0.00019853016454293785, + "loss": 1.2705, + "step": 2318 + }, + { + "epoch": 0.0830483284688524, + "grad_norm": 1.8033031225204468, + "learning_rate": 0.00019852818249617963, + "loss": 1.3123, + "step": 2319 + }, + { + "epoch": 0.08308414059842069, + "grad_norm": 1.9029414653778076, + "learning_rate": 0.0001985261991238518, + "loss": 1.0653, + "step": 2320 + }, + { + "epoch": 0.08311995272798897, + "grad_norm": 2.131553888320923, + "learning_rate": 0.00019852421442598107, + "loss": 1.3074, + "step": 2321 + }, + { + "epoch": 0.08315576485755725, + "grad_norm": 2.156146764755249, + "learning_rate": 0.0001985222284025941, + "loss": 1.3328, + "step": 2322 + }, + { + "epoch": 0.08319157698712554, + "grad_norm": 1.6656368970870972, + "learning_rate": 0.00019852024105371764, + "loss": 1.3592, + "step": 2323 + }, + { + "epoch": 0.08322738911669382, + "grad_norm": 1.43528413772583, + "learning_rate": 0.0001985182523793784, + "loss": 1.0481, + "step": 2324 + }, + { + "epoch": 0.0832632012462621, + "grad_norm": 2.035396099090576, + "learning_rate": 0.00019851626237960316, + "loss": 1.3468, + "step": 2325 + }, + { + "epoch": 0.0832990133758304, + "grad_norm": 1.8966528177261353, + "learning_rate": 0.00019851427105441874, + "loss": 1.2934, + "step": 2326 + }, + { + "epoch": 0.08333482550539868, + "grad_norm": 2.0910067558288574, + "learning_rate": 0.00019851227840385184, + "loss": 1.1767, + "step": 2327 + }, + { + "epoch": 0.08337063763496697, + "grad_norm": 1.7176417112350464, + "learning_rate": 0.00019851028442792928, + "loss": 1.3137, + "step": 2328 + }, + { + "epoch": 0.08340644976453525, + "grad_norm": 1.8292415142059326, + "learning_rate": 0.00019850828912667794, + "loss": 1.3275, + "step": 2329 + }, + { + "epoch": 0.08344226189410353, + "grad_norm": 1.8728505373001099, + "learning_rate": 0.0001985062925001246, + "loss": 1.1779, + "step": 2330 + }, + { + "epoch": 0.08347807402367181, + "grad_norm": 1.7803834676742554, + "learning_rate": 0.0001985042945482962, + "loss": 1.1901, + "step": 2331 + }, + { + "epoch": 0.0835138861532401, + "grad_norm": 2.176401138305664, + "learning_rate": 0.00019850229527121956, + "loss": 1.2975, + "step": 2332 + }, + { + "epoch": 0.0835496982828084, + "grad_norm": 1.5966345071792603, + "learning_rate": 0.00019850029466892161, + "loss": 1.2255, + "step": 2333 + }, + { + "epoch": 0.08358551041237668, + "grad_norm": 2.2270944118499756, + "learning_rate": 0.00019849829274142924, + "loss": 1.1086, + "step": 2334 + }, + { + "epoch": 0.08362132254194496, + "grad_norm": 2.546152353286743, + "learning_rate": 0.00019849628948876943, + "loss": 1.427, + "step": 2335 + }, + { + "epoch": 0.08365713467151324, + "grad_norm": 1.6494839191436768, + "learning_rate": 0.00019849428491096904, + "loss": 1.2255, + "step": 2336 + }, + { + "epoch": 0.08369294680108152, + "grad_norm": 2.1273000240325928, + "learning_rate": 0.0001984922790080551, + "loss": 1.2824, + "step": 2337 + }, + { + "epoch": 0.08372875893064981, + "grad_norm": 2.0132811069488525, + "learning_rate": 0.0001984902717800546, + "loss": 1.2845, + "step": 2338 + }, + { + "epoch": 0.08376457106021809, + "grad_norm": 1.6470410823822021, + "learning_rate": 0.00019848826322699456, + "loss": 1.1477, + "step": 2339 + }, + { + "epoch": 0.08380038318978639, + "grad_norm": 1.7385661602020264, + "learning_rate": 0.000198486253348902, + "loss": 1.073, + "step": 2340 + }, + { + "epoch": 0.08383619531935467, + "grad_norm": 3.6454696655273438, + "learning_rate": 0.0001984842421458039, + "loss": 1.5019, + "step": 2341 + }, + { + "epoch": 0.08387200744892295, + "grad_norm": 1.7512617111206055, + "learning_rate": 0.00019848222961772733, + "loss": 1.2816, + "step": 2342 + }, + { + "epoch": 0.08390781957849124, + "grad_norm": 1.7721397876739502, + "learning_rate": 0.00019848021576469944, + "loss": 1.3251, + "step": 2343 + }, + { + "epoch": 0.08394363170805952, + "grad_norm": 1.4651826620101929, + "learning_rate": 0.00019847820058674728, + "loss": 1.196, + "step": 2344 + }, + { + "epoch": 0.0839794438376278, + "grad_norm": 2.761131525039673, + "learning_rate": 0.00019847618408389792, + "loss": 1.3375, + "step": 2345 + }, + { + "epoch": 0.08401525596719608, + "grad_norm": 1.9501272439956665, + "learning_rate": 0.00019847416625617855, + "loss": 1.2637, + "step": 2346 + }, + { + "epoch": 0.08405106809676438, + "grad_norm": 1.3517154455184937, + "learning_rate": 0.0001984721471036163, + "loss": 1.2389, + "step": 2347 + }, + { + "epoch": 0.08408688022633266, + "grad_norm": 2.421243190765381, + "learning_rate": 0.00019847012662623832, + "loss": 1.266, + "step": 2348 + }, + { + "epoch": 0.08412269235590095, + "grad_norm": 1.4377175569534302, + "learning_rate": 0.00019846810482407182, + "loss": 1.3297, + "step": 2349 + }, + { + "epoch": 0.08415850448546923, + "grad_norm": 1.707340955734253, + "learning_rate": 0.00019846608169714398, + "loss": 1.0904, + "step": 2350 + }, + { + "epoch": 0.08419431661503751, + "grad_norm": 1.9402178525924683, + "learning_rate": 0.00019846405724548204, + "loss": 1.1524, + "step": 2351 + }, + { + "epoch": 0.0842301287446058, + "grad_norm": 1.9950048923492432, + "learning_rate": 0.00019846203146911318, + "loss": 1.2942, + "step": 2352 + }, + { + "epoch": 0.08426594087417408, + "grad_norm": 2.280423879623413, + "learning_rate": 0.00019846000436806471, + "loss": 1.4213, + "step": 2353 + }, + { + "epoch": 0.08430175300374236, + "grad_norm": 1.7909777164459229, + "learning_rate": 0.00019845797594236387, + "loss": 1.2672, + "step": 2354 + }, + { + "epoch": 0.08433756513331066, + "grad_norm": 1.8605928421020508, + "learning_rate": 0.00019845594619203797, + "loss": 1.3115, + "step": 2355 + }, + { + "epoch": 0.08437337726287894, + "grad_norm": 1.9631696939468384, + "learning_rate": 0.00019845391511711435, + "loss": 1.1933, + "step": 2356 + }, + { + "epoch": 0.08440918939244722, + "grad_norm": 2.056398868560791, + "learning_rate": 0.00019845188271762029, + "loss": 1.327, + "step": 2357 + }, + { + "epoch": 0.0844450015220155, + "grad_norm": 2.0564894676208496, + "learning_rate": 0.0001984498489935831, + "loss": 1.1624, + "step": 2358 + }, + { + "epoch": 0.08448081365158379, + "grad_norm": 1.4168022871017456, + "learning_rate": 0.00019844781394503022, + "loss": 1.0852, + "step": 2359 + }, + { + "epoch": 0.08451662578115207, + "grad_norm": 2.891514778137207, + "learning_rate": 0.00019844577757198898, + "loss": 1.2104, + "step": 2360 + }, + { + "epoch": 0.08455243791072035, + "grad_norm": 1.9204707145690918, + "learning_rate": 0.00019844373987448676, + "loss": 1.2878, + "step": 2361 + }, + { + "epoch": 0.08458825004028865, + "grad_norm": 1.8043303489685059, + "learning_rate": 0.00019844170085255104, + "loss": 1.0681, + "step": 2362 + }, + { + "epoch": 0.08462406216985693, + "grad_norm": 1.2573543787002563, + "learning_rate": 0.0001984396605062092, + "loss": 1.264, + "step": 2363 + }, + { + "epoch": 0.08465987429942522, + "grad_norm": 2.476543664932251, + "learning_rate": 0.00019843761883548872, + "loss": 1.1245, + "step": 2364 + }, + { + "epoch": 0.0846956864289935, + "grad_norm": 1.3079078197479248, + "learning_rate": 0.00019843557584041705, + "loss": 1.1042, + "step": 2365 + }, + { + "epoch": 0.08473149855856178, + "grad_norm": 1.886830449104309, + "learning_rate": 0.0001984335315210217, + "loss": 1.3674, + "step": 2366 + }, + { + "epoch": 0.08476731068813007, + "grad_norm": 1.5205780267715454, + "learning_rate": 0.00019843148587733012, + "loss": 1.3754, + "step": 2367 + }, + { + "epoch": 0.08480312281769835, + "grad_norm": 1.7739698886871338, + "learning_rate": 0.00019842943890936986, + "loss": 1.1774, + "step": 2368 + }, + { + "epoch": 0.08483893494726664, + "grad_norm": 1.955454707145691, + "learning_rate": 0.00019842739061716848, + "loss": 1.3946, + "step": 2369 + }, + { + "epoch": 0.08487474707683493, + "grad_norm": 1.8560142517089844, + "learning_rate": 0.00019842534100075355, + "loss": 1.4762, + "step": 2370 + }, + { + "epoch": 0.08491055920640321, + "grad_norm": 2.0216503143310547, + "learning_rate": 0.00019842329006015255, + "loss": 1.1028, + "step": 2371 + }, + { + "epoch": 0.0849463713359715, + "grad_norm": 1.673707365989685, + "learning_rate": 0.0001984212377953932, + "loss": 1.4345, + "step": 2372 + }, + { + "epoch": 0.08498218346553978, + "grad_norm": 1.4391093254089355, + "learning_rate": 0.00019841918420650302, + "loss": 1.2694, + "step": 2373 + }, + { + "epoch": 0.08501799559510806, + "grad_norm": 1.864411473274231, + "learning_rate": 0.00019841712929350965, + "loss": 1.3437, + "step": 2374 + }, + { + "epoch": 0.08505380772467634, + "grad_norm": 1.252284049987793, + "learning_rate": 0.0001984150730564408, + "loss": 1.2637, + "step": 2375 + }, + { + "epoch": 0.08508961985424464, + "grad_norm": 1.724997639656067, + "learning_rate": 0.00019841301549532409, + "loss": 1.3285, + "step": 2376 + }, + { + "epoch": 0.08512543198381292, + "grad_norm": 1.5740399360656738, + "learning_rate": 0.00019841095661018716, + "loss": 1.2668, + "step": 2377 + }, + { + "epoch": 0.0851612441133812, + "grad_norm": 1.799936294555664, + "learning_rate": 0.00019840889640105775, + "loss": 1.3721, + "step": 2378 + }, + { + "epoch": 0.08519705624294949, + "grad_norm": 2.7609660625457764, + "learning_rate": 0.00019840683486796362, + "loss": 1.2343, + "step": 2379 + }, + { + "epoch": 0.08523286837251777, + "grad_norm": 1.435662865638733, + "learning_rate": 0.00019840477201093243, + "loss": 1.2739, + "step": 2380 + }, + { + "epoch": 0.08526868050208605, + "grad_norm": 1.8176209926605225, + "learning_rate": 0.00019840270782999197, + "loss": 1.3107, + "step": 2381 + }, + { + "epoch": 0.08530449263165434, + "grad_norm": 1.5667589902877808, + "learning_rate": 0.00019840064232517, + "loss": 1.3154, + "step": 2382 + }, + { + "epoch": 0.08534030476122263, + "grad_norm": 1.7615090608596802, + "learning_rate": 0.0001983985754964943, + "loss": 1.2057, + "step": 2383 + }, + { + "epoch": 0.08537611689079092, + "grad_norm": 1.70809006690979, + "learning_rate": 0.00019839650734399276, + "loss": 1.3998, + "step": 2384 + }, + { + "epoch": 0.0854119290203592, + "grad_norm": 1.63711678981781, + "learning_rate": 0.0001983944378676931, + "loss": 1.1298, + "step": 2385 + }, + { + "epoch": 0.08544774114992748, + "grad_norm": 2.066554069519043, + "learning_rate": 0.00019839236706762318, + "loss": 1.502, + "step": 2386 + }, + { + "epoch": 0.08548355327949576, + "grad_norm": 2.4078714847564697, + "learning_rate": 0.00019839029494381086, + "loss": 1.0648, + "step": 2387 + }, + { + "epoch": 0.08551936540906405, + "grad_norm": 1.872382640838623, + "learning_rate": 0.0001983882214962841, + "loss": 1.4721, + "step": 2388 + }, + { + "epoch": 0.08555517753863233, + "grad_norm": 1.3164470195770264, + "learning_rate": 0.00019838614672507067, + "loss": 1.2237, + "step": 2389 + }, + { + "epoch": 0.08559098966820063, + "grad_norm": 1.6047462224960327, + "learning_rate": 0.00019838407063019857, + "loss": 1.3521, + "step": 2390 + }, + { + "epoch": 0.08562680179776891, + "grad_norm": 1.5142557621002197, + "learning_rate": 0.0001983819932116957, + "loss": 1.315, + "step": 2391 + }, + { + "epoch": 0.08566261392733719, + "grad_norm": 2.0190627574920654, + "learning_rate": 0.00019837991446959005, + "loss": 1.3708, + "step": 2392 + }, + { + "epoch": 0.08569842605690547, + "grad_norm": 1.3233726024627686, + "learning_rate": 0.0001983778344039095, + "loss": 1.1472, + "step": 2393 + }, + { + "epoch": 0.08573423818647376, + "grad_norm": 1.4750229120254517, + "learning_rate": 0.00019837575301468211, + "loss": 1.4179, + "step": 2394 + }, + { + "epoch": 0.08577005031604204, + "grad_norm": 1.7948247194290161, + "learning_rate": 0.00019837367030193587, + "loss": 1.3233, + "step": 2395 + }, + { + "epoch": 0.08580586244561032, + "grad_norm": 2.477534294128418, + "learning_rate": 0.00019837158626569878, + "loss": 1.4971, + "step": 2396 + }, + { + "epoch": 0.08584167457517862, + "grad_norm": 1.5161799192428589, + "learning_rate": 0.0001983695009059989, + "loss": 1.3052, + "step": 2397 + }, + { + "epoch": 0.0858774867047469, + "grad_norm": 1.7656577825546265, + "learning_rate": 0.00019836741422286425, + "loss": 1.1998, + "step": 2398 + }, + { + "epoch": 0.08591329883431519, + "grad_norm": 1.5675132274627686, + "learning_rate": 0.00019836532621632293, + "loss": 1.2724, + "step": 2399 + }, + { + "epoch": 0.08594911096388347, + "grad_norm": 1.5494904518127441, + "learning_rate": 0.000198363236886403, + "loss": 1.1049, + "step": 2400 + }, + { + "epoch": 0.08598492309345175, + "grad_norm": 1.4611680507659912, + "learning_rate": 0.00019836114623313265, + "loss": 1.2636, + "step": 2401 + }, + { + "epoch": 0.08602073522302003, + "grad_norm": 2.229396104812622, + "learning_rate": 0.00019835905425653994, + "loss": 1.1226, + "step": 2402 + }, + { + "epoch": 0.08605654735258832, + "grad_norm": 2.290069103240967, + "learning_rate": 0.00019835696095665302, + "loss": 1.3572, + "step": 2403 + }, + { + "epoch": 0.0860923594821566, + "grad_norm": 2.8780853748321533, + "learning_rate": 0.00019835486633350006, + "loss": 1.3467, + "step": 2404 + }, + { + "epoch": 0.0861281716117249, + "grad_norm": 2.0858635902404785, + "learning_rate": 0.00019835277038710928, + "loss": 1.0961, + "step": 2405 + }, + { + "epoch": 0.08616398374129318, + "grad_norm": 1.266880750656128, + "learning_rate": 0.00019835067311750878, + "loss": 1.1982, + "step": 2406 + }, + { + "epoch": 0.08619979587086146, + "grad_norm": 1.9196956157684326, + "learning_rate": 0.00019834857452472686, + "loss": 1.2647, + "step": 2407 + }, + { + "epoch": 0.08623560800042974, + "grad_norm": 1.6960420608520508, + "learning_rate": 0.00019834647460879174, + "loss": 1.1247, + "step": 2408 + }, + { + "epoch": 0.08627142012999803, + "grad_norm": 1.5700180530548096, + "learning_rate": 0.00019834437336973165, + "loss": 1.2902, + "step": 2409 + }, + { + "epoch": 0.08630723225956631, + "grad_norm": 2.4546005725860596, + "learning_rate": 0.00019834227080757488, + "loss": 1.139, + "step": 2410 + }, + { + "epoch": 0.0863430443891346, + "grad_norm": 1.757914423942566, + "learning_rate": 0.0001983401669223497, + "loss": 1.252, + "step": 2411 + }, + { + "epoch": 0.08637885651870289, + "grad_norm": 1.9489264488220215, + "learning_rate": 0.00019833806171408442, + "loss": 1.3696, + "step": 2412 + }, + { + "epoch": 0.08641466864827117, + "grad_norm": 1.876599669456482, + "learning_rate": 0.0001983359551828074, + "loss": 1.2686, + "step": 2413 + }, + { + "epoch": 0.08645048077783946, + "grad_norm": 2.2416419982910156, + "learning_rate": 0.0001983338473285469, + "loss": 1.2587, + "step": 2414 + }, + { + "epoch": 0.08648629290740774, + "grad_norm": 1.659842848777771, + "learning_rate": 0.00019833173815133134, + "loss": 1.1096, + "step": 2415 + }, + { + "epoch": 0.08652210503697602, + "grad_norm": 1.4732718467712402, + "learning_rate": 0.0001983296276511891, + "loss": 1.2355, + "step": 2416 + }, + { + "epoch": 0.0865579171665443, + "grad_norm": 1.8822187185287476, + "learning_rate": 0.00019832751582814855, + "loss": 1.3329, + "step": 2417 + }, + { + "epoch": 0.08659372929611259, + "grad_norm": 2.0818538665771484, + "learning_rate": 0.0001983254026822381, + "loss": 1.1504, + "step": 2418 + }, + { + "epoch": 0.08662954142568088, + "grad_norm": 2.7689993381500244, + "learning_rate": 0.0001983232882134862, + "loss": 1.0647, + "step": 2419 + }, + { + "epoch": 0.08666535355524917, + "grad_norm": 1.6569437980651855, + "learning_rate": 0.00019832117242192128, + "loss": 1.0235, + "step": 2420 + }, + { + "epoch": 0.08670116568481745, + "grad_norm": 1.6566797494888306, + "learning_rate": 0.0001983190553075718, + "loss": 1.0755, + "step": 2421 + }, + { + "epoch": 0.08673697781438573, + "grad_norm": 2.063586711883545, + "learning_rate": 0.00019831693687046627, + "loss": 1.383, + "step": 2422 + }, + { + "epoch": 0.08677278994395402, + "grad_norm": 2.8876283168792725, + "learning_rate": 0.00019831481711063314, + "loss": 1.3543, + "step": 2423 + }, + { + "epoch": 0.0868086020735223, + "grad_norm": 1.4753950834274292, + "learning_rate": 0.000198312696028101, + "loss": 1.2556, + "step": 2424 + }, + { + "epoch": 0.08684441420309058, + "grad_norm": 1.4171035289764404, + "learning_rate": 0.00019831057362289833, + "loss": 1.1773, + "step": 2425 + }, + { + "epoch": 0.08688022633265888, + "grad_norm": 1.7456287145614624, + "learning_rate": 0.00019830844989505373, + "loss": 1.2908, + "step": 2426 + }, + { + "epoch": 0.08691603846222716, + "grad_norm": 1.9069461822509766, + "learning_rate": 0.00019830632484459573, + "loss": 1.1277, + "step": 2427 + }, + { + "epoch": 0.08695185059179544, + "grad_norm": 1.5885815620422363, + "learning_rate": 0.00019830419847155292, + "loss": 1.2329, + "step": 2428 + }, + { + "epoch": 0.08698766272136373, + "grad_norm": 2.469407796859741, + "learning_rate": 0.00019830207077595392, + "loss": 1.4347, + "step": 2429 + }, + { + "epoch": 0.08702347485093201, + "grad_norm": 2.1676623821258545, + "learning_rate": 0.00019829994175782738, + "loss": 1.289, + "step": 2430 + }, + { + "epoch": 0.08705928698050029, + "grad_norm": 1.7327021360397339, + "learning_rate": 0.0001982978114172019, + "loss": 1.3375, + "step": 2431 + }, + { + "epoch": 0.08709509911006857, + "grad_norm": 1.6366701126098633, + "learning_rate": 0.0001982956797541062, + "loss": 1.2208, + "step": 2432 + }, + { + "epoch": 0.08713091123963687, + "grad_norm": 1.6913028955459595, + "learning_rate": 0.0001982935467685689, + "loss": 1.3156, + "step": 2433 + }, + { + "epoch": 0.08716672336920515, + "grad_norm": 1.4835008382797241, + "learning_rate": 0.0001982914124606187, + "loss": 1.0368, + "step": 2434 + }, + { + "epoch": 0.08720253549877344, + "grad_norm": 1.266835331916809, + "learning_rate": 0.00019828927683028435, + "loss": 1.19, + "step": 2435 + }, + { + "epoch": 0.08723834762834172, + "grad_norm": 1.3322949409484863, + "learning_rate": 0.00019828713987759454, + "loss": 1.0602, + "step": 2436 + }, + { + "epoch": 0.08727415975791, + "grad_norm": 1.3694932460784912, + "learning_rate": 0.00019828500160257807, + "loss": 1.2852, + "step": 2437 + }, + { + "epoch": 0.08730997188747829, + "grad_norm": 1.8115922212600708, + "learning_rate": 0.0001982828620052637, + "loss": 1.1393, + "step": 2438 + }, + { + "epoch": 0.08734578401704657, + "grad_norm": 1.9488227367401123, + "learning_rate": 0.00019828072108568016, + "loss": 1.3816, + "step": 2439 + }, + { + "epoch": 0.08738159614661486, + "grad_norm": 1.461976408958435, + "learning_rate": 0.0001982785788438563, + "loss": 1.1953, + "step": 2440 + }, + { + "epoch": 0.08741740827618315, + "grad_norm": 2.491894245147705, + "learning_rate": 0.00019827643527982095, + "loss": 1.4784, + "step": 2441 + }, + { + "epoch": 0.08745322040575143, + "grad_norm": 1.7341541051864624, + "learning_rate": 0.00019827429039360293, + "loss": 1.2375, + "step": 2442 + }, + { + "epoch": 0.08748903253531971, + "grad_norm": 1.2579996585845947, + "learning_rate": 0.00019827214418523107, + "loss": 1.3576, + "step": 2443 + }, + { + "epoch": 0.087524844664888, + "grad_norm": 1.526706576347351, + "learning_rate": 0.0001982699966547343, + "loss": 1.1865, + "step": 2444 + }, + { + "epoch": 0.08756065679445628, + "grad_norm": 1.981634259223938, + "learning_rate": 0.00019826784780214147, + "loss": 1.1645, + "step": 2445 + }, + { + "epoch": 0.08759646892402456, + "grad_norm": 1.5954060554504395, + "learning_rate": 0.0001982656976274815, + "loss": 0.9844, + "step": 2446 + }, + { + "epoch": 0.08763228105359286, + "grad_norm": 2.2307093143463135, + "learning_rate": 0.00019826354613078332, + "loss": 1.4323, + "step": 2447 + }, + { + "epoch": 0.08766809318316114, + "grad_norm": 1.4159826040267944, + "learning_rate": 0.0001982613933120759, + "loss": 1.2999, + "step": 2448 + }, + { + "epoch": 0.08770390531272942, + "grad_norm": 2.2944040298461914, + "learning_rate": 0.00019825923917138818, + "loss": 1.4139, + "step": 2449 + }, + { + "epoch": 0.08773971744229771, + "grad_norm": 1.5072072744369507, + "learning_rate": 0.0001982570837087491, + "loss": 1.0664, + "step": 2450 + }, + { + "epoch": 0.08777552957186599, + "grad_norm": 1.4167371988296509, + "learning_rate": 0.00019825492692418774, + "loss": 1.0605, + "step": 2451 + }, + { + "epoch": 0.08781134170143427, + "grad_norm": 1.255202054977417, + "learning_rate": 0.00019825276881773308, + "loss": 1.2455, + "step": 2452 + }, + { + "epoch": 0.08784715383100256, + "grad_norm": 1.5871864557266235, + "learning_rate": 0.00019825060938941414, + "loss": 1.3449, + "step": 2453 + }, + { + "epoch": 0.08788296596057084, + "grad_norm": 1.425524115562439, + "learning_rate": 0.00019824844863925998, + "loss": 1.2763, + "step": 2454 + }, + { + "epoch": 0.08791877809013914, + "grad_norm": 2.5663552284240723, + "learning_rate": 0.0001982462865672997, + "loss": 1.2698, + "step": 2455 + }, + { + "epoch": 0.08795459021970742, + "grad_norm": 1.911181926727295, + "learning_rate": 0.00019824412317356234, + "loss": 1.0884, + "step": 2456 + }, + { + "epoch": 0.0879904023492757, + "grad_norm": 1.9564883708953857, + "learning_rate": 0.00019824195845807703, + "loss": 1.1802, + "step": 2457 + }, + { + "epoch": 0.08802621447884398, + "grad_norm": 1.6693605184555054, + "learning_rate": 0.00019823979242087288, + "loss": 1.0276, + "step": 2458 + }, + { + "epoch": 0.08806202660841227, + "grad_norm": 1.627231478691101, + "learning_rate": 0.00019823762506197907, + "loss": 1.2705, + "step": 2459 + }, + { + "epoch": 0.08809783873798055, + "grad_norm": 1.272868275642395, + "learning_rate": 0.0001982354563814247, + "loss": 1.1568, + "step": 2460 + }, + { + "epoch": 0.08813365086754883, + "grad_norm": 2.939390182495117, + "learning_rate": 0.000198233286379239, + "loss": 1.4173, + "step": 2461 + }, + { + "epoch": 0.08816946299711713, + "grad_norm": 1.7284799814224243, + "learning_rate": 0.00019823111505545114, + "loss": 1.2101, + "step": 2462 + }, + { + "epoch": 0.08820527512668541, + "grad_norm": 2.4829795360565186, + "learning_rate": 0.00019822894241009037, + "loss": 1.1436, + "step": 2463 + }, + { + "epoch": 0.0882410872562537, + "grad_norm": 1.5615227222442627, + "learning_rate": 0.00019822676844318582, + "loss": 1.2064, + "step": 2464 + }, + { + "epoch": 0.08827689938582198, + "grad_norm": 1.468366265296936, + "learning_rate": 0.00019822459315476686, + "loss": 1.1954, + "step": 2465 + }, + { + "epoch": 0.08831271151539026, + "grad_norm": 2.0006120204925537, + "learning_rate": 0.00019822241654486266, + "loss": 1.4558, + "step": 2466 + }, + { + "epoch": 0.08834852364495854, + "grad_norm": 2.1439406871795654, + "learning_rate": 0.00019822023861350256, + "loss": 1.3309, + "step": 2467 + }, + { + "epoch": 0.08838433577452683, + "grad_norm": 1.5273778438568115, + "learning_rate": 0.00019821805936071584, + "loss": 1.2198, + "step": 2468 + }, + { + "epoch": 0.08842014790409512, + "grad_norm": 1.6349472999572754, + "learning_rate": 0.00019821587878653184, + "loss": 1.4042, + "step": 2469 + }, + { + "epoch": 0.0884559600336634, + "grad_norm": 1.7885419130325317, + "learning_rate": 0.00019821369689097988, + "loss": 1.2052, + "step": 2470 + }, + { + "epoch": 0.08849177216323169, + "grad_norm": 2.249192476272583, + "learning_rate": 0.00019821151367408927, + "loss": 1.0536, + "step": 2471 + }, + { + "epoch": 0.08852758429279997, + "grad_norm": 2.7184510231018066, + "learning_rate": 0.00019820932913588947, + "loss": 1.3667, + "step": 2472 + }, + { + "epoch": 0.08856339642236825, + "grad_norm": 1.427022099494934, + "learning_rate": 0.00019820714327640983, + "loss": 0.9444, + "step": 2473 + }, + { + "epoch": 0.08859920855193654, + "grad_norm": 1.3194931745529175, + "learning_rate": 0.00019820495609567976, + "loss": 1.236, + "step": 2474 + }, + { + "epoch": 0.08863502068150482, + "grad_norm": 1.4764775037765503, + "learning_rate": 0.00019820276759372867, + "loss": 1.4027, + "step": 2475 + }, + { + "epoch": 0.08867083281107312, + "grad_norm": 1.7052007913589478, + "learning_rate": 0.00019820057777058598, + "loss": 1.1051, + "step": 2476 + }, + { + "epoch": 0.0887066449406414, + "grad_norm": 1.5675374269485474, + "learning_rate": 0.00019819838662628122, + "loss": 1.1926, + "step": 2477 + }, + { + "epoch": 0.08874245707020968, + "grad_norm": 1.8070189952850342, + "learning_rate": 0.00019819619416084385, + "loss": 1.3045, + "step": 2478 + }, + { + "epoch": 0.08877826919977796, + "grad_norm": 1.3729710578918457, + "learning_rate": 0.00019819400037430332, + "loss": 1.1678, + "step": 2479 + }, + { + "epoch": 0.08881408132934625, + "grad_norm": 1.6976642608642578, + "learning_rate": 0.0001981918052666892, + "loss": 1.1031, + "step": 2480 + }, + { + "epoch": 0.08884989345891453, + "grad_norm": 2.019322633743286, + "learning_rate": 0.00019818960883803097, + "loss": 1.1823, + "step": 2481 + }, + { + "epoch": 0.08888570558848281, + "grad_norm": 1.4746259450912476, + "learning_rate": 0.00019818741108835824, + "loss": 1.0075, + "step": 2482 + }, + { + "epoch": 0.08892151771805111, + "grad_norm": 1.9513500928878784, + "learning_rate": 0.00019818521201770052, + "loss": 1.3522, + "step": 2483 + }, + { + "epoch": 0.08895732984761939, + "grad_norm": 1.9852702617645264, + "learning_rate": 0.00019818301162608743, + "loss": 1.341, + "step": 2484 + }, + { + "epoch": 0.08899314197718768, + "grad_norm": 1.6168525218963623, + "learning_rate": 0.00019818080991354858, + "loss": 1.1072, + "step": 2485 + }, + { + "epoch": 0.08902895410675596, + "grad_norm": 1.558984637260437, + "learning_rate": 0.00019817860688011357, + "loss": 1.168, + "step": 2486 + }, + { + "epoch": 0.08906476623632424, + "grad_norm": 1.6035948991775513, + "learning_rate": 0.00019817640252581202, + "loss": 1.0752, + "step": 2487 + }, + { + "epoch": 0.08910057836589252, + "grad_norm": 1.9977848529815674, + "learning_rate": 0.00019817419685067364, + "loss": 1.4674, + "step": 2488 + }, + { + "epoch": 0.08913639049546081, + "grad_norm": 1.911453366279602, + "learning_rate": 0.00019817198985472807, + "loss": 1.2143, + "step": 2489 + }, + { + "epoch": 0.0891722026250291, + "grad_norm": 1.6470348834991455, + "learning_rate": 0.00019816978153800504, + "loss": 1.3346, + "step": 2490 + }, + { + "epoch": 0.08920801475459739, + "grad_norm": 1.6858807802200317, + "learning_rate": 0.00019816757190053416, + "loss": 1.1327, + "step": 2491 + }, + { + "epoch": 0.08924382688416567, + "grad_norm": 1.5868382453918457, + "learning_rate": 0.00019816536094234528, + "loss": 1.3183, + "step": 2492 + }, + { + "epoch": 0.08927963901373395, + "grad_norm": 1.412445068359375, + "learning_rate": 0.00019816314866346807, + "loss": 1.1148, + "step": 2493 + }, + { + "epoch": 0.08931545114330224, + "grad_norm": 1.3705092668533325, + "learning_rate": 0.00019816093506393233, + "loss": 1.243, + "step": 2494 + }, + { + "epoch": 0.08935126327287052, + "grad_norm": 2.049978017807007, + "learning_rate": 0.00019815872014376784, + "loss": 1.2245, + "step": 2495 + }, + { + "epoch": 0.0893870754024388, + "grad_norm": 2.216442346572876, + "learning_rate": 0.00019815650390300434, + "loss": 1.3974, + "step": 2496 + }, + { + "epoch": 0.0894228875320071, + "grad_norm": 2.012739896774292, + "learning_rate": 0.00019815428634167176, + "loss": 1.0135, + "step": 2497 + }, + { + "epoch": 0.08945869966157538, + "grad_norm": 1.234686017036438, + "learning_rate": 0.00019815206745979981, + "loss": 1.1098, + "step": 2498 + }, + { + "epoch": 0.08949451179114366, + "grad_norm": 1.5963572263717651, + "learning_rate": 0.00019814984725741842, + "loss": 1.1577, + "step": 2499 + }, + { + "epoch": 0.08953032392071195, + "grad_norm": 1.77279794216156, + "learning_rate": 0.00019814762573455743, + "loss": 1.352, + "step": 2500 + }, + { + "epoch": 0.08956613605028023, + "grad_norm": 1.7832560539245605, + "learning_rate": 0.00019814540289124675, + "loss": 1.1449, + "step": 2501 + }, + { + "epoch": 0.08960194817984851, + "grad_norm": 2.10224986076355, + "learning_rate": 0.00019814317872751626, + "loss": 1.4708, + "step": 2502 + }, + { + "epoch": 0.0896377603094168, + "grad_norm": 1.3671101331710815, + "learning_rate": 0.0001981409532433959, + "loss": 1.2745, + "step": 2503 + }, + { + "epoch": 0.08967357243898508, + "grad_norm": 1.5577925443649292, + "learning_rate": 0.00019813872643891563, + "loss": 1.2718, + "step": 2504 + }, + { + "epoch": 0.08970938456855337, + "grad_norm": 1.9618436098098755, + "learning_rate": 0.00019813649831410535, + "loss": 1.2224, + "step": 2505 + }, + { + "epoch": 0.08974519669812166, + "grad_norm": 1.338390588760376, + "learning_rate": 0.00019813426886899509, + "loss": 0.9465, + "step": 2506 + }, + { + "epoch": 0.08978100882768994, + "grad_norm": 2.1171529293060303, + "learning_rate": 0.00019813203810361483, + "loss": 0.993, + "step": 2507 + }, + { + "epoch": 0.08981682095725822, + "grad_norm": 1.9397727251052856, + "learning_rate": 0.00019812980601799458, + "loss": 1.2231, + "step": 2508 + }, + { + "epoch": 0.0898526330868265, + "grad_norm": 1.4052525758743286, + "learning_rate": 0.00019812757261216435, + "loss": 1.3194, + "step": 2509 + }, + { + "epoch": 0.08988844521639479, + "grad_norm": 1.4940789937973022, + "learning_rate": 0.0001981253378861542, + "loss": 1.3126, + "step": 2510 + }, + { + "epoch": 0.08992425734596307, + "grad_norm": 3.31784987449646, + "learning_rate": 0.00019812310183999423, + "loss": 1.5061, + "step": 2511 + }, + { + "epoch": 0.08996006947553137, + "grad_norm": 1.3873320817947388, + "learning_rate": 0.00019812086447371446, + "loss": 1.3021, + "step": 2512 + }, + { + "epoch": 0.08999588160509965, + "grad_norm": 1.6774553060531616, + "learning_rate": 0.00019811862578734507, + "loss": 1.3777, + "step": 2513 + }, + { + "epoch": 0.09003169373466793, + "grad_norm": 2.1920454502105713, + "learning_rate": 0.0001981163857809161, + "loss": 1.3493, + "step": 2514 + }, + { + "epoch": 0.09006750586423622, + "grad_norm": 1.9448683261871338, + "learning_rate": 0.00019811414445445772, + "loss": 1.1725, + "step": 2515 + }, + { + "epoch": 0.0901033179938045, + "grad_norm": 2.145928144454956, + "learning_rate": 0.00019811190180800013, + "loss": 1.421, + "step": 2516 + }, + { + "epoch": 0.09013913012337278, + "grad_norm": 2.314532518386841, + "learning_rate": 0.0001981096578415734, + "loss": 1.2235, + "step": 2517 + }, + { + "epoch": 0.09017494225294106, + "grad_norm": 1.6365941762924194, + "learning_rate": 0.00019810741255520782, + "loss": 1.1035, + "step": 2518 + }, + { + "epoch": 0.09021075438250936, + "grad_norm": 1.7646455764770508, + "learning_rate": 0.0001981051659489335, + "loss": 1.3637, + "step": 2519 + }, + { + "epoch": 0.09024656651207764, + "grad_norm": 1.7795156240463257, + "learning_rate": 0.00019810291802278078, + "loss": 1.0427, + "step": 2520 + }, + { + "epoch": 0.09028237864164593, + "grad_norm": 1.4121487140655518, + "learning_rate": 0.00019810066877677982, + "loss": 1.4409, + "step": 2521 + }, + { + "epoch": 0.09031819077121421, + "grad_norm": 1.835250973701477, + "learning_rate": 0.00019809841821096086, + "loss": 1.1235, + "step": 2522 + }, + { + "epoch": 0.09035400290078249, + "grad_norm": 2.179297685623169, + "learning_rate": 0.00019809616632535427, + "loss": 1.2821, + "step": 2523 + }, + { + "epoch": 0.09038981503035078, + "grad_norm": 1.4266953468322754, + "learning_rate": 0.00019809391311999028, + "loss": 1.2781, + "step": 2524 + }, + { + "epoch": 0.09042562715991906, + "grad_norm": 2.18051815032959, + "learning_rate": 0.00019809165859489922, + "loss": 1.2392, + "step": 2525 + }, + { + "epoch": 0.09046143928948736, + "grad_norm": 1.9522027969360352, + "learning_rate": 0.00019808940275011145, + "loss": 1.3285, + "step": 2526 + }, + { + "epoch": 0.09049725141905564, + "grad_norm": 1.5964711904525757, + "learning_rate": 0.00019808714558565727, + "loss": 1.0207, + "step": 2527 + }, + { + "epoch": 0.09053306354862392, + "grad_norm": 2.4223287105560303, + "learning_rate": 0.00019808488710156707, + "loss": 1.3693, + "step": 2528 + }, + { + "epoch": 0.0905688756781922, + "grad_norm": 2.6624538898468018, + "learning_rate": 0.0001980826272978712, + "loss": 1.1464, + "step": 2529 + }, + { + "epoch": 0.09060468780776049, + "grad_norm": 1.6438015699386597, + "learning_rate": 0.00019808036617460016, + "loss": 1.1927, + "step": 2530 + }, + { + "epoch": 0.09064049993732877, + "grad_norm": 1.4575196504592896, + "learning_rate": 0.00019807810373178425, + "loss": 1.1861, + "step": 2531 + }, + { + "epoch": 0.09067631206689705, + "grad_norm": 1.7573736906051636, + "learning_rate": 0.000198075839969454, + "loss": 1.2542, + "step": 2532 + }, + { + "epoch": 0.09071212419646535, + "grad_norm": 1.9816999435424805, + "learning_rate": 0.00019807357488763985, + "loss": 1.3359, + "step": 2533 + }, + { + "epoch": 0.09074793632603363, + "grad_norm": 1.3986352682113647, + "learning_rate": 0.00019807130848637224, + "loss": 1.2631, + "step": 2534 + }, + { + "epoch": 0.09078374845560191, + "grad_norm": 2.0683939456939697, + "learning_rate": 0.00019806904076568165, + "loss": 1.2131, + "step": 2535 + }, + { + "epoch": 0.0908195605851702, + "grad_norm": 2.0810468196868896, + "learning_rate": 0.00019806677172559865, + "loss": 1.0968, + "step": 2536 + }, + { + "epoch": 0.09085537271473848, + "grad_norm": 1.7151142358779907, + "learning_rate": 0.00019806450136615372, + "loss": 1.1913, + "step": 2537 + }, + { + "epoch": 0.09089118484430676, + "grad_norm": 1.635122537612915, + "learning_rate": 0.0001980622296873774, + "loss": 1.3088, + "step": 2538 + }, + { + "epoch": 0.09092699697387505, + "grad_norm": 1.7170606851577759, + "learning_rate": 0.0001980599566893003, + "loss": 1.5381, + "step": 2539 + }, + { + "epoch": 0.09096280910344334, + "grad_norm": 1.4985008239746094, + "learning_rate": 0.00019805768237195296, + "loss": 1.2324, + "step": 2540 + }, + { + "epoch": 0.09099862123301163, + "grad_norm": 1.9743802547454834, + "learning_rate": 0.00019805540673536597, + "loss": 1.3017, + "step": 2541 + }, + { + "epoch": 0.09103443336257991, + "grad_norm": 2.033809185028076, + "learning_rate": 0.00019805312977956997, + "loss": 1.1144, + "step": 2542 + }, + { + "epoch": 0.09107024549214819, + "grad_norm": 1.7084975242614746, + "learning_rate": 0.0001980508515045956, + "loss": 1.1758, + "step": 2543 + }, + { + "epoch": 0.09110605762171647, + "grad_norm": 1.6796820163726807, + "learning_rate": 0.00019804857191047353, + "loss": 1.4867, + "step": 2544 + }, + { + "epoch": 0.09114186975128476, + "grad_norm": 2.544792890548706, + "learning_rate": 0.00019804629099723435, + "loss": 1.3496, + "step": 2545 + }, + { + "epoch": 0.09117768188085304, + "grad_norm": 1.7478322982788086, + "learning_rate": 0.00019804400876490883, + "loss": 1.3918, + "step": 2546 + }, + { + "epoch": 0.09121349401042134, + "grad_norm": 2.805248975753784, + "learning_rate": 0.00019804172521352761, + "loss": 1.1882, + "step": 2547 + }, + { + "epoch": 0.09124930613998962, + "grad_norm": 2.0942447185516357, + "learning_rate": 0.00019803944034312148, + "loss": 1.3624, + "step": 2548 + }, + { + "epoch": 0.0912851182695579, + "grad_norm": 2.0264437198638916, + "learning_rate": 0.0001980371541537211, + "loss": 1.1408, + "step": 2549 + }, + { + "epoch": 0.09132093039912619, + "grad_norm": 1.5914636850357056, + "learning_rate": 0.0001980348666453573, + "loss": 1.2745, + "step": 2550 + }, + { + "epoch": 0.09135674252869447, + "grad_norm": 1.6150305271148682, + "learning_rate": 0.00019803257781806082, + "loss": 1.3122, + "step": 2551 + }, + { + "epoch": 0.09139255465826275, + "grad_norm": 1.6927369832992554, + "learning_rate": 0.00019803028767186246, + "loss": 1.3103, + "step": 2552 + }, + { + "epoch": 0.09142836678783103, + "grad_norm": 1.587031364440918, + "learning_rate": 0.000198027996206793, + "loss": 1.2019, + "step": 2553 + }, + { + "epoch": 0.09146417891739932, + "grad_norm": 1.6645021438598633, + "learning_rate": 0.0001980257034228833, + "loss": 1.3498, + "step": 2554 + }, + { + "epoch": 0.09149999104696761, + "grad_norm": 2.279149055480957, + "learning_rate": 0.00019802340932016424, + "loss": 1.3358, + "step": 2555 + }, + { + "epoch": 0.0915358031765359, + "grad_norm": 2.0482683181762695, + "learning_rate": 0.00019802111389866664, + "loss": 1.2837, + "step": 2556 + }, + { + "epoch": 0.09157161530610418, + "grad_norm": 1.453730821609497, + "learning_rate": 0.00019801881715842136, + "loss": 1.1378, + "step": 2557 + }, + { + "epoch": 0.09160742743567246, + "grad_norm": 1.369621753692627, + "learning_rate": 0.00019801651909945935, + "loss": 1.2166, + "step": 2558 + }, + { + "epoch": 0.09164323956524074, + "grad_norm": 1.4688940048217773, + "learning_rate": 0.0001980142197218115, + "loss": 1.1996, + "step": 2559 + }, + { + "epoch": 0.09167905169480903, + "grad_norm": 1.7568817138671875, + "learning_rate": 0.0001980119190255088, + "loss": 1.1591, + "step": 2560 + }, + { + "epoch": 0.09171486382437731, + "grad_norm": 2.6373610496520996, + "learning_rate": 0.0001980096170105821, + "loss": 1.214, + "step": 2561 + }, + { + "epoch": 0.0917506759539456, + "grad_norm": 1.4568703174591064, + "learning_rate": 0.00019800731367706248, + "loss": 1.2938, + "step": 2562 + }, + { + "epoch": 0.09178648808351389, + "grad_norm": 1.7917625904083252, + "learning_rate": 0.0001980050090249808, + "loss": 1.2461, + "step": 2563 + }, + { + "epoch": 0.09182230021308217, + "grad_norm": 1.8370487689971924, + "learning_rate": 0.0001980027030543682, + "loss": 1.2011, + "step": 2564 + }, + { + "epoch": 0.09185811234265046, + "grad_norm": 1.6005456447601318, + "learning_rate": 0.00019800039576525562, + "loss": 1.3767, + "step": 2565 + }, + { + "epoch": 0.09189392447221874, + "grad_norm": 1.755397915840149, + "learning_rate": 0.00019799808715767413, + "loss": 1.3565, + "step": 2566 + }, + { + "epoch": 0.09192973660178702, + "grad_norm": 1.5920376777648926, + "learning_rate": 0.00019799577723165479, + "loss": 1.3074, + "step": 2567 + }, + { + "epoch": 0.0919655487313553, + "grad_norm": 2.8187429904937744, + "learning_rate": 0.0001979934659872287, + "loss": 1.3943, + "step": 2568 + }, + { + "epoch": 0.0920013608609236, + "grad_norm": 1.5627751350402832, + "learning_rate": 0.00019799115342442687, + "loss": 1.3226, + "step": 2569 + }, + { + "epoch": 0.09203717299049188, + "grad_norm": 1.5544811487197876, + "learning_rate": 0.0001979888395432805, + "loss": 1.2807, + "step": 2570 + }, + { + "epoch": 0.09207298512006017, + "grad_norm": 1.7239794731140137, + "learning_rate": 0.00019798652434382068, + "loss": 1.2988, + "step": 2571 + }, + { + "epoch": 0.09210879724962845, + "grad_norm": 1.4671499729156494, + "learning_rate": 0.0001979842078260786, + "loss": 1.1, + "step": 2572 + }, + { + "epoch": 0.09214460937919673, + "grad_norm": 1.1498817205429077, + "learning_rate": 0.00019798188999008536, + "loss": 1.037, + "step": 2573 + }, + { + "epoch": 0.09218042150876501, + "grad_norm": 1.5970933437347412, + "learning_rate": 0.00019797957083587218, + "loss": 1.143, + "step": 2574 + }, + { + "epoch": 0.0922162336383333, + "grad_norm": 2.1910548210144043, + "learning_rate": 0.00019797725036347025, + "loss": 1.2094, + "step": 2575 + }, + { + "epoch": 0.0922520457679016, + "grad_norm": 1.6814172267913818, + "learning_rate": 0.00019797492857291085, + "loss": 1.1409, + "step": 2576 + }, + { + "epoch": 0.09228785789746988, + "grad_norm": 1.7379227876663208, + "learning_rate": 0.00019797260546422512, + "loss": 1.3843, + "step": 2577 + }, + { + "epoch": 0.09232367002703816, + "grad_norm": 2.3140296936035156, + "learning_rate": 0.00019797028103744438, + "loss": 1.2397, + "step": 2578 + }, + { + "epoch": 0.09235948215660644, + "grad_norm": 1.338460087776184, + "learning_rate": 0.00019796795529259986, + "loss": 1.2539, + "step": 2579 + }, + { + "epoch": 0.09239529428617473, + "grad_norm": 1.4784398078918457, + "learning_rate": 0.0001979656282297229, + "loss": 1.0928, + "step": 2580 + }, + { + "epoch": 0.09243110641574301, + "grad_norm": 2.12530779838562, + "learning_rate": 0.00019796329984884473, + "loss": 1.3456, + "step": 2581 + }, + { + "epoch": 0.09246691854531129, + "grad_norm": 1.666161298751831, + "learning_rate": 0.00019796097014999678, + "loss": 1.3005, + "step": 2582 + }, + { + "epoch": 0.09250273067487959, + "grad_norm": 2.2467494010925293, + "learning_rate": 0.0001979586391332103, + "loss": 1.302, + "step": 2583 + }, + { + "epoch": 0.09253854280444787, + "grad_norm": 1.2508139610290527, + "learning_rate": 0.0001979563067985167, + "loss": 1.2483, + "step": 2584 + }, + { + "epoch": 0.09257435493401615, + "grad_norm": 1.4863941669464111, + "learning_rate": 0.00019795397314594735, + "loss": 1.2147, + "step": 2585 + }, + { + "epoch": 0.09261016706358444, + "grad_norm": 1.6971149444580078, + "learning_rate": 0.00019795163817553363, + "loss": 1.0991, + "step": 2586 + }, + { + "epoch": 0.09264597919315272, + "grad_norm": 2.2937474250793457, + "learning_rate": 0.000197949301887307, + "loss": 1.1238, + "step": 2587 + }, + { + "epoch": 0.092681791322721, + "grad_norm": 1.7298513650894165, + "learning_rate": 0.00019794696428129883, + "loss": 1.0024, + "step": 2588 + }, + { + "epoch": 0.09271760345228929, + "grad_norm": 1.237941026687622, + "learning_rate": 0.0001979446253575406, + "loss": 1.162, + "step": 2589 + }, + { + "epoch": 0.09275341558185758, + "grad_norm": 2.200965642929077, + "learning_rate": 0.00019794228511606376, + "loss": 1.2136, + "step": 2590 + }, + { + "epoch": 0.09278922771142586, + "grad_norm": 2.342883825302124, + "learning_rate": 0.00019793994355689985, + "loss": 1.2068, + "step": 2591 + }, + { + "epoch": 0.09282503984099415, + "grad_norm": 2.221367597579956, + "learning_rate": 0.0001979376006800803, + "loss": 1.2873, + "step": 2592 + }, + { + "epoch": 0.09286085197056243, + "grad_norm": 2.0996158123016357, + "learning_rate": 0.00019793525648563668, + "loss": 1.159, + "step": 2593 + }, + { + "epoch": 0.09289666410013071, + "grad_norm": 1.4878617525100708, + "learning_rate": 0.0001979329109736005, + "loss": 1.3516, + "step": 2594 + }, + { + "epoch": 0.092932476229699, + "grad_norm": 2.101564645767212, + "learning_rate": 0.00019793056414400332, + "loss": 1.4136, + "step": 2595 + }, + { + "epoch": 0.09296828835926728, + "grad_norm": 1.8938322067260742, + "learning_rate": 0.00019792821599687676, + "loss": 1.1894, + "step": 2596 + }, + { + "epoch": 0.09300410048883558, + "grad_norm": 1.7714389562606812, + "learning_rate": 0.00019792586653225237, + "loss": 1.1614, + "step": 2597 + }, + { + "epoch": 0.09303991261840386, + "grad_norm": 1.798006534576416, + "learning_rate": 0.00019792351575016173, + "loss": 1.1396, + "step": 2598 + }, + { + "epoch": 0.09307572474797214, + "grad_norm": 2.04874324798584, + "learning_rate": 0.0001979211636506365, + "loss": 1.2285, + "step": 2599 + }, + { + "epoch": 0.09311153687754042, + "grad_norm": 2.34702205657959, + "learning_rate": 0.0001979188102337083, + "loss": 1.1943, + "step": 2600 + }, + { + "epoch": 0.0931473490071087, + "grad_norm": 1.637847661972046, + "learning_rate": 0.00019791645549940886, + "loss": 1.2746, + "step": 2601 + }, + { + "epoch": 0.09318316113667699, + "grad_norm": 1.5934511423110962, + "learning_rate": 0.0001979140994477698, + "loss": 1.2883, + "step": 2602 + }, + { + "epoch": 0.09321897326624527, + "grad_norm": 1.7709951400756836, + "learning_rate": 0.00019791174207882284, + "loss": 1.0891, + "step": 2603 + }, + { + "epoch": 0.09325478539581356, + "grad_norm": 1.3033725023269653, + "learning_rate": 0.00019790938339259967, + "loss": 1.2345, + "step": 2604 + }, + { + "epoch": 0.09329059752538185, + "grad_norm": 1.9779174327850342, + "learning_rate": 0.00019790702338913204, + "loss": 1.1779, + "step": 2605 + }, + { + "epoch": 0.09332640965495013, + "grad_norm": 1.7167448997497559, + "learning_rate": 0.0001979046620684517, + "loss": 1.1378, + "step": 2606 + }, + { + "epoch": 0.09336222178451842, + "grad_norm": 1.8844178915023804, + "learning_rate": 0.00019790229943059045, + "loss": 1.2586, + "step": 2607 + }, + { + "epoch": 0.0933980339140867, + "grad_norm": 1.4839868545532227, + "learning_rate": 0.00019789993547558, + "loss": 1.3679, + "step": 2608 + }, + { + "epoch": 0.09343384604365498, + "grad_norm": 1.5786889791488647, + "learning_rate": 0.00019789757020345224, + "loss": 1.2769, + "step": 2609 + }, + { + "epoch": 0.09346965817322327, + "grad_norm": 2.193488597869873, + "learning_rate": 0.00019789520361423893, + "loss": 1.3902, + "step": 2610 + }, + { + "epoch": 0.09350547030279155, + "grad_norm": 1.3334327936172485, + "learning_rate": 0.00019789283570797192, + "loss": 1.3656, + "step": 2611 + }, + { + "epoch": 0.09354128243235985, + "grad_norm": 1.8428736925125122, + "learning_rate": 0.0001978904664846831, + "loss": 1.197, + "step": 2612 + }, + { + "epoch": 0.09357709456192813, + "grad_norm": 2.075793743133545, + "learning_rate": 0.00019788809594440432, + "loss": 1.2026, + "step": 2613 + }, + { + "epoch": 0.09361290669149641, + "grad_norm": 1.8888881206512451, + "learning_rate": 0.00019788572408716747, + "loss": 1.3667, + "step": 2614 + }, + { + "epoch": 0.0936487188210647, + "grad_norm": 2.3414604663848877, + "learning_rate": 0.00019788335091300448, + "loss": 1.3274, + "step": 2615 + }, + { + "epoch": 0.09368453095063298, + "grad_norm": 1.8494720458984375, + "learning_rate": 0.00019788097642194725, + "loss": 1.3228, + "step": 2616 + }, + { + "epoch": 0.09372034308020126, + "grad_norm": 1.716713309288025, + "learning_rate": 0.00019787860061402774, + "loss": 1.2005, + "step": 2617 + }, + { + "epoch": 0.09375615520976954, + "grad_norm": 2.7576706409454346, + "learning_rate": 0.00019787622348927793, + "loss": 1.217, + "step": 2618 + }, + { + "epoch": 0.09379196733933784, + "grad_norm": 2.180371046066284, + "learning_rate": 0.00019787384504772976, + "loss": 1.1111, + "step": 2619 + }, + { + "epoch": 0.09382777946890612, + "grad_norm": 3.5557303428649902, + "learning_rate": 0.00019787146528941528, + "loss": 1.5635, + "step": 2620 + }, + { + "epoch": 0.0938635915984744, + "grad_norm": 1.4034497737884521, + "learning_rate": 0.00019786908421436645, + "loss": 1.293, + "step": 2621 + }, + { + "epoch": 0.09389940372804269, + "grad_norm": 1.4488781690597534, + "learning_rate": 0.00019786670182261534, + "loss": 1.2427, + "step": 2622 + }, + { + "epoch": 0.09393521585761097, + "grad_norm": 1.6248698234558105, + "learning_rate": 0.00019786431811419402, + "loss": 1.0954, + "step": 2623 + }, + { + "epoch": 0.09397102798717925, + "grad_norm": 1.71968412399292, + "learning_rate": 0.0001978619330891345, + "loss": 1.4266, + "step": 2624 + }, + { + "epoch": 0.09400684011674754, + "grad_norm": 3.3918917179107666, + "learning_rate": 0.0001978595467474689, + "loss": 1.2429, + "step": 2625 + }, + { + "epoch": 0.09404265224631583, + "grad_norm": 1.9212273359298706, + "learning_rate": 0.00019785715908922938, + "loss": 1.06, + "step": 2626 + }, + { + "epoch": 0.09407846437588412, + "grad_norm": 1.8142613172531128, + "learning_rate": 0.00019785477011444798, + "loss": 1.0878, + "step": 2627 + }, + { + "epoch": 0.0941142765054524, + "grad_norm": 1.5745428800582886, + "learning_rate": 0.00019785237982315686, + "loss": 1.2516, + "step": 2628 + }, + { + "epoch": 0.09415008863502068, + "grad_norm": 1.5213465690612793, + "learning_rate": 0.0001978499882153882, + "loss": 1.2853, + "step": 2629 + }, + { + "epoch": 0.09418590076458896, + "grad_norm": 1.3543118238449097, + "learning_rate": 0.00019784759529117415, + "loss": 1.0821, + "step": 2630 + }, + { + "epoch": 0.09422171289415725, + "grad_norm": 2.0561363697052, + "learning_rate": 0.000197845201050547, + "loss": 1.0568, + "step": 2631 + }, + { + "epoch": 0.09425752502372553, + "grad_norm": 1.6196810007095337, + "learning_rate": 0.0001978428054935388, + "loss": 1.1456, + "step": 2632 + }, + { + "epoch": 0.09429333715329383, + "grad_norm": 2.5416171550750732, + "learning_rate": 0.00019784040862018184, + "loss": 1.1566, + "step": 2633 + }, + { + "epoch": 0.09432914928286211, + "grad_norm": 1.5292414426803589, + "learning_rate": 0.00019783801043050844, + "loss": 1.1419, + "step": 2634 + }, + { + "epoch": 0.09436496141243039, + "grad_norm": 1.5518923997879028, + "learning_rate": 0.0001978356109245508, + "loss": 1.1964, + "step": 2635 + }, + { + "epoch": 0.09440077354199868, + "grad_norm": 1.5205786228179932, + "learning_rate": 0.00019783321010234122, + "loss": 1.3056, + "step": 2636 + }, + { + "epoch": 0.09443658567156696, + "grad_norm": 2.4911389350891113, + "learning_rate": 0.000197830807963912, + "loss": 1.152, + "step": 2637 + }, + { + "epoch": 0.09447239780113524, + "grad_norm": 1.195717692375183, + "learning_rate": 0.00019782840450929543, + "loss": 1.2686, + "step": 2638 + }, + { + "epoch": 0.09450820993070352, + "grad_norm": 1.5056260824203491, + "learning_rate": 0.00019782599973852387, + "loss": 1.3082, + "step": 2639 + }, + { + "epoch": 0.09454402206027182, + "grad_norm": 1.4531906843185425, + "learning_rate": 0.0001978235936516297, + "loss": 1.0194, + "step": 2640 + }, + { + "epoch": 0.0945798341898401, + "grad_norm": 1.573920488357544, + "learning_rate": 0.0001978211862486452, + "loss": 1.211, + "step": 2641 + }, + { + "epoch": 0.09461564631940839, + "grad_norm": 1.4735695123672485, + "learning_rate": 0.00019781877752960285, + "loss": 1.2038, + "step": 2642 + }, + { + "epoch": 0.09465145844897667, + "grad_norm": 1.426507592201233, + "learning_rate": 0.00019781636749453504, + "loss": 1.2299, + "step": 2643 + }, + { + "epoch": 0.09468727057854495, + "grad_norm": 2.4257867336273193, + "learning_rate": 0.00019781395614347415, + "loss": 1.5164, + "step": 2644 + }, + { + "epoch": 0.09472308270811323, + "grad_norm": 1.80821692943573, + "learning_rate": 0.0001978115434764527, + "loss": 1.2361, + "step": 2645 + }, + { + "epoch": 0.09475889483768152, + "grad_norm": 1.766276240348816, + "learning_rate": 0.00019780912949350307, + "loss": 1.2207, + "step": 2646 + }, + { + "epoch": 0.09479470696724981, + "grad_norm": 1.3676961660385132, + "learning_rate": 0.00019780671419465776, + "loss": 1.0804, + "step": 2647 + }, + { + "epoch": 0.0948305190968181, + "grad_norm": 2.823322057723999, + "learning_rate": 0.00019780429757994928, + "loss": 1.1325, + "step": 2648 + }, + { + "epoch": 0.09486633122638638, + "grad_norm": 1.289958119392395, + "learning_rate": 0.00019780187964941011, + "loss": 1.18, + "step": 2649 + }, + { + "epoch": 0.09490214335595466, + "grad_norm": 1.6708669662475586, + "learning_rate": 0.00019779946040307284, + "loss": 1.1428, + "step": 2650 + }, + { + "epoch": 0.09493795548552295, + "grad_norm": 2.4587326049804688, + "learning_rate": 0.00019779703984096998, + "loss": 1.2531, + "step": 2651 + }, + { + "epoch": 0.09497376761509123, + "grad_norm": 2.262280225753784, + "learning_rate": 0.00019779461796313408, + "loss": 1.1606, + "step": 2652 + }, + { + "epoch": 0.09500957974465951, + "grad_norm": 1.784002661705017, + "learning_rate": 0.00019779219476959777, + "loss": 1.0607, + "step": 2653 + }, + { + "epoch": 0.0950453918742278, + "grad_norm": 2.69956111907959, + "learning_rate": 0.0001977897702603936, + "loss": 1.4824, + "step": 2654 + }, + { + "epoch": 0.09508120400379609, + "grad_norm": 2.2476463317871094, + "learning_rate": 0.0001977873444355542, + "loss": 1.2725, + "step": 2655 + }, + { + "epoch": 0.09511701613336437, + "grad_norm": 3.5810654163360596, + "learning_rate": 0.00019778491729511224, + "loss": 1.2319, + "step": 2656 + }, + { + "epoch": 0.09515282826293266, + "grad_norm": 1.456548810005188, + "learning_rate": 0.00019778248883910035, + "loss": 1.2805, + "step": 2657 + }, + { + "epoch": 0.09518864039250094, + "grad_norm": 1.3787672519683838, + "learning_rate": 0.0001977800590675512, + "loss": 1.1876, + "step": 2658 + }, + { + "epoch": 0.09522445252206922, + "grad_norm": 1.7239669561386108, + "learning_rate": 0.0001977776279804975, + "loss": 1.1883, + "step": 2659 + }, + { + "epoch": 0.0952602646516375, + "grad_norm": 1.770757794380188, + "learning_rate": 0.0001977751955779719, + "loss": 0.9123, + "step": 2660 + }, + { + "epoch": 0.09529607678120579, + "grad_norm": 2.3640689849853516, + "learning_rate": 0.00019777276186000716, + "loss": 1.1777, + "step": 2661 + }, + { + "epoch": 0.09533188891077408, + "grad_norm": 2.264645576477051, + "learning_rate": 0.00019777032682663607, + "loss": 1.3429, + "step": 2662 + }, + { + "epoch": 0.09536770104034237, + "grad_norm": 1.5059412717819214, + "learning_rate": 0.00019776789047789133, + "loss": 1.3737, + "step": 2663 + }, + { + "epoch": 0.09540351316991065, + "grad_norm": 1.8164383172988892, + "learning_rate": 0.0001977654528138057, + "loss": 1.2688, + "step": 2664 + }, + { + "epoch": 0.09543932529947893, + "grad_norm": 1.5609298944473267, + "learning_rate": 0.00019776301383441207, + "loss": 1.1716, + "step": 2665 + }, + { + "epoch": 0.09547513742904722, + "grad_norm": 1.8076540231704712, + "learning_rate": 0.00019776057353974315, + "loss": 1.2619, + "step": 2666 + }, + { + "epoch": 0.0955109495586155, + "grad_norm": 1.9957425594329834, + "learning_rate": 0.00019775813192983183, + "loss": 0.8561, + "step": 2667 + }, + { + "epoch": 0.09554676168818378, + "grad_norm": 2.2655372619628906, + "learning_rate": 0.00019775568900471096, + "loss": 1.1281, + "step": 2668 + }, + { + "epoch": 0.09558257381775208, + "grad_norm": 1.8234918117523193, + "learning_rate": 0.00019775324476441336, + "loss": 1.0729, + "step": 2669 + }, + { + "epoch": 0.09561838594732036, + "grad_norm": 2.3984649181365967, + "learning_rate": 0.00019775079920897196, + "loss": 1.3076, + "step": 2670 + }, + { + "epoch": 0.09565419807688864, + "grad_norm": 1.4292970895767212, + "learning_rate": 0.00019774835233841965, + "loss": 1.3064, + "step": 2671 + }, + { + "epoch": 0.09569001020645693, + "grad_norm": 1.9665076732635498, + "learning_rate": 0.00019774590415278933, + "loss": 1.0777, + "step": 2672 + }, + { + "epoch": 0.09572582233602521, + "grad_norm": 1.6812001466751099, + "learning_rate": 0.00019774345465211398, + "loss": 1.5112, + "step": 2673 + }, + { + "epoch": 0.09576163446559349, + "grad_norm": 1.8586666584014893, + "learning_rate": 0.00019774100383642651, + "loss": 1.0958, + "step": 2674 + }, + { + "epoch": 0.09579744659516178, + "grad_norm": 2.505612850189209, + "learning_rate": 0.0001977385517057599, + "loss": 1.2393, + "step": 2675 + }, + { + "epoch": 0.09583325872473007, + "grad_norm": 2.0924320220947266, + "learning_rate": 0.00019773609826014718, + "loss": 1.3262, + "step": 2676 + }, + { + "epoch": 0.09586907085429835, + "grad_norm": 2.0002737045288086, + "learning_rate": 0.0001977336434996213, + "loss": 1.0588, + "step": 2677 + }, + { + "epoch": 0.09590488298386664, + "grad_norm": 1.740734577178955, + "learning_rate": 0.00019773118742421532, + "loss": 1.0606, + "step": 2678 + }, + { + "epoch": 0.09594069511343492, + "grad_norm": 2.3396644592285156, + "learning_rate": 0.00019772873003396228, + "loss": 1.2466, + "step": 2679 + }, + { + "epoch": 0.0959765072430032, + "grad_norm": 1.775567889213562, + "learning_rate": 0.00019772627132889526, + "loss": 1.2349, + "step": 2680 + }, + { + "epoch": 0.09601231937257149, + "grad_norm": 1.7995229959487915, + "learning_rate": 0.00019772381130904728, + "loss": 1.2253, + "step": 2681 + }, + { + "epoch": 0.09604813150213977, + "grad_norm": 1.5477373600006104, + "learning_rate": 0.0001977213499744515, + "loss": 1.3125, + "step": 2682 + }, + { + "epoch": 0.09608394363170807, + "grad_norm": 3.076563835144043, + "learning_rate": 0.00019771888732514098, + "loss": 1.1768, + "step": 2683 + }, + { + "epoch": 0.09611975576127635, + "grad_norm": 1.3191670179367065, + "learning_rate": 0.00019771642336114892, + "loss": 0.8956, + "step": 2684 + }, + { + "epoch": 0.09615556789084463, + "grad_norm": 1.519983172416687, + "learning_rate": 0.0001977139580825084, + "loss": 1.0926, + "step": 2685 + }, + { + "epoch": 0.09619138002041291, + "grad_norm": 2.9822170734405518, + "learning_rate": 0.0001977114914892526, + "loss": 1.4398, + "step": 2686 + }, + { + "epoch": 0.0962271921499812, + "grad_norm": 2.7970845699310303, + "learning_rate": 0.00019770902358141478, + "loss": 1.0794, + "step": 2687 + }, + { + "epoch": 0.09626300427954948, + "grad_norm": 1.70359468460083, + "learning_rate": 0.00019770655435902805, + "loss": 1.1794, + "step": 2688 + }, + { + "epoch": 0.09629881640911776, + "grad_norm": 1.8265082836151123, + "learning_rate": 0.00019770408382212564, + "loss": 1.0276, + "step": 2689 + }, + { + "epoch": 0.09633462853868606, + "grad_norm": 1.494787573814392, + "learning_rate": 0.00019770161197074084, + "loss": 1.213, + "step": 2690 + }, + { + "epoch": 0.09637044066825434, + "grad_norm": 1.353816032409668, + "learning_rate": 0.00019769913880490688, + "loss": 1.1916, + "step": 2691 + }, + { + "epoch": 0.09640625279782263, + "grad_norm": 2.241114377975464, + "learning_rate": 0.000197696664324657, + "loss": 1.4315, + "step": 2692 + }, + { + "epoch": 0.09644206492739091, + "grad_norm": 1.6433030366897583, + "learning_rate": 0.00019769418853002454, + "loss": 1.0081, + "step": 2693 + }, + { + "epoch": 0.09647787705695919, + "grad_norm": 2.041844367980957, + "learning_rate": 0.0001976917114210428, + "loss": 1.1764, + "step": 2694 + }, + { + "epoch": 0.09651368918652747, + "grad_norm": 1.817533254623413, + "learning_rate": 0.00019768923299774506, + "loss": 1.1923, + "step": 2695 + }, + { + "epoch": 0.09654950131609576, + "grad_norm": 1.5782170295715332, + "learning_rate": 0.00019768675326016475, + "loss": 1.1088, + "step": 2696 + }, + { + "epoch": 0.09658531344566405, + "grad_norm": 1.2971657514572144, + "learning_rate": 0.00019768427220833514, + "loss": 1.0796, + "step": 2697 + }, + { + "epoch": 0.09662112557523234, + "grad_norm": 1.775241732597351, + "learning_rate": 0.00019768178984228967, + "loss": 1.4373, + "step": 2698 + }, + { + "epoch": 0.09665693770480062, + "grad_norm": 1.5123405456542969, + "learning_rate": 0.00019767930616206174, + "loss": 1.2161, + "step": 2699 + }, + { + "epoch": 0.0966927498343689, + "grad_norm": 2.102839231491089, + "learning_rate": 0.00019767682116768472, + "loss": 1.1452, + "step": 2700 + }, + { + "epoch": 0.09672856196393718, + "grad_norm": 1.4372493028640747, + "learning_rate": 0.00019767433485919206, + "loss": 1.1937, + "step": 2701 + }, + { + "epoch": 0.09676437409350547, + "grad_norm": 1.7801436185836792, + "learning_rate": 0.0001976718472366172, + "loss": 1.185, + "step": 2702 + }, + { + "epoch": 0.09680018622307375, + "grad_norm": 1.9375643730163574, + "learning_rate": 0.00019766935829999363, + "loss": 1.299, + "step": 2703 + }, + { + "epoch": 0.09683599835264203, + "grad_norm": 1.8871551752090454, + "learning_rate": 0.00019766686804935488, + "loss": 1.2408, + "step": 2704 + }, + { + "epoch": 0.09687181048221033, + "grad_norm": 1.58482825756073, + "learning_rate": 0.00019766437648473435, + "loss": 1.3431, + "step": 2705 + }, + { + "epoch": 0.09690762261177861, + "grad_norm": 1.44480562210083, + "learning_rate": 0.00019766188360616563, + "loss": 1.3167, + "step": 2706 + }, + { + "epoch": 0.0969434347413469, + "grad_norm": 1.6056838035583496, + "learning_rate": 0.00019765938941368222, + "loss": 1.3, + "step": 2707 + }, + { + "epoch": 0.09697924687091518, + "grad_norm": 1.7867616415023804, + "learning_rate": 0.00019765689390731773, + "loss": 1.3497, + "step": 2708 + }, + { + "epoch": 0.09701505900048346, + "grad_norm": 1.3780345916748047, + "learning_rate": 0.0001976543970871057, + "loss": 1.1642, + "step": 2709 + }, + { + "epoch": 0.09705087113005174, + "grad_norm": 1.6554670333862305, + "learning_rate": 0.0001976518989530797, + "loss": 1.3166, + "step": 2710 + }, + { + "epoch": 0.09708668325962003, + "grad_norm": 1.786044955253601, + "learning_rate": 0.00019764939950527336, + "loss": 1.2461, + "step": 2711 + }, + { + "epoch": 0.09712249538918832, + "grad_norm": 1.88020658493042, + "learning_rate": 0.0001976468987437203, + "loss": 1.2655, + "step": 2712 + }, + { + "epoch": 0.0971583075187566, + "grad_norm": 3.4603817462921143, + "learning_rate": 0.0001976443966684542, + "loss": 1.3273, + "step": 2713 + }, + { + "epoch": 0.09719411964832489, + "grad_norm": 1.4980496168136597, + "learning_rate": 0.00019764189327950869, + "loss": 1.073, + "step": 2714 + }, + { + "epoch": 0.09722993177789317, + "grad_norm": 2.287594795227051, + "learning_rate": 0.00019763938857691744, + "loss": 1.1231, + "step": 2715 + }, + { + "epoch": 0.09726574390746145, + "grad_norm": 1.4906936883926392, + "learning_rate": 0.00019763688256071418, + "loss": 1.3828, + "step": 2716 + }, + { + "epoch": 0.09730155603702974, + "grad_norm": 1.4275223016738892, + "learning_rate": 0.0001976343752309326, + "loss": 1.1636, + "step": 2717 + }, + { + "epoch": 0.09733736816659802, + "grad_norm": 1.9999109506607056, + "learning_rate": 0.00019763186658760645, + "loss": 1.2459, + "step": 2718 + }, + { + "epoch": 0.09737318029616632, + "grad_norm": 1.5791215896606445, + "learning_rate": 0.00019762935663076946, + "loss": 1.3512, + "step": 2719 + }, + { + "epoch": 0.0974089924257346, + "grad_norm": 1.4464889764785767, + "learning_rate": 0.00019762684536045542, + "loss": 1.0915, + "step": 2720 + }, + { + "epoch": 0.09744480455530288, + "grad_norm": 2.066521167755127, + "learning_rate": 0.00019762433277669807, + "loss": 1.444, + "step": 2721 + }, + { + "epoch": 0.09748061668487117, + "grad_norm": 3.526792526245117, + "learning_rate": 0.00019762181887953128, + "loss": 1.4108, + "step": 2722 + }, + { + "epoch": 0.09751642881443945, + "grad_norm": 1.8401739597320557, + "learning_rate": 0.00019761930366898883, + "loss": 1.2702, + "step": 2723 + }, + { + "epoch": 0.09755224094400773, + "grad_norm": 1.5853358507156372, + "learning_rate": 0.0001976167871451046, + "loss": 1.2104, + "step": 2724 + }, + { + "epoch": 0.09758805307357601, + "grad_norm": 2.0425379276275635, + "learning_rate": 0.00019761426930791238, + "loss": 1.5372, + "step": 2725 + }, + { + "epoch": 0.09762386520314431, + "grad_norm": 2.4989335536956787, + "learning_rate": 0.00019761175015744605, + "loss": 1.2876, + "step": 2726 + }, + { + "epoch": 0.0976596773327126, + "grad_norm": 1.742287039756775, + "learning_rate": 0.0001976092296937396, + "loss": 1.5573, + "step": 2727 + }, + { + "epoch": 0.09769548946228088, + "grad_norm": 1.3378328084945679, + "learning_rate": 0.00019760670791682685, + "loss": 1.2851, + "step": 2728 + }, + { + "epoch": 0.09773130159184916, + "grad_norm": 1.9125341176986694, + "learning_rate": 0.00019760418482674173, + "loss": 1.2456, + "step": 2729 + }, + { + "epoch": 0.09776711372141744, + "grad_norm": 1.6320644617080688, + "learning_rate": 0.0001976016604235182, + "loss": 1.1988, + "step": 2730 + }, + { + "epoch": 0.09780292585098573, + "grad_norm": 2.1355559825897217, + "learning_rate": 0.00019759913470719024, + "loss": 1.2261, + "step": 2731 + }, + { + "epoch": 0.09783873798055401, + "grad_norm": 1.5571328401565552, + "learning_rate": 0.00019759660767779184, + "loss": 1.2361, + "step": 2732 + }, + { + "epoch": 0.0978745501101223, + "grad_norm": 1.3427413702011108, + "learning_rate": 0.00019759407933535693, + "loss": 1.2868, + "step": 2733 + }, + { + "epoch": 0.09791036223969059, + "grad_norm": 1.8516055345535278, + "learning_rate": 0.0001975915496799196, + "loss": 1.4475, + "step": 2734 + }, + { + "epoch": 0.09794617436925887, + "grad_norm": 2.0296127796173096, + "learning_rate": 0.00019758901871151383, + "loss": 1.0903, + "step": 2735 + }, + { + "epoch": 0.09798198649882715, + "grad_norm": 1.5894348621368408, + "learning_rate": 0.00019758648643017373, + "loss": 0.948, + "step": 2736 + }, + { + "epoch": 0.09801779862839544, + "grad_norm": 1.9072284698486328, + "learning_rate": 0.0001975839528359333, + "loss": 1.0333, + "step": 2737 + }, + { + "epoch": 0.09805361075796372, + "grad_norm": 2.0503852367401123, + "learning_rate": 0.00019758141792882667, + "loss": 1.1416, + "step": 2738 + }, + { + "epoch": 0.098089422887532, + "grad_norm": 1.7128944396972656, + "learning_rate": 0.00019757888170888793, + "loss": 1.0493, + "step": 2739 + }, + { + "epoch": 0.0981252350171003, + "grad_norm": 2.3619141578674316, + "learning_rate": 0.0001975763441761512, + "loss": 1.2776, + "step": 2740 + }, + { + "epoch": 0.09816104714666858, + "grad_norm": 1.9806737899780273, + "learning_rate": 0.00019757380533065065, + "loss": 1.2865, + "step": 2741 + }, + { + "epoch": 0.09819685927623686, + "grad_norm": 1.6258699893951416, + "learning_rate": 0.00019757126517242038, + "loss": 1.2956, + "step": 2742 + }, + { + "epoch": 0.09823267140580515, + "grad_norm": 1.5297434329986572, + "learning_rate": 0.0001975687237014946, + "loss": 1.2865, + "step": 2743 + }, + { + "epoch": 0.09826848353537343, + "grad_norm": 1.7148276567459106, + "learning_rate": 0.0001975661809179075, + "loss": 1.2711, + "step": 2744 + }, + { + "epoch": 0.09830429566494171, + "grad_norm": 1.4782956838607788, + "learning_rate": 0.0001975636368216933, + "loss": 1.1206, + "step": 2745 + }, + { + "epoch": 0.09834010779451, + "grad_norm": 1.6929378509521484, + "learning_rate": 0.0001975610914128862, + "loss": 1.3415, + "step": 2746 + }, + { + "epoch": 0.09837591992407829, + "grad_norm": 2.2116494178771973, + "learning_rate": 0.00019755854469152045, + "loss": 1.0951, + "step": 2747 + }, + { + "epoch": 0.09841173205364658, + "grad_norm": 1.7289624214172363, + "learning_rate": 0.00019755599665763037, + "loss": 1.0937, + "step": 2748 + }, + { + "epoch": 0.09844754418321486, + "grad_norm": 2.076066493988037, + "learning_rate": 0.00019755344731125013, + "loss": 1.3214, + "step": 2749 + }, + { + "epoch": 0.09848335631278314, + "grad_norm": 1.3917784690856934, + "learning_rate": 0.00019755089665241413, + "loss": 1.241, + "step": 2750 + }, + { + "epoch": 0.09851916844235142, + "grad_norm": 1.859021782875061, + "learning_rate": 0.00019754834468115664, + "loss": 1.2763, + "step": 2751 + }, + { + "epoch": 0.0985549805719197, + "grad_norm": 1.7205578088760376, + "learning_rate": 0.00019754579139751198, + "loss": 1.4124, + "step": 2752 + }, + { + "epoch": 0.09859079270148799, + "grad_norm": 1.8846122026443481, + "learning_rate": 0.00019754323680151457, + "loss": 1.2077, + "step": 2753 + }, + { + "epoch": 0.09862660483105627, + "grad_norm": 2.449166774749756, + "learning_rate": 0.00019754068089319869, + "loss": 1.2007, + "step": 2754 + }, + { + "epoch": 0.09866241696062457, + "grad_norm": 2.0212907791137695, + "learning_rate": 0.00019753812367259878, + "loss": 1.1109, + "step": 2755 + }, + { + "epoch": 0.09869822909019285, + "grad_norm": 2.26348614692688, + "learning_rate": 0.00019753556513974922, + "loss": 1.1999, + "step": 2756 + }, + { + "epoch": 0.09873404121976113, + "grad_norm": 1.546715259552002, + "learning_rate": 0.00019753300529468446, + "loss": 1.2587, + "step": 2757 + }, + { + "epoch": 0.09876985334932942, + "grad_norm": 1.552450180053711, + "learning_rate": 0.00019753044413743892, + "loss": 1.38, + "step": 2758 + }, + { + "epoch": 0.0988056654788977, + "grad_norm": 2.4945931434631348, + "learning_rate": 0.00019752788166804702, + "loss": 1.4127, + "step": 2759 + }, + { + "epoch": 0.09884147760846598, + "grad_norm": 1.6542692184448242, + "learning_rate": 0.0001975253178865433, + "loss": 1.1861, + "step": 2760 + }, + { + "epoch": 0.09887728973803427, + "grad_norm": 1.3813605308532715, + "learning_rate": 0.00019752275279296227, + "loss": 1.2214, + "step": 2761 + }, + { + "epoch": 0.09891310186760256, + "grad_norm": 1.3887815475463867, + "learning_rate": 0.00019752018638733836, + "loss": 1.2305, + "step": 2762 + }, + { + "epoch": 0.09894891399717085, + "grad_norm": 1.8205652236938477, + "learning_rate": 0.00019751761866970612, + "loss": 1.319, + "step": 2763 + }, + { + "epoch": 0.09898472612673913, + "grad_norm": 1.4133182764053345, + "learning_rate": 0.00019751504964010016, + "loss": 1.3114, + "step": 2764 + }, + { + "epoch": 0.09902053825630741, + "grad_norm": 1.6347718238830566, + "learning_rate": 0.00019751247929855495, + "loss": 1.0341, + "step": 2765 + }, + { + "epoch": 0.0990563503858757, + "grad_norm": 1.7313061952590942, + "learning_rate": 0.0001975099076451051, + "loss": 1.1846, + "step": 2766 + }, + { + "epoch": 0.09909216251544398, + "grad_norm": 2.3311643600463867, + "learning_rate": 0.00019750733467978525, + "loss": 1.2142, + "step": 2767 + }, + { + "epoch": 0.09912797464501226, + "grad_norm": 2.191312313079834, + "learning_rate": 0.00019750476040262998, + "loss": 1.4953, + "step": 2768 + }, + { + "epoch": 0.09916378677458056, + "grad_norm": 1.4628626108169556, + "learning_rate": 0.00019750218481367392, + "loss": 1.2615, + "step": 2769 + }, + { + "epoch": 0.09919959890414884, + "grad_norm": 1.3801952600479126, + "learning_rate": 0.00019749960791295174, + "loss": 1.3479, + "step": 2770 + }, + { + "epoch": 0.09923541103371712, + "grad_norm": 1.832008957862854, + "learning_rate": 0.0001974970297004981, + "loss": 1.3381, + "step": 2771 + }, + { + "epoch": 0.0992712231632854, + "grad_norm": 1.5091166496276855, + "learning_rate": 0.0001974944501763477, + "loss": 1.2105, + "step": 2772 + }, + { + "epoch": 0.09930703529285369, + "grad_norm": 1.777061939239502, + "learning_rate": 0.0001974918693405352, + "loss": 1.2228, + "step": 2773 + }, + { + "epoch": 0.09934284742242197, + "grad_norm": 1.5049452781677246, + "learning_rate": 0.0001974892871930954, + "loss": 1.2135, + "step": 2774 + }, + { + "epoch": 0.09937865955199025, + "grad_norm": 1.883262276649475, + "learning_rate": 0.00019748670373406294, + "loss": 1.2675, + "step": 2775 + }, + { + "epoch": 0.09941447168155855, + "grad_norm": 2.051835775375366, + "learning_rate": 0.00019748411896347267, + "loss": 1.1559, + "step": 2776 + }, + { + "epoch": 0.09945028381112683, + "grad_norm": 1.5758163928985596, + "learning_rate": 0.00019748153288135932, + "loss": 1.3276, + "step": 2777 + }, + { + "epoch": 0.09948609594069512, + "grad_norm": 1.3111058473587036, + "learning_rate": 0.0001974789454877577, + "loss": 1.0387, + "step": 2778 + }, + { + "epoch": 0.0995219080702634, + "grad_norm": 1.5280368328094482, + "learning_rate": 0.0001974763567827026, + "loss": 1.2331, + "step": 2779 + }, + { + "epoch": 0.09955772019983168, + "grad_norm": 1.5093393325805664, + "learning_rate": 0.00019747376676622878, + "loss": 1.2003, + "step": 2780 + }, + { + "epoch": 0.09959353232939996, + "grad_norm": 1.441476821899414, + "learning_rate": 0.00019747117543837125, + "loss": 1.4101, + "step": 2781 + }, + { + "epoch": 0.09962934445896825, + "grad_norm": 1.6688133478164673, + "learning_rate": 0.00019746858279916476, + "loss": 1.0501, + "step": 2782 + }, + { + "epoch": 0.09966515658853654, + "grad_norm": 1.8359642028808594, + "learning_rate": 0.0001974659888486442, + "loss": 1.3588, + "step": 2783 + }, + { + "epoch": 0.09970096871810483, + "grad_norm": 1.478863000869751, + "learning_rate": 0.0001974633935868445, + "loss": 1.0925, + "step": 2784 + }, + { + "epoch": 0.09973678084767311, + "grad_norm": 1.2663161754608154, + "learning_rate": 0.00019746079701380055, + "loss": 1.1842, + "step": 2785 + }, + { + "epoch": 0.09977259297724139, + "grad_norm": 2.351972818374634, + "learning_rate": 0.00019745819912954732, + "loss": 1.2673, + "step": 2786 + }, + { + "epoch": 0.09980840510680968, + "grad_norm": 1.575207233428955, + "learning_rate": 0.00019745559993411966, + "loss": 1.22, + "step": 2787 + }, + { + "epoch": 0.09984421723637796, + "grad_norm": 1.8584083318710327, + "learning_rate": 0.00019745299942755266, + "loss": 1.3126, + "step": 2788 + }, + { + "epoch": 0.09988002936594624, + "grad_norm": 1.5573323965072632, + "learning_rate": 0.00019745039760988127, + "loss": 1.2176, + "step": 2789 + }, + { + "epoch": 0.09991584149551454, + "grad_norm": 2.6952779293060303, + "learning_rate": 0.00019744779448114047, + "loss": 1.0878, + "step": 2790 + }, + { + "epoch": 0.09995165362508282, + "grad_norm": 1.5208580493927002, + "learning_rate": 0.00019744519004136527, + "loss": 1.0481, + "step": 2791 + }, + { + "epoch": 0.0999874657546511, + "grad_norm": 1.7516252994537354, + "learning_rate": 0.00019744258429059075, + "loss": 1.2346, + "step": 2792 + }, + { + "epoch": 0.10002327788421939, + "grad_norm": 1.2741330862045288, + "learning_rate": 0.00019743997722885198, + "loss": 1.1849, + "step": 2793 + }, + { + "epoch": 0.10005909001378767, + "grad_norm": 1.2291839122772217, + "learning_rate": 0.00019743736885618395, + "loss": 1.0402, + "step": 2794 + }, + { + "epoch": 0.10009490214335595, + "grad_norm": 2.318486452102661, + "learning_rate": 0.00019743475917262187, + "loss": 1.1358, + "step": 2795 + }, + { + "epoch": 0.10013071427292423, + "grad_norm": 1.6438391208648682, + "learning_rate": 0.00019743214817820074, + "loss": 1.2459, + "step": 2796 + }, + { + "epoch": 0.10016652640249253, + "grad_norm": 1.6616102457046509, + "learning_rate": 0.00019742953587295573, + "loss": 1.1208, + "step": 2797 + }, + { + "epoch": 0.10020233853206081, + "grad_norm": 1.6711572408676147, + "learning_rate": 0.000197426922256922, + "loss": 1.3163, + "step": 2798 + }, + { + "epoch": 0.1002381506616291, + "grad_norm": 1.974973440170288, + "learning_rate": 0.00019742430733013473, + "loss": 1.0508, + "step": 2799 + }, + { + "epoch": 0.10027396279119738, + "grad_norm": 1.487565517425537, + "learning_rate": 0.00019742169109262904, + "loss": 1.2878, + "step": 2800 + }, + { + "epoch": 0.10030977492076566, + "grad_norm": 1.8867043256759644, + "learning_rate": 0.00019741907354444018, + "loss": 1.2929, + "step": 2801 + }, + { + "epoch": 0.10034558705033395, + "grad_norm": 1.6791906356811523, + "learning_rate": 0.00019741645468560336, + "loss": 1.5003, + "step": 2802 + }, + { + "epoch": 0.10038139917990223, + "grad_norm": 2.335331439971924, + "learning_rate": 0.00019741383451615376, + "loss": 1.2684, + "step": 2803 + }, + { + "epoch": 0.10041721130947051, + "grad_norm": 1.9871655702590942, + "learning_rate": 0.0001974112130361267, + "loss": 1.2666, + "step": 2804 + }, + { + "epoch": 0.10045302343903881, + "grad_norm": 2.215463161468506, + "learning_rate": 0.0001974085902455574, + "loss": 1.2706, + "step": 2805 + }, + { + "epoch": 0.10048883556860709, + "grad_norm": 1.3567991256713867, + "learning_rate": 0.0001974059661444812, + "loss": 1.2428, + "step": 2806 + }, + { + "epoch": 0.10052464769817537, + "grad_norm": 2.400167226791382, + "learning_rate": 0.00019740334073293334, + "loss": 1.2365, + "step": 2807 + }, + { + "epoch": 0.10056045982774366, + "grad_norm": 1.6953130960464478, + "learning_rate": 0.0001974007140109492, + "loss": 1.2545, + "step": 2808 + }, + { + "epoch": 0.10059627195731194, + "grad_norm": 2.081000566482544, + "learning_rate": 0.00019739808597856405, + "loss": 1.3837, + "step": 2809 + }, + { + "epoch": 0.10063208408688022, + "grad_norm": 1.4785358905792236, + "learning_rate": 0.0001973954566358133, + "loss": 1.2518, + "step": 2810 + }, + { + "epoch": 0.1006678962164485, + "grad_norm": 1.3100039958953857, + "learning_rate": 0.0001973928259827323, + "loss": 1.1746, + "step": 2811 + }, + { + "epoch": 0.1007037083460168, + "grad_norm": 2.414891004562378, + "learning_rate": 0.0001973901940193565, + "loss": 1.1338, + "step": 2812 + }, + { + "epoch": 0.10073952047558508, + "grad_norm": 2.733438730239868, + "learning_rate": 0.00019738756074572127, + "loss": 1.2269, + "step": 2813 + }, + { + "epoch": 0.10077533260515337, + "grad_norm": 1.7628306150436401, + "learning_rate": 0.00019738492616186198, + "loss": 1.3408, + "step": 2814 + }, + { + "epoch": 0.10081114473472165, + "grad_norm": 2.15142822265625, + "learning_rate": 0.00019738229026781414, + "loss": 1.1897, + "step": 2815 + }, + { + "epoch": 0.10084695686428993, + "grad_norm": 1.449340581893921, + "learning_rate": 0.00019737965306361322, + "loss": 1.142, + "step": 2816 + }, + { + "epoch": 0.10088276899385822, + "grad_norm": 2.136775016784668, + "learning_rate": 0.00019737701454929468, + "loss": 1.347, + "step": 2817 + }, + { + "epoch": 0.1009185811234265, + "grad_norm": 1.3219484090805054, + "learning_rate": 0.000197374374724894, + "loss": 1.2456, + "step": 2818 + }, + { + "epoch": 0.1009543932529948, + "grad_norm": 1.6696664094924927, + "learning_rate": 0.0001973717335904467, + "loss": 1.1578, + "step": 2819 + }, + { + "epoch": 0.10099020538256308, + "grad_norm": 3.222177028656006, + "learning_rate": 0.00019736909114598833, + "loss": 1.383, + "step": 2820 + }, + { + "epoch": 0.10102601751213136, + "grad_norm": 1.4379724264144897, + "learning_rate": 0.00019736644739155445, + "loss": 1.0716, + "step": 2821 + }, + { + "epoch": 0.10106182964169964, + "grad_norm": 1.5540097951889038, + "learning_rate": 0.00019736380232718062, + "loss": 1.2117, + "step": 2822 + }, + { + "epoch": 0.10109764177126793, + "grad_norm": 1.9226773977279663, + "learning_rate": 0.00019736115595290238, + "loss": 1.3544, + "step": 2823 + }, + { + "epoch": 0.10113345390083621, + "grad_norm": 2.037550449371338, + "learning_rate": 0.00019735850826875542, + "loss": 1.185, + "step": 2824 + }, + { + "epoch": 0.10116926603040449, + "grad_norm": 1.3267382383346558, + "learning_rate": 0.0001973558592747753, + "loss": 1.1205, + "step": 2825 + }, + { + "epoch": 0.10120507815997279, + "grad_norm": 1.6238415241241455, + "learning_rate": 0.00019735320897099764, + "loss": 1.3865, + "step": 2826 + }, + { + "epoch": 0.10124089028954107, + "grad_norm": 1.4545915126800537, + "learning_rate": 0.00019735055735745817, + "loss": 1.3055, + "step": 2827 + }, + { + "epoch": 0.10127670241910935, + "grad_norm": 2.045335531234741, + "learning_rate": 0.0001973479044341925, + "loss": 1.1648, + "step": 2828 + }, + { + "epoch": 0.10131251454867764, + "grad_norm": 2.0693957805633545, + "learning_rate": 0.00019734525020123639, + "loss": 1.3954, + "step": 2829 + }, + { + "epoch": 0.10134832667824592, + "grad_norm": 2.137756109237671, + "learning_rate": 0.00019734259465862546, + "loss": 1.2467, + "step": 2830 + }, + { + "epoch": 0.1013841388078142, + "grad_norm": 2.1946370601654053, + "learning_rate": 0.0001973399378063955, + "loss": 0.9996, + "step": 2831 + }, + { + "epoch": 0.10141995093738249, + "grad_norm": 1.2792820930480957, + "learning_rate": 0.00019733727964458221, + "loss": 1.13, + "step": 2832 + }, + { + "epoch": 0.10145576306695078, + "grad_norm": 1.7665941715240479, + "learning_rate": 0.00019733462017322142, + "loss": 1.375, + "step": 2833 + }, + { + "epoch": 0.10149157519651907, + "grad_norm": 1.6662721633911133, + "learning_rate": 0.00019733195939234882, + "loss": 1.3381, + "step": 2834 + }, + { + "epoch": 0.10152738732608735, + "grad_norm": 1.1893996000289917, + "learning_rate": 0.00019732929730200031, + "loss": 1.2574, + "step": 2835 + }, + { + "epoch": 0.10156319945565563, + "grad_norm": 1.7696233987808228, + "learning_rate": 0.00019732663390221162, + "loss": 1.4527, + "step": 2836 + }, + { + "epoch": 0.10159901158522391, + "grad_norm": 1.4410967826843262, + "learning_rate": 0.0001973239691930186, + "loss": 1.2679, + "step": 2837 + }, + { + "epoch": 0.1016348237147922, + "grad_norm": 2.1522061824798584, + "learning_rate": 0.00019732130317445714, + "loss": 1.2893, + "step": 2838 + }, + { + "epoch": 0.10167063584436048, + "grad_norm": 1.4316394329071045, + "learning_rate": 0.00019731863584656308, + "loss": 1.1733, + "step": 2839 + }, + { + "epoch": 0.10170644797392878, + "grad_norm": 1.6126518249511719, + "learning_rate": 0.0001973159672093723, + "loss": 1.4514, + "step": 2840 + }, + { + "epoch": 0.10174226010349706, + "grad_norm": 1.6837921142578125, + "learning_rate": 0.00019731329726292073, + "loss": 1.1612, + "step": 2841 + }, + { + "epoch": 0.10177807223306534, + "grad_norm": 1.563593864440918, + "learning_rate": 0.00019731062600724424, + "loss": 1.1631, + "step": 2842 + }, + { + "epoch": 0.10181388436263362, + "grad_norm": 1.4960051774978638, + "learning_rate": 0.0001973079534423788, + "loss": 1.3559, + "step": 2843 + }, + { + "epoch": 0.10184969649220191, + "grad_norm": 1.3920435905456543, + "learning_rate": 0.00019730527956836035, + "loss": 1.1998, + "step": 2844 + }, + { + "epoch": 0.10188550862177019, + "grad_norm": 1.8852595090866089, + "learning_rate": 0.00019730260438522492, + "loss": 1.1751, + "step": 2845 + }, + { + "epoch": 0.10192132075133847, + "grad_norm": 2.772995948791504, + "learning_rate": 0.00019729992789300845, + "loss": 1.3397, + "step": 2846 + }, + { + "epoch": 0.10195713288090676, + "grad_norm": 1.2261860370635986, + "learning_rate": 0.00019729725009174693, + "loss": 1.1942, + "step": 2847 + }, + { + "epoch": 0.10199294501047505, + "grad_norm": 1.501965045928955, + "learning_rate": 0.00019729457098147647, + "loss": 1.1119, + "step": 2848 + }, + { + "epoch": 0.10202875714004334, + "grad_norm": 1.8482106924057007, + "learning_rate": 0.000197291890562233, + "loss": 1.4838, + "step": 2849 + }, + { + "epoch": 0.10206456926961162, + "grad_norm": 2.072953701019287, + "learning_rate": 0.00019728920883405263, + "loss": 1.1302, + "step": 2850 + }, + { + "epoch": 0.1021003813991799, + "grad_norm": 1.6478341817855835, + "learning_rate": 0.00019728652579697152, + "loss": 1.1936, + "step": 2851 + }, + { + "epoch": 0.10213619352874818, + "grad_norm": 3.084569215774536, + "learning_rate": 0.00019728384145102564, + "loss": 1.3032, + "step": 2852 + }, + { + "epoch": 0.10217200565831647, + "grad_norm": 1.5524585247039795, + "learning_rate": 0.00019728115579625117, + "loss": 1.1013, + "step": 2853 + }, + { + "epoch": 0.10220781778788475, + "grad_norm": 1.7695257663726807, + "learning_rate": 0.00019727846883268427, + "loss": 1.3988, + "step": 2854 + }, + { + "epoch": 0.10224362991745305, + "grad_norm": 1.8248103857040405, + "learning_rate": 0.00019727578056036101, + "loss": 1.2365, + "step": 2855 + }, + { + "epoch": 0.10227944204702133, + "grad_norm": 1.6586451530456543, + "learning_rate": 0.0001972730909793176, + "loss": 1.1591, + "step": 2856 + }, + { + "epoch": 0.10231525417658961, + "grad_norm": 1.6264903545379639, + "learning_rate": 0.00019727040008959024, + "loss": 1.2197, + "step": 2857 + }, + { + "epoch": 0.1023510663061579, + "grad_norm": 1.5559415817260742, + "learning_rate": 0.00019726770789121512, + "loss": 1.2361, + "step": 2858 + }, + { + "epoch": 0.10238687843572618, + "grad_norm": 1.3919328451156616, + "learning_rate": 0.00019726501438422842, + "loss": 1.2648, + "step": 2859 + }, + { + "epoch": 0.10242269056529446, + "grad_norm": 1.8672287464141846, + "learning_rate": 0.00019726231956866645, + "loss": 1.1205, + "step": 2860 + }, + { + "epoch": 0.10245850269486274, + "grad_norm": 2.054304838180542, + "learning_rate": 0.00019725962344456543, + "loss": 1.221, + "step": 2861 + }, + { + "epoch": 0.10249431482443104, + "grad_norm": 2.0919957160949707, + "learning_rate": 0.00019725692601196162, + "loss": 1.2964, + "step": 2862 + }, + { + "epoch": 0.10253012695399932, + "grad_norm": 2.7299301624298096, + "learning_rate": 0.00019725422727089132, + "loss": 1.1746, + "step": 2863 + }, + { + "epoch": 0.1025659390835676, + "grad_norm": 1.9412825107574463, + "learning_rate": 0.00019725152722139085, + "loss": 1.3378, + "step": 2864 + }, + { + "epoch": 0.10260175121313589, + "grad_norm": 1.8513420820236206, + "learning_rate": 0.00019724882586349653, + "loss": 1.3273, + "step": 2865 + }, + { + "epoch": 0.10263756334270417, + "grad_norm": 2.151501178741455, + "learning_rate": 0.00019724612319724468, + "loss": 1.1883, + "step": 2866 + }, + { + "epoch": 0.10267337547227245, + "grad_norm": 1.4179731607437134, + "learning_rate": 0.0001972434192226717, + "loss": 1.2133, + "step": 2867 + }, + { + "epoch": 0.10270918760184074, + "grad_norm": 1.8383610248565674, + "learning_rate": 0.00019724071393981393, + "loss": 1.0081, + "step": 2868 + }, + { + "epoch": 0.10274499973140903, + "grad_norm": 2.4638688564300537, + "learning_rate": 0.00019723800734870782, + "loss": 1.2557, + "step": 2869 + }, + { + "epoch": 0.10278081186097732, + "grad_norm": 1.5408779382705688, + "learning_rate": 0.00019723529944938974, + "loss": 1.3425, + "step": 2870 + }, + { + "epoch": 0.1028166239905456, + "grad_norm": 1.294501781463623, + "learning_rate": 0.00019723259024189612, + "loss": 1.1561, + "step": 2871 + }, + { + "epoch": 0.10285243612011388, + "grad_norm": 1.2273716926574707, + "learning_rate": 0.0001972298797262634, + "loss": 1.2079, + "step": 2872 + }, + { + "epoch": 0.10288824824968217, + "grad_norm": 1.8919804096221924, + "learning_rate": 0.0001972271679025281, + "loss": 1.3604, + "step": 2873 + }, + { + "epoch": 0.10292406037925045, + "grad_norm": 1.4485993385314941, + "learning_rate": 0.00019722445477072666, + "loss": 1.1569, + "step": 2874 + }, + { + "epoch": 0.10295987250881873, + "grad_norm": 1.389143705368042, + "learning_rate": 0.00019722174033089557, + "loss": 1.2678, + "step": 2875 + }, + { + "epoch": 0.10299568463838703, + "grad_norm": 1.5176033973693848, + "learning_rate": 0.0001972190245830714, + "loss": 1.1675, + "step": 2876 + }, + { + "epoch": 0.10303149676795531, + "grad_norm": 1.5315461158752441, + "learning_rate": 0.00019721630752729064, + "loss": 1.2772, + "step": 2877 + }, + { + "epoch": 0.1030673088975236, + "grad_norm": 1.650718331336975, + "learning_rate": 0.00019721358916358986, + "loss": 1.1519, + "step": 2878 + }, + { + "epoch": 0.10310312102709188, + "grad_norm": 1.401171088218689, + "learning_rate": 0.00019721086949200565, + "loss": 1.2022, + "step": 2879 + }, + { + "epoch": 0.10313893315666016, + "grad_norm": 1.3243207931518555, + "learning_rate": 0.00019720814851257457, + "loss": 1.1614, + "step": 2880 + }, + { + "epoch": 0.10317474528622844, + "grad_norm": 1.4702694416046143, + "learning_rate": 0.00019720542622533323, + "loss": 1.3095, + "step": 2881 + }, + { + "epoch": 0.10321055741579672, + "grad_norm": 1.422116756439209, + "learning_rate": 0.0001972027026303183, + "loss": 1.1001, + "step": 2882 + }, + { + "epoch": 0.10324636954536502, + "grad_norm": 1.365236759185791, + "learning_rate": 0.00019719997772756637, + "loss": 1.1538, + "step": 2883 + }, + { + "epoch": 0.1032821816749333, + "grad_norm": 1.555049180984497, + "learning_rate": 0.00019719725151711413, + "loss": 1.5723, + "step": 2884 + }, + { + "epoch": 0.10331799380450159, + "grad_norm": 1.218267798423767, + "learning_rate": 0.00019719452399899823, + "loss": 1.0974, + "step": 2885 + }, + { + "epoch": 0.10335380593406987, + "grad_norm": 1.7734495401382446, + "learning_rate": 0.00019719179517325538, + "loss": 1.0935, + "step": 2886 + }, + { + "epoch": 0.10338961806363815, + "grad_norm": 2.6993298530578613, + "learning_rate": 0.00019718906503992233, + "loss": 1.4088, + "step": 2887 + }, + { + "epoch": 0.10342543019320644, + "grad_norm": 2.249967098236084, + "learning_rate": 0.00019718633359903573, + "loss": 1.0693, + "step": 2888 + }, + { + "epoch": 0.10346124232277472, + "grad_norm": 2.6331679821014404, + "learning_rate": 0.00019718360085063238, + "loss": 1.2431, + "step": 2889 + }, + { + "epoch": 0.10349705445234302, + "grad_norm": 1.3112658262252808, + "learning_rate": 0.00019718086679474905, + "loss": 1.1188, + "step": 2890 + }, + { + "epoch": 0.1035328665819113, + "grad_norm": 1.6268188953399658, + "learning_rate": 0.0001971781314314225, + "loss": 1.118, + "step": 2891 + }, + { + "epoch": 0.10356867871147958, + "grad_norm": 2.1066718101501465, + "learning_rate": 0.00019717539476068959, + "loss": 1.2642, + "step": 2892 + }, + { + "epoch": 0.10360449084104786, + "grad_norm": 1.2997286319732666, + "learning_rate": 0.00019717265678258702, + "loss": 1.2351, + "step": 2893 + }, + { + "epoch": 0.10364030297061615, + "grad_norm": 2.1634719371795654, + "learning_rate": 0.00019716991749715174, + "loss": 1.2442, + "step": 2894 + }, + { + "epoch": 0.10367611510018443, + "grad_norm": 1.3998935222625732, + "learning_rate": 0.00019716717690442055, + "loss": 1.2415, + "step": 2895 + }, + { + "epoch": 0.10371192722975271, + "grad_norm": 1.2987253665924072, + "learning_rate": 0.00019716443500443034, + "loss": 1.0673, + "step": 2896 + }, + { + "epoch": 0.103747739359321, + "grad_norm": 1.5240329504013062, + "learning_rate": 0.00019716169179721799, + "loss": 1.3045, + "step": 2897 + }, + { + "epoch": 0.10378355148888929, + "grad_norm": 1.45256769657135, + "learning_rate": 0.00019715894728282037, + "loss": 1.2156, + "step": 2898 + }, + { + "epoch": 0.10381936361845757, + "grad_norm": 1.5277578830718994, + "learning_rate": 0.00019715620146127448, + "loss": 1.2158, + "step": 2899 + }, + { + "epoch": 0.10385517574802586, + "grad_norm": 1.8070653676986694, + "learning_rate": 0.0001971534543326172, + "loss": 1.2826, + "step": 2900 + }, + { + "epoch": 0.10389098787759414, + "grad_norm": 1.3623030185699463, + "learning_rate": 0.0001971507058968855, + "loss": 1.3254, + "step": 2901 + }, + { + "epoch": 0.10392680000716242, + "grad_norm": 1.886695384979248, + "learning_rate": 0.00019714795615411644, + "loss": 1.1528, + "step": 2902 + }, + { + "epoch": 0.1039626121367307, + "grad_norm": 1.3821895122528076, + "learning_rate": 0.00019714520510434686, + "loss": 1.1643, + "step": 2903 + }, + { + "epoch": 0.10399842426629899, + "grad_norm": 1.7165051698684692, + "learning_rate": 0.0001971424527476139, + "loss": 1.1094, + "step": 2904 + }, + { + "epoch": 0.10403423639586729, + "grad_norm": 2.0601422786712646, + "learning_rate": 0.0001971396990839545, + "loss": 1.2257, + "step": 2905 + }, + { + "epoch": 0.10407004852543557, + "grad_norm": 1.5391780138015747, + "learning_rate": 0.0001971369441134058, + "loss": 1.1226, + "step": 2906 + }, + { + "epoch": 0.10410586065500385, + "grad_norm": 1.7990038394927979, + "learning_rate": 0.00019713418783600477, + "loss": 1.1559, + "step": 2907 + }, + { + "epoch": 0.10414167278457213, + "grad_norm": 1.7674298286437988, + "learning_rate": 0.00019713143025178856, + "loss": 1.2958, + "step": 2908 + }, + { + "epoch": 0.10417748491414042, + "grad_norm": 1.7681318521499634, + "learning_rate": 0.00019712867136079427, + "loss": 1.205, + "step": 2909 + }, + { + "epoch": 0.1042132970437087, + "grad_norm": 1.7704535722732544, + "learning_rate": 0.00019712591116305896, + "loss": 1.3479, + "step": 2910 + }, + { + "epoch": 0.10424910917327698, + "grad_norm": 1.6087754964828491, + "learning_rate": 0.0001971231496586198, + "loss": 1.2297, + "step": 2911 + }, + { + "epoch": 0.10428492130284528, + "grad_norm": 2.1142168045043945, + "learning_rate": 0.00019712038684751394, + "loss": 1.2021, + "step": 2912 + }, + { + "epoch": 0.10432073343241356, + "grad_norm": 1.3196550607681274, + "learning_rate": 0.0001971176227297786, + "loss": 1.0718, + "step": 2913 + }, + { + "epoch": 0.10435654556198184, + "grad_norm": 1.554134726524353, + "learning_rate": 0.00019711485730545086, + "loss": 1.0556, + "step": 2914 + }, + { + "epoch": 0.10439235769155013, + "grad_norm": 2.4596567153930664, + "learning_rate": 0.000197112090574568, + "loss": 1.1844, + "step": 2915 + }, + { + "epoch": 0.10442816982111841, + "grad_norm": 1.4490718841552734, + "learning_rate": 0.00019710932253716722, + "loss": 0.9362, + "step": 2916 + }, + { + "epoch": 0.1044639819506867, + "grad_norm": 1.60019850730896, + "learning_rate": 0.00019710655319328578, + "loss": 1.3177, + "step": 2917 + }, + { + "epoch": 0.10449979408025498, + "grad_norm": 1.406620740890503, + "learning_rate": 0.00019710378254296092, + "loss": 1.4073, + "step": 2918 + }, + { + "epoch": 0.10453560620982327, + "grad_norm": 1.5672017335891724, + "learning_rate": 0.00019710101058622993, + "loss": 1.0107, + "step": 2919 + }, + { + "epoch": 0.10457141833939156, + "grad_norm": 1.5221198797225952, + "learning_rate": 0.00019709823732313008, + "loss": 1.2694, + "step": 2920 + }, + { + "epoch": 0.10460723046895984, + "grad_norm": 2.7940123081207275, + "learning_rate": 0.0001970954627536987, + "loss": 1.4576, + "step": 2921 + }, + { + "epoch": 0.10464304259852812, + "grad_norm": 1.7199044227600098, + "learning_rate": 0.00019709268687797312, + "loss": 1.1099, + "step": 2922 + }, + { + "epoch": 0.1046788547280964, + "grad_norm": 1.4984066486358643, + "learning_rate": 0.00019708990969599067, + "loss": 1.0562, + "step": 2923 + }, + { + "epoch": 0.10471466685766469, + "grad_norm": 1.5207016468048096, + "learning_rate": 0.00019708713120778873, + "loss": 1.3382, + "step": 2924 + }, + { + "epoch": 0.10475047898723297, + "grad_norm": 1.4612205028533936, + "learning_rate": 0.00019708435141340465, + "loss": 1.1134, + "step": 2925 + }, + { + "epoch": 0.10478629111680127, + "grad_norm": 1.3351777791976929, + "learning_rate": 0.00019708157031287588, + "loss": 1.3582, + "step": 2926 + }, + { + "epoch": 0.10482210324636955, + "grad_norm": 2.0488803386688232, + "learning_rate": 0.0001970787879062398, + "loss": 1.0667, + "step": 2927 + }, + { + "epoch": 0.10485791537593783, + "grad_norm": 1.454836130142212, + "learning_rate": 0.00019707600419353383, + "loss": 1.2546, + "step": 2928 + }, + { + "epoch": 0.10489372750550612, + "grad_norm": 1.9217573404312134, + "learning_rate": 0.00019707321917479547, + "loss": 1.1711, + "step": 2929 + }, + { + "epoch": 0.1049295396350744, + "grad_norm": 1.7918416261672974, + "learning_rate": 0.00019707043285006214, + "loss": 1.1954, + "step": 2930 + }, + { + "epoch": 0.10496535176464268, + "grad_norm": 1.809128761291504, + "learning_rate": 0.00019706764521937138, + "loss": 1.2274, + "step": 2931 + }, + { + "epoch": 0.10500116389421096, + "grad_norm": 1.8199971914291382, + "learning_rate": 0.00019706485628276062, + "loss": 1.206, + "step": 2932 + }, + { + "epoch": 0.10503697602377926, + "grad_norm": 2.2863569259643555, + "learning_rate": 0.00019706206604026746, + "loss": 1.1672, + "step": 2933 + }, + { + "epoch": 0.10507278815334754, + "grad_norm": 1.1524385213851929, + "learning_rate": 0.00019705927449192937, + "loss": 1.1369, + "step": 2934 + }, + { + "epoch": 0.10510860028291583, + "grad_norm": 1.5789809226989746, + "learning_rate": 0.00019705648163778397, + "loss": 1.1472, + "step": 2935 + }, + { + "epoch": 0.10514441241248411, + "grad_norm": 1.8875539302825928, + "learning_rate": 0.00019705368747786878, + "loss": 1.4614, + "step": 2936 + }, + { + "epoch": 0.10518022454205239, + "grad_norm": 1.754919171333313, + "learning_rate": 0.00019705089201222143, + "loss": 0.9844, + "step": 2937 + }, + { + "epoch": 0.10521603667162067, + "grad_norm": 1.8902390003204346, + "learning_rate": 0.00019704809524087952, + "loss": 1.371, + "step": 2938 + }, + { + "epoch": 0.10525184880118896, + "grad_norm": 1.4184372425079346, + "learning_rate": 0.00019704529716388068, + "loss": 1.1739, + "step": 2939 + }, + { + "epoch": 0.10528766093075725, + "grad_norm": 1.6919022798538208, + "learning_rate": 0.00019704249778126253, + "loss": 1.2468, + "step": 2940 + }, + { + "epoch": 0.10532347306032554, + "grad_norm": 1.9043033123016357, + "learning_rate": 0.00019703969709306273, + "loss": 1.2449, + "step": 2941 + }, + { + "epoch": 0.10535928518989382, + "grad_norm": 1.477023720741272, + "learning_rate": 0.000197036895099319, + "loss": 1.4135, + "step": 2942 + }, + { + "epoch": 0.1053950973194621, + "grad_norm": 1.4040751457214355, + "learning_rate": 0.000197034091800069, + "loss": 1.528, + "step": 2943 + }, + { + "epoch": 0.10543090944903039, + "grad_norm": 1.605771780014038, + "learning_rate": 0.00019703128719535047, + "loss": 1.2581, + "step": 2944 + }, + { + "epoch": 0.10546672157859867, + "grad_norm": 1.5101817846298218, + "learning_rate": 0.00019702848128520112, + "loss": 1.1218, + "step": 2945 + }, + { + "epoch": 0.10550253370816695, + "grad_norm": 1.9862970113754272, + "learning_rate": 0.00019702567406965874, + "loss": 1.2702, + "step": 2946 + }, + { + "epoch": 0.10553834583773523, + "grad_norm": 2.014716386795044, + "learning_rate": 0.00019702286554876107, + "loss": 1.3278, + "step": 2947 + }, + { + "epoch": 0.10557415796730353, + "grad_norm": 1.7099248170852661, + "learning_rate": 0.00019702005572254586, + "loss": 1.2185, + "step": 2948 + }, + { + "epoch": 0.10560997009687181, + "grad_norm": 2.1188113689422607, + "learning_rate": 0.00019701724459105096, + "loss": 1.2237, + "step": 2949 + }, + { + "epoch": 0.1056457822264401, + "grad_norm": 1.7354111671447754, + "learning_rate": 0.0001970144321543142, + "loss": 1.2883, + "step": 2950 + }, + { + "epoch": 0.10568159435600838, + "grad_norm": 1.9206128120422363, + "learning_rate": 0.00019701161841237337, + "loss": 1.1903, + "step": 2951 + }, + { + "epoch": 0.10571740648557666, + "grad_norm": 2.0514702796936035, + "learning_rate": 0.00019700880336526635, + "loss": 1.4205, + "step": 2952 + }, + { + "epoch": 0.10575321861514494, + "grad_norm": 1.4189434051513672, + "learning_rate": 0.000197005987013031, + "loss": 1.386, + "step": 2953 + }, + { + "epoch": 0.10578903074471323, + "grad_norm": 1.517785906791687, + "learning_rate": 0.00019700316935570525, + "loss": 1.295, + "step": 2954 + }, + { + "epoch": 0.10582484287428152, + "grad_norm": 1.5241918563842773, + "learning_rate": 0.00019700035039332697, + "loss": 1.2021, + "step": 2955 + }, + { + "epoch": 0.10586065500384981, + "grad_norm": 1.9306671619415283, + "learning_rate": 0.00019699753012593412, + "loss": 1.2411, + "step": 2956 + }, + { + "epoch": 0.10589646713341809, + "grad_norm": 1.4975332021713257, + "learning_rate": 0.0001969947085535646, + "loss": 0.8931, + "step": 2957 + }, + { + "epoch": 0.10593227926298637, + "grad_norm": 2.3054635524749756, + "learning_rate": 0.00019699188567625639, + "loss": 1.2953, + "step": 2958 + }, + { + "epoch": 0.10596809139255466, + "grad_norm": 1.3508740663528442, + "learning_rate": 0.00019698906149404746, + "loss": 1.2511, + "step": 2959 + }, + { + "epoch": 0.10600390352212294, + "grad_norm": 1.4820530414581299, + "learning_rate": 0.00019698623600697583, + "loss": 1.2013, + "step": 2960 + }, + { + "epoch": 0.10603971565169122, + "grad_norm": 1.623093843460083, + "learning_rate": 0.0001969834092150795, + "loss": 1.4329, + "step": 2961 + }, + { + "epoch": 0.10607552778125952, + "grad_norm": 1.2928494215011597, + "learning_rate": 0.0001969805811183965, + "loss": 1.2772, + "step": 2962 + }, + { + "epoch": 0.1061113399108278, + "grad_norm": 1.6468377113342285, + "learning_rate": 0.00019697775171696486, + "loss": 1.2309, + "step": 2963 + }, + { + "epoch": 0.10614715204039608, + "grad_norm": 2.4976494312286377, + "learning_rate": 0.00019697492101082266, + "loss": 1.2883, + "step": 2964 + }, + { + "epoch": 0.10618296416996437, + "grad_norm": 1.529433012008667, + "learning_rate": 0.000196972089000008, + "loss": 1.1443, + "step": 2965 + }, + { + "epoch": 0.10621877629953265, + "grad_norm": 1.4264572858810425, + "learning_rate": 0.00019696925568455894, + "loss": 1.1765, + "step": 2966 + }, + { + "epoch": 0.10625458842910093, + "grad_norm": 1.5921289920806885, + "learning_rate": 0.00019696642106451368, + "loss": 1.3436, + "step": 2967 + }, + { + "epoch": 0.10629040055866922, + "grad_norm": 1.7052264213562012, + "learning_rate": 0.00019696358513991027, + "loss": 1.3809, + "step": 2968 + }, + { + "epoch": 0.10632621268823751, + "grad_norm": 1.6285935640335083, + "learning_rate": 0.0001969607479107869, + "loss": 1.2425, + "step": 2969 + }, + { + "epoch": 0.1063620248178058, + "grad_norm": 1.891503095626831, + "learning_rate": 0.00019695790937718176, + "loss": 1.3572, + "step": 2970 + }, + { + "epoch": 0.10639783694737408, + "grad_norm": 2.336024045944214, + "learning_rate": 0.00019695506953913298, + "loss": 1.5907, + "step": 2971 + }, + { + "epoch": 0.10643364907694236, + "grad_norm": 2.2468955516815186, + "learning_rate": 0.0001969522283966788, + "loss": 1.3982, + "step": 2972 + }, + { + "epoch": 0.10646946120651064, + "grad_norm": 2.137949228286743, + "learning_rate": 0.00019694938594985747, + "loss": 1.1292, + "step": 2973 + }, + { + "epoch": 0.10650527333607893, + "grad_norm": 1.6585091352462769, + "learning_rate": 0.00019694654219870722, + "loss": 1.2834, + "step": 2974 + }, + { + "epoch": 0.10654108546564721, + "grad_norm": 1.5191633701324463, + "learning_rate": 0.00019694369714326625, + "loss": 1.2328, + "step": 2975 + }, + { + "epoch": 0.1065768975952155, + "grad_norm": 2.144946813583374, + "learning_rate": 0.00019694085078357293, + "loss": 1.4051, + "step": 2976 + }, + { + "epoch": 0.10661270972478379, + "grad_norm": 1.3682231903076172, + "learning_rate": 0.00019693800311966549, + "loss": 1.2365, + "step": 2977 + }, + { + "epoch": 0.10664852185435207, + "grad_norm": 1.8461191654205322, + "learning_rate": 0.00019693515415158223, + "loss": 1.2553, + "step": 2978 + }, + { + "epoch": 0.10668433398392035, + "grad_norm": 2.0821239948272705, + "learning_rate": 0.00019693230387936154, + "loss": 1.3508, + "step": 2979 + }, + { + "epoch": 0.10672014611348864, + "grad_norm": 1.5840544700622559, + "learning_rate": 0.00019692945230304174, + "loss": 1.3536, + "step": 2980 + }, + { + "epoch": 0.10675595824305692, + "grad_norm": 2.537936210632324, + "learning_rate": 0.00019692659942266118, + "loss": 1.6208, + "step": 2981 + }, + { + "epoch": 0.1067917703726252, + "grad_norm": 2.223097562789917, + "learning_rate": 0.00019692374523825823, + "loss": 1.2775, + "step": 2982 + }, + { + "epoch": 0.1068275825021935, + "grad_norm": 1.739273190498352, + "learning_rate": 0.00019692088974987133, + "loss": 1.3093, + "step": 2983 + }, + { + "epoch": 0.10686339463176178, + "grad_norm": 1.5586793422698975, + "learning_rate": 0.0001969180329575389, + "loss": 1.2186, + "step": 2984 + }, + { + "epoch": 0.10689920676133007, + "grad_norm": 2.3937196731567383, + "learning_rate": 0.0001969151748612993, + "loss": 1.2744, + "step": 2985 + }, + { + "epoch": 0.10693501889089835, + "grad_norm": 1.3938322067260742, + "learning_rate": 0.00019691231546119107, + "loss": 1.0562, + "step": 2986 + }, + { + "epoch": 0.10697083102046663, + "grad_norm": 2.260335922241211, + "learning_rate": 0.00019690945475725266, + "loss": 1.1491, + "step": 2987 + }, + { + "epoch": 0.10700664315003491, + "grad_norm": 1.7987792491912842, + "learning_rate": 0.0001969065927495225, + "loss": 1.2606, + "step": 2988 + }, + { + "epoch": 0.1070424552796032, + "grad_norm": 1.560793161392212, + "learning_rate": 0.00019690372943803914, + "loss": 1.3281, + "step": 2989 + }, + { + "epoch": 0.1070782674091715, + "grad_norm": 2.322397232055664, + "learning_rate": 0.00019690086482284112, + "loss": 1.2372, + "step": 2990 + }, + { + "epoch": 0.10711407953873978, + "grad_norm": 1.4384677410125732, + "learning_rate": 0.00019689799890396694, + "loss": 1.1588, + "step": 2991 + }, + { + "epoch": 0.10714989166830806, + "grad_norm": 2.32120943069458, + "learning_rate": 0.0001968951316814552, + "loss": 1.2706, + "step": 2992 + }, + { + "epoch": 0.10718570379787634, + "grad_norm": 2.0052738189697266, + "learning_rate": 0.0001968922631553444, + "loss": 1.2177, + "step": 2993 + }, + { + "epoch": 0.10722151592744462, + "grad_norm": 1.827478051185608, + "learning_rate": 0.00019688939332567325, + "loss": 1.2246, + "step": 2994 + }, + { + "epoch": 0.10725732805701291, + "grad_norm": 1.441378116607666, + "learning_rate": 0.00019688652219248021, + "loss": 0.9822, + "step": 2995 + }, + { + "epoch": 0.10729314018658119, + "grad_norm": 1.2350471019744873, + "learning_rate": 0.00019688364975580406, + "loss": 1.1546, + "step": 2996 + }, + { + "epoch": 0.10732895231614947, + "grad_norm": 1.4823802709579468, + "learning_rate": 0.00019688077601568332, + "loss": 1.1138, + "step": 2997 + }, + { + "epoch": 0.10736476444571777, + "grad_norm": 1.7603669166564941, + "learning_rate": 0.00019687790097215675, + "loss": 1.2636, + "step": 2998 + }, + { + "epoch": 0.10740057657528605, + "grad_norm": 1.6044714450836182, + "learning_rate": 0.00019687502462526296, + "loss": 1.3196, + "step": 2999 + }, + { + "epoch": 0.10743638870485434, + "grad_norm": 1.8668309450149536, + "learning_rate": 0.00019687214697504068, + "loss": 1.2658, + "step": 3000 + }, + { + "epoch": 0.10747220083442262, + "grad_norm": 1.6136945486068726, + "learning_rate": 0.00019686926802152862, + "loss": 1.2255, + "step": 3001 + }, + { + "epoch": 0.1075080129639909, + "grad_norm": 1.616620421409607, + "learning_rate": 0.0001968663877647655, + "loss": 1.3056, + "step": 3002 + }, + { + "epoch": 0.10754382509355918, + "grad_norm": 1.735804796218872, + "learning_rate": 0.0001968635062047901, + "loss": 1.375, + "step": 3003 + }, + { + "epoch": 0.10757963722312747, + "grad_norm": 1.615212082862854, + "learning_rate": 0.00019686062334164114, + "loss": 1.1485, + "step": 3004 + }, + { + "epoch": 0.10761544935269576, + "grad_norm": 1.9854201078414917, + "learning_rate": 0.00019685773917535747, + "loss": 1.1495, + "step": 3005 + }, + { + "epoch": 0.10765126148226405, + "grad_norm": 1.466868281364441, + "learning_rate": 0.00019685485370597781, + "loss": 1.2744, + "step": 3006 + }, + { + "epoch": 0.10768707361183233, + "grad_norm": 2.15297794342041, + "learning_rate": 0.00019685196693354108, + "loss": 1.3823, + "step": 3007 + }, + { + "epoch": 0.10772288574140061, + "grad_norm": 1.5681333541870117, + "learning_rate": 0.00019684907885808602, + "loss": 1.2936, + "step": 3008 + }, + { + "epoch": 0.1077586978709689, + "grad_norm": 1.681711196899414, + "learning_rate": 0.00019684618947965157, + "loss": 1.1964, + "step": 3009 + }, + { + "epoch": 0.10779451000053718, + "grad_norm": 1.5451533794403076, + "learning_rate": 0.00019684329879827655, + "loss": 1.2738, + "step": 3010 + }, + { + "epoch": 0.10783032213010546, + "grad_norm": 2.106849431991577, + "learning_rate": 0.00019684040681399988, + "loss": 1.3479, + "step": 3011 + }, + { + "epoch": 0.10786613425967376, + "grad_norm": 1.7660176753997803, + "learning_rate": 0.0001968375135268604, + "loss": 1.3691, + "step": 3012 + }, + { + "epoch": 0.10790194638924204, + "grad_norm": 2.3100552558898926, + "learning_rate": 0.00019683461893689713, + "loss": 1.0709, + "step": 3013 + }, + { + "epoch": 0.10793775851881032, + "grad_norm": 1.5367406606674194, + "learning_rate": 0.00019683172304414895, + "loss": 1.401, + "step": 3014 + }, + { + "epoch": 0.1079735706483786, + "grad_norm": 1.459265947341919, + "learning_rate": 0.00019682882584865486, + "loss": 1.1972, + "step": 3015 + }, + { + "epoch": 0.10800938277794689, + "grad_norm": 1.5106781721115112, + "learning_rate": 0.0001968259273504538, + "loss": 0.9435, + "step": 3016 + }, + { + "epoch": 0.10804519490751517, + "grad_norm": 1.8236584663391113, + "learning_rate": 0.0001968230275495848, + "loss": 1.1157, + "step": 3017 + }, + { + "epoch": 0.10808100703708345, + "grad_norm": 1.9020802974700928, + "learning_rate": 0.00019682012644608684, + "loss": 1.4409, + "step": 3018 + }, + { + "epoch": 0.10811681916665175, + "grad_norm": 1.539783239364624, + "learning_rate": 0.000196817224039999, + "loss": 1.2199, + "step": 3019 + }, + { + "epoch": 0.10815263129622003, + "grad_norm": 1.5008376836776733, + "learning_rate": 0.00019681432033136025, + "loss": 1.3365, + "step": 3020 + }, + { + "epoch": 0.10818844342578832, + "grad_norm": 1.7610019445419312, + "learning_rate": 0.00019681141532020973, + "loss": 1.2036, + "step": 3021 + }, + { + "epoch": 0.1082242555553566, + "grad_norm": 1.5654178857803345, + "learning_rate": 0.00019680850900658648, + "loss": 1.3435, + "step": 3022 + }, + { + "epoch": 0.10826006768492488, + "grad_norm": 1.9277440309524536, + "learning_rate": 0.00019680560139052962, + "loss": 1.1342, + "step": 3023 + }, + { + "epoch": 0.10829587981449317, + "grad_norm": 1.7767225503921509, + "learning_rate": 0.00019680269247207826, + "loss": 1.2604, + "step": 3024 + }, + { + "epoch": 0.10833169194406145, + "grad_norm": 1.8000718355178833, + "learning_rate": 0.00019679978225127154, + "loss": 1.1581, + "step": 3025 + }, + { + "epoch": 0.10836750407362974, + "grad_norm": 1.954991340637207, + "learning_rate": 0.00019679687072814863, + "loss": 1.3712, + "step": 3026 + }, + { + "epoch": 0.10840331620319803, + "grad_norm": 1.7951723337173462, + "learning_rate": 0.00019679395790274867, + "loss": 1.29, + "step": 3027 + }, + { + "epoch": 0.10843912833276631, + "grad_norm": 1.7887741327285767, + "learning_rate": 0.00019679104377511085, + "loss": 1.2895, + "step": 3028 + }, + { + "epoch": 0.1084749404623346, + "grad_norm": 1.7788869142532349, + "learning_rate": 0.0001967881283452744, + "loss": 1.2326, + "step": 3029 + }, + { + "epoch": 0.10851075259190288, + "grad_norm": 1.5840790271759033, + "learning_rate": 0.00019678521161327854, + "loss": 1.2573, + "step": 3030 + }, + { + "epoch": 0.10854656472147116, + "grad_norm": 1.5849202871322632, + "learning_rate": 0.0001967822935791625, + "loss": 1.3604, + "step": 3031 + }, + { + "epoch": 0.10858237685103944, + "grad_norm": 1.7930307388305664, + "learning_rate": 0.0001967793742429655, + "loss": 1.3976, + "step": 3032 + }, + { + "epoch": 0.10861818898060774, + "grad_norm": 1.7193841934204102, + "learning_rate": 0.00019677645360472693, + "loss": 1.188, + "step": 3033 + }, + { + "epoch": 0.10865400111017602, + "grad_norm": 2.2984635829925537, + "learning_rate": 0.00019677353166448595, + "loss": 1.3255, + "step": 3034 + }, + { + "epoch": 0.1086898132397443, + "grad_norm": 1.65117347240448, + "learning_rate": 0.00019677060842228193, + "loss": 1.1003, + "step": 3035 + }, + { + "epoch": 0.10872562536931259, + "grad_norm": 2.158609390258789, + "learning_rate": 0.00019676768387815423, + "loss": 1.3374, + "step": 3036 + }, + { + "epoch": 0.10876143749888087, + "grad_norm": 1.614426612854004, + "learning_rate": 0.00019676475803214217, + "loss": 1.0233, + "step": 3037 + }, + { + "epoch": 0.10879724962844915, + "grad_norm": 1.7291847467422485, + "learning_rate": 0.0001967618308842851, + "loss": 1.3465, + "step": 3038 + }, + { + "epoch": 0.10883306175801744, + "grad_norm": 2.084979295730591, + "learning_rate": 0.00019675890243462237, + "loss": 1.2799, + "step": 3039 + }, + { + "epoch": 0.10886887388758573, + "grad_norm": 1.1818249225616455, + "learning_rate": 0.00019675597268319344, + "loss": 1.2123, + "step": 3040 + }, + { + "epoch": 0.10890468601715401, + "grad_norm": 1.7394850254058838, + "learning_rate": 0.00019675304163003772, + "loss": 1.3266, + "step": 3041 + }, + { + "epoch": 0.1089404981467223, + "grad_norm": 1.78844153881073, + "learning_rate": 0.00019675010927519462, + "loss": 1.3296, + "step": 3042 + }, + { + "epoch": 0.10897631027629058, + "grad_norm": 1.5235034227371216, + "learning_rate": 0.0001967471756187036, + "loss": 1.205, + "step": 3043 + }, + { + "epoch": 0.10901212240585886, + "grad_norm": 2.3588359355926514, + "learning_rate": 0.0001967442406606041, + "loss": 1.1397, + "step": 3044 + }, + { + "epoch": 0.10904793453542715, + "grad_norm": 1.5638126134872437, + "learning_rate": 0.00019674130440093567, + "loss": 1.2665, + "step": 3045 + }, + { + "epoch": 0.10908374666499543, + "grad_norm": 1.554991364479065, + "learning_rate": 0.00019673836683973777, + "loss": 1.2276, + "step": 3046 + }, + { + "epoch": 0.10911955879456371, + "grad_norm": 2.357395648956299, + "learning_rate": 0.00019673542797704992, + "loss": 1.194, + "step": 3047 + }, + { + "epoch": 0.10915537092413201, + "grad_norm": 1.9651963710784912, + "learning_rate": 0.00019673248781291167, + "loss": 1.375, + "step": 3048 + }, + { + "epoch": 0.10919118305370029, + "grad_norm": 1.5754432678222656, + "learning_rate": 0.00019672954634736257, + "loss": 1.4374, + "step": 3049 + }, + { + "epoch": 0.10922699518326857, + "grad_norm": 2.0374982357025146, + "learning_rate": 0.00019672660358044218, + "loss": 1.1918, + "step": 3050 + }, + { + "epoch": 0.10926280731283686, + "grad_norm": 1.6274160146713257, + "learning_rate": 0.00019672365951219013, + "loss": 1.0361, + "step": 3051 + }, + { + "epoch": 0.10929861944240514, + "grad_norm": 1.4710670709609985, + "learning_rate": 0.00019672071414264598, + "loss": 1.251, + "step": 3052 + }, + { + "epoch": 0.10933443157197342, + "grad_norm": 1.5979846715927124, + "learning_rate": 0.0001967177674718494, + "loss": 1.3173, + "step": 3053 + }, + { + "epoch": 0.1093702437015417, + "grad_norm": 1.5418189764022827, + "learning_rate": 0.00019671481949984002, + "loss": 1.2536, + "step": 3054 + }, + { + "epoch": 0.10940605583111, + "grad_norm": 1.7097506523132324, + "learning_rate": 0.0001967118702266575, + "loss": 1.0802, + "step": 3055 + }, + { + "epoch": 0.10944186796067829, + "grad_norm": 1.6743814945220947, + "learning_rate": 0.0001967089196523415, + "loss": 1.4579, + "step": 3056 + }, + { + "epoch": 0.10947768009024657, + "grad_norm": 1.9694774150848389, + "learning_rate": 0.00019670596777693176, + "loss": 1.1217, + "step": 3057 + }, + { + "epoch": 0.10951349221981485, + "grad_norm": 1.9900200366973877, + "learning_rate": 0.00019670301460046795, + "loss": 1.2414, + "step": 3058 + }, + { + "epoch": 0.10954930434938313, + "grad_norm": 1.5555585622787476, + "learning_rate": 0.0001967000601229898, + "loss": 1.2658, + "step": 3059 + }, + { + "epoch": 0.10958511647895142, + "grad_norm": 2.722710132598877, + "learning_rate": 0.00019669710434453707, + "loss": 1.381, + "step": 3060 + }, + { + "epoch": 0.1096209286085197, + "grad_norm": 1.8094063997268677, + "learning_rate": 0.00019669414726514956, + "loss": 1.403, + "step": 3061 + }, + { + "epoch": 0.109656740738088, + "grad_norm": 1.4547473192214966, + "learning_rate": 0.000196691188884867, + "loss": 1.041, + "step": 3062 + }, + { + "epoch": 0.10969255286765628, + "grad_norm": 1.6490707397460938, + "learning_rate": 0.00019668822920372922, + "loss": 1.3548, + "step": 3063 + }, + { + "epoch": 0.10972836499722456, + "grad_norm": 1.8111627101898193, + "learning_rate": 0.00019668526822177605, + "loss": 1.1622, + "step": 3064 + }, + { + "epoch": 0.10976417712679284, + "grad_norm": 1.8029303550720215, + "learning_rate": 0.00019668230593904734, + "loss": 1.0635, + "step": 3065 + }, + { + "epoch": 0.10979998925636113, + "grad_norm": 1.2528207302093506, + "learning_rate": 0.00019667934235558285, + "loss": 1.0717, + "step": 3066 + }, + { + "epoch": 0.10983580138592941, + "grad_norm": 1.3346952199935913, + "learning_rate": 0.00019667637747142257, + "loss": 1.2732, + "step": 3067 + }, + { + "epoch": 0.10987161351549769, + "grad_norm": 2.185272216796875, + "learning_rate": 0.0001966734112866063, + "loss": 1.1225, + "step": 3068 + }, + { + "epoch": 0.10990742564506599, + "grad_norm": 1.913496494293213, + "learning_rate": 0.00019667044380117398, + "loss": 1.1753, + "step": 3069 + }, + { + "epoch": 0.10994323777463427, + "grad_norm": 1.405144453048706, + "learning_rate": 0.00019666747501516553, + "loss": 1.3429, + "step": 3070 + }, + { + "epoch": 0.10997904990420256, + "grad_norm": 1.2577519416809082, + "learning_rate": 0.00019666450492862093, + "loss": 0.9899, + "step": 3071 + }, + { + "epoch": 0.11001486203377084, + "grad_norm": 1.1936372518539429, + "learning_rate": 0.0001966615335415801, + "loss": 1.1925, + "step": 3072 + }, + { + "epoch": 0.11005067416333912, + "grad_norm": 1.7411127090454102, + "learning_rate": 0.000196658560854083, + "loss": 1.3385, + "step": 3073 + }, + { + "epoch": 0.1100864862929074, + "grad_norm": 1.3270512819290161, + "learning_rate": 0.00019665558686616965, + "loss": 1.4298, + "step": 3074 + }, + { + "epoch": 0.11012229842247569, + "grad_norm": 1.7618567943572998, + "learning_rate": 0.00019665261157788004, + "loss": 1.4188, + "step": 3075 + }, + { + "epoch": 0.11015811055204398, + "grad_norm": 1.5022156238555908, + "learning_rate": 0.00019664963498925423, + "loss": 1.2819, + "step": 3076 + }, + { + "epoch": 0.11019392268161227, + "grad_norm": 2.3539414405822754, + "learning_rate": 0.00019664665710033226, + "loss": 1.2872, + "step": 3077 + }, + { + "epoch": 0.11022973481118055, + "grad_norm": 1.848494291305542, + "learning_rate": 0.0001966436779111542, + "loss": 1.3518, + "step": 3078 + }, + { + "epoch": 0.11026554694074883, + "grad_norm": 1.5626026391983032, + "learning_rate": 0.00019664069742176006, + "loss": 1.2669, + "step": 3079 + }, + { + "epoch": 0.11030135907031711, + "grad_norm": 1.6697626113891602, + "learning_rate": 0.00019663771563219006, + "loss": 1.2013, + "step": 3080 + }, + { + "epoch": 0.1103371711998854, + "grad_norm": 1.5801141262054443, + "learning_rate": 0.00019663473254248417, + "loss": 1.3364, + "step": 3081 + }, + { + "epoch": 0.11037298332945368, + "grad_norm": 2.0585615634918213, + "learning_rate": 0.00019663174815268266, + "loss": 1.2447, + "step": 3082 + }, + { + "epoch": 0.11040879545902198, + "grad_norm": 2.0605616569519043, + "learning_rate": 0.0001966287624628256, + "loss": 1.3266, + "step": 3083 + }, + { + "epoch": 0.11044460758859026, + "grad_norm": 2.2343435287475586, + "learning_rate": 0.0001966257754729532, + "loss": 1.4348, + "step": 3084 + }, + { + "epoch": 0.11048041971815854, + "grad_norm": 1.467976689338684, + "learning_rate": 0.00019662278718310562, + "loss": 1.238, + "step": 3085 + }, + { + "epoch": 0.11051623184772683, + "grad_norm": 1.943094253540039, + "learning_rate": 0.0001966197975933231, + "loss": 1.1511, + "step": 3086 + }, + { + "epoch": 0.11055204397729511, + "grad_norm": 1.813822865486145, + "learning_rate": 0.0001966168067036458, + "loss": 1.1176, + "step": 3087 + }, + { + "epoch": 0.11058785610686339, + "grad_norm": 1.64815354347229, + "learning_rate": 0.000196613814514114, + "loss": 1.2398, + "step": 3088 + }, + { + "epoch": 0.11062366823643167, + "grad_norm": 1.3253110647201538, + "learning_rate": 0.00019661082102476795, + "loss": 1.2221, + "step": 3089 + }, + { + "epoch": 0.11065948036599997, + "grad_norm": 1.6050453186035156, + "learning_rate": 0.00019660782623564792, + "loss": 1.2314, + "step": 3090 + }, + { + "epoch": 0.11069529249556825, + "grad_norm": 1.8660022020339966, + "learning_rate": 0.0001966048301467942, + "loss": 1.1166, + "step": 3091 + }, + { + "epoch": 0.11073110462513654, + "grad_norm": 1.2904887199401855, + "learning_rate": 0.0001966018327582471, + "loss": 1.1979, + "step": 3092 + }, + { + "epoch": 0.11076691675470482, + "grad_norm": 1.3506687879562378, + "learning_rate": 0.00019659883407004697, + "loss": 1.2625, + "step": 3093 + }, + { + "epoch": 0.1108027288842731, + "grad_norm": 1.6077914237976074, + "learning_rate": 0.00019659583408223412, + "loss": 1.3037, + "step": 3094 + }, + { + "epoch": 0.11083854101384139, + "grad_norm": 1.4831385612487793, + "learning_rate": 0.00019659283279484891, + "loss": 1.2663, + "step": 3095 + }, + { + "epoch": 0.11087435314340967, + "grad_norm": 1.4315882921218872, + "learning_rate": 0.00019658983020793175, + "loss": 1.0592, + "step": 3096 + }, + { + "epoch": 0.11091016527297795, + "grad_norm": 1.9085558652877808, + "learning_rate": 0.000196586826321523, + "loss": 1.2692, + "step": 3097 + }, + { + "epoch": 0.11094597740254625, + "grad_norm": 1.5069574117660522, + "learning_rate": 0.0001965838211356631, + "loss": 1.3433, + "step": 3098 + }, + { + "epoch": 0.11098178953211453, + "grad_norm": 1.8815114498138428, + "learning_rate": 0.00019658081465039246, + "loss": 1.2345, + "step": 3099 + }, + { + "epoch": 0.11101760166168281, + "grad_norm": 1.4917411804199219, + "learning_rate": 0.00019657780686575157, + "loss": 1.2225, + "step": 3100 + }, + { + "epoch": 0.1110534137912511, + "grad_norm": 1.1930116415023804, + "learning_rate": 0.00019657479778178083, + "loss": 1.2475, + "step": 3101 + }, + { + "epoch": 0.11108922592081938, + "grad_norm": 1.6929601430892944, + "learning_rate": 0.00019657178739852075, + "loss": 1.1706, + "step": 3102 + }, + { + "epoch": 0.11112503805038766, + "grad_norm": 1.6179395914077759, + "learning_rate": 0.00019656877571601187, + "loss": 1.3648, + "step": 3103 + }, + { + "epoch": 0.11116085017995594, + "grad_norm": 1.511979103088379, + "learning_rate": 0.00019656576273429467, + "loss": 1.4164, + "step": 3104 + }, + { + "epoch": 0.11119666230952424, + "grad_norm": 1.3832341432571411, + "learning_rate": 0.0001965627484534097, + "loss": 1.1823, + "step": 3105 + }, + { + "epoch": 0.11123247443909252, + "grad_norm": 1.8210198879241943, + "learning_rate": 0.0001965597328733975, + "loss": 1.3232, + "step": 3106 + }, + { + "epoch": 0.1112682865686608, + "grad_norm": 1.1551519632339478, + "learning_rate": 0.00019655671599429865, + "loss": 1.2209, + "step": 3107 + }, + { + "epoch": 0.11130409869822909, + "grad_norm": 1.789966344833374, + "learning_rate": 0.0001965536978161537, + "loss": 1.3125, + "step": 3108 + }, + { + "epoch": 0.11133991082779737, + "grad_norm": 2.0829477310180664, + "learning_rate": 0.00019655067833900333, + "loss": 1.2224, + "step": 3109 + }, + { + "epoch": 0.11137572295736566, + "grad_norm": 1.5187098979949951, + "learning_rate": 0.00019654765756288813, + "loss": 1.4426, + "step": 3110 + }, + { + "epoch": 0.11141153508693394, + "grad_norm": 2.294898509979248, + "learning_rate": 0.00019654463548784873, + "loss": 1.3307, + "step": 3111 + }, + { + "epoch": 0.11144734721650223, + "grad_norm": 1.781096339225769, + "learning_rate": 0.00019654161211392576, + "loss": 1.2171, + "step": 3112 + }, + { + "epoch": 0.11148315934607052, + "grad_norm": 1.8161768913269043, + "learning_rate": 0.00019653858744115996, + "loss": 1.1645, + "step": 3113 + }, + { + "epoch": 0.1115189714756388, + "grad_norm": 1.790497899055481, + "learning_rate": 0.00019653556146959197, + "loss": 1.1414, + "step": 3114 + }, + { + "epoch": 0.11155478360520708, + "grad_norm": 1.5658843517303467, + "learning_rate": 0.00019653253419926254, + "loss": 1.3932, + "step": 3115 + }, + { + "epoch": 0.11159059573477537, + "grad_norm": 2.079388380050659, + "learning_rate": 0.00019652950563021237, + "loss": 1.235, + "step": 3116 + }, + { + "epoch": 0.11162640786434365, + "grad_norm": 1.6200380325317383, + "learning_rate": 0.00019652647576248223, + "loss": 1.2335, + "step": 3117 + }, + { + "epoch": 0.11166221999391193, + "grad_norm": 2.05619740486145, + "learning_rate": 0.00019652344459611287, + "loss": 1.3925, + "step": 3118 + }, + { + "epoch": 0.11169803212348023, + "grad_norm": 2.040635585784912, + "learning_rate": 0.00019652041213114504, + "loss": 1.4463, + "step": 3119 + }, + { + "epoch": 0.11173384425304851, + "grad_norm": 1.946821689605713, + "learning_rate": 0.0001965173783676196, + "loss": 1.1149, + "step": 3120 + }, + { + "epoch": 0.1117696563826168, + "grad_norm": 1.52767014503479, + "learning_rate": 0.0001965143433055773, + "loss": 1.2931, + "step": 3121 + }, + { + "epoch": 0.11180546851218508, + "grad_norm": 1.817968726158142, + "learning_rate": 0.00019651130694505904, + "loss": 1.2288, + "step": 3122 + }, + { + "epoch": 0.11184128064175336, + "grad_norm": 1.3640471696853638, + "learning_rate": 0.00019650826928610564, + "loss": 1.2652, + "step": 3123 + }, + { + "epoch": 0.11187709277132164, + "grad_norm": 1.6849696636199951, + "learning_rate": 0.00019650523032875791, + "loss": 1.0737, + "step": 3124 + }, + { + "epoch": 0.11191290490088993, + "grad_norm": 1.444207787513733, + "learning_rate": 0.00019650219007305686, + "loss": 1.1629, + "step": 3125 + }, + { + "epoch": 0.11194871703045822, + "grad_norm": 2.1553444862365723, + "learning_rate": 0.00019649914851904327, + "loss": 1.164, + "step": 3126 + }, + { + "epoch": 0.1119845291600265, + "grad_norm": 1.5282254219055176, + "learning_rate": 0.0001964961056667581, + "loss": 1.2375, + "step": 3127 + }, + { + "epoch": 0.11202034128959479, + "grad_norm": 1.7433973550796509, + "learning_rate": 0.00019649306151624235, + "loss": 1.1875, + "step": 3128 + }, + { + "epoch": 0.11205615341916307, + "grad_norm": 1.5794063806533813, + "learning_rate": 0.0001964900160675369, + "loss": 1.1999, + "step": 3129 + }, + { + "epoch": 0.11209196554873135, + "grad_norm": 2.031782865524292, + "learning_rate": 0.00019648696932068272, + "loss": 1.4308, + "step": 3130 + }, + { + "epoch": 0.11212777767829964, + "grad_norm": 1.538789987564087, + "learning_rate": 0.0001964839212757209, + "loss": 1.2289, + "step": 3131 + }, + { + "epoch": 0.11216358980786792, + "grad_norm": 1.7240490913391113, + "learning_rate": 0.00019648087193269232, + "loss": 1.1875, + "step": 3132 + }, + { + "epoch": 0.11219940193743622, + "grad_norm": 1.5663371086120605, + "learning_rate": 0.00019647782129163805, + "loss": 1.2488, + "step": 3133 + }, + { + "epoch": 0.1122352140670045, + "grad_norm": 1.4571062326431274, + "learning_rate": 0.00019647476935259916, + "loss": 1.1729, + "step": 3134 + }, + { + "epoch": 0.11227102619657278, + "grad_norm": 2.308295965194702, + "learning_rate": 0.0001964717161156167, + "loss": 1.2115, + "step": 3135 + }, + { + "epoch": 0.11230683832614106, + "grad_norm": 1.296649694442749, + "learning_rate": 0.00019646866158073173, + "loss": 1.0982, + "step": 3136 + }, + { + "epoch": 0.11234265045570935, + "grad_norm": 1.5956249237060547, + "learning_rate": 0.00019646560574798535, + "loss": 0.9104, + "step": 3137 + }, + { + "epoch": 0.11237846258527763, + "grad_norm": 1.2931467294692993, + "learning_rate": 0.0001964625486174187, + "loss": 1.0712, + "step": 3138 + }, + { + "epoch": 0.11241427471484591, + "grad_norm": 1.5110552310943604, + "learning_rate": 0.00019645949018907283, + "loss": 1.1533, + "step": 3139 + }, + { + "epoch": 0.11245008684441421, + "grad_norm": 1.5156255960464478, + "learning_rate": 0.000196456430462989, + "loss": 0.9901, + "step": 3140 + }, + { + "epoch": 0.11248589897398249, + "grad_norm": 1.3310749530792236, + "learning_rate": 0.00019645336943920828, + "loss": 1.2503, + "step": 3141 + }, + { + "epoch": 0.11252171110355078, + "grad_norm": 1.679858922958374, + "learning_rate": 0.00019645030711777192, + "loss": 1.0899, + "step": 3142 + }, + { + "epoch": 0.11255752323311906, + "grad_norm": 1.9810466766357422, + "learning_rate": 0.0001964472434987211, + "loss": 1.4057, + "step": 3143 + }, + { + "epoch": 0.11259333536268734, + "grad_norm": 1.3771454095840454, + "learning_rate": 0.00019644417858209702, + "loss": 1.1638, + "step": 3144 + }, + { + "epoch": 0.11262914749225562, + "grad_norm": 1.7264028787612915, + "learning_rate": 0.00019644111236794088, + "loss": 1.2655, + "step": 3145 + }, + { + "epoch": 0.1126649596218239, + "grad_norm": 1.936472773551941, + "learning_rate": 0.000196438044856294, + "loss": 1.2709, + "step": 3146 + }, + { + "epoch": 0.11270077175139219, + "grad_norm": 1.7968852519989014, + "learning_rate": 0.0001964349760471976, + "loss": 1.2133, + "step": 3147 + }, + { + "epoch": 0.11273658388096049, + "grad_norm": 1.7161732912063599, + "learning_rate": 0.00019643190594069302, + "loss": 1.4373, + "step": 3148 + }, + { + "epoch": 0.11277239601052877, + "grad_norm": 2.097959041595459, + "learning_rate": 0.00019642883453682152, + "loss": 1.3745, + "step": 3149 + }, + { + "epoch": 0.11280820814009705, + "grad_norm": 1.9481486082077026, + "learning_rate": 0.00019642576183562444, + "loss": 1.1402, + "step": 3150 + }, + { + "epoch": 0.11284402026966533, + "grad_norm": 1.8444058895111084, + "learning_rate": 0.00019642268783714312, + "loss": 1.4313, + "step": 3151 + }, + { + "epoch": 0.11287983239923362, + "grad_norm": 1.856662631034851, + "learning_rate": 0.0001964196125414189, + "loss": 1.2301, + "step": 3152 + }, + { + "epoch": 0.1129156445288019, + "grad_norm": 1.378593921661377, + "learning_rate": 0.0001964165359484932, + "loss": 1.2236, + "step": 3153 + }, + { + "epoch": 0.11295145665837018, + "grad_norm": 1.6731464862823486, + "learning_rate": 0.00019641345805840733, + "loss": 1.1247, + "step": 3154 + }, + { + "epoch": 0.11298726878793848, + "grad_norm": 1.6466379165649414, + "learning_rate": 0.00019641037887120277, + "loss": 1.3028, + "step": 3155 + }, + { + "epoch": 0.11302308091750676, + "grad_norm": 1.7709681987762451, + "learning_rate": 0.00019640729838692092, + "loss": 1.0653, + "step": 3156 + }, + { + "epoch": 0.11305889304707505, + "grad_norm": 1.5463871955871582, + "learning_rate": 0.00019640421660560323, + "loss": 1.1978, + "step": 3157 + }, + { + "epoch": 0.11309470517664333, + "grad_norm": 1.4446361064910889, + "learning_rate": 0.00019640113352729116, + "loss": 1.2781, + "step": 3158 + }, + { + "epoch": 0.11313051730621161, + "grad_norm": 1.3919930458068848, + "learning_rate": 0.00019639804915202617, + "loss": 1.1982, + "step": 3159 + }, + { + "epoch": 0.1131663294357799, + "grad_norm": 1.2673590183258057, + "learning_rate": 0.0001963949634798498, + "loss": 1.2044, + "step": 3160 + }, + { + "epoch": 0.11320214156534818, + "grad_norm": 1.4975892305374146, + "learning_rate": 0.0001963918765108035, + "loss": 1.0999, + "step": 3161 + }, + { + "epoch": 0.11323795369491647, + "grad_norm": 1.394385576248169, + "learning_rate": 0.00019638878824492886, + "loss": 1.1071, + "step": 3162 + }, + { + "epoch": 0.11327376582448476, + "grad_norm": 1.8313754796981812, + "learning_rate": 0.0001963856986822674, + "loss": 1.0623, + "step": 3163 + }, + { + "epoch": 0.11330957795405304, + "grad_norm": 1.5441484451293945, + "learning_rate": 0.00019638260782286072, + "loss": 1.0457, + "step": 3164 + }, + { + "epoch": 0.11334539008362132, + "grad_norm": 1.8952008485794067, + "learning_rate": 0.00019637951566675035, + "loss": 1.477, + "step": 3165 + }, + { + "epoch": 0.1133812022131896, + "grad_norm": 1.8150010108947754, + "learning_rate": 0.00019637642221397792, + "loss": 1.1854, + "step": 3166 + }, + { + "epoch": 0.11341701434275789, + "grad_norm": 1.8791371583938599, + "learning_rate": 0.00019637332746458506, + "loss": 1.3577, + "step": 3167 + }, + { + "epoch": 0.11345282647232617, + "grad_norm": 2.487931489944458, + "learning_rate": 0.00019637023141861338, + "loss": 1.3603, + "step": 3168 + }, + { + "epoch": 0.11348863860189447, + "grad_norm": 1.6032068729400635, + "learning_rate": 0.00019636713407610455, + "loss": 1.1556, + "step": 3169 + }, + { + "epoch": 0.11352445073146275, + "grad_norm": 1.7784035205841064, + "learning_rate": 0.0001963640354371002, + "loss": 1.5699, + "step": 3170 + }, + { + "epoch": 0.11356026286103103, + "grad_norm": 1.53371262550354, + "learning_rate": 0.00019636093550164208, + "loss": 1.2616, + "step": 3171 + }, + { + "epoch": 0.11359607499059932, + "grad_norm": 1.8701997995376587, + "learning_rate": 0.00019635783426977187, + "loss": 1.4666, + "step": 3172 + }, + { + "epoch": 0.1136318871201676, + "grad_norm": 1.465103030204773, + "learning_rate": 0.00019635473174153128, + "loss": 1.2741, + "step": 3173 + }, + { + "epoch": 0.11366769924973588, + "grad_norm": 1.3816897869110107, + "learning_rate": 0.00019635162791696212, + "loss": 1.2098, + "step": 3174 + }, + { + "epoch": 0.11370351137930416, + "grad_norm": 1.5653212070465088, + "learning_rate": 0.00019634852279610602, + "loss": 1.2489, + "step": 3175 + }, + { + "epoch": 0.11373932350887246, + "grad_norm": 2.2242093086242676, + "learning_rate": 0.00019634541637900487, + "loss": 1.2318, + "step": 3176 + }, + { + "epoch": 0.11377513563844074, + "grad_norm": 1.9457917213439941, + "learning_rate": 0.0001963423086657004, + "loss": 1.1177, + "step": 3177 + }, + { + "epoch": 0.11381094776800903, + "grad_norm": 1.5322331190109253, + "learning_rate": 0.00019633919965623444, + "loss": 1.3089, + "step": 3178 + }, + { + "epoch": 0.11384675989757731, + "grad_norm": 1.411341905593872, + "learning_rate": 0.0001963360893506488, + "loss": 1.0703, + "step": 3179 + }, + { + "epoch": 0.11388257202714559, + "grad_norm": 1.5790942907333374, + "learning_rate": 0.0001963329777489854, + "loss": 1.1477, + "step": 3180 + }, + { + "epoch": 0.11391838415671388, + "grad_norm": 1.8391317129135132, + "learning_rate": 0.00019632986485128602, + "loss": 1.2526, + "step": 3181 + }, + { + "epoch": 0.11395419628628216, + "grad_norm": 1.7155077457427979, + "learning_rate": 0.00019632675065759254, + "loss": 1.1206, + "step": 3182 + }, + { + "epoch": 0.11399000841585046, + "grad_norm": 2.3684730529785156, + "learning_rate": 0.0001963236351679469, + "loss": 1.3734, + "step": 3183 + }, + { + "epoch": 0.11402582054541874, + "grad_norm": 1.8422293663024902, + "learning_rate": 0.00019632051838239099, + "loss": 1.1945, + "step": 3184 + }, + { + "epoch": 0.11406163267498702, + "grad_norm": 1.8248080015182495, + "learning_rate": 0.00019631740030096677, + "loss": 1.4621, + "step": 3185 + }, + { + "epoch": 0.1140974448045553, + "grad_norm": 1.440938115119934, + "learning_rate": 0.00019631428092371612, + "loss": 1.2268, + "step": 3186 + }, + { + "epoch": 0.11413325693412359, + "grad_norm": 1.3884409666061401, + "learning_rate": 0.00019631116025068112, + "loss": 1.1873, + "step": 3187 + }, + { + "epoch": 0.11416906906369187, + "grad_norm": 1.8957469463348389, + "learning_rate": 0.00019630803828190368, + "loss": 1.4095, + "step": 3188 + }, + { + "epoch": 0.11420488119326015, + "grad_norm": 1.930794596672058, + "learning_rate": 0.00019630491501742577, + "loss": 1.1873, + "step": 3189 + }, + { + "epoch": 0.11424069332282845, + "grad_norm": 1.4723412990570068, + "learning_rate": 0.00019630179045728946, + "loss": 1.2613, + "step": 3190 + }, + { + "epoch": 0.11427650545239673, + "grad_norm": 1.5016422271728516, + "learning_rate": 0.00019629866460153683, + "loss": 1.2201, + "step": 3191 + }, + { + "epoch": 0.11431231758196501, + "grad_norm": 1.9082210063934326, + "learning_rate": 0.00019629553745020983, + "loss": 0.9191, + "step": 3192 + }, + { + "epoch": 0.1143481297115333, + "grad_norm": 1.3473032712936401, + "learning_rate": 0.00019629240900335062, + "loss": 1.1972, + "step": 3193 + }, + { + "epoch": 0.11438394184110158, + "grad_norm": 1.7120994329452515, + "learning_rate": 0.00019628927926100125, + "loss": 1.1437, + "step": 3194 + }, + { + "epoch": 0.11441975397066986, + "grad_norm": 1.5907862186431885, + "learning_rate": 0.0001962861482232038, + "loss": 1.1086, + "step": 3195 + }, + { + "epoch": 0.11445556610023815, + "grad_norm": 1.469390869140625, + "learning_rate": 0.00019628301589000047, + "loss": 1.1091, + "step": 3196 + }, + { + "epoch": 0.11449137822980643, + "grad_norm": 2.129167079925537, + "learning_rate": 0.00019627988226143334, + "loss": 1.119, + "step": 3197 + }, + { + "epoch": 0.11452719035937473, + "grad_norm": 1.4530543088912964, + "learning_rate": 0.00019627674733754458, + "loss": 1.1672, + "step": 3198 + }, + { + "epoch": 0.11456300248894301, + "grad_norm": 1.6876469850540161, + "learning_rate": 0.00019627361111837637, + "loss": 1.093, + "step": 3199 + }, + { + "epoch": 0.11459881461851129, + "grad_norm": 1.4332391023635864, + "learning_rate": 0.00019627047360397092, + "loss": 1.3656, + "step": 3200 + }, + { + "epoch": 0.11463462674807957, + "grad_norm": 1.767459511756897, + "learning_rate": 0.00019626733479437042, + "loss": 1.3802, + "step": 3201 + }, + { + "epoch": 0.11467043887764786, + "grad_norm": 1.743019461631775, + "learning_rate": 0.0001962641946896171, + "loss": 1.3125, + "step": 3202 + }, + { + "epoch": 0.11470625100721614, + "grad_norm": 2.0998106002807617, + "learning_rate": 0.0001962610532897532, + "loss": 1.5031, + "step": 3203 + }, + { + "epoch": 0.11474206313678442, + "grad_norm": 2.168558120727539, + "learning_rate": 0.00019625791059482106, + "loss": 1.0965, + "step": 3204 + }, + { + "epoch": 0.11477787526635272, + "grad_norm": 2.718688488006592, + "learning_rate": 0.00019625476660486285, + "loss": 1.183, + "step": 3205 + }, + { + "epoch": 0.114813687395921, + "grad_norm": 2.1164631843566895, + "learning_rate": 0.0001962516213199209, + "loss": 1.4101, + "step": 3206 + }, + { + "epoch": 0.11484949952548928, + "grad_norm": 1.6541707515716553, + "learning_rate": 0.00019624847474003756, + "loss": 1.3819, + "step": 3207 + }, + { + "epoch": 0.11488531165505757, + "grad_norm": 1.7489417791366577, + "learning_rate": 0.00019624532686525513, + "loss": 1.2364, + "step": 3208 + }, + { + "epoch": 0.11492112378462585, + "grad_norm": 1.2068191766738892, + "learning_rate": 0.000196242177695616, + "loss": 1.1161, + "step": 3209 + }, + { + "epoch": 0.11495693591419413, + "grad_norm": 1.7104312181472778, + "learning_rate": 0.0001962390272311625, + "loss": 1.3564, + "step": 3210 + }, + { + "epoch": 0.11499274804376242, + "grad_norm": 1.3586195707321167, + "learning_rate": 0.00019623587547193703, + "loss": 1.2004, + "step": 3211 + }, + { + "epoch": 0.11502856017333071, + "grad_norm": 1.812294363975525, + "learning_rate": 0.00019623272241798198, + "loss": 1.4124, + "step": 3212 + }, + { + "epoch": 0.115064372302899, + "grad_norm": 1.804189682006836, + "learning_rate": 0.0001962295680693398, + "loss": 1.2377, + "step": 3213 + }, + { + "epoch": 0.11510018443246728, + "grad_norm": 1.5341169834136963, + "learning_rate": 0.0001962264124260529, + "loss": 1.1645, + "step": 3214 + }, + { + "epoch": 0.11513599656203556, + "grad_norm": 1.5620911121368408, + "learning_rate": 0.00019622325548816373, + "loss": 1.1993, + "step": 3215 + }, + { + "epoch": 0.11517180869160384, + "grad_norm": 1.8674345016479492, + "learning_rate": 0.0001962200972557148, + "loss": 1.3055, + "step": 3216 + }, + { + "epoch": 0.11520762082117213, + "grad_norm": 1.490256905555725, + "learning_rate": 0.00019621693772874855, + "loss": 1.0593, + "step": 3217 + }, + { + "epoch": 0.11524343295074041, + "grad_norm": 1.4550861120224, + "learning_rate": 0.00019621377690730754, + "loss": 1.2007, + "step": 3218 + }, + { + "epoch": 0.1152792450803087, + "grad_norm": 1.7522491216659546, + "learning_rate": 0.00019621061479143425, + "loss": 1.1787, + "step": 3219 + }, + { + "epoch": 0.11531505720987699, + "grad_norm": 1.2830439805984497, + "learning_rate": 0.00019620745138117124, + "loss": 1.1094, + "step": 3220 + }, + { + "epoch": 0.11535086933944527, + "grad_norm": 1.3956905603408813, + "learning_rate": 0.00019620428667656108, + "loss": 1.2726, + "step": 3221 + }, + { + "epoch": 0.11538668146901356, + "grad_norm": 1.6379587650299072, + "learning_rate": 0.00019620112067764636, + "loss": 1.2903, + "step": 3222 + }, + { + "epoch": 0.11542249359858184, + "grad_norm": 1.5342439413070679, + "learning_rate": 0.0001961979533844696, + "loss": 1.0994, + "step": 3223 + }, + { + "epoch": 0.11545830572815012, + "grad_norm": 2.181086540222168, + "learning_rate": 0.0001961947847970735, + "loss": 1.2406, + "step": 3224 + }, + { + "epoch": 0.1154941178577184, + "grad_norm": 1.905002474784851, + "learning_rate": 0.00019619161491550065, + "loss": 0.9383, + "step": 3225 + }, + { + "epoch": 0.1155299299872867, + "grad_norm": 1.9139245748519897, + "learning_rate": 0.00019618844373979372, + "loss": 1.2071, + "step": 3226 + }, + { + "epoch": 0.11556574211685498, + "grad_norm": 1.7220063209533691, + "learning_rate": 0.0001961852712699953, + "loss": 1.0958, + "step": 3227 + }, + { + "epoch": 0.11560155424642327, + "grad_norm": 1.5378366708755493, + "learning_rate": 0.00019618209750614813, + "loss": 1.1329, + "step": 3228 + }, + { + "epoch": 0.11563736637599155, + "grad_norm": 1.6399130821228027, + "learning_rate": 0.00019617892244829495, + "loss": 1.2257, + "step": 3229 + }, + { + "epoch": 0.11567317850555983, + "grad_norm": 1.6919465065002441, + "learning_rate": 0.0001961757460964784, + "loss": 1.1381, + "step": 3230 + }, + { + "epoch": 0.11570899063512811, + "grad_norm": 1.8100955486297607, + "learning_rate": 0.00019617256845074125, + "loss": 1.2274, + "step": 3231 + }, + { + "epoch": 0.1157448027646964, + "grad_norm": 1.599463939666748, + "learning_rate": 0.00019616938951112623, + "loss": 1.0993, + "step": 3232 + }, + { + "epoch": 0.1157806148942647, + "grad_norm": 2.106494903564453, + "learning_rate": 0.00019616620927767614, + "loss": 1.068, + "step": 3233 + }, + { + "epoch": 0.11581642702383298, + "grad_norm": 1.682153582572937, + "learning_rate": 0.00019616302775043377, + "loss": 1.24, + "step": 3234 + }, + { + "epoch": 0.11585223915340126, + "grad_norm": 1.6491785049438477, + "learning_rate": 0.00019615984492944187, + "loss": 1.315, + "step": 3235 + }, + { + "epoch": 0.11588805128296954, + "grad_norm": 1.3360650539398193, + "learning_rate": 0.00019615666081474332, + "loss": 1.2198, + "step": 3236 + }, + { + "epoch": 0.11592386341253783, + "grad_norm": 1.5949509143829346, + "learning_rate": 0.00019615347540638092, + "loss": 1.2657, + "step": 3237 + }, + { + "epoch": 0.11595967554210611, + "grad_norm": 1.9191782474517822, + "learning_rate": 0.00019615028870439752, + "loss": 1.251, + "step": 3238 + }, + { + "epoch": 0.11599548767167439, + "grad_norm": 1.9771796464920044, + "learning_rate": 0.00019614710070883602, + "loss": 1.1105, + "step": 3239 + }, + { + "epoch": 0.11603129980124269, + "grad_norm": 1.414929986000061, + "learning_rate": 0.00019614391141973934, + "loss": 1.1286, + "step": 3240 + }, + { + "epoch": 0.11606711193081097, + "grad_norm": 1.7678372859954834, + "learning_rate": 0.00019614072083715028, + "loss": 1.1246, + "step": 3241 + }, + { + "epoch": 0.11610292406037925, + "grad_norm": 1.8011178970336914, + "learning_rate": 0.00019613752896111187, + "loss": 1.2411, + "step": 3242 + }, + { + "epoch": 0.11613873618994754, + "grad_norm": 2.3432369232177734, + "learning_rate": 0.00019613433579166706, + "loss": 1.5058, + "step": 3243 + }, + { + "epoch": 0.11617454831951582, + "grad_norm": 1.7754435539245605, + "learning_rate": 0.0001961311413288587, + "loss": 1.26, + "step": 3244 + }, + { + "epoch": 0.1162103604490841, + "grad_norm": 1.9865550994873047, + "learning_rate": 0.00019612794557272983, + "loss": 1.2328, + "step": 3245 + }, + { + "epoch": 0.11624617257865238, + "grad_norm": 1.353009581565857, + "learning_rate": 0.00019612474852332348, + "loss": 1.1414, + "step": 3246 + }, + { + "epoch": 0.11628198470822067, + "grad_norm": 1.569539189338684, + "learning_rate": 0.00019612155018068264, + "loss": 1.1328, + "step": 3247 + }, + { + "epoch": 0.11631779683778896, + "grad_norm": 2.6068270206451416, + "learning_rate": 0.00019611835054485032, + "loss": 1.4352, + "step": 3248 + }, + { + "epoch": 0.11635360896735725, + "grad_norm": 1.5074113607406616, + "learning_rate": 0.00019611514961586957, + "loss": 1.0729, + "step": 3249 + }, + { + "epoch": 0.11638942109692553, + "grad_norm": 1.461962342262268, + "learning_rate": 0.00019611194739378344, + "loss": 1.3025, + "step": 3250 + }, + { + "epoch": 0.11642523322649381, + "grad_norm": 1.8271667957305908, + "learning_rate": 0.00019610874387863508, + "loss": 1.1057, + "step": 3251 + }, + { + "epoch": 0.1164610453560621, + "grad_norm": 1.687117099761963, + "learning_rate": 0.00019610553907046748, + "loss": 1.2873, + "step": 3252 + }, + { + "epoch": 0.11649685748563038, + "grad_norm": 1.7939573526382446, + "learning_rate": 0.0001961023329693239, + "loss": 1.2771, + "step": 3253 + }, + { + "epoch": 0.11653266961519866, + "grad_norm": 1.6531643867492676, + "learning_rate": 0.00019609912557524734, + "loss": 1.1824, + "step": 3254 + }, + { + "epoch": 0.11656848174476696, + "grad_norm": 1.3313435316085815, + "learning_rate": 0.000196095916888281, + "loss": 1.1731, + "step": 3255 + }, + { + "epoch": 0.11660429387433524, + "grad_norm": 1.787941575050354, + "learning_rate": 0.00019609270690846807, + "loss": 1.3363, + "step": 3256 + }, + { + "epoch": 0.11664010600390352, + "grad_norm": 1.4001331329345703, + "learning_rate": 0.00019608949563585174, + "loss": 1.1009, + "step": 3257 + }, + { + "epoch": 0.1166759181334718, + "grad_norm": 2.660851001739502, + "learning_rate": 0.00019608628307047517, + "loss": 1.2702, + "step": 3258 + }, + { + "epoch": 0.11671173026304009, + "grad_norm": 1.4838459491729736, + "learning_rate": 0.0001960830692123816, + "loss": 0.9979, + "step": 3259 + }, + { + "epoch": 0.11674754239260837, + "grad_norm": 1.466352939605713, + "learning_rate": 0.00019607985406161425, + "loss": 1.1752, + "step": 3260 + }, + { + "epoch": 0.11678335452217666, + "grad_norm": 1.563137412071228, + "learning_rate": 0.00019607663761821644, + "loss": 1.3559, + "step": 3261 + }, + { + "epoch": 0.11681916665174495, + "grad_norm": 1.8242155313491821, + "learning_rate": 0.0001960734198822314, + "loss": 1.2599, + "step": 3262 + }, + { + "epoch": 0.11685497878131323, + "grad_norm": 2.0237319469451904, + "learning_rate": 0.0001960702008537024, + "loss": 1.2085, + "step": 3263 + }, + { + "epoch": 0.11689079091088152, + "grad_norm": 2.316171169281006, + "learning_rate": 0.00019606698053267277, + "loss": 1.4008, + "step": 3264 + }, + { + "epoch": 0.1169266030404498, + "grad_norm": 1.859920859336853, + "learning_rate": 0.00019606375891918583, + "loss": 1.3872, + "step": 3265 + }, + { + "epoch": 0.11696241517001808, + "grad_norm": 1.4649406671524048, + "learning_rate": 0.00019606053601328496, + "loss": 1.2661, + "step": 3266 + }, + { + "epoch": 0.11699822729958637, + "grad_norm": 2.2029640674591064, + "learning_rate": 0.00019605731181501342, + "loss": 1.0322, + "step": 3267 + }, + { + "epoch": 0.11703403942915465, + "grad_norm": 1.7063302993774414, + "learning_rate": 0.00019605408632441474, + "loss": 1.3291, + "step": 3268 + }, + { + "epoch": 0.11706985155872295, + "grad_norm": 1.4093822240829468, + "learning_rate": 0.00019605085954153218, + "loss": 1.3354, + "step": 3269 + }, + { + "epoch": 0.11710566368829123, + "grad_norm": 1.896452784538269, + "learning_rate": 0.00019604763146640922, + "loss": 1.2954, + "step": 3270 + }, + { + "epoch": 0.11714147581785951, + "grad_norm": 1.5280622243881226, + "learning_rate": 0.00019604440209908925, + "loss": 1.4784, + "step": 3271 + }, + { + "epoch": 0.1171772879474278, + "grad_norm": 1.9211255311965942, + "learning_rate": 0.00019604117143961575, + "loss": 1.0333, + "step": 3272 + }, + { + "epoch": 0.11721310007699608, + "grad_norm": 1.9889100790023804, + "learning_rate": 0.00019603793948803216, + "loss": 1.1552, + "step": 3273 + }, + { + "epoch": 0.11724891220656436, + "grad_norm": 2.271162748336792, + "learning_rate": 0.000196034706244382, + "loss": 1.2124, + "step": 3274 + }, + { + "epoch": 0.11728472433613264, + "grad_norm": 1.6901897192001343, + "learning_rate": 0.0001960314717087087, + "loss": 1.1224, + "step": 3275 + }, + { + "epoch": 0.11732053646570094, + "grad_norm": 1.5182793140411377, + "learning_rate": 0.00019602823588105585, + "loss": 1.1872, + "step": 3276 + }, + { + "epoch": 0.11735634859526922, + "grad_norm": 1.7282302379608154, + "learning_rate": 0.000196024998761467, + "loss": 1.2233, + "step": 3277 + }, + { + "epoch": 0.1173921607248375, + "grad_norm": 3.0508010387420654, + "learning_rate": 0.00019602176034998556, + "loss": 1.4208, + "step": 3278 + }, + { + "epoch": 0.11742797285440579, + "grad_norm": 1.5991183519363403, + "learning_rate": 0.00019601852064665524, + "loss": 1.2437, + "step": 3279 + }, + { + "epoch": 0.11746378498397407, + "grad_norm": 2.168799877166748, + "learning_rate": 0.0001960152796515196, + "loss": 1.2358, + "step": 3280 + }, + { + "epoch": 0.11749959711354235, + "grad_norm": 1.5082751512527466, + "learning_rate": 0.00019601203736462219, + "loss": 1.2962, + "step": 3281 + }, + { + "epoch": 0.11753540924311064, + "grad_norm": 2.6055402755737305, + "learning_rate": 0.00019600879378600666, + "loss": 1.3102, + "step": 3282 + }, + { + "epoch": 0.11757122137267893, + "grad_norm": 1.5780483484268188, + "learning_rate": 0.0001960055489157167, + "loss": 1.2992, + "step": 3283 + }, + { + "epoch": 0.11760703350224722, + "grad_norm": 1.3561161756515503, + "learning_rate": 0.00019600230275379588, + "loss": 1.3914, + "step": 3284 + }, + { + "epoch": 0.1176428456318155, + "grad_norm": 2.0068888664245605, + "learning_rate": 0.0001959990553002879, + "loss": 1.2048, + "step": 3285 + }, + { + "epoch": 0.11767865776138378, + "grad_norm": 1.4673808813095093, + "learning_rate": 0.0001959958065552365, + "loss": 1.3874, + "step": 3286 + }, + { + "epoch": 0.11771446989095206, + "grad_norm": 1.4518505334854126, + "learning_rate": 0.0001959925565186853, + "loss": 1.1188, + "step": 3287 + }, + { + "epoch": 0.11775028202052035, + "grad_norm": 1.6385209560394287, + "learning_rate": 0.00019598930519067813, + "loss": 1.2566, + "step": 3288 + }, + { + "epoch": 0.11778609415008863, + "grad_norm": 1.4854462146759033, + "learning_rate": 0.00019598605257125864, + "loss": 1.1057, + "step": 3289 + }, + { + "epoch": 0.11782190627965693, + "grad_norm": 2.0305540561676025, + "learning_rate": 0.0001959827986604706, + "loss": 1.1756, + "step": 3290 + }, + { + "epoch": 0.11785771840922521, + "grad_norm": 1.699625849723816, + "learning_rate": 0.00019597954345835787, + "loss": 1.2179, + "step": 3291 + }, + { + "epoch": 0.11789353053879349, + "grad_norm": 1.917992353439331, + "learning_rate": 0.00019597628696496418, + "loss": 1.3183, + "step": 3292 + }, + { + "epoch": 0.11792934266836178, + "grad_norm": 1.9767767190933228, + "learning_rate": 0.0001959730291803333, + "loss": 1.5066, + "step": 3293 + }, + { + "epoch": 0.11796515479793006, + "grad_norm": 2.053107500076294, + "learning_rate": 0.00019596977010450915, + "loss": 1.0921, + "step": 3294 + }, + { + "epoch": 0.11800096692749834, + "grad_norm": 1.317548155784607, + "learning_rate": 0.00019596650973753555, + "loss": 1.4611, + "step": 3295 + }, + { + "epoch": 0.11803677905706662, + "grad_norm": 2.0114073753356934, + "learning_rate": 0.00019596324807945632, + "loss": 1.1702, + "step": 3296 + }, + { + "epoch": 0.1180725911866349, + "grad_norm": 1.4958434104919434, + "learning_rate": 0.00019595998513031537, + "loss": 1.1519, + "step": 3297 + }, + { + "epoch": 0.1181084033162032, + "grad_norm": 2.875328779220581, + "learning_rate": 0.00019595672089015663, + "loss": 1.3533, + "step": 3298 + }, + { + "epoch": 0.11814421544577149, + "grad_norm": 1.9566287994384766, + "learning_rate": 0.00019595345535902394, + "loss": 1.0201, + "step": 3299 + }, + { + "epoch": 0.11818002757533977, + "grad_norm": 1.7578203678131104, + "learning_rate": 0.0001959501885369613, + "loss": 1.2028, + "step": 3300 + }, + { + "epoch": 0.11821583970490805, + "grad_norm": 2.475909948348999, + "learning_rate": 0.00019594692042401263, + "loss": 1.2891, + "step": 3301 + }, + { + "epoch": 0.11825165183447633, + "grad_norm": 1.9937820434570312, + "learning_rate": 0.00019594365102022193, + "loss": 1.3128, + "step": 3302 + }, + { + "epoch": 0.11828746396404462, + "grad_norm": 1.5909672975540161, + "learning_rate": 0.00019594038032563315, + "loss": 1.2517, + "step": 3303 + }, + { + "epoch": 0.1183232760936129, + "grad_norm": 1.728240966796875, + "learning_rate": 0.0001959371083402903, + "loss": 1.3736, + "step": 3304 + }, + { + "epoch": 0.1183590882231812, + "grad_norm": 1.6301099061965942, + "learning_rate": 0.00019593383506423743, + "loss": 1.2364, + "step": 3305 + }, + { + "epoch": 0.11839490035274948, + "grad_norm": 1.5758976936340332, + "learning_rate": 0.00019593056049751852, + "loss": 1.1733, + "step": 3306 + }, + { + "epoch": 0.11843071248231776, + "grad_norm": 1.4152430295944214, + "learning_rate": 0.0001959272846401777, + "loss": 1.1338, + "step": 3307 + }, + { + "epoch": 0.11846652461188605, + "grad_norm": 1.6814019680023193, + "learning_rate": 0.000195924007492259, + "loss": 1.0809, + "step": 3308 + }, + { + "epoch": 0.11850233674145433, + "grad_norm": 2.469101905822754, + "learning_rate": 0.00019592072905380648, + "loss": 1.3181, + "step": 3309 + }, + { + "epoch": 0.11853814887102261, + "grad_norm": 2.3670248985290527, + "learning_rate": 0.00019591744932486428, + "loss": 1.2324, + "step": 3310 + }, + { + "epoch": 0.1185739610005909, + "grad_norm": 1.7352133989334106, + "learning_rate": 0.00019591416830547657, + "loss": 1.185, + "step": 3311 + }, + { + "epoch": 0.11860977313015919, + "grad_norm": 1.8280692100524902, + "learning_rate": 0.0001959108859956874, + "loss": 1.287, + "step": 3312 + }, + { + "epoch": 0.11864558525972747, + "grad_norm": 2.4229044914245605, + "learning_rate": 0.00019590760239554097, + "loss": 1.2761, + "step": 3313 + }, + { + "epoch": 0.11868139738929576, + "grad_norm": 1.6661885976791382, + "learning_rate": 0.00019590431750508153, + "loss": 1.1878, + "step": 3314 + }, + { + "epoch": 0.11871720951886404, + "grad_norm": 1.5830167531967163, + "learning_rate": 0.00019590103132435314, + "loss": 1.3466, + "step": 3315 + }, + { + "epoch": 0.11875302164843232, + "grad_norm": 1.6322855949401855, + "learning_rate": 0.00019589774385340007, + "loss": 1.3415, + "step": 3316 + }, + { + "epoch": 0.1187888337780006, + "grad_norm": 2.247825860977173, + "learning_rate": 0.0001958944550922666, + "loss": 1.221, + "step": 3317 + }, + { + "epoch": 0.11882464590756889, + "grad_norm": 1.620654582977295, + "learning_rate": 0.0001958911650409969, + "loss": 1.0723, + "step": 3318 + }, + { + "epoch": 0.11886045803713718, + "grad_norm": 1.6880791187286377, + "learning_rate": 0.0001958878736996353, + "loss": 1.2884, + "step": 3319 + }, + { + "epoch": 0.11889627016670547, + "grad_norm": 1.343462586402893, + "learning_rate": 0.00019588458106822602, + "loss": 1.1811, + "step": 3320 + }, + { + "epoch": 0.11893208229627375, + "grad_norm": 1.8503739833831787, + "learning_rate": 0.00019588128714681337, + "loss": 1.2199, + "step": 3321 + }, + { + "epoch": 0.11896789442584203, + "grad_norm": 1.8702141046524048, + "learning_rate": 0.0001958779919354417, + "loss": 1.4809, + "step": 3322 + }, + { + "epoch": 0.11900370655541032, + "grad_norm": 1.4704456329345703, + "learning_rate": 0.00019587469543415532, + "loss": 1.1352, + "step": 3323 + }, + { + "epoch": 0.1190395186849786, + "grad_norm": 1.5397899150848389, + "learning_rate": 0.00019587139764299857, + "loss": 1.1995, + "step": 3324 + }, + { + "epoch": 0.11907533081454688, + "grad_norm": 1.6772550344467163, + "learning_rate": 0.00019586809856201586, + "loss": 1.3091, + "step": 3325 + }, + { + "epoch": 0.11911114294411518, + "grad_norm": 2.149824619293213, + "learning_rate": 0.00019586479819125153, + "loss": 1.2466, + "step": 3326 + }, + { + "epoch": 0.11914695507368346, + "grad_norm": 1.4862260818481445, + "learning_rate": 0.00019586149653074997, + "loss": 0.9555, + "step": 3327 + }, + { + "epoch": 0.11918276720325174, + "grad_norm": 1.2253000736236572, + "learning_rate": 0.00019585819358055567, + "loss": 1.1394, + "step": 3328 + }, + { + "epoch": 0.11921857933282003, + "grad_norm": 2.335902214050293, + "learning_rate": 0.00019585488934071302, + "loss": 1.3433, + "step": 3329 + }, + { + "epoch": 0.11925439146238831, + "grad_norm": 1.4649436473846436, + "learning_rate": 0.00019585158381126645, + "loss": 1.0602, + "step": 3330 + }, + { + "epoch": 0.11929020359195659, + "grad_norm": 2.0767085552215576, + "learning_rate": 0.00019584827699226044, + "loss": 1.207, + "step": 3331 + }, + { + "epoch": 0.11932601572152488, + "grad_norm": 1.7085540294647217, + "learning_rate": 0.00019584496888373955, + "loss": 1.0621, + "step": 3332 + }, + { + "epoch": 0.11936182785109317, + "grad_norm": 1.6900063753128052, + "learning_rate": 0.00019584165948574822, + "loss": 1.2936, + "step": 3333 + }, + { + "epoch": 0.11939763998066145, + "grad_norm": 2.278123378753662, + "learning_rate": 0.00019583834879833097, + "loss": 1.0604, + "step": 3334 + }, + { + "epoch": 0.11943345211022974, + "grad_norm": 1.8099993467330933, + "learning_rate": 0.0001958350368215324, + "loss": 1.1853, + "step": 3335 + }, + { + "epoch": 0.11946926423979802, + "grad_norm": 1.5281099081039429, + "learning_rate": 0.00019583172355539698, + "loss": 1.2368, + "step": 3336 + }, + { + "epoch": 0.1195050763693663, + "grad_norm": 2.1402721405029297, + "learning_rate": 0.00019582840899996936, + "loss": 1.1366, + "step": 3337 + }, + { + "epoch": 0.11954088849893459, + "grad_norm": 1.6550977230072021, + "learning_rate": 0.00019582509315529408, + "loss": 1.3002, + "step": 3338 + }, + { + "epoch": 0.11957670062850287, + "grad_norm": 2.219606876373291, + "learning_rate": 0.0001958217760214158, + "loss": 1.3626, + "step": 3339 + }, + { + "epoch": 0.11961251275807117, + "grad_norm": 1.5945137739181519, + "learning_rate": 0.00019581845759837914, + "loss": 1.2448, + "step": 3340 + }, + { + "epoch": 0.11964832488763945, + "grad_norm": 1.915103793144226, + "learning_rate": 0.0001958151378862287, + "loss": 1.1554, + "step": 3341 + }, + { + "epoch": 0.11968413701720773, + "grad_norm": 2.149441719055176, + "learning_rate": 0.00019581181688500918, + "loss": 1.3402, + "step": 3342 + }, + { + "epoch": 0.11971994914677601, + "grad_norm": 2.241023063659668, + "learning_rate": 0.00019580849459476527, + "loss": 1.1886, + "step": 3343 + }, + { + "epoch": 0.1197557612763443, + "grad_norm": 1.9235414266586304, + "learning_rate": 0.00019580517101554164, + "loss": 1.1942, + "step": 3344 + }, + { + "epoch": 0.11979157340591258, + "grad_norm": 1.3793772459030151, + "learning_rate": 0.00019580184614738299, + "loss": 1.2452, + "step": 3345 + }, + { + "epoch": 0.11982738553548086, + "grad_norm": 1.7663137912750244, + "learning_rate": 0.0001957985199903341, + "loss": 1.1406, + "step": 3346 + }, + { + "epoch": 0.11986319766504915, + "grad_norm": 1.7154361009597778, + "learning_rate": 0.00019579519254443967, + "loss": 1.3336, + "step": 3347 + }, + { + "epoch": 0.11989900979461744, + "grad_norm": 1.3215693235397339, + "learning_rate": 0.00019579186380974455, + "loss": 1.0479, + "step": 3348 + }, + { + "epoch": 0.11993482192418572, + "grad_norm": 1.6724573373794556, + "learning_rate": 0.0001957885337862934, + "loss": 1.287, + "step": 3349 + }, + { + "epoch": 0.11997063405375401, + "grad_norm": 1.6024593114852905, + "learning_rate": 0.00019578520247413113, + "loss": 1.2661, + "step": 3350 + }, + { + "epoch": 0.12000644618332229, + "grad_norm": 1.9433929920196533, + "learning_rate": 0.0001957818698733025, + "loss": 1.3419, + "step": 3351 + }, + { + "epoch": 0.12004225831289057, + "grad_norm": 2.333376407623291, + "learning_rate": 0.00019577853598385235, + "loss": 0.9934, + "step": 3352 + }, + { + "epoch": 0.12007807044245886, + "grad_norm": 1.9373081922531128, + "learning_rate": 0.00019577520080582556, + "loss": 1.3308, + "step": 3353 + }, + { + "epoch": 0.12011388257202714, + "grad_norm": 1.4264698028564453, + "learning_rate": 0.00019577186433926698, + "loss": 1.3944, + "step": 3354 + }, + { + "epoch": 0.12014969470159544, + "grad_norm": 1.728320837020874, + "learning_rate": 0.00019576852658422146, + "loss": 1.3237, + "step": 3355 + }, + { + "epoch": 0.12018550683116372, + "grad_norm": 2.4400460720062256, + "learning_rate": 0.000195765187540734, + "loss": 1.3948, + "step": 3356 + }, + { + "epoch": 0.120221318960732, + "grad_norm": 2.0699050426483154, + "learning_rate": 0.00019576184720884946, + "loss": 1.3813, + "step": 3357 + }, + { + "epoch": 0.12025713109030028, + "grad_norm": 1.615922212600708, + "learning_rate": 0.00019575850558861278, + "loss": 1.3131, + "step": 3358 + }, + { + "epoch": 0.12029294321986857, + "grad_norm": 1.6759737730026245, + "learning_rate": 0.00019575516268006892, + "loss": 1.1858, + "step": 3359 + }, + { + "epoch": 0.12032875534943685, + "grad_norm": 1.3411206007003784, + "learning_rate": 0.00019575181848326289, + "loss": 1.0367, + "step": 3360 + }, + { + "epoch": 0.12036456747900513, + "grad_norm": 1.5762470960617065, + "learning_rate": 0.00019574847299823965, + "loss": 1.2425, + "step": 3361 + }, + { + "epoch": 0.12040037960857343, + "grad_norm": 1.5277354717254639, + "learning_rate": 0.00019574512622504416, + "loss": 1.2675, + "step": 3362 + }, + { + "epoch": 0.12043619173814171, + "grad_norm": 1.3411957025527954, + "learning_rate": 0.00019574177816372154, + "loss": 1.1213, + "step": 3363 + }, + { + "epoch": 0.12047200386771, + "grad_norm": 1.3351263999938965, + "learning_rate": 0.0001957384288143168, + "loss": 1.3232, + "step": 3364 + }, + { + "epoch": 0.12050781599727828, + "grad_norm": 2.0293631553649902, + "learning_rate": 0.000195735078176875, + "loss": 1.1463, + "step": 3365 + }, + { + "epoch": 0.12054362812684656, + "grad_norm": 2.036923408508301, + "learning_rate": 0.0001957317262514412, + "loss": 1.1953, + "step": 3366 + }, + { + "epoch": 0.12057944025641484, + "grad_norm": 2.5202536582946777, + "learning_rate": 0.00019572837303806048, + "loss": 1.3889, + "step": 3367 + }, + { + "epoch": 0.12061525238598313, + "grad_norm": 1.6876493692398071, + "learning_rate": 0.00019572501853677802, + "loss": 1.1133, + "step": 3368 + }, + { + "epoch": 0.12065106451555142, + "grad_norm": 2.0812149047851562, + "learning_rate": 0.0001957216627476389, + "loss": 1.1399, + "step": 3369 + }, + { + "epoch": 0.1206868766451197, + "grad_norm": 2.4492738246917725, + "learning_rate": 0.0001957183056706883, + "loss": 1.3374, + "step": 3370 + }, + { + "epoch": 0.12072268877468799, + "grad_norm": 2.413912534713745, + "learning_rate": 0.0001957149473059713, + "loss": 1.1906, + "step": 3371 + }, + { + "epoch": 0.12075850090425627, + "grad_norm": 1.4304925203323364, + "learning_rate": 0.0001957115876535332, + "loss": 1.2637, + "step": 3372 + }, + { + "epoch": 0.12079431303382455, + "grad_norm": 1.9818978309631348, + "learning_rate": 0.00019570822671341915, + "loss": 1.2345, + "step": 3373 + }, + { + "epoch": 0.12083012516339284, + "grad_norm": 1.7940864562988281, + "learning_rate": 0.00019570486448567437, + "loss": 1.288, + "step": 3374 + }, + { + "epoch": 0.12086593729296112, + "grad_norm": 1.7134274244308472, + "learning_rate": 0.00019570150097034404, + "loss": 1.255, + "step": 3375 + }, + { + "epoch": 0.12090174942252942, + "grad_norm": 1.5690858364105225, + "learning_rate": 0.0001956981361674735, + "loss": 1.3387, + "step": 3376 + }, + { + "epoch": 0.1209375615520977, + "grad_norm": 1.68631911277771, + "learning_rate": 0.00019569477007710798, + "loss": 1.313, + "step": 3377 + }, + { + "epoch": 0.12097337368166598, + "grad_norm": 1.3937112092971802, + "learning_rate": 0.00019569140269929276, + "loss": 1.3103, + "step": 3378 + }, + { + "epoch": 0.12100918581123427, + "grad_norm": 1.9333696365356445, + "learning_rate": 0.00019568803403407315, + "loss": 1.3083, + "step": 3379 + }, + { + "epoch": 0.12104499794080255, + "grad_norm": 1.6884723901748657, + "learning_rate": 0.00019568466408149447, + "loss": 1.0347, + "step": 3380 + }, + { + "epoch": 0.12108081007037083, + "grad_norm": 1.699155569076538, + "learning_rate": 0.00019568129284160203, + "loss": 1.1284, + "step": 3381 + }, + { + "epoch": 0.12111662219993911, + "grad_norm": 1.4022928476333618, + "learning_rate": 0.00019567792031444125, + "loss": 1.0891, + "step": 3382 + }, + { + "epoch": 0.12115243432950741, + "grad_norm": 1.5060577392578125, + "learning_rate": 0.00019567454650005749, + "loss": 1.2924, + "step": 3383 + }, + { + "epoch": 0.1211882464590757, + "grad_norm": 1.8115978240966797, + "learning_rate": 0.00019567117139849605, + "loss": 1.3024, + "step": 3384 + }, + { + "epoch": 0.12122405858864398, + "grad_norm": 1.393578052520752, + "learning_rate": 0.00019566779500980247, + "loss": 0.9634, + "step": 3385 + }, + { + "epoch": 0.12125987071821226, + "grad_norm": 1.740544080734253, + "learning_rate": 0.00019566441733402207, + "loss": 0.9691, + "step": 3386 + }, + { + "epoch": 0.12129568284778054, + "grad_norm": 2.3696482181549072, + "learning_rate": 0.00019566103837120036, + "loss": 1.0505, + "step": 3387 + }, + { + "epoch": 0.12133149497734882, + "grad_norm": 1.4522120952606201, + "learning_rate": 0.00019565765812138274, + "loss": 1.1263, + "step": 3388 + }, + { + "epoch": 0.12136730710691711, + "grad_norm": 1.7515254020690918, + "learning_rate": 0.00019565427658461474, + "loss": 1.2417, + "step": 3389 + }, + { + "epoch": 0.1214031192364854, + "grad_norm": 1.552008867263794, + "learning_rate": 0.00019565089376094184, + "loss": 1.2679, + "step": 3390 + }, + { + "epoch": 0.12143893136605369, + "grad_norm": 1.8509860038757324, + "learning_rate": 0.0001956475096504095, + "loss": 1.3208, + "step": 3391 + }, + { + "epoch": 0.12147474349562197, + "grad_norm": 1.6061866283416748, + "learning_rate": 0.00019564412425306338, + "loss": 1.3342, + "step": 3392 + }, + { + "epoch": 0.12151055562519025, + "grad_norm": 2.0603091716766357, + "learning_rate": 0.00019564073756894889, + "loss": 1.3634, + "step": 3393 + }, + { + "epoch": 0.12154636775475854, + "grad_norm": 2.4436655044555664, + "learning_rate": 0.00019563734959811163, + "loss": 1.1359, + "step": 3394 + }, + { + "epoch": 0.12158217988432682, + "grad_norm": 2.723891019821167, + "learning_rate": 0.00019563396034059724, + "loss": 1.1577, + "step": 3395 + }, + { + "epoch": 0.1216179920138951, + "grad_norm": 1.4671378135681152, + "learning_rate": 0.00019563056979645123, + "loss": 1.176, + "step": 3396 + }, + { + "epoch": 0.12165380414346338, + "grad_norm": 2.0345118045806885, + "learning_rate": 0.00019562717796571929, + "loss": 1.1763, + "step": 3397 + }, + { + "epoch": 0.12168961627303168, + "grad_norm": 1.5550769567489624, + "learning_rate": 0.00019562378484844697, + "loss": 1.2988, + "step": 3398 + }, + { + "epoch": 0.12172542840259996, + "grad_norm": 1.3475502729415894, + "learning_rate": 0.00019562039044468, + "loss": 1.3448, + "step": 3399 + }, + { + "epoch": 0.12176124053216825, + "grad_norm": 1.8239307403564453, + "learning_rate": 0.00019561699475446401, + "loss": 1.1447, + "step": 3400 + }, + { + "epoch": 0.12179705266173653, + "grad_norm": 1.5480351448059082, + "learning_rate": 0.00019561359777784472, + "loss": 1.124, + "step": 3401 + }, + { + "epoch": 0.12183286479130481, + "grad_norm": 1.8290865421295166, + "learning_rate": 0.0001956101995148678, + "loss": 1.2743, + "step": 3402 + }, + { + "epoch": 0.1218686769208731, + "grad_norm": 1.546467900276184, + "learning_rate": 0.00019560679996557894, + "loss": 1.3055, + "step": 3403 + }, + { + "epoch": 0.12190448905044138, + "grad_norm": 1.3050352334976196, + "learning_rate": 0.00019560339913002396, + "loss": 1.2757, + "step": 3404 + }, + { + "epoch": 0.12194030118000967, + "grad_norm": 2.0538487434387207, + "learning_rate": 0.00019559999700824852, + "loss": 1.3136, + "step": 3405 + }, + { + "epoch": 0.12197611330957796, + "grad_norm": 1.405582070350647, + "learning_rate": 0.00019559659360029845, + "loss": 1.2607, + "step": 3406 + }, + { + "epoch": 0.12201192543914624, + "grad_norm": 2.2261979579925537, + "learning_rate": 0.0001955931889062195, + "loss": 1.28, + "step": 3407 + }, + { + "epoch": 0.12204773756871452, + "grad_norm": 1.74753737449646, + "learning_rate": 0.00019558978292605754, + "loss": 1.1607, + "step": 3408 + }, + { + "epoch": 0.1220835496982828, + "grad_norm": 2.0766396522521973, + "learning_rate": 0.00019558637565985834, + "loss": 1.0285, + "step": 3409 + }, + { + "epoch": 0.12211936182785109, + "grad_norm": 1.7329195737838745, + "learning_rate": 0.00019558296710766774, + "loss": 1.1474, + "step": 3410 + }, + { + "epoch": 0.12215517395741937, + "grad_norm": 1.3776458501815796, + "learning_rate": 0.00019557955726953163, + "loss": 1.1306, + "step": 3411 + }, + { + "epoch": 0.12219098608698767, + "grad_norm": 1.4767249822616577, + "learning_rate": 0.00019557614614549586, + "loss": 1.2332, + "step": 3412 + }, + { + "epoch": 0.12222679821655595, + "grad_norm": 2.113649606704712, + "learning_rate": 0.00019557273373560632, + "loss": 1.3377, + "step": 3413 + }, + { + "epoch": 0.12226261034612423, + "grad_norm": 1.2179756164550781, + "learning_rate": 0.00019556932003990892, + "loss": 1.0252, + "step": 3414 + }, + { + "epoch": 0.12229842247569252, + "grad_norm": 1.6309434175491333, + "learning_rate": 0.0001955659050584496, + "loss": 1.2337, + "step": 3415 + }, + { + "epoch": 0.1223342346052608, + "grad_norm": 2.049391031265259, + "learning_rate": 0.0001955624887912743, + "loss": 1.0594, + "step": 3416 + }, + { + "epoch": 0.12237004673482908, + "grad_norm": 1.8258986473083496, + "learning_rate": 0.00019555907123842902, + "loss": 1.1531, + "step": 3417 + }, + { + "epoch": 0.12240585886439737, + "grad_norm": 2.3163304328918457, + "learning_rate": 0.00019555565239995966, + "loss": 1.1349, + "step": 3418 + }, + { + "epoch": 0.12244167099396566, + "grad_norm": 1.7747559547424316, + "learning_rate": 0.00019555223227591225, + "loss": 1.2395, + "step": 3419 + }, + { + "epoch": 0.12247748312353395, + "grad_norm": 1.332226276397705, + "learning_rate": 0.0001955488108663328, + "loss": 1.2374, + "step": 3420 + }, + { + "epoch": 0.12251329525310223, + "grad_norm": 1.7024117708206177, + "learning_rate": 0.00019554538817126739, + "loss": 1.2033, + "step": 3421 + }, + { + "epoch": 0.12254910738267051, + "grad_norm": 1.6742013692855835, + "learning_rate": 0.000195541964190762, + "loss": 1.2592, + "step": 3422 + }, + { + "epoch": 0.1225849195122388, + "grad_norm": 2.019040107727051, + "learning_rate": 0.00019553853892486273, + "loss": 1.2353, + "step": 3423 + }, + { + "epoch": 0.12262073164180708, + "grad_norm": 1.465643286705017, + "learning_rate": 0.00019553511237361564, + "loss": 1.3148, + "step": 3424 + }, + { + "epoch": 0.12265654377137536, + "grad_norm": 2.7709405422210693, + "learning_rate": 0.00019553168453706685, + "loss": 1.1949, + "step": 3425 + }, + { + "epoch": 0.12269235590094366, + "grad_norm": 1.6717628240585327, + "learning_rate": 0.00019552825541526247, + "loss": 1.0839, + "step": 3426 + }, + { + "epoch": 0.12272816803051194, + "grad_norm": 1.4473333358764648, + "learning_rate": 0.00019552482500824865, + "loss": 1.3838, + "step": 3427 + }, + { + "epoch": 0.12276398016008022, + "grad_norm": 1.7233331203460693, + "learning_rate": 0.0001955213933160715, + "loss": 1.1389, + "step": 3428 + }, + { + "epoch": 0.1227997922896485, + "grad_norm": 2.0393800735473633, + "learning_rate": 0.00019551796033877726, + "loss": 0.9972, + "step": 3429 + }, + { + "epoch": 0.12283560441921679, + "grad_norm": 1.576380729675293, + "learning_rate": 0.00019551452607641205, + "loss": 1.3504, + "step": 3430 + }, + { + "epoch": 0.12287141654878507, + "grad_norm": 1.5997921228408813, + "learning_rate": 0.0001955110905290221, + "loss": 1.3985, + "step": 3431 + }, + { + "epoch": 0.12290722867835335, + "grad_norm": 1.703274130821228, + "learning_rate": 0.00019550765369665362, + "loss": 1.0618, + "step": 3432 + }, + { + "epoch": 0.12294304080792165, + "grad_norm": 1.6090327501296997, + "learning_rate": 0.00019550421557935286, + "loss": 1.2229, + "step": 3433 + }, + { + "epoch": 0.12297885293748993, + "grad_norm": 1.3286488056182861, + "learning_rate": 0.00019550077617716606, + "loss": 1.2173, + "step": 3434 + }, + { + "epoch": 0.12301466506705822, + "grad_norm": 1.3367928266525269, + "learning_rate": 0.00019549733549013954, + "loss": 1.266, + "step": 3435 + }, + { + "epoch": 0.1230504771966265, + "grad_norm": 1.4856594800949097, + "learning_rate": 0.0001954938935183195, + "loss": 1.2672, + "step": 3436 + }, + { + "epoch": 0.12308628932619478, + "grad_norm": 2.278393268585205, + "learning_rate": 0.00019549045026175232, + "loss": 1.2264, + "step": 3437 + }, + { + "epoch": 0.12312210145576306, + "grad_norm": 1.4539357423782349, + "learning_rate": 0.00019548700572048433, + "loss": 1.2002, + "step": 3438 + }, + { + "epoch": 0.12315791358533135, + "grad_norm": 1.6765538454055786, + "learning_rate": 0.00019548355989456182, + "loss": 1.1934, + "step": 3439 + }, + { + "epoch": 0.12319372571489964, + "grad_norm": 1.54740309715271, + "learning_rate": 0.0001954801127840312, + "loss": 1.1196, + "step": 3440 + }, + { + "epoch": 0.12322953784446793, + "grad_norm": 1.9623514413833618, + "learning_rate": 0.00019547666438893879, + "loss": 1.0752, + "step": 3441 + }, + { + "epoch": 0.12326534997403621, + "grad_norm": 1.6392574310302734, + "learning_rate": 0.00019547321470933103, + "loss": 1.1296, + "step": 3442 + }, + { + "epoch": 0.12330116210360449, + "grad_norm": 1.5002408027648926, + "learning_rate": 0.00019546976374525433, + "loss": 1.2583, + "step": 3443 + }, + { + "epoch": 0.12333697423317277, + "grad_norm": 1.3538259267807007, + "learning_rate": 0.0001954663114967551, + "loss": 0.9241, + "step": 3444 + }, + { + "epoch": 0.12337278636274106, + "grad_norm": 1.789853572845459, + "learning_rate": 0.0001954628579638798, + "loss": 0.8258, + "step": 3445 + }, + { + "epoch": 0.12340859849230934, + "grad_norm": 1.5795247554779053, + "learning_rate": 0.0001954594031466749, + "loss": 1.0578, + "step": 3446 + }, + { + "epoch": 0.12344441062187762, + "grad_norm": 1.3715051412582397, + "learning_rate": 0.00019545594704518682, + "loss": 1.3049, + "step": 3447 + }, + { + "epoch": 0.12348022275144592, + "grad_norm": 1.3695902824401855, + "learning_rate": 0.00019545248965946216, + "loss": 1.2747, + "step": 3448 + }, + { + "epoch": 0.1235160348810142, + "grad_norm": 1.9291332960128784, + "learning_rate": 0.00019544903098954732, + "loss": 1.1367, + "step": 3449 + }, + { + "epoch": 0.12355184701058249, + "grad_norm": 1.950408935546875, + "learning_rate": 0.0001954455710354889, + "loss": 1.1429, + "step": 3450 + }, + { + "epoch": 0.12358765914015077, + "grad_norm": 1.9268611669540405, + "learning_rate": 0.00019544210979733343, + "loss": 1.3323, + "step": 3451 + }, + { + "epoch": 0.12362347126971905, + "grad_norm": 1.5002727508544922, + "learning_rate": 0.0001954386472751275, + "loss": 1.4392, + "step": 3452 + }, + { + "epoch": 0.12365928339928733, + "grad_norm": 2.3010973930358887, + "learning_rate": 0.0001954351834689177, + "loss": 1.3664, + "step": 3453 + }, + { + "epoch": 0.12369509552885562, + "grad_norm": 2.8311336040496826, + "learning_rate": 0.0001954317183787506, + "loss": 1.3793, + "step": 3454 + }, + { + "epoch": 0.12373090765842391, + "grad_norm": 1.8716074228286743, + "learning_rate": 0.00019542825200467279, + "loss": 1.3133, + "step": 3455 + }, + { + "epoch": 0.1237667197879922, + "grad_norm": 1.2935175895690918, + "learning_rate": 0.00019542478434673096, + "loss": 1.2367, + "step": 3456 + }, + { + "epoch": 0.12380253191756048, + "grad_norm": 1.6644173860549927, + "learning_rate": 0.00019542131540497174, + "loss": 1.2141, + "step": 3457 + }, + { + "epoch": 0.12383834404712876, + "grad_norm": 1.7815769910812378, + "learning_rate": 0.00019541784517944182, + "loss": 1.3472, + "step": 3458 + }, + { + "epoch": 0.12387415617669705, + "grad_norm": 1.7410675287246704, + "learning_rate": 0.0001954143736701879, + "loss": 0.9717, + "step": 3459 + }, + { + "epoch": 0.12390996830626533, + "grad_norm": 1.559039831161499, + "learning_rate": 0.0001954109008772566, + "loss": 1.3285, + "step": 3460 + }, + { + "epoch": 0.12394578043583361, + "grad_norm": 1.7947685718536377, + "learning_rate": 0.00019540742680069473, + "loss": 1.2326, + "step": 3461 + }, + { + "epoch": 0.12398159256540191, + "grad_norm": 1.9499709606170654, + "learning_rate": 0.000195403951440549, + "loss": 1.2646, + "step": 3462 + }, + { + "epoch": 0.12401740469497019, + "grad_norm": 1.4210041761398315, + "learning_rate": 0.00019540047479686616, + "loss": 1.1889, + "step": 3463 + }, + { + "epoch": 0.12405321682453847, + "grad_norm": 1.3470717668533325, + "learning_rate": 0.00019539699686969302, + "loss": 1.0391, + "step": 3464 + }, + { + "epoch": 0.12408902895410676, + "grad_norm": 1.5703980922698975, + "learning_rate": 0.0001953935176590763, + "loss": 1.3745, + "step": 3465 + }, + { + "epoch": 0.12412484108367504, + "grad_norm": 1.6687908172607422, + "learning_rate": 0.00019539003716506287, + "loss": 1.2328, + "step": 3466 + }, + { + "epoch": 0.12416065321324332, + "grad_norm": 2.0336880683898926, + "learning_rate": 0.0001953865553876995, + "loss": 1.4604, + "step": 3467 + }, + { + "epoch": 0.1241964653428116, + "grad_norm": 1.856956958770752, + "learning_rate": 0.00019538307232703313, + "loss": 1.2152, + "step": 3468 + }, + { + "epoch": 0.1242322774723799, + "grad_norm": 1.367696762084961, + "learning_rate": 0.0001953795879831105, + "loss": 1.2228, + "step": 3469 + }, + { + "epoch": 0.12426808960194818, + "grad_norm": 1.5239533185958862, + "learning_rate": 0.00019537610235597857, + "loss": 1.2693, + "step": 3470 + }, + { + "epoch": 0.12430390173151647, + "grad_norm": 1.357938289642334, + "learning_rate": 0.00019537261544568421, + "loss": 1.0367, + "step": 3471 + }, + { + "epoch": 0.12433971386108475, + "grad_norm": 1.7828727960586548, + "learning_rate": 0.00019536912725227432, + "loss": 1.2741, + "step": 3472 + }, + { + "epoch": 0.12437552599065303, + "grad_norm": 1.404898762702942, + "learning_rate": 0.00019536563777579585, + "loss": 1.192, + "step": 3473 + }, + { + "epoch": 0.12441133812022132, + "grad_norm": 1.6117751598358154, + "learning_rate": 0.0001953621470162957, + "loss": 1.283, + "step": 3474 + }, + { + "epoch": 0.1244471502497896, + "grad_norm": 1.839181661605835, + "learning_rate": 0.00019535865497382094, + "loss": 1.0992, + "step": 3475 + }, + { + "epoch": 0.1244829623793579, + "grad_norm": 1.6550092697143555, + "learning_rate": 0.00019535516164841842, + "loss": 1.1866, + "step": 3476 + }, + { + "epoch": 0.12451877450892618, + "grad_norm": 1.5617272853851318, + "learning_rate": 0.00019535166704013522, + "loss": 1.1622, + "step": 3477 + }, + { + "epoch": 0.12455458663849446, + "grad_norm": 1.788174033164978, + "learning_rate": 0.00019534817114901833, + "loss": 1.289, + "step": 3478 + }, + { + "epoch": 0.12459039876806274, + "grad_norm": 1.6122461557388306, + "learning_rate": 0.0001953446739751148, + "loss": 1.2976, + "step": 3479 + }, + { + "epoch": 0.12462621089763103, + "grad_norm": 1.9981768131256104, + "learning_rate": 0.00019534117551847166, + "loss": 1.1888, + "step": 3480 + }, + { + "epoch": 0.12466202302719931, + "grad_norm": 1.588686227798462, + "learning_rate": 0.000195337675779136, + "loss": 1.3457, + "step": 3481 + }, + { + "epoch": 0.12469783515676759, + "grad_norm": 1.852001428604126, + "learning_rate": 0.00019533417475715487, + "loss": 1.1401, + "step": 3482 + }, + { + "epoch": 0.12473364728633589, + "grad_norm": 2.262359142303467, + "learning_rate": 0.0001953306724525754, + "loss": 1.2257, + "step": 3483 + }, + { + "epoch": 0.12476945941590417, + "grad_norm": 1.4519673585891724, + "learning_rate": 0.00019532716886544468, + "loss": 1.0113, + "step": 3484 + }, + { + "epoch": 0.12480527154547245, + "grad_norm": 2.2113444805145264, + "learning_rate": 0.0001953236639958099, + "loss": 1.2163, + "step": 3485 + }, + { + "epoch": 0.12484108367504074, + "grad_norm": 1.190718173980713, + "learning_rate": 0.00019532015784371818, + "loss": 1.3899, + "step": 3486 + }, + { + "epoch": 0.12487689580460902, + "grad_norm": 1.9034090042114258, + "learning_rate": 0.00019531665040921668, + "loss": 1.3298, + "step": 3487 + }, + { + "epoch": 0.1249127079341773, + "grad_norm": 1.5359928607940674, + "learning_rate": 0.00019531314169235259, + "loss": 0.9532, + "step": 3488 + }, + { + "epoch": 0.12494852006374559, + "grad_norm": 1.7498419284820557, + "learning_rate": 0.00019530963169317312, + "loss": 1.0156, + "step": 3489 + }, + { + "epoch": 0.12498433219331388, + "grad_norm": 1.939504861831665, + "learning_rate": 0.0001953061204117255, + "loss": 1.2668, + "step": 3490 + }, + { + "epoch": 0.12502014432288217, + "grad_norm": 1.8199338912963867, + "learning_rate": 0.00019530260784805697, + "loss": 1.039, + "step": 3491 + }, + { + "epoch": 0.12505595645245043, + "grad_norm": 1.4359776973724365, + "learning_rate": 0.00019529909400221475, + "loss": 0.9994, + "step": 3492 + }, + { + "epoch": 0.12509176858201873, + "grad_norm": 1.401432991027832, + "learning_rate": 0.00019529557887424618, + "loss": 1.3022, + "step": 3493 + }, + { + "epoch": 0.12512758071158703, + "grad_norm": 1.6837811470031738, + "learning_rate": 0.00019529206246419854, + "loss": 1.2176, + "step": 3494 + }, + { + "epoch": 0.1251633928411553, + "grad_norm": 1.4190561771392822, + "learning_rate": 0.00019528854477211908, + "loss": 1.0851, + "step": 3495 + }, + { + "epoch": 0.1251992049707236, + "grad_norm": 1.4431524276733398, + "learning_rate": 0.0001952850257980552, + "loss": 1.1807, + "step": 3496 + }, + { + "epoch": 0.12523501710029186, + "grad_norm": 1.2798283100128174, + "learning_rate": 0.00019528150554205419, + "loss": 1.1268, + "step": 3497 + }, + { + "epoch": 0.12527082922986016, + "grad_norm": 2.066631317138672, + "learning_rate": 0.00019527798400416338, + "loss": 1.2586, + "step": 3498 + }, + { + "epoch": 0.12530664135942843, + "grad_norm": 1.6284284591674805, + "learning_rate": 0.00019527446118443025, + "loss": 1.1559, + "step": 3499 + }, + { + "epoch": 0.12534245348899672, + "grad_norm": 1.464437484741211, + "learning_rate": 0.00019527093708290215, + "loss": 1.2859, + "step": 3500 + }, + { + "epoch": 0.12537826561856502, + "grad_norm": 1.4775131940841675, + "learning_rate": 0.00019526741169962643, + "loss": 1.3507, + "step": 3501 + }, + { + "epoch": 0.1254140777481333, + "grad_norm": 2.0801079273223877, + "learning_rate": 0.00019526388503465062, + "loss": 1.3007, + "step": 3502 + }, + { + "epoch": 0.1254498898777016, + "grad_norm": 1.7990822792053223, + "learning_rate": 0.00019526035708802207, + "loss": 1.0847, + "step": 3503 + }, + { + "epoch": 0.12548570200726986, + "grad_norm": 1.7157820463180542, + "learning_rate": 0.00019525682785978833, + "loss": 1.3349, + "step": 3504 + }, + { + "epoch": 0.12552151413683815, + "grad_norm": 1.6738520860671997, + "learning_rate": 0.00019525329734999683, + "loss": 1.3051, + "step": 3505 + }, + { + "epoch": 0.12555732626640642, + "grad_norm": 1.5837424993515015, + "learning_rate": 0.0001952497655586951, + "loss": 0.9879, + "step": 3506 + }, + { + "epoch": 0.12559313839597472, + "grad_norm": 1.57079017162323, + "learning_rate": 0.00019524623248593062, + "loss": 1.0661, + "step": 3507 + }, + { + "epoch": 0.12562895052554302, + "grad_norm": 1.4714502096176147, + "learning_rate": 0.00019524269813175096, + "loss": 1.0381, + "step": 3508 + }, + { + "epoch": 0.12566476265511128, + "grad_norm": 1.8120723962783813, + "learning_rate": 0.00019523916249620363, + "loss": 1.189, + "step": 3509 + }, + { + "epoch": 0.12570057478467958, + "grad_norm": 2.456559181213379, + "learning_rate": 0.0001952356255793362, + "loss": 1.2678, + "step": 3510 + }, + { + "epoch": 0.12573638691424785, + "grad_norm": 1.9166570901870728, + "learning_rate": 0.00019523208738119632, + "loss": 1.1487, + "step": 3511 + }, + { + "epoch": 0.12577219904381615, + "grad_norm": 2.3763582706451416, + "learning_rate": 0.00019522854790183152, + "loss": 1.223, + "step": 3512 + }, + { + "epoch": 0.12580801117338442, + "grad_norm": 1.4974249601364136, + "learning_rate": 0.00019522500714128942, + "loss": 1.1924, + "step": 3513 + }, + { + "epoch": 0.1258438233029527, + "grad_norm": 1.321512222290039, + "learning_rate": 0.0001952214650996177, + "loss": 1.0213, + "step": 3514 + }, + { + "epoch": 0.125879635432521, + "grad_norm": 2.002631902694702, + "learning_rate": 0.000195217921776864, + "loss": 1.2343, + "step": 3515 + }, + { + "epoch": 0.12591544756208928, + "grad_norm": 1.1275579929351807, + "learning_rate": 0.000195214377173076, + "loss": 1.1595, + "step": 3516 + }, + { + "epoch": 0.12595125969165757, + "grad_norm": 1.7928236722946167, + "learning_rate": 0.00019521083128830137, + "loss": 1.1209, + "step": 3517 + }, + { + "epoch": 0.12598707182122584, + "grad_norm": 1.3536807298660278, + "learning_rate": 0.0001952072841225878, + "loss": 1.3129, + "step": 3518 + }, + { + "epoch": 0.12602288395079414, + "grad_norm": 1.8541772365570068, + "learning_rate": 0.00019520373567598304, + "loss": 1.0367, + "step": 3519 + }, + { + "epoch": 0.1260586960803624, + "grad_norm": 2.0471673011779785, + "learning_rate": 0.0001952001859485348, + "loss": 1.3968, + "step": 3520 + }, + { + "epoch": 0.1260945082099307, + "grad_norm": 1.6680623292922974, + "learning_rate": 0.0001951966349402909, + "loss": 1.2075, + "step": 3521 + }, + { + "epoch": 0.12613032033949897, + "grad_norm": 1.7699054479599, + "learning_rate": 0.00019519308265129903, + "loss": 1.0567, + "step": 3522 + }, + { + "epoch": 0.12616613246906727, + "grad_norm": 1.5806000232696533, + "learning_rate": 0.00019518952908160705, + "loss": 1.2582, + "step": 3523 + }, + { + "epoch": 0.12620194459863557, + "grad_norm": 1.9963690042495728, + "learning_rate": 0.00019518597423126273, + "loss": 1.2361, + "step": 3524 + }, + { + "epoch": 0.12623775672820384, + "grad_norm": 1.9601541757583618, + "learning_rate": 0.0001951824181003139, + "loss": 1.0571, + "step": 3525 + }, + { + "epoch": 0.12627356885777213, + "grad_norm": 1.4461758136749268, + "learning_rate": 0.00019517886068880843, + "loss": 1.1878, + "step": 3526 + }, + { + "epoch": 0.1263093809873404, + "grad_norm": 1.8595314025878906, + "learning_rate": 0.00019517530199679415, + "loss": 1.1957, + "step": 3527 + }, + { + "epoch": 0.1263451931169087, + "grad_norm": 1.6223304271697998, + "learning_rate": 0.00019517174202431895, + "loss": 1.0704, + "step": 3528 + }, + { + "epoch": 0.12638100524647697, + "grad_norm": 1.3445109128952026, + "learning_rate": 0.00019516818077143071, + "loss": 1.195, + "step": 3529 + }, + { + "epoch": 0.12641681737604527, + "grad_norm": 2.0814692974090576, + "learning_rate": 0.00019516461823817737, + "loss": 1.0631, + "step": 3530 + }, + { + "epoch": 0.12645262950561356, + "grad_norm": 2.07169508934021, + "learning_rate": 0.00019516105442460684, + "loss": 1.0647, + "step": 3531 + }, + { + "epoch": 0.12648844163518183, + "grad_norm": 1.8019351959228516, + "learning_rate": 0.0001951574893307671, + "loss": 1.4113, + "step": 3532 + }, + { + "epoch": 0.12652425376475013, + "grad_norm": 1.6039155721664429, + "learning_rate": 0.00019515392295670604, + "loss": 1.197, + "step": 3533 + }, + { + "epoch": 0.1265600658943184, + "grad_norm": 1.177172303199768, + "learning_rate": 0.00019515035530247172, + "loss": 1.1001, + "step": 3534 + }, + { + "epoch": 0.1265958780238867, + "grad_norm": 1.3965284824371338, + "learning_rate": 0.0001951467863681121, + "loss": 1.2269, + "step": 3535 + }, + { + "epoch": 0.12663169015345496, + "grad_norm": 1.6613924503326416, + "learning_rate": 0.00019514321615367517, + "loss": 1.3946, + "step": 3536 + }, + { + "epoch": 0.12666750228302326, + "grad_norm": 1.3929816484451294, + "learning_rate": 0.000195139644659209, + "loss": 1.1687, + "step": 3537 + }, + { + "epoch": 0.12670331441259156, + "grad_norm": 1.7529644966125488, + "learning_rate": 0.00019513607188476168, + "loss": 1.1662, + "step": 3538 + }, + { + "epoch": 0.12673912654215982, + "grad_norm": 1.537915587425232, + "learning_rate": 0.00019513249783038118, + "loss": 1.2338, + "step": 3539 + }, + { + "epoch": 0.12677493867172812, + "grad_norm": 2.0685970783233643, + "learning_rate": 0.00019512892249611566, + "loss": 1.4346, + "step": 3540 + }, + { + "epoch": 0.1268107508012964, + "grad_norm": 2.063776969909668, + "learning_rate": 0.00019512534588201318, + "loss": 1.1964, + "step": 3541 + }, + { + "epoch": 0.1268465629308647, + "grad_norm": 1.9367022514343262, + "learning_rate": 0.00019512176798812189, + "loss": 1.2047, + "step": 3542 + }, + { + "epoch": 0.12688237506043296, + "grad_norm": 1.5148990154266357, + "learning_rate": 0.0001951181888144899, + "loss": 1.3424, + "step": 3543 + }, + { + "epoch": 0.12691818719000125, + "grad_norm": 1.5290327072143555, + "learning_rate": 0.00019511460836116537, + "loss": 1.3588, + "step": 3544 + }, + { + "epoch": 0.12695399931956955, + "grad_norm": 1.4554870128631592, + "learning_rate": 0.00019511102662819648, + "loss": 1.2028, + "step": 3545 + }, + { + "epoch": 0.12698981144913782, + "grad_norm": 2.8370962142944336, + "learning_rate": 0.0001951074436156314, + "loss": 1.2018, + "step": 3546 + }, + { + "epoch": 0.12702562357870611, + "grad_norm": 1.877184271812439, + "learning_rate": 0.00019510385932351837, + "loss": 1.1329, + "step": 3547 + }, + { + "epoch": 0.12706143570827438, + "grad_norm": 1.918304443359375, + "learning_rate": 0.00019510027375190556, + "loss": 1.3423, + "step": 3548 + }, + { + "epoch": 0.12709724783784268, + "grad_norm": 1.5726805925369263, + "learning_rate": 0.00019509668690084126, + "loss": 1.0819, + "step": 3549 + }, + { + "epoch": 0.12713305996741095, + "grad_norm": 1.6257178783416748, + "learning_rate": 0.00019509309877037369, + "loss": 1.0644, + "step": 3550 + }, + { + "epoch": 0.12716887209697925, + "grad_norm": 1.979759693145752, + "learning_rate": 0.00019508950936055115, + "loss": 1.2978, + "step": 3551 + }, + { + "epoch": 0.12720468422654754, + "grad_norm": 2.1221508979797363, + "learning_rate": 0.0001950859186714219, + "loss": 1.1915, + "step": 3552 + }, + { + "epoch": 0.1272404963561158, + "grad_norm": 1.450005054473877, + "learning_rate": 0.00019508232670303427, + "loss": 1.3147, + "step": 3553 + }, + { + "epoch": 0.1272763084856841, + "grad_norm": 1.5809009075164795, + "learning_rate": 0.00019507873345543658, + "loss": 1.1334, + "step": 3554 + }, + { + "epoch": 0.12731212061525238, + "grad_norm": 1.2394202947616577, + "learning_rate": 0.00019507513892867717, + "loss": 1.1822, + "step": 3555 + }, + { + "epoch": 0.12734793274482067, + "grad_norm": 1.672654628753662, + "learning_rate": 0.0001950715431228044, + "loss": 1.1077, + "step": 3556 + }, + { + "epoch": 0.12738374487438894, + "grad_norm": 2.0596015453338623, + "learning_rate": 0.0001950679460378667, + "loss": 1.3849, + "step": 3557 + }, + { + "epoch": 0.12741955700395724, + "grad_norm": 1.7857680320739746, + "learning_rate": 0.00019506434767391237, + "loss": 1.2593, + "step": 3558 + }, + { + "epoch": 0.12745536913352554, + "grad_norm": 1.6719520092010498, + "learning_rate": 0.00019506074803098987, + "loss": 1.4514, + "step": 3559 + }, + { + "epoch": 0.1274911812630938, + "grad_norm": 1.3895751237869263, + "learning_rate": 0.00019505714710914764, + "loss": 1.2182, + "step": 3560 + }, + { + "epoch": 0.1275269933926621, + "grad_norm": 2.51377534866333, + "learning_rate": 0.0001950535449084341, + "loss": 1.2603, + "step": 3561 + }, + { + "epoch": 0.12756280552223037, + "grad_norm": 1.6601526737213135, + "learning_rate": 0.0001950499414288977, + "loss": 1.0256, + "step": 3562 + }, + { + "epoch": 0.12759861765179867, + "grad_norm": 1.5013519525527954, + "learning_rate": 0.000195046336670587, + "loss": 1.1857, + "step": 3563 + }, + { + "epoch": 0.12763442978136694, + "grad_norm": 1.4431736469268799, + "learning_rate": 0.0001950427306335504, + "loss": 1.3928, + "step": 3564 + }, + { + "epoch": 0.12767024191093523, + "grad_norm": 1.2432829141616821, + "learning_rate": 0.00019503912331783648, + "loss": 1.1389, + "step": 3565 + }, + { + "epoch": 0.12770605404050353, + "grad_norm": 2.1057517528533936, + "learning_rate": 0.00019503551472349373, + "loss": 1.1055, + "step": 3566 + }, + { + "epoch": 0.1277418661700718, + "grad_norm": 1.5741525888442993, + "learning_rate": 0.0001950319048505707, + "loss": 1.0012, + "step": 3567 + }, + { + "epoch": 0.1277776782996401, + "grad_norm": 1.3043478727340698, + "learning_rate": 0.000195028293699116, + "loss": 1.1442, + "step": 3568 + }, + { + "epoch": 0.12781349042920837, + "grad_norm": 1.9013718366622925, + "learning_rate": 0.0001950246812691782, + "loss": 1.2805, + "step": 3569 + }, + { + "epoch": 0.12784930255877666, + "grad_norm": 2.272308111190796, + "learning_rate": 0.00019502106756080583, + "loss": 1.4782, + "step": 3570 + }, + { + "epoch": 0.12788511468834493, + "grad_norm": 2.4406046867370605, + "learning_rate": 0.00019501745257404762, + "loss": 1.2798, + "step": 3571 + }, + { + "epoch": 0.12792092681791323, + "grad_norm": 2.1572515964508057, + "learning_rate": 0.00019501383630895211, + "loss": 1.2379, + "step": 3572 + }, + { + "epoch": 0.12795673894748152, + "grad_norm": 1.107816457748413, + "learning_rate": 0.00019501021876556802, + "loss": 1.2639, + "step": 3573 + }, + { + "epoch": 0.1279925510770498, + "grad_norm": 1.5499293804168701, + "learning_rate": 0.00019500659994394398, + "loss": 1.1379, + "step": 3574 + }, + { + "epoch": 0.1280283632066181, + "grad_norm": 2.3614585399627686, + "learning_rate": 0.0001950029798441287, + "loss": 1.2651, + "step": 3575 + }, + { + "epoch": 0.12806417533618636, + "grad_norm": 1.5625566244125366, + "learning_rate": 0.00019499935846617084, + "loss": 1.3991, + "step": 3576 + }, + { + "epoch": 0.12809998746575466, + "grad_norm": 1.6808158159255981, + "learning_rate": 0.0001949957358101192, + "loss": 1.1484, + "step": 3577 + }, + { + "epoch": 0.12813579959532292, + "grad_norm": 1.4027035236358643, + "learning_rate": 0.00019499211187602242, + "loss": 1.2064, + "step": 3578 + }, + { + "epoch": 0.12817161172489122, + "grad_norm": 1.5302544832229614, + "learning_rate": 0.0001949884866639293, + "loss": 1.4244, + "step": 3579 + }, + { + "epoch": 0.12820742385445952, + "grad_norm": 1.2798593044281006, + "learning_rate": 0.00019498486017388865, + "loss": 1.2127, + "step": 3580 + }, + { + "epoch": 0.1282432359840278, + "grad_norm": 2.7895333766937256, + "learning_rate": 0.00019498123240594924, + "loss": 1.0757, + "step": 3581 + }, + { + "epoch": 0.12827904811359608, + "grad_norm": 2.149473190307617, + "learning_rate": 0.00019497760336015984, + "loss": 1.2193, + "step": 3582 + }, + { + "epoch": 0.12831486024316435, + "grad_norm": 1.4893298149108887, + "learning_rate": 0.0001949739730365693, + "loss": 1.3201, + "step": 3583 + }, + { + "epoch": 0.12835067237273265, + "grad_norm": 1.3671380281448364, + "learning_rate": 0.0001949703414352265, + "loss": 1.2155, + "step": 3584 + }, + { + "epoch": 0.12838648450230092, + "grad_norm": 1.8234277963638306, + "learning_rate": 0.0001949667085561802, + "loss": 1.2965, + "step": 3585 + }, + { + "epoch": 0.12842229663186921, + "grad_norm": 2.001923084259033, + "learning_rate": 0.00019496307439947937, + "loss": 1.2682, + "step": 3586 + }, + { + "epoch": 0.1284581087614375, + "grad_norm": 1.5110528469085693, + "learning_rate": 0.00019495943896517286, + "loss": 1.0057, + "step": 3587 + }, + { + "epoch": 0.12849392089100578, + "grad_norm": 1.880549430847168, + "learning_rate": 0.0001949558022533096, + "loss": 1.103, + "step": 3588 + }, + { + "epoch": 0.12852973302057408, + "grad_norm": 1.5238534212112427, + "learning_rate": 0.00019495216426393847, + "loss": 1.2737, + "step": 3589 + }, + { + "epoch": 0.12856554515014235, + "grad_norm": 1.4976166486740112, + "learning_rate": 0.0001949485249971085, + "loss": 1.1073, + "step": 3590 + }, + { + "epoch": 0.12860135727971064, + "grad_norm": 1.509246826171875, + "learning_rate": 0.00019494488445286856, + "loss": 1.0663, + "step": 3591 + }, + { + "epoch": 0.1286371694092789, + "grad_norm": 1.793238878250122, + "learning_rate": 0.00019494124263126766, + "loss": 1.1264, + "step": 3592 + }, + { + "epoch": 0.1286729815388472, + "grad_norm": 1.820534348487854, + "learning_rate": 0.00019493759953235484, + "loss": 1.2683, + "step": 3593 + }, + { + "epoch": 0.1287087936684155, + "grad_norm": 1.8486450910568237, + "learning_rate": 0.00019493395515617908, + "loss": 1.21, + "step": 3594 + }, + { + "epoch": 0.12874460579798377, + "grad_norm": 1.6037187576293945, + "learning_rate": 0.00019493030950278937, + "loss": 1.2285, + "step": 3595 + }, + { + "epoch": 0.12878041792755207, + "grad_norm": 1.5011444091796875, + "learning_rate": 0.00019492666257223484, + "loss": 1.3685, + "step": 3596 + }, + { + "epoch": 0.12881623005712034, + "grad_norm": 1.4984527826309204, + "learning_rate": 0.00019492301436456447, + "loss": 1.2781, + "step": 3597 + }, + { + "epoch": 0.12885204218668864, + "grad_norm": 1.8072823286056519, + "learning_rate": 0.00019491936487982744, + "loss": 1.2121, + "step": 3598 + }, + { + "epoch": 0.1288878543162569, + "grad_norm": 1.4723687171936035, + "learning_rate": 0.00019491571411807274, + "loss": 1.2135, + "step": 3599 + }, + { + "epoch": 0.1289236664458252, + "grad_norm": 2.432111978530884, + "learning_rate": 0.00019491206207934955, + "loss": 1.3906, + "step": 3600 + }, + { + "epoch": 0.1289594785753935, + "grad_norm": 1.7052611112594604, + "learning_rate": 0.00019490840876370703, + "loss": 1.0414, + "step": 3601 + }, + { + "epoch": 0.12899529070496177, + "grad_norm": 1.4951622486114502, + "learning_rate": 0.00019490475417119425, + "loss": 1.2504, + "step": 3602 + }, + { + "epoch": 0.12903110283453006, + "grad_norm": 1.8248775005340576, + "learning_rate": 0.00019490109830186042, + "loss": 1.214, + "step": 3603 + }, + { + "epoch": 0.12906691496409833, + "grad_norm": 1.2666187286376953, + "learning_rate": 0.00019489744115575475, + "loss": 1.1047, + "step": 3604 + }, + { + "epoch": 0.12910272709366663, + "grad_norm": 1.2961376905441284, + "learning_rate": 0.00019489378273292643, + "loss": 1.1309, + "step": 3605 + }, + { + "epoch": 0.1291385392232349, + "grad_norm": 1.3812462091445923, + "learning_rate": 0.00019489012303342462, + "loss": 1.287, + "step": 3606 + }, + { + "epoch": 0.1291743513528032, + "grad_norm": 1.9205892086029053, + "learning_rate": 0.00019488646205729864, + "loss": 1.154, + "step": 3607 + }, + { + "epoch": 0.1292101634823715, + "grad_norm": 1.817407488822937, + "learning_rate": 0.00019488279980459772, + "loss": 1.2415, + "step": 3608 + }, + { + "epoch": 0.12924597561193976, + "grad_norm": 1.4975507259368896, + "learning_rate": 0.00019487913627537108, + "loss": 1.3901, + "step": 3609 + }, + { + "epoch": 0.12928178774150806, + "grad_norm": 1.7680950164794922, + "learning_rate": 0.00019487547146966808, + "loss": 1.1767, + "step": 3610 + }, + { + "epoch": 0.12931759987107633, + "grad_norm": 2.185807704925537, + "learning_rate": 0.00019487180538753796, + "loss": 1.3031, + "step": 3611 + }, + { + "epoch": 0.12935341200064462, + "grad_norm": 1.8627290725708008, + "learning_rate": 0.0001948681380290301, + "loss": 1.3076, + "step": 3612 + }, + { + "epoch": 0.1293892241302129, + "grad_norm": 1.7168893814086914, + "learning_rate": 0.0001948644693941938, + "loss": 1.076, + "step": 3613 + }, + { + "epoch": 0.1294250362597812, + "grad_norm": 2.261016368865967, + "learning_rate": 0.00019486079948307844, + "loss": 1.1659, + "step": 3614 + }, + { + "epoch": 0.12946084838934946, + "grad_norm": 1.5947039127349854, + "learning_rate": 0.00019485712829573338, + "loss": 1.2626, + "step": 3615 + }, + { + "epoch": 0.12949666051891776, + "grad_norm": 1.7880897521972656, + "learning_rate": 0.000194853455832208, + "loss": 1.1298, + "step": 3616 + }, + { + "epoch": 0.12953247264848605, + "grad_norm": 1.3216421604156494, + "learning_rate": 0.00019484978209255175, + "loss": 0.9229, + "step": 3617 + }, + { + "epoch": 0.12956828477805432, + "grad_norm": 1.6742706298828125, + "learning_rate": 0.00019484610707681403, + "loss": 1.1702, + "step": 3618 + }, + { + "epoch": 0.12960409690762262, + "grad_norm": 1.7938730716705322, + "learning_rate": 0.00019484243078504428, + "loss": 1.2079, + "step": 3619 + }, + { + "epoch": 0.1296399090371909, + "grad_norm": 1.4435184001922607, + "learning_rate": 0.00019483875321729194, + "loss": 1.273, + "step": 3620 + }, + { + "epoch": 0.12967572116675918, + "grad_norm": 1.3861416578292847, + "learning_rate": 0.00019483507437360653, + "loss": 1.2656, + "step": 3621 + }, + { + "epoch": 0.12971153329632745, + "grad_norm": 1.7689940929412842, + "learning_rate": 0.0001948313942540375, + "loss": 1.2364, + "step": 3622 + }, + { + "epoch": 0.12974734542589575, + "grad_norm": 1.9835349321365356, + "learning_rate": 0.00019482771285863438, + "loss": 1.108, + "step": 3623 + }, + { + "epoch": 0.12978315755546405, + "grad_norm": 2.029775619506836, + "learning_rate": 0.00019482403018744674, + "loss": 1.2276, + "step": 3624 + }, + { + "epoch": 0.12981896968503231, + "grad_norm": 1.6460047960281372, + "learning_rate": 0.00019482034624052408, + "loss": 1.5105, + "step": 3625 + }, + { + "epoch": 0.1298547818146006, + "grad_norm": 1.6599452495574951, + "learning_rate": 0.00019481666101791594, + "loss": 1.1468, + "step": 3626 + }, + { + "epoch": 0.12989059394416888, + "grad_norm": 1.6492691040039062, + "learning_rate": 0.00019481297451967195, + "loss": 1.3804, + "step": 3627 + }, + { + "epoch": 0.12992640607373718, + "grad_norm": 1.6432744264602661, + "learning_rate": 0.0001948092867458417, + "loss": 0.9755, + "step": 3628 + }, + { + "epoch": 0.12996221820330545, + "grad_norm": 1.6829620599746704, + "learning_rate": 0.00019480559769647477, + "loss": 1.3532, + "step": 3629 + }, + { + "epoch": 0.12999803033287374, + "grad_norm": 1.383571743965149, + "learning_rate": 0.00019480190737162083, + "loss": 1.1729, + "step": 3630 + }, + { + "epoch": 0.13003384246244204, + "grad_norm": 1.799089789390564, + "learning_rate": 0.0001947982157713295, + "loss": 1.186, + "step": 3631 + }, + { + "epoch": 0.1300696545920103, + "grad_norm": 1.5398815870285034, + "learning_rate": 0.00019479452289565048, + "loss": 1.4063, + "step": 3632 + }, + { + "epoch": 0.1301054667215786, + "grad_norm": 1.3064801692962646, + "learning_rate": 0.00019479082874463338, + "loss": 1.1823, + "step": 3633 + }, + { + "epoch": 0.13014127885114687, + "grad_norm": 1.5321348905563354, + "learning_rate": 0.000194787133318328, + "loss": 1.3126, + "step": 3634 + }, + { + "epoch": 0.13017709098071517, + "grad_norm": 1.7304966449737549, + "learning_rate": 0.000194783436616784, + "loss": 1.0671, + "step": 3635 + }, + { + "epoch": 0.13021290311028344, + "grad_norm": 1.2683557271957397, + "learning_rate": 0.00019477973864005113, + "loss": 1.4004, + "step": 3636 + }, + { + "epoch": 0.13024871523985174, + "grad_norm": 1.6889400482177734, + "learning_rate": 0.0001947760393881791, + "loss": 1.0654, + "step": 3637 + }, + { + "epoch": 0.13028452736942003, + "grad_norm": 2.0068233013153076, + "learning_rate": 0.00019477233886121772, + "loss": 1.2427, + "step": 3638 + }, + { + "epoch": 0.1303203394989883, + "grad_norm": 1.4096657037734985, + "learning_rate": 0.00019476863705921677, + "loss": 1.2452, + "step": 3639 + }, + { + "epoch": 0.1303561516285566, + "grad_norm": 1.531909465789795, + "learning_rate": 0.00019476493398222608, + "loss": 1.1573, + "step": 3640 + }, + { + "epoch": 0.13039196375812487, + "grad_norm": 2.134215831756592, + "learning_rate": 0.0001947612296302954, + "loss": 0.9241, + "step": 3641 + }, + { + "epoch": 0.13042777588769316, + "grad_norm": 1.3249017000198364, + "learning_rate": 0.00019475752400347464, + "loss": 1.2229, + "step": 3642 + }, + { + "epoch": 0.13046358801726143, + "grad_norm": 1.5542323589324951, + "learning_rate": 0.00019475381710181363, + "loss": 0.9441, + "step": 3643 + }, + { + "epoch": 0.13049940014682973, + "grad_norm": 2.1152868270874023, + "learning_rate": 0.0001947501089253622, + "loss": 1.4603, + "step": 3644 + }, + { + "epoch": 0.13053521227639803, + "grad_norm": 1.602221131324768, + "learning_rate": 0.00019474639947417028, + "loss": 1.1164, + "step": 3645 + }, + { + "epoch": 0.1305710244059663, + "grad_norm": 1.9623887538909912, + "learning_rate": 0.0001947426887482878, + "loss": 1.1741, + "step": 3646 + }, + { + "epoch": 0.1306068365355346, + "grad_norm": 1.348570704460144, + "learning_rate": 0.0001947389767477646, + "loss": 1.2709, + "step": 3647 + }, + { + "epoch": 0.13064264866510286, + "grad_norm": 1.5085698366165161, + "learning_rate": 0.00019473526347265073, + "loss": 1.2166, + "step": 3648 + }, + { + "epoch": 0.13067846079467116, + "grad_norm": 1.8209490776062012, + "learning_rate": 0.00019473154892299608, + "loss": 1.3987, + "step": 3649 + }, + { + "epoch": 0.13071427292423943, + "grad_norm": 2.2363462448120117, + "learning_rate": 0.00019472783309885057, + "loss": 1.5035, + "step": 3650 + }, + { + "epoch": 0.13075008505380772, + "grad_norm": 1.9192228317260742, + "learning_rate": 0.0001947241160002643, + "loss": 1.2966, + "step": 3651 + }, + { + "epoch": 0.13078589718337602, + "grad_norm": 1.6753249168395996, + "learning_rate": 0.00019472039762728728, + "loss": 1.2093, + "step": 3652 + }, + { + "epoch": 0.1308217093129443, + "grad_norm": 1.6823550462722778, + "learning_rate": 0.00019471667797996944, + "loss": 1.1158, + "step": 3653 + }, + { + "epoch": 0.1308575214425126, + "grad_norm": 1.818329095840454, + "learning_rate": 0.00019471295705836088, + "loss": 1.1239, + "step": 3654 + }, + { + "epoch": 0.13089333357208086, + "grad_norm": 1.7063738107681274, + "learning_rate": 0.00019470923486251165, + "loss": 1.2254, + "step": 3655 + }, + { + "epoch": 0.13092914570164915, + "grad_norm": 2.025486946105957, + "learning_rate": 0.00019470551139247184, + "loss": 1.3572, + "step": 3656 + }, + { + "epoch": 0.13096495783121742, + "grad_norm": 1.6066782474517822, + "learning_rate": 0.00019470178664829154, + "loss": 1.2985, + "step": 3657 + }, + { + "epoch": 0.13100076996078572, + "grad_norm": 1.2578856945037842, + "learning_rate": 0.00019469806063002082, + "loss": 1.1889, + "step": 3658 + }, + { + "epoch": 0.13103658209035401, + "grad_norm": 3.0243449211120605, + "learning_rate": 0.0001946943333377099, + "loss": 1.2412, + "step": 3659 + }, + { + "epoch": 0.13107239421992228, + "grad_norm": 1.5060319900512695, + "learning_rate": 0.00019469060477140886, + "loss": 1.1448, + "step": 3660 + }, + { + "epoch": 0.13110820634949058, + "grad_norm": 1.507739782333374, + "learning_rate": 0.00019468687493116784, + "loss": 1.2259, + "step": 3661 + }, + { + "epoch": 0.13114401847905885, + "grad_norm": 2.0738353729248047, + "learning_rate": 0.00019468314381703708, + "loss": 1.3801, + "step": 3662 + }, + { + "epoch": 0.13117983060862715, + "grad_norm": 2.234265089035034, + "learning_rate": 0.00019467941142906674, + "loss": 1.272, + "step": 3663 + }, + { + "epoch": 0.13121564273819541, + "grad_norm": 1.4135339260101318, + "learning_rate": 0.00019467567776730707, + "loss": 1.2276, + "step": 3664 + }, + { + "epoch": 0.1312514548677637, + "grad_norm": 1.4434585571289062, + "learning_rate": 0.00019467194283180828, + "loss": 1.2056, + "step": 3665 + }, + { + "epoch": 0.131287266997332, + "grad_norm": 1.246997594833374, + "learning_rate": 0.0001946682066226206, + "loss": 1.1962, + "step": 3666 + }, + { + "epoch": 0.13132307912690028, + "grad_norm": 2.034248113632202, + "learning_rate": 0.0001946644691397943, + "loss": 1.1414, + "step": 3667 + }, + { + "epoch": 0.13135889125646857, + "grad_norm": 1.3468559980392456, + "learning_rate": 0.00019466073038337968, + "loss": 1.0008, + "step": 3668 + }, + { + "epoch": 0.13139470338603684, + "grad_norm": 1.6077325344085693, + "learning_rate": 0.00019465699035342706, + "loss": 1.202, + "step": 3669 + }, + { + "epoch": 0.13143051551560514, + "grad_norm": 1.5491340160369873, + "learning_rate": 0.00019465324904998672, + "loss": 1.2314, + "step": 3670 + }, + { + "epoch": 0.1314663276451734, + "grad_norm": 1.5736229419708252, + "learning_rate": 0.000194649506473109, + "loss": 1.2284, + "step": 3671 + }, + { + "epoch": 0.1315021397747417, + "grad_norm": 1.4649828672409058, + "learning_rate": 0.00019464576262284426, + "loss": 1.0924, + "step": 3672 + }, + { + "epoch": 0.13153795190431, + "grad_norm": 1.3102171421051025, + "learning_rate": 0.00019464201749924288, + "loss": 1.2653, + "step": 3673 + }, + { + "epoch": 0.13157376403387827, + "grad_norm": 1.837370753288269, + "learning_rate": 0.00019463827110235523, + "loss": 1.2135, + "step": 3674 + }, + { + "epoch": 0.13160957616344657, + "grad_norm": 2.6161046028137207, + "learning_rate": 0.00019463452343223173, + "loss": 1.3549, + "step": 3675 + }, + { + "epoch": 0.13164538829301484, + "grad_norm": 1.816907525062561, + "learning_rate": 0.00019463077448892278, + "loss": 1.0384, + "step": 3676 + }, + { + "epoch": 0.13168120042258313, + "grad_norm": 2.0349555015563965, + "learning_rate": 0.0001946270242724788, + "loss": 1.122, + "step": 3677 + }, + { + "epoch": 0.1317170125521514, + "grad_norm": 1.8596420288085938, + "learning_rate": 0.0001946232727829503, + "loss": 1.3245, + "step": 3678 + }, + { + "epoch": 0.1317528246817197, + "grad_norm": 2.6293697357177734, + "learning_rate": 0.00019461952002038771, + "loss": 1.3099, + "step": 3679 + }, + { + "epoch": 0.131788636811288, + "grad_norm": 1.9295055866241455, + "learning_rate": 0.0001946157659848415, + "loss": 1.5713, + "step": 3680 + }, + { + "epoch": 0.13182444894085626, + "grad_norm": 1.5618054866790771, + "learning_rate": 0.00019461201067636226, + "loss": 1.3336, + "step": 3681 + }, + { + "epoch": 0.13186026107042456, + "grad_norm": 1.9470295906066895, + "learning_rate": 0.00019460825409500042, + "loss": 1.3854, + "step": 3682 + }, + { + "epoch": 0.13189607319999283, + "grad_norm": 1.6477071046829224, + "learning_rate": 0.00019460449624080655, + "loss": 1.2405, + "step": 3683 + }, + { + "epoch": 0.13193188532956113, + "grad_norm": 2.1498258113861084, + "learning_rate": 0.00019460073711383125, + "loss": 1.1612, + "step": 3684 + }, + { + "epoch": 0.1319676974591294, + "grad_norm": 1.4369322061538696, + "learning_rate": 0.00019459697671412503, + "loss": 1.2698, + "step": 3685 + }, + { + "epoch": 0.1320035095886977, + "grad_norm": 1.589257836341858, + "learning_rate": 0.0001945932150417385, + "loss": 1.1009, + "step": 3686 + }, + { + "epoch": 0.132039321718266, + "grad_norm": 1.939494013786316, + "learning_rate": 0.0001945894520967223, + "loss": 1.3882, + "step": 3687 + }, + { + "epoch": 0.13207513384783426, + "grad_norm": 1.711140751838684, + "learning_rate": 0.00019458568787912703, + "loss": 1.1276, + "step": 3688 + }, + { + "epoch": 0.13211094597740256, + "grad_norm": 1.5815783739089966, + "learning_rate": 0.00019458192238900335, + "loss": 1.1094, + "step": 3689 + }, + { + "epoch": 0.13214675810697082, + "grad_norm": 1.5982741117477417, + "learning_rate": 0.00019457815562640187, + "loss": 0.9765, + "step": 3690 + }, + { + "epoch": 0.13218257023653912, + "grad_norm": 1.4955016374588013, + "learning_rate": 0.00019457438759137334, + "loss": 1.076, + "step": 3691 + }, + { + "epoch": 0.1322183823661074, + "grad_norm": 1.6879651546478271, + "learning_rate": 0.00019457061828396838, + "loss": 1.1235, + "step": 3692 + }, + { + "epoch": 0.1322541944956757, + "grad_norm": 1.684508204460144, + "learning_rate": 0.00019456684770423777, + "loss": 1.2787, + "step": 3693 + }, + { + "epoch": 0.13229000662524398, + "grad_norm": 2.2536675930023193, + "learning_rate": 0.00019456307585223218, + "loss": 1.1691, + "step": 3694 + }, + { + "epoch": 0.13232581875481225, + "grad_norm": 1.5004597902297974, + "learning_rate": 0.00019455930272800243, + "loss": 1.1929, + "step": 3695 + }, + { + "epoch": 0.13236163088438055, + "grad_norm": 1.3809250593185425, + "learning_rate": 0.00019455552833159918, + "loss": 1.2884, + "step": 3696 + }, + { + "epoch": 0.13239744301394882, + "grad_norm": 1.9219982624053955, + "learning_rate": 0.00019455175266307328, + "loss": 1.1682, + "step": 3697 + }, + { + "epoch": 0.13243325514351711, + "grad_norm": 1.2098259925842285, + "learning_rate": 0.00019454797572247552, + "loss": 1.246, + "step": 3698 + }, + { + "epoch": 0.13246906727308538, + "grad_norm": 1.508534550666809, + "learning_rate": 0.0001945441975098567, + "loss": 1.2474, + "step": 3699 + }, + { + "epoch": 0.13250487940265368, + "grad_norm": 1.9769538640975952, + "learning_rate": 0.00019454041802526766, + "loss": 1.3661, + "step": 3700 + }, + { + "epoch": 0.13254069153222198, + "grad_norm": 1.662620186805725, + "learning_rate": 0.00019453663726875923, + "loss": 1.1982, + "step": 3701 + }, + { + "epoch": 0.13257650366179025, + "grad_norm": 1.4894832372665405, + "learning_rate": 0.0001945328552403823, + "loss": 1.2571, + "step": 3702 + }, + { + "epoch": 0.13261231579135854, + "grad_norm": 1.8537240028381348, + "learning_rate": 0.00019452907194018776, + "loss": 1.1966, + "step": 3703 + }, + { + "epoch": 0.1326481279209268, + "grad_norm": 1.5245468616485596, + "learning_rate": 0.00019452528736822646, + "loss": 1.2687, + "step": 3704 + }, + { + "epoch": 0.1326839400504951, + "grad_norm": 1.5627422332763672, + "learning_rate": 0.00019452150152454936, + "loss": 1.5026, + "step": 3705 + }, + { + "epoch": 0.13271975218006338, + "grad_norm": 1.752436637878418, + "learning_rate": 0.0001945177144092074, + "loss": 1.0579, + "step": 3706 + }, + { + "epoch": 0.13275556430963167, + "grad_norm": 2.2014055252075195, + "learning_rate": 0.0001945139260222515, + "loss": 1.1696, + "step": 3707 + }, + { + "epoch": 0.13279137643919997, + "grad_norm": 2.1447389125823975, + "learning_rate": 0.00019451013636373262, + "loss": 1.2162, + "step": 3708 + }, + { + "epoch": 0.13282718856876824, + "grad_norm": 1.4673101902008057, + "learning_rate": 0.00019450634543370177, + "loss": 1.2435, + "step": 3709 + }, + { + "epoch": 0.13286300069833654, + "grad_norm": 1.9529545307159424, + "learning_rate": 0.00019450255323220995, + "loss": 1.1831, + "step": 3710 + }, + { + "epoch": 0.1328988128279048, + "grad_norm": 1.7163029909133911, + "learning_rate": 0.00019449875975930818, + "loss": 1.6304, + "step": 3711 + }, + { + "epoch": 0.1329346249574731, + "grad_norm": 1.7375589609146118, + "learning_rate": 0.00019449496501504747, + "loss": 1.1609, + "step": 3712 + }, + { + "epoch": 0.13297043708704137, + "grad_norm": 1.8505090475082397, + "learning_rate": 0.0001944911689994789, + "loss": 1.1018, + "step": 3713 + }, + { + "epoch": 0.13300624921660967, + "grad_norm": 1.3135404586791992, + "learning_rate": 0.0001944873717126536, + "loss": 1.1419, + "step": 3714 + }, + { + "epoch": 0.13304206134617794, + "grad_norm": 1.8809421062469482, + "learning_rate": 0.00019448357315462255, + "loss": 1.3374, + "step": 3715 + }, + { + "epoch": 0.13307787347574623, + "grad_norm": 1.506103754043579, + "learning_rate": 0.00019447977332543687, + "loss": 1.1068, + "step": 3716 + }, + { + "epoch": 0.13311368560531453, + "grad_norm": 2.2748031616210938, + "learning_rate": 0.00019447597222514772, + "loss": 1.4053, + "step": 3717 + }, + { + "epoch": 0.1331494977348828, + "grad_norm": 3.105186939239502, + "learning_rate": 0.00019447216985380626, + "loss": 1.0758, + "step": 3718 + }, + { + "epoch": 0.1331853098644511, + "grad_norm": 1.690515398979187, + "learning_rate": 0.0001944683662114636, + "loss": 1.2285, + "step": 3719 + }, + { + "epoch": 0.13322112199401936, + "grad_norm": 1.7843323945999146, + "learning_rate": 0.00019446456129817093, + "loss": 1.2717, + "step": 3720 + }, + { + "epoch": 0.13325693412358766, + "grad_norm": 1.3074381351470947, + "learning_rate": 0.00019446075511397943, + "loss": 1.0387, + "step": 3721 + }, + { + "epoch": 0.13329274625315593, + "grad_norm": 2.3016586303710938, + "learning_rate": 0.0001944569476589403, + "loss": 1.4151, + "step": 3722 + }, + { + "epoch": 0.13332855838272423, + "grad_norm": 1.4878370761871338, + "learning_rate": 0.00019445313893310482, + "loss": 1.3203, + "step": 3723 + }, + { + "epoch": 0.13336437051229252, + "grad_norm": 2.3976798057556152, + "learning_rate": 0.00019444932893652417, + "loss": 0.9449, + "step": 3724 + }, + { + "epoch": 0.1334001826418608, + "grad_norm": 1.610863208770752, + "learning_rate": 0.00019444551766924963, + "loss": 1.4349, + "step": 3725 + }, + { + "epoch": 0.1334359947714291, + "grad_norm": 1.2812316417694092, + "learning_rate": 0.00019444170513133248, + "loss": 1.2333, + "step": 3726 + }, + { + "epoch": 0.13347180690099736, + "grad_norm": 1.806597352027893, + "learning_rate": 0.00019443789132282403, + "loss": 1.3072, + "step": 3727 + }, + { + "epoch": 0.13350761903056566, + "grad_norm": 1.5292235612869263, + "learning_rate": 0.0001944340762437755, + "loss": 1.0057, + "step": 3728 + }, + { + "epoch": 0.13354343116013392, + "grad_norm": 2.4505808353424072, + "learning_rate": 0.00019443025989423834, + "loss": 1.2745, + "step": 3729 + }, + { + "epoch": 0.13357924328970222, + "grad_norm": 1.6290208101272583, + "learning_rate": 0.00019442644227426383, + "loss": 1.1865, + "step": 3730 + }, + { + "epoch": 0.13361505541927052, + "grad_norm": 1.749999761581421, + "learning_rate": 0.00019442262338390337, + "loss": 1.0683, + "step": 3731 + }, + { + "epoch": 0.1336508675488388, + "grad_norm": 1.4109066724777222, + "learning_rate": 0.00019441880322320824, + "loss": 1.2102, + "step": 3732 + }, + { + "epoch": 0.13368667967840708, + "grad_norm": 1.5348531007766724, + "learning_rate": 0.00019441498179222997, + "loss": 1.3938, + "step": 3733 + }, + { + "epoch": 0.13372249180797535, + "grad_norm": 2.6691224575042725, + "learning_rate": 0.00019441115909101986, + "loss": 1.216, + "step": 3734 + }, + { + "epoch": 0.13375830393754365, + "grad_norm": 1.6734209060668945, + "learning_rate": 0.0001944073351196294, + "loss": 1.1464, + "step": 3735 + }, + { + "epoch": 0.13379411606711192, + "grad_norm": 1.7819793224334717, + "learning_rate": 0.00019440350987811003, + "loss": 1.2509, + "step": 3736 + }, + { + "epoch": 0.13382992819668021, + "grad_norm": 1.4636861085891724, + "learning_rate": 0.0001943996833665132, + "loss": 1.1241, + "step": 3737 + }, + { + "epoch": 0.1338657403262485, + "grad_norm": 1.9566986560821533, + "learning_rate": 0.0001943958555848904, + "loss": 1.417, + "step": 3738 + }, + { + "epoch": 0.13390155245581678, + "grad_norm": 2.0324418544769287, + "learning_rate": 0.00019439202653329313, + "loss": 1.2634, + "step": 3739 + }, + { + "epoch": 0.13393736458538508, + "grad_norm": 1.8565906286239624, + "learning_rate": 0.00019438819621177289, + "loss": 1.334, + "step": 3740 + }, + { + "epoch": 0.13397317671495335, + "grad_norm": 1.7317408323287964, + "learning_rate": 0.00019438436462038125, + "loss": 1.0639, + "step": 3741 + }, + { + "epoch": 0.13400898884452164, + "grad_norm": 2.5832302570343018, + "learning_rate": 0.00019438053175916968, + "loss": 1.2424, + "step": 3742 + }, + { + "epoch": 0.1340448009740899, + "grad_norm": 2.07053542137146, + "learning_rate": 0.00019437669762818985, + "loss": 1.3602, + "step": 3743 + }, + { + "epoch": 0.1340806131036582, + "grad_norm": 2.111257314682007, + "learning_rate": 0.00019437286222749326, + "loss": 1.3086, + "step": 3744 + }, + { + "epoch": 0.1341164252332265, + "grad_norm": 1.9166315793991089, + "learning_rate": 0.00019436902555713153, + "loss": 1.3038, + "step": 3745 + }, + { + "epoch": 0.13415223736279477, + "grad_norm": 1.1709810495376587, + "learning_rate": 0.00019436518761715632, + "loss": 1.1925, + "step": 3746 + }, + { + "epoch": 0.13418804949236307, + "grad_norm": 2.0171146392822266, + "learning_rate": 0.0001943613484076192, + "loss": 1.058, + "step": 3747 + }, + { + "epoch": 0.13422386162193134, + "grad_norm": 1.568250298500061, + "learning_rate": 0.0001943575079285719, + "loss": 1.298, + "step": 3748 + }, + { + "epoch": 0.13425967375149964, + "grad_norm": 2.0341734886169434, + "learning_rate": 0.000194353666180066, + "loss": 1.2185, + "step": 3749 + }, + { + "epoch": 0.1342954858810679, + "grad_norm": 1.2428889274597168, + "learning_rate": 0.00019434982316215326, + "loss": 1.1952, + "step": 3750 + }, + { + "epoch": 0.1343312980106362, + "grad_norm": 1.6812258958816528, + "learning_rate": 0.00019434597887488532, + "loss": 1.1496, + "step": 3751 + }, + { + "epoch": 0.1343671101402045, + "grad_norm": 1.4739924669265747, + "learning_rate": 0.00019434213331831398, + "loss": 1.3748, + "step": 3752 + }, + { + "epoch": 0.13440292226977277, + "grad_norm": 1.4920786619186401, + "learning_rate": 0.00019433828649249087, + "loss": 1.0664, + "step": 3753 + }, + { + "epoch": 0.13443873439934106, + "grad_norm": 1.5182970762252808, + "learning_rate": 0.00019433443839746785, + "loss": 1.2002, + "step": 3754 + }, + { + "epoch": 0.13447454652890933, + "grad_norm": 1.424370527267456, + "learning_rate": 0.00019433058903329663, + "loss": 1.2884, + "step": 3755 + }, + { + "epoch": 0.13451035865847763, + "grad_norm": 1.9185832738876343, + "learning_rate": 0.00019432673840002898, + "loss": 1.3351, + "step": 3756 + }, + { + "epoch": 0.1345461707880459, + "grad_norm": 1.7480831146240234, + "learning_rate": 0.00019432288649771676, + "loss": 1.0919, + "step": 3757 + }, + { + "epoch": 0.1345819829176142, + "grad_norm": 1.2691439390182495, + "learning_rate": 0.0001943190333264118, + "loss": 1.2772, + "step": 3758 + }, + { + "epoch": 0.1346177950471825, + "grad_norm": 2.421154499053955, + "learning_rate": 0.0001943151788861659, + "loss": 1.352, + "step": 3759 + }, + { + "epoch": 0.13465360717675076, + "grad_norm": 1.82246732711792, + "learning_rate": 0.0001943113231770309, + "loss": 1.1695, + "step": 3760 + }, + { + "epoch": 0.13468941930631906, + "grad_norm": 1.6399964094161987, + "learning_rate": 0.0001943074661990587, + "loss": 1.4075, + "step": 3761 + }, + { + "epoch": 0.13472523143588733, + "grad_norm": 1.8059278726577759, + "learning_rate": 0.0001943036079523012, + "loss": 1.2046, + "step": 3762 + }, + { + "epoch": 0.13476104356545562, + "grad_norm": 1.547413945198059, + "learning_rate": 0.00019429974843681032, + "loss": 1.1032, + "step": 3763 + }, + { + "epoch": 0.1347968556950239, + "grad_norm": 1.525916337966919, + "learning_rate": 0.0001942958876526379, + "loss": 1.3311, + "step": 3764 + }, + { + "epoch": 0.1348326678245922, + "grad_norm": 1.6523547172546387, + "learning_rate": 0.000194292025599836, + "loss": 1.0808, + "step": 3765 + }, + { + "epoch": 0.1348684799541605, + "grad_norm": 1.810968279838562, + "learning_rate": 0.00019428816227845652, + "loss": 1.4143, + "step": 3766 + }, + { + "epoch": 0.13490429208372876, + "grad_norm": 1.6859594583511353, + "learning_rate": 0.0001942842976885514, + "loss": 1.2649, + "step": 3767 + }, + { + "epoch": 0.13494010421329705, + "grad_norm": 1.366675853729248, + "learning_rate": 0.00019428043183017274, + "loss": 1.2164, + "step": 3768 + }, + { + "epoch": 0.13497591634286532, + "grad_norm": 3.932572841644287, + "learning_rate": 0.00019427656470337242, + "loss": 1.4305, + "step": 3769 + }, + { + "epoch": 0.13501172847243362, + "grad_norm": 1.623166561126709, + "learning_rate": 0.00019427269630820258, + "loss": 1.1551, + "step": 3770 + }, + { + "epoch": 0.1350475406020019, + "grad_norm": 1.573548436164856, + "learning_rate": 0.00019426882664471515, + "loss": 1.1927, + "step": 3771 + }, + { + "epoch": 0.13508335273157018, + "grad_norm": 2.003708600997925, + "learning_rate": 0.00019426495571296234, + "loss": 1.2776, + "step": 3772 + }, + { + "epoch": 0.13511916486113848, + "grad_norm": 2.739943265914917, + "learning_rate": 0.00019426108351299607, + "loss": 1.3913, + "step": 3773 + }, + { + "epoch": 0.13515497699070675, + "grad_norm": 1.9346274137496948, + "learning_rate": 0.00019425721004486852, + "loss": 1.4574, + "step": 3774 + }, + { + "epoch": 0.13519078912027505, + "grad_norm": 1.46054208278656, + "learning_rate": 0.00019425333530863182, + "loss": 1.0988, + "step": 3775 + }, + { + "epoch": 0.13522660124984331, + "grad_norm": 1.8704214096069336, + "learning_rate": 0.00019424945930433807, + "loss": 1.3294, + "step": 3776 + }, + { + "epoch": 0.1352624133794116, + "grad_norm": 1.9789676666259766, + "learning_rate": 0.0001942455820320394, + "loss": 1.1255, + "step": 3777 + }, + { + "epoch": 0.13529822550897988, + "grad_norm": 1.3832300901412964, + "learning_rate": 0.00019424170349178802, + "loss": 1.2164, + "step": 3778 + }, + { + "epoch": 0.13533403763854818, + "grad_norm": 1.637256145477295, + "learning_rate": 0.00019423782368363604, + "loss": 1.1483, + "step": 3779 + }, + { + "epoch": 0.13536984976811647, + "grad_norm": 1.726043939590454, + "learning_rate": 0.00019423394260763573, + "loss": 1.1972, + "step": 3780 + }, + { + "epoch": 0.13540566189768474, + "grad_norm": 1.3907902240753174, + "learning_rate": 0.00019423006026383926, + "loss": 1.2329, + "step": 3781 + }, + { + "epoch": 0.13544147402725304, + "grad_norm": 2.146045207977295, + "learning_rate": 0.0001942261766522989, + "loss": 1.4093, + "step": 3782 + }, + { + "epoch": 0.1354772861568213, + "grad_norm": 1.6554672718048096, + "learning_rate": 0.00019422229177306686, + "loss": 1.1036, + "step": 3783 + }, + { + "epoch": 0.1355130982863896, + "grad_norm": 2.866483211517334, + "learning_rate": 0.0001942184056261954, + "loss": 1.2631, + "step": 3784 + }, + { + "epoch": 0.13554891041595787, + "grad_norm": 1.6854197978973389, + "learning_rate": 0.00019421451821173685, + "loss": 1.471, + "step": 3785 + }, + { + "epoch": 0.13558472254552617, + "grad_norm": 1.5095551013946533, + "learning_rate": 0.0001942106295297435, + "loss": 1.2604, + "step": 3786 + }, + { + "epoch": 0.13562053467509447, + "grad_norm": 1.605637550354004, + "learning_rate": 0.00019420673958026762, + "loss": 1.1973, + "step": 3787 + }, + { + "epoch": 0.13565634680466274, + "grad_norm": 2.077237844467163, + "learning_rate": 0.0001942028483633616, + "loss": 1.4813, + "step": 3788 + }, + { + "epoch": 0.13569215893423103, + "grad_norm": 1.5663530826568604, + "learning_rate": 0.00019419895587907777, + "loss": 1.1976, + "step": 3789 + }, + { + "epoch": 0.1357279710637993, + "grad_norm": 1.5733237266540527, + "learning_rate": 0.0001941950621274685, + "loss": 1.3988, + "step": 3790 + }, + { + "epoch": 0.1357637831933676, + "grad_norm": 1.6673483848571777, + "learning_rate": 0.00019419116710858614, + "loss": 1.2725, + "step": 3791 + }, + { + "epoch": 0.13579959532293587, + "grad_norm": 1.82987380027771, + "learning_rate": 0.00019418727082248316, + "loss": 1.0386, + "step": 3792 + }, + { + "epoch": 0.13583540745250416, + "grad_norm": 1.6305838823318481, + "learning_rate": 0.00019418337326921193, + "loss": 1.0916, + "step": 3793 + }, + { + "epoch": 0.13587121958207246, + "grad_norm": 1.6829408407211304, + "learning_rate": 0.0001941794744488249, + "loss": 1.3312, + "step": 3794 + }, + { + "epoch": 0.13590703171164073, + "grad_norm": 1.3704564571380615, + "learning_rate": 0.0001941755743613745, + "loss": 1.2209, + "step": 3795 + }, + { + "epoch": 0.13594284384120903, + "grad_norm": 2.1219818592071533, + "learning_rate": 0.00019417167300691328, + "loss": 1.242, + "step": 3796 + }, + { + "epoch": 0.1359786559707773, + "grad_norm": 1.412664532661438, + "learning_rate": 0.00019416777038549362, + "loss": 1.0413, + "step": 3797 + }, + { + "epoch": 0.1360144681003456, + "grad_norm": 1.4982202053070068, + "learning_rate": 0.00019416386649716812, + "loss": 1.1914, + "step": 3798 + }, + { + "epoch": 0.13605028022991386, + "grad_norm": 1.6664620637893677, + "learning_rate": 0.0001941599613419892, + "loss": 1.0177, + "step": 3799 + }, + { + "epoch": 0.13608609235948216, + "grad_norm": 1.9268770217895508, + "learning_rate": 0.00019415605492000953, + "loss": 1.3881, + "step": 3800 + }, + { + "epoch": 0.13612190448905045, + "grad_norm": 1.6566745042800903, + "learning_rate": 0.00019415214723128154, + "loss": 1.1618, + "step": 3801 + }, + { + "epoch": 0.13615771661861872, + "grad_norm": 2.1234419345855713, + "learning_rate": 0.0001941482382758579, + "loss": 1.1766, + "step": 3802 + }, + { + "epoch": 0.13619352874818702, + "grad_norm": 1.2707493305206299, + "learning_rate": 0.00019414432805379113, + "loss": 1.0383, + "step": 3803 + }, + { + "epoch": 0.1362293408777553, + "grad_norm": 2.4121787548065186, + "learning_rate": 0.00019414041656513385, + "loss": 1.0763, + "step": 3804 + }, + { + "epoch": 0.1362651530073236, + "grad_norm": 1.3251664638519287, + "learning_rate": 0.0001941365038099387, + "loss": 1.253, + "step": 3805 + }, + { + "epoch": 0.13630096513689186, + "grad_norm": 1.615216612815857, + "learning_rate": 0.00019413258978825834, + "loss": 1.2646, + "step": 3806 + }, + { + "epoch": 0.13633677726646015, + "grad_norm": 1.6384365558624268, + "learning_rate": 0.0001941286745001454, + "loss": 1.1926, + "step": 3807 + }, + { + "epoch": 0.13637258939602845, + "grad_norm": 1.513426661491394, + "learning_rate": 0.00019412475794565256, + "loss": 1.0672, + "step": 3808 + }, + { + "epoch": 0.13640840152559672, + "grad_norm": 1.611948013305664, + "learning_rate": 0.00019412084012483249, + "loss": 1.2264, + "step": 3809 + }, + { + "epoch": 0.13644421365516501, + "grad_norm": 1.9613397121429443, + "learning_rate": 0.00019411692103773795, + "loss": 1.0818, + "step": 3810 + }, + { + "epoch": 0.13648002578473328, + "grad_norm": 1.5819247961044312, + "learning_rate": 0.00019411300068442167, + "loss": 1.4039, + "step": 3811 + }, + { + "epoch": 0.13651583791430158, + "grad_norm": 1.586746096611023, + "learning_rate": 0.0001941090790649363, + "loss": 1.1583, + "step": 3812 + }, + { + "epoch": 0.13655165004386985, + "grad_norm": 1.6297763586044312, + "learning_rate": 0.00019410515617933468, + "loss": 0.9561, + "step": 3813 + }, + { + "epoch": 0.13658746217343815, + "grad_norm": 1.9986522197723389, + "learning_rate": 0.0001941012320276696, + "loss": 1.516, + "step": 3814 + }, + { + "epoch": 0.13662327430300641, + "grad_norm": 1.5270659923553467, + "learning_rate": 0.0001940973066099938, + "loss": 1.1336, + "step": 3815 + }, + { + "epoch": 0.1366590864325747, + "grad_norm": 1.8658851385116577, + "learning_rate": 0.00019409337992636015, + "loss": 1.181, + "step": 3816 + }, + { + "epoch": 0.136694898562143, + "grad_norm": 2.109266519546509, + "learning_rate": 0.0001940894519768214, + "loss": 1.0606, + "step": 3817 + }, + { + "epoch": 0.13673071069171128, + "grad_norm": 1.874842882156372, + "learning_rate": 0.00019408552276143045, + "loss": 1.3024, + "step": 3818 + }, + { + "epoch": 0.13676652282127957, + "grad_norm": 1.3942415714263916, + "learning_rate": 0.00019408159228024018, + "loss": 1.1218, + "step": 3819 + }, + { + "epoch": 0.13680233495084784, + "grad_norm": 1.5628331899642944, + "learning_rate": 0.00019407766053330342, + "loss": 1.0951, + "step": 3820 + }, + { + "epoch": 0.13683814708041614, + "grad_norm": 1.511309266090393, + "learning_rate": 0.00019407372752067308, + "loss": 1.0964, + "step": 3821 + }, + { + "epoch": 0.1368739592099844, + "grad_norm": 1.4916263818740845, + "learning_rate": 0.0001940697932424021, + "loss": 1.1489, + "step": 3822 + }, + { + "epoch": 0.1369097713395527, + "grad_norm": 1.3866722583770752, + "learning_rate": 0.0001940658576985434, + "loss": 1.1751, + "step": 3823 + }, + { + "epoch": 0.136945583469121, + "grad_norm": 1.4442417621612549, + "learning_rate": 0.0001940619208891499, + "loss": 1.3216, + "step": 3824 + }, + { + "epoch": 0.13698139559868927, + "grad_norm": 1.607844591140747, + "learning_rate": 0.0001940579828142746, + "loss": 1.2769, + "step": 3825 + }, + { + "epoch": 0.13701720772825757, + "grad_norm": 1.4012391567230225, + "learning_rate": 0.00019405404347397047, + "loss": 1.2829, + "step": 3826 + }, + { + "epoch": 0.13705301985782584, + "grad_norm": 1.7171849012374878, + "learning_rate": 0.0001940501028682905, + "loss": 1.2266, + "step": 3827 + }, + { + "epoch": 0.13708883198739413, + "grad_norm": 1.4206444025039673, + "learning_rate": 0.00019404616099728773, + "loss": 1.0956, + "step": 3828 + }, + { + "epoch": 0.1371246441169624, + "grad_norm": 1.474390983581543, + "learning_rate": 0.00019404221786101513, + "loss": 1.1688, + "step": 3829 + }, + { + "epoch": 0.1371604562465307, + "grad_norm": 1.62975013256073, + "learning_rate": 0.0001940382734595258, + "loss": 1.1792, + "step": 3830 + }, + { + "epoch": 0.137196268376099, + "grad_norm": 1.879928708076477, + "learning_rate": 0.00019403432779287286, + "loss": 1.2788, + "step": 3831 + }, + { + "epoch": 0.13723208050566726, + "grad_norm": 1.4938169717788696, + "learning_rate": 0.00019403038086110926, + "loss": 1.3498, + "step": 3832 + }, + { + "epoch": 0.13726789263523556, + "grad_norm": 1.4533069133758545, + "learning_rate": 0.00019402643266428822, + "loss": 1.3738, + "step": 3833 + }, + { + "epoch": 0.13730370476480383, + "grad_norm": 1.8871474266052246, + "learning_rate": 0.00019402248320246282, + "loss": 1.4763, + "step": 3834 + }, + { + "epoch": 0.13733951689437213, + "grad_norm": 1.8816747665405273, + "learning_rate": 0.00019401853247568614, + "loss": 1.1145, + "step": 3835 + }, + { + "epoch": 0.1373753290239404, + "grad_norm": 1.4090968370437622, + "learning_rate": 0.00019401458048401145, + "loss": 1.0002, + "step": 3836 + }, + { + "epoch": 0.1374111411535087, + "grad_norm": 1.761064887046814, + "learning_rate": 0.0001940106272274918, + "loss": 1.1355, + "step": 3837 + }, + { + "epoch": 0.137446953283077, + "grad_norm": 2.04732346534729, + "learning_rate": 0.00019400667270618046, + "loss": 1.1444, + "step": 3838 + }, + { + "epoch": 0.13748276541264526, + "grad_norm": 1.7416253089904785, + "learning_rate": 0.00019400271692013058, + "loss": 1.5248, + "step": 3839 + }, + { + "epoch": 0.13751857754221355, + "grad_norm": 1.8846087455749512, + "learning_rate": 0.0001939987598693954, + "loss": 1.2228, + "step": 3840 + }, + { + "epoch": 0.13755438967178182, + "grad_norm": 2.0424113273620605, + "learning_rate": 0.00019399480155402813, + "loss": 1.1409, + "step": 3841 + }, + { + "epoch": 0.13759020180135012, + "grad_norm": 1.3249925374984741, + "learning_rate": 0.0001939908419740821, + "loss": 1.3614, + "step": 3842 + }, + { + "epoch": 0.1376260139309184, + "grad_norm": 1.5908477306365967, + "learning_rate": 0.0001939868811296105, + "loss": 0.9232, + "step": 3843 + }, + { + "epoch": 0.1376618260604867, + "grad_norm": 1.4253493547439575, + "learning_rate": 0.00019398291902066666, + "loss": 1.1453, + "step": 3844 + }, + { + "epoch": 0.13769763819005498, + "grad_norm": 1.789834976196289, + "learning_rate": 0.00019397895564730386, + "loss": 1.0389, + "step": 3845 + }, + { + "epoch": 0.13773345031962325, + "grad_norm": 1.898138403892517, + "learning_rate": 0.00019397499100957542, + "loss": 1.4413, + "step": 3846 + }, + { + "epoch": 0.13776926244919155, + "grad_norm": 1.53733229637146, + "learning_rate": 0.00019397102510753473, + "loss": 1.0898, + "step": 3847 + }, + { + "epoch": 0.13780507457875982, + "grad_norm": 1.8739738464355469, + "learning_rate": 0.0001939670579412351, + "loss": 1.357, + "step": 3848 + }, + { + "epoch": 0.13784088670832811, + "grad_norm": 1.7880147695541382, + "learning_rate": 0.00019396308951072992, + "loss": 1.2188, + "step": 3849 + }, + { + "epoch": 0.13787669883789638, + "grad_norm": 2.1245129108428955, + "learning_rate": 0.00019395911981607254, + "loss": 1.1074, + "step": 3850 + }, + { + "epoch": 0.13791251096746468, + "grad_norm": 1.7916488647460938, + "learning_rate": 0.00019395514885731644, + "loss": 1.1795, + "step": 3851 + }, + { + "epoch": 0.13794832309703298, + "grad_norm": 1.8771685361862183, + "learning_rate": 0.000193951176634515, + "loss": 1.2839, + "step": 3852 + }, + { + "epoch": 0.13798413522660125, + "grad_norm": 1.863285779953003, + "learning_rate": 0.00019394720314772166, + "loss": 1.2836, + "step": 3853 + }, + { + "epoch": 0.13801994735616954, + "grad_norm": 1.7899121046066284, + "learning_rate": 0.00019394322839698988, + "loss": 1.3859, + "step": 3854 + }, + { + "epoch": 0.1380557594857378, + "grad_norm": 1.4952107667922974, + "learning_rate": 0.00019393925238237313, + "loss": 1.3213, + "step": 3855 + }, + { + "epoch": 0.1380915716153061, + "grad_norm": 1.3935914039611816, + "learning_rate": 0.00019393527510392494, + "loss": 1.356, + "step": 3856 + }, + { + "epoch": 0.13812738374487438, + "grad_norm": 1.779893159866333, + "learning_rate": 0.0001939312965616988, + "loss": 1.2498, + "step": 3857 + }, + { + "epoch": 0.13816319587444267, + "grad_norm": 2.023606538772583, + "learning_rate": 0.0001939273167557482, + "loss": 1.2402, + "step": 3858 + }, + { + "epoch": 0.13819900800401097, + "grad_norm": 1.2730605602264404, + "learning_rate": 0.00019392333568612672, + "loss": 1.2259, + "step": 3859 + }, + { + "epoch": 0.13823482013357924, + "grad_norm": 1.485270380973816, + "learning_rate": 0.00019391935335288788, + "loss": 1.2831, + "step": 3860 + }, + { + "epoch": 0.13827063226314754, + "grad_norm": 2.269993305206299, + "learning_rate": 0.00019391536975608533, + "loss": 1.2928, + "step": 3861 + }, + { + "epoch": 0.1383064443927158, + "grad_norm": 1.675038456916809, + "learning_rate": 0.0001939113848957726, + "loss": 0.9932, + "step": 3862 + }, + { + "epoch": 0.1383422565222841, + "grad_norm": 2.3034870624542236, + "learning_rate": 0.00019390739877200335, + "loss": 1.2402, + "step": 3863 + }, + { + "epoch": 0.13837806865185237, + "grad_norm": 1.5948288440704346, + "learning_rate": 0.00019390341138483117, + "loss": 1.259, + "step": 3864 + }, + { + "epoch": 0.13841388078142067, + "grad_norm": 1.4060418605804443, + "learning_rate": 0.0001938994227343097, + "loss": 1.2661, + "step": 3865 + }, + { + "epoch": 0.13844969291098896, + "grad_norm": 1.659236192703247, + "learning_rate": 0.00019389543282049263, + "loss": 1.1036, + "step": 3866 + }, + { + "epoch": 0.13848550504055723, + "grad_norm": 1.4474562406539917, + "learning_rate": 0.0001938914416434336, + "loss": 1.2742, + "step": 3867 + }, + { + "epoch": 0.13852131717012553, + "grad_norm": 1.4575443267822266, + "learning_rate": 0.00019388744920318638, + "loss": 1.2664, + "step": 3868 + }, + { + "epoch": 0.1385571292996938, + "grad_norm": 1.8088241815567017, + "learning_rate": 0.00019388345549980462, + "loss": 1.3949, + "step": 3869 + }, + { + "epoch": 0.1385929414292621, + "grad_norm": 1.4771618843078613, + "learning_rate": 0.00019387946053334206, + "loss": 1.4409, + "step": 3870 + }, + { + "epoch": 0.13862875355883036, + "grad_norm": 1.9491852521896362, + "learning_rate": 0.00019387546430385246, + "loss": 1.1717, + "step": 3871 + }, + { + "epoch": 0.13866456568839866, + "grad_norm": 1.9770007133483887, + "learning_rate": 0.00019387146681138957, + "loss": 1.2278, + "step": 3872 + }, + { + "epoch": 0.13870037781796696, + "grad_norm": 1.605725646018982, + "learning_rate": 0.00019386746805600717, + "loss": 1.2164, + "step": 3873 + }, + { + "epoch": 0.13873618994753523, + "grad_norm": 1.3768041133880615, + "learning_rate": 0.00019386346803775909, + "loss": 1.0643, + "step": 3874 + }, + { + "epoch": 0.13877200207710352, + "grad_norm": 2.027958869934082, + "learning_rate": 0.00019385946675669913, + "loss": 1.2233, + "step": 3875 + }, + { + "epoch": 0.1388078142066718, + "grad_norm": 1.835396409034729, + "learning_rate": 0.0001938554642128811, + "loss": 1.2593, + "step": 3876 + }, + { + "epoch": 0.1388436263362401, + "grad_norm": 1.4830487966537476, + "learning_rate": 0.00019385146040635886, + "loss": 1.2534, + "step": 3877 + }, + { + "epoch": 0.13887943846580836, + "grad_norm": 1.5021227598190308, + "learning_rate": 0.00019384745533718628, + "loss": 1.0981, + "step": 3878 + }, + { + "epoch": 0.13891525059537665, + "grad_norm": 2.283930540084839, + "learning_rate": 0.00019384344900541723, + "loss": 1.1155, + "step": 3879 + }, + { + "epoch": 0.13895106272494495, + "grad_norm": 1.7975435256958008, + "learning_rate": 0.00019383944141110565, + "loss": 1.1684, + "step": 3880 + }, + { + "epoch": 0.13898687485451322, + "grad_norm": 1.2852085828781128, + "learning_rate": 0.00019383543255430542, + "loss": 1.044, + "step": 3881 + }, + { + "epoch": 0.13902268698408152, + "grad_norm": 1.790181279182434, + "learning_rate": 0.00019383142243507048, + "loss": 1.4496, + "step": 3882 + }, + { + "epoch": 0.13905849911364979, + "grad_norm": 1.9038870334625244, + "learning_rate": 0.00019382741105345482, + "loss": 1.2933, + "step": 3883 + }, + { + "epoch": 0.13909431124321808, + "grad_norm": 1.3521850109100342, + "learning_rate": 0.0001938233984095123, + "loss": 1.4248, + "step": 3884 + }, + { + "epoch": 0.13913012337278635, + "grad_norm": 1.5137065649032593, + "learning_rate": 0.00019381938450329704, + "loss": 1.4058, + "step": 3885 + }, + { + "epoch": 0.13916593550235465, + "grad_norm": 2.130991220474243, + "learning_rate": 0.00019381536933486295, + "loss": 1.473, + "step": 3886 + }, + { + "epoch": 0.13920174763192295, + "grad_norm": 1.9059667587280273, + "learning_rate": 0.0001938113529042641, + "loss": 1.3214, + "step": 3887 + }, + { + "epoch": 0.13923755976149121, + "grad_norm": 1.2959721088409424, + "learning_rate": 0.0001938073352115545, + "loss": 1.1907, + "step": 3888 + }, + { + "epoch": 0.1392733718910595, + "grad_norm": 1.917816400527954, + "learning_rate": 0.00019380331625678821, + "loss": 1.3111, + "step": 3889 + }, + { + "epoch": 0.13930918402062778, + "grad_norm": 1.4171957969665527, + "learning_rate": 0.00019379929604001927, + "loss": 1.1841, + "step": 3890 + }, + { + "epoch": 0.13934499615019608, + "grad_norm": 2.049849033355713, + "learning_rate": 0.00019379527456130183, + "loss": 1.4166, + "step": 3891 + }, + { + "epoch": 0.13938080827976435, + "grad_norm": 1.9757475852966309, + "learning_rate": 0.00019379125182068994, + "loss": 1.1892, + "step": 3892 + }, + { + "epoch": 0.13941662040933264, + "grad_norm": 1.9772698879241943, + "learning_rate": 0.00019378722781823772, + "loss": 1.2727, + "step": 3893 + }, + { + "epoch": 0.13945243253890094, + "grad_norm": 1.9279121160507202, + "learning_rate": 0.00019378320255399934, + "loss": 1.294, + "step": 3894 + }, + { + "epoch": 0.1394882446684692, + "grad_norm": 1.673357605934143, + "learning_rate": 0.00019377917602802897, + "loss": 1.2049, + "step": 3895 + }, + { + "epoch": 0.1395240567980375, + "grad_norm": 2.1010494232177734, + "learning_rate": 0.00019377514824038073, + "loss": 1.4998, + "step": 3896 + }, + { + "epoch": 0.13955986892760577, + "grad_norm": 2.5087883472442627, + "learning_rate": 0.00019377111919110883, + "loss": 1.5368, + "step": 3897 + }, + { + "epoch": 0.13959568105717407, + "grad_norm": 1.4822754859924316, + "learning_rate": 0.00019376708888026747, + "loss": 1.2524, + "step": 3898 + }, + { + "epoch": 0.13963149318674234, + "grad_norm": 2.045320987701416, + "learning_rate": 0.0001937630573079109, + "loss": 1.1341, + "step": 3899 + }, + { + "epoch": 0.13966730531631064, + "grad_norm": 1.8475141525268555, + "learning_rate": 0.0001937590244740933, + "loss": 1.1097, + "step": 3900 + }, + { + "epoch": 0.13970311744587893, + "grad_norm": 1.8152955770492554, + "learning_rate": 0.000193754990378869, + "loss": 1.2383, + "step": 3901 + }, + { + "epoch": 0.1397389295754472, + "grad_norm": 1.4363402128219604, + "learning_rate": 0.00019375095502229223, + "loss": 1.0928, + "step": 3902 + }, + { + "epoch": 0.1397747417050155, + "grad_norm": 1.6513184309005737, + "learning_rate": 0.0001937469184044173, + "loss": 1.1891, + "step": 3903 + }, + { + "epoch": 0.13981055383458377, + "grad_norm": 1.573039174079895, + "learning_rate": 0.0001937428805252985, + "loss": 1.015, + "step": 3904 + }, + { + "epoch": 0.13984636596415206, + "grad_norm": 1.7117273807525635, + "learning_rate": 0.00019373884138499018, + "loss": 1.2206, + "step": 3905 + }, + { + "epoch": 0.13988217809372033, + "grad_norm": 1.6629964113235474, + "learning_rate": 0.00019373480098354665, + "loss": 1.1972, + "step": 3906 + }, + { + "epoch": 0.13991799022328863, + "grad_norm": 2.1111557483673096, + "learning_rate": 0.00019373075932102227, + "loss": 1.2065, + "step": 3907 + }, + { + "epoch": 0.13995380235285693, + "grad_norm": 1.5673630237579346, + "learning_rate": 0.00019372671639747145, + "loss": 1.3001, + "step": 3908 + }, + { + "epoch": 0.1399896144824252, + "grad_norm": 1.4174344539642334, + "learning_rate": 0.00019372267221294854, + "loss": 1.3383, + "step": 3909 + }, + { + "epoch": 0.1400254266119935, + "grad_norm": 1.6904226541519165, + "learning_rate": 0.00019371862676750796, + "loss": 1.2109, + "step": 3910 + }, + { + "epoch": 0.14006123874156176, + "grad_norm": 1.4195408821105957, + "learning_rate": 0.00019371458006120417, + "loss": 1.295, + "step": 3911 + }, + { + "epoch": 0.14009705087113006, + "grad_norm": 1.8054983615875244, + "learning_rate": 0.00019371053209409157, + "loss": 1.1858, + "step": 3912 + }, + { + "epoch": 0.14013286300069833, + "grad_norm": 1.796844482421875, + "learning_rate": 0.00019370648286622466, + "loss": 1.1023, + "step": 3913 + }, + { + "epoch": 0.14016867513026662, + "grad_norm": 1.859695315361023, + "learning_rate": 0.00019370243237765787, + "loss": 1.0159, + "step": 3914 + }, + { + "epoch": 0.1402044872598349, + "grad_norm": 1.3659417629241943, + "learning_rate": 0.00019369838062844577, + "loss": 1.2163, + "step": 3915 + }, + { + "epoch": 0.1402402993894032, + "grad_norm": 1.8125382661819458, + "learning_rate": 0.00019369432761864278, + "loss": 1.2172, + "step": 3916 + }, + { + "epoch": 0.14027611151897149, + "grad_norm": 1.4106041193008423, + "learning_rate": 0.00019369027334830346, + "loss": 1.2294, + "step": 3917 + }, + { + "epoch": 0.14031192364853975, + "grad_norm": 1.6075189113616943, + "learning_rate": 0.00019368621781748238, + "loss": 1.2626, + "step": 3918 + }, + { + "epoch": 0.14034773577810805, + "grad_norm": 1.6463679075241089, + "learning_rate": 0.0001936821610262341, + "loss": 1.2841, + "step": 3919 + }, + { + "epoch": 0.14038354790767632, + "grad_norm": 1.436765432357788, + "learning_rate": 0.00019367810297461313, + "loss": 1.1244, + "step": 3920 + }, + { + "epoch": 0.14041936003724462, + "grad_norm": 1.6742850542068481, + "learning_rate": 0.00019367404366267416, + "loss": 1.1236, + "step": 3921 + }, + { + "epoch": 0.14045517216681289, + "grad_norm": 1.7150018215179443, + "learning_rate": 0.0001936699830904718, + "loss": 1.2501, + "step": 3922 + }, + { + "epoch": 0.14049098429638118, + "grad_norm": 1.5636425018310547, + "learning_rate": 0.00019366592125806057, + "loss": 1.2078, + "step": 3923 + }, + { + "epoch": 0.14052679642594948, + "grad_norm": 1.6084883213043213, + "learning_rate": 0.00019366185816549524, + "loss": 1.0556, + "step": 3924 + }, + { + "epoch": 0.14056260855551775, + "grad_norm": 1.3750157356262207, + "learning_rate": 0.0001936577938128304, + "loss": 1.2371, + "step": 3925 + }, + { + "epoch": 0.14059842068508605, + "grad_norm": 1.7064673900604248, + "learning_rate": 0.00019365372820012077, + "loss": 1.0627, + "step": 3926 + }, + { + "epoch": 0.14063423281465431, + "grad_norm": 1.4918442964553833, + "learning_rate": 0.00019364966132742102, + "loss": 1.2918, + "step": 3927 + }, + { + "epoch": 0.1406700449442226, + "grad_norm": 2.0816304683685303, + "learning_rate": 0.00019364559319478585, + "loss": 1.1133, + "step": 3928 + }, + { + "epoch": 0.14070585707379088, + "grad_norm": 1.2794170379638672, + "learning_rate": 0.00019364152380227007, + "loss": 1.1496, + "step": 3929 + }, + { + "epoch": 0.14074166920335918, + "grad_norm": 2.4281489849090576, + "learning_rate": 0.00019363745314992836, + "loss": 1.2892, + "step": 3930 + }, + { + "epoch": 0.14077748133292747, + "grad_norm": 1.5795261859893799, + "learning_rate": 0.00019363338123781548, + "loss": 1.2562, + "step": 3931 + }, + { + "epoch": 0.14081329346249574, + "grad_norm": 2.294405937194824, + "learning_rate": 0.00019362930806598625, + "loss": 1.1648, + "step": 3932 + }, + { + "epoch": 0.14084910559206404, + "grad_norm": 1.257856011390686, + "learning_rate": 0.00019362523363449546, + "loss": 1.055, + "step": 3933 + }, + { + "epoch": 0.1408849177216323, + "grad_norm": 1.6763396263122559, + "learning_rate": 0.0001936211579433979, + "loss": 1.2278, + "step": 3934 + }, + { + "epoch": 0.1409207298512006, + "grad_norm": 1.5622103214263916, + "learning_rate": 0.00019361708099274844, + "loss": 1.4096, + "step": 3935 + }, + { + "epoch": 0.14095654198076887, + "grad_norm": 1.883415937423706, + "learning_rate": 0.00019361300278260193, + "loss": 1.3711, + "step": 3936 + }, + { + "epoch": 0.14099235411033717, + "grad_norm": 1.9400440454483032, + "learning_rate": 0.00019360892331301316, + "loss": 1.2541, + "step": 3937 + }, + { + "epoch": 0.14102816623990547, + "grad_norm": 1.283797025680542, + "learning_rate": 0.00019360484258403713, + "loss": 1.2604, + "step": 3938 + }, + { + "epoch": 0.14106397836947374, + "grad_norm": 2.1017584800720215, + "learning_rate": 0.00019360076059572867, + "loss": 1.1071, + "step": 3939 + }, + { + "epoch": 0.14109979049904203, + "grad_norm": 1.222893476486206, + "learning_rate": 0.0001935966773481427, + "loss": 1.2577, + "step": 3940 + }, + { + "epoch": 0.1411356026286103, + "grad_norm": 1.4515206813812256, + "learning_rate": 0.00019359259284133418, + "loss": 1.1621, + "step": 3941 + }, + { + "epoch": 0.1411714147581786, + "grad_norm": 1.7947192192077637, + "learning_rate": 0.00019358850707535804, + "loss": 1.0886, + "step": 3942 + }, + { + "epoch": 0.14120722688774687, + "grad_norm": 1.3816362619400024, + "learning_rate": 0.00019358442005026926, + "loss": 1.0798, + "step": 3943 + }, + { + "epoch": 0.14124303901731516, + "grad_norm": 1.6299675703048706, + "learning_rate": 0.0001935803317661228, + "loss": 1.2909, + "step": 3944 + }, + { + "epoch": 0.14127885114688346, + "grad_norm": 1.799340844154358, + "learning_rate": 0.0001935762422229737, + "loss": 1.3749, + "step": 3945 + }, + { + "epoch": 0.14131466327645173, + "grad_norm": 2.60453462600708, + "learning_rate": 0.00019357215142087699, + "loss": 1.0202, + "step": 3946 + }, + { + "epoch": 0.14135047540602003, + "grad_norm": 1.6083608865737915, + "learning_rate": 0.0001935680593598877, + "loss": 1.1044, + "step": 3947 + }, + { + "epoch": 0.1413862875355883, + "grad_norm": 1.5208561420440674, + "learning_rate": 0.00019356396604006083, + "loss": 1.0653, + "step": 3948 + }, + { + "epoch": 0.1414220996651566, + "grad_norm": 1.3531147241592407, + "learning_rate": 0.00019355987146145147, + "loss": 1.1652, + "step": 3949 + }, + { + "epoch": 0.14145791179472486, + "grad_norm": 1.4410161972045898, + "learning_rate": 0.00019355577562411473, + "loss": 1.2916, + "step": 3950 + }, + { + "epoch": 0.14149372392429316, + "grad_norm": 1.6777557134628296, + "learning_rate": 0.00019355167852810575, + "loss": 1.2156, + "step": 3951 + }, + { + "epoch": 0.14152953605386145, + "grad_norm": 1.335658073425293, + "learning_rate": 0.00019354758017347957, + "loss": 1.0817, + "step": 3952 + }, + { + "epoch": 0.14156534818342972, + "grad_norm": 1.4333369731903076, + "learning_rate": 0.00019354348056029136, + "loss": 1.3639, + "step": 3953 + }, + { + "epoch": 0.14160116031299802, + "grad_norm": 1.7953625917434692, + "learning_rate": 0.0001935393796885963, + "loss": 1.1525, + "step": 3954 + }, + { + "epoch": 0.1416369724425663, + "grad_norm": 1.5210254192352295, + "learning_rate": 0.00019353527755844953, + "loss": 1.193, + "step": 3955 + }, + { + "epoch": 0.14167278457213459, + "grad_norm": 1.5940616130828857, + "learning_rate": 0.00019353117416990627, + "loss": 1.1452, + "step": 3956 + }, + { + "epoch": 0.14170859670170285, + "grad_norm": 2.0237767696380615, + "learning_rate": 0.0001935270695230217, + "loss": 1.0516, + "step": 3957 + }, + { + "epoch": 0.14174440883127115, + "grad_norm": 1.7159591913223267, + "learning_rate": 0.00019352296361785105, + "loss": 1.1824, + "step": 3958 + }, + { + "epoch": 0.14178022096083945, + "grad_norm": 1.6991076469421387, + "learning_rate": 0.00019351885645444957, + "loss": 1.1748, + "step": 3959 + }, + { + "epoch": 0.14181603309040772, + "grad_norm": 2.1113829612731934, + "learning_rate": 0.0001935147480328725, + "loss": 1.3167, + "step": 3960 + }, + { + "epoch": 0.141851845219976, + "grad_norm": 1.7993522882461548, + "learning_rate": 0.0001935106383531751, + "loss": 1.2077, + "step": 3961 + }, + { + "epoch": 0.14188765734954428, + "grad_norm": 1.8339391946792603, + "learning_rate": 0.00019350652741541272, + "loss": 1.3146, + "step": 3962 + }, + { + "epoch": 0.14192346947911258, + "grad_norm": 1.8934698104858398, + "learning_rate": 0.00019350241521964062, + "loss": 1.2242, + "step": 3963 + }, + { + "epoch": 0.14195928160868085, + "grad_norm": 1.6405458450317383, + "learning_rate": 0.00019349830176591408, + "loss": 1.2733, + "step": 3964 + }, + { + "epoch": 0.14199509373824915, + "grad_norm": 1.7591601610183716, + "learning_rate": 0.00019349418705428854, + "loss": 1.036, + "step": 3965 + }, + { + "epoch": 0.14203090586781744, + "grad_norm": 1.7620686292648315, + "learning_rate": 0.0001934900710848193, + "loss": 1.2613, + "step": 3966 + }, + { + "epoch": 0.1420667179973857, + "grad_norm": 1.3282748460769653, + "learning_rate": 0.00019348595385756178, + "loss": 0.9327, + "step": 3967 + }, + { + "epoch": 0.142102530126954, + "grad_norm": 1.7955777645111084, + "learning_rate": 0.00019348183537257131, + "loss": 1.2441, + "step": 3968 + }, + { + "epoch": 0.14213834225652228, + "grad_norm": 2.6294784545898438, + "learning_rate": 0.00019347771562990332, + "loss": 1.392, + "step": 3969 + }, + { + "epoch": 0.14217415438609057, + "grad_norm": 1.5245628356933594, + "learning_rate": 0.00019347359462961326, + "loss": 1.1434, + "step": 3970 + }, + { + "epoch": 0.14220996651565884, + "grad_norm": 1.2800089120864868, + "learning_rate": 0.00019346947237175655, + "loss": 1.0229, + "step": 3971 + }, + { + "epoch": 0.14224577864522714, + "grad_norm": 1.581390380859375, + "learning_rate": 0.00019346534885638866, + "loss": 1.4063, + "step": 3972 + }, + { + "epoch": 0.14228159077479544, + "grad_norm": 1.6010406017303467, + "learning_rate": 0.00019346122408356507, + "loss": 1.3115, + "step": 3973 + }, + { + "epoch": 0.1423174029043637, + "grad_norm": 1.74225652217865, + "learning_rate": 0.00019345709805334123, + "loss": 1.2107, + "step": 3974 + }, + { + "epoch": 0.142353215033932, + "grad_norm": 1.6850101947784424, + "learning_rate": 0.00019345297076577272, + "loss": 1.3045, + "step": 3975 + }, + { + "epoch": 0.14238902716350027, + "grad_norm": 2.0627546310424805, + "learning_rate": 0.00019344884222091503, + "loss": 1.2662, + "step": 3976 + }, + { + "epoch": 0.14242483929306857, + "grad_norm": 1.6786539554595947, + "learning_rate": 0.00019344471241882372, + "loss": 1.2285, + "step": 3977 + }, + { + "epoch": 0.14246065142263684, + "grad_norm": 1.898452639579773, + "learning_rate": 0.0001934405813595543, + "loss": 1.3576, + "step": 3978 + }, + { + "epoch": 0.14249646355220513, + "grad_norm": 1.5774530172348022, + "learning_rate": 0.00019343644904316242, + "loss": 1.2229, + "step": 3979 + }, + { + "epoch": 0.14253227568177343, + "grad_norm": 1.2925701141357422, + "learning_rate": 0.0001934323154697036, + "loss": 1.0342, + "step": 3980 + }, + { + "epoch": 0.1425680878113417, + "grad_norm": 1.920512318611145, + "learning_rate": 0.00019342818063923357, + "loss": 1.2528, + "step": 3981 + }, + { + "epoch": 0.14260389994091, + "grad_norm": 1.2098218202590942, + "learning_rate": 0.00019342404455180784, + "loss": 1.0892, + "step": 3982 + }, + { + "epoch": 0.14263971207047826, + "grad_norm": 2.1409783363342285, + "learning_rate": 0.00019341990720748208, + "loss": 1.1471, + "step": 3983 + }, + { + "epoch": 0.14267552420004656, + "grad_norm": 1.5230920314788818, + "learning_rate": 0.000193415768606312, + "loss": 1.3864, + "step": 3984 + }, + { + "epoch": 0.14271133632961483, + "grad_norm": 1.7394078969955444, + "learning_rate": 0.00019341162874835326, + "loss": 1.513, + "step": 3985 + }, + { + "epoch": 0.14274714845918313, + "grad_norm": 1.4850836992263794, + "learning_rate": 0.00019340748763366152, + "loss": 1.2074, + "step": 3986 + }, + { + "epoch": 0.14278296058875142, + "grad_norm": 1.5210654735565186, + "learning_rate": 0.00019340334526229253, + "loss": 1.2825, + "step": 3987 + }, + { + "epoch": 0.1428187727183197, + "grad_norm": 1.836186408996582, + "learning_rate": 0.00019339920163430202, + "loss": 1.2825, + "step": 3988 + }, + { + "epoch": 0.142854584847888, + "grad_norm": 2.853250503540039, + "learning_rate": 0.0001933950567497457, + "loss": 1.3269, + "step": 3989 + }, + { + "epoch": 0.14289039697745626, + "grad_norm": 2.0440897941589355, + "learning_rate": 0.0001933909106086794, + "loss": 1.1824, + "step": 3990 + }, + { + "epoch": 0.14292620910702455, + "grad_norm": 1.6151930093765259, + "learning_rate": 0.00019338676321115883, + "loss": 1.1623, + "step": 3991 + }, + { + "epoch": 0.14296202123659282, + "grad_norm": 1.3916709423065186, + "learning_rate": 0.00019338261455723984, + "loss": 1.061, + "step": 3992 + }, + { + "epoch": 0.14299783336616112, + "grad_norm": 1.6421949863433838, + "learning_rate": 0.00019337846464697825, + "loss": 1.0378, + "step": 3993 + }, + { + "epoch": 0.14303364549572942, + "grad_norm": 1.4759881496429443, + "learning_rate": 0.00019337431348042983, + "loss": 1.1212, + "step": 3994 + }, + { + "epoch": 0.14306945762529769, + "grad_norm": 1.6444889307022095, + "learning_rate": 0.00019337016105765048, + "loss": 1.1022, + "step": 3995 + }, + { + "epoch": 0.14310526975486598, + "grad_norm": 1.2006852626800537, + "learning_rate": 0.00019336600737869603, + "loss": 1.3664, + "step": 3996 + }, + { + "epoch": 0.14314108188443425, + "grad_norm": 1.5304769277572632, + "learning_rate": 0.00019336185244362244, + "loss": 1.084, + "step": 3997 + }, + { + "epoch": 0.14317689401400255, + "grad_norm": 1.371273159980774, + "learning_rate": 0.0001933576962524855, + "loss": 0.9993, + "step": 3998 + }, + { + "epoch": 0.14321270614357082, + "grad_norm": 1.434452772140503, + "learning_rate": 0.0001933535388053412, + "loss": 1.1286, + "step": 3999 + }, + { + "epoch": 0.1432485182731391, + "grad_norm": 1.77420175075531, + "learning_rate": 0.00019334938010224546, + "loss": 1.2898, + "step": 4000 + }, + { + "epoch": 0.1432843304027074, + "grad_norm": 1.4724403619766235, + "learning_rate": 0.0001933452201432542, + "loss": 1.3327, + "step": 4001 + }, + { + "epoch": 0.14332014253227568, + "grad_norm": 1.5364574193954468, + "learning_rate": 0.00019334105892842342, + "loss": 1.1841, + "step": 4002 + }, + { + "epoch": 0.14335595466184398, + "grad_norm": 1.524446964263916, + "learning_rate": 0.00019333689645780912, + "loss": 1.2694, + "step": 4003 + }, + { + "epoch": 0.14339176679141225, + "grad_norm": 1.5650132894515991, + "learning_rate": 0.00019333273273146721, + "loss": 1.1902, + "step": 4004 + }, + { + "epoch": 0.14342757892098054, + "grad_norm": 1.8873403072357178, + "learning_rate": 0.00019332856774945383, + "loss": 1.268, + "step": 4005 + }, + { + "epoch": 0.1434633910505488, + "grad_norm": 2.0836634635925293, + "learning_rate": 0.00019332440151182493, + "loss": 1.0108, + "step": 4006 + }, + { + "epoch": 0.1434992031801171, + "grad_norm": 1.448052167892456, + "learning_rate": 0.00019332023401863658, + "loss": 1.0407, + "step": 4007 + }, + { + "epoch": 0.1435350153096854, + "grad_norm": 1.895379900932312, + "learning_rate": 0.00019331606526994488, + "loss": 1.1866, + "step": 4008 + }, + { + "epoch": 0.14357082743925367, + "grad_norm": 2.128133773803711, + "learning_rate": 0.0001933118952658059, + "loss": 1.1979, + "step": 4009 + }, + { + "epoch": 0.14360663956882197, + "grad_norm": 2.0220980644226074, + "learning_rate": 0.00019330772400627573, + "loss": 0.9256, + "step": 4010 + }, + { + "epoch": 0.14364245169839024, + "grad_norm": 1.7873870134353638, + "learning_rate": 0.00019330355149141046, + "loss": 1.2618, + "step": 4011 + }, + { + "epoch": 0.14367826382795854, + "grad_norm": 1.8645596504211426, + "learning_rate": 0.00019329937772126626, + "loss": 1.4238, + "step": 4012 + }, + { + "epoch": 0.1437140759575268, + "grad_norm": 2.0410170555114746, + "learning_rate": 0.0001932952026958993, + "loss": 1.1469, + "step": 4013 + }, + { + "epoch": 0.1437498880870951, + "grad_norm": 1.5216008424758911, + "learning_rate": 0.00019329102641536575, + "loss": 1.0865, + "step": 4014 + }, + { + "epoch": 0.14378570021666337, + "grad_norm": 1.6552379131317139, + "learning_rate": 0.00019328684887972173, + "loss": 1.211, + "step": 4015 + }, + { + "epoch": 0.14382151234623167, + "grad_norm": 2.09911847114563, + "learning_rate": 0.00019328267008902352, + "loss": 1.2462, + "step": 4016 + }, + { + "epoch": 0.14385732447579996, + "grad_norm": 2.0293045043945312, + "learning_rate": 0.00019327849004332728, + "loss": 1.3651, + "step": 4017 + }, + { + "epoch": 0.14389313660536823, + "grad_norm": 1.8108526468276978, + "learning_rate": 0.0001932743087426893, + "loss": 1.1908, + "step": 4018 + }, + { + "epoch": 0.14392894873493653, + "grad_norm": 1.5499550104141235, + "learning_rate": 0.00019327012618716583, + "loss": 1.1615, + "step": 4019 + }, + { + "epoch": 0.1439647608645048, + "grad_norm": 1.839896321296692, + "learning_rate": 0.00019326594237681311, + "loss": 1.4794, + "step": 4020 + }, + { + "epoch": 0.1440005729940731, + "grad_norm": 1.7744227647781372, + "learning_rate": 0.00019326175731168742, + "loss": 1.3646, + "step": 4021 + }, + { + "epoch": 0.14403638512364136, + "grad_norm": 1.4854953289031982, + "learning_rate": 0.00019325757099184507, + "loss": 1.01, + "step": 4022 + }, + { + "epoch": 0.14407219725320966, + "grad_norm": 1.7854678630828857, + "learning_rate": 0.00019325338341734245, + "loss": 1.2597, + "step": 4023 + }, + { + "epoch": 0.14410800938277796, + "grad_norm": 1.2821274995803833, + "learning_rate": 0.00019324919458823582, + "loss": 1.211, + "step": 4024 + }, + { + "epoch": 0.14414382151234623, + "grad_norm": 2.193368911743164, + "learning_rate": 0.00019324500450458153, + "loss": 1.2996, + "step": 4025 + }, + { + "epoch": 0.14417963364191452, + "grad_norm": 1.8708676099777222, + "learning_rate": 0.000193240813166436, + "loss": 1.2216, + "step": 4026 + }, + { + "epoch": 0.1442154457714828, + "grad_norm": 2.2283935546875, + "learning_rate": 0.0001932366205738556, + "loss": 1.1224, + "step": 4027 + }, + { + "epoch": 0.1442512579010511, + "grad_norm": 1.7700031995773315, + "learning_rate": 0.00019323242672689676, + "loss": 1.183, + "step": 4028 + }, + { + "epoch": 0.14428707003061936, + "grad_norm": 1.3877640962600708, + "learning_rate": 0.00019322823162561586, + "loss": 1.2685, + "step": 4029 + }, + { + "epoch": 0.14432288216018765, + "grad_norm": 1.803672194480896, + "learning_rate": 0.00019322403527006937, + "loss": 1.1844, + "step": 4030 + }, + { + "epoch": 0.14435869428975595, + "grad_norm": 1.5099321603775024, + "learning_rate": 0.00019321983766031373, + "loss": 1.4153, + "step": 4031 + }, + { + "epoch": 0.14439450641932422, + "grad_norm": 1.5434517860412598, + "learning_rate": 0.00019321563879640542, + "loss": 1.0842, + "step": 4032 + }, + { + "epoch": 0.14443031854889252, + "grad_norm": 1.7550690174102783, + "learning_rate": 0.00019321143867840091, + "loss": 1.1084, + "step": 4033 + }, + { + "epoch": 0.14446613067846079, + "grad_norm": 1.3539574146270752, + "learning_rate": 0.00019320723730635676, + "loss": 1.3951, + "step": 4034 + }, + { + "epoch": 0.14450194280802908, + "grad_norm": 2.4964685440063477, + "learning_rate": 0.00019320303468032944, + "loss": 1.3153, + "step": 4035 + }, + { + "epoch": 0.14453775493759735, + "grad_norm": 1.511718511581421, + "learning_rate": 0.00019319883080037552, + "loss": 1.3124, + "step": 4036 + }, + { + "epoch": 0.14457356706716565, + "grad_norm": 1.1952182054519653, + "learning_rate": 0.00019319462566655155, + "loss": 1.2906, + "step": 4037 + }, + { + "epoch": 0.14460937919673394, + "grad_norm": 1.4950565099716187, + "learning_rate": 0.0001931904192789141, + "loss": 1.2451, + "step": 4038 + }, + { + "epoch": 0.1446451913263022, + "grad_norm": 1.7570362091064453, + "learning_rate": 0.00019318621163751974, + "loss": 1.1662, + "step": 4039 + }, + { + "epoch": 0.1446810034558705, + "grad_norm": 1.6255706548690796, + "learning_rate": 0.00019318200274242515, + "loss": 1.3747, + "step": 4040 + }, + { + "epoch": 0.14471681558543878, + "grad_norm": 1.5378774404525757, + "learning_rate": 0.0001931777925936869, + "loss": 0.9509, + "step": 4041 + }, + { + "epoch": 0.14475262771500708, + "grad_norm": 1.4056363105773926, + "learning_rate": 0.00019317358119136163, + "loss": 1.2054, + "step": 4042 + }, + { + "epoch": 0.14478843984457535, + "grad_norm": 2.425856590270996, + "learning_rate": 0.000193169368535506, + "loss": 1.287, + "step": 4043 + }, + { + "epoch": 0.14482425197414364, + "grad_norm": 2.6064987182617188, + "learning_rate": 0.00019316515462617672, + "loss": 1.4229, + "step": 4044 + }, + { + "epoch": 0.14486006410371194, + "grad_norm": 1.973889708518982, + "learning_rate": 0.00019316093946343044, + "loss": 0.9998, + "step": 4045 + }, + { + "epoch": 0.1448958762332802, + "grad_norm": 1.9092237949371338, + "learning_rate": 0.00019315672304732388, + "loss": 1.0593, + "step": 4046 + }, + { + "epoch": 0.1449316883628485, + "grad_norm": 1.8676931858062744, + "learning_rate": 0.0001931525053779138, + "loss": 1.2113, + "step": 4047 + }, + { + "epoch": 0.14496750049241677, + "grad_norm": 1.6464760303497314, + "learning_rate": 0.00019314828645525692, + "loss": 1.0963, + "step": 4048 + }, + { + "epoch": 0.14500331262198507, + "grad_norm": 1.6227872371673584, + "learning_rate": 0.00019314406627940996, + "loss": 1.0851, + "step": 4049 + }, + { + "epoch": 0.14503912475155334, + "grad_norm": 2.4113311767578125, + "learning_rate": 0.00019313984485042976, + "loss": 1.2757, + "step": 4050 + }, + { + "epoch": 0.14507493688112164, + "grad_norm": 1.7336256504058838, + "learning_rate": 0.0001931356221683731, + "loss": 1.3248, + "step": 4051 + }, + { + "epoch": 0.14511074901068993, + "grad_norm": 1.9786752462387085, + "learning_rate": 0.00019313139823329677, + "loss": 1.1367, + "step": 4052 + }, + { + "epoch": 0.1451465611402582, + "grad_norm": 1.3353698253631592, + "learning_rate": 0.00019312717304525762, + "loss": 1.1371, + "step": 4053 + }, + { + "epoch": 0.1451823732698265, + "grad_norm": 2.596587896347046, + "learning_rate": 0.00019312294660431246, + "loss": 1.3797, + "step": 4054 + }, + { + "epoch": 0.14521818539939477, + "grad_norm": 1.432083010673523, + "learning_rate": 0.00019311871891051818, + "loss": 1.1582, + "step": 4055 + }, + { + "epoch": 0.14525399752896306, + "grad_norm": 1.7029708623886108, + "learning_rate": 0.00019311448996393163, + "loss": 1.1488, + "step": 4056 + }, + { + "epoch": 0.14528980965853133, + "grad_norm": 1.8351460695266724, + "learning_rate": 0.00019311025976460978, + "loss": 1.1466, + "step": 4057 + }, + { + "epoch": 0.14532562178809963, + "grad_norm": 1.9187780618667603, + "learning_rate": 0.00019310602831260944, + "loss": 1.3065, + "step": 4058 + }, + { + "epoch": 0.14536143391766793, + "grad_norm": 1.702524185180664, + "learning_rate": 0.0001931017956079876, + "loss": 1.3776, + "step": 4059 + }, + { + "epoch": 0.1453972460472362, + "grad_norm": 1.6843523979187012, + "learning_rate": 0.0001930975616508012, + "loss": 1.2611, + "step": 4060 + }, + { + "epoch": 0.1454330581768045, + "grad_norm": 2.058788776397705, + "learning_rate": 0.00019309332644110722, + "loss": 1.5156, + "step": 4061 + }, + { + "epoch": 0.14546887030637276, + "grad_norm": 1.9515049457550049, + "learning_rate": 0.0001930890899789626, + "loss": 1.0876, + "step": 4062 + }, + { + "epoch": 0.14550468243594106, + "grad_norm": 1.6328415870666504, + "learning_rate": 0.0001930848522644243, + "loss": 1.2591, + "step": 4063 + }, + { + "epoch": 0.14554049456550933, + "grad_norm": 1.487871766090393, + "learning_rate": 0.00019308061329754942, + "loss": 1.297, + "step": 4064 + }, + { + "epoch": 0.14557630669507762, + "grad_norm": 1.4730297327041626, + "learning_rate": 0.00019307637307839498, + "loss": 1.1845, + "step": 4065 + }, + { + "epoch": 0.14561211882464592, + "grad_norm": 2.0825657844543457, + "learning_rate": 0.00019307213160701798, + "loss": 1.1735, + "step": 4066 + }, + { + "epoch": 0.1456479309542142, + "grad_norm": 1.5830317735671997, + "learning_rate": 0.0001930678888834755, + "loss": 1.0889, + "step": 4067 + }, + { + "epoch": 0.14568374308378249, + "grad_norm": 1.4048733711242676, + "learning_rate": 0.00019306364490782462, + "loss": 1.2775, + "step": 4068 + }, + { + "epoch": 0.14571955521335075, + "grad_norm": 1.9784492254257202, + "learning_rate": 0.00019305939968012245, + "loss": 1.2397, + "step": 4069 + }, + { + "epoch": 0.14575536734291905, + "grad_norm": 1.5593181848526, + "learning_rate": 0.00019305515320042611, + "loss": 1.1975, + "step": 4070 + }, + { + "epoch": 0.14579117947248732, + "grad_norm": 1.5558393001556396, + "learning_rate": 0.00019305090546879267, + "loss": 1.1802, + "step": 4071 + }, + { + "epoch": 0.14582699160205562, + "grad_norm": 1.6281818151474, + "learning_rate": 0.00019304665648527935, + "loss": 1.0701, + "step": 4072 + }, + { + "epoch": 0.1458628037316239, + "grad_norm": 1.87656569480896, + "learning_rate": 0.00019304240624994328, + "loss": 1.2839, + "step": 4073 + }, + { + "epoch": 0.14589861586119218, + "grad_norm": 2.0323493480682373, + "learning_rate": 0.00019303815476284168, + "loss": 1.0714, + "step": 4074 + }, + { + "epoch": 0.14593442799076048, + "grad_norm": 1.8592158555984497, + "learning_rate": 0.0001930339020240317, + "loss": 1.2558, + "step": 4075 + }, + { + "epoch": 0.14597024012032875, + "grad_norm": 1.7381867170333862, + "learning_rate": 0.00019302964803357057, + "loss": 1.3733, + "step": 4076 + }, + { + "epoch": 0.14600605224989704, + "grad_norm": 1.5722525119781494, + "learning_rate": 0.00019302539279151553, + "loss": 1.1642, + "step": 4077 + }, + { + "epoch": 0.1460418643794653, + "grad_norm": 1.801121711730957, + "learning_rate": 0.00019302113629792383, + "loss": 0.9068, + "step": 4078 + }, + { + "epoch": 0.1460776765090336, + "grad_norm": 1.955649733543396, + "learning_rate": 0.0001930168785528527, + "loss": 1.2189, + "step": 4079 + }, + { + "epoch": 0.1461134886386019, + "grad_norm": 1.5312422513961792, + "learning_rate": 0.00019301261955635948, + "loss": 1.4061, + "step": 4080 + }, + { + "epoch": 0.14614930076817018, + "grad_norm": 1.4050699472427368, + "learning_rate": 0.00019300835930850143, + "loss": 1.0867, + "step": 4081 + }, + { + "epoch": 0.14618511289773847, + "grad_norm": 1.592042088508606, + "learning_rate": 0.0001930040978093359, + "loss": 1.108, + "step": 4082 + }, + { + "epoch": 0.14622092502730674, + "grad_norm": 1.666394591331482, + "learning_rate": 0.00019299983505892016, + "loss": 1.1015, + "step": 4083 + }, + { + "epoch": 0.14625673715687504, + "grad_norm": 2.1955199241638184, + "learning_rate": 0.00019299557105731166, + "loss": 1.0783, + "step": 4084 + }, + { + "epoch": 0.1462925492864433, + "grad_norm": 1.3652715682983398, + "learning_rate": 0.00019299130580456765, + "loss": 1.09, + "step": 4085 + }, + { + "epoch": 0.1463283614160116, + "grad_norm": 1.5289510488510132, + "learning_rate": 0.0001929870393007456, + "loss": 1.3031, + "step": 4086 + }, + { + "epoch": 0.1463641735455799, + "grad_norm": 2.373441696166992, + "learning_rate": 0.00019298277154590284, + "loss": 1.0522, + "step": 4087 + }, + { + "epoch": 0.14639998567514817, + "grad_norm": 1.9246727228164673, + "learning_rate": 0.0001929785025400969, + "loss": 1.1783, + "step": 4088 + }, + { + "epoch": 0.14643579780471647, + "grad_norm": 1.9228545427322388, + "learning_rate": 0.0001929742322833851, + "loss": 1.1695, + "step": 4089 + }, + { + "epoch": 0.14647160993428474, + "grad_norm": 1.728043794631958, + "learning_rate": 0.00019296996077582492, + "loss": 1.0831, + "step": 4090 + }, + { + "epoch": 0.14650742206385303, + "grad_norm": 1.9132351875305176, + "learning_rate": 0.00019296568801747385, + "loss": 1.3141, + "step": 4091 + }, + { + "epoch": 0.1465432341934213, + "grad_norm": 1.962683916091919, + "learning_rate": 0.00019296141400838938, + "loss": 1.2352, + "step": 4092 + }, + { + "epoch": 0.1465790463229896, + "grad_norm": 1.759524941444397, + "learning_rate": 0.00019295713874862896, + "loss": 1.0751, + "step": 4093 + }, + { + "epoch": 0.1466148584525579, + "grad_norm": 1.4407248497009277, + "learning_rate": 0.0001929528622382502, + "loss": 1.1641, + "step": 4094 + }, + { + "epoch": 0.14665067058212616, + "grad_norm": 1.6652607917785645, + "learning_rate": 0.00019294858447731054, + "loss": 1.1182, + "step": 4095 + }, + { + "epoch": 0.14668648271169446, + "grad_norm": 1.734060525894165, + "learning_rate": 0.0001929443054658676, + "loss": 1.3293, + "step": 4096 + }, + { + "epoch": 0.14672229484126273, + "grad_norm": 1.8977422714233398, + "learning_rate": 0.00019294002520397888, + "loss": 1.2851, + "step": 4097 + }, + { + "epoch": 0.14675810697083103, + "grad_norm": 2.164356231689453, + "learning_rate": 0.000192935743691702, + "loss": 1.2186, + "step": 4098 + }, + { + "epoch": 0.1467939191003993, + "grad_norm": 1.5437357425689697, + "learning_rate": 0.00019293146092909462, + "loss": 1.201, + "step": 4099 + }, + { + "epoch": 0.1468297312299676, + "grad_norm": 1.9638350009918213, + "learning_rate": 0.00019292717691621428, + "loss": 1.2618, + "step": 4100 + }, + { + "epoch": 0.1468655433595359, + "grad_norm": 1.5040152072906494, + "learning_rate": 0.00019292289165311863, + "loss": 1.182, + "step": 4101 + }, + { + "epoch": 0.14690135548910416, + "grad_norm": 1.5108528137207031, + "learning_rate": 0.00019291860513986534, + "loss": 1.2889, + "step": 4102 + }, + { + "epoch": 0.14693716761867245, + "grad_norm": 1.7375773191452026, + "learning_rate": 0.0001929143173765121, + "loss": 1.2748, + "step": 4103 + }, + { + "epoch": 0.14697297974824072, + "grad_norm": 1.4837801456451416, + "learning_rate": 0.00019291002836311654, + "loss": 1.2531, + "step": 4104 + }, + { + "epoch": 0.14700879187780902, + "grad_norm": 2.0652432441711426, + "learning_rate": 0.0001929057380997364, + "loss": 1.0847, + "step": 4105 + }, + { + "epoch": 0.1470446040073773, + "grad_norm": 1.534096360206604, + "learning_rate": 0.0001929014465864294, + "loss": 1.2267, + "step": 4106 + }, + { + "epoch": 0.14708041613694559, + "grad_norm": 1.355660080909729, + "learning_rate": 0.00019289715382325327, + "loss": 1.102, + "step": 4107 + }, + { + "epoch": 0.14711622826651388, + "grad_norm": 1.876499891281128, + "learning_rate": 0.00019289285981026577, + "loss": 1.459, + "step": 4108 + }, + { + "epoch": 0.14715204039608215, + "grad_norm": 1.5953261852264404, + "learning_rate": 0.00019288856454752464, + "loss": 1.1213, + "step": 4109 + }, + { + "epoch": 0.14718785252565045, + "grad_norm": 1.3306382894515991, + "learning_rate": 0.0001928842680350877, + "loss": 1.1867, + "step": 4110 + }, + { + "epoch": 0.14722366465521872, + "grad_norm": 1.5990322828292847, + "learning_rate": 0.00019287997027301275, + "loss": 1.2631, + "step": 4111 + }, + { + "epoch": 0.147259476784787, + "grad_norm": 1.6025015115737915, + "learning_rate": 0.00019287567126135763, + "loss": 1.0734, + "step": 4112 + }, + { + "epoch": 0.14729528891435528, + "grad_norm": 1.5528345108032227, + "learning_rate": 0.00019287137100018013, + "loss": 1.4328, + "step": 4113 + }, + { + "epoch": 0.14733110104392358, + "grad_norm": 1.8978655338287354, + "learning_rate": 0.00019286706948953812, + "loss": 1.4337, + "step": 4114 + }, + { + "epoch": 0.14736691317349185, + "grad_norm": 1.238855004310608, + "learning_rate": 0.00019286276672948952, + "loss": 1.2412, + "step": 4115 + }, + { + "epoch": 0.14740272530306014, + "grad_norm": 2.084327220916748, + "learning_rate": 0.00019285846272009213, + "loss": 1.1953, + "step": 4116 + }, + { + "epoch": 0.14743853743262844, + "grad_norm": 1.841536521911621, + "learning_rate": 0.00019285415746140392, + "loss": 1.2953, + "step": 4117 + }, + { + "epoch": 0.1474743495621967, + "grad_norm": 1.635081171989441, + "learning_rate": 0.0001928498509534828, + "loss": 1.2056, + "step": 4118 + }, + { + "epoch": 0.147510161691765, + "grad_norm": 1.2969706058502197, + "learning_rate": 0.0001928455431963867, + "loss": 1.1577, + "step": 4119 + }, + { + "epoch": 0.14754597382133328, + "grad_norm": 1.7435299158096313, + "learning_rate": 0.00019284123419017357, + "loss": 1.4046, + "step": 4120 + }, + { + "epoch": 0.14758178595090157, + "grad_norm": 2.1329898834228516, + "learning_rate": 0.0001928369239349014, + "loss": 1.3374, + "step": 4121 + }, + { + "epoch": 0.14761759808046984, + "grad_norm": 1.7307273149490356, + "learning_rate": 0.00019283261243062817, + "loss": 1.2584, + "step": 4122 + }, + { + "epoch": 0.14765341021003814, + "grad_norm": 1.3270539045333862, + "learning_rate": 0.0001928282996774119, + "loss": 1.1668, + "step": 4123 + }, + { + "epoch": 0.14768922233960644, + "grad_norm": 1.6562221050262451, + "learning_rate": 0.00019282398567531058, + "loss": 1.2097, + "step": 4124 + }, + { + "epoch": 0.1477250344691747, + "grad_norm": 1.9372332096099854, + "learning_rate": 0.00019281967042438227, + "loss": 1.188, + "step": 4125 + }, + { + "epoch": 0.147760846598743, + "grad_norm": 1.696405053138733, + "learning_rate": 0.000192815353924685, + "loss": 1.1837, + "step": 4126 + }, + { + "epoch": 0.14779665872831127, + "grad_norm": 1.3839404582977295, + "learning_rate": 0.0001928110361762769, + "loss": 1.1905, + "step": 4127 + }, + { + "epoch": 0.14783247085787957, + "grad_norm": 1.4105809926986694, + "learning_rate": 0.000192806717179216, + "loss": 1.2539, + "step": 4128 + }, + { + "epoch": 0.14786828298744784, + "grad_norm": 1.3970357179641724, + "learning_rate": 0.00019280239693356048, + "loss": 1.149, + "step": 4129 + }, + { + "epoch": 0.14790409511701613, + "grad_norm": 1.4507982730865479, + "learning_rate": 0.0001927980754393684, + "loss": 1.2883, + "step": 4130 + }, + { + "epoch": 0.14793990724658443, + "grad_norm": 2.0798254013061523, + "learning_rate": 0.00019279375269669785, + "loss": 1.1635, + "step": 4131 + }, + { + "epoch": 0.1479757193761527, + "grad_norm": 1.85867440700531, + "learning_rate": 0.00019278942870560713, + "loss": 1.3422, + "step": 4132 + }, + { + "epoch": 0.148011531505721, + "grad_norm": 1.441142201423645, + "learning_rate": 0.0001927851034661543, + "loss": 1.0392, + "step": 4133 + }, + { + "epoch": 0.14804734363528926, + "grad_norm": 1.4745756387710571, + "learning_rate": 0.0001927807769783976, + "loss": 1.0557, + "step": 4134 + }, + { + "epoch": 0.14808315576485756, + "grad_norm": 1.629359483718872, + "learning_rate": 0.0001927764492423952, + "loss": 1.1234, + "step": 4135 + }, + { + "epoch": 0.14811896789442583, + "grad_norm": 1.480472207069397, + "learning_rate": 0.0001927721202582054, + "loss": 1.1148, + "step": 4136 + }, + { + "epoch": 0.14815478002399413, + "grad_norm": 1.4933662414550781, + "learning_rate": 0.00019276779002588634, + "loss": 1.2085, + "step": 4137 + }, + { + "epoch": 0.14819059215356242, + "grad_norm": 1.186118721961975, + "learning_rate": 0.00019276345854549634, + "loss": 1.2872, + "step": 4138 + }, + { + "epoch": 0.1482264042831307, + "grad_norm": 1.623254418373108, + "learning_rate": 0.00019275912581709367, + "loss": 1.3303, + "step": 4139 + }, + { + "epoch": 0.148262216412699, + "grad_norm": 1.6673471927642822, + "learning_rate": 0.0001927547918407366, + "loss": 1.2462, + "step": 4140 + }, + { + "epoch": 0.14829802854226726, + "grad_norm": 1.5214701890945435, + "learning_rate": 0.00019275045661648344, + "loss": 1.3065, + "step": 4141 + }, + { + "epoch": 0.14833384067183555, + "grad_norm": 1.6444493532180786, + "learning_rate": 0.00019274612014439258, + "loss": 1.2762, + "step": 4142 + }, + { + "epoch": 0.14836965280140382, + "grad_norm": 2.308685541152954, + "learning_rate": 0.00019274178242452224, + "loss": 1.259, + "step": 4143 + }, + { + "epoch": 0.14840546493097212, + "grad_norm": 2.2085111141204834, + "learning_rate": 0.0001927374434569309, + "loss": 1.1563, + "step": 4144 + }, + { + "epoch": 0.14844127706054042, + "grad_norm": 1.7395786046981812, + "learning_rate": 0.00019273310324167687, + "loss": 1.3773, + "step": 4145 + }, + { + "epoch": 0.14847708919010869, + "grad_norm": 1.4093979597091675, + "learning_rate": 0.00019272876177881852, + "loss": 0.9985, + "step": 4146 + }, + { + "epoch": 0.14851290131967698, + "grad_norm": 2.2475574016571045, + "learning_rate": 0.00019272441906841432, + "loss": 1.1706, + "step": 4147 + }, + { + "epoch": 0.14854871344924525, + "grad_norm": 1.583628535270691, + "learning_rate": 0.00019272007511052266, + "loss": 1.3162, + "step": 4148 + }, + { + "epoch": 0.14858452557881355, + "grad_norm": 1.5448541641235352, + "learning_rate": 0.000192715729905202, + "loss": 1.2954, + "step": 4149 + }, + { + "epoch": 0.14862033770838182, + "grad_norm": 1.524349331855774, + "learning_rate": 0.00019271138345251077, + "loss": 1.3254, + "step": 4150 + }, + { + "epoch": 0.1486561498379501, + "grad_norm": 1.6890850067138672, + "learning_rate": 0.00019270703575250748, + "loss": 1.1823, + "step": 4151 + }, + { + "epoch": 0.1486919619675184, + "grad_norm": 2.375861644744873, + "learning_rate": 0.0001927026868052506, + "loss": 1.1358, + "step": 4152 + }, + { + "epoch": 0.14872777409708668, + "grad_norm": 1.5728029012680054, + "learning_rate": 0.00019269833661079866, + "loss": 1.2629, + "step": 4153 + }, + { + "epoch": 0.14876358622665498, + "grad_norm": 2.304456949234009, + "learning_rate": 0.00019269398516921015, + "loss": 1.5021, + "step": 4154 + }, + { + "epoch": 0.14879939835622324, + "grad_norm": 2.035371780395508, + "learning_rate": 0.00019268963248054367, + "loss": 1.3747, + "step": 4155 + }, + { + "epoch": 0.14883521048579154, + "grad_norm": 1.5691063404083252, + "learning_rate": 0.00019268527854485773, + "loss": 1.1943, + "step": 4156 + }, + { + "epoch": 0.1488710226153598, + "grad_norm": 1.6923892498016357, + "learning_rate": 0.0001926809233622109, + "loss": 1.4695, + "step": 4157 + }, + { + "epoch": 0.1489068347449281, + "grad_norm": 2.295781373977661, + "learning_rate": 0.0001926765669326618, + "loss": 1.1328, + "step": 4158 + }, + { + "epoch": 0.1489426468744964, + "grad_norm": 1.9550100564956665, + "learning_rate": 0.00019267220925626907, + "loss": 1.294, + "step": 4159 + }, + { + "epoch": 0.14897845900406467, + "grad_norm": 1.7108453512191772, + "learning_rate": 0.00019266785033309128, + "loss": 1.044, + "step": 4160 + }, + { + "epoch": 0.14901427113363297, + "grad_norm": 1.8903536796569824, + "learning_rate": 0.0001926634901631871, + "loss": 1.2427, + "step": 4161 + }, + { + "epoch": 0.14905008326320124, + "grad_norm": 2.1279799938201904, + "learning_rate": 0.00019265912874661515, + "loss": 1.0497, + "step": 4162 + }, + { + "epoch": 0.14908589539276954, + "grad_norm": 1.1809848546981812, + "learning_rate": 0.0001926547660834342, + "loss": 1.2355, + "step": 4163 + }, + { + "epoch": 0.1491217075223378, + "grad_norm": 1.948665738105774, + "learning_rate": 0.00019265040217370286, + "loss": 1.0522, + "step": 4164 + }, + { + "epoch": 0.1491575196519061, + "grad_norm": 1.920639157295227, + "learning_rate": 0.0001926460370174799, + "loss": 1.096, + "step": 4165 + }, + { + "epoch": 0.1491933317814744, + "grad_norm": 1.7080215215682983, + "learning_rate": 0.00019264167061482397, + "loss": 1.1299, + "step": 4166 + }, + { + "epoch": 0.14922914391104267, + "grad_norm": 1.554028034210205, + "learning_rate": 0.0001926373029657939, + "loss": 1.1915, + "step": 4167 + }, + { + "epoch": 0.14926495604061096, + "grad_norm": 1.9993752241134644, + "learning_rate": 0.00019263293407044838, + "loss": 1.3134, + "step": 4168 + }, + { + "epoch": 0.14930076817017923, + "grad_norm": 1.726814866065979, + "learning_rate": 0.00019262856392884625, + "loss": 1.3521, + "step": 4169 + }, + { + "epoch": 0.14933658029974753, + "grad_norm": 1.8719745874404907, + "learning_rate": 0.00019262419254104628, + "loss": 1.3725, + "step": 4170 + }, + { + "epoch": 0.1493723924293158, + "grad_norm": 1.621699571609497, + "learning_rate": 0.00019261981990710723, + "loss": 1.2489, + "step": 4171 + }, + { + "epoch": 0.1494082045588841, + "grad_norm": 1.443272590637207, + "learning_rate": 0.000192615446027088, + "loss": 1.1615, + "step": 4172 + }, + { + "epoch": 0.1494440166884524, + "grad_norm": 1.399842619895935, + "learning_rate": 0.00019261107090104743, + "loss": 1.1822, + "step": 4173 + }, + { + "epoch": 0.14947982881802066, + "grad_norm": 1.4572347402572632, + "learning_rate": 0.00019260669452904433, + "loss": 1.0016, + "step": 4174 + }, + { + "epoch": 0.14951564094758896, + "grad_norm": 1.3040486574172974, + "learning_rate": 0.00019260231691113763, + "loss": 1.3705, + "step": 4175 + }, + { + "epoch": 0.14955145307715723, + "grad_norm": 1.5427151918411255, + "learning_rate": 0.00019259793804738619, + "loss": 1.3059, + "step": 4176 + }, + { + "epoch": 0.14958726520672552, + "grad_norm": 1.4549710750579834, + "learning_rate": 0.00019259355793784893, + "loss": 1.2545, + "step": 4177 + }, + { + "epoch": 0.1496230773362938, + "grad_norm": 1.5458165407180786, + "learning_rate": 0.00019258917658258483, + "loss": 1.2855, + "step": 4178 + }, + { + "epoch": 0.1496588894658621, + "grad_norm": 1.4301310777664185, + "learning_rate": 0.00019258479398165273, + "loss": 1.3034, + "step": 4179 + }, + { + "epoch": 0.14969470159543039, + "grad_norm": 2.024251937866211, + "learning_rate": 0.00019258041013511167, + "loss": 1.1861, + "step": 4180 + }, + { + "epoch": 0.14973051372499865, + "grad_norm": 2.281102418899536, + "learning_rate": 0.00019257602504302063, + "loss": 1.4814, + "step": 4181 + }, + { + "epoch": 0.14976632585456695, + "grad_norm": 2.4223134517669678, + "learning_rate": 0.0001925716387054386, + "loss": 1.2811, + "step": 4182 + }, + { + "epoch": 0.14980213798413522, + "grad_norm": 1.8615962266921997, + "learning_rate": 0.00019256725112242455, + "loss": 1.4714, + "step": 4183 + }, + { + "epoch": 0.14983795011370352, + "grad_norm": 2.7037880420684814, + "learning_rate": 0.00019256286229403754, + "loss": 1.2788, + "step": 4184 + }, + { + "epoch": 0.14987376224327179, + "grad_norm": 1.811698317527771, + "learning_rate": 0.00019255847222033663, + "loss": 1.3298, + "step": 4185 + }, + { + "epoch": 0.14990957437284008, + "grad_norm": 2.0591022968292236, + "learning_rate": 0.00019255408090138086, + "loss": 1.2873, + "step": 4186 + }, + { + "epoch": 0.14994538650240838, + "grad_norm": 1.2529079914093018, + "learning_rate": 0.00019254968833722934, + "loss": 1.18, + "step": 4187 + }, + { + "epoch": 0.14998119863197665, + "grad_norm": 2.1085243225097656, + "learning_rate": 0.0001925452945279411, + "loss": 1.2148, + "step": 4188 + }, + { + "epoch": 0.15001701076154494, + "grad_norm": 1.6984211206436157, + "learning_rate": 0.00019254089947357534, + "loss": 1.1727, + "step": 4189 + }, + { + "epoch": 0.1500528228911132, + "grad_norm": 1.7549830675125122, + "learning_rate": 0.00019253650317419113, + "loss": 1.3639, + "step": 4190 + }, + { + "epoch": 0.1500886350206815, + "grad_norm": 1.861141562461853, + "learning_rate": 0.0001925321056298476, + "loss": 1.0665, + "step": 4191 + }, + { + "epoch": 0.15012444715024978, + "grad_norm": 1.7429007291793823, + "learning_rate": 0.000192527706840604, + "loss": 1.099, + "step": 4192 + }, + { + "epoch": 0.15016025927981808, + "grad_norm": 1.4746580123901367, + "learning_rate": 0.00019252330680651945, + "loss": 1.267, + "step": 4193 + }, + { + "epoch": 0.15019607140938637, + "grad_norm": 1.5567959547042847, + "learning_rate": 0.0001925189055276531, + "loss": 0.9812, + "step": 4194 + }, + { + "epoch": 0.15023188353895464, + "grad_norm": 1.2230925559997559, + "learning_rate": 0.00019251450300406426, + "loss": 1.1123, + "step": 4195 + }, + { + "epoch": 0.15026769566852294, + "grad_norm": 2.209616184234619, + "learning_rate": 0.00019251009923581213, + "loss": 1.3548, + "step": 4196 + }, + { + "epoch": 0.1503035077980912, + "grad_norm": 1.379439353942871, + "learning_rate": 0.0001925056942229559, + "loss": 1.2132, + "step": 4197 + }, + { + "epoch": 0.1503393199276595, + "grad_norm": 1.3142069578170776, + "learning_rate": 0.00019250128796555492, + "loss": 1.2592, + "step": 4198 + }, + { + "epoch": 0.15037513205722777, + "grad_norm": 1.433775544166565, + "learning_rate": 0.0001924968804636684, + "loss": 1.1314, + "step": 4199 + }, + { + "epoch": 0.15041094418679607, + "grad_norm": 2.2010419368743896, + "learning_rate": 0.0001924924717173557, + "loss": 1.1576, + "step": 4200 + }, + { + "epoch": 0.15044675631636437, + "grad_norm": 1.7225579023361206, + "learning_rate": 0.00019248806172667606, + "loss": 1.2346, + "step": 4201 + }, + { + "epoch": 0.15048256844593264, + "grad_norm": 1.6868363618850708, + "learning_rate": 0.00019248365049168888, + "loss": 1.2064, + "step": 4202 + }, + { + "epoch": 0.15051838057550093, + "grad_norm": 1.8711267709732056, + "learning_rate": 0.00019247923801245345, + "loss": 1.1556, + "step": 4203 + }, + { + "epoch": 0.1505541927050692, + "grad_norm": 1.384631633758545, + "learning_rate": 0.0001924748242890292, + "loss": 1.1246, + "step": 4204 + }, + { + "epoch": 0.1505900048346375, + "grad_norm": 1.5793060064315796, + "learning_rate": 0.00019247040932147546, + "loss": 1.2861, + "step": 4205 + }, + { + "epoch": 0.15062581696420577, + "grad_norm": 1.9783775806427002, + "learning_rate": 0.00019246599310985163, + "loss": 1.1975, + "step": 4206 + }, + { + "epoch": 0.15066162909377406, + "grad_norm": 1.4906336069107056, + "learning_rate": 0.0001924615756542171, + "loss": 1.1782, + "step": 4207 + }, + { + "epoch": 0.15069744122334236, + "grad_norm": 2.457226276397705, + "learning_rate": 0.0001924571569546314, + "loss": 1.3327, + "step": 4208 + }, + { + "epoch": 0.15073325335291063, + "grad_norm": 1.4362215995788574, + "learning_rate": 0.00019245273701115387, + "loss": 1.1497, + "step": 4209 + }, + { + "epoch": 0.15076906548247893, + "grad_norm": 1.9636234045028687, + "learning_rate": 0.00019244831582384406, + "loss": 1.0834, + "step": 4210 + }, + { + "epoch": 0.1508048776120472, + "grad_norm": 2.543954372406006, + "learning_rate": 0.0001924438933927614, + "loss": 1.2149, + "step": 4211 + }, + { + "epoch": 0.1508406897416155, + "grad_norm": 1.8596675395965576, + "learning_rate": 0.00019243946971796535, + "loss": 0.9496, + "step": 4212 + }, + { + "epoch": 0.15087650187118376, + "grad_norm": 2.211521863937378, + "learning_rate": 0.00019243504479951552, + "loss": 1.2572, + "step": 4213 + }, + { + "epoch": 0.15091231400075206, + "grad_norm": 1.684637427330017, + "learning_rate": 0.00019243061863747138, + "loss": 1.178, + "step": 4214 + }, + { + "epoch": 0.15094812613032033, + "grad_norm": 1.5638924837112427, + "learning_rate": 0.0001924261912318925, + "loss": 1.3015, + "step": 4215 + }, + { + "epoch": 0.15098393825988862, + "grad_norm": 1.4894062280654907, + "learning_rate": 0.00019242176258283845, + "loss": 1.0318, + "step": 4216 + }, + { + "epoch": 0.15101975038945692, + "grad_norm": 1.611350417137146, + "learning_rate": 0.00019241733269036878, + "loss": 1.3091, + "step": 4217 + }, + { + "epoch": 0.1510555625190252, + "grad_norm": 2.0361859798431396, + "learning_rate": 0.0001924129015545431, + "loss": 1.0792, + "step": 4218 + }, + { + "epoch": 0.15109137464859348, + "grad_norm": 1.4359923601150513, + "learning_rate": 0.00019240846917542107, + "loss": 1.036, + "step": 4219 + }, + { + "epoch": 0.15112718677816175, + "grad_norm": 1.9351422786712646, + "learning_rate": 0.00019240403555306225, + "loss": 1.4444, + "step": 4220 + }, + { + "epoch": 0.15116299890773005, + "grad_norm": 2.0752549171447754, + "learning_rate": 0.00019239960068752633, + "loss": 1.2194, + "step": 4221 + }, + { + "epoch": 0.15119881103729832, + "grad_norm": 1.660125494003296, + "learning_rate": 0.00019239516457887298, + "loss": 1.2749, + "step": 4222 + }, + { + "epoch": 0.15123462316686662, + "grad_norm": 1.7416834831237793, + "learning_rate": 0.00019239072722716186, + "loss": 1.1832, + "step": 4223 + }, + { + "epoch": 0.1512704352964349, + "grad_norm": 1.8140292167663574, + "learning_rate": 0.0001923862886324527, + "loss": 1.2154, + "step": 4224 + }, + { + "epoch": 0.15130624742600318, + "grad_norm": 1.9597752094268799, + "learning_rate": 0.00019238184879480518, + "loss": 1.2147, + "step": 4225 + }, + { + "epoch": 0.15134205955557148, + "grad_norm": 1.546859622001648, + "learning_rate": 0.00019237740771427906, + "loss": 0.96, + "step": 4226 + }, + { + "epoch": 0.15137787168513975, + "grad_norm": 2.4994428157806396, + "learning_rate": 0.00019237296539093408, + "loss": 1.4823, + "step": 4227 + }, + { + "epoch": 0.15141368381470804, + "grad_norm": 2.0119643211364746, + "learning_rate": 0.00019236852182482998, + "loss": 0.9465, + "step": 4228 + }, + { + "epoch": 0.1514494959442763, + "grad_norm": 2.427952289581299, + "learning_rate": 0.0001923640770160266, + "loss": 1.3526, + "step": 4229 + }, + { + "epoch": 0.1514853080738446, + "grad_norm": 1.4372848272323608, + "learning_rate": 0.00019235963096458366, + "loss": 1.3803, + "step": 4230 + }, + { + "epoch": 0.1515211202034129, + "grad_norm": 1.4672868251800537, + "learning_rate": 0.00019235518367056106, + "loss": 1.3684, + "step": 4231 + }, + { + "epoch": 0.15155693233298118, + "grad_norm": 1.3634363412857056, + "learning_rate": 0.0001923507351340186, + "loss": 1.3492, + "step": 4232 + }, + { + "epoch": 0.15159274446254947, + "grad_norm": 1.559186339378357, + "learning_rate": 0.00019234628535501607, + "loss": 1.1962, + "step": 4233 + }, + { + "epoch": 0.15162855659211774, + "grad_norm": 1.1756534576416016, + "learning_rate": 0.00019234183433361344, + "loss": 1.2562, + "step": 4234 + }, + { + "epoch": 0.15166436872168604, + "grad_norm": 1.4915255308151245, + "learning_rate": 0.0001923373820698705, + "loss": 1.1738, + "step": 4235 + }, + { + "epoch": 0.1517001808512543, + "grad_norm": 2.2005698680877686, + "learning_rate": 0.00019233292856384723, + "loss": 1.3093, + "step": 4236 + }, + { + "epoch": 0.1517359929808226, + "grad_norm": 1.5250815153121948, + "learning_rate": 0.00019232847381560347, + "loss": 1.2977, + "step": 4237 + }, + { + "epoch": 0.1517718051103909, + "grad_norm": 1.3498200178146362, + "learning_rate": 0.00019232401782519923, + "loss": 1.2386, + "step": 4238 + }, + { + "epoch": 0.15180761723995917, + "grad_norm": 1.6601605415344238, + "learning_rate": 0.0001923195605926944, + "loss": 1.1499, + "step": 4239 + }, + { + "epoch": 0.15184342936952747, + "grad_norm": 2.390753746032715, + "learning_rate": 0.00019231510211814896, + "loss": 1.191, + "step": 4240 + }, + { + "epoch": 0.15187924149909574, + "grad_norm": 2.64862322807312, + "learning_rate": 0.0001923106424016229, + "loss": 1.4281, + "step": 4241 + }, + { + "epoch": 0.15191505362866403, + "grad_norm": 1.1648635864257812, + "learning_rate": 0.00019230618144317624, + "loss": 1.217, + "step": 4242 + }, + { + "epoch": 0.1519508657582323, + "grad_norm": 1.7684968709945679, + "learning_rate": 0.00019230171924286896, + "loss": 1.1478, + "step": 4243 + }, + { + "epoch": 0.1519866778878006, + "grad_norm": 1.6858289241790771, + "learning_rate": 0.0001922972558007611, + "loss": 1.1837, + "step": 4244 + }, + { + "epoch": 0.1520224900173689, + "grad_norm": 1.3203492164611816, + "learning_rate": 0.00019229279111691272, + "loss": 1.1398, + "step": 4245 + }, + { + "epoch": 0.15205830214693716, + "grad_norm": 1.4210227727890015, + "learning_rate": 0.0001922883251913839, + "loss": 1.1832, + "step": 4246 + }, + { + "epoch": 0.15209411427650546, + "grad_norm": 2.2904062271118164, + "learning_rate": 0.00019228385802423469, + "loss": 1.4535, + "step": 4247 + }, + { + "epoch": 0.15212992640607373, + "grad_norm": 1.7628711462020874, + "learning_rate": 0.0001922793896155252, + "loss": 1.3495, + "step": 4248 + }, + { + "epoch": 0.15216573853564203, + "grad_norm": 1.7661798000335693, + "learning_rate": 0.00019227491996531558, + "loss": 1.2851, + "step": 4249 + }, + { + "epoch": 0.1522015506652103, + "grad_norm": 1.9101345539093018, + "learning_rate": 0.00019227044907366595, + "loss": 0.9741, + "step": 4250 + }, + { + "epoch": 0.1522373627947786, + "grad_norm": 1.5807220935821533, + "learning_rate": 0.00019226597694063638, + "loss": 1.1075, + "step": 4251 + }, + { + "epoch": 0.1522731749243469, + "grad_norm": 1.5753737688064575, + "learning_rate": 0.0001922615035662872, + "loss": 1.1358, + "step": 4252 + }, + { + "epoch": 0.15230898705391516, + "grad_norm": 1.4697372913360596, + "learning_rate": 0.00019225702895067843, + "loss": 1.0638, + "step": 4253 + }, + { + "epoch": 0.15234479918348345, + "grad_norm": 1.537718415260315, + "learning_rate": 0.00019225255309387036, + "loss": 1.2946, + "step": 4254 + }, + { + "epoch": 0.15238061131305172, + "grad_norm": 2.151106834411621, + "learning_rate": 0.00019224807599592318, + "loss": 1.2676, + "step": 4255 + }, + { + "epoch": 0.15241642344262002, + "grad_norm": 1.6313166618347168, + "learning_rate": 0.00019224359765689713, + "loss": 1.405, + "step": 4256 + }, + { + "epoch": 0.1524522355721883, + "grad_norm": 1.524391770362854, + "learning_rate": 0.00019223911807685244, + "loss": 1.1682, + "step": 4257 + }, + { + "epoch": 0.15248804770175658, + "grad_norm": 1.9701108932495117, + "learning_rate": 0.00019223463725584944, + "loss": 1.1719, + "step": 4258 + }, + { + "epoch": 0.15252385983132488, + "grad_norm": 1.5630806684494019, + "learning_rate": 0.00019223015519394834, + "loss": 1.2385, + "step": 4259 + }, + { + "epoch": 0.15255967196089315, + "grad_norm": 1.9739242792129517, + "learning_rate": 0.00019222567189120947, + "loss": 1.0837, + "step": 4260 + }, + { + "epoch": 0.15259548409046145, + "grad_norm": 1.9942901134490967, + "learning_rate": 0.00019222118734769317, + "loss": 1.112, + "step": 4261 + }, + { + "epoch": 0.15263129622002972, + "grad_norm": 2.421816825866699, + "learning_rate": 0.00019221670156345971, + "loss": 1.1807, + "step": 4262 + }, + { + "epoch": 0.152667108349598, + "grad_norm": 1.912240982055664, + "learning_rate": 0.00019221221453856954, + "loss": 1.428, + "step": 4263 + }, + { + "epoch": 0.15270292047916628, + "grad_norm": 1.5314058065414429, + "learning_rate": 0.00019220772627308292, + "loss": 1.1489, + "step": 4264 + }, + { + "epoch": 0.15273873260873458, + "grad_norm": 1.6595377922058105, + "learning_rate": 0.00019220323676706028, + "loss": 1.2596, + "step": 4265 + }, + { + "epoch": 0.15277454473830288, + "grad_norm": 2.7646865844726562, + "learning_rate": 0.00019219874602056204, + "loss": 1.3707, + "step": 4266 + }, + { + "epoch": 0.15281035686787114, + "grad_norm": 2.074577569961548, + "learning_rate": 0.0001921942540336486, + "loss": 1.2308, + "step": 4267 + }, + { + "epoch": 0.15284616899743944, + "grad_norm": 2.0578832626342773, + "learning_rate": 0.00019218976080638043, + "loss": 1.3674, + "step": 4268 + }, + { + "epoch": 0.1528819811270077, + "grad_norm": 1.5946264266967773, + "learning_rate": 0.0001921852663388179, + "loss": 1.2172, + "step": 4269 + }, + { + "epoch": 0.152917793256576, + "grad_norm": 1.7529840469360352, + "learning_rate": 0.0001921807706310215, + "loss": 1.211, + "step": 4270 + }, + { + "epoch": 0.15295360538614428, + "grad_norm": 1.2617487907409668, + "learning_rate": 0.00019217627368305176, + "loss": 1.2876, + "step": 4271 + }, + { + "epoch": 0.15298941751571257, + "grad_norm": 2.0639188289642334, + "learning_rate": 0.0001921717754949692, + "loss": 1.2187, + "step": 4272 + }, + { + "epoch": 0.15302522964528087, + "grad_norm": 1.5765366554260254, + "learning_rate": 0.00019216727606683425, + "loss": 1.1518, + "step": 4273 + }, + { + "epoch": 0.15306104177484914, + "grad_norm": 1.7234505414962769, + "learning_rate": 0.00019216277539870752, + "loss": 1.1161, + "step": 4274 + }, + { + "epoch": 0.15309685390441743, + "grad_norm": 1.3718329668045044, + "learning_rate": 0.00019215827349064948, + "loss": 1.242, + "step": 4275 + }, + { + "epoch": 0.1531326660339857, + "grad_norm": 1.828961730003357, + "learning_rate": 0.00019215377034272074, + "loss": 1.2893, + "step": 4276 + }, + { + "epoch": 0.153168478163554, + "grad_norm": 1.2977741956710815, + "learning_rate": 0.00019214926595498196, + "loss": 1.27, + "step": 4277 + }, + { + "epoch": 0.15320429029312227, + "grad_norm": 1.717916488647461, + "learning_rate": 0.0001921447603274936, + "loss": 1.1821, + "step": 4278 + }, + { + "epoch": 0.15324010242269057, + "grad_norm": 1.5709048509597778, + "learning_rate": 0.0001921402534603164, + "loss": 1.2803, + "step": 4279 + }, + { + "epoch": 0.15327591455225886, + "grad_norm": 1.4489820003509521, + "learning_rate": 0.00019213574535351092, + "loss": 1.2623, + "step": 4280 + }, + { + "epoch": 0.15331172668182713, + "grad_norm": 1.6640560626983643, + "learning_rate": 0.00019213123600713783, + "loss": 1.1939, + "step": 4281 + }, + { + "epoch": 0.15334753881139543, + "grad_norm": 1.517790675163269, + "learning_rate": 0.0001921267254212578, + "loss": 1.1327, + "step": 4282 + }, + { + "epoch": 0.1533833509409637, + "grad_norm": 1.6141036748886108, + "learning_rate": 0.00019212221359593152, + "loss": 1.0067, + "step": 4283 + }, + { + "epoch": 0.153419163070532, + "grad_norm": 2.1419546604156494, + "learning_rate": 0.00019211770053121968, + "loss": 1.2825, + "step": 4284 + }, + { + "epoch": 0.15345497520010026, + "grad_norm": 1.9848809242248535, + "learning_rate": 0.000192113186227183, + "loss": 1.1329, + "step": 4285 + }, + { + "epoch": 0.15349078732966856, + "grad_norm": 2.256805896759033, + "learning_rate": 0.0001921086706838822, + "loss": 1.1236, + "step": 4286 + }, + { + "epoch": 0.15352659945923686, + "grad_norm": 1.612509846687317, + "learning_rate": 0.0001921041539013781, + "loss": 1.263, + "step": 4287 + }, + { + "epoch": 0.15356241158880513, + "grad_norm": 1.4388744831085205, + "learning_rate": 0.00019209963587973138, + "loss": 1.3341, + "step": 4288 + }, + { + "epoch": 0.15359822371837342, + "grad_norm": 2.0712125301361084, + "learning_rate": 0.00019209511661900285, + "loss": 1.0489, + "step": 4289 + }, + { + "epoch": 0.1536340358479417, + "grad_norm": 1.4476298093795776, + "learning_rate": 0.00019209059611925336, + "loss": 1.142, + "step": 4290 + }, + { + "epoch": 0.15366984797751, + "grad_norm": 1.42405104637146, + "learning_rate": 0.00019208607438054364, + "loss": 1.1426, + "step": 4291 + }, + { + "epoch": 0.15370566010707826, + "grad_norm": 1.206735372543335, + "learning_rate": 0.0001920815514029346, + "loss": 1.3104, + "step": 4292 + }, + { + "epoch": 0.15374147223664655, + "grad_norm": 1.2605146169662476, + "learning_rate": 0.00019207702718648705, + "loss": 1.0705, + "step": 4293 + }, + { + "epoch": 0.15377728436621485, + "grad_norm": 1.6134769916534424, + "learning_rate": 0.00019207250173126187, + "loss": 1.1819, + "step": 4294 + }, + { + "epoch": 0.15381309649578312, + "grad_norm": 1.3705562353134155, + "learning_rate": 0.00019206797503731996, + "loss": 1.1938, + "step": 4295 + }, + { + "epoch": 0.15384890862535142, + "grad_norm": 1.7016879320144653, + "learning_rate": 0.0001920634471047222, + "loss": 1.0963, + "step": 4296 + }, + { + "epoch": 0.15388472075491968, + "grad_norm": 1.6008135080337524, + "learning_rate": 0.0001920589179335295, + "loss": 1.3035, + "step": 4297 + }, + { + "epoch": 0.15392053288448798, + "grad_norm": 1.7761482000350952, + "learning_rate": 0.00019205438752380283, + "loss": 1.2278, + "step": 4298 + }, + { + "epoch": 0.15395634501405625, + "grad_norm": 1.9665392637252808, + "learning_rate": 0.00019204985587560307, + "loss": 1.3344, + "step": 4299 + }, + { + "epoch": 0.15399215714362455, + "grad_norm": 1.9687954187393188, + "learning_rate": 0.00019204532298899127, + "loss": 1.3927, + "step": 4300 + }, + { + "epoch": 0.15402796927319284, + "grad_norm": 1.7490397691726685, + "learning_rate": 0.0001920407888640284, + "loss": 1.4672, + "step": 4301 + }, + { + "epoch": 0.1540637814027611, + "grad_norm": 1.8672930002212524, + "learning_rate": 0.00019203625350077541, + "loss": 1.3993, + "step": 4302 + }, + { + "epoch": 0.1540995935323294, + "grad_norm": 1.359995722770691, + "learning_rate": 0.00019203171689929333, + "loss": 1.3037, + "step": 4303 + }, + { + "epoch": 0.15413540566189768, + "grad_norm": 1.3511407375335693, + "learning_rate": 0.00019202717905964325, + "loss": 1.1403, + "step": 4304 + }, + { + "epoch": 0.15417121779146598, + "grad_norm": 1.2879115343093872, + "learning_rate": 0.00019202263998188617, + "loss": 1.2118, + "step": 4305 + }, + { + "epoch": 0.15420702992103424, + "grad_norm": 1.33463454246521, + "learning_rate": 0.00019201809966608316, + "loss": 1.1488, + "step": 4306 + }, + { + "epoch": 0.15424284205060254, + "grad_norm": 1.8852399587631226, + "learning_rate": 0.0001920135581122953, + "loss": 0.9906, + "step": 4307 + }, + { + "epoch": 0.1542786541801708, + "grad_norm": 2.334268093109131, + "learning_rate": 0.00019200901532058376, + "loss": 1.1775, + "step": 4308 + }, + { + "epoch": 0.1543144663097391, + "grad_norm": 1.5181312561035156, + "learning_rate": 0.00019200447129100954, + "loss": 1.0946, + "step": 4309 + }, + { + "epoch": 0.1543502784393074, + "grad_norm": 1.5620158910751343, + "learning_rate": 0.00019199992602363385, + "loss": 1.3032, + "step": 4310 + }, + { + "epoch": 0.15438609056887567, + "grad_norm": 1.6559255123138428, + "learning_rate": 0.00019199537951851788, + "loss": 1.2935, + "step": 4311 + }, + { + "epoch": 0.15442190269844397, + "grad_norm": 1.5609004497528076, + "learning_rate": 0.0001919908317757227, + "loss": 1.1537, + "step": 4312 + }, + { + "epoch": 0.15445771482801224, + "grad_norm": 2.1713333129882812, + "learning_rate": 0.00019198628279530952, + "loss": 1.3779, + "step": 4313 + }, + { + "epoch": 0.15449352695758053, + "grad_norm": 2.468302011489868, + "learning_rate": 0.00019198173257733961, + "loss": 1.1244, + "step": 4314 + }, + { + "epoch": 0.1545293390871488, + "grad_norm": 1.5065081119537354, + "learning_rate": 0.00019197718112187409, + "loss": 0.9288, + "step": 4315 + }, + { + "epoch": 0.1545651512167171, + "grad_norm": 2.204991102218628, + "learning_rate": 0.00019197262842897425, + "loss": 1.0026, + "step": 4316 + }, + { + "epoch": 0.1546009633462854, + "grad_norm": 1.5149450302124023, + "learning_rate": 0.00019196807449870133, + "loss": 1.2869, + "step": 4317 + }, + { + "epoch": 0.15463677547585367, + "grad_norm": 1.5095324516296387, + "learning_rate": 0.00019196351933111662, + "loss": 1.0316, + "step": 4318 + }, + { + "epoch": 0.15467258760542196, + "grad_norm": 1.6583994626998901, + "learning_rate": 0.00019195896292628138, + "loss": 1.2134, + "step": 4319 + }, + { + "epoch": 0.15470839973499023, + "grad_norm": 1.9380218982696533, + "learning_rate": 0.00019195440528425688, + "loss": 1.2056, + "step": 4320 + }, + { + "epoch": 0.15474421186455853, + "grad_norm": 1.6426316499710083, + "learning_rate": 0.00019194984640510447, + "loss": 1.1555, + "step": 4321 + }, + { + "epoch": 0.1547800239941268, + "grad_norm": 1.586598515510559, + "learning_rate": 0.00019194528628888554, + "loss": 1.3793, + "step": 4322 + }, + { + "epoch": 0.1548158361236951, + "grad_norm": 1.3909614086151123, + "learning_rate": 0.00019194072493566134, + "loss": 1.4468, + "step": 4323 + }, + { + "epoch": 0.1548516482532634, + "grad_norm": 1.3901642560958862, + "learning_rate": 0.00019193616234549328, + "loss": 1.1478, + "step": 4324 + }, + { + "epoch": 0.15488746038283166, + "grad_norm": 1.8111873865127563, + "learning_rate": 0.00019193159851844276, + "loss": 1.2678, + "step": 4325 + }, + { + "epoch": 0.15492327251239996, + "grad_norm": 1.7325929403305054, + "learning_rate": 0.00019192703345457114, + "loss": 1.343, + "step": 4326 + }, + { + "epoch": 0.15495908464196823, + "grad_norm": 1.4998432397842407, + "learning_rate": 0.00019192246715393988, + "loss": 1.3147, + "step": 4327 + }, + { + "epoch": 0.15499489677153652, + "grad_norm": 1.4813847541809082, + "learning_rate": 0.0001919178996166104, + "loss": 1.2877, + "step": 4328 + }, + { + "epoch": 0.1550307089011048, + "grad_norm": 1.7915174961090088, + "learning_rate": 0.00019191333084264412, + "loss": 1.4264, + "step": 4329 + }, + { + "epoch": 0.1550665210306731, + "grad_norm": 1.96388840675354, + "learning_rate": 0.00019190876083210258, + "loss": 1.3211, + "step": 4330 + }, + { + "epoch": 0.15510233316024138, + "grad_norm": 1.4728180170059204, + "learning_rate": 0.00019190418958504716, + "loss": 1.2513, + "step": 4331 + }, + { + "epoch": 0.15513814528980965, + "grad_norm": 1.5277820825576782, + "learning_rate": 0.00019189961710153948, + "loss": 1.3114, + "step": 4332 + }, + { + "epoch": 0.15517395741937795, + "grad_norm": 1.8226851224899292, + "learning_rate": 0.00019189504338164095, + "loss": 1.1109, + "step": 4333 + }, + { + "epoch": 0.15520976954894622, + "grad_norm": 1.924798607826233, + "learning_rate": 0.00019189046842541316, + "loss": 1.3881, + "step": 4334 + }, + { + "epoch": 0.15524558167851452, + "grad_norm": 2.1339287757873535, + "learning_rate": 0.00019188589223291763, + "loss": 1.3119, + "step": 4335 + }, + { + "epoch": 0.15528139380808278, + "grad_norm": 1.4805891513824463, + "learning_rate": 0.00019188131480421595, + "loss": 1.1515, + "step": 4336 + }, + { + "epoch": 0.15531720593765108, + "grad_norm": 1.480418086051941, + "learning_rate": 0.0001918767361393697, + "loss": 1.1962, + "step": 4337 + }, + { + "epoch": 0.15535301806721938, + "grad_norm": 2.010167360305786, + "learning_rate": 0.00019187215623844053, + "loss": 1.159, + "step": 4338 + }, + { + "epoch": 0.15538883019678765, + "grad_norm": 1.8449177742004395, + "learning_rate": 0.00019186757510148995, + "loss": 1.2877, + "step": 4339 + }, + { + "epoch": 0.15542464232635594, + "grad_norm": 1.8808649778366089, + "learning_rate": 0.00019186299272857965, + "loss": 1.2622, + "step": 4340 + }, + { + "epoch": 0.1554604544559242, + "grad_norm": 1.3880107402801514, + "learning_rate": 0.0001918584091197713, + "loss": 1.1183, + "step": 4341 + }, + { + "epoch": 0.1554962665854925, + "grad_norm": 1.2879066467285156, + "learning_rate": 0.00019185382427512653, + "loss": 1.0158, + "step": 4342 + }, + { + "epoch": 0.15553207871506078, + "grad_norm": 1.374577283859253, + "learning_rate": 0.00019184923819470703, + "loss": 1.201, + "step": 4343 + }, + { + "epoch": 0.15556789084462908, + "grad_norm": 1.8058313131332397, + "learning_rate": 0.0001918446508785745, + "loss": 1.1124, + "step": 4344 + }, + { + "epoch": 0.15560370297419737, + "grad_norm": 3.0403542518615723, + "learning_rate": 0.00019184006232679068, + "loss": 1.3109, + "step": 4345 + }, + { + "epoch": 0.15563951510376564, + "grad_norm": 1.6199623346328735, + "learning_rate": 0.00019183547253941733, + "loss": 1.1548, + "step": 4346 + }, + { + "epoch": 0.15567532723333394, + "grad_norm": 1.4806323051452637, + "learning_rate": 0.0001918308815165161, + "loss": 1.1667, + "step": 4347 + }, + { + "epoch": 0.1557111393629022, + "grad_norm": 1.5497148036956787, + "learning_rate": 0.0001918262892581488, + "loss": 1.1774, + "step": 4348 + }, + { + "epoch": 0.1557469514924705, + "grad_norm": 1.8839356899261475, + "learning_rate": 0.00019182169576437724, + "loss": 1.2524, + "step": 4349 + }, + { + "epoch": 0.15578276362203877, + "grad_norm": 1.349637746810913, + "learning_rate": 0.00019181710103526321, + "loss": 1.2056, + "step": 4350 + }, + { + "epoch": 0.15581857575160707, + "grad_norm": 1.7882013320922852, + "learning_rate": 0.00019181250507086854, + "loss": 1.1441, + "step": 4351 + }, + { + "epoch": 0.15585438788117537, + "grad_norm": 1.9163522720336914, + "learning_rate": 0.00019180790787125504, + "loss": 1.3887, + "step": 4352 + }, + { + "epoch": 0.15589020001074363, + "grad_norm": 1.577852725982666, + "learning_rate": 0.00019180330943648454, + "loss": 1.2585, + "step": 4353 + }, + { + "epoch": 0.15592601214031193, + "grad_norm": 1.8529714345932007, + "learning_rate": 0.00019179870976661895, + "loss": 1.1956, + "step": 4354 + }, + { + "epoch": 0.1559618242698802, + "grad_norm": 1.780099630355835, + "learning_rate": 0.0001917941088617201, + "loss": 1.1389, + "step": 4355 + }, + { + "epoch": 0.1559976363994485, + "grad_norm": 1.500920295715332, + "learning_rate": 0.00019178950672184996, + "loss": 1.2347, + "step": 4356 + }, + { + "epoch": 0.15603344852901677, + "grad_norm": 1.212587833404541, + "learning_rate": 0.0001917849033470704, + "loss": 1.2414, + "step": 4357 + }, + { + "epoch": 0.15606926065858506, + "grad_norm": 1.795747995376587, + "learning_rate": 0.00019178029873744335, + "loss": 1.3336, + "step": 4358 + }, + { + "epoch": 0.15610507278815336, + "grad_norm": 1.6547776460647583, + "learning_rate": 0.00019177569289303078, + "loss": 1.2447, + "step": 4359 + }, + { + "epoch": 0.15614088491772163, + "grad_norm": 1.9866677522659302, + "learning_rate": 0.00019177108581389462, + "loss": 1.4152, + "step": 4360 + }, + { + "epoch": 0.15617669704728993, + "grad_norm": 2.6384098529815674, + "learning_rate": 0.0001917664775000969, + "loss": 1.2157, + "step": 4361 + }, + { + "epoch": 0.1562125091768582, + "grad_norm": 1.492130994796753, + "learning_rate": 0.00019176186795169956, + "loss": 1.2257, + "step": 4362 + }, + { + "epoch": 0.1562483213064265, + "grad_norm": 1.567800521850586, + "learning_rate": 0.0001917572571687647, + "loss": 1.1279, + "step": 4363 + }, + { + "epoch": 0.15628413343599476, + "grad_norm": 1.8494219779968262, + "learning_rate": 0.00019175264515135427, + "loss": 1.3436, + "step": 4364 + }, + { + "epoch": 0.15631994556556306, + "grad_norm": 1.655542016029358, + "learning_rate": 0.00019174803189953035, + "loss": 1.5112, + "step": 4365 + }, + { + "epoch": 0.15635575769513135, + "grad_norm": 1.5740361213684082, + "learning_rate": 0.00019174341741335504, + "loss": 1.1049, + "step": 4366 + }, + { + "epoch": 0.15639156982469962, + "grad_norm": 1.3488227128982544, + "learning_rate": 0.00019173880169289035, + "loss": 1.2316, + "step": 4367 + }, + { + "epoch": 0.15642738195426792, + "grad_norm": 1.801393985748291, + "learning_rate": 0.00019173418473819844, + "loss": 1.1684, + "step": 4368 + }, + { + "epoch": 0.1564631940838362, + "grad_norm": 1.855544924736023, + "learning_rate": 0.0001917295665493414, + "loss": 1.1148, + "step": 4369 + }, + { + "epoch": 0.15649900621340448, + "grad_norm": 1.6929352283477783, + "learning_rate": 0.00019172494712638136, + "loss": 1.4215, + "step": 4370 + }, + { + "epoch": 0.15653481834297275, + "grad_norm": 1.9852956533432007, + "learning_rate": 0.0001917203264693805, + "loss": 1.2057, + "step": 4371 + }, + { + "epoch": 0.15657063047254105, + "grad_norm": 1.972158670425415, + "learning_rate": 0.0001917157045784009, + "loss": 1.2083, + "step": 4372 + }, + { + "epoch": 0.15660644260210935, + "grad_norm": 1.8836984634399414, + "learning_rate": 0.00019171108145350484, + "loss": 1.4167, + "step": 4373 + }, + { + "epoch": 0.15664225473167762, + "grad_norm": 1.546947956085205, + "learning_rate": 0.00019170645709475447, + "loss": 1.0082, + "step": 4374 + }, + { + "epoch": 0.1566780668612459, + "grad_norm": 1.5492771863937378, + "learning_rate": 0.00019170183150221201, + "loss": 1.0212, + "step": 4375 + }, + { + "epoch": 0.15671387899081418, + "grad_norm": 1.9802762269973755, + "learning_rate": 0.00019169720467593972, + "loss": 1.4072, + "step": 4376 + }, + { + "epoch": 0.15674969112038248, + "grad_norm": 1.625670313835144, + "learning_rate": 0.0001916925766159998, + "loss": 1.2083, + "step": 4377 + }, + { + "epoch": 0.15678550324995075, + "grad_norm": 1.4719518423080444, + "learning_rate": 0.0001916879473224545, + "loss": 0.9941, + "step": 4378 + }, + { + "epoch": 0.15682131537951904, + "grad_norm": 1.3464711904525757, + "learning_rate": 0.00019168331679536623, + "loss": 1.3943, + "step": 4379 + }, + { + "epoch": 0.15685712750908734, + "grad_norm": 1.4270274639129639, + "learning_rate": 0.00019167868503479712, + "loss": 1.2651, + "step": 4380 + }, + { + "epoch": 0.1568929396386556, + "grad_norm": 1.5083835124969482, + "learning_rate": 0.00019167405204080956, + "loss": 1.0996, + "step": 4381 + }, + { + "epoch": 0.1569287517682239, + "grad_norm": 1.742008090019226, + "learning_rate": 0.00019166941781346592, + "loss": 0.9665, + "step": 4382 + }, + { + "epoch": 0.15696456389779218, + "grad_norm": 2.241222620010376, + "learning_rate": 0.0001916647823528285, + "loss": 1.3534, + "step": 4383 + }, + { + "epoch": 0.15700037602736047, + "grad_norm": 1.9553377628326416, + "learning_rate": 0.00019166014565895966, + "loss": 1.2613, + "step": 4384 + }, + { + "epoch": 0.15703618815692874, + "grad_norm": 1.7417570352554321, + "learning_rate": 0.0001916555077319218, + "loss": 1.2942, + "step": 4385 + }, + { + "epoch": 0.15707200028649704, + "grad_norm": 1.951590657234192, + "learning_rate": 0.0001916508685717773, + "loss": 1.2192, + "step": 4386 + }, + { + "epoch": 0.15710781241606533, + "grad_norm": 1.4603610038757324, + "learning_rate": 0.0001916462281785886, + "loss": 1.1934, + "step": 4387 + }, + { + "epoch": 0.1571436245456336, + "grad_norm": 2.0082292556762695, + "learning_rate": 0.0001916415865524181, + "loss": 1.1991, + "step": 4388 + }, + { + "epoch": 0.1571794366752019, + "grad_norm": 1.6899062395095825, + "learning_rate": 0.00019163694369332825, + "loss": 1.0281, + "step": 4389 + }, + { + "epoch": 0.15721524880477017, + "grad_norm": 1.4902654886245728, + "learning_rate": 0.00019163229960138156, + "loss": 1.0911, + "step": 4390 + }, + { + "epoch": 0.15725106093433847, + "grad_norm": 1.568476915359497, + "learning_rate": 0.00019162765427664045, + "loss": 1.1758, + "step": 4391 + }, + { + "epoch": 0.15728687306390673, + "grad_norm": 1.5237959623336792, + "learning_rate": 0.00019162300771916746, + "loss": 1.2291, + "step": 4392 + }, + { + "epoch": 0.15732268519347503, + "grad_norm": 1.2749286890029907, + "learning_rate": 0.00019161835992902507, + "loss": 1.19, + "step": 4393 + }, + { + "epoch": 0.15735849732304333, + "grad_norm": 1.8608871698379517, + "learning_rate": 0.00019161371090627583, + "loss": 1.2632, + "step": 4394 + }, + { + "epoch": 0.1573943094526116, + "grad_norm": 1.5353600978851318, + "learning_rate": 0.00019160906065098228, + "loss": 1.1666, + "step": 4395 + }, + { + "epoch": 0.1574301215821799, + "grad_norm": 1.4482020139694214, + "learning_rate": 0.00019160440916320698, + "loss": 1.1944, + "step": 4396 + }, + { + "epoch": 0.15746593371174816, + "grad_norm": 1.6925541162490845, + "learning_rate": 0.00019159975644301256, + "loss": 1.4122, + "step": 4397 + }, + { + "epoch": 0.15750174584131646, + "grad_norm": 1.757089614868164, + "learning_rate": 0.00019159510249046154, + "loss": 1.2645, + "step": 4398 + }, + { + "epoch": 0.15753755797088473, + "grad_norm": 1.66544771194458, + "learning_rate": 0.00019159044730561656, + "loss": 1.3635, + "step": 4399 + }, + { + "epoch": 0.15757337010045303, + "grad_norm": 1.6643856763839722, + "learning_rate": 0.00019158579088854026, + "loss": 1.3598, + "step": 4400 + }, + { + "epoch": 0.15760918223002132, + "grad_norm": 1.2916673421859741, + "learning_rate": 0.0001915811332392953, + "loss": 1.1693, + "step": 4401 + }, + { + "epoch": 0.1576449943595896, + "grad_norm": 2.384567975997925, + "learning_rate": 0.00019157647435794428, + "loss": 1.0504, + "step": 4402 + }, + { + "epoch": 0.1576808064891579, + "grad_norm": 2.150205135345459, + "learning_rate": 0.00019157181424454996, + "loss": 1.3883, + "step": 4403 + }, + { + "epoch": 0.15771661861872616, + "grad_norm": 1.354878544807434, + "learning_rate": 0.00019156715289917497, + "loss": 1.196, + "step": 4404 + }, + { + "epoch": 0.15775243074829445, + "grad_norm": 1.7078908681869507, + "learning_rate": 0.0001915624903218821, + "loss": 1.0735, + "step": 4405 + }, + { + "epoch": 0.15778824287786272, + "grad_norm": 2.0418734550476074, + "learning_rate": 0.00019155782651273398, + "loss": 1.3011, + "step": 4406 + }, + { + "epoch": 0.15782405500743102, + "grad_norm": 1.706006646156311, + "learning_rate": 0.00019155316147179342, + "loss": 1.1664, + "step": 4407 + }, + { + "epoch": 0.1578598671369993, + "grad_norm": 1.3904839754104614, + "learning_rate": 0.00019154849519912318, + "loss": 1.3007, + "step": 4408 + }, + { + "epoch": 0.15789567926656758, + "grad_norm": 2.2421321868896484, + "learning_rate": 0.00019154382769478602, + "loss": 1.2685, + "step": 4409 + }, + { + "epoch": 0.15793149139613588, + "grad_norm": 1.634050726890564, + "learning_rate": 0.00019153915895884474, + "loss": 1.2385, + "step": 4410 + }, + { + "epoch": 0.15796730352570415, + "grad_norm": 1.7169800996780396, + "learning_rate": 0.00019153448899136212, + "loss": 1.033, + "step": 4411 + }, + { + "epoch": 0.15800311565527245, + "grad_norm": 1.4427543878555298, + "learning_rate": 0.00019152981779240106, + "loss": 1.3233, + "step": 4412 + }, + { + "epoch": 0.15803892778484072, + "grad_norm": 1.6913899183273315, + "learning_rate": 0.00019152514536202437, + "loss": 0.9405, + "step": 4413 + }, + { + "epoch": 0.158074739914409, + "grad_norm": 1.715579867362976, + "learning_rate": 0.0001915204717002949, + "loss": 1.3459, + "step": 4414 + }, + { + "epoch": 0.15811055204397728, + "grad_norm": 1.4300254583358765, + "learning_rate": 0.00019151579680727553, + "loss": 1.2128, + "step": 4415 + }, + { + "epoch": 0.15814636417354558, + "grad_norm": 2.2409706115722656, + "learning_rate": 0.00019151112068302917, + "loss": 1.3704, + "step": 4416 + }, + { + "epoch": 0.15818217630311388, + "grad_norm": 1.6053327322006226, + "learning_rate": 0.0001915064433276187, + "loss": 1.1605, + "step": 4417 + }, + { + "epoch": 0.15821798843268214, + "grad_norm": 1.787540316581726, + "learning_rate": 0.0001915017647411071, + "loss": 1.2343, + "step": 4418 + }, + { + "epoch": 0.15825380056225044, + "grad_norm": 2.236205577850342, + "learning_rate": 0.00019149708492355728, + "loss": 1.6795, + "step": 4419 + }, + { + "epoch": 0.1582896126918187, + "grad_norm": 1.581094741821289, + "learning_rate": 0.0001914924038750322, + "loss": 1.215, + "step": 4420 + }, + { + "epoch": 0.158325424821387, + "grad_norm": 1.5803074836730957, + "learning_rate": 0.00019148772159559486, + "loss": 1.1568, + "step": 4421 + }, + { + "epoch": 0.15836123695095528, + "grad_norm": 1.7399505376815796, + "learning_rate": 0.00019148303808530818, + "loss": 1.0843, + "step": 4422 + }, + { + "epoch": 0.15839704908052357, + "grad_norm": 1.3777583837509155, + "learning_rate": 0.00019147835334423527, + "loss": 1.1607, + "step": 4423 + }, + { + "epoch": 0.15843286121009187, + "grad_norm": 1.5028252601623535, + "learning_rate": 0.0001914736673724391, + "loss": 1.0658, + "step": 4424 + }, + { + "epoch": 0.15846867333966014, + "grad_norm": 2.361051559448242, + "learning_rate": 0.00019146898016998273, + "loss": 1.2551, + "step": 4425 + }, + { + "epoch": 0.15850448546922843, + "grad_norm": 1.825052261352539, + "learning_rate": 0.00019146429173692923, + "loss": 1.1091, + "step": 4426 + }, + { + "epoch": 0.1585402975987967, + "grad_norm": 1.9006389379501343, + "learning_rate": 0.00019145960207334165, + "loss": 1.1817, + "step": 4427 + }, + { + "epoch": 0.158576109728365, + "grad_norm": 1.4656016826629639, + "learning_rate": 0.00019145491117928312, + "loss": 1.3999, + "step": 4428 + }, + { + "epoch": 0.15861192185793327, + "grad_norm": 1.7033413648605347, + "learning_rate": 0.00019145021905481673, + "loss": 1.3686, + "step": 4429 + }, + { + "epoch": 0.15864773398750157, + "grad_norm": 1.261288046836853, + "learning_rate": 0.00019144552570000558, + "loss": 1.1114, + "step": 4430 + }, + { + "epoch": 0.15868354611706986, + "grad_norm": 1.4297480583190918, + "learning_rate": 0.00019144083111491284, + "loss": 1.075, + "step": 4431 + }, + { + "epoch": 0.15871935824663813, + "grad_norm": 1.8366742134094238, + "learning_rate": 0.0001914361352996017, + "loss": 1.0402, + "step": 4432 + }, + { + "epoch": 0.15875517037620643, + "grad_norm": 1.6987781524658203, + "learning_rate": 0.00019143143825413526, + "loss": 1.1776, + "step": 4433 + }, + { + "epoch": 0.1587909825057747, + "grad_norm": 1.5838309526443481, + "learning_rate": 0.00019142673997857678, + "loss": 1.1307, + "step": 4434 + }, + { + "epoch": 0.158826794635343, + "grad_norm": 1.3837953805923462, + "learning_rate": 0.00019142204047298945, + "loss": 1.0849, + "step": 4435 + }, + { + "epoch": 0.15886260676491126, + "grad_norm": 1.5738294124603271, + "learning_rate": 0.00019141733973743644, + "loss": 1.2144, + "step": 4436 + }, + { + "epoch": 0.15889841889447956, + "grad_norm": 2.0474908351898193, + "learning_rate": 0.0001914126377719811, + "loss": 1.135, + "step": 4437 + }, + { + "epoch": 0.15893423102404786, + "grad_norm": 1.5187320709228516, + "learning_rate": 0.00019140793457668665, + "loss": 1.0676, + "step": 4438 + }, + { + "epoch": 0.15897004315361613, + "grad_norm": 1.898295283317566, + "learning_rate": 0.0001914032301516163, + "loss": 1.3018, + "step": 4439 + }, + { + "epoch": 0.15900585528318442, + "grad_norm": 1.3666919469833374, + "learning_rate": 0.0001913985244968334, + "loss": 1.1435, + "step": 4440 + }, + { + "epoch": 0.1590416674127527, + "grad_norm": 1.71784245967865, + "learning_rate": 0.00019139381761240127, + "loss": 1.2174, + "step": 4441 + }, + { + "epoch": 0.159077479542321, + "grad_norm": 1.6504827737808228, + "learning_rate": 0.00019138910949838321, + "loss": 1.0874, + "step": 4442 + }, + { + "epoch": 0.15911329167188926, + "grad_norm": 1.7658859491348267, + "learning_rate": 0.0001913844001548425, + "loss": 1.2975, + "step": 4443 + }, + { + "epoch": 0.15914910380145755, + "grad_norm": 1.2512527704238892, + "learning_rate": 0.00019137968958184265, + "loss": 1.1752, + "step": 4444 + }, + { + "epoch": 0.15918491593102585, + "grad_norm": 1.4031134843826294, + "learning_rate": 0.00019137497777944691, + "loss": 0.9814, + "step": 4445 + }, + { + "epoch": 0.15922072806059412, + "grad_norm": 2.0043768882751465, + "learning_rate": 0.00019137026474771874, + "loss": 1.0221, + "step": 4446 + }, + { + "epoch": 0.15925654019016242, + "grad_norm": 2.0803236961364746, + "learning_rate": 0.00019136555048672145, + "loss": 1.0505, + "step": 4447 + }, + { + "epoch": 0.15929235231973068, + "grad_norm": 1.9464565515518188, + "learning_rate": 0.0001913608349965186, + "loss": 1.2133, + "step": 4448 + }, + { + "epoch": 0.15932816444929898, + "grad_norm": 1.301774263381958, + "learning_rate": 0.0001913561182771735, + "loss": 1.2294, + "step": 4449 + }, + { + "epoch": 0.15936397657886725, + "grad_norm": 1.675096035003662, + "learning_rate": 0.00019135140032874973, + "loss": 1.2657, + "step": 4450 + }, + { + "epoch": 0.15939978870843555, + "grad_norm": 1.689249038696289, + "learning_rate": 0.00019134668115131068, + "loss": 1.3188, + "step": 4451 + }, + { + "epoch": 0.15943560083800384, + "grad_norm": 1.6217703819274902, + "learning_rate": 0.00019134196074491988, + "loss": 1.2296, + "step": 4452 + }, + { + "epoch": 0.1594714129675721, + "grad_norm": 1.661340355873108, + "learning_rate": 0.00019133723910964078, + "loss": 1.1982, + "step": 4453 + }, + { + "epoch": 0.1595072250971404, + "grad_norm": 1.8453421592712402, + "learning_rate": 0.00019133251624553696, + "loss": 1.1536, + "step": 4454 + }, + { + "epoch": 0.15954303722670868, + "grad_norm": 1.5702019929885864, + "learning_rate": 0.00019132779215267197, + "loss": 1.3005, + "step": 4455 + }, + { + "epoch": 0.15957884935627697, + "grad_norm": 1.583754301071167, + "learning_rate": 0.00019132306683110933, + "loss": 1.111, + "step": 4456 + }, + { + "epoch": 0.15961466148584524, + "grad_norm": 1.6730464696884155, + "learning_rate": 0.0001913183402809126, + "loss": 1.3981, + "step": 4457 + }, + { + "epoch": 0.15965047361541354, + "grad_norm": 1.9863829612731934, + "learning_rate": 0.00019131361250214541, + "loss": 1.0159, + "step": 4458 + }, + { + "epoch": 0.15968628574498184, + "grad_norm": 1.4859025478363037, + "learning_rate": 0.00019130888349487134, + "loss": 1.006, + "step": 4459 + }, + { + "epoch": 0.1597220978745501, + "grad_norm": 1.7324304580688477, + "learning_rate": 0.00019130415325915406, + "loss": 1.2859, + "step": 4460 + }, + { + "epoch": 0.1597579100041184, + "grad_norm": 1.5711873769760132, + "learning_rate": 0.00019129942179505713, + "loss": 1.0831, + "step": 4461 + }, + { + "epoch": 0.15979372213368667, + "grad_norm": 1.5177385807037354, + "learning_rate": 0.00019129468910264428, + "loss": 1.251, + "step": 4462 + }, + { + "epoch": 0.15982953426325497, + "grad_norm": 2.154702663421631, + "learning_rate": 0.00019128995518197912, + "loss": 1.2765, + "step": 4463 + }, + { + "epoch": 0.15986534639282324, + "grad_norm": 1.1658060550689697, + "learning_rate": 0.00019128522003312537, + "loss": 0.9939, + "step": 4464 + }, + { + "epoch": 0.15990115852239153, + "grad_norm": 1.5976234674453735, + "learning_rate": 0.00019128048365614676, + "loss": 1.1066, + "step": 4465 + }, + { + "epoch": 0.15993697065195983, + "grad_norm": 1.4256579875946045, + "learning_rate": 0.00019127574605110693, + "loss": 1.2731, + "step": 4466 + }, + { + "epoch": 0.1599727827815281, + "grad_norm": 1.5094327926635742, + "learning_rate": 0.00019127100721806975, + "loss": 1.2665, + "step": 4467 + }, + { + "epoch": 0.1600085949110964, + "grad_norm": 1.501530408859253, + "learning_rate": 0.00019126626715709885, + "loss": 0.9212, + "step": 4468 + }, + { + "epoch": 0.16004440704066467, + "grad_norm": 1.4244840145111084, + "learning_rate": 0.00019126152586825806, + "loss": 1.0899, + "step": 4469 + }, + { + "epoch": 0.16008021917023296, + "grad_norm": 1.9686604738235474, + "learning_rate": 0.00019125678335161117, + "loss": 1.2237, + "step": 4470 + }, + { + "epoch": 0.16011603129980123, + "grad_norm": 1.2123782634735107, + "learning_rate": 0.00019125203960722198, + "loss": 1.1606, + "step": 4471 + }, + { + "epoch": 0.16015184342936953, + "grad_norm": 1.6117480993270874, + "learning_rate": 0.00019124729463515427, + "loss": 1.2924, + "step": 4472 + }, + { + "epoch": 0.16018765555893782, + "grad_norm": 1.4395455121994019, + "learning_rate": 0.00019124254843547195, + "loss": 1.1886, + "step": 4473 + }, + { + "epoch": 0.1602234676885061, + "grad_norm": 1.8975876569747925, + "learning_rate": 0.0001912378010082388, + "loss": 1.2259, + "step": 4474 + }, + { + "epoch": 0.1602592798180744, + "grad_norm": 2.0351486206054688, + "learning_rate": 0.00019123305235351873, + "loss": 1.1533, + "step": 4475 + }, + { + "epoch": 0.16029509194764266, + "grad_norm": 1.767136573791504, + "learning_rate": 0.00019122830247137563, + "loss": 1.0537, + "step": 4476 + }, + { + "epoch": 0.16033090407721096, + "grad_norm": 2.1347625255584717, + "learning_rate": 0.00019122355136187342, + "loss": 1.4171, + "step": 4477 + }, + { + "epoch": 0.16036671620677923, + "grad_norm": 1.6066044569015503, + "learning_rate": 0.00019121879902507595, + "loss": 1.1642, + "step": 4478 + }, + { + "epoch": 0.16040252833634752, + "grad_norm": 1.3249279260635376, + "learning_rate": 0.00019121404546104724, + "loss": 1.2427, + "step": 4479 + }, + { + "epoch": 0.16043834046591582, + "grad_norm": 1.4221374988555908, + "learning_rate": 0.00019120929066985122, + "loss": 0.9697, + "step": 4480 + }, + { + "epoch": 0.1604741525954841, + "grad_norm": 1.5807257890701294, + "learning_rate": 0.0001912045346515518, + "loss": 1.1283, + "step": 4481 + }, + { + "epoch": 0.16050996472505238, + "grad_norm": 1.3666576147079468, + "learning_rate": 0.00019119977740621305, + "loss": 1.1076, + "step": 4482 + }, + { + "epoch": 0.16054577685462065, + "grad_norm": 2.3156771659851074, + "learning_rate": 0.0001911950189338989, + "loss": 0.949, + "step": 4483 + }, + { + "epoch": 0.16058158898418895, + "grad_norm": 1.836609125137329, + "learning_rate": 0.00019119025923467343, + "loss": 1.1349, + "step": 4484 + }, + { + "epoch": 0.16061740111375722, + "grad_norm": 2.932831287384033, + "learning_rate": 0.00019118549830860065, + "loss": 1.1524, + "step": 4485 + }, + { + "epoch": 0.16065321324332552, + "grad_norm": 1.8395748138427734, + "learning_rate": 0.0001911807361557446, + "loss": 1.0901, + "step": 4486 + }, + { + "epoch": 0.1606890253728938, + "grad_norm": 1.8928135633468628, + "learning_rate": 0.00019117597277616932, + "loss": 1.2019, + "step": 4487 + }, + { + "epoch": 0.16072483750246208, + "grad_norm": 1.9746259450912476, + "learning_rate": 0.00019117120816993899, + "loss": 1.2212, + "step": 4488 + }, + { + "epoch": 0.16076064963203038, + "grad_norm": 1.759778618812561, + "learning_rate": 0.00019116644233711764, + "loss": 1.246, + "step": 4489 + }, + { + "epoch": 0.16079646176159865, + "grad_norm": 1.8963817358016968, + "learning_rate": 0.0001911616752777694, + "loss": 1.3384, + "step": 4490 + }, + { + "epoch": 0.16083227389116694, + "grad_norm": 1.4860639572143555, + "learning_rate": 0.0001911569069919584, + "loss": 1.2565, + "step": 4491 + }, + { + "epoch": 0.1608680860207352, + "grad_norm": 1.566386342048645, + "learning_rate": 0.00019115213747974882, + "loss": 0.9172, + "step": 4492 + }, + { + "epoch": 0.1609038981503035, + "grad_norm": 1.663780927658081, + "learning_rate": 0.0001911473667412048, + "loss": 1.1884, + "step": 4493 + }, + { + "epoch": 0.1609397102798718, + "grad_norm": 1.565576195716858, + "learning_rate": 0.00019114259477639057, + "loss": 1.2563, + "step": 4494 + }, + { + "epoch": 0.16097552240944007, + "grad_norm": 1.9322407245635986, + "learning_rate": 0.00019113782158537024, + "loss": 1.1021, + "step": 4495 + }, + { + "epoch": 0.16101133453900837, + "grad_norm": 1.2827671766281128, + "learning_rate": 0.0001911330471682081, + "loss": 1.0923, + "step": 4496 + }, + { + "epoch": 0.16104714666857664, + "grad_norm": 1.232258915901184, + "learning_rate": 0.00019112827152496835, + "loss": 1.3379, + "step": 4497 + }, + { + "epoch": 0.16108295879814494, + "grad_norm": 1.1920584440231323, + "learning_rate": 0.00019112349465571525, + "loss": 1.2191, + "step": 4498 + }, + { + "epoch": 0.1611187709277132, + "grad_norm": 2.7477171421051025, + "learning_rate": 0.0001911187165605131, + "loss": 1.2898, + "step": 4499 + }, + { + "epoch": 0.1611545830572815, + "grad_norm": 1.6775703430175781, + "learning_rate": 0.00019111393723942615, + "loss": 1.1616, + "step": 4500 + }, + { + "epoch": 0.1611903951868498, + "grad_norm": 1.9252902269363403, + "learning_rate": 0.00019110915669251868, + "loss": 1.1661, + "step": 4501 + }, + { + "epoch": 0.16122620731641807, + "grad_norm": 1.5417851209640503, + "learning_rate": 0.00019110437491985505, + "loss": 1.4419, + "step": 4502 + }, + { + "epoch": 0.16126201944598637, + "grad_norm": 2.090850830078125, + "learning_rate": 0.00019109959192149955, + "loss": 1.1571, + "step": 4503 + }, + { + "epoch": 0.16129783157555463, + "grad_norm": 1.5132218599319458, + "learning_rate": 0.0001910948076975166, + "loss": 1.0013, + "step": 4504 + }, + { + "epoch": 0.16133364370512293, + "grad_norm": 1.443742036819458, + "learning_rate": 0.00019109002224797046, + "loss": 1.3896, + "step": 4505 + }, + { + "epoch": 0.1613694558346912, + "grad_norm": 1.5443689823150635, + "learning_rate": 0.00019108523557292558, + "loss": 1.1635, + "step": 4506 + }, + { + "epoch": 0.1614052679642595, + "grad_norm": 1.2881214618682861, + "learning_rate": 0.00019108044767244636, + "loss": 1.2703, + "step": 4507 + }, + { + "epoch": 0.16144108009382777, + "grad_norm": 1.4532777070999146, + "learning_rate": 0.0001910756585465972, + "loss": 1.3246, + "step": 4508 + }, + { + "epoch": 0.16147689222339606, + "grad_norm": 1.4586206674575806, + "learning_rate": 0.0001910708681954425, + "loss": 1.1758, + "step": 4509 + }, + { + "epoch": 0.16151270435296436, + "grad_norm": 2.0806283950805664, + "learning_rate": 0.00019106607661904682, + "loss": 1.2516, + "step": 4510 + }, + { + "epoch": 0.16154851648253263, + "grad_norm": 1.4727798700332642, + "learning_rate": 0.00019106128381747448, + "loss": 1.206, + "step": 4511 + }, + { + "epoch": 0.16158432861210092, + "grad_norm": 1.2714958190917969, + "learning_rate": 0.00019105648979079006, + "loss": 1.1979, + "step": 4512 + }, + { + "epoch": 0.1616201407416692, + "grad_norm": 1.3788787126541138, + "learning_rate": 0.000191051694539058, + "loss": 1.1992, + "step": 4513 + }, + { + "epoch": 0.1616559528712375, + "grad_norm": 1.2828105688095093, + "learning_rate": 0.0001910468980623428, + "loss": 1.252, + "step": 4514 + }, + { + "epoch": 0.16169176500080576, + "grad_norm": 1.7463818788528442, + "learning_rate": 0.0001910421003607091, + "loss": 1.1684, + "step": 4515 + }, + { + "epoch": 0.16172757713037406, + "grad_norm": 1.3934979438781738, + "learning_rate": 0.00019103730143422135, + "loss": 1.3503, + "step": 4516 + }, + { + "epoch": 0.16176338925994235, + "grad_norm": 1.7444173097610474, + "learning_rate": 0.00019103250128294413, + "loss": 1.2894, + "step": 4517 + }, + { + "epoch": 0.16179920138951062, + "grad_norm": 1.6852136850357056, + "learning_rate": 0.00019102769990694208, + "loss": 1.0768, + "step": 4518 + }, + { + "epoch": 0.16183501351907892, + "grad_norm": 1.483176589012146, + "learning_rate": 0.00019102289730627968, + "loss": 1.1025, + "step": 4519 + }, + { + "epoch": 0.1618708256486472, + "grad_norm": 1.964627742767334, + "learning_rate": 0.0001910180934810216, + "loss": 1.2328, + "step": 4520 + }, + { + "epoch": 0.16190663777821548, + "grad_norm": 1.9798446893692017, + "learning_rate": 0.0001910132884312325, + "loss": 1.3574, + "step": 4521 + }, + { + "epoch": 0.16194244990778375, + "grad_norm": 1.300298810005188, + "learning_rate": 0.00019100848215697705, + "loss": 1.175, + "step": 4522 + }, + { + "epoch": 0.16197826203735205, + "grad_norm": 1.5079647302627563, + "learning_rate": 0.00019100367465831983, + "loss": 1.1517, + "step": 4523 + }, + { + "epoch": 0.16201407416692035, + "grad_norm": 1.461304783821106, + "learning_rate": 0.00019099886593532554, + "loss": 1.0835, + "step": 4524 + }, + { + "epoch": 0.16204988629648862, + "grad_norm": 1.4456837177276611, + "learning_rate": 0.00019099405598805888, + "loss": 1.1248, + "step": 4525 + }, + { + "epoch": 0.1620856984260569, + "grad_norm": 1.8282212018966675, + "learning_rate": 0.0001909892448165846, + "loss": 1.2003, + "step": 4526 + }, + { + "epoch": 0.16212151055562518, + "grad_norm": 1.472376823425293, + "learning_rate": 0.0001909844324209674, + "loss": 1.1921, + "step": 4527 + }, + { + "epoch": 0.16215732268519348, + "grad_norm": 1.9961879253387451, + "learning_rate": 0.00019097961880127203, + "loss": 1.1614, + "step": 4528 + }, + { + "epoch": 0.16219313481476175, + "grad_norm": 1.3660013675689697, + "learning_rate": 0.0001909748039575632, + "loss": 1.2597, + "step": 4529 + }, + { + "epoch": 0.16222894694433004, + "grad_norm": 1.67041015625, + "learning_rate": 0.00019096998788990574, + "loss": 1.0725, + "step": 4530 + }, + { + "epoch": 0.16226475907389834, + "grad_norm": 2.052351474761963, + "learning_rate": 0.00019096517059836448, + "loss": 1.2811, + "step": 4531 + }, + { + "epoch": 0.1623005712034666, + "grad_norm": 1.6514710187911987, + "learning_rate": 0.00019096035208300416, + "loss": 1.2127, + "step": 4532 + }, + { + "epoch": 0.1623363833330349, + "grad_norm": 1.3749306201934814, + "learning_rate": 0.00019095553234388962, + "loss": 1.2575, + "step": 4533 + }, + { + "epoch": 0.16237219546260317, + "grad_norm": 1.6522761583328247, + "learning_rate": 0.00019095071138108575, + "loss": 1.2969, + "step": 4534 + }, + { + "epoch": 0.16240800759217147, + "grad_norm": 1.2630161046981812, + "learning_rate": 0.00019094588919465734, + "loss": 1.0525, + "step": 4535 + }, + { + "epoch": 0.16244381972173974, + "grad_norm": 2.0246169567108154, + "learning_rate": 0.0001909410657846693, + "loss": 1.3504, + "step": 4536 + }, + { + "epoch": 0.16247963185130804, + "grad_norm": 1.7630584239959717, + "learning_rate": 0.00019093624115118656, + "loss": 1.194, + "step": 4537 + }, + { + "epoch": 0.16251544398087633, + "grad_norm": 1.5456621646881104, + "learning_rate": 0.00019093141529427396, + "loss": 1.2215, + "step": 4538 + }, + { + "epoch": 0.1625512561104446, + "grad_norm": 1.2659436464309692, + "learning_rate": 0.00019092658821399648, + "loss": 1.1474, + "step": 4539 + }, + { + "epoch": 0.1625870682400129, + "grad_norm": 1.6965339183807373, + "learning_rate": 0.00019092175991041905, + "loss": 1.2566, + "step": 4540 + }, + { + "epoch": 0.16262288036958117, + "grad_norm": 1.8538544178009033, + "learning_rate": 0.0001909169303836066, + "loss": 1.1764, + "step": 4541 + }, + { + "epoch": 0.16265869249914947, + "grad_norm": 1.5658061504364014, + "learning_rate": 0.00019091209963362416, + "loss": 1.1628, + "step": 4542 + }, + { + "epoch": 0.16269450462871773, + "grad_norm": 2.2986011505126953, + "learning_rate": 0.00019090726766053667, + "loss": 1.261, + "step": 4543 + }, + { + "epoch": 0.16273031675828603, + "grad_norm": 1.3701817989349365, + "learning_rate": 0.00019090243446440915, + "loss": 1.1242, + "step": 4544 + }, + { + "epoch": 0.16276612888785433, + "grad_norm": 1.5110810995101929, + "learning_rate": 0.0001908976000453066, + "loss": 1.3246, + "step": 4545 + }, + { + "epoch": 0.1628019410174226, + "grad_norm": 1.7055087089538574, + "learning_rate": 0.00019089276440329415, + "loss": 1.1046, + "step": 4546 + }, + { + "epoch": 0.1628377531469909, + "grad_norm": 1.8327715396881104, + "learning_rate": 0.00019088792753843675, + "loss": 1.3705, + "step": 4547 + }, + { + "epoch": 0.16287356527655916, + "grad_norm": 1.4606183767318726, + "learning_rate": 0.00019088308945079956, + "loss": 1.3669, + "step": 4548 + }, + { + "epoch": 0.16290937740612746, + "grad_norm": 1.4562917947769165, + "learning_rate": 0.00019087825014044762, + "loss": 1.185, + "step": 4549 + }, + { + "epoch": 0.16294518953569573, + "grad_norm": 2.2147698402404785, + "learning_rate": 0.00019087340960744604, + "loss": 1.2658, + "step": 4550 + }, + { + "epoch": 0.16298100166526402, + "grad_norm": 1.4063518047332764, + "learning_rate": 0.00019086856785185992, + "loss": 1.2103, + "step": 4551 + }, + { + "epoch": 0.16301681379483232, + "grad_norm": 1.8013921976089478, + "learning_rate": 0.0001908637248737545, + "loss": 1.1433, + "step": 4552 + }, + { + "epoch": 0.1630526259244006, + "grad_norm": 1.8997917175292969, + "learning_rate": 0.00019085888067319485, + "loss": 1.1368, + "step": 4553 + }, + { + "epoch": 0.1630884380539689, + "grad_norm": 2.0366384983062744, + "learning_rate": 0.00019085403525024612, + "loss": 1.1036, + "step": 4554 + }, + { + "epoch": 0.16312425018353716, + "grad_norm": 1.2805782556533813, + "learning_rate": 0.00019084918860497356, + "loss": 1.0085, + "step": 4555 + }, + { + "epoch": 0.16316006231310545, + "grad_norm": 1.7374486923217773, + "learning_rate": 0.00019084434073744238, + "loss": 0.8882, + "step": 4556 + }, + { + "epoch": 0.16319587444267372, + "grad_norm": 1.3663676977157593, + "learning_rate": 0.00019083949164771773, + "loss": 1.153, + "step": 4557 + }, + { + "epoch": 0.16323168657224202, + "grad_norm": 1.4205164909362793, + "learning_rate": 0.00019083464133586492, + "loss": 1.2618, + "step": 4558 + }, + { + "epoch": 0.16326749870181032, + "grad_norm": 1.5028676986694336, + "learning_rate": 0.00019082978980194918, + "loss": 1.2547, + "step": 4559 + }, + { + "epoch": 0.16330331083137858, + "grad_norm": 1.688193917274475, + "learning_rate": 0.00019082493704603576, + "loss": 1.1477, + "step": 4560 + }, + { + "epoch": 0.16333912296094688, + "grad_norm": 1.247304916381836, + "learning_rate": 0.00019082008306819001, + "loss": 1.28, + "step": 4561 + }, + { + "epoch": 0.16337493509051515, + "grad_norm": 1.6683900356292725, + "learning_rate": 0.00019081522786847717, + "loss": 1.2956, + "step": 4562 + }, + { + "epoch": 0.16341074722008345, + "grad_norm": 1.4954910278320312, + "learning_rate": 0.0001908103714469626, + "loss": 1.0233, + "step": 4563 + }, + { + "epoch": 0.16344655934965172, + "grad_norm": 1.8016539812088013, + "learning_rate": 0.00019080551380371157, + "loss": 1.2493, + "step": 4564 + }, + { + "epoch": 0.16348237147922, + "grad_norm": 1.3343099355697632, + "learning_rate": 0.0001908006549387895, + "loss": 1.0751, + "step": 4565 + }, + { + "epoch": 0.1635181836087883, + "grad_norm": 1.8809975385665894, + "learning_rate": 0.00019079579485226176, + "loss": 1.2141, + "step": 4566 + }, + { + "epoch": 0.16355399573835658, + "grad_norm": 1.708471655845642, + "learning_rate": 0.0001907909335441937, + "loss": 1.2121, + "step": 4567 + }, + { + "epoch": 0.16358980786792487, + "grad_norm": 1.6979390382766724, + "learning_rate": 0.00019078607101465078, + "loss": 1.1107, + "step": 4568 + }, + { + "epoch": 0.16362561999749314, + "grad_norm": 1.7507470846176147, + "learning_rate": 0.00019078120726369834, + "loss": 1.1862, + "step": 4569 + }, + { + "epoch": 0.16366143212706144, + "grad_norm": 1.6608753204345703, + "learning_rate": 0.00019077634229140188, + "loss": 1.1078, + "step": 4570 + }, + { + "epoch": 0.1636972442566297, + "grad_norm": 1.5076541900634766, + "learning_rate": 0.0001907714760978268, + "loss": 1.2742, + "step": 4571 + }, + { + "epoch": 0.163733056386198, + "grad_norm": 1.5532310009002686, + "learning_rate": 0.0001907666086830386, + "loss": 1.2085, + "step": 4572 + }, + { + "epoch": 0.1637688685157663, + "grad_norm": 1.3924639225006104, + "learning_rate": 0.0001907617400471028, + "loss": 1.0468, + "step": 4573 + }, + { + "epoch": 0.16380468064533457, + "grad_norm": 1.7256547212600708, + "learning_rate": 0.00019075687019008483, + "loss": 1.3657, + "step": 4574 + }, + { + "epoch": 0.16384049277490287, + "grad_norm": 1.511757493019104, + "learning_rate": 0.00019075199911205024, + "loss": 1.173, + "step": 4575 + }, + { + "epoch": 0.16387630490447114, + "grad_norm": 1.9438221454620361, + "learning_rate": 0.00019074712681306456, + "loss": 1.4583, + "step": 4576 + }, + { + "epoch": 0.16391211703403943, + "grad_norm": 1.3989675045013428, + "learning_rate": 0.00019074225329319337, + "loss": 1.178, + "step": 4577 + }, + { + "epoch": 0.1639479291636077, + "grad_norm": 1.4701060056686401, + "learning_rate": 0.00019073737855250218, + "loss": 1.202, + "step": 4578 + }, + { + "epoch": 0.163983741293176, + "grad_norm": 1.3102481365203857, + "learning_rate": 0.00019073250259105663, + "loss": 1.2497, + "step": 4579 + }, + { + "epoch": 0.1640195534227443, + "grad_norm": 1.3100061416625977, + "learning_rate": 0.00019072762540892226, + "loss": 1.1657, + "step": 4580 + }, + { + "epoch": 0.16405536555231257, + "grad_norm": 1.5840352773666382, + "learning_rate": 0.00019072274700616474, + "loss": 1.3976, + "step": 4581 + }, + { + "epoch": 0.16409117768188086, + "grad_norm": 1.4631420373916626, + "learning_rate": 0.00019071786738284968, + "loss": 1.0581, + "step": 4582 + }, + { + "epoch": 0.16412698981144913, + "grad_norm": 1.6327877044677734, + "learning_rate": 0.00019071298653904276, + "loss": 1.224, + "step": 4583 + }, + { + "epoch": 0.16416280194101743, + "grad_norm": 1.4716964960098267, + "learning_rate": 0.00019070810447480957, + "loss": 1.1824, + "step": 4584 + }, + { + "epoch": 0.1641986140705857, + "grad_norm": 1.2152650356292725, + "learning_rate": 0.00019070322119021588, + "loss": 1.1279, + "step": 4585 + }, + { + "epoch": 0.164234426200154, + "grad_norm": 1.6660319566726685, + "learning_rate": 0.00019069833668532732, + "loss": 1.1008, + "step": 4586 + }, + { + "epoch": 0.1642702383297223, + "grad_norm": 1.4652189016342163, + "learning_rate": 0.00019069345096020966, + "loss": 1.1832, + "step": 4587 + }, + { + "epoch": 0.16430605045929056, + "grad_norm": 1.1969157457351685, + "learning_rate": 0.00019068856401492857, + "loss": 1.2262, + "step": 4588 + }, + { + "epoch": 0.16434186258885886, + "grad_norm": 1.7352052927017212, + "learning_rate": 0.00019068367584954986, + "loss": 1.1018, + "step": 4589 + }, + { + "epoch": 0.16437767471842712, + "grad_norm": 1.361660122871399, + "learning_rate": 0.00019067878646413923, + "loss": 1.2004, + "step": 4590 + }, + { + "epoch": 0.16441348684799542, + "grad_norm": 1.2125896215438843, + "learning_rate": 0.0001906738958587625, + "loss": 1.1339, + "step": 4591 + }, + { + "epoch": 0.1644492989775637, + "grad_norm": 1.4821157455444336, + "learning_rate": 0.00019066900403348551, + "loss": 1.0628, + "step": 4592 + }, + { + "epoch": 0.164485111107132, + "grad_norm": 1.44423246383667, + "learning_rate": 0.000190664110988374, + "loss": 1.1986, + "step": 4593 + }, + { + "epoch": 0.16452092323670028, + "grad_norm": 2.3371238708496094, + "learning_rate": 0.00019065921672349384, + "loss": 1.27, + "step": 4594 + }, + { + "epoch": 0.16455673536626855, + "grad_norm": 1.8781839609146118, + "learning_rate": 0.00019065432123891083, + "loss": 1.2882, + "step": 4595 + }, + { + "epoch": 0.16459254749583685, + "grad_norm": 1.7105482816696167, + "learning_rate": 0.00019064942453469086, + "loss": 1.2405, + "step": 4596 + }, + { + "epoch": 0.16462835962540512, + "grad_norm": 1.992102026939392, + "learning_rate": 0.0001906445266108998, + "loss": 1.2953, + "step": 4597 + }, + { + "epoch": 0.16466417175497342, + "grad_norm": 2.0259788036346436, + "learning_rate": 0.0001906396274676036, + "loss": 1.1566, + "step": 4598 + }, + { + "epoch": 0.16469998388454168, + "grad_norm": 1.4824045896530151, + "learning_rate": 0.00019063472710486814, + "loss": 1.273, + "step": 4599 + }, + { + "epoch": 0.16473579601410998, + "grad_norm": 1.9188499450683594, + "learning_rate": 0.0001906298255227593, + "loss": 1.2497, + "step": 4600 + }, + { + "epoch": 0.16477160814367828, + "grad_norm": 1.7566003799438477, + "learning_rate": 0.00019062492272134307, + "loss": 1.0647, + "step": 4601 + }, + { + "epoch": 0.16480742027324655, + "grad_norm": 1.6887013912200928, + "learning_rate": 0.0001906200187006854, + "loss": 1.1372, + "step": 4602 + }, + { + "epoch": 0.16484323240281484, + "grad_norm": 1.4523262977600098, + "learning_rate": 0.0001906151134608523, + "loss": 1.2111, + "step": 4603 + }, + { + "epoch": 0.1648790445323831, + "grad_norm": 2.110483169555664, + "learning_rate": 0.0001906102070019097, + "loss": 1.2187, + "step": 4604 + }, + { + "epoch": 0.1649148566619514, + "grad_norm": 2.111218214035034, + "learning_rate": 0.00019060529932392366, + "loss": 1.0412, + "step": 4605 + }, + { + "epoch": 0.16495066879151968, + "grad_norm": 1.9994667768478394, + "learning_rate": 0.00019060039042696016, + "loss": 1.1999, + "step": 4606 + }, + { + "epoch": 0.16498648092108797, + "grad_norm": 1.333627462387085, + "learning_rate": 0.00019059548031108528, + "loss": 1.1168, + "step": 4607 + }, + { + "epoch": 0.16502229305065624, + "grad_norm": 1.9869219064712524, + "learning_rate": 0.0001905905689763651, + "loss": 1.0308, + "step": 4608 + }, + { + "epoch": 0.16505810518022454, + "grad_norm": 1.5410627126693726, + "learning_rate": 0.00019058565642286567, + "loss": 0.9635, + "step": 4609 + }, + { + "epoch": 0.16509391730979284, + "grad_norm": 1.1542079448699951, + "learning_rate": 0.00019058074265065303, + "loss": 1.1443, + "step": 4610 + }, + { + "epoch": 0.1651297294393611, + "grad_norm": 1.8268108367919922, + "learning_rate": 0.00019057582765979341, + "loss": 1.097, + "step": 4611 + }, + { + "epoch": 0.1651655415689294, + "grad_norm": 1.7950198650360107, + "learning_rate": 0.00019057091145035281, + "loss": 1.1773, + "step": 4612 + }, + { + "epoch": 0.16520135369849767, + "grad_norm": 1.3903026580810547, + "learning_rate": 0.00019056599402239742, + "loss": 1.1627, + "step": 4613 + }, + { + "epoch": 0.16523716582806597, + "grad_norm": 1.29165780544281, + "learning_rate": 0.00019056107537599343, + "loss": 1.1441, + "step": 4614 + }, + { + "epoch": 0.16527297795763424, + "grad_norm": 1.99923574924469, + "learning_rate": 0.000190556155511207, + "loss": 1.4764, + "step": 4615 + }, + { + "epoch": 0.16530879008720253, + "grad_norm": 2.2431225776672363, + "learning_rate": 0.00019055123442810427, + "loss": 1.216, + "step": 4616 + }, + { + "epoch": 0.16534460221677083, + "grad_norm": 1.6228532791137695, + "learning_rate": 0.00019054631212675152, + "loss": 0.9517, + "step": 4617 + }, + { + "epoch": 0.1653804143463391, + "grad_norm": 1.5099889039993286, + "learning_rate": 0.00019054138860721492, + "loss": 1.2627, + "step": 4618 + }, + { + "epoch": 0.1654162264759074, + "grad_norm": 2.181680917739868, + "learning_rate": 0.00019053646386956073, + "loss": 1.074, + "step": 4619 + }, + { + "epoch": 0.16545203860547567, + "grad_norm": 1.419111967086792, + "learning_rate": 0.00019053153791385516, + "loss": 1.239, + "step": 4620 + }, + { + "epoch": 0.16548785073504396, + "grad_norm": 1.9157251119613647, + "learning_rate": 0.00019052661074016458, + "loss": 1.2185, + "step": 4621 + }, + { + "epoch": 0.16552366286461223, + "grad_norm": 1.5868587493896484, + "learning_rate": 0.0001905216823485552, + "loss": 1.1938, + "step": 4622 + }, + { + "epoch": 0.16555947499418053, + "grad_norm": 1.305956244468689, + "learning_rate": 0.00019051675273909336, + "loss": 1.2823, + "step": 4623 + }, + { + "epoch": 0.16559528712374882, + "grad_norm": 2.0025367736816406, + "learning_rate": 0.00019051182191184537, + "loss": 1.0085, + "step": 4624 + }, + { + "epoch": 0.1656310992533171, + "grad_norm": 1.7980509996414185, + "learning_rate": 0.00019050688986687754, + "loss": 1.1315, + "step": 4625 + }, + { + "epoch": 0.1656669113828854, + "grad_norm": 1.5174260139465332, + "learning_rate": 0.00019050195660425627, + "loss": 1.1409, + "step": 4626 + }, + { + "epoch": 0.16570272351245366, + "grad_norm": 1.6812987327575684, + "learning_rate": 0.00019049702212404793, + "loss": 1.0607, + "step": 4627 + }, + { + "epoch": 0.16573853564202196, + "grad_norm": 1.2894487380981445, + "learning_rate": 0.00019049208642631885, + "loss": 1.1647, + "step": 4628 + }, + { + "epoch": 0.16577434777159022, + "grad_norm": 1.6481224298477173, + "learning_rate": 0.00019048714951113552, + "loss": 1.2951, + "step": 4629 + }, + { + "epoch": 0.16581015990115852, + "grad_norm": 1.6044607162475586, + "learning_rate": 0.00019048221137856427, + "loss": 1.3284, + "step": 4630 + }, + { + "epoch": 0.16584597203072682, + "grad_norm": 1.5551249980926514, + "learning_rate": 0.0001904772720286716, + "loss": 1.2884, + "step": 4631 + }, + { + "epoch": 0.1658817841602951, + "grad_norm": 1.700853705406189, + "learning_rate": 0.00019047233146152393, + "loss": 1.1267, + "step": 4632 + }, + { + "epoch": 0.16591759628986338, + "grad_norm": 1.543239712715149, + "learning_rate": 0.00019046738967718778, + "loss": 1.3238, + "step": 4633 + }, + { + "epoch": 0.16595340841943165, + "grad_norm": 1.521651029586792, + "learning_rate": 0.00019046244667572957, + "loss": 1.2309, + "step": 4634 + }, + { + "epoch": 0.16598922054899995, + "grad_norm": 1.80137300491333, + "learning_rate": 0.00019045750245721583, + "loss": 1.0074, + "step": 4635 + }, + { + "epoch": 0.16602503267856822, + "grad_norm": 2.034578561782837, + "learning_rate": 0.00019045255702171307, + "loss": 1.1494, + "step": 4636 + }, + { + "epoch": 0.16606084480813652, + "grad_norm": 1.9390887022018433, + "learning_rate": 0.00019044761036928783, + "loss": 1.3122, + "step": 4637 + }, + { + "epoch": 0.1660966569377048, + "grad_norm": 2.4497146606445312, + "learning_rate": 0.00019044266250000668, + "loss": 1.3293, + "step": 4638 + }, + { + "epoch": 0.16613246906727308, + "grad_norm": 1.8555299043655396, + "learning_rate": 0.00019043771341393614, + "loss": 1.2007, + "step": 4639 + }, + { + "epoch": 0.16616828119684138, + "grad_norm": 1.5204429626464844, + "learning_rate": 0.00019043276311114283, + "loss": 1.2661, + "step": 4640 + }, + { + "epoch": 0.16620409332640965, + "grad_norm": 3.0212838649749756, + "learning_rate": 0.00019042781159169336, + "loss": 1.26, + "step": 4641 + }, + { + "epoch": 0.16623990545597794, + "grad_norm": 1.396894097328186, + "learning_rate": 0.00019042285885565433, + "loss": 1.182, + "step": 4642 + }, + { + "epoch": 0.1662757175855462, + "grad_norm": 1.760754942893982, + "learning_rate": 0.00019041790490309233, + "loss": 0.9453, + "step": 4643 + }, + { + "epoch": 0.1663115297151145, + "grad_norm": 1.5238252878189087, + "learning_rate": 0.00019041294973407412, + "loss": 1.2236, + "step": 4644 + }, + { + "epoch": 0.1663473418446828, + "grad_norm": 2.1149866580963135, + "learning_rate": 0.00019040799334866626, + "loss": 1.3678, + "step": 4645 + }, + { + "epoch": 0.16638315397425107, + "grad_norm": 1.856547236442566, + "learning_rate": 0.00019040303574693545, + "loss": 1.2523, + "step": 4646 + }, + { + "epoch": 0.16641896610381937, + "grad_norm": 1.5953292846679688, + "learning_rate": 0.00019039807692894842, + "loss": 1.1412, + "step": 4647 + }, + { + "epoch": 0.16645477823338764, + "grad_norm": 1.6485199928283691, + "learning_rate": 0.00019039311689477185, + "loss": 1.2222, + "step": 4648 + }, + { + "epoch": 0.16649059036295594, + "grad_norm": 1.6217297315597534, + "learning_rate": 0.0001903881556444725, + "loss": 1.1151, + "step": 4649 + }, + { + "epoch": 0.1665264024925242, + "grad_norm": 1.8155043125152588, + "learning_rate": 0.00019038319317811714, + "loss": 1.1309, + "step": 4650 + }, + { + "epoch": 0.1665622146220925, + "grad_norm": 1.3995726108551025, + "learning_rate": 0.00019037822949577248, + "loss": 1.0119, + "step": 4651 + }, + { + "epoch": 0.1665980267516608, + "grad_norm": 1.3577842712402344, + "learning_rate": 0.00019037326459750534, + "loss": 1.2524, + "step": 4652 + }, + { + "epoch": 0.16663383888122907, + "grad_norm": 1.4114586114883423, + "learning_rate": 0.00019036829848338246, + "loss": 1.1469, + "step": 4653 + }, + { + "epoch": 0.16666965101079736, + "grad_norm": 1.9447810649871826, + "learning_rate": 0.00019036333115347073, + "loss": 1.2322, + "step": 4654 + }, + { + "epoch": 0.16670546314036563, + "grad_norm": 1.6723659038543701, + "learning_rate": 0.00019035836260783691, + "loss": 1.1307, + "step": 4655 + }, + { + "epoch": 0.16674127526993393, + "grad_norm": 1.3963121175765991, + "learning_rate": 0.00019035339284654787, + "loss": 1.0801, + "step": 4656 + }, + { + "epoch": 0.1667770873995022, + "grad_norm": 1.7298917770385742, + "learning_rate": 0.0001903484218696705, + "loss": 1.1887, + "step": 4657 + }, + { + "epoch": 0.1668128995290705, + "grad_norm": 1.557509422302246, + "learning_rate": 0.00019034344967727165, + "loss": 1.2738, + "step": 4658 + }, + { + "epoch": 0.1668487116586388, + "grad_norm": 1.9298263788223267, + "learning_rate": 0.0001903384762694182, + "loss": 1.331, + "step": 4659 + }, + { + "epoch": 0.16688452378820706, + "grad_norm": 1.522019863128662, + "learning_rate": 0.0001903335016461771, + "loss": 1.09, + "step": 4660 + }, + { + "epoch": 0.16692033591777536, + "grad_norm": 1.8806453943252563, + "learning_rate": 0.00019032852580761527, + "loss": 1.2488, + "step": 4661 + }, + { + "epoch": 0.16695614804734363, + "grad_norm": 1.224436640739441, + "learning_rate": 0.00019032354875379962, + "loss": 1.0414, + "step": 4662 + }, + { + "epoch": 0.16699196017691192, + "grad_norm": 1.854073166847229, + "learning_rate": 0.00019031857048479713, + "loss": 0.9687, + "step": 4663 + }, + { + "epoch": 0.1670277723064802, + "grad_norm": 1.6285663843154907, + "learning_rate": 0.00019031359100067478, + "loss": 1.3422, + "step": 4664 + }, + { + "epoch": 0.1670635844360485, + "grad_norm": 2.190809726715088, + "learning_rate": 0.00019030861030149956, + "loss": 1.0288, + "step": 4665 + }, + { + "epoch": 0.1670993965656168, + "grad_norm": 2.022881507873535, + "learning_rate": 0.00019030362838733846, + "loss": 1.0525, + "step": 4666 + }, + { + "epoch": 0.16713520869518506, + "grad_norm": 1.2658923864364624, + "learning_rate": 0.00019029864525825857, + "loss": 1.2009, + "step": 4667 + }, + { + "epoch": 0.16717102082475335, + "grad_norm": 1.243189811706543, + "learning_rate": 0.00019029366091432684, + "loss": 1.2471, + "step": 4668 + }, + { + "epoch": 0.16720683295432162, + "grad_norm": 1.7862849235534668, + "learning_rate": 0.0001902886753556104, + "loss": 1.0535, + "step": 4669 + }, + { + "epoch": 0.16724264508388992, + "grad_norm": 1.478222131729126, + "learning_rate": 0.0001902836885821763, + "loss": 1.1113, + "step": 4670 + }, + { + "epoch": 0.1672784572134582, + "grad_norm": 1.462168574333191, + "learning_rate": 0.00019027870059409158, + "loss": 1.0729, + "step": 4671 + }, + { + "epoch": 0.16731426934302648, + "grad_norm": 1.5069997310638428, + "learning_rate": 0.00019027371139142342, + "loss": 1.1671, + "step": 4672 + }, + { + "epoch": 0.16735008147259478, + "grad_norm": 1.4407960176467896, + "learning_rate": 0.00019026872097423894, + "loss": 1.0293, + "step": 4673 + }, + { + "epoch": 0.16738589360216305, + "grad_norm": 1.6638065576553345, + "learning_rate": 0.00019026372934260525, + "loss": 1.0186, + "step": 4674 + }, + { + "epoch": 0.16742170573173135, + "grad_norm": 1.8162329196929932, + "learning_rate": 0.0001902587364965895, + "loss": 1.0247, + "step": 4675 + }, + { + "epoch": 0.16745751786129962, + "grad_norm": 1.5036933422088623, + "learning_rate": 0.00019025374243625888, + "loss": 1.3096, + "step": 4676 + }, + { + "epoch": 0.1674933299908679, + "grad_norm": 1.4695520401000977, + "learning_rate": 0.0001902487471616806, + "loss": 1.3907, + "step": 4677 + }, + { + "epoch": 0.16752914212043618, + "grad_norm": 1.7133365869522095, + "learning_rate": 0.00019024375067292181, + "loss": 1.15, + "step": 4678 + }, + { + "epoch": 0.16756495425000448, + "grad_norm": 2.440105676651001, + "learning_rate": 0.00019023875297004977, + "loss": 1.3812, + "step": 4679 + }, + { + "epoch": 0.16760076637957277, + "grad_norm": 1.4445240497589111, + "learning_rate": 0.0001902337540531317, + "loss": 1.2551, + "step": 4680 + }, + { + "epoch": 0.16763657850914104, + "grad_norm": 1.9982246160507202, + "learning_rate": 0.00019022875392223486, + "loss": 1.2651, + "step": 4681 + }, + { + "epoch": 0.16767239063870934, + "grad_norm": 1.5661993026733398, + "learning_rate": 0.00019022375257742656, + "loss": 1.3113, + "step": 4682 + }, + { + "epoch": 0.1677082027682776, + "grad_norm": 1.5739009380340576, + "learning_rate": 0.000190218750018774, + "loss": 1.0716, + "step": 4683 + }, + { + "epoch": 0.1677440148978459, + "grad_norm": 2.654669761657715, + "learning_rate": 0.00019021374624634456, + "loss": 1.1663, + "step": 4684 + }, + { + "epoch": 0.16777982702741417, + "grad_norm": 1.7296613454818726, + "learning_rate": 0.0001902087412602055, + "loss": 1.2174, + "step": 4685 + }, + { + "epoch": 0.16781563915698247, + "grad_norm": 1.4873186349868774, + "learning_rate": 0.00019020373506042424, + "loss": 1.0642, + "step": 4686 + }, + { + "epoch": 0.16785145128655077, + "grad_norm": 1.5815755128860474, + "learning_rate": 0.00019019872764706804, + "loss": 1.3504, + "step": 4687 + }, + { + "epoch": 0.16788726341611904, + "grad_norm": 1.8232604265213013, + "learning_rate": 0.00019019371902020434, + "loss": 1.287, + "step": 4688 + }, + { + "epoch": 0.16792307554568733, + "grad_norm": 1.751360535621643, + "learning_rate": 0.00019018870917990045, + "loss": 1.4862, + "step": 4689 + }, + { + "epoch": 0.1679588876752556, + "grad_norm": 1.4138318300247192, + "learning_rate": 0.00019018369812622384, + "loss": 1.334, + "step": 4690 + }, + { + "epoch": 0.1679946998048239, + "grad_norm": 1.5787750482559204, + "learning_rate": 0.0001901786858592419, + "loss": 1.0002, + "step": 4691 + }, + { + "epoch": 0.16803051193439217, + "grad_norm": 1.6909499168395996, + "learning_rate": 0.00019017367237902206, + "loss": 1.0584, + "step": 4692 + }, + { + "epoch": 0.16806632406396046, + "grad_norm": 1.8485896587371826, + "learning_rate": 0.00019016865768563176, + "loss": 1.3755, + "step": 4693 + }, + { + "epoch": 0.16810213619352876, + "grad_norm": 1.836942195892334, + "learning_rate": 0.0001901636417791385, + "loss": 1.1855, + "step": 4694 + }, + { + "epoch": 0.16813794832309703, + "grad_norm": 1.6487377882003784, + "learning_rate": 0.00019015862465960974, + "loss": 1.31, + "step": 4695 + }, + { + "epoch": 0.16817376045266533, + "grad_norm": 1.494375228881836, + "learning_rate": 0.00019015360632711298, + "loss": 1.3274, + "step": 4696 + }, + { + "epoch": 0.1682095725822336, + "grad_norm": 1.5535235404968262, + "learning_rate": 0.00019014858678171573, + "loss": 1.1832, + "step": 4697 + }, + { + "epoch": 0.1682453847118019, + "grad_norm": 2.197988748550415, + "learning_rate": 0.0001901435660234855, + "loss": 1.3457, + "step": 4698 + }, + { + "epoch": 0.16828119684137016, + "grad_norm": 1.5146814584732056, + "learning_rate": 0.00019013854405248992, + "loss": 1.194, + "step": 4699 + }, + { + "epoch": 0.16831700897093846, + "grad_norm": 1.5298281908035278, + "learning_rate": 0.0001901335208687965, + "loss": 1.2491, + "step": 4700 + }, + { + "epoch": 0.16835282110050676, + "grad_norm": 1.7546117305755615, + "learning_rate": 0.00019012849647247277, + "loss": 1.0421, + "step": 4701 + }, + { + "epoch": 0.16838863323007502, + "grad_norm": 1.5282258987426758, + "learning_rate": 0.0001901234708635864, + "loss": 1.2008, + "step": 4702 + }, + { + "epoch": 0.16842444535964332, + "grad_norm": 1.7478458881378174, + "learning_rate": 0.00019011844404220497, + "loss": 1.1928, + "step": 4703 + }, + { + "epoch": 0.1684602574892116, + "grad_norm": 1.4669198989868164, + "learning_rate": 0.00019011341600839616, + "loss": 1.1758, + "step": 4704 + }, + { + "epoch": 0.1684960696187799, + "grad_norm": 1.4467236995697021, + "learning_rate": 0.00019010838676222755, + "loss": 1.2624, + "step": 4705 + }, + { + "epoch": 0.16853188174834816, + "grad_norm": 1.7796226739883423, + "learning_rate": 0.00019010335630376682, + "loss": 1.0346, + "step": 4706 + }, + { + "epoch": 0.16856769387791645, + "grad_norm": 1.726049542427063, + "learning_rate": 0.00019009832463308168, + "loss": 1.4576, + "step": 4707 + }, + { + "epoch": 0.16860350600748472, + "grad_norm": 2.073401689529419, + "learning_rate": 0.00019009329175023978, + "loss": 1.4013, + "step": 4708 + }, + { + "epoch": 0.16863931813705302, + "grad_norm": 1.3228259086608887, + "learning_rate": 0.00019008825765530886, + "loss": 1.1896, + "step": 4709 + }, + { + "epoch": 0.16867513026662131, + "grad_norm": 1.5485994815826416, + "learning_rate": 0.00019008322234835662, + "loss": 1.1407, + "step": 4710 + }, + { + "epoch": 0.16871094239618958, + "grad_norm": 1.42915678024292, + "learning_rate": 0.00019007818582945086, + "loss": 1.0734, + "step": 4711 + }, + { + "epoch": 0.16874675452575788, + "grad_norm": 1.6646885871887207, + "learning_rate": 0.00019007314809865928, + "loss": 1.2503, + "step": 4712 + }, + { + "epoch": 0.16878256665532615, + "grad_norm": 2.0191807746887207, + "learning_rate": 0.00019006810915604967, + "loss": 1.1801, + "step": 4713 + }, + { + "epoch": 0.16881837878489445, + "grad_norm": 1.8239531517028809, + "learning_rate": 0.00019006306900168983, + "loss": 1.1875, + "step": 4714 + }, + { + "epoch": 0.16885419091446272, + "grad_norm": 1.4287577867507935, + "learning_rate": 0.00019005802763564757, + "loss": 1.0476, + "step": 4715 + }, + { + "epoch": 0.168890003044031, + "grad_norm": 1.7913970947265625, + "learning_rate": 0.00019005298505799073, + "loss": 1.296, + "step": 4716 + }, + { + "epoch": 0.1689258151735993, + "grad_norm": 1.6100245714187622, + "learning_rate": 0.00019004794126878713, + "loss": 1.3598, + "step": 4717 + }, + { + "epoch": 0.16896162730316758, + "grad_norm": 1.8645174503326416, + "learning_rate": 0.00019004289626810462, + "loss": 1.0918, + "step": 4718 + }, + { + "epoch": 0.16899743943273587, + "grad_norm": 1.4445300102233887, + "learning_rate": 0.00019003785005601112, + "loss": 1.1552, + "step": 4719 + }, + { + "epoch": 0.16903325156230414, + "grad_norm": 1.4976195096969604, + "learning_rate": 0.00019003280263257447, + "loss": 1.2449, + "step": 4720 + }, + { + "epoch": 0.16906906369187244, + "grad_norm": 1.3155039548873901, + "learning_rate": 0.0001900277539978626, + "loss": 1.15, + "step": 4721 + }, + { + "epoch": 0.1691048758214407, + "grad_norm": 1.9472308158874512, + "learning_rate": 0.0001900227041519434, + "loss": 1.1428, + "step": 4722 + }, + { + "epoch": 0.169140687951009, + "grad_norm": 1.9459317922592163, + "learning_rate": 0.00019001765309488487, + "loss": 1.4198, + "step": 4723 + }, + { + "epoch": 0.1691765000805773, + "grad_norm": 1.9523215293884277, + "learning_rate": 0.00019001260082675492, + "loss": 1.3384, + "step": 4724 + }, + { + "epoch": 0.16921231221014557, + "grad_norm": 1.3982763290405273, + "learning_rate": 0.00019000754734762153, + "loss": 1.1903, + "step": 4725 + }, + { + "epoch": 0.16924812433971387, + "grad_norm": 1.971880555152893, + "learning_rate": 0.0001900024926575527, + "loss": 1.1724, + "step": 4726 + }, + { + "epoch": 0.16928393646928214, + "grad_norm": 1.7573609352111816, + "learning_rate": 0.0001899974367566164, + "loss": 1.2855, + "step": 4727 + }, + { + "epoch": 0.16931974859885043, + "grad_norm": 1.651070237159729, + "learning_rate": 0.00018999237964488074, + "loss": 1.0451, + "step": 4728 + }, + { + "epoch": 0.1693555607284187, + "grad_norm": 1.4095699787139893, + "learning_rate": 0.0001899873213224136, + "loss": 1.1424, + "step": 4729 + }, + { + "epoch": 0.169391372857987, + "grad_norm": 1.6626442670822144, + "learning_rate": 0.0001899822617892832, + "loss": 1.3311, + "step": 4730 + }, + { + "epoch": 0.1694271849875553, + "grad_norm": 1.5390369892120361, + "learning_rate": 0.0001899772010455575, + "loss": 1.1243, + "step": 4731 + }, + { + "epoch": 0.16946299711712356, + "grad_norm": 1.5373798608779907, + "learning_rate": 0.00018997213909130464, + "loss": 0.9989, + "step": 4732 + }, + { + "epoch": 0.16949880924669186, + "grad_norm": 1.806597352027893, + "learning_rate": 0.0001899670759265927, + "loss": 1.2411, + "step": 4733 + }, + { + "epoch": 0.16953462137626013, + "grad_norm": 1.9421225786209106, + "learning_rate": 0.00018996201155148983, + "loss": 1.1518, + "step": 4734 + }, + { + "epoch": 0.16957043350582843, + "grad_norm": 2.0039803981781006, + "learning_rate": 0.0001899569459660641, + "loss": 1.3374, + "step": 4735 + }, + { + "epoch": 0.1696062456353967, + "grad_norm": 1.3825805187225342, + "learning_rate": 0.0001899518791703837, + "loss": 1.1378, + "step": 4736 + }, + { + "epoch": 0.169642057764965, + "grad_norm": 1.4057689905166626, + "learning_rate": 0.0001899468111645168, + "loss": 1.2844, + "step": 4737 + }, + { + "epoch": 0.1696778698945333, + "grad_norm": 1.6171038150787354, + "learning_rate": 0.00018994174194853161, + "loss": 1.1064, + "step": 4738 + }, + { + "epoch": 0.16971368202410156, + "grad_norm": 1.4919551610946655, + "learning_rate": 0.00018993667152249625, + "loss": 1.1239, + "step": 4739 + }, + { + "epoch": 0.16974949415366986, + "grad_norm": 1.6611560583114624, + "learning_rate": 0.00018993159988647901, + "loss": 1.4132, + "step": 4740 + }, + { + "epoch": 0.16978530628323812, + "grad_norm": 1.6839720010757446, + "learning_rate": 0.0001899265270405481, + "loss": 1.2752, + "step": 4741 + }, + { + "epoch": 0.16982111841280642, + "grad_norm": 2.3424127101898193, + "learning_rate": 0.00018992145298477175, + "loss": 1.1639, + "step": 4742 + }, + { + "epoch": 0.1698569305423747, + "grad_norm": 1.774057388305664, + "learning_rate": 0.00018991637771921825, + "loss": 1.4078, + "step": 4743 + }, + { + "epoch": 0.169892742671943, + "grad_norm": 1.739351749420166, + "learning_rate": 0.00018991130124395585, + "loss": 1.3764, + "step": 4744 + }, + { + "epoch": 0.16992855480151128, + "grad_norm": 1.9991254806518555, + "learning_rate": 0.0001899062235590529, + "loss": 1.1499, + "step": 4745 + }, + { + "epoch": 0.16996436693107955, + "grad_norm": 1.6175276041030884, + "learning_rate": 0.00018990114466457768, + "loss": 1.3618, + "step": 4746 + }, + { + "epoch": 0.17000017906064785, + "grad_norm": 1.5790889263153076, + "learning_rate": 0.0001898960645605985, + "loss": 1.1919, + "step": 4747 + }, + { + "epoch": 0.17003599119021612, + "grad_norm": 1.88215970993042, + "learning_rate": 0.00018989098324718375, + "loss": 1.1912, + "step": 4748 + }, + { + "epoch": 0.17007180331978441, + "grad_norm": 1.4670408964157104, + "learning_rate": 0.00018988590072440176, + "loss": 1.2966, + "step": 4749 + }, + { + "epoch": 0.17010761544935268, + "grad_norm": 1.859674096107483, + "learning_rate": 0.00018988081699232095, + "loss": 1.2879, + "step": 4750 + }, + { + "epoch": 0.17014342757892098, + "grad_norm": 1.7120777368545532, + "learning_rate": 0.00018987573205100965, + "loss": 1.244, + "step": 4751 + }, + { + "epoch": 0.17017923970848928, + "grad_norm": 1.7116392850875854, + "learning_rate": 0.00018987064590053634, + "loss": 1.5195, + "step": 4752 + }, + { + "epoch": 0.17021505183805755, + "grad_norm": 1.6049576997756958, + "learning_rate": 0.0001898655585409694, + "loss": 1.1025, + "step": 4753 + }, + { + "epoch": 0.17025086396762584, + "grad_norm": 2.440330743789673, + "learning_rate": 0.00018986046997237726, + "loss": 1.2201, + "step": 4754 + }, + { + "epoch": 0.1702866760971941, + "grad_norm": 1.4258679151535034, + "learning_rate": 0.00018985538019482842, + "loss": 1.295, + "step": 4755 + }, + { + "epoch": 0.1703224882267624, + "grad_norm": 2.2950217723846436, + "learning_rate": 0.00018985028920839137, + "loss": 1.3321, + "step": 4756 + }, + { + "epoch": 0.17035830035633068, + "grad_norm": 1.5237529277801514, + "learning_rate": 0.00018984519701313455, + "loss": 1.1991, + "step": 4757 + }, + { + "epoch": 0.17039411248589897, + "grad_norm": 1.6332066059112549, + "learning_rate": 0.0001898401036091265, + "loss": 1.1966, + "step": 4758 + }, + { + "epoch": 0.17042992461546727, + "grad_norm": 1.5417430400848389, + "learning_rate": 0.00018983500899643577, + "loss": 1.4413, + "step": 4759 + }, + { + "epoch": 0.17046573674503554, + "grad_norm": 1.3178311586380005, + "learning_rate": 0.00018982991317513084, + "loss": 1.145, + "step": 4760 + }, + { + "epoch": 0.17050154887460384, + "grad_norm": 1.9986690282821655, + "learning_rate": 0.0001898248161452803, + "loss": 1.1234, + "step": 4761 + }, + { + "epoch": 0.1705373610041721, + "grad_norm": 1.4623093605041504, + "learning_rate": 0.00018981971790695275, + "loss": 1.1612, + "step": 4762 + }, + { + "epoch": 0.1705731731337404, + "grad_norm": 1.6274795532226562, + "learning_rate": 0.0001898146184602167, + "loss": 1.2643, + "step": 4763 + }, + { + "epoch": 0.17060898526330867, + "grad_norm": 1.4718364477157593, + "learning_rate": 0.0001898095178051409, + "loss": 1.1186, + "step": 4764 + }, + { + "epoch": 0.17064479739287697, + "grad_norm": 1.454458236694336, + "learning_rate": 0.0001898044159417938, + "loss": 1.2588, + "step": 4765 + }, + { + "epoch": 0.17068060952244526, + "grad_norm": 1.2591147422790527, + "learning_rate": 0.00018979931287024416, + "loss": 0.9728, + "step": 4766 + }, + { + "epoch": 0.17071642165201353, + "grad_norm": 1.8008278608322144, + "learning_rate": 0.00018979420859056062, + "loss": 1.4276, + "step": 4767 + }, + { + "epoch": 0.17075223378158183, + "grad_norm": 2.3183393478393555, + "learning_rate": 0.0001897891031028118, + "loss": 1.2964, + "step": 4768 + }, + { + "epoch": 0.1707880459111501, + "grad_norm": 2.018542528152466, + "learning_rate": 0.0001897839964070664, + "loss": 1.1841, + "step": 4769 + }, + { + "epoch": 0.1708238580407184, + "grad_norm": 2.064601182937622, + "learning_rate": 0.00018977888850339319, + "loss": 1.3482, + "step": 4770 + }, + { + "epoch": 0.17085967017028666, + "grad_norm": 1.3155854940414429, + "learning_rate": 0.0001897737793918608, + "loss": 1.2117, + "step": 4771 + }, + { + "epoch": 0.17089548229985496, + "grad_norm": 1.41620934009552, + "learning_rate": 0.00018976866907253803, + "loss": 0.9499, + "step": 4772 + }, + { + "epoch": 0.17093129442942326, + "grad_norm": 1.537757396697998, + "learning_rate": 0.0001897635575454936, + "loss": 1.1525, + "step": 4773 + }, + { + "epoch": 0.17096710655899153, + "grad_norm": 1.4225660562515259, + "learning_rate": 0.0001897584448107963, + "loss": 1.3132, + "step": 4774 + }, + { + "epoch": 0.17100291868855982, + "grad_norm": 1.663432240486145, + "learning_rate": 0.0001897533308685149, + "loss": 1.2619, + "step": 4775 + }, + { + "epoch": 0.1710387308181281, + "grad_norm": 1.5998328924179077, + "learning_rate": 0.00018974821571871822, + "loss": 1.1414, + "step": 4776 + }, + { + "epoch": 0.1710745429476964, + "grad_norm": 1.3194442987442017, + "learning_rate": 0.00018974309936147502, + "loss": 1.1023, + "step": 4777 + }, + { + "epoch": 0.17111035507726466, + "grad_norm": 1.5301756858825684, + "learning_rate": 0.00018973798179685423, + "loss": 1.1506, + "step": 4778 + }, + { + "epoch": 0.17114616720683296, + "grad_norm": 1.6422923803329468, + "learning_rate": 0.0001897328630249246, + "loss": 1.1961, + "step": 4779 + }, + { + "epoch": 0.17118197933640125, + "grad_norm": 1.340721845626831, + "learning_rate": 0.0001897277430457551, + "loss": 0.9904, + "step": 4780 + }, + { + "epoch": 0.17121779146596952, + "grad_norm": 1.7759246826171875, + "learning_rate": 0.00018972262185941452, + "loss": 1.2864, + "step": 4781 + }, + { + "epoch": 0.17125360359553782, + "grad_norm": 1.822115421295166, + "learning_rate": 0.00018971749946597178, + "loss": 1.1713, + "step": 4782 + }, + { + "epoch": 0.1712894157251061, + "grad_norm": 1.4087188243865967, + "learning_rate": 0.00018971237586549587, + "loss": 1.1848, + "step": 4783 + }, + { + "epoch": 0.17132522785467438, + "grad_norm": 1.4446126222610474, + "learning_rate": 0.00018970725105805562, + "loss": 1.1386, + "step": 4784 + }, + { + "epoch": 0.17136103998424265, + "grad_norm": 1.696129322052002, + "learning_rate": 0.00018970212504372004, + "loss": 1.1293, + "step": 4785 + }, + { + "epoch": 0.17139685211381095, + "grad_norm": 1.662096381187439, + "learning_rate": 0.00018969699782255808, + "loss": 1.3387, + "step": 4786 + }, + { + "epoch": 0.17143266424337925, + "grad_norm": 1.8126821517944336, + "learning_rate": 0.0001896918693946387, + "loss": 1.1688, + "step": 4787 + }, + { + "epoch": 0.17146847637294751, + "grad_norm": 1.2400062084197998, + "learning_rate": 0.0001896867397600309, + "loss": 1.3009, + "step": 4788 + }, + { + "epoch": 0.1715042885025158, + "grad_norm": 2.5058693885803223, + "learning_rate": 0.00018968160891880373, + "loss": 1.5603, + "step": 4789 + }, + { + "epoch": 0.17154010063208408, + "grad_norm": 1.278178334236145, + "learning_rate": 0.00018967647687102618, + "loss": 1.0685, + "step": 4790 + }, + { + "epoch": 0.17157591276165238, + "grad_norm": 2.025056838989258, + "learning_rate": 0.00018967134361676732, + "loss": 1.1106, + "step": 4791 + }, + { + "epoch": 0.17161172489122065, + "grad_norm": 1.3619027137756348, + "learning_rate": 0.00018966620915609618, + "loss": 1.2129, + "step": 4792 + }, + { + "epoch": 0.17164753702078894, + "grad_norm": 2.2306933403015137, + "learning_rate": 0.00018966107348908188, + "loss": 1.0709, + "step": 4793 + }, + { + "epoch": 0.17168334915035724, + "grad_norm": 1.6596362590789795, + "learning_rate": 0.00018965593661579347, + "loss": 1.1984, + "step": 4794 + }, + { + "epoch": 0.1717191612799255, + "grad_norm": 1.8007298707962036, + "learning_rate": 0.00018965079853630007, + "loss": 1.3355, + "step": 4795 + }, + { + "epoch": 0.1717549734094938, + "grad_norm": 1.813963770866394, + "learning_rate": 0.00018964565925067085, + "loss": 1.3287, + "step": 4796 + }, + { + "epoch": 0.17179078553906207, + "grad_norm": 1.5186985731124878, + "learning_rate": 0.0001896405187589749, + "loss": 1.0377, + "step": 4797 + }, + { + "epoch": 0.17182659766863037, + "grad_norm": 1.8111257553100586, + "learning_rate": 0.0001896353770612814, + "loss": 1.3567, + "step": 4798 + }, + { + "epoch": 0.17186240979819864, + "grad_norm": 2.105717420578003, + "learning_rate": 0.00018963023415765956, + "loss": 1.2043, + "step": 4799 + }, + { + "epoch": 0.17189822192776694, + "grad_norm": 1.75229811668396, + "learning_rate": 0.00018962509004817846, + "loss": 1.1882, + "step": 4800 + }, + { + "epoch": 0.17193403405733523, + "grad_norm": 1.64487624168396, + "learning_rate": 0.00018961994473290744, + "loss": 1.1523, + "step": 4801 + }, + { + "epoch": 0.1719698461869035, + "grad_norm": 1.8683810234069824, + "learning_rate": 0.00018961479821191562, + "loss": 1.2062, + "step": 4802 + }, + { + "epoch": 0.1720056583164718, + "grad_norm": 1.5621305704116821, + "learning_rate": 0.00018960965048527232, + "loss": 1.2192, + "step": 4803 + }, + { + "epoch": 0.17204147044604007, + "grad_norm": 1.6298075914382935, + "learning_rate": 0.00018960450155304677, + "loss": 1.0638, + "step": 4804 + }, + { + "epoch": 0.17207728257560836, + "grad_norm": 1.808280110359192, + "learning_rate": 0.00018959935141530821, + "loss": 1.2022, + "step": 4805 + }, + { + "epoch": 0.17211309470517663, + "grad_norm": 2.0043387413024902, + "learning_rate": 0.00018959420007212593, + "loss": 0.987, + "step": 4806 + }, + { + "epoch": 0.17214890683474493, + "grad_norm": 1.7056399583816528, + "learning_rate": 0.0001895890475235693, + "loss": 1.1488, + "step": 4807 + }, + { + "epoch": 0.1721847189643132, + "grad_norm": 1.5555033683776855, + "learning_rate": 0.00018958389376970758, + "loss": 1.2729, + "step": 4808 + }, + { + "epoch": 0.1722205310938815, + "grad_norm": 1.471787452697754, + "learning_rate": 0.00018957873881061014, + "loss": 1.1417, + "step": 4809 + }, + { + "epoch": 0.1722563432234498, + "grad_norm": 1.7012022733688354, + "learning_rate": 0.00018957358264634627, + "loss": 1.4235, + "step": 4810 + }, + { + "epoch": 0.17229215535301806, + "grad_norm": 1.6687122583389282, + "learning_rate": 0.0001895684252769854, + "loss": 1.0217, + "step": 4811 + }, + { + "epoch": 0.17232796748258636, + "grad_norm": 1.6322672367095947, + "learning_rate": 0.00018956326670259695, + "loss": 1.0648, + "step": 4812 + }, + { + "epoch": 0.17236377961215463, + "grad_norm": 1.7864867448806763, + "learning_rate": 0.00018955810692325025, + "loss": 1.229, + "step": 4813 + }, + { + "epoch": 0.17239959174172292, + "grad_norm": 1.800253987312317, + "learning_rate": 0.00018955294593901476, + "loss": 1.4059, + "step": 4814 + }, + { + "epoch": 0.1724354038712912, + "grad_norm": 1.7635972499847412, + "learning_rate": 0.00018954778374995988, + "loss": 1.0568, + "step": 4815 + }, + { + "epoch": 0.1724712160008595, + "grad_norm": 1.3471447229385376, + "learning_rate": 0.00018954262035615505, + "loss": 1.3959, + "step": 4816 + }, + { + "epoch": 0.1725070281304278, + "grad_norm": 1.7025173902511597, + "learning_rate": 0.0001895374557576698, + "loss": 1.2309, + "step": 4817 + }, + { + "epoch": 0.17254284025999606, + "grad_norm": 1.800162434577942, + "learning_rate": 0.00018953228995457355, + "loss": 1.2504, + "step": 4818 + }, + { + "epoch": 0.17257865238956435, + "grad_norm": 1.7077995538711548, + "learning_rate": 0.00018952712294693585, + "loss": 1.3124, + "step": 4819 + }, + { + "epoch": 0.17261446451913262, + "grad_norm": 2.154409408569336, + "learning_rate": 0.0001895219547348262, + "loss": 1.1135, + "step": 4820 + }, + { + "epoch": 0.17265027664870092, + "grad_norm": 1.9874820709228516, + "learning_rate": 0.0001895167853183141, + "loss": 1.3199, + "step": 4821 + }, + { + "epoch": 0.1726860887782692, + "grad_norm": 2.0398480892181396, + "learning_rate": 0.00018951161469746915, + "loss": 1.1404, + "step": 4822 + }, + { + "epoch": 0.17272190090783748, + "grad_norm": 1.6196132898330688, + "learning_rate": 0.00018950644287236084, + "loss": 1.4648, + "step": 4823 + }, + { + "epoch": 0.17275771303740578, + "grad_norm": 2.0002365112304688, + "learning_rate": 0.00018950126984305885, + "loss": 1.2058, + "step": 4824 + }, + { + "epoch": 0.17279352516697405, + "grad_norm": 1.9498478174209595, + "learning_rate": 0.0001894960956096327, + "loss": 1.087, + "step": 4825 + }, + { + "epoch": 0.17282933729654235, + "grad_norm": 2.2983922958374023, + "learning_rate": 0.000189490920172152, + "loss": 1.4322, + "step": 4826 + }, + { + "epoch": 0.17286514942611061, + "grad_norm": 1.4018276929855347, + "learning_rate": 0.00018948574353068643, + "loss": 1.3097, + "step": 4827 + }, + { + "epoch": 0.1729009615556789, + "grad_norm": 1.721871256828308, + "learning_rate": 0.0001894805656853056, + "loss": 1.2478, + "step": 4828 + }, + { + "epoch": 0.17293677368524718, + "grad_norm": 1.4413807392120361, + "learning_rate": 0.00018947538663607918, + "loss": 0.9122, + "step": 4829 + }, + { + "epoch": 0.17297258581481548, + "grad_norm": 2.3940441608428955, + "learning_rate": 0.00018947020638307687, + "loss": 1.3793, + "step": 4830 + }, + { + "epoch": 0.17300839794438377, + "grad_norm": 1.9251985549926758, + "learning_rate": 0.0001894650249263683, + "loss": 1.2549, + "step": 4831 + }, + { + "epoch": 0.17304421007395204, + "grad_norm": 1.348172664642334, + "learning_rate": 0.00018945984226602326, + "loss": 1.3082, + "step": 4832 + }, + { + "epoch": 0.17308002220352034, + "grad_norm": 1.5119335651397705, + "learning_rate": 0.00018945465840211143, + "loss": 1.2909, + "step": 4833 + }, + { + "epoch": 0.1731158343330886, + "grad_norm": 1.8641186952590942, + "learning_rate": 0.00018944947333470252, + "loss": 1.2338, + "step": 4834 + }, + { + "epoch": 0.1731516464626569, + "grad_norm": 2.1562392711639404, + "learning_rate": 0.0001894442870638664, + "loss": 1.2383, + "step": 4835 + }, + { + "epoch": 0.17318745859222517, + "grad_norm": 2.477849245071411, + "learning_rate": 0.00018943909958967273, + "loss": 1.4943, + "step": 4836 + }, + { + "epoch": 0.17322327072179347, + "grad_norm": 1.8404033184051514, + "learning_rate": 0.00018943391091219133, + "loss": 1.2357, + "step": 4837 + }, + { + "epoch": 0.17325908285136177, + "grad_norm": 1.3149681091308594, + "learning_rate": 0.00018942872103149206, + "loss": 1.253, + "step": 4838 + }, + { + "epoch": 0.17329489498093004, + "grad_norm": 1.7529081106185913, + "learning_rate": 0.00018942352994764464, + "loss": 1.1853, + "step": 4839 + }, + { + "epoch": 0.17333070711049833, + "grad_norm": 1.9548184871673584, + "learning_rate": 0.00018941833766071903, + "loss": 1.2559, + "step": 4840 + }, + { + "epoch": 0.1733665192400666, + "grad_norm": 1.67954421043396, + "learning_rate": 0.00018941314417078502, + "loss": 1.2163, + "step": 4841 + }, + { + "epoch": 0.1734023313696349, + "grad_norm": 1.3339837789535522, + "learning_rate": 0.00018940794947791247, + "loss": 1.1042, + "step": 4842 + }, + { + "epoch": 0.17343814349920317, + "grad_norm": 1.435136318206787, + "learning_rate": 0.0001894027535821713, + "loss": 1.1752, + "step": 4843 + }, + { + "epoch": 0.17347395562877146, + "grad_norm": 1.9161666631698608, + "learning_rate": 0.0001893975564836314, + "loss": 1.357, + "step": 4844 + }, + { + "epoch": 0.17350976775833976, + "grad_norm": 2.1066815853118896, + "learning_rate": 0.00018939235818236268, + "loss": 1.4351, + "step": 4845 + }, + { + "epoch": 0.17354557988790803, + "grad_norm": 1.7039475440979004, + "learning_rate": 0.00018938715867843512, + "loss": 1.1335, + "step": 4846 + }, + { + "epoch": 0.17358139201747633, + "grad_norm": 1.447229027748108, + "learning_rate": 0.0001893819579719186, + "loss": 1.095, + "step": 4847 + }, + { + "epoch": 0.1736172041470446, + "grad_norm": 1.7993618249893188, + "learning_rate": 0.00018937675606288317, + "loss": 1.0243, + "step": 4848 + }, + { + "epoch": 0.1736530162766129, + "grad_norm": 1.529617190361023, + "learning_rate": 0.00018937155295139878, + "loss": 1.3079, + "step": 4849 + }, + { + "epoch": 0.17368882840618116, + "grad_norm": 1.6598352193832397, + "learning_rate": 0.00018936634863753537, + "loss": 1.1617, + "step": 4850 + }, + { + "epoch": 0.17372464053574946, + "grad_norm": 1.3270262479782104, + "learning_rate": 0.00018936114312136307, + "loss": 1.0851, + "step": 4851 + }, + { + "epoch": 0.17376045266531776, + "grad_norm": 1.3828742504119873, + "learning_rate": 0.0001893559364029518, + "loss": 1.1083, + "step": 4852 + }, + { + "epoch": 0.17379626479488602, + "grad_norm": 1.7545074224472046, + "learning_rate": 0.00018935072848237172, + "loss": 1.2434, + "step": 4853 + }, + { + "epoch": 0.17383207692445432, + "grad_norm": 2.054216146469116, + "learning_rate": 0.00018934551935969284, + "loss": 1.1942, + "step": 4854 + }, + { + "epoch": 0.1738678890540226, + "grad_norm": 1.6056499481201172, + "learning_rate": 0.00018934030903498518, + "loss": 1.1933, + "step": 4855 + }, + { + "epoch": 0.1739037011835909, + "grad_norm": 1.4153995513916016, + "learning_rate": 0.00018933509750831897, + "loss": 1.2859, + "step": 4856 + }, + { + "epoch": 0.17393951331315916, + "grad_norm": 1.4348880052566528, + "learning_rate": 0.00018932988477976423, + "loss": 1.2171, + "step": 4857 + }, + { + "epoch": 0.17397532544272745, + "grad_norm": 1.7043797969818115, + "learning_rate": 0.0001893246708493911, + "loss": 1.1735, + "step": 4858 + }, + { + "epoch": 0.17401113757229575, + "grad_norm": 2.3084583282470703, + "learning_rate": 0.0001893194557172698, + "loss": 1.1754, + "step": 4859 + }, + { + "epoch": 0.17404694970186402, + "grad_norm": 1.4393792152404785, + "learning_rate": 0.0001893142393834704, + "loss": 1.3088, + "step": 4860 + }, + { + "epoch": 0.17408276183143231, + "grad_norm": 1.8129425048828125, + "learning_rate": 0.00018930902184806313, + "loss": 1.4781, + "step": 4861 + }, + { + "epoch": 0.17411857396100058, + "grad_norm": 1.6770737171173096, + "learning_rate": 0.00018930380311111815, + "loss": 1.0774, + "step": 4862 + }, + { + "epoch": 0.17415438609056888, + "grad_norm": 1.4997860193252563, + "learning_rate": 0.0001892985831727057, + "loss": 1.2414, + "step": 4863 + }, + { + "epoch": 0.17419019822013715, + "grad_norm": 1.488534688949585, + "learning_rate": 0.000189293362032896, + "loss": 1.1084, + "step": 4864 + }, + { + "epoch": 0.17422601034970545, + "grad_norm": 1.594786524772644, + "learning_rate": 0.00018928813969175932, + "loss": 1.1696, + "step": 4865 + }, + { + "epoch": 0.17426182247927374, + "grad_norm": 1.8167200088500977, + "learning_rate": 0.00018928291614936585, + "loss": 1.3622, + "step": 4866 + }, + { + "epoch": 0.174297634608842, + "grad_norm": 2.02768611907959, + "learning_rate": 0.00018927769140578593, + "loss": 1.1695, + "step": 4867 + }, + { + "epoch": 0.1743334467384103, + "grad_norm": 2.350593090057373, + "learning_rate": 0.00018927246546108985, + "loss": 1.3614, + "step": 4868 + }, + { + "epoch": 0.17436925886797858, + "grad_norm": 2.208631753921509, + "learning_rate": 0.00018926723831534789, + "loss": 1.0917, + "step": 4869 + }, + { + "epoch": 0.17440507099754687, + "grad_norm": 1.4298827648162842, + "learning_rate": 0.00018926200996863038, + "loss": 1.3344, + "step": 4870 + }, + { + "epoch": 0.17444088312711514, + "grad_norm": 1.348339319229126, + "learning_rate": 0.00018925678042100766, + "loss": 0.9773, + "step": 4871 + }, + { + "epoch": 0.17447669525668344, + "grad_norm": 1.6409673690795898, + "learning_rate": 0.0001892515496725501, + "loss": 1.0611, + "step": 4872 + }, + { + "epoch": 0.17451250738625174, + "grad_norm": 1.3890101909637451, + "learning_rate": 0.00018924631772332807, + "loss": 1.0716, + "step": 4873 + }, + { + "epoch": 0.17454831951582, + "grad_norm": 1.6732662916183472, + "learning_rate": 0.00018924108457341195, + "loss": 1.406, + "step": 4874 + }, + { + "epoch": 0.1745841316453883, + "grad_norm": 1.8554630279541016, + "learning_rate": 0.00018923585022287214, + "loss": 1.2123, + "step": 4875 + }, + { + "epoch": 0.17461994377495657, + "grad_norm": 1.413450002670288, + "learning_rate": 0.00018923061467177908, + "loss": 1.2092, + "step": 4876 + }, + { + "epoch": 0.17465575590452487, + "grad_norm": 1.2662445306777954, + "learning_rate": 0.0001892253779202032, + "loss": 1.0664, + "step": 4877 + }, + { + "epoch": 0.17469156803409314, + "grad_norm": 1.807662844657898, + "learning_rate": 0.00018922013996821492, + "loss": 0.8893, + "step": 4878 + }, + { + "epoch": 0.17472738016366143, + "grad_norm": 2.482591390609741, + "learning_rate": 0.0001892149008158848, + "loss": 1.0711, + "step": 4879 + }, + { + "epoch": 0.17476319229322973, + "grad_norm": 1.5350465774536133, + "learning_rate": 0.00018920966046328324, + "loss": 1.2241, + "step": 4880 + }, + { + "epoch": 0.174799004422798, + "grad_norm": 1.4678375720977783, + "learning_rate": 0.00018920441891048077, + "loss": 1.2541, + "step": 4881 + }, + { + "epoch": 0.1748348165523663, + "grad_norm": 1.2268840074539185, + "learning_rate": 0.00018919917615754792, + "loss": 1.1703, + "step": 4882 + }, + { + "epoch": 0.17487062868193456, + "grad_norm": 1.7285887002944946, + "learning_rate": 0.00018919393220455518, + "loss": 1.0549, + "step": 4883 + }, + { + "epoch": 0.17490644081150286, + "grad_norm": 1.7994532585144043, + "learning_rate": 0.00018918868705157318, + "loss": 1.1604, + "step": 4884 + }, + { + "epoch": 0.17494225294107113, + "grad_norm": 1.4907346963882446, + "learning_rate": 0.00018918344069867243, + "loss": 1.1762, + "step": 4885 + }, + { + "epoch": 0.17497806507063943, + "grad_norm": 2.764704465866089, + "learning_rate": 0.00018917819314592351, + "loss": 1.3659, + "step": 4886 + }, + { + "epoch": 0.17501387720020772, + "grad_norm": 1.8530787229537964, + "learning_rate": 0.00018917294439339705, + "loss": 1.1436, + "step": 4887 + }, + { + "epoch": 0.175049689329776, + "grad_norm": 1.4300445318222046, + "learning_rate": 0.0001891676944411636, + "loss": 1.1634, + "step": 4888 + }, + { + "epoch": 0.1750855014593443, + "grad_norm": 1.744749665260315, + "learning_rate": 0.00018916244328929388, + "loss": 1.0603, + "step": 4889 + }, + { + "epoch": 0.17512131358891256, + "grad_norm": 1.7929058074951172, + "learning_rate": 0.00018915719093785848, + "loss": 1.1368, + "step": 4890 + }, + { + "epoch": 0.17515712571848085, + "grad_norm": 1.3681459426879883, + "learning_rate": 0.00018915193738692812, + "loss": 1.1534, + "step": 4891 + }, + { + "epoch": 0.17519293784804912, + "grad_norm": 1.4892985820770264, + "learning_rate": 0.00018914668263657342, + "loss": 1.1289, + "step": 4892 + }, + { + "epoch": 0.17522874997761742, + "grad_norm": 1.5112477540969849, + "learning_rate": 0.00018914142668686505, + "loss": 1.1185, + "step": 4893 + }, + { + "epoch": 0.17526456210718572, + "grad_norm": 2.38442325592041, + "learning_rate": 0.0001891361695378738, + "loss": 1.3925, + "step": 4894 + }, + { + "epoch": 0.175300374236754, + "grad_norm": 2.6644551753997803, + "learning_rate": 0.00018913091118967037, + "loss": 1.4657, + "step": 4895 + }, + { + "epoch": 0.17533618636632228, + "grad_norm": 1.788188099861145, + "learning_rate": 0.00018912565164232552, + "loss": 1.1577, + "step": 4896 + }, + { + "epoch": 0.17537199849589055, + "grad_norm": 1.5300188064575195, + "learning_rate": 0.00018912039089590997, + "loss": 1.1789, + "step": 4897 + }, + { + "epoch": 0.17540781062545885, + "grad_norm": 2.317657709121704, + "learning_rate": 0.00018911512895049452, + "loss": 1.2874, + "step": 4898 + }, + { + "epoch": 0.17544362275502712, + "grad_norm": 1.4912575483322144, + "learning_rate": 0.00018910986580614997, + "loss": 1.0754, + "step": 4899 + }, + { + "epoch": 0.17547943488459541, + "grad_norm": 1.476576805114746, + "learning_rate": 0.00018910460146294707, + "loss": 1.0842, + "step": 4900 + }, + { + "epoch": 0.1755152470141637, + "grad_norm": 1.3094103336334229, + "learning_rate": 0.00018909933592095674, + "loss": 1.0632, + "step": 4901 + }, + { + "epoch": 0.17555105914373198, + "grad_norm": 1.8791388273239136, + "learning_rate": 0.0001890940691802498, + "loss": 1.2609, + "step": 4902 + }, + { + "epoch": 0.17558687127330028, + "grad_norm": 1.835625171661377, + "learning_rate": 0.00018908880124089702, + "loss": 1.0284, + "step": 4903 + }, + { + "epoch": 0.17562268340286855, + "grad_norm": 2.301825523376465, + "learning_rate": 0.0001890835321029694, + "loss": 1.1952, + "step": 4904 + }, + { + "epoch": 0.17565849553243684, + "grad_norm": 1.3281526565551758, + "learning_rate": 0.00018907826176653772, + "loss": 1.1104, + "step": 4905 + }, + { + "epoch": 0.1756943076620051, + "grad_norm": 1.3583227396011353, + "learning_rate": 0.00018907299023167293, + "loss": 1.2587, + "step": 4906 + }, + { + "epoch": 0.1757301197915734, + "grad_norm": 1.4455939531326294, + "learning_rate": 0.00018906771749844595, + "loss": 1.1991, + "step": 4907 + }, + { + "epoch": 0.17576593192114168, + "grad_norm": 1.4696850776672363, + "learning_rate": 0.00018906244356692775, + "loss": 1.3426, + "step": 4908 + }, + { + "epoch": 0.17580174405070997, + "grad_norm": 1.6776385307312012, + "learning_rate": 0.00018905716843718926, + "loss": 1.3442, + "step": 4909 + }, + { + "epoch": 0.17583755618027827, + "grad_norm": 1.7512859106063843, + "learning_rate": 0.00018905189210930142, + "loss": 1.1327, + "step": 4910 + }, + { + "epoch": 0.17587336830984654, + "grad_norm": 1.8062031269073486, + "learning_rate": 0.00018904661458333524, + "loss": 1.1924, + "step": 4911 + }, + { + "epoch": 0.17590918043941484, + "grad_norm": 1.9129385948181152, + "learning_rate": 0.00018904133585936173, + "loss": 1.1647, + "step": 4912 + }, + { + "epoch": 0.1759449925689831, + "grad_norm": 1.5833733081817627, + "learning_rate": 0.0001890360559374519, + "loss": 1.3663, + "step": 4913 + }, + { + "epoch": 0.1759808046985514, + "grad_norm": 1.4850317239761353, + "learning_rate": 0.00018903077481767676, + "loss": 0.9917, + "step": 4914 + }, + { + "epoch": 0.17601661682811967, + "grad_norm": 1.9070367813110352, + "learning_rate": 0.00018902549250010743, + "loss": 1.2242, + "step": 4915 + }, + { + "epoch": 0.17605242895768797, + "grad_norm": 1.8141504526138306, + "learning_rate": 0.0001890202089848149, + "loss": 1.1936, + "step": 4916 + }, + { + "epoch": 0.17608824108725626, + "grad_norm": 1.5890421867370605, + "learning_rate": 0.00018901492427187032, + "loss": 1.1432, + "step": 4917 + }, + { + "epoch": 0.17612405321682453, + "grad_norm": 1.560502052307129, + "learning_rate": 0.0001890096383613447, + "loss": 1.2722, + "step": 4918 + }, + { + "epoch": 0.17615986534639283, + "grad_norm": 1.3627618551254272, + "learning_rate": 0.00018900435125330923, + "loss": 1.1034, + "step": 4919 + }, + { + "epoch": 0.1761956774759611, + "grad_norm": 1.2981374263763428, + "learning_rate": 0.00018899906294783504, + "loss": 1.2789, + "step": 4920 + }, + { + "epoch": 0.1762314896055294, + "grad_norm": 1.7872167825698853, + "learning_rate": 0.00018899377344499328, + "loss": 1.1247, + "step": 4921 + }, + { + "epoch": 0.17626730173509766, + "grad_norm": 2.1824474334716797, + "learning_rate": 0.00018898848274485506, + "loss": 1.4795, + "step": 4922 + }, + { + "epoch": 0.17630311386466596, + "grad_norm": 1.5018762350082397, + "learning_rate": 0.00018898319084749158, + "loss": 0.9622, + "step": 4923 + }, + { + "epoch": 0.17633892599423426, + "grad_norm": 2.102220058441162, + "learning_rate": 0.00018897789775297404, + "loss": 1.3847, + "step": 4924 + }, + { + "epoch": 0.17637473812380253, + "grad_norm": 1.4397528171539307, + "learning_rate": 0.0001889726034613737, + "loss": 1.1718, + "step": 4925 + }, + { + "epoch": 0.17641055025337082, + "grad_norm": 2.0658085346221924, + "learning_rate": 0.00018896730797276175, + "loss": 1.3668, + "step": 4926 + }, + { + "epoch": 0.1764463623829391, + "grad_norm": 1.4740384817123413, + "learning_rate": 0.00018896201128720938, + "loss": 1.0867, + "step": 4927 + }, + { + "epoch": 0.1764821745125074, + "grad_norm": 1.4185761213302612, + "learning_rate": 0.00018895671340478796, + "loss": 1.2099, + "step": 4928 + }, + { + "epoch": 0.17651798664207566, + "grad_norm": 2.059544324874878, + "learning_rate": 0.00018895141432556867, + "loss": 1.2101, + "step": 4929 + }, + { + "epoch": 0.17655379877164395, + "grad_norm": 2.4504494667053223, + "learning_rate": 0.00018894611404962283, + "loss": 1.3068, + "step": 4930 + }, + { + "epoch": 0.17658961090121225, + "grad_norm": 1.2178417444229126, + "learning_rate": 0.0001889408125770218, + "loss": 1.1548, + "step": 4931 + }, + { + "epoch": 0.17662542303078052, + "grad_norm": 1.2851979732513428, + "learning_rate": 0.00018893550990783684, + "loss": 1.0379, + "step": 4932 + }, + { + "epoch": 0.17666123516034882, + "grad_norm": 1.9596595764160156, + "learning_rate": 0.00018893020604213932, + "loss": 1.223, + "step": 4933 + }, + { + "epoch": 0.1766970472899171, + "grad_norm": 1.7124497890472412, + "learning_rate": 0.00018892490098000055, + "loss": 1.0982, + "step": 4934 + }, + { + "epoch": 0.17673285941948538, + "grad_norm": 1.656323790550232, + "learning_rate": 0.00018891959472149198, + "loss": 1.1057, + "step": 4935 + }, + { + "epoch": 0.17676867154905365, + "grad_norm": 2.678682804107666, + "learning_rate": 0.00018891428726668495, + "loss": 1.3088, + "step": 4936 + }, + { + "epoch": 0.17680448367862195, + "grad_norm": 1.6592466831207275, + "learning_rate": 0.00018890897861565086, + "loss": 1.5127, + "step": 4937 + }, + { + "epoch": 0.17684029580819025, + "grad_norm": 1.819252371788025, + "learning_rate": 0.00018890366876846119, + "loss": 1.3372, + "step": 4938 + }, + { + "epoch": 0.17687610793775851, + "grad_norm": 1.8493282794952393, + "learning_rate": 0.00018889835772518731, + "loss": 1.0788, + "step": 4939 + }, + { + "epoch": 0.1769119200673268, + "grad_norm": 1.556828260421753, + "learning_rate": 0.00018889304548590067, + "loss": 1.2955, + "step": 4940 + }, + { + "epoch": 0.17694773219689508, + "grad_norm": 1.2848360538482666, + "learning_rate": 0.00018888773205067282, + "loss": 1.2889, + "step": 4941 + }, + { + "epoch": 0.17698354432646338, + "grad_norm": 1.3519372940063477, + "learning_rate": 0.00018888241741957514, + "loss": 1.2245, + "step": 4942 + }, + { + "epoch": 0.17701935645603165, + "grad_norm": 2.2200968265533447, + "learning_rate": 0.00018887710159267923, + "loss": 1.3665, + "step": 4943 + }, + { + "epoch": 0.17705516858559994, + "grad_norm": 1.276363492012024, + "learning_rate": 0.00018887178457005653, + "loss": 1.173, + "step": 4944 + }, + { + "epoch": 0.17709098071516824, + "grad_norm": 1.8068066835403442, + "learning_rate": 0.00018886646635177864, + "loss": 1.3552, + "step": 4945 + }, + { + "epoch": 0.1771267928447365, + "grad_norm": 1.8262608051300049, + "learning_rate": 0.00018886114693791704, + "loss": 1.1426, + "step": 4946 + }, + { + "epoch": 0.1771626049743048, + "grad_norm": 1.5962966680526733, + "learning_rate": 0.00018885582632854333, + "loss": 1.1347, + "step": 4947 + }, + { + "epoch": 0.17719841710387307, + "grad_norm": 1.2628027200698853, + "learning_rate": 0.00018885050452372912, + "loss": 1.1666, + "step": 4948 + }, + { + "epoch": 0.17723422923344137, + "grad_norm": 2.0372493267059326, + "learning_rate": 0.00018884518152354596, + "loss": 1.1454, + "step": 4949 + }, + { + "epoch": 0.17727004136300964, + "grad_norm": 1.7895387411117554, + "learning_rate": 0.00018883985732806547, + "loss": 1.2246, + "step": 4950 + }, + { + "epoch": 0.17730585349257794, + "grad_norm": 1.4656963348388672, + "learning_rate": 0.00018883453193735932, + "loss": 1.1031, + "step": 4951 + }, + { + "epoch": 0.17734166562214623, + "grad_norm": 1.666920781135559, + "learning_rate": 0.00018882920535149913, + "loss": 1.1739, + "step": 4952 + }, + { + "epoch": 0.1773774777517145, + "grad_norm": 1.563825249671936, + "learning_rate": 0.00018882387757055655, + "loss": 1.4353, + "step": 4953 + }, + { + "epoch": 0.1774132898812828, + "grad_norm": 1.1232571601867676, + "learning_rate": 0.00018881854859460328, + "loss": 1.2073, + "step": 4954 + }, + { + "epoch": 0.17744910201085107, + "grad_norm": 1.7529096603393555, + "learning_rate": 0.00018881321842371103, + "loss": 1.2896, + "step": 4955 + }, + { + "epoch": 0.17748491414041936, + "grad_norm": 1.7089818716049194, + "learning_rate": 0.00018880788705795144, + "loss": 1.3258, + "step": 4956 + }, + { + "epoch": 0.17752072626998763, + "grad_norm": 1.5099844932556152, + "learning_rate": 0.00018880255449739634, + "loss": 1.1877, + "step": 4957 + }, + { + "epoch": 0.17755653839955593, + "grad_norm": 1.357500433921814, + "learning_rate": 0.00018879722074211736, + "loss": 1.1743, + "step": 4958 + }, + { + "epoch": 0.17759235052912423, + "grad_norm": 1.3563216924667358, + "learning_rate": 0.00018879188579218635, + "loss": 1.1705, + "step": 4959 + }, + { + "epoch": 0.1776281626586925, + "grad_norm": 1.7187597751617432, + "learning_rate": 0.000188786549647675, + "loss": 1.0476, + "step": 4960 + }, + { + "epoch": 0.1776639747882608, + "grad_norm": 1.6963194608688354, + "learning_rate": 0.0001887812123086552, + "loss": 1.3594, + "step": 4961 + }, + { + "epoch": 0.17769978691782906, + "grad_norm": 1.6241540908813477, + "learning_rate": 0.0001887758737751987, + "loss": 1.1272, + "step": 4962 + }, + { + "epoch": 0.17773559904739736, + "grad_norm": 1.3006178140640259, + "learning_rate": 0.00018877053404737734, + "loss": 1.2296, + "step": 4963 + }, + { + "epoch": 0.17777141117696563, + "grad_norm": 1.419042944908142, + "learning_rate": 0.00018876519312526293, + "loss": 1.1977, + "step": 4964 + }, + { + "epoch": 0.17780722330653392, + "grad_norm": 1.9757895469665527, + "learning_rate": 0.00018875985100892738, + "loss": 0.9943, + "step": 4965 + }, + { + "epoch": 0.17784303543610222, + "grad_norm": 1.1925586462020874, + "learning_rate": 0.0001887545076984425, + "loss": 1.2207, + "step": 4966 + }, + { + "epoch": 0.1778788475656705, + "grad_norm": 1.3790775537490845, + "learning_rate": 0.0001887491631938802, + "loss": 0.9785, + "step": 4967 + }, + { + "epoch": 0.17791465969523879, + "grad_norm": 2.5968823432922363, + "learning_rate": 0.0001887438174953124, + "loss": 1.2983, + "step": 4968 + }, + { + "epoch": 0.17795047182480705, + "grad_norm": 1.6984015703201294, + "learning_rate": 0.000188738470602811, + "loss": 1.4603, + "step": 4969 + }, + { + "epoch": 0.17798628395437535, + "grad_norm": 1.87100088596344, + "learning_rate": 0.00018873312251644793, + "loss": 1.292, + "step": 4970 + }, + { + "epoch": 0.17802209608394362, + "grad_norm": 1.4950276613235474, + "learning_rate": 0.00018872777323629514, + "loss": 1.1187, + "step": 4971 + }, + { + "epoch": 0.17805790821351192, + "grad_norm": 2.0523462295532227, + "learning_rate": 0.00018872242276242465, + "loss": 1.3423, + "step": 4972 + }, + { + "epoch": 0.17809372034308021, + "grad_norm": 1.6870025396347046, + "learning_rate": 0.0001887170710949084, + "loss": 1.482, + "step": 4973 + }, + { + "epoch": 0.17812953247264848, + "grad_norm": 1.557790756225586, + "learning_rate": 0.00018871171823381836, + "loss": 1.4792, + "step": 4974 + }, + { + "epoch": 0.17816534460221678, + "grad_norm": 2.1374154090881348, + "learning_rate": 0.00018870636417922662, + "loss": 1.2006, + "step": 4975 + }, + { + "epoch": 0.17820115673178505, + "grad_norm": 2.181462287902832, + "learning_rate": 0.00018870100893120516, + "loss": 1.3964, + "step": 4976 + }, + { + "epoch": 0.17823696886135335, + "grad_norm": 1.6053320169448853, + "learning_rate": 0.00018869565248982607, + "loss": 1.0662, + "step": 4977 + }, + { + "epoch": 0.17827278099092161, + "grad_norm": 1.4421659708023071, + "learning_rate": 0.00018869029485516135, + "loss": 0.9421, + "step": 4978 + }, + { + "epoch": 0.1783085931204899, + "grad_norm": 1.7654504776000977, + "learning_rate": 0.0001886849360272831, + "loss": 1.2544, + "step": 4979 + }, + { + "epoch": 0.1783444052500582, + "grad_norm": 1.1967506408691406, + "learning_rate": 0.00018867957600626344, + "loss": 1.3538, + "step": 4980 + }, + { + "epoch": 0.17838021737962648, + "grad_norm": 1.3835008144378662, + "learning_rate": 0.0001886742147921745, + "loss": 1.2293, + "step": 4981 + }, + { + "epoch": 0.17841602950919477, + "grad_norm": 1.4455628395080566, + "learning_rate": 0.0001886688523850884, + "loss": 1.2615, + "step": 4982 + }, + { + "epoch": 0.17845184163876304, + "grad_norm": 1.4421252012252808, + "learning_rate": 0.0001886634887850772, + "loss": 1.383, + "step": 4983 + }, + { + "epoch": 0.17848765376833134, + "grad_norm": 1.5549116134643555, + "learning_rate": 0.00018865812399221317, + "loss": 1.1781, + "step": 4984 + }, + { + "epoch": 0.1785234658978996, + "grad_norm": 1.326514720916748, + "learning_rate": 0.00018865275800656844, + "loss": 1.2836, + "step": 4985 + }, + { + "epoch": 0.1785592780274679, + "grad_norm": 1.3947257995605469, + "learning_rate": 0.00018864739082821518, + "loss": 1.1654, + "step": 4986 + }, + { + "epoch": 0.1785950901570362, + "grad_norm": 1.3800934553146362, + "learning_rate": 0.00018864202245722566, + "loss": 1.2145, + "step": 4987 + }, + { + "epoch": 0.17863090228660447, + "grad_norm": 1.7603713274002075, + "learning_rate": 0.00018863665289367204, + "loss": 1.2375, + "step": 4988 + }, + { + "epoch": 0.17866671441617277, + "grad_norm": 1.5546749830245972, + "learning_rate": 0.0001886312821376266, + "loss": 1.1125, + "step": 4989 + }, + { + "epoch": 0.17870252654574104, + "grad_norm": 1.8378030061721802, + "learning_rate": 0.00018862591018916155, + "loss": 1.1886, + "step": 4990 + }, + { + "epoch": 0.17873833867530933, + "grad_norm": 1.4867974519729614, + "learning_rate": 0.00018862053704834925, + "loss": 1.127, + "step": 4991 + }, + { + "epoch": 0.1787741508048776, + "grad_norm": 1.7560725212097168, + "learning_rate": 0.00018861516271526191, + "loss": 1.0864, + "step": 4992 + }, + { + "epoch": 0.1788099629344459, + "grad_norm": 1.5766594409942627, + "learning_rate": 0.00018860978718997185, + "loss": 1.3602, + "step": 4993 + }, + { + "epoch": 0.1788457750640142, + "grad_norm": 1.5832605361938477, + "learning_rate": 0.00018860441047255144, + "loss": 1.2209, + "step": 4994 + }, + { + "epoch": 0.17888158719358246, + "grad_norm": 2.254561424255371, + "learning_rate": 0.00018859903256307297, + "loss": 1.4464, + "step": 4995 + }, + { + "epoch": 0.17891739932315076, + "grad_norm": 1.7363691329956055, + "learning_rate": 0.00018859365346160877, + "loss": 1.2688, + "step": 4996 + }, + { + "epoch": 0.17895321145271903, + "grad_norm": 1.5053730010986328, + "learning_rate": 0.00018858827316823126, + "loss": 1.0487, + "step": 4997 + }, + { + "epoch": 0.17898902358228733, + "grad_norm": 1.3010731935501099, + "learning_rate": 0.0001885828916830128, + "loss": 1.128, + "step": 4998 + }, + { + "epoch": 0.1790248357118556, + "grad_norm": 2.6008362770080566, + "learning_rate": 0.00018857750900602583, + "loss": 1.2833, + "step": 4999 + }, + { + "epoch": 0.1790606478414239, + "grad_norm": 1.806923270225525, + "learning_rate": 0.00018857212513734268, + "loss": 1.0728, + "step": 5000 + }, + { + "epoch": 0.17909645997099216, + "grad_norm": 1.7252516746520996, + "learning_rate": 0.00018856674007703585, + "loss": 1.2769, + "step": 5001 + }, + { + "epoch": 0.17913227210056046, + "grad_norm": 2.0774710178375244, + "learning_rate": 0.0001885613538251778, + "loss": 1.2267, + "step": 5002 + }, + { + "epoch": 0.17916808423012875, + "grad_norm": 1.424799919128418, + "learning_rate": 0.00018855596638184095, + "loss": 1.1411, + "step": 5003 + }, + { + "epoch": 0.17920389635969702, + "grad_norm": 2.0511982440948486, + "learning_rate": 0.0001885505777470978, + "loss": 1.3055, + "step": 5004 + }, + { + "epoch": 0.17923970848926532, + "grad_norm": 1.3469139337539673, + "learning_rate": 0.00018854518792102084, + "loss": 1.1458, + "step": 5005 + }, + { + "epoch": 0.1792755206188336, + "grad_norm": 1.3741822242736816, + "learning_rate": 0.00018853979690368263, + "loss": 1.3071, + "step": 5006 + }, + { + "epoch": 0.17931133274840189, + "grad_norm": 1.3421168327331543, + "learning_rate": 0.0001885344046951556, + "loss": 1.1868, + "step": 5007 + }, + { + "epoch": 0.17934714487797015, + "grad_norm": 1.2578328847885132, + "learning_rate": 0.00018852901129551243, + "loss": 1.1806, + "step": 5008 + }, + { + "epoch": 0.17938295700753845, + "grad_norm": 2.1601455211639404, + "learning_rate": 0.00018852361670482556, + "loss": 1.2945, + "step": 5009 + }, + { + "epoch": 0.17941876913710675, + "grad_norm": 1.664143681526184, + "learning_rate": 0.00018851822092316763, + "loss": 1.3289, + "step": 5010 + }, + { + "epoch": 0.17945458126667502, + "grad_norm": 1.5091066360473633, + "learning_rate": 0.00018851282395061122, + "loss": 1.0636, + "step": 5011 + }, + { + "epoch": 0.17949039339624331, + "grad_norm": 2.3369152545928955, + "learning_rate": 0.00018850742578722894, + "loss": 1.2628, + "step": 5012 + }, + { + "epoch": 0.17952620552581158, + "grad_norm": 1.610758900642395, + "learning_rate": 0.00018850202643309338, + "loss": 1.2277, + "step": 5013 + }, + { + "epoch": 0.17956201765537988, + "grad_norm": 1.5264900922775269, + "learning_rate": 0.00018849662588827723, + "loss": 1.1096, + "step": 5014 + }, + { + "epoch": 0.17959782978494815, + "grad_norm": 1.699971079826355, + "learning_rate": 0.00018849122415285315, + "loss": 1.2059, + "step": 5015 + }, + { + "epoch": 0.17963364191451645, + "grad_norm": 1.6303890943527222, + "learning_rate": 0.00018848582122689376, + "loss": 1.2284, + "step": 5016 + }, + { + "epoch": 0.17966945404408474, + "grad_norm": 1.3733372688293457, + "learning_rate": 0.0001884804171104718, + "loss": 1.1642, + "step": 5017 + }, + { + "epoch": 0.179705266173653, + "grad_norm": 2.2472853660583496, + "learning_rate": 0.00018847501180365995, + "loss": 1.4868, + "step": 5018 + }, + { + "epoch": 0.1797410783032213, + "grad_norm": 1.4765734672546387, + "learning_rate": 0.000188469605306531, + "loss": 1.2021, + "step": 5019 + }, + { + "epoch": 0.17977689043278958, + "grad_norm": 1.5052324533462524, + "learning_rate": 0.00018846419761915753, + "loss": 1.0045, + "step": 5020 + }, + { + "epoch": 0.17981270256235787, + "grad_norm": 1.8078304529190063, + "learning_rate": 0.00018845878874161249, + "loss": 0.8853, + "step": 5021 + }, + { + "epoch": 0.17984851469192614, + "grad_norm": 1.4316786527633667, + "learning_rate": 0.00018845337867396848, + "loss": 1.256, + "step": 5022 + }, + { + "epoch": 0.17988432682149444, + "grad_norm": 2.0059404373168945, + "learning_rate": 0.0001884479674162984, + "loss": 1.3728, + "step": 5023 + }, + { + "epoch": 0.17992013895106274, + "grad_norm": 1.2452020645141602, + "learning_rate": 0.00018844255496867497, + "loss": 1.0862, + "step": 5024 + }, + { + "epoch": 0.179955951080631, + "grad_norm": 1.914552092552185, + "learning_rate": 0.00018843714133117106, + "loss": 1.25, + "step": 5025 + }, + { + "epoch": 0.1799917632101993, + "grad_norm": 1.6009835004806519, + "learning_rate": 0.0001884317265038595, + "loss": 1.1581, + "step": 5026 + }, + { + "epoch": 0.18002757533976757, + "grad_norm": 1.4893797636032104, + "learning_rate": 0.00018842631048681313, + "loss": 0.9578, + "step": 5027 + }, + { + "epoch": 0.18006338746933587, + "grad_norm": 1.6481666564941406, + "learning_rate": 0.00018842089328010482, + "loss": 1.3253, + "step": 5028 + }, + { + "epoch": 0.18009919959890414, + "grad_norm": 1.946872353553772, + "learning_rate": 0.00018841547488380745, + "loss": 1.2765, + "step": 5029 + }, + { + "epoch": 0.18013501172847243, + "grad_norm": 1.8447160720825195, + "learning_rate": 0.00018841005529799388, + "loss": 1.2961, + "step": 5030 + }, + { + "epoch": 0.18017082385804073, + "grad_norm": 1.7316590547561646, + "learning_rate": 0.00018840463452273707, + "loss": 1.1818, + "step": 5031 + }, + { + "epoch": 0.180206635987609, + "grad_norm": 1.5543711185455322, + "learning_rate": 0.00018839921255810996, + "loss": 1.2834, + "step": 5032 + }, + { + "epoch": 0.1802424481171773, + "grad_norm": 1.5353494882583618, + "learning_rate": 0.00018839378940418544, + "loss": 1.0581, + "step": 5033 + }, + { + "epoch": 0.18027826024674556, + "grad_norm": 2.350623607635498, + "learning_rate": 0.00018838836506103652, + "loss": 1.2166, + "step": 5034 + }, + { + "epoch": 0.18031407237631386, + "grad_norm": 1.9018611907958984, + "learning_rate": 0.00018838293952873616, + "loss": 1.3091, + "step": 5035 + }, + { + "epoch": 0.18034988450588213, + "grad_norm": 1.4260408878326416, + "learning_rate": 0.0001883775128073573, + "loss": 1.3639, + "step": 5036 + }, + { + "epoch": 0.18038569663545043, + "grad_norm": 1.8075343370437622, + "learning_rate": 0.0001883720848969731, + "loss": 1.1575, + "step": 5037 + }, + { + "epoch": 0.18042150876501872, + "grad_norm": 2.328223705291748, + "learning_rate": 0.00018836665579765642, + "loss": 1.0382, + "step": 5038 + }, + { + "epoch": 0.180457320894587, + "grad_norm": 2.1374526023864746, + "learning_rate": 0.0001883612255094804, + "loss": 1.2742, + "step": 5039 + }, + { + "epoch": 0.1804931330241553, + "grad_norm": 1.6550828218460083, + "learning_rate": 0.00018835579403251806, + "loss": 1.1672, + "step": 5040 + }, + { + "epoch": 0.18052894515372356, + "grad_norm": 1.7217787504196167, + "learning_rate": 0.00018835036136684248, + "loss": 1.3719, + "step": 5041 + }, + { + "epoch": 0.18056475728329185, + "grad_norm": 1.3542956113815308, + "learning_rate": 0.00018834492751252678, + "loss": 1.1469, + "step": 5042 + }, + { + "epoch": 0.18060056941286012, + "grad_norm": 1.6626652479171753, + "learning_rate": 0.000188339492469644, + "loss": 1.143, + "step": 5043 + }, + { + "epoch": 0.18063638154242842, + "grad_norm": 2.344381332397461, + "learning_rate": 0.0001883340562382673, + "loss": 1.3464, + "step": 5044 + }, + { + "epoch": 0.18067219367199672, + "grad_norm": 1.5664763450622559, + "learning_rate": 0.00018832861881846983, + "loss": 1.1136, + "step": 5045 + }, + { + "epoch": 0.18070800580156499, + "grad_norm": 1.5811320543289185, + "learning_rate": 0.00018832318021032472, + "loss": 1.3375, + "step": 5046 + }, + { + "epoch": 0.18074381793113328, + "grad_norm": 1.3929229974746704, + "learning_rate": 0.00018831774041390517, + "loss": 0.8395, + "step": 5047 + }, + { + "epoch": 0.18077963006070155, + "grad_norm": 1.181016445159912, + "learning_rate": 0.00018831229942928434, + "loss": 1.1655, + "step": 5048 + }, + { + "epoch": 0.18081544219026985, + "grad_norm": 1.3651745319366455, + "learning_rate": 0.0001883068572565354, + "loss": 1.0428, + "step": 5049 + }, + { + "epoch": 0.18085125431983812, + "grad_norm": 1.480823040008545, + "learning_rate": 0.00018830141389573166, + "loss": 1.2321, + "step": 5050 + }, + { + "epoch": 0.18088706644940641, + "grad_norm": 1.3584136962890625, + "learning_rate": 0.00018829596934694624, + "loss": 1.3139, + "step": 5051 + }, + { + "epoch": 0.1809228785789747, + "grad_norm": 1.676684021949768, + "learning_rate": 0.0001882905236102525, + "loss": 1.2484, + "step": 5052 + }, + { + "epoch": 0.18095869070854298, + "grad_norm": 1.384189486503601, + "learning_rate": 0.0001882850766857236, + "loss": 1.0781, + "step": 5053 + }, + { + "epoch": 0.18099450283811128, + "grad_norm": 1.7330617904663086, + "learning_rate": 0.0001882796285734329, + "loss": 1.191, + "step": 5054 + }, + { + "epoch": 0.18103031496767955, + "grad_norm": 1.2987060546875, + "learning_rate": 0.0001882741792734537, + "loss": 1.0779, + "step": 5055 + }, + { + "epoch": 0.18106612709724784, + "grad_norm": 1.5816254615783691, + "learning_rate": 0.00018826872878585925, + "loss": 1.2002, + "step": 5056 + }, + { + "epoch": 0.1811019392268161, + "grad_norm": 1.4840813875198364, + "learning_rate": 0.0001882632771107229, + "loss": 1.218, + "step": 5057 + }, + { + "epoch": 0.1811377513563844, + "grad_norm": 1.1658384799957275, + "learning_rate": 0.00018825782424811802, + "loss": 1.1391, + "step": 5058 + }, + { + "epoch": 0.1811735634859527, + "grad_norm": 1.644037127494812, + "learning_rate": 0.00018825237019811796, + "loss": 1.2611, + "step": 5059 + }, + { + "epoch": 0.18120937561552097, + "grad_norm": 1.2600605487823486, + "learning_rate": 0.0001882469149607961, + "loss": 1.1379, + "step": 5060 + }, + { + "epoch": 0.18124518774508927, + "grad_norm": 1.251114010810852, + "learning_rate": 0.00018824145853622582, + "loss": 1.2917, + "step": 5061 + }, + { + "epoch": 0.18128099987465754, + "grad_norm": 1.6302874088287354, + "learning_rate": 0.00018823600092448054, + "loss": 1.2045, + "step": 5062 + }, + { + "epoch": 0.18131681200422584, + "grad_norm": 1.197837471961975, + "learning_rate": 0.0001882305421256337, + "loss": 1.1214, + "step": 5063 + }, + { + "epoch": 0.1813526241337941, + "grad_norm": 1.5983929634094238, + "learning_rate": 0.0001882250821397587, + "loss": 1.0802, + "step": 5064 + }, + { + "epoch": 0.1813884362633624, + "grad_norm": 1.7347968816757202, + "learning_rate": 0.00018821962096692905, + "loss": 1.1802, + "step": 5065 + }, + { + "epoch": 0.1814242483929307, + "grad_norm": 2.1455438137054443, + "learning_rate": 0.00018821415860721818, + "loss": 0.9516, + "step": 5066 + }, + { + "epoch": 0.18146006052249897, + "grad_norm": 1.259245753288269, + "learning_rate": 0.0001882086950606996, + "loss": 1.2633, + "step": 5067 + }, + { + "epoch": 0.18149587265206726, + "grad_norm": 1.5454808473587036, + "learning_rate": 0.0001882032303274468, + "loss": 1.0708, + "step": 5068 + }, + { + "epoch": 0.18153168478163553, + "grad_norm": 1.8083200454711914, + "learning_rate": 0.0001881977644075333, + "loss": 1.0914, + "step": 5069 + }, + { + "epoch": 0.18156749691120383, + "grad_norm": 2.058741331100464, + "learning_rate": 0.00018819229730103267, + "loss": 1.3082, + "step": 5070 + }, + { + "epoch": 0.1816033090407721, + "grad_norm": 1.8576359748840332, + "learning_rate": 0.00018818682900801842, + "loss": 1.0998, + "step": 5071 + }, + { + "epoch": 0.1816391211703404, + "grad_norm": 1.4982048273086548, + "learning_rate": 0.00018818135952856414, + "loss": 1.2178, + "step": 5072 + }, + { + "epoch": 0.1816749332999087, + "grad_norm": 1.382636308670044, + "learning_rate": 0.00018817588886274345, + "loss": 1.3004, + "step": 5073 + }, + { + "epoch": 0.18171074542947696, + "grad_norm": 1.6992204189300537, + "learning_rate": 0.00018817041701062987, + "loss": 1.1994, + "step": 5074 + }, + { + "epoch": 0.18174655755904526, + "grad_norm": 1.9188326597213745, + "learning_rate": 0.00018816494397229708, + "loss": 1.1843, + "step": 5075 + }, + { + "epoch": 0.18178236968861353, + "grad_norm": 2.538668632507324, + "learning_rate": 0.0001881594697478187, + "loss": 1.282, + "step": 5076 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.8505432605743408, + "learning_rate": 0.00018815399433726835, + "loss": 1.1992, + "step": 5077 + }, + { + "epoch": 0.1818539939477501, + "grad_norm": 2.2220535278320312, + "learning_rate": 0.00018814851774071972, + "loss": 1.2874, + "step": 5078 + }, + { + "epoch": 0.1818898060773184, + "grad_norm": 1.5924043655395508, + "learning_rate": 0.00018814303995824653, + "loss": 1.1783, + "step": 5079 + }, + { + "epoch": 0.18192561820688669, + "grad_norm": 1.4641693830490112, + "learning_rate": 0.00018813756098992238, + "loss": 1.1729, + "step": 5080 + }, + { + "epoch": 0.18196143033645495, + "grad_norm": 1.7742080688476562, + "learning_rate": 0.00018813208083582106, + "loss": 1.1271, + "step": 5081 + }, + { + "epoch": 0.18199724246602325, + "grad_norm": 1.375099539756775, + "learning_rate": 0.00018812659949601627, + "loss": 1.1099, + "step": 5082 + }, + { + "epoch": 0.18203305459559152, + "grad_norm": 1.264348030090332, + "learning_rate": 0.00018812111697058174, + "loss": 1.1268, + "step": 5083 + }, + { + "epoch": 0.18206886672515982, + "grad_norm": 2.9842894077301025, + "learning_rate": 0.0001881156332595913, + "loss": 1.2104, + "step": 5084 + }, + { + "epoch": 0.18210467885472809, + "grad_norm": 1.5549598932266235, + "learning_rate": 0.00018811014836311865, + "loss": 1.1216, + "step": 5085 + }, + { + "epoch": 0.18214049098429638, + "grad_norm": 1.4579311609268188, + "learning_rate": 0.00018810466228123758, + "loss": 1.1494, + "step": 5086 + }, + { + "epoch": 0.18217630311386468, + "grad_norm": 1.6301416158676147, + "learning_rate": 0.00018809917501402197, + "loss": 1.0859, + "step": 5087 + }, + { + "epoch": 0.18221211524343295, + "grad_norm": 1.3595716953277588, + "learning_rate": 0.00018809368656154556, + "loss": 1.0712, + "step": 5088 + }, + { + "epoch": 0.18224792737300125, + "grad_norm": 1.6425689458847046, + "learning_rate": 0.00018808819692388225, + "loss": 1.1387, + "step": 5089 + }, + { + "epoch": 0.18228373950256951, + "grad_norm": 2.1753268241882324, + "learning_rate": 0.00018808270610110584, + "loss": 1.0696, + "step": 5090 + }, + { + "epoch": 0.1823195516321378, + "grad_norm": 1.366904377937317, + "learning_rate": 0.0001880772140932903, + "loss": 1.2708, + "step": 5091 + }, + { + "epoch": 0.18235536376170608, + "grad_norm": 1.6952965259552002, + "learning_rate": 0.00018807172090050945, + "loss": 1.3346, + "step": 5092 + }, + { + "epoch": 0.18239117589127438, + "grad_norm": 1.4568768739700317, + "learning_rate": 0.00018806622652283713, + "loss": 1.2322, + "step": 5093 + }, + { + "epoch": 0.18242698802084267, + "grad_norm": 1.4199992418289185, + "learning_rate": 0.0001880607309603474, + "loss": 1.0636, + "step": 5094 + }, + { + "epoch": 0.18246280015041094, + "grad_norm": 1.4467459917068481, + "learning_rate": 0.00018805523421311411, + "loss": 1.179, + "step": 5095 + }, + { + "epoch": 0.18249861227997924, + "grad_norm": 1.524647831916809, + "learning_rate": 0.0001880497362812112, + "loss": 1.4124, + "step": 5096 + }, + { + "epoch": 0.1825344244095475, + "grad_norm": 1.5673344135284424, + "learning_rate": 0.00018804423716471268, + "loss": 1.1199, + "step": 5097 + }, + { + "epoch": 0.1825702365391158, + "grad_norm": 1.6886203289031982, + "learning_rate": 0.00018803873686369253, + "loss": 0.965, + "step": 5098 + }, + { + "epoch": 0.18260604866868407, + "grad_norm": 1.3323575258255005, + "learning_rate": 0.00018803323537822472, + "loss": 1.0781, + "step": 5099 + }, + { + "epoch": 0.18264186079825237, + "grad_norm": 1.1945807933807373, + "learning_rate": 0.00018802773270838329, + "loss": 1.0829, + "step": 5100 + }, + { + "epoch": 0.18267767292782064, + "grad_norm": 1.6517410278320312, + "learning_rate": 0.0001880222288542423, + "loss": 1.1231, + "step": 5101 + }, + { + "epoch": 0.18271348505738894, + "grad_norm": 1.6388558149337769, + "learning_rate": 0.0001880167238158757, + "loss": 1.201, + "step": 5102 + }, + { + "epoch": 0.18274929718695723, + "grad_norm": 1.6692355871200562, + "learning_rate": 0.00018801121759335764, + "loss": 1.2437, + "step": 5103 + }, + { + "epoch": 0.1827851093165255, + "grad_norm": 1.65425443649292, + "learning_rate": 0.0001880057101867622, + "loss": 1.1318, + "step": 5104 + }, + { + "epoch": 0.1828209214460938, + "grad_norm": 1.2910715341567993, + "learning_rate": 0.00018800020159616342, + "loss": 1.2821, + "step": 5105 + }, + { + "epoch": 0.18285673357566207, + "grad_norm": 1.5913504362106323, + "learning_rate": 0.00018799469182163544, + "loss": 1.1818, + "step": 5106 + }, + { + "epoch": 0.18289254570523036, + "grad_norm": 2.286322832107544, + "learning_rate": 0.00018798918086325236, + "loss": 1.3696, + "step": 5107 + }, + { + "epoch": 0.18292835783479863, + "grad_norm": 1.7597332000732422, + "learning_rate": 0.0001879836687210884, + "loss": 1.2939, + "step": 5108 + }, + { + "epoch": 0.18296416996436693, + "grad_norm": 1.6547759771347046, + "learning_rate": 0.00018797815539521763, + "loss": 1.2806, + "step": 5109 + }, + { + "epoch": 0.18299998209393523, + "grad_norm": 1.8523986339569092, + "learning_rate": 0.00018797264088571427, + "loss": 1.4393, + "step": 5110 + }, + { + "epoch": 0.1830357942235035, + "grad_norm": 2.8024344444274902, + "learning_rate": 0.0001879671251926525, + "loss": 1.2534, + "step": 5111 + }, + { + "epoch": 0.1830716063530718, + "grad_norm": 1.7240288257598877, + "learning_rate": 0.00018796160831610655, + "loss": 1.2756, + "step": 5112 + }, + { + "epoch": 0.18310741848264006, + "grad_norm": 1.638113021850586, + "learning_rate": 0.00018795609025615062, + "loss": 1.2273, + "step": 5113 + }, + { + "epoch": 0.18314323061220836, + "grad_norm": 1.5655713081359863, + "learning_rate": 0.00018795057101285895, + "loss": 1.2304, + "step": 5114 + }, + { + "epoch": 0.18317904274177663, + "grad_norm": 1.6967610120773315, + "learning_rate": 0.0001879450505863058, + "loss": 1.1911, + "step": 5115 + }, + { + "epoch": 0.18321485487134492, + "grad_norm": 1.821368932723999, + "learning_rate": 0.00018793952897656544, + "loss": 0.9868, + "step": 5116 + }, + { + "epoch": 0.18325066700091322, + "grad_norm": 1.395741581916809, + "learning_rate": 0.00018793400618371213, + "loss": 0.9441, + "step": 5117 + }, + { + "epoch": 0.1832864791304815, + "grad_norm": 1.3037892580032349, + "learning_rate": 0.0001879284822078202, + "loss": 1.1313, + "step": 5118 + }, + { + "epoch": 0.18332229126004979, + "grad_norm": 1.4252601861953735, + "learning_rate": 0.00018792295704896396, + "loss": 1.1665, + "step": 5119 + }, + { + "epoch": 0.18335810338961805, + "grad_norm": 1.376364827156067, + "learning_rate": 0.00018791743070721776, + "loss": 1.386, + "step": 5120 + }, + { + "epoch": 0.18339391551918635, + "grad_norm": 1.5832319259643555, + "learning_rate": 0.0001879119031826559, + "loss": 1.2166, + "step": 5121 + }, + { + "epoch": 0.18342972764875462, + "grad_norm": 1.469287395477295, + "learning_rate": 0.00018790637447535283, + "loss": 1.2433, + "step": 5122 + }, + { + "epoch": 0.18346553977832292, + "grad_norm": 1.813109278678894, + "learning_rate": 0.00018790084458538285, + "loss": 1.3447, + "step": 5123 + }, + { + "epoch": 0.1835013519078912, + "grad_norm": 1.7901971340179443, + "learning_rate": 0.0001878953135128204, + "loss": 1.2571, + "step": 5124 + }, + { + "epoch": 0.18353716403745948, + "grad_norm": 2.126194477081299, + "learning_rate": 0.00018788978125773987, + "loss": 1.1721, + "step": 5125 + }, + { + "epoch": 0.18357297616702778, + "grad_norm": 1.5508190393447876, + "learning_rate": 0.0001878842478202157, + "loss": 1.1689, + "step": 5126 + }, + { + "epoch": 0.18360878829659605, + "grad_norm": 1.510527491569519, + "learning_rate": 0.00018787871320032236, + "loss": 1.2523, + "step": 5127 + }, + { + "epoch": 0.18364460042616434, + "grad_norm": 2.5960021018981934, + "learning_rate": 0.0001878731773981343, + "loss": 1.4139, + "step": 5128 + }, + { + "epoch": 0.18368041255573261, + "grad_norm": 1.7932016849517822, + "learning_rate": 0.00018786764041372594, + "loss": 1.2649, + "step": 5129 + }, + { + "epoch": 0.1837162246853009, + "grad_norm": 1.582322120666504, + "learning_rate": 0.00018786210224717184, + "loss": 1.1736, + "step": 5130 + }, + { + "epoch": 0.1837520368148692, + "grad_norm": 1.5817537307739258, + "learning_rate": 0.0001878565628985465, + "loss": 1.2936, + "step": 5131 + }, + { + "epoch": 0.18378784894443748, + "grad_norm": 1.6726558208465576, + "learning_rate": 0.00018785102236792444, + "loss": 1.1484, + "step": 5132 + }, + { + "epoch": 0.18382366107400577, + "grad_norm": 1.5466740131378174, + "learning_rate": 0.00018784548065538018, + "loss": 1.2442, + "step": 5133 + }, + { + "epoch": 0.18385947320357404, + "grad_norm": 1.7736939191818237, + "learning_rate": 0.0001878399377609883, + "loss": 1.1552, + "step": 5134 + }, + { + "epoch": 0.18389528533314234, + "grad_norm": 1.7284821271896362, + "learning_rate": 0.00018783439368482335, + "loss": 1.3148, + "step": 5135 + }, + { + "epoch": 0.1839310974627106, + "grad_norm": 1.5124616622924805, + "learning_rate": 0.00018782884842695992, + "loss": 1.0704, + "step": 5136 + }, + { + "epoch": 0.1839669095922789, + "grad_norm": 2.0567872524261475, + "learning_rate": 0.00018782330198747265, + "loss": 1.2572, + "step": 5137 + }, + { + "epoch": 0.1840027217218472, + "grad_norm": 2.090442657470703, + "learning_rate": 0.0001878177543664361, + "loss": 1.4485, + "step": 5138 + }, + { + "epoch": 0.18403853385141547, + "grad_norm": 1.835807204246521, + "learning_rate": 0.00018781220556392497, + "loss": 1.213, + "step": 5139 + }, + { + "epoch": 0.18407434598098377, + "grad_norm": 1.9749032258987427, + "learning_rate": 0.00018780665558001388, + "loss": 1.2474, + "step": 5140 + }, + { + "epoch": 0.18411015811055204, + "grad_norm": 1.4902267456054688, + "learning_rate": 0.00018780110441477752, + "loss": 1.1193, + "step": 5141 + }, + { + "epoch": 0.18414597024012033, + "grad_norm": 1.6599916219711304, + "learning_rate": 0.00018779555206829054, + "loss": 1.1895, + "step": 5142 + }, + { + "epoch": 0.1841817823696886, + "grad_norm": 1.6947726011276245, + "learning_rate": 0.00018778999854062765, + "loss": 1.1311, + "step": 5143 + }, + { + "epoch": 0.1842175944992569, + "grad_norm": 1.4388104677200317, + "learning_rate": 0.00018778444383186357, + "loss": 1.1483, + "step": 5144 + }, + { + "epoch": 0.1842534066288252, + "grad_norm": 1.53462553024292, + "learning_rate": 0.00018777888794207302, + "loss": 1.1, + "step": 5145 + }, + { + "epoch": 0.18428921875839346, + "grad_norm": 1.923104166984558, + "learning_rate": 0.0001877733308713308, + "loss": 1.1699, + "step": 5146 + }, + { + "epoch": 0.18432503088796176, + "grad_norm": 2.0521600246429443, + "learning_rate": 0.00018776777261971162, + "loss": 1.1458, + "step": 5147 + }, + { + "epoch": 0.18436084301753003, + "grad_norm": 1.8784675598144531, + "learning_rate": 0.00018776221318729026, + "loss": 1.2806, + "step": 5148 + }, + { + "epoch": 0.18439665514709833, + "grad_norm": 2.671698808670044, + "learning_rate": 0.00018775665257414153, + "loss": 1.1205, + "step": 5149 + }, + { + "epoch": 0.1844324672766666, + "grad_norm": 1.7631534337997437, + "learning_rate": 0.00018775109078034022, + "loss": 1.1336, + "step": 5150 + }, + { + "epoch": 0.1844682794062349, + "grad_norm": 1.684458613395691, + "learning_rate": 0.00018774552780596117, + "loss": 1.24, + "step": 5151 + }, + { + "epoch": 0.1845040915358032, + "grad_norm": 1.526487946510315, + "learning_rate": 0.00018773996365107926, + "loss": 0.8424, + "step": 5152 + }, + { + "epoch": 0.18453990366537146, + "grad_norm": 1.4715887308120728, + "learning_rate": 0.00018773439831576929, + "loss": 1.0265, + "step": 5153 + }, + { + "epoch": 0.18457571579493975, + "grad_norm": 1.6682459115982056, + "learning_rate": 0.00018772883180010616, + "loss": 1.187, + "step": 5154 + }, + { + "epoch": 0.18461152792450802, + "grad_norm": 1.518559455871582, + "learning_rate": 0.0001877232641041648, + "loss": 1.1288, + "step": 5155 + }, + { + "epoch": 0.18464734005407632, + "grad_norm": 1.1492596864700317, + "learning_rate": 0.00018771769522802004, + "loss": 1.1866, + "step": 5156 + }, + { + "epoch": 0.1846831521836446, + "grad_norm": 1.9328402280807495, + "learning_rate": 0.00018771212517174686, + "loss": 1.3153, + "step": 5157 + }, + { + "epoch": 0.18471896431321289, + "grad_norm": 1.967233419418335, + "learning_rate": 0.00018770655393542012, + "loss": 1.2464, + "step": 5158 + }, + { + "epoch": 0.18475477644278118, + "grad_norm": 1.4367094039916992, + "learning_rate": 0.0001877009815191149, + "loss": 1.186, + "step": 5159 + }, + { + "epoch": 0.18479058857234945, + "grad_norm": 1.4395967721939087, + "learning_rate": 0.00018769540792290608, + "loss": 1.1578, + "step": 5160 + }, + { + "epoch": 0.18482640070191775, + "grad_norm": 2.024174451828003, + "learning_rate": 0.00018768983314686866, + "loss": 1.178, + "step": 5161 + }, + { + "epoch": 0.18486221283148602, + "grad_norm": 1.6083009243011475, + "learning_rate": 0.00018768425719107765, + "loss": 1.1666, + "step": 5162 + }, + { + "epoch": 0.1848980249610543, + "grad_norm": 1.4889087677001953, + "learning_rate": 0.00018767868005560806, + "loss": 1.2263, + "step": 5163 + }, + { + "epoch": 0.18493383709062258, + "grad_norm": 1.3075028657913208, + "learning_rate": 0.0001876731017405349, + "loss": 1.0558, + "step": 5164 + }, + { + "epoch": 0.18496964922019088, + "grad_norm": 2.229335308074951, + "learning_rate": 0.0001876675222459333, + "loss": 1.1131, + "step": 5165 + }, + { + "epoch": 0.18500546134975918, + "grad_norm": 1.4304817914962769, + "learning_rate": 0.0001876619415718782, + "loss": 1.1301, + "step": 5166 + }, + { + "epoch": 0.18504127347932744, + "grad_norm": 2.11869478225708, + "learning_rate": 0.00018765635971844483, + "loss": 1.2171, + "step": 5167 + }, + { + "epoch": 0.18507708560889574, + "grad_norm": 1.927475094795227, + "learning_rate": 0.00018765077668570816, + "loss": 1.4648, + "step": 5168 + }, + { + "epoch": 0.185112897738464, + "grad_norm": 1.8266632556915283, + "learning_rate": 0.00018764519247374336, + "loss": 1.2642, + "step": 5169 + }, + { + "epoch": 0.1851487098680323, + "grad_norm": 2.011923313140869, + "learning_rate": 0.00018763960708262557, + "loss": 1.2449, + "step": 5170 + }, + { + "epoch": 0.18518452199760058, + "grad_norm": 2.06638765335083, + "learning_rate": 0.0001876340205124299, + "loss": 1.1442, + "step": 5171 + }, + { + "epoch": 0.18522033412716887, + "grad_norm": 1.5684452056884766, + "learning_rate": 0.00018762843276323151, + "loss": 1.406, + "step": 5172 + }, + { + "epoch": 0.18525614625673717, + "grad_norm": 1.6358246803283691, + "learning_rate": 0.0001876228438351056, + "loss": 1.3411, + "step": 5173 + }, + { + "epoch": 0.18529195838630544, + "grad_norm": 1.485058307647705, + "learning_rate": 0.00018761725372812735, + "loss": 1.2832, + "step": 5174 + }, + { + "epoch": 0.18532777051587374, + "grad_norm": 2.511019229888916, + "learning_rate": 0.00018761166244237197, + "loss": 1.1908, + "step": 5175 + }, + { + "epoch": 0.185363582645442, + "grad_norm": 1.9309558868408203, + "learning_rate": 0.00018760606997791468, + "loss": 0.9772, + "step": 5176 + }, + { + "epoch": 0.1853993947750103, + "grad_norm": 1.7011768817901611, + "learning_rate": 0.00018760047633483074, + "loss": 1.2787, + "step": 5177 + }, + { + "epoch": 0.18543520690457857, + "grad_norm": 1.3686940670013428, + "learning_rate": 0.00018759488151319539, + "loss": 1.1625, + "step": 5178 + }, + { + "epoch": 0.18547101903414687, + "grad_norm": 1.4499132633209229, + "learning_rate": 0.00018758928551308385, + "loss": 1.1694, + "step": 5179 + }, + { + "epoch": 0.18550683116371516, + "grad_norm": 1.9811547994613647, + "learning_rate": 0.0001875836883345715, + "loss": 1.1535, + "step": 5180 + }, + { + "epoch": 0.18554264329328343, + "grad_norm": 1.5992767810821533, + "learning_rate": 0.00018757808997773358, + "loss": 1.0406, + "step": 5181 + }, + { + "epoch": 0.18557845542285173, + "grad_norm": 1.853322982788086, + "learning_rate": 0.00018757249044264542, + "loss": 1.1784, + "step": 5182 + }, + { + "epoch": 0.18561426755242, + "grad_norm": 1.4630482196807861, + "learning_rate": 0.00018756688972938239, + "loss": 1.2049, + "step": 5183 + }, + { + "epoch": 0.1856500796819883, + "grad_norm": 1.5294369459152222, + "learning_rate": 0.0001875612878380198, + "loss": 1.4269, + "step": 5184 + }, + { + "epoch": 0.18568589181155656, + "grad_norm": 1.2062664031982422, + "learning_rate": 0.00018755568476863302, + "loss": 1.2248, + "step": 5185 + }, + { + "epoch": 0.18572170394112486, + "grad_norm": 1.7123725414276123, + "learning_rate": 0.00018755008052129743, + "loss": 1.3112, + "step": 5186 + }, + { + "epoch": 0.18575751607069316, + "grad_norm": 2.887481689453125, + "learning_rate": 0.00018754447509608847, + "loss": 1.114, + "step": 5187 + }, + { + "epoch": 0.18579332820026143, + "grad_norm": 1.686383605003357, + "learning_rate": 0.0001875388684930815, + "loss": 1.031, + "step": 5188 + }, + { + "epoch": 0.18582914032982972, + "grad_norm": 1.480356216430664, + "learning_rate": 0.00018753326071235197, + "loss": 1.1935, + "step": 5189 + }, + { + "epoch": 0.185864952459398, + "grad_norm": 1.853214979171753, + "learning_rate": 0.00018752765175397533, + "loss": 1.2375, + "step": 5190 + }, + { + "epoch": 0.1859007645889663, + "grad_norm": 1.2903774976730347, + "learning_rate": 0.00018752204161802706, + "loss": 1.2491, + "step": 5191 + }, + { + "epoch": 0.18593657671853456, + "grad_norm": 1.8832334280014038, + "learning_rate": 0.00018751643030458256, + "loss": 1.2729, + "step": 5192 + }, + { + "epoch": 0.18597238884810285, + "grad_norm": 1.8131953477859497, + "learning_rate": 0.00018751081781371743, + "loss": 1.1166, + "step": 5193 + }, + { + "epoch": 0.18600820097767115, + "grad_norm": 1.5567123889923096, + "learning_rate": 0.00018750520414550711, + "loss": 1.1298, + "step": 5194 + }, + { + "epoch": 0.18604401310723942, + "grad_norm": 1.4616676568984985, + "learning_rate": 0.00018749958930002717, + "loss": 1.2171, + "step": 5195 + }, + { + "epoch": 0.18607982523680772, + "grad_norm": 1.7918716669082642, + "learning_rate": 0.00018749397327735308, + "loss": 1.0309, + "step": 5196 + }, + { + "epoch": 0.18611563736637599, + "grad_norm": 1.6738355159759521, + "learning_rate": 0.00018748835607756045, + "loss": 1.2118, + "step": 5197 + }, + { + "epoch": 0.18615144949594428, + "grad_norm": 1.275909185409546, + "learning_rate": 0.00018748273770072485, + "loss": 1.1938, + "step": 5198 + }, + { + "epoch": 0.18618726162551255, + "grad_norm": 1.5424880981445312, + "learning_rate": 0.00018747711814692185, + "loss": 1.3568, + "step": 5199 + }, + { + "epoch": 0.18622307375508085, + "grad_norm": 1.5678210258483887, + "learning_rate": 0.00018747149741622706, + "loss": 1.1857, + "step": 5200 + }, + { + "epoch": 0.18625888588464912, + "grad_norm": 1.571457028388977, + "learning_rate": 0.0001874658755087161, + "loss": 1.3281, + "step": 5201 + }, + { + "epoch": 0.1862946980142174, + "grad_norm": 1.7520310878753662, + "learning_rate": 0.00018746025242446463, + "loss": 1.0969, + "step": 5202 + }, + { + "epoch": 0.1863305101437857, + "grad_norm": 1.6003284454345703, + "learning_rate": 0.00018745462816354826, + "loss": 1.058, + "step": 5203 + }, + { + "epoch": 0.18636632227335398, + "grad_norm": 1.2575510740280151, + "learning_rate": 0.0001874490027260427, + "loss": 1.2464, + "step": 5204 + }, + { + "epoch": 0.18640213440292228, + "grad_norm": 1.5466363430023193, + "learning_rate": 0.0001874433761120236, + "loss": 1.0373, + "step": 5205 + }, + { + "epoch": 0.18643794653249054, + "grad_norm": 1.388698697090149, + "learning_rate": 0.00018743774832156667, + "loss": 1.1569, + "step": 5206 + }, + { + "epoch": 0.18647375866205884, + "grad_norm": 1.2932448387145996, + "learning_rate": 0.0001874321193547476, + "loss": 1.2422, + "step": 5207 + }, + { + "epoch": 0.1865095707916271, + "grad_norm": 2.2839620113372803, + "learning_rate": 0.00018742648921164215, + "loss": 1.2078, + "step": 5208 + }, + { + "epoch": 0.1865453829211954, + "grad_norm": 1.6905803680419922, + "learning_rate": 0.00018742085789232607, + "loss": 1.0809, + "step": 5209 + }, + { + "epoch": 0.1865811950507637, + "grad_norm": 1.8177646398544312, + "learning_rate": 0.0001874152253968751, + "loss": 1.1054, + "step": 5210 + }, + { + "epoch": 0.18661700718033197, + "grad_norm": 1.3388235569000244, + "learning_rate": 0.00018740959172536506, + "loss": 1.0361, + "step": 5211 + }, + { + "epoch": 0.18665281930990027, + "grad_norm": 1.846411108970642, + "learning_rate": 0.0001874039568778717, + "loss": 1.247, + "step": 5212 + }, + { + "epoch": 0.18668863143946854, + "grad_norm": 1.4525147676467896, + "learning_rate": 0.0001873983208544708, + "loss": 1.3429, + "step": 5213 + }, + { + "epoch": 0.18672444356903684, + "grad_norm": 1.333920955657959, + "learning_rate": 0.00018739268365523828, + "loss": 1.0377, + "step": 5214 + }, + { + "epoch": 0.1867602556986051, + "grad_norm": 2.1530871391296387, + "learning_rate": 0.00018738704528024994, + "loss": 1.4549, + "step": 5215 + }, + { + "epoch": 0.1867960678281734, + "grad_norm": 1.8433353900909424, + "learning_rate": 0.00018738140572958155, + "loss": 1.234, + "step": 5216 + }, + { + "epoch": 0.1868318799577417, + "grad_norm": 1.9900621175765991, + "learning_rate": 0.00018737576500330914, + "loss": 1.1193, + "step": 5217 + }, + { + "epoch": 0.18686769208730997, + "grad_norm": 1.3765482902526855, + "learning_rate": 0.00018737012310150847, + "loss": 1.1159, + "step": 5218 + }, + { + "epoch": 0.18690350421687826, + "grad_norm": 1.496553897857666, + "learning_rate": 0.00018736448002425554, + "loss": 1.1133, + "step": 5219 + }, + { + "epoch": 0.18693931634644653, + "grad_norm": 1.2739059925079346, + "learning_rate": 0.00018735883577162619, + "loss": 1.1254, + "step": 5220 + }, + { + "epoch": 0.18697512847601483, + "grad_norm": 1.7543584108352661, + "learning_rate": 0.0001873531903436964, + "loss": 1.2155, + "step": 5221 + }, + { + "epoch": 0.1870109406055831, + "grad_norm": 1.6308308839797974, + "learning_rate": 0.00018734754374054207, + "loss": 1.1707, + "step": 5222 + }, + { + "epoch": 0.1870467527351514, + "grad_norm": 1.9458035230636597, + "learning_rate": 0.0001873418959622393, + "loss": 1.1682, + "step": 5223 + }, + { + "epoch": 0.1870825648647197, + "grad_norm": 1.5927318334579468, + "learning_rate": 0.0001873362470088639, + "loss": 1.1963, + "step": 5224 + }, + { + "epoch": 0.18711837699428796, + "grad_norm": 1.6850411891937256, + "learning_rate": 0.00018733059688049198, + "loss": 1.1285, + "step": 5225 + }, + { + "epoch": 0.18715418912385626, + "grad_norm": 2.52518367767334, + "learning_rate": 0.00018732494557719952, + "loss": 1.3323, + "step": 5226 + }, + { + "epoch": 0.18719000125342453, + "grad_norm": 1.605754017829895, + "learning_rate": 0.00018731929309906254, + "loss": 1.1568, + "step": 5227 + }, + { + "epoch": 0.18722581338299282, + "grad_norm": 1.3564960956573486, + "learning_rate": 0.00018731363944615717, + "loss": 0.9865, + "step": 5228 + }, + { + "epoch": 0.1872616255125611, + "grad_norm": 1.6927738189697266, + "learning_rate": 0.00018730798461855938, + "loss": 1.1376, + "step": 5229 + }, + { + "epoch": 0.1872974376421294, + "grad_norm": 1.659547209739685, + "learning_rate": 0.00018730232861634524, + "loss": 1.1025, + "step": 5230 + }, + { + "epoch": 0.18733324977169769, + "grad_norm": 1.6610193252563477, + "learning_rate": 0.0001872966714395909, + "loss": 1.1507, + "step": 5231 + }, + { + "epoch": 0.18736906190126595, + "grad_norm": 1.6670571565628052, + "learning_rate": 0.00018729101308837245, + "loss": 1.289, + "step": 5232 + }, + { + "epoch": 0.18740487403083425, + "grad_norm": 2.0027332305908203, + "learning_rate": 0.000187285353562766, + "loss": 1.054, + "step": 5233 + }, + { + "epoch": 0.18744068616040252, + "grad_norm": 1.493294596672058, + "learning_rate": 0.00018727969286284776, + "loss": 1.1354, + "step": 5234 + }, + { + "epoch": 0.18747649828997082, + "grad_norm": 1.5575460195541382, + "learning_rate": 0.0001872740309886938, + "loss": 1.2903, + "step": 5235 + }, + { + "epoch": 0.18751231041953909, + "grad_norm": 2.8186981678009033, + "learning_rate": 0.00018726836794038035, + "loss": 1.1353, + "step": 5236 + }, + { + "epoch": 0.18754812254910738, + "grad_norm": 2.172595977783203, + "learning_rate": 0.00018726270371798357, + "loss": 1.1726, + "step": 5237 + }, + { + "epoch": 0.18758393467867568, + "grad_norm": 1.575480580329895, + "learning_rate": 0.00018725703832157966, + "loss": 1.299, + "step": 5238 + }, + { + "epoch": 0.18761974680824395, + "grad_norm": 1.4842729568481445, + "learning_rate": 0.00018725137175124482, + "loss": 1.1763, + "step": 5239 + }, + { + "epoch": 0.18765555893781224, + "grad_norm": 1.9456225633621216, + "learning_rate": 0.0001872457040070554, + "loss": 1.2886, + "step": 5240 + }, + { + "epoch": 0.1876913710673805, + "grad_norm": 1.6128742694854736, + "learning_rate": 0.0001872400350890875, + "loss": 1.1471, + "step": 5241 + }, + { + "epoch": 0.1877271831969488, + "grad_norm": 1.4566318988800049, + "learning_rate": 0.00018723436499741748, + "loss": 0.9894, + "step": 5242 + }, + { + "epoch": 0.18776299532651708, + "grad_norm": 1.586605191230774, + "learning_rate": 0.0001872286937321216, + "loss": 1.0064, + "step": 5243 + }, + { + "epoch": 0.18779880745608538, + "grad_norm": 1.4981569051742554, + "learning_rate": 0.00018722302129327618, + "loss": 1.058, + "step": 5244 + }, + { + "epoch": 0.18783461958565367, + "grad_norm": 1.5246200561523438, + "learning_rate": 0.0001872173476809575, + "loss": 1.0834, + "step": 5245 + }, + { + "epoch": 0.18787043171522194, + "grad_norm": 1.8068327903747559, + "learning_rate": 0.00018721167289524195, + "loss": 1.3222, + "step": 5246 + }, + { + "epoch": 0.18790624384479024, + "grad_norm": 1.2044543027877808, + "learning_rate": 0.0001872059969362058, + "loss": 1.2247, + "step": 5247 + }, + { + "epoch": 0.1879420559743585, + "grad_norm": 1.352595567703247, + "learning_rate": 0.00018720031980392544, + "loss": 1.1546, + "step": 5248 + }, + { + "epoch": 0.1879778681039268, + "grad_norm": 2.295567512512207, + "learning_rate": 0.0001871946414984773, + "loss": 1.1788, + "step": 5249 + }, + { + "epoch": 0.18801368023349507, + "grad_norm": 1.5749520063400269, + "learning_rate": 0.00018718896201993767, + "loss": 0.8726, + "step": 5250 + }, + { + "epoch": 0.18804949236306337, + "grad_norm": 1.7352313995361328, + "learning_rate": 0.00018718328136838305, + "loss": 1.3016, + "step": 5251 + }, + { + "epoch": 0.18808530449263167, + "grad_norm": 1.8330131769180298, + "learning_rate": 0.00018717759954388986, + "loss": 1.1219, + "step": 5252 + }, + { + "epoch": 0.18812111662219994, + "grad_norm": 1.4892184734344482, + "learning_rate": 0.00018717191654653452, + "loss": 1.3102, + "step": 5253 + }, + { + "epoch": 0.18815692875176823, + "grad_norm": 1.164198637008667, + "learning_rate": 0.00018716623237639347, + "loss": 1.0666, + "step": 5254 + }, + { + "epoch": 0.1881927408813365, + "grad_norm": 1.517343282699585, + "learning_rate": 0.00018716054703354318, + "loss": 1.2116, + "step": 5255 + }, + { + "epoch": 0.1882285530109048, + "grad_norm": 2.198455333709717, + "learning_rate": 0.0001871548605180602, + "loss": 1.1619, + "step": 5256 + }, + { + "epoch": 0.18826436514047307, + "grad_norm": 1.5350607633590698, + "learning_rate": 0.00018714917283002094, + "loss": 1.3748, + "step": 5257 + }, + { + "epoch": 0.18830017727004136, + "grad_norm": 1.4056100845336914, + "learning_rate": 0.000187143483969502, + "loss": 1.3186, + "step": 5258 + }, + { + "epoch": 0.18833598939960966, + "grad_norm": 1.6625293493270874, + "learning_rate": 0.00018713779393657993, + "loss": 1.1375, + "step": 5259 + }, + { + "epoch": 0.18837180152917793, + "grad_norm": 1.275696039199829, + "learning_rate": 0.00018713210273133118, + "loss": 1.2253, + "step": 5260 + }, + { + "epoch": 0.18840761365874623, + "grad_norm": 1.548079013824463, + "learning_rate": 0.00018712641035383243, + "loss": 1.1165, + "step": 5261 + }, + { + "epoch": 0.1884434257883145, + "grad_norm": 1.4785184860229492, + "learning_rate": 0.00018712071680416017, + "loss": 1.3207, + "step": 5262 + }, + { + "epoch": 0.1884792379178828, + "grad_norm": 1.6979035139083862, + "learning_rate": 0.00018711502208239108, + "loss": 0.9586, + "step": 5263 + }, + { + "epoch": 0.18851505004745106, + "grad_norm": 2.2027502059936523, + "learning_rate": 0.0001871093261886017, + "loss": 1.1903, + "step": 5264 + }, + { + "epoch": 0.18855086217701936, + "grad_norm": 1.5944570302963257, + "learning_rate": 0.00018710362912286872, + "loss": 1.218, + "step": 5265 + }, + { + "epoch": 0.18858667430658765, + "grad_norm": 1.7293773889541626, + "learning_rate": 0.00018709793088526877, + "loss": 1.227, + "step": 5266 + }, + { + "epoch": 0.18862248643615592, + "grad_norm": 1.7222366333007812, + "learning_rate": 0.0001870922314758785, + "loss": 1.0502, + "step": 5267 + }, + { + "epoch": 0.18865829856572422, + "grad_norm": 1.684194564819336, + "learning_rate": 0.0001870865308947746, + "loss": 1.1845, + "step": 5268 + }, + { + "epoch": 0.1886941106952925, + "grad_norm": 1.552920937538147, + "learning_rate": 0.00018708082914203376, + "loss": 1.1214, + "step": 5269 + }, + { + "epoch": 0.18872992282486079, + "grad_norm": 1.3904372453689575, + "learning_rate": 0.0001870751262177327, + "loss": 1.1167, + "step": 5270 + }, + { + "epoch": 0.18876573495442905, + "grad_norm": 2.268503189086914, + "learning_rate": 0.00018706942212194812, + "loss": 1.117, + "step": 5271 + }, + { + "epoch": 0.18880154708399735, + "grad_norm": 1.6002895832061768, + "learning_rate": 0.0001870637168547568, + "loss": 1.3543, + "step": 5272 + }, + { + "epoch": 0.18883735921356565, + "grad_norm": 1.6693837642669678, + "learning_rate": 0.00018705801041623546, + "loss": 1.2185, + "step": 5273 + }, + { + "epoch": 0.18887317134313392, + "grad_norm": 1.6282885074615479, + "learning_rate": 0.0001870523028064609, + "loss": 1.0769, + "step": 5274 + }, + { + "epoch": 0.1889089834727022, + "grad_norm": 1.649880051612854, + "learning_rate": 0.00018704659402550986, + "loss": 1.241, + "step": 5275 + }, + { + "epoch": 0.18894479560227048, + "grad_norm": 1.9194964170455933, + "learning_rate": 0.0001870408840734592, + "loss": 1.1821, + "step": 5276 + }, + { + "epoch": 0.18898060773183878, + "grad_norm": 1.8869719505310059, + "learning_rate": 0.00018703517295038573, + "loss": 0.9286, + "step": 5277 + }, + { + "epoch": 0.18901641986140705, + "grad_norm": 1.5856724977493286, + "learning_rate": 0.00018702946065636623, + "loss": 1.0167, + "step": 5278 + }, + { + "epoch": 0.18905223199097534, + "grad_norm": 1.9974020719528198, + "learning_rate": 0.00018702374719147766, + "loss": 1.012, + "step": 5279 + }, + { + "epoch": 0.18908804412054364, + "grad_norm": 1.5280981063842773, + "learning_rate": 0.00018701803255579677, + "loss": 1.3908, + "step": 5280 + }, + { + "epoch": 0.1891238562501119, + "grad_norm": 1.604126214981079, + "learning_rate": 0.00018701231674940054, + "loss": 1.3953, + "step": 5281 + }, + { + "epoch": 0.1891596683796802, + "grad_norm": 1.9647021293640137, + "learning_rate": 0.0001870065997723658, + "loss": 1.2956, + "step": 5282 + }, + { + "epoch": 0.18919548050924848, + "grad_norm": 1.5462199449539185, + "learning_rate": 0.00018700088162476952, + "loss": 1.0715, + "step": 5283 + }, + { + "epoch": 0.18923129263881677, + "grad_norm": 1.326987385749817, + "learning_rate": 0.00018699516230668856, + "loss": 1.2541, + "step": 5284 + }, + { + "epoch": 0.18926710476838504, + "grad_norm": 1.4160528182983398, + "learning_rate": 0.00018698944181819993, + "loss": 0.9947, + "step": 5285 + }, + { + "epoch": 0.18930291689795334, + "grad_norm": 1.4141687154769897, + "learning_rate": 0.00018698372015938058, + "loss": 1.4686, + "step": 5286 + }, + { + "epoch": 0.18933872902752164, + "grad_norm": 1.432426929473877, + "learning_rate": 0.00018697799733030746, + "loss": 1.2084, + "step": 5287 + }, + { + "epoch": 0.1893745411570899, + "grad_norm": 1.4783998727798462, + "learning_rate": 0.00018697227333105756, + "loss": 1.2465, + "step": 5288 + }, + { + "epoch": 0.1894103532866582, + "grad_norm": 1.8240966796875, + "learning_rate": 0.00018696654816170795, + "loss": 1.3001, + "step": 5289 + }, + { + "epoch": 0.18944616541622647, + "grad_norm": 1.4720064401626587, + "learning_rate": 0.0001869608218223356, + "loss": 0.9989, + "step": 5290 + }, + { + "epoch": 0.18948197754579477, + "grad_norm": 1.7098900079727173, + "learning_rate": 0.0001869550943130175, + "loss": 1.0922, + "step": 5291 + }, + { + "epoch": 0.18951778967536304, + "grad_norm": 1.4531803131103516, + "learning_rate": 0.00018694936563383086, + "loss": 1.047, + "step": 5292 + }, + { + "epoch": 0.18955360180493133, + "grad_norm": 1.4342273473739624, + "learning_rate": 0.00018694363578485262, + "loss": 1.1549, + "step": 5293 + }, + { + "epoch": 0.18958941393449963, + "grad_norm": 1.7216050624847412, + "learning_rate": 0.00018693790476615992, + "loss": 1.1924, + "step": 5294 + }, + { + "epoch": 0.1896252260640679, + "grad_norm": 1.6910263299942017, + "learning_rate": 0.00018693217257782985, + "loss": 1.0945, + "step": 5295 + }, + { + "epoch": 0.1896610381936362, + "grad_norm": 1.4725433588027954, + "learning_rate": 0.00018692643921993952, + "loss": 1.1661, + "step": 5296 + }, + { + "epoch": 0.18969685032320446, + "grad_norm": 1.444690227508545, + "learning_rate": 0.0001869207046925661, + "loss": 1.1401, + "step": 5297 + }, + { + "epoch": 0.18973266245277276, + "grad_norm": 1.6308059692382812, + "learning_rate": 0.0001869149689957867, + "loss": 1.282, + "step": 5298 + }, + { + "epoch": 0.18976847458234103, + "grad_norm": 1.915196180343628, + "learning_rate": 0.0001869092321296785, + "loss": 1.0631, + "step": 5299 + }, + { + "epoch": 0.18980428671190933, + "grad_norm": 1.441960334777832, + "learning_rate": 0.00018690349409431872, + "loss": 1.0129, + "step": 5300 + }, + { + "epoch": 0.1898400988414776, + "grad_norm": 1.5047211647033691, + "learning_rate": 0.00018689775488978452, + "loss": 1.1959, + "step": 5301 + }, + { + "epoch": 0.1898759109710459, + "grad_norm": 2.033831834793091, + "learning_rate": 0.0001868920145161531, + "loss": 1.152, + "step": 5302 + }, + { + "epoch": 0.1899117231006142, + "grad_norm": 1.6957100629806519, + "learning_rate": 0.0001868862729735017, + "loss": 1.3198, + "step": 5303 + }, + { + "epoch": 0.18994753523018246, + "grad_norm": 1.4119433164596558, + "learning_rate": 0.00018688053026190757, + "loss": 0.9751, + "step": 5304 + }, + { + "epoch": 0.18998334735975075, + "grad_norm": 1.842826247215271, + "learning_rate": 0.000186874786381448, + "loss": 1.1863, + "step": 5305 + }, + { + "epoch": 0.19001915948931902, + "grad_norm": 1.8707283735275269, + "learning_rate": 0.0001868690413322002, + "loss": 1.2721, + "step": 5306 + }, + { + "epoch": 0.19005497161888732, + "grad_norm": 1.6760303974151611, + "learning_rate": 0.00018686329511424153, + "loss": 1.3468, + "step": 5307 + }, + { + "epoch": 0.1900907837484556, + "grad_norm": 1.9748179912567139, + "learning_rate": 0.00018685754772764928, + "loss": 1.2585, + "step": 5308 + }, + { + "epoch": 0.19012659587802389, + "grad_norm": 1.3611046075820923, + "learning_rate": 0.00018685179917250072, + "loss": 1.165, + "step": 5309 + }, + { + "epoch": 0.19016240800759218, + "grad_norm": 1.4190073013305664, + "learning_rate": 0.0001868460494488733, + "loss": 1.0858, + "step": 5310 + }, + { + "epoch": 0.19019822013716045, + "grad_norm": 2.6077828407287598, + "learning_rate": 0.00018684029855684425, + "loss": 1.3584, + "step": 5311 + }, + { + "epoch": 0.19023403226672875, + "grad_norm": 1.3694159984588623, + "learning_rate": 0.00018683454649649103, + "loss": 1.1562, + "step": 5312 + }, + { + "epoch": 0.19026984439629702, + "grad_norm": 1.3548016548156738, + "learning_rate": 0.00018682879326789098, + "loss": 1.1597, + "step": 5313 + }, + { + "epoch": 0.1903056565258653, + "grad_norm": 1.7035913467407227, + "learning_rate": 0.00018682303887112154, + "loss": 1.1503, + "step": 5314 + }, + { + "epoch": 0.19034146865543358, + "grad_norm": 1.8393737077713013, + "learning_rate": 0.00018681728330626008, + "loss": 1.4359, + "step": 5315 + }, + { + "epoch": 0.19037728078500188, + "grad_norm": 1.4171319007873535, + "learning_rate": 0.00018681152657338404, + "loss": 1.35, + "step": 5316 + }, + { + "epoch": 0.19041309291457018, + "grad_norm": 1.6510313749313354, + "learning_rate": 0.00018680576867257095, + "loss": 1.2128, + "step": 5317 + }, + { + "epoch": 0.19044890504413844, + "grad_norm": 1.4547624588012695, + "learning_rate": 0.00018680000960389818, + "loss": 1.1832, + "step": 5318 + }, + { + "epoch": 0.19048471717370674, + "grad_norm": 1.3087458610534668, + "learning_rate": 0.00018679424936744323, + "loss": 1.0165, + "step": 5319 + }, + { + "epoch": 0.190520529303275, + "grad_norm": 2.1871273517608643, + "learning_rate": 0.00018678848796328362, + "loss": 0.9737, + "step": 5320 + }, + { + "epoch": 0.1905563414328433, + "grad_norm": 1.3672736883163452, + "learning_rate": 0.00018678272539149687, + "loss": 0.9648, + "step": 5321 + }, + { + "epoch": 0.19059215356241158, + "grad_norm": 1.8120441436767578, + "learning_rate": 0.00018677696165216048, + "loss": 1.2644, + "step": 5322 + }, + { + "epoch": 0.19062796569197987, + "grad_norm": 1.6367199420928955, + "learning_rate": 0.000186771196745352, + "loss": 1.3221, + "step": 5323 + }, + { + "epoch": 0.19066377782154817, + "grad_norm": 1.8570196628570557, + "learning_rate": 0.000186765430671149, + "loss": 1.1799, + "step": 5324 + }, + { + "epoch": 0.19069958995111644, + "grad_norm": 2.176551103591919, + "learning_rate": 0.00018675966342962904, + "loss": 1.184, + "step": 5325 + }, + { + "epoch": 0.19073540208068474, + "grad_norm": 1.7382574081420898, + "learning_rate": 0.00018675389502086976, + "loss": 1.2401, + "step": 5326 + }, + { + "epoch": 0.190771214210253, + "grad_norm": 1.7611408233642578, + "learning_rate": 0.00018674812544494865, + "loss": 1.2464, + "step": 5327 + }, + { + "epoch": 0.1908070263398213, + "grad_norm": 2.166661500930786, + "learning_rate": 0.00018674235470194348, + "loss": 1.331, + "step": 5328 + }, + { + "epoch": 0.19084283846938957, + "grad_norm": 1.5139257907867432, + "learning_rate": 0.0001867365827919318, + "loss": 1.1852, + "step": 5329 + }, + { + "epoch": 0.19087865059895787, + "grad_norm": 2.383754253387451, + "learning_rate": 0.00018673080971499126, + "loss": 1.1801, + "step": 5330 + }, + { + "epoch": 0.19091446272852616, + "grad_norm": 1.4625976085662842, + "learning_rate": 0.00018672503547119957, + "loss": 1.0333, + "step": 5331 + }, + { + "epoch": 0.19095027485809443, + "grad_norm": 1.491163730621338, + "learning_rate": 0.00018671926006063442, + "loss": 0.9923, + "step": 5332 + }, + { + "epoch": 0.19098608698766273, + "grad_norm": 2.1346192359924316, + "learning_rate": 0.00018671348348337343, + "loss": 1.2855, + "step": 5333 + }, + { + "epoch": 0.191021899117231, + "grad_norm": 1.4937313795089722, + "learning_rate": 0.00018670770573949442, + "loss": 1.2371, + "step": 5334 + }, + { + "epoch": 0.1910577112467993, + "grad_norm": 1.856185793876648, + "learning_rate": 0.00018670192682907505, + "loss": 1.2481, + "step": 5335 + }, + { + "epoch": 0.19109352337636756, + "grad_norm": 1.2867814302444458, + "learning_rate": 0.00018669614675219308, + "loss": 1.2409, + "step": 5336 + }, + { + "epoch": 0.19112933550593586, + "grad_norm": 1.7672617435455322, + "learning_rate": 0.0001866903655089263, + "loss": 1.2588, + "step": 5337 + }, + { + "epoch": 0.19116514763550416, + "grad_norm": 1.8491688966751099, + "learning_rate": 0.00018668458309935247, + "loss": 1.1892, + "step": 5338 + }, + { + "epoch": 0.19120095976507243, + "grad_norm": 1.452989935874939, + "learning_rate": 0.0001866787995235494, + "loss": 1.333, + "step": 5339 + }, + { + "epoch": 0.19123677189464072, + "grad_norm": 1.6978058815002441, + "learning_rate": 0.00018667301478159489, + "loss": 1.3351, + "step": 5340 + }, + { + "epoch": 0.191272584024209, + "grad_norm": 1.7369645833969116, + "learning_rate": 0.00018666722887356673, + "loss": 1.2978, + "step": 5341 + }, + { + "epoch": 0.1913083961537773, + "grad_norm": 1.522119402885437, + "learning_rate": 0.00018666144179954283, + "loss": 1.4492, + "step": 5342 + }, + { + "epoch": 0.19134420828334556, + "grad_norm": 1.684097409248352, + "learning_rate": 0.00018665565355960103, + "loss": 0.893, + "step": 5343 + }, + { + "epoch": 0.19138002041291385, + "grad_norm": 1.3567925691604614, + "learning_rate": 0.0001866498641538192, + "loss": 1.354, + "step": 5344 + }, + { + "epoch": 0.19141583254248215, + "grad_norm": 1.200348138809204, + "learning_rate": 0.00018664407358227517, + "loss": 1.1521, + "step": 5345 + }, + { + "epoch": 0.19145164467205042, + "grad_norm": 1.5701676607131958, + "learning_rate": 0.0001866382818450469, + "loss": 1.1129, + "step": 5346 + }, + { + "epoch": 0.19148745680161872, + "grad_norm": 1.5085904598236084, + "learning_rate": 0.00018663248894221232, + "loss": 1.0741, + "step": 5347 + }, + { + "epoch": 0.19152326893118699, + "grad_norm": 1.3974212408065796, + "learning_rate": 0.00018662669487384936, + "loss": 1.2482, + "step": 5348 + }, + { + "epoch": 0.19155908106075528, + "grad_norm": 1.9547728300094604, + "learning_rate": 0.00018662089964003594, + "loss": 1.0579, + "step": 5349 + }, + { + "epoch": 0.19159489319032355, + "grad_norm": 1.3949261903762817, + "learning_rate": 0.00018661510324085003, + "loss": 1.2346, + "step": 5350 + }, + { + "epoch": 0.19163070531989185, + "grad_norm": 1.2851594686508179, + "learning_rate": 0.00018660930567636968, + "loss": 1.144, + "step": 5351 + }, + { + "epoch": 0.19166651744946014, + "grad_norm": 1.44109046459198, + "learning_rate": 0.00018660350694667282, + "loss": 1.1482, + "step": 5352 + }, + { + "epoch": 0.1917023295790284, + "grad_norm": 1.4595109224319458, + "learning_rate": 0.00018659770705183748, + "loss": 1.0874, + "step": 5353 + }, + { + "epoch": 0.1917381417085967, + "grad_norm": 1.3148467540740967, + "learning_rate": 0.00018659190599194168, + "loss": 1.0136, + "step": 5354 + }, + { + "epoch": 0.19177395383816498, + "grad_norm": 1.586521029472351, + "learning_rate": 0.0001865861037670635, + "loss": 1.1718, + "step": 5355 + }, + { + "epoch": 0.19180976596773328, + "grad_norm": 1.4473111629486084, + "learning_rate": 0.00018658030037728098, + "loss": 1.1331, + "step": 5356 + }, + { + "epoch": 0.19184557809730154, + "grad_norm": 1.594092845916748, + "learning_rate": 0.00018657449582267218, + "loss": 1.2209, + "step": 5357 + }, + { + "epoch": 0.19188139022686984, + "grad_norm": 1.706128478050232, + "learning_rate": 0.00018656869010331523, + "loss": 1.2932, + "step": 5358 + }, + { + "epoch": 0.19191720235643814, + "grad_norm": 1.3623861074447632, + "learning_rate": 0.00018656288321928824, + "loss": 1.0024, + "step": 5359 + }, + { + "epoch": 0.1919530144860064, + "grad_norm": 1.5481764078140259, + "learning_rate": 0.0001865570751706693, + "loss": 1.2197, + "step": 5360 + }, + { + "epoch": 0.1919888266155747, + "grad_norm": 2.0283310413360596, + "learning_rate": 0.00018655126595753654, + "loss": 1.1113, + "step": 5361 + }, + { + "epoch": 0.19202463874514297, + "grad_norm": 1.4700065851211548, + "learning_rate": 0.00018654545557996816, + "loss": 1.2491, + "step": 5362 + }, + { + "epoch": 0.19206045087471127, + "grad_norm": 1.3863394260406494, + "learning_rate": 0.0001865396440380423, + "loss": 1.2744, + "step": 5363 + }, + { + "epoch": 0.19209626300427954, + "grad_norm": 1.9710079431533813, + "learning_rate": 0.00018653383133183718, + "loss": 1.4148, + "step": 5364 + }, + { + "epoch": 0.19213207513384783, + "grad_norm": 1.7655998468399048, + "learning_rate": 0.00018652801746143097, + "loss": 1.2089, + "step": 5365 + }, + { + "epoch": 0.19216788726341613, + "grad_norm": 1.3462042808532715, + "learning_rate": 0.00018652220242690187, + "loss": 1.2839, + "step": 5366 + }, + { + "epoch": 0.1922036993929844, + "grad_norm": 1.658882737159729, + "learning_rate": 0.00018651638622832817, + "loss": 1.0345, + "step": 5367 + }, + { + "epoch": 0.1922395115225527, + "grad_norm": 2.2809901237487793, + "learning_rate": 0.00018651056886578808, + "loss": 1.1566, + "step": 5368 + }, + { + "epoch": 0.19227532365212097, + "grad_norm": 1.8055344820022583, + "learning_rate": 0.00018650475033935992, + "loss": 1.1273, + "step": 5369 + }, + { + "epoch": 0.19231113578168926, + "grad_norm": 2.2734224796295166, + "learning_rate": 0.00018649893064912187, + "loss": 1.2334, + "step": 5370 + }, + { + "epoch": 0.19234694791125753, + "grad_norm": 1.5857789516448975, + "learning_rate": 0.00018649310979515228, + "loss": 1.1204, + "step": 5371 + }, + { + "epoch": 0.19238276004082583, + "grad_norm": 1.385394811630249, + "learning_rate": 0.0001864872877775295, + "loss": 1.3011, + "step": 5372 + }, + { + "epoch": 0.19241857217039413, + "grad_norm": 1.733188509941101, + "learning_rate": 0.00018648146459633182, + "loss": 1.0437, + "step": 5373 + }, + { + "epoch": 0.1924543842999624, + "grad_norm": 1.5901150703430176, + "learning_rate": 0.00018647564025163756, + "loss": 1.222, + "step": 5374 + }, + { + "epoch": 0.1924901964295307, + "grad_norm": 1.2199116945266724, + "learning_rate": 0.00018646981474352515, + "loss": 1.1761, + "step": 5375 + }, + { + "epoch": 0.19252600855909896, + "grad_norm": 1.3519306182861328, + "learning_rate": 0.0001864639880720729, + "loss": 1.1886, + "step": 5376 + }, + { + "epoch": 0.19256182068866726, + "grad_norm": 1.672835350036621, + "learning_rate": 0.0001864581602373592, + "loss": 1.1939, + "step": 5377 + }, + { + "epoch": 0.19259763281823553, + "grad_norm": 1.6740162372589111, + "learning_rate": 0.00018645233123946252, + "loss": 1.04, + "step": 5378 + }, + { + "epoch": 0.19263344494780382, + "grad_norm": 1.4057881832122803, + "learning_rate": 0.0001864465010784612, + "loss": 0.9873, + "step": 5379 + }, + { + "epoch": 0.19266925707737212, + "grad_norm": 1.7745954990386963, + "learning_rate": 0.00018644066975443373, + "loss": 1.1985, + "step": 5380 + }, + { + "epoch": 0.1927050692069404, + "grad_norm": 1.5258504152297974, + "learning_rate": 0.00018643483726745857, + "loss": 1.0048, + "step": 5381 + }, + { + "epoch": 0.19274088133650868, + "grad_norm": 2.201575756072998, + "learning_rate": 0.00018642900361761413, + "loss": 1.2319, + "step": 5382 + }, + { + "epoch": 0.19277669346607695, + "grad_norm": 1.7580864429473877, + "learning_rate": 0.00018642316880497893, + "loss": 1.2288, + "step": 5383 + }, + { + "epoch": 0.19281250559564525, + "grad_norm": 2.227632761001587, + "learning_rate": 0.00018641733282963153, + "loss": 1.5124, + "step": 5384 + }, + { + "epoch": 0.19284831772521352, + "grad_norm": 1.6055827140808105, + "learning_rate": 0.00018641149569165034, + "loss": 1.2524, + "step": 5385 + }, + { + "epoch": 0.19288412985478182, + "grad_norm": 1.6834646463394165, + "learning_rate": 0.00018640565739111393, + "loss": 1.1499, + "step": 5386 + }, + { + "epoch": 0.1929199419843501, + "grad_norm": 1.7152782678604126, + "learning_rate": 0.0001863998179281009, + "loss": 1.2102, + "step": 5387 + }, + { + "epoch": 0.19295575411391838, + "grad_norm": 1.563869595527649, + "learning_rate": 0.0001863939773026897, + "loss": 1.1709, + "step": 5388 + }, + { + "epoch": 0.19299156624348668, + "grad_norm": 1.556178092956543, + "learning_rate": 0.00018638813551495901, + "loss": 1.3865, + "step": 5389 + }, + { + "epoch": 0.19302737837305495, + "grad_norm": 1.3825156688690186, + "learning_rate": 0.0001863822925649874, + "loss": 1.3718, + "step": 5390 + }, + { + "epoch": 0.19306319050262324, + "grad_norm": 1.409233808517456, + "learning_rate": 0.00018637644845285344, + "loss": 1.0497, + "step": 5391 + }, + { + "epoch": 0.1930990026321915, + "grad_norm": 1.4196453094482422, + "learning_rate": 0.00018637060317863583, + "loss": 1.358, + "step": 5392 + }, + { + "epoch": 0.1931348147617598, + "grad_norm": 1.5505244731903076, + "learning_rate": 0.00018636475674241315, + "loss": 1.3293, + "step": 5393 + }, + { + "epoch": 0.1931706268913281, + "grad_norm": 1.4460728168487549, + "learning_rate": 0.00018635890914426404, + "loss": 1.1621, + "step": 5394 + }, + { + "epoch": 0.19320643902089638, + "grad_norm": 1.2154964208602905, + "learning_rate": 0.00018635306038426724, + "loss": 1.0386, + "step": 5395 + }, + { + "epoch": 0.19324225115046467, + "grad_norm": 2.0547728538513184, + "learning_rate": 0.00018634721046250139, + "loss": 1.1565, + "step": 5396 + }, + { + "epoch": 0.19327806328003294, + "grad_norm": 1.5923651456832886, + "learning_rate": 0.00018634135937904518, + "loss": 1.3199, + "step": 5397 + }, + { + "epoch": 0.19331387540960124, + "grad_norm": 1.5104615688323975, + "learning_rate": 0.00018633550713397737, + "loss": 1.1029, + "step": 5398 + }, + { + "epoch": 0.1933496875391695, + "grad_norm": 1.3760446310043335, + "learning_rate": 0.00018632965372737665, + "loss": 1.2336, + "step": 5399 + }, + { + "epoch": 0.1933854996687378, + "grad_norm": 2.4118897914886475, + "learning_rate": 0.00018632379915932185, + "loss": 1.3636, + "step": 5400 + }, + { + "epoch": 0.19342131179830607, + "grad_norm": 1.8166747093200684, + "learning_rate": 0.00018631794342989163, + "loss": 0.9639, + "step": 5401 + }, + { + "epoch": 0.19345712392787437, + "grad_norm": 1.3818808794021606, + "learning_rate": 0.00018631208653916486, + "loss": 1.1498, + "step": 5402 + }, + { + "epoch": 0.19349293605744267, + "grad_norm": 1.313680648803711, + "learning_rate": 0.0001863062284872203, + "loss": 0.9711, + "step": 5403 + }, + { + "epoch": 0.19352874818701093, + "grad_norm": 1.8002269268035889, + "learning_rate": 0.00018630036927413672, + "loss": 1.064, + "step": 5404 + }, + { + "epoch": 0.19356456031657923, + "grad_norm": 1.7936885356903076, + "learning_rate": 0.00018629450889999302, + "loss": 1.145, + "step": 5405 + }, + { + "epoch": 0.1936003724461475, + "grad_norm": 1.8337994813919067, + "learning_rate": 0.000186288647364868, + "loss": 1.0567, + "step": 5406 + }, + { + "epoch": 0.1936361845757158, + "grad_norm": 2.0044326782226562, + "learning_rate": 0.00018628278466884055, + "loss": 1.1795, + "step": 5407 + }, + { + "epoch": 0.19367199670528407, + "grad_norm": 1.4890154600143433, + "learning_rate": 0.00018627692081198954, + "loss": 1.1206, + "step": 5408 + }, + { + "epoch": 0.19370780883485236, + "grad_norm": 1.9811391830444336, + "learning_rate": 0.00018627105579439382, + "loss": 1.0744, + "step": 5409 + }, + { + "epoch": 0.19374362096442066, + "grad_norm": 1.6333580017089844, + "learning_rate": 0.00018626518961613236, + "loss": 1.066, + "step": 5410 + }, + { + "epoch": 0.19377943309398893, + "grad_norm": 1.721145749092102, + "learning_rate": 0.00018625932227728402, + "loss": 1.2409, + "step": 5411 + }, + { + "epoch": 0.19381524522355723, + "grad_norm": 1.4021919965744019, + "learning_rate": 0.00018625345377792777, + "loss": 1.1998, + "step": 5412 + }, + { + "epoch": 0.1938510573531255, + "grad_norm": 3.20247483253479, + "learning_rate": 0.0001862475841181426, + "loss": 1.2561, + "step": 5413 + }, + { + "epoch": 0.1938868694826938, + "grad_norm": 2.2783896923065186, + "learning_rate": 0.00018624171329800738, + "loss": 1.26, + "step": 5414 + }, + { + "epoch": 0.19392268161226206, + "grad_norm": 1.7242993116378784, + "learning_rate": 0.00018623584131760118, + "loss": 1.3751, + "step": 5415 + }, + { + "epoch": 0.19395849374183036, + "grad_norm": 2.066652536392212, + "learning_rate": 0.00018622996817700295, + "loss": 1.2045, + "step": 5416 + }, + { + "epoch": 0.19399430587139865, + "grad_norm": 2.781285047531128, + "learning_rate": 0.00018622409387629175, + "loss": 1.2415, + "step": 5417 + }, + { + "epoch": 0.19403011800096692, + "grad_norm": 1.7107020616531372, + "learning_rate": 0.0001862182184155466, + "loss": 1.3567, + "step": 5418 + }, + { + "epoch": 0.19406593013053522, + "grad_norm": 1.5555914640426636, + "learning_rate": 0.00018621234179484647, + "loss": 1.2023, + "step": 5419 + }, + { + "epoch": 0.1941017422601035, + "grad_norm": 2.062497854232788, + "learning_rate": 0.00018620646401427054, + "loss": 1.2584, + "step": 5420 + }, + { + "epoch": 0.19413755438967178, + "grad_norm": 1.9417593479156494, + "learning_rate": 0.00018620058507389783, + "loss": 1.2839, + "step": 5421 + }, + { + "epoch": 0.19417336651924005, + "grad_norm": 1.5924872159957886, + "learning_rate": 0.00018619470497380745, + "loss": 1.2937, + "step": 5422 + }, + { + "epoch": 0.19420917864880835, + "grad_norm": 1.368730068206787, + "learning_rate": 0.00018618882371407847, + "loss": 1.1599, + "step": 5423 + }, + { + "epoch": 0.19424499077837665, + "grad_norm": 1.3865190744400024, + "learning_rate": 0.00018618294129479007, + "loss": 1.0635, + "step": 5424 + }, + { + "epoch": 0.19428080290794492, + "grad_norm": 1.5997819900512695, + "learning_rate": 0.00018617705771602132, + "loss": 1.0948, + "step": 5425 + }, + { + "epoch": 0.1943166150375132, + "grad_norm": 2.2152764797210693, + "learning_rate": 0.00018617117297785145, + "loss": 1.3764, + "step": 5426 + }, + { + "epoch": 0.19435242716708148, + "grad_norm": 1.4905812740325928, + "learning_rate": 0.00018616528708035958, + "loss": 1.1715, + "step": 5427 + }, + { + "epoch": 0.19438823929664978, + "grad_norm": 1.5524505376815796, + "learning_rate": 0.00018615940002362496, + "loss": 1.0168, + "step": 5428 + }, + { + "epoch": 0.19442405142621805, + "grad_norm": 1.6811473369598389, + "learning_rate": 0.0001861535118077267, + "loss": 1.1781, + "step": 5429 + }, + { + "epoch": 0.19445986355578634, + "grad_norm": 2.308976411819458, + "learning_rate": 0.0001861476224327441, + "loss": 1.3006, + "step": 5430 + }, + { + "epoch": 0.19449567568535464, + "grad_norm": 1.581443190574646, + "learning_rate": 0.00018614173189875636, + "loss": 1.0722, + "step": 5431 + }, + { + "epoch": 0.1945314878149229, + "grad_norm": 1.9186171293258667, + "learning_rate": 0.0001861358402058427, + "loss": 1.2339, + "step": 5432 + }, + { + "epoch": 0.1945672999444912, + "grad_norm": 1.5260332822799683, + "learning_rate": 0.00018612994735408246, + "loss": 1.2635, + "step": 5433 + }, + { + "epoch": 0.19460311207405948, + "grad_norm": 1.3543435335159302, + "learning_rate": 0.00018612405334355488, + "loss": 1.1765, + "step": 5434 + }, + { + "epoch": 0.19463892420362777, + "grad_norm": 1.740868330001831, + "learning_rate": 0.00018611815817433925, + "loss": 1.3383, + "step": 5435 + }, + { + "epoch": 0.19467473633319604, + "grad_norm": 1.5243256092071533, + "learning_rate": 0.00018611226184651484, + "loss": 1.3211, + "step": 5436 + }, + { + "epoch": 0.19471054846276434, + "grad_norm": 1.5447046756744385, + "learning_rate": 0.00018610636436016106, + "loss": 1.2316, + "step": 5437 + }, + { + "epoch": 0.19474636059233263, + "grad_norm": 1.5774728059768677, + "learning_rate": 0.00018610046571535723, + "loss": 1.1895, + "step": 5438 + }, + { + "epoch": 0.1947821727219009, + "grad_norm": 1.6557519435882568, + "learning_rate": 0.00018609456591218266, + "loss": 0.8775, + "step": 5439 + }, + { + "epoch": 0.1948179848514692, + "grad_norm": 1.7781563997268677, + "learning_rate": 0.0001860886649507168, + "loss": 1.1754, + "step": 5440 + }, + { + "epoch": 0.19485379698103747, + "grad_norm": 1.6342003345489502, + "learning_rate": 0.00018608276283103896, + "loss": 1.3229, + "step": 5441 + }, + { + "epoch": 0.19488960911060577, + "grad_norm": 1.7039930820465088, + "learning_rate": 0.0001860768595532286, + "loss": 0.8992, + "step": 5442 + }, + { + "epoch": 0.19492542124017403, + "grad_norm": 1.3347408771514893, + "learning_rate": 0.00018607095511736515, + "loss": 1.2208, + "step": 5443 + }, + { + "epoch": 0.19496123336974233, + "grad_norm": 1.6041244268417358, + "learning_rate": 0.00018606504952352798, + "loss": 1.0481, + "step": 5444 + }, + { + "epoch": 0.19499704549931063, + "grad_norm": 2.438939332962036, + "learning_rate": 0.00018605914277179664, + "loss": 1.2849, + "step": 5445 + }, + { + "epoch": 0.1950328576288789, + "grad_norm": 1.5756186246871948, + "learning_rate": 0.00018605323486225049, + "loss": 1.2764, + "step": 5446 + }, + { + "epoch": 0.1950686697584472, + "grad_norm": 1.3796666860580444, + "learning_rate": 0.00018604732579496908, + "loss": 1.081, + "step": 5447 + }, + { + "epoch": 0.19510448188801546, + "grad_norm": 1.3952018022537231, + "learning_rate": 0.0001860414155700319, + "loss": 1.139, + "step": 5448 + }, + { + "epoch": 0.19514029401758376, + "grad_norm": 1.4777045249938965, + "learning_rate": 0.00018603550418751845, + "loss": 1.0835, + "step": 5449 + }, + { + "epoch": 0.19517610614715203, + "grad_norm": 2.3641419410705566, + "learning_rate": 0.0001860295916475083, + "loss": 1.2202, + "step": 5450 + }, + { + "epoch": 0.19521191827672033, + "grad_norm": 1.4703376293182373, + "learning_rate": 0.00018602367795008093, + "loss": 1.3953, + "step": 5451 + }, + { + "epoch": 0.19524773040628862, + "grad_norm": 1.415002703666687, + "learning_rate": 0.00018601776309531593, + "loss": 1.0888, + "step": 5452 + }, + { + "epoch": 0.1952835425358569, + "grad_norm": 1.6385419368743896, + "learning_rate": 0.00018601184708329292, + "loss": 1.1526, + "step": 5453 + }, + { + "epoch": 0.1953193546654252, + "grad_norm": 2.2754735946655273, + "learning_rate": 0.00018600592991409141, + "loss": 1.1442, + "step": 5454 + }, + { + "epoch": 0.19535516679499346, + "grad_norm": 1.9322744607925415, + "learning_rate": 0.00018600001158779108, + "loss": 1.1645, + "step": 5455 + }, + { + "epoch": 0.19539097892456175, + "grad_norm": 1.4823285341262817, + "learning_rate": 0.00018599409210447152, + "loss": 1.178, + "step": 5456 + }, + { + "epoch": 0.19542679105413002, + "grad_norm": 1.3805739879608154, + "learning_rate": 0.0001859881714642124, + "loss": 1.231, + "step": 5457 + }, + { + "epoch": 0.19546260318369832, + "grad_norm": 1.5708301067352295, + "learning_rate": 0.00018598224966709332, + "loss": 1.2643, + "step": 5458 + }, + { + "epoch": 0.19549841531326662, + "grad_norm": 1.4218462705612183, + "learning_rate": 0.00018597632671319398, + "loss": 1.0227, + "step": 5459 + }, + { + "epoch": 0.19553422744283488, + "grad_norm": 1.4160487651824951, + "learning_rate": 0.0001859704026025941, + "loss": 1.1664, + "step": 5460 + }, + { + "epoch": 0.19557003957240318, + "grad_norm": 1.4122307300567627, + "learning_rate": 0.0001859644773353733, + "loss": 1.1537, + "step": 5461 + }, + { + "epoch": 0.19560585170197145, + "grad_norm": 1.9004501104354858, + "learning_rate": 0.00018595855091161137, + "loss": 1.1503, + "step": 5462 + }, + { + "epoch": 0.19564166383153975, + "grad_norm": 1.3676494359970093, + "learning_rate": 0.00018595262333138802, + "loss": 1.2872, + "step": 5463 + }, + { + "epoch": 0.19567747596110802, + "grad_norm": 1.541651725769043, + "learning_rate": 0.000185946694594783, + "loss": 1.4052, + "step": 5464 + }, + { + "epoch": 0.1957132880906763, + "grad_norm": 1.5974011421203613, + "learning_rate": 0.000185940764701876, + "loss": 1.135, + "step": 5465 + }, + { + "epoch": 0.1957491002202446, + "grad_norm": 1.9617160558700562, + "learning_rate": 0.00018593483365274694, + "loss": 1.1146, + "step": 5466 + }, + { + "epoch": 0.19578491234981288, + "grad_norm": 1.6280970573425293, + "learning_rate": 0.00018592890144747553, + "loss": 1.2038, + "step": 5467 + }, + { + "epoch": 0.19582072447938118, + "grad_norm": 2.314329147338867, + "learning_rate": 0.00018592296808614156, + "loss": 1.4868, + "step": 5468 + }, + { + "epoch": 0.19585653660894944, + "grad_norm": 2.726766347885132, + "learning_rate": 0.0001859170335688249, + "loss": 1.1704, + "step": 5469 + }, + { + "epoch": 0.19589234873851774, + "grad_norm": 1.848445177078247, + "learning_rate": 0.0001859110978956054, + "loss": 1.1042, + "step": 5470 + }, + { + "epoch": 0.195928160868086, + "grad_norm": 1.5820012092590332, + "learning_rate": 0.00018590516106656288, + "loss": 1.2797, + "step": 5471 + }, + { + "epoch": 0.1959639729976543, + "grad_norm": 1.8861008882522583, + "learning_rate": 0.00018589922308177723, + "loss": 1.1453, + "step": 5472 + }, + { + "epoch": 0.1959997851272226, + "grad_norm": 1.6534818410873413, + "learning_rate": 0.0001858932839413283, + "loss": 1.1113, + "step": 5473 + }, + { + "epoch": 0.19603559725679087, + "grad_norm": 1.269147276878357, + "learning_rate": 0.0001858873436452961, + "loss": 0.9739, + "step": 5474 + }, + { + "epoch": 0.19607140938635917, + "grad_norm": 1.6341426372528076, + "learning_rate": 0.0001858814021937604, + "loss": 0.9501, + "step": 5475 + }, + { + "epoch": 0.19610722151592744, + "grad_norm": 1.6179311275482178, + "learning_rate": 0.0001858754595868013, + "loss": 1.0664, + "step": 5476 + }, + { + "epoch": 0.19614303364549573, + "grad_norm": 1.5063284635543823, + "learning_rate": 0.0001858695158244986, + "loss": 1.0659, + "step": 5477 + }, + { + "epoch": 0.196178845775064, + "grad_norm": 2.406686305999756, + "learning_rate": 0.00018586357090693233, + "loss": 1.5888, + "step": 5478 + }, + { + "epoch": 0.1962146579046323, + "grad_norm": 1.3106420040130615, + "learning_rate": 0.0001858576248341825, + "loss": 1.0992, + "step": 5479 + }, + { + "epoch": 0.1962504700342006, + "grad_norm": 2.3591692447662354, + "learning_rate": 0.00018585167760632905, + "loss": 1.2476, + "step": 5480 + }, + { + "epoch": 0.19628628216376887, + "grad_norm": 1.322333812713623, + "learning_rate": 0.00018584572922345202, + "loss": 1.0334, + "step": 5481 + }, + { + "epoch": 0.19632209429333716, + "grad_norm": 1.6053485870361328, + "learning_rate": 0.00018583977968563144, + "loss": 1.1813, + "step": 5482 + }, + { + "epoch": 0.19635790642290543, + "grad_norm": 2.000143051147461, + "learning_rate": 0.00018583382899294736, + "loss": 1.1146, + "step": 5483 + }, + { + "epoch": 0.19639371855247373, + "grad_norm": 1.6044002771377563, + "learning_rate": 0.00018582787714547982, + "loss": 1.0244, + "step": 5484 + }, + { + "epoch": 0.196429530682042, + "grad_norm": 1.4378491640090942, + "learning_rate": 0.0001858219241433089, + "loss": 1.3059, + "step": 5485 + }, + { + "epoch": 0.1964653428116103, + "grad_norm": 1.7768937349319458, + "learning_rate": 0.0001858159699865147, + "loss": 1.1595, + "step": 5486 + }, + { + "epoch": 0.1965011549411786, + "grad_norm": 1.4542794227600098, + "learning_rate": 0.00018581001467517734, + "loss": 1.2615, + "step": 5487 + }, + { + "epoch": 0.19653696707074686, + "grad_norm": 1.8201180696487427, + "learning_rate": 0.00018580405820937688, + "loss": 1.5474, + "step": 5488 + }, + { + "epoch": 0.19657277920031516, + "grad_norm": 1.1311732530593872, + "learning_rate": 0.0001857981005891935, + "loss": 1.0631, + "step": 5489 + }, + { + "epoch": 0.19660859132988343, + "grad_norm": 1.527860403060913, + "learning_rate": 0.00018579214181470736, + "loss": 1.052, + "step": 5490 + }, + { + "epoch": 0.19664440345945172, + "grad_norm": 1.643100380897522, + "learning_rate": 0.00018578618188599863, + "loss": 1.2136, + "step": 5491 + }, + { + "epoch": 0.19668021558902, + "grad_norm": 2.828942060470581, + "learning_rate": 0.00018578022080314747, + "loss": 1.1586, + "step": 5492 + }, + { + "epoch": 0.1967160277185883, + "grad_norm": 1.6133052110671997, + "learning_rate": 0.00018577425856623408, + "loss": 1.2449, + "step": 5493 + }, + { + "epoch": 0.19675183984815658, + "grad_norm": 1.5826549530029297, + "learning_rate": 0.00018576829517533868, + "loss": 0.9755, + "step": 5494 + }, + { + "epoch": 0.19678765197772485, + "grad_norm": 1.5129432678222656, + "learning_rate": 0.00018576233063054151, + "loss": 1.248, + "step": 5495 + }, + { + "epoch": 0.19682346410729315, + "grad_norm": 1.8787697553634644, + "learning_rate": 0.00018575636493192282, + "loss": 1.4023, + "step": 5496 + }, + { + "epoch": 0.19685927623686142, + "grad_norm": 1.2860835790634155, + "learning_rate": 0.00018575039807956282, + "loss": 1.2593, + "step": 5497 + }, + { + "epoch": 0.19689508836642972, + "grad_norm": 1.9298210144042969, + "learning_rate": 0.00018574443007354186, + "loss": 1.1001, + "step": 5498 + }, + { + "epoch": 0.19693090049599798, + "grad_norm": 1.359413981437683, + "learning_rate": 0.00018573846091394017, + "loss": 1.0898, + "step": 5499 + }, + { + "epoch": 0.19696671262556628, + "grad_norm": 1.4385204315185547, + "learning_rate": 0.00018573249060083812, + "loss": 1.1358, + "step": 5500 + }, + { + "epoch": 0.19700252475513455, + "grad_norm": 1.5750162601470947, + "learning_rate": 0.00018572651913431596, + "loss": 1.3439, + "step": 5501 + }, + { + "epoch": 0.19703833688470285, + "grad_norm": 1.4500905275344849, + "learning_rate": 0.00018572054651445408, + "loss": 1.1566, + "step": 5502 + }, + { + "epoch": 0.19707414901427114, + "grad_norm": 2.3436801433563232, + "learning_rate": 0.00018571457274133279, + "loss": 1.2384, + "step": 5503 + }, + { + "epoch": 0.1971099611438394, + "grad_norm": 1.6638256311416626, + "learning_rate": 0.0001857085978150325, + "loss": 1.2261, + "step": 5504 + }, + { + "epoch": 0.1971457732734077, + "grad_norm": 1.468759298324585, + "learning_rate": 0.0001857026217356336, + "loss": 1.1782, + "step": 5505 + }, + { + "epoch": 0.19718158540297598, + "grad_norm": 1.7099943161010742, + "learning_rate": 0.00018569664450321645, + "loss": 1.0802, + "step": 5506 + }, + { + "epoch": 0.19721739753254428, + "grad_norm": 1.2186987400054932, + "learning_rate": 0.00018569066611786152, + "loss": 1.1812, + "step": 5507 + }, + { + "epoch": 0.19725320966211254, + "grad_norm": 1.5861250162124634, + "learning_rate": 0.00018568468657964918, + "loss": 0.9978, + "step": 5508 + }, + { + "epoch": 0.19728902179168084, + "grad_norm": 1.2736619710922241, + "learning_rate": 0.00018567870588865994, + "loss": 1.1363, + "step": 5509 + }, + { + "epoch": 0.19732483392124914, + "grad_norm": 1.4466044902801514, + "learning_rate": 0.0001856727240449742, + "loss": 1.3483, + "step": 5510 + }, + { + "epoch": 0.1973606460508174, + "grad_norm": 1.4557915925979614, + "learning_rate": 0.0001856667410486725, + "loss": 1.3686, + "step": 5511 + }, + { + "epoch": 0.1973964581803857, + "grad_norm": 1.3577827215194702, + "learning_rate": 0.00018566075689983527, + "loss": 1.2834, + "step": 5512 + }, + { + "epoch": 0.19743227030995397, + "grad_norm": 1.2523231506347656, + "learning_rate": 0.00018565477159854306, + "loss": 1.277, + "step": 5513 + }, + { + "epoch": 0.19746808243952227, + "grad_norm": 1.7336382865905762, + "learning_rate": 0.00018564878514487637, + "loss": 1.0239, + "step": 5514 + }, + { + "epoch": 0.19750389456909054, + "grad_norm": 1.6833475828170776, + "learning_rate": 0.0001856427975389158, + "loss": 1.3618, + "step": 5515 + }, + { + "epoch": 0.19753970669865883, + "grad_norm": 2.105794668197632, + "learning_rate": 0.00018563680878074182, + "loss": 1.1295, + "step": 5516 + }, + { + "epoch": 0.19757551882822713, + "grad_norm": 1.9490333795547485, + "learning_rate": 0.00018563081887043505, + "loss": 1.2086, + "step": 5517 + }, + { + "epoch": 0.1976113309577954, + "grad_norm": 1.8695532083511353, + "learning_rate": 0.00018562482780807606, + "loss": 1.1105, + "step": 5518 + }, + { + "epoch": 0.1976471430873637, + "grad_norm": 1.6702526807785034, + "learning_rate": 0.00018561883559374548, + "loss": 1.0645, + "step": 5519 + }, + { + "epoch": 0.19768295521693197, + "grad_norm": 1.9434826374053955, + "learning_rate": 0.0001856128422275239, + "loss": 1.2097, + "step": 5520 + }, + { + "epoch": 0.19771876734650026, + "grad_norm": 1.5898545980453491, + "learning_rate": 0.00018560684770949198, + "loss": 1.2461, + "step": 5521 + }, + { + "epoch": 0.19775457947606853, + "grad_norm": 1.8402466773986816, + "learning_rate": 0.0001856008520397303, + "loss": 1.1868, + "step": 5522 + }, + { + "epoch": 0.19779039160563683, + "grad_norm": 1.5717374086380005, + "learning_rate": 0.00018559485521831958, + "loss": 1.1506, + "step": 5523 + }, + { + "epoch": 0.19782620373520513, + "grad_norm": 1.711726427078247, + "learning_rate": 0.00018558885724534054, + "loss": 1.1806, + "step": 5524 + }, + { + "epoch": 0.1978620158647734, + "grad_norm": 1.646470308303833, + "learning_rate": 0.00018558285812087378, + "loss": 1.3418, + "step": 5525 + }, + { + "epoch": 0.1978978279943417, + "grad_norm": 1.5218557119369507, + "learning_rate": 0.0001855768578450001, + "loss": 1.1852, + "step": 5526 + }, + { + "epoch": 0.19793364012390996, + "grad_norm": 1.6442193984985352, + "learning_rate": 0.00018557085641780018, + "loss": 1.3417, + "step": 5527 + }, + { + "epoch": 0.19796945225347826, + "grad_norm": 1.6525108814239502, + "learning_rate": 0.0001855648538393547, + "loss": 1.2965, + "step": 5528 + }, + { + "epoch": 0.19800526438304653, + "grad_norm": 1.8210242986679077, + "learning_rate": 0.00018555885010974454, + "loss": 1.1465, + "step": 5529 + }, + { + "epoch": 0.19804107651261482, + "grad_norm": 1.937224268913269, + "learning_rate": 0.00018555284522905042, + "loss": 1.2362, + "step": 5530 + }, + { + "epoch": 0.19807688864218312, + "grad_norm": 1.5259969234466553, + "learning_rate": 0.00018554683919735313, + "loss": 1.1786, + "step": 5531 + }, + { + "epoch": 0.1981127007717514, + "grad_norm": 1.3484342098236084, + "learning_rate": 0.0001855408320147334, + "loss": 1.1385, + "step": 5532 + }, + { + "epoch": 0.19814851290131968, + "grad_norm": 1.9700133800506592, + "learning_rate": 0.00018553482368127217, + "loss": 1.2505, + "step": 5533 + }, + { + "epoch": 0.19818432503088795, + "grad_norm": 1.4969534873962402, + "learning_rate": 0.0001855288141970502, + "loss": 1.1674, + "step": 5534 + }, + { + "epoch": 0.19822013716045625, + "grad_norm": 1.8112578392028809, + "learning_rate": 0.00018552280356214838, + "loss": 1.1591, + "step": 5535 + }, + { + "epoch": 0.19825594929002452, + "grad_norm": 1.6894129514694214, + "learning_rate": 0.00018551679177664755, + "loss": 1.0196, + "step": 5536 + }, + { + "epoch": 0.19829176141959282, + "grad_norm": 1.4706614017486572, + "learning_rate": 0.0001855107788406286, + "loss": 1.3727, + "step": 5537 + }, + { + "epoch": 0.1983275735491611, + "grad_norm": 1.2951226234436035, + "learning_rate": 0.0001855047647541724, + "loss": 1.1569, + "step": 5538 + }, + { + "epoch": 0.19836338567872938, + "grad_norm": 1.589166522026062, + "learning_rate": 0.00018549874951735988, + "loss": 1.3434, + "step": 5539 + }, + { + "epoch": 0.19839919780829768, + "grad_norm": 1.641939401626587, + "learning_rate": 0.00018549273313027198, + "loss": 1.2217, + "step": 5540 + }, + { + "epoch": 0.19843500993786595, + "grad_norm": 1.5693542957305908, + "learning_rate": 0.00018548671559298963, + "loss": 1.2179, + "step": 5541 + }, + { + "epoch": 0.19847082206743424, + "grad_norm": 1.605495810508728, + "learning_rate": 0.00018548069690559383, + "loss": 1.2084, + "step": 5542 + }, + { + "epoch": 0.1985066341970025, + "grad_norm": 1.6472214460372925, + "learning_rate": 0.00018547467706816546, + "loss": 1.0625, + "step": 5543 + }, + { + "epoch": 0.1985424463265708, + "grad_norm": 1.591853380203247, + "learning_rate": 0.00018546865608078559, + "loss": 1.226, + "step": 5544 + }, + { + "epoch": 0.1985782584561391, + "grad_norm": 1.5722146034240723, + "learning_rate": 0.0001854626339435352, + "loss": 1.3642, + "step": 5545 + }, + { + "epoch": 0.19861407058570738, + "grad_norm": 1.309429407119751, + "learning_rate": 0.0001854566106564953, + "loss": 1.0708, + "step": 5546 + }, + { + "epoch": 0.19864988271527567, + "grad_norm": 1.4604986906051636, + "learning_rate": 0.00018545058621974693, + "loss": 1.2111, + "step": 5547 + }, + { + "epoch": 0.19868569484484394, + "grad_norm": 1.3385380506515503, + "learning_rate": 0.00018544456063337116, + "loss": 1.0963, + "step": 5548 + }, + { + "epoch": 0.19872150697441224, + "grad_norm": 1.6139216423034668, + "learning_rate": 0.00018543853389744905, + "loss": 1.1832, + "step": 5549 + }, + { + "epoch": 0.1987573191039805, + "grad_norm": 1.4242664575576782, + "learning_rate": 0.00018543250601206165, + "loss": 1.3143, + "step": 5550 + }, + { + "epoch": 0.1987931312335488, + "grad_norm": 1.5678741931915283, + "learning_rate": 0.00018542647697729009, + "loss": 1.1815, + "step": 5551 + }, + { + "epoch": 0.1988289433631171, + "grad_norm": 1.8707845211029053, + "learning_rate": 0.00018542044679321549, + "loss": 1.2486, + "step": 5552 + }, + { + "epoch": 0.19886475549268537, + "grad_norm": 1.3513062000274658, + "learning_rate": 0.00018541441545991892, + "loss": 1.0779, + "step": 5553 + }, + { + "epoch": 0.19890056762225367, + "grad_norm": 1.7062551975250244, + "learning_rate": 0.00018540838297748162, + "loss": 0.9954, + "step": 5554 + }, + { + "epoch": 0.19893637975182193, + "grad_norm": 1.2607803344726562, + "learning_rate": 0.0001854023493459847, + "loss": 0.9574, + "step": 5555 + }, + { + "epoch": 0.19897219188139023, + "grad_norm": 1.1019786596298218, + "learning_rate": 0.00018539631456550927, + "loss": 0.9639, + "step": 5556 + }, + { + "epoch": 0.1990080040109585, + "grad_norm": 1.5840280055999756, + "learning_rate": 0.00018539027863613664, + "loss": 1.2437, + "step": 5557 + }, + { + "epoch": 0.1990438161405268, + "grad_norm": 1.3191173076629639, + "learning_rate": 0.0001853842415579479, + "loss": 1.2533, + "step": 5558 + }, + { + "epoch": 0.1990796282700951, + "grad_norm": 2.1385531425476074, + "learning_rate": 0.0001853782033310244, + "loss": 1.1988, + "step": 5559 + }, + { + "epoch": 0.19911544039966336, + "grad_norm": 1.4287512302398682, + "learning_rate": 0.00018537216395544723, + "loss": 1.1231, + "step": 5560 + }, + { + "epoch": 0.19915125252923166, + "grad_norm": 1.9745829105377197, + "learning_rate": 0.00018536612343129778, + "loss": 1.1639, + "step": 5561 + }, + { + "epoch": 0.19918706465879993, + "grad_norm": 1.466817021369934, + "learning_rate": 0.0001853600817586572, + "loss": 1.2861, + "step": 5562 + }, + { + "epoch": 0.19922287678836822, + "grad_norm": 2.0731616020202637, + "learning_rate": 0.00018535403893760684, + "loss": 1.3173, + "step": 5563 + }, + { + "epoch": 0.1992586889179365, + "grad_norm": 2.1758038997650146, + "learning_rate": 0.00018534799496822802, + "loss": 1.2014, + "step": 5564 + }, + { + "epoch": 0.1992945010475048, + "grad_norm": 1.8118274211883545, + "learning_rate": 0.00018534194985060198, + "loss": 1.3525, + "step": 5565 + }, + { + "epoch": 0.1993303131770731, + "grad_norm": 1.5485012531280518, + "learning_rate": 0.0001853359035848101, + "loss": 1.2688, + "step": 5566 + }, + { + "epoch": 0.19936612530664136, + "grad_norm": 1.7634280920028687, + "learning_rate": 0.0001853298561709337, + "loss": 1.1478, + "step": 5567 + }, + { + "epoch": 0.19940193743620965, + "grad_norm": 1.5654810667037964, + "learning_rate": 0.0001853238076090542, + "loss": 1.2053, + "step": 5568 + }, + { + "epoch": 0.19943774956577792, + "grad_norm": 1.3921397924423218, + "learning_rate": 0.00018531775789925288, + "loss": 1.1557, + "step": 5569 + }, + { + "epoch": 0.19947356169534622, + "grad_norm": 1.3667486906051636, + "learning_rate": 0.00018531170704161117, + "loss": 1.2038, + "step": 5570 + }, + { + "epoch": 0.1995093738249145, + "grad_norm": 1.8619238138198853, + "learning_rate": 0.00018530565503621052, + "loss": 1.1156, + "step": 5571 + }, + { + "epoch": 0.19954518595448278, + "grad_norm": 1.9847443103790283, + "learning_rate": 0.00018529960188313233, + "loss": 1.1148, + "step": 5572 + }, + { + "epoch": 0.19958099808405108, + "grad_norm": 1.4127296209335327, + "learning_rate": 0.000185293547582458, + "loss": 1.025, + "step": 5573 + }, + { + "epoch": 0.19961681021361935, + "grad_norm": 1.6443486213684082, + "learning_rate": 0.000185287492134269, + "loss": 1.2149, + "step": 5574 + }, + { + "epoch": 0.19965262234318765, + "grad_norm": 1.8859471082687378, + "learning_rate": 0.00018528143553864682, + "loss": 1.0458, + "step": 5575 + }, + { + "epoch": 0.19968843447275592, + "grad_norm": 1.9277607202529907, + "learning_rate": 0.00018527537779567294, + "loss": 1.3341, + "step": 5576 + }, + { + "epoch": 0.1997242466023242, + "grad_norm": 1.6015419960021973, + "learning_rate": 0.00018526931890542882, + "loss": 1.1337, + "step": 5577 + }, + { + "epoch": 0.19976005873189248, + "grad_norm": 1.9471124410629272, + "learning_rate": 0.00018526325886799601, + "loss": 1.0453, + "step": 5578 + }, + { + "epoch": 0.19979587086146078, + "grad_norm": 1.590812087059021, + "learning_rate": 0.00018525719768345606, + "loss": 1.3265, + "step": 5579 + }, + { + "epoch": 0.19983168299102907, + "grad_norm": 2.653160333633423, + "learning_rate": 0.00018525113535189047, + "loss": 1.3719, + "step": 5580 + }, + { + "epoch": 0.19986749512059734, + "grad_norm": 1.424807071685791, + "learning_rate": 0.00018524507187338082, + "loss": 1.2841, + "step": 5581 + }, + { + "epoch": 0.19990330725016564, + "grad_norm": 1.4514027833938599, + "learning_rate": 0.00018523900724800872, + "loss": 1.1154, + "step": 5582 + }, + { + "epoch": 0.1999391193797339, + "grad_norm": 1.912869930267334, + "learning_rate": 0.00018523294147585568, + "loss": 1.1426, + "step": 5583 + }, + { + "epoch": 0.1999749315093022, + "grad_norm": 2.0479085445404053, + "learning_rate": 0.00018522687455700337, + "loss": 1.2495, + "step": 5584 + }, + { + "epoch": 0.20001074363887048, + "grad_norm": 1.5522781610488892, + "learning_rate": 0.0001852208064915334, + "loss": 1.2389, + "step": 5585 + }, + { + "epoch": 0.20004655576843877, + "grad_norm": 1.4387106895446777, + "learning_rate": 0.00018521473727952742, + "loss": 1.1312, + "step": 5586 + }, + { + "epoch": 0.20008236789800707, + "grad_norm": 1.5917448997497559, + "learning_rate": 0.00018520866692106703, + "loss": 1.3244, + "step": 5587 + }, + { + "epoch": 0.20011818002757534, + "grad_norm": 1.4525855779647827, + "learning_rate": 0.00018520259541623398, + "loss": 1.3579, + "step": 5588 + }, + { + "epoch": 0.20015399215714363, + "grad_norm": 1.7744776010513306, + "learning_rate": 0.0001851965227651099, + "loss": 1.1945, + "step": 5589 + }, + { + "epoch": 0.2001898042867119, + "grad_norm": 1.6735714673995972, + "learning_rate": 0.00018519044896777648, + "loss": 1.4639, + "step": 5590 + }, + { + "epoch": 0.2002256164162802, + "grad_norm": 1.3976898193359375, + "learning_rate": 0.0001851843740243155, + "loss": 1.2882, + "step": 5591 + }, + { + "epoch": 0.20026142854584847, + "grad_norm": 1.643754005432129, + "learning_rate": 0.00018517829793480861, + "loss": 1.292, + "step": 5592 + }, + { + "epoch": 0.20029724067541677, + "grad_norm": 1.8597618341445923, + "learning_rate": 0.0001851722206993376, + "loss": 1.0908, + "step": 5593 + }, + { + "epoch": 0.20033305280498506, + "grad_norm": 1.775797963142395, + "learning_rate": 0.00018516614231798423, + "loss": 1.2748, + "step": 5594 + }, + { + "epoch": 0.20036886493455333, + "grad_norm": 1.4376592636108398, + "learning_rate": 0.00018516006279083026, + "loss": 1.3815, + "step": 5595 + }, + { + "epoch": 0.20040467706412163, + "grad_norm": 1.5083205699920654, + "learning_rate": 0.0001851539821179575, + "loss": 1.0715, + "step": 5596 + }, + { + "epoch": 0.2004404891936899, + "grad_norm": 1.4475585222244263, + "learning_rate": 0.00018514790029944777, + "loss": 1.2451, + "step": 5597 + }, + { + "epoch": 0.2004763013232582, + "grad_norm": 1.8563859462738037, + "learning_rate": 0.00018514181733538285, + "loss": 1.1528, + "step": 5598 + }, + { + "epoch": 0.20051211345282646, + "grad_norm": 1.3338218927383423, + "learning_rate": 0.00018513573322584463, + "loss": 1.2403, + "step": 5599 + }, + { + "epoch": 0.20054792558239476, + "grad_norm": 1.5304951667785645, + "learning_rate": 0.0001851296479709149, + "loss": 1.3024, + "step": 5600 + }, + { + "epoch": 0.20058373771196303, + "grad_norm": 1.6376433372497559, + "learning_rate": 0.00018512356157067558, + "loss": 1.2082, + "step": 5601 + }, + { + "epoch": 0.20061954984153132, + "grad_norm": 1.2693275213241577, + "learning_rate": 0.00018511747402520857, + "loss": 0.8763, + "step": 5602 + }, + { + "epoch": 0.20065536197109962, + "grad_norm": 1.3545167446136475, + "learning_rate": 0.0001851113853345957, + "loss": 1.2701, + "step": 5603 + }, + { + "epoch": 0.2006911741006679, + "grad_norm": 1.5375765562057495, + "learning_rate": 0.00018510529549891895, + "loss": 1.2068, + "step": 5604 + }, + { + "epoch": 0.2007269862302362, + "grad_norm": 2.0584585666656494, + "learning_rate": 0.00018509920451826022, + "loss": 1.0049, + "step": 5605 + }, + { + "epoch": 0.20076279835980446, + "grad_norm": 1.8351408243179321, + "learning_rate": 0.00018509311239270145, + "loss": 1.0711, + "step": 5606 + }, + { + "epoch": 0.20079861048937275, + "grad_norm": 1.724690556526184, + "learning_rate": 0.00018508701912232464, + "loss": 1.1477, + "step": 5607 + }, + { + "epoch": 0.20083442261894102, + "grad_norm": 1.6891528367996216, + "learning_rate": 0.00018508092470721175, + "loss": 1.1019, + "step": 5608 + }, + { + "epoch": 0.20087023474850932, + "grad_norm": 1.558571696281433, + "learning_rate": 0.0001850748291474447, + "loss": 1.2042, + "step": 5609 + }, + { + "epoch": 0.20090604687807762, + "grad_norm": 1.7499001026153564, + "learning_rate": 0.00018506873244310563, + "loss": 1.0422, + "step": 5610 + }, + { + "epoch": 0.20094185900764588, + "grad_norm": 1.5836446285247803, + "learning_rate": 0.00018506263459427648, + "loss": 1.2033, + "step": 5611 + }, + { + "epoch": 0.20097767113721418, + "grad_norm": 1.3632310628890991, + "learning_rate": 0.00018505653560103928, + "loss": 1.2208, + "step": 5612 + }, + { + "epoch": 0.20101348326678245, + "grad_norm": 1.5123504400253296, + "learning_rate": 0.00018505043546347612, + "loss": 1.0289, + "step": 5613 + }, + { + "epoch": 0.20104929539635075, + "grad_norm": 1.4539918899536133, + "learning_rate": 0.00018504433418166908, + "loss": 1.1127, + "step": 5614 + }, + { + "epoch": 0.20108510752591902, + "grad_norm": 1.4341799020767212, + "learning_rate": 0.00018503823175570021, + "loss": 1.2384, + "step": 5615 + }, + { + "epoch": 0.2011209196554873, + "grad_norm": 1.7121295928955078, + "learning_rate": 0.00018503212818565161, + "loss": 1.1199, + "step": 5616 + }, + { + "epoch": 0.2011567317850556, + "grad_norm": 1.7023324966430664, + "learning_rate": 0.00018502602347160544, + "loss": 1.2296, + "step": 5617 + }, + { + "epoch": 0.20119254391462388, + "grad_norm": 1.6691267490386963, + "learning_rate": 0.00018501991761364376, + "loss": 1.2268, + "step": 5618 + }, + { + "epoch": 0.20122835604419217, + "grad_norm": 1.445711374282837, + "learning_rate": 0.00018501381061184876, + "loss": 1.2353, + "step": 5619 + }, + { + "epoch": 0.20126416817376044, + "grad_norm": 1.78801691532135, + "learning_rate": 0.0001850077024663026, + "loss": 1.0375, + "step": 5620 + }, + { + "epoch": 0.20129998030332874, + "grad_norm": 1.3707906007766724, + "learning_rate": 0.00018500159317708749, + "loss": 1.4135, + "step": 5621 + }, + { + "epoch": 0.201335792432897, + "grad_norm": 1.5806629657745361, + "learning_rate": 0.00018499548274428557, + "loss": 1.0786, + "step": 5622 + }, + { + "epoch": 0.2013716045624653, + "grad_norm": 1.6732141971588135, + "learning_rate": 0.00018498937116797904, + "loss": 1.2222, + "step": 5623 + }, + { + "epoch": 0.2014074166920336, + "grad_norm": 1.2954885959625244, + "learning_rate": 0.0001849832584482502, + "loss": 1.2102, + "step": 5624 + }, + { + "epoch": 0.20144322882160187, + "grad_norm": 1.5995103120803833, + "learning_rate": 0.00018497714458518122, + "loss": 1.2059, + "step": 5625 + }, + { + "epoch": 0.20147904095117017, + "grad_norm": 1.4343912601470947, + "learning_rate": 0.00018497102957885434, + "loss": 1.1506, + "step": 5626 + }, + { + "epoch": 0.20151485308073844, + "grad_norm": 1.45311439037323, + "learning_rate": 0.0001849649134293519, + "loss": 1.1888, + "step": 5627 + }, + { + "epoch": 0.20155066521030673, + "grad_norm": 1.7754563093185425, + "learning_rate": 0.00018495879613675612, + "loss": 1.0173, + "step": 5628 + }, + { + "epoch": 0.201586477339875, + "grad_norm": 1.7730247974395752, + "learning_rate": 0.00018495267770114935, + "loss": 1.076, + "step": 5629 + }, + { + "epoch": 0.2016222894694433, + "grad_norm": 1.240389108657837, + "learning_rate": 0.00018494655812261387, + "loss": 1.1576, + "step": 5630 + }, + { + "epoch": 0.2016581015990116, + "grad_norm": 1.7536720037460327, + "learning_rate": 0.00018494043740123202, + "loss": 1.2588, + "step": 5631 + }, + { + "epoch": 0.20169391372857987, + "grad_norm": 1.593070149421692, + "learning_rate": 0.00018493431553708614, + "loss": 1.3414, + "step": 5632 + }, + { + "epoch": 0.20172972585814816, + "grad_norm": 1.595645546913147, + "learning_rate": 0.0001849281925302586, + "loss": 1.1856, + "step": 5633 + }, + { + "epoch": 0.20176553798771643, + "grad_norm": 1.5516860485076904, + "learning_rate": 0.0001849220683808318, + "loss": 1.2567, + "step": 5634 + }, + { + "epoch": 0.20180135011728473, + "grad_norm": 1.4141186475753784, + "learning_rate": 0.00018491594308888814, + "loss": 1.2423, + "step": 5635 + }, + { + "epoch": 0.201837162246853, + "grad_norm": 1.6589336395263672, + "learning_rate": 0.00018490981665450994, + "loss": 1.2578, + "step": 5636 + }, + { + "epoch": 0.2018729743764213, + "grad_norm": 1.3463112115859985, + "learning_rate": 0.00018490368907777974, + "loss": 1.3587, + "step": 5637 + }, + { + "epoch": 0.2019087865059896, + "grad_norm": 1.4763635396957397, + "learning_rate": 0.0001848975603587799, + "loss": 1.1616, + "step": 5638 + }, + { + "epoch": 0.20194459863555786, + "grad_norm": 1.8425880670547485, + "learning_rate": 0.00018489143049759286, + "loss": 1.2243, + "step": 5639 + }, + { + "epoch": 0.20198041076512616, + "grad_norm": 1.8861240148544312, + "learning_rate": 0.00018488529949430116, + "loss": 1.2055, + "step": 5640 + }, + { + "epoch": 0.20201622289469442, + "grad_norm": 1.5399963855743408, + "learning_rate": 0.00018487916734898722, + "loss": 1.2614, + "step": 5641 + }, + { + "epoch": 0.20205203502426272, + "grad_norm": 2.4605929851531982, + "learning_rate": 0.0001848730340617336, + "loss": 1.2863, + "step": 5642 + }, + { + "epoch": 0.202087847153831, + "grad_norm": 1.6150505542755127, + "learning_rate": 0.00018486689963262277, + "loss": 1.2396, + "step": 5643 + }, + { + "epoch": 0.2021236592833993, + "grad_norm": 1.5606422424316406, + "learning_rate": 0.00018486076406173726, + "loss": 1.1424, + "step": 5644 + }, + { + "epoch": 0.20215947141296758, + "grad_norm": 1.4226592779159546, + "learning_rate": 0.00018485462734915966, + "loss": 1.279, + "step": 5645 + }, + { + "epoch": 0.20219528354253585, + "grad_norm": 1.5522451400756836, + "learning_rate": 0.0001848484894949725, + "loss": 1.4417, + "step": 5646 + }, + { + "epoch": 0.20223109567210415, + "grad_norm": 1.4619139432907104, + "learning_rate": 0.00018484235049925836, + "loss": 1.2152, + "step": 5647 + }, + { + "epoch": 0.20226690780167242, + "grad_norm": 1.7783467769622803, + "learning_rate": 0.00018483621036209983, + "loss": 1.2844, + "step": 5648 + }, + { + "epoch": 0.20230271993124072, + "grad_norm": 1.429095983505249, + "learning_rate": 0.0001848300690835795, + "loss": 1.2397, + "step": 5649 + }, + { + "epoch": 0.20233853206080898, + "grad_norm": 1.229067087173462, + "learning_rate": 0.00018482392666378003, + "loss": 1.1734, + "step": 5650 + }, + { + "epoch": 0.20237434419037728, + "grad_norm": 1.1384248733520508, + "learning_rate": 0.00018481778310278405, + "loss": 1.1596, + "step": 5651 + }, + { + "epoch": 0.20241015631994558, + "grad_norm": 1.5326344966888428, + "learning_rate": 0.0001848116384006742, + "loss": 1.1994, + "step": 5652 + }, + { + "epoch": 0.20244596844951385, + "grad_norm": 1.4800068140029907, + "learning_rate": 0.00018480549255753313, + "loss": 1.0223, + "step": 5653 + }, + { + "epoch": 0.20248178057908214, + "grad_norm": 1.5276614427566528, + "learning_rate": 0.0001847993455734436, + "loss": 1.2204, + "step": 5654 + }, + { + "epoch": 0.2025175927086504, + "grad_norm": 1.6002328395843506, + "learning_rate": 0.00018479319744848821, + "loss": 1.3225, + "step": 5655 + }, + { + "epoch": 0.2025534048382187, + "grad_norm": 2.5998306274414062, + "learning_rate": 0.00018478704818274976, + "loss": 1.3149, + "step": 5656 + }, + { + "epoch": 0.20258921696778698, + "grad_norm": 1.792359709739685, + "learning_rate": 0.00018478089777631092, + "loss": 1.2404, + "step": 5657 + }, + { + "epoch": 0.20262502909735527, + "grad_norm": 1.251320481300354, + "learning_rate": 0.00018477474622925449, + "loss": 1.1095, + "step": 5658 + }, + { + "epoch": 0.20266084122692357, + "grad_norm": 1.4709254503250122, + "learning_rate": 0.00018476859354166317, + "loss": 1.1978, + "step": 5659 + }, + { + "epoch": 0.20269665335649184, + "grad_norm": 1.6654634475708008, + "learning_rate": 0.0001847624397136198, + "loss": 1.1599, + "step": 5660 + }, + { + "epoch": 0.20273246548606014, + "grad_norm": 1.5736348628997803, + "learning_rate": 0.0001847562847452071, + "loss": 1.2534, + "step": 5661 + }, + { + "epoch": 0.2027682776156284, + "grad_norm": 1.794840693473816, + "learning_rate": 0.0001847501286365079, + "loss": 1.173, + "step": 5662 + }, + { + "epoch": 0.2028040897451967, + "grad_norm": 1.307637333869934, + "learning_rate": 0.00018474397138760508, + "loss": 1.1769, + "step": 5663 + }, + { + "epoch": 0.20283990187476497, + "grad_norm": 1.6954820156097412, + "learning_rate": 0.00018473781299858146, + "loss": 1.2881, + "step": 5664 + }, + { + "epoch": 0.20287571400433327, + "grad_norm": 2.1980183124542236, + "learning_rate": 0.0001847316534695198, + "loss": 1.4199, + "step": 5665 + }, + { + "epoch": 0.20291152613390157, + "grad_norm": 1.3763688802719116, + "learning_rate": 0.0001847254928005031, + "loss": 1.2628, + "step": 5666 + }, + { + "epoch": 0.20294733826346983, + "grad_norm": 1.5667475461959839, + "learning_rate": 0.00018471933099161415, + "loss": 1.2823, + "step": 5667 + }, + { + "epoch": 0.20298315039303813, + "grad_norm": 2.12888240814209, + "learning_rate": 0.00018471316804293594, + "loss": 1.3439, + "step": 5668 + }, + { + "epoch": 0.2030189625226064, + "grad_norm": 1.4838321208953857, + "learning_rate": 0.00018470700395455125, + "loss": 1.3741, + "step": 5669 + }, + { + "epoch": 0.2030547746521747, + "grad_norm": 1.6266424655914307, + "learning_rate": 0.00018470083872654312, + "loss": 1.0459, + "step": 5670 + }, + { + "epoch": 0.20309058678174297, + "grad_norm": 1.5038843154907227, + "learning_rate": 0.00018469467235899444, + "loss": 1.1144, + "step": 5671 + }, + { + "epoch": 0.20312639891131126, + "grad_norm": 1.577218770980835, + "learning_rate": 0.00018468850485198822, + "loss": 1.2041, + "step": 5672 + }, + { + "epoch": 0.20316221104087956, + "grad_norm": 1.882367730140686, + "learning_rate": 0.00018468233620560739, + "loss": 0.94, + "step": 5673 + }, + { + "epoch": 0.20319802317044783, + "grad_norm": 1.5407545566558838, + "learning_rate": 0.00018467616641993498, + "loss": 1.2505, + "step": 5674 + }, + { + "epoch": 0.20323383530001612, + "grad_norm": 1.7741369009017944, + "learning_rate": 0.00018466999549505392, + "loss": 1.1419, + "step": 5675 + }, + { + "epoch": 0.2032696474295844, + "grad_norm": 1.7691892385482788, + "learning_rate": 0.00018466382343104734, + "loss": 1.1769, + "step": 5676 + }, + { + "epoch": 0.2033054595591527, + "grad_norm": 1.8440338373184204, + "learning_rate": 0.00018465765022799823, + "loss": 1.0947, + "step": 5677 + }, + { + "epoch": 0.20334127168872096, + "grad_norm": 1.614633560180664, + "learning_rate": 0.00018465147588598958, + "loss": 1.0431, + "step": 5678 + }, + { + "epoch": 0.20337708381828926, + "grad_norm": 1.6896435022354126, + "learning_rate": 0.00018464530040510456, + "loss": 1.3024, + "step": 5679 + }, + { + "epoch": 0.20341289594785755, + "grad_norm": 1.7185337543487549, + "learning_rate": 0.0001846391237854262, + "loss": 1.2343, + "step": 5680 + }, + { + "epoch": 0.20344870807742582, + "grad_norm": 1.3764500617980957, + "learning_rate": 0.0001846329460270376, + "loss": 1.1933, + "step": 5681 + }, + { + "epoch": 0.20348452020699412, + "grad_norm": 1.9724040031433105, + "learning_rate": 0.0001846267671300219, + "loss": 1.3377, + "step": 5682 + }, + { + "epoch": 0.2035203323365624, + "grad_norm": 2.1054139137268066, + "learning_rate": 0.00018462058709446216, + "loss": 1.3429, + "step": 5683 + }, + { + "epoch": 0.20355614446613068, + "grad_norm": 1.587171196937561, + "learning_rate": 0.00018461440592044165, + "loss": 1.3877, + "step": 5684 + }, + { + "epoch": 0.20359195659569895, + "grad_norm": 1.3183552026748657, + "learning_rate": 0.00018460822360804338, + "loss": 1.2926, + "step": 5685 + }, + { + "epoch": 0.20362776872526725, + "grad_norm": 1.412224292755127, + "learning_rate": 0.00018460204015735064, + "loss": 1.1962, + "step": 5686 + }, + { + "epoch": 0.20366358085483555, + "grad_norm": 1.6202760934829712, + "learning_rate": 0.00018459585556844656, + "loss": 1.3381, + "step": 5687 + }, + { + "epoch": 0.20369939298440382, + "grad_norm": 1.6764341592788696, + "learning_rate": 0.00018458966984141438, + "loss": 1.087, + "step": 5688 + }, + { + "epoch": 0.2037352051139721, + "grad_norm": 1.7893017530441284, + "learning_rate": 0.00018458348297633727, + "loss": 1.1771, + "step": 5689 + }, + { + "epoch": 0.20377101724354038, + "grad_norm": 1.6495959758758545, + "learning_rate": 0.00018457729497329853, + "loss": 1.0608, + "step": 5690 + }, + { + "epoch": 0.20380682937310868, + "grad_norm": 1.8015632629394531, + "learning_rate": 0.0001845711058323814, + "loss": 1.2931, + "step": 5691 + }, + { + "epoch": 0.20384264150267695, + "grad_norm": 1.3405944108963013, + "learning_rate": 0.0001845649155536691, + "loss": 1.2165, + "step": 5692 + }, + { + "epoch": 0.20387845363224524, + "grad_norm": 1.5007469654083252, + "learning_rate": 0.00018455872413724496, + "loss": 1.1719, + "step": 5693 + }, + { + "epoch": 0.2039142657618135, + "grad_norm": 1.7417266368865967, + "learning_rate": 0.00018455253158319225, + "loss": 1.0779, + "step": 5694 + }, + { + "epoch": 0.2039500778913818, + "grad_norm": 1.6824958324432373, + "learning_rate": 0.00018454633789159427, + "loss": 1.2879, + "step": 5695 + }, + { + "epoch": 0.2039858900209501, + "grad_norm": 1.3206311464309692, + "learning_rate": 0.0001845401430625344, + "loss": 1.2197, + "step": 5696 + }, + { + "epoch": 0.20402170215051837, + "grad_norm": 1.8253862857818604, + "learning_rate": 0.00018453394709609598, + "loss": 1.4991, + "step": 5697 + }, + { + "epoch": 0.20405751428008667, + "grad_norm": 1.5509058237075806, + "learning_rate": 0.0001845277499923623, + "loss": 1.3038, + "step": 5698 + }, + { + "epoch": 0.20409332640965494, + "grad_norm": 1.8411304950714111, + "learning_rate": 0.0001845215517514168, + "loss": 1.3209, + "step": 5699 + }, + { + "epoch": 0.20412913853922324, + "grad_norm": 1.4423882961273193, + "learning_rate": 0.0001845153523733428, + "loss": 1.2996, + "step": 5700 + }, + { + "epoch": 0.2041649506687915, + "grad_norm": 1.4836838245391846, + "learning_rate": 0.00018450915185822382, + "loss": 1.3073, + "step": 5701 + }, + { + "epoch": 0.2042007627983598, + "grad_norm": 1.747931957244873, + "learning_rate": 0.00018450295020614317, + "loss": 1.1895, + "step": 5702 + }, + { + "epoch": 0.2042365749279281, + "grad_norm": 1.3882687091827393, + "learning_rate": 0.00018449674741718433, + "loss": 1.3688, + "step": 5703 + }, + { + "epoch": 0.20427238705749637, + "grad_norm": 1.7264467477798462, + "learning_rate": 0.00018449054349143072, + "loss": 1.1334, + "step": 5704 + }, + { + "epoch": 0.20430819918706467, + "grad_norm": 1.5111839771270752, + "learning_rate": 0.0001844843384289659, + "loss": 1.3782, + "step": 5705 + }, + { + "epoch": 0.20434401131663293, + "grad_norm": 1.9728127717971802, + "learning_rate": 0.00018447813222987323, + "loss": 1.4903, + "step": 5706 + }, + { + "epoch": 0.20437982344620123, + "grad_norm": 1.8894506692886353, + "learning_rate": 0.00018447192489423625, + "loss": 1.2895, + "step": 5707 + }, + { + "epoch": 0.2044156355757695, + "grad_norm": 1.926550030708313, + "learning_rate": 0.00018446571642213852, + "loss": 1.4962, + "step": 5708 + }, + { + "epoch": 0.2044514477053378, + "grad_norm": 1.2439274787902832, + "learning_rate": 0.0001844595068136635, + "loss": 1.2184, + "step": 5709 + }, + { + "epoch": 0.2044872598349061, + "grad_norm": 1.4495917558670044, + "learning_rate": 0.0001844532960688948, + "loss": 1.0723, + "step": 5710 + }, + { + "epoch": 0.20452307196447436, + "grad_norm": 1.3969535827636719, + "learning_rate": 0.0001844470841879159, + "loss": 1.1694, + "step": 5711 + }, + { + "epoch": 0.20455888409404266, + "grad_norm": 1.2102735042572021, + "learning_rate": 0.00018444087117081042, + "loss": 1.1702, + "step": 5712 + }, + { + "epoch": 0.20459469622361093, + "grad_norm": 1.6083556413650513, + "learning_rate": 0.00018443465701766196, + "loss": 1.0602, + "step": 5713 + }, + { + "epoch": 0.20463050835317922, + "grad_norm": 2.327089548110962, + "learning_rate": 0.0001844284417285541, + "loss": 1.2917, + "step": 5714 + }, + { + "epoch": 0.2046663204827475, + "grad_norm": 1.3855392932891846, + "learning_rate": 0.00018442222530357043, + "loss": 1.3314, + "step": 5715 + }, + { + "epoch": 0.2047021326123158, + "grad_norm": 1.431308627128601, + "learning_rate": 0.00018441600774279465, + "loss": 1.1585, + "step": 5716 + }, + { + "epoch": 0.2047379447418841, + "grad_norm": 1.6833840608596802, + "learning_rate": 0.00018440978904631032, + "loss": 1.0137, + "step": 5717 + }, + { + "epoch": 0.20477375687145236, + "grad_norm": 1.4842368364334106, + "learning_rate": 0.00018440356921420122, + "loss": 1.3742, + "step": 5718 + }, + { + "epoch": 0.20480956900102065, + "grad_norm": 1.230654239654541, + "learning_rate": 0.00018439734824655092, + "loss": 1.0693, + "step": 5719 + }, + { + "epoch": 0.20484538113058892, + "grad_norm": 2.551546335220337, + "learning_rate": 0.00018439112614344322, + "loss": 1.0699, + "step": 5720 + }, + { + "epoch": 0.20488119326015722, + "grad_norm": 1.4214671850204468, + "learning_rate": 0.0001843849029049617, + "loss": 1.1373, + "step": 5721 + }, + { + "epoch": 0.2049170053897255, + "grad_norm": 1.5684329271316528, + "learning_rate": 0.00018437867853119023, + "loss": 1.0957, + "step": 5722 + }, + { + "epoch": 0.20495281751929378, + "grad_norm": 1.8979581594467163, + "learning_rate": 0.00018437245302221244, + "loss": 1.2407, + "step": 5723 + }, + { + "epoch": 0.20498862964886208, + "grad_norm": 1.2931841611862183, + "learning_rate": 0.00018436622637811215, + "loss": 1.1514, + "step": 5724 + }, + { + "epoch": 0.20502444177843035, + "grad_norm": 1.6948623657226562, + "learning_rate": 0.0001843599985989731, + "loss": 1.1731, + "step": 5725 + }, + { + "epoch": 0.20506025390799865, + "grad_norm": 2.1802875995635986, + "learning_rate": 0.0001843537696848791, + "loss": 1.179, + "step": 5726 + }, + { + "epoch": 0.20509606603756692, + "grad_norm": 1.3925975561141968, + "learning_rate": 0.0001843475396359139, + "loss": 0.955, + "step": 5727 + }, + { + "epoch": 0.2051318781671352, + "grad_norm": 1.3452024459838867, + "learning_rate": 0.00018434130845216138, + "loss": 1.1475, + "step": 5728 + }, + { + "epoch": 0.20516769029670348, + "grad_norm": 1.32256281375885, + "learning_rate": 0.00018433507613370534, + "loss": 1.2005, + "step": 5729 + }, + { + "epoch": 0.20520350242627178, + "grad_norm": 1.598803162574768, + "learning_rate": 0.00018432884268062964, + "loss": 1.2464, + "step": 5730 + }, + { + "epoch": 0.20523931455584007, + "grad_norm": 1.6527460813522339, + "learning_rate": 0.00018432260809301816, + "loss": 1.1625, + "step": 5731 + }, + { + "epoch": 0.20527512668540834, + "grad_norm": 1.4007484912872314, + "learning_rate": 0.00018431637237095472, + "loss": 1.0932, + "step": 5732 + }, + { + "epoch": 0.20531093881497664, + "grad_norm": 2.0654172897338867, + "learning_rate": 0.00018431013551452327, + "loss": 1.2982, + "step": 5733 + }, + { + "epoch": 0.2053467509445449, + "grad_norm": 1.4852917194366455, + "learning_rate": 0.0001843038975238077, + "loss": 0.9719, + "step": 5734 + }, + { + "epoch": 0.2053825630741132, + "grad_norm": 1.3660446405410767, + "learning_rate": 0.00018429765839889193, + "loss": 1.0387, + "step": 5735 + }, + { + "epoch": 0.20541837520368147, + "grad_norm": 1.5656299591064453, + "learning_rate": 0.0001842914181398599, + "loss": 1.166, + "step": 5736 + }, + { + "epoch": 0.20545418733324977, + "grad_norm": 1.6326770782470703, + "learning_rate": 0.00018428517674679557, + "loss": 1.0259, + "step": 5737 + }, + { + "epoch": 0.20548999946281807, + "grad_norm": 1.6408950090408325, + "learning_rate": 0.0001842789342197829, + "loss": 1.1974, + "step": 5738 + }, + { + "epoch": 0.20552581159238634, + "grad_norm": 1.7128431797027588, + "learning_rate": 0.00018427269055890588, + "loss": 1.3325, + "step": 5739 + }, + { + "epoch": 0.20556162372195463, + "grad_norm": 1.4046363830566406, + "learning_rate": 0.00018426644576424855, + "loss": 1.2332, + "step": 5740 + }, + { + "epoch": 0.2055974358515229, + "grad_norm": 1.2155282497406006, + "learning_rate": 0.00018426019983589482, + "loss": 1.1843, + "step": 5741 + }, + { + "epoch": 0.2056332479810912, + "grad_norm": 2.0360589027404785, + "learning_rate": 0.00018425395277392882, + "loss": 1.1798, + "step": 5742 + }, + { + "epoch": 0.20566906011065947, + "grad_norm": 1.5911599397659302, + "learning_rate": 0.0001842477045784346, + "loss": 1.1407, + "step": 5743 + }, + { + "epoch": 0.20570487224022777, + "grad_norm": 2.352126359939575, + "learning_rate": 0.00018424145524949614, + "loss": 1.3287, + "step": 5744 + }, + { + "epoch": 0.20574068436979606, + "grad_norm": 2.3703670501708984, + "learning_rate": 0.00018423520478719758, + "loss": 1.1754, + "step": 5745 + }, + { + "epoch": 0.20577649649936433, + "grad_norm": 1.6545145511627197, + "learning_rate": 0.00018422895319162298, + "loss": 1.1577, + "step": 5746 + }, + { + "epoch": 0.20581230862893263, + "grad_norm": 1.7815451622009277, + "learning_rate": 0.0001842227004628565, + "loss": 1.1038, + "step": 5747 + }, + { + "epoch": 0.2058481207585009, + "grad_norm": 1.9959211349487305, + "learning_rate": 0.00018421644660098217, + "loss": 1.218, + "step": 5748 + }, + { + "epoch": 0.2058839328880692, + "grad_norm": 1.775768518447876, + "learning_rate": 0.00018421019160608424, + "loss": 1.2191, + "step": 5749 + }, + { + "epoch": 0.20591974501763746, + "grad_norm": 1.6170830726623535, + "learning_rate": 0.00018420393547824676, + "loss": 1.0018, + "step": 5750 + }, + { + "epoch": 0.20595555714720576, + "grad_norm": 2.5513222217559814, + "learning_rate": 0.000184197678217554, + "loss": 1.278, + "step": 5751 + }, + { + "epoch": 0.20599136927677406, + "grad_norm": 1.7760447263717651, + "learning_rate": 0.00018419141982409001, + "loss": 1.01, + "step": 5752 + }, + { + "epoch": 0.20602718140634232, + "grad_norm": 1.4122519493103027, + "learning_rate": 0.00018418516029793916, + "loss": 1.2435, + "step": 5753 + }, + { + "epoch": 0.20606299353591062, + "grad_norm": 1.4283791780471802, + "learning_rate": 0.00018417889963918548, + "loss": 1.3784, + "step": 5754 + }, + { + "epoch": 0.2060988056654789, + "grad_norm": 1.3227452039718628, + "learning_rate": 0.00018417263784791335, + "loss": 1.3211, + "step": 5755 + }, + { + "epoch": 0.2061346177950472, + "grad_norm": 1.2324556112289429, + "learning_rate": 0.0001841663749242069, + "loss": 1.3296, + "step": 5756 + }, + { + "epoch": 0.20617042992461546, + "grad_norm": 1.3846900463104248, + "learning_rate": 0.0001841601108681505, + "loss": 1.0979, + "step": 5757 + }, + { + "epoch": 0.20620624205418375, + "grad_norm": 1.3352599143981934, + "learning_rate": 0.00018415384567982833, + "loss": 1.2382, + "step": 5758 + }, + { + "epoch": 0.20624205418375205, + "grad_norm": 1.5511225461959839, + "learning_rate": 0.0001841475793593247, + "loss": 1.1205, + "step": 5759 + }, + { + "epoch": 0.20627786631332032, + "grad_norm": 1.6957182884216309, + "learning_rate": 0.00018414131190672394, + "loss": 1.1837, + "step": 5760 + }, + { + "epoch": 0.20631367844288862, + "grad_norm": 2.000352621078491, + "learning_rate": 0.00018413504332211037, + "loss": 1.1646, + "step": 5761 + }, + { + "epoch": 0.20634949057245688, + "grad_norm": 1.856191873550415, + "learning_rate": 0.00018412877360556834, + "loss": 1.2529, + "step": 5762 + }, + { + "epoch": 0.20638530270202518, + "grad_norm": 1.682483434677124, + "learning_rate": 0.00018412250275718218, + "loss": 1.2097, + "step": 5763 + }, + { + "epoch": 0.20642111483159345, + "grad_norm": 1.4870235919952393, + "learning_rate": 0.00018411623077703624, + "loss": 1.3195, + "step": 5764 + }, + { + "epoch": 0.20645692696116175, + "grad_norm": 1.940490484237671, + "learning_rate": 0.0001841099576652149, + "loss": 1.3608, + "step": 5765 + }, + { + "epoch": 0.20649273909073004, + "grad_norm": 1.434676170349121, + "learning_rate": 0.00018410368342180263, + "loss": 1.0984, + "step": 5766 + }, + { + "epoch": 0.2065285512202983, + "grad_norm": 1.7190890312194824, + "learning_rate": 0.00018409740804688373, + "loss": 1.0951, + "step": 5767 + }, + { + "epoch": 0.2065643633498666, + "grad_norm": 1.3451989889144897, + "learning_rate": 0.0001840911315405427, + "loss": 1.0593, + "step": 5768 + }, + { + "epoch": 0.20660017547943488, + "grad_norm": 1.4363806247711182, + "learning_rate": 0.00018408485390286397, + "loss": 1.2249, + "step": 5769 + }, + { + "epoch": 0.20663598760900317, + "grad_norm": 1.7446151971817017, + "learning_rate": 0.00018407857513393197, + "loss": 1.1991, + "step": 5770 + }, + { + "epoch": 0.20667179973857144, + "grad_norm": 1.8351322412490845, + "learning_rate": 0.00018407229523383122, + "loss": 1.2527, + "step": 5771 + }, + { + "epoch": 0.20670761186813974, + "grad_norm": 1.6528198719024658, + "learning_rate": 0.00018406601420264618, + "loss": 1.0792, + "step": 5772 + }, + { + "epoch": 0.20674342399770804, + "grad_norm": 1.6726816892623901, + "learning_rate": 0.00018405973204046135, + "loss": 1.4012, + "step": 5773 + }, + { + "epoch": 0.2067792361272763, + "grad_norm": 1.9644086360931396, + "learning_rate": 0.00018405344874736126, + "loss": 1.0534, + "step": 5774 + }, + { + "epoch": 0.2068150482568446, + "grad_norm": 1.2206050157546997, + "learning_rate": 0.00018404716432343044, + "loss": 1.136, + "step": 5775 + }, + { + "epoch": 0.20685086038641287, + "grad_norm": 1.478287935256958, + "learning_rate": 0.0001840408787687534, + "loss": 0.95, + "step": 5776 + }, + { + "epoch": 0.20688667251598117, + "grad_norm": 1.5981229543685913, + "learning_rate": 0.0001840345920834148, + "loss": 1.1961, + "step": 5777 + }, + { + "epoch": 0.20692248464554944, + "grad_norm": 1.6242845058441162, + "learning_rate": 0.00018402830426749914, + "loss": 1.0915, + "step": 5778 + }, + { + "epoch": 0.20695829677511773, + "grad_norm": 1.5030514001846313, + "learning_rate": 0.00018402201532109102, + "loss": 1.2637, + "step": 5779 + }, + { + "epoch": 0.20699410890468603, + "grad_norm": 1.6898037195205688, + "learning_rate": 0.00018401572524427505, + "loss": 1.4727, + "step": 5780 + }, + { + "epoch": 0.2070299210342543, + "grad_norm": 1.6994915008544922, + "learning_rate": 0.0001840094340371359, + "loss": 1.2842, + "step": 5781 + }, + { + "epoch": 0.2070657331638226, + "grad_norm": 1.9566761255264282, + "learning_rate": 0.00018400314169975818, + "loss": 1.0613, + "step": 5782 + }, + { + "epoch": 0.20710154529339087, + "grad_norm": 1.5852376222610474, + "learning_rate": 0.00018399684823222653, + "loss": 0.9789, + "step": 5783 + }, + { + "epoch": 0.20713735742295916, + "grad_norm": 1.5041555166244507, + "learning_rate": 0.00018399055363462562, + "loss": 1.126, + "step": 5784 + }, + { + "epoch": 0.20717316955252743, + "grad_norm": 1.7747312784194946, + "learning_rate": 0.0001839842579070402, + "loss": 0.9781, + "step": 5785 + }, + { + "epoch": 0.20720898168209573, + "grad_norm": 1.3689985275268555, + "learning_rate": 0.0001839779610495549, + "loss": 1.0478, + "step": 5786 + }, + { + "epoch": 0.20724479381166402, + "grad_norm": 1.582324504852295, + "learning_rate": 0.00018397166306225444, + "loss": 1.2046, + "step": 5787 + }, + { + "epoch": 0.2072806059412323, + "grad_norm": 1.4689550399780273, + "learning_rate": 0.00018396536394522359, + "loss": 1.2026, + "step": 5788 + }, + { + "epoch": 0.2073164180708006, + "grad_norm": 1.6899322271347046, + "learning_rate": 0.00018395906369854704, + "loss": 1.1111, + "step": 5789 + }, + { + "epoch": 0.20735223020036886, + "grad_norm": 1.3251663446426392, + "learning_rate": 0.00018395276232230964, + "loss": 1.2245, + "step": 5790 + }, + { + "epoch": 0.20738804232993716, + "grad_norm": 1.926283597946167, + "learning_rate": 0.00018394645981659608, + "loss": 1.1499, + "step": 5791 + }, + { + "epoch": 0.20742385445950542, + "grad_norm": 1.7822729349136353, + "learning_rate": 0.00018394015618149122, + "loss": 1.1268, + "step": 5792 + }, + { + "epoch": 0.20745966658907372, + "grad_norm": 1.4486576318740845, + "learning_rate": 0.00018393385141707977, + "loss": 1.169, + "step": 5793 + }, + { + "epoch": 0.207495478718642, + "grad_norm": 1.509377121925354, + "learning_rate": 0.00018392754552344666, + "loss": 1.018, + "step": 5794 + }, + { + "epoch": 0.2075312908482103, + "grad_norm": 1.4083184003829956, + "learning_rate": 0.00018392123850067668, + "loss": 1.1781, + "step": 5795 + }, + { + "epoch": 0.20756710297777858, + "grad_norm": 1.7459287643432617, + "learning_rate": 0.00018391493034885468, + "loss": 1.2253, + "step": 5796 + }, + { + "epoch": 0.20760291510734685, + "grad_norm": 1.4573302268981934, + "learning_rate": 0.00018390862106806554, + "loss": 1.2302, + "step": 5797 + }, + { + "epoch": 0.20763872723691515, + "grad_norm": 1.8945271968841553, + "learning_rate": 0.00018390231065839414, + "loss": 1.2077, + "step": 5798 + }, + { + "epoch": 0.20767453936648342, + "grad_norm": 1.5822484493255615, + "learning_rate": 0.00018389599911992538, + "loss": 1.0805, + "step": 5799 + }, + { + "epoch": 0.20771035149605171, + "grad_norm": 1.6152626276016235, + "learning_rate": 0.00018388968645274416, + "loss": 1.0887, + "step": 5800 + }, + { + "epoch": 0.20774616362561998, + "grad_norm": 1.326729655265808, + "learning_rate": 0.00018388337265693542, + "loss": 1.1481, + "step": 5801 + }, + { + "epoch": 0.20778197575518828, + "grad_norm": 1.6072100400924683, + "learning_rate": 0.0001838770577325841, + "loss": 1.2513, + "step": 5802 + }, + { + "epoch": 0.20781778788475658, + "grad_norm": 1.4517700672149658, + "learning_rate": 0.00018387074167977517, + "loss": 1.0699, + "step": 5803 + }, + { + "epoch": 0.20785360001432485, + "grad_norm": 1.7080137729644775, + "learning_rate": 0.00018386442449859358, + "loss": 1.1754, + "step": 5804 + }, + { + "epoch": 0.20788941214389314, + "grad_norm": 1.5353113412857056, + "learning_rate": 0.00018385810618912435, + "loss": 1.1174, + "step": 5805 + }, + { + "epoch": 0.2079252242734614, + "grad_norm": 1.6411951780319214, + "learning_rate": 0.00018385178675145246, + "loss": 0.9983, + "step": 5806 + }, + { + "epoch": 0.2079610364030297, + "grad_norm": 1.509759545326233, + "learning_rate": 0.00018384546618566296, + "loss": 1.1045, + "step": 5807 + }, + { + "epoch": 0.20799684853259798, + "grad_norm": 2.2551989555358887, + "learning_rate": 0.00018383914449184084, + "loss": 1.3369, + "step": 5808 + }, + { + "epoch": 0.20803266066216627, + "grad_norm": 1.5289512872695923, + "learning_rate": 0.0001838328216700712, + "loss": 0.9483, + "step": 5809 + }, + { + "epoch": 0.20806847279173457, + "grad_norm": 2.950371026992798, + "learning_rate": 0.00018382649772043908, + "loss": 1.365, + "step": 5810 + }, + { + "epoch": 0.20810428492130284, + "grad_norm": 1.6345412731170654, + "learning_rate": 0.00018382017264302955, + "loss": 1.3626, + "step": 5811 + }, + { + "epoch": 0.20814009705087114, + "grad_norm": 1.544666051864624, + "learning_rate": 0.0001838138464379277, + "loss": 0.9959, + "step": 5812 + }, + { + "epoch": 0.2081759091804394, + "grad_norm": 1.6480481624603271, + "learning_rate": 0.0001838075191052187, + "loss": 1.4355, + "step": 5813 + }, + { + "epoch": 0.2082117213100077, + "grad_norm": 1.6931613683700562, + "learning_rate": 0.0001838011906449876, + "loss": 1.1853, + "step": 5814 + }, + { + "epoch": 0.20824753343957597, + "grad_norm": 1.3422468900680542, + "learning_rate": 0.0001837948610573196, + "loss": 1.122, + "step": 5815 + }, + { + "epoch": 0.20828334556914427, + "grad_norm": 1.8201117515563965, + "learning_rate": 0.0001837885303422998, + "loss": 1.3325, + "step": 5816 + }, + { + "epoch": 0.20831915769871256, + "grad_norm": 1.4026964902877808, + "learning_rate": 0.00018378219850001345, + "loss": 1.1604, + "step": 5817 + }, + { + "epoch": 0.20835496982828083, + "grad_norm": 1.3656692504882812, + "learning_rate": 0.00018377586553054565, + "loss": 1.2867, + "step": 5818 + }, + { + "epoch": 0.20839078195784913, + "grad_norm": 1.5778876543045044, + "learning_rate": 0.00018376953143398167, + "loss": 1.193, + "step": 5819 + }, + { + "epoch": 0.2084265940874174, + "grad_norm": 2.0215296745300293, + "learning_rate": 0.00018376319621040668, + "loss": 1.106, + "step": 5820 + }, + { + "epoch": 0.2084624062169857, + "grad_norm": 2.11993145942688, + "learning_rate": 0.00018375685985990594, + "loss": 1.5468, + "step": 5821 + }, + { + "epoch": 0.20849821834655397, + "grad_norm": 1.7853769063949585, + "learning_rate": 0.00018375052238256466, + "loss": 1.3175, + "step": 5822 + }, + { + "epoch": 0.20853403047612226, + "grad_norm": 1.980393648147583, + "learning_rate": 0.00018374418377846817, + "loss": 1.426, + "step": 5823 + }, + { + "epoch": 0.20856984260569056, + "grad_norm": 1.8124537467956543, + "learning_rate": 0.0001837378440477017, + "loss": 1.2873, + "step": 5824 + }, + { + "epoch": 0.20860565473525883, + "grad_norm": 1.523603081703186, + "learning_rate": 0.00018373150319035055, + "loss": 1.3263, + "step": 5825 + }, + { + "epoch": 0.20864146686482712, + "grad_norm": 1.655837059020996, + "learning_rate": 0.00018372516120650003, + "loss": 1.2292, + "step": 5826 + }, + { + "epoch": 0.2086772789943954, + "grad_norm": 1.5114275217056274, + "learning_rate": 0.00018371881809623545, + "loss": 1.1364, + "step": 5827 + }, + { + "epoch": 0.2087130911239637, + "grad_norm": 1.327652096748352, + "learning_rate": 0.0001837124738596422, + "loss": 1.2685, + "step": 5828 + }, + { + "epoch": 0.20874890325353196, + "grad_norm": 1.307649850845337, + "learning_rate": 0.00018370612849680557, + "loss": 1.2733, + "step": 5829 + }, + { + "epoch": 0.20878471538310026, + "grad_norm": 1.8177465200424194, + "learning_rate": 0.00018369978200781094, + "loss": 1.2267, + "step": 5830 + }, + { + "epoch": 0.20882052751266855, + "grad_norm": 1.6402497291564941, + "learning_rate": 0.00018369343439274372, + "loss": 1.3753, + "step": 5831 + }, + { + "epoch": 0.20885633964223682, + "grad_norm": 2.0513672828674316, + "learning_rate": 0.0001836870856516893, + "loss": 1.1256, + "step": 5832 + }, + { + "epoch": 0.20889215177180512, + "grad_norm": 1.3707754611968994, + "learning_rate": 0.0001836807357847331, + "loss": 1.0518, + "step": 5833 + }, + { + "epoch": 0.2089279639013734, + "grad_norm": 1.699007272720337, + "learning_rate": 0.00018367438479196055, + "loss": 1.2551, + "step": 5834 + }, + { + "epoch": 0.20896377603094168, + "grad_norm": 2.3364737033843994, + "learning_rate": 0.00018366803267345704, + "loss": 1.2596, + "step": 5835 + }, + { + "epoch": 0.20899958816050995, + "grad_norm": 1.1402415037155151, + "learning_rate": 0.0001836616794293081, + "loss": 1.1289, + "step": 5836 + }, + { + "epoch": 0.20903540029007825, + "grad_norm": 1.762323021888733, + "learning_rate": 0.00018365532505959918, + "loss": 1.2, + "step": 5837 + }, + { + "epoch": 0.20907121241964655, + "grad_norm": 1.6822434663772583, + "learning_rate": 0.00018364896956441577, + "loss": 1.2384, + "step": 5838 + }, + { + "epoch": 0.20910702454921481, + "grad_norm": 1.5170358419418335, + "learning_rate": 0.00018364261294384336, + "loss": 1.2277, + "step": 5839 + }, + { + "epoch": 0.2091428366787831, + "grad_norm": 1.2710429430007935, + "learning_rate": 0.0001836362551979675, + "loss": 1.1457, + "step": 5840 + }, + { + "epoch": 0.20917864880835138, + "grad_norm": 1.7667030096054077, + "learning_rate": 0.00018362989632687374, + "loss": 1.3821, + "step": 5841 + }, + { + "epoch": 0.20921446093791968, + "grad_norm": 1.4056719541549683, + "learning_rate": 0.00018362353633064754, + "loss": 1.229, + "step": 5842 + }, + { + "epoch": 0.20925027306748795, + "grad_norm": 2.0240862369537354, + "learning_rate": 0.00018361717520937458, + "loss": 1.4328, + "step": 5843 + }, + { + "epoch": 0.20928608519705624, + "grad_norm": 1.5466883182525635, + "learning_rate": 0.00018361081296314037, + "loss": 1.352, + "step": 5844 + }, + { + "epoch": 0.20932189732662454, + "grad_norm": 1.4761698246002197, + "learning_rate": 0.0001836044495920305, + "loss": 1.129, + "step": 5845 + }, + { + "epoch": 0.2093577094561928, + "grad_norm": 1.299889087677002, + "learning_rate": 0.00018359808509613062, + "loss": 1.0836, + "step": 5846 + }, + { + "epoch": 0.2093935215857611, + "grad_norm": 1.339091420173645, + "learning_rate": 0.00018359171947552631, + "loss": 1.0285, + "step": 5847 + }, + { + "epoch": 0.20942933371532937, + "grad_norm": 2.758042812347412, + "learning_rate": 0.00018358535273030327, + "loss": 1.284, + "step": 5848 + }, + { + "epoch": 0.20946514584489767, + "grad_norm": 1.7179064750671387, + "learning_rate": 0.0001835789848605471, + "loss": 1.0727, + "step": 5849 + }, + { + "epoch": 0.20950095797446594, + "grad_norm": 1.7606709003448486, + "learning_rate": 0.00018357261586634353, + "loss": 1.2229, + "step": 5850 + }, + { + "epoch": 0.20953677010403424, + "grad_norm": 1.6163064241409302, + "learning_rate": 0.00018356624574777822, + "loss": 1.1418, + "step": 5851 + }, + { + "epoch": 0.20957258223360253, + "grad_norm": 1.6309394836425781, + "learning_rate": 0.0001835598745049368, + "loss": 1.175, + "step": 5852 + }, + { + "epoch": 0.2096083943631708, + "grad_norm": 1.7484463453292847, + "learning_rate": 0.00018355350213790513, + "loss": 1.1945, + "step": 5853 + }, + { + "epoch": 0.2096442064927391, + "grad_norm": 1.5130842924118042, + "learning_rate": 0.00018354712864676885, + "loss": 1.0617, + "step": 5854 + }, + { + "epoch": 0.20968001862230737, + "grad_norm": 1.6597434282302856, + "learning_rate": 0.00018354075403161367, + "loss": 1.0573, + "step": 5855 + }, + { + "epoch": 0.20971583075187566, + "grad_norm": 2.05460524559021, + "learning_rate": 0.00018353437829252543, + "loss": 1.0941, + "step": 5856 + }, + { + "epoch": 0.20975164288144393, + "grad_norm": 1.5204064846038818, + "learning_rate": 0.00018352800142958992, + "loss": 1.1645, + "step": 5857 + }, + { + "epoch": 0.20978745501101223, + "grad_norm": 1.3446040153503418, + "learning_rate": 0.00018352162344289284, + "loss": 1.137, + "step": 5858 + }, + { + "epoch": 0.20982326714058053, + "grad_norm": 1.8675928115844727, + "learning_rate": 0.0001835152443325201, + "loss": 1.1931, + "step": 5859 + }, + { + "epoch": 0.2098590792701488, + "grad_norm": 1.4325242042541504, + "learning_rate": 0.00018350886409855744, + "loss": 1.3039, + "step": 5860 + }, + { + "epoch": 0.2098948913997171, + "grad_norm": 1.4989653825759888, + "learning_rate": 0.00018350248274109077, + "loss": 1.3777, + "step": 5861 + }, + { + "epoch": 0.20993070352928536, + "grad_norm": 1.287725567817688, + "learning_rate": 0.00018349610026020585, + "loss": 1.0466, + "step": 5862 + }, + { + "epoch": 0.20996651565885366, + "grad_norm": 1.627543568611145, + "learning_rate": 0.00018348971665598865, + "loss": 0.9576, + "step": 5863 + }, + { + "epoch": 0.21000232778842193, + "grad_norm": 1.7564266920089722, + "learning_rate": 0.000183483331928525, + "loss": 1.2513, + "step": 5864 + }, + { + "epoch": 0.21003813991799022, + "grad_norm": 1.6362595558166504, + "learning_rate": 0.00018347694607790077, + "loss": 1.1802, + "step": 5865 + }, + { + "epoch": 0.21007395204755852, + "grad_norm": 1.4015835523605347, + "learning_rate": 0.00018347055910420193, + "loss": 1.2374, + "step": 5866 + }, + { + "epoch": 0.2101097641771268, + "grad_norm": 2.206071376800537, + "learning_rate": 0.0001834641710075144, + "loss": 1.0997, + "step": 5867 + }, + { + "epoch": 0.2101455763066951, + "grad_norm": 1.9626057147979736, + "learning_rate": 0.0001834577817879241, + "loss": 0.9687, + "step": 5868 + }, + { + "epoch": 0.21018138843626336, + "grad_norm": 1.4613183736801147, + "learning_rate": 0.000183451391445517, + "loss": 1.2096, + "step": 5869 + }, + { + "epoch": 0.21021720056583165, + "grad_norm": 1.3751953840255737, + "learning_rate": 0.00018344499998037907, + "loss": 1.1914, + "step": 5870 + }, + { + "epoch": 0.21025301269539992, + "grad_norm": 1.4876588582992554, + "learning_rate": 0.0001834386073925963, + "loss": 1.2188, + "step": 5871 + }, + { + "epoch": 0.21028882482496822, + "grad_norm": 1.3358885049819946, + "learning_rate": 0.0001834322136822547, + "loss": 0.9269, + "step": 5872 + }, + { + "epoch": 0.21032463695453651, + "grad_norm": 1.3979681730270386, + "learning_rate": 0.00018342581884944027, + "loss": 1.2304, + "step": 5873 + }, + { + "epoch": 0.21036044908410478, + "grad_norm": 1.1786569356918335, + "learning_rate": 0.0001834194228942391, + "loss": 1.0957, + "step": 5874 + }, + { + "epoch": 0.21039626121367308, + "grad_norm": 1.527829647064209, + "learning_rate": 0.00018341302581673715, + "loss": 1.0016, + "step": 5875 + }, + { + "epoch": 0.21043207334324135, + "grad_norm": 2.0664775371551514, + "learning_rate": 0.00018340662761702055, + "loss": 1.3801, + "step": 5876 + }, + { + "epoch": 0.21046788547280965, + "grad_norm": 1.4661636352539062, + "learning_rate": 0.00018340022829517537, + "loss": 1.3154, + "step": 5877 + }, + { + "epoch": 0.21050369760237791, + "grad_norm": 1.681605577468872, + "learning_rate": 0.00018339382785128767, + "loss": 1.323, + "step": 5878 + }, + { + "epoch": 0.2105395097319462, + "grad_norm": 1.5225132703781128, + "learning_rate": 0.00018338742628544363, + "loss": 1.3471, + "step": 5879 + }, + { + "epoch": 0.2105753218615145, + "grad_norm": 1.5649735927581787, + "learning_rate": 0.0001833810235977293, + "loss": 1.2027, + "step": 5880 + }, + { + "epoch": 0.21061113399108278, + "grad_norm": 1.2944961786270142, + "learning_rate": 0.00018337461978823084, + "loss": 1.2111, + "step": 5881 + }, + { + "epoch": 0.21064694612065107, + "grad_norm": 1.7747769355773926, + "learning_rate": 0.00018336821485703445, + "loss": 1.103, + "step": 5882 + }, + { + "epoch": 0.21068275825021934, + "grad_norm": 1.2406574487686157, + "learning_rate": 0.00018336180880422625, + "loss": 1.2722, + "step": 5883 + }, + { + "epoch": 0.21071857037978764, + "grad_norm": 2.385615110397339, + "learning_rate": 0.00018335540162989244, + "loss": 1.3261, + "step": 5884 + }, + { + "epoch": 0.2107543825093559, + "grad_norm": 1.535895824432373, + "learning_rate": 0.00018334899333411926, + "loss": 1.2118, + "step": 5885 + }, + { + "epoch": 0.2107901946389242, + "grad_norm": 1.5190480947494507, + "learning_rate": 0.00018334258391699285, + "loss": 1.3231, + "step": 5886 + }, + { + "epoch": 0.2108260067684925, + "grad_norm": 1.4603886604309082, + "learning_rate": 0.00018333617337859946, + "loss": 1.1679, + "step": 5887 + }, + { + "epoch": 0.21086181889806077, + "grad_norm": 1.4174939393997192, + "learning_rate": 0.00018332976171902537, + "loss": 1.1923, + "step": 5888 + }, + { + "epoch": 0.21089763102762907, + "grad_norm": 1.6904760599136353, + "learning_rate": 0.00018332334893835683, + "loss": 1.0417, + "step": 5889 + }, + { + "epoch": 0.21093344315719734, + "grad_norm": 1.8942592144012451, + "learning_rate": 0.00018331693503668013, + "loss": 1.3193, + "step": 5890 + }, + { + "epoch": 0.21096925528676563, + "grad_norm": 1.4962157011032104, + "learning_rate": 0.00018331052001408152, + "loss": 1.0898, + "step": 5891 + }, + { + "epoch": 0.2110050674163339, + "grad_norm": 1.3931103944778442, + "learning_rate": 0.0001833041038706473, + "loss": 1.1796, + "step": 5892 + }, + { + "epoch": 0.2110408795459022, + "grad_norm": 1.885072112083435, + "learning_rate": 0.00018329768660646384, + "loss": 1.2425, + "step": 5893 + }, + { + "epoch": 0.21107669167547047, + "grad_norm": 1.3456915616989136, + "learning_rate": 0.00018329126822161747, + "loss": 1.1115, + "step": 5894 + }, + { + "epoch": 0.21111250380503876, + "grad_norm": 1.197265625, + "learning_rate": 0.0001832848487161945, + "loss": 1.1865, + "step": 5895 + }, + { + "epoch": 0.21114831593460706, + "grad_norm": 1.3523898124694824, + "learning_rate": 0.00018327842809028134, + "loss": 1.0751, + "step": 5896 + }, + { + "epoch": 0.21118412806417533, + "grad_norm": 1.5174789428710938, + "learning_rate": 0.00018327200634396434, + "loss": 1.1912, + "step": 5897 + }, + { + "epoch": 0.21121994019374363, + "grad_norm": 1.3614903688430786, + "learning_rate": 0.0001832655834773299, + "loss": 1.0873, + "step": 5898 + }, + { + "epoch": 0.2112557523233119, + "grad_norm": 1.3308957815170288, + "learning_rate": 0.00018325915949046444, + "loss": 1.0417, + "step": 5899 + }, + { + "epoch": 0.2112915644528802, + "grad_norm": 1.1764222383499146, + "learning_rate": 0.00018325273438345437, + "loss": 1.1594, + "step": 5900 + }, + { + "epoch": 0.21132737658244846, + "grad_norm": 1.7407574653625488, + "learning_rate": 0.0001832463081563862, + "loss": 1.2228, + "step": 5901 + }, + { + "epoch": 0.21136318871201676, + "grad_norm": 1.5726957321166992, + "learning_rate": 0.00018323988080934628, + "loss": 1.2498, + "step": 5902 + }, + { + "epoch": 0.21139900084158506, + "grad_norm": 1.468012809753418, + "learning_rate": 0.00018323345234242118, + "loss": 1.1564, + "step": 5903 + }, + { + "epoch": 0.21143481297115332, + "grad_norm": 1.5947173833847046, + "learning_rate": 0.0001832270227556973, + "loss": 0.842, + "step": 5904 + }, + { + "epoch": 0.21147062510072162, + "grad_norm": 1.186497449874878, + "learning_rate": 0.0001832205920492612, + "loss": 1.1561, + "step": 5905 + }, + { + "epoch": 0.2115064372302899, + "grad_norm": 2.2919352054595947, + "learning_rate": 0.0001832141602231994, + "loss": 1.1681, + "step": 5906 + }, + { + "epoch": 0.2115422493598582, + "grad_norm": 1.8020861148834229, + "learning_rate": 0.0001832077272775984, + "loss": 1.3281, + "step": 5907 + }, + { + "epoch": 0.21157806148942646, + "grad_norm": 1.9128295183181763, + "learning_rate": 0.0001832012932125448, + "loss": 1.3364, + "step": 5908 + }, + { + "epoch": 0.21161387361899475, + "grad_norm": 1.631540298461914, + "learning_rate": 0.00018319485802812503, + "loss": 1.267, + "step": 5909 + }, + { + "epoch": 0.21164968574856305, + "grad_norm": 1.7278393507003784, + "learning_rate": 0.00018318842172442582, + "loss": 1.2768, + "step": 5910 + }, + { + "epoch": 0.21168549787813132, + "grad_norm": 1.651802659034729, + "learning_rate": 0.0001831819843015337, + "loss": 1.1853, + "step": 5911 + }, + { + "epoch": 0.21172131000769961, + "grad_norm": 1.8009216785430908, + "learning_rate": 0.00018317554575953527, + "loss": 1.1516, + "step": 5912 + }, + { + "epoch": 0.21175712213726788, + "grad_norm": 1.7880243062973022, + "learning_rate": 0.00018316910609851713, + "loss": 1.0592, + "step": 5913 + }, + { + "epoch": 0.21179293426683618, + "grad_norm": 1.2784663438796997, + "learning_rate": 0.00018316266531856598, + "loss": 1.2911, + "step": 5914 + }, + { + "epoch": 0.21182874639640445, + "grad_norm": 1.459008812904358, + "learning_rate": 0.00018315622341976844, + "loss": 1.2537, + "step": 5915 + }, + { + "epoch": 0.21186455852597275, + "grad_norm": 1.6422984600067139, + "learning_rate": 0.0001831497804022112, + "loss": 1.0794, + "step": 5916 + }, + { + "epoch": 0.21190037065554104, + "grad_norm": 3.2136332988739014, + "learning_rate": 0.00018314333626598089, + "loss": 1.1846, + "step": 5917 + }, + { + "epoch": 0.2119361827851093, + "grad_norm": 1.6987121105194092, + "learning_rate": 0.0001831368910111642, + "loss": 1.1333, + "step": 5918 + }, + { + "epoch": 0.2119719949146776, + "grad_norm": 2.4710848331451416, + "learning_rate": 0.00018313044463784793, + "loss": 1.1367, + "step": 5919 + }, + { + "epoch": 0.21200780704424588, + "grad_norm": 1.9521551132202148, + "learning_rate": 0.00018312399714611876, + "loss": 1.2814, + "step": 5920 + }, + { + "epoch": 0.21204361917381417, + "grad_norm": 1.438788652420044, + "learning_rate": 0.00018311754853606344, + "loss": 1.259, + "step": 5921 + }, + { + "epoch": 0.21207943130338244, + "grad_norm": 1.5629158020019531, + "learning_rate": 0.00018311109880776868, + "loss": 1.3061, + "step": 5922 + }, + { + "epoch": 0.21211524343295074, + "grad_norm": 1.4570329189300537, + "learning_rate": 0.00018310464796132133, + "loss": 1.2352, + "step": 5923 + }, + { + "epoch": 0.21215105556251904, + "grad_norm": 1.7005980014801025, + "learning_rate": 0.0001830981959968081, + "loss": 1.2727, + "step": 5924 + }, + { + "epoch": 0.2121868676920873, + "grad_norm": 1.7779288291931152, + "learning_rate": 0.00018309174291431587, + "loss": 1.2765, + "step": 5925 + }, + { + "epoch": 0.2122226798216556, + "grad_norm": 1.449654459953308, + "learning_rate": 0.00018308528871393138, + "loss": 1.0216, + "step": 5926 + }, + { + "epoch": 0.21225849195122387, + "grad_norm": 1.5592617988586426, + "learning_rate": 0.00018307883339574153, + "loss": 1.1165, + "step": 5927 + }, + { + "epoch": 0.21229430408079217, + "grad_norm": 2.385946035385132, + "learning_rate": 0.00018307237695983314, + "loss": 1.1329, + "step": 5928 + }, + { + "epoch": 0.21233011621036044, + "grad_norm": 1.4196093082427979, + "learning_rate": 0.00018306591940629307, + "loss": 1.0812, + "step": 5929 + }, + { + "epoch": 0.21236592833992873, + "grad_norm": 1.3206510543823242, + "learning_rate": 0.00018305946073520822, + "loss": 1.0606, + "step": 5930 + }, + { + "epoch": 0.21240174046949703, + "grad_norm": 1.3282872438430786, + "learning_rate": 0.00018305300094666543, + "loss": 1.1236, + "step": 5931 + }, + { + "epoch": 0.2124375525990653, + "grad_norm": 1.7955880165100098, + "learning_rate": 0.00018304654004075167, + "loss": 1.2966, + "step": 5932 + }, + { + "epoch": 0.2124733647286336, + "grad_norm": 1.6051276922225952, + "learning_rate": 0.0001830400780175538, + "loss": 1.0167, + "step": 5933 + }, + { + "epoch": 0.21250917685820186, + "grad_norm": 1.5578813552856445, + "learning_rate": 0.00018303361487715883, + "loss": 1.0984, + "step": 5934 + }, + { + "epoch": 0.21254498898777016, + "grad_norm": 1.2830477952957153, + "learning_rate": 0.00018302715061965365, + "loss": 1.3548, + "step": 5935 + }, + { + "epoch": 0.21258080111733843, + "grad_norm": 1.335598111152649, + "learning_rate": 0.00018302068524512528, + "loss": 0.9548, + "step": 5936 + }, + { + "epoch": 0.21261661324690673, + "grad_norm": 2.051218032836914, + "learning_rate": 0.00018301421875366067, + "loss": 1.3451, + "step": 5937 + }, + { + "epoch": 0.21265242537647502, + "grad_norm": 1.6260132789611816, + "learning_rate": 0.00018300775114534683, + "loss": 1.2098, + "step": 5938 + }, + { + "epoch": 0.2126882375060433, + "grad_norm": 1.7299309968948364, + "learning_rate": 0.00018300128242027078, + "loss": 1.0883, + "step": 5939 + }, + { + "epoch": 0.2127240496356116, + "grad_norm": 1.4014966487884521, + "learning_rate": 0.00018299481257851952, + "loss": 1.3162, + "step": 5940 + }, + { + "epoch": 0.21275986176517986, + "grad_norm": 1.6174920797348022, + "learning_rate": 0.00018298834162018012, + "loss": 1.2443, + "step": 5941 + }, + { + "epoch": 0.21279567389474816, + "grad_norm": 2.025808095932007, + "learning_rate": 0.00018298186954533962, + "loss": 1.0517, + "step": 5942 + }, + { + "epoch": 0.21283148602431642, + "grad_norm": 3.668114423751831, + "learning_rate": 0.00018297539635408512, + "loss": 1.1735, + "step": 5943 + }, + { + "epoch": 0.21286729815388472, + "grad_norm": 1.8668879270553589, + "learning_rate": 0.00018296892204650367, + "loss": 1.2723, + "step": 5944 + }, + { + "epoch": 0.21290311028345302, + "grad_norm": 1.3148396015167236, + "learning_rate": 0.00018296244662268241, + "loss": 1.2732, + "step": 5945 + }, + { + "epoch": 0.2129389224130213, + "grad_norm": 1.3471726179122925, + "learning_rate": 0.00018295597008270847, + "loss": 1.2274, + "step": 5946 + }, + { + "epoch": 0.21297473454258958, + "grad_norm": 1.561423659324646, + "learning_rate": 0.00018294949242666895, + "loss": 1.2417, + "step": 5947 + }, + { + "epoch": 0.21301054667215785, + "grad_norm": 1.8578828573226929, + "learning_rate": 0.00018294301365465095, + "loss": 1.2567, + "step": 5948 + }, + { + "epoch": 0.21304635880172615, + "grad_norm": 1.5483378171920776, + "learning_rate": 0.00018293653376674177, + "loss": 1.1206, + "step": 5949 + }, + { + "epoch": 0.21308217093129442, + "grad_norm": 1.3357279300689697, + "learning_rate": 0.00018293005276302844, + "loss": 1.039, + "step": 5950 + }, + { + "epoch": 0.21311798306086271, + "grad_norm": 1.6707149744033813, + "learning_rate": 0.00018292357064359828, + "loss": 1.3274, + "step": 5951 + }, + { + "epoch": 0.213153795190431, + "grad_norm": 1.8392035961151123, + "learning_rate": 0.0001829170874085384, + "loss": 1.2073, + "step": 5952 + }, + { + "epoch": 0.21318960731999928, + "grad_norm": 1.3935143947601318, + "learning_rate": 0.00018291060305793608, + "loss": 1.1552, + "step": 5953 + }, + { + "epoch": 0.21322541944956758, + "grad_norm": 1.7574388980865479, + "learning_rate": 0.00018290411759187855, + "loss": 1.3328, + "step": 5954 + }, + { + "epoch": 0.21326123157913585, + "grad_norm": 2.401430606842041, + "learning_rate": 0.00018289763101045302, + "loss": 1.2141, + "step": 5955 + }, + { + "epoch": 0.21329704370870414, + "grad_norm": 2.0606865882873535, + "learning_rate": 0.00018289114331374685, + "loss": 1.1697, + "step": 5956 + }, + { + "epoch": 0.2133328558382724, + "grad_norm": 1.3911174535751343, + "learning_rate": 0.00018288465450184722, + "loss": 1.0673, + "step": 5957 + }, + { + "epoch": 0.2133686679678407, + "grad_norm": 1.8833529949188232, + "learning_rate": 0.0001828781645748415, + "loss": 1.2293, + "step": 5958 + }, + { + "epoch": 0.213404480097409, + "grad_norm": 1.2478693723678589, + "learning_rate": 0.00018287167353281698, + "loss": 1.0451, + "step": 5959 + }, + { + "epoch": 0.21344029222697727, + "grad_norm": 1.4047200679779053, + "learning_rate": 0.000182865181375861, + "loss": 0.9504, + "step": 5960 + }, + { + "epoch": 0.21347610435654557, + "grad_norm": 1.564349889755249, + "learning_rate": 0.0001828586881040609, + "loss": 1.2625, + "step": 5961 + }, + { + "epoch": 0.21351191648611384, + "grad_norm": 1.3700848817825317, + "learning_rate": 0.00018285219371750398, + "loss": 1.0777, + "step": 5962 + }, + { + "epoch": 0.21354772861568214, + "grad_norm": 1.8085166215896606, + "learning_rate": 0.0001828456982162777, + "loss": 1.2992, + "step": 5963 + }, + { + "epoch": 0.2135835407452504, + "grad_norm": 1.8577463626861572, + "learning_rate": 0.0001828392016004694, + "loss": 1.3197, + "step": 5964 + }, + { + "epoch": 0.2136193528748187, + "grad_norm": 1.5931975841522217, + "learning_rate": 0.00018283270387016654, + "loss": 1.1809, + "step": 5965 + }, + { + "epoch": 0.213655165004387, + "grad_norm": 2.1206068992614746, + "learning_rate": 0.00018282620502545647, + "loss": 1.3595, + "step": 5966 + }, + { + "epoch": 0.21369097713395527, + "grad_norm": 1.3388245105743408, + "learning_rate": 0.00018281970506642663, + "loss": 1.0302, + "step": 5967 + }, + { + "epoch": 0.21372678926352356, + "grad_norm": 2.189361810684204, + "learning_rate": 0.0001828132039931645, + "loss": 1.2489, + "step": 5968 + }, + { + "epoch": 0.21376260139309183, + "grad_norm": 1.2796142101287842, + "learning_rate": 0.00018280670180575754, + "loss": 1.0917, + "step": 5969 + }, + { + "epoch": 0.21379841352266013, + "grad_norm": 1.3251184225082397, + "learning_rate": 0.00018280019850429321, + "loss": 1.0995, + "step": 5970 + }, + { + "epoch": 0.2138342256522284, + "grad_norm": 1.5068703889846802, + "learning_rate": 0.000182793694088859, + "loss": 1.0744, + "step": 5971 + }, + { + "epoch": 0.2138700377817967, + "grad_norm": 2.4754645824432373, + "learning_rate": 0.00018278718855954247, + "loss": 1.3216, + "step": 5972 + }, + { + "epoch": 0.213905849911365, + "grad_norm": 1.7134411334991455, + "learning_rate": 0.00018278068191643107, + "loss": 1.3445, + "step": 5973 + }, + { + "epoch": 0.21394166204093326, + "grad_norm": 1.3397893905639648, + "learning_rate": 0.0001827741741596124, + "loss": 1.1135, + "step": 5974 + }, + { + "epoch": 0.21397747417050156, + "grad_norm": 1.9937351942062378, + "learning_rate": 0.00018276766528917398, + "loss": 1.3676, + "step": 5975 + }, + { + "epoch": 0.21401328630006983, + "grad_norm": 1.62259042263031, + "learning_rate": 0.00018276115530520336, + "loss": 1.2897, + "step": 5976 + }, + { + "epoch": 0.21404909842963812, + "grad_norm": 1.6082234382629395, + "learning_rate": 0.0001827546442077882, + "loss": 1.1268, + "step": 5977 + }, + { + "epoch": 0.2140849105592064, + "grad_norm": 1.7217257022857666, + "learning_rate": 0.000182748131997016, + "loss": 1.1048, + "step": 5978 + }, + { + "epoch": 0.2141207226887747, + "grad_norm": 1.3838475942611694, + "learning_rate": 0.00018274161867297447, + "loss": 1.292, + "step": 5979 + }, + { + "epoch": 0.214156534818343, + "grad_norm": 1.3099256753921509, + "learning_rate": 0.00018273510423575117, + "loss": 1.1662, + "step": 5980 + }, + { + "epoch": 0.21419234694791126, + "grad_norm": 1.4702374935150146, + "learning_rate": 0.00018272858868543374, + "loss": 1.0449, + "step": 5981 + }, + { + "epoch": 0.21422815907747955, + "grad_norm": 1.9855237007141113, + "learning_rate": 0.00018272207202210986, + "loss": 1.1535, + "step": 5982 + }, + { + "epoch": 0.21426397120704782, + "grad_norm": 1.4967195987701416, + "learning_rate": 0.00018271555424586723, + "loss": 1.3424, + "step": 5983 + }, + { + "epoch": 0.21429978333661612, + "grad_norm": 1.6590744256973267, + "learning_rate": 0.0001827090353567935, + "loss": 1.1625, + "step": 5984 + }, + { + "epoch": 0.2143355954661844, + "grad_norm": 1.6192785501480103, + "learning_rate": 0.0001827025153549764, + "loss": 1.3295, + "step": 5985 + }, + { + "epoch": 0.21437140759575268, + "grad_norm": 1.442513108253479, + "learning_rate": 0.00018269599424050362, + "loss": 1.2109, + "step": 5986 + }, + { + "epoch": 0.21440721972532098, + "grad_norm": 1.5017144680023193, + "learning_rate": 0.00018268947201346291, + "loss": 1.1684, + "step": 5987 + }, + { + "epoch": 0.21444303185488925, + "grad_norm": 1.3882544040679932, + "learning_rate": 0.00018268294867394204, + "loss": 1.1722, + "step": 5988 + }, + { + "epoch": 0.21447884398445755, + "grad_norm": 1.8731000423431396, + "learning_rate": 0.00018267642422202873, + "loss": 1.3751, + "step": 5989 + }, + { + "epoch": 0.21451465611402581, + "grad_norm": 1.7082462310791016, + "learning_rate": 0.00018266989865781076, + "loss": 1.2543, + "step": 5990 + }, + { + "epoch": 0.2145504682435941, + "grad_norm": 1.4769957065582275, + "learning_rate": 0.00018266337198137594, + "loss": 1.2006, + "step": 5991 + }, + { + "epoch": 0.21458628037316238, + "grad_norm": 1.6424782276153564, + "learning_rate": 0.00018265684419281213, + "loss": 1.3092, + "step": 5992 + }, + { + "epoch": 0.21462209250273068, + "grad_norm": 1.458537220954895, + "learning_rate": 0.00018265031529220705, + "loss": 1.1276, + "step": 5993 + }, + { + "epoch": 0.21465790463229895, + "grad_norm": 1.337550163269043, + "learning_rate": 0.0001826437852796486, + "loss": 1.1529, + "step": 5994 + }, + { + "epoch": 0.21469371676186724, + "grad_norm": 2.006822109222412, + "learning_rate": 0.00018263725415522462, + "loss": 1.1233, + "step": 5995 + }, + { + "epoch": 0.21472952889143554, + "grad_norm": 1.9410041570663452, + "learning_rate": 0.000182630721919023, + "loss": 1.1741, + "step": 5996 + }, + { + "epoch": 0.2147653410210038, + "grad_norm": 1.5222933292388916, + "learning_rate": 0.00018262418857113157, + "loss": 1.3509, + "step": 5997 + }, + { + "epoch": 0.2148011531505721, + "grad_norm": 1.4486408233642578, + "learning_rate": 0.00018261765411163827, + "loss": 1.3074, + "step": 5998 + }, + { + "epoch": 0.21483696528014037, + "grad_norm": 1.4212745428085327, + "learning_rate": 0.000182611118540631, + "loss": 1.0888, + "step": 5999 + }, + { + "epoch": 0.21487277740970867, + "grad_norm": 1.2979916334152222, + "learning_rate": 0.00018260458185819772, + "loss": 1.1595, + "step": 6000 + }, + { + "epoch": 0.21490858953927694, + "grad_norm": 1.1907687187194824, + "learning_rate": 0.00018259804406442633, + "loss": 1.026, + "step": 6001 + }, + { + "epoch": 0.21494440166884524, + "grad_norm": 2.079052448272705, + "learning_rate": 0.0001825915051594048, + "loss": 1.3202, + "step": 6002 + }, + { + "epoch": 0.21498021379841353, + "grad_norm": 1.615185022354126, + "learning_rate": 0.0001825849651432211, + "loss": 1.0647, + "step": 6003 + }, + { + "epoch": 0.2150160259279818, + "grad_norm": 1.3417376279830933, + "learning_rate": 0.0001825784240159632, + "loss": 1.0955, + "step": 6004 + }, + { + "epoch": 0.2150518380575501, + "grad_norm": 1.725746512413025, + "learning_rate": 0.00018257188177771914, + "loss": 1.2275, + "step": 6005 + }, + { + "epoch": 0.21508765018711837, + "grad_norm": 1.703240156173706, + "learning_rate": 0.00018256533842857695, + "loss": 1.0154, + "step": 6006 + }, + { + "epoch": 0.21512346231668666, + "grad_norm": 1.5877310037612915, + "learning_rate": 0.0001825587939686246, + "loss": 1.2157, + "step": 6007 + }, + { + "epoch": 0.21515927444625493, + "grad_norm": 1.3762083053588867, + "learning_rate": 0.00018255224839795018, + "loss": 1.2853, + "step": 6008 + }, + { + "epoch": 0.21519508657582323, + "grad_norm": 1.5469129085540771, + "learning_rate": 0.00018254570171664174, + "loss": 1.0446, + "step": 6009 + }, + { + "epoch": 0.21523089870539153, + "grad_norm": 1.31468665599823, + "learning_rate": 0.00018253915392478737, + "loss": 1.1348, + "step": 6010 + }, + { + "epoch": 0.2152667108349598, + "grad_norm": 2.21403169631958, + "learning_rate": 0.00018253260502247513, + "loss": 1.3616, + "step": 6011 + }, + { + "epoch": 0.2153025229645281, + "grad_norm": 1.7870758771896362, + "learning_rate": 0.00018252605500979316, + "loss": 1.3442, + "step": 6012 + }, + { + "epoch": 0.21533833509409636, + "grad_norm": 1.6176189184188843, + "learning_rate": 0.00018251950388682958, + "loss": 1.1761, + "step": 6013 + }, + { + "epoch": 0.21537414722366466, + "grad_norm": 1.3922330141067505, + "learning_rate": 0.0001825129516536725, + "loss": 1.1098, + "step": 6014 + }, + { + "epoch": 0.21540995935323293, + "grad_norm": 1.9538958072662354, + "learning_rate": 0.0001825063983104101, + "loss": 1.2693, + "step": 6015 + }, + { + "epoch": 0.21544577148280122, + "grad_norm": 2.611274003982544, + "learning_rate": 0.00018249984385713055, + "loss": 1.3974, + "step": 6016 + }, + { + "epoch": 0.21548158361236952, + "grad_norm": 1.7044440507888794, + "learning_rate": 0.000182493288293922, + "loss": 1.1269, + "step": 6017 + }, + { + "epoch": 0.2155173957419378, + "grad_norm": 1.4571449756622314, + "learning_rate": 0.00018248673162087268, + "loss": 1.2139, + "step": 6018 + }, + { + "epoch": 0.2155532078715061, + "grad_norm": 1.520183801651001, + "learning_rate": 0.00018248017383807076, + "loss": 1.1208, + "step": 6019 + }, + { + "epoch": 0.21558902000107436, + "grad_norm": 1.557117223739624, + "learning_rate": 0.0001824736149456045, + "loss": 1.2129, + "step": 6020 + }, + { + "epoch": 0.21562483213064265, + "grad_norm": 1.6524004936218262, + "learning_rate": 0.00018246705494356214, + "loss": 1.0319, + "step": 6021 + }, + { + "epoch": 0.21566064426021092, + "grad_norm": 1.7376459836959839, + "learning_rate": 0.00018246049383203192, + "loss": 1.1389, + "step": 6022 + }, + { + "epoch": 0.21569645638977922, + "grad_norm": 1.7697045803070068, + "learning_rate": 0.00018245393161110215, + "loss": 0.8622, + "step": 6023 + }, + { + "epoch": 0.21573226851934751, + "grad_norm": 1.4529592990875244, + "learning_rate": 0.00018244736828086107, + "loss": 0.9531, + "step": 6024 + }, + { + "epoch": 0.21576808064891578, + "grad_norm": 1.5195668935775757, + "learning_rate": 0.00018244080384139698, + "loss": 1.2667, + "step": 6025 + }, + { + "epoch": 0.21580389277848408, + "grad_norm": 1.4062906503677368, + "learning_rate": 0.00018243423829279824, + "loss": 1.228, + "step": 6026 + }, + { + "epoch": 0.21583970490805235, + "grad_norm": 1.4088739156723022, + "learning_rate": 0.00018242767163515318, + "loss": 1.1528, + "step": 6027 + }, + { + "epoch": 0.21587551703762065, + "grad_norm": 1.4596867561340332, + "learning_rate": 0.00018242110386855007, + "loss": 0.8913, + "step": 6028 + }, + { + "epoch": 0.21591132916718891, + "grad_norm": 1.388340950012207, + "learning_rate": 0.00018241453499307734, + "loss": 1.0773, + "step": 6029 + }, + { + "epoch": 0.2159471412967572, + "grad_norm": 1.9575836658477783, + "learning_rate": 0.00018240796500882338, + "loss": 1.3912, + "step": 6030 + }, + { + "epoch": 0.2159829534263255, + "grad_norm": 2.1397805213928223, + "learning_rate": 0.0001824013939158765, + "loss": 1.3917, + "step": 6031 + }, + { + "epoch": 0.21601876555589378, + "grad_norm": 1.241409182548523, + "learning_rate": 0.0001823948217143252, + "loss": 1.2939, + "step": 6032 + }, + { + "epoch": 0.21605457768546207, + "grad_norm": 1.4355294704437256, + "learning_rate": 0.00018238824840425785, + "loss": 1.0698, + "step": 6033 + }, + { + "epoch": 0.21609038981503034, + "grad_norm": 1.7295721769332886, + "learning_rate": 0.00018238167398576286, + "loss": 1.0513, + "step": 6034 + }, + { + "epoch": 0.21612620194459864, + "grad_norm": 2.10087513923645, + "learning_rate": 0.00018237509845892873, + "loss": 1.2063, + "step": 6035 + }, + { + "epoch": 0.2161620140741669, + "grad_norm": 1.4729150533676147, + "learning_rate": 0.00018236852182384393, + "loss": 1.2004, + "step": 6036 + }, + { + "epoch": 0.2161978262037352, + "grad_norm": 1.5544703006744385, + "learning_rate": 0.00018236194408059685, + "loss": 0.9106, + "step": 6037 + }, + { + "epoch": 0.2162336383333035, + "grad_norm": 1.5639525651931763, + "learning_rate": 0.00018235536522927611, + "loss": 1.2734, + "step": 6038 + }, + { + "epoch": 0.21626945046287177, + "grad_norm": 1.3623335361480713, + "learning_rate": 0.00018234878526997015, + "loss": 1.0626, + "step": 6039 + }, + { + "epoch": 0.21630526259244007, + "grad_norm": 1.3152885437011719, + "learning_rate": 0.0001823422042027675, + "loss": 1.2622, + "step": 6040 + }, + { + "epoch": 0.21634107472200834, + "grad_norm": 1.7479197978973389, + "learning_rate": 0.0001823356220277567, + "loss": 1.1753, + "step": 6041 + }, + { + "epoch": 0.21637688685157663, + "grad_norm": 1.7686129808425903, + "learning_rate": 0.00018232903874502632, + "loss": 1.0503, + "step": 6042 + }, + { + "epoch": 0.2164126989811449, + "grad_norm": 1.7411669492721558, + "learning_rate": 0.00018232245435466493, + "loss": 1.2054, + "step": 6043 + }, + { + "epoch": 0.2164485111107132, + "grad_norm": 1.718023419380188, + "learning_rate": 0.0001823158688567611, + "loss": 1.1854, + "step": 6044 + }, + { + "epoch": 0.2164843232402815, + "grad_norm": 1.4845569133758545, + "learning_rate": 0.00018230928225140342, + "loss": 1.3647, + "step": 6045 + }, + { + "epoch": 0.21652013536984976, + "grad_norm": 1.7571004629135132, + "learning_rate": 0.00018230269453868052, + "loss": 1.0928, + "step": 6046 + }, + { + "epoch": 0.21655594749941806, + "grad_norm": 1.6386191844940186, + "learning_rate": 0.00018229610571868102, + "loss": 1.5063, + "step": 6047 + }, + { + "epoch": 0.21659175962898633, + "grad_norm": 1.1858868598937988, + "learning_rate": 0.0001822895157914936, + "loss": 1.3004, + "step": 6048 + }, + { + "epoch": 0.21662757175855463, + "grad_norm": 1.3876453638076782, + "learning_rate": 0.00018228292475720687, + "loss": 1.2866, + "step": 6049 + }, + { + "epoch": 0.2166633838881229, + "grad_norm": 1.4315376281738281, + "learning_rate": 0.00018227633261590955, + "loss": 1.2025, + "step": 6050 + }, + { + "epoch": 0.2166991960176912, + "grad_norm": 1.4029706716537476, + "learning_rate": 0.00018226973936769027, + "loss": 1.2031, + "step": 6051 + }, + { + "epoch": 0.2167350081472595, + "grad_norm": 1.8706666231155396, + "learning_rate": 0.0001822631450126378, + "loss": 0.9924, + "step": 6052 + }, + { + "epoch": 0.21677082027682776, + "grad_norm": 1.5407401323318481, + "learning_rate": 0.00018225654955084079, + "loss": 1.2639, + "step": 6053 + }, + { + "epoch": 0.21680663240639605, + "grad_norm": 1.4469127655029297, + "learning_rate": 0.00018224995298238804, + "loss": 1.2486, + "step": 6054 + }, + { + "epoch": 0.21684244453596432, + "grad_norm": 1.7935962677001953, + "learning_rate": 0.00018224335530736825, + "loss": 1.3826, + "step": 6055 + }, + { + "epoch": 0.21687825666553262, + "grad_norm": 1.7432937622070312, + "learning_rate": 0.0001822367565258702, + "loss": 1.3368, + "step": 6056 + }, + { + "epoch": 0.2169140687951009, + "grad_norm": 1.3258392810821533, + "learning_rate": 0.0001822301566379827, + "loss": 1.2861, + "step": 6057 + }, + { + "epoch": 0.2169498809246692, + "grad_norm": 1.4058440923690796, + "learning_rate": 0.00018222355564379448, + "loss": 0.9821, + "step": 6058 + }, + { + "epoch": 0.21698569305423748, + "grad_norm": 1.4108790159225464, + "learning_rate": 0.00018221695354339435, + "loss": 1.0847, + "step": 6059 + }, + { + "epoch": 0.21702150518380575, + "grad_norm": 1.690443754196167, + "learning_rate": 0.00018221035033687123, + "loss": 1.1047, + "step": 6060 + }, + { + "epoch": 0.21705731731337405, + "grad_norm": 1.8489917516708374, + "learning_rate": 0.00018220374602431386, + "loss": 1.2788, + "step": 6061 + }, + { + "epoch": 0.21709312944294232, + "grad_norm": 1.4927116632461548, + "learning_rate": 0.0001821971406058111, + "loss": 1.2465, + "step": 6062 + }, + { + "epoch": 0.21712894157251061, + "grad_norm": 1.4604675769805908, + "learning_rate": 0.00018219053408145185, + "loss": 1.0098, + "step": 6063 + }, + { + "epoch": 0.21716475370207888, + "grad_norm": 1.429654598236084, + "learning_rate": 0.000182183926451325, + "loss": 1.1772, + "step": 6064 + }, + { + "epoch": 0.21720056583164718, + "grad_norm": 1.7137852907180786, + "learning_rate": 0.00018217731771551942, + "loss": 1.2631, + "step": 6065 + }, + { + "epoch": 0.21723637796121548, + "grad_norm": 1.7385077476501465, + "learning_rate": 0.00018217070787412404, + "loss": 1.1189, + "step": 6066 + }, + { + "epoch": 0.21727219009078375, + "grad_norm": 1.6179896593093872, + "learning_rate": 0.00018216409692722779, + "loss": 1.3419, + "step": 6067 + }, + { + "epoch": 0.21730800222035204, + "grad_norm": 1.6328660249710083, + "learning_rate": 0.00018215748487491958, + "loss": 1.1016, + "step": 6068 + }, + { + "epoch": 0.2173438143499203, + "grad_norm": 2.35105562210083, + "learning_rate": 0.00018215087171728837, + "loss": 1.5361, + "step": 6069 + }, + { + "epoch": 0.2173796264794886, + "grad_norm": 1.5921425819396973, + "learning_rate": 0.00018214425745442317, + "loss": 1.3574, + "step": 6070 + }, + { + "epoch": 0.21741543860905688, + "grad_norm": 1.624632716178894, + "learning_rate": 0.00018213764208641292, + "loss": 1.1898, + "step": 6071 + }, + { + "epoch": 0.21745125073862517, + "grad_norm": 1.8851943016052246, + "learning_rate": 0.00018213102561334668, + "loss": 1.1072, + "step": 6072 + }, + { + "epoch": 0.21748706286819347, + "grad_norm": 1.8186404705047607, + "learning_rate": 0.00018212440803531342, + "loss": 1.178, + "step": 6073 + }, + { + "epoch": 0.21752287499776174, + "grad_norm": 1.7398995161056519, + "learning_rate": 0.00018211778935240219, + "loss": 1.2297, + "step": 6074 + }, + { + "epoch": 0.21755868712733004, + "grad_norm": 1.4142756462097168, + "learning_rate": 0.00018211116956470203, + "loss": 1.0626, + "step": 6075 + }, + { + "epoch": 0.2175944992568983, + "grad_norm": 1.4644103050231934, + "learning_rate": 0.00018210454867230195, + "loss": 1.245, + "step": 6076 + }, + { + "epoch": 0.2176303113864666, + "grad_norm": 1.921090006828308, + "learning_rate": 0.00018209792667529112, + "loss": 1.2783, + "step": 6077 + }, + { + "epoch": 0.21766612351603487, + "grad_norm": 1.6927887201309204, + "learning_rate": 0.00018209130357375858, + "loss": 1.387, + "step": 6078 + }, + { + "epoch": 0.21770193564560317, + "grad_norm": 1.298884391784668, + "learning_rate": 0.00018208467936779347, + "loss": 1.3791, + "step": 6079 + }, + { + "epoch": 0.21773774777517146, + "grad_norm": 2.5529892444610596, + "learning_rate": 0.00018207805405748482, + "loss": 1.1886, + "step": 6080 + }, + { + "epoch": 0.21777355990473973, + "grad_norm": 1.3965200185775757, + "learning_rate": 0.00018207142764292187, + "loss": 1.1356, + "step": 6081 + }, + { + "epoch": 0.21780937203430803, + "grad_norm": 1.326150894165039, + "learning_rate": 0.00018206480012419372, + "loss": 1.2634, + "step": 6082 + }, + { + "epoch": 0.2178451841638763, + "grad_norm": 1.9085038900375366, + "learning_rate": 0.0001820581715013895, + "loss": 1.1775, + "step": 6083 + }, + { + "epoch": 0.2178809962934446, + "grad_norm": 1.564134955406189, + "learning_rate": 0.0001820515417745985, + "loss": 1.1025, + "step": 6084 + }, + { + "epoch": 0.21791680842301286, + "grad_norm": 1.2887275218963623, + "learning_rate": 0.0001820449109439098, + "loss": 1.1958, + "step": 6085 + }, + { + "epoch": 0.21795262055258116, + "grad_norm": 1.5944937467575073, + "learning_rate": 0.00018203827900941264, + "loss": 1.2906, + "step": 6086 + }, + { + "epoch": 0.21798843268214946, + "grad_norm": 1.6305270195007324, + "learning_rate": 0.0001820316459711963, + "loss": 1.1959, + "step": 6087 + }, + { + "epoch": 0.21802424481171773, + "grad_norm": 1.6168540716171265, + "learning_rate": 0.0001820250118293499, + "loss": 1.4525, + "step": 6088 + }, + { + "epoch": 0.21806005694128602, + "grad_norm": 1.424505591392517, + "learning_rate": 0.00018201837658396287, + "loss": 1.1874, + "step": 6089 + }, + { + "epoch": 0.2180958690708543, + "grad_norm": 1.8222310543060303, + "learning_rate": 0.00018201174023512433, + "loss": 1.1459, + "step": 6090 + }, + { + "epoch": 0.2181316812004226, + "grad_norm": 1.760902762413025, + "learning_rate": 0.0001820051027829236, + "loss": 1.0037, + "step": 6091 + }, + { + "epoch": 0.21816749332999086, + "grad_norm": 1.3264780044555664, + "learning_rate": 0.00018199846422745002, + "loss": 1.1926, + "step": 6092 + }, + { + "epoch": 0.21820330545955915, + "grad_norm": 1.206947922706604, + "learning_rate": 0.00018199182456879286, + "loss": 1.1044, + "step": 6093 + }, + { + "epoch": 0.21823911758912742, + "grad_norm": 1.6771348714828491, + "learning_rate": 0.00018198518380704143, + "loss": 1.4152, + "step": 6094 + }, + { + "epoch": 0.21827492971869572, + "grad_norm": 1.5694631338119507, + "learning_rate": 0.00018197854194228517, + "loss": 1.3148, + "step": 6095 + }, + { + "epoch": 0.21831074184826402, + "grad_norm": 1.830426573753357, + "learning_rate": 0.00018197189897461332, + "loss": 1.0874, + "step": 6096 + }, + { + "epoch": 0.2183465539778323, + "grad_norm": 1.4226763248443604, + "learning_rate": 0.00018196525490411534, + "loss": 1.1146, + "step": 6097 + }, + { + "epoch": 0.21838236610740058, + "grad_norm": 1.930633783340454, + "learning_rate": 0.00018195860973088058, + "loss": 1.2485, + "step": 6098 + }, + { + "epoch": 0.21841817823696885, + "grad_norm": 1.5981216430664062, + "learning_rate": 0.00018195196345499842, + "loss": 1.34, + "step": 6099 + }, + { + "epoch": 0.21845399036653715, + "grad_norm": 1.9425640106201172, + "learning_rate": 0.00018194531607655833, + "loss": 1.0736, + "step": 6100 + }, + { + "epoch": 0.21848980249610542, + "grad_norm": 1.2703111171722412, + "learning_rate": 0.0001819386675956497, + "loss": 1.09, + "step": 6101 + }, + { + "epoch": 0.21852561462567371, + "grad_norm": 1.4724929332733154, + "learning_rate": 0.000181932018012362, + "loss": 1.2747, + "step": 6102 + }, + { + "epoch": 0.218561426755242, + "grad_norm": 1.4707927703857422, + "learning_rate": 0.00018192536732678468, + "loss": 1.1684, + "step": 6103 + }, + { + "epoch": 0.21859723888481028, + "grad_norm": 1.7345246076583862, + "learning_rate": 0.00018191871553900718, + "loss": 1.2684, + "step": 6104 + }, + { + "epoch": 0.21863305101437858, + "grad_norm": 1.6982179880142212, + "learning_rate": 0.00018191206264911908, + "loss": 1.1042, + "step": 6105 + }, + { + "epoch": 0.21866886314394685, + "grad_norm": 1.7817472219467163, + "learning_rate": 0.0001819054086572098, + "loss": 1.2669, + "step": 6106 + }, + { + "epoch": 0.21870467527351514, + "grad_norm": 1.5483341217041016, + "learning_rate": 0.00018189875356336893, + "loss": 1.1814, + "step": 6107 + }, + { + "epoch": 0.2187404874030834, + "grad_norm": 1.6386258602142334, + "learning_rate": 0.00018189209736768595, + "loss": 1.1493, + "step": 6108 + }, + { + "epoch": 0.2187762995326517, + "grad_norm": 1.917580485343933, + "learning_rate": 0.00018188544007025043, + "loss": 1.2468, + "step": 6109 + }, + { + "epoch": 0.21881211166222, + "grad_norm": 1.440403699874878, + "learning_rate": 0.00018187878167115197, + "loss": 1.2287, + "step": 6110 + }, + { + "epoch": 0.21884792379178827, + "grad_norm": 1.458876132965088, + "learning_rate": 0.00018187212217048008, + "loss": 1.3054, + "step": 6111 + }, + { + "epoch": 0.21888373592135657, + "grad_norm": 1.8171433210372925, + "learning_rate": 0.00018186546156832444, + "loss": 1.2894, + "step": 6112 + }, + { + "epoch": 0.21891954805092484, + "grad_norm": 1.4996439218521118, + "learning_rate": 0.00018185879986477456, + "loss": 1.0722, + "step": 6113 + }, + { + "epoch": 0.21895536018049314, + "grad_norm": 1.8206934928894043, + "learning_rate": 0.00018185213705992014, + "loss": 1.207, + "step": 6114 + }, + { + "epoch": 0.2189911723100614, + "grad_norm": 1.630703330039978, + "learning_rate": 0.00018184547315385082, + "loss": 1.4004, + "step": 6115 + }, + { + "epoch": 0.2190269844396297, + "grad_norm": 1.2376163005828857, + "learning_rate": 0.0001818388081466562, + "loss": 1.0784, + "step": 6116 + }, + { + "epoch": 0.219062796569198, + "grad_norm": 1.5051442384719849, + "learning_rate": 0.00018183214203842601, + "loss": 1.4563, + "step": 6117 + }, + { + "epoch": 0.21909860869876627, + "grad_norm": 1.645464539527893, + "learning_rate": 0.00018182547482924988, + "loss": 1.1104, + "step": 6118 + }, + { + "epoch": 0.21913442082833456, + "grad_norm": 1.610579490661621, + "learning_rate": 0.00018181880651921755, + "loss": 1.2442, + "step": 6119 + }, + { + "epoch": 0.21917023295790283, + "grad_norm": 1.3105393648147583, + "learning_rate": 0.0001818121371084187, + "loss": 1.207, + "step": 6120 + }, + { + "epoch": 0.21920604508747113, + "grad_norm": 1.2061383724212646, + "learning_rate": 0.00018180546659694307, + "loss": 1.2235, + "step": 6121 + }, + { + "epoch": 0.2192418572170394, + "grad_norm": 1.368411660194397, + "learning_rate": 0.0001817987949848804, + "loss": 1.2133, + "step": 6122 + }, + { + "epoch": 0.2192776693466077, + "grad_norm": 2.632964611053467, + "learning_rate": 0.0001817921222723205, + "loss": 1.3322, + "step": 6123 + }, + { + "epoch": 0.219313481476176, + "grad_norm": 1.345131516456604, + "learning_rate": 0.00018178544845935308, + "loss": 1.2761, + "step": 6124 + }, + { + "epoch": 0.21934929360574426, + "grad_norm": 1.3546663522720337, + "learning_rate": 0.00018177877354606797, + "loss": 1.064, + "step": 6125 + }, + { + "epoch": 0.21938510573531256, + "grad_norm": 1.5753769874572754, + "learning_rate": 0.00018177209753255492, + "loss": 1.1218, + "step": 6126 + }, + { + "epoch": 0.21942091786488083, + "grad_norm": 1.5388144254684448, + "learning_rate": 0.00018176542041890376, + "loss": 1.1833, + "step": 6127 + }, + { + "epoch": 0.21945672999444912, + "grad_norm": 1.4409021139144897, + "learning_rate": 0.00018175874220520438, + "loss": 1.0971, + "step": 6128 + }, + { + "epoch": 0.2194925421240174, + "grad_norm": 1.8211675882339478, + "learning_rate": 0.00018175206289154655, + "loss": 1.2226, + "step": 6129 + }, + { + "epoch": 0.2195283542535857, + "grad_norm": 1.3824049234390259, + "learning_rate": 0.00018174538247802015, + "loss": 0.8818, + "step": 6130 + }, + { + "epoch": 0.21956416638315399, + "grad_norm": 1.4139151573181152, + "learning_rate": 0.00018173870096471512, + "loss": 1.2518, + "step": 6131 + }, + { + "epoch": 0.21959997851272225, + "grad_norm": 1.7601318359375, + "learning_rate": 0.00018173201835172128, + "loss": 1.4242, + "step": 6132 + }, + { + "epoch": 0.21963579064229055, + "grad_norm": 1.228637456893921, + "learning_rate": 0.00018172533463912857, + "loss": 1.0923, + "step": 6133 + }, + { + "epoch": 0.21967160277185882, + "grad_norm": 1.645239233970642, + "learning_rate": 0.00018171864982702692, + "loss": 1.0369, + "step": 6134 + }, + { + "epoch": 0.21970741490142712, + "grad_norm": 1.6247820854187012, + "learning_rate": 0.0001817119639155062, + "loss": 1.097, + "step": 6135 + }, + { + "epoch": 0.21974322703099539, + "grad_norm": 1.7392942905426025, + "learning_rate": 0.00018170527690465643, + "loss": 1.1566, + "step": 6136 + }, + { + "epoch": 0.21977903916056368, + "grad_norm": 1.7204903364181519, + "learning_rate": 0.00018169858879456757, + "loss": 1.2578, + "step": 6137 + }, + { + "epoch": 0.21981485129013198, + "grad_norm": 1.563789963722229, + "learning_rate": 0.00018169189958532953, + "loss": 1.2287, + "step": 6138 + }, + { + "epoch": 0.21985066341970025, + "grad_norm": 1.8435378074645996, + "learning_rate": 0.0001816852092770324, + "loss": 1.1816, + "step": 6139 + }, + { + "epoch": 0.21988647554926855, + "grad_norm": 1.4963282346725464, + "learning_rate": 0.00018167851786976612, + "loss": 1.1354, + "step": 6140 + }, + { + "epoch": 0.21992228767883681, + "grad_norm": 1.3961191177368164, + "learning_rate": 0.00018167182536362074, + "loss": 1.1309, + "step": 6141 + }, + { + "epoch": 0.2199580998084051, + "grad_norm": 1.506338357925415, + "learning_rate": 0.00018166513175868633, + "loss": 1.1194, + "step": 6142 + }, + { + "epoch": 0.21999391193797338, + "grad_norm": 1.5676851272583008, + "learning_rate": 0.0001816584370550529, + "loss": 0.9419, + "step": 6143 + }, + { + "epoch": 0.22002972406754168, + "grad_norm": 1.9994237422943115, + "learning_rate": 0.00018165174125281053, + "loss": 1.0974, + "step": 6144 + }, + { + "epoch": 0.22006553619710997, + "grad_norm": 1.5551707744598389, + "learning_rate": 0.0001816450443520493, + "loss": 1.124, + "step": 6145 + }, + { + "epoch": 0.22010134832667824, + "grad_norm": 1.5168503522872925, + "learning_rate": 0.00018163834635285931, + "loss": 0.9392, + "step": 6146 + }, + { + "epoch": 0.22013716045624654, + "grad_norm": 2.125554084777832, + "learning_rate": 0.00018163164725533068, + "loss": 1.3631, + "step": 6147 + }, + { + "epoch": 0.2201729725858148, + "grad_norm": 2.1406049728393555, + "learning_rate": 0.0001816249470595535, + "loss": 1.1743, + "step": 6148 + }, + { + "epoch": 0.2202087847153831, + "grad_norm": 1.7066822052001953, + "learning_rate": 0.000181618245765618, + "loss": 1.1526, + "step": 6149 + }, + { + "epoch": 0.22024459684495137, + "grad_norm": 1.4997525215148926, + "learning_rate": 0.00018161154337361426, + "loss": 1.1771, + "step": 6150 + }, + { + "epoch": 0.22028040897451967, + "grad_norm": 1.4738956689834595, + "learning_rate": 0.00018160483988363248, + "loss": 1.3557, + "step": 6151 + }, + { + "epoch": 0.22031622110408797, + "grad_norm": 1.4356935024261475, + "learning_rate": 0.00018159813529576284, + "loss": 1.4047, + "step": 6152 + }, + { + "epoch": 0.22035203323365624, + "grad_norm": 1.5905729532241821, + "learning_rate": 0.0001815914296100955, + "loss": 1.3393, + "step": 6153 + }, + { + "epoch": 0.22038784536322453, + "grad_norm": 1.7778306007385254, + "learning_rate": 0.00018158472282672078, + "loss": 1.2564, + "step": 6154 + }, + { + "epoch": 0.2204236574927928, + "grad_norm": 1.4404395818710327, + "learning_rate": 0.00018157801494572885, + "loss": 1.211, + "step": 6155 + }, + { + "epoch": 0.2204594696223611, + "grad_norm": 1.4709093570709229, + "learning_rate": 0.00018157130596720996, + "loss": 1.2028, + "step": 6156 + }, + { + "epoch": 0.22049528175192937, + "grad_norm": 1.4092514514923096, + "learning_rate": 0.0001815645958912543, + "loss": 1.0152, + "step": 6157 + }, + { + "epoch": 0.22053109388149766, + "grad_norm": 1.8910341262817383, + "learning_rate": 0.0001815578847179523, + "loss": 1.2186, + "step": 6158 + }, + { + "epoch": 0.22056690601106596, + "grad_norm": 1.7492977380752563, + "learning_rate": 0.0001815511724473941, + "loss": 1.2689, + "step": 6159 + }, + { + "epoch": 0.22060271814063423, + "grad_norm": 1.8436650037765503, + "learning_rate": 0.0001815444590796701, + "loss": 1.072, + "step": 6160 + }, + { + "epoch": 0.22063853027020253, + "grad_norm": 2.165684223175049, + "learning_rate": 0.00018153774461487058, + "loss": 1.2427, + "step": 6161 + }, + { + "epoch": 0.2206743423997708, + "grad_norm": 1.5918545722961426, + "learning_rate": 0.00018153102905308589, + "loss": 1.3226, + "step": 6162 + }, + { + "epoch": 0.2207101545293391, + "grad_norm": 1.138702154159546, + "learning_rate": 0.00018152431239440637, + "loss": 1.2199, + "step": 6163 + }, + { + "epoch": 0.22074596665890736, + "grad_norm": 1.620971441268921, + "learning_rate": 0.00018151759463892235, + "loss": 1.3117, + "step": 6164 + }, + { + "epoch": 0.22078177878847566, + "grad_norm": 1.688135027885437, + "learning_rate": 0.00018151087578672427, + "loss": 1.1642, + "step": 6165 + }, + { + "epoch": 0.22081759091804395, + "grad_norm": 1.640334963798523, + "learning_rate": 0.00018150415583790253, + "loss": 0.9944, + "step": 6166 + }, + { + "epoch": 0.22085340304761222, + "grad_norm": 1.6061384677886963, + "learning_rate": 0.00018149743479254745, + "loss": 1.2822, + "step": 6167 + }, + { + "epoch": 0.22088921517718052, + "grad_norm": 1.4280778169631958, + "learning_rate": 0.00018149071265074955, + "loss": 1.2172, + "step": 6168 + }, + { + "epoch": 0.2209250273067488, + "grad_norm": 1.4413847923278809, + "learning_rate": 0.0001814839894125992, + "loss": 0.8536, + "step": 6169 + }, + { + "epoch": 0.22096083943631709, + "grad_norm": 1.405049443244934, + "learning_rate": 0.0001814772650781869, + "loss": 1.2526, + "step": 6170 + }, + { + "epoch": 0.22099665156588535, + "grad_norm": 1.787839412689209, + "learning_rate": 0.0001814705396476031, + "loss": 1.0109, + "step": 6171 + }, + { + "epoch": 0.22103246369545365, + "grad_norm": 1.4489576816558838, + "learning_rate": 0.00018146381312093826, + "loss": 1.0953, + "step": 6172 + }, + { + "epoch": 0.22106827582502195, + "grad_norm": 1.5132304430007935, + "learning_rate": 0.00018145708549828287, + "loss": 1.4079, + "step": 6173 + }, + { + "epoch": 0.22110408795459022, + "grad_norm": 1.3405301570892334, + "learning_rate": 0.00018145035677972753, + "loss": 1.2673, + "step": 6174 + }, + { + "epoch": 0.2211399000841585, + "grad_norm": 1.4821919202804565, + "learning_rate": 0.00018144362696536267, + "loss": 1.1983, + "step": 6175 + }, + { + "epoch": 0.22117571221372678, + "grad_norm": 1.7805500030517578, + "learning_rate": 0.00018143689605527885, + "loss": 0.9299, + "step": 6176 + }, + { + "epoch": 0.22121152434329508, + "grad_norm": 1.7311859130859375, + "learning_rate": 0.00018143016404956669, + "loss": 0.9155, + "step": 6177 + }, + { + "epoch": 0.22124733647286335, + "grad_norm": 1.4663119316101074, + "learning_rate": 0.00018142343094831667, + "loss": 1.2997, + "step": 6178 + }, + { + "epoch": 0.22128314860243165, + "grad_norm": 1.426738977432251, + "learning_rate": 0.0001814166967516194, + "loss": 1.0686, + "step": 6179 + }, + { + "epoch": 0.22131896073199994, + "grad_norm": 1.3733510971069336, + "learning_rate": 0.00018140996145956552, + "loss": 0.9534, + "step": 6180 + }, + { + "epoch": 0.2213547728615682, + "grad_norm": 1.5790903568267822, + "learning_rate": 0.00018140322507224563, + "loss": 1.1359, + "step": 6181 + }, + { + "epoch": 0.2213905849911365, + "grad_norm": 1.6846582889556885, + "learning_rate": 0.00018139648758975032, + "loss": 0.9743, + "step": 6182 + }, + { + "epoch": 0.22142639712070478, + "grad_norm": 2.1946330070495605, + "learning_rate": 0.00018138974901217027, + "loss": 1.3454, + "step": 6183 + }, + { + "epoch": 0.22146220925027307, + "grad_norm": 2.0368857383728027, + "learning_rate": 0.00018138300933959615, + "loss": 1.2356, + "step": 6184 + }, + { + "epoch": 0.22149802137984134, + "grad_norm": 2.3810276985168457, + "learning_rate": 0.0001813762685721186, + "loss": 1.142, + "step": 6185 + }, + { + "epoch": 0.22153383350940964, + "grad_norm": 1.5970600843429565, + "learning_rate": 0.00018136952670982833, + "loss": 0.9894, + "step": 6186 + }, + { + "epoch": 0.22156964563897794, + "grad_norm": 1.5154091119766235, + "learning_rate": 0.00018136278375281605, + "loss": 1.179, + "step": 6187 + }, + { + "epoch": 0.2216054577685462, + "grad_norm": 1.6056166887283325, + "learning_rate": 0.00018135603970117242, + "loss": 1.1784, + "step": 6188 + }, + { + "epoch": 0.2216412698981145, + "grad_norm": 1.3035842180252075, + "learning_rate": 0.00018134929455498828, + "loss": 1.1514, + "step": 6189 + }, + { + "epoch": 0.22167708202768277, + "grad_norm": 2.0612964630126953, + "learning_rate": 0.0001813425483143543, + "loss": 1.3291, + "step": 6190 + }, + { + "epoch": 0.22171289415725107, + "grad_norm": 1.8122187852859497, + "learning_rate": 0.00018133580097936123, + "loss": 1.2614, + "step": 6191 + }, + { + "epoch": 0.22174870628681934, + "grad_norm": 1.9474247694015503, + "learning_rate": 0.00018132905255009986, + "loss": 1.1603, + "step": 6192 + }, + { + "epoch": 0.22178451841638763, + "grad_norm": 1.634992241859436, + "learning_rate": 0.00018132230302666104, + "loss": 1.2289, + "step": 6193 + }, + { + "epoch": 0.2218203305459559, + "grad_norm": 1.9024602174758911, + "learning_rate": 0.0001813155524091355, + "loss": 1.0055, + "step": 6194 + }, + { + "epoch": 0.2218561426755242, + "grad_norm": 1.5261272192001343, + "learning_rate": 0.00018130880069761412, + "loss": 1.3058, + "step": 6195 + }, + { + "epoch": 0.2218919548050925, + "grad_norm": 1.4952951669692993, + "learning_rate": 0.00018130204789218769, + "loss": 1.386, + "step": 6196 + }, + { + "epoch": 0.22192776693466076, + "grad_norm": 1.8099796772003174, + "learning_rate": 0.00018129529399294706, + "loss": 1.4152, + "step": 6197 + }, + { + "epoch": 0.22196357906422906, + "grad_norm": 1.5334937572479248, + "learning_rate": 0.00018128853899998312, + "loss": 1.0509, + "step": 6198 + }, + { + "epoch": 0.22199939119379733, + "grad_norm": 1.4724843502044678, + "learning_rate": 0.00018128178291338678, + "loss": 1.3215, + "step": 6199 + }, + { + "epoch": 0.22203520332336563, + "grad_norm": 1.3141013383865356, + "learning_rate": 0.00018127502573324887, + "loss": 1.1044, + "step": 6200 + }, + { + "epoch": 0.2220710154529339, + "grad_norm": 1.51041579246521, + "learning_rate": 0.00018126826745966032, + "loss": 1.2502, + "step": 6201 + }, + { + "epoch": 0.2221068275825022, + "grad_norm": 1.40494704246521, + "learning_rate": 0.00018126150809271208, + "loss": 1.2274, + "step": 6202 + }, + { + "epoch": 0.2221426397120705, + "grad_norm": 1.749536395072937, + "learning_rate": 0.00018125474763249505, + "loss": 1.3843, + "step": 6203 + }, + { + "epoch": 0.22217845184163876, + "grad_norm": 2.3232028484344482, + "learning_rate": 0.00018124798607910018, + "loss": 1.3318, + "step": 6204 + }, + { + "epoch": 0.22221426397120705, + "grad_norm": 1.3128881454467773, + "learning_rate": 0.0001812412234326185, + "loss": 1.1291, + "step": 6205 + }, + { + "epoch": 0.22225007610077532, + "grad_norm": 1.6593819856643677, + "learning_rate": 0.00018123445969314095, + "loss": 1.3807, + "step": 6206 + }, + { + "epoch": 0.22228588823034362, + "grad_norm": 2.2797951698303223, + "learning_rate": 0.00018122769486075854, + "loss": 1.2973, + "step": 6207 + }, + { + "epoch": 0.2223217003599119, + "grad_norm": 1.2514702081680298, + "learning_rate": 0.00018122092893556224, + "loss": 1.1775, + "step": 6208 + }, + { + "epoch": 0.22235751248948019, + "grad_norm": 1.2925174236297607, + "learning_rate": 0.0001812141619176431, + "loss": 1.1756, + "step": 6209 + }, + { + "epoch": 0.22239332461904848, + "grad_norm": 1.7077758312225342, + "learning_rate": 0.00018120739380709218, + "loss": 1.2734, + "step": 6210 + }, + { + "epoch": 0.22242913674861675, + "grad_norm": 1.662938117980957, + "learning_rate": 0.00018120062460400056, + "loss": 1.1202, + "step": 6211 + }, + { + "epoch": 0.22246494887818505, + "grad_norm": 1.5963820219039917, + "learning_rate": 0.00018119385430845925, + "loss": 1.1806, + "step": 6212 + }, + { + "epoch": 0.22250076100775332, + "grad_norm": 1.6635326147079468, + "learning_rate": 0.00018118708292055936, + "loss": 1.4219, + "step": 6213 + }, + { + "epoch": 0.2225365731373216, + "grad_norm": 2.2738752365112305, + "learning_rate": 0.00018118031044039198, + "loss": 1.1892, + "step": 6214 + }, + { + "epoch": 0.22257238526688988, + "grad_norm": 1.3065983057022095, + "learning_rate": 0.00018117353686804825, + "loss": 1.0659, + "step": 6215 + }, + { + "epoch": 0.22260819739645818, + "grad_norm": 1.4280078411102295, + "learning_rate": 0.00018116676220361933, + "loss": 1.2087, + "step": 6216 + }, + { + "epoch": 0.22264400952602648, + "grad_norm": 1.8552905321121216, + "learning_rate": 0.00018115998644719627, + "loss": 1.4259, + "step": 6217 + }, + { + "epoch": 0.22267982165559475, + "grad_norm": 1.20035982131958, + "learning_rate": 0.0001811532095988703, + "loss": 1.0594, + "step": 6218 + }, + { + "epoch": 0.22271563378516304, + "grad_norm": 1.3735891580581665, + "learning_rate": 0.00018114643165873258, + "loss": 1.1311, + "step": 6219 + }, + { + "epoch": 0.2227514459147313, + "grad_norm": 1.6588460206985474, + "learning_rate": 0.00018113965262687426, + "loss": 1.19, + "step": 6220 + }, + { + "epoch": 0.2227872580442996, + "grad_norm": 1.5918787717819214, + "learning_rate": 0.00018113287250338662, + "loss": 1.0107, + "step": 6221 + }, + { + "epoch": 0.22282307017386788, + "grad_norm": 1.3902353048324585, + "learning_rate": 0.0001811260912883608, + "loss": 1.0661, + "step": 6222 + }, + { + "epoch": 0.22285888230343617, + "grad_norm": 1.5725704431533813, + "learning_rate": 0.0001811193089818881, + "loss": 1.0849, + "step": 6223 + }, + { + "epoch": 0.22289469443300447, + "grad_norm": 1.854818344116211, + "learning_rate": 0.0001811125255840597, + "loss": 1.2188, + "step": 6224 + }, + { + "epoch": 0.22293050656257274, + "grad_norm": 1.2977608442306519, + "learning_rate": 0.00018110574109496692, + "loss": 1.0447, + "step": 6225 + }, + { + "epoch": 0.22296631869214104, + "grad_norm": 1.8905771970748901, + "learning_rate": 0.000181098955514701, + "loss": 1.4151, + "step": 6226 + }, + { + "epoch": 0.2230021308217093, + "grad_norm": 1.5767186880111694, + "learning_rate": 0.00018109216884335325, + "loss": 1.2466, + "step": 6227 + }, + { + "epoch": 0.2230379429512776, + "grad_norm": 1.9624028205871582, + "learning_rate": 0.00018108538108101496, + "loss": 1.3582, + "step": 6228 + }, + { + "epoch": 0.22307375508084587, + "grad_norm": 1.4853765964508057, + "learning_rate": 0.00018107859222777747, + "loss": 1.2539, + "step": 6229 + }, + { + "epoch": 0.22310956721041417, + "grad_norm": 1.541543960571289, + "learning_rate": 0.0001810718022837321, + "loss": 1.0494, + "step": 6230 + }, + { + "epoch": 0.22314537933998246, + "grad_norm": 1.6067883968353271, + "learning_rate": 0.00018106501124897024, + "loss": 1.2405, + "step": 6231 + }, + { + "epoch": 0.22318119146955073, + "grad_norm": 1.3110731840133667, + "learning_rate": 0.00018105821912358318, + "loss": 1.1859, + "step": 6232 + }, + { + "epoch": 0.22321700359911903, + "grad_norm": 2.0484087467193604, + "learning_rate": 0.00018105142590766235, + "loss": 1.0709, + "step": 6233 + }, + { + "epoch": 0.2232528157286873, + "grad_norm": 1.74288010597229, + "learning_rate": 0.00018104463160129912, + "loss": 1.1669, + "step": 6234 + }, + { + "epoch": 0.2232886278582556, + "grad_norm": 1.2169049978256226, + "learning_rate": 0.00018103783620458495, + "loss": 0.9558, + "step": 6235 + }, + { + "epoch": 0.22332443998782386, + "grad_norm": 1.5259519815444946, + "learning_rate": 0.0001810310397176112, + "loss": 1.2247, + "step": 6236 + }, + { + "epoch": 0.22336025211739216, + "grad_norm": 1.4007843732833862, + "learning_rate": 0.0001810242421404693, + "loss": 1.1218, + "step": 6237 + }, + { + "epoch": 0.22339606424696046, + "grad_norm": 1.773490309715271, + "learning_rate": 0.00018101744347325078, + "loss": 1.1407, + "step": 6238 + }, + { + "epoch": 0.22343187637652873, + "grad_norm": 1.6592057943344116, + "learning_rate": 0.00018101064371604705, + "loss": 1.1899, + "step": 6239 + }, + { + "epoch": 0.22346768850609702, + "grad_norm": 1.851503849029541, + "learning_rate": 0.0001810038428689496, + "loss": 1.3417, + "step": 6240 + }, + { + "epoch": 0.2235035006356653, + "grad_norm": 1.5078750848770142, + "learning_rate": 0.00018099704093204997, + "loss": 1.2077, + "step": 6241 + }, + { + "epoch": 0.2235393127652336, + "grad_norm": 1.7574124336242676, + "learning_rate": 0.00018099023790543956, + "loss": 1.1576, + "step": 6242 + }, + { + "epoch": 0.22357512489480186, + "grad_norm": 1.9294384717941284, + "learning_rate": 0.00018098343378921002, + "loss": 1.0939, + "step": 6243 + }, + { + "epoch": 0.22361093702437015, + "grad_norm": 1.7331653833389282, + "learning_rate": 0.00018097662858345282, + "loss": 0.9282, + "step": 6244 + }, + { + "epoch": 0.22364674915393845, + "grad_norm": 1.807331919670105, + "learning_rate": 0.00018096982228825957, + "loss": 1.2255, + "step": 6245 + }, + { + "epoch": 0.22368256128350672, + "grad_norm": 1.7921392917633057, + "learning_rate": 0.00018096301490372175, + "loss": 1.249, + "step": 6246 + }, + { + "epoch": 0.22371837341307502, + "grad_norm": 1.6845133304595947, + "learning_rate": 0.00018095620642993106, + "loss": 1.1594, + "step": 6247 + }, + { + "epoch": 0.22375418554264329, + "grad_norm": 9.199856758117676, + "learning_rate": 0.000180949396866979, + "loss": 1.1909, + "step": 6248 + }, + { + "epoch": 0.22378999767221158, + "grad_norm": 1.776192307472229, + "learning_rate": 0.00018094258621495724, + "loss": 1.2223, + "step": 6249 + }, + { + "epoch": 0.22382580980177985, + "grad_norm": 1.4511460065841675, + "learning_rate": 0.00018093577447395737, + "loss": 1.279, + "step": 6250 + }, + { + "epoch": 0.22386162193134815, + "grad_norm": 1.9926121234893799, + "learning_rate": 0.00018092896164407108, + "loss": 1.1129, + "step": 6251 + }, + { + "epoch": 0.22389743406091644, + "grad_norm": 2.1933059692382812, + "learning_rate": 0.00018092214772538994, + "loss": 1.326, + "step": 6252 + }, + { + "epoch": 0.2239332461904847, + "grad_norm": 2.2700133323669434, + "learning_rate": 0.00018091533271800576, + "loss": 1.2273, + "step": 6253 + }, + { + "epoch": 0.223969058320053, + "grad_norm": 1.3883789777755737, + "learning_rate": 0.00018090851662201011, + "loss": 1.164, + "step": 6254 + }, + { + "epoch": 0.22400487044962128, + "grad_norm": 1.2905442714691162, + "learning_rate": 0.00018090169943749476, + "loss": 1.2424, + "step": 6255 + }, + { + "epoch": 0.22404068257918958, + "grad_norm": 1.6342167854309082, + "learning_rate": 0.00018089488116455137, + "loss": 1.3108, + "step": 6256 + }, + { + "epoch": 0.22407649470875785, + "grad_norm": 1.6717005968093872, + "learning_rate": 0.00018088806180327174, + "loss": 1.2799, + "step": 6257 + }, + { + "epoch": 0.22411230683832614, + "grad_norm": 2.133274555206299, + "learning_rate": 0.00018088124135374754, + "loss": 1.0878, + "step": 6258 + }, + { + "epoch": 0.22414811896789444, + "grad_norm": 1.9566408395767212, + "learning_rate": 0.00018087441981607056, + "loss": 0.9837, + "step": 6259 + }, + { + "epoch": 0.2241839310974627, + "grad_norm": 1.585748553276062, + "learning_rate": 0.00018086759719033261, + "loss": 1.1632, + "step": 6260 + }, + { + "epoch": 0.224219743227031, + "grad_norm": 2.3211050033569336, + "learning_rate": 0.00018086077347662544, + "loss": 1.0847, + "step": 6261 + }, + { + "epoch": 0.22425555535659927, + "grad_norm": 1.6721147298812866, + "learning_rate": 0.00018085394867504087, + "loss": 1.3571, + "step": 6262 + }, + { + "epoch": 0.22429136748616757, + "grad_norm": 1.3581912517547607, + "learning_rate": 0.00018084712278567072, + "loss": 1.1793, + "step": 6263 + }, + { + "epoch": 0.22432717961573584, + "grad_norm": 2.019355297088623, + "learning_rate": 0.00018084029580860679, + "loss": 1.3864, + "step": 6264 + }, + { + "epoch": 0.22436299174530414, + "grad_norm": 1.9514970779418945, + "learning_rate": 0.000180833467743941, + "loss": 1.1678, + "step": 6265 + }, + { + "epoch": 0.22439880387487243, + "grad_norm": 1.5085241794586182, + "learning_rate": 0.00018082663859176514, + "loss": 1.0373, + "step": 6266 + }, + { + "epoch": 0.2244346160044407, + "grad_norm": 1.4787169694900513, + "learning_rate": 0.00018081980835217115, + "loss": 1.1308, + "step": 6267 + }, + { + "epoch": 0.224470428134009, + "grad_norm": 1.6092264652252197, + "learning_rate": 0.00018081297702525083, + "loss": 1.3767, + "step": 6268 + }, + { + "epoch": 0.22450624026357727, + "grad_norm": 1.8415021896362305, + "learning_rate": 0.0001808061446110962, + "loss": 1.2447, + "step": 6269 + }, + { + "epoch": 0.22454205239314556, + "grad_norm": 2.3893957138061523, + "learning_rate": 0.0001807993111097991, + "loss": 1.33, + "step": 6270 + }, + { + "epoch": 0.22457786452271383, + "grad_norm": 1.6627132892608643, + "learning_rate": 0.0001807924765214515, + "loss": 1.0387, + "step": 6271 + }, + { + "epoch": 0.22461367665228213, + "grad_norm": 1.9369231462478638, + "learning_rate": 0.00018078564084614534, + "loss": 1.2239, + "step": 6272 + }, + { + "epoch": 0.22464948878185043, + "grad_norm": 1.890769600868225, + "learning_rate": 0.0001807788040839726, + "loss": 1.2705, + "step": 6273 + }, + { + "epoch": 0.2246853009114187, + "grad_norm": 1.4824820756912231, + "learning_rate": 0.0001807719662350252, + "loss": 1.1696, + "step": 6274 + }, + { + "epoch": 0.224721113040987, + "grad_norm": 1.5665630102157593, + "learning_rate": 0.00018076512729939522, + "loss": 1.3403, + "step": 6275 + }, + { + "epoch": 0.22475692517055526, + "grad_norm": 1.6681995391845703, + "learning_rate": 0.00018075828727717464, + "loss": 1.4269, + "step": 6276 + }, + { + "epoch": 0.22479273730012356, + "grad_norm": 1.8709626197814941, + "learning_rate": 0.00018075144616845544, + "loss": 1.2227, + "step": 6277 + }, + { + "epoch": 0.22482854942969183, + "grad_norm": 1.4257827997207642, + "learning_rate": 0.00018074460397332973, + "loss": 1.2658, + "step": 6278 + }, + { + "epoch": 0.22486436155926012, + "grad_norm": 1.367699146270752, + "learning_rate": 0.00018073776069188954, + "loss": 1.1565, + "step": 6279 + }, + { + "epoch": 0.22490017368882842, + "grad_norm": 1.5811935663223267, + "learning_rate": 0.0001807309163242269, + "loss": 0.9634, + "step": 6280 + }, + { + "epoch": 0.2249359858183967, + "grad_norm": 1.6494228839874268, + "learning_rate": 0.0001807240708704339, + "loss": 1.3718, + "step": 6281 + }, + { + "epoch": 0.22497179794796499, + "grad_norm": 1.7860969305038452, + "learning_rate": 0.0001807172243306027, + "loss": 1.2239, + "step": 6282 + }, + { + "epoch": 0.22500761007753325, + "grad_norm": 1.759442687034607, + "learning_rate": 0.00018071037670482532, + "loss": 1.2192, + "step": 6283 + }, + { + "epoch": 0.22504342220710155, + "grad_norm": 1.5037062168121338, + "learning_rate": 0.00018070352799319395, + "loss": 1.1493, + "step": 6284 + }, + { + "epoch": 0.22507923433666982, + "grad_norm": 2.2688794136047363, + "learning_rate": 0.0001806966781958007, + "loss": 1.2428, + "step": 6285 + }, + { + "epoch": 0.22511504646623812, + "grad_norm": 1.9081186056137085, + "learning_rate": 0.00018068982731273773, + "loss": 1.4098, + "step": 6286 + }, + { + "epoch": 0.2251508585958064, + "grad_norm": 1.4602291584014893, + "learning_rate": 0.00018068297534409725, + "loss": 1.2939, + "step": 6287 + }, + { + "epoch": 0.22518667072537468, + "grad_norm": 1.5343518257141113, + "learning_rate": 0.00018067612228997137, + "loss": 1.2431, + "step": 6288 + }, + { + "epoch": 0.22522248285494298, + "grad_norm": 1.4989640712738037, + "learning_rate": 0.00018066926815045236, + "loss": 1.3369, + "step": 6289 + }, + { + "epoch": 0.22525829498451125, + "grad_norm": 1.6013189554214478, + "learning_rate": 0.00018066241292563238, + "loss": 1.0276, + "step": 6290 + }, + { + "epoch": 0.22529410711407954, + "grad_norm": 1.5463844537734985, + "learning_rate": 0.00018065555661560368, + "loss": 1.3069, + "step": 6291 + }, + { + "epoch": 0.2253299192436478, + "grad_norm": 1.5418213605880737, + "learning_rate": 0.00018064869922045852, + "loss": 1.1361, + "step": 6292 + }, + { + "epoch": 0.2253657313732161, + "grad_norm": 2.3628756999969482, + "learning_rate": 0.00018064184074028915, + "loss": 1.1854, + "step": 6293 + }, + { + "epoch": 0.22540154350278438, + "grad_norm": 2.309203624725342, + "learning_rate": 0.0001806349811751878, + "loss": 1.2715, + "step": 6294 + }, + { + "epoch": 0.22543735563235268, + "grad_norm": 1.4673250913619995, + "learning_rate": 0.00018062812052524683, + "loss": 1.1353, + "step": 6295 + }, + { + "epoch": 0.22547316776192097, + "grad_norm": 1.643247365951538, + "learning_rate": 0.00018062125879055846, + "loss": 1.1687, + "step": 6296 + }, + { + "epoch": 0.22550897989148924, + "grad_norm": 1.9224193096160889, + "learning_rate": 0.00018061439597121508, + "loss": 1.2213, + "step": 6297 + }, + { + "epoch": 0.22554479202105754, + "grad_norm": 1.362475872039795, + "learning_rate": 0.000180607532067309, + "loss": 1.0371, + "step": 6298 + }, + { + "epoch": 0.2255806041506258, + "grad_norm": 2.2019073963165283, + "learning_rate": 0.0001806006670789325, + "loss": 1.3377, + "step": 6299 + }, + { + "epoch": 0.2256164162801941, + "grad_norm": 1.7080262899398804, + "learning_rate": 0.00018059380100617802, + "loss": 1.2272, + "step": 6300 + }, + { + "epoch": 0.22565222840976237, + "grad_norm": 2.191504716873169, + "learning_rate": 0.0001805869338491379, + "loss": 1.1099, + "step": 6301 + }, + { + "epoch": 0.22568804053933067, + "grad_norm": 1.76483154296875, + "learning_rate": 0.00018058006560790453, + "loss": 0.9652, + "step": 6302 + }, + { + "epoch": 0.22572385266889897, + "grad_norm": 1.7999125719070435, + "learning_rate": 0.00018057319628257034, + "loss": 1.3336, + "step": 6303 + }, + { + "epoch": 0.22575966479846724, + "grad_norm": 1.82087242603302, + "learning_rate": 0.0001805663258732277, + "loss": 1.5184, + "step": 6304 + }, + { + "epoch": 0.22579547692803553, + "grad_norm": 1.3384838104248047, + "learning_rate": 0.0001805594543799691, + "loss": 1.2396, + "step": 6305 + }, + { + "epoch": 0.2258312890576038, + "grad_norm": 1.4789539575576782, + "learning_rate": 0.00018055258180288696, + "loss": 1.2025, + "step": 6306 + }, + { + "epoch": 0.2258671011871721, + "grad_norm": 1.2865229845046997, + "learning_rate": 0.0001805457081420737, + "loss": 1.293, + "step": 6307 + }, + { + "epoch": 0.22590291331674037, + "grad_norm": 2.069483518600464, + "learning_rate": 0.00018053883339762183, + "loss": 1.329, + "step": 6308 + }, + { + "epoch": 0.22593872544630866, + "grad_norm": 1.5348390340805054, + "learning_rate": 0.00018053195756962388, + "loss": 1.0861, + "step": 6309 + }, + { + "epoch": 0.22597453757587696, + "grad_norm": 1.5319578647613525, + "learning_rate": 0.0001805250806581723, + "loss": 1.0458, + "step": 6310 + }, + { + "epoch": 0.22601034970544523, + "grad_norm": 1.6966511011123657, + "learning_rate": 0.00018051820266335963, + "loss": 1.0518, + "step": 6311 + }, + { + "epoch": 0.22604616183501353, + "grad_norm": 1.616368055343628, + "learning_rate": 0.0001805113235852784, + "loss": 1.1615, + "step": 6312 + }, + { + "epoch": 0.2260819739645818, + "grad_norm": 1.3071706295013428, + "learning_rate": 0.00018050444342402114, + "loss": 1.016, + "step": 6313 + }, + { + "epoch": 0.2261177860941501, + "grad_norm": 1.7682405710220337, + "learning_rate": 0.0001804975621796805, + "loss": 1.0169, + "step": 6314 + }, + { + "epoch": 0.22615359822371836, + "grad_norm": 1.4678562879562378, + "learning_rate": 0.00018049067985234895, + "loss": 1.0291, + "step": 6315 + }, + { + "epoch": 0.22618941035328666, + "grad_norm": 1.5395869016647339, + "learning_rate": 0.00018048379644211915, + "loss": 1.0871, + "step": 6316 + }, + { + "epoch": 0.22622522248285495, + "grad_norm": 1.543986439704895, + "learning_rate": 0.00018047691194908368, + "loss": 1.1738, + "step": 6317 + }, + { + "epoch": 0.22626103461242322, + "grad_norm": 1.759164810180664, + "learning_rate": 0.00018047002637333517, + "loss": 1.2297, + "step": 6318 + }, + { + "epoch": 0.22629684674199152, + "grad_norm": 1.4492011070251465, + "learning_rate": 0.00018046313971496622, + "loss": 1.0896, + "step": 6319 + }, + { + "epoch": 0.2263326588715598, + "grad_norm": 1.8728190660476685, + "learning_rate": 0.00018045625197406957, + "loss": 1.1327, + "step": 6320 + }, + { + "epoch": 0.22636847100112809, + "grad_norm": 1.2285199165344238, + "learning_rate": 0.00018044936315073779, + "loss": 1.2098, + "step": 6321 + }, + { + "epoch": 0.22640428313069635, + "grad_norm": 2.749210834503174, + "learning_rate": 0.0001804424732450636, + "loss": 1.4093, + "step": 6322 + }, + { + "epoch": 0.22644009526026465, + "grad_norm": 1.7635828256607056, + "learning_rate": 0.0001804355822571397, + "loss": 1.2204, + "step": 6323 + }, + { + "epoch": 0.22647590738983295, + "grad_norm": 1.5287224054336548, + "learning_rate": 0.00018042869018705882, + "loss": 1.0861, + "step": 6324 + }, + { + "epoch": 0.22651171951940122, + "grad_norm": 1.6886777877807617, + "learning_rate": 0.00018042179703491365, + "loss": 1.1385, + "step": 6325 + }, + { + "epoch": 0.2265475316489695, + "grad_norm": 1.828983187675476, + "learning_rate": 0.00018041490280079693, + "loss": 1.2291, + "step": 6326 + }, + { + "epoch": 0.22658334377853778, + "grad_norm": 1.4571372270584106, + "learning_rate": 0.00018040800748480142, + "loss": 1.0637, + "step": 6327 + }, + { + "epoch": 0.22661915590810608, + "grad_norm": 1.650603175163269, + "learning_rate": 0.00018040111108701988, + "loss": 1.2946, + "step": 6328 + }, + { + "epoch": 0.22665496803767435, + "grad_norm": 1.7101067304611206, + "learning_rate": 0.00018039421360754513, + "loss": 1.202, + "step": 6329 + }, + { + "epoch": 0.22669078016724264, + "grad_norm": 1.8698807954788208, + "learning_rate": 0.0001803873150464699, + "loss": 1.1065, + "step": 6330 + }, + { + "epoch": 0.22672659229681094, + "grad_norm": 2.0062386989593506, + "learning_rate": 0.00018038041540388705, + "loss": 1.1168, + "step": 6331 + }, + { + "epoch": 0.2267624044263792, + "grad_norm": 1.9985909461975098, + "learning_rate": 0.00018037351467988942, + "loss": 1.0931, + "step": 6332 + }, + { + "epoch": 0.2267982165559475, + "grad_norm": 1.3465447425842285, + "learning_rate": 0.0001803666128745698, + "loss": 1.0384, + "step": 6333 + }, + { + "epoch": 0.22683402868551578, + "grad_norm": 2.0310745239257812, + "learning_rate": 0.00018035970998802106, + "loss": 1.2406, + "step": 6334 + }, + { + "epoch": 0.22686984081508407, + "grad_norm": 2.334548234939575, + "learning_rate": 0.0001803528060203361, + "loss": 1.2597, + "step": 6335 + }, + { + "epoch": 0.22690565294465234, + "grad_norm": 1.2634953260421753, + "learning_rate": 0.00018034590097160778, + "loss": 1.1837, + "step": 6336 + }, + { + "epoch": 0.22694146507422064, + "grad_norm": 1.5814220905303955, + "learning_rate": 0.000180338994841929, + "loss": 1.1136, + "step": 6337 + }, + { + "epoch": 0.22697727720378894, + "grad_norm": 1.3497282266616821, + "learning_rate": 0.00018033208763139266, + "loss": 1.2811, + "step": 6338 + }, + { + "epoch": 0.2270130893333572, + "grad_norm": 1.4690123796463013, + "learning_rate": 0.0001803251793400917, + "loss": 1.1861, + "step": 6339 + }, + { + "epoch": 0.2270489014629255, + "grad_norm": 1.2982438802719116, + "learning_rate": 0.0001803182699681191, + "loss": 1.1161, + "step": 6340 + }, + { + "epoch": 0.22708471359249377, + "grad_norm": 1.730849266052246, + "learning_rate": 0.00018031135951556774, + "loss": 1.108, + "step": 6341 + }, + { + "epoch": 0.22712052572206207, + "grad_norm": 1.943929672241211, + "learning_rate": 0.00018030444798253066, + "loss": 1.1311, + "step": 6342 + }, + { + "epoch": 0.22715633785163034, + "grad_norm": 1.3914917707443237, + "learning_rate": 0.0001802975353691008, + "loss": 1.2376, + "step": 6343 + }, + { + "epoch": 0.22719214998119863, + "grad_norm": 1.2468549013137817, + "learning_rate": 0.00018029062167537117, + "loss": 1.1603, + "step": 6344 + }, + { + "epoch": 0.22722796211076693, + "grad_norm": 1.7529582977294922, + "learning_rate": 0.00018028370690143482, + "loss": 1.3355, + "step": 6345 + }, + { + "epoch": 0.2272637742403352, + "grad_norm": 1.7637943029403687, + "learning_rate": 0.00018027679104738473, + "loss": 1.0901, + "step": 6346 + }, + { + "epoch": 0.2272995863699035, + "grad_norm": 2.361942768096924, + "learning_rate": 0.00018026987411331398, + "loss": 1.2123, + "step": 6347 + }, + { + "epoch": 0.22733539849947176, + "grad_norm": 1.402681827545166, + "learning_rate": 0.0001802629560993156, + "loss": 0.9253, + "step": 6348 + }, + { + "epoch": 0.22737121062904006, + "grad_norm": 1.398399829864502, + "learning_rate": 0.0001802560370054827, + "loss": 1.3004, + "step": 6349 + }, + { + "epoch": 0.22740702275860833, + "grad_norm": 1.503109097480774, + "learning_rate": 0.00018024911683190833, + "loss": 1.0325, + "step": 6350 + }, + { + "epoch": 0.22744283488817663, + "grad_norm": 1.6640022993087769, + "learning_rate": 0.0001802421955786856, + "loss": 1.384, + "step": 6351 + }, + { + "epoch": 0.22747864701774492, + "grad_norm": 1.735912799835205, + "learning_rate": 0.00018023527324590764, + "loss": 1.1193, + "step": 6352 + }, + { + "epoch": 0.2275144591473132, + "grad_norm": 1.6703435182571411, + "learning_rate": 0.0001802283498336676, + "loss": 1.1412, + "step": 6353 + }, + { + "epoch": 0.2275502712768815, + "grad_norm": 1.6370019912719727, + "learning_rate": 0.00018022142534205858, + "loss": 1.2411, + "step": 6354 + }, + { + "epoch": 0.22758608340644976, + "grad_norm": 1.3570674657821655, + "learning_rate": 0.00018021449977117374, + "loss": 1.1225, + "step": 6355 + }, + { + "epoch": 0.22762189553601805, + "grad_norm": 1.589860200881958, + "learning_rate": 0.00018020757312110628, + "loss": 1.2636, + "step": 6356 + }, + { + "epoch": 0.22765770766558632, + "grad_norm": 1.5085405111312866, + "learning_rate": 0.0001802006453919494, + "loss": 1.2845, + "step": 6357 + }, + { + "epoch": 0.22769351979515462, + "grad_norm": 1.2969141006469727, + "learning_rate": 0.0001801937165837963, + "loss": 1.2635, + "step": 6358 + }, + { + "epoch": 0.22772933192472292, + "grad_norm": 1.5932103395462036, + "learning_rate": 0.00018018678669674015, + "loss": 1.3275, + "step": 6359 + }, + { + "epoch": 0.22776514405429119, + "grad_norm": 1.7389870882034302, + "learning_rate": 0.00018017985573087425, + "loss": 1.1409, + "step": 6360 + }, + { + "epoch": 0.22780095618385948, + "grad_norm": 2.268998861312866, + "learning_rate": 0.0001801729236862918, + "loss": 1.3231, + "step": 6361 + }, + { + "epoch": 0.22783676831342775, + "grad_norm": 1.4658268690109253, + "learning_rate": 0.0001801659905630861, + "loss": 1.2101, + "step": 6362 + }, + { + "epoch": 0.22787258044299605, + "grad_norm": 1.5067239999771118, + "learning_rate": 0.00018015905636135037, + "loss": 1.0562, + "step": 6363 + }, + { + "epoch": 0.22790839257256432, + "grad_norm": 1.4619146585464478, + "learning_rate": 0.00018015212108117793, + "loss": 1.3144, + "step": 6364 + }, + { + "epoch": 0.2279442047021326, + "grad_norm": 2.1932435035705566, + "learning_rate": 0.0001801451847226621, + "loss": 1.498, + "step": 6365 + }, + { + "epoch": 0.2279800168317009, + "grad_norm": 1.4509611129760742, + "learning_rate": 0.0001801382472858962, + "loss": 1.1862, + "step": 6366 + }, + { + "epoch": 0.22801582896126918, + "grad_norm": 1.7292462587356567, + "learning_rate": 0.00018013130877097357, + "loss": 1.16, + "step": 6367 + }, + { + "epoch": 0.22805164109083748, + "grad_norm": 1.7766531705856323, + "learning_rate": 0.0001801243691779875, + "loss": 1.1533, + "step": 6368 + }, + { + "epoch": 0.22808745322040574, + "grad_norm": 1.6280714273452759, + "learning_rate": 0.00018011742850703146, + "loss": 1.0439, + "step": 6369 + }, + { + "epoch": 0.22812326534997404, + "grad_norm": 1.6317572593688965, + "learning_rate": 0.00018011048675819872, + "loss": 1.0844, + "step": 6370 + }, + { + "epoch": 0.2281590774795423, + "grad_norm": 1.5086579322814941, + "learning_rate": 0.0001801035439315827, + "loss": 1.1798, + "step": 6371 + }, + { + "epoch": 0.2281948896091106, + "grad_norm": 1.7376680374145508, + "learning_rate": 0.00018009660002727684, + "loss": 1.1636, + "step": 6372 + }, + { + "epoch": 0.2282307017386789, + "grad_norm": 1.4254311323165894, + "learning_rate": 0.00018008965504537455, + "loss": 1.2282, + "step": 6373 + }, + { + "epoch": 0.22826651386824717, + "grad_norm": 1.700535774230957, + "learning_rate": 0.00018008270898596927, + "loss": 1.38, + "step": 6374 + }, + { + "epoch": 0.22830232599781547, + "grad_norm": 1.5054891109466553, + "learning_rate": 0.00018007576184915443, + "loss": 1.2034, + "step": 6375 + }, + { + "epoch": 0.22833813812738374, + "grad_norm": 1.3242995738983154, + "learning_rate": 0.00018006881363502348, + "loss": 1.0384, + "step": 6376 + }, + { + "epoch": 0.22837395025695204, + "grad_norm": 1.544811725616455, + "learning_rate": 0.00018006186434366996, + "loss": 1.3883, + "step": 6377 + }, + { + "epoch": 0.2284097623865203, + "grad_norm": 2.030498743057251, + "learning_rate": 0.0001800549139751873, + "loss": 1.1053, + "step": 6378 + }, + { + "epoch": 0.2284455745160886, + "grad_norm": 1.4821089506149292, + "learning_rate": 0.00018004796252966908, + "loss": 1.1414, + "step": 6379 + }, + { + "epoch": 0.2284813866456569, + "grad_norm": 1.5294089317321777, + "learning_rate": 0.00018004101000720872, + "loss": 1.3367, + "step": 6380 + }, + { + "epoch": 0.22851719877522517, + "grad_norm": 1.3016033172607422, + "learning_rate": 0.00018003405640789987, + "loss": 1.1288, + "step": 6381 + }, + { + "epoch": 0.22855301090479346, + "grad_norm": 2.1125364303588867, + "learning_rate": 0.00018002710173183596, + "loss": 1.2537, + "step": 6382 + }, + { + "epoch": 0.22858882303436173, + "grad_norm": 2.3106002807617188, + "learning_rate": 0.00018002014597911066, + "loss": 1.2729, + "step": 6383 + }, + { + "epoch": 0.22862463516393003, + "grad_norm": 1.3685702085494995, + "learning_rate": 0.00018001318914981753, + "loss": 1.1766, + "step": 6384 + }, + { + "epoch": 0.2286604472934983, + "grad_norm": 2.0223076343536377, + "learning_rate": 0.00018000623124405014, + "loss": 1.1762, + "step": 6385 + }, + { + "epoch": 0.2286962594230666, + "grad_norm": 1.6300933361053467, + "learning_rate": 0.0001799992722619021, + "loss": 0.9489, + "step": 6386 + }, + { + "epoch": 0.22873207155263486, + "grad_norm": 1.4272630214691162, + "learning_rate": 0.00017999231220346707, + "loss": 1.1587, + "step": 6387 + }, + { + "epoch": 0.22876788368220316, + "grad_norm": 1.7665987014770508, + "learning_rate": 0.00017998535106883862, + "loss": 0.8907, + "step": 6388 + }, + { + "epoch": 0.22880369581177146, + "grad_norm": 1.5623161792755127, + "learning_rate": 0.00017997838885811047, + "loss": 1.2043, + "step": 6389 + }, + { + "epoch": 0.22883950794133973, + "grad_norm": 1.8842748403549194, + "learning_rate": 0.00017997142557137625, + "loss": 1.0911, + "step": 6390 + }, + { + "epoch": 0.22887532007090802, + "grad_norm": 2.100118637084961, + "learning_rate": 0.00017996446120872967, + "loss": 1.4202, + "step": 6391 + }, + { + "epoch": 0.2289111322004763, + "grad_norm": 1.831519603729248, + "learning_rate": 0.00017995749577026443, + "loss": 1.2696, + "step": 6392 + }, + { + "epoch": 0.2289469443300446, + "grad_norm": 1.7156790494918823, + "learning_rate": 0.0001799505292560742, + "loss": 1.3551, + "step": 6393 + }, + { + "epoch": 0.22898275645961286, + "grad_norm": 1.5621800422668457, + "learning_rate": 0.00017994356166625271, + "loss": 1.1997, + "step": 6394 + }, + { + "epoch": 0.22901856858918115, + "grad_norm": 1.564731478691101, + "learning_rate": 0.0001799365930008937, + "loss": 1.3124, + "step": 6395 + }, + { + "epoch": 0.22905438071874945, + "grad_norm": 1.2313860654830933, + "learning_rate": 0.000179929623260091, + "loss": 1.1379, + "step": 6396 + }, + { + "epoch": 0.22909019284831772, + "grad_norm": 1.5157219171524048, + "learning_rate": 0.0001799226524439383, + "loss": 1.1303, + "step": 6397 + }, + { + "epoch": 0.22912600497788602, + "grad_norm": 1.8906829357147217, + "learning_rate": 0.0001799156805525294, + "loss": 1.1309, + "step": 6398 + }, + { + "epoch": 0.22916181710745429, + "grad_norm": 1.4019941091537476, + "learning_rate": 0.00017990870758595811, + "loss": 1.1279, + "step": 6399 + }, + { + "epoch": 0.22919762923702258, + "grad_norm": 2.4353222846984863, + "learning_rate": 0.0001799017335443182, + "loss": 1.1484, + "step": 6400 + }, + { + "epoch": 0.22923344136659085, + "grad_norm": 1.5030633211135864, + "learning_rate": 0.00017989475842770358, + "loss": 1.1426, + "step": 6401 + }, + { + "epoch": 0.22926925349615915, + "grad_norm": 2.143343687057495, + "learning_rate": 0.00017988778223620799, + "loss": 1.3419, + "step": 6402 + }, + { + "epoch": 0.22930506562572744, + "grad_norm": 1.717750072479248, + "learning_rate": 0.0001798808049699254, + "loss": 1.3273, + "step": 6403 + }, + { + "epoch": 0.2293408777552957, + "grad_norm": 1.748896837234497, + "learning_rate": 0.00017987382662894955, + "loss": 1.151, + "step": 6404 + }, + { + "epoch": 0.229376689884864, + "grad_norm": 1.8465347290039062, + "learning_rate": 0.00017986684721337442, + "loss": 1.2185, + "step": 6405 + }, + { + "epoch": 0.22941250201443228, + "grad_norm": 1.597379207611084, + "learning_rate": 0.00017985986672329392, + "loss": 1.244, + "step": 6406 + }, + { + "epoch": 0.22944831414400058, + "grad_norm": 1.6250989437103271, + "learning_rate": 0.0001798528851588019, + "loss": 1.3521, + "step": 6407 + }, + { + "epoch": 0.22948412627356884, + "grad_norm": 1.785508632659912, + "learning_rate": 0.0001798459025199923, + "loss": 1.2147, + "step": 6408 + }, + { + "epoch": 0.22951993840313714, + "grad_norm": 1.5459461212158203, + "learning_rate": 0.00017983891880695907, + "loss": 1.2306, + "step": 6409 + }, + { + "epoch": 0.22955575053270544, + "grad_norm": 1.5071203708648682, + "learning_rate": 0.00017983193401979616, + "loss": 1.2555, + "step": 6410 + }, + { + "epoch": 0.2295915626622737, + "grad_norm": 1.7904905080795288, + "learning_rate": 0.0001798249481585976, + "loss": 1.1141, + "step": 6411 + }, + { + "epoch": 0.229627374791842, + "grad_norm": 1.5815308094024658, + "learning_rate": 0.0001798179612234573, + "loss": 1.0919, + "step": 6412 + }, + { + "epoch": 0.22966318692141027, + "grad_norm": 1.2086875438690186, + "learning_rate": 0.0001798109732144693, + "loss": 1.1874, + "step": 6413 + }, + { + "epoch": 0.22969899905097857, + "grad_norm": 2.0082695484161377, + "learning_rate": 0.0001798039841317276, + "loss": 1.1658, + "step": 6414 + }, + { + "epoch": 0.22973481118054684, + "grad_norm": 1.9516031742095947, + "learning_rate": 0.00017979699397532625, + "loss": 1.3238, + "step": 6415 + }, + { + "epoch": 0.22977062331011514, + "grad_norm": 1.4445347785949707, + "learning_rate": 0.00017979000274535926, + "loss": 1.2143, + "step": 6416 + }, + { + "epoch": 0.22980643543968343, + "grad_norm": 2.2531869411468506, + "learning_rate": 0.0001797830104419207, + "loss": 1.0593, + "step": 6417 + }, + { + "epoch": 0.2298422475692517, + "grad_norm": 1.3010872602462769, + "learning_rate": 0.00017977601706510465, + "loss": 1.1946, + "step": 6418 + }, + { + "epoch": 0.22987805969882, + "grad_norm": 1.8295570611953735, + "learning_rate": 0.0001797690226150052, + "loss": 1.2084, + "step": 6419 + }, + { + "epoch": 0.22991387182838827, + "grad_norm": 1.347680926322937, + "learning_rate": 0.00017976202709171643, + "loss": 1.0833, + "step": 6420 + }, + { + "epoch": 0.22994968395795656, + "grad_norm": 1.7595570087432861, + "learning_rate": 0.0001797550304953325, + "loss": 1.1271, + "step": 6421 + }, + { + "epoch": 0.22998549608752483, + "grad_norm": 1.9168617725372314, + "learning_rate": 0.00017974803282594747, + "loss": 1.1364, + "step": 6422 + }, + { + "epoch": 0.23002130821709313, + "grad_norm": 1.6527292728424072, + "learning_rate": 0.00017974103408365557, + "loss": 1.1911, + "step": 6423 + }, + { + "epoch": 0.23005712034666143, + "grad_norm": 1.7314921617507935, + "learning_rate": 0.00017973403426855088, + "loss": 1.2227, + "step": 6424 + }, + { + "epoch": 0.2300929324762297, + "grad_norm": 1.3122756481170654, + "learning_rate": 0.00017972703338072762, + "loss": 1.1497, + "step": 6425 + }, + { + "epoch": 0.230128744605798, + "grad_norm": 1.7151799201965332, + "learning_rate": 0.00017972003142027997, + "loss": 1.5018, + "step": 6426 + }, + { + "epoch": 0.23016455673536626, + "grad_norm": 1.2618896961212158, + "learning_rate": 0.00017971302838730213, + "loss": 1.1305, + "step": 6427 + }, + { + "epoch": 0.23020036886493456, + "grad_norm": 1.452684998512268, + "learning_rate": 0.00017970602428188834, + "loss": 1.089, + "step": 6428 + }, + { + "epoch": 0.23023618099450283, + "grad_norm": 1.8269551992416382, + "learning_rate": 0.00017969901910413276, + "loss": 1.069, + "step": 6429 + }, + { + "epoch": 0.23027199312407112, + "grad_norm": 1.689496636390686, + "learning_rate": 0.00017969201285412972, + "loss": 1.2353, + "step": 6430 + }, + { + "epoch": 0.23030780525363942, + "grad_norm": 1.9198263883590698, + "learning_rate": 0.0001796850055319734, + "loss": 1.0746, + "step": 6431 + }, + { + "epoch": 0.2303436173832077, + "grad_norm": 1.6455634832382202, + "learning_rate": 0.00017967799713775815, + "loss": 1.3259, + "step": 6432 + }, + { + "epoch": 0.23037942951277599, + "grad_norm": 1.993094801902771, + "learning_rate": 0.00017967098767157822, + "loss": 1.3578, + "step": 6433 + }, + { + "epoch": 0.23041524164234425, + "grad_norm": 1.8181920051574707, + "learning_rate": 0.00017966397713352792, + "loss": 0.979, + "step": 6434 + }, + { + "epoch": 0.23045105377191255, + "grad_norm": 1.7688971757888794, + "learning_rate": 0.00017965696552370156, + "loss": 1.105, + "step": 6435 + }, + { + "epoch": 0.23048686590148082, + "grad_norm": 1.5167020559310913, + "learning_rate": 0.00017964995284219348, + "loss": 1.0633, + "step": 6436 + }, + { + "epoch": 0.23052267803104912, + "grad_norm": 1.7564020156860352, + "learning_rate": 0.00017964293908909803, + "loss": 1.5009, + "step": 6437 + }, + { + "epoch": 0.2305584901606174, + "grad_norm": 1.3875713348388672, + "learning_rate": 0.00017963592426450956, + "loss": 1.1026, + "step": 6438 + }, + { + "epoch": 0.23059430229018568, + "grad_norm": 1.6341291666030884, + "learning_rate": 0.0001796289083685225, + "loss": 1.3128, + "step": 6439 + }, + { + "epoch": 0.23063011441975398, + "grad_norm": 1.613972783088684, + "learning_rate": 0.0001796218914012311, + "loss": 1.2254, + "step": 6440 + }, + { + "epoch": 0.23066592654932225, + "grad_norm": 1.437293529510498, + "learning_rate": 0.0001796148733627299, + "loss": 1.253, + "step": 6441 + }, + { + "epoch": 0.23070173867889054, + "grad_norm": 1.4872446060180664, + "learning_rate": 0.00017960785425311332, + "loss": 1.0743, + "step": 6442 + }, + { + "epoch": 0.2307375508084588, + "grad_norm": 2.4534149169921875, + "learning_rate": 0.0001796008340724757, + "loss": 1.4041, + "step": 6443 + }, + { + "epoch": 0.2307733629380271, + "grad_norm": 1.9721717834472656, + "learning_rate": 0.00017959381282091152, + "loss": 1.1755, + "step": 6444 + }, + { + "epoch": 0.2308091750675954, + "grad_norm": 1.9814666509628296, + "learning_rate": 0.0001795867904985153, + "loss": 1.3324, + "step": 6445 + }, + { + "epoch": 0.23084498719716368, + "grad_norm": 1.3728965520858765, + "learning_rate": 0.00017957976710538144, + "loss": 1.3335, + "step": 6446 + }, + { + "epoch": 0.23088079932673197, + "grad_norm": 1.7165091037750244, + "learning_rate": 0.00017957274264160448, + "loss": 1.3148, + "step": 6447 + }, + { + "epoch": 0.23091661145630024, + "grad_norm": 1.55254065990448, + "learning_rate": 0.0001795657171072789, + "loss": 1.2551, + "step": 6448 + }, + { + "epoch": 0.23095242358586854, + "grad_norm": 1.6750601530075073, + "learning_rate": 0.00017955869050249925, + "loss": 1.5637, + "step": 6449 + }, + { + "epoch": 0.2309882357154368, + "grad_norm": 1.609794020652771, + "learning_rate": 0.00017955166282736002, + "loss": 1.2292, + "step": 6450 + }, + { + "epoch": 0.2310240478450051, + "grad_norm": 1.3900059461593628, + "learning_rate": 0.0001795446340819558, + "loss": 0.9079, + "step": 6451 + }, + { + "epoch": 0.2310598599745734, + "grad_norm": 1.4024003744125366, + "learning_rate": 0.0001795376042663811, + "loss": 1.0804, + "step": 6452 + }, + { + "epoch": 0.23109567210414167, + "grad_norm": 1.6548972129821777, + "learning_rate": 0.00017953057338073055, + "loss": 1.2394, + "step": 6453 + }, + { + "epoch": 0.23113148423370997, + "grad_norm": 1.647518277168274, + "learning_rate": 0.00017952354142509872, + "loss": 1.1095, + "step": 6454 + }, + { + "epoch": 0.23116729636327824, + "grad_norm": 1.4643096923828125, + "learning_rate": 0.00017951650839958023, + "loss": 1.1189, + "step": 6455 + }, + { + "epoch": 0.23120310849284653, + "grad_norm": 1.5912576913833618, + "learning_rate": 0.0001795094743042697, + "loss": 1.1691, + "step": 6456 + }, + { + "epoch": 0.2312389206224148, + "grad_norm": 1.592500925064087, + "learning_rate": 0.00017950243913926171, + "loss": 1.0948, + "step": 6457 + }, + { + "epoch": 0.2312747327519831, + "grad_norm": 2.130035161972046, + "learning_rate": 0.000179495402904651, + "loss": 1.331, + "step": 6458 + }, + { + "epoch": 0.2313105448815514, + "grad_norm": 1.4225754737854004, + "learning_rate": 0.00017948836560053216, + "loss": 0.9842, + "step": 6459 + }, + { + "epoch": 0.23134635701111966, + "grad_norm": 1.7222230434417725, + "learning_rate": 0.00017948132722699992, + "loss": 1.2667, + "step": 6460 + }, + { + "epoch": 0.23138216914068796, + "grad_norm": 1.4846590757369995, + "learning_rate": 0.0001794742877841489, + "loss": 1.2852, + "step": 6461 + }, + { + "epoch": 0.23141798127025623, + "grad_norm": 1.3349905014038086, + "learning_rate": 0.00017946724727207388, + "loss": 1.0263, + "step": 6462 + }, + { + "epoch": 0.23145379339982453, + "grad_norm": 2.3815770149230957, + "learning_rate": 0.00017946020569086955, + "loss": 0.9885, + "step": 6463 + }, + { + "epoch": 0.2314896055293928, + "grad_norm": 1.390413761138916, + "learning_rate": 0.00017945316304063066, + "loss": 1.1038, + "step": 6464 + }, + { + "epoch": 0.2315254176589611, + "grad_norm": 1.536272644996643, + "learning_rate": 0.00017944611932145193, + "loss": 1.2541, + "step": 6465 + }, + { + "epoch": 0.2315612297885294, + "grad_norm": 1.5373085737228394, + "learning_rate": 0.0001794390745334281, + "loss": 0.9233, + "step": 6466 + }, + { + "epoch": 0.23159704191809766, + "grad_norm": 1.8973153829574585, + "learning_rate": 0.00017943202867665408, + "loss": 1.2309, + "step": 6467 + }, + { + "epoch": 0.23163285404766595, + "grad_norm": 1.7683377265930176, + "learning_rate": 0.00017942498175122453, + "loss": 1.482, + "step": 6468 + }, + { + "epoch": 0.23166866617723422, + "grad_norm": 1.8468736410140991, + "learning_rate": 0.0001794179337572343, + "loss": 1.1203, + "step": 6469 + }, + { + "epoch": 0.23170447830680252, + "grad_norm": 1.7906218767166138, + "learning_rate": 0.0001794108846947782, + "loss": 1.2626, + "step": 6470 + }, + { + "epoch": 0.2317402904363708, + "grad_norm": 2.165318012237549, + "learning_rate": 0.00017940383456395109, + "loss": 1.1113, + "step": 6471 + }, + { + "epoch": 0.23177610256593908, + "grad_norm": 1.3968905210494995, + "learning_rate": 0.00017939678336484783, + "loss": 1.1899, + "step": 6472 + }, + { + "epoch": 0.23181191469550738, + "grad_norm": 1.7886288166046143, + "learning_rate": 0.00017938973109756323, + "loss": 1.2378, + "step": 6473 + }, + { + "epoch": 0.23184772682507565, + "grad_norm": 1.847670555114746, + "learning_rate": 0.00017938267776219225, + "loss": 1.2196, + "step": 6474 + }, + { + "epoch": 0.23188353895464395, + "grad_norm": 1.409990906715393, + "learning_rate": 0.00017937562335882968, + "loss": 1.2612, + "step": 6475 + }, + { + "epoch": 0.23191935108421222, + "grad_norm": 1.6399743556976318, + "learning_rate": 0.00017936856788757055, + "loss": 1.1142, + "step": 6476 + }, + { + "epoch": 0.2319551632137805, + "grad_norm": 1.3464100360870361, + "learning_rate": 0.00017936151134850966, + "loss": 1.2322, + "step": 6477 + }, + { + "epoch": 0.23199097534334878, + "grad_norm": 1.65673828125, + "learning_rate": 0.00017935445374174202, + "loss": 1.1394, + "step": 6478 + }, + { + "epoch": 0.23202678747291708, + "grad_norm": 1.7789695262908936, + "learning_rate": 0.0001793473950673626, + "loss": 1.2804, + "step": 6479 + }, + { + "epoch": 0.23206259960248538, + "grad_norm": 1.7016518115997314, + "learning_rate": 0.00017934033532546632, + "loss": 1.2465, + "step": 6480 + }, + { + "epoch": 0.23209841173205364, + "grad_norm": 1.606576919555664, + "learning_rate": 0.00017933327451614812, + "loss": 1.234, + "step": 6481 + }, + { + "epoch": 0.23213422386162194, + "grad_norm": 1.4611696004867554, + "learning_rate": 0.0001793262126395031, + "loss": 1.2307, + "step": 6482 + }, + { + "epoch": 0.2321700359911902, + "grad_norm": 1.3919026851654053, + "learning_rate": 0.00017931914969562617, + "loss": 1.1827, + "step": 6483 + }, + { + "epoch": 0.2322058481207585, + "grad_norm": 1.7127737998962402, + "learning_rate": 0.0001793120856846124, + "loss": 1.2127, + "step": 6484 + }, + { + "epoch": 0.23224166025032678, + "grad_norm": 1.2491861581802368, + "learning_rate": 0.00017930502060655682, + "loss": 1.104, + "step": 6485 + }, + { + "epoch": 0.23227747237989507, + "grad_norm": 2.0083394050598145, + "learning_rate": 0.0001792979544615545, + "loss": 1.2798, + "step": 6486 + }, + { + "epoch": 0.23231328450946334, + "grad_norm": 1.5320472717285156, + "learning_rate": 0.00017929088724970052, + "loss": 1.1736, + "step": 6487 + }, + { + "epoch": 0.23234909663903164, + "grad_norm": 1.6743110418319702, + "learning_rate": 0.0001792838189710899, + "loss": 1.2641, + "step": 6488 + }, + { + "epoch": 0.23238490876859993, + "grad_norm": 1.2845525741577148, + "learning_rate": 0.00017927674962581774, + "loss": 1.094, + "step": 6489 + }, + { + "epoch": 0.2324207208981682, + "grad_norm": 1.756885290145874, + "learning_rate": 0.0001792696792139792, + "loss": 1.0813, + "step": 6490 + }, + { + "epoch": 0.2324565330277365, + "grad_norm": 1.432459831237793, + "learning_rate": 0.0001792626077356694, + "loss": 1.2039, + "step": 6491 + }, + { + "epoch": 0.23249234515730477, + "grad_norm": 1.346209168434143, + "learning_rate": 0.0001792555351909834, + "loss": 1.4071, + "step": 6492 + }, + { + "epoch": 0.23252815728687307, + "grad_norm": 1.208433985710144, + "learning_rate": 0.0001792484615800164, + "loss": 1.0488, + "step": 6493 + }, + { + "epoch": 0.23256396941644134, + "grad_norm": 1.5714714527130127, + "learning_rate": 0.00017924138690286366, + "loss": 1.1759, + "step": 6494 + }, + { + "epoch": 0.23259978154600963, + "grad_norm": 1.4875552654266357, + "learning_rate": 0.0001792343111596202, + "loss": 1.239, + "step": 6495 + }, + { + "epoch": 0.23263559367557793, + "grad_norm": 1.3629910945892334, + "learning_rate": 0.00017922723435038131, + "loss": 1.0009, + "step": 6496 + }, + { + "epoch": 0.2326714058051462, + "grad_norm": 1.6079113483428955, + "learning_rate": 0.00017922015647524217, + "loss": 1.1845, + "step": 6497 + }, + { + "epoch": 0.2327072179347145, + "grad_norm": 1.2227866649627686, + "learning_rate": 0.00017921307753429803, + "loss": 0.8303, + "step": 6498 + }, + { + "epoch": 0.23274303006428276, + "grad_norm": 1.269921064376831, + "learning_rate": 0.00017920599752764408, + "loss": 0.9062, + "step": 6499 + }, + { + "epoch": 0.23277884219385106, + "grad_norm": 2.4576873779296875, + "learning_rate": 0.00017919891645537563, + "loss": 1.2418, + "step": 6500 + }, + { + "epoch": 0.23281465432341933, + "grad_norm": 1.9915226697921753, + "learning_rate": 0.0001791918343175879, + "loss": 1.4652, + "step": 6501 + }, + { + "epoch": 0.23285046645298763, + "grad_norm": 1.6082826852798462, + "learning_rate": 0.0001791847511143762, + "loss": 1.0503, + "step": 6502 + }, + { + "epoch": 0.23288627858255592, + "grad_norm": 1.548007607460022, + "learning_rate": 0.0001791776668458358, + "loss": 1.234, + "step": 6503 + }, + { + "epoch": 0.2329220907121242, + "grad_norm": 1.4587184190750122, + "learning_rate": 0.00017917058151206204, + "loss": 1.2253, + "step": 6504 + }, + { + "epoch": 0.2329579028416925, + "grad_norm": 1.459812879562378, + "learning_rate": 0.00017916349511315022, + "loss": 1.253, + "step": 6505 + }, + { + "epoch": 0.23299371497126076, + "grad_norm": 1.4848847389221191, + "learning_rate": 0.0001791564076491957, + "loss": 1.1358, + "step": 6506 + }, + { + "epoch": 0.23302952710082905, + "grad_norm": 1.7560590505599976, + "learning_rate": 0.0001791493191202938, + "loss": 1.489, + "step": 6507 + }, + { + "epoch": 0.23306533923039732, + "grad_norm": 1.594439148902893, + "learning_rate": 0.00017914222952653992, + "loss": 1.1957, + "step": 6508 + }, + { + "epoch": 0.23310115135996562, + "grad_norm": 1.5020387172698975, + "learning_rate": 0.00017913513886802943, + "loss": 1.1136, + "step": 6509 + }, + { + "epoch": 0.23313696348953392, + "grad_norm": 1.6846486330032349, + "learning_rate": 0.0001791280471448577, + "loss": 1.2456, + "step": 6510 + }, + { + "epoch": 0.23317277561910218, + "grad_norm": 1.4743435382843018, + "learning_rate": 0.00017912095435712017, + "loss": 1.2084, + "step": 6511 + }, + { + "epoch": 0.23320858774867048, + "grad_norm": 1.3325461149215698, + "learning_rate": 0.0001791138605049123, + "loss": 1.315, + "step": 6512 + }, + { + "epoch": 0.23324439987823875, + "grad_norm": 1.2936242818832397, + "learning_rate": 0.00017910676558832944, + "loss": 1.2456, + "step": 6513 + }, + { + "epoch": 0.23328021200780705, + "grad_norm": 2.2223732471466064, + "learning_rate": 0.0001790996696074671, + "loss": 1.2536, + "step": 6514 + }, + { + "epoch": 0.23331602413737532, + "grad_norm": 1.29911208152771, + "learning_rate": 0.00017909257256242076, + "loss": 1.0894, + "step": 6515 + }, + { + "epoch": 0.2333518362669436, + "grad_norm": 1.2766624689102173, + "learning_rate": 0.00017908547445328585, + "loss": 1.2595, + "step": 6516 + }, + { + "epoch": 0.2333876483965119, + "grad_norm": 1.4476516246795654, + "learning_rate": 0.0001790783752801579, + "loss": 1.2158, + "step": 6517 + }, + { + "epoch": 0.23342346052608018, + "grad_norm": 1.5496809482574463, + "learning_rate": 0.00017907127504313241, + "loss": 1.0938, + "step": 6518 + }, + { + "epoch": 0.23345927265564848, + "grad_norm": 1.4282965660095215, + "learning_rate": 0.00017906417374230493, + "loss": 1.1227, + "step": 6519 + }, + { + "epoch": 0.23349508478521674, + "grad_norm": 1.6786824464797974, + "learning_rate": 0.00017905707137777098, + "loss": 0.9882, + "step": 6520 + }, + { + "epoch": 0.23353089691478504, + "grad_norm": 1.402723789215088, + "learning_rate": 0.00017904996794962608, + "loss": 1.0409, + "step": 6521 + }, + { + "epoch": 0.2335667090443533, + "grad_norm": 1.6408259868621826, + "learning_rate": 0.00017904286345796582, + "loss": 1.2437, + "step": 6522 + }, + { + "epoch": 0.2336025211739216, + "grad_norm": 1.516430139541626, + "learning_rate": 0.00017903575790288585, + "loss": 1.1051, + "step": 6523 + }, + { + "epoch": 0.2336383333034899, + "grad_norm": 1.8020477294921875, + "learning_rate": 0.00017902865128448166, + "loss": 1.3548, + "step": 6524 + }, + { + "epoch": 0.23367414543305817, + "grad_norm": 1.4688093662261963, + "learning_rate": 0.00017902154360284893, + "loss": 0.9875, + "step": 6525 + }, + { + "epoch": 0.23370995756262647, + "grad_norm": 1.6409159898757935, + "learning_rate": 0.00017901443485808324, + "loss": 1.1761, + "step": 6526 + }, + { + "epoch": 0.23374576969219474, + "grad_norm": 1.5470293760299683, + "learning_rate": 0.00017900732505028025, + "loss": 1.1527, + "step": 6527 + }, + { + "epoch": 0.23378158182176303, + "grad_norm": 1.5823485851287842, + "learning_rate": 0.00017900021417953564, + "loss": 1.2304, + "step": 6528 + }, + { + "epoch": 0.2338173939513313, + "grad_norm": 1.5468363761901855, + "learning_rate": 0.000178993102245945, + "loss": 1.2155, + "step": 6529 + }, + { + "epoch": 0.2338532060808996, + "grad_norm": 1.842511773109436, + "learning_rate": 0.0001789859892496041, + "loss": 1.173, + "step": 6530 + }, + { + "epoch": 0.2338890182104679, + "grad_norm": 1.328555703163147, + "learning_rate": 0.00017897887519060862, + "loss": 1.019, + "step": 6531 + }, + { + "epoch": 0.23392483034003617, + "grad_norm": 1.4685403108596802, + "learning_rate": 0.0001789717600690542, + "loss": 1.3919, + "step": 6532 + }, + { + "epoch": 0.23396064246960446, + "grad_norm": 1.6433663368225098, + "learning_rate": 0.00017896464388503664, + "loss": 0.9942, + "step": 6533 + }, + { + "epoch": 0.23399645459917273, + "grad_norm": 1.8452528715133667, + "learning_rate": 0.00017895752663865167, + "loss": 1.2509, + "step": 6534 + }, + { + "epoch": 0.23403226672874103, + "grad_norm": 2.169668674468994, + "learning_rate": 0.00017895040832999502, + "loss": 1.2176, + "step": 6535 + }, + { + "epoch": 0.2340680788583093, + "grad_norm": 1.4182482957839966, + "learning_rate": 0.00017894328895916244, + "loss": 1.3203, + "step": 6536 + }, + { + "epoch": 0.2341038909878776, + "grad_norm": 1.841698169708252, + "learning_rate": 0.00017893616852624974, + "loss": 1.1089, + "step": 6537 + }, + { + "epoch": 0.2341397031174459, + "grad_norm": 1.5617938041687012, + "learning_rate": 0.00017892904703135272, + "loss": 1.2593, + "step": 6538 + }, + { + "epoch": 0.23417551524701416, + "grad_norm": 1.7703583240509033, + "learning_rate": 0.0001789219244745672, + "loss": 0.8029, + "step": 6539 + }, + { + "epoch": 0.23421132737658246, + "grad_norm": 1.2518033981323242, + "learning_rate": 0.00017891480085598896, + "loss": 1.1354, + "step": 6540 + }, + { + "epoch": 0.23424713950615073, + "grad_norm": 1.746680736541748, + "learning_rate": 0.00017890767617571388, + "loss": 1.174, + "step": 6541 + }, + { + "epoch": 0.23428295163571902, + "grad_norm": 1.6999220848083496, + "learning_rate": 0.00017890055043383782, + "loss": 1.0742, + "step": 6542 + }, + { + "epoch": 0.2343187637652873, + "grad_norm": 2.2195000648498535, + "learning_rate": 0.0001788934236304566, + "loss": 1.1883, + "step": 6543 + }, + { + "epoch": 0.2343545758948556, + "grad_norm": 2.209517240524292, + "learning_rate": 0.00017888629576566614, + "loss": 1.448, + "step": 6544 + }, + { + "epoch": 0.23439038802442388, + "grad_norm": 1.6196156740188599, + "learning_rate": 0.00017887916683956233, + "loss": 1.1419, + "step": 6545 + }, + { + "epoch": 0.23442620015399215, + "grad_norm": 1.5512895584106445, + "learning_rate": 0.0001788720368522411, + "loss": 1.1156, + "step": 6546 + }, + { + "epoch": 0.23446201228356045, + "grad_norm": 2.174320936203003, + "learning_rate": 0.0001788649058037983, + "loss": 1.2395, + "step": 6547 + }, + { + "epoch": 0.23449782441312872, + "grad_norm": 1.6576850414276123, + "learning_rate": 0.00017885777369432994, + "loss": 1.1276, + "step": 6548 + }, + { + "epoch": 0.23453363654269702, + "grad_norm": 1.3317694664001465, + "learning_rate": 0.000178850640523932, + "loss": 1.2013, + "step": 6549 + }, + { + "epoch": 0.23456944867226528, + "grad_norm": 1.3169124126434326, + "learning_rate": 0.00017884350629270035, + "loss": 1.1974, + "step": 6550 + }, + { + "epoch": 0.23460526080183358, + "grad_norm": 1.6373071670532227, + "learning_rate": 0.00017883637100073104, + "loss": 0.9649, + "step": 6551 + }, + { + "epoch": 0.23464107293140188, + "grad_norm": 1.5850285291671753, + "learning_rate": 0.00017882923464812006, + "loss": 0.9283, + "step": 6552 + }, + { + "epoch": 0.23467688506097015, + "grad_norm": 1.685111165046692, + "learning_rate": 0.00017882209723496338, + "loss": 1.3674, + "step": 6553 + }, + { + "epoch": 0.23471269719053844, + "grad_norm": 1.8440793752670288, + "learning_rate": 0.00017881495876135708, + "loss": 1.2819, + "step": 6554 + }, + { + "epoch": 0.2347485093201067, + "grad_norm": 1.6434139013290405, + "learning_rate": 0.00017880781922739717, + "loss": 1.5416, + "step": 6555 + }, + { + "epoch": 0.234784321449675, + "grad_norm": 1.6804347038269043, + "learning_rate": 0.0001788006786331797, + "loss": 1.0634, + "step": 6556 + }, + { + "epoch": 0.23482013357924328, + "grad_norm": 1.5545058250427246, + "learning_rate": 0.00017879353697880073, + "loss": 1.2036, + "step": 6557 + }, + { + "epoch": 0.23485594570881158, + "grad_norm": 1.5690522193908691, + "learning_rate": 0.00017878639426435638, + "loss": 1.0862, + "step": 6558 + }, + { + "epoch": 0.23489175783837987, + "grad_norm": 1.2246313095092773, + "learning_rate": 0.00017877925048994273, + "loss": 1.0965, + "step": 6559 + }, + { + "epoch": 0.23492756996794814, + "grad_norm": 1.5620317459106445, + "learning_rate": 0.00017877210565565586, + "loss": 1.4243, + "step": 6560 + }, + { + "epoch": 0.23496338209751644, + "grad_norm": 1.6258330345153809, + "learning_rate": 0.0001787649597615919, + "loss": 1.1942, + "step": 6561 + }, + { + "epoch": 0.2349991942270847, + "grad_norm": 1.4353623390197754, + "learning_rate": 0.00017875781280784705, + "loss": 1.1504, + "step": 6562 + }, + { + "epoch": 0.235035006356653, + "grad_norm": 1.8389403820037842, + "learning_rate": 0.0001787506647945174, + "loss": 1.0751, + "step": 6563 + }, + { + "epoch": 0.23507081848622127, + "grad_norm": 2.2019364833831787, + "learning_rate": 0.00017874351572169913, + "loss": 1.1808, + "step": 6564 + }, + { + "epoch": 0.23510663061578957, + "grad_norm": 1.6081691980361938, + "learning_rate": 0.00017873636558948846, + "loss": 1.1215, + "step": 6565 + }, + { + "epoch": 0.23514244274535787, + "grad_norm": 1.3956047296524048, + "learning_rate": 0.00017872921439798152, + "loss": 1.292, + "step": 6566 + }, + { + "epoch": 0.23517825487492613, + "grad_norm": 1.2679853439331055, + "learning_rate": 0.00017872206214727455, + "loss": 1.0825, + "step": 6567 + }, + { + "epoch": 0.23521406700449443, + "grad_norm": 1.4290771484375, + "learning_rate": 0.0001787149088374638, + "loss": 1.1306, + "step": 6568 + }, + { + "epoch": 0.2352498791340627, + "grad_norm": 1.3365906476974487, + "learning_rate": 0.00017870775446864547, + "loss": 1.1475, + "step": 6569 + }, + { + "epoch": 0.235285691263631, + "grad_norm": 1.8449329137802124, + "learning_rate": 0.00017870059904091584, + "loss": 1.0386, + "step": 6570 + }, + { + "epoch": 0.23532150339319927, + "grad_norm": 1.4977753162384033, + "learning_rate": 0.00017869344255437117, + "loss": 1.1181, + "step": 6571 + }, + { + "epoch": 0.23535731552276756, + "grad_norm": 1.5463663339614868, + "learning_rate": 0.00017868628500910773, + "loss": 1.2988, + "step": 6572 + }, + { + "epoch": 0.23539312765233586, + "grad_norm": 1.2094626426696777, + "learning_rate": 0.00017867912640522182, + "loss": 1.3273, + "step": 6573 + }, + { + "epoch": 0.23542893978190413, + "grad_norm": 1.644053339958191, + "learning_rate": 0.00017867196674280976, + "loss": 1.4154, + "step": 6574 + }, + { + "epoch": 0.23546475191147243, + "grad_norm": 1.528545618057251, + "learning_rate": 0.00017866480602196787, + "loss": 1.021, + "step": 6575 + }, + { + "epoch": 0.2355005640410407, + "grad_norm": 1.942249059677124, + "learning_rate": 0.00017865764424279248, + "loss": 1.0829, + "step": 6576 + }, + { + "epoch": 0.235536376170609, + "grad_norm": 1.8080992698669434, + "learning_rate": 0.00017865048140537995, + "loss": 1.3134, + "step": 6577 + }, + { + "epoch": 0.23557218830017726, + "grad_norm": 1.601040244102478, + "learning_rate": 0.00017864331750982665, + "loss": 1.1186, + "step": 6578 + }, + { + "epoch": 0.23560800042974556, + "grad_norm": 1.5243803262710571, + "learning_rate": 0.0001786361525562289, + "loss": 1.1968, + "step": 6579 + }, + { + "epoch": 0.23564381255931385, + "grad_norm": 1.2979930639266968, + "learning_rate": 0.0001786289865446832, + "loss": 1.1327, + "step": 6580 + }, + { + "epoch": 0.23567962468888212, + "grad_norm": 2.0095081329345703, + "learning_rate": 0.00017862181947528592, + "loss": 1.1896, + "step": 6581 + }, + { + "epoch": 0.23571543681845042, + "grad_norm": 1.8242744207382202, + "learning_rate": 0.00017861465134813348, + "loss": 1.0923, + "step": 6582 + }, + { + "epoch": 0.2357512489480187, + "grad_norm": 1.5743978023529053, + "learning_rate": 0.00017860748216332227, + "loss": 1.087, + "step": 6583 + }, + { + "epoch": 0.23578706107758698, + "grad_norm": 1.5239804983139038, + "learning_rate": 0.00017860031192094882, + "loss": 1.0968, + "step": 6584 + }, + { + "epoch": 0.23582287320715525, + "grad_norm": 1.5184880495071411, + "learning_rate": 0.00017859314062110954, + "loss": 1.1861, + "step": 6585 + }, + { + "epoch": 0.23585868533672355, + "grad_norm": 2.073988437652588, + "learning_rate": 0.00017858596826390093, + "loss": 1.3035, + "step": 6586 + }, + { + "epoch": 0.23589449746629182, + "grad_norm": 1.582937240600586, + "learning_rate": 0.0001785787948494195, + "loss": 1.3556, + "step": 6587 + }, + { + "epoch": 0.23593030959586012, + "grad_norm": 1.403565526008606, + "learning_rate": 0.00017857162037776173, + "loss": 1.1024, + "step": 6588 + }, + { + "epoch": 0.2359661217254284, + "grad_norm": 1.573805809020996, + "learning_rate": 0.0001785644448490242, + "loss": 1.1614, + "step": 6589 + }, + { + "epoch": 0.23600193385499668, + "grad_norm": 1.4389519691467285, + "learning_rate": 0.00017855726826330334, + "loss": 1.2863, + "step": 6590 + }, + { + "epoch": 0.23603774598456498, + "grad_norm": 1.7273446321487427, + "learning_rate": 0.00017855009062069582, + "loss": 1.1532, + "step": 6591 + }, + { + "epoch": 0.23607355811413325, + "grad_norm": 1.9195250272750854, + "learning_rate": 0.00017854291192129812, + "loss": 1.166, + "step": 6592 + }, + { + "epoch": 0.23610937024370154, + "grad_norm": 1.4077982902526855, + "learning_rate": 0.00017853573216520684, + "loss": 1.2363, + "step": 6593 + }, + { + "epoch": 0.2361451823732698, + "grad_norm": 1.4161927700042725, + "learning_rate": 0.00017852855135251864, + "loss": 0.9578, + "step": 6594 + }, + { + "epoch": 0.2361809945028381, + "grad_norm": 1.6588902473449707, + "learning_rate": 0.00017852136948333006, + "loss": 1.288, + "step": 6595 + }, + { + "epoch": 0.2362168066324064, + "grad_norm": 1.6443901062011719, + "learning_rate": 0.00017851418655773772, + "loss": 1.2083, + "step": 6596 + }, + { + "epoch": 0.23625261876197468, + "grad_norm": 1.9217793941497803, + "learning_rate": 0.00017850700257583828, + "loss": 1.4066, + "step": 6597 + }, + { + "epoch": 0.23628843089154297, + "grad_norm": 1.698299765586853, + "learning_rate": 0.00017849981753772836, + "loss": 1.2655, + "step": 6598 + }, + { + "epoch": 0.23632424302111124, + "grad_norm": 1.4867080450057983, + "learning_rate": 0.0001784926314435047, + "loss": 1.34, + "step": 6599 + }, + { + "epoch": 0.23636005515067954, + "grad_norm": 1.9663246870040894, + "learning_rate": 0.00017848544429326392, + "loss": 1.0613, + "step": 6600 + }, + { + "epoch": 0.2363958672802478, + "grad_norm": 2.3637278079986572, + "learning_rate": 0.00017847825608710273, + "loss": 1.131, + "step": 6601 + }, + { + "epoch": 0.2364316794098161, + "grad_norm": 1.4313836097717285, + "learning_rate": 0.00017847106682511782, + "loss": 1.1986, + "step": 6602 + }, + { + "epoch": 0.2364674915393844, + "grad_norm": 2.3891584873199463, + "learning_rate": 0.00017846387650740592, + "loss": 1.1995, + "step": 6603 + }, + { + "epoch": 0.23650330366895267, + "grad_norm": 1.4455604553222656, + "learning_rate": 0.00017845668513406378, + "loss": 1.0744, + "step": 6604 + }, + { + "epoch": 0.23653911579852097, + "grad_norm": 1.3173528909683228, + "learning_rate": 0.00017844949270518816, + "loss": 0.8803, + "step": 6605 + }, + { + "epoch": 0.23657492792808923, + "grad_norm": 1.5018843412399292, + "learning_rate": 0.00017844229922087582, + "loss": 1.0882, + "step": 6606 + }, + { + "epoch": 0.23661074005765753, + "grad_norm": 1.613588809967041, + "learning_rate": 0.00017843510468122347, + "loss": 1.1454, + "step": 6607 + }, + { + "epoch": 0.2366465521872258, + "grad_norm": 1.9066826105117798, + "learning_rate": 0.00017842790908632802, + "loss": 1.1433, + "step": 6608 + }, + { + "epoch": 0.2366823643167941, + "grad_norm": 1.748289942741394, + "learning_rate": 0.00017842071243628617, + "loss": 1.0953, + "step": 6609 + }, + { + "epoch": 0.2367181764463624, + "grad_norm": 1.535718321800232, + "learning_rate": 0.0001784135147311948, + "loss": 1.188, + "step": 6610 + }, + { + "epoch": 0.23675398857593066, + "grad_norm": 1.544402003288269, + "learning_rate": 0.00017840631597115076, + "loss": 1.2188, + "step": 6611 + }, + { + "epoch": 0.23678980070549896, + "grad_norm": 1.6709446907043457, + "learning_rate": 0.00017839911615625086, + "loss": 1.2238, + "step": 6612 + }, + { + "epoch": 0.23682561283506723, + "grad_norm": 1.387172818183899, + "learning_rate": 0.00017839191528659198, + "loss": 1.1421, + "step": 6613 + }, + { + "epoch": 0.23686142496463553, + "grad_norm": 1.4070106744766235, + "learning_rate": 0.000178384713362271, + "loss": 1.1759, + "step": 6614 + }, + { + "epoch": 0.2368972370942038, + "grad_norm": 1.609897255897522, + "learning_rate": 0.00017837751038338482, + "loss": 1.2401, + "step": 6615 + }, + { + "epoch": 0.2369330492237721, + "grad_norm": 1.5236730575561523, + "learning_rate": 0.00017837030635003032, + "loss": 1.227, + "step": 6616 + }, + { + "epoch": 0.2369688613533404, + "grad_norm": 1.3396612405776978, + "learning_rate": 0.00017836310126230444, + "loss": 1.0722, + "step": 6617 + }, + { + "epoch": 0.23700467348290866, + "grad_norm": 1.5324392318725586, + "learning_rate": 0.00017835589512030413, + "loss": 1.234, + "step": 6618 + }, + { + "epoch": 0.23704048561247695, + "grad_norm": 1.3557580709457397, + "learning_rate": 0.00017834868792412632, + "loss": 1.1227, + "step": 6619 + }, + { + "epoch": 0.23707629774204522, + "grad_norm": 1.6105607748031616, + "learning_rate": 0.00017834147967386797, + "loss": 1.0419, + "step": 6620 + }, + { + "epoch": 0.23711210987161352, + "grad_norm": 1.8046458959579468, + "learning_rate": 0.00017833427036962604, + "loss": 1.1976, + "step": 6621 + }, + { + "epoch": 0.2371479220011818, + "grad_norm": 1.4367218017578125, + "learning_rate": 0.0001783270600114976, + "loss": 1.1073, + "step": 6622 + }, + { + "epoch": 0.23718373413075008, + "grad_norm": 1.6592007875442505, + "learning_rate": 0.00017831984859957955, + "loss": 1.1498, + "step": 6623 + }, + { + "epoch": 0.23721954626031838, + "grad_norm": 1.6921358108520508, + "learning_rate": 0.00017831263613396898, + "loss": 1.3757, + "step": 6624 + }, + { + "epoch": 0.23725535838988665, + "grad_norm": 2.154862880706787, + "learning_rate": 0.0001783054226147629, + "loss": 1.1641, + "step": 6625 + }, + { + "epoch": 0.23729117051945495, + "grad_norm": 1.608426570892334, + "learning_rate": 0.0001782982080420584, + "loss": 1.0344, + "step": 6626 + }, + { + "epoch": 0.23732698264902322, + "grad_norm": 1.533535361289978, + "learning_rate": 0.00017829099241595245, + "loss": 1.2299, + "step": 6627 + }, + { + "epoch": 0.2373627947785915, + "grad_norm": 1.341589093208313, + "learning_rate": 0.00017828377573654225, + "loss": 1.3322, + "step": 6628 + }, + { + "epoch": 0.23739860690815978, + "grad_norm": 1.4667844772338867, + "learning_rate": 0.00017827655800392478, + "loss": 1.3007, + "step": 6629 + }, + { + "epoch": 0.23743441903772808, + "grad_norm": 1.63994300365448, + "learning_rate": 0.00017826933921819723, + "loss": 1.1543, + "step": 6630 + }, + { + "epoch": 0.23747023116729638, + "grad_norm": 1.6657774448394775, + "learning_rate": 0.00017826211937945665, + "loss": 1.4233, + "step": 6631 + }, + { + "epoch": 0.23750604329686464, + "grad_norm": 1.6392691135406494, + "learning_rate": 0.00017825489848780022, + "loss": 1.3093, + "step": 6632 + }, + { + "epoch": 0.23754185542643294, + "grad_norm": 2.2956299781799316, + "learning_rate": 0.00017824767654332505, + "loss": 1.4229, + "step": 6633 + }, + { + "epoch": 0.2375776675560012, + "grad_norm": 1.3744940757751465, + "learning_rate": 0.00017824045354612836, + "loss": 1.0477, + "step": 6634 + }, + { + "epoch": 0.2376134796855695, + "grad_norm": 1.5985265970230103, + "learning_rate": 0.00017823322949630727, + "loss": 1.1964, + "step": 6635 + }, + { + "epoch": 0.23764929181513778, + "grad_norm": 1.628487467765808, + "learning_rate": 0.000178226004393959, + "loss": 1.1042, + "step": 6636 + }, + { + "epoch": 0.23768510394470607, + "grad_norm": 1.3105449676513672, + "learning_rate": 0.0001782187782391807, + "loss": 1.0335, + "step": 6637 + }, + { + "epoch": 0.23772091607427437, + "grad_norm": 2.1579880714416504, + "learning_rate": 0.0001782115510320697, + "loss": 1.379, + "step": 6638 + }, + { + "epoch": 0.23775672820384264, + "grad_norm": 1.5707300901412964, + "learning_rate": 0.00017820432277272313, + "loss": 1.173, + "step": 6639 + }, + { + "epoch": 0.23779254033341093, + "grad_norm": 1.417056918144226, + "learning_rate": 0.00017819709346123826, + "loss": 1.3164, + "step": 6640 + }, + { + "epoch": 0.2378283524629792, + "grad_norm": 1.5211182832717896, + "learning_rate": 0.0001781898630977124, + "loss": 1.0842, + "step": 6641 + }, + { + "epoch": 0.2378641645925475, + "grad_norm": 1.65911066532135, + "learning_rate": 0.00017818263168224276, + "loss": 1.158, + "step": 6642 + }, + { + "epoch": 0.23789997672211577, + "grad_norm": 1.8380882740020752, + "learning_rate": 0.0001781753992149267, + "loss": 1.1746, + "step": 6643 + }, + { + "epoch": 0.23793578885168407, + "grad_norm": 1.2298704385757446, + "learning_rate": 0.00017816816569586144, + "loss": 1.1739, + "step": 6644 + }, + { + "epoch": 0.23797160098125236, + "grad_norm": 1.6242049932479858, + "learning_rate": 0.00017816093112514437, + "loss": 1.1777, + "step": 6645 + }, + { + "epoch": 0.23800741311082063, + "grad_norm": 1.4578819274902344, + "learning_rate": 0.00017815369550287278, + "loss": 1.1063, + "step": 6646 + }, + { + "epoch": 0.23804322524038893, + "grad_norm": 1.253287672996521, + "learning_rate": 0.00017814645882914402, + "loss": 1.2506, + "step": 6647 + }, + { + "epoch": 0.2380790373699572, + "grad_norm": 1.6570844650268555, + "learning_rate": 0.00017813922110405548, + "loss": 1.3524, + "step": 6648 + }, + { + "epoch": 0.2381148494995255, + "grad_norm": 1.2918310165405273, + "learning_rate": 0.00017813198232770447, + "loss": 1.0436, + "step": 6649 + }, + { + "epoch": 0.23815066162909376, + "grad_norm": 1.9056556224822998, + "learning_rate": 0.00017812474250018844, + "loss": 1.1903, + "step": 6650 + }, + { + "epoch": 0.23818647375866206, + "grad_norm": 1.258172631263733, + "learning_rate": 0.00017811750162160478, + "loss": 1.09, + "step": 6651 + }, + { + "epoch": 0.23822228588823036, + "grad_norm": 1.2487553358078003, + "learning_rate": 0.00017811025969205092, + "loss": 1.0176, + "step": 6652 + }, + { + "epoch": 0.23825809801779863, + "grad_norm": 1.5371112823486328, + "learning_rate": 0.00017810301671162426, + "loss": 1.3264, + "step": 6653 + }, + { + "epoch": 0.23829391014736692, + "grad_norm": 1.4076074361801147, + "learning_rate": 0.00017809577268042224, + "loss": 1.2652, + "step": 6654 + }, + { + "epoch": 0.2383297222769352, + "grad_norm": 1.5089248418807983, + "learning_rate": 0.00017808852759854235, + "loss": 1.0968, + "step": 6655 + }, + { + "epoch": 0.2383655344065035, + "grad_norm": 1.2651790380477905, + "learning_rate": 0.00017808128146608204, + "loss": 1.0779, + "step": 6656 + }, + { + "epoch": 0.23840134653607176, + "grad_norm": 1.5546025037765503, + "learning_rate": 0.0001780740342831388, + "loss": 1.1586, + "step": 6657 + }, + { + "epoch": 0.23843715866564005, + "grad_norm": 1.4502668380737305, + "learning_rate": 0.00017806678604981012, + "loss": 0.9819, + "step": 6658 + }, + { + "epoch": 0.23847297079520835, + "grad_norm": 1.633884310722351, + "learning_rate": 0.00017805953676619356, + "loss": 1.2202, + "step": 6659 + }, + { + "epoch": 0.23850878292477662, + "grad_norm": 1.7264028787612915, + "learning_rate": 0.00017805228643238662, + "loss": 1.1898, + "step": 6660 + }, + { + "epoch": 0.23854459505434492, + "grad_norm": 1.9910950660705566, + "learning_rate": 0.00017804503504848684, + "loss": 1.147, + "step": 6661 + }, + { + "epoch": 0.23858040718391318, + "grad_norm": 1.9079251289367676, + "learning_rate": 0.00017803778261459181, + "loss": 1.2128, + "step": 6662 + }, + { + "epoch": 0.23861621931348148, + "grad_norm": 2.0574920177459717, + "learning_rate": 0.00017803052913079905, + "loss": 1.0654, + "step": 6663 + }, + { + "epoch": 0.23865203144304975, + "grad_norm": 1.2321815490722656, + "learning_rate": 0.0001780232745972062, + "loss": 1.0709, + "step": 6664 + }, + { + "epoch": 0.23868784357261805, + "grad_norm": 1.6169469356536865, + "learning_rate": 0.00017801601901391078, + "loss": 1.1478, + "step": 6665 + }, + { + "epoch": 0.23872365570218634, + "grad_norm": 1.4925166368484497, + "learning_rate": 0.0001780087623810105, + "loss": 1.3663, + "step": 6666 + }, + { + "epoch": 0.2387594678317546, + "grad_norm": 1.366723656654358, + "learning_rate": 0.00017800150469860293, + "loss": 1.2678, + "step": 6667 + }, + { + "epoch": 0.2387952799613229, + "grad_norm": 1.32651948928833, + "learning_rate": 0.00017799424596678573, + "loss": 1.209, + "step": 6668 + }, + { + "epoch": 0.23883109209089118, + "grad_norm": 1.5348175764083862, + "learning_rate": 0.00017798698618565653, + "loss": 1.1536, + "step": 6669 + }, + { + "epoch": 0.23886690422045948, + "grad_norm": 1.4471558332443237, + "learning_rate": 0.0001779797253553131, + "loss": 1.3862, + "step": 6670 + }, + { + "epoch": 0.23890271635002774, + "grad_norm": 1.7074192762374878, + "learning_rate": 0.000177972463475853, + "loss": 1.2583, + "step": 6671 + }, + { + "epoch": 0.23893852847959604, + "grad_norm": 1.6374374628067017, + "learning_rate": 0.000177965200547374, + "loss": 1.3237, + "step": 6672 + }, + { + "epoch": 0.23897434060916434, + "grad_norm": 2.0027976036071777, + "learning_rate": 0.00017795793656997377, + "loss": 1.1387, + "step": 6673 + }, + { + "epoch": 0.2390101527387326, + "grad_norm": 1.5807641744613647, + "learning_rate": 0.00017795067154375007, + "loss": 1.1226, + "step": 6674 + }, + { + "epoch": 0.2390459648683009, + "grad_norm": 1.3983135223388672, + "learning_rate": 0.00017794340546880064, + "loss": 1.1775, + "step": 6675 + }, + { + "epoch": 0.23908177699786917, + "grad_norm": 1.740599513053894, + "learning_rate": 0.00017793613834522326, + "loss": 1.1592, + "step": 6676 + }, + { + "epoch": 0.23911758912743747, + "grad_norm": 1.2650352716445923, + "learning_rate": 0.0001779288701731156, + "loss": 1.2283, + "step": 6677 + }, + { + "epoch": 0.23915340125700574, + "grad_norm": 2.3034915924072266, + "learning_rate": 0.00017792160095257556, + "loss": 1.2086, + "step": 6678 + }, + { + "epoch": 0.23918921338657403, + "grad_norm": 1.2608919143676758, + "learning_rate": 0.00017791433068370087, + "loss": 1.0904, + "step": 6679 + }, + { + "epoch": 0.23922502551614233, + "grad_norm": 1.6857128143310547, + "learning_rate": 0.00017790705936658938, + "loss": 0.9787, + "step": 6680 + }, + { + "epoch": 0.2392608376457106, + "grad_norm": 1.7556869983673096, + "learning_rate": 0.00017789978700133888, + "loss": 1.4086, + "step": 6681 + }, + { + "epoch": 0.2392966497752789, + "grad_norm": 1.6927303075790405, + "learning_rate": 0.00017789251358804725, + "loss": 1.1104, + "step": 6682 + }, + { + "epoch": 0.23933246190484717, + "grad_norm": 1.376960277557373, + "learning_rate": 0.00017788523912681231, + "loss": 1.2014, + "step": 6683 + }, + { + "epoch": 0.23936827403441546, + "grad_norm": 1.7035386562347412, + "learning_rate": 0.00017787796361773197, + "loss": 1.0708, + "step": 6684 + }, + { + "epoch": 0.23940408616398373, + "grad_norm": 1.3369581699371338, + "learning_rate": 0.00017787068706090405, + "loss": 1.2026, + "step": 6685 + }, + { + "epoch": 0.23943989829355203, + "grad_norm": 1.684841513633728, + "learning_rate": 0.0001778634094564265, + "loss": 1.0864, + "step": 6686 + }, + { + "epoch": 0.2394757104231203, + "grad_norm": 1.5815086364746094, + "learning_rate": 0.0001778561308043972, + "loss": 1.1298, + "step": 6687 + }, + { + "epoch": 0.2395115225526886, + "grad_norm": 1.8305387496948242, + "learning_rate": 0.00017784885110491412, + "loss": 1.3066, + "step": 6688 + }, + { + "epoch": 0.2395473346822569, + "grad_norm": 1.3473076820373535, + "learning_rate": 0.00017784157035807515, + "loss": 1.191, + "step": 6689 + }, + { + "epoch": 0.23958314681182516, + "grad_norm": 1.5315905809402466, + "learning_rate": 0.00017783428856397825, + "loss": 1.1041, + "step": 6690 + }, + { + "epoch": 0.23961895894139346, + "grad_norm": 1.4599074125289917, + "learning_rate": 0.00017782700572272137, + "loss": 1.1743, + "step": 6691 + }, + { + "epoch": 0.23965477107096173, + "grad_norm": 1.6862092018127441, + "learning_rate": 0.00017781972183440254, + "loss": 1.1732, + "step": 6692 + }, + { + "epoch": 0.23969058320053002, + "grad_norm": 1.635854959487915, + "learning_rate": 0.00017781243689911973, + "loss": 1.1689, + "step": 6693 + }, + { + "epoch": 0.2397263953300983, + "grad_norm": 1.3785161972045898, + "learning_rate": 0.00017780515091697096, + "loss": 1.2588, + "step": 6694 + }, + { + "epoch": 0.2397622074596666, + "grad_norm": 1.5527567863464355, + "learning_rate": 0.00017779786388805424, + "loss": 1.0334, + "step": 6695 + }, + { + "epoch": 0.23979801958923488, + "grad_norm": 1.3528339862823486, + "learning_rate": 0.00017779057581246763, + "loss": 1.1562, + "step": 6696 + }, + { + "epoch": 0.23983383171880315, + "grad_norm": 1.669845461845398, + "learning_rate": 0.00017778328669030918, + "loss": 1.5751, + "step": 6697 + }, + { + "epoch": 0.23986964384837145, + "grad_norm": 1.4239513874053955, + "learning_rate": 0.0001777759965216769, + "loss": 1.1414, + "step": 6698 + }, + { + "epoch": 0.23990545597793972, + "grad_norm": 1.6687391996383667, + "learning_rate": 0.0001777687053066689, + "loss": 1.2733, + "step": 6699 + }, + { + "epoch": 0.23994126810750802, + "grad_norm": 1.2837566137313843, + "learning_rate": 0.00017776141304538332, + "loss": 1.1087, + "step": 6700 + }, + { + "epoch": 0.23997708023707628, + "grad_norm": 1.645704746246338, + "learning_rate": 0.00017775411973791822, + "loss": 1.2597, + "step": 6701 + }, + { + "epoch": 0.24001289236664458, + "grad_norm": 2.0076279640197754, + "learning_rate": 0.00017774682538437175, + "loss": 1.163, + "step": 6702 + }, + { + "epoch": 0.24004870449621288, + "grad_norm": 1.678077220916748, + "learning_rate": 0.00017773952998484204, + "loss": 1.2174, + "step": 6703 + }, + { + "epoch": 0.24008451662578115, + "grad_norm": 1.7188485860824585, + "learning_rate": 0.0001777322335394272, + "loss": 1.3137, + "step": 6704 + }, + { + "epoch": 0.24012032875534944, + "grad_norm": 2.005155563354492, + "learning_rate": 0.00017772493604822543, + "loss": 1.1958, + "step": 6705 + }, + { + "epoch": 0.2401561408849177, + "grad_norm": 1.5914640426635742, + "learning_rate": 0.00017771763751133488, + "loss": 1.215, + "step": 6706 + }, + { + "epoch": 0.240191953014486, + "grad_norm": 1.1758989095687866, + "learning_rate": 0.0001777103379288538, + "loss": 1.0616, + "step": 6707 + }, + { + "epoch": 0.24022776514405428, + "grad_norm": 1.58502995967865, + "learning_rate": 0.00017770303730088035, + "loss": 1.1797, + "step": 6708 + }, + { + "epoch": 0.24026357727362257, + "grad_norm": 2.367452621459961, + "learning_rate": 0.00017769573562751275, + "loss": 1.1998, + "step": 6709 + }, + { + "epoch": 0.24029938940319087, + "grad_norm": 1.6662729978561401, + "learning_rate": 0.0001776884329088493, + "loss": 1.2903, + "step": 6710 + }, + { + "epoch": 0.24033520153275914, + "grad_norm": 1.375241756439209, + "learning_rate": 0.00017768112914498817, + "loss": 1.001, + "step": 6711 + }, + { + "epoch": 0.24037101366232744, + "grad_norm": 1.5477970838546753, + "learning_rate": 0.00017767382433602762, + "loss": 1.0649, + "step": 6712 + }, + { + "epoch": 0.2404068257918957, + "grad_norm": 1.9034072160720825, + "learning_rate": 0.00017766651848206597, + "loss": 1.3909, + "step": 6713 + }, + { + "epoch": 0.240442637921464, + "grad_norm": 1.4506175518035889, + "learning_rate": 0.00017765921158320152, + "loss": 1.2023, + "step": 6714 + }, + { + "epoch": 0.24047845005103227, + "grad_norm": 1.5316356420516968, + "learning_rate": 0.00017765190363953253, + "loss": 0.8989, + "step": 6715 + }, + { + "epoch": 0.24051426218060057, + "grad_norm": 1.7360984086990356, + "learning_rate": 0.00017764459465115736, + "loss": 1.291, + "step": 6716 + }, + { + "epoch": 0.24055007431016887, + "grad_norm": 1.6217504739761353, + "learning_rate": 0.0001776372846181743, + "loss": 1.1416, + "step": 6717 + }, + { + "epoch": 0.24058588643973713, + "grad_norm": 1.9607553482055664, + "learning_rate": 0.00017762997354068172, + "loss": 1.4013, + "step": 6718 + }, + { + "epoch": 0.24062169856930543, + "grad_norm": 1.7733148336410522, + "learning_rate": 0.00017762266141877796, + "loss": 1.071, + "step": 6719 + }, + { + "epoch": 0.2406575106988737, + "grad_norm": 1.8119690418243408, + "learning_rate": 0.00017761534825256144, + "loss": 1.1406, + "step": 6720 + }, + { + "epoch": 0.240693322828442, + "grad_norm": 1.3359935283660889, + "learning_rate": 0.00017760803404213052, + "loss": 0.916, + "step": 6721 + }, + { + "epoch": 0.24072913495801027, + "grad_norm": 1.979259967803955, + "learning_rate": 0.00017760071878758363, + "loss": 1.1253, + "step": 6722 + }, + { + "epoch": 0.24076494708757856, + "grad_norm": 1.684201717376709, + "learning_rate": 0.00017759340248901917, + "loss": 1.2174, + "step": 6723 + }, + { + "epoch": 0.24080075921714686, + "grad_norm": 1.4150118827819824, + "learning_rate": 0.00017758608514653555, + "loss": 1.0637, + "step": 6724 + }, + { + "epoch": 0.24083657134671513, + "grad_norm": 1.4768247604370117, + "learning_rate": 0.00017757876676023125, + "loss": 1.1627, + "step": 6725 + }, + { + "epoch": 0.24087238347628342, + "grad_norm": 1.6312659978866577, + "learning_rate": 0.0001775714473302047, + "loss": 1.1623, + "step": 6726 + }, + { + "epoch": 0.2409081956058517, + "grad_norm": 1.3278111219406128, + "learning_rate": 0.0001775641268565544, + "loss": 1.2097, + "step": 6727 + }, + { + "epoch": 0.24094400773542, + "grad_norm": 2.1430563926696777, + "learning_rate": 0.0001775568053393788, + "loss": 1.2887, + "step": 6728 + }, + { + "epoch": 0.24097981986498826, + "grad_norm": 1.5869516134262085, + "learning_rate": 0.00017754948277877642, + "loss": 1.1969, + "step": 6729 + }, + { + "epoch": 0.24101563199455656, + "grad_norm": 1.7527828216552734, + "learning_rate": 0.0001775421591748458, + "loss": 1.1335, + "step": 6730 + }, + { + "epoch": 0.24105144412412485, + "grad_norm": 1.408781886100769, + "learning_rate": 0.00017753483452768545, + "loss": 1.154, + "step": 6731 + }, + { + "epoch": 0.24108725625369312, + "grad_norm": 1.9065208435058594, + "learning_rate": 0.0001775275088373939, + "loss": 1.2306, + "step": 6732 + }, + { + "epoch": 0.24112306838326142, + "grad_norm": 1.7392830848693848, + "learning_rate": 0.00017752018210406972, + "loss": 1.2198, + "step": 6733 + }, + { + "epoch": 0.2411588805128297, + "grad_norm": 1.3293102979660034, + "learning_rate": 0.00017751285432781152, + "loss": 1.1174, + "step": 6734 + }, + { + "epoch": 0.24119469264239798, + "grad_norm": 1.5690313577651978, + "learning_rate": 0.00017750552550871782, + "loss": 1.1538, + "step": 6735 + }, + { + "epoch": 0.24123050477196625, + "grad_norm": 1.510910153388977, + "learning_rate": 0.00017749819564688725, + "loss": 0.9653, + "step": 6736 + }, + { + "epoch": 0.24126631690153455, + "grad_norm": 1.2111831903457642, + "learning_rate": 0.00017749086474241844, + "loss": 1.1555, + "step": 6737 + }, + { + "epoch": 0.24130212903110285, + "grad_norm": 1.8610153198242188, + "learning_rate": 0.00017748353279540999, + "loss": 1.2458, + "step": 6738 + }, + { + "epoch": 0.24133794116067112, + "grad_norm": 1.6111321449279785, + "learning_rate": 0.00017747619980596055, + "loss": 1.2727, + "step": 6739 + }, + { + "epoch": 0.2413737532902394, + "grad_norm": 2.2168684005737305, + "learning_rate": 0.00017746886577416876, + "loss": 1.1636, + "step": 6740 + }, + { + "epoch": 0.24140956541980768, + "grad_norm": 1.5907087326049805, + "learning_rate": 0.00017746153070013335, + "loss": 1.3871, + "step": 6741 + }, + { + "epoch": 0.24144537754937598, + "grad_norm": 1.432680368423462, + "learning_rate": 0.00017745419458395294, + "loss": 1.2631, + "step": 6742 + }, + { + "epoch": 0.24148118967894425, + "grad_norm": 1.6417070627212524, + "learning_rate": 0.00017744685742572625, + "loss": 1.3854, + "step": 6743 + }, + { + "epoch": 0.24151700180851254, + "grad_norm": 1.8828281164169312, + "learning_rate": 0.000177439519225552, + "loss": 1.2555, + "step": 6744 + }, + { + "epoch": 0.24155281393808084, + "grad_norm": 1.516442894935608, + "learning_rate": 0.0001774321799835289, + "loss": 1.2099, + "step": 6745 + }, + { + "epoch": 0.2415886260676491, + "grad_norm": 1.5112133026123047, + "learning_rate": 0.00017742483969975572, + "loss": 1.1494, + "step": 6746 + }, + { + "epoch": 0.2416244381972174, + "grad_norm": 1.3810558319091797, + "learning_rate": 0.00017741749837433117, + "loss": 1.2539, + "step": 6747 + }, + { + "epoch": 0.24166025032678567, + "grad_norm": 1.59540593624115, + "learning_rate": 0.00017741015600735403, + "loss": 1.1651, + "step": 6748 + }, + { + "epoch": 0.24169606245635397, + "grad_norm": 1.538267731666565, + "learning_rate": 0.0001774028125989231, + "loss": 1.2712, + "step": 6749 + }, + { + "epoch": 0.24173187458592224, + "grad_norm": 1.8449903726577759, + "learning_rate": 0.00017739546814913722, + "loss": 1.2744, + "step": 6750 + }, + { + "epoch": 0.24176768671549054, + "grad_norm": 1.1666584014892578, + "learning_rate": 0.00017738812265809508, + "loss": 1.2101, + "step": 6751 + }, + { + "epoch": 0.24180349884505883, + "grad_norm": 1.5145328044891357, + "learning_rate": 0.0001773807761258956, + "loss": 1.1185, + "step": 6752 + }, + { + "epoch": 0.2418393109746271, + "grad_norm": 1.935210108757019, + "learning_rate": 0.0001773734285526376, + "loss": 1.1671, + "step": 6753 + }, + { + "epoch": 0.2418751231041954, + "grad_norm": 1.3492019176483154, + "learning_rate": 0.0001773660799384199, + "loss": 1.1551, + "step": 6754 + }, + { + "epoch": 0.24191093523376367, + "grad_norm": 1.6698315143585205, + "learning_rate": 0.0001773587302833414, + "loss": 0.9628, + "step": 6755 + }, + { + "epoch": 0.24194674736333197, + "grad_norm": 1.5143558979034424, + "learning_rate": 0.000177351379587501, + "loss": 1.0116, + "step": 6756 + }, + { + "epoch": 0.24198255949290023, + "grad_norm": 1.188547134399414, + "learning_rate": 0.0001773440278509975, + "loss": 1.2512, + "step": 6757 + }, + { + "epoch": 0.24201837162246853, + "grad_norm": 1.6219719648361206, + "learning_rate": 0.00017733667507392991, + "loss": 1.1958, + "step": 6758 + }, + { + "epoch": 0.24205418375203683, + "grad_norm": 1.7092152833938599, + "learning_rate": 0.00017732932125639713, + "loss": 1.1706, + "step": 6759 + }, + { + "epoch": 0.2420899958816051, + "grad_norm": 1.5511833429336548, + "learning_rate": 0.00017732196639849804, + "loss": 1.2562, + "step": 6760 + }, + { + "epoch": 0.2421258080111734, + "grad_norm": 1.4424821138381958, + "learning_rate": 0.0001773146105003317, + "loss": 1.1302, + "step": 6761 + }, + { + "epoch": 0.24216162014074166, + "grad_norm": 1.194706678390503, + "learning_rate": 0.00017730725356199692, + "loss": 1.2092, + "step": 6762 + }, + { + "epoch": 0.24219743227030996, + "grad_norm": 2.171982765197754, + "learning_rate": 0.0001772998955835928, + "loss": 1.2729, + "step": 6763 + }, + { + "epoch": 0.24223324439987823, + "grad_norm": 1.8078298568725586, + "learning_rate": 0.00017729253656521832, + "loss": 1.0654, + "step": 6764 + }, + { + "epoch": 0.24226905652944652, + "grad_norm": 1.4816991090774536, + "learning_rate": 0.00017728517650697243, + "loss": 1.2405, + "step": 6765 + }, + { + "epoch": 0.24230486865901482, + "grad_norm": 2.0018093585968018, + "learning_rate": 0.0001772778154089542, + "loss": 1.2201, + "step": 6766 + }, + { + "epoch": 0.2423406807885831, + "grad_norm": 1.5105581283569336, + "learning_rate": 0.0001772704532712626, + "loss": 1.2577, + "step": 6767 + }, + { + "epoch": 0.2423764929181514, + "grad_norm": 1.699346661567688, + "learning_rate": 0.00017726309009399676, + "loss": 1.2949, + "step": 6768 + }, + { + "epoch": 0.24241230504771966, + "grad_norm": 1.6380810737609863, + "learning_rate": 0.0001772557258772557, + "loss": 1.2059, + "step": 6769 + }, + { + "epoch": 0.24244811717728795, + "grad_norm": 1.4893676042556763, + "learning_rate": 0.0001772483606211385, + "loss": 1.1996, + "step": 6770 + }, + { + "epoch": 0.24248392930685622, + "grad_norm": 1.186768889427185, + "learning_rate": 0.00017724099432574425, + "loss": 1.1641, + "step": 6771 + }, + { + "epoch": 0.24251974143642452, + "grad_norm": 1.7309666872024536, + "learning_rate": 0.00017723362699117206, + "loss": 1.415, + "step": 6772 + }, + { + "epoch": 0.24255555356599282, + "grad_norm": 1.9619293212890625, + "learning_rate": 0.00017722625861752103, + "loss": 1.2764, + "step": 6773 + }, + { + "epoch": 0.24259136569556108, + "grad_norm": 1.419368863105774, + "learning_rate": 0.0001772188892048903, + "loss": 1.1357, + "step": 6774 + }, + { + "epoch": 0.24262717782512938, + "grad_norm": 2.3950774669647217, + "learning_rate": 0.00017721151875337907, + "loss": 1.4993, + "step": 6775 + }, + { + "epoch": 0.24266298995469765, + "grad_norm": 1.389258623123169, + "learning_rate": 0.00017720414726308642, + "loss": 1.1647, + "step": 6776 + }, + { + "epoch": 0.24269880208426595, + "grad_norm": 1.254736304283142, + "learning_rate": 0.00017719677473411154, + "loss": 0.9885, + "step": 6777 + }, + { + "epoch": 0.24273461421383422, + "grad_norm": 1.4066237211227417, + "learning_rate": 0.00017718940116655363, + "loss": 1.3538, + "step": 6778 + }, + { + "epoch": 0.2427704263434025, + "grad_norm": 1.5529917478561401, + "learning_rate": 0.00017718202656051194, + "loss": 1.3441, + "step": 6779 + }, + { + "epoch": 0.2428062384729708, + "grad_norm": 1.4387781620025635, + "learning_rate": 0.0001771746509160856, + "loss": 1.0036, + "step": 6780 + }, + { + "epoch": 0.24284205060253908, + "grad_norm": 1.4135140180587769, + "learning_rate": 0.00017716727423337388, + "loss": 1.0758, + "step": 6781 + }, + { + "epoch": 0.24287786273210737, + "grad_norm": 1.5998225212097168, + "learning_rate": 0.00017715989651247602, + "loss": 1.2097, + "step": 6782 + }, + { + "epoch": 0.24291367486167564, + "grad_norm": 1.7434452772140503, + "learning_rate": 0.0001771525177534913, + "loss": 1.1806, + "step": 6783 + }, + { + "epoch": 0.24294948699124394, + "grad_norm": 1.3116443157196045, + "learning_rate": 0.00017714513795651898, + "loss": 1.1679, + "step": 6784 + }, + { + "epoch": 0.2429852991208122, + "grad_norm": 1.3251231908798218, + "learning_rate": 0.00017713775712165832, + "loss": 1.2792, + "step": 6785 + }, + { + "epoch": 0.2430211112503805, + "grad_norm": 1.54373300075531, + "learning_rate": 0.00017713037524900863, + "loss": 1.2182, + "step": 6786 + }, + { + "epoch": 0.24305692337994877, + "grad_norm": 1.4152860641479492, + "learning_rate": 0.00017712299233866923, + "loss": 1.1247, + "step": 6787 + }, + { + "epoch": 0.24309273550951707, + "grad_norm": 1.8330579996109009, + "learning_rate": 0.0001771156083907395, + "loss": 1.3099, + "step": 6788 + }, + { + "epoch": 0.24312854763908537, + "grad_norm": 1.2336864471435547, + "learning_rate": 0.0001771082234053187, + "loss": 1.2784, + "step": 6789 + }, + { + "epoch": 0.24316435976865364, + "grad_norm": 1.7373096942901611, + "learning_rate": 0.0001771008373825062, + "loss": 1.0707, + "step": 6790 + }, + { + "epoch": 0.24320017189822193, + "grad_norm": 1.6842570304870605, + "learning_rate": 0.0001770934503224014, + "loss": 1.1078, + "step": 6791 + }, + { + "epoch": 0.2432359840277902, + "grad_norm": 1.4983638525009155, + "learning_rate": 0.00017708606222510367, + "loss": 1.0072, + "step": 6792 + }, + { + "epoch": 0.2432717961573585, + "grad_norm": 1.9205924272537231, + "learning_rate": 0.0001770786730907124, + "loss": 1.3536, + "step": 6793 + }, + { + "epoch": 0.24330760828692677, + "grad_norm": 1.4661051034927368, + "learning_rate": 0.00017707128291932702, + "loss": 1.1321, + "step": 6794 + }, + { + "epoch": 0.24334342041649507, + "grad_norm": 2.2797605991363525, + "learning_rate": 0.00017706389171104694, + "loss": 1.3445, + "step": 6795 + }, + { + "epoch": 0.24337923254606336, + "grad_norm": 2.002254009246826, + "learning_rate": 0.00017705649946597157, + "loss": 1.2672, + "step": 6796 + }, + { + "epoch": 0.24341504467563163, + "grad_norm": 1.369136929512024, + "learning_rate": 0.00017704910618420044, + "loss": 1.1553, + "step": 6797 + }, + { + "epoch": 0.24345085680519993, + "grad_norm": 1.635637879371643, + "learning_rate": 0.00017704171186583295, + "loss": 1.2852, + "step": 6798 + }, + { + "epoch": 0.2434866689347682, + "grad_norm": 1.5871466398239136, + "learning_rate": 0.00017703431651096862, + "loss": 1.4036, + "step": 6799 + }, + { + "epoch": 0.2435224810643365, + "grad_norm": 2.1505110263824463, + "learning_rate": 0.00017702692011970693, + "loss": 1.0506, + "step": 6800 + }, + { + "epoch": 0.24355829319390476, + "grad_norm": 1.6885453462600708, + "learning_rate": 0.00017701952269214737, + "loss": 1.1705, + "step": 6801 + }, + { + "epoch": 0.24359410532347306, + "grad_norm": 1.539516568183899, + "learning_rate": 0.00017701212422838948, + "loss": 1.1624, + "step": 6802 + }, + { + "epoch": 0.24362991745304136, + "grad_norm": 1.5745352506637573, + "learning_rate": 0.00017700472472853283, + "loss": 1.2621, + "step": 6803 + }, + { + "epoch": 0.24366572958260962, + "grad_norm": 1.391173005104065, + "learning_rate": 0.00017699732419267688, + "loss": 1.079, + "step": 6804 + }, + { + "epoch": 0.24370154171217792, + "grad_norm": 1.8522040843963623, + "learning_rate": 0.0001769899226209213, + "loss": 0.9527, + "step": 6805 + }, + { + "epoch": 0.2437373538417462, + "grad_norm": 1.754431962966919, + "learning_rate": 0.0001769825200133656, + "loss": 1.2855, + "step": 6806 + }, + { + "epoch": 0.2437731659713145, + "grad_norm": 1.1170536279678345, + "learning_rate": 0.00017697511637010938, + "loss": 0.9627, + "step": 6807 + }, + { + "epoch": 0.24380897810088276, + "grad_norm": 1.9336646795272827, + "learning_rate": 0.0001769677116912523, + "loss": 1.1771, + "step": 6808 + }, + { + "epoch": 0.24384479023045105, + "grad_norm": 1.448656439781189, + "learning_rate": 0.00017696030597689393, + "loss": 1.2062, + "step": 6809 + }, + { + "epoch": 0.24388060236001935, + "grad_norm": 1.806641936302185, + "learning_rate": 0.00017695289922713389, + "loss": 1.0576, + "step": 6810 + }, + { + "epoch": 0.24391641448958762, + "grad_norm": 1.7021892070770264, + "learning_rate": 0.00017694549144207185, + "loss": 1.3378, + "step": 6811 + }, + { + "epoch": 0.24395222661915592, + "grad_norm": 1.5133734941482544, + "learning_rate": 0.0001769380826218075, + "loss": 1.2964, + "step": 6812 + }, + { + "epoch": 0.24398803874872418, + "grad_norm": 1.5824968814849854, + "learning_rate": 0.00017693067276644049, + "loss": 1.212, + "step": 6813 + }, + { + "epoch": 0.24402385087829248, + "grad_norm": 1.717682123184204, + "learning_rate": 0.00017692326187607052, + "loss": 1.1169, + "step": 6814 + }, + { + "epoch": 0.24405966300786075, + "grad_norm": 1.192223072052002, + "learning_rate": 0.00017691584995079725, + "loss": 0.9687, + "step": 6815 + }, + { + "epoch": 0.24409547513742905, + "grad_norm": 1.4196245670318604, + "learning_rate": 0.00017690843699072045, + "loss": 1.3182, + "step": 6816 + }, + { + "epoch": 0.24413128726699734, + "grad_norm": 1.5555247068405151, + "learning_rate": 0.00017690102299593985, + "loss": 1.3178, + "step": 6817 + }, + { + "epoch": 0.2441670993965656, + "grad_norm": 1.4039623737335205, + "learning_rate": 0.00017689360796655515, + "loss": 1.1561, + "step": 6818 + }, + { + "epoch": 0.2442029115261339, + "grad_norm": 1.446697473526001, + "learning_rate": 0.00017688619190266616, + "loss": 1.2926, + "step": 6819 + }, + { + "epoch": 0.24423872365570218, + "grad_norm": 1.9138054847717285, + "learning_rate": 0.00017687877480437262, + "loss": 1.1603, + "step": 6820 + }, + { + "epoch": 0.24427453578527047, + "grad_norm": 1.4130088090896606, + "learning_rate": 0.00017687135667177436, + "loss": 1.3251, + "step": 6821 + }, + { + "epoch": 0.24431034791483874, + "grad_norm": 1.8042715787887573, + "learning_rate": 0.00017686393750497112, + "loss": 1.1302, + "step": 6822 + }, + { + "epoch": 0.24434616004440704, + "grad_norm": 1.7331831455230713, + "learning_rate": 0.0001768565173040628, + "loss": 0.9652, + "step": 6823 + }, + { + "epoch": 0.24438197217397534, + "grad_norm": 1.7895381450653076, + "learning_rate": 0.0001768490960691491, + "loss": 1.2262, + "step": 6824 + }, + { + "epoch": 0.2444177843035436, + "grad_norm": 1.1704381704330444, + "learning_rate": 0.00017684167380033002, + "loss": 1.157, + "step": 6825 + }, + { + "epoch": 0.2444535964331119, + "grad_norm": 1.8827852010726929, + "learning_rate": 0.00017683425049770527, + "loss": 1.3188, + "step": 6826 + }, + { + "epoch": 0.24448940856268017, + "grad_norm": 1.499713659286499, + "learning_rate": 0.00017682682616137484, + "loss": 1.1753, + "step": 6827 + }, + { + "epoch": 0.24452522069224847, + "grad_norm": 1.5482592582702637, + "learning_rate": 0.00017681940079143855, + "loss": 1.1906, + "step": 6828 + }, + { + "epoch": 0.24456103282181674, + "grad_norm": 1.9279130697250366, + "learning_rate": 0.0001768119743879963, + "loss": 1.1001, + "step": 6829 + }, + { + "epoch": 0.24459684495138503, + "grad_norm": 1.4255386590957642, + "learning_rate": 0.00017680454695114802, + "loss": 1.3324, + "step": 6830 + }, + { + "epoch": 0.24463265708095333, + "grad_norm": 1.3060718774795532, + "learning_rate": 0.00017679711848099362, + "loss": 1.0955, + "step": 6831 + }, + { + "epoch": 0.2446684692105216, + "grad_norm": 1.931302785873413, + "learning_rate": 0.0001767896889776331, + "loss": 1.352, + "step": 6832 + }, + { + "epoch": 0.2447042813400899, + "grad_norm": 1.5580946207046509, + "learning_rate": 0.00017678225844116628, + "loss": 1.1577, + "step": 6833 + }, + { + "epoch": 0.24474009346965817, + "grad_norm": 1.502519130706787, + "learning_rate": 0.00017677482687169328, + "loss": 1.2605, + "step": 6834 + }, + { + "epoch": 0.24477590559922646, + "grad_norm": 1.3734053373336792, + "learning_rate": 0.000176767394269314, + "loss": 1.1992, + "step": 6835 + }, + { + "epoch": 0.24481171772879473, + "grad_norm": 1.3164204359054565, + "learning_rate": 0.00017675996063412844, + "loss": 1.1568, + "step": 6836 + }, + { + "epoch": 0.24484752985836303, + "grad_norm": 1.50732421875, + "learning_rate": 0.00017675252596623665, + "loss": 1.2509, + "step": 6837 + }, + { + "epoch": 0.24488334198793132, + "grad_norm": 1.725959062576294, + "learning_rate": 0.00017674509026573864, + "loss": 1.349, + "step": 6838 + }, + { + "epoch": 0.2449191541174996, + "grad_norm": 1.4366824626922607, + "learning_rate": 0.00017673765353273438, + "loss": 1.17, + "step": 6839 + }, + { + "epoch": 0.2449549662470679, + "grad_norm": 1.7110153436660767, + "learning_rate": 0.00017673021576732404, + "loss": 1.2498, + "step": 6840 + }, + { + "epoch": 0.24499077837663616, + "grad_norm": 1.9927059412002563, + "learning_rate": 0.00017672277696960756, + "loss": 1.0209, + "step": 6841 + }, + { + "epoch": 0.24502659050620446, + "grad_norm": 1.4345731735229492, + "learning_rate": 0.0001767153371396851, + "loss": 1.2744, + "step": 6842 + }, + { + "epoch": 0.24506240263577272, + "grad_norm": 1.3287636041641235, + "learning_rate": 0.00017670789627765676, + "loss": 1.2237, + "step": 6843 + }, + { + "epoch": 0.24509821476534102, + "grad_norm": 1.7391217947006226, + "learning_rate": 0.0001767004543836226, + "loss": 1.2542, + "step": 6844 + }, + { + "epoch": 0.24513402689490932, + "grad_norm": 1.3967113494873047, + "learning_rate": 0.00017669301145768277, + "loss": 1.362, + "step": 6845 + }, + { + "epoch": 0.2451698390244776, + "grad_norm": 1.6437686681747437, + "learning_rate": 0.0001766855674999374, + "loss": 1.129, + "step": 6846 + }, + { + "epoch": 0.24520565115404588, + "grad_norm": 1.647951364517212, + "learning_rate": 0.00017667812251048664, + "loss": 1.1294, + "step": 6847 + }, + { + "epoch": 0.24524146328361415, + "grad_norm": 1.7916377782821655, + "learning_rate": 0.00017667067648943064, + "loss": 1.4493, + "step": 6848 + }, + { + "epoch": 0.24527727541318245, + "grad_norm": 1.3997303247451782, + "learning_rate": 0.00017666322943686957, + "loss": 1.2516, + "step": 6849 + }, + { + "epoch": 0.24531308754275072, + "grad_norm": 1.9569077491760254, + "learning_rate": 0.00017665578135290364, + "loss": 1.0581, + "step": 6850 + }, + { + "epoch": 0.24534889967231902, + "grad_norm": 2.1476516723632812, + "learning_rate": 0.00017664833223763306, + "loss": 1.2392, + "step": 6851 + }, + { + "epoch": 0.2453847118018873, + "grad_norm": 1.7473291158676147, + "learning_rate": 0.00017664088209115805, + "loss": 1.2241, + "step": 6852 + }, + { + "epoch": 0.24542052393145558, + "grad_norm": 1.6306310892105103, + "learning_rate": 0.00017663343091357881, + "loss": 1.4449, + "step": 6853 + }, + { + "epoch": 0.24545633606102388, + "grad_norm": 1.4273197650909424, + "learning_rate": 0.00017662597870499562, + "loss": 1.135, + "step": 6854 + }, + { + "epoch": 0.24549214819059215, + "grad_norm": 1.4953649044036865, + "learning_rate": 0.00017661852546550875, + "loss": 1.2044, + "step": 6855 + }, + { + "epoch": 0.24552796032016044, + "grad_norm": 1.3661226034164429, + "learning_rate": 0.00017661107119521842, + "loss": 1.2612, + "step": 6856 + }, + { + "epoch": 0.2455637724497287, + "grad_norm": 1.7001441717147827, + "learning_rate": 0.00017660361589422497, + "loss": 1.0029, + "step": 6857 + }, + { + "epoch": 0.245599584579297, + "grad_norm": 1.4277349710464478, + "learning_rate": 0.00017659615956262865, + "loss": 1.3527, + "step": 6858 + }, + { + "epoch": 0.2456353967088653, + "grad_norm": 1.2881057262420654, + "learning_rate": 0.00017658870220052983, + "loss": 1.0551, + "step": 6859 + }, + { + "epoch": 0.24567120883843357, + "grad_norm": 1.4978264570236206, + "learning_rate": 0.00017658124380802882, + "loss": 1.0749, + "step": 6860 + }, + { + "epoch": 0.24570702096800187, + "grad_norm": 1.459607481956482, + "learning_rate": 0.00017657378438522593, + "loss": 1.25, + "step": 6861 + }, + { + "epoch": 0.24574283309757014, + "grad_norm": 1.797658920288086, + "learning_rate": 0.00017656632393222156, + "loss": 1.1328, + "step": 6862 + }, + { + "epoch": 0.24577864522713844, + "grad_norm": 1.9255379438400269, + "learning_rate": 0.00017655886244911603, + "loss": 1.2797, + "step": 6863 + }, + { + "epoch": 0.2458144573567067, + "grad_norm": 1.6403851509094238, + "learning_rate": 0.00017655139993600982, + "loss": 1.0078, + "step": 6864 + }, + { + "epoch": 0.245850269486275, + "grad_norm": 1.3189709186553955, + "learning_rate": 0.0001765439363930032, + "loss": 1.1323, + "step": 6865 + }, + { + "epoch": 0.2458860816158433, + "grad_norm": 1.5708354711532593, + "learning_rate": 0.00017653647182019671, + "loss": 1.2465, + "step": 6866 + }, + { + "epoch": 0.24592189374541157, + "grad_norm": 2.420866012573242, + "learning_rate": 0.0001765290062176907, + "loss": 1.2275, + "step": 6867 + }, + { + "epoch": 0.24595770587497987, + "grad_norm": 1.7600715160369873, + "learning_rate": 0.00017652153958558562, + "loss": 1.0745, + "step": 6868 + }, + { + "epoch": 0.24599351800454813, + "grad_norm": 2.6974761486053467, + "learning_rate": 0.00017651407192398195, + "loss": 0.9558, + "step": 6869 + }, + { + "epoch": 0.24602933013411643, + "grad_norm": 1.7356503009796143, + "learning_rate": 0.0001765066032329801, + "loss": 1.2704, + "step": 6870 + }, + { + "epoch": 0.2460651422636847, + "grad_norm": 1.4844895601272583, + "learning_rate": 0.0001764991335126806, + "loss": 1.119, + "step": 6871 + }, + { + "epoch": 0.246100954393253, + "grad_norm": 1.5499835014343262, + "learning_rate": 0.000176491662763184, + "loss": 1.2394, + "step": 6872 + }, + { + "epoch": 0.2461367665228213, + "grad_norm": 1.6073681116104126, + "learning_rate": 0.0001764841909845907, + "loss": 1.2443, + "step": 6873 + }, + { + "epoch": 0.24617257865238956, + "grad_norm": 1.7055435180664062, + "learning_rate": 0.00017647671817700122, + "loss": 0.9192, + "step": 6874 + }, + { + "epoch": 0.24620839078195786, + "grad_norm": 1.643152117729187, + "learning_rate": 0.00017646924434051617, + "loss": 1.3864, + "step": 6875 + }, + { + "epoch": 0.24624420291152613, + "grad_norm": 1.5396192073822021, + "learning_rate": 0.0001764617694752361, + "loss": 1.0207, + "step": 6876 + }, + { + "epoch": 0.24628001504109442, + "grad_norm": 1.8057568073272705, + "learning_rate": 0.00017645429358126156, + "loss": 1.2316, + "step": 6877 + }, + { + "epoch": 0.2463158271706627, + "grad_norm": 1.6093379259109497, + "learning_rate": 0.0001764468166586931, + "loss": 1.1991, + "step": 6878 + }, + { + "epoch": 0.246351639300231, + "grad_norm": 1.399366021156311, + "learning_rate": 0.00017643933870763133, + "loss": 1.1368, + "step": 6879 + }, + { + "epoch": 0.2463874514297993, + "grad_norm": 1.7931088209152222, + "learning_rate": 0.00017643185972817684, + "loss": 1.077, + "step": 6880 + }, + { + "epoch": 0.24642326355936756, + "grad_norm": 1.5344324111938477, + "learning_rate": 0.0001764243797204303, + "loss": 1.0162, + "step": 6881 + }, + { + "epoch": 0.24645907568893585, + "grad_norm": 1.6807363033294678, + "learning_rate": 0.0001764168986844923, + "loss": 1.2479, + "step": 6882 + }, + { + "epoch": 0.24649488781850412, + "grad_norm": 1.83706796169281, + "learning_rate": 0.00017640941662046345, + "loss": 1.2766, + "step": 6883 + }, + { + "epoch": 0.24653069994807242, + "grad_norm": 1.5919657945632935, + "learning_rate": 0.00017640193352844454, + "loss": 1.0704, + "step": 6884 + }, + { + "epoch": 0.2465665120776407, + "grad_norm": 1.259521245956421, + "learning_rate": 0.00017639444940853612, + "loss": 1.1184, + "step": 6885 + }, + { + "epoch": 0.24660232420720898, + "grad_norm": 1.6898640394210815, + "learning_rate": 0.00017638696426083893, + "loss": 1.2144, + "step": 6886 + }, + { + "epoch": 0.24663813633677725, + "grad_norm": 1.8451439142227173, + "learning_rate": 0.00017637947808545369, + "loss": 1.0074, + "step": 6887 + }, + { + "epoch": 0.24667394846634555, + "grad_norm": 1.2948733568191528, + "learning_rate": 0.00017637199088248106, + "loss": 1.3199, + "step": 6888 + }, + { + "epoch": 0.24670976059591385, + "grad_norm": 1.6208205223083496, + "learning_rate": 0.00017636450265202185, + "loss": 1.1535, + "step": 6889 + }, + { + "epoch": 0.24674557272548212, + "grad_norm": 1.3151262998580933, + "learning_rate": 0.00017635701339417672, + "loss": 1.0097, + "step": 6890 + }, + { + "epoch": 0.2467813848550504, + "grad_norm": 1.5779258012771606, + "learning_rate": 0.0001763495231090465, + "loss": 1.4535, + "step": 6891 + }, + { + "epoch": 0.24681719698461868, + "grad_norm": 1.6953518390655518, + "learning_rate": 0.0001763420317967319, + "loss": 1.0348, + "step": 6892 + }, + { + "epoch": 0.24685300911418698, + "grad_norm": 1.4543631076812744, + "learning_rate": 0.00017633453945733373, + "loss": 1.0293, + "step": 6893 + }, + { + "epoch": 0.24688882124375525, + "grad_norm": 1.9309182167053223, + "learning_rate": 0.00017632704609095283, + "loss": 1.3773, + "step": 6894 + }, + { + "epoch": 0.24692463337332354, + "grad_norm": 1.4144504070281982, + "learning_rate": 0.00017631955169768998, + "loss": 1.3353, + "step": 6895 + }, + { + "epoch": 0.24696044550289184, + "grad_norm": 1.3037524223327637, + "learning_rate": 0.00017631205627764598, + "loss": 1.1864, + "step": 6896 + }, + { + "epoch": 0.2469962576324601, + "grad_norm": 1.334747314453125, + "learning_rate": 0.0001763045598309217, + "loss": 1.3512, + "step": 6897 + }, + { + "epoch": 0.2470320697620284, + "grad_norm": 1.7555752992630005, + "learning_rate": 0.00017629706235761802, + "loss": 1.2572, + "step": 6898 + }, + { + "epoch": 0.24706788189159667, + "grad_norm": 1.5317105054855347, + "learning_rate": 0.00017628956385783577, + "loss": 1.322, + "step": 6899 + }, + { + "epoch": 0.24710369402116497, + "grad_norm": 1.7863472700119019, + "learning_rate": 0.00017628206433167583, + "loss": 1.1923, + "step": 6900 + }, + { + "epoch": 0.24713950615073324, + "grad_norm": 1.5038975477218628, + "learning_rate": 0.00017627456377923911, + "loss": 1.145, + "step": 6901 + }, + { + "epoch": 0.24717531828030154, + "grad_norm": 1.4173437356948853, + "learning_rate": 0.00017626706220062654, + "loss": 1.2502, + "step": 6902 + }, + { + "epoch": 0.24721113040986983, + "grad_norm": 2.6659607887268066, + "learning_rate": 0.00017625955959593904, + "loss": 1.3424, + "step": 6903 + }, + { + "epoch": 0.2472469425394381, + "grad_norm": 1.795927882194519, + "learning_rate": 0.0001762520559652775, + "loss": 1.214, + "step": 6904 + }, + { + "epoch": 0.2472827546690064, + "grad_norm": 2.3665947914123535, + "learning_rate": 0.00017624455130874292, + "loss": 1.2954, + "step": 6905 + }, + { + "epoch": 0.24731856679857467, + "grad_norm": 1.4865883588790894, + "learning_rate": 0.00017623704562643624, + "loss": 1.3528, + "step": 6906 + }, + { + "epoch": 0.24735437892814297, + "grad_norm": 1.7575072050094604, + "learning_rate": 0.00017622953891845847, + "loss": 1.059, + "step": 6907 + }, + { + "epoch": 0.24739019105771123, + "grad_norm": 2.082791805267334, + "learning_rate": 0.00017622203118491055, + "loss": 1.2549, + "step": 6908 + }, + { + "epoch": 0.24742600318727953, + "grad_norm": 2.008176326751709, + "learning_rate": 0.00017621452242589354, + "loss": 1.134, + "step": 6909 + }, + { + "epoch": 0.24746181531684783, + "grad_norm": 1.4217827320098877, + "learning_rate": 0.00017620701264150845, + "loss": 1.3029, + "step": 6910 + }, + { + "epoch": 0.2474976274464161, + "grad_norm": 1.709494948387146, + "learning_rate": 0.0001761995018318563, + "loss": 1.1332, + "step": 6911 + }, + { + "epoch": 0.2475334395759844, + "grad_norm": 1.2756450176239014, + "learning_rate": 0.00017619198999703812, + "loss": 1.061, + "step": 6912 + }, + { + "epoch": 0.24756925170555266, + "grad_norm": 1.5074715614318848, + "learning_rate": 0.00017618447713715503, + "loss": 1.3208, + "step": 6913 + }, + { + "epoch": 0.24760506383512096, + "grad_norm": 1.3438138961791992, + "learning_rate": 0.00017617696325230805, + "loss": 1.2037, + "step": 6914 + }, + { + "epoch": 0.24764087596468923, + "grad_norm": 1.4406810998916626, + "learning_rate": 0.0001761694483425983, + "loss": 1.091, + "step": 6915 + }, + { + "epoch": 0.24767668809425752, + "grad_norm": 1.6558492183685303, + "learning_rate": 0.00017616193240812687, + "loss": 1.1992, + "step": 6916 + }, + { + "epoch": 0.24771250022382582, + "grad_norm": 1.3758883476257324, + "learning_rate": 0.00017615441544899488, + "loss": 1.3413, + "step": 6917 + }, + { + "epoch": 0.2477483123533941, + "grad_norm": 1.5926074981689453, + "learning_rate": 0.00017614689746530345, + "loss": 1.3257, + "step": 6918 + }, + { + "epoch": 0.2477841244829624, + "grad_norm": 2.3961856365203857, + "learning_rate": 0.00017613937845715376, + "loss": 1.2179, + "step": 6919 + }, + { + "epoch": 0.24781993661253066, + "grad_norm": 1.5887528657913208, + "learning_rate": 0.00017613185842464693, + "loss": 1.2502, + "step": 6920 + }, + { + "epoch": 0.24785574874209895, + "grad_norm": 1.8956705331802368, + "learning_rate": 0.00017612433736788417, + "loss": 1.2658, + "step": 6921 + }, + { + "epoch": 0.24789156087166722, + "grad_norm": 1.5714540481567383, + "learning_rate": 0.0001761168152869666, + "loss": 1.2248, + "step": 6922 + }, + { + "epoch": 0.24792737300123552, + "grad_norm": 1.510380744934082, + "learning_rate": 0.00017610929218199553, + "loss": 1.225, + "step": 6923 + }, + { + "epoch": 0.24796318513080381, + "grad_norm": 1.2994334697723389, + "learning_rate": 0.00017610176805307206, + "loss": 1.051, + "step": 6924 + }, + { + "epoch": 0.24799899726037208, + "grad_norm": 1.7617374658584595, + "learning_rate": 0.00017609424290029746, + "loss": 1.2485, + "step": 6925 + }, + { + "epoch": 0.24803480938994038, + "grad_norm": 2.5006065368652344, + "learning_rate": 0.000176086716723773, + "loss": 1.2877, + "step": 6926 + }, + { + "epoch": 0.24807062151950865, + "grad_norm": 1.5213685035705566, + "learning_rate": 0.0001760791895235999, + "loss": 0.9052, + "step": 6927 + }, + { + "epoch": 0.24810643364907695, + "grad_norm": 1.6159536838531494, + "learning_rate": 0.00017607166129987944, + "loss": 1.2183, + "step": 6928 + }, + { + "epoch": 0.24814224577864522, + "grad_norm": 1.787126898765564, + "learning_rate": 0.0001760641320527129, + "loss": 1.0598, + "step": 6929 + }, + { + "epoch": 0.2481780579082135, + "grad_norm": 1.8065812587738037, + "learning_rate": 0.00017605660178220158, + "loss": 1.2786, + "step": 6930 + }, + { + "epoch": 0.2482138700377818, + "grad_norm": 1.5157959461212158, + "learning_rate": 0.0001760490704884468, + "loss": 1.3344, + "step": 6931 + }, + { + "epoch": 0.24824968216735008, + "grad_norm": 1.8986259698867798, + "learning_rate": 0.00017604153817154985, + "loss": 1.346, + "step": 6932 + }, + { + "epoch": 0.24828549429691837, + "grad_norm": 1.5875250101089478, + "learning_rate": 0.00017603400483161212, + "loss": 1.0184, + "step": 6933 + }, + { + "epoch": 0.24832130642648664, + "grad_norm": 1.5250909328460693, + "learning_rate": 0.0001760264704687349, + "loss": 1.2173, + "step": 6934 + }, + { + "epoch": 0.24835711855605494, + "grad_norm": 1.425767421722412, + "learning_rate": 0.00017601893508301962, + "loss": 0.9976, + "step": 6935 + }, + { + "epoch": 0.2483929306856232, + "grad_norm": 1.486551284790039, + "learning_rate": 0.0001760113986745676, + "loss": 1.4106, + "step": 6936 + }, + { + "epoch": 0.2484287428151915, + "grad_norm": 1.7791316509246826, + "learning_rate": 0.00017600386124348028, + "loss": 1.1233, + "step": 6937 + }, + { + "epoch": 0.2484645549447598, + "grad_norm": 1.3184198141098022, + "learning_rate": 0.00017599632278985904, + "loss": 1.1582, + "step": 6938 + }, + { + "epoch": 0.24850036707432807, + "grad_norm": 1.5505707263946533, + "learning_rate": 0.00017598878331380528, + "loss": 1.1599, + "step": 6939 + }, + { + "epoch": 0.24853617920389637, + "grad_norm": 1.7496182918548584, + "learning_rate": 0.0001759812428154205, + "loss": 1.1087, + "step": 6940 + }, + { + "epoch": 0.24857199133346464, + "grad_norm": 1.6300804615020752, + "learning_rate": 0.00017597370129480606, + "loss": 0.9539, + "step": 6941 + }, + { + "epoch": 0.24860780346303293, + "grad_norm": 1.31636643409729, + "learning_rate": 0.00017596615875206347, + "loss": 1.2534, + "step": 6942 + }, + { + "epoch": 0.2486436155926012, + "grad_norm": 1.4562270641326904, + "learning_rate": 0.00017595861518729424, + "loss": 1.2153, + "step": 6943 + }, + { + "epoch": 0.2486794277221695, + "grad_norm": 1.6034414768218994, + "learning_rate": 0.00017595107060059984, + "loss": 1.2034, + "step": 6944 + }, + { + "epoch": 0.2487152398517378, + "grad_norm": 1.4991284608840942, + "learning_rate": 0.0001759435249920817, + "loss": 1.4198, + "step": 6945 + }, + { + "epoch": 0.24875105198130606, + "grad_norm": 1.620441198348999, + "learning_rate": 0.0001759359783618414, + "loss": 1.3449, + "step": 6946 + }, + { + "epoch": 0.24878686411087436, + "grad_norm": 1.3268835544586182, + "learning_rate": 0.00017592843070998049, + "loss": 1.2221, + "step": 6947 + }, + { + "epoch": 0.24882267624044263, + "grad_norm": 1.1846861839294434, + "learning_rate": 0.00017592088203660045, + "loss": 1.0228, + "step": 6948 + }, + { + "epoch": 0.24885848837001093, + "grad_norm": 1.3322689533233643, + "learning_rate": 0.00017591333234180293, + "loss": 1.2545, + "step": 6949 + }, + { + "epoch": 0.2488943004995792, + "grad_norm": 1.2325718402862549, + "learning_rate": 0.0001759057816256894, + "loss": 1.1669, + "step": 6950 + }, + { + "epoch": 0.2489301126291475, + "grad_norm": 1.9471315145492554, + "learning_rate": 0.00017589822988836148, + "loss": 1.1786, + "step": 6951 + }, + { + "epoch": 0.2489659247587158, + "grad_norm": 1.7470251321792603, + "learning_rate": 0.00017589067712992082, + "loss": 1.0511, + "step": 6952 + }, + { + "epoch": 0.24900173688828406, + "grad_norm": 1.8171417713165283, + "learning_rate": 0.00017588312335046897, + "loss": 1.2584, + "step": 6953 + }, + { + "epoch": 0.24903754901785236, + "grad_norm": 1.3494971990585327, + "learning_rate": 0.00017587556855010755, + "loss": 1.3149, + "step": 6954 + }, + { + "epoch": 0.24907336114742062, + "grad_norm": 1.5060662031173706, + "learning_rate": 0.00017586801272893827, + "loss": 1.2877, + "step": 6955 + }, + { + "epoch": 0.24910917327698892, + "grad_norm": 1.4610350131988525, + "learning_rate": 0.00017586045588706273, + "loss": 0.9562, + "step": 6956 + }, + { + "epoch": 0.2491449854065572, + "grad_norm": 1.561164140701294, + "learning_rate": 0.0001758528980245826, + "loss": 1.1926, + "step": 6957 + }, + { + "epoch": 0.2491807975361255, + "grad_norm": 1.3045276403427124, + "learning_rate": 0.00017584533914159956, + "loss": 0.9348, + "step": 6958 + }, + { + "epoch": 0.24921660966569378, + "grad_norm": 1.780137062072754, + "learning_rate": 0.00017583777923821533, + "loss": 1.192, + "step": 6959 + }, + { + "epoch": 0.24925242179526205, + "grad_norm": 1.5121835470199585, + "learning_rate": 0.0001758302183145316, + "loss": 1.2264, + "step": 6960 + }, + { + "epoch": 0.24928823392483035, + "grad_norm": 1.9274852275848389, + "learning_rate": 0.00017582265637065012, + "loss": 1.3559, + "step": 6961 + }, + { + "epoch": 0.24932404605439862, + "grad_norm": 1.4614145755767822, + "learning_rate": 0.00017581509340667257, + "loss": 1.2972, + "step": 6962 + }, + { + "epoch": 0.24935985818396691, + "grad_norm": 1.6656363010406494, + "learning_rate": 0.00017580752942270077, + "loss": 1.3064, + "step": 6963 + }, + { + "epoch": 0.24939567031353518, + "grad_norm": 1.5381349325180054, + "learning_rate": 0.0001757999644188364, + "loss": 1.2878, + "step": 6964 + }, + { + "epoch": 0.24943148244310348, + "grad_norm": 1.5522284507751465, + "learning_rate": 0.0001757923983951813, + "loss": 1.2377, + "step": 6965 + }, + { + "epoch": 0.24946729457267178, + "grad_norm": 1.5940330028533936, + "learning_rate": 0.00017578483135183726, + "loss": 1.3239, + "step": 6966 + }, + { + "epoch": 0.24950310670224005, + "grad_norm": 1.5080609321594238, + "learning_rate": 0.00017577726328890604, + "loss": 1.0303, + "step": 6967 + }, + { + "epoch": 0.24953891883180834, + "grad_norm": 1.589513897895813, + "learning_rate": 0.0001757696942064895, + "loss": 1.0967, + "step": 6968 + }, + { + "epoch": 0.2495747309613766, + "grad_norm": 1.5773226022720337, + "learning_rate": 0.00017576212410468949, + "loss": 1.0602, + "step": 6969 + }, + { + "epoch": 0.2496105430909449, + "grad_norm": 1.3057199716567993, + "learning_rate": 0.00017575455298360782, + "loss": 1.1904, + "step": 6970 + }, + { + "epoch": 0.24964635522051318, + "grad_norm": 1.5127432346343994, + "learning_rate": 0.00017574698084334633, + "loss": 1.2911, + "step": 6971 + }, + { + "epoch": 0.24968216735008147, + "grad_norm": 1.5553947687149048, + "learning_rate": 0.00017573940768400692, + "loss": 1.1061, + "step": 6972 + }, + { + "epoch": 0.24971797947964977, + "grad_norm": 1.2066068649291992, + "learning_rate": 0.00017573183350569148, + "loss": 1.024, + "step": 6973 + }, + { + "epoch": 0.24975379160921804, + "grad_norm": 1.4632699489593506, + "learning_rate": 0.00017572425830850193, + "loss": 1.1733, + "step": 6974 + }, + { + "epoch": 0.24978960373878634, + "grad_norm": 1.4938944578170776, + "learning_rate": 0.00017571668209254013, + "loss": 1.271, + "step": 6975 + }, + { + "epoch": 0.2498254158683546, + "grad_norm": 1.5449507236480713, + "learning_rate": 0.00017570910485790805, + "loss": 1.0149, + "step": 6976 + }, + { + "epoch": 0.2498612279979229, + "grad_norm": 1.5069782733917236, + "learning_rate": 0.00017570152660470765, + "loss": 1.2901, + "step": 6977 + }, + { + "epoch": 0.24989704012749117, + "grad_norm": 1.519176721572876, + "learning_rate": 0.00017569394733304083, + "loss": 1.2762, + "step": 6978 + }, + { + "epoch": 0.24993285225705947, + "grad_norm": 1.943303108215332, + "learning_rate": 0.00017568636704300958, + "loss": 1.3441, + "step": 6979 + }, + { + "epoch": 0.24996866438662776, + "grad_norm": 1.6762633323669434, + "learning_rate": 0.0001756787857347159, + "loss": 1.109, + "step": 6980 + }, + { + "epoch": 0.25000447651619606, + "grad_norm": 1.3843663930892944, + "learning_rate": 0.00017567120340826177, + "loss": 1.1571, + "step": 6981 + }, + { + "epoch": 0.25004028864576433, + "grad_norm": 1.480919599533081, + "learning_rate": 0.0001756636200637492, + "loss": 1.0491, + "step": 6982 + }, + { + "epoch": 0.2500761007753326, + "grad_norm": 1.7757961750030518, + "learning_rate": 0.00017565603570128023, + "loss": 1.229, + "step": 6983 + }, + { + "epoch": 0.25011191290490087, + "grad_norm": 1.5696722269058228, + "learning_rate": 0.0001756484503209569, + "loss": 1.1627, + "step": 6984 + }, + { + "epoch": 0.2501477250344692, + "grad_norm": 1.7215487957000732, + "learning_rate": 0.00017564086392288125, + "loss": 1.3247, + "step": 6985 + }, + { + "epoch": 0.25018353716403746, + "grad_norm": 1.348347783088684, + "learning_rate": 0.00017563327650715535, + "loss": 1.381, + "step": 6986 + }, + { + "epoch": 0.25021934929360573, + "grad_norm": 1.7724815607070923, + "learning_rate": 0.00017562568807388126, + "loss": 1.2587, + "step": 6987 + }, + { + "epoch": 0.25025516142317406, + "grad_norm": 1.5093902349472046, + "learning_rate": 0.0001756180986231611, + "loss": 1.0294, + "step": 6988 + }, + { + "epoch": 0.2502909735527423, + "grad_norm": 1.3860375881195068, + "learning_rate": 0.00017561050815509695, + "loss": 1.1623, + "step": 6989 + }, + { + "epoch": 0.2503267856823106, + "grad_norm": 1.6744294166564941, + "learning_rate": 0.00017560291666979095, + "loss": 0.9504, + "step": 6990 + }, + { + "epoch": 0.25036259781187886, + "grad_norm": 1.4780210256576538, + "learning_rate": 0.00017559532416734524, + "loss": 1.2905, + "step": 6991 + }, + { + "epoch": 0.2503984099414472, + "grad_norm": 1.6162041425704956, + "learning_rate": 0.00017558773064786193, + "loss": 1.126, + "step": 6992 + }, + { + "epoch": 0.25043422207101546, + "grad_norm": 2.2053394317626953, + "learning_rate": 0.0001755801361114432, + "loss": 1.3676, + "step": 6993 + }, + { + "epoch": 0.2504700342005837, + "grad_norm": 1.429555892944336, + "learning_rate": 0.00017557254055819126, + "loss": 1.141, + "step": 6994 + }, + { + "epoch": 0.25050584633015205, + "grad_norm": 1.752230167388916, + "learning_rate": 0.00017556494398820823, + "loss": 1.0686, + "step": 6995 + }, + { + "epoch": 0.2505416584597203, + "grad_norm": 1.5589138269424438, + "learning_rate": 0.0001755573464015964, + "loss": 1.2437, + "step": 6996 + }, + { + "epoch": 0.2505774705892886, + "grad_norm": 1.583090901374817, + "learning_rate": 0.00017554974779845792, + "loss": 1.3823, + "step": 6997 + }, + { + "epoch": 0.25061328271885686, + "grad_norm": 1.5981390476226807, + "learning_rate": 0.000175542148178895, + "loss": 1.3333, + "step": 6998 + }, + { + "epoch": 0.2506490948484252, + "grad_norm": 1.9166666269302368, + "learning_rate": 0.00017553454754300996, + "loss": 0.9984, + "step": 6999 + }, + { + "epoch": 0.25068490697799345, + "grad_norm": 1.6161549091339111, + "learning_rate": 0.000175526945890905, + "loss": 1.1487, + "step": 7000 + }, + { + "epoch": 0.2507207191075617, + "grad_norm": 1.545116662979126, + "learning_rate": 0.0001755193432226824, + "loss": 1.1963, + "step": 7001 + }, + { + "epoch": 0.25075653123713004, + "grad_norm": 1.6151524782180786, + "learning_rate": 0.00017551173953844445, + "loss": 1.1759, + "step": 7002 + }, + { + "epoch": 0.2507923433666983, + "grad_norm": 1.4607309103012085, + "learning_rate": 0.00017550413483829344, + "loss": 1.2836, + "step": 7003 + }, + { + "epoch": 0.2508281554962666, + "grad_norm": 1.7012648582458496, + "learning_rate": 0.0001754965291223317, + "loss": 1.1084, + "step": 7004 + }, + { + "epoch": 0.25086396762583485, + "grad_norm": 1.4325213432312012, + "learning_rate": 0.00017548892239066156, + "loss": 0.9489, + "step": 7005 + }, + { + "epoch": 0.2508997797554032, + "grad_norm": 1.4297716617584229, + "learning_rate": 0.00017548131464338533, + "loss": 1.3198, + "step": 7006 + }, + { + "epoch": 0.25093559188497144, + "grad_norm": 1.5560568571090698, + "learning_rate": 0.00017547370588060537, + "loss": 1.0019, + "step": 7007 + }, + { + "epoch": 0.2509714040145397, + "grad_norm": 1.247488021850586, + "learning_rate": 0.00017546609610242405, + "loss": 1.1654, + "step": 7008 + }, + { + "epoch": 0.25100721614410804, + "grad_norm": 1.4044173955917358, + "learning_rate": 0.00017545848530894377, + "loss": 1.3008, + "step": 7009 + }, + { + "epoch": 0.2510430282736763, + "grad_norm": 1.5363157987594604, + "learning_rate": 0.0001754508735002669, + "loss": 1.317, + "step": 7010 + }, + { + "epoch": 0.2510788404032446, + "grad_norm": 1.633845567703247, + "learning_rate": 0.00017544326067649583, + "loss": 1.2724, + "step": 7011 + }, + { + "epoch": 0.25111465253281284, + "grad_norm": 1.3897778987884521, + "learning_rate": 0.00017543564683773302, + "loss": 1.2287, + "step": 7012 + }, + { + "epoch": 0.25115046466238117, + "grad_norm": 1.2499061822891235, + "learning_rate": 0.00017542803198408087, + "loss": 1.2865, + "step": 7013 + }, + { + "epoch": 0.25118627679194944, + "grad_norm": 1.7507588863372803, + "learning_rate": 0.00017542041611564186, + "loss": 1.0507, + "step": 7014 + }, + { + "epoch": 0.2512220889215177, + "grad_norm": 1.7075941562652588, + "learning_rate": 0.00017541279923251844, + "loss": 1.2325, + "step": 7015 + }, + { + "epoch": 0.25125790105108603, + "grad_norm": 1.7512236833572388, + "learning_rate": 0.00017540518133481308, + "loss": 1.0149, + "step": 7016 + }, + { + "epoch": 0.2512937131806543, + "grad_norm": 1.856297492980957, + "learning_rate": 0.00017539756242262826, + "loss": 1.2462, + "step": 7017 + }, + { + "epoch": 0.25132952531022257, + "grad_norm": 1.4325414896011353, + "learning_rate": 0.0001753899424960665, + "loss": 1.2776, + "step": 7018 + }, + { + "epoch": 0.25136533743979084, + "grad_norm": 1.6601237058639526, + "learning_rate": 0.0001753823215552303, + "loss": 1.0191, + "step": 7019 + }, + { + "epoch": 0.25140114956935916, + "grad_norm": 1.526804804801941, + "learning_rate": 0.00017537469960022221, + "loss": 1.2052, + "step": 7020 + }, + { + "epoch": 0.25143696169892743, + "grad_norm": 1.3619654178619385, + "learning_rate": 0.00017536707663114477, + "loss": 1.1346, + "step": 7021 + }, + { + "epoch": 0.2514727738284957, + "grad_norm": 1.7509902715682983, + "learning_rate": 0.00017535945264810052, + "loss": 1.1267, + "step": 7022 + }, + { + "epoch": 0.251508585958064, + "grad_norm": 1.3162586688995361, + "learning_rate": 0.00017535182765119204, + "loss": 1.068, + "step": 7023 + }, + { + "epoch": 0.2515443980876323, + "grad_norm": 1.6392780542373657, + "learning_rate": 0.00017534420164052193, + "loss": 1.1364, + "step": 7024 + }, + { + "epoch": 0.25158021021720056, + "grad_norm": 1.5794309377670288, + "learning_rate": 0.00017533657461619274, + "loss": 1.1599, + "step": 7025 + }, + { + "epoch": 0.25161602234676883, + "grad_norm": 1.3031127452850342, + "learning_rate": 0.00017532894657830715, + "loss": 1.2997, + "step": 7026 + }, + { + "epoch": 0.25165183447633716, + "grad_norm": 1.570569634437561, + "learning_rate": 0.00017532131752696776, + "loss": 1.2072, + "step": 7027 + }, + { + "epoch": 0.2516876466059054, + "grad_norm": 1.6221058368682861, + "learning_rate": 0.00017531368746227718, + "loss": 1.1358, + "step": 7028 + }, + { + "epoch": 0.2517234587354737, + "grad_norm": 1.6600606441497803, + "learning_rate": 0.00017530605638433805, + "loss": 1.2359, + "step": 7029 + }, + { + "epoch": 0.251759270865042, + "grad_norm": 1.6840671300888062, + "learning_rate": 0.00017529842429325312, + "loss": 1.2857, + "step": 7030 + }, + { + "epoch": 0.2517950829946103, + "grad_norm": 1.7347328662872314, + "learning_rate": 0.00017529079118912502, + "loss": 0.9099, + "step": 7031 + }, + { + "epoch": 0.25183089512417856, + "grad_norm": 1.3435026407241821, + "learning_rate": 0.00017528315707205643, + "loss": 1.3693, + "step": 7032 + }, + { + "epoch": 0.2518667072537468, + "grad_norm": 1.6826081275939941, + "learning_rate": 0.00017527552194215005, + "loss": 1.1351, + "step": 7033 + }, + { + "epoch": 0.25190251938331515, + "grad_norm": 1.704973816871643, + "learning_rate": 0.00017526788579950864, + "loss": 1.2214, + "step": 7034 + }, + { + "epoch": 0.2519383315128834, + "grad_norm": 1.6812753677368164, + "learning_rate": 0.0001752602486442349, + "loss": 1.2081, + "step": 7035 + }, + { + "epoch": 0.2519741436424517, + "grad_norm": 1.7803236246109009, + "learning_rate": 0.0001752526104764316, + "loss": 1.1927, + "step": 7036 + }, + { + "epoch": 0.25200995577201996, + "grad_norm": 1.56562340259552, + "learning_rate": 0.0001752449712962015, + "loss": 1.0783, + "step": 7037 + }, + { + "epoch": 0.2520457679015883, + "grad_norm": 1.4491761922836304, + "learning_rate": 0.00017523733110364736, + "loss": 1.0768, + "step": 7038 + }, + { + "epoch": 0.25208158003115655, + "grad_norm": 1.8191121816635132, + "learning_rate": 0.000175229689898872, + "loss": 1.327, + "step": 7039 + }, + { + "epoch": 0.2521173921607248, + "grad_norm": 1.770431399345398, + "learning_rate": 0.00017522204768197818, + "loss": 1.1677, + "step": 7040 + }, + { + "epoch": 0.25215320429029314, + "grad_norm": 1.4679042100906372, + "learning_rate": 0.00017521440445306875, + "loss": 1.2885, + "step": 7041 + }, + { + "epoch": 0.2521890164198614, + "grad_norm": 1.4695771932601929, + "learning_rate": 0.00017520676021224652, + "loss": 1.1152, + "step": 7042 + }, + { + "epoch": 0.2522248285494297, + "grad_norm": 1.6256440877914429, + "learning_rate": 0.00017519911495961435, + "loss": 1.2529, + "step": 7043 + }, + { + "epoch": 0.25226064067899795, + "grad_norm": 1.4248756170272827, + "learning_rate": 0.0001751914686952751, + "loss": 1.2185, + "step": 7044 + }, + { + "epoch": 0.2522964528085663, + "grad_norm": 1.423560380935669, + "learning_rate": 0.0001751838214193316, + "loss": 1.1768, + "step": 7045 + }, + { + "epoch": 0.25233226493813454, + "grad_norm": 1.9326244592666626, + "learning_rate": 0.0001751761731318868, + "loss": 1.0913, + "step": 7046 + }, + { + "epoch": 0.2523680770677028, + "grad_norm": 1.6514819860458374, + "learning_rate": 0.00017516852383304353, + "loss": 1.164, + "step": 7047 + }, + { + "epoch": 0.25240388919727114, + "grad_norm": 1.6959822177886963, + "learning_rate": 0.00017516087352290472, + "loss": 1.2258, + "step": 7048 + }, + { + "epoch": 0.2524397013268394, + "grad_norm": 1.454109787940979, + "learning_rate": 0.00017515322220157333, + "loss": 0.916, + "step": 7049 + }, + { + "epoch": 0.2524755134564077, + "grad_norm": 1.904162049293518, + "learning_rate": 0.0001751455698691523, + "loss": 0.8915, + "step": 7050 + }, + { + "epoch": 0.25251132558597594, + "grad_norm": 1.712014079093933, + "learning_rate": 0.00017513791652574453, + "loss": 1.3056, + "step": 7051 + }, + { + "epoch": 0.25254713771554427, + "grad_norm": 1.3918757438659668, + "learning_rate": 0.00017513026217145302, + "loss": 1.2624, + "step": 7052 + }, + { + "epoch": 0.25258294984511254, + "grad_norm": 1.2729716300964355, + "learning_rate": 0.00017512260680638072, + "loss": 1.0545, + "step": 7053 + }, + { + "epoch": 0.2526187619746808, + "grad_norm": 1.8565258979797363, + "learning_rate": 0.00017511495043063066, + "loss": 1.2389, + "step": 7054 + }, + { + "epoch": 0.25265457410424913, + "grad_norm": 1.2893513441085815, + "learning_rate": 0.00017510729304430584, + "loss": 1.2066, + "step": 7055 + }, + { + "epoch": 0.2526903862338174, + "grad_norm": 1.4234213829040527, + "learning_rate": 0.00017509963464750928, + "loss": 0.9941, + "step": 7056 + }, + { + "epoch": 0.25272619836338567, + "grad_norm": 1.7210487127304077, + "learning_rate": 0.000175091975240344, + "loss": 1.2282, + "step": 7057 + }, + { + "epoch": 0.25276201049295394, + "grad_norm": 1.5455288887023926, + "learning_rate": 0.00017508431482291304, + "loss": 1.1138, + "step": 7058 + }, + { + "epoch": 0.25279782262252226, + "grad_norm": 1.4997084140777588, + "learning_rate": 0.0001750766533953195, + "loss": 1.3298, + "step": 7059 + }, + { + "epoch": 0.25283363475209053, + "grad_norm": 1.5806487798690796, + "learning_rate": 0.00017506899095766641, + "loss": 1.1912, + "step": 7060 + }, + { + "epoch": 0.2528694468816588, + "grad_norm": 1.3807047605514526, + "learning_rate": 0.0001750613275100569, + "loss": 1.0255, + "step": 7061 + }, + { + "epoch": 0.2529052590112271, + "grad_norm": 1.6970329284667969, + "learning_rate": 0.00017505366305259402, + "loss": 1.1729, + "step": 7062 + }, + { + "epoch": 0.2529410711407954, + "grad_norm": 1.4124537706375122, + "learning_rate": 0.00017504599758538095, + "loss": 1.0616, + "step": 7063 + }, + { + "epoch": 0.25297688327036366, + "grad_norm": 1.6423226594924927, + "learning_rate": 0.00017503833110852078, + "loss": 1.0889, + "step": 7064 + }, + { + "epoch": 0.25301269539993193, + "grad_norm": 2.2432615756988525, + "learning_rate": 0.00017503066362211663, + "loss": 1.1842, + "step": 7065 + }, + { + "epoch": 0.25304850752950026, + "grad_norm": 1.6316828727722168, + "learning_rate": 0.00017502299512627172, + "loss": 1.2297, + "step": 7066 + }, + { + "epoch": 0.2530843196590685, + "grad_norm": 2.330566167831421, + "learning_rate": 0.00017501532562108916, + "loss": 1.1693, + "step": 7067 + }, + { + "epoch": 0.2531201317886368, + "grad_norm": 1.677132487297058, + "learning_rate": 0.00017500765510667217, + "loss": 1.1658, + "step": 7068 + }, + { + "epoch": 0.2531559439182051, + "grad_norm": 2.0298759937286377, + "learning_rate": 0.0001749999835831239, + "loss": 1.3484, + "step": 7069 + }, + { + "epoch": 0.2531917560477734, + "grad_norm": 1.6495641469955444, + "learning_rate": 0.00017499231105054763, + "loss": 1.0789, + "step": 7070 + }, + { + "epoch": 0.25322756817734166, + "grad_norm": 1.6207647323608398, + "learning_rate": 0.00017498463750904652, + "loss": 0.9069, + "step": 7071 + }, + { + "epoch": 0.2532633803069099, + "grad_norm": 1.6446360349655151, + "learning_rate": 0.00017497696295872385, + "loss": 1.3124, + "step": 7072 + }, + { + "epoch": 0.25329919243647825, + "grad_norm": 1.7044203281402588, + "learning_rate": 0.00017496928739968288, + "loss": 1.3966, + "step": 7073 + }, + { + "epoch": 0.2533350045660465, + "grad_norm": 1.238172173500061, + "learning_rate": 0.0001749616108320268, + "loss": 1.0996, + "step": 7074 + }, + { + "epoch": 0.2533708166956148, + "grad_norm": 1.4781423807144165, + "learning_rate": 0.000174953933255859, + "loss": 1.1959, + "step": 7075 + }, + { + "epoch": 0.2534066288251831, + "grad_norm": 1.1623748540878296, + "learning_rate": 0.0001749462546712827, + "loss": 0.9648, + "step": 7076 + }, + { + "epoch": 0.2534424409547514, + "grad_norm": 1.4262140989303589, + "learning_rate": 0.00017493857507840116, + "loss": 0.9595, + "step": 7077 + }, + { + "epoch": 0.25347825308431965, + "grad_norm": 2.0026445388793945, + "learning_rate": 0.0001749308944773178, + "loss": 1.3694, + "step": 7078 + }, + { + "epoch": 0.2535140652138879, + "grad_norm": 1.3472440242767334, + "learning_rate": 0.0001749232128681359, + "loss": 1.2008, + "step": 7079 + }, + { + "epoch": 0.25354987734345624, + "grad_norm": 1.609649658203125, + "learning_rate": 0.00017491553025095882, + "loss": 1.3174, + "step": 7080 + }, + { + "epoch": 0.2535856894730245, + "grad_norm": 1.2357585430145264, + "learning_rate": 0.00017490784662588992, + "loss": 1.1113, + "step": 7081 + }, + { + "epoch": 0.2536215016025928, + "grad_norm": 1.3808903694152832, + "learning_rate": 0.00017490016199303256, + "loss": 1.1678, + "step": 7082 + }, + { + "epoch": 0.2536573137321611, + "grad_norm": 1.4951591491699219, + "learning_rate": 0.00017489247635249012, + "loss": 1.34, + "step": 7083 + }, + { + "epoch": 0.2536931258617294, + "grad_norm": 1.6677045822143555, + "learning_rate": 0.00017488478970436604, + "loss": 1.184, + "step": 7084 + }, + { + "epoch": 0.25372893799129764, + "grad_norm": 1.6028698682785034, + "learning_rate": 0.0001748771020487637, + "loss": 1.1406, + "step": 7085 + }, + { + "epoch": 0.2537647501208659, + "grad_norm": 1.4776318073272705, + "learning_rate": 0.00017486941338578653, + "loss": 1.0536, + "step": 7086 + }, + { + "epoch": 0.25380056225043424, + "grad_norm": 1.503678321838379, + "learning_rate": 0.000174861723715538, + "loss": 1.1491, + "step": 7087 + }, + { + "epoch": 0.2538363743800025, + "grad_norm": 1.769368290901184, + "learning_rate": 0.0001748540330381215, + "loss": 1.2154, + "step": 7088 + }, + { + "epoch": 0.2538721865095708, + "grad_norm": 1.647094488143921, + "learning_rate": 0.00017484634135364057, + "loss": 1.0276, + "step": 7089 + }, + { + "epoch": 0.2539079986391391, + "grad_norm": 1.572364330291748, + "learning_rate": 0.00017483864866219868, + "loss": 1.0822, + "step": 7090 + }, + { + "epoch": 0.25394381076870737, + "grad_norm": 1.4320013523101807, + "learning_rate": 0.00017483095496389928, + "loss": 1.0057, + "step": 7091 + }, + { + "epoch": 0.25397962289827564, + "grad_norm": 1.649052381515503, + "learning_rate": 0.0001748232602588459, + "loss": 1.1916, + "step": 7092 + }, + { + "epoch": 0.2540154350278439, + "grad_norm": 1.3940110206604004, + "learning_rate": 0.0001748155645471421, + "loss": 1.3154, + "step": 7093 + }, + { + "epoch": 0.25405124715741223, + "grad_norm": 1.5033340454101562, + "learning_rate": 0.00017480786782889137, + "loss": 1.1045, + "step": 7094 + }, + { + "epoch": 0.2540870592869805, + "grad_norm": 1.681386947631836, + "learning_rate": 0.00017480017010419724, + "loss": 1.2959, + "step": 7095 + }, + { + "epoch": 0.25412287141654877, + "grad_norm": 1.9101759195327759, + "learning_rate": 0.00017479247137316335, + "loss": 1.1928, + "step": 7096 + }, + { + "epoch": 0.2541586835461171, + "grad_norm": 1.5014939308166504, + "learning_rate": 0.0001747847716358932, + "loss": 1.0501, + "step": 7097 + }, + { + "epoch": 0.25419449567568536, + "grad_norm": 1.432726263999939, + "learning_rate": 0.00017477707089249043, + "loss": 1.046, + "step": 7098 + }, + { + "epoch": 0.25423030780525363, + "grad_norm": 1.197654128074646, + "learning_rate": 0.00017476936914305862, + "loss": 1.2208, + "step": 7099 + }, + { + "epoch": 0.2542661199348219, + "grad_norm": 1.4489916563034058, + "learning_rate": 0.00017476166638770142, + "loss": 1.0943, + "step": 7100 + }, + { + "epoch": 0.2543019320643902, + "grad_norm": 1.777800440788269, + "learning_rate": 0.0001747539626265224, + "loss": 1.1633, + "step": 7101 + }, + { + "epoch": 0.2543377441939585, + "grad_norm": 1.706864595413208, + "learning_rate": 0.00017474625785962524, + "loss": 1.1411, + "step": 7102 + }, + { + "epoch": 0.25437355632352676, + "grad_norm": 2.1371195316314697, + "learning_rate": 0.00017473855208711362, + "loss": 1.2486, + "step": 7103 + }, + { + "epoch": 0.2544093684530951, + "grad_norm": 1.4981892108917236, + "learning_rate": 0.00017473084530909117, + "loss": 1.14, + "step": 7104 + }, + { + "epoch": 0.25444518058266336, + "grad_norm": 1.449744462966919, + "learning_rate": 0.0001747231375256616, + "loss": 1.3334, + "step": 7105 + }, + { + "epoch": 0.2544809927122316, + "grad_norm": 1.7326080799102783, + "learning_rate": 0.00017471542873692862, + "loss": 1.2081, + "step": 7106 + }, + { + "epoch": 0.2545168048417999, + "grad_norm": 1.5881582498550415, + "learning_rate": 0.0001747077189429959, + "loss": 0.9744, + "step": 7107 + }, + { + "epoch": 0.2545526169713682, + "grad_norm": 1.4175212383270264, + "learning_rate": 0.00017470000814396718, + "loss": 1.3476, + "step": 7108 + }, + { + "epoch": 0.2545884291009365, + "grad_norm": 2.172529697418213, + "learning_rate": 0.0001746922963399462, + "loss": 1.0353, + "step": 7109 + }, + { + "epoch": 0.25462424123050476, + "grad_norm": 1.305872917175293, + "learning_rate": 0.00017468458353103676, + "loss": 1.1265, + "step": 7110 + }, + { + "epoch": 0.2546600533600731, + "grad_norm": 1.3033521175384521, + "learning_rate": 0.00017467686971734257, + "loss": 1.2163, + "step": 7111 + }, + { + "epoch": 0.25469586548964135, + "grad_norm": 1.3496493101119995, + "learning_rate": 0.0001746691548989674, + "loss": 0.9418, + "step": 7112 + }, + { + "epoch": 0.2547316776192096, + "grad_norm": 1.3298815488815308, + "learning_rate": 0.00017466143907601508, + "loss": 1.2345, + "step": 7113 + }, + { + "epoch": 0.2547674897487779, + "grad_norm": 1.3271453380584717, + "learning_rate": 0.00017465372224858937, + "loss": 1.1406, + "step": 7114 + }, + { + "epoch": 0.2548033018783462, + "grad_norm": 1.2018744945526123, + "learning_rate": 0.00017464600441679417, + "loss": 1.1747, + "step": 7115 + }, + { + "epoch": 0.2548391140079145, + "grad_norm": 1.6546918153762817, + "learning_rate": 0.0001746382855807333, + "loss": 1.1576, + "step": 7116 + }, + { + "epoch": 0.25487492613748275, + "grad_norm": 1.4785339832305908, + "learning_rate": 0.0001746305657405105, + "loss": 1.1828, + "step": 7117 + }, + { + "epoch": 0.2549107382670511, + "grad_norm": 1.5939679145812988, + "learning_rate": 0.00017462284489622973, + "loss": 1.1828, + "step": 7118 + }, + { + "epoch": 0.25494655039661934, + "grad_norm": 1.6142559051513672, + "learning_rate": 0.00017461512304799484, + "loss": 1.1441, + "step": 7119 + }, + { + "epoch": 0.2549823625261876, + "grad_norm": 1.6222745180130005, + "learning_rate": 0.0001746074001959097, + "loss": 0.9842, + "step": 7120 + }, + { + "epoch": 0.2550181746557559, + "grad_norm": 1.70821213722229, + "learning_rate": 0.00017459967634007826, + "loss": 1.3246, + "step": 7121 + }, + { + "epoch": 0.2550539867853242, + "grad_norm": 1.4416874647140503, + "learning_rate": 0.00017459195148060438, + "loss": 0.9696, + "step": 7122 + }, + { + "epoch": 0.2550897989148925, + "grad_norm": 2.2505693435668945, + "learning_rate": 0.00017458422561759203, + "loss": 1.4691, + "step": 7123 + }, + { + "epoch": 0.25512561104446074, + "grad_norm": 1.598983645439148, + "learning_rate": 0.0001745764987511451, + "loss": 1.0056, + "step": 7124 + }, + { + "epoch": 0.25516142317402907, + "grad_norm": 1.9150594472885132, + "learning_rate": 0.0001745687708813676, + "loss": 1.2343, + "step": 7125 + }, + { + "epoch": 0.25519723530359734, + "grad_norm": 1.5203630924224854, + "learning_rate": 0.00017456104200836347, + "loss": 1.0413, + "step": 7126 + }, + { + "epoch": 0.2552330474331656, + "grad_norm": 1.5128974914550781, + "learning_rate": 0.00017455331213223668, + "loss": 1.2607, + "step": 7127 + }, + { + "epoch": 0.2552688595627339, + "grad_norm": 1.4420478343963623, + "learning_rate": 0.00017454558125309125, + "loss": 1.078, + "step": 7128 + }, + { + "epoch": 0.2553046716923022, + "grad_norm": 1.8487310409545898, + "learning_rate": 0.00017453784937103122, + "loss": 1.0365, + "step": 7129 + }, + { + "epoch": 0.25534048382187047, + "grad_norm": 1.4514355659484863, + "learning_rate": 0.00017453011648616053, + "loss": 1.1374, + "step": 7130 + }, + { + "epoch": 0.25537629595143874, + "grad_norm": 1.529703140258789, + "learning_rate": 0.00017452238259858327, + "loss": 1.2883, + "step": 7131 + }, + { + "epoch": 0.25541210808100706, + "grad_norm": 1.1709120273590088, + "learning_rate": 0.00017451464770840348, + "loss": 1.1243, + "step": 7132 + }, + { + "epoch": 0.25544792021057533, + "grad_norm": 2.034545421600342, + "learning_rate": 0.00017450691181572522, + "loss": 1.1086, + "step": 7133 + }, + { + "epoch": 0.2554837323401436, + "grad_norm": 1.9611557722091675, + "learning_rate": 0.00017449917492065256, + "loss": 1.1475, + "step": 7134 + }, + { + "epoch": 0.25551954446971187, + "grad_norm": 1.3155715465545654, + "learning_rate": 0.0001744914370232896, + "loss": 1.2407, + "step": 7135 + }, + { + "epoch": 0.2555553565992802, + "grad_norm": 1.7545604705810547, + "learning_rate": 0.00017448369812374045, + "loss": 1.0816, + "step": 7136 + }, + { + "epoch": 0.25559116872884846, + "grad_norm": 1.6955724954605103, + "learning_rate": 0.00017447595822210924, + "loss": 0.9886, + "step": 7137 + }, + { + "epoch": 0.25562698085841673, + "grad_norm": 1.534593939781189, + "learning_rate": 0.00017446821731850008, + "loss": 1.1112, + "step": 7138 + }, + { + "epoch": 0.25566279298798505, + "grad_norm": 1.3749630451202393, + "learning_rate": 0.00017446047541301707, + "loss": 1.1141, + "step": 7139 + }, + { + "epoch": 0.2556986051175533, + "grad_norm": 2.2915029525756836, + "learning_rate": 0.00017445273250576442, + "loss": 1.2206, + "step": 7140 + }, + { + "epoch": 0.2557344172471216, + "grad_norm": 1.863507866859436, + "learning_rate": 0.0001744449885968463, + "loss": 1.2121, + "step": 7141 + }, + { + "epoch": 0.25577022937668986, + "grad_norm": 1.7177835702896118, + "learning_rate": 0.00017443724368636693, + "loss": 1.0511, + "step": 7142 + }, + { + "epoch": 0.2558060415062582, + "grad_norm": 1.8923636674880981, + "learning_rate": 0.00017442949777443038, + "loss": 1.3147, + "step": 7143 + }, + { + "epoch": 0.25584185363582646, + "grad_norm": 1.4344220161437988, + "learning_rate": 0.000174421750861141, + "loss": 1.2453, + "step": 7144 + }, + { + "epoch": 0.2558776657653947, + "grad_norm": 1.6646997928619385, + "learning_rate": 0.00017441400294660294, + "loss": 1.2662, + "step": 7145 + }, + { + "epoch": 0.25591347789496305, + "grad_norm": 1.4710465669631958, + "learning_rate": 0.00017440625403092045, + "loss": 1.1635, + "step": 7146 + }, + { + "epoch": 0.2559492900245313, + "grad_norm": 1.6254370212554932, + "learning_rate": 0.00017439850411419782, + "loss": 1.2087, + "step": 7147 + }, + { + "epoch": 0.2559851021540996, + "grad_norm": 1.576823115348816, + "learning_rate": 0.00017439075319653928, + "loss": 1.3735, + "step": 7148 + }, + { + "epoch": 0.25602091428366786, + "grad_norm": 1.647093653678894, + "learning_rate": 0.0001743830012780491, + "loss": 1.1936, + "step": 7149 + }, + { + "epoch": 0.2560567264132362, + "grad_norm": 1.8962364196777344, + "learning_rate": 0.00017437524835883157, + "loss": 1.3083, + "step": 7150 + }, + { + "epoch": 0.25609253854280445, + "grad_norm": 1.3505964279174805, + "learning_rate": 0.00017436749443899103, + "loss": 1.2571, + "step": 7151 + }, + { + "epoch": 0.2561283506723727, + "grad_norm": 1.8000810146331787, + "learning_rate": 0.00017435973951863179, + "loss": 1.1305, + "step": 7152 + }, + { + "epoch": 0.25616416280194104, + "grad_norm": 2.2368438243865967, + "learning_rate": 0.0001743519835978581, + "loss": 1.0568, + "step": 7153 + }, + { + "epoch": 0.2561999749315093, + "grad_norm": 1.4775089025497437, + "learning_rate": 0.00017434422667677446, + "loss": 1.2193, + "step": 7154 + }, + { + "epoch": 0.2562357870610776, + "grad_norm": 1.4503251314163208, + "learning_rate": 0.00017433646875548512, + "loss": 1.1911, + "step": 7155 + }, + { + "epoch": 0.25627159919064585, + "grad_norm": 1.5076212882995605, + "learning_rate": 0.0001743287098340945, + "loss": 1.2264, + "step": 7156 + }, + { + "epoch": 0.2563074113202142, + "grad_norm": 1.700514554977417, + "learning_rate": 0.00017432094991270692, + "loss": 1.2037, + "step": 7157 + }, + { + "epoch": 0.25634322344978244, + "grad_norm": 1.6732909679412842, + "learning_rate": 0.00017431318899142686, + "loss": 1.384, + "step": 7158 + }, + { + "epoch": 0.2563790355793507, + "grad_norm": 1.4447669982910156, + "learning_rate": 0.0001743054270703587, + "loss": 1.1093, + "step": 7159 + }, + { + "epoch": 0.25641484770891904, + "grad_norm": 1.8685051202774048, + "learning_rate": 0.00017429766414960685, + "loss": 1.2571, + "step": 7160 + }, + { + "epoch": 0.2564506598384873, + "grad_norm": 1.913029432296753, + "learning_rate": 0.0001742899002292758, + "loss": 1.407, + "step": 7161 + }, + { + "epoch": 0.2564864719680556, + "grad_norm": 1.1556092500686646, + "learning_rate": 0.00017428213530946995, + "loss": 1.2041, + "step": 7162 + }, + { + "epoch": 0.25652228409762384, + "grad_norm": 2.2842626571655273, + "learning_rate": 0.00017427436939029378, + "loss": 1.2859, + "step": 7163 + }, + { + "epoch": 0.25655809622719217, + "grad_norm": 1.7235199213027954, + "learning_rate": 0.00017426660247185177, + "loss": 1.1219, + "step": 7164 + }, + { + "epoch": 0.25659390835676044, + "grad_norm": 1.5752438306808472, + "learning_rate": 0.0001742588345542484, + "loss": 1.354, + "step": 7165 + }, + { + "epoch": 0.2566297204863287, + "grad_norm": 1.3234152793884277, + "learning_rate": 0.00017425106563758824, + "loss": 1.1277, + "step": 7166 + }, + { + "epoch": 0.25666553261589703, + "grad_norm": 1.8224986791610718, + "learning_rate": 0.00017424329572197578, + "loss": 1.1957, + "step": 7167 + }, + { + "epoch": 0.2567013447454653, + "grad_norm": 1.4783951044082642, + "learning_rate": 0.0001742355248075155, + "loss": 1.126, + "step": 7168 + }, + { + "epoch": 0.25673715687503357, + "grad_norm": 1.3045965433120728, + "learning_rate": 0.00017422775289431202, + "loss": 1.173, + "step": 7169 + }, + { + "epoch": 0.25677296900460184, + "grad_norm": 1.5593605041503906, + "learning_rate": 0.00017421997998246985, + "loss": 1.1709, + "step": 7170 + }, + { + "epoch": 0.25680878113417016, + "grad_norm": 2.246901035308838, + "learning_rate": 0.0001742122060720936, + "loss": 1.3015, + "step": 7171 + }, + { + "epoch": 0.25684459326373843, + "grad_norm": 1.3841832876205444, + "learning_rate": 0.00017420443116328784, + "loss": 0.9719, + "step": 7172 + }, + { + "epoch": 0.2568804053933067, + "grad_norm": 1.2411484718322754, + "learning_rate": 0.0001741966552561572, + "loss": 1.0677, + "step": 7173 + }, + { + "epoch": 0.256916217522875, + "grad_norm": 1.8778218030929565, + "learning_rate": 0.00017418887835080624, + "loss": 1.1167, + "step": 7174 + }, + { + "epoch": 0.2569520296524433, + "grad_norm": 1.5798085927963257, + "learning_rate": 0.0001741811004473396, + "loss": 1.0552, + "step": 7175 + }, + { + "epoch": 0.25698784178201156, + "grad_norm": 2.240093469619751, + "learning_rate": 0.000174173321545862, + "loss": 1.1182, + "step": 7176 + }, + { + "epoch": 0.25702365391157983, + "grad_norm": 1.3528108596801758, + "learning_rate": 0.000174165541646478, + "loss": 1.194, + "step": 7177 + }, + { + "epoch": 0.25705946604114815, + "grad_norm": 1.3988733291625977, + "learning_rate": 0.0001741577607492923, + "loss": 1.1188, + "step": 7178 + }, + { + "epoch": 0.2570952781707164, + "grad_norm": 1.42152738571167, + "learning_rate": 0.00017414997885440957, + "loss": 1.1683, + "step": 7179 + }, + { + "epoch": 0.2571310903002847, + "grad_norm": 1.5007855892181396, + "learning_rate": 0.00017414219596193455, + "loss": 1.0256, + "step": 7180 + }, + { + "epoch": 0.257166902429853, + "grad_norm": 1.6339985132217407, + "learning_rate": 0.0001741344120719719, + "loss": 1.0901, + "step": 7181 + }, + { + "epoch": 0.2572027145594213, + "grad_norm": 1.6358048915863037, + "learning_rate": 0.00017412662718462637, + "loss": 1.1589, + "step": 7182 + }, + { + "epoch": 0.25723852668898955, + "grad_norm": 1.4716299772262573, + "learning_rate": 0.00017411884130000271, + "loss": 1.1468, + "step": 7183 + }, + { + "epoch": 0.2572743388185578, + "grad_norm": 1.9147197008132935, + "learning_rate": 0.00017411105441820563, + "loss": 1.2272, + "step": 7184 + }, + { + "epoch": 0.25731015094812615, + "grad_norm": 1.601464867591858, + "learning_rate": 0.0001741032665393399, + "loss": 1.2458, + "step": 7185 + }, + { + "epoch": 0.2573459630776944, + "grad_norm": 1.8370518684387207, + "learning_rate": 0.00017409547766351034, + "loss": 1.107, + "step": 7186 + }, + { + "epoch": 0.2573817752072627, + "grad_norm": 1.7786418199539185, + "learning_rate": 0.00017408768779082165, + "loss": 1.1815, + "step": 7187 + }, + { + "epoch": 0.257417587336831, + "grad_norm": 1.4073587656021118, + "learning_rate": 0.00017407989692137872, + "loss": 1.1436, + "step": 7188 + }, + { + "epoch": 0.2574533994663993, + "grad_norm": 2.088529586791992, + "learning_rate": 0.0001740721050552863, + "loss": 1.2332, + "step": 7189 + }, + { + "epoch": 0.25748921159596755, + "grad_norm": 1.9605083465576172, + "learning_rate": 0.0001740643121926493, + "loss": 1.0957, + "step": 7190 + }, + { + "epoch": 0.2575250237255358, + "grad_norm": 1.8863017559051514, + "learning_rate": 0.0001740565183335725, + "loss": 1.1425, + "step": 7191 + }, + { + "epoch": 0.25756083585510414, + "grad_norm": 1.4277348518371582, + "learning_rate": 0.00017404872347816076, + "loss": 0.9569, + "step": 7192 + }, + { + "epoch": 0.2575966479846724, + "grad_norm": 1.7897828817367554, + "learning_rate": 0.00017404092762651898, + "loss": 1.1977, + "step": 7193 + }, + { + "epoch": 0.2576324601142407, + "grad_norm": 1.9901039600372314, + "learning_rate": 0.000174033130778752, + "loss": 1.1982, + "step": 7194 + }, + { + "epoch": 0.257668272243809, + "grad_norm": 1.6629902124404907, + "learning_rate": 0.00017402533293496477, + "loss": 1.0872, + "step": 7195 + }, + { + "epoch": 0.2577040843733773, + "grad_norm": 1.5523282289505005, + "learning_rate": 0.00017401753409526216, + "loss": 1.3142, + "step": 7196 + }, + { + "epoch": 0.25773989650294554, + "grad_norm": 1.8505008220672607, + "learning_rate": 0.0001740097342597491, + "loss": 1.2569, + "step": 7197 + }, + { + "epoch": 0.2577757086325138, + "grad_norm": 1.7557306289672852, + "learning_rate": 0.0001740019334285305, + "loss": 1.1292, + "step": 7198 + }, + { + "epoch": 0.25781152076208214, + "grad_norm": 1.7141106128692627, + "learning_rate": 0.0001739941316017114, + "loss": 1.1407, + "step": 7199 + }, + { + "epoch": 0.2578473328916504, + "grad_norm": 1.6724861860275269, + "learning_rate": 0.00017398632877939666, + "loss": 1.1329, + "step": 7200 + }, + { + "epoch": 0.2578831450212187, + "grad_norm": 1.262244701385498, + "learning_rate": 0.00017397852496169134, + "loss": 1.0871, + "step": 7201 + }, + { + "epoch": 0.257918957150787, + "grad_norm": 1.6686761379241943, + "learning_rate": 0.00017397072014870037, + "loss": 1.0849, + "step": 7202 + }, + { + "epoch": 0.25795476928035527, + "grad_norm": 1.7917226552963257, + "learning_rate": 0.00017396291434052877, + "loss": 1.1155, + "step": 7203 + }, + { + "epoch": 0.25799058140992354, + "grad_norm": 1.345262050628662, + "learning_rate": 0.00017395510753728157, + "loss": 1.0341, + "step": 7204 + }, + { + "epoch": 0.2580263935394918, + "grad_norm": 1.7890543937683105, + "learning_rate": 0.0001739472997390638, + "loss": 1.2916, + "step": 7205 + }, + { + "epoch": 0.25806220566906013, + "grad_norm": 2.20346999168396, + "learning_rate": 0.00017393949094598047, + "loss": 1.063, + "step": 7206 + }, + { + "epoch": 0.2580980177986284, + "grad_norm": 1.5609447956085205, + "learning_rate": 0.00017393168115813673, + "loss": 1.1983, + "step": 7207 + }, + { + "epoch": 0.25813382992819667, + "grad_norm": 1.4354445934295654, + "learning_rate": 0.0001739238703756375, + "loss": 1.2157, + "step": 7208 + }, + { + "epoch": 0.258169642057765, + "grad_norm": 1.6613332033157349, + "learning_rate": 0.00017391605859858798, + "loss": 1.1714, + "step": 7209 + }, + { + "epoch": 0.25820545418733326, + "grad_norm": 1.5549300909042358, + "learning_rate": 0.00017390824582709326, + "loss": 1.3959, + "step": 7210 + }, + { + "epoch": 0.25824126631690153, + "grad_norm": 1.3821275234222412, + "learning_rate": 0.0001739004320612584, + "loss": 1.284, + "step": 7211 + }, + { + "epoch": 0.2582770784464698, + "grad_norm": 1.6809078454971313, + "learning_rate": 0.00017389261730118858, + "loss": 1.3263, + "step": 7212 + }, + { + "epoch": 0.2583128905760381, + "grad_norm": 1.6380234956741333, + "learning_rate": 0.0001738848015469889, + "loss": 1.1384, + "step": 7213 + }, + { + "epoch": 0.2583487027056064, + "grad_norm": 1.9011222124099731, + "learning_rate": 0.0001738769847987645, + "loss": 1.0339, + "step": 7214 + }, + { + "epoch": 0.25838451483517466, + "grad_norm": 1.616592526435852, + "learning_rate": 0.0001738691670566206, + "loss": 1.051, + "step": 7215 + }, + { + "epoch": 0.258420326964743, + "grad_norm": 1.412545084953308, + "learning_rate": 0.0001738613483206623, + "loss": 1.2047, + "step": 7216 + }, + { + "epoch": 0.25845613909431125, + "grad_norm": 1.502881407737732, + "learning_rate": 0.00017385352859099483, + "loss": 1.3366, + "step": 7217 + }, + { + "epoch": 0.2584919512238795, + "grad_norm": 1.7075620889663696, + "learning_rate": 0.00017384570786772345, + "loss": 1.4103, + "step": 7218 + }, + { + "epoch": 0.2585277633534478, + "grad_norm": 2.047956943511963, + "learning_rate": 0.00017383788615095327, + "loss": 1.2683, + "step": 7219 + }, + { + "epoch": 0.2585635754830161, + "grad_norm": 1.4336856603622437, + "learning_rate": 0.0001738300634407896, + "loss": 1.1843, + "step": 7220 + }, + { + "epoch": 0.2585993876125844, + "grad_norm": 1.1876418590545654, + "learning_rate": 0.00017382223973733767, + "loss": 1.1454, + "step": 7221 + }, + { + "epoch": 0.25863519974215265, + "grad_norm": 1.5936414003372192, + "learning_rate": 0.0001738144150407027, + "loss": 1.2843, + "step": 7222 + }, + { + "epoch": 0.258671011871721, + "grad_norm": 1.5446679592132568, + "learning_rate": 0.00017380658935099, + "loss": 1.2551, + "step": 7223 + }, + { + "epoch": 0.25870682400128925, + "grad_norm": 1.8139145374298096, + "learning_rate": 0.00017379876266830486, + "loss": 1.2992, + "step": 7224 + }, + { + "epoch": 0.2587426361308575, + "grad_norm": 1.3816314935684204, + "learning_rate": 0.00017379093499275258, + "loss": 1.2002, + "step": 7225 + }, + { + "epoch": 0.2587784482604258, + "grad_norm": 1.1046422719955444, + "learning_rate": 0.00017378310632443843, + "loss": 0.899, + "step": 7226 + }, + { + "epoch": 0.2588142603899941, + "grad_norm": 1.6747568845748901, + "learning_rate": 0.00017377527666346772, + "loss": 1.212, + "step": 7227 + }, + { + "epoch": 0.2588500725195624, + "grad_norm": 1.2782466411590576, + "learning_rate": 0.00017376744600994587, + "loss": 1.2298, + "step": 7228 + }, + { + "epoch": 0.25888588464913065, + "grad_norm": 1.5557337999343872, + "learning_rate": 0.00017375961436397818, + "loss": 0.9647, + "step": 7229 + }, + { + "epoch": 0.2589216967786989, + "grad_norm": 1.5775898694992065, + "learning_rate": 0.00017375178172567002, + "loss": 1.1811, + "step": 7230 + }, + { + "epoch": 0.25895750890826724, + "grad_norm": 1.6436522006988525, + "learning_rate": 0.00017374394809512676, + "loss": 1.1322, + "step": 7231 + }, + { + "epoch": 0.2589933210378355, + "grad_norm": 1.3805245161056519, + "learning_rate": 0.0001737361134724538, + "loss": 1.1028, + "step": 7232 + }, + { + "epoch": 0.2590291331674038, + "grad_norm": 1.6215946674346924, + "learning_rate": 0.00017372827785775655, + "loss": 1.3378, + "step": 7233 + }, + { + "epoch": 0.2590649452969721, + "grad_norm": 1.921344518661499, + "learning_rate": 0.00017372044125114045, + "loss": 1.1338, + "step": 7234 + }, + { + "epoch": 0.2591007574265404, + "grad_norm": 2.3236236572265625, + "learning_rate": 0.0001737126036527109, + "loss": 1.2852, + "step": 7235 + }, + { + "epoch": 0.25913656955610864, + "grad_norm": 1.5052186250686646, + "learning_rate": 0.00017370476506257333, + "loss": 1.1768, + "step": 7236 + }, + { + "epoch": 0.2591723816856769, + "grad_norm": 1.8754909038543701, + "learning_rate": 0.0001736969254808332, + "loss": 1.2468, + "step": 7237 + }, + { + "epoch": 0.25920819381524524, + "grad_norm": 2.0417158603668213, + "learning_rate": 0.00017368908490759605, + "loss": 1.2782, + "step": 7238 + }, + { + "epoch": 0.2592440059448135, + "grad_norm": 1.9712940454483032, + "learning_rate": 0.00017368124334296727, + "loss": 1.1325, + "step": 7239 + }, + { + "epoch": 0.2592798180743818, + "grad_norm": 1.922357201576233, + "learning_rate": 0.00017367340078705242, + "loss": 1.1854, + "step": 7240 + }, + { + "epoch": 0.2593156302039501, + "grad_norm": 1.7811118364334106, + "learning_rate": 0.000173665557239957, + "loss": 1.3036, + "step": 7241 + }, + { + "epoch": 0.25935144233351837, + "grad_norm": 1.8732304573059082, + "learning_rate": 0.00017365771270178652, + "loss": 1.2322, + "step": 7242 + }, + { + "epoch": 0.25938725446308664, + "grad_norm": 1.7698942422866821, + "learning_rate": 0.00017364986717264652, + "loss": 1.2604, + "step": 7243 + }, + { + "epoch": 0.2594230665926549, + "grad_norm": 1.4482903480529785, + "learning_rate": 0.00017364202065264258, + "loss": 1.2888, + "step": 7244 + }, + { + "epoch": 0.25945887872222323, + "grad_norm": 2.052699565887451, + "learning_rate": 0.00017363417314188024, + "loss": 1.3979, + "step": 7245 + }, + { + "epoch": 0.2594946908517915, + "grad_norm": 1.4508880376815796, + "learning_rate": 0.00017362632464046506, + "loss": 1.2219, + "step": 7246 + }, + { + "epoch": 0.25953050298135977, + "grad_norm": 1.7148813009262085, + "learning_rate": 0.00017361847514850266, + "loss": 1.1648, + "step": 7247 + }, + { + "epoch": 0.2595663151109281, + "grad_norm": 1.492763638496399, + "learning_rate": 0.00017361062466609867, + "loss": 0.9444, + "step": 7248 + }, + { + "epoch": 0.25960212724049636, + "grad_norm": 2.012239694595337, + "learning_rate": 0.00017360277319335865, + "loss": 1.2838, + "step": 7249 + }, + { + "epoch": 0.25963793937006463, + "grad_norm": 1.9670828580856323, + "learning_rate": 0.00017359492073038826, + "loss": 1.438, + "step": 7250 + }, + { + "epoch": 0.2596737514996329, + "grad_norm": 1.4769389629364014, + "learning_rate": 0.00017358706727729311, + "loss": 1.0992, + "step": 7251 + }, + { + "epoch": 0.2597095636292012, + "grad_norm": 1.2985421419143677, + "learning_rate": 0.00017357921283417892, + "loss": 1.2063, + "step": 7252 + }, + { + "epoch": 0.2597453757587695, + "grad_norm": 1.6455059051513672, + "learning_rate": 0.00017357135740115137, + "loss": 1.1314, + "step": 7253 + }, + { + "epoch": 0.25978118788833776, + "grad_norm": 1.6343854665756226, + "learning_rate": 0.00017356350097831605, + "loss": 1.0559, + "step": 7254 + }, + { + "epoch": 0.2598170000179061, + "grad_norm": 1.5274266004562378, + "learning_rate": 0.00017355564356577873, + "loss": 1.2557, + "step": 7255 + }, + { + "epoch": 0.25985281214747435, + "grad_norm": 1.7981528043746948, + "learning_rate": 0.00017354778516364512, + "loss": 1.0807, + "step": 7256 + }, + { + "epoch": 0.2598886242770426, + "grad_norm": 1.6358364820480347, + "learning_rate": 0.00017353992577202093, + "loss": 1.2064, + "step": 7257 + }, + { + "epoch": 0.2599244364066109, + "grad_norm": 1.7501251697540283, + "learning_rate": 0.00017353206539101186, + "loss": 1.1552, + "step": 7258 + }, + { + "epoch": 0.2599602485361792, + "grad_norm": 1.4871864318847656, + "learning_rate": 0.00017352420402072375, + "loss": 1.0859, + "step": 7259 + }, + { + "epoch": 0.2599960606657475, + "grad_norm": 1.3607617616653442, + "learning_rate": 0.00017351634166126227, + "loss": 1.2684, + "step": 7260 + }, + { + "epoch": 0.26003187279531575, + "grad_norm": 1.6695120334625244, + "learning_rate": 0.00017350847831273329, + "loss": 1.3523, + "step": 7261 + }, + { + "epoch": 0.2600676849248841, + "grad_norm": 1.6184518337249756, + "learning_rate": 0.00017350061397524252, + "loss": 1.2264, + "step": 7262 + }, + { + "epoch": 0.26010349705445235, + "grad_norm": 1.334600567817688, + "learning_rate": 0.0001734927486488958, + "loss": 1.2186, + "step": 7263 + }, + { + "epoch": 0.2601393091840206, + "grad_norm": 1.6608799695968628, + "learning_rate": 0.00017348488233379897, + "loss": 1.3569, + "step": 7264 + }, + { + "epoch": 0.2601751213135889, + "grad_norm": 1.5877805948257446, + "learning_rate": 0.0001734770150300578, + "loss": 1.2099, + "step": 7265 + }, + { + "epoch": 0.2602109334431572, + "grad_norm": 1.5152349472045898, + "learning_rate": 0.00017346914673777822, + "loss": 1.3287, + "step": 7266 + }, + { + "epoch": 0.2602467455727255, + "grad_norm": 1.5900753736495972, + "learning_rate": 0.000173461277457066, + "loss": 1.1304, + "step": 7267 + }, + { + "epoch": 0.26028255770229375, + "grad_norm": 1.422868251800537, + "learning_rate": 0.00017345340718802704, + "loss": 1.2217, + "step": 7268 + }, + { + "epoch": 0.2603183698318621, + "grad_norm": 2.4070417881011963, + "learning_rate": 0.00017344553593076726, + "loss": 1.0859, + "step": 7269 + }, + { + "epoch": 0.26035418196143034, + "grad_norm": 2.009625196456909, + "learning_rate": 0.00017343766368539253, + "loss": 1.221, + "step": 7270 + }, + { + "epoch": 0.2603899940909986, + "grad_norm": 1.6786373853683472, + "learning_rate": 0.00017342979045200876, + "loss": 1.4259, + "step": 7271 + }, + { + "epoch": 0.2604258062205669, + "grad_norm": 1.911194920539856, + "learning_rate": 0.00017342191623072187, + "loss": 1.0755, + "step": 7272 + }, + { + "epoch": 0.2604616183501352, + "grad_norm": 1.3173848390579224, + "learning_rate": 0.00017341404102163782, + "loss": 1.2238, + "step": 7273 + }, + { + "epoch": 0.2604974304797035, + "grad_norm": 1.8591009378433228, + "learning_rate": 0.00017340616482486253, + "loss": 1.2994, + "step": 7274 + }, + { + "epoch": 0.26053324260927174, + "grad_norm": 1.3631057739257812, + "learning_rate": 0.00017339828764050198, + "loss": 1.0757, + "step": 7275 + }, + { + "epoch": 0.26056905473884007, + "grad_norm": 1.3886592388153076, + "learning_rate": 0.00017339040946866217, + "loss": 1.2404, + "step": 7276 + }, + { + "epoch": 0.26060486686840834, + "grad_norm": 1.7074180841445923, + "learning_rate": 0.00017338253030944905, + "loss": 1.1965, + "step": 7277 + }, + { + "epoch": 0.2606406789979766, + "grad_norm": 1.7405105829238892, + "learning_rate": 0.00017337465016296864, + "loss": 1.1424, + "step": 7278 + }, + { + "epoch": 0.2606764911275449, + "grad_norm": 1.6604058742523193, + "learning_rate": 0.00017336676902932695, + "loss": 1.0883, + "step": 7279 + }, + { + "epoch": 0.2607123032571132, + "grad_norm": 1.6709595918655396, + "learning_rate": 0.00017335888690863, + "loss": 1.0818, + "step": 7280 + }, + { + "epoch": 0.26074811538668147, + "grad_norm": 1.6881581544876099, + "learning_rate": 0.00017335100380098392, + "loss": 1.272, + "step": 7281 + }, + { + "epoch": 0.26078392751624974, + "grad_norm": 1.4436806440353394, + "learning_rate": 0.00017334311970649465, + "loss": 1.0874, + "step": 7282 + }, + { + "epoch": 0.26081973964581806, + "grad_norm": 1.5656092166900635, + "learning_rate": 0.00017333523462526832, + "loss": 1.1484, + "step": 7283 + }, + { + "epoch": 0.26085555177538633, + "grad_norm": 1.8230680227279663, + "learning_rate": 0.000173327348557411, + "loss": 1.2727, + "step": 7284 + }, + { + "epoch": 0.2608913639049546, + "grad_norm": 1.3366475105285645, + "learning_rate": 0.00017331946150302878, + "loss": 1.1538, + "step": 7285 + }, + { + "epoch": 0.26092717603452287, + "grad_norm": 1.5695160627365112, + "learning_rate": 0.00017331157346222779, + "loss": 1.0371, + "step": 7286 + }, + { + "epoch": 0.2609629881640912, + "grad_norm": 1.6189770698547363, + "learning_rate": 0.00017330368443511417, + "loss": 1.27, + "step": 7287 + }, + { + "epoch": 0.26099880029365946, + "grad_norm": 1.2986291646957397, + "learning_rate": 0.00017329579442179401, + "loss": 1.2444, + "step": 7288 + }, + { + "epoch": 0.26103461242322773, + "grad_norm": 1.8389496803283691, + "learning_rate": 0.00017328790342237347, + "loss": 1.0744, + "step": 7289 + }, + { + "epoch": 0.26107042455279605, + "grad_norm": 1.2849242687225342, + "learning_rate": 0.00017328001143695874, + "loss": 0.9641, + "step": 7290 + }, + { + "epoch": 0.2611062366823643, + "grad_norm": 1.4691364765167236, + "learning_rate": 0.00017327211846565596, + "loss": 1.1263, + "step": 7291 + }, + { + "epoch": 0.2611420488119326, + "grad_norm": 1.4898836612701416, + "learning_rate": 0.0001732642245085714, + "loss": 1.2284, + "step": 7292 + }, + { + "epoch": 0.26117786094150086, + "grad_norm": 1.2873886823654175, + "learning_rate": 0.00017325632956581113, + "loss": 1.1529, + "step": 7293 + }, + { + "epoch": 0.2612136730710692, + "grad_norm": 1.7339701652526855, + "learning_rate": 0.00017324843363748148, + "loss": 1.1707, + "step": 7294 + }, + { + "epoch": 0.26124948520063745, + "grad_norm": 1.555395483970642, + "learning_rate": 0.00017324053672368862, + "loss": 1.1575, + "step": 7295 + }, + { + "epoch": 0.2612852973302057, + "grad_norm": 1.8398436307907104, + "learning_rate": 0.0001732326388245388, + "loss": 1.1484, + "step": 7296 + }, + { + "epoch": 0.26132110945977405, + "grad_norm": 2.757620096206665, + "learning_rate": 0.00017322473994013833, + "loss": 1.1362, + "step": 7297 + }, + { + "epoch": 0.2613569215893423, + "grad_norm": 1.592272162437439, + "learning_rate": 0.00017321684007059343, + "loss": 1.0656, + "step": 7298 + }, + { + "epoch": 0.2613927337189106, + "grad_norm": 1.6221569776535034, + "learning_rate": 0.00017320893921601036, + "loss": 1.0265, + "step": 7299 + }, + { + "epoch": 0.26142854584847885, + "grad_norm": 1.4163581132888794, + "learning_rate": 0.00017320103737649548, + "loss": 1.0619, + "step": 7300 + }, + { + "epoch": 0.2614643579780472, + "grad_norm": 1.1924874782562256, + "learning_rate": 0.00017319313455215504, + "loss": 1.0446, + "step": 7301 + }, + { + "epoch": 0.26150017010761545, + "grad_norm": 1.4980510473251343, + "learning_rate": 0.00017318523074309538, + "loss": 1.3772, + "step": 7302 + }, + { + "epoch": 0.2615359822371837, + "grad_norm": 1.5520496368408203, + "learning_rate": 0.00017317732594942286, + "loss": 1.1475, + "step": 7303 + }, + { + "epoch": 0.26157179436675204, + "grad_norm": 2.0780911445617676, + "learning_rate": 0.0001731694201712438, + "loss": 1.1659, + "step": 7304 + }, + { + "epoch": 0.2616076064963203, + "grad_norm": 3.3492543697357178, + "learning_rate": 0.0001731615134086646, + "loss": 1.1829, + "step": 7305 + }, + { + "epoch": 0.2616434186258886, + "grad_norm": 1.7790703773498535, + "learning_rate": 0.00017315360566179158, + "loss": 1.1606, + "step": 7306 + }, + { + "epoch": 0.26167923075545685, + "grad_norm": 1.5327907800674438, + "learning_rate": 0.00017314569693073115, + "loss": 1.1879, + "step": 7307 + }, + { + "epoch": 0.2617150428850252, + "grad_norm": 1.6576557159423828, + "learning_rate": 0.00017313778721558975, + "loss": 1.2047, + "step": 7308 + }, + { + "epoch": 0.26175085501459344, + "grad_norm": 1.6729618310928345, + "learning_rate": 0.00017312987651647374, + "loss": 1.1581, + "step": 7309 + }, + { + "epoch": 0.2617866671441617, + "grad_norm": 1.4508607387542725, + "learning_rate": 0.0001731219648334896, + "loss": 1.4385, + "step": 7310 + }, + { + "epoch": 0.26182247927373004, + "grad_norm": 1.6770191192626953, + "learning_rate": 0.00017311405216674373, + "loss": 1.226, + "step": 7311 + }, + { + "epoch": 0.2618582914032983, + "grad_norm": 1.9501619338989258, + "learning_rate": 0.00017310613851634257, + "loss": 1.3391, + "step": 7312 + }, + { + "epoch": 0.2618941035328666, + "grad_norm": 2.0423357486724854, + "learning_rate": 0.00017309822388239266, + "loss": 1.3121, + "step": 7313 + }, + { + "epoch": 0.26192991566243484, + "grad_norm": 1.6718720197677612, + "learning_rate": 0.0001730903082650004, + "loss": 1.1174, + "step": 7314 + }, + { + "epoch": 0.26196572779200317, + "grad_norm": 1.826229214668274, + "learning_rate": 0.00017308239166427232, + "loss": 1.3993, + "step": 7315 + }, + { + "epoch": 0.26200153992157144, + "grad_norm": 1.764124870300293, + "learning_rate": 0.00017307447408031497, + "loss": 1.2414, + "step": 7316 + }, + { + "epoch": 0.2620373520511397, + "grad_norm": 1.8078981637954712, + "learning_rate": 0.0001730665555132348, + "loss": 1.0949, + "step": 7317 + }, + { + "epoch": 0.26207316418070803, + "grad_norm": 1.3857616186141968, + "learning_rate": 0.00017305863596313837, + "loss": 1.1573, + "step": 7318 + }, + { + "epoch": 0.2621089763102763, + "grad_norm": 1.4906349182128906, + "learning_rate": 0.00017305071543013227, + "loss": 1.1502, + "step": 7319 + }, + { + "epoch": 0.26214478843984457, + "grad_norm": 1.3596410751342773, + "learning_rate": 0.000173042793914323, + "loss": 1.0726, + "step": 7320 + }, + { + "epoch": 0.26218060056941284, + "grad_norm": 1.5081144571304321, + "learning_rate": 0.00017303487141581716, + "loss": 1.2169, + "step": 7321 + }, + { + "epoch": 0.26221641269898116, + "grad_norm": 2.1751115322113037, + "learning_rate": 0.0001730269479347213, + "loss": 1.2382, + "step": 7322 + }, + { + "epoch": 0.26225222482854943, + "grad_norm": 1.5981663465499878, + "learning_rate": 0.00017301902347114208, + "loss": 1.1012, + "step": 7323 + }, + { + "epoch": 0.2622880369581177, + "grad_norm": 1.8189399242401123, + "learning_rate": 0.0001730110980251861, + "loss": 1.3035, + "step": 7324 + }, + { + "epoch": 0.262323849087686, + "grad_norm": 1.6273865699768066, + "learning_rate": 0.00017300317159695995, + "loss": 1.1135, + "step": 7325 + }, + { + "epoch": 0.2623596612172543, + "grad_norm": 1.882949948310852, + "learning_rate": 0.0001729952441865703, + "loss": 1.3242, + "step": 7326 + }, + { + "epoch": 0.26239547334682256, + "grad_norm": 1.4525009393692017, + "learning_rate": 0.0001729873157941238, + "loss": 1.1714, + "step": 7327 + }, + { + "epoch": 0.26243128547639083, + "grad_norm": 1.4362889528274536, + "learning_rate": 0.00017297938641972716, + "loss": 1.0731, + "step": 7328 + }, + { + "epoch": 0.26246709760595915, + "grad_norm": 2.1219899654388428, + "learning_rate": 0.00017297145606348695, + "loss": 1.0916, + "step": 7329 + }, + { + "epoch": 0.2625029097355274, + "grad_norm": 1.6916478872299194, + "learning_rate": 0.00017296352472550994, + "loss": 1.3608, + "step": 7330 + }, + { + "epoch": 0.2625387218650957, + "grad_norm": 1.8163094520568848, + "learning_rate": 0.00017295559240590282, + "loss": 1.3979, + "step": 7331 + }, + { + "epoch": 0.262574533994664, + "grad_norm": 1.5225406885147095, + "learning_rate": 0.00017294765910477234, + "loss": 1.2785, + "step": 7332 + }, + { + "epoch": 0.2626103461242323, + "grad_norm": 1.3070849180221558, + "learning_rate": 0.00017293972482222515, + "loss": 1.1813, + "step": 7333 + }, + { + "epoch": 0.26264615825380055, + "grad_norm": 2.067491292953491, + "learning_rate": 0.00017293178955836807, + "loss": 1.2882, + "step": 7334 + }, + { + "epoch": 0.2626819703833688, + "grad_norm": 1.5734772682189941, + "learning_rate": 0.00017292385331330786, + "loss": 1.1799, + "step": 7335 + }, + { + "epoch": 0.26271778251293715, + "grad_norm": 1.383597493171692, + "learning_rate": 0.00017291591608715123, + "loss": 1.1337, + "step": 7336 + }, + { + "epoch": 0.2627535946425054, + "grad_norm": 1.340307354927063, + "learning_rate": 0.00017290797788000503, + "loss": 1.1895, + "step": 7337 + }, + { + "epoch": 0.2627894067720737, + "grad_norm": 1.559213399887085, + "learning_rate": 0.00017290003869197603, + "loss": 1.3329, + "step": 7338 + }, + { + "epoch": 0.262825218901642, + "grad_norm": 1.4489567279815674, + "learning_rate": 0.00017289209852317102, + "loss": 0.9668, + "step": 7339 + }, + { + "epoch": 0.2628610310312103, + "grad_norm": 2.353193521499634, + "learning_rate": 0.00017288415737369689, + "loss": 1.1954, + "step": 7340 + }, + { + "epoch": 0.26289684316077855, + "grad_norm": 2.0701611042022705, + "learning_rate": 0.0001728762152436604, + "loss": 1.2315, + "step": 7341 + }, + { + "epoch": 0.2629326552903468, + "grad_norm": 1.778817057609558, + "learning_rate": 0.00017286827213316844, + "loss": 1.0562, + "step": 7342 + }, + { + "epoch": 0.26296846741991514, + "grad_norm": 1.4904241561889648, + "learning_rate": 0.0001728603280423279, + "loss": 1.2533, + "step": 7343 + }, + { + "epoch": 0.2630042795494834, + "grad_norm": 1.7368394136428833, + "learning_rate": 0.00017285238297124562, + "loss": 1.0094, + "step": 7344 + }, + { + "epoch": 0.2630400916790517, + "grad_norm": 1.7026493549346924, + "learning_rate": 0.00017284443692002846, + "loss": 1.3126, + "step": 7345 + }, + { + "epoch": 0.26307590380862, + "grad_norm": 1.2568191289901733, + "learning_rate": 0.00017283648988878343, + "loss": 1.2058, + "step": 7346 + }, + { + "epoch": 0.2631117159381883, + "grad_norm": 1.3988277912139893, + "learning_rate": 0.00017282854187761735, + "loss": 1.3001, + "step": 7347 + }, + { + "epoch": 0.26314752806775654, + "grad_norm": 1.7278004884719849, + "learning_rate": 0.00017282059288663715, + "loss": 1.177, + "step": 7348 + }, + { + "epoch": 0.2631833401973248, + "grad_norm": 1.699567198753357, + "learning_rate": 0.00017281264291594983, + "loss": 1.3038, + "step": 7349 + }, + { + "epoch": 0.26321915232689314, + "grad_norm": 1.8270342350006104, + "learning_rate": 0.00017280469196566235, + "loss": 1.1799, + "step": 7350 + }, + { + "epoch": 0.2632549644564614, + "grad_norm": 1.4414900541305542, + "learning_rate": 0.0001727967400358816, + "loss": 1.2131, + "step": 7351 + }, + { + "epoch": 0.2632907765860297, + "grad_norm": 1.9254294633865356, + "learning_rate": 0.00017278878712671464, + "loss": 1.2693, + "step": 7352 + }, + { + "epoch": 0.263326588715598, + "grad_norm": 1.4882842302322388, + "learning_rate": 0.00017278083323826846, + "loss": 1.2462, + "step": 7353 + }, + { + "epoch": 0.26336240084516627, + "grad_norm": 1.5258337259292603, + "learning_rate": 0.00017277287837065002, + "loss": 1.0989, + "step": 7354 + }, + { + "epoch": 0.26339821297473454, + "grad_norm": 1.716902256011963, + "learning_rate": 0.0001727649225239664, + "loss": 0.9756, + "step": 7355 + }, + { + "epoch": 0.2634340251043028, + "grad_norm": 1.6458845138549805, + "learning_rate": 0.00017275696569832457, + "loss": 1.2228, + "step": 7356 + }, + { + "epoch": 0.26346983723387113, + "grad_norm": 1.8200689554214478, + "learning_rate": 0.00017274900789383165, + "loss": 1.1489, + "step": 7357 + }, + { + "epoch": 0.2635056493634394, + "grad_norm": 1.344401240348816, + "learning_rate": 0.0001727410491105946, + "loss": 1.0635, + "step": 7358 + }, + { + "epoch": 0.26354146149300767, + "grad_norm": 1.720895767211914, + "learning_rate": 0.00017273308934872064, + "loss": 1.279, + "step": 7359 + }, + { + "epoch": 0.263577273622576, + "grad_norm": 1.5047377347946167, + "learning_rate": 0.00017272512860831674, + "loss": 1.4052, + "step": 7360 + }, + { + "epoch": 0.26361308575214426, + "grad_norm": 1.584534764289856, + "learning_rate": 0.00017271716688949007, + "loss": 1.2413, + "step": 7361 + }, + { + "epoch": 0.26364889788171253, + "grad_norm": 1.9319980144500732, + "learning_rate": 0.0001727092041923477, + "loss": 1.2008, + "step": 7362 + }, + { + "epoch": 0.2636847100112808, + "grad_norm": 1.5328196287155151, + "learning_rate": 0.00017270124051699682, + "loss": 1.2435, + "step": 7363 + }, + { + "epoch": 0.2637205221408491, + "grad_norm": 1.4358375072479248, + "learning_rate": 0.00017269327586354446, + "loss": 1.1821, + "step": 7364 + }, + { + "epoch": 0.2637563342704174, + "grad_norm": 1.516510009765625, + "learning_rate": 0.00017268531023209788, + "loss": 1.2237, + "step": 7365 + }, + { + "epoch": 0.26379214639998566, + "grad_norm": 2.1648635864257812, + "learning_rate": 0.0001726773436227642, + "loss": 1.2611, + "step": 7366 + }, + { + "epoch": 0.263827958529554, + "grad_norm": 1.533369541168213, + "learning_rate": 0.0001726693760356506, + "loss": 1.1408, + "step": 7367 + }, + { + "epoch": 0.26386377065912225, + "grad_norm": 1.4820351600646973, + "learning_rate": 0.0001726614074708643, + "loss": 0.9812, + "step": 7368 + }, + { + "epoch": 0.2638995827886905, + "grad_norm": 1.4317364692687988, + "learning_rate": 0.00017265343792851248, + "loss": 1.0077, + "step": 7369 + }, + { + "epoch": 0.2639353949182588, + "grad_norm": 1.423218011856079, + "learning_rate": 0.00017264546740870234, + "loss": 1.2552, + "step": 7370 + }, + { + "epoch": 0.2639712070478271, + "grad_norm": 1.3020496368408203, + "learning_rate": 0.0001726374959115412, + "loss": 1.0618, + "step": 7371 + }, + { + "epoch": 0.2640070191773954, + "grad_norm": 1.5702335834503174, + "learning_rate": 0.0001726295234371362, + "loss": 1.16, + "step": 7372 + }, + { + "epoch": 0.26404283130696365, + "grad_norm": 1.535727620124817, + "learning_rate": 0.00017262154998559466, + "loss": 1.2343, + "step": 7373 + }, + { + "epoch": 0.264078643436532, + "grad_norm": 1.702688455581665, + "learning_rate": 0.00017261357555702387, + "loss": 1.1807, + "step": 7374 + }, + { + "epoch": 0.26411445556610025, + "grad_norm": 1.7612309455871582, + "learning_rate": 0.00017260560015153106, + "loss": 1.3103, + "step": 7375 + }, + { + "epoch": 0.2641502676956685, + "grad_norm": 2.0340518951416016, + "learning_rate": 0.00017259762376922356, + "loss": 1.4239, + "step": 7376 + }, + { + "epoch": 0.2641860798252368, + "grad_norm": 1.7041476964950562, + "learning_rate": 0.00017258964641020868, + "loss": 1.1576, + "step": 7377 + }, + { + "epoch": 0.2642218919548051, + "grad_norm": 2.487048864364624, + "learning_rate": 0.0001725816680745937, + "loss": 1.1252, + "step": 7378 + }, + { + "epoch": 0.2642577040843734, + "grad_norm": 2.3464252948760986, + "learning_rate": 0.00017257368876248604, + "loss": 1.1348, + "step": 7379 + }, + { + "epoch": 0.26429351621394165, + "grad_norm": 1.2797359228134155, + "learning_rate": 0.000172565708473993, + "loss": 1.1235, + "step": 7380 + }, + { + "epoch": 0.26432932834351, + "grad_norm": 1.2892802953720093, + "learning_rate": 0.00017255772720922195, + "loss": 1.2394, + "step": 7381 + }, + { + "epoch": 0.26436514047307824, + "grad_norm": 1.2485171556472778, + "learning_rate": 0.0001725497449682803, + "loss": 1.0017, + "step": 7382 + }, + { + "epoch": 0.2644009526026465, + "grad_norm": 1.4161735773086548, + "learning_rate": 0.00017254176175127538, + "loss": 1.0983, + "step": 7383 + }, + { + "epoch": 0.2644367647322148, + "grad_norm": 2.123352289199829, + "learning_rate": 0.00017253377755831466, + "loss": 1.3545, + "step": 7384 + }, + { + "epoch": 0.2644725768617831, + "grad_norm": 1.549028992652893, + "learning_rate": 0.00017252579238950552, + "loss": 1.2624, + "step": 7385 + }, + { + "epoch": 0.2645083889913514, + "grad_norm": 1.774410367012024, + "learning_rate": 0.00017251780624495536, + "loss": 1.1682, + "step": 7386 + }, + { + "epoch": 0.26454420112091964, + "grad_norm": 1.9072667360305786, + "learning_rate": 0.0001725098191247717, + "loss": 1.1765, + "step": 7387 + }, + { + "epoch": 0.26458001325048797, + "grad_norm": 1.3498033285140991, + "learning_rate": 0.00017250183102906195, + "loss": 1.2857, + "step": 7388 + }, + { + "epoch": 0.26461582538005624, + "grad_norm": 1.2935940027236938, + "learning_rate": 0.00017249384195793357, + "loss": 1.0824, + "step": 7389 + }, + { + "epoch": 0.2646516375096245, + "grad_norm": 1.6049389839172363, + "learning_rate": 0.000172485851911494, + "loss": 1.3889, + "step": 7390 + }, + { + "epoch": 0.2646874496391928, + "grad_norm": 1.9654911756515503, + "learning_rate": 0.00017247786088985087, + "loss": 1.2713, + "step": 7391 + }, + { + "epoch": 0.2647232617687611, + "grad_norm": 1.7399989366531372, + "learning_rate": 0.0001724698688931116, + "loss": 1.3147, + "step": 7392 + }, + { + "epoch": 0.26475907389832937, + "grad_norm": 3.2606635093688965, + "learning_rate": 0.0001724618759213837, + "loss": 1.3502, + "step": 7393 + }, + { + "epoch": 0.26479488602789764, + "grad_norm": 1.9230647087097168, + "learning_rate": 0.00017245388197477477, + "loss": 1.1292, + "step": 7394 + }, + { + "epoch": 0.26483069815746596, + "grad_norm": 1.829436182975769, + "learning_rate": 0.0001724458870533923, + "loss": 1.2272, + "step": 7395 + }, + { + "epoch": 0.26486651028703423, + "grad_norm": 1.7466132640838623, + "learning_rate": 0.00017243789115734383, + "loss": 1.362, + "step": 7396 + }, + { + "epoch": 0.2649023224166025, + "grad_norm": 1.488226056098938, + "learning_rate": 0.00017242989428673701, + "loss": 1.1685, + "step": 7397 + }, + { + "epoch": 0.26493813454617077, + "grad_norm": 1.6506309509277344, + "learning_rate": 0.0001724218964416794, + "loss": 1.0455, + "step": 7398 + }, + { + "epoch": 0.2649739466757391, + "grad_norm": 2.077061176300049, + "learning_rate": 0.00017241389762227857, + "loss": 1.3576, + "step": 7399 + }, + { + "epoch": 0.26500975880530736, + "grad_norm": 1.8978804349899292, + "learning_rate": 0.00017240589782864215, + "loss": 1.2308, + "step": 7400 + }, + { + "epoch": 0.26504557093487563, + "grad_norm": 1.4193648099899292, + "learning_rate": 0.00017239789706087778, + "loss": 1.2513, + "step": 7401 + }, + { + "epoch": 0.26508138306444395, + "grad_norm": 1.3452273607254028, + "learning_rate": 0.0001723898953190931, + "loss": 1.3294, + "step": 7402 + }, + { + "epoch": 0.2651171951940122, + "grad_norm": 1.6419113874435425, + "learning_rate": 0.00017238189260339573, + "loss": 1.2377, + "step": 7403 + }, + { + "epoch": 0.2651530073235805, + "grad_norm": 1.2653530836105347, + "learning_rate": 0.00017237388891389336, + "loss": 1.1264, + "step": 7404 + }, + { + "epoch": 0.26518881945314876, + "grad_norm": 1.855352759361267, + "learning_rate": 0.0001723658842506937, + "loss": 1.0762, + "step": 7405 + }, + { + "epoch": 0.2652246315827171, + "grad_norm": 1.5729089975357056, + "learning_rate": 0.0001723578786139044, + "loss": 1.0039, + "step": 7406 + }, + { + "epoch": 0.26526044371228535, + "grad_norm": 1.8483325242996216, + "learning_rate": 0.00017234987200363317, + "loss": 1.1056, + "step": 7407 + }, + { + "epoch": 0.2652962558418536, + "grad_norm": 2.067095994949341, + "learning_rate": 0.00017234186441998777, + "loss": 1.2194, + "step": 7408 + }, + { + "epoch": 0.26533206797142195, + "grad_norm": 1.4411251544952393, + "learning_rate": 0.00017233385586307588, + "loss": 1.2185, + "step": 7409 + }, + { + "epoch": 0.2653678801009902, + "grad_norm": 1.511083960533142, + "learning_rate": 0.00017232584633300522, + "loss": 1.2313, + "step": 7410 + }, + { + "epoch": 0.2654036922305585, + "grad_norm": 1.3955132961273193, + "learning_rate": 0.00017231783582988367, + "loss": 1.1809, + "step": 7411 + }, + { + "epoch": 0.26543950436012675, + "grad_norm": 1.5914759635925293, + "learning_rate": 0.00017230982435381887, + "loss": 1.2787, + "step": 7412 + }, + { + "epoch": 0.2654753164896951, + "grad_norm": 1.7087160348892212, + "learning_rate": 0.00017230181190491862, + "loss": 1.1328, + "step": 7413 + }, + { + "epoch": 0.26551112861926335, + "grad_norm": 1.3701186180114746, + "learning_rate": 0.0001722937984832908, + "loss": 1.2365, + "step": 7414 + }, + { + "epoch": 0.2655469407488316, + "grad_norm": 1.452074646949768, + "learning_rate": 0.0001722857840890432, + "loss": 1.0252, + "step": 7415 + }, + { + "epoch": 0.26558275287839994, + "grad_norm": 1.663928508758545, + "learning_rate": 0.00017227776872228359, + "loss": 1.1407, + "step": 7416 + }, + { + "epoch": 0.2656185650079682, + "grad_norm": 1.481101155281067, + "learning_rate": 0.00017226975238311982, + "loss": 1.1144, + "step": 7417 + }, + { + "epoch": 0.2656543771375365, + "grad_norm": 1.9516242742538452, + "learning_rate": 0.00017226173507165976, + "loss": 1.3143, + "step": 7418 + }, + { + "epoch": 0.26569018926710475, + "grad_norm": 1.5094354152679443, + "learning_rate": 0.0001722537167880113, + "loss": 1.3574, + "step": 7419 + }, + { + "epoch": 0.2657260013966731, + "grad_norm": 1.637215495109558, + "learning_rate": 0.00017224569753228225, + "loss": 1.2047, + "step": 7420 + }, + { + "epoch": 0.26576181352624134, + "grad_norm": 1.61125910282135, + "learning_rate": 0.00017223767730458053, + "loss": 1.2594, + "step": 7421 + }, + { + "epoch": 0.2657976256558096, + "grad_norm": 1.3256739377975464, + "learning_rate": 0.00017222965610501405, + "loss": 1.1325, + "step": 7422 + }, + { + "epoch": 0.26583343778537794, + "grad_norm": 1.5749343633651733, + "learning_rate": 0.00017222163393369071, + "loss": 1.1945, + "step": 7423 + }, + { + "epoch": 0.2658692499149462, + "grad_norm": 1.3675041198730469, + "learning_rate": 0.00017221361079071846, + "loss": 1.4055, + "step": 7424 + }, + { + "epoch": 0.2659050620445145, + "grad_norm": 1.7534555196762085, + "learning_rate": 0.00017220558667620518, + "loss": 1.1929, + "step": 7425 + }, + { + "epoch": 0.26594087417408274, + "grad_norm": 1.6422646045684814, + "learning_rate": 0.0001721975615902589, + "loss": 1.2347, + "step": 7426 + }, + { + "epoch": 0.26597668630365107, + "grad_norm": 1.798872947692871, + "learning_rate": 0.00017218953553298759, + "loss": 1.2091, + "step": 7427 + }, + { + "epoch": 0.26601249843321934, + "grad_norm": 1.4656471014022827, + "learning_rate": 0.00017218150850449915, + "loss": 0.9486, + "step": 7428 + }, + { + "epoch": 0.2660483105627876, + "grad_norm": 1.3080193996429443, + "learning_rate": 0.00017217348050490162, + "loss": 1.3054, + "step": 7429 + }, + { + "epoch": 0.2660841226923559, + "grad_norm": 1.2965269088745117, + "learning_rate": 0.00017216545153430303, + "loss": 1.135, + "step": 7430 + }, + { + "epoch": 0.2661199348219242, + "grad_norm": 1.2715035676956177, + "learning_rate": 0.00017215742159281137, + "loss": 1.1285, + "step": 7431 + }, + { + "epoch": 0.26615574695149247, + "grad_norm": 1.5319154262542725, + "learning_rate": 0.00017214939068053468, + "loss": 1.152, + "step": 7432 + }, + { + "epoch": 0.26619155908106074, + "grad_norm": 1.665763020515442, + "learning_rate": 0.000172141358797581, + "loss": 1.2055, + "step": 7433 + }, + { + "epoch": 0.26622737121062906, + "grad_norm": 1.2327442169189453, + "learning_rate": 0.0001721333259440584, + "loss": 1.1269, + "step": 7434 + }, + { + "epoch": 0.26626318334019733, + "grad_norm": 1.2448166608810425, + "learning_rate": 0.00017212529212007492, + "loss": 1.348, + "step": 7435 + }, + { + "epoch": 0.2662989954697656, + "grad_norm": 1.470304250717163, + "learning_rate": 0.0001721172573257387, + "loss": 1.1025, + "step": 7436 + }, + { + "epoch": 0.26633480759933387, + "grad_norm": 1.2161977291107178, + "learning_rate": 0.0001721092215611578, + "loss": 0.7957, + "step": 7437 + }, + { + "epoch": 0.2663706197289022, + "grad_norm": 1.3188951015472412, + "learning_rate": 0.00017210118482644036, + "loss": 1.1686, + "step": 7438 + }, + { + "epoch": 0.26640643185847046, + "grad_norm": 1.525809645652771, + "learning_rate": 0.00017209314712169445, + "loss": 1.029, + "step": 7439 + }, + { + "epoch": 0.26644224398803873, + "grad_norm": 1.3587522506713867, + "learning_rate": 0.00017208510844702823, + "loss": 1.2284, + "step": 7440 + }, + { + "epoch": 0.26647805611760705, + "grad_norm": 1.3401761054992676, + "learning_rate": 0.00017207706880254987, + "loss": 1.2362, + "step": 7441 + }, + { + "epoch": 0.2665138682471753, + "grad_norm": 1.4956825971603394, + "learning_rate": 0.00017206902818836756, + "loss": 1.0632, + "step": 7442 + }, + { + "epoch": 0.2665496803767436, + "grad_norm": 1.8247087001800537, + "learning_rate": 0.00017206098660458937, + "loss": 1.31, + "step": 7443 + }, + { + "epoch": 0.26658549250631186, + "grad_norm": 1.4943424463272095, + "learning_rate": 0.00017205294405132362, + "loss": 1.0546, + "step": 7444 + }, + { + "epoch": 0.2666213046358802, + "grad_norm": 1.4705015420913696, + "learning_rate": 0.00017204490052867842, + "loss": 1.0055, + "step": 7445 + }, + { + "epoch": 0.26665711676544845, + "grad_norm": 1.2794184684753418, + "learning_rate": 0.00017203685603676202, + "loss": 1.2818, + "step": 7446 + }, + { + "epoch": 0.2666929288950167, + "grad_norm": 2.2079811096191406, + "learning_rate": 0.0001720288105756826, + "loss": 1.3719, + "step": 7447 + }, + { + "epoch": 0.26672874102458505, + "grad_norm": 1.4198933839797974, + "learning_rate": 0.0001720207641455485, + "loss": 1.1806, + "step": 7448 + }, + { + "epoch": 0.2667645531541533, + "grad_norm": 1.348096251487732, + "learning_rate": 0.0001720127167464679, + "loss": 1.1894, + "step": 7449 + }, + { + "epoch": 0.2668003652837216, + "grad_norm": 1.3896377086639404, + "learning_rate": 0.00017200466837854908, + "loss": 1.1656, + "step": 7450 + }, + { + "epoch": 0.26683617741328985, + "grad_norm": 1.7732754945755005, + "learning_rate": 0.00017199661904190037, + "loss": 1.1264, + "step": 7451 + }, + { + "epoch": 0.2668719895428582, + "grad_norm": 1.660509467124939, + "learning_rate": 0.00017198856873662996, + "loss": 1.2013, + "step": 7452 + }, + { + "epoch": 0.26690780167242645, + "grad_norm": 1.3672994375228882, + "learning_rate": 0.00017198051746284624, + "loss": 1.1011, + "step": 7453 + }, + { + "epoch": 0.2669436138019947, + "grad_norm": 1.3835691213607788, + "learning_rate": 0.00017197246522065752, + "loss": 1.1719, + "step": 7454 + }, + { + "epoch": 0.26697942593156304, + "grad_norm": 1.3936142921447754, + "learning_rate": 0.00017196441201017208, + "loss": 1.0068, + "step": 7455 + }, + { + "epoch": 0.2670152380611313, + "grad_norm": 1.6903526782989502, + "learning_rate": 0.00017195635783149834, + "loss": 1.3438, + "step": 7456 + }, + { + "epoch": 0.2670510501906996, + "grad_norm": 2.0800392627716064, + "learning_rate": 0.0001719483026847446, + "loss": 1.3069, + "step": 7457 + }, + { + "epoch": 0.26708686232026785, + "grad_norm": 1.6757912635803223, + "learning_rate": 0.00017194024657001927, + "loss": 1.1067, + "step": 7458 + }, + { + "epoch": 0.2671226744498362, + "grad_norm": 1.5923818349838257, + "learning_rate": 0.0001719321894874307, + "loss": 1.3473, + "step": 7459 + }, + { + "epoch": 0.26715848657940444, + "grad_norm": 1.5198948383331299, + "learning_rate": 0.00017192413143708735, + "loss": 1.2313, + "step": 7460 + }, + { + "epoch": 0.2671942987089727, + "grad_norm": 1.8660460710525513, + "learning_rate": 0.00017191607241909753, + "loss": 1.1778, + "step": 7461 + }, + { + "epoch": 0.26723011083854104, + "grad_norm": 1.42656409740448, + "learning_rate": 0.00017190801243356977, + "loss": 1.1884, + "step": 7462 + }, + { + "epoch": 0.2672659229681093, + "grad_norm": 1.6985740661621094, + "learning_rate": 0.0001718999514806124, + "loss": 1.0674, + "step": 7463 + }, + { + "epoch": 0.2673017350976776, + "grad_norm": 1.8287229537963867, + "learning_rate": 0.000171891889560334, + "loss": 1.1647, + "step": 7464 + }, + { + "epoch": 0.26733754722724584, + "grad_norm": 2.015532970428467, + "learning_rate": 0.0001718838266728429, + "loss": 1.1792, + "step": 7465 + }, + { + "epoch": 0.26737335935681417, + "grad_norm": 1.741951823234558, + "learning_rate": 0.00017187576281824766, + "loss": 1.1453, + "step": 7466 + }, + { + "epoch": 0.26740917148638244, + "grad_norm": 1.5642669200897217, + "learning_rate": 0.00017186769799665673, + "loss": 1.1603, + "step": 7467 + }, + { + "epoch": 0.2674449836159507, + "grad_norm": 1.5438326597213745, + "learning_rate": 0.00017185963220817864, + "loss": 1.1382, + "step": 7468 + }, + { + "epoch": 0.26748079574551903, + "grad_norm": 1.0792758464813232, + "learning_rate": 0.0001718515654529219, + "loss": 1.0574, + "step": 7469 + }, + { + "epoch": 0.2675166078750873, + "grad_norm": 1.6378521919250488, + "learning_rate": 0.000171843497730995, + "loss": 1.2834, + "step": 7470 + }, + { + "epoch": 0.26755242000465557, + "grad_norm": 1.5264146327972412, + "learning_rate": 0.00017183542904250656, + "loss": 1.2594, + "step": 7471 + }, + { + "epoch": 0.26758823213422384, + "grad_norm": 1.7073619365692139, + "learning_rate": 0.00017182735938756506, + "loss": 1.1218, + "step": 7472 + }, + { + "epoch": 0.26762404426379216, + "grad_norm": 1.4878227710723877, + "learning_rate": 0.00017181928876627907, + "loss": 1.1258, + "step": 7473 + }, + { + "epoch": 0.26765985639336043, + "grad_norm": 1.99684476852417, + "learning_rate": 0.0001718112171787572, + "loss": 1.2539, + "step": 7474 + }, + { + "epoch": 0.2676956685229287, + "grad_norm": 1.4789865016937256, + "learning_rate": 0.000171803144625108, + "loss": 1.0881, + "step": 7475 + }, + { + "epoch": 0.267731480652497, + "grad_norm": 1.8643052577972412, + "learning_rate": 0.00017179507110544014, + "loss": 1.4023, + "step": 7476 + }, + { + "epoch": 0.2677672927820653, + "grad_norm": 1.301580786705017, + "learning_rate": 0.0001717869966198622, + "loss": 1.172, + "step": 7477 + }, + { + "epoch": 0.26780310491163356, + "grad_norm": 1.81546950340271, + "learning_rate": 0.00017177892116848284, + "loss": 1.1792, + "step": 7478 + }, + { + "epoch": 0.26783891704120183, + "grad_norm": 1.517357587814331, + "learning_rate": 0.00017177084475141069, + "loss": 1.3029, + "step": 7479 + }, + { + "epoch": 0.26787472917077015, + "grad_norm": 1.579571008682251, + "learning_rate": 0.0001717627673687544, + "loss": 1.2018, + "step": 7480 + }, + { + "epoch": 0.2679105413003384, + "grad_norm": 2.287308692932129, + "learning_rate": 0.0001717546890206226, + "loss": 1.0084, + "step": 7481 + }, + { + "epoch": 0.2679463534299067, + "grad_norm": 1.819448471069336, + "learning_rate": 0.00017174660970712403, + "loss": 1.1305, + "step": 7482 + }, + { + "epoch": 0.267982165559475, + "grad_norm": 1.6665725708007812, + "learning_rate": 0.00017173852942836739, + "loss": 1.2286, + "step": 7483 + }, + { + "epoch": 0.2680179776890433, + "grad_norm": 1.5412025451660156, + "learning_rate": 0.00017173044818446137, + "loss": 1.2361, + "step": 7484 + }, + { + "epoch": 0.26805378981861155, + "grad_norm": 2.5877785682678223, + "learning_rate": 0.00017172236597551467, + "loss": 1.171, + "step": 7485 + }, + { + "epoch": 0.2680896019481798, + "grad_norm": 1.3882758617401123, + "learning_rate": 0.0001717142828016361, + "loss": 1.2554, + "step": 7486 + }, + { + "epoch": 0.26812541407774815, + "grad_norm": 1.8300966024398804, + "learning_rate": 0.00017170619866293434, + "loss": 0.96, + "step": 7487 + }, + { + "epoch": 0.2681612262073164, + "grad_norm": 2.048234701156616, + "learning_rate": 0.00017169811355951815, + "loss": 1.233, + "step": 7488 + }, + { + "epoch": 0.2681970383368847, + "grad_norm": 1.507015585899353, + "learning_rate": 0.0001716900274914963, + "loss": 0.9908, + "step": 7489 + }, + { + "epoch": 0.268232850466453, + "grad_norm": 1.7637373208999634, + "learning_rate": 0.00017168194045897767, + "loss": 1.1017, + "step": 7490 + }, + { + "epoch": 0.2682686625960213, + "grad_norm": 1.5495771169662476, + "learning_rate": 0.000171673852462071, + "loss": 1.0646, + "step": 7491 + }, + { + "epoch": 0.26830447472558955, + "grad_norm": 1.6689916849136353, + "learning_rate": 0.00017166576350088506, + "loss": 1.0205, + "step": 7492 + }, + { + "epoch": 0.2683402868551578, + "grad_norm": 2.0447463989257812, + "learning_rate": 0.0001716576735755287, + "loss": 1.2011, + "step": 7493 + }, + { + "epoch": 0.26837609898472614, + "grad_norm": 1.9152439832687378, + "learning_rate": 0.00017164958268611077, + "loss": 1.0679, + "step": 7494 + }, + { + "epoch": 0.2684119111142944, + "grad_norm": 1.7429478168487549, + "learning_rate": 0.00017164149083274017, + "loss": 1.2569, + "step": 7495 + }, + { + "epoch": 0.2684477232438627, + "grad_norm": 1.4659607410430908, + "learning_rate": 0.0001716333980155257, + "loss": 1.185, + "step": 7496 + }, + { + "epoch": 0.268483535373431, + "grad_norm": 1.6189353466033936, + "learning_rate": 0.00017162530423457626, + "loss": 1.3114, + "step": 7497 + }, + { + "epoch": 0.2685193475029993, + "grad_norm": 1.5922613143920898, + "learning_rate": 0.00017161720949000075, + "loss": 1.1995, + "step": 7498 + }, + { + "epoch": 0.26855515963256754, + "grad_norm": 1.8258767127990723, + "learning_rate": 0.00017160911378190808, + "loss": 1.1191, + "step": 7499 + }, + { + "epoch": 0.2685909717621358, + "grad_norm": 2.1504454612731934, + "learning_rate": 0.00017160101711040713, + "loss": 1.3379, + "step": 7500 + }, + { + "epoch": 0.26862678389170414, + "grad_norm": 1.7490127086639404, + "learning_rate": 0.00017159291947560682, + "loss": 1.1276, + "step": 7501 + }, + { + "epoch": 0.2686625960212724, + "grad_norm": 1.4710766077041626, + "learning_rate": 0.00017158482087761617, + "loss": 1.2509, + "step": 7502 + }, + { + "epoch": 0.2686984081508407, + "grad_norm": 1.538650631904602, + "learning_rate": 0.0001715767213165441, + "loss": 1.2473, + "step": 7503 + }, + { + "epoch": 0.268734220280409, + "grad_norm": 1.432449221611023, + "learning_rate": 0.00017156862079249953, + "loss": 1.3319, + "step": 7504 + }, + { + "epoch": 0.26877003240997727, + "grad_norm": 1.7935694456100464, + "learning_rate": 0.00017156051930559155, + "loss": 1.26, + "step": 7505 + }, + { + "epoch": 0.26880584453954554, + "grad_norm": 1.6402740478515625, + "learning_rate": 0.00017155241685592903, + "loss": 1.3065, + "step": 7506 + }, + { + "epoch": 0.2688416566691138, + "grad_norm": 1.3133989572525024, + "learning_rate": 0.00017154431344362106, + "loss": 1.1516, + "step": 7507 + }, + { + "epoch": 0.26887746879868213, + "grad_norm": 1.8181458711624146, + "learning_rate": 0.00017153620906877666, + "loss": 1.3182, + "step": 7508 + }, + { + "epoch": 0.2689132809282504, + "grad_norm": 1.3803306818008423, + "learning_rate": 0.00017152810373150478, + "loss": 1.2427, + "step": 7509 + }, + { + "epoch": 0.26894909305781867, + "grad_norm": 1.7015846967697144, + "learning_rate": 0.00017151999743191456, + "loss": 1.1645, + "step": 7510 + }, + { + "epoch": 0.268984905187387, + "grad_norm": 2.0035321712493896, + "learning_rate": 0.00017151189017011503, + "loss": 1.4253, + "step": 7511 + }, + { + "epoch": 0.26902071731695526, + "grad_norm": 1.3911627531051636, + "learning_rate": 0.00017150378194621529, + "loss": 1.1873, + "step": 7512 + }, + { + "epoch": 0.26905652944652353, + "grad_norm": 1.8004144430160522, + "learning_rate": 0.0001714956727603244, + "loss": 1.1127, + "step": 7513 + }, + { + "epoch": 0.2690923415760918, + "grad_norm": 1.8168281316757202, + "learning_rate": 0.0001714875626125514, + "loss": 1.1859, + "step": 7514 + }, + { + "epoch": 0.2691281537056601, + "grad_norm": 1.6386977434158325, + "learning_rate": 0.0001714794515030055, + "loss": 1.488, + "step": 7515 + }, + { + "epoch": 0.2691639658352284, + "grad_norm": 1.56617271900177, + "learning_rate": 0.00017147133943179577, + "loss": 1.1715, + "step": 7516 + }, + { + "epoch": 0.26919977796479666, + "grad_norm": 1.4683891534805298, + "learning_rate": 0.00017146322639903137, + "loss": 1.2363, + "step": 7517 + }, + { + "epoch": 0.269235590094365, + "grad_norm": 1.7126514911651611, + "learning_rate": 0.00017145511240482142, + "loss": 1.1551, + "step": 7518 + }, + { + "epoch": 0.26927140222393325, + "grad_norm": 1.7213553190231323, + "learning_rate": 0.00017144699744927507, + "loss": 1.229, + "step": 7519 + }, + { + "epoch": 0.2693072143535015, + "grad_norm": 1.7834662199020386, + "learning_rate": 0.0001714388815325016, + "loss": 1.1746, + "step": 7520 + }, + { + "epoch": 0.2693430264830698, + "grad_norm": 2.3189826011657715, + "learning_rate": 0.0001714307646546101, + "loss": 1.4838, + "step": 7521 + }, + { + "epoch": 0.2693788386126381, + "grad_norm": 1.4145745038986206, + "learning_rate": 0.00017142264681570978, + "loss": 1.2492, + "step": 7522 + }, + { + "epoch": 0.2694146507422064, + "grad_norm": 1.5658732652664185, + "learning_rate": 0.00017141452801590988, + "loss": 0.9978, + "step": 7523 + }, + { + "epoch": 0.26945046287177465, + "grad_norm": 1.471814751625061, + "learning_rate": 0.00017140640825531967, + "loss": 1.206, + "step": 7524 + }, + { + "epoch": 0.269486275001343, + "grad_norm": 1.4741177558898926, + "learning_rate": 0.0001713982875340483, + "loss": 1.2706, + "step": 7525 + }, + { + "epoch": 0.26952208713091125, + "grad_norm": 1.7161659002304077, + "learning_rate": 0.00017139016585220512, + "loss": 1.1238, + "step": 7526 + }, + { + "epoch": 0.2695578992604795, + "grad_norm": 1.2792588472366333, + "learning_rate": 0.0001713820432098993, + "loss": 1.043, + "step": 7527 + }, + { + "epoch": 0.2695937113900478, + "grad_norm": 1.832725167274475, + "learning_rate": 0.00017137391960724013, + "loss": 1.1674, + "step": 7528 + }, + { + "epoch": 0.2696295235196161, + "grad_norm": 1.6892982721328735, + "learning_rate": 0.000171365795044337, + "loss": 1.1571, + "step": 7529 + }, + { + "epoch": 0.2696653356491844, + "grad_norm": 1.4483256340026855, + "learning_rate": 0.00017135766952129913, + "loss": 1.0759, + "step": 7530 + }, + { + "epoch": 0.26970114777875265, + "grad_norm": 1.6056139469146729, + "learning_rate": 0.00017134954303823588, + "loss": 1.1866, + "step": 7531 + }, + { + "epoch": 0.269736959908321, + "grad_norm": 1.3984564542770386, + "learning_rate": 0.00017134141559525654, + "loss": 1.0353, + "step": 7532 + }, + { + "epoch": 0.26977277203788924, + "grad_norm": 1.3953073024749756, + "learning_rate": 0.00017133328719247048, + "loss": 1.0719, + "step": 7533 + }, + { + "epoch": 0.2698085841674575, + "grad_norm": 1.430668592453003, + "learning_rate": 0.00017132515782998704, + "loss": 1.0583, + "step": 7534 + }, + { + "epoch": 0.2698443962970258, + "grad_norm": 1.674222469329834, + "learning_rate": 0.00017131702750791564, + "loss": 1.0329, + "step": 7535 + }, + { + "epoch": 0.2698802084265941, + "grad_norm": 1.526803970336914, + "learning_rate": 0.0001713088962263656, + "loss": 1.2459, + "step": 7536 + }, + { + "epoch": 0.2699160205561624, + "grad_norm": 1.5370043516159058, + "learning_rate": 0.00017130076398544635, + "loss": 1.3153, + "step": 7537 + }, + { + "epoch": 0.26995183268573064, + "grad_norm": 1.6347123384475708, + "learning_rate": 0.0001712926307852673, + "loss": 1.1583, + "step": 7538 + }, + { + "epoch": 0.26998764481529897, + "grad_norm": 1.5582990646362305, + "learning_rate": 0.00017128449662593786, + "loss": 1.1324, + "step": 7539 + }, + { + "epoch": 0.27002345694486724, + "grad_norm": 1.4588099718093872, + "learning_rate": 0.00017127636150756747, + "loss": 1.3487, + "step": 7540 + }, + { + "epoch": 0.2700592690744355, + "grad_norm": 1.413360834121704, + "learning_rate": 0.00017126822543026555, + "loss": 1.2136, + "step": 7541 + }, + { + "epoch": 0.2700950812040038, + "grad_norm": 1.8445534706115723, + "learning_rate": 0.00017126008839414163, + "loss": 1.405, + "step": 7542 + }, + { + "epoch": 0.2701308933335721, + "grad_norm": 1.4772019386291504, + "learning_rate": 0.00017125195039930508, + "loss": 1.0075, + "step": 7543 + }, + { + "epoch": 0.27016670546314037, + "grad_norm": 2.435997247695923, + "learning_rate": 0.0001712438114458655, + "loss": 1.2558, + "step": 7544 + }, + { + "epoch": 0.27020251759270864, + "grad_norm": 1.3885102272033691, + "learning_rate": 0.00017123567153393233, + "loss": 1.0579, + "step": 7545 + }, + { + "epoch": 0.27023832972227696, + "grad_norm": 2.1693453788757324, + "learning_rate": 0.00017122753066361508, + "loss": 1.146, + "step": 7546 + }, + { + "epoch": 0.27027414185184523, + "grad_norm": 1.5302921533584595, + "learning_rate": 0.00017121938883502328, + "loss": 1.2924, + "step": 7547 + }, + { + "epoch": 0.2703099539814135, + "grad_norm": 1.7205816507339478, + "learning_rate": 0.00017121124604826645, + "loss": 1.2728, + "step": 7548 + }, + { + "epoch": 0.27034576611098177, + "grad_norm": 1.7962088584899902, + "learning_rate": 0.00017120310230345418, + "loss": 0.9968, + "step": 7549 + }, + { + "epoch": 0.2703815782405501, + "grad_norm": 1.2816373109817505, + "learning_rate": 0.000171194957600696, + "loss": 1.1626, + "step": 7550 + }, + { + "epoch": 0.27041739037011836, + "grad_norm": 1.4500492811203003, + "learning_rate": 0.00017118681194010153, + "loss": 1.0261, + "step": 7551 + }, + { + "epoch": 0.27045320249968663, + "grad_norm": 1.767217755317688, + "learning_rate": 0.0001711786653217803, + "loss": 1.1042, + "step": 7552 + }, + { + "epoch": 0.27048901462925495, + "grad_norm": 1.6455119848251343, + "learning_rate": 0.00017117051774584194, + "loss": 1.06, + "step": 7553 + }, + { + "epoch": 0.2705248267588232, + "grad_norm": 1.4862453937530518, + "learning_rate": 0.00017116236921239607, + "loss": 1.0244, + "step": 7554 + }, + { + "epoch": 0.2705606388883915, + "grad_norm": 1.7961466312408447, + "learning_rate": 0.00017115421972155234, + "loss": 1.1245, + "step": 7555 + }, + { + "epoch": 0.27059645101795976, + "grad_norm": 1.4363895654678345, + "learning_rate": 0.00017114606927342036, + "loss": 1.1136, + "step": 7556 + }, + { + "epoch": 0.2706322631475281, + "grad_norm": 1.5338973999023438, + "learning_rate": 0.0001711379178681098, + "loss": 0.9987, + "step": 7557 + }, + { + "epoch": 0.27066807527709635, + "grad_norm": 1.8341132402420044, + "learning_rate": 0.00017112976550573026, + "loss": 1.2594, + "step": 7558 + }, + { + "epoch": 0.2707038874066646, + "grad_norm": 1.5914729833602905, + "learning_rate": 0.00017112161218639152, + "loss": 1.1677, + "step": 7559 + }, + { + "epoch": 0.27073969953623295, + "grad_norm": 1.6505793333053589, + "learning_rate": 0.00017111345791020324, + "loss": 1.1025, + "step": 7560 + }, + { + "epoch": 0.2707755116658012, + "grad_norm": 1.7910611629486084, + "learning_rate": 0.0001711053026772751, + "loss": 1.1483, + "step": 7561 + }, + { + "epoch": 0.2708113237953695, + "grad_norm": 1.8301961421966553, + "learning_rate": 0.00017109714648771683, + "loss": 1.0633, + "step": 7562 + }, + { + "epoch": 0.27084713592493775, + "grad_norm": 1.4782778024673462, + "learning_rate": 0.00017108898934163814, + "loss": 1.326, + "step": 7563 + }, + { + "epoch": 0.2708829480545061, + "grad_norm": 1.6443839073181152, + "learning_rate": 0.0001710808312391488, + "loss": 0.9657, + "step": 7564 + }, + { + "epoch": 0.27091876018407435, + "grad_norm": 1.5884402990341187, + "learning_rate": 0.0001710726721803586, + "loss": 1.0412, + "step": 7565 + }, + { + "epoch": 0.2709545723136426, + "grad_norm": 1.586340308189392, + "learning_rate": 0.00017106451216537723, + "loss": 1.1342, + "step": 7566 + }, + { + "epoch": 0.27099038444321094, + "grad_norm": 1.3676133155822754, + "learning_rate": 0.00017105635119431457, + "loss": 1.0765, + "step": 7567 + }, + { + "epoch": 0.2710261965727792, + "grad_norm": 1.7038363218307495, + "learning_rate": 0.0001710481892672803, + "loss": 1.2126, + "step": 7568 + }, + { + "epoch": 0.2710620087023475, + "grad_norm": 1.5685844421386719, + "learning_rate": 0.00017104002638438433, + "loss": 1.2691, + "step": 7569 + }, + { + "epoch": 0.27109782083191575, + "grad_norm": 1.907149076461792, + "learning_rate": 0.00017103186254573642, + "loss": 1.1978, + "step": 7570 + }, + { + "epoch": 0.2711336329614841, + "grad_norm": 2.862241744995117, + "learning_rate": 0.00017102369775144643, + "loss": 1.1809, + "step": 7571 + }, + { + "epoch": 0.27116944509105234, + "grad_norm": 2.0206286907196045, + "learning_rate": 0.0001710155320016242, + "loss": 1.2896, + "step": 7572 + }, + { + "epoch": 0.2712052572206206, + "grad_norm": 1.5547641515731812, + "learning_rate": 0.00017100736529637958, + "loss": 1.2199, + "step": 7573 + }, + { + "epoch": 0.27124106935018893, + "grad_norm": 1.375912070274353, + "learning_rate": 0.0001709991976358225, + "loss": 1.3043, + "step": 7574 + }, + { + "epoch": 0.2712768814797572, + "grad_norm": 2.647552728652954, + "learning_rate": 0.00017099102902006275, + "loss": 1.262, + "step": 7575 + }, + { + "epoch": 0.2713126936093255, + "grad_norm": 1.9751975536346436, + "learning_rate": 0.00017098285944921028, + "loss": 1.471, + "step": 7576 + }, + { + "epoch": 0.27134850573889374, + "grad_norm": 1.5266613960266113, + "learning_rate": 0.00017097468892337503, + "loss": 1.2304, + "step": 7577 + }, + { + "epoch": 0.27138431786846207, + "grad_norm": 1.3912138938903809, + "learning_rate": 0.00017096651744266686, + "loss": 1.3435, + "step": 7578 + }, + { + "epoch": 0.27142012999803034, + "grad_norm": 1.4410455226898193, + "learning_rate": 0.00017095834500719574, + "loss": 1.147, + "step": 7579 + }, + { + "epoch": 0.2714559421275986, + "grad_norm": 2.271430253982544, + "learning_rate": 0.00017095017161707164, + "loss": 1.2823, + "step": 7580 + }, + { + "epoch": 0.27149175425716693, + "grad_norm": 1.7735415697097778, + "learning_rate": 0.00017094199727240447, + "loss": 1.0768, + "step": 7581 + }, + { + "epoch": 0.2715275663867352, + "grad_norm": 1.2244523763656616, + "learning_rate": 0.00017093382197330427, + "loss": 1.089, + "step": 7582 + }, + { + "epoch": 0.27156337851630347, + "grad_norm": 1.2885204553604126, + "learning_rate": 0.00017092564571988096, + "loss": 1.0395, + "step": 7583 + }, + { + "epoch": 0.27159919064587174, + "grad_norm": 2.615150213241577, + "learning_rate": 0.0001709174685122446, + "loss": 1.1094, + "step": 7584 + }, + { + "epoch": 0.27163500277544006, + "grad_norm": 1.447703242301941, + "learning_rate": 0.00017090929035050513, + "loss": 1.2254, + "step": 7585 + }, + { + "epoch": 0.27167081490500833, + "grad_norm": 1.448148250579834, + "learning_rate": 0.00017090111123477266, + "loss": 1.1345, + "step": 7586 + }, + { + "epoch": 0.2717066270345766, + "grad_norm": 1.6399766206741333, + "learning_rate": 0.0001708929311651572, + "loss": 1.2291, + "step": 7587 + }, + { + "epoch": 0.2717424391641449, + "grad_norm": 1.601775050163269, + "learning_rate": 0.0001708847501417688, + "loss": 1.0929, + "step": 7588 + }, + { + "epoch": 0.2717782512937132, + "grad_norm": 1.320278286933899, + "learning_rate": 0.00017087656816471754, + "loss": 0.9371, + "step": 7589 + }, + { + "epoch": 0.27181406342328146, + "grad_norm": 1.8539279699325562, + "learning_rate": 0.00017086838523411343, + "loss": 1.3312, + "step": 7590 + }, + { + "epoch": 0.27184987555284973, + "grad_norm": 1.3440790176391602, + "learning_rate": 0.00017086020135006664, + "loss": 0.9954, + "step": 7591 + }, + { + "epoch": 0.27188568768241805, + "grad_norm": 1.3599324226379395, + "learning_rate": 0.00017085201651268722, + "loss": 1.0861, + "step": 7592 + }, + { + "epoch": 0.2719214998119863, + "grad_norm": 1.8609673976898193, + "learning_rate": 0.00017084383072208534, + "loss": 1.367, + "step": 7593 + }, + { + "epoch": 0.2719573119415546, + "grad_norm": 1.495007872581482, + "learning_rate": 0.00017083564397837108, + "loss": 1.1639, + "step": 7594 + }, + { + "epoch": 0.2719931240711229, + "grad_norm": 1.3250771760940552, + "learning_rate": 0.00017082745628165463, + "loss": 1.2498, + "step": 7595 + }, + { + "epoch": 0.2720289362006912, + "grad_norm": 2.0565202236175537, + "learning_rate": 0.0001708192676320461, + "loss": 1.1131, + "step": 7596 + }, + { + "epoch": 0.27206474833025945, + "grad_norm": 1.5756092071533203, + "learning_rate": 0.00017081107802965564, + "loss": 1.1325, + "step": 7597 + }, + { + "epoch": 0.2721005604598277, + "grad_norm": 1.664131760597229, + "learning_rate": 0.0001708028874745935, + "loss": 1.2888, + "step": 7598 + }, + { + "epoch": 0.27213637258939605, + "grad_norm": 2.0447678565979004, + "learning_rate": 0.0001707946959669698, + "loss": 1.176, + "step": 7599 + }, + { + "epoch": 0.2721721847189643, + "grad_norm": 1.7501680850982666, + "learning_rate": 0.00017078650350689482, + "loss": 1.1515, + "step": 7600 + }, + { + "epoch": 0.2722079968485326, + "grad_norm": 1.270140528678894, + "learning_rate": 0.00017077831009447878, + "loss": 1.029, + "step": 7601 + }, + { + "epoch": 0.2722438089781009, + "grad_norm": 1.4784846305847168, + "learning_rate": 0.00017077011572983183, + "loss": 1.046, + "step": 7602 + }, + { + "epoch": 0.2722796211076692, + "grad_norm": 1.7188282012939453, + "learning_rate": 0.00017076192041306425, + "loss": 1.1747, + "step": 7603 + }, + { + "epoch": 0.27231543323723745, + "grad_norm": 1.6662992238998413, + "learning_rate": 0.00017075372414428633, + "loss": 1.4142, + "step": 7604 + }, + { + "epoch": 0.2723512453668057, + "grad_norm": 1.6177631616592407, + "learning_rate": 0.00017074552692360832, + "loss": 0.9817, + "step": 7605 + }, + { + "epoch": 0.27238705749637404, + "grad_norm": 1.592407464981079, + "learning_rate": 0.00017073732875114045, + "loss": 1.1408, + "step": 7606 + }, + { + "epoch": 0.2724228696259423, + "grad_norm": 1.5143073797225952, + "learning_rate": 0.0001707291296269931, + "loss": 1.1223, + "step": 7607 + }, + { + "epoch": 0.2724586817555106, + "grad_norm": 1.5978717803955078, + "learning_rate": 0.00017072092955127657, + "loss": 1.1529, + "step": 7608 + }, + { + "epoch": 0.2724944938850789, + "grad_norm": 1.3789716958999634, + "learning_rate": 0.00017071272852410113, + "loss": 1.1035, + "step": 7609 + }, + { + "epoch": 0.2725303060146472, + "grad_norm": 1.45754075050354, + "learning_rate": 0.00017070452654557717, + "loss": 1.2222, + "step": 7610 + }, + { + "epoch": 0.27256611814421544, + "grad_norm": 1.3922597169876099, + "learning_rate": 0.00017069632361581497, + "loss": 1.3085, + "step": 7611 + }, + { + "epoch": 0.2726019302737837, + "grad_norm": 1.7085111141204834, + "learning_rate": 0.00017068811973492497, + "loss": 1.1953, + "step": 7612 + }, + { + "epoch": 0.27263774240335203, + "grad_norm": 1.6307653188705444, + "learning_rate": 0.00017067991490301744, + "loss": 1.2384, + "step": 7613 + }, + { + "epoch": 0.2726735545329203, + "grad_norm": 2.0947062969207764, + "learning_rate": 0.00017067170912020286, + "loss": 1.3155, + "step": 7614 + }, + { + "epoch": 0.2727093666624886, + "grad_norm": 1.5844002962112427, + "learning_rate": 0.0001706635023865916, + "loss": 1.1999, + "step": 7615 + }, + { + "epoch": 0.2727451787920569, + "grad_norm": 1.5589311122894287, + "learning_rate": 0.00017065529470229403, + "loss": 1.0942, + "step": 7616 + }, + { + "epoch": 0.27278099092162517, + "grad_norm": 1.437756896018982, + "learning_rate": 0.00017064708606742067, + "loss": 1.4296, + "step": 7617 + }, + { + "epoch": 0.27281680305119343, + "grad_norm": 1.5128093957901, + "learning_rate": 0.00017063887648208185, + "loss": 1.3931, + "step": 7618 + }, + { + "epoch": 0.2728526151807617, + "grad_norm": 2.086167573928833, + "learning_rate": 0.00017063066594638805, + "loss": 1.1979, + "step": 7619 + }, + { + "epoch": 0.27288842731033003, + "grad_norm": 1.4915664196014404, + "learning_rate": 0.0001706224544604498, + "loss": 1.2286, + "step": 7620 + }, + { + "epoch": 0.2729242394398983, + "grad_norm": 2.015349864959717, + "learning_rate": 0.00017061424202437748, + "loss": 1.3149, + "step": 7621 + }, + { + "epoch": 0.27296005156946657, + "grad_norm": 1.2442028522491455, + "learning_rate": 0.00017060602863828165, + "loss": 1.0815, + "step": 7622 + }, + { + "epoch": 0.2729958636990349, + "grad_norm": 1.7010383605957031, + "learning_rate": 0.00017059781430227275, + "loss": 1.4307, + "step": 7623 + }, + { + "epoch": 0.27303167582860316, + "grad_norm": 1.5405778884887695, + "learning_rate": 0.00017058959901646134, + "loss": 1.1946, + "step": 7624 + }, + { + "epoch": 0.27306748795817143, + "grad_norm": 2.4392552375793457, + "learning_rate": 0.00017058138278095792, + "loss": 1.1954, + "step": 7625 + }, + { + "epoch": 0.2731033000877397, + "grad_norm": 2.1707165241241455, + "learning_rate": 0.00017057316559587307, + "loss": 1.2649, + "step": 7626 + }, + { + "epoch": 0.273139112217308, + "grad_norm": 1.4758304357528687, + "learning_rate": 0.00017056494746131725, + "loss": 1.165, + "step": 7627 + }, + { + "epoch": 0.2731749243468763, + "grad_norm": 1.6074739694595337, + "learning_rate": 0.00017055672837740113, + "loss": 1.0666, + "step": 7628 + }, + { + "epoch": 0.27321073647644456, + "grad_norm": 1.3912423849105835, + "learning_rate": 0.00017054850834423522, + "loss": 1.2036, + "step": 7629 + }, + { + "epoch": 0.27324654860601283, + "grad_norm": 1.5518784523010254, + "learning_rate": 0.00017054028736193013, + "loss": 0.977, + "step": 7630 + }, + { + "epoch": 0.27328236073558115, + "grad_norm": 1.5494199991226196, + "learning_rate": 0.00017053206543059647, + "loss": 1.3114, + "step": 7631 + }, + { + "epoch": 0.2733181728651494, + "grad_norm": 1.3944522142410278, + "learning_rate": 0.00017052384255034485, + "loss": 1.1754, + "step": 7632 + }, + { + "epoch": 0.2733539849947177, + "grad_norm": 2.0536067485809326, + "learning_rate": 0.00017051561872128592, + "loss": 1.1928, + "step": 7633 + }, + { + "epoch": 0.273389797124286, + "grad_norm": 1.52769136428833, + "learning_rate": 0.00017050739394353028, + "loss": 1.1241, + "step": 7634 + }, + { + "epoch": 0.2734256092538543, + "grad_norm": 1.9389945268630981, + "learning_rate": 0.00017049916821718861, + "loss": 1.2469, + "step": 7635 + }, + { + "epoch": 0.27346142138342255, + "grad_norm": 1.311187505722046, + "learning_rate": 0.00017049094154237155, + "loss": 1.1467, + "step": 7636 + }, + { + "epoch": 0.2734972335129908, + "grad_norm": 1.5005223751068115, + "learning_rate": 0.0001704827139191898, + "loss": 1.2545, + "step": 7637 + }, + { + "epoch": 0.27353304564255915, + "grad_norm": 1.4588043689727783, + "learning_rate": 0.00017047448534775406, + "loss": 1.2343, + "step": 7638 + }, + { + "epoch": 0.2735688577721274, + "grad_norm": 1.5121698379516602, + "learning_rate": 0.00017046625582817503, + "loss": 1.0187, + "step": 7639 + }, + { + "epoch": 0.2736046699016957, + "grad_norm": 1.9783787727355957, + "learning_rate": 0.00017045802536056344, + "loss": 1.4388, + "step": 7640 + }, + { + "epoch": 0.273640482031264, + "grad_norm": 1.4372299909591675, + "learning_rate": 0.00017044979394502995, + "loss": 1.2348, + "step": 7641 + }, + { + "epoch": 0.2736762941608323, + "grad_norm": 1.9864096641540527, + "learning_rate": 0.0001704415615816854, + "loss": 1.2409, + "step": 7642 + }, + { + "epoch": 0.27371210629040055, + "grad_norm": 1.4610484838485718, + "learning_rate": 0.0001704333282706405, + "loss": 1.175, + "step": 7643 + }, + { + "epoch": 0.2737479184199688, + "grad_norm": 1.6219489574432373, + "learning_rate": 0.00017042509401200598, + "loss": 1.245, + "step": 7644 + }, + { + "epoch": 0.27378373054953714, + "grad_norm": 1.8600332736968994, + "learning_rate": 0.00017041685880589272, + "loss": 1.2935, + "step": 7645 + }, + { + "epoch": 0.2738195426791054, + "grad_norm": 1.1831316947937012, + "learning_rate": 0.0001704086226524114, + "loss": 1.0476, + "step": 7646 + }, + { + "epoch": 0.2738553548086737, + "grad_norm": 1.6636264324188232, + "learning_rate": 0.0001704003855516729, + "loss": 1.2829, + "step": 7647 + }, + { + "epoch": 0.273891166938242, + "grad_norm": 1.9048793315887451, + "learning_rate": 0.00017039214750378805, + "loss": 1.2758, + "step": 7648 + }, + { + "epoch": 0.2739269790678103, + "grad_norm": 1.6954694986343384, + "learning_rate": 0.00017038390850886766, + "loss": 1.1294, + "step": 7649 + }, + { + "epoch": 0.27396279119737854, + "grad_norm": 1.4319027662277222, + "learning_rate": 0.00017037566856702255, + "loss": 1.0186, + "step": 7650 + }, + { + "epoch": 0.2739986033269468, + "grad_norm": 1.4484748840332031, + "learning_rate": 0.00017036742767836355, + "loss": 1.236, + "step": 7651 + }, + { + "epoch": 0.27403441545651513, + "grad_norm": 1.6730636358261108, + "learning_rate": 0.00017035918584300163, + "loss": 1.1421, + "step": 7652 + }, + { + "epoch": 0.2740702275860834, + "grad_norm": 1.5141139030456543, + "learning_rate": 0.00017035094306104762, + "loss": 1.3112, + "step": 7653 + }, + { + "epoch": 0.2741060397156517, + "grad_norm": 1.356305480003357, + "learning_rate": 0.0001703426993326124, + "loss": 1.1716, + "step": 7654 + }, + { + "epoch": 0.27414185184522, + "grad_norm": 1.651108980178833, + "learning_rate": 0.0001703344546578069, + "loss": 1.3206, + "step": 7655 + }, + { + "epoch": 0.27417766397478827, + "grad_norm": 1.6114798784255981, + "learning_rate": 0.00017032620903674207, + "loss": 1.1814, + "step": 7656 + }, + { + "epoch": 0.27421347610435653, + "grad_norm": 2.2223012447357178, + "learning_rate": 0.0001703179624695288, + "loss": 1.1652, + "step": 7657 + }, + { + "epoch": 0.2742492882339248, + "grad_norm": 2.128021478652954, + "learning_rate": 0.00017030971495627802, + "loss": 1.2204, + "step": 7658 + }, + { + "epoch": 0.27428510036349313, + "grad_norm": 1.4637842178344727, + "learning_rate": 0.00017030146649710072, + "loss": 1.1792, + "step": 7659 + }, + { + "epoch": 0.2743209124930614, + "grad_norm": 1.7602823972702026, + "learning_rate": 0.00017029321709210787, + "loss": 1.3126, + "step": 7660 + }, + { + "epoch": 0.27435672462262967, + "grad_norm": 3.1603198051452637, + "learning_rate": 0.00017028496674141051, + "loss": 0.9683, + "step": 7661 + }, + { + "epoch": 0.274392536752198, + "grad_norm": 1.9578624963760376, + "learning_rate": 0.0001702767154451195, + "loss": 1.2157, + "step": 7662 + }, + { + "epoch": 0.27442834888176626, + "grad_norm": 1.4205299615859985, + "learning_rate": 0.000170268463203346, + "loss": 1.1686, + "step": 7663 + }, + { + "epoch": 0.27446416101133453, + "grad_norm": 1.3895554542541504, + "learning_rate": 0.00017026021001620095, + "loss": 1.2735, + "step": 7664 + }, + { + "epoch": 0.2744999731409028, + "grad_norm": 1.8671528100967407, + "learning_rate": 0.00017025195588379538, + "loss": 1.0969, + "step": 7665 + }, + { + "epoch": 0.2745357852704711, + "grad_norm": 1.5356212854385376, + "learning_rate": 0.0001702437008062404, + "loss": 0.9886, + "step": 7666 + }, + { + "epoch": 0.2745715974000394, + "grad_norm": 1.5258326530456543, + "learning_rate": 0.00017023544478364698, + "loss": 1.0558, + "step": 7667 + }, + { + "epoch": 0.27460740952960766, + "grad_norm": 1.495056390762329, + "learning_rate": 0.0001702271878161263, + "loss": 1.0005, + "step": 7668 + }, + { + "epoch": 0.274643221659176, + "grad_norm": 1.4252089262008667, + "learning_rate": 0.0001702189299037894, + "loss": 1.192, + "step": 7669 + }, + { + "epoch": 0.27467903378874425, + "grad_norm": 1.5654501914978027, + "learning_rate": 0.00017021067104674734, + "loss": 1.4199, + "step": 7670 + }, + { + "epoch": 0.2747148459183125, + "grad_norm": 2.279989719390869, + "learning_rate": 0.00017020241124511128, + "loss": 1.2345, + "step": 7671 + }, + { + "epoch": 0.2747506580478808, + "grad_norm": 1.3275092840194702, + "learning_rate": 0.0001701941504989923, + "loss": 1.0112, + "step": 7672 + }, + { + "epoch": 0.2747864701774491, + "grad_norm": 1.5335594415664673, + "learning_rate": 0.00017018588880850162, + "loss": 1.1943, + "step": 7673 + }, + { + "epoch": 0.2748222823070174, + "grad_norm": 1.2425601482391357, + "learning_rate": 0.0001701776261737503, + "loss": 1.2329, + "step": 7674 + }, + { + "epoch": 0.27485809443658565, + "grad_norm": 1.6345511674880981, + "learning_rate": 0.00017016936259484953, + "loss": 1.2554, + "step": 7675 + }, + { + "epoch": 0.274893906566154, + "grad_norm": 1.4177786111831665, + "learning_rate": 0.00017016109807191056, + "loss": 1.0907, + "step": 7676 + }, + { + "epoch": 0.27492971869572225, + "grad_norm": 1.2455544471740723, + "learning_rate": 0.00017015283260504447, + "loss": 1.137, + "step": 7677 + }, + { + "epoch": 0.2749655308252905, + "grad_norm": 2.0547983646392822, + "learning_rate": 0.00017014456619436253, + "loss": 1.2042, + "step": 7678 + }, + { + "epoch": 0.2750013429548588, + "grad_norm": 1.5150011777877808, + "learning_rate": 0.00017013629883997594, + "loss": 1.2034, + "step": 7679 + }, + { + "epoch": 0.2750371550844271, + "grad_norm": 1.420972466468811, + "learning_rate": 0.00017012803054199587, + "loss": 1.347, + "step": 7680 + }, + { + "epoch": 0.2750729672139954, + "grad_norm": 1.9368606805801392, + "learning_rate": 0.00017011976130053367, + "loss": 1.1486, + "step": 7681 + }, + { + "epoch": 0.27510877934356365, + "grad_norm": 1.4412055015563965, + "learning_rate": 0.00017011149111570051, + "loss": 1.1582, + "step": 7682 + }, + { + "epoch": 0.27514459147313197, + "grad_norm": 1.9715265035629272, + "learning_rate": 0.00017010321998760762, + "loss": 1.2514, + "step": 7683 + }, + { + "epoch": 0.27518040360270024, + "grad_norm": 1.4985159635543823, + "learning_rate": 0.0001700949479163664, + "loss": 1.093, + "step": 7684 + }, + { + "epoch": 0.2752162157322685, + "grad_norm": 1.7432842254638672, + "learning_rate": 0.00017008667490208803, + "loss": 1.0688, + "step": 7685 + }, + { + "epoch": 0.2752520278618368, + "grad_norm": 1.4697972536087036, + "learning_rate": 0.00017007840094488387, + "loss": 1.2895, + "step": 7686 + }, + { + "epoch": 0.2752878399914051, + "grad_norm": 1.4939841032028198, + "learning_rate": 0.00017007012604486525, + "loss": 1.2408, + "step": 7687 + }, + { + "epoch": 0.2753236521209734, + "grad_norm": 1.3698832988739014, + "learning_rate": 0.0001700618502021434, + "loss": 1.1646, + "step": 7688 + }, + { + "epoch": 0.27535946425054164, + "grad_norm": 1.2604336738586426, + "learning_rate": 0.00017005357341682979, + "loss": 1.1815, + "step": 7689 + }, + { + "epoch": 0.27539527638010997, + "grad_norm": 1.8805720806121826, + "learning_rate": 0.0001700452956890357, + "loss": 1.345, + "step": 7690 + }, + { + "epoch": 0.27543108850967823, + "grad_norm": 1.3913273811340332, + "learning_rate": 0.0001700370170188725, + "loss": 1.1982, + "step": 7691 + }, + { + "epoch": 0.2754669006392465, + "grad_norm": 1.6161259412765503, + "learning_rate": 0.00017002873740645157, + "loss": 1.3707, + "step": 7692 + }, + { + "epoch": 0.2755027127688148, + "grad_norm": 1.4571408033370972, + "learning_rate": 0.00017002045685188431, + "loss": 1.0355, + "step": 7693 + }, + { + "epoch": 0.2755385248983831, + "grad_norm": 1.534745693206787, + "learning_rate": 0.00017001217535528215, + "loss": 1.0932, + "step": 7694 + }, + { + "epoch": 0.27557433702795137, + "grad_norm": 1.468727469444275, + "learning_rate": 0.00017000389291675644, + "loss": 1.2764, + "step": 7695 + }, + { + "epoch": 0.27561014915751963, + "grad_norm": 1.5528837442398071, + "learning_rate": 0.00016999560953641867, + "loss": 1.3073, + "step": 7696 + }, + { + "epoch": 0.27564596128708796, + "grad_norm": 1.2361522912979126, + "learning_rate": 0.00016998732521438024, + "loss": 0.8982, + "step": 7697 + }, + { + "epoch": 0.27568177341665623, + "grad_norm": 1.8044737577438354, + "learning_rate": 0.00016997903995075265, + "loss": 1.0177, + "step": 7698 + }, + { + "epoch": 0.2757175855462245, + "grad_norm": 1.6667808294296265, + "learning_rate": 0.00016997075374564733, + "loss": 1.0818, + "step": 7699 + }, + { + "epoch": 0.27575339767579277, + "grad_norm": 1.2096736431121826, + "learning_rate": 0.00016996246659917578, + "loss": 0.9956, + "step": 7700 + }, + { + "epoch": 0.2757892098053611, + "grad_norm": 1.5478484630584717, + "learning_rate": 0.0001699541785114495, + "loss": 1.0905, + "step": 7701 + }, + { + "epoch": 0.27582502193492936, + "grad_norm": 1.5650089979171753, + "learning_rate": 0.00016994588948257997, + "loss": 0.9671, + "step": 7702 + }, + { + "epoch": 0.27586083406449763, + "grad_norm": 1.6939797401428223, + "learning_rate": 0.0001699375995126787, + "loss": 1.2303, + "step": 7703 + }, + { + "epoch": 0.27589664619406595, + "grad_norm": 1.3396904468536377, + "learning_rate": 0.00016992930860185726, + "loss": 1.082, + "step": 7704 + }, + { + "epoch": 0.2759324583236342, + "grad_norm": 1.5023809671401978, + "learning_rate": 0.0001699210167502272, + "loss": 1.0364, + "step": 7705 + }, + { + "epoch": 0.2759682704532025, + "grad_norm": 1.327124834060669, + "learning_rate": 0.00016991272395790007, + "loss": 1.0712, + "step": 7706 + }, + { + "epoch": 0.27600408258277076, + "grad_norm": 1.9838831424713135, + "learning_rate": 0.00016990443022498735, + "loss": 1.1738, + "step": 7707 + }, + { + "epoch": 0.2760398947123391, + "grad_norm": 1.28135347366333, + "learning_rate": 0.0001698961355516007, + "loss": 1.1677, + "step": 7708 + }, + { + "epoch": 0.27607570684190735, + "grad_norm": 2.3330318927764893, + "learning_rate": 0.00016988783993785177, + "loss": 1.2011, + "step": 7709 + }, + { + "epoch": 0.2761115189714756, + "grad_norm": 1.6458501815795898, + "learning_rate": 0.00016987954338385202, + "loss": 1.0749, + "step": 7710 + }, + { + "epoch": 0.27614733110104395, + "grad_norm": 1.5465775728225708, + "learning_rate": 0.0001698712458897132, + "loss": 1.1556, + "step": 7711 + }, + { + "epoch": 0.2761831432306122, + "grad_norm": 1.7253347635269165, + "learning_rate": 0.0001698629474555469, + "loss": 1.1435, + "step": 7712 + }, + { + "epoch": 0.2762189553601805, + "grad_norm": 1.8876067399978638, + "learning_rate": 0.00016985464808146473, + "loss": 1.3263, + "step": 7713 + }, + { + "epoch": 0.27625476748974875, + "grad_norm": 1.5002371072769165, + "learning_rate": 0.0001698463477675784, + "loss": 1.0788, + "step": 7714 + }, + { + "epoch": 0.2762905796193171, + "grad_norm": 1.6441421508789062, + "learning_rate": 0.00016983804651399956, + "loss": 1.1886, + "step": 7715 + }, + { + "epoch": 0.27632639174888535, + "grad_norm": 1.9360268115997314, + "learning_rate": 0.00016982974432083986, + "loss": 1.2738, + "step": 7716 + }, + { + "epoch": 0.2763622038784536, + "grad_norm": 1.48494291305542, + "learning_rate": 0.00016982144118821103, + "loss": 1.2326, + "step": 7717 + }, + { + "epoch": 0.27639801600802194, + "grad_norm": 1.7441169023513794, + "learning_rate": 0.0001698131371162248, + "loss": 1.3373, + "step": 7718 + }, + { + "epoch": 0.2764338281375902, + "grad_norm": 1.4463623762130737, + "learning_rate": 0.00016980483210499286, + "loss": 1.1257, + "step": 7719 + }, + { + "epoch": 0.2764696402671585, + "grad_norm": 1.7369368076324463, + "learning_rate": 0.00016979652615462692, + "loss": 1.2914, + "step": 7720 + }, + { + "epoch": 0.27650545239672675, + "grad_norm": 1.504203200340271, + "learning_rate": 0.00016978821926523873, + "loss": 1.1273, + "step": 7721 + }, + { + "epoch": 0.27654126452629507, + "grad_norm": 1.6385915279388428, + "learning_rate": 0.00016977991143694014, + "loss": 1.2466, + "step": 7722 + }, + { + "epoch": 0.27657707665586334, + "grad_norm": 1.3168153762817383, + "learning_rate": 0.00016977160266984283, + "loss": 1.0865, + "step": 7723 + }, + { + "epoch": 0.2766128887854316, + "grad_norm": 1.7673097848892212, + "learning_rate": 0.00016976329296405855, + "loss": 1.0654, + "step": 7724 + }, + { + "epoch": 0.27664870091499993, + "grad_norm": 1.4311822652816772, + "learning_rate": 0.0001697549823196992, + "loss": 1.3345, + "step": 7725 + }, + { + "epoch": 0.2766845130445682, + "grad_norm": 1.7974562644958496, + "learning_rate": 0.00016974667073687655, + "loss": 1.165, + "step": 7726 + }, + { + "epoch": 0.27672032517413647, + "grad_norm": 1.5604274272918701, + "learning_rate": 0.00016973835821570236, + "loss": 1.2174, + "step": 7727 + }, + { + "epoch": 0.27675613730370474, + "grad_norm": 1.7345993518829346, + "learning_rate": 0.00016973004475628856, + "loss": 1.0565, + "step": 7728 + }, + { + "epoch": 0.27679194943327307, + "grad_norm": 1.652790904045105, + "learning_rate": 0.00016972173035874693, + "loss": 1.3173, + "step": 7729 + }, + { + "epoch": 0.27682776156284133, + "grad_norm": 1.533132791519165, + "learning_rate": 0.00016971341502318936, + "loss": 1.1111, + "step": 7730 + }, + { + "epoch": 0.2768635736924096, + "grad_norm": 1.9227255582809448, + "learning_rate": 0.00016970509874972774, + "loss": 1.3948, + "step": 7731 + }, + { + "epoch": 0.27689938582197793, + "grad_norm": 1.6927275657653809, + "learning_rate": 0.0001696967815384739, + "loss": 1.2439, + "step": 7732 + }, + { + "epoch": 0.2769351979515462, + "grad_norm": 1.4723879098892212, + "learning_rate": 0.0001696884633895398, + "loss": 1.2089, + "step": 7733 + }, + { + "epoch": 0.27697101008111447, + "grad_norm": 1.4699729681015015, + "learning_rate": 0.00016968014430303728, + "loss": 1.1597, + "step": 7734 + }, + { + "epoch": 0.27700682221068273, + "grad_norm": 1.3718491792678833, + "learning_rate": 0.0001696718242790783, + "loss": 1.1253, + "step": 7735 + }, + { + "epoch": 0.27704263434025106, + "grad_norm": 1.593003749847412, + "learning_rate": 0.0001696635033177748, + "loss": 1.1001, + "step": 7736 + }, + { + "epoch": 0.27707844646981933, + "grad_norm": 1.4522851705551147, + "learning_rate": 0.00016965518141923874, + "loss": 1.2081, + "step": 7737 + }, + { + "epoch": 0.2771142585993876, + "grad_norm": 1.6111152172088623, + "learning_rate": 0.00016964685858358202, + "loss": 1.3245, + "step": 7738 + }, + { + "epoch": 0.2771500707289559, + "grad_norm": 2.095527172088623, + "learning_rate": 0.0001696385348109167, + "loss": 1.2472, + "step": 7739 + }, + { + "epoch": 0.2771858828585242, + "grad_norm": 1.5707728862762451, + "learning_rate": 0.0001696302101013547, + "loss": 1.2542, + "step": 7740 + }, + { + "epoch": 0.27722169498809246, + "grad_norm": 1.3443212509155273, + "learning_rate": 0.00016962188445500807, + "loss": 1.1921, + "step": 7741 + }, + { + "epoch": 0.27725750711766073, + "grad_norm": 1.6612989902496338, + "learning_rate": 0.00016961355787198875, + "loss": 1.2872, + "step": 7742 + }, + { + "epoch": 0.27729331924722905, + "grad_norm": 1.3904482126235962, + "learning_rate": 0.00016960523035240883, + "loss": 1.15, + "step": 7743 + }, + { + "epoch": 0.2773291313767973, + "grad_norm": 1.2011346817016602, + "learning_rate": 0.0001695969018963803, + "loss": 1.1735, + "step": 7744 + }, + { + "epoch": 0.2773649435063656, + "grad_norm": 1.5788087844848633, + "learning_rate": 0.00016958857250401525, + "loss": 1.3203, + "step": 7745 + }, + { + "epoch": 0.2774007556359339, + "grad_norm": 1.4079513549804688, + "learning_rate": 0.0001695802421754257, + "loss": 1.3048, + "step": 7746 + }, + { + "epoch": 0.2774365677655022, + "grad_norm": 1.414125680923462, + "learning_rate": 0.00016957191091072376, + "loss": 1.0489, + "step": 7747 + }, + { + "epoch": 0.27747237989507045, + "grad_norm": 1.5383268594741821, + "learning_rate": 0.0001695635787100215, + "loss": 1.0442, + "step": 7748 + }, + { + "epoch": 0.2775081920246387, + "grad_norm": 1.5336531400680542, + "learning_rate": 0.000169555245573431, + "loss": 1.0502, + "step": 7749 + }, + { + "epoch": 0.27754400415420705, + "grad_norm": 1.635715365409851, + "learning_rate": 0.0001695469115010644, + "loss": 1.0245, + "step": 7750 + }, + { + "epoch": 0.2775798162837753, + "grad_norm": 1.3087096214294434, + "learning_rate": 0.00016953857649303381, + "loss": 1.244, + "step": 7751 + }, + { + "epoch": 0.2776156284133436, + "grad_norm": 1.4762247800827026, + "learning_rate": 0.00016953024054945138, + "loss": 1.0745, + "step": 7752 + }, + { + "epoch": 0.2776514405429119, + "grad_norm": 2.644057512283325, + "learning_rate": 0.00016952190367042926, + "loss": 1.2236, + "step": 7753 + }, + { + "epoch": 0.2776872526724802, + "grad_norm": 2.3128418922424316, + "learning_rate": 0.0001695135658560796, + "loss": 1.4195, + "step": 7754 + }, + { + "epoch": 0.27772306480204845, + "grad_norm": 1.4004504680633545, + "learning_rate": 0.00016950522710651455, + "loss": 1.0558, + "step": 7755 + }, + { + "epoch": 0.2777588769316167, + "grad_norm": 1.604570984840393, + "learning_rate": 0.00016949688742184637, + "loss": 1.1668, + "step": 7756 + }, + { + "epoch": 0.27779468906118504, + "grad_norm": 1.5237549543380737, + "learning_rate": 0.0001694885468021872, + "loss": 1.1655, + "step": 7757 + }, + { + "epoch": 0.2778305011907533, + "grad_norm": 1.8344531059265137, + "learning_rate": 0.00016948020524764924, + "loss": 1.0875, + "step": 7758 + }, + { + "epoch": 0.2778663133203216, + "grad_norm": 1.6127196550369263, + "learning_rate": 0.00016947186275834475, + "loss": 1.2533, + "step": 7759 + }, + { + "epoch": 0.2779021254498899, + "grad_norm": 1.901363730430603, + "learning_rate": 0.00016946351933438595, + "loss": 1.0955, + "step": 7760 + }, + { + "epoch": 0.27793793757945817, + "grad_norm": 1.4294970035552979, + "learning_rate": 0.00016945517497588512, + "loss": 1.1703, + "step": 7761 + }, + { + "epoch": 0.27797374970902644, + "grad_norm": 2.002462148666382, + "learning_rate": 0.00016944682968295452, + "loss": 1.1136, + "step": 7762 + }, + { + "epoch": 0.2780095618385947, + "grad_norm": 1.9101659059524536, + "learning_rate": 0.00016943848345570638, + "loss": 1.2769, + "step": 7763 + }, + { + "epoch": 0.27804537396816303, + "grad_norm": 1.5722832679748535, + "learning_rate": 0.00016943013629425302, + "loss": 1.1263, + "step": 7764 + }, + { + "epoch": 0.2780811860977313, + "grad_norm": 1.3865430355072021, + "learning_rate": 0.00016942178819870672, + "loss": 1.0083, + "step": 7765 + }, + { + "epoch": 0.27811699822729957, + "grad_norm": 1.3278512954711914, + "learning_rate": 0.00016941343916917982, + "loss": 1.1819, + "step": 7766 + }, + { + "epoch": 0.2781528103568679, + "grad_norm": 1.5365707874298096, + "learning_rate": 0.00016940508920578463, + "loss": 1.3782, + "step": 7767 + }, + { + "epoch": 0.27818862248643617, + "grad_norm": 1.8087279796600342, + "learning_rate": 0.00016939673830863348, + "loss": 1.1388, + "step": 7768 + }, + { + "epoch": 0.27822443461600443, + "grad_norm": 1.8104170560836792, + "learning_rate": 0.00016938838647783877, + "loss": 1.1561, + "step": 7769 + }, + { + "epoch": 0.2782602467455727, + "grad_norm": 1.9375722408294678, + "learning_rate": 0.00016938003371351278, + "loss": 1.3681, + "step": 7770 + }, + { + "epoch": 0.27829605887514103, + "grad_norm": 1.9626212120056152, + "learning_rate": 0.00016937168001576795, + "loss": 1.0588, + "step": 7771 + }, + { + "epoch": 0.2783318710047093, + "grad_norm": 1.4951245784759521, + "learning_rate": 0.00016936332538471666, + "loss": 1.0706, + "step": 7772 + }, + { + "epoch": 0.27836768313427757, + "grad_norm": 2.3644793033599854, + "learning_rate": 0.00016935496982047128, + "loss": 1.4721, + "step": 7773 + }, + { + "epoch": 0.2784034952638459, + "grad_norm": 1.3738638162612915, + "learning_rate": 0.00016934661332314424, + "loss": 1.3051, + "step": 7774 + }, + { + "epoch": 0.27843930739341416, + "grad_norm": 1.5028232336044312, + "learning_rate": 0.000169338255892848, + "loss": 1.1478, + "step": 7775 + }, + { + "epoch": 0.27847511952298243, + "grad_norm": 1.6852961778640747, + "learning_rate": 0.00016932989752969495, + "loss": 1.1909, + "step": 7776 + }, + { + "epoch": 0.2785109316525507, + "grad_norm": 1.815198540687561, + "learning_rate": 0.00016932153823379754, + "loss": 1.2829, + "step": 7777 + }, + { + "epoch": 0.278546743782119, + "grad_norm": 1.8809387683868408, + "learning_rate": 0.00016931317800526828, + "loss": 1.4346, + "step": 7778 + }, + { + "epoch": 0.2785825559116873, + "grad_norm": 1.705649733543396, + "learning_rate": 0.0001693048168442196, + "loss": 1.1822, + "step": 7779 + }, + { + "epoch": 0.27861836804125556, + "grad_norm": 1.4594931602478027, + "learning_rate": 0.000169296454750764, + "loss": 1.1507, + "step": 7780 + }, + { + "epoch": 0.2786541801708239, + "grad_norm": 1.6385860443115234, + "learning_rate": 0.00016928809172501397, + "loss": 1.3643, + "step": 7781 + }, + { + "epoch": 0.27868999230039215, + "grad_norm": 1.8448505401611328, + "learning_rate": 0.00016927972776708208, + "loss": 1.1206, + "step": 7782 + }, + { + "epoch": 0.2787258044299604, + "grad_norm": 1.62285578250885, + "learning_rate": 0.0001692713628770808, + "loss": 0.9006, + "step": 7783 + }, + { + "epoch": 0.2787616165595287, + "grad_norm": 2.2985994815826416, + "learning_rate": 0.00016926299705512273, + "loss": 1.0344, + "step": 7784 + }, + { + "epoch": 0.278797428689097, + "grad_norm": 1.529619574546814, + "learning_rate": 0.0001692546303013203, + "loss": 1.2977, + "step": 7785 + }, + { + "epoch": 0.2788332408186653, + "grad_norm": 2.014907121658325, + "learning_rate": 0.0001692462626157862, + "loss": 1.1646, + "step": 7786 + }, + { + "epoch": 0.27886905294823355, + "grad_norm": 1.822267770767212, + "learning_rate": 0.00016923789399863294, + "loss": 1.1927, + "step": 7787 + }, + { + "epoch": 0.2789048650778019, + "grad_norm": 1.6903704404830933, + "learning_rate": 0.00016922952444997313, + "loss": 1.1878, + "step": 7788 + }, + { + "epoch": 0.27894067720737015, + "grad_norm": 1.5856666564941406, + "learning_rate": 0.00016922115396991939, + "loss": 1.085, + "step": 7789 + }, + { + "epoch": 0.2789764893369384, + "grad_norm": 1.473075032234192, + "learning_rate": 0.00016921278255858425, + "loss": 1.2013, + "step": 7790 + }, + { + "epoch": 0.2790123014665067, + "grad_norm": 1.5139038562774658, + "learning_rate": 0.00016920441021608048, + "loss": 1.4089, + "step": 7791 + }, + { + "epoch": 0.279048113596075, + "grad_norm": 1.5954444408416748, + "learning_rate": 0.0001691960369425206, + "loss": 1.1941, + "step": 7792 + }, + { + "epoch": 0.2790839257256433, + "grad_norm": 1.6590332984924316, + "learning_rate": 0.0001691876627380173, + "loss": 1.1355, + "step": 7793 + }, + { + "epoch": 0.27911973785521155, + "grad_norm": 1.6005423069000244, + "learning_rate": 0.00016917928760268325, + "loss": 1.2508, + "step": 7794 + }, + { + "epoch": 0.27915554998477987, + "grad_norm": 1.2277590036392212, + "learning_rate": 0.0001691709115366311, + "loss": 1.1517, + "step": 7795 + }, + { + "epoch": 0.27919136211434814, + "grad_norm": 1.445702314376831, + "learning_rate": 0.00016916253453997358, + "loss": 1.214, + "step": 7796 + }, + { + "epoch": 0.2792271742439164, + "grad_norm": 1.3356939554214478, + "learning_rate": 0.00016915415661282335, + "loss": 1.2152, + "step": 7797 + }, + { + "epoch": 0.2792629863734847, + "grad_norm": 1.637850046157837, + "learning_rate": 0.00016914577775529316, + "loss": 0.9437, + "step": 7798 + }, + { + "epoch": 0.279298798503053, + "grad_norm": 1.3731663227081299, + "learning_rate": 0.0001691373979674957, + "loss": 1.3764, + "step": 7799 + }, + { + "epoch": 0.27933461063262127, + "grad_norm": 1.6288890838623047, + "learning_rate": 0.00016912901724954377, + "loss": 1.2655, + "step": 7800 + }, + { + "epoch": 0.27937042276218954, + "grad_norm": 1.5614951848983765, + "learning_rate": 0.00016912063560155005, + "loss": 1.2023, + "step": 7801 + }, + { + "epoch": 0.27940623489175787, + "grad_norm": 1.9276036024093628, + "learning_rate": 0.00016911225302362738, + "loss": 1.134, + "step": 7802 + }, + { + "epoch": 0.27944204702132613, + "grad_norm": 1.663224458694458, + "learning_rate": 0.00016910386951588845, + "loss": 1.2327, + "step": 7803 + }, + { + "epoch": 0.2794778591508944, + "grad_norm": 1.4252976179122925, + "learning_rate": 0.0001690954850784461, + "loss": 1.3495, + "step": 7804 + }, + { + "epoch": 0.27951367128046267, + "grad_norm": 1.276404619216919, + "learning_rate": 0.00016908709971141312, + "loss": 1.2178, + "step": 7805 + }, + { + "epoch": 0.279549483410031, + "grad_norm": 2.218754291534424, + "learning_rate": 0.00016907871341490235, + "loss": 1.2505, + "step": 7806 + }, + { + "epoch": 0.27958529553959927, + "grad_norm": 4.20759391784668, + "learning_rate": 0.00016907032618902661, + "loss": 1.2555, + "step": 7807 + }, + { + "epoch": 0.27962110766916753, + "grad_norm": 1.5035035610198975, + "learning_rate": 0.00016906193803389868, + "loss": 1.1637, + "step": 7808 + }, + { + "epoch": 0.27965691979873586, + "grad_norm": 2.028827428817749, + "learning_rate": 0.00016905354894963147, + "loss": 1.3682, + "step": 7809 + }, + { + "epoch": 0.27969273192830413, + "grad_norm": 1.6089534759521484, + "learning_rate": 0.00016904515893633785, + "loss": 1.2886, + "step": 7810 + }, + { + "epoch": 0.2797285440578724, + "grad_norm": 1.5393871068954468, + "learning_rate": 0.0001690367679941307, + "loss": 1.1779, + "step": 7811 + }, + { + "epoch": 0.27976435618744067, + "grad_norm": 1.2825130224227905, + "learning_rate": 0.00016902837612312285, + "loss": 1.2138, + "step": 7812 + }, + { + "epoch": 0.279800168317009, + "grad_norm": 1.4812836647033691, + "learning_rate": 0.00016901998332342726, + "loss": 1.2475, + "step": 7813 + }, + { + "epoch": 0.27983598044657726, + "grad_norm": 1.2240378856658936, + "learning_rate": 0.00016901158959515682, + "loss": 1.1594, + "step": 7814 + }, + { + "epoch": 0.27987179257614553, + "grad_norm": 1.6199263334274292, + "learning_rate": 0.00016900319493842446, + "loss": 1.161, + "step": 7815 + }, + { + "epoch": 0.27990760470571385, + "grad_norm": 1.8229326009750366, + "learning_rate": 0.00016899479935334307, + "loss": 1.4038, + "step": 7816 + }, + { + "epoch": 0.2799434168352821, + "grad_norm": 1.5035443305969238, + "learning_rate": 0.0001689864028400257, + "loss": 1.1599, + "step": 7817 + }, + { + "epoch": 0.2799792289648504, + "grad_norm": 1.8030600547790527, + "learning_rate": 0.00016897800539858527, + "loss": 1.2696, + "step": 7818 + }, + { + "epoch": 0.28001504109441866, + "grad_norm": 1.4706969261169434, + "learning_rate": 0.00016896960702913476, + "loss": 1.2585, + "step": 7819 + }, + { + "epoch": 0.280050853223987, + "grad_norm": 1.414884328842163, + "learning_rate": 0.00016896120773178712, + "loss": 1.3214, + "step": 7820 + }, + { + "epoch": 0.28008666535355525, + "grad_norm": 1.7351728677749634, + "learning_rate": 0.00016895280750665542, + "loss": 1.2766, + "step": 7821 + }, + { + "epoch": 0.2801224774831235, + "grad_norm": 1.4033920764923096, + "learning_rate": 0.0001689444063538526, + "loss": 1.1574, + "step": 7822 + }, + { + "epoch": 0.2801582896126918, + "grad_norm": 3.1922378540039062, + "learning_rate": 0.00016893600427349173, + "loss": 1.0415, + "step": 7823 + }, + { + "epoch": 0.2801941017422601, + "grad_norm": 1.521330714225769, + "learning_rate": 0.00016892760126568584, + "loss": 1.1084, + "step": 7824 + }, + { + "epoch": 0.2802299138718284, + "grad_norm": 1.526863932609558, + "learning_rate": 0.00016891919733054802, + "loss": 1.1658, + "step": 7825 + }, + { + "epoch": 0.28026572600139665, + "grad_norm": 1.260337471961975, + "learning_rate": 0.00016891079246819128, + "loss": 1.1552, + "step": 7826 + }, + { + "epoch": 0.280301538130965, + "grad_norm": 1.317183256149292, + "learning_rate": 0.0001689023866787287, + "loss": 1.2575, + "step": 7827 + }, + { + "epoch": 0.28033735026053325, + "grad_norm": 1.4170114994049072, + "learning_rate": 0.00016889397996227342, + "loss": 1.2817, + "step": 7828 + }, + { + "epoch": 0.2803731623901015, + "grad_norm": 1.4670145511627197, + "learning_rate": 0.00016888557231893846, + "loss": 1.1562, + "step": 7829 + }, + { + "epoch": 0.2804089745196698, + "grad_norm": 1.298601508140564, + "learning_rate": 0.00016887716374883703, + "loss": 1.1659, + "step": 7830 + }, + { + "epoch": 0.2804447866492381, + "grad_norm": 1.307503581047058, + "learning_rate": 0.0001688687542520822, + "loss": 1.1841, + "step": 7831 + }, + { + "epoch": 0.2804805987788064, + "grad_norm": 1.2910513877868652, + "learning_rate": 0.0001688603438287871, + "loss": 1.1543, + "step": 7832 + }, + { + "epoch": 0.28051641090837465, + "grad_norm": 2.048854112625122, + "learning_rate": 0.00016885193247906488, + "loss": 1.0536, + "step": 7833 + }, + { + "epoch": 0.28055222303794297, + "grad_norm": 1.6912153959274292, + "learning_rate": 0.00016884352020302875, + "loss": 1.2872, + "step": 7834 + }, + { + "epoch": 0.28058803516751124, + "grad_norm": 1.412658929824829, + "learning_rate": 0.00016883510700079182, + "loss": 1.2582, + "step": 7835 + }, + { + "epoch": 0.2806238472970795, + "grad_norm": 1.3779224157333374, + "learning_rate": 0.00016882669287246734, + "loss": 1.2088, + "step": 7836 + }, + { + "epoch": 0.2806596594266478, + "grad_norm": 1.41794753074646, + "learning_rate": 0.0001688182778181685, + "loss": 1.1825, + "step": 7837 + }, + { + "epoch": 0.2806954715562161, + "grad_norm": 1.5987069606781006, + "learning_rate": 0.0001688098618380085, + "loss": 0.9548, + "step": 7838 + }, + { + "epoch": 0.28073128368578437, + "grad_norm": 1.592089056968689, + "learning_rate": 0.00016880144493210052, + "loss": 1.2039, + "step": 7839 + }, + { + "epoch": 0.28076709581535264, + "grad_norm": 1.6749271154403687, + "learning_rate": 0.00016879302710055792, + "loss": 1.2472, + "step": 7840 + }, + { + "epoch": 0.28080290794492097, + "grad_norm": 1.4797885417938232, + "learning_rate": 0.0001687846083434938, + "loss": 1.2968, + "step": 7841 + }, + { + "epoch": 0.28083872007448923, + "grad_norm": 1.5384573936462402, + "learning_rate": 0.00016877618866102155, + "loss": 1.2751, + "step": 7842 + }, + { + "epoch": 0.2808745322040575, + "grad_norm": 1.4807273149490356, + "learning_rate": 0.0001687677680532544, + "loss": 1.0182, + "step": 7843 + }, + { + "epoch": 0.28091034433362577, + "grad_norm": 1.8014850616455078, + "learning_rate": 0.00016875934652030563, + "loss": 1.1636, + "step": 7844 + }, + { + "epoch": 0.2809461564631941, + "grad_norm": 1.5031015872955322, + "learning_rate": 0.00016875092406228853, + "loss": 1.1143, + "step": 7845 + }, + { + "epoch": 0.28098196859276237, + "grad_norm": 2.1283063888549805, + "learning_rate": 0.00016874250067931644, + "loss": 1.2495, + "step": 7846 + }, + { + "epoch": 0.28101778072233063, + "grad_norm": 1.3208321332931519, + "learning_rate": 0.00016873407637150268, + "loss": 1.1265, + "step": 7847 + }, + { + "epoch": 0.28105359285189896, + "grad_norm": 1.807210087776184, + "learning_rate": 0.00016872565113896056, + "loss": 1.0792, + "step": 7848 + }, + { + "epoch": 0.28108940498146723, + "grad_norm": 1.9734461307525635, + "learning_rate": 0.00016871722498180346, + "loss": 1.3567, + "step": 7849 + }, + { + "epoch": 0.2811252171110355, + "grad_norm": 1.5887194871902466, + "learning_rate": 0.00016870879790014474, + "loss": 1.1578, + "step": 7850 + }, + { + "epoch": 0.28116102924060377, + "grad_norm": 1.638609766960144, + "learning_rate": 0.00016870036989409778, + "loss": 1.1259, + "step": 7851 + }, + { + "epoch": 0.2811968413701721, + "grad_norm": 1.7065489292144775, + "learning_rate": 0.00016869194096377597, + "loss": 1.1913, + "step": 7852 + }, + { + "epoch": 0.28123265349974036, + "grad_norm": 1.4921420812606812, + "learning_rate": 0.00016868351110929268, + "loss": 1.2503, + "step": 7853 + }, + { + "epoch": 0.28126846562930863, + "grad_norm": 1.3213798999786377, + "learning_rate": 0.00016867508033076135, + "loss": 0.9513, + "step": 7854 + }, + { + "epoch": 0.28130427775887695, + "grad_norm": 1.4816935062408447, + "learning_rate": 0.00016866664862829543, + "loss": 1.1104, + "step": 7855 + }, + { + "epoch": 0.2813400898884452, + "grad_norm": 1.5035449266433716, + "learning_rate": 0.00016865821600200827, + "loss": 1.3029, + "step": 7856 + }, + { + "epoch": 0.2813759020180135, + "grad_norm": 1.5792735815048218, + "learning_rate": 0.0001686497824520134, + "loss": 1.3416, + "step": 7857 + }, + { + "epoch": 0.28141171414758176, + "grad_norm": 1.4636744260787964, + "learning_rate": 0.00016864134797842426, + "loss": 1.0651, + "step": 7858 + }, + { + "epoch": 0.2814475262771501, + "grad_norm": 1.7954957485198975, + "learning_rate": 0.00016863291258135434, + "loss": 1.1419, + "step": 7859 + }, + { + "epoch": 0.28148333840671835, + "grad_norm": 1.690830111503601, + "learning_rate": 0.00016862447626091707, + "loss": 1.2627, + "step": 7860 + }, + { + "epoch": 0.2815191505362866, + "grad_norm": 1.8549681901931763, + "learning_rate": 0.00016861603901722601, + "loss": 1.0641, + "step": 7861 + }, + { + "epoch": 0.28155496266585495, + "grad_norm": 1.440242886543274, + "learning_rate": 0.00016860760085039467, + "loss": 1.1878, + "step": 7862 + }, + { + "epoch": 0.2815907747954232, + "grad_norm": 1.4408069849014282, + "learning_rate": 0.00016859916176053657, + "loss": 1.1655, + "step": 7863 + }, + { + "epoch": 0.2816265869249915, + "grad_norm": 1.7723580598831177, + "learning_rate": 0.00016859072174776522, + "loss": 1.032, + "step": 7864 + }, + { + "epoch": 0.28166239905455975, + "grad_norm": 1.576386570930481, + "learning_rate": 0.00016858228081219416, + "loss": 1.1987, + "step": 7865 + }, + { + "epoch": 0.2816982111841281, + "grad_norm": 1.7572746276855469, + "learning_rate": 0.000168573838953937, + "loss": 1.2806, + "step": 7866 + }, + { + "epoch": 0.28173402331369635, + "grad_norm": 1.8365319967269897, + "learning_rate": 0.00016856539617310728, + "loss": 1.1453, + "step": 7867 + }, + { + "epoch": 0.2817698354432646, + "grad_norm": 1.5942387580871582, + "learning_rate": 0.0001685569524698186, + "loss": 1.2367, + "step": 7868 + }, + { + "epoch": 0.28180564757283294, + "grad_norm": 1.636447787284851, + "learning_rate": 0.00016854850784418457, + "loss": 1.3408, + "step": 7869 + }, + { + "epoch": 0.2818414597024012, + "grad_norm": 1.455823302268982, + "learning_rate": 0.00016854006229631877, + "loss": 1.2418, + "step": 7870 + }, + { + "epoch": 0.2818772718319695, + "grad_norm": 1.7591160535812378, + "learning_rate": 0.00016853161582633486, + "loss": 1.3692, + "step": 7871 + }, + { + "epoch": 0.28191308396153775, + "grad_norm": 1.5553009510040283, + "learning_rate": 0.00016852316843434645, + "loss": 1.0137, + "step": 7872 + }, + { + "epoch": 0.28194889609110607, + "grad_norm": 1.4550565481185913, + "learning_rate": 0.0001685147201204672, + "loss": 1.3515, + "step": 7873 + }, + { + "epoch": 0.28198470822067434, + "grad_norm": 1.6851247549057007, + "learning_rate": 0.00016850627088481077, + "loss": 1.1369, + "step": 7874 + }, + { + "epoch": 0.2820205203502426, + "grad_norm": 1.6319035291671753, + "learning_rate": 0.0001684978207274908, + "loss": 1.2788, + "step": 7875 + }, + { + "epoch": 0.28205633247981093, + "grad_norm": 1.400048851966858, + "learning_rate": 0.00016848936964862106, + "loss": 1.0631, + "step": 7876 + }, + { + "epoch": 0.2820921446093792, + "grad_norm": 1.5202577114105225, + "learning_rate": 0.00016848091764831518, + "loss": 1.1271, + "step": 7877 + }, + { + "epoch": 0.28212795673894747, + "grad_norm": 1.9621756076812744, + "learning_rate": 0.00016847246472668684, + "loss": 1.1236, + "step": 7878 + }, + { + "epoch": 0.28216376886851574, + "grad_norm": 1.2630966901779175, + "learning_rate": 0.00016846401088384987, + "loss": 1.1755, + "step": 7879 + }, + { + "epoch": 0.28219958099808407, + "grad_norm": 1.3433924913406372, + "learning_rate": 0.0001684555561199179, + "loss": 1.1933, + "step": 7880 + }, + { + "epoch": 0.28223539312765233, + "grad_norm": 1.6120604276657104, + "learning_rate": 0.00016844710043500478, + "loss": 1.1347, + "step": 7881 + }, + { + "epoch": 0.2822712052572206, + "grad_norm": 2.0665948390960693, + "learning_rate": 0.00016843864382922418, + "loss": 1.253, + "step": 7882 + }, + { + "epoch": 0.2823070173867889, + "grad_norm": 1.880582571029663, + "learning_rate": 0.0001684301863026899, + "loss": 1.0563, + "step": 7883 + }, + { + "epoch": 0.2823428295163572, + "grad_norm": 1.6128588914871216, + "learning_rate": 0.00016842172785551572, + "loss": 1.2766, + "step": 7884 + }, + { + "epoch": 0.28237864164592547, + "grad_norm": 1.575729250907898, + "learning_rate": 0.00016841326848781546, + "loss": 1.2913, + "step": 7885 + }, + { + "epoch": 0.28241445377549373, + "grad_norm": 1.7683690786361694, + "learning_rate": 0.00016840480819970294, + "loss": 1.263, + "step": 7886 + }, + { + "epoch": 0.28245026590506206, + "grad_norm": 1.6461036205291748, + "learning_rate": 0.00016839634699129197, + "loss": 1.12, + "step": 7887 + }, + { + "epoch": 0.28248607803463033, + "grad_norm": 1.545164942741394, + "learning_rate": 0.00016838788486269634, + "loss": 1.0401, + "step": 7888 + }, + { + "epoch": 0.2825218901641986, + "grad_norm": 1.3374141454696655, + "learning_rate": 0.00016837942181402993, + "loss": 1.0428, + "step": 7889 + }, + { + "epoch": 0.2825577022937669, + "grad_norm": 1.5328984260559082, + "learning_rate": 0.00016837095784540663, + "loss": 1.1325, + "step": 7890 + }, + { + "epoch": 0.2825935144233352, + "grad_norm": 1.3864349126815796, + "learning_rate": 0.0001683624929569403, + "loss": 1.2908, + "step": 7891 + }, + { + "epoch": 0.28262932655290346, + "grad_norm": 1.5494613647460938, + "learning_rate": 0.0001683540271487448, + "loss": 1.0968, + "step": 7892 + }, + { + "epoch": 0.28266513868247173, + "grad_norm": 1.6090970039367676, + "learning_rate": 0.000168345560420934, + "loss": 1.2931, + "step": 7893 + }, + { + "epoch": 0.28270095081204005, + "grad_norm": 1.849301815032959, + "learning_rate": 0.00016833709277362186, + "loss": 1.3343, + "step": 7894 + }, + { + "epoch": 0.2827367629416083, + "grad_norm": 1.6146354675292969, + "learning_rate": 0.0001683286242069223, + "loss": 1.0503, + "step": 7895 + }, + { + "epoch": 0.2827725750711766, + "grad_norm": 2.2484848499298096, + "learning_rate": 0.00016832015472094923, + "loss": 1.0129, + "step": 7896 + }, + { + "epoch": 0.2828083872007449, + "grad_norm": 1.4916529655456543, + "learning_rate": 0.0001683116843158166, + "loss": 1.129, + "step": 7897 + }, + { + "epoch": 0.2828441993303132, + "grad_norm": 1.827230453491211, + "learning_rate": 0.00016830321299163837, + "loss": 1.2086, + "step": 7898 + }, + { + "epoch": 0.28288001145988145, + "grad_norm": 1.8629677295684814, + "learning_rate": 0.0001682947407485285, + "loss": 1.1145, + "step": 7899 + }, + { + "epoch": 0.2829158235894497, + "grad_norm": 2.0414257049560547, + "learning_rate": 0.00016828626758660104, + "loss": 1.3545, + "step": 7900 + }, + { + "epoch": 0.28295163571901805, + "grad_norm": 1.7574697732925415, + "learning_rate": 0.00016827779350596988, + "loss": 1.1382, + "step": 7901 + }, + { + "epoch": 0.2829874478485863, + "grad_norm": 1.5186235904693604, + "learning_rate": 0.00016826931850674913, + "loss": 1.22, + "step": 7902 + }, + { + "epoch": 0.2830232599781546, + "grad_norm": 1.5333185195922852, + "learning_rate": 0.0001682608425890527, + "loss": 1.1013, + "step": 7903 + }, + { + "epoch": 0.2830590721077229, + "grad_norm": 1.2719920873641968, + "learning_rate": 0.00016825236575299473, + "loss": 1.0238, + "step": 7904 + }, + { + "epoch": 0.2830948842372912, + "grad_norm": 1.6190199851989746, + "learning_rate": 0.0001682438879986892, + "loss": 1.0415, + "step": 7905 + }, + { + "epoch": 0.28313069636685945, + "grad_norm": 1.74473237991333, + "learning_rate": 0.0001682354093262502, + "loss": 1.2066, + "step": 7906 + }, + { + "epoch": 0.2831665084964277, + "grad_norm": 2.303750991821289, + "learning_rate": 0.00016822692973579177, + "loss": 1.146, + "step": 7907 + }, + { + "epoch": 0.28320232062599604, + "grad_norm": 2.065650701522827, + "learning_rate": 0.000168218449227428, + "loss": 1.3001, + "step": 7908 + }, + { + "epoch": 0.2832381327555643, + "grad_norm": 2.116710662841797, + "learning_rate": 0.00016820996780127302, + "loss": 1.1254, + "step": 7909 + }, + { + "epoch": 0.2832739448851326, + "grad_norm": 1.5394452810287476, + "learning_rate": 0.00016820148545744089, + "loss": 1.1715, + "step": 7910 + }, + { + "epoch": 0.2833097570147009, + "grad_norm": 1.7059975862503052, + "learning_rate": 0.00016819300219604572, + "loss": 1.2375, + "step": 7911 + }, + { + "epoch": 0.28334556914426917, + "grad_norm": 2.064436435699463, + "learning_rate": 0.00016818451801720169, + "loss": 1.223, + "step": 7912 + }, + { + "epoch": 0.28338138127383744, + "grad_norm": 1.8502864837646484, + "learning_rate": 0.00016817603292102292, + "loss": 1.2785, + "step": 7913 + }, + { + "epoch": 0.2834171934034057, + "grad_norm": 1.8634480237960815, + "learning_rate": 0.00016816754690762356, + "loss": 1.2917, + "step": 7914 + }, + { + "epoch": 0.28345300553297403, + "grad_norm": 2.61289381980896, + "learning_rate": 0.0001681590599771178, + "loss": 1.1359, + "step": 7915 + }, + { + "epoch": 0.2834888176625423, + "grad_norm": 1.3727120161056519, + "learning_rate": 0.00016815057212961985, + "loss": 1.167, + "step": 7916 + }, + { + "epoch": 0.28352462979211057, + "grad_norm": 1.5804487466812134, + "learning_rate": 0.0001681420833652438, + "loss": 1.1948, + "step": 7917 + }, + { + "epoch": 0.2835604419216789, + "grad_norm": 1.4904811382293701, + "learning_rate": 0.00016813359368410394, + "loss": 1.2999, + "step": 7918 + }, + { + "epoch": 0.28359625405124717, + "grad_norm": 1.2961273193359375, + "learning_rate": 0.00016812510308631445, + "loss": 1.0124, + "step": 7919 + }, + { + "epoch": 0.28363206618081543, + "grad_norm": 2.610222101211548, + "learning_rate": 0.00016811661157198956, + "loss": 1.058, + "step": 7920 + }, + { + "epoch": 0.2836678783103837, + "grad_norm": 1.512258529663086, + "learning_rate": 0.00016810811914124354, + "loss": 1.1095, + "step": 7921 + }, + { + "epoch": 0.283703690439952, + "grad_norm": 1.7233489751815796, + "learning_rate": 0.00016809962579419064, + "loss": 1.1462, + "step": 7922 + }, + { + "epoch": 0.2837395025695203, + "grad_norm": 1.8434414863586426, + "learning_rate": 0.0001680911315309451, + "loss": 1.1227, + "step": 7923 + }, + { + "epoch": 0.28377531469908857, + "grad_norm": 1.6842381954193115, + "learning_rate": 0.00016808263635162123, + "loss": 1.3436, + "step": 7924 + }, + { + "epoch": 0.2838111268286569, + "grad_norm": 2.108870267868042, + "learning_rate": 0.0001680741402563333, + "loss": 0.9564, + "step": 7925 + }, + { + "epoch": 0.28384693895822516, + "grad_norm": 1.5980929136276245, + "learning_rate": 0.00016806564324519565, + "loss": 1.1687, + "step": 7926 + }, + { + "epoch": 0.28388275108779343, + "grad_norm": 1.42555832862854, + "learning_rate": 0.00016805714531832253, + "loss": 1.1646, + "step": 7927 + }, + { + "epoch": 0.2839185632173617, + "grad_norm": 2.040743589401245, + "learning_rate": 0.00016804864647582832, + "loss": 1.3754, + "step": 7928 + }, + { + "epoch": 0.28395437534693, + "grad_norm": 1.4014520645141602, + "learning_rate": 0.00016804014671782736, + "loss": 1.2232, + "step": 7929 + }, + { + "epoch": 0.2839901874764983, + "grad_norm": 1.506685733795166, + "learning_rate": 0.00016803164604443395, + "loss": 0.9566, + "step": 7930 + }, + { + "epoch": 0.28402599960606656, + "grad_norm": 1.9319409132003784, + "learning_rate": 0.00016802314445576254, + "loss": 0.9624, + "step": 7931 + }, + { + "epoch": 0.2840618117356349, + "grad_norm": 1.742565393447876, + "learning_rate": 0.00016801464195192746, + "loss": 1.2377, + "step": 7932 + }, + { + "epoch": 0.28409762386520315, + "grad_norm": 1.6164462566375732, + "learning_rate": 0.00016800613853304311, + "loss": 1.0499, + "step": 7933 + }, + { + "epoch": 0.2841334359947714, + "grad_norm": 1.551490068435669, + "learning_rate": 0.00016799763419922387, + "loss": 1.3919, + "step": 7934 + }, + { + "epoch": 0.2841692481243397, + "grad_norm": 1.776598572731018, + "learning_rate": 0.00016798912895058416, + "loss": 1.1366, + "step": 7935 + }, + { + "epoch": 0.284205060253908, + "grad_norm": 1.8811665773391724, + "learning_rate": 0.00016798062278723845, + "loss": 1.2767, + "step": 7936 + }, + { + "epoch": 0.2842408723834763, + "grad_norm": 1.4863827228546143, + "learning_rate": 0.00016797211570930115, + "loss": 1.1666, + "step": 7937 + }, + { + "epoch": 0.28427668451304455, + "grad_norm": 2.006511688232422, + "learning_rate": 0.0001679636077168867, + "loss": 1.4062, + "step": 7938 + }, + { + "epoch": 0.2843124966426129, + "grad_norm": 1.1677989959716797, + "learning_rate": 0.00016795509881010955, + "loss": 1.1489, + "step": 7939 + }, + { + "epoch": 0.28434830877218115, + "grad_norm": 1.4772629737854004, + "learning_rate": 0.00016794658898908424, + "loss": 1.1926, + "step": 7940 + }, + { + "epoch": 0.2843841209017494, + "grad_norm": 1.9621127843856812, + "learning_rate": 0.00016793807825392517, + "loss": 1.2169, + "step": 7941 + }, + { + "epoch": 0.2844199330313177, + "grad_norm": 1.5559625625610352, + "learning_rate": 0.00016792956660474694, + "loss": 1.2533, + "step": 7942 + }, + { + "epoch": 0.284455745160886, + "grad_norm": 1.4378386735916138, + "learning_rate": 0.00016792105404166404, + "loss": 1.2568, + "step": 7943 + }, + { + "epoch": 0.2844915572904543, + "grad_norm": 1.6056135892868042, + "learning_rate": 0.00016791254056479092, + "loss": 0.9825, + "step": 7944 + }, + { + "epoch": 0.28452736942002255, + "grad_norm": 1.751481056213379, + "learning_rate": 0.00016790402617424216, + "loss": 1.1473, + "step": 7945 + }, + { + "epoch": 0.28456318154959087, + "grad_norm": 1.5342391729354858, + "learning_rate": 0.00016789551087013232, + "loss": 1.3686, + "step": 7946 + }, + { + "epoch": 0.28459899367915914, + "grad_norm": 1.4484928846359253, + "learning_rate": 0.00016788699465257597, + "loss": 1.1546, + "step": 7947 + }, + { + "epoch": 0.2846348058087274, + "grad_norm": 2.30951189994812, + "learning_rate": 0.00016787847752168769, + "loss": 1.0333, + "step": 7948 + }, + { + "epoch": 0.2846706179382957, + "grad_norm": 1.6998987197875977, + "learning_rate": 0.00016786995947758204, + "loss": 1.2318, + "step": 7949 + }, + { + "epoch": 0.284706430067864, + "grad_norm": 1.4360953569412231, + "learning_rate": 0.00016786144052037365, + "loss": 1.1033, + "step": 7950 + }, + { + "epoch": 0.28474224219743227, + "grad_norm": 1.6733812093734741, + "learning_rate": 0.00016785292065017707, + "loss": 1.1167, + "step": 7951 + }, + { + "epoch": 0.28477805432700054, + "grad_norm": 2.0459580421447754, + "learning_rate": 0.000167844399867107, + "loss": 1.1325, + "step": 7952 + }, + { + "epoch": 0.28481386645656886, + "grad_norm": 1.5506911277770996, + "learning_rate": 0.00016783587817127804, + "loss": 1.0077, + "step": 7953 + }, + { + "epoch": 0.28484967858613713, + "grad_norm": 1.800087809562683, + "learning_rate": 0.00016782735556280484, + "loss": 1.2656, + "step": 7954 + }, + { + "epoch": 0.2848854907157054, + "grad_norm": 1.414759635925293, + "learning_rate": 0.00016781883204180207, + "loss": 1.1504, + "step": 7955 + }, + { + "epoch": 0.28492130284527367, + "grad_norm": 1.715881109237671, + "learning_rate": 0.00016781030760838436, + "loss": 1.3658, + "step": 7956 + }, + { + "epoch": 0.284957114974842, + "grad_norm": 1.5025968551635742, + "learning_rate": 0.00016780178226266646, + "loss": 1.2579, + "step": 7957 + }, + { + "epoch": 0.28499292710441027, + "grad_norm": 1.6438994407653809, + "learning_rate": 0.00016779325600476303, + "loss": 1.2848, + "step": 7958 + }, + { + "epoch": 0.28502873923397853, + "grad_norm": 1.7322893142700195, + "learning_rate": 0.00016778472883478878, + "loss": 1.1508, + "step": 7959 + }, + { + "epoch": 0.28506455136354686, + "grad_norm": 1.5117928981781006, + "learning_rate": 0.00016777620075285847, + "loss": 1.0268, + "step": 7960 + }, + { + "epoch": 0.2851003634931151, + "grad_norm": 1.9253220558166504, + "learning_rate": 0.00016776767175908676, + "loss": 1.2933, + "step": 7961 + }, + { + "epoch": 0.2851361756226834, + "grad_norm": 1.5724892616271973, + "learning_rate": 0.00016775914185358846, + "loss": 1.2161, + "step": 7962 + }, + { + "epoch": 0.28517198775225167, + "grad_norm": 2.4197516441345215, + "learning_rate": 0.00016775061103647834, + "loss": 1.2328, + "step": 7963 + }, + { + "epoch": 0.28520779988182, + "grad_norm": 1.441402554512024, + "learning_rate": 0.00016774207930787108, + "loss": 1.2313, + "step": 7964 + }, + { + "epoch": 0.28524361201138826, + "grad_norm": 2.1083645820617676, + "learning_rate": 0.00016773354666788155, + "loss": 0.9026, + "step": 7965 + }, + { + "epoch": 0.28527942414095653, + "grad_norm": 1.7798165082931519, + "learning_rate": 0.00016772501311662454, + "loss": 1.2427, + "step": 7966 + }, + { + "epoch": 0.28531523627052485, + "grad_norm": 1.5478897094726562, + "learning_rate": 0.00016771647865421483, + "loss": 1.2062, + "step": 7967 + }, + { + "epoch": 0.2853510484000931, + "grad_norm": 1.6237306594848633, + "learning_rate": 0.00016770794328076726, + "loss": 1.2446, + "step": 7968 + }, + { + "epoch": 0.2853868605296614, + "grad_norm": 1.4814374446868896, + "learning_rate": 0.00016769940699639662, + "loss": 1.1358, + "step": 7969 + }, + { + "epoch": 0.28542267265922966, + "grad_norm": 1.455759048461914, + "learning_rate": 0.0001676908698012178, + "loss": 1.3051, + "step": 7970 + }, + { + "epoch": 0.285458484788798, + "grad_norm": 1.1294089555740356, + "learning_rate": 0.0001676823316953456, + "loss": 0.891, + "step": 7971 + }, + { + "epoch": 0.28549429691836625, + "grad_norm": 2.428621768951416, + "learning_rate": 0.00016767379267889498, + "loss": 1.2428, + "step": 7972 + }, + { + "epoch": 0.2855301090479345, + "grad_norm": 1.295573115348816, + "learning_rate": 0.00016766525275198078, + "loss": 1.2519, + "step": 7973 + }, + { + "epoch": 0.28556592117750285, + "grad_norm": 1.469852328300476, + "learning_rate": 0.00016765671191471785, + "loss": 1.162, + "step": 7974 + }, + { + "epoch": 0.2856017333070711, + "grad_norm": 1.3055137395858765, + "learning_rate": 0.00016764817016722114, + "loss": 1.2029, + "step": 7975 + }, + { + "epoch": 0.2856375454366394, + "grad_norm": 1.4202829599380493, + "learning_rate": 0.00016763962750960558, + "loss": 1.1864, + "step": 7976 + }, + { + "epoch": 0.28567335756620765, + "grad_norm": 2.09348464012146, + "learning_rate": 0.00016763108394198605, + "loss": 1.4691, + "step": 7977 + }, + { + "epoch": 0.285709169695776, + "grad_norm": 1.6974684000015259, + "learning_rate": 0.00016762253946447757, + "loss": 1.2094, + "step": 7978 + }, + { + "epoch": 0.28574498182534425, + "grad_norm": 1.7124760150909424, + "learning_rate": 0.000167613994077195, + "loss": 1.2269, + "step": 7979 + }, + { + "epoch": 0.2857807939549125, + "grad_norm": 1.5560866594314575, + "learning_rate": 0.00016760544778025337, + "loss": 1.3767, + "step": 7980 + }, + { + "epoch": 0.28581660608448084, + "grad_norm": 2.3291079998016357, + "learning_rate": 0.00016759690057376769, + "loss": 1.3598, + "step": 7981 + }, + { + "epoch": 0.2858524182140491, + "grad_norm": 1.5138038396835327, + "learning_rate": 0.00016758835245785284, + "loss": 1.2322, + "step": 7982 + }, + { + "epoch": 0.2858882303436174, + "grad_norm": 1.5037084817886353, + "learning_rate": 0.00016757980343262393, + "loss": 1.3124, + "step": 7983 + }, + { + "epoch": 0.28592404247318565, + "grad_norm": 1.4848960638046265, + "learning_rate": 0.00016757125349819592, + "loss": 1.3295, + "step": 7984 + }, + { + "epoch": 0.28595985460275397, + "grad_norm": 1.3306736946105957, + "learning_rate": 0.00016756270265468385, + "loss": 1.0548, + "step": 7985 + }, + { + "epoch": 0.28599566673232224, + "grad_norm": 1.753389596939087, + "learning_rate": 0.00016755415090220278, + "loss": 1.1712, + "step": 7986 + }, + { + "epoch": 0.2860314788618905, + "grad_norm": 1.6381059885025024, + "learning_rate": 0.00016754559824086774, + "loss": 1.0729, + "step": 7987 + }, + { + "epoch": 0.28606729099145883, + "grad_norm": 1.4113043546676636, + "learning_rate": 0.00016753704467079383, + "loss": 1.2383, + "step": 7988 + }, + { + "epoch": 0.2861031031210271, + "grad_norm": 1.3656153678894043, + "learning_rate": 0.00016752849019209607, + "loss": 1.059, + "step": 7989 + }, + { + "epoch": 0.28613891525059537, + "grad_norm": 1.5359585285186768, + "learning_rate": 0.00016751993480488956, + "loss": 1.3204, + "step": 7990 + }, + { + "epoch": 0.28617472738016364, + "grad_norm": 2.9075515270233154, + "learning_rate": 0.0001675113785092895, + "loss": 1.3485, + "step": 7991 + }, + { + "epoch": 0.28621053950973196, + "grad_norm": 1.4807275533676147, + "learning_rate": 0.00016750282130541084, + "loss": 1.1407, + "step": 7992 + }, + { + "epoch": 0.28624635163930023, + "grad_norm": 1.4793347120285034, + "learning_rate": 0.00016749426319336884, + "loss": 1.2019, + "step": 7993 + }, + { + "epoch": 0.2862821637688685, + "grad_norm": 1.322943925857544, + "learning_rate": 0.00016748570417327857, + "loss": 1.1931, + "step": 7994 + }, + { + "epoch": 0.2863179758984368, + "grad_norm": 1.8958892822265625, + "learning_rate": 0.0001674771442452552, + "loss": 1.2137, + "step": 7995 + }, + { + "epoch": 0.2863537880280051, + "grad_norm": 1.5802631378173828, + "learning_rate": 0.0001674685834094139, + "loss": 1.2206, + "step": 7996 + }, + { + "epoch": 0.28638960015757337, + "grad_norm": 1.9816296100616455, + "learning_rate": 0.00016746002166586984, + "loss": 1.2278, + "step": 7997 + }, + { + "epoch": 0.28642541228714163, + "grad_norm": 1.107828974723816, + "learning_rate": 0.00016745145901473819, + "loss": 1.0519, + "step": 7998 + }, + { + "epoch": 0.28646122441670996, + "grad_norm": 1.6063913106918335, + "learning_rate": 0.0001674428954561342, + "loss": 1.2248, + "step": 7999 + }, + { + "epoch": 0.2864970365462782, + "grad_norm": 1.5787841081619263, + "learning_rate": 0.000167434330990173, + "loss": 1.1257, + "step": 8000 + }, + { + "epoch": 0.2865328486758465, + "grad_norm": 1.7356359958648682, + "learning_rate": 0.0001674257656169699, + "loss": 1.3958, + "step": 8001 + }, + { + "epoch": 0.2865686608054148, + "grad_norm": 1.3748154640197754, + "learning_rate": 0.00016741719933664008, + "loss": 1.1319, + "step": 8002 + }, + { + "epoch": 0.2866044729349831, + "grad_norm": 1.9505913257598877, + "learning_rate": 0.00016740863214929883, + "loss": 1.0992, + "step": 8003 + }, + { + "epoch": 0.28664028506455136, + "grad_norm": 1.3370112180709839, + "learning_rate": 0.00016740006405506133, + "loss": 1.0943, + "step": 8004 + }, + { + "epoch": 0.28667609719411963, + "grad_norm": 1.2519564628601074, + "learning_rate": 0.00016739149505404298, + "loss": 0.9158, + "step": 8005 + }, + { + "epoch": 0.28671190932368795, + "grad_norm": 1.827633261680603, + "learning_rate": 0.00016738292514635893, + "loss": 1.0601, + "step": 8006 + }, + { + "epoch": 0.2867477214532562, + "grad_norm": 1.4122846126556396, + "learning_rate": 0.0001673743543321246, + "loss": 1.1201, + "step": 8007 + }, + { + "epoch": 0.2867835335828245, + "grad_norm": 2.0478696823120117, + "learning_rate": 0.00016736578261145518, + "loss": 1.3813, + "step": 8008 + }, + { + "epoch": 0.2868193457123928, + "grad_norm": 1.5580182075500488, + "learning_rate": 0.00016735720998446607, + "loss": 0.9973, + "step": 8009 + }, + { + "epoch": 0.2868551578419611, + "grad_norm": 1.7295955419540405, + "learning_rate": 0.0001673486364512726, + "loss": 1.1728, + "step": 8010 + }, + { + "epoch": 0.28689096997152935, + "grad_norm": 1.2298508882522583, + "learning_rate": 0.00016734006201199006, + "loss": 1.0518, + "step": 8011 + }, + { + "epoch": 0.2869267821010976, + "grad_norm": 1.8944278955459595, + "learning_rate": 0.00016733148666673388, + "loss": 1.1177, + "step": 8012 + }, + { + "epoch": 0.28696259423066595, + "grad_norm": 1.6743289232254028, + "learning_rate": 0.0001673229104156194, + "loss": 1.1448, + "step": 8013 + }, + { + "epoch": 0.2869984063602342, + "grad_norm": 1.459035873413086, + "learning_rate": 0.000167314333258762, + "loss": 1.1881, + "step": 8014 + }, + { + "epoch": 0.2870342184898025, + "grad_norm": 1.7478358745574951, + "learning_rate": 0.00016730575519627707, + "loss": 1.1464, + "step": 8015 + }, + { + "epoch": 0.2870700306193708, + "grad_norm": 1.5208065509796143, + "learning_rate": 0.00016729717622828002, + "loss": 1.0701, + "step": 8016 + }, + { + "epoch": 0.2871058427489391, + "grad_norm": 1.7472354173660278, + "learning_rate": 0.00016728859635488626, + "loss": 1.2165, + "step": 8017 + }, + { + "epoch": 0.28714165487850735, + "grad_norm": 1.68996000289917, + "learning_rate": 0.00016728001557621126, + "loss": 1.1786, + "step": 8018 + }, + { + "epoch": 0.2871774670080756, + "grad_norm": 1.3784352540969849, + "learning_rate": 0.00016727143389237042, + "loss": 1.1162, + "step": 8019 + }, + { + "epoch": 0.28721327913764394, + "grad_norm": 1.649593710899353, + "learning_rate": 0.0001672628513034792, + "loss": 1.1973, + "step": 8020 + }, + { + "epoch": 0.2872490912672122, + "grad_norm": 1.70454740524292, + "learning_rate": 0.0001672542678096531, + "loss": 1.3082, + "step": 8021 + }, + { + "epoch": 0.2872849033967805, + "grad_norm": 1.8595786094665527, + "learning_rate": 0.00016724568341100758, + "loss": 1.1546, + "step": 8022 + }, + { + "epoch": 0.28732071552634875, + "grad_norm": 1.2866063117980957, + "learning_rate": 0.0001672370981076581, + "loss": 1.0944, + "step": 8023 + }, + { + "epoch": 0.28735652765591707, + "grad_norm": 1.701101303100586, + "learning_rate": 0.00016722851189972024, + "loss": 1.1395, + "step": 8024 + }, + { + "epoch": 0.28739233978548534, + "grad_norm": 1.4356117248535156, + "learning_rate": 0.00016721992478730942, + "loss": 1.0265, + "step": 8025 + }, + { + "epoch": 0.2874281519150536, + "grad_norm": 1.3101977109909058, + "learning_rate": 0.00016721133677054123, + "loss": 1.0928, + "step": 8026 + }, + { + "epoch": 0.28746396404462193, + "grad_norm": 1.3240046501159668, + "learning_rate": 0.00016720274784953122, + "loss": 0.9151, + "step": 8027 + }, + { + "epoch": 0.2874997761741902, + "grad_norm": 1.416304588317871, + "learning_rate": 0.00016719415802439493, + "loss": 1.1234, + "step": 8028 + }, + { + "epoch": 0.28753558830375847, + "grad_norm": 1.5309293270111084, + "learning_rate": 0.0001671855672952479, + "loss": 1.039, + "step": 8029 + }, + { + "epoch": 0.28757140043332674, + "grad_norm": 1.6284602880477905, + "learning_rate": 0.00016717697566220573, + "loss": 1.2168, + "step": 8030 + }, + { + "epoch": 0.28760721256289506, + "grad_norm": 1.510253667831421, + "learning_rate": 0.00016716838312538402, + "loss": 1.2025, + "step": 8031 + }, + { + "epoch": 0.28764302469246333, + "grad_norm": 1.4302504062652588, + "learning_rate": 0.00016715978968489834, + "loss": 1.1975, + "step": 8032 + }, + { + "epoch": 0.2876788368220316, + "grad_norm": 1.4557428359985352, + "learning_rate": 0.0001671511953408643, + "loss": 1.1901, + "step": 8033 + }, + { + "epoch": 0.2877146489515999, + "grad_norm": 1.4463789463043213, + "learning_rate": 0.0001671426000933976, + "loss": 1.2003, + "step": 8034 + }, + { + "epoch": 0.2877504610811682, + "grad_norm": 1.3606035709381104, + "learning_rate": 0.00016713400394261378, + "loss": 1.1915, + "step": 8035 + }, + { + "epoch": 0.28778627321073647, + "grad_norm": 1.3841557502746582, + "learning_rate": 0.00016712540688862854, + "loss": 1.4126, + "step": 8036 + }, + { + "epoch": 0.28782208534030473, + "grad_norm": 1.3239991664886475, + "learning_rate": 0.0001671168089315575, + "loss": 1.1983, + "step": 8037 + }, + { + "epoch": 0.28785789746987306, + "grad_norm": 2.0507869720458984, + "learning_rate": 0.00016710821007151646, + "loss": 1.2569, + "step": 8038 + }, + { + "epoch": 0.2878937095994413, + "grad_norm": 2.6280429363250732, + "learning_rate": 0.00016709961030862092, + "loss": 1.0492, + "step": 8039 + }, + { + "epoch": 0.2879295217290096, + "grad_norm": 1.4531506299972534, + "learning_rate": 0.00016709100964298673, + "loss": 0.865, + "step": 8040 + }, + { + "epoch": 0.2879653338585779, + "grad_norm": 1.7660350799560547, + "learning_rate": 0.00016708240807472956, + "loss": 1.4253, + "step": 8041 + }, + { + "epoch": 0.2880011459881462, + "grad_norm": 1.2696508169174194, + "learning_rate": 0.00016707380560396508, + "loss": 1.2577, + "step": 8042 + }, + { + "epoch": 0.28803695811771446, + "grad_norm": 1.9519377946853638, + "learning_rate": 0.0001670652022308091, + "loss": 1.1453, + "step": 8043 + }, + { + "epoch": 0.28807277024728273, + "grad_norm": 1.4691392183303833, + "learning_rate": 0.0001670565979553773, + "loss": 1.2781, + "step": 8044 + }, + { + "epoch": 0.28810858237685105, + "grad_norm": 1.7306978702545166, + "learning_rate": 0.0001670479927777855, + "loss": 1.0571, + "step": 8045 + }, + { + "epoch": 0.2881443945064193, + "grad_norm": 1.8794736862182617, + "learning_rate": 0.0001670393866981494, + "loss": 1.2832, + "step": 8046 + }, + { + "epoch": 0.2881802066359876, + "grad_norm": 1.1929636001586914, + "learning_rate": 0.00016703077971658487, + "loss": 1.1059, + "step": 8047 + }, + { + "epoch": 0.2882160187655559, + "grad_norm": 1.718522548675537, + "learning_rate": 0.00016702217183320762, + "loss": 1.2021, + "step": 8048 + }, + { + "epoch": 0.2882518308951242, + "grad_norm": 1.3166879415512085, + "learning_rate": 0.00016701356304813357, + "loss": 1.1097, + "step": 8049 + }, + { + "epoch": 0.28828764302469245, + "grad_norm": 1.9643526077270508, + "learning_rate": 0.00016700495336147841, + "loss": 1.1997, + "step": 8050 + }, + { + "epoch": 0.2883234551542607, + "grad_norm": 1.383708119392395, + "learning_rate": 0.00016699634277335805, + "loss": 1.0974, + "step": 8051 + }, + { + "epoch": 0.28835926728382905, + "grad_norm": 2.118687629699707, + "learning_rate": 0.00016698773128388832, + "loss": 1.3139, + "step": 8052 + }, + { + "epoch": 0.2883950794133973, + "grad_norm": 1.3097846508026123, + "learning_rate": 0.00016697911889318508, + "loss": 1.0788, + "step": 8053 + }, + { + "epoch": 0.2884308915429656, + "grad_norm": 1.3427093029022217, + "learning_rate": 0.00016697050560136417, + "loss": 1.029, + "step": 8054 + }, + { + "epoch": 0.2884667036725339, + "grad_norm": 1.1717629432678223, + "learning_rate": 0.0001669618914085415, + "loss": 1.1278, + "step": 8055 + }, + { + "epoch": 0.2885025158021022, + "grad_norm": 1.7847486734390259, + "learning_rate": 0.00016695327631483298, + "loss": 1.4957, + "step": 8056 + }, + { + "epoch": 0.28853832793167045, + "grad_norm": 1.2964529991149902, + "learning_rate": 0.00016694466032035447, + "loss": 1.1242, + "step": 8057 + }, + { + "epoch": 0.2885741400612387, + "grad_norm": 1.2974356412887573, + "learning_rate": 0.0001669360434252219, + "loss": 1.1973, + "step": 8058 + }, + { + "epoch": 0.28860995219080704, + "grad_norm": 2.2001662254333496, + "learning_rate": 0.00016692742562955123, + "loss": 1.1595, + "step": 8059 + }, + { + "epoch": 0.2886457643203753, + "grad_norm": 1.3302936553955078, + "learning_rate": 0.00016691880693345837, + "loss": 1.1163, + "step": 8060 + }, + { + "epoch": 0.2886815764499436, + "grad_norm": 1.262312650680542, + "learning_rate": 0.00016691018733705926, + "loss": 1.1674, + "step": 8061 + }, + { + "epoch": 0.2887173885795119, + "grad_norm": 1.5988377332687378, + "learning_rate": 0.00016690156684046991, + "loss": 1.1402, + "step": 8062 + }, + { + "epoch": 0.28875320070908017, + "grad_norm": 1.6477662324905396, + "learning_rate": 0.00016689294544380628, + "loss": 1.2734, + "step": 8063 + }, + { + "epoch": 0.28878901283864844, + "grad_norm": 1.8863489627838135, + "learning_rate": 0.00016688432314718434, + "loss": 1.21, + "step": 8064 + }, + { + "epoch": 0.2888248249682167, + "grad_norm": 1.7038155794143677, + "learning_rate": 0.0001668756999507201, + "loss": 1.3308, + "step": 8065 + }, + { + "epoch": 0.28886063709778503, + "grad_norm": 2.4914073944091797, + "learning_rate": 0.00016686707585452962, + "loss": 1.3897, + "step": 8066 + }, + { + "epoch": 0.2888964492273533, + "grad_norm": 1.2741557359695435, + "learning_rate": 0.00016685845085872883, + "loss": 0.9721, + "step": 8067 + }, + { + "epoch": 0.28893226135692157, + "grad_norm": 1.7769873142242432, + "learning_rate": 0.00016684982496343386, + "loss": 1.3938, + "step": 8068 + }, + { + "epoch": 0.2889680734864899, + "grad_norm": 1.7354387044906616, + "learning_rate": 0.0001668411981687607, + "loss": 1.0534, + "step": 8069 + }, + { + "epoch": 0.28900388561605816, + "grad_norm": 1.7217462062835693, + "learning_rate": 0.00016683257047482548, + "loss": 1.3798, + "step": 8070 + }, + { + "epoch": 0.28903969774562643, + "grad_norm": 1.4092457294464111, + "learning_rate": 0.0001668239418817442, + "loss": 1.1991, + "step": 8071 + }, + { + "epoch": 0.2890755098751947, + "grad_norm": 1.7520629167556763, + "learning_rate": 0.000166815312389633, + "loss": 1.0425, + "step": 8072 + }, + { + "epoch": 0.289111322004763, + "grad_norm": 1.7505134344100952, + "learning_rate": 0.00016680668199860793, + "loss": 1.3464, + "step": 8073 + }, + { + "epoch": 0.2891471341343313, + "grad_norm": 1.6201471090316772, + "learning_rate": 0.00016679805070878514, + "loss": 1.2471, + "step": 8074 + }, + { + "epoch": 0.28918294626389957, + "grad_norm": 1.6702793836593628, + "learning_rate": 0.00016678941852028075, + "loss": 1.3453, + "step": 8075 + }, + { + "epoch": 0.2892187583934679, + "grad_norm": 1.8024673461914062, + "learning_rate": 0.0001667807854332109, + "loss": 1.145, + "step": 8076 + }, + { + "epoch": 0.28925457052303616, + "grad_norm": 1.5180423259735107, + "learning_rate": 0.0001667721514476917, + "loss": 1.2635, + "step": 8077 + }, + { + "epoch": 0.2892903826526044, + "grad_norm": 1.5333855152130127, + "learning_rate": 0.0001667635165638393, + "loss": 1.1535, + "step": 8078 + }, + { + "epoch": 0.2893261947821727, + "grad_norm": 1.3337444067001343, + "learning_rate": 0.00016675488078176994, + "loss": 1.1691, + "step": 8079 + }, + { + "epoch": 0.289362006911741, + "grad_norm": 1.5022257566452026, + "learning_rate": 0.00016674624410159978, + "loss": 1.2396, + "step": 8080 + }, + { + "epoch": 0.2893978190413093, + "grad_norm": 1.5448452234268188, + "learning_rate": 0.000166737606523445, + "loss": 1.3456, + "step": 8081 + }, + { + "epoch": 0.28943363117087756, + "grad_norm": 1.6367753744125366, + "learning_rate": 0.00016672896804742178, + "loss": 1.3737, + "step": 8082 + }, + { + "epoch": 0.2894694433004459, + "grad_norm": 1.4022997617721558, + "learning_rate": 0.00016672032867364638, + "loss": 1.2676, + "step": 8083 + }, + { + "epoch": 0.28950525543001415, + "grad_norm": 1.4363073110580444, + "learning_rate": 0.00016671168840223503, + "loss": 1.1635, + "step": 8084 + }, + { + "epoch": 0.2895410675595824, + "grad_norm": 1.7110960483551025, + "learning_rate": 0.00016670304723330397, + "loss": 1.1666, + "step": 8085 + }, + { + "epoch": 0.2895768796891507, + "grad_norm": 1.737815499305725, + "learning_rate": 0.00016669440516696945, + "loss": 1.0689, + "step": 8086 + }, + { + "epoch": 0.289612691818719, + "grad_norm": 1.623739242553711, + "learning_rate": 0.0001666857622033477, + "loss": 1.2408, + "step": 8087 + }, + { + "epoch": 0.2896485039482873, + "grad_norm": 1.8960233926773071, + "learning_rate": 0.00016667711834255505, + "loss": 1.2934, + "step": 8088 + }, + { + "epoch": 0.28968431607785555, + "grad_norm": 1.4767926931381226, + "learning_rate": 0.0001666684735847078, + "loss": 0.9162, + "step": 8089 + }, + { + "epoch": 0.2897201282074239, + "grad_norm": 1.361946702003479, + "learning_rate": 0.00016665982792992226, + "loss": 1.1001, + "step": 8090 + }, + { + "epoch": 0.28975594033699215, + "grad_norm": 1.6917883157730103, + "learning_rate": 0.00016665118137831468, + "loss": 1.0502, + "step": 8091 + }, + { + "epoch": 0.2897917524665604, + "grad_norm": 1.5359561443328857, + "learning_rate": 0.00016664253393000144, + "loss": 1.3591, + "step": 8092 + }, + { + "epoch": 0.2898275645961287, + "grad_norm": 1.7416077852249146, + "learning_rate": 0.00016663388558509887, + "loss": 1.2398, + "step": 8093 + }, + { + "epoch": 0.289863376725697, + "grad_norm": 1.2893725633621216, + "learning_rate": 0.00016662523634372334, + "loss": 1.1943, + "step": 8094 + }, + { + "epoch": 0.2898991888552653, + "grad_norm": 1.5725760459899902, + "learning_rate": 0.00016661658620599113, + "loss": 1.2111, + "step": 8095 + }, + { + "epoch": 0.28993500098483355, + "grad_norm": 1.828575611114502, + "learning_rate": 0.00016660793517201875, + "loss": 1.2075, + "step": 8096 + }, + { + "epoch": 0.28997081311440187, + "grad_norm": 1.7075490951538086, + "learning_rate": 0.00016659928324192248, + "loss": 1.3437, + "step": 8097 + }, + { + "epoch": 0.29000662524397014, + "grad_norm": 1.6411799192428589, + "learning_rate": 0.0001665906304158188, + "loss": 1.1974, + "step": 8098 + }, + { + "epoch": 0.2900424373735384, + "grad_norm": 1.4476815462112427, + "learning_rate": 0.00016658197669382405, + "loss": 1.1505, + "step": 8099 + }, + { + "epoch": 0.2900782495031067, + "grad_norm": 2.383030414581299, + "learning_rate": 0.0001665733220760547, + "loss": 1.241, + "step": 8100 + }, + { + "epoch": 0.290114061632675, + "grad_norm": 1.681451678276062, + "learning_rate": 0.00016656466656262718, + "loss": 1.0772, + "step": 8101 + }, + { + "epoch": 0.29014987376224327, + "grad_norm": 1.3654322624206543, + "learning_rate": 0.00016655601015365794, + "loss": 1.1675, + "step": 8102 + }, + { + "epoch": 0.29018568589181154, + "grad_norm": 1.1934351921081543, + "learning_rate": 0.00016654735284926341, + "loss": 1.1811, + "step": 8103 + }, + { + "epoch": 0.29022149802137986, + "grad_norm": 1.2242465019226074, + "learning_rate": 0.00016653869464956008, + "loss": 1.1242, + "step": 8104 + }, + { + "epoch": 0.29025731015094813, + "grad_norm": 1.7996327877044678, + "learning_rate": 0.00016653003555466448, + "loss": 1.036, + "step": 8105 + }, + { + "epoch": 0.2902931222805164, + "grad_norm": 1.376822590827942, + "learning_rate": 0.00016652137556469305, + "loss": 1.0009, + "step": 8106 + }, + { + "epoch": 0.29032893441008467, + "grad_norm": 2.079237937927246, + "learning_rate": 0.00016651271467976232, + "loss": 1.1166, + "step": 8107 + }, + { + "epoch": 0.290364746539653, + "grad_norm": 1.9181843996047974, + "learning_rate": 0.0001665040528999888, + "loss": 1.1711, + "step": 8108 + }, + { + "epoch": 0.29040055866922126, + "grad_norm": 1.4172486066818237, + "learning_rate": 0.00016649539022548903, + "loss": 1.1643, + "step": 8109 + }, + { + "epoch": 0.29043637079878953, + "grad_norm": 1.7491180896759033, + "learning_rate": 0.00016648672665637958, + "loss": 1.1627, + "step": 8110 + }, + { + "epoch": 0.29047218292835786, + "grad_norm": 1.6299803256988525, + "learning_rate": 0.00016647806219277698, + "loss": 1.1655, + "step": 8111 + }, + { + "epoch": 0.2905079950579261, + "grad_norm": 1.8102400302886963, + "learning_rate": 0.0001664693968347978, + "loss": 1.3002, + "step": 8112 + }, + { + "epoch": 0.2905438071874944, + "grad_norm": 2.3789608478546143, + "learning_rate": 0.00016646073058255862, + "loss": 1.0981, + "step": 8113 + }, + { + "epoch": 0.29057961931706267, + "grad_norm": 1.767542839050293, + "learning_rate": 0.00016645206343617603, + "loss": 1.3367, + "step": 8114 + }, + { + "epoch": 0.290615431446631, + "grad_norm": 1.666918396949768, + "learning_rate": 0.00016644339539576664, + "loss": 1.1529, + "step": 8115 + }, + { + "epoch": 0.29065124357619926, + "grad_norm": 1.4458760023117065, + "learning_rate": 0.0001664347264614471, + "loss": 1.2537, + "step": 8116 + }, + { + "epoch": 0.2906870557057675, + "grad_norm": 1.543701410293579, + "learning_rate": 0.000166426056633334, + "loss": 1.3265, + "step": 8117 + }, + { + "epoch": 0.29072286783533585, + "grad_norm": 1.6016907691955566, + "learning_rate": 0.00016641738591154396, + "loss": 1.1844, + "step": 8118 + }, + { + "epoch": 0.2907586799649041, + "grad_norm": 1.37652587890625, + "learning_rate": 0.00016640871429619372, + "loss": 1.1474, + "step": 8119 + }, + { + "epoch": 0.2907944920944724, + "grad_norm": 1.7977780103683472, + "learning_rate": 0.00016640004178739985, + "loss": 1.2157, + "step": 8120 + }, + { + "epoch": 0.29083030422404066, + "grad_norm": 1.9022331237792969, + "learning_rate": 0.0001663913683852791, + "loss": 0.8922, + "step": 8121 + }, + { + "epoch": 0.290866116353609, + "grad_norm": 1.4317246675491333, + "learning_rate": 0.00016638269408994808, + "loss": 1.2038, + "step": 8122 + }, + { + "epoch": 0.29090192848317725, + "grad_norm": 1.638366937637329, + "learning_rate": 0.00016637401890152358, + "loss": 1.0698, + "step": 8123 + }, + { + "epoch": 0.2909377406127455, + "grad_norm": 1.8835279941558838, + "learning_rate": 0.00016636534282012225, + "loss": 1.1287, + "step": 8124 + }, + { + "epoch": 0.29097355274231385, + "grad_norm": 2.037776470184326, + "learning_rate": 0.00016635666584586083, + "loss": 1.1346, + "step": 8125 + }, + { + "epoch": 0.2910093648718821, + "grad_norm": 1.909350872039795, + "learning_rate": 0.00016634798797885607, + "loss": 1.2519, + "step": 8126 + }, + { + "epoch": 0.2910451770014504, + "grad_norm": 1.7102725505828857, + "learning_rate": 0.00016633930921922474, + "loss": 1.1752, + "step": 8127 + }, + { + "epoch": 0.29108098913101865, + "grad_norm": 1.5282775163650513, + "learning_rate": 0.00016633062956708354, + "loss": 1.0715, + "step": 8128 + }, + { + "epoch": 0.291116801260587, + "grad_norm": 1.7743877172470093, + "learning_rate": 0.0001663219490225493, + "loss": 1.0518, + "step": 8129 + }, + { + "epoch": 0.29115261339015525, + "grad_norm": 1.4563425779342651, + "learning_rate": 0.0001663132675857388, + "loss": 1.4151, + "step": 8130 + }, + { + "epoch": 0.2911884255197235, + "grad_norm": 1.5126514434814453, + "learning_rate": 0.0001663045852567688, + "loss": 1.2058, + "step": 8131 + }, + { + "epoch": 0.29122423764929184, + "grad_norm": 1.5654454231262207, + "learning_rate": 0.00016629590203575613, + "loss": 1.0321, + "step": 8132 + }, + { + "epoch": 0.2912600497788601, + "grad_norm": 1.421779990196228, + "learning_rate": 0.0001662872179228176, + "loss": 1.2487, + "step": 8133 + }, + { + "epoch": 0.2912958619084284, + "grad_norm": 1.8693279027938843, + "learning_rate": 0.0001662785329180701, + "loss": 1.2083, + "step": 8134 + }, + { + "epoch": 0.29133167403799665, + "grad_norm": 1.325974464416504, + "learning_rate": 0.0001662698470216304, + "loss": 1.0403, + "step": 8135 + }, + { + "epoch": 0.29136748616756497, + "grad_norm": 1.5304944515228271, + "learning_rate": 0.0001662611602336154, + "loss": 1.0958, + "step": 8136 + }, + { + "epoch": 0.29140329829713324, + "grad_norm": 1.2294920682907104, + "learning_rate": 0.00016625247255414198, + "loss": 1.2007, + "step": 8137 + }, + { + "epoch": 0.2914391104267015, + "grad_norm": 1.0933688879013062, + "learning_rate": 0.000166243783983327, + "loss": 1.0316, + "step": 8138 + }, + { + "epoch": 0.29147492255626983, + "grad_norm": 1.5981239080429077, + "learning_rate": 0.00016623509452128732, + "loss": 1.175, + "step": 8139 + }, + { + "epoch": 0.2915107346858381, + "grad_norm": 1.3587627410888672, + "learning_rate": 0.00016622640416813988, + "loss": 1.1738, + "step": 8140 + }, + { + "epoch": 0.29154654681540637, + "grad_norm": 1.3871757984161377, + "learning_rate": 0.00016621771292400162, + "loss": 1.1707, + "step": 8141 + }, + { + "epoch": 0.29158235894497464, + "grad_norm": 1.8419315814971924, + "learning_rate": 0.00016620902078898943, + "loss": 1.0345, + "step": 8142 + }, + { + "epoch": 0.29161817107454296, + "grad_norm": 1.3923730850219727, + "learning_rate": 0.0001662003277632203, + "loss": 1.1377, + "step": 8143 + }, + { + "epoch": 0.29165398320411123, + "grad_norm": 1.8022476434707642, + "learning_rate": 0.0001661916338468111, + "loss": 1.0399, + "step": 8144 + }, + { + "epoch": 0.2916897953336795, + "grad_norm": 1.7083078622817993, + "learning_rate": 0.00016618293903987888, + "loss": 1.1465, + "step": 8145 + }, + { + "epoch": 0.2917256074632478, + "grad_norm": 1.8886581659317017, + "learning_rate": 0.00016617424334254061, + "loss": 1.1166, + "step": 8146 + }, + { + "epoch": 0.2917614195928161, + "grad_norm": 1.553911566734314, + "learning_rate": 0.00016616554675491325, + "loss": 1.3605, + "step": 8147 + }, + { + "epoch": 0.29179723172238436, + "grad_norm": 1.289354920387268, + "learning_rate": 0.00016615684927711376, + "loss": 1.0923, + "step": 8148 + }, + { + "epoch": 0.29183304385195263, + "grad_norm": 1.6587151288986206, + "learning_rate": 0.00016614815090925923, + "loss": 1.1251, + "step": 8149 + }, + { + "epoch": 0.29186885598152096, + "grad_norm": 1.4888391494750977, + "learning_rate": 0.00016613945165146668, + "loss": 1.2321, + "step": 8150 + }, + { + "epoch": 0.2919046681110892, + "grad_norm": 1.4687397480010986, + "learning_rate": 0.00016613075150385308, + "loss": 1.0501, + "step": 8151 + }, + { + "epoch": 0.2919404802406575, + "grad_norm": 1.5412240028381348, + "learning_rate": 0.00016612205046653554, + "loss": 1.1893, + "step": 8152 + }, + { + "epoch": 0.2919762923702258, + "grad_norm": 1.547686219215393, + "learning_rate": 0.00016611334853963106, + "loss": 1.0241, + "step": 8153 + }, + { + "epoch": 0.2920121044997941, + "grad_norm": 1.5880247354507446, + "learning_rate": 0.0001661046457232568, + "loss": 0.9791, + "step": 8154 + }, + { + "epoch": 0.29204791662936236, + "grad_norm": 1.7011065483093262, + "learning_rate": 0.00016609594201752982, + "loss": 1.2006, + "step": 8155 + }, + { + "epoch": 0.2920837287589306, + "grad_norm": 1.5271458625793457, + "learning_rate": 0.00016608723742256719, + "loss": 1.1586, + "step": 8156 + }, + { + "epoch": 0.29211954088849895, + "grad_norm": 1.4758622646331787, + "learning_rate": 0.00016607853193848597, + "loss": 1.1369, + "step": 8157 + }, + { + "epoch": 0.2921553530180672, + "grad_norm": 1.6127346754074097, + "learning_rate": 0.0001660698255654034, + "loss": 1.1992, + "step": 8158 + }, + { + "epoch": 0.2921911651476355, + "grad_norm": 1.6295706033706665, + "learning_rate": 0.0001660611183034365, + "loss": 1.2412, + "step": 8159 + }, + { + "epoch": 0.2922269772772038, + "grad_norm": 1.5403640270233154, + "learning_rate": 0.00016605241015270247, + "loss": 1.1751, + "step": 8160 + }, + { + "epoch": 0.2922627894067721, + "grad_norm": 1.9405112266540527, + "learning_rate": 0.0001660437011133185, + "loss": 1.3952, + "step": 8161 + }, + { + "epoch": 0.29229860153634035, + "grad_norm": 1.5335521697998047, + "learning_rate": 0.0001660349911854017, + "loss": 1.2441, + "step": 8162 + }, + { + "epoch": 0.2923344136659086, + "grad_norm": 1.8237025737762451, + "learning_rate": 0.0001660262803690693, + "loss": 1.0362, + "step": 8163 + }, + { + "epoch": 0.29237022579547695, + "grad_norm": 1.3729710578918457, + "learning_rate": 0.00016601756866443845, + "loss": 1.3109, + "step": 8164 + }, + { + "epoch": 0.2924060379250452, + "grad_norm": 1.3823214769363403, + "learning_rate": 0.00016600885607162636, + "loss": 0.9267, + "step": 8165 + }, + { + "epoch": 0.2924418500546135, + "grad_norm": 1.3649708032608032, + "learning_rate": 0.00016600014259075024, + "loss": 1.1381, + "step": 8166 + }, + { + "epoch": 0.2924776621841818, + "grad_norm": 1.9021387100219727, + "learning_rate": 0.00016599142822192736, + "loss": 0.8887, + "step": 8167 + }, + { + "epoch": 0.2925134743137501, + "grad_norm": 1.4105093479156494, + "learning_rate": 0.00016598271296527494, + "loss": 1.4141, + "step": 8168 + }, + { + "epoch": 0.29254928644331835, + "grad_norm": 1.622109293937683, + "learning_rate": 0.00016597399682091024, + "loss": 0.9728, + "step": 8169 + }, + { + "epoch": 0.2925850985728866, + "grad_norm": 1.602568507194519, + "learning_rate": 0.00016596527978895046, + "loss": 1.0498, + "step": 8170 + }, + { + "epoch": 0.29262091070245494, + "grad_norm": 1.2128233909606934, + "learning_rate": 0.00016595656186951297, + "loss": 1.3245, + "step": 8171 + }, + { + "epoch": 0.2926567228320232, + "grad_norm": 1.7244452238082886, + "learning_rate": 0.00016594784306271502, + "loss": 1.2824, + "step": 8172 + }, + { + "epoch": 0.2926925349615915, + "grad_norm": 1.5835041999816895, + "learning_rate": 0.00016593912336867393, + "loss": 1.0436, + "step": 8173 + }, + { + "epoch": 0.2927283470911598, + "grad_norm": 1.559402346611023, + "learning_rate": 0.00016593040278750694, + "loss": 1.1783, + "step": 8174 + }, + { + "epoch": 0.29276415922072807, + "grad_norm": 1.4669878482818604, + "learning_rate": 0.00016592168131933144, + "loss": 1.0636, + "step": 8175 + }, + { + "epoch": 0.29279997135029634, + "grad_norm": 2.1150503158569336, + "learning_rate": 0.00016591295896426476, + "loss": 0.9309, + "step": 8176 + }, + { + "epoch": 0.2928357834798646, + "grad_norm": 1.6587193012237549, + "learning_rate": 0.00016590423572242422, + "loss": 1.2621, + "step": 8177 + }, + { + "epoch": 0.29287159560943293, + "grad_norm": 1.4871327877044678, + "learning_rate": 0.0001658955115939272, + "loss": 1.0597, + "step": 8178 + }, + { + "epoch": 0.2929074077390012, + "grad_norm": 1.94606614112854, + "learning_rate": 0.00016588678657889112, + "loss": 1.2908, + "step": 8179 + }, + { + "epoch": 0.29294321986856947, + "grad_norm": 1.5516635179519653, + "learning_rate": 0.00016587806067743327, + "loss": 1.4736, + "step": 8180 + }, + { + "epoch": 0.2929790319981378, + "grad_norm": 1.7875971794128418, + "learning_rate": 0.00016586933388967109, + "loss": 1.1478, + "step": 8181 + }, + { + "epoch": 0.29301484412770606, + "grad_norm": 1.7943695783615112, + "learning_rate": 0.000165860606215722, + "loss": 1.1048, + "step": 8182 + }, + { + "epoch": 0.29305065625727433, + "grad_norm": 2.075282096862793, + "learning_rate": 0.0001658518776557034, + "loss": 0.9537, + "step": 8183 + }, + { + "epoch": 0.2930864683868426, + "grad_norm": 1.2871848344802856, + "learning_rate": 0.00016584314820973273, + "loss": 1.1443, + "step": 8184 + }, + { + "epoch": 0.2931222805164109, + "grad_norm": 1.3742963075637817, + "learning_rate": 0.00016583441787792745, + "loss": 1.0596, + "step": 8185 + }, + { + "epoch": 0.2931580926459792, + "grad_norm": 1.5276155471801758, + "learning_rate": 0.00016582568666040497, + "loss": 1.249, + "step": 8186 + }, + { + "epoch": 0.29319390477554746, + "grad_norm": 1.2189489603042603, + "learning_rate": 0.0001658169545572828, + "loss": 1.1733, + "step": 8187 + }, + { + "epoch": 0.2932297169051158, + "grad_norm": 1.7565499544143677, + "learning_rate": 0.0001658082215686784, + "loss": 1.0313, + "step": 8188 + }, + { + "epoch": 0.29326552903468406, + "grad_norm": 1.980645775794983, + "learning_rate": 0.00016579948769470927, + "loss": 1.3844, + "step": 8189 + }, + { + "epoch": 0.2933013411642523, + "grad_norm": 1.7903727293014526, + "learning_rate": 0.00016579075293549292, + "loss": 1.0926, + "step": 8190 + }, + { + "epoch": 0.2933371532938206, + "grad_norm": 1.5200183391571045, + "learning_rate": 0.00016578201729114682, + "loss": 1.2466, + "step": 8191 + }, + { + "epoch": 0.2933729654233889, + "grad_norm": 1.8921464681625366, + "learning_rate": 0.00016577328076178855, + "loss": 1.1547, + "step": 8192 + }, + { + "epoch": 0.2934087775529572, + "grad_norm": 1.5072718858718872, + "learning_rate": 0.0001657645433475356, + "loss": 1.0504, + "step": 8193 + }, + { + "epoch": 0.29344458968252546, + "grad_norm": 1.4286744594573975, + "learning_rate": 0.0001657558050485056, + "loss": 1.0177, + "step": 8194 + }, + { + "epoch": 0.2934804018120938, + "grad_norm": 1.6349303722381592, + "learning_rate": 0.00016574706586481607, + "loss": 0.9429, + "step": 8195 + }, + { + "epoch": 0.29351621394166205, + "grad_norm": 2.1043217182159424, + "learning_rate": 0.0001657383257965845, + "loss": 1.3046, + "step": 8196 + }, + { + "epoch": 0.2935520260712303, + "grad_norm": 1.8287688493728638, + "learning_rate": 0.0001657295848439286, + "loss": 0.976, + "step": 8197 + }, + { + "epoch": 0.2935878382007986, + "grad_norm": 1.6221023797988892, + "learning_rate": 0.00016572084300696594, + "loss": 1.2025, + "step": 8198 + }, + { + "epoch": 0.2936236503303669, + "grad_norm": 1.911760926246643, + "learning_rate": 0.0001657121002858141, + "loss": 1.3627, + "step": 8199 + }, + { + "epoch": 0.2936594624599352, + "grad_norm": 1.479008436203003, + "learning_rate": 0.0001657033566805907, + "loss": 0.9378, + "step": 8200 + }, + { + "epoch": 0.29369527458950345, + "grad_norm": 1.6655240058898926, + "learning_rate": 0.00016569461219141337, + "loss": 1.3713, + "step": 8201 + }, + { + "epoch": 0.2937310867190718, + "grad_norm": 2.0623905658721924, + "learning_rate": 0.00016568586681839982, + "loss": 1.1008, + "step": 8202 + }, + { + "epoch": 0.29376689884864005, + "grad_norm": 1.7179086208343506, + "learning_rate": 0.00016567712056166762, + "loss": 1.0692, + "step": 8203 + }, + { + "epoch": 0.2938027109782083, + "grad_norm": 1.3593605756759644, + "learning_rate": 0.0001656683734213345, + "loss": 1.1806, + "step": 8204 + }, + { + "epoch": 0.2938385231077766, + "grad_norm": 1.4529451131820679, + "learning_rate": 0.00016565962539751808, + "loss": 1.2101, + "step": 8205 + }, + { + "epoch": 0.2938743352373449, + "grad_norm": 1.6455321311950684, + "learning_rate": 0.00016565087649033614, + "loss": 1.0177, + "step": 8206 + }, + { + "epoch": 0.2939101473669132, + "grad_norm": 1.610344409942627, + "learning_rate": 0.00016564212669990634, + "loss": 1.1532, + "step": 8207 + }, + { + "epoch": 0.29394595949648145, + "grad_norm": 1.5025447607040405, + "learning_rate": 0.00016563337602634642, + "loss": 1.2491, + "step": 8208 + }, + { + "epoch": 0.29398177162604977, + "grad_norm": 1.416930913925171, + "learning_rate": 0.00016562462446977403, + "loss": 1.2603, + "step": 8209 + }, + { + "epoch": 0.29401758375561804, + "grad_norm": 1.74039888381958, + "learning_rate": 0.000165615872030307, + "loss": 1.1651, + "step": 8210 + }, + { + "epoch": 0.2940533958851863, + "grad_norm": 1.568834662437439, + "learning_rate": 0.00016560711870806303, + "loss": 1.1587, + "step": 8211 + }, + { + "epoch": 0.2940892080147546, + "grad_norm": 1.76903235912323, + "learning_rate": 0.00016559836450315992, + "loss": 1.2351, + "step": 8212 + }, + { + "epoch": 0.2941250201443229, + "grad_norm": 1.6961992979049683, + "learning_rate": 0.00016558960941571543, + "loss": 1.1093, + "step": 8213 + }, + { + "epoch": 0.29416083227389117, + "grad_norm": 1.4649475812911987, + "learning_rate": 0.00016558085344584736, + "loss": 1.0346, + "step": 8214 + }, + { + "epoch": 0.29419664440345944, + "grad_norm": 1.4006158113479614, + "learning_rate": 0.00016557209659367347, + "loss": 1.2515, + "step": 8215 + }, + { + "epoch": 0.29423245653302776, + "grad_norm": 1.9428192377090454, + "learning_rate": 0.00016556333885931162, + "loss": 1.222, + "step": 8216 + }, + { + "epoch": 0.29426826866259603, + "grad_norm": 1.639729619026184, + "learning_rate": 0.00016555458024287964, + "loss": 1.3938, + "step": 8217 + }, + { + "epoch": 0.2943040807921643, + "grad_norm": 1.8911609649658203, + "learning_rate": 0.0001655458207444953, + "loss": 1.3142, + "step": 8218 + }, + { + "epoch": 0.29433989292173257, + "grad_norm": 1.3249202966690063, + "learning_rate": 0.0001655370603642765, + "loss": 1.1552, + "step": 8219 + }, + { + "epoch": 0.2943757050513009, + "grad_norm": 1.3111982345581055, + "learning_rate": 0.0001655282991023411, + "loss": 1.2046, + "step": 8220 + }, + { + "epoch": 0.29441151718086916, + "grad_norm": 1.9114937782287598, + "learning_rate": 0.000165519536958807, + "loss": 1.2034, + "step": 8221 + }, + { + "epoch": 0.29444732931043743, + "grad_norm": 1.7763501405715942, + "learning_rate": 0.000165510773933792, + "loss": 1.0552, + "step": 8222 + }, + { + "epoch": 0.2944831414400057, + "grad_norm": 1.526631474494934, + "learning_rate": 0.00016550201002741403, + "loss": 1.242, + "step": 8223 + }, + { + "epoch": 0.294518953569574, + "grad_norm": 1.5912314653396606, + "learning_rate": 0.00016549324523979102, + "loss": 1.3148, + "step": 8224 + }, + { + "epoch": 0.2945547656991423, + "grad_norm": 1.8353331089019775, + "learning_rate": 0.0001654844795710409, + "loss": 1.3514, + "step": 8225 + }, + { + "epoch": 0.29459057782871056, + "grad_norm": 1.717878818511963, + "learning_rate": 0.00016547571302128153, + "loss": 1.1759, + "step": 8226 + }, + { + "epoch": 0.2946263899582789, + "grad_norm": 1.5434927940368652, + "learning_rate": 0.00016546694559063093, + "loss": 1.1358, + "step": 8227 + }, + { + "epoch": 0.29466220208784716, + "grad_norm": 1.5463213920593262, + "learning_rate": 0.000165458177279207, + "loss": 1.2809, + "step": 8228 + }, + { + "epoch": 0.2946980142174154, + "grad_norm": 1.5844237804412842, + "learning_rate": 0.00016544940808712775, + "loss": 1.2279, + "step": 8229 + }, + { + "epoch": 0.2947338263469837, + "grad_norm": 1.7013137340545654, + "learning_rate": 0.00016544063801451114, + "loss": 1.4399, + "step": 8230 + }, + { + "epoch": 0.294769638476552, + "grad_norm": 1.8431209325790405, + "learning_rate": 0.00016543186706147514, + "loss": 1.253, + "step": 8231 + }, + { + "epoch": 0.2948054506061203, + "grad_norm": 2.0415799617767334, + "learning_rate": 0.00016542309522813779, + "loss": 1.1108, + "step": 8232 + }, + { + "epoch": 0.29484126273568856, + "grad_norm": 1.3407597541809082, + "learning_rate": 0.00016541432251461705, + "loss": 1.1864, + "step": 8233 + }, + { + "epoch": 0.2948770748652569, + "grad_norm": 1.3796371221542358, + "learning_rate": 0.000165405548921031, + "loss": 1.0775, + "step": 8234 + }, + { + "epoch": 0.29491288699482515, + "grad_norm": 1.983489751815796, + "learning_rate": 0.0001653967744474977, + "loss": 1.1109, + "step": 8235 + }, + { + "epoch": 0.2949486991243934, + "grad_norm": 1.7928661108016968, + "learning_rate": 0.00016538799909413508, + "loss": 1.1313, + "step": 8236 + }, + { + "epoch": 0.2949845112539617, + "grad_norm": 1.6872329711914062, + "learning_rate": 0.00016537922286106134, + "loss": 1.2204, + "step": 8237 + }, + { + "epoch": 0.29502032338353, + "grad_norm": 1.6700689792633057, + "learning_rate": 0.00016537044574839444, + "loss": 1.1895, + "step": 8238 + }, + { + "epoch": 0.2950561355130983, + "grad_norm": 1.6949191093444824, + "learning_rate": 0.00016536166775625252, + "loss": 1.3945, + "step": 8239 + }, + { + "epoch": 0.29509194764266655, + "grad_norm": 1.2069326639175415, + "learning_rate": 0.0001653528888847537, + "loss": 1.0788, + "step": 8240 + }, + { + "epoch": 0.2951277597722349, + "grad_norm": 1.4759175777435303, + "learning_rate": 0.00016534410913401603, + "loss": 1.099, + "step": 8241 + }, + { + "epoch": 0.29516357190180315, + "grad_norm": 1.5363385677337646, + "learning_rate": 0.0001653353285041577, + "loss": 1.0436, + "step": 8242 + }, + { + "epoch": 0.2951993840313714, + "grad_norm": 1.4938173294067383, + "learning_rate": 0.00016532654699529678, + "loss": 1.2561, + "step": 8243 + }, + { + "epoch": 0.2952351961609397, + "grad_norm": 1.4759892225265503, + "learning_rate": 0.00016531776460755143, + "loss": 1.0028, + "step": 8244 + }, + { + "epoch": 0.295271008290508, + "grad_norm": 1.905588984489441, + "learning_rate": 0.0001653089813410398, + "loss": 1.2501, + "step": 8245 + }, + { + "epoch": 0.2953068204200763, + "grad_norm": 1.3686895370483398, + "learning_rate": 0.00016530019719588007, + "loss": 1.113, + "step": 8246 + }, + { + "epoch": 0.29534263254964455, + "grad_norm": 1.4514057636260986, + "learning_rate": 0.00016529141217219045, + "loss": 1.1897, + "step": 8247 + }, + { + "epoch": 0.29537844467921287, + "grad_norm": 1.4960085153579712, + "learning_rate": 0.00016528262627008906, + "loss": 1.1309, + "step": 8248 + }, + { + "epoch": 0.29541425680878114, + "grad_norm": 1.9527997970581055, + "learning_rate": 0.00016527383948969416, + "loss": 1.3485, + "step": 8249 + }, + { + "epoch": 0.2954500689383494, + "grad_norm": 1.573303461074829, + "learning_rate": 0.00016526505183112394, + "loss": 1.2389, + "step": 8250 + }, + { + "epoch": 0.2954858810679177, + "grad_norm": 1.6833937168121338, + "learning_rate": 0.00016525626329449668, + "loss": 1.3434, + "step": 8251 + }, + { + "epoch": 0.295521693197486, + "grad_norm": 1.6960405111312866, + "learning_rate": 0.0001652474738799305, + "loss": 1.2726, + "step": 8252 + }, + { + "epoch": 0.29555750532705427, + "grad_norm": 1.4664548635482788, + "learning_rate": 0.00016523868358754378, + "loss": 1.1169, + "step": 8253 + }, + { + "epoch": 0.29559331745662254, + "grad_norm": 2.294020414352417, + "learning_rate": 0.00016522989241745469, + "loss": 1.1382, + "step": 8254 + }, + { + "epoch": 0.29562912958619086, + "grad_norm": 1.8984702825546265, + "learning_rate": 0.00016522110036978153, + "loss": 1.0849, + "step": 8255 + }, + { + "epoch": 0.29566494171575913, + "grad_norm": 1.940454363822937, + "learning_rate": 0.0001652123074446426, + "loss": 1.2385, + "step": 8256 + }, + { + "epoch": 0.2957007538453274, + "grad_norm": 2.0734190940856934, + "learning_rate": 0.00016520351364215623, + "loss": 1.5211, + "step": 8257 + }, + { + "epoch": 0.29573656597489567, + "grad_norm": 1.8028337955474854, + "learning_rate": 0.00016519471896244063, + "loss": 1.2014, + "step": 8258 + }, + { + "epoch": 0.295772378104464, + "grad_norm": 1.9575212001800537, + "learning_rate": 0.00016518592340561422, + "loss": 1.1843, + "step": 8259 + }, + { + "epoch": 0.29580819023403226, + "grad_norm": 1.538577914237976, + "learning_rate": 0.0001651771269717953, + "loss": 1.1613, + "step": 8260 + }, + { + "epoch": 0.29584400236360053, + "grad_norm": 1.5544569492340088, + "learning_rate": 0.0001651683296611022, + "loss": 1.3091, + "step": 8261 + }, + { + "epoch": 0.29587981449316886, + "grad_norm": 1.9594866037368774, + "learning_rate": 0.0001651595314736533, + "loss": 1.1429, + "step": 8262 + }, + { + "epoch": 0.2959156266227371, + "grad_norm": 1.6986420154571533, + "learning_rate": 0.00016515073240956692, + "loss": 1.1975, + "step": 8263 + }, + { + "epoch": 0.2959514387523054, + "grad_norm": 1.611853837966919, + "learning_rate": 0.0001651419324689615, + "loss": 1.1613, + "step": 8264 + }, + { + "epoch": 0.29598725088187366, + "grad_norm": 1.4676263332366943, + "learning_rate": 0.00016513313165195538, + "loss": 1.0534, + "step": 8265 + }, + { + "epoch": 0.296023063011442, + "grad_norm": 1.5344127416610718, + "learning_rate": 0.00016512432995866702, + "loss": 1.051, + "step": 8266 + }, + { + "epoch": 0.29605887514101026, + "grad_norm": 1.4619224071502686, + "learning_rate": 0.00016511552738921479, + "loss": 1.2949, + "step": 8267 + }, + { + "epoch": 0.2960946872705785, + "grad_norm": 1.6261708736419678, + "learning_rate": 0.0001651067239437171, + "loss": 1.231, + "step": 8268 + }, + { + "epoch": 0.29613049940014685, + "grad_norm": 1.4027454853057861, + "learning_rate": 0.00016509791962229247, + "loss": 1.2139, + "step": 8269 + }, + { + "epoch": 0.2961663115297151, + "grad_norm": 1.919278621673584, + "learning_rate": 0.0001650891144250593, + "loss": 1.2568, + "step": 8270 + }, + { + "epoch": 0.2962021236592834, + "grad_norm": 1.483976125717163, + "learning_rate": 0.00016508030835213605, + "loss": 1.1235, + "step": 8271 + }, + { + "epoch": 0.29623793578885166, + "grad_norm": 1.689498782157898, + "learning_rate": 0.00016507150140364116, + "loss": 1.0914, + "step": 8272 + }, + { + "epoch": 0.29627374791842, + "grad_norm": 1.423421859741211, + "learning_rate": 0.0001650626935796932, + "loss": 1.144, + "step": 8273 + }, + { + "epoch": 0.29630956004798825, + "grad_norm": 1.7756469249725342, + "learning_rate": 0.00016505388488041058, + "loss": 1.185, + "step": 8274 + }, + { + "epoch": 0.2963453721775565, + "grad_norm": 1.4991822242736816, + "learning_rate": 0.0001650450753059119, + "loss": 1.0498, + "step": 8275 + }, + { + "epoch": 0.29638118430712485, + "grad_norm": 2.0735301971435547, + "learning_rate": 0.00016503626485631561, + "loss": 1.3155, + "step": 8276 + }, + { + "epoch": 0.2964169964366931, + "grad_norm": 1.3739265203475952, + "learning_rate": 0.00016502745353174026, + "loss": 1.2926, + "step": 8277 + }, + { + "epoch": 0.2964528085662614, + "grad_norm": 1.9867258071899414, + "learning_rate": 0.0001650186413323044, + "loss": 1.3382, + "step": 8278 + }, + { + "epoch": 0.29648862069582965, + "grad_norm": 1.4616589546203613, + "learning_rate": 0.0001650098282581266, + "loss": 1.0516, + "step": 8279 + }, + { + "epoch": 0.296524432825398, + "grad_norm": 1.6023280620574951, + "learning_rate": 0.00016500101430932541, + "loss": 1.1586, + "step": 8280 + }, + { + "epoch": 0.29656024495496625, + "grad_norm": 1.5426716804504395, + "learning_rate": 0.00016499219948601943, + "loss": 1.2869, + "step": 8281 + }, + { + "epoch": 0.2965960570845345, + "grad_norm": 1.4001213312149048, + "learning_rate": 0.00016498338378832724, + "loss": 1.1067, + "step": 8282 + }, + { + "epoch": 0.29663186921410284, + "grad_norm": 1.6084911823272705, + "learning_rate": 0.00016497456721636743, + "loss": 1.1875, + "step": 8283 + }, + { + "epoch": 0.2966676813436711, + "grad_norm": 1.6355007886886597, + "learning_rate": 0.00016496574977025862, + "loss": 1.2522, + "step": 8284 + }, + { + "epoch": 0.2967034934732394, + "grad_norm": 1.3197362422943115, + "learning_rate": 0.00016495693145011947, + "loss": 0.974, + "step": 8285 + }, + { + "epoch": 0.29673930560280765, + "grad_norm": 1.6156702041625977, + "learning_rate": 0.00016494811225606858, + "loss": 1.0609, + "step": 8286 + }, + { + "epoch": 0.29677511773237597, + "grad_norm": 1.7792130708694458, + "learning_rate": 0.00016493929218822467, + "loss": 0.9822, + "step": 8287 + }, + { + "epoch": 0.29681092986194424, + "grad_norm": 1.894665002822876, + "learning_rate": 0.0001649304712467063, + "loss": 1.1351, + "step": 8288 + }, + { + "epoch": 0.2968467419915125, + "grad_norm": 1.7986706495285034, + "learning_rate": 0.00016492164943163217, + "loss": 1.2367, + "step": 8289 + }, + { + "epoch": 0.29688255412108083, + "grad_norm": 1.3468233346939087, + "learning_rate": 0.00016491282674312103, + "loss": 1.205, + "step": 8290 + }, + { + "epoch": 0.2969183662506491, + "grad_norm": 1.5086969137191772, + "learning_rate": 0.00016490400318129153, + "loss": 1.0754, + "step": 8291 + }, + { + "epoch": 0.29695417838021737, + "grad_norm": 1.3507224321365356, + "learning_rate": 0.0001648951787462624, + "loss": 1.1775, + "step": 8292 + }, + { + "epoch": 0.29698999050978564, + "grad_norm": 1.778799057006836, + "learning_rate": 0.0001648863534381523, + "loss": 1.27, + "step": 8293 + }, + { + "epoch": 0.29702580263935396, + "grad_norm": 1.5074681043624878, + "learning_rate": 0.00016487752725708005, + "loss": 1.2611, + "step": 8294 + }, + { + "epoch": 0.29706161476892223, + "grad_norm": 1.4830124378204346, + "learning_rate": 0.00016486870020316437, + "loss": 1.0908, + "step": 8295 + }, + { + "epoch": 0.2970974268984905, + "grad_norm": 1.4512361288070679, + "learning_rate": 0.000164859872276524, + "loss": 1.2161, + "step": 8296 + }, + { + "epoch": 0.2971332390280588, + "grad_norm": 1.5999419689178467, + "learning_rate": 0.0001648510434772777, + "loss": 1.2397, + "step": 8297 + }, + { + "epoch": 0.2971690511576271, + "grad_norm": 1.5689831972122192, + "learning_rate": 0.00016484221380554424, + "loss": 1.0927, + "step": 8298 + }, + { + "epoch": 0.29720486328719536, + "grad_norm": 1.9188692569732666, + "learning_rate": 0.00016483338326144244, + "loss": 1.2113, + "step": 8299 + }, + { + "epoch": 0.29724067541676363, + "grad_norm": 1.4943267107009888, + "learning_rate": 0.0001648245518450911, + "loss": 0.9915, + "step": 8300 + }, + { + "epoch": 0.29727648754633196, + "grad_norm": 1.826572060585022, + "learning_rate": 0.00016481571955660903, + "loss": 1.2228, + "step": 8301 + }, + { + "epoch": 0.2973122996759002, + "grad_norm": 1.3858635425567627, + "learning_rate": 0.0001648068863961151, + "loss": 1.1327, + "step": 8302 + }, + { + "epoch": 0.2973481118054685, + "grad_norm": 1.3565785884857178, + "learning_rate": 0.00016479805236372806, + "loss": 1.0942, + "step": 8303 + }, + { + "epoch": 0.2973839239350368, + "grad_norm": 1.6411765813827515, + "learning_rate": 0.00016478921745956686, + "loss": 1.3517, + "step": 8304 + }, + { + "epoch": 0.2974197360646051, + "grad_norm": 1.845913290977478, + "learning_rate": 0.00016478038168375028, + "loss": 1.492, + "step": 8305 + }, + { + "epoch": 0.29745554819417336, + "grad_norm": 1.683200478553772, + "learning_rate": 0.00016477154503639723, + "loss": 1.2436, + "step": 8306 + }, + { + "epoch": 0.2974913603237416, + "grad_norm": 1.4818953275680542, + "learning_rate": 0.00016476270751762656, + "loss": 0.9025, + "step": 8307 + }, + { + "epoch": 0.29752717245330995, + "grad_norm": 1.4540287256240845, + "learning_rate": 0.00016475386912755724, + "loss": 1.1335, + "step": 8308 + }, + { + "epoch": 0.2975629845828782, + "grad_norm": 1.655297040939331, + "learning_rate": 0.0001647450298663081, + "loss": 1.3057, + "step": 8309 + }, + { + "epoch": 0.2975987967124465, + "grad_norm": 1.8399049043655396, + "learning_rate": 0.00016473618973399811, + "loss": 1.1942, + "step": 8310 + }, + { + "epoch": 0.2976346088420148, + "grad_norm": 1.7606158256530762, + "learning_rate": 0.00016472734873074622, + "loss": 1.3331, + "step": 8311 + }, + { + "epoch": 0.2976704209715831, + "grad_norm": 1.5666282176971436, + "learning_rate": 0.00016471850685667133, + "loss": 1.1639, + "step": 8312 + }, + { + "epoch": 0.29770623310115135, + "grad_norm": 1.6966527700424194, + "learning_rate": 0.0001647096641118924, + "loss": 1.2803, + "step": 8313 + }, + { + "epoch": 0.2977420452307196, + "grad_norm": 1.5161687135696411, + "learning_rate": 0.00016470082049652843, + "loss": 1.1573, + "step": 8314 + }, + { + "epoch": 0.29777785736028795, + "grad_norm": 1.3142815828323364, + "learning_rate": 0.00016469197601069838, + "loss": 1.0586, + "step": 8315 + }, + { + "epoch": 0.2978136694898562, + "grad_norm": 1.3845125436782837, + "learning_rate": 0.00016468313065452121, + "loss": 1.1757, + "step": 8316 + }, + { + "epoch": 0.2978494816194245, + "grad_norm": 1.5361560583114624, + "learning_rate": 0.00016467428442811595, + "loss": 0.9918, + "step": 8317 + }, + { + "epoch": 0.2978852937489928, + "grad_norm": 1.7048897743225098, + "learning_rate": 0.00016466543733160163, + "loss": 1.3181, + "step": 8318 + }, + { + "epoch": 0.2979211058785611, + "grad_norm": 1.4278795719146729, + "learning_rate": 0.00016465658936509726, + "loss": 1.0878, + "step": 8319 + }, + { + "epoch": 0.29795691800812935, + "grad_norm": 1.4830702543258667, + "learning_rate": 0.0001646477405287219, + "loss": 1.2514, + "step": 8320 + }, + { + "epoch": 0.2979927301376976, + "grad_norm": 1.39218008518219, + "learning_rate": 0.00016463889082259456, + "loss": 1.2239, + "step": 8321 + }, + { + "epoch": 0.29802854226726594, + "grad_norm": 1.852612018585205, + "learning_rate": 0.00016463004024683432, + "loss": 1.216, + "step": 8322 + }, + { + "epoch": 0.2980643543968342, + "grad_norm": 1.4265192747116089, + "learning_rate": 0.0001646211888015603, + "loss": 0.9936, + "step": 8323 + }, + { + "epoch": 0.2981001665264025, + "grad_norm": 1.470229983329773, + "learning_rate": 0.0001646123364868915, + "loss": 1.023, + "step": 8324 + }, + { + "epoch": 0.2981359786559708, + "grad_norm": 1.6624137163162231, + "learning_rate": 0.00016460348330294704, + "loss": 1.064, + "step": 8325 + }, + { + "epoch": 0.29817179078553907, + "grad_norm": 1.3349603414535522, + "learning_rate": 0.00016459462924984605, + "loss": 1.323, + "step": 8326 + }, + { + "epoch": 0.29820760291510734, + "grad_norm": 1.2284175157546997, + "learning_rate": 0.00016458577432770766, + "loss": 1.1587, + "step": 8327 + }, + { + "epoch": 0.2982434150446756, + "grad_norm": 1.8591517210006714, + "learning_rate": 0.000164576918536651, + "loss": 1.1069, + "step": 8328 + }, + { + "epoch": 0.29827922717424393, + "grad_norm": 1.269019365310669, + "learning_rate": 0.0001645680618767952, + "loss": 1.0739, + "step": 8329 + }, + { + "epoch": 0.2983150393038122, + "grad_norm": 1.5900176763534546, + "learning_rate": 0.00016455920434825936, + "loss": 1.2493, + "step": 8330 + }, + { + "epoch": 0.29835085143338047, + "grad_norm": 1.1904910802841187, + "learning_rate": 0.00016455034595116278, + "loss": 0.9992, + "step": 8331 + }, + { + "epoch": 0.2983866635629488, + "grad_norm": 1.7700215578079224, + "learning_rate": 0.00016454148668562454, + "loss": 0.969, + "step": 8332 + }, + { + "epoch": 0.29842247569251706, + "grad_norm": 1.9240727424621582, + "learning_rate": 0.0001645326265517638, + "loss": 1.0699, + "step": 8333 + }, + { + "epoch": 0.29845828782208533, + "grad_norm": 1.4014614820480347, + "learning_rate": 0.00016452376554969983, + "loss": 1.2459, + "step": 8334 + }, + { + "epoch": 0.2984940999516536, + "grad_norm": 1.504016399383545, + "learning_rate": 0.00016451490367955183, + "loss": 1.1692, + "step": 8335 + }, + { + "epoch": 0.2985299120812219, + "grad_norm": 1.8755345344543457, + "learning_rate": 0.00016450604094143904, + "loss": 1.2338, + "step": 8336 + }, + { + "epoch": 0.2985657242107902, + "grad_norm": 1.3299551010131836, + "learning_rate": 0.00016449717733548066, + "loss": 1.1798, + "step": 8337 + }, + { + "epoch": 0.29860153634035846, + "grad_norm": 1.6623239517211914, + "learning_rate": 0.00016448831286179595, + "loss": 1.2276, + "step": 8338 + }, + { + "epoch": 0.2986373484699268, + "grad_norm": 1.5385732650756836, + "learning_rate": 0.00016447944752050417, + "loss": 1.2266, + "step": 8339 + }, + { + "epoch": 0.29867316059949506, + "grad_norm": 1.3969274759292603, + "learning_rate": 0.00016447058131172462, + "loss": 1.1614, + "step": 8340 + }, + { + "epoch": 0.2987089727290633, + "grad_norm": 1.822195291519165, + "learning_rate": 0.00016446171423557652, + "loss": 1.2252, + "step": 8341 + }, + { + "epoch": 0.2987447848586316, + "grad_norm": 1.9116876125335693, + "learning_rate": 0.00016445284629217923, + "loss": 1.2012, + "step": 8342 + }, + { + "epoch": 0.2987805969881999, + "grad_norm": 1.4911556243896484, + "learning_rate": 0.00016444397748165205, + "loss": 1.235, + "step": 8343 + }, + { + "epoch": 0.2988164091177682, + "grad_norm": 1.6038975715637207, + "learning_rate": 0.00016443510780411423, + "loss": 1.1679, + "step": 8344 + }, + { + "epoch": 0.29885222124733646, + "grad_norm": 1.8494973182678223, + "learning_rate": 0.0001644262372596852, + "loss": 1.111, + "step": 8345 + }, + { + "epoch": 0.2988880333769048, + "grad_norm": 1.5265573263168335, + "learning_rate": 0.00016441736584848422, + "loss": 1.1249, + "step": 8346 + }, + { + "epoch": 0.29892384550647305, + "grad_norm": 1.418458342552185, + "learning_rate": 0.0001644084935706307, + "loss": 1.1535, + "step": 8347 + }, + { + "epoch": 0.2989596576360413, + "grad_norm": 1.828518271446228, + "learning_rate": 0.00016439962042624396, + "loss": 1.3243, + "step": 8348 + }, + { + "epoch": 0.2989954697656096, + "grad_norm": 1.4431995153427124, + "learning_rate": 0.0001643907464154434, + "loss": 1.1393, + "step": 8349 + }, + { + "epoch": 0.2990312818951779, + "grad_norm": 1.4861443042755127, + "learning_rate": 0.00016438187153834842, + "loss": 1.1298, + "step": 8350 + }, + { + "epoch": 0.2990670940247462, + "grad_norm": 1.7499743700027466, + "learning_rate": 0.0001643729957950784, + "loss": 1.2627, + "step": 8351 + }, + { + "epoch": 0.29910290615431445, + "grad_norm": 2.0400400161743164, + "learning_rate": 0.00016436411918575275, + "loss": 1.1317, + "step": 8352 + }, + { + "epoch": 0.2991387182838828, + "grad_norm": 1.6752665042877197, + "learning_rate": 0.00016435524171049088, + "loss": 1.3413, + "step": 8353 + }, + { + "epoch": 0.29917453041345105, + "grad_norm": 1.4912091493606567, + "learning_rate": 0.00016434636336941228, + "loss": 1.4449, + "step": 8354 + }, + { + "epoch": 0.2992103425430193, + "grad_norm": 1.3523526191711426, + "learning_rate": 0.00016433748416263633, + "loss": 1.3269, + "step": 8355 + }, + { + "epoch": 0.2992461546725876, + "grad_norm": 1.8373702764511108, + "learning_rate": 0.00016432860409028253, + "loss": 1.1786, + "step": 8356 + }, + { + "epoch": 0.2992819668021559, + "grad_norm": 1.2781089544296265, + "learning_rate": 0.00016431972315247037, + "loss": 1.0278, + "step": 8357 + }, + { + "epoch": 0.2993177789317242, + "grad_norm": 1.3384571075439453, + "learning_rate": 0.00016431084134931927, + "loss": 1.3748, + "step": 8358 + }, + { + "epoch": 0.29935359106129245, + "grad_norm": 1.448007345199585, + "learning_rate": 0.00016430195868094875, + "loss": 1.2332, + "step": 8359 + }, + { + "epoch": 0.29938940319086077, + "grad_norm": 1.2279000282287598, + "learning_rate": 0.00016429307514747834, + "loss": 1.175, + "step": 8360 + }, + { + "epoch": 0.29942521532042904, + "grad_norm": 1.3970508575439453, + "learning_rate": 0.00016428419074902752, + "loss": 1.1185, + "step": 8361 + }, + { + "epoch": 0.2994610274499973, + "grad_norm": 1.8461207151412964, + "learning_rate": 0.00016427530548571585, + "loss": 1.1783, + "step": 8362 + }, + { + "epoch": 0.2994968395795656, + "grad_norm": 1.4953193664550781, + "learning_rate": 0.00016426641935766284, + "loss": 1.1803, + "step": 8363 + }, + { + "epoch": 0.2995326517091339, + "grad_norm": 1.5694085359573364, + "learning_rate": 0.00016425753236498807, + "loss": 1.0651, + "step": 8364 + }, + { + "epoch": 0.29956846383870217, + "grad_norm": 1.8044143915176392, + "learning_rate": 0.00016424864450781108, + "loss": 1.3517, + "step": 8365 + }, + { + "epoch": 0.29960427596827044, + "grad_norm": 1.5013200044631958, + "learning_rate": 0.00016423975578625142, + "loss": 1.1889, + "step": 8366 + }, + { + "epoch": 0.29964008809783876, + "grad_norm": 1.8176320791244507, + "learning_rate": 0.00016423086620042879, + "loss": 1.092, + "step": 8367 + }, + { + "epoch": 0.29967590022740703, + "grad_norm": 1.3964438438415527, + "learning_rate": 0.00016422197575046265, + "loss": 1.3321, + "step": 8368 + }, + { + "epoch": 0.2997117123569753, + "grad_norm": 1.244991660118103, + "learning_rate": 0.00016421308443647265, + "loss": 1.0564, + "step": 8369 + }, + { + "epoch": 0.29974752448654357, + "grad_norm": 1.8524776697158813, + "learning_rate": 0.00016420419225857846, + "loss": 1.1379, + "step": 8370 + }, + { + "epoch": 0.2997833366161119, + "grad_norm": 1.2420967817306519, + "learning_rate": 0.00016419529921689967, + "loss": 1.018, + "step": 8371 + }, + { + "epoch": 0.29981914874568016, + "grad_norm": 1.4244372844696045, + "learning_rate": 0.00016418640531155597, + "loss": 1.2327, + "step": 8372 + }, + { + "epoch": 0.29985496087524843, + "grad_norm": 1.6407923698425293, + "learning_rate": 0.00016417751054266692, + "loss": 1.1392, + "step": 8373 + }, + { + "epoch": 0.29989077300481676, + "grad_norm": 1.3002123832702637, + "learning_rate": 0.00016416861491035228, + "loss": 0.8303, + "step": 8374 + }, + { + "epoch": 0.299926585134385, + "grad_norm": 1.4414702653884888, + "learning_rate": 0.0001641597184147317, + "loss": 1.1625, + "step": 8375 + }, + { + "epoch": 0.2999623972639533, + "grad_norm": 1.2965035438537598, + "learning_rate": 0.0001641508210559249, + "loss": 1.0954, + "step": 8376 + }, + { + "epoch": 0.29999820939352156, + "grad_norm": 1.3499760627746582, + "learning_rate": 0.00016414192283405147, + "loss": 0.9212, + "step": 8377 + }, + { + "epoch": 0.3000340215230899, + "grad_norm": 2.1806857585906982, + "learning_rate": 0.00016413302374923124, + "loss": 1.2416, + "step": 8378 + }, + { + "epoch": 0.30006983365265816, + "grad_norm": 1.5070432424545288, + "learning_rate": 0.00016412412380158392, + "loss": 1.2767, + "step": 8379 + }, + { + "epoch": 0.3001056457822264, + "grad_norm": 1.526115894317627, + "learning_rate": 0.00016411522299122924, + "loss": 1.309, + "step": 8380 + }, + { + "epoch": 0.30014145791179475, + "grad_norm": 1.7965779304504395, + "learning_rate": 0.0001641063213182869, + "loss": 1.248, + "step": 8381 + }, + { + "epoch": 0.300177270041363, + "grad_norm": 1.5404866933822632, + "learning_rate": 0.00016409741878287671, + "loss": 1.1294, + "step": 8382 + }, + { + "epoch": 0.3002130821709313, + "grad_norm": 1.5137661695480347, + "learning_rate": 0.00016408851538511846, + "loss": 1.0828, + "step": 8383 + }, + { + "epoch": 0.30024889430049956, + "grad_norm": 1.4487659931182861, + "learning_rate": 0.0001640796111251319, + "loss": 0.9734, + "step": 8384 + }, + { + "epoch": 0.3002847064300679, + "grad_norm": 1.4137576818466187, + "learning_rate": 0.0001640707060030368, + "loss": 1.133, + "step": 8385 + }, + { + "epoch": 0.30032051855963615, + "grad_norm": 1.4109846353530884, + "learning_rate": 0.00016406180001895298, + "loss": 1.3182, + "step": 8386 + }, + { + "epoch": 0.3003563306892044, + "grad_norm": 1.4681360721588135, + "learning_rate": 0.00016405289317300033, + "loss": 1.0035, + "step": 8387 + }, + { + "epoch": 0.30039214281877274, + "grad_norm": 1.87667977809906, + "learning_rate": 0.00016404398546529859, + "loss": 1.3425, + "step": 8388 + }, + { + "epoch": 0.300427954948341, + "grad_norm": 1.5491056442260742, + "learning_rate": 0.00016403507689596763, + "loss": 1.2124, + "step": 8389 + }, + { + "epoch": 0.3004637670779093, + "grad_norm": 1.8183287382125854, + "learning_rate": 0.0001640261674651273, + "loss": 1.1069, + "step": 8390 + }, + { + "epoch": 0.30049957920747755, + "grad_norm": 1.8269599676132202, + "learning_rate": 0.0001640172571728975, + "loss": 1.1557, + "step": 8391 + }, + { + "epoch": 0.3005353913370459, + "grad_norm": 1.492116093635559, + "learning_rate": 0.0001640083460193981, + "loss": 1.0213, + "step": 8392 + }, + { + "epoch": 0.30057120346661415, + "grad_norm": 1.652378797531128, + "learning_rate": 0.00016399943400474895, + "loss": 1.0984, + "step": 8393 + }, + { + "epoch": 0.3006070155961824, + "grad_norm": 1.559085726737976, + "learning_rate": 0.00016399052112906994, + "loss": 1.122, + "step": 8394 + }, + { + "epoch": 0.30064282772575074, + "grad_norm": 1.222636103630066, + "learning_rate": 0.00016398160739248104, + "loss": 0.8831, + "step": 8395 + }, + { + "epoch": 0.300678639855319, + "grad_norm": 1.570224404335022, + "learning_rate": 0.00016397269279510215, + "loss": 1.1039, + "step": 8396 + }, + { + "epoch": 0.3007144519848873, + "grad_norm": 1.4942373037338257, + "learning_rate": 0.00016396377733705317, + "loss": 1.1795, + "step": 8397 + }, + { + "epoch": 0.30075026411445555, + "grad_norm": 1.6114163398742676, + "learning_rate": 0.00016395486101845408, + "loss": 1.0415, + "step": 8398 + }, + { + "epoch": 0.30078607624402387, + "grad_norm": 1.8706040382385254, + "learning_rate": 0.00016394594383942486, + "loss": 1.0274, + "step": 8399 + }, + { + "epoch": 0.30082188837359214, + "grad_norm": 1.9482815265655518, + "learning_rate": 0.00016393702580008542, + "loss": 1.0693, + "step": 8400 + }, + { + "epoch": 0.3008577005031604, + "grad_norm": 1.5981242656707764, + "learning_rate": 0.00016392810690055577, + "loss": 1.1059, + "step": 8401 + }, + { + "epoch": 0.30089351263272873, + "grad_norm": 1.947324275970459, + "learning_rate": 0.00016391918714095592, + "loss": 1.2532, + "step": 8402 + }, + { + "epoch": 0.300929324762297, + "grad_norm": 1.4780583381652832, + "learning_rate": 0.00016391026652140585, + "loss": 1.1037, + "step": 8403 + }, + { + "epoch": 0.30096513689186527, + "grad_norm": 1.910905361175537, + "learning_rate": 0.00016390134504202557, + "loss": 1.1422, + "step": 8404 + }, + { + "epoch": 0.30100094902143354, + "grad_norm": 1.6914708614349365, + "learning_rate": 0.00016389242270293514, + "loss": 0.9213, + "step": 8405 + }, + { + "epoch": 0.30103676115100186, + "grad_norm": 1.7898890972137451, + "learning_rate": 0.00016388349950425456, + "loss": 1.2586, + "step": 8406 + }, + { + "epoch": 0.30107257328057013, + "grad_norm": 1.5885273218154907, + "learning_rate": 0.0001638745754461039, + "loss": 1.0251, + "step": 8407 + }, + { + "epoch": 0.3011083854101384, + "grad_norm": 1.7408686876296997, + "learning_rate": 0.00016386565052860323, + "loss": 1.2662, + "step": 8408 + }, + { + "epoch": 0.3011441975397067, + "grad_norm": 1.288952350616455, + "learning_rate": 0.00016385672475187262, + "loss": 1.1002, + "step": 8409 + }, + { + "epoch": 0.301180009669275, + "grad_norm": 1.3986061811447144, + "learning_rate": 0.00016384779811603214, + "loss": 1.0959, + "step": 8410 + }, + { + "epoch": 0.30121582179884326, + "grad_norm": 1.540024995803833, + "learning_rate": 0.0001638388706212019, + "loss": 1.0726, + "step": 8411 + }, + { + "epoch": 0.30125163392841153, + "grad_norm": 1.4175351858139038, + "learning_rate": 0.000163829942267502, + "loss": 1.1363, + "step": 8412 + }, + { + "epoch": 0.30128744605797986, + "grad_norm": 1.9669687747955322, + "learning_rate": 0.00016382101305505254, + "loss": 1.275, + "step": 8413 + }, + { + "epoch": 0.3013232581875481, + "grad_norm": 1.4113069772720337, + "learning_rate": 0.0001638120829839737, + "loss": 1.1208, + "step": 8414 + }, + { + "epoch": 0.3013590703171164, + "grad_norm": 2.4960291385650635, + "learning_rate": 0.00016380315205438554, + "loss": 1.0226, + "step": 8415 + }, + { + "epoch": 0.3013948824466847, + "grad_norm": 2.2110562324523926, + "learning_rate": 0.00016379422026640831, + "loss": 1.1314, + "step": 8416 + }, + { + "epoch": 0.301430694576253, + "grad_norm": 1.2584011554718018, + "learning_rate": 0.00016378528762016218, + "loss": 0.9073, + "step": 8417 + }, + { + "epoch": 0.30146650670582126, + "grad_norm": 1.4689924716949463, + "learning_rate": 0.00016377635411576723, + "loss": 1.1098, + "step": 8418 + }, + { + "epoch": 0.3015023188353895, + "grad_norm": 1.9847384691238403, + "learning_rate": 0.00016376741975334368, + "loss": 1.2725, + "step": 8419 + }, + { + "epoch": 0.30153813096495785, + "grad_norm": 2.0214943885803223, + "learning_rate": 0.0001637584845330118, + "loss": 1.3746, + "step": 8420 + }, + { + "epoch": 0.3015739430945261, + "grad_norm": 1.62454354763031, + "learning_rate": 0.00016374954845489175, + "loss": 1.1835, + "step": 8421 + }, + { + "epoch": 0.3016097552240944, + "grad_norm": 1.0858556032180786, + "learning_rate": 0.00016374061151910372, + "loss": 1.0058, + "step": 8422 + }, + { + "epoch": 0.30164556735366266, + "grad_norm": 1.3336619138717651, + "learning_rate": 0.000163731673725768, + "loss": 1.0561, + "step": 8423 + }, + { + "epoch": 0.301681379483231, + "grad_norm": 1.389056921005249, + "learning_rate": 0.00016372273507500481, + "loss": 1.2294, + "step": 8424 + }, + { + "epoch": 0.30171719161279925, + "grad_norm": 1.3834075927734375, + "learning_rate": 0.00016371379556693442, + "loss": 1.1542, + "step": 8425 + }, + { + "epoch": 0.3017530037423675, + "grad_norm": 1.555402159690857, + "learning_rate": 0.0001637048552016771, + "loss": 1.1674, + "step": 8426 + }, + { + "epoch": 0.30178881587193584, + "grad_norm": 1.4934643507003784, + "learning_rate": 0.00016369591397935314, + "loss": 1.2031, + "step": 8427 + }, + { + "epoch": 0.3018246280015041, + "grad_norm": 1.403955101966858, + "learning_rate": 0.0001636869719000828, + "loss": 1.1176, + "step": 8428 + }, + { + "epoch": 0.3018604401310724, + "grad_norm": 1.3738188743591309, + "learning_rate": 0.0001636780289639864, + "loss": 1.1753, + "step": 8429 + }, + { + "epoch": 0.30189625226064065, + "grad_norm": 1.7596608400344849, + "learning_rate": 0.00016366908517118428, + "loss": 1.0016, + "step": 8430 + }, + { + "epoch": 0.301932064390209, + "grad_norm": 1.631496787071228, + "learning_rate": 0.00016366014052179674, + "loss": 0.9749, + "step": 8431 + }, + { + "epoch": 0.30196787651977725, + "grad_norm": 1.7311794757843018, + "learning_rate": 0.0001636511950159441, + "loss": 1.098, + "step": 8432 + }, + { + "epoch": 0.3020036886493455, + "grad_norm": 1.557485818862915, + "learning_rate": 0.00016364224865374677, + "loss": 1.2257, + "step": 8433 + }, + { + "epoch": 0.30203950077891384, + "grad_norm": 1.4572123289108276, + "learning_rate": 0.00016363330143532508, + "loss": 1.1294, + "step": 8434 + }, + { + "epoch": 0.3020753129084821, + "grad_norm": 1.6207196712493896, + "learning_rate": 0.00016362435336079938, + "loss": 1.2071, + "step": 8435 + }, + { + "epoch": 0.3021111250380504, + "grad_norm": 1.6316046714782715, + "learning_rate": 0.00016361540443029008, + "loss": 1.3509, + "step": 8436 + }, + { + "epoch": 0.30214693716761865, + "grad_norm": 1.9858677387237549, + "learning_rate": 0.00016360645464391754, + "loss": 1.0935, + "step": 8437 + }, + { + "epoch": 0.30218274929718697, + "grad_norm": 1.7040866613388062, + "learning_rate": 0.00016359750400180226, + "loss": 1.2836, + "step": 8438 + }, + { + "epoch": 0.30221856142675524, + "grad_norm": 1.9513963460922241, + "learning_rate": 0.00016358855250406455, + "loss": 1.191, + "step": 8439 + }, + { + "epoch": 0.3022543735563235, + "grad_norm": 1.3895254135131836, + "learning_rate": 0.0001635796001508249, + "loss": 1.2454, + "step": 8440 + }, + { + "epoch": 0.30229018568589183, + "grad_norm": 1.753265380859375, + "learning_rate": 0.00016357064694220375, + "loss": 1.304, + "step": 8441 + }, + { + "epoch": 0.3023259978154601, + "grad_norm": 1.6251821517944336, + "learning_rate": 0.00016356169287832156, + "loss": 1.1816, + "step": 8442 + }, + { + "epoch": 0.30236180994502837, + "grad_norm": 1.4640588760375977, + "learning_rate": 0.00016355273795929875, + "loss": 1.2685, + "step": 8443 + }, + { + "epoch": 0.30239762207459664, + "grad_norm": 1.865112066268921, + "learning_rate": 0.00016354378218525584, + "loss": 1.3681, + "step": 8444 + }, + { + "epoch": 0.30243343420416496, + "grad_norm": 1.6928788423538208, + "learning_rate": 0.00016353482555631334, + "loss": 1.255, + "step": 8445 + }, + { + "epoch": 0.30246924633373323, + "grad_norm": 1.5954506397247314, + "learning_rate": 0.00016352586807259168, + "loss": 1.4303, + "step": 8446 + }, + { + "epoch": 0.3025050584633015, + "grad_norm": 1.3485113382339478, + "learning_rate": 0.00016351690973421138, + "loss": 1.2439, + "step": 8447 + }, + { + "epoch": 0.3025408705928698, + "grad_norm": 1.3562754392623901, + "learning_rate": 0.00016350795054129305, + "loss": 1.1665, + "step": 8448 + }, + { + "epoch": 0.3025766827224381, + "grad_norm": 1.441849946975708, + "learning_rate": 0.00016349899049395713, + "loss": 1.2609, + "step": 8449 + }, + { + "epoch": 0.30261249485200636, + "grad_norm": 1.543131947517395, + "learning_rate": 0.0001634900295923242, + "loss": 1.2669, + "step": 8450 + }, + { + "epoch": 0.30264830698157463, + "grad_norm": 1.5300389528274536, + "learning_rate": 0.00016348106783651482, + "loss": 0.9213, + "step": 8451 + }, + { + "epoch": 0.30268411911114296, + "grad_norm": 2.5530405044555664, + "learning_rate": 0.00016347210522664956, + "loss": 1.3652, + "step": 8452 + }, + { + "epoch": 0.3027199312407112, + "grad_norm": 1.4831140041351318, + "learning_rate": 0.000163463141762849, + "loss": 1.118, + "step": 8453 + }, + { + "epoch": 0.3027557433702795, + "grad_norm": 1.7309688329696655, + "learning_rate": 0.00016345417744523374, + "loss": 1.166, + "step": 8454 + }, + { + "epoch": 0.3027915554998478, + "grad_norm": 1.9041332006454468, + "learning_rate": 0.00016344521227392437, + "loss": 1.0905, + "step": 8455 + }, + { + "epoch": 0.3028273676294161, + "grad_norm": 1.2785197496414185, + "learning_rate": 0.00016343624624904151, + "loss": 1.2878, + "step": 8456 + }, + { + "epoch": 0.30286317975898436, + "grad_norm": 1.8579375743865967, + "learning_rate": 0.00016342727937070577, + "loss": 1.1555, + "step": 8457 + }, + { + "epoch": 0.3028989918885526, + "grad_norm": 1.7278960943222046, + "learning_rate": 0.0001634183116390378, + "loss": 1.3948, + "step": 8458 + }, + { + "epoch": 0.30293480401812095, + "grad_norm": 1.4677878618240356, + "learning_rate": 0.00016340934305415823, + "loss": 1.2576, + "step": 8459 + }, + { + "epoch": 0.3029706161476892, + "grad_norm": 1.4073762893676758, + "learning_rate": 0.00016340037361618778, + "loss": 0.9936, + "step": 8460 + }, + { + "epoch": 0.3030064282772575, + "grad_norm": 1.7620666027069092, + "learning_rate": 0.00016339140332524707, + "loss": 1.2054, + "step": 8461 + }, + { + "epoch": 0.3030422404068258, + "grad_norm": 2.107293128967285, + "learning_rate": 0.0001633824321814568, + "loss": 1.2854, + "step": 8462 + }, + { + "epoch": 0.3030780525363941, + "grad_norm": 1.4253169298171997, + "learning_rate": 0.00016337346018493768, + "loss": 1.1264, + "step": 8463 + }, + { + "epoch": 0.30311386466596235, + "grad_norm": 1.5582337379455566, + "learning_rate": 0.00016336448733581037, + "loss": 1.2887, + "step": 8464 + }, + { + "epoch": 0.3031496767955306, + "grad_norm": 1.4700133800506592, + "learning_rate": 0.00016335551363419562, + "loss": 1.1561, + "step": 8465 + }, + { + "epoch": 0.30318548892509894, + "grad_norm": 1.4035735130310059, + "learning_rate": 0.00016334653908021415, + "loss": 1.3132, + "step": 8466 + }, + { + "epoch": 0.3032213010546672, + "grad_norm": 2.1092352867126465, + "learning_rate": 0.00016333756367398674, + "loss": 1.4017, + "step": 8467 + }, + { + "epoch": 0.3032571131842355, + "grad_norm": 1.4637699127197266, + "learning_rate": 0.00016332858741563408, + "loss": 1.1475, + "step": 8468 + }, + { + "epoch": 0.3032929253138038, + "grad_norm": 1.702841877937317, + "learning_rate": 0.00016331961030527698, + "loss": 1.1433, + "step": 8469 + }, + { + "epoch": 0.3033287374433721, + "grad_norm": 1.5061774253845215, + "learning_rate": 0.00016331063234303618, + "loss": 1.1919, + "step": 8470 + }, + { + "epoch": 0.30336454957294035, + "grad_norm": 1.386439561843872, + "learning_rate": 0.0001633016535290325, + "loss": 0.9667, + "step": 8471 + }, + { + "epoch": 0.3034003617025086, + "grad_norm": 1.626469612121582, + "learning_rate": 0.00016329267386338674, + "loss": 1.0212, + "step": 8472 + }, + { + "epoch": 0.30343617383207694, + "grad_norm": 1.4529403448104858, + "learning_rate": 0.0001632836933462197, + "loss": 0.9696, + "step": 8473 + }, + { + "epoch": 0.3034719859616452, + "grad_norm": 1.5760221481323242, + "learning_rate": 0.00016327471197765216, + "loss": 1.1638, + "step": 8474 + }, + { + "epoch": 0.3035077980912135, + "grad_norm": 1.914120078086853, + "learning_rate": 0.000163265729757805, + "loss": 1.2154, + "step": 8475 + }, + { + "epoch": 0.3035436102207818, + "grad_norm": 1.6455943584442139, + "learning_rate": 0.00016325674668679906, + "loss": 1.1807, + "step": 8476 + }, + { + "epoch": 0.30357942235035007, + "grad_norm": 1.7030339241027832, + "learning_rate": 0.00016324776276475518, + "loss": 1.3235, + "step": 8477 + }, + { + "epoch": 0.30361523447991834, + "grad_norm": 1.6749515533447266, + "learning_rate": 0.0001632387779917943, + "loss": 1.2255, + "step": 8478 + }, + { + "epoch": 0.3036510466094866, + "grad_norm": 1.3964307308197021, + "learning_rate": 0.00016322979236803713, + "loss": 1.265, + "step": 8479 + }, + { + "epoch": 0.30368685873905493, + "grad_norm": 1.6718271970748901, + "learning_rate": 0.00016322080589360472, + "loss": 1.3311, + "step": 8480 + }, + { + "epoch": 0.3037226708686232, + "grad_norm": 1.6986629962921143, + "learning_rate": 0.0001632118185686179, + "loss": 1.1674, + "step": 8481 + }, + { + "epoch": 0.30375848299819147, + "grad_norm": 1.3006080389022827, + "learning_rate": 0.0001632028303931976, + "loss": 1.1108, + "step": 8482 + }, + { + "epoch": 0.3037942951277598, + "grad_norm": 1.4372096061706543, + "learning_rate": 0.00016319384136746477, + "loss": 1.2395, + "step": 8483 + }, + { + "epoch": 0.30383010725732806, + "grad_norm": 1.2273885011672974, + "learning_rate": 0.0001631848514915403, + "loss": 1.1953, + "step": 8484 + }, + { + "epoch": 0.30386591938689633, + "grad_norm": 1.4106993675231934, + "learning_rate": 0.00016317586076554515, + "loss": 1.2661, + "step": 8485 + }, + { + "epoch": 0.3039017315164646, + "grad_norm": 1.4796817302703857, + "learning_rate": 0.0001631668691896003, + "loss": 1.1564, + "step": 8486 + }, + { + "epoch": 0.3039375436460329, + "grad_norm": 1.5287870168685913, + "learning_rate": 0.00016315787676382667, + "loss": 1.2009, + "step": 8487 + }, + { + "epoch": 0.3039733557756012, + "grad_norm": 2.1176950931549072, + "learning_rate": 0.0001631488834883453, + "loss": 1.1734, + "step": 8488 + }, + { + "epoch": 0.30400916790516946, + "grad_norm": 1.961983323097229, + "learning_rate": 0.00016313988936327717, + "loss": 1.3311, + "step": 8489 + }, + { + "epoch": 0.3040449800347378, + "grad_norm": 1.603257656097412, + "learning_rate": 0.00016313089438874326, + "loss": 1.3757, + "step": 8490 + }, + { + "epoch": 0.30408079216430606, + "grad_norm": 1.8779070377349854, + "learning_rate": 0.00016312189856486462, + "loss": 1.3801, + "step": 8491 + }, + { + "epoch": 0.3041166042938743, + "grad_norm": 1.3627333641052246, + "learning_rate": 0.00016311290189176223, + "loss": 1.1814, + "step": 8492 + }, + { + "epoch": 0.3041524164234426, + "grad_norm": 1.6173611879348755, + "learning_rate": 0.00016310390436955716, + "loss": 1.1166, + "step": 8493 + }, + { + "epoch": 0.3041882285530109, + "grad_norm": 1.3844155073165894, + "learning_rate": 0.00016309490599837045, + "loss": 1.3327, + "step": 8494 + }, + { + "epoch": 0.3042240406825792, + "grad_norm": 1.5411195755004883, + "learning_rate": 0.00016308590677832315, + "loss": 1.098, + "step": 8495 + }, + { + "epoch": 0.30425985281214746, + "grad_norm": 1.2838990688323975, + "learning_rate": 0.0001630769067095364, + "loss": 1.0656, + "step": 8496 + }, + { + "epoch": 0.3042956649417158, + "grad_norm": 2.135709047317505, + "learning_rate": 0.0001630679057921312, + "loss": 1.0565, + "step": 8497 + }, + { + "epoch": 0.30433147707128405, + "grad_norm": 1.3932876586914062, + "learning_rate": 0.0001630589040262287, + "loss": 1.1278, + "step": 8498 + }, + { + "epoch": 0.3043672892008523, + "grad_norm": 1.8509734869003296, + "learning_rate": 0.00016304990141194996, + "loss": 1.1644, + "step": 8499 + }, + { + "epoch": 0.3044031013304206, + "grad_norm": 1.5195817947387695, + "learning_rate": 0.00016304089794941614, + "loss": 1.251, + "step": 8500 + }, + { + "epoch": 0.3044389134599889, + "grad_norm": 1.5418577194213867, + "learning_rate": 0.00016303189363874835, + "loss": 1.0577, + "step": 8501 + }, + { + "epoch": 0.3044747255895572, + "grad_norm": 1.4127378463745117, + "learning_rate": 0.00016302288848006776, + "loss": 0.9872, + "step": 8502 + }, + { + "epoch": 0.30451053771912545, + "grad_norm": 2.390159845352173, + "learning_rate": 0.00016301388247349545, + "loss": 1.1021, + "step": 8503 + }, + { + "epoch": 0.3045463498486938, + "grad_norm": 1.5833901166915894, + "learning_rate": 0.00016300487561915266, + "loss": 1.1385, + "step": 8504 + }, + { + "epoch": 0.30458216197826204, + "grad_norm": 1.9407470226287842, + "learning_rate": 0.00016299586791716054, + "loss": 1.4208, + "step": 8505 + }, + { + "epoch": 0.3046179741078303, + "grad_norm": 1.4106816053390503, + "learning_rate": 0.00016298685936764026, + "loss": 1.0733, + "step": 8506 + }, + { + "epoch": 0.3046537862373986, + "grad_norm": 1.582789659500122, + "learning_rate": 0.00016297784997071308, + "loss": 1.1991, + "step": 8507 + }, + { + "epoch": 0.3046895983669669, + "grad_norm": 1.5226975679397583, + "learning_rate": 0.00016296883972650013, + "loss": 1.0497, + "step": 8508 + }, + { + "epoch": 0.3047254104965352, + "grad_norm": 1.4798544645309448, + "learning_rate": 0.00016295982863512266, + "loss": 1.0427, + "step": 8509 + }, + { + "epoch": 0.30476122262610345, + "grad_norm": 1.9687749147415161, + "learning_rate": 0.00016295081669670191, + "loss": 1.2475, + "step": 8510 + }, + { + "epoch": 0.30479703475567177, + "grad_norm": 1.411702275276184, + "learning_rate": 0.00016294180391135914, + "loss": 1.1062, + "step": 8511 + }, + { + "epoch": 0.30483284688524004, + "grad_norm": 1.5741877555847168, + "learning_rate": 0.00016293279027921557, + "loss": 1.1018, + "step": 8512 + }, + { + "epoch": 0.3048686590148083, + "grad_norm": 2.075650453567505, + "learning_rate": 0.0001629237758003925, + "loss": 1.341, + "step": 8513 + }, + { + "epoch": 0.3049044711443766, + "grad_norm": 1.3563563823699951, + "learning_rate": 0.00016291476047501115, + "loss": 1.2538, + "step": 8514 + }, + { + "epoch": 0.3049402832739449, + "grad_norm": 1.477524995803833, + "learning_rate": 0.0001629057443031929, + "loss": 1.1564, + "step": 8515 + }, + { + "epoch": 0.30497609540351317, + "grad_norm": 1.6263113021850586, + "learning_rate": 0.000162896727285059, + "loss": 1.0504, + "step": 8516 + }, + { + "epoch": 0.30501190753308144, + "grad_norm": 1.7168818712234497, + "learning_rate": 0.00016288770942073075, + "loss": 1.2634, + "step": 8517 + }, + { + "epoch": 0.30504771966264976, + "grad_norm": 1.9247437715530396, + "learning_rate": 0.00016287869071032952, + "loss": 1.2508, + "step": 8518 + }, + { + "epoch": 0.30508353179221803, + "grad_norm": 1.667773723602295, + "learning_rate": 0.00016286967115397655, + "loss": 1.1299, + "step": 8519 + }, + { + "epoch": 0.3051193439217863, + "grad_norm": 2.019408702850342, + "learning_rate": 0.00016286065075179332, + "loss": 1.3514, + "step": 8520 + }, + { + "epoch": 0.30515515605135457, + "grad_norm": 2.0149385929107666, + "learning_rate": 0.00016285162950390104, + "loss": 1.1189, + "step": 8521 + }, + { + "epoch": 0.3051909681809229, + "grad_norm": 2.4864964485168457, + "learning_rate": 0.00016284260741042123, + "loss": 1.2077, + "step": 8522 + }, + { + "epoch": 0.30522678031049116, + "grad_norm": 1.741622805595398, + "learning_rate": 0.00016283358447147516, + "loss": 1.3237, + "step": 8523 + }, + { + "epoch": 0.30526259244005943, + "grad_norm": 2.1023030281066895, + "learning_rate": 0.0001628245606871843, + "loss": 1.085, + "step": 8524 + }, + { + "epoch": 0.30529840456962776, + "grad_norm": 1.7771161794662476, + "learning_rate": 0.00016281553605766998, + "loss": 1.2843, + "step": 8525 + }, + { + "epoch": 0.305334216699196, + "grad_norm": 1.8599132299423218, + "learning_rate": 0.00016280651058305363, + "loss": 1.1384, + "step": 8526 + }, + { + "epoch": 0.3053700288287643, + "grad_norm": 1.5380957126617432, + "learning_rate": 0.00016279748426345673, + "loss": 1.1292, + "step": 8527 + }, + { + "epoch": 0.30540584095833256, + "grad_norm": 1.2730565071105957, + "learning_rate": 0.0001627884570990007, + "loss": 1.1993, + "step": 8528 + }, + { + "epoch": 0.3054416530879009, + "grad_norm": 1.370141625404358, + "learning_rate": 0.0001627794290898069, + "loss": 1.3408, + "step": 8529 + }, + { + "epoch": 0.30547746521746916, + "grad_norm": 1.2499113082885742, + "learning_rate": 0.00016277040023599692, + "loss": 1.2084, + "step": 8530 + }, + { + "epoch": 0.3055132773470374, + "grad_norm": 1.515315055847168, + "learning_rate": 0.00016276137053769217, + "loss": 0.9788, + "step": 8531 + }, + { + "epoch": 0.30554908947660575, + "grad_norm": 1.3745925426483154, + "learning_rate": 0.0001627523399950141, + "loss": 1.1429, + "step": 8532 + }, + { + "epoch": 0.305584901606174, + "grad_norm": 1.2595326900482178, + "learning_rate": 0.00016274330860808426, + "loss": 1.1191, + "step": 8533 + }, + { + "epoch": 0.3056207137357423, + "grad_norm": 1.3395544290542603, + "learning_rate": 0.00016273427637702415, + "loss": 0.9627, + "step": 8534 + }, + { + "epoch": 0.30565652586531056, + "grad_norm": 1.2498524188995361, + "learning_rate": 0.00016272524330195525, + "loss": 0.9561, + "step": 8535 + }, + { + "epoch": 0.3056923379948789, + "grad_norm": 2.1511423587799072, + "learning_rate": 0.00016271620938299912, + "loss": 1.3263, + "step": 8536 + }, + { + "epoch": 0.30572815012444715, + "grad_norm": 1.4170628786087036, + "learning_rate": 0.0001627071746202773, + "loss": 1.312, + "step": 8537 + }, + { + "epoch": 0.3057639622540154, + "grad_norm": 2.577139139175415, + "learning_rate": 0.00016269813901391132, + "loss": 1.4065, + "step": 8538 + }, + { + "epoch": 0.30579977438358374, + "grad_norm": 1.8703845739364624, + "learning_rate": 0.00016268910256402277, + "loss": 1.2183, + "step": 8539 + }, + { + "epoch": 0.305835586513152, + "grad_norm": 1.8554120063781738, + "learning_rate": 0.00016268006527073322, + "loss": 1.1933, + "step": 8540 + }, + { + "epoch": 0.3058713986427203, + "grad_norm": 1.4961694478988647, + "learning_rate": 0.00016267102713416417, + "loss": 1.0699, + "step": 8541 + }, + { + "epoch": 0.30590721077228855, + "grad_norm": 1.4181106090545654, + "learning_rate": 0.00016266198815443738, + "loss": 1.0535, + "step": 8542 + }, + { + "epoch": 0.3059430229018569, + "grad_norm": 1.5474495887756348, + "learning_rate": 0.00016265294833167434, + "loss": 1.1475, + "step": 8543 + }, + { + "epoch": 0.30597883503142514, + "grad_norm": 1.3645308017730713, + "learning_rate": 0.0001626439076659967, + "loss": 1.0511, + "step": 8544 + }, + { + "epoch": 0.3060146471609934, + "grad_norm": 1.4466532468795776, + "learning_rate": 0.00016263486615752606, + "loss": 1.0376, + "step": 8545 + }, + { + "epoch": 0.30605045929056174, + "grad_norm": 1.5058766603469849, + "learning_rate": 0.00016262582380638407, + "loss": 1.1508, + "step": 8546 + }, + { + "epoch": 0.30608627142013, + "grad_norm": 1.5207470655441284, + "learning_rate": 0.00016261678061269244, + "loss": 1.2748, + "step": 8547 + }, + { + "epoch": 0.3061220835496983, + "grad_norm": 2.1046156883239746, + "learning_rate": 0.0001626077365765728, + "loss": 1.2248, + "step": 8548 + }, + { + "epoch": 0.30615789567926655, + "grad_norm": 1.4178498983383179, + "learning_rate": 0.00016259869169814678, + "loss": 1.4332, + "step": 8549 + }, + { + "epoch": 0.30619370780883487, + "grad_norm": 1.4264739751815796, + "learning_rate": 0.00016258964597753615, + "loss": 1.0167, + "step": 8550 + }, + { + "epoch": 0.30622951993840314, + "grad_norm": 1.7800389528274536, + "learning_rate": 0.00016258059941486259, + "loss": 1.0305, + "step": 8551 + }, + { + "epoch": 0.3062653320679714, + "grad_norm": 1.6956294775009155, + "learning_rate": 0.00016257155201024776, + "loss": 1.0955, + "step": 8552 + }, + { + "epoch": 0.30630114419753973, + "grad_norm": 1.8510578870773315, + "learning_rate": 0.0001625625037638134, + "loss": 1.17, + "step": 8553 + }, + { + "epoch": 0.306336956327108, + "grad_norm": 1.7557320594787598, + "learning_rate": 0.00016255345467568126, + "loss": 1.1639, + "step": 8554 + }, + { + "epoch": 0.30637276845667627, + "grad_norm": 1.534202218055725, + "learning_rate": 0.00016254440474597307, + "loss": 1.1645, + "step": 8555 + }, + { + "epoch": 0.30640858058624454, + "grad_norm": 1.3360987901687622, + "learning_rate": 0.0001625353539748106, + "loss": 1.0842, + "step": 8556 + }, + { + "epoch": 0.30644439271581286, + "grad_norm": 1.8540589809417725, + "learning_rate": 0.00016252630236231557, + "loss": 0.9391, + "step": 8557 + }, + { + "epoch": 0.30648020484538113, + "grad_norm": 1.3655922412872314, + "learning_rate": 0.00016251724990860983, + "loss": 1.2013, + "step": 8558 + }, + { + "epoch": 0.3065160169749494, + "grad_norm": 1.6350390911102295, + "learning_rate": 0.00016250819661381516, + "loss": 1.1963, + "step": 8559 + }, + { + "epoch": 0.3065518291045177, + "grad_norm": 1.2852349281311035, + "learning_rate": 0.0001624991424780533, + "loss": 0.9467, + "step": 8560 + }, + { + "epoch": 0.306587641234086, + "grad_norm": 1.378571629524231, + "learning_rate": 0.0001624900875014461, + "loss": 1.1425, + "step": 8561 + }, + { + "epoch": 0.30662345336365426, + "grad_norm": 1.2628884315490723, + "learning_rate": 0.0001624810316841154, + "loss": 1.2175, + "step": 8562 + }, + { + "epoch": 0.30665926549322253, + "grad_norm": 1.4105592966079712, + "learning_rate": 0.000162471975026183, + "loss": 1.0563, + "step": 8563 + }, + { + "epoch": 0.30669507762279086, + "grad_norm": 1.5842384099960327, + "learning_rate": 0.0001624629175277707, + "loss": 1.1688, + "step": 8564 + }, + { + "epoch": 0.3067308897523591, + "grad_norm": 1.6600080728530884, + "learning_rate": 0.0001624538591890005, + "loss": 1.1094, + "step": 8565 + }, + { + "epoch": 0.3067667018819274, + "grad_norm": 1.5835617780685425, + "learning_rate": 0.00016244480000999416, + "loss": 0.9956, + "step": 8566 + }, + { + "epoch": 0.3068025140114957, + "grad_norm": 1.62114679813385, + "learning_rate": 0.0001624357399908736, + "loss": 1.4039, + "step": 8567 + }, + { + "epoch": 0.306838326141064, + "grad_norm": 1.2879735231399536, + "learning_rate": 0.00016242667913176064, + "loss": 1.2045, + "step": 8568 + }, + { + "epoch": 0.30687413827063226, + "grad_norm": 1.755541443824768, + "learning_rate": 0.00016241761743277726, + "loss": 1.2748, + "step": 8569 + }, + { + "epoch": 0.3069099504002005, + "grad_norm": 1.7984334230422974, + "learning_rate": 0.00016240855489404535, + "loss": 1.0553, + "step": 8570 + }, + { + "epoch": 0.30694576252976885, + "grad_norm": 1.4692410230636597, + "learning_rate": 0.00016239949151568688, + "loss": 1.1323, + "step": 8571 + }, + { + "epoch": 0.3069815746593371, + "grad_norm": 1.536725640296936, + "learning_rate": 0.0001623904272978237, + "loss": 1.1322, + "step": 8572 + }, + { + "epoch": 0.3070173867889054, + "grad_norm": 2.45461106300354, + "learning_rate": 0.00016238136224057777, + "loss": 1.2709, + "step": 8573 + }, + { + "epoch": 0.3070531989184737, + "grad_norm": 1.6152485609054565, + "learning_rate": 0.00016237229634407112, + "loss": 1.2657, + "step": 8574 + }, + { + "epoch": 0.307089011048042, + "grad_norm": 1.635525107383728, + "learning_rate": 0.0001623632296084257, + "loss": 1.1669, + "step": 8575 + }, + { + "epoch": 0.30712482317761025, + "grad_norm": 1.9170528650283813, + "learning_rate": 0.0001623541620337634, + "loss": 1.4314, + "step": 8576 + }, + { + "epoch": 0.3071606353071785, + "grad_norm": 1.3207323551177979, + "learning_rate": 0.00016234509362020633, + "loss": 1.3314, + "step": 8577 + }, + { + "epoch": 0.30719644743674684, + "grad_norm": 2.26776385307312, + "learning_rate": 0.00016233602436787644, + "loss": 1.2219, + "step": 8578 + }, + { + "epoch": 0.3072322595663151, + "grad_norm": 1.37172269821167, + "learning_rate": 0.00016232695427689575, + "loss": 1.2401, + "step": 8579 + }, + { + "epoch": 0.3072680716958834, + "grad_norm": 2.8130040168762207, + "learning_rate": 0.00016231788334738627, + "loss": 1.0659, + "step": 8580 + }, + { + "epoch": 0.3073038838254517, + "grad_norm": 1.4310991764068604, + "learning_rate": 0.00016230881157947006, + "loss": 1.19, + "step": 8581 + }, + { + "epoch": 0.30733969595502, + "grad_norm": 1.3403431177139282, + "learning_rate": 0.00016229973897326919, + "loss": 1.213, + "step": 8582 + }, + { + "epoch": 0.30737550808458824, + "grad_norm": 1.5482901334762573, + "learning_rate": 0.00016229066552890563, + "loss": 1.1399, + "step": 8583 + }, + { + "epoch": 0.3074113202141565, + "grad_norm": 1.9216983318328857, + "learning_rate": 0.0001622815912465016, + "loss": 1.1055, + "step": 8584 + }, + { + "epoch": 0.30744713234372484, + "grad_norm": 1.696237564086914, + "learning_rate": 0.00016227251612617902, + "loss": 1.16, + "step": 8585 + }, + { + "epoch": 0.3074829444732931, + "grad_norm": 1.7690720558166504, + "learning_rate": 0.0001622634401680601, + "loss": 1.2207, + "step": 8586 + }, + { + "epoch": 0.3075187566028614, + "grad_norm": 1.6936194896697998, + "learning_rate": 0.0001622543633722669, + "loss": 1.0678, + "step": 8587 + }, + { + "epoch": 0.3075545687324297, + "grad_norm": 1.3898451328277588, + "learning_rate": 0.00016224528573892153, + "loss": 1.1785, + "step": 8588 + }, + { + "epoch": 0.30759038086199797, + "grad_norm": 1.5279182195663452, + "learning_rate": 0.00016223620726814615, + "loss": 1.0542, + "step": 8589 + }, + { + "epoch": 0.30762619299156624, + "grad_norm": 2.0892233848571777, + "learning_rate": 0.00016222712796006285, + "loss": 1.1565, + "step": 8590 + }, + { + "epoch": 0.3076620051211345, + "grad_norm": 1.510156512260437, + "learning_rate": 0.00016221804781479384, + "loss": 1.1088, + "step": 8591 + }, + { + "epoch": 0.30769781725070283, + "grad_norm": 1.6812881231307983, + "learning_rate": 0.00016220896683246126, + "loss": 0.958, + "step": 8592 + }, + { + "epoch": 0.3077336293802711, + "grad_norm": 1.7384670972824097, + "learning_rate": 0.00016219988501318727, + "loss": 1.1489, + "step": 8593 + }, + { + "epoch": 0.30776944150983937, + "grad_norm": 1.9324885606765747, + "learning_rate": 0.00016219080235709403, + "loss": 1.2896, + "step": 8594 + }, + { + "epoch": 0.3078052536394077, + "grad_norm": 1.462056279182434, + "learning_rate": 0.0001621817188643038, + "loss": 1.1438, + "step": 8595 + }, + { + "epoch": 0.30784106576897596, + "grad_norm": 1.5486034154891968, + "learning_rate": 0.0001621726345349387, + "loss": 1.2069, + "step": 8596 + }, + { + "epoch": 0.30787687789854423, + "grad_norm": 1.4411839246749878, + "learning_rate": 0.00016216354936912105, + "loss": 1.2149, + "step": 8597 + }, + { + "epoch": 0.3079126900281125, + "grad_norm": 1.4540739059448242, + "learning_rate": 0.000162154463366973, + "loss": 1.3063, + "step": 8598 + }, + { + "epoch": 0.3079485021576808, + "grad_norm": 1.5505518913269043, + "learning_rate": 0.00016214537652861687, + "loss": 0.8986, + "step": 8599 + }, + { + "epoch": 0.3079843142872491, + "grad_norm": 1.3972070217132568, + "learning_rate": 0.00016213628885417483, + "loss": 1.1246, + "step": 8600 + }, + { + "epoch": 0.30802012641681736, + "grad_norm": 1.5110498666763306, + "learning_rate": 0.00016212720034376914, + "loss": 1.2336, + "step": 8601 + }, + { + "epoch": 0.3080559385463857, + "grad_norm": 1.4464168548583984, + "learning_rate": 0.00016211811099752215, + "loss": 1.1067, + "step": 8602 + }, + { + "epoch": 0.30809175067595396, + "grad_norm": 1.7480340003967285, + "learning_rate": 0.00016210902081555605, + "loss": 1.1766, + "step": 8603 + }, + { + "epoch": 0.3081275628055222, + "grad_norm": 1.2848155498504639, + "learning_rate": 0.00016209992979799326, + "loss": 1.0699, + "step": 8604 + }, + { + "epoch": 0.3081633749350905, + "grad_norm": 1.9579294919967651, + "learning_rate": 0.00016209083794495598, + "loss": 1.2162, + "step": 8605 + }, + { + "epoch": 0.3081991870646588, + "grad_norm": 1.4275661706924438, + "learning_rate": 0.00016208174525656656, + "loss": 0.9591, + "step": 8606 + }, + { + "epoch": 0.3082349991942271, + "grad_norm": 1.5784162282943726, + "learning_rate": 0.00016207265173294734, + "loss": 1.2425, + "step": 8607 + }, + { + "epoch": 0.30827081132379536, + "grad_norm": 2.0583817958831787, + "learning_rate": 0.00016206355737422067, + "loss": 1.2415, + "step": 8608 + }, + { + "epoch": 0.3083066234533637, + "grad_norm": 1.545795202255249, + "learning_rate": 0.00016205446218050892, + "loss": 0.8668, + "step": 8609 + }, + { + "epoch": 0.30834243558293195, + "grad_norm": 1.5371829271316528, + "learning_rate": 0.00016204536615193439, + "loss": 1.2568, + "step": 8610 + }, + { + "epoch": 0.3083782477125002, + "grad_norm": 1.83467698097229, + "learning_rate": 0.00016203626928861948, + "loss": 1.2377, + "step": 8611 + }, + { + "epoch": 0.3084140598420685, + "grad_norm": 2.0947158336639404, + "learning_rate": 0.00016202717159068662, + "loss": 1.1524, + "step": 8612 + }, + { + "epoch": 0.3084498719716368, + "grad_norm": 1.7983481884002686, + "learning_rate": 0.00016201807305825817, + "loss": 1.1299, + "step": 8613 + }, + { + "epoch": 0.3084856841012051, + "grad_norm": 1.7447012662887573, + "learning_rate": 0.0001620089736914565, + "loss": 1.0072, + "step": 8614 + }, + { + "epoch": 0.30852149623077335, + "grad_norm": 1.6850483417510986, + "learning_rate": 0.0001619998734904041, + "loss": 1.3254, + "step": 8615 + }, + { + "epoch": 0.3085573083603416, + "grad_norm": 1.3878064155578613, + "learning_rate": 0.00016199077245522341, + "loss": 1.2114, + "step": 8616 + }, + { + "epoch": 0.30859312048990994, + "grad_norm": 1.7361465692520142, + "learning_rate": 0.00016198167058603682, + "loss": 1.1758, + "step": 8617 + }, + { + "epoch": 0.3086289326194782, + "grad_norm": 1.664039134979248, + "learning_rate": 0.0001619725678829668, + "loss": 1.2062, + "step": 8618 + }, + { + "epoch": 0.3086647447490465, + "grad_norm": 1.511802315711975, + "learning_rate": 0.0001619634643461358, + "loss": 1.2619, + "step": 8619 + }, + { + "epoch": 0.3087005568786148, + "grad_norm": 2.1597840785980225, + "learning_rate": 0.00016195435997566632, + "loss": 1.4158, + "step": 8620 + }, + { + "epoch": 0.3087363690081831, + "grad_norm": 1.7769004106521606, + "learning_rate": 0.00016194525477168087, + "loss": 1.2724, + "step": 8621 + }, + { + "epoch": 0.30877218113775134, + "grad_norm": 1.8733152151107788, + "learning_rate": 0.0001619361487343019, + "loss": 1.1369, + "step": 8622 + }, + { + "epoch": 0.3088079932673196, + "grad_norm": 1.9050359725952148, + "learning_rate": 0.00016192704186365195, + "loss": 1.1383, + "step": 8623 + }, + { + "epoch": 0.30884380539688794, + "grad_norm": 1.7245118618011475, + "learning_rate": 0.00016191793415985353, + "loss": 1.3885, + "step": 8624 + }, + { + "epoch": 0.3088796175264562, + "grad_norm": 1.7479420900344849, + "learning_rate": 0.00016190882562302914, + "loss": 1.2098, + "step": 8625 + }, + { + "epoch": 0.3089154296560245, + "grad_norm": 1.431227684020996, + "learning_rate": 0.0001618997162533014, + "loss": 1.2087, + "step": 8626 + }, + { + "epoch": 0.3089512417855928, + "grad_norm": 1.532562255859375, + "learning_rate": 0.0001618906060507928, + "loss": 1.1683, + "step": 8627 + }, + { + "epoch": 0.30898705391516107, + "grad_norm": 1.5869332551956177, + "learning_rate": 0.00016188149501562596, + "loss": 1.0045, + "step": 8628 + }, + { + "epoch": 0.30902286604472934, + "grad_norm": 1.5806242227554321, + "learning_rate": 0.00016187238314792338, + "loss": 1.0425, + "step": 8629 + }, + { + "epoch": 0.3090586781742976, + "grad_norm": 1.3555889129638672, + "learning_rate": 0.00016186327044780772, + "loss": 1.1515, + "step": 8630 + }, + { + "epoch": 0.30909449030386593, + "grad_norm": 1.6676982641220093, + "learning_rate": 0.00016185415691540156, + "loss": 1.1716, + "step": 8631 + }, + { + "epoch": 0.3091303024334342, + "grad_norm": 2.4261622428894043, + "learning_rate": 0.0001618450425508275, + "loss": 1.0743, + "step": 8632 + }, + { + "epoch": 0.30916611456300247, + "grad_norm": 1.4590678215026855, + "learning_rate": 0.00016183592735420817, + "loss": 1.0156, + "step": 8633 + }, + { + "epoch": 0.3092019266925708, + "grad_norm": 1.41183602809906, + "learning_rate": 0.0001618268113256662, + "loss": 1.2209, + "step": 8634 + }, + { + "epoch": 0.30923773882213906, + "grad_norm": 1.3465666770935059, + "learning_rate": 0.0001618176944653242, + "loss": 1.1793, + "step": 8635 + }, + { + "epoch": 0.30927355095170733, + "grad_norm": 1.2679080963134766, + "learning_rate": 0.0001618085767733049, + "loss": 1.1194, + "step": 8636 + }, + { + "epoch": 0.3093093630812756, + "grad_norm": 1.5564440488815308, + "learning_rate": 0.0001617994582497309, + "loss": 1.0681, + "step": 8637 + }, + { + "epoch": 0.3093451752108439, + "grad_norm": 1.8687019348144531, + "learning_rate": 0.00016179033889472493, + "loss": 1.1449, + "step": 8638 + }, + { + "epoch": 0.3093809873404122, + "grad_norm": 1.4858633279800415, + "learning_rate": 0.00016178121870840965, + "loss": 1.1828, + "step": 8639 + }, + { + "epoch": 0.30941679946998046, + "grad_norm": 1.714883804321289, + "learning_rate": 0.00016177209769090774, + "loss": 1.1866, + "step": 8640 + }, + { + "epoch": 0.3094526115995488, + "grad_norm": 1.311575174331665, + "learning_rate": 0.00016176297584234196, + "loss": 1.2277, + "step": 8641 + }, + { + "epoch": 0.30948842372911706, + "grad_norm": 1.4500019550323486, + "learning_rate": 0.00016175385316283502, + "loss": 1.1304, + "step": 8642 + }, + { + "epoch": 0.3095242358586853, + "grad_norm": 2.106606960296631, + "learning_rate": 0.00016174472965250965, + "loss": 1.195, + "step": 8643 + }, + { + "epoch": 0.3095600479882536, + "grad_norm": 2.7091481685638428, + "learning_rate": 0.00016173560531148855, + "loss": 1.4102, + "step": 8644 + }, + { + "epoch": 0.3095958601178219, + "grad_norm": 1.475685715675354, + "learning_rate": 0.0001617264801398945, + "loss": 1.2711, + "step": 8645 + }, + { + "epoch": 0.3096316722473902, + "grad_norm": 1.7037007808685303, + "learning_rate": 0.0001617173541378503, + "loss": 1.2443, + "step": 8646 + }, + { + "epoch": 0.30966748437695846, + "grad_norm": 1.9223546981811523, + "learning_rate": 0.0001617082273054787, + "loss": 1.216, + "step": 8647 + }, + { + "epoch": 0.3097032965065268, + "grad_norm": 1.9208606481552124, + "learning_rate": 0.00016169909964290256, + "loss": 1.091, + "step": 8648 + }, + { + "epoch": 0.30973910863609505, + "grad_norm": 1.5749123096466064, + "learning_rate": 0.00016168997115024458, + "loss": 1.1984, + "step": 8649 + }, + { + "epoch": 0.3097749207656633, + "grad_norm": 1.4901601076126099, + "learning_rate": 0.0001616808418276276, + "loss": 1.1773, + "step": 8650 + }, + { + "epoch": 0.3098107328952316, + "grad_norm": 1.615326166152954, + "learning_rate": 0.00016167171167517447, + "loss": 1.245, + "step": 8651 + }, + { + "epoch": 0.3098465450247999, + "grad_norm": 1.3948417901992798, + "learning_rate": 0.00016166258069300803, + "loss": 1.1649, + "step": 8652 + }, + { + "epoch": 0.3098823571543682, + "grad_norm": 1.3273259401321411, + "learning_rate": 0.00016165344888125106, + "loss": 1.2267, + "step": 8653 + }, + { + "epoch": 0.30991816928393645, + "grad_norm": 1.6672254800796509, + "learning_rate": 0.00016164431624002647, + "loss": 1.195, + "step": 8654 + }, + { + "epoch": 0.3099539814135048, + "grad_norm": 1.4055334329605103, + "learning_rate": 0.00016163518276945715, + "loss": 1.2407, + "step": 8655 + }, + { + "epoch": 0.30998979354307304, + "grad_norm": 1.2579345703125, + "learning_rate": 0.00016162604846966594, + "loss": 1.1567, + "step": 8656 + }, + { + "epoch": 0.3100256056726413, + "grad_norm": 1.5872184038162231, + "learning_rate": 0.0001616169133407757, + "loss": 1.4666, + "step": 8657 + }, + { + "epoch": 0.3100614178022096, + "grad_norm": 1.440415382385254, + "learning_rate": 0.00016160777738290945, + "loss": 1.235, + "step": 8658 + }, + { + "epoch": 0.3100972299317779, + "grad_norm": 1.5176734924316406, + "learning_rate": 0.00016159864059618997, + "loss": 1.2429, + "step": 8659 + }, + { + "epoch": 0.3101330420613462, + "grad_norm": 1.6537644863128662, + "learning_rate": 0.00016158950298074022, + "loss": 1.2394, + "step": 8660 + }, + { + "epoch": 0.31016885419091444, + "grad_norm": 1.6115870475769043, + "learning_rate": 0.00016158036453668318, + "loss": 1.1624, + "step": 8661 + }, + { + "epoch": 0.31020466632048277, + "grad_norm": 1.5790889263153076, + "learning_rate": 0.00016157122526414176, + "loss": 1.2898, + "step": 8662 + }, + { + "epoch": 0.31024047845005104, + "grad_norm": 1.6728990077972412, + "learning_rate": 0.00016156208516323895, + "loss": 1.0797, + "step": 8663 + }, + { + "epoch": 0.3102762905796193, + "grad_norm": 1.3839727640151978, + "learning_rate": 0.00016155294423409768, + "loss": 1.1586, + "step": 8664 + }, + { + "epoch": 0.3103121027091876, + "grad_norm": 1.6079671382904053, + "learning_rate": 0.00016154380247684094, + "loss": 1.0969, + "step": 8665 + }, + { + "epoch": 0.3103479148387559, + "grad_norm": 1.7885090112686157, + "learning_rate": 0.00016153465989159172, + "loss": 1.1969, + "step": 8666 + }, + { + "epoch": 0.31038372696832417, + "grad_norm": 1.4352436065673828, + "learning_rate": 0.00016152551647847304, + "loss": 1.3595, + "step": 8667 + }, + { + "epoch": 0.31041953909789244, + "grad_norm": 1.6418925523757935, + "learning_rate": 0.00016151637223760785, + "loss": 1.1881, + "step": 8668 + }, + { + "epoch": 0.31045535122746076, + "grad_norm": 1.4416180849075317, + "learning_rate": 0.0001615072271691193, + "loss": 1.1178, + "step": 8669 + }, + { + "epoch": 0.31049116335702903, + "grad_norm": 1.8889800310134888, + "learning_rate": 0.00016149808127313025, + "loss": 1.3578, + "step": 8670 + }, + { + "epoch": 0.3105269754865973, + "grad_norm": 1.5791561603546143, + "learning_rate": 0.00016148893454976393, + "loss": 1.2316, + "step": 8671 + }, + { + "epoch": 0.31056278761616557, + "grad_norm": 1.8258134126663208, + "learning_rate": 0.00016147978699914325, + "loss": 1.0946, + "step": 8672 + }, + { + "epoch": 0.3105985997457339, + "grad_norm": 1.292256474494934, + "learning_rate": 0.00016147063862139138, + "loss": 1.1441, + "step": 8673 + }, + { + "epoch": 0.31063441187530216, + "grad_norm": 1.5790197849273682, + "learning_rate": 0.00016146148941663136, + "loss": 1.1183, + "step": 8674 + }, + { + "epoch": 0.31067022400487043, + "grad_norm": 1.3386865854263306, + "learning_rate": 0.00016145233938498626, + "loss": 1.2142, + "step": 8675 + }, + { + "epoch": 0.31070603613443876, + "grad_norm": 1.8613605499267578, + "learning_rate": 0.00016144318852657921, + "loss": 1.2111, + "step": 8676 + }, + { + "epoch": 0.310741848264007, + "grad_norm": 1.6100250482559204, + "learning_rate": 0.00016143403684153328, + "loss": 1.4308, + "step": 8677 + }, + { + "epoch": 0.3107776603935753, + "grad_norm": 1.6607122421264648, + "learning_rate": 0.00016142488432997168, + "loss": 1.1917, + "step": 8678 + }, + { + "epoch": 0.31081347252314356, + "grad_norm": 2.0252790451049805, + "learning_rate": 0.00016141573099201744, + "loss": 1.088, + "step": 8679 + }, + { + "epoch": 0.3108492846527119, + "grad_norm": 1.3406927585601807, + "learning_rate": 0.00016140657682779384, + "loss": 0.9351, + "step": 8680 + }, + { + "epoch": 0.31088509678228016, + "grad_norm": 1.5686582326889038, + "learning_rate": 0.0001613974218374239, + "loss": 1.1702, + "step": 8681 + }, + { + "epoch": 0.3109209089118484, + "grad_norm": 1.4228252172470093, + "learning_rate": 0.00016138826602103085, + "loss": 1.3546, + "step": 8682 + }, + { + "epoch": 0.31095672104141675, + "grad_norm": 1.3292087316513062, + "learning_rate": 0.00016137910937873788, + "loss": 1.2229, + "step": 8683 + }, + { + "epoch": 0.310992533170985, + "grad_norm": 1.8345659971237183, + "learning_rate": 0.00016136995191066818, + "loss": 1.3533, + "step": 8684 + }, + { + "epoch": 0.3110283453005533, + "grad_norm": 1.566933274269104, + "learning_rate": 0.0001613607936169449, + "loss": 1.3978, + "step": 8685 + }, + { + "epoch": 0.31106415743012156, + "grad_norm": 2.047131299972534, + "learning_rate": 0.00016135163449769132, + "loss": 1.1231, + "step": 8686 + }, + { + "epoch": 0.3110999695596899, + "grad_norm": 2.1134862899780273, + "learning_rate": 0.0001613424745530306, + "loss": 1.2135, + "step": 8687 + }, + { + "epoch": 0.31113578168925815, + "grad_norm": 1.8301069736480713, + "learning_rate": 0.00016133331378308604, + "loss": 1.3535, + "step": 8688 + }, + { + "epoch": 0.3111715938188264, + "grad_norm": 1.670212984085083, + "learning_rate": 0.00016132415218798085, + "loss": 1.1987, + "step": 8689 + }, + { + "epoch": 0.31120740594839474, + "grad_norm": 1.4002984762191772, + "learning_rate": 0.0001613149897678383, + "loss": 1.1341, + "step": 8690 + }, + { + "epoch": 0.311243218077963, + "grad_norm": 2.3098883628845215, + "learning_rate": 0.00016130582652278163, + "loss": 1.1148, + "step": 8691 + }, + { + "epoch": 0.3112790302075313, + "grad_norm": 1.2418853044509888, + "learning_rate": 0.0001612966624529342, + "loss": 1.2507, + "step": 8692 + }, + { + "epoch": 0.31131484233709955, + "grad_norm": 1.418245553970337, + "learning_rate": 0.0001612874975584192, + "loss": 1.1068, + "step": 8693 + }, + { + "epoch": 0.3113506544666679, + "grad_norm": 1.5077203512191772, + "learning_rate": 0.00016127833183936, + "loss": 1.1851, + "step": 8694 + }, + { + "epoch": 0.31138646659623614, + "grad_norm": 1.9751828908920288, + "learning_rate": 0.00016126916529587987, + "loss": 1.182, + "step": 8695 + }, + { + "epoch": 0.3114222787258044, + "grad_norm": 1.4976015090942383, + "learning_rate": 0.00016125999792810213, + "loss": 0.944, + "step": 8696 + }, + { + "epoch": 0.31145809085537274, + "grad_norm": 1.4347585439682007, + "learning_rate": 0.00016125082973615017, + "loss": 0.9489, + "step": 8697 + }, + { + "epoch": 0.311493902984941, + "grad_norm": 1.6864339113235474, + "learning_rate": 0.00016124166072014728, + "loss": 1.0909, + "step": 8698 + }, + { + "epoch": 0.3115297151145093, + "grad_norm": 1.9020100831985474, + "learning_rate": 0.00016123249088021688, + "loss": 1.255, + "step": 8699 + }, + { + "epoch": 0.31156552724407754, + "grad_norm": 1.580141544342041, + "learning_rate": 0.00016122332021648226, + "loss": 1.219, + "step": 8700 + }, + { + "epoch": 0.31160133937364587, + "grad_norm": 1.729596734046936, + "learning_rate": 0.00016121414872906687, + "loss": 1.162, + "step": 8701 + }, + { + "epoch": 0.31163715150321414, + "grad_norm": 1.5200968980789185, + "learning_rate": 0.00016120497641809408, + "loss": 1.264, + "step": 8702 + }, + { + "epoch": 0.3116729636327824, + "grad_norm": 1.8695322275161743, + "learning_rate": 0.00016119580328368725, + "loss": 1.324, + "step": 8703 + }, + { + "epoch": 0.31170877576235073, + "grad_norm": 1.694458246231079, + "learning_rate": 0.0001611866293259698, + "loss": 1.1055, + "step": 8704 + }, + { + "epoch": 0.311744587891919, + "grad_norm": 1.805552363395691, + "learning_rate": 0.00016117745454506522, + "loss": 1.105, + "step": 8705 + }, + { + "epoch": 0.31178040002148727, + "grad_norm": 1.8747038841247559, + "learning_rate": 0.00016116827894109686, + "loss": 0.9817, + "step": 8706 + }, + { + "epoch": 0.31181621215105554, + "grad_norm": 1.8646128177642822, + "learning_rate": 0.00016115910251418827, + "loss": 1.1439, + "step": 8707 + }, + { + "epoch": 0.31185202428062386, + "grad_norm": 1.3459265232086182, + "learning_rate": 0.0001611499252644628, + "loss": 1.1982, + "step": 8708 + }, + { + "epoch": 0.31188783641019213, + "grad_norm": 1.3484317064285278, + "learning_rate": 0.00016114074719204396, + "loss": 1.2159, + "step": 8709 + }, + { + "epoch": 0.3119236485397604, + "grad_norm": 1.8100037574768066, + "learning_rate": 0.00016113156829705526, + "loss": 1.1007, + "step": 8710 + }, + { + "epoch": 0.3119594606693287, + "grad_norm": 1.4303606748580933, + "learning_rate": 0.00016112238857962017, + "loss": 1.1923, + "step": 8711 + }, + { + "epoch": 0.311995272798897, + "grad_norm": 1.5292985439300537, + "learning_rate": 0.00016111320803986217, + "loss": 1.2557, + "step": 8712 + }, + { + "epoch": 0.31203108492846526, + "grad_norm": 1.404022455215454, + "learning_rate": 0.00016110402667790475, + "loss": 1.2592, + "step": 8713 + }, + { + "epoch": 0.31206689705803353, + "grad_norm": 1.7304773330688477, + "learning_rate": 0.0001610948444938715, + "loss": 1.4211, + "step": 8714 + }, + { + "epoch": 0.31210270918760186, + "grad_norm": 1.5086398124694824, + "learning_rate": 0.00016108566148788594, + "loss": 0.9597, + "step": 8715 + }, + { + "epoch": 0.3121385213171701, + "grad_norm": 1.4469795227050781, + "learning_rate": 0.00016107647766007159, + "loss": 1.1456, + "step": 8716 + }, + { + "epoch": 0.3121743334467384, + "grad_norm": 1.8933906555175781, + "learning_rate": 0.000161067293010552, + "loss": 1.2173, + "step": 8717 + }, + { + "epoch": 0.3122101455763067, + "grad_norm": 1.5752097368240356, + "learning_rate": 0.00016105810753945076, + "loss": 1.2198, + "step": 8718 + }, + { + "epoch": 0.312245957705875, + "grad_norm": 1.93033766746521, + "learning_rate": 0.00016104892124689147, + "loss": 1.348, + "step": 8719 + }, + { + "epoch": 0.31228176983544326, + "grad_norm": 1.2772696018218994, + "learning_rate": 0.00016103973413299767, + "loss": 1.1453, + "step": 8720 + }, + { + "epoch": 0.3123175819650115, + "grad_norm": 1.7015581130981445, + "learning_rate": 0.00016103054619789298, + "loss": 1.2257, + "step": 8721 + }, + { + "epoch": 0.31235339409457985, + "grad_norm": 1.6378401517868042, + "learning_rate": 0.00016102135744170098, + "loss": 1.3487, + "step": 8722 + }, + { + "epoch": 0.3123892062241481, + "grad_norm": 2.4715139865875244, + "learning_rate": 0.00016101216786454538, + "loss": 1.3323, + "step": 8723 + }, + { + "epoch": 0.3124250183537164, + "grad_norm": 2.243699789047241, + "learning_rate": 0.00016100297746654975, + "loss": 1.3484, + "step": 8724 + }, + { + "epoch": 0.3124608304832847, + "grad_norm": 1.4816820621490479, + "learning_rate": 0.00016099378624783773, + "loss": 1.4227, + "step": 8725 + }, + { + "epoch": 0.312496642612853, + "grad_norm": 1.5447473526000977, + "learning_rate": 0.00016098459420853302, + "loss": 1.3362, + "step": 8726 + }, + { + "epoch": 0.31253245474242125, + "grad_norm": 1.7988617420196533, + "learning_rate": 0.00016097540134875924, + "loss": 1.1828, + "step": 8727 + }, + { + "epoch": 0.3125682668719895, + "grad_norm": 1.6082279682159424, + "learning_rate": 0.00016096620766864011, + "loss": 1.3388, + "step": 8728 + }, + { + "epoch": 0.31260407900155784, + "grad_norm": 1.5702368021011353, + "learning_rate": 0.00016095701316829925, + "loss": 0.9982, + "step": 8729 + }, + { + "epoch": 0.3126398911311261, + "grad_norm": 1.466307282447815, + "learning_rate": 0.00016094781784786044, + "loss": 1.1821, + "step": 8730 + }, + { + "epoch": 0.3126757032606944, + "grad_norm": 1.7247300148010254, + "learning_rate": 0.00016093862170744733, + "loss": 1.1939, + "step": 8731 + }, + { + "epoch": 0.3127115153902627, + "grad_norm": 1.9052692651748657, + "learning_rate": 0.00016092942474718372, + "loss": 1.4252, + "step": 8732 + }, + { + "epoch": 0.312747327519831, + "grad_norm": 1.6616138219833374, + "learning_rate": 0.00016092022696719327, + "loss": 1.3681, + "step": 8733 + }, + { + "epoch": 0.31278313964939924, + "grad_norm": 1.4143447875976562, + "learning_rate": 0.00016091102836759974, + "loss": 1.3651, + "step": 8734 + }, + { + "epoch": 0.3128189517789675, + "grad_norm": 1.2230521440505981, + "learning_rate": 0.00016090182894852687, + "loss": 0.9571, + "step": 8735 + }, + { + "epoch": 0.31285476390853584, + "grad_norm": 1.8764896392822266, + "learning_rate": 0.0001608926287100985, + "loss": 1.2848, + "step": 8736 + }, + { + "epoch": 0.3128905760381041, + "grad_norm": 1.6488420963287354, + "learning_rate": 0.00016088342765243832, + "loss": 1.2627, + "step": 8737 + }, + { + "epoch": 0.3129263881676724, + "grad_norm": 2.2215194702148438, + "learning_rate": 0.00016087422577567016, + "loss": 1.215, + "step": 8738 + }, + { + "epoch": 0.3129622002972407, + "grad_norm": 1.2457228899002075, + "learning_rate": 0.00016086502307991783, + "loss": 1.1424, + "step": 8739 + }, + { + "epoch": 0.31299801242680897, + "grad_norm": 1.3347328901290894, + "learning_rate": 0.0001608558195653051, + "loss": 0.9437, + "step": 8740 + }, + { + "epoch": 0.31303382455637724, + "grad_norm": 1.5983428955078125, + "learning_rate": 0.00016084661523195585, + "loss": 1.3603, + "step": 8741 + }, + { + "epoch": 0.3130696366859455, + "grad_norm": 2.024381637573242, + "learning_rate": 0.00016083741007999388, + "loss": 1.2584, + "step": 8742 + }, + { + "epoch": 0.31310544881551383, + "grad_norm": 1.3448294401168823, + "learning_rate": 0.00016082820410954297, + "loss": 1.2531, + "step": 8743 + }, + { + "epoch": 0.3131412609450821, + "grad_norm": 1.7001458406448364, + "learning_rate": 0.0001608189973207271, + "loss": 1.3405, + "step": 8744 + }, + { + "epoch": 0.31317707307465037, + "grad_norm": 2.1118853092193604, + "learning_rate": 0.00016080978971367004, + "loss": 1.2954, + "step": 8745 + }, + { + "epoch": 0.3132128852042187, + "grad_norm": 1.609196662902832, + "learning_rate": 0.00016080058128849572, + "loss": 1.0773, + "step": 8746 + }, + { + "epoch": 0.31324869733378696, + "grad_norm": 1.3322819471359253, + "learning_rate": 0.000160791372045328, + "loss": 1.102, + "step": 8747 + }, + { + "epoch": 0.31328450946335523, + "grad_norm": 1.9300302267074585, + "learning_rate": 0.00016078216198429077, + "loss": 1.1698, + "step": 8748 + }, + { + "epoch": 0.3133203215929235, + "grad_norm": 1.456827998161316, + "learning_rate": 0.00016077295110550796, + "loss": 1.0814, + "step": 8749 + }, + { + "epoch": 0.3133561337224918, + "grad_norm": 1.4454290866851807, + "learning_rate": 0.0001607637394091035, + "loss": 1.188, + "step": 8750 + }, + { + "epoch": 0.3133919458520601, + "grad_norm": 2.012561082839966, + "learning_rate": 0.00016075452689520128, + "loss": 1.3023, + "step": 8751 + }, + { + "epoch": 0.31342775798162836, + "grad_norm": 1.7164925336837769, + "learning_rate": 0.0001607453135639253, + "loss": 1.2113, + "step": 8752 + }, + { + "epoch": 0.3134635701111967, + "grad_norm": 1.4213062524795532, + "learning_rate": 0.00016073609941539944, + "loss": 1.0378, + "step": 8753 + }, + { + "epoch": 0.31349938224076496, + "grad_norm": 2.304598569869995, + "learning_rate": 0.00016072688444974774, + "loss": 1.3182, + "step": 8754 + }, + { + "epoch": 0.3135351943703332, + "grad_norm": 1.4762814044952393, + "learning_rate": 0.00016071766866709413, + "loss": 1.0275, + "step": 8755 + }, + { + "epoch": 0.3135710064999015, + "grad_norm": 1.52603018283844, + "learning_rate": 0.00016070845206756263, + "loss": 1.2419, + "step": 8756 + }, + { + "epoch": 0.3136068186294698, + "grad_norm": 1.3543126583099365, + "learning_rate": 0.00016069923465127718, + "loss": 1.2105, + "step": 8757 + }, + { + "epoch": 0.3136426307590381, + "grad_norm": 1.2974112033843994, + "learning_rate": 0.00016069001641836182, + "loss": 0.9753, + "step": 8758 + }, + { + "epoch": 0.31367844288860636, + "grad_norm": 1.37090003490448, + "learning_rate": 0.00016068079736894058, + "loss": 1.2036, + "step": 8759 + }, + { + "epoch": 0.3137142550181747, + "grad_norm": 1.4401767253875732, + "learning_rate": 0.00016067157750313752, + "loss": 1.0645, + "step": 8760 + }, + { + "epoch": 0.31375006714774295, + "grad_norm": 1.3878966569900513, + "learning_rate": 0.00016066235682107662, + "loss": 1.2037, + "step": 8761 + }, + { + "epoch": 0.3137858792773112, + "grad_norm": 1.535106897354126, + "learning_rate": 0.00016065313532288196, + "loss": 1.1346, + "step": 8762 + }, + { + "epoch": 0.3138216914068795, + "grad_norm": 1.6284167766571045, + "learning_rate": 0.0001606439130086776, + "loss": 1.1306, + "step": 8763 + }, + { + "epoch": 0.3138575035364478, + "grad_norm": 2.0086214542388916, + "learning_rate": 0.00016063468987858763, + "loss": 1.2361, + "step": 8764 + }, + { + "epoch": 0.3138933156660161, + "grad_norm": 1.357069492340088, + "learning_rate": 0.00016062546593273612, + "loss": 1.0325, + "step": 8765 + }, + { + "epoch": 0.31392912779558435, + "grad_norm": 1.7100811004638672, + "learning_rate": 0.00016061624117124715, + "loss": 1.3058, + "step": 8766 + }, + { + "epoch": 0.3139649399251527, + "grad_norm": 1.7258042097091675, + "learning_rate": 0.00016060701559424484, + "loss": 1.1752, + "step": 8767 + }, + { + "epoch": 0.31400075205472094, + "grad_norm": 1.4441914558410645, + "learning_rate": 0.00016059778920185332, + "loss": 1.0176, + "step": 8768 + }, + { + "epoch": 0.3140365641842892, + "grad_norm": 1.4213522672653198, + "learning_rate": 0.0001605885619941967, + "loss": 1.1592, + "step": 8769 + }, + { + "epoch": 0.3140723763138575, + "grad_norm": 1.7609481811523438, + "learning_rate": 0.00016057933397139914, + "loss": 1.1813, + "step": 8770 + }, + { + "epoch": 0.3141081884434258, + "grad_norm": 1.5500562191009521, + "learning_rate": 0.0001605701051335848, + "loss": 1.219, + "step": 8771 + }, + { + "epoch": 0.3141440005729941, + "grad_norm": 1.531135082244873, + "learning_rate": 0.00016056087548087784, + "loss": 1.0638, + "step": 8772 + }, + { + "epoch": 0.31417981270256234, + "grad_norm": 1.502111554145813, + "learning_rate": 0.00016055164501340235, + "loss": 1.1791, + "step": 8773 + }, + { + "epoch": 0.31421562483213067, + "grad_norm": 1.9600918292999268, + "learning_rate": 0.00016054241373128264, + "loss": 1.251, + "step": 8774 + }, + { + "epoch": 0.31425143696169894, + "grad_norm": 1.468337893486023, + "learning_rate": 0.00016053318163464285, + "loss": 1.1324, + "step": 8775 + }, + { + "epoch": 0.3142872490912672, + "grad_norm": 1.8454415798187256, + "learning_rate": 0.00016052394872360717, + "loss": 1.269, + "step": 8776 + }, + { + "epoch": 0.3143230612208355, + "grad_norm": 1.6421587467193604, + "learning_rate": 0.00016051471499829983, + "loss": 1.2118, + "step": 8777 + }, + { + "epoch": 0.3143588733504038, + "grad_norm": 1.3286335468292236, + "learning_rate": 0.00016050548045884501, + "loss": 1.079, + "step": 8778 + }, + { + "epoch": 0.31439468547997207, + "grad_norm": 1.4586429595947266, + "learning_rate": 0.00016049624510536704, + "loss": 1.3849, + "step": 8779 + }, + { + "epoch": 0.31443049760954034, + "grad_norm": 1.5696327686309814, + "learning_rate": 0.00016048700893799014, + "loss": 1.0149, + "step": 8780 + }, + { + "epoch": 0.31446630973910866, + "grad_norm": 1.2526307106018066, + "learning_rate": 0.00016047777195683858, + "loss": 1.0522, + "step": 8781 + }, + { + "epoch": 0.31450212186867693, + "grad_norm": 1.7881062030792236, + "learning_rate": 0.00016046853416203655, + "loss": 1.0323, + "step": 8782 + }, + { + "epoch": 0.3145379339982452, + "grad_norm": 1.4661383628845215, + "learning_rate": 0.0001604592955537084, + "loss": 1.2708, + "step": 8783 + }, + { + "epoch": 0.31457374612781347, + "grad_norm": 1.1885040998458862, + "learning_rate": 0.00016045005613197843, + "loss": 0.9787, + "step": 8784 + }, + { + "epoch": 0.3146095582573818, + "grad_norm": 1.7092645168304443, + "learning_rate": 0.00016044081589697092, + "loss": 1.2355, + "step": 8785 + }, + { + "epoch": 0.31464537038695006, + "grad_norm": 1.4734821319580078, + "learning_rate": 0.0001604315748488102, + "loss": 1.1293, + "step": 8786 + }, + { + "epoch": 0.31468118251651833, + "grad_norm": 1.4516034126281738, + "learning_rate": 0.00016042233298762062, + "loss": 1.1394, + "step": 8787 + }, + { + "epoch": 0.31471699464608666, + "grad_norm": 1.436822772026062, + "learning_rate": 0.00016041309031352644, + "loss": 0.9969, + "step": 8788 + }, + { + "epoch": 0.3147528067756549, + "grad_norm": 2.0728182792663574, + "learning_rate": 0.00016040384682665214, + "loss": 1.2308, + "step": 8789 + }, + { + "epoch": 0.3147886189052232, + "grad_norm": 1.544194221496582, + "learning_rate": 0.0001603946025271219, + "loss": 1.3354, + "step": 8790 + }, + { + "epoch": 0.31482443103479146, + "grad_norm": 1.6420962810516357, + "learning_rate": 0.00016038535741506025, + "loss": 1.1292, + "step": 8791 + }, + { + "epoch": 0.3148602431643598, + "grad_norm": 1.4769207239151, + "learning_rate": 0.00016037611149059147, + "loss": 1.0251, + "step": 8792 + }, + { + "epoch": 0.31489605529392806, + "grad_norm": 1.645047903060913, + "learning_rate": 0.00016036686475384002, + "loss": 1.0334, + "step": 8793 + }, + { + "epoch": 0.3149318674234963, + "grad_norm": 1.5304971933364868, + "learning_rate": 0.00016035761720493023, + "loss": 1.1744, + "step": 8794 + }, + { + "epoch": 0.31496767955306465, + "grad_norm": 1.3700590133666992, + "learning_rate": 0.0001603483688439866, + "loss": 1.0422, + "step": 8795 + }, + { + "epoch": 0.3150034916826329, + "grad_norm": 1.675028920173645, + "learning_rate": 0.00016033911967113347, + "loss": 1.1811, + "step": 8796 + }, + { + "epoch": 0.3150393038122012, + "grad_norm": 1.6892948150634766, + "learning_rate": 0.00016032986968649536, + "loss": 1.1809, + "step": 8797 + }, + { + "epoch": 0.31507511594176946, + "grad_norm": 1.804660677909851, + "learning_rate": 0.00016032061889019662, + "loss": 1.0936, + "step": 8798 + }, + { + "epoch": 0.3151109280713378, + "grad_norm": 1.389407753944397, + "learning_rate": 0.00016031136728236184, + "loss": 1.0515, + "step": 8799 + }, + { + "epoch": 0.31514674020090605, + "grad_norm": 1.3587857484817505, + "learning_rate": 0.00016030211486311533, + "loss": 1.0434, + "step": 8800 + }, + { + "epoch": 0.3151825523304743, + "grad_norm": 1.478757381439209, + "learning_rate": 0.0001602928616325817, + "loss": 1.409, + "step": 8801 + }, + { + "epoch": 0.31521836446004264, + "grad_norm": 1.8754408359527588, + "learning_rate": 0.00016028360759088534, + "loss": 1.2483, + "step": 8802 + }, + { + "epoch": 0.3152541765896109, + "grad_norm": 1.7604761123657227, + "learning_rate": 0.00016027435273815085, + "loss": 1.1241, + "step": 8803 + }, + { + "epoch": 0.3152899887191792, + "grad_norm": 1.2350589036941528, + "learning_rate": 0.00016026509707450266, + "loss": 1.0341, + "step": 8804 + }, + { + "epoch": 0.31532580084874745, + "grad_norm": 3.494727373123169, + "learning_rate": 0.00016025584060006532, + "loss": 1.2478, + "step": 8805 + }, + { + "epoch": 0.3153616129783158, + "grad_norm": 1.3793567419052124, + "learning_rate": 0.0001602465833149634, + "loss": 1.2206, + "step": 8806 + }, + { + "epoch": 0.31539742510788404, + "grad_norm": 1.6861071586608887, + "learning_rate": 0.0001602373252193214, + "loss": 1.2387, + "step": 8807 + }, + { + "epoch": 0.3154332372374523, + "grad_norm": 2.0008630752563477, + "learning_rate": 0.0001602280663132639, + "loss": 1.0489, + "step": 8808 + }, + { + "epoch": 0.31546904936702064, + "grad_norm": 1.6631911993026733, + "learning_rate": 0.00016021880659691546, + "loss": 1.0118, + "step": 8809 + }, + { + "epoch": 0.3155048614965889, + "grad_norm": 1.3419182300567627, + "learning_rate": 0.00016020954607040065, + "loss": 1.236, + "step": 8810 + }, + { + "epoch": 0.3155406736261572, + "grad_norm": 1.5294520854949951, + "learning_rate": 0.00016020028473384402, + "loss": 1.3113, + "step": 8811 + }, + { + "epoch": 0.31557648575572544, + "grad_norm": 1.5441851615905762, + "learning_rate": 0.00016019102258737027, + "loss": 1.0342, + "step": 8812 + }, + { + "epoch": 0.31561229788529377, + "grad_norm": 1.2743465900421143, + "learning_rate": 0.00016018175963110389, + "loss": 1.0694, + "step": 8813 + }, + { + "epoch": 0.31564811001486204, + "grad_norm": 1.7531509399414062, + "learning_rate": 0.00016017249586516963, + "loss": 1.1595, + "step": 8814 + }, + { + "epoch": 0.3156839221444303, + "grad_norm": 1.6750191450119019, + "learning_rate": 0.000160163231289692, + "loss": 1.222, + "step": 8815 + }, + { + "epoch": 0.3157197342739986, + "grad_norm": 1.9437286853790283, + "learning_rate": 0.00016015396590479575, + "loss": 1.2437, + "step": 8816 + }, + { + "epoch": 0.3157555464035669, + "grad_norm": 1.300172209739685, + "learning_rate": 0.00016014469971060543, + "loss": 1.0609, + "step": 8817 + }, + { + "epoch": 0.31579135853313517, + "grad_norm": 1.2924976348876953, + "learning_rate": 0.0001601354327072458, + "loss": 1.0754, + "step": 8818 + }, + { + "epoch": 0.31582717066270344, + "grad_norm": 1.5915316343307495, + "learning_rate": 0.00016012616489484148, + "loss": 1.1037, + "step": 8819 + }, + { + "epoch": 0.31586298279227176, + "grad_norm": 1.483031988143921, + "learning_rate": 0.00016011689627351712, + "loss": 1.0874, + "step": 8820 + }, + { + "epoch": 0.31589879492184003, + "grad_norm": 2.230269432067871, + "learning_rate": 0.00016010762684339752, + "loss": 1.2041, + "step": 8821 + }, + { + "epoch": 0.3159346070514083, + "grad_norm": 1.234607458114624, + "learning_rate": 0.00016009835660460732, + "loss": 1.0622, + "step": 8822 + }, + { + "epoch": 0.31597041918097657, + "grad_norm": 1.5450620651245117, + "learning_rate": 0.00016008908555727123, + "loss": 1.1622, + "step": 8823 + }, + { + "epoch": 0.3160062313105449, + "grad_norm": 1.730654239654541, + "learning_rate": 0.00016007981370151406, + "loss": 1.064, + "step": 8824 + }, + { + "epoch": 0.31604204344011316, + "grad_norm": 1.706353783607483, + "learning_rate": 0.00016007054103746047, + "loss": 1.185, + "step": 8825 + }, + { + "epoch": 0.31607785556968143, + "grad_norm": 1.5138137340545654, + "learning_rate": 0.00016006126756523524, + "loss": 1.2244, + "step": 8826 + }, + { + "epoch": 0.31611366769924976, + "grad_norm": 1.6418768167495728, + "learning_rate": 0.0001600519932849631, + "loss": 1.0843, + "step": 8827 + }, + { + "epoch": 0.316149479828818, + "grad_norm": 1.4462900161743164, + "learning_rate": 0.00016004271819676887, + "loss": 1.2581, + "step": 8828 + }, + { + "epoch": 0.3161852919583863, + "grad_norm": 1.2587993144989014, + "learning_rate": 0.0001600334423007773, + "loss": 1.0662, + "step": 8829 + }, + { + "epoch": 0.31622110408795456, + "grad_norm": 2.0788180828094482, + "learning_rate": 0.0001600241655971132, + "loss": 1.2279, + "step": 8830 + }, + { + "epoch": 0.3162569162175229, + "grad_norm": 1.8177601099014282, + "learning_rate": 0.0001600148880859014, + "loss": 1.1204, + "step": 8831 + }, + { + "epoch": 0.31629272834709116, + "grad_norm": 1.9333659410476685, + "learning_rate": 0.0001600056097672667, + "loss": 1.2487, + "step": 8832 + }, + { + "epoch": 0.3163285404766594, + "grad_norm": 1.3471119403839111, + "learning_rate": 0.00015999633064133392, + "loss": 0.9853, + "step": 8833 + }, + { + "epoch": 0.31636435260622775, + "grad_norm": 1.6641117334365845, + "learning_rate": 0.0001599870507082279, + "loss": 1.186, + "step": 8834 + }, + { + "epoch": 0.316400164735796, + "grad_norm": 1.2119730710983276, + "learning_rate": 0.0001599777699680735, + "loss": 1.1898, + "step": 8835 + }, + { + "epoch": 0.3164359768653643, + "grad_norm": 1.8191298246383667, + "learning_rate": 0.0001599684884209955, + "loss": 1.2184, + "step": 8836 + }, + { + "epoch": 0.31647178899493256, + "grad_norm": 1.5748666524887085, + "learning_rate": 0.00015995920606711893, + "loss": 0.9792, + "step": 8837 + }, + { + "epoch": 0.3165076011245009, + "grad_norm": 1.7372043132781982, + "learning_rate": 0.00015994992290656855, + "loss": 1.3112, + "step": 8838 + }, + { + "epoch": 0.31654341325406915, + "grad_norm": 1.3080569505691528, + "learning_rate": 0.00015994063893946928, + "loss": 1.2003, + "step": 8839 + }, + { + "epoch": 0.3165792253836374, + "grad_norm": 1.3817867040634155, + "learning_rate": 0.000159931354165946, + "loss": 1.2997, + "step": 8840 + }, + { + "epoch": 0.31661503751320574, + "grad_norm": 1.8736841678619385, + "learning_rate": 0.0001599220685861237, + "loss": 1.1917, + "step": 8841 + }, + { + "epoch": 0.316650849642774, + "grad_norm": 1.4558610916137695, + "learning_rate": 0.00015991278220012727, + "loss": 1.2522, + "step": 8842 + }, + { + "epoch": 0.3166866617723423, + "grad_norm": 1.6811654567718506, + "learning_rate": 0.00015990349500808162, + "loss": 1.148, + "step": 8843 + }, + { + "epoch": 0.31672247390191055, + "grad_norm": 1.616883635520935, + "learning_rate": 0.00015989420701011171, + "loss": 1.2287, + "step": 8844 + }, + { + "epoch": 0.3167582860314789, + "grad_norm": 1.285925030708313, + "learning_rate": 0.0001598849182063425, + "loss": 1.1621, + "step": 8845 + }, + { + "epoch": 0.31679409816104714, + "grad_norm": 1.7907902002334595, + "learning_rate": 0.00015987562859689898, + "loss": 1.1295, + "step": 8846 + }, + { + "epoch": 0.3168299102906154, + "grad_norm": 1.1794289350509644, + "learning_rate": 0.0001598663381819061, + "loss": 1.1321, + "step": 8847 + }, + { + "epoch": 0.31686572242018374, + "grad_norm": 1.3017473220825195, + "learning_rate": 0.00015985704696148885, + "loss": 1.3534, + "step": 8848 + }, + { + "epoch": 0.316901534549752, + "grad_norm": 1.4429848194122314, + "learning_rate": 0.00015984775493577225, + "loss": 1.195, + "step": 8849 + }, + { + "epoch": 0.3169373466793203, + "grad_norm": 1.744667887687683, + "learning_rate": 0.0001598384621048813, + "loss": 1.129, + "step": 8850 + }, + { + "epoch": 0.31697315880888854, + "grad_norm": 1.4070016145706177, + "learning_rate": 0.00015982916846894106, + "loss": 1.145, + "step": 8851 + }, + { + "epoch": 0.31700897093845687, + "grad_norm": 1.6887904405593872, + "learning_rate": 0.0001598198740280765, + "loss": 1.2116, + "step": 8852 + }, + { + "epoch": 0.31704478306802514, + "grad_norm": 1.4545613527297974, + "learning_rate": 0.00015981057878241273, + "loss": 0.9588, + "step": 8853 + }, + { + "epoch": 0.3170805951975934, + "grad_norm": 1.4053138494491577, + "learning_rate": 0.00015980128273207473, + "loss": 1.2218, + "step": 8854 + }, + { + "epoch": 0.31711640732716173, + "grad_norm": 1.6396347284317017, + "learning_rate": 0.00015979198587718764, + "loss": 1.2386, + "step": 8855 + }, + { + "epoch": 0.31715221945673, + "grad_norm": 1.5900300741195679, + "learning_rate": 0.00015978268821787648, + "loss": 1.4692, + "step": 8856 + }, + { + "epoch": 0.31718803158629827, + "grad_norm": 1.4821839332580566, + "learning_rate": 0.0001597733897542664, + "loss": 1.2206, + "step": 8857 + }, + { + "epoch": 0.31722384371586654, + "grad_norm": 1.3947293758392334, + "learning_rate": 0.0001597640904864824, + "loss": 1.1066, + "step": 8858 + }, + { + "epoch": 0.31725965584543486, + "grad_norm": 2.533789873123169, + "learning_rate": 0.00015975479041464974, + "loss": 1.1507, + "step": 8859 + }, + { + "epoch": 0.31729546797500313, + "grad_norm": 1.650001049041748, + "learning_rate": 0.0001597454895388934, + "loss": 1.0279, + "step": 8860 + }, + { + "epoch": 0.3173312801045714, + "grad_norm": 1.589218258857727, + "learning_rate": 0.00015973618785933858, + "loss": 1.2492, + "step": 8861 + }, + { + "epoch": 0.3173670922341397, + "grad_norm": 1.670323133468628, + "learning_rate": 0.00015972688537611038, + "loss": 1.2751, + "step": 8862 + }, + { + "epoch": 0.317402904363708, + "grad_norm": 1.5589622259140015, + "learning_rate": 0.000159717582089334, + "loss": 1.1708, + "step": 8863 + }, + { + "epoch": 0.31743871649327626, + "grad_norm": 1.6760109663009644, + "learning_rate": 0.0001597082779991346, + "loss": 1.1203, + "step": 8864 + }, + { + "epoch": 0.31747452862284453, + "grad_norm": 1.4168516397476196, + "learning_rate": 0.0001596989731056373, + "loss": 1.0827, + "step": 8865 + }, + { + "epoch": 0.31751034075241286, + "grad_norm": 1.5397920608520508, + "learning_rate": 0.00015968966740896736, + "loss": 1.3056, + "step": 8866 + }, + { + "epoch": 0.3175461528819811, + "grad_norm": 1.353535771369934, + "learning_rate": 0.0001596803609092499, + "loss": 1.0448, + "step": 8867 + }, + { + "epoch": 0.3175819650115494, + "grad_norm": 1.5065422058105469, + "learning_rate": 0.0001596710536066102, + "loss": 0.9049, + "step": 8868 + }, + { + "epoch": 0.3176177771411177, + "grad_norm": 1.5428167581558228, + "learning_rate": 0.00015966174550117342, + "loss": 1.1996, + "step": 8869 + }, + { + "epoch": 0.317653589270686, + "grad_norm": 1.5400327444076538, + "learning_rate": 0.00015965243659306482, + "loss": 1.0823, + "step": 8870 + }, + { + "epoch": 0.31768940140025426, + "grad_norm": 1.497359037399292, + "learning_rate": 0.00015964312688240967, + "loss": 1.3852, + "step": 8871 + }, + { + "epoch": 0.3177252135298225, + "grad_norm": 1.8060824871063232, + "learning_rate": 0.00015963381636933312, + "loss": 1.0172, + "step": 8872 + }, + { + "epoch": 0.31776102565939085, + "grad_norm": 1.375948190689087, + "learning_rate": 0.00015962450505396051, + "loss": 0.9863, + "step": 8873 + }, + { + "epoch": 0.3177968377889591, + "grad_norm": 1.6032425165176392, + "learning_rate": 0.00015961519293641714, + "loss": 1.0825, + "step": 8874 + }, + { + "epoch": 0.3178326499185274, + "grad_norm": 1.665005087852478, + "learning_rate": 0.0001596058800168282, + "loss": 1.1141, + "step": 8875 + }, + { + "epoch": 0.3178684620480957, + "grad_norm": 1.3346561193466187, + "learning_rate": 0.00015959656629531904, + "loss": 1.0436, + "step": 8876 + }, + { + "epoch": 0.317904274177664, + "grad_norm": 1.4879631996154785, + "learning_rate": 0.00015958725177201495, + "loss": 1.1868, + "step": 8877 + }, + { + "epoch": 0.31794008630723225, + "grad_norm": 1.5094008445739746, + "learning_rate": 0.0001595779364470413, + "loss": 1.1637, + "step": 8878 + }, + { + "epoch": 0.3179758984368005, + "grad_norm": 1.327484369277954, + "learning_rate": 0.0001595686203205233, + "loss": 1.3782, + "step": 8879 + }, + { + "epoch": 0.31801171056636884, + "grad_norm": 1.2296048402786255, + "learning_rate": 0.00015955930339258634, + "loss": 1.1632, + "step": 8880 + }, + { + "epoch": 0.3180475226959371, + "grad_norm": 1.6585893630981445, + "learning_rate": 0.00015954998566335583, + "loss": 1.4109, + "step": 8881 + }, + { + "epoch": 0.3180833348255054, + "grad_norm": 1.4640969038009644, + "learning_rate": 0.00015954066713295707, + "loss": 1.1369, + "step": 8882 + }, + { + "epoch": 0.3181191469550737, + "grad_norm": 1.4738589525222778, + "learning_rate": 0.00015953134780151543, + "loss": 1.3071, + "step": 8883 + }, + { + "epoch": 0.318154959084642, + "grad_norm": 1.6421856880187988, + "learning_rate": 0.00015952202766915627, + "loss": 1.3453, + "step": 8884 + }, + { + "epoch": 0.31819077121421024, + "grad_norm": 1.3272426128387451, + "learning_rate": 0.00015951270673600503, + "loss": 0.8391, + "step": 8885 + }, + { + "epoch": 0.3182265833437785, + "grad_norm": 1.2920825481414795, + "learning_rate": 0.0001595033850021871, + "loss": 1.0387, + "step": 8886 + }, + { + "epoch": 0.31826239547334684, + "grad_norm": 1.7031333446502686, + "learning_rate": 0.00015949406246782785, + "loss": 0.9712, + "step": 8887 + }, + { + "epoch": 0.3182982076029151, + "grad_norm": 1.5531755685806274, + "learning_rate": 0.00015948473913305274, + "loss": 1.126, + "step": 8888 + }, + { + "epoch": 0.3183340197324834, + "grad_norm": 1.6988670825958252, + "learning_rate": 0.00015947541499798721, + "loss": 1.1683, + "step": 8889 + }, + { + "epoch": 0.3183698318620517, + "grad_norm": 1.9645134210586548, + "learning_rate": 0.00015946609006275666, + "loss": 1.0811, + "step": 8890 + }, + { + "epoch": 0.31840564399161997, + "grad_norm": 1.7375531196594238, + "learning_rate": 0.0001594567643274866, + "loss": 1.2398, + "step": 8891 + }, + { + "epoch": 0.31844145612118824, + "grad_norm": 2.5537405014038086, + "learning_rate": 0.00015944743779230244, + "loss": 1.1886, + "step": 8892 + }, + { + "epoch": 0.3184772682507565, + "grad_norm": 1.238587498664856, + "learning_rate": 0.00015943811045732973, + "loss": 1.1062, + "step": 8893 + }, + { + "epoch": 0.31851308038032483, + "grad_norm": 1.971927523612976, + "learning_rate": 0.00015942878232269388, + "loss": 1.1696, + "step": 8894 + }, + { + "epoch": 0.3185488925098931, + "grad_norm": 1.4979546070098877, + "learning_rate": 0.00015941945338852044, + "loss": 0.8566, + "step": 8895 + }, + { + "epoch": 0.31858470463946137, + "grad_norm": 1.4069342613220215, + "learning_rate": 0.0001594101236549349, + "loss": 0.9965, + "step": 8896 + }, + { + "epoch": 0.3186205167690297, + "grad_norm": 1.3395401239395142, + "learning_rate": 0.00015940079312206276, + "loss": 1.0135, + "step": 8897 + }, + { + "epoch": 0.31865632889859796, + "grad_norm": 1.6094424724578857, + "learning_rate": 0.00015939146179002957, + "loss": 1.2613, + "step": 8898 + }, + { + "epoch": 0.31869214102816623, + "grad_norm": 1.4847077131271362, + "learning_rate": 0.00015938212965896088, + "loss": 1.0539, + "step": 8899 + }, + { + "epoch": 0.3187279531577345, + "grad_norm": 1.467976689338684, + "learning_rate": 0.00015937279672898223, + "loss": 1.197, + "step": 8900 + }, + { + "epoch": 0.3187637652873028, + "grad_norm": 1.3867275714874268, + "learning_rate": 0.0001593634630002192, + "loss": 1.1901, + "step": 8901 + }, + { + "epoch": 0.3187995774168711, + "grad_norm": 3.943316698074341, + "learning_rate": 0.00015935412847279735, + "loss": 1.3246, + "step": 8902 + }, + { + "epoch": 0.31883538954643936, + "grad_norm": 1.4033734798431396, + "learning_rate": 0.00015934479314684224, + "loss": 1.1253, + "step": 8903 + }, + { + "epoch": 0.3188712016760077, + "grad_norm": 1.6851963996887207, + "learning_rate": 0.00015933545702247952, + "loss": 1.2578, + "step": 8904 + }, + { + "epoch": 0.31890701380557596, + "grad_norm": 1.5943151712417603, + "learning_rate": 0.00015932612009983475, + "loss": 1.1329, + "step": 8905 + }, + { + "epoch": 0.3189428259351442, + "grad_norm": 1.9033187627792358, + "learning_rate": 0.00015931678237903353, + "loss": 1.1929, + "step": 8906 + }, + { + "epoch": 0.3189786380647125, + "grad_norm": 2.222475528717041, + "learning_rate": 0.00015930744386020152, + "loss": 1.0619, + "step": 8907 + }, + { + "epoch": 0.3190144501942808, + "grad_norm": 1.8867216110229492, + "learning_rate": 0.0001592981045434644, + "loss": 1.0946, + "step": 8908 + }, + { + "epoch": 0.3190502623238491, + "grad_norm": 1.4646399021148682, + "learning_rate": 0.0001592887644289477, + "loss": 1.1852, + "step": 8909 + }, + { + "epoch": 0.31908607445341736, + "grad_norm": 2.1436679363250732, + "learning_rate": 0.0001592794235167772, + "loss": 0.942, + "step": 8910 + }, + { + "epoch": 0.3191218865829857, + "grad_norm": 1.3684090375900269, + "learning_rate": 0.00015927008180707854, + "loss": 1.1698, + "step": 8911 + }, + { + "epoch": 0.31915769871255395, + "grad_norm": 1.8780956268310547, + "learning_rate": 0.00015926073929997735, + "loss": 1.0655, + "step": 8912 + }, + { + "epoch": 0.3191935108421222, + "grad_norm": 1.4764078855514526, + "learning_rate": 0.00015925139599559939, + "loss": 1.1629, + "step": 8913 + }, + { + "epoch": 0.3192293229716905, + "grad_norm": 1.6846532821655273, + "learning_rate": 0.0001592420518940703, + "loss": 1.0672, + "step": 8914 + }, + { + "epoch": 0.3192651351012588, + "grad_norm": 2.0349507331848145, + "learning_rate": 0.0001592327069955158, + "loss": 1.3743, + "step": 8915 + }, + { + "epoch": 0.3193009472308271, + "grad_norm": 1.2261691093444824, + "learning_rate": 0.00015922336130006162, + "loss": 1.0185, + "step": 8916 + }, + { + "epoch": 0.31933675936039535, + "grad_norm": 1.6061526536941528, + "learning_rate": 0.00015921401480783356, + "loss": 1.3409, + "step": 8917 + }, + { + "epoch": 0.3193725714899637, + "grad_norm": 1.30286705493927, + "learning_rate": 0.0001592046675189573, + "loss": 1.1234, + "step": 8918 + }, + { + "epoch": 0.31940838361953194, + "grad_norm": 1.2861350774765015, + "learning_rate": 0.00015919531943355857, + "loss": 0.9891, + "step": 8919 + }, + { + "epoch": 0.3194441957491002, + "grad_norm": 1.739784836769104, + "learning_rate": 0.0001591859705517632, + "loss": 1.153, + "step": 8920 + }, + { + "epoch": 0.3194800078786685, + "grad_norm": 2.075777530670166, + "learning_rate": 0.00015917662087369693, + "loss": 1.4032, + "step": 8921 + }, + { + "epoch": 0.3195158200082368, + "grad_norm": 1.5709234476089478, + "learning_rate": 0.0001591672703994856, + "loss": 1.0709, + "step": 8922 + }, + { + "epoch": 0.3195516321378051, + "grad_norm": 1.5477533340454102, + "learning_rate": 0.00015915791912925493, + "loss": 1.1008, + "step": 8923 + }, + { + "epoch": 0.31958744426737334, + "grad_norm": 1.703967571258545, + "learning_rate": 0.00015914856706313076, + "loss": 1.1267, + "step": 8924 + }, + { + "epoch": 0.31962325639694167, + "grad_norm": 1.5246831178665161, + "learning_rate": 0.00015913921420123892, + "loss": 1.3536, + "step": 8925 + }, + { + "epoch": 0.31965906852650994, + "grad_norm": 1.511366844177246, + "learning_rate": 0.00015912986054370524, + "loss": 1.0835, + "step": 8926 + }, + { + "epoch": 0.3196948806560782, + "grad_norm": 1.459030270576477, + "learning_rate": 0.00015912050609065556, + "loss": 0.9122, + "step": 8927 + }, + { + "epoch": 0.3197306927856465, + "grad_norm": 1.5180784463882446, + "learning_rate": 0.00015911115084221575, + "loss": 1.2414, + "step": 8928 + }, + { + "epoch": 0.3197665049152148, + "grad_norm": 1.6793153285980225, + "learning_rate": 0.00015910179479851163, + "loss": 1.1196, + "step": 8929 + }, + { + "epoch": 0.31980231704478307, + "grad_norm": 1.7419824600219727, + "learning_rate": 0.0001590924379596691, + "loss": 1.2257, + "step": 8930 + }, + { + "epoch": 0.31983812917435134, + "grad_norm": 1.7307265996932983, + "learning_rate": 0.00015908308032581406, + "loss": 1.3594, + "step": 8931 + }, + { + "epoch": 0.31987394130391966, + "grad_norm": 1.443402647972107, + "learning_rate": 0.00015907372189707237, + "loss": 1.1274, + "step": 8932 + }, + { + "epoch": 0.31990975343348793, + "grad_norm": 1.662224292755127, + "learning_rate": 0.00015906436267356993, + "loss": 1.0536, + "step": 8933 + }, + { + "epoch": 0.3199455655630562, + "grad_norm": 1.470943570137024, + "learning_rate": 0.00015905500265543272, + "loss": 1.1414, + "step": 8934 + }, + { + "epoch": 0.31998137769262447, + "grad_norm": 1.9122693538665771, + "learning_rate": 0.0001590456418427866, + "loss": 1.2132, + "step": 8935 + }, + { + "epoch": 0.3200171898221928, + "grad_norm": 1.6879210472106934, + "learning_rate": 0.00015903628023575755, + "loss": 0.9774, + "step": 8936 + }, + { + "epoch": 0.32005300195176106, + "grad_norm": 1.3634120225906372, + "learning_rate": 0.00015902691783447142, + "loss": 1.1193, + "step": 8937 + }, + { + "epoch": 0.32008881408132933, + "grad_norm": 1.4968308210372925, + "learning_rate": 0.00015901755463905434, + "loss": 1.1681, + "step": 8938 + }, + { + "epoch": 0.32012462621089766, + "grad_norm": 1.4899775981903076, + "learning_rate": 0.00015900819064963218, + "loss": 1.21, + "step": 8939 + }, + { + "epoch": 0.3201604383404659, + "grad_norm": 1.6099286079406738, + "learning_rate": 0.00015899882586633093, + "loss": 0.9, + "step": 8940 + }, + { + "epoch": 0.3201962504700342, + "grad_norm": 1.425756573677063, + "learning_rate": 0.00015898946028927656, + "loss": 1.1774, + "step": 8941 + }, + { + "epoch": 0.32023206259960246, + "grad_norm": 1.5232856273651123, + "learning_rate": 0.0001589800939185951, + "loss": 1.124, + "step": 8942 + }, + { + "epoch": 0.3202678747291708, + "grad_norm": 1.8085843324661255, + "learning_rate": 0.00015897072675441254, + "loss": 1.2086, + "step": 8943 + }, + { + "epoch": 0.32030368685873906, + "grad_norm": 2.363452196121216, + "learning_rate": 0.00015896135879685494, + "loss": 1.2344, + "step": 8944 + }, + { + "epoch": 0.3203394989883073, + "grad_norm": 1.925740361213684, + "learning_rate": 0.0001589519900460483, + "loss": 1.2579, + "step": 8945 + }, + { + "epoch": 0.32037531111787565, + "grad_norm": 1.722765326499939, + "learning_rate": 0.00015894262050211868, + "loss": 1.1914, + "step": 8946 + }, + { + "epoch": 0.3204111232474439, + "grad_norm": 1.8601047992706299, + "learning_rate": 0.00015893325016519213, + "loss": 1.1792, + "step": 8947 + }, + { + "epoch": 0.3204469353770122, + "grad_norm": 2.33085036277771, + "learning_rate": 0.0001589238790353947, + "loss": 1.2448, + "step": 8948 + }, + { + "epoch": 0.32048274750658046, + "grad_norm": 1.4069111347198486, + "learning_rate": 0.00015891450711285254, + "loss": 1.1344, + "step": 8949 + }, + { + "epoch": 0.3205185596361488, + "grad_norm": 2.1323530673980713, + "learning_rate": 0.00015890513439769164, + "loss": 1.3448, + "step": 8950 + }, + { + "epoch": 0.32055437176571705, + "grad_norm": 1.37254798412323, + "learning_rate": 0.00015889576089003814, + "loss": 1.048, + "step": 8951 + }, + { + "epoch": 0.3205901838952853, + "grad_norm": 1.5069202184677124, + "learning_rate": 0.00015888638659001815, + "loss": 1.1643, + "step": 8952 + }, + { + "epoch": 0.32062599602485364, + "grad_norm": 1.751829981803894, + "learning_rate": 0.0001588770114977578, + "loss": 1.1429, + "step": 8953 + }, + { + "epoch": 0.3206618081544219, + "grad_norm": 1.7296637296676636, + "learning_rate": 0.00015886763561338317, + "loss": 1.3244, + "step": 8954 + }, + { + "epoch": 0.3206976202839902, + "grad_norm": 1.2798609733581543, + "learning_rate": 0.00015885825893702048, + "loss": 1.2688, + "step": 8955 + }, + { + "epoch": 0.32073343241355845, + "grad_norm": 1.3441495895385742, + "learning_rate": 0.0001588488814687958, + "loss": 1.2091, + "step": 8956 + }, + { + "epoch": 0.3207692445431268, + "grad_norm": 1.478143572807312, + "learning_rate": 0.00015883950320883536, + "loss": 1.0384, + "step": 8957 + }, + { + "epoch": 0.32080505667269504, + "grad_norm": 1.554840087890625, + "learning_rate": 0.0001588301241572653, + "loss": 1.2523, + "step": 8958 + }, + { + "epoch": 0.3208408688022633, + "grad_norm": 1.3060357570648193, + "learning_rate": 0.0001588207443142118, + "loss": 1.2202, + "step": 8959 + }, + { + "epoch": 0.32087668093183164, + "grad_norm": 1.461972713470459, + "learning_rate": 0.00015881136367980103, + "loss": 1.1771, + "step": 8960 + }, + { + "epoch": 0.3209124930613999, + "grad_norm": 1.8060604333877563, + "learning_rate": 0.00015880198225415925, + "loss": 1.0542, + "step": 8961 + }, + { + "epoch": 0.3209483051909682, + "grad_norm": 1.4930371046066284, + "learning_rate": 0.00015879260003741265, + "loss": 1.322, + "step": 8962 + }, + { + "epoch": 0.32098411732053644, + "grad_norm": 1.4082540273666382, + "learning_rate": 0.00015878321702968745, + "loss": 1.2064, + "step": 8963 + }, + { + "epoch": 0.32101992945010477, + "grad_norm": 2.3617727756500244, + "learning_rate": 0.0001587738332311099, + "loss": 1.7783, + "step": 8964 + }, + { + "epoch": 0.32105574157967304, + "grad_norm": 1.4011845588684082, + "learning_rate": 0.0001587644486418062, + "loss": 1.1017, + "step": 8965 + }, + { + "epoch": 0.3210915537092413, + "grad_norm": 1.5199804306030273, + "learning_rate": 0.00015875506326190267, + "loss": 1.0178, + "step": 8966 + }, + { + "epoch": 0.32112736583880963, + "grad_norm": 1.9224541187286377, + "learning_rate": 0.00015874567709152557, + "loss": 1.0504, + "step": 8967 + }, + { + "epoch": 0.3211631779683779, + "grad_norm": 1.3849655389785767, + "learning_rate": 0.00015873629013080114, + "loss": 1.2484, + "step": 8968 + }, + { + "epoch": 0.32119899009794617, + "grad_norm": 1.7983161211013794, + "learning_rate": 0.0001587269023798557, + "loss": 1.0147, + "step": 8969 + }, + { + "epoch": 0.32123480222751444, + "grad_norm": 1.5515251159667969, + "learning_rate": 0.0001587175138388155, + "loss": 1.1273, + "step": 8970 + }, + { + "epoch": 0.32127061435708276, + "grad_norm": 1.6957802772521973, + "learning_rate": 0.00015870812450780695, + "loss": 1.1532, + "step": 8971 + }, + { + "epoch": 0.32130642648665103, + "grad_norm": 1.6496061086654663, + "learning_rate": 0.00015869873438695628, + "loss": 1.1661, + "step": 8972 + }, + { + "epoch": 0.3213422386162193, + "grad_norm": 1.2829391956329346, + "learning_rate": 0.00015868934347638985, + "loss": 1.0726, + "step": 8973 + }, + { + "epoch": 0.3213780507457876, + "grad_norm": 1.74334716796875, + "learning_rate": 0.00015867995177623403, + "loss": 1.1969, + "step": 8974 + }, + { + "epoch": 0.3214138628753559, + "grad_norm": 1.697222113609314, + "learning_rate": 0.00015867055928661517, + "loss": 1.1677, + "step": 8975 + }, + { + "epoch": 0.32144967500492416, + "grad_norm": 1.8068134784698486, + "learning_rate": 0.00015866116600765957, + "loss": 1.1338, + "step": 8976 + }, + { + "epoch": 0.32148548713449243, + "grad_norm": 1.426978588104248, + "learning_rate": 0.00015865177193949366, + "loss": 1.1253, + "step": 8977 + }, + { + "epoch": 0.32152129926406076, + "grad_norm": 1.8561643362045288, + "learning_rate": 0.0001586423770822438, + "loss": 1.1862, + "step": 8978 + }, + { + "epoch": 0.321557111393629, + "grad_norm": 1.8542108535766602, + "learning_rate": 0.0001586329814360364, + "loss": 1.295, + "step": 8979 + }, + { + "epoch": 0.3215929235231973, + "grad_norm": 1.630292296409607, + "learning_rate": 0.0001586235850009979, + "loss": 1.0108, + "step": 8980 + }, + { + "epoch": 0.3216287356527656, + "grad_norm": 1.9672024250030518, + "learning_rate": 0.00015861418777725467, + "loss": 1.1054, + "step": 8981 + }, + { + "epoch": 0.3216645477823339, + "grad_norm": 1.3546115159988403, + "learning_rate": 0.00015860478976493313, + "loss": 1.0902, + "step": 8982 + }, + { + "epoch": 0.32170035991190216, + "grad_norm": 1.577762484550476, + "learning_rate": 0.00015859539096415976, + "loss": 1.2414, + "step": 8983 + }, + { + "epoch": 0.3217361720414704, + "grad_norm": 1.5543943643569946, + "learning_rate": 0.000158585991375061, + "loss": 1.2093, + "step": 8984 + }, + { + "epoch": 0.32177198417103875, + "grad_norm": 1.5452404022216797, + "learning_rate": 0.00015857659099776327, + "loss": 1.1786, + "step": 8985 + }, + { + "epoch": 0.321807796300607, + "grad_norm": 1.550658941268921, + "learning_rate": 0.0001585671898323931, + "loss": 1.2801, + "step": 8986 + }, + { + "epoch": 0.3218436084301753, + "grad_norm": 1.4834903478622437, + "learning_rate": 0.0001585577878790769, + "loss": 0.884, + "step": 8987 + }, + { + "epoch": 0.3218794205597436, + "grad_norm": 1.4673378467559814, + "learning_rate": 0.00015854838513794118, + "loss": 1.3239, + "step": 8988 + }, + { + "epoch": 0.3219152326893119, + "grad_norm": 1.492674708366394, + "learning_rate": 0.00015853898160911252, + "loss": 1.3544, + "step": 8989 + }, + { + "epoch": 0.32195104481888015, + "grad_norm": 1.4738471508026123, + "learning_rate": 0.00015852957729271735, + "loss": 0.9398, + "step": 8990 + }, + { + "epoch": 0.3219868569484484, + "grad_norm": 1.7081282138824463, + "learning_rate": 0.00015852017218888218, + "loss": 1.0341, + "step": 8991 + }, + { + "epoch": 0.32202266907801674, + "grad_norm": 1.319769024848938, + "learning_rate": 0.0001585107662977336, + "loss": 1.2242, + "step": 8992 + }, + { + "epoch": 0.322058481207585, + "grad_norm": 1.9207274913787842, + "learning_rate": 0.00015850135961939814, + "loss": 1.0412, + "step": 8993 + }, + { + "epoch": 0.3220942933371533, + "grad_norm": 1.363855004310608, + "learning_rate": 0.00015849195215400234, + "loss": 1.3433, + "step": 8994 + }, + { + "epoch": 0.3221301054667216, + "grad_norm": 1.3902758359909058, + "learning_rate": 0.0001584825439016728, + "loss": 1.1966, + "step": 8995 + }, + { + "epoch": 0.3221659175962899, + "grad_norm": 2.098628282546997, + "learning_rate": 0.00015847313486253603, + "loss": 1.3309, + "step": 8996 + }, + { + "epoch": 0.32220172972585814, + "grad_norm": 1.337232232093811, + "learning_rate": 0.0001584637250367187, + "loss": 1.1018, + "step": 8997 + }, + { + "epoch": 0.3222375418554264, + "grad_norm": 1.7093586921691895, + "learning_rate": 0.00015845431442434733, + "loss": 1.4344, + "step": 8998 + }, + { + "epoch": 0.32227335398499474, + "grad_norm": 1.5663609504699707, + "learning_rate": 0.00015844490302554856, + "loss": 1.2499, + "step": 8999 + }, + { + "epoch": 0.322309166114563, + "grad_norm": 1.4690366983413696, + "learning_rate": 0.00015843549084044903, + "loss": 1.1887, + "step": 9000 + }, + { + "epoch": 0.3223449782441313, + "grad_norm": 1.7440780401229858, + "learning_rate": 0.0001584260778691753, + "loss": 1.1236, + "step": 9001 + }, + { + "epoch": 0.3223807903736996, + "grad_norm": 1.8427873849868774, + "learning_rate": 0.00015841666411185411, + "loss": 1.0237, + "step": 9002 + }, + { + "epoch": 0.32241660250326787, + "grad_norm": 1.2682275772094727, + "learning_rate": 0.000158407249568612, + "loss": 1.1058, + "step": 9003 + }, + { + "epoch": 0.32245241463283614, + "grad_norm": 1.4595767259597778, + "learning_rate": 0.00015839783423957576, + "loss": 1.1125, + "step": 9004 + }, + { + "epoch": 0.3224882267624044, + "grad_norm": 1.8212066888809204, + "learning_rate": 0.00015838841812487194, + "loss": 1.3275, + "step": 9005 + }, + { + "epoch": 0.32252403889197273, + "grad_norm": 1.6462815999984741, + "learning_rate": 0.00015837900122462725, + "loss": 1.0373, + "step": 9006 + }, + { + "epoch": 0.322559851021541, + "grad_norm": 1.7524150609970093, + "learning_rate": 0.00015836958353896845, + "loss": 1.0574, + "step": 9007 + }, + { + "epoch": 0.32259566315110927, + "grad_norm": 1.3968027830123901, + "learning_rate": 0.00015836016506802218, + "loss": 1.0191, + "step": 9008 + }, + { + "epoch": 0.3226314752806776, + "grad_norm": 1.8942807912826538, + "learning_rate": 0.00015835074581191516, + "loss": 1.2544, + "step": 9009 + }, + { + "epoch": 0.32266728741024586, + "grad_norm": 1.6089799404144287, + "learning_rate": 0.00015834132577077412, + "loss": 1.2585, + "step": 9010 + }, + { + "epoch": 0.32270309953981413, + "grad_norm": 1.7249996662139893, + "learning_rate": 0.00015833190494472582, + "loss": 1.3773, + "step": 9011 + }, + { + "epoch": 0.3227389116693824, + "grad_norm": 1.462255835533142, + "learning_rate": 0.00015832248333389693, + "loss": 1.2158, + "step": 9012 + }, + { + "epoch": 0.3227747237989507, + "grad_norm": 1.377328634262085, + "learning_rate": 0.00015831306093841432, + "loss": 0.9843, + "step": 9013 + }, + { + "epoch": 0.322810535928519, + "grad_norm": 2.3324873447418213, + "learning_rate": 0.00015830363775840467, + "loss": 1.0166, + "step": 9014 + }, + { + "epoch": 0.32284634805808726, + "grad_norm": 1.6809396743774414, + "learning_rate": 0.00015829421379399475, + "loss": 1.2923, + "step": 9015 + }, + { + "epoch": 0.32288216018765553, + "grad_norm": 1.3376625776290894, + "learning_rate": 0.00015828478904531142, + "loss": 1.1127, + "step": 9016 + }, + { + "epoch": 0.32291797231722386, + "grad_norm": 1.490435242652893, + "learning_rate": 0.0001582753635124814, + "loss": 0.9981, + "step": 9017 + }, + { + "epoch": 0.3229537844467921, + "grad_norm": 1.38615083694458, + "learning_rate": 0.00015826593719563156, + "loss": 1.0429, + "step": 9018 + }, + { + "epoch": 0.3229895965763604, + "grad_norm": 1.4850118160247803, + "learning_rate": 0.0001582565100948887, + "loss": 1.1825, + "step": 9019 + }, + { + "epoch": 0.3230254087059287, + "grad_norm": 1.7252081632614136, + "learning_rate": 0.00015824708221037965, + "loss": 1.1541, + "step": 9020 + }, + { + "epoch": 0.323061220835497, + "grad_norm": 1.7162225246429443, + "learning_rate": 0.0001582376535422312, + "loss": 1.0111, + "step": 9021 + }, + { + "epoch": 0.32309703296506526, + "grad_norm": 1.5967528820037842, + "learning_rate": 0.00015822822409057024, + "loss": 1.1446, + "step": 9022 + }, + { + "epoch": 0.3231328450946335, + "grad_norm": 1.2434910535812378, + "learning_rate": 0.00015821879385552367, + "loss": 1.0291, + "step": 9023 + }, + { + "epoch": 0.32316865722420185, + "grad_norm": 1.4169111251831055, + "learning_rate": 0.00015820936283721834, + "loss": 1.2374, + "step": 9024 + }, + { + "epoch": 0.3232044693537701, + "grad_norm": 1.6790285110473633, + "learning_rate": 0.00015819993103578106, + "loss": 1.2004, + "step": 9025 + }, + { + "epoch": 0.3232402814833384, + "grad_norm": 1.5349366664886475, + "learning_rate": 0.0001581904984513388, + "loss": 1.0543, + "step": 9026 + }, + { + "epoch": 0.3232760936129067, + "grad_norm": 3.728522539138794, + "learning_rate": 0.00015818106508401847, + "loss": 1.2132, + "step": 9027 + }, + { + "epoch": 0.323311905742475, + "grad_norm": 1.9048042297363281, + "learning_rate": 0.00015817163093394693, + "loss": 1.0753, + "step": 9028 + }, + { + "epoch": 0.32334771787204325, + "grad_norm": 1.3571566343307495, + "learning_rate": 0.00015816219600125114, + "loss": 1.0346, + "step": 9029 + }, + { + "epoch": 0.3233835300016115, + "grad_norm": 1.656421184539795, + "learning_rate": 0.00015815276028605807, + "loss": 0.9838, + "step": 9030 + }, + { + "epoch": 0.32341934213117984, + "grad_norm": 1.4696658849716187, + "learning_rate": 0.00015814332378849457, + "loss": 1.2245, + "step": 9031 + }, + { + "epoch": 0.3234551542607481, + "grad_norm": 1.4364594221115112, + "learning_rate": 0.00015813388650868766, + "loss": 1.0568, + "step": 9032 + }, + { + "epoch": 0.3234909663903164, + "grad_norm": 1.3598124980926514, + "learning_rate": 0.00015812444844676428, + "loss": 1.2169, + "step": 9033 + }, + { + "epoch": 0.3235267785198847, + "grad_norm": 1.9815555810928345, + "learning_rate": 0.00015811500960285143, + "loss": 1.0776, + "step": 9034 + }, + { + "epoch": 0.323562590649453, + "grad_norm": 2.162384271621704, + "learning_rate": 0.00015810556997707608, + "loss": 0.9584, + "step": 9035 + }, + { + "epoch": 0.32359840277902124, + "grad_norm": 1.7156201601028442, + "learning_rate": 0.00015809612956956527, + "loss": 0.968, + "step": 9036 + }, + { + "epoch": 0.3236342149085895, + "grad_norm": 1.506207823753357, + "learning_rate": 0.00015808668838044595, + "loss": 1.1854, + "step": 9037 + }, + { + "epoch": 0.32367002703815784, + "grad_norm": 2.043530225753784, + "learning_rate": 0.00015807724640984518, + "loss": 1.1549, + "step": 9038 + }, + { + "epoch": 0.3237058391677261, + "grad_norm": 1.6394429206848145, + "learning_rate": 0.00015806780365788998, + "loss": 1.4064, + "step": 9039 + }, + { + "epoch": 0.3237416512972944, + "grad_norm": 1.78749680519104, + "learning_rate": 0.00015805836012470733, + "loss": 1.248, + "step": 9040 + }, + { + "epoch": 0.3237774634268627, + "grad_norm": 1.8145784139633179, + "learning_rate": 0.0001580489158104244, + "loss": 1.1897, + "step": 9041 + }, + { + "epoch": 0.32381327555643097, + "grad_norm": 1.6974968910217285, + "learning_rate": 0.00015803947071516813, + "loss": 0.9918, + "step": 9042 + }, + { + "epoch": 0.32384908768599924, + "grad_norm": 1.4689418077468872, + "learning_rate": 0.00015803002483906568, + "loss": 1.2981, + "step": 9043 + }, + { + "epoch": 0.3238848998155675, + "grad_norm": 1.5928845405578613, + "learning_rate": 0.0001580205781822441, + "loss": 1.0855, + "step": 9044 + }, + { + "epoch": 0.32392071194513583, + "grad_norm": 1.1917381286621094, + "learning_rate": 0.00015801113074483046, + "loss": 1.2294, + "step": 9045 + }, + { + "epoch": 0.3239565240747041, + "grad_norm": 2.025371789932251, + "learning_rate": 0.0001580016825269519, + "loss": 1.2312, + "step": 9046 + }, + { + "epoch": 0.32399233620427237, + "grad_norm": 2.014054298400879, + "learning_rate": 0.00015799223352873555, + "loss": 1.1208, + "step": 9047 + }, + { + "epoch": 0.3240281483338407, + "grad_norm": 1.4782837629318237, + "learning_rate": 0.00015798278375030845, + "loss": 0.8936, + "step": 9048 + }, + { + "epoch": 0.32406396046340896, + "grad_norm": 1.3147625923156738, + "learning_rate": 0.0001579733331917978, + "loss": 1.1628, + "step": 9049 + }, + { + "epoch": 0.32409977259297723, + "grad_norm": 1.4679800271987915, + "learning_rate": 0.00015796388185333076, + "loss": 1.1087, + "step": 9050 + }, + { + "epoch": 0.3241355847225455, + "grad_norm": 1.6562761068344116, + "learning_rate": 0.00015795442973503442, + "loss": 1.2282, + "step": 9051 + }, + { + "epoch": 0.3241713968521138, + "grad_norm": 1.5199953317642212, + "learning_rate": 0.00015794497683703601, + "loss": 1.2886, + "step": 9052 + }, + { + "epoch": 0.3242072089816821, + "grad_norm": 1.494160532951355, + "learning_rate": 0.00015793552315946266, + "loss": 1.1601, + "step": 9053 + }, + { + "epoch": 0.32424302111125036, + "grad_norm": 2.2501134872436523, + "learning_rate": 0.00015792606870244162, + "loss": 1.2144, + "step": 9054 + }, + { + "epoch": 0.3242788332408187, + "grad_norm": 1.885783076286316, + "learning_rate": 0.0001579166134661, + "loss": 1.0624, + "step": 9055 + }, + { + "epoch": 0.32431464537038696, + "grad_norm": 1.4683514833450317, + "learning_rate": 0.00015790715745056506, + "loss": 1.2538, + "step": 9056 + }, + { + "epoch": 0.3243504574999552, + "grad_norm": 1.6246237754821777, + "learning_rate": 0.00015789770065596404, + "loss": 1.3116, + "step": 9057 + }, + { + "epoch": 0.3243862696295235, + "grad_norm": 1.4287855625152588, + "learning_rate": 0.00015788824308242408, + "loss": 1.4551, + "step": 9058 + }, + { + "epoch": 0.3244220817590918, + "grad_norm": 1.6399073600769043, + "learning_rate": 0.00015787878473007253, + "loss": 1.2677, + "step": 9059 + }, + { + "epoch": 0.3244578938886601, + "grad_norm": 2.357074737548828, + "learning_rate": 0.00015786932559903657, + "loss": 1.227, + "step": 9060 + }, + { + "epoch": 0.32449370601822836, + "grad_norm": 1.8582717180252075, + "learning_rate": 0.00015785986568944352, + "loss": 1.3111, + "step": 9061 + }, + { + "epoch": 0.3245295181477967, + "grad_norm": 1.511978030204773, + "learning_rate": 0.00015785040500142057, + "loss": 1.2402, + "step": 9062 + }, + { + "epoch": 0.32456533027736495, + "grad_norm": 1.9329071044921875, + "learning_rate": 0.00015784094353509507, + "loss": 1.1195, + "step": 9063 + }, + { + "epoch": 0.3246011424069332, + "grad_norm": 1.651551365852356, + "learning_rate": 0.00015783148129059425, + "loss": 1.1974, + "step": 9064 + }, + { + "epoch": 0.3246369545365015, + "grad_norm": 1.7226146459579468, + "learning_rate": 0.00015782201826804548, + "loss": 1.1675, + "step": 9065 + }, + { + "epoch": 0.3246727666660698, + "grad_norm": 1.4754321575164795, + "learning_rate": 0.000157812554467576, + "loss": 1.0601, + "step": 9066 + }, + { + "epoch": 0.3247085787956381, + "grad_norm": 1.5307024717330933, + "learning_rate": 0.0001578030898893132, + "loss": 1.3263, + "step": 9067 + }, + { + "epoch": 0.32474439092520635, + "grad_norm": 1.5462263822555542, + "learning_rate": 0.00015779362453338438, + "loss": 1.2954, + "step": 9068 + }, + { + "epoch": 0.3247802030547747, + "grad_norm": 1.5329277515411377, + "learning_rate": 0.0001577841583999169, + "loss": 1.1453, + "step": 9069 + }, + { + "epoch": 0.32481601518434294, + "grad_norm": 1.4288994073867798, + "learning_rate": 0.00015777469148903808, + "loss": 1.2048, + "step": 9070 + }, + { + "epoch": 0.3248518273139112, + "grad_norm": 1.4550023078918457, + "learning_rate": 0.00015776522380087532, + "loss": 1.1692, + "step": 9071 + }, + { + "epoch": 0.3248876394434795, + "grad_norm": 1.8150231838226318, + "learning_rate": 0.00015775575533555602, + "loss": 1.1757, + "step": 9072 + }, + { + "epoch": 0.3249234515730478, + "grad_norm": 1.2412525415420532, + "learning_rate": 0.0001577462860932075, + "loss": 1.0942, + "step": 9073 + }, + { + "epoch": 0.3249592637026161, + "grad_norm": 1.4400767087936401, + "learning_rate": 0.00015773681607395717, + "loss": 0.9914, + "step": 9074 + }, + { + "epoch": 0.32499507583218434, + "grad_norm": 1.8630541563034058, + "learning_rate": 0.0001577273452779325, + "loss": 1.4267, + "step": 9075 + }, + { + "epoch": 0.32503088796175267, + "grad_norm": 1.4452595710754395, + "learning_rate": 0.00015771787370526084, + "loss": 1.1588, + "step": 9076 + }, + { + "epoch": 0.32506670009132094, + "grad_norm": 1.3441641330718994, + "learning_rate": 0.0001577084013560696, + "loss": 1.0002, + "step": 9077 + }, + { + "epoch": 0.3251025122208892, + "grad_norm": 2.5139007568359375, + "learning_rate": 0.0001576989282304863, + "loss": 1.0797, + "step": 9078 + }, + { + "epoch": 0.3251383243504575, + "grad_norm": 1.4184015989303589, + "learning_rate": 0.00015768945432863835, + "loss": 1.1009, + "step": 9079 + }, + { + "epoch": 0.3251741364800258, + "grad_norm": 2.220208168029785, + "learning_rate": 0.00015767997965065322, + "loss": 1.2606, + "step": 9080 + }, + { + "epoch": 0.32520994860959407, + "grad_norm": 1.5852655172348022, + "learning_rate": 0.00015767050419665836, + "loss": 1.0937, + "step": 9081 + }, + { + "epoch": 0.32524576073916234, + "grad_norm": 1.3806978464126587, + "learning_rate": 0.00015766102796678123, + "loss": 1.2646, + "step": 9082 + }, + { + "epoch": 0.32528157286873066, + "grad_norm": 1.6033092737197876, + "learning_rate": 0.00015765155096114934, + "loss": 1.1768, + "step": 9083 + }, + { + "epoch": 0.32531738499829893, + "grad_norm": 1.4919465780258179, + "learning_rate": 0.00015764207317989023, + "loss": 0.973, + "step": 9084 + }, + { + "epoch": 0.3253531971278672, + "grad_norm": 1.5793644189834595, + "learning_rate": 0.00015763259462313136, + "loss": 1.2168, + "step": 9085 + }, + { + "epoch": 0.32538900925743547, + "grad_norm": 1.7913364171981812, + "learning_rate": 0.00015762311529100024, + "loss": 1.4612, + "step": 9086 + }, + { + "epoch": 0.3254248213870038, + "grad_norm": 1.578572154045105, + "learning_rate": 0.00015761363518362447, + "loss": 1.2359, + "step": 9087 + }, + { + "epoch": 0.32546063351657206, + "grad_norm": 1.4386789798736572, + "learning_rate": 0.00015760415430113157, + "loss": 1.2918, + "step": 9088 + }, + { + "epoch": 0.32549644564614033, + "grad_norm": 1.518807291984558, + "learning_rate": 0.00015759467264364905, + "loss": 1.3302, + "step": 9089 + }, + { + "epoch": 0.32553225777570866, + "grad_norm": 1.8602869510650635, + "learning_rate": 0.00015758519021130451, + "loss": 1.2211, + "step": 9090 + }, + { + "epoch": 0.3255680699052769, + "grad_norm": 1.4596136808395386, + "learning_rate": 0.0001575757070042255, + "loss": 1.1881, + "step": 9091 + }, + { + "epoch": 0.3256038820348452, + "grad_norm": 1.5380911827087402, + "learning_rate": 0.00015756622302253966, + "loss": 1.2237, + "step": 9092 + }, + { + "epoch": 0.32563969416441346, + "grad_norm": 1.189141869544983, + "learning_rate": 0.0001575567382663745, + "loss": 1.204, + "step": 9093 + }, + { + "epoch": 0.3256755062939818, + "grad_norm": 1.450945258140564, + "learning_rate": 0.00015754725273585767, + "loss": 1.0412, + "step": 9094 + }, + { + "epoch": 0.32571131842355006, + "grad_norm": 1.4881539344787598, + "learning_rate": 0.0001575377664311168, + "loss": 1.1331, + "step": 9095 + }, + { + "epoch": 0.3257471305531183, + "grad_norm": 1.5460001230239868, + "learning_rate": 0.00015752827935227952, + "loss": 1.0824, + "step": 9096 + }, + { + "epoch": 0.32578294268268665, + "grad_norm": 1.8696707487106323, + "learning_rate": 0.00015751879149947343, + "loss": 1.2142, + "step": 9097 + }, + { + "epoch": 0.3258187548122549, + "grad_norm": 1.4973596334457397, + "learning_rate": 0.0001575093028728262, + "loss": 0.9082, + "step": 9098 + }, + { + "epoch": 0.3258545669418232, + "grad_norm": 2.6095595359802246, + "learning_rate": 0.00015749981347246549, + "loss": 1.266, + "step": 9099 + }, + { + "epoch": 0.32589037907139146, + "grad_norm": 2.0984816551208496, + "learning_rate": 0.00015749032329851894, + "loss": 1.1032, + "step": 9100 + }, + { + "epoch": 0.3259261912009598, + "grad_norm": 1.4505112171173096, + "learning_rate": 0.00015748083235111424, + "loss": 1.0614, + "step": 9101 + }, + { + "epoch": 0.32596200333052805, + "grad_norm": 1.5790090560913086, + "learning_rate": 0.00015747134063037908, + "loss": 1.2667, + "step": 9102 + }, + { + "epoch": 0.3259978154600963, + "grad_norm": 1.5633909702301025, + "learning_rate": 0.0001574618481364412, + "loss": 1.2263, + "step": 9103 + }, + { + "epoch": 0.32603362758966464, + "grad_norm": 1.9618427753448486, + "learning_rate": 0.00015745235486942826, + "loss": 1.252, + "step": 9104 + }, + { + "epoch": 0.3260694397192329, + "grad_norm": 1.427702784538269, + "learning_rate": 0.00015744286082946797, + "loss": 1.0747, + "step": 9105 + }, + { + "epoch": 0.3261052518488012, + "grad_norm": 1.562048316001892, + "learning_rate": 0.0001574333660166881, + "loss": 1.1918, + "step": 9106 + }, + { + "epoch": 0.32614106397836945, + "grad_norm": 2.048452615737915, + "learning_rate": 0.0001574238704312164, + "loss": 1.0039, + "step": 9107 + }, + { + "epoch": 0.3261768761079378, + "grad_norm": 1.8142485618591309, + "learning_rate": 0.00015741437407318056, + "loss": 1.2515, + "step": 9108 + }, + { + "epoch": 0.32621268823750604, + "grad_norm": 1.4087649583816528, + "learning_rate": 0.00015740487694270838, + "loss": 0.9486, + "step": 9109 + }, + { + "epoch": 0.3262485003670743, + "grad_norm": 1.5805524587631226, + "learning_rate": 0.00015739537903992765, + "loss": 1.0203, + "step": 9110 + }, + { + "epoch": 0.32628431249664264, + "grad_norm": 1.4839452505111694, + "learning_rate": 0.0001573858803649661, + "loss": 1.0324, + "step": 9111 + }, + { + "epoch": 0.3263201246262109, + "grad_norm": 2.126516103744507, + "learning_rate": 0.00015737638091795157, + "loss": 1.3619, + "step": 9112 + }, + { + "epoch": 0.3263559367557792, + "grad_norm": 1.7353442907333374, + "learning_rate": 0.00015736688069901183, + "loss": 1.1316, + "step": 9113 + }, + { + "epoch": 0.32639174888534744, + "grad_norm": 1.5950394868850708, + "learning_rate": 0.00015735737970827473, + "loss": 0.9312, + "step": 9114 + }, + { + "epoch": 0.32642756101491577, + "grad_norm": 1.631212830543518, + "learning_rate": 0.00015734787794586806, + "loss": 1.1243, + "step": 9115 + }, + { + "epoch": 0.32646337314448404, + "grad_norm": 1.7835320234298706, + "learning_rate": 0.00015733837541191968, + "loss": 1.2529, + "step": 9116 + }, + { + "epoch": 0.3264991852740523, + "grad_norm": 1.5937395095825195, + "learning_rate": 0.00015732887210655742, + "loss": 1.1422, + "step": 9117 + }, + { + "epoch": 0.32653499740362063, + "grad_norm": 1.5082273483276367, + "learning_rate": 0.00015731936802990912, + "loss": 1.2603, + "step": 9118 + }, + { + "epoch": 0.3265708095331889, + "grad_norm": 2.039132833480835, + "learning_rate": 0.00015730986318210265, + "loss": 1.0463, + "step": 9119 + }, + { + "epoch": 0.32660662166275717, + "grad_norm": 1.4175186157226562, + "learning_rate": 0.00015730035756326592, + "loss": 1.3031, + "step": 9120 + }, + { + "epoch": 0.32664243379232544, + "grad_norm": 1.7756500244140625, + "learning_rate": 0.00015729085117352674, + "loss": 1.2066, + "step": 9121 + }, + { + "epoch": 0.32667824592189376, + "grad_norm": 1.7428083419799805, + "learning_rate": 0.00015728134401301312, + "loss": 0.9653, + "step": 9122 + }, + { + "epoch": 0.32671405805146203, + "grad_norm": 1.5597965717315674, + "learning_rate": 0.0001572718360818529, + "loss": 1.0483, + "step": 9123 + }, + { + "epoch": 0.3267498701810303, + "grad_norm": 1.72866690158844, + "learning_rate": 0.00015726232738017397, + "loss": 1.2172, + "step": 9124 + }, + { + "epoch": 0.3267856823105986, + "grad_norm": 1.3777672052383423, + "learning_rate": 0.00015725281790810431, + "loss": 1.1079, + "step": 9125 + }, + { + "epoch": 0.3268214944401669, + "grad_norm": 1.3351390361785889, + "learning_rate": 0.00015724330766577182, + "loss": 1.2701, + "step": 9126 + }, + { + "epoch": 0.32685730656973516, + "grad_norm": 1.707980751991272, + "learning_rate": 0.0001572337966533045, + "loss": 1.5111, + "step": 9127 + }, + { + "epoch": 0.32689311869930343, + "grad_norm": 1.592113971710205, + "learning_rate": 0.0001572242848708302, + "loss": 1.276, + "step": 9128 + }, + { + "epoch": 0.32692893082887176, + "grad_norm": 2.2302732467651367, + "learning_rate": 0.00015721477231847702, + "loss": 1.2185, + "step": 9129 + }, + { + "epoch": 0.32696474295844, + "grad_norm": 2.3419697284698486, + "learning_rate": 0.00015720525899637285, + "loss": 1.2576, + "step": 9130 + }, + { + "epoch": 0.3270005550880083, + "grad_norm": 1.7610814571380615, + "learning_rate": 0.00015719574490464573, + "loss": 1.2385, + "step": 9131 + }, + { + "epoch": 0.3270363672175766, + "grad_norm": 2.0381596088409424, + "learning_rate": 0.00015718623004342362, + "loss": 1.4105, + "step": 9132 + }, + { + "epoch": 0.3270721793471449, + "grad_norm": 1.7362427711486816, + "learning_rate": 0.00015717671441283458, + "loss": 1.2907, + "step": 9133 + }, + { + "epoch": 0.32710799147671316, + "grad_norm": 1.6149425506591797, + "learning_rate": 0.0001571671980130066, + "loss": 1.3624, + "step": 9134 + }, + { + "epoch": 0.3271438036062814, + "grad_norm": 1.3958244323730469, + "learning_rate": 0.00015715768084406765, + "loss": 1.1203, + "step": 9135 + }, + { + "epoch": 0.32717961573584975, + "grad_norm": 1.51193106174469, + "learning_rate": 0.0001571481629061459, + "loss": 1.2074, + "step": 9136 + }, + { + "epoch": 0.327215427865418, + "grad_norm": 1.6999800205230713, + "learning_rate": 0.0001571386441993693, + "loss": 1.2692, + "step": 9137 + }, + { + "epoch": 0.3272512399949863, + "grad_norm": 1.3757169246673584, + "learning_rate": 0.00015712912472386597, + "loss": 1.2789, + "step": 9138 + }, + { + "epoch": 0.3272870521245546, + "grad_norm": 1.3244624137878418, + "learning_rate": 0.00015711960447976393, + "loss": 1.395, + "step": 9139 + }, + { + "epoch": 0.3273228642541229, + "grad_norm": 1.4161608219146729, + "learning_rate": 0.0001571100834671913, + "loss": 1.0767, + "step": 9140 + }, + { + "epoch": 0.32735867638369115, + "grad_norm": 1.3522825241088867, + "learning_rate": 0.00015710056168627618, + "loss": 1.2521, + "step": 9141 + }, + { + "epoch": 0.3273944885132594, + "grad_norm": 2.0328657627105713, + "learning_rate": 0.00015709103913714664, + "loss": 1.1575, + "step": 9142 + }, + { + "epoch": 0.32743030064282774, + "grad_norm": 1.334712266921997, + "learning_rate": 0.0001570815158199308, + "loss": 1.0086, + "step": 9143 + }, + { + "epoch": 0.327466112772396, + "grad_norm": 1.6248427629470825, + "learning_rate": 0.00015707199173475682, + "loss": 1.0762, + "step": 9144 + }, + { + "epoch": 0.3275019249019643, + "grad_norm": 2.5207884311676025, + "learning_rate": 0.00015706246688175282, + "loss": 0.9659, + "step": 9145 + }, + { + "epoch": 0.3275377370315326, + "grad_norm": 1.4467089176177979, + "learning_rate": 0.00015705294126104692, + "loss": 1.062, + "step": 9146 + }, + { + "epoch": 0.3275735491611009, + "grad_norm": 1.352792501449585, + "learning_rate": 0.00015704341487276726, + "loss": 0.9607, + "step": 9147 + }, + { + "epoch": 0.32760936129066914, + "grad_norm": 1.5811479091644287, + "learning_rate": 0.00015703388771704205, + "loss": 1.2859, + "step": 9148 + }, + { + "epoch": 0.3276451734202374, + "grad_norm": 1.6747124195098877, + "learning_rate": 0.00015702435979399946, + "loss": 1.1547, + "step": 9149 + }, + { + "epoch": 0.32768098554980574, + "grad_norm": 1.3666020631790161, + "learning_rate": 0.00015701483110376762, + "loss": 1.0985, + "step": 9150 + }, + { + "epoch": 0.327716797679374, + "grad_norm": 1.7258838415145874, + "learning_rate": 0.00015700530164647485, + "loss": 1.1541, + "step": 9151 + }, + { + "epoch": 0.3277526098089423, + "grad_norm": 2.2083230018615723, + "learning_rate": 0.00015699577142224924, + "loss": 1.1089, + "step": 9152 + }, + { + "epoch": 0.3277884219385106, + "grad_norm": 1.7268062829971313, + "learning_rate": 0.000156986240431219, + "loss": 1.1368, + "step": 9153 + }, + { + "epoch": 0.32782423406807887, + "grad_norm": 1.4487242698669434, + "learning_rate": 0.00015697670867351247, + "loss": 1.1807, + "step": 9154 + }, + { + "epoch": 0.32786004619764714, + "grad_norm": 1.7326481342315674, + "learning_rate": 0.0001569671761492578, + "loss": 1.303, + "step": 9155 + }, + { + "epoch": 0.3278958583272154, + "grad_norm": 1.5618234872817993, + "learning_rate": 0.00015695764285858323, + "loss": 1.2741, + "step": 9156 + }, + { + "epoch": 0.32793167045678373, + "grad_norm": 1.3108497858047485, + "learning_rate": 0.00015694810880161706, + "loss": 1.0064, + "step": 9157 + }, + { + "epoch": 0.327967482586352, + "grad_norm": 1.5650267601013184, + "learning_rate": 0.00015693857397848756, + "loss": 1.2195, + "step": 9158 + }, + { + "epoch": 0.32800329471592027, + "grad_norm": 2.1292624473571777, + "learning_rate": 0.00015692903838932299, + "loss": 1.1606, + "step": 9159 + }, + { + "epoch": 0.3280391068454886, + "grad_norm": 1.9834003448486328, + "learning_rate": 0.00015691950203425162, + "loss": 1.1577, + "step": 9160 + }, + { + "epoch": 0.32807491897505686, + "grad_norm": 1.51247239112854, + "learning_rate": 0.0001569099649134018, + "loss": 1.0177, + "step": 9161 + }, + { + "epoch": 0.32811073110462513, + "grad_norm": 1.5422078371047974, + "learning_rate": 0.0001569004270269018, + "loss": 1.1461, + "step": 9162 + }, + { + "epoch": 0.3281465432341934, + "grad_norm": 1.5874481201171875, + "learning_rate": 0.00015689088837487995, + "loss": 1.2816, + "step": 9163 + }, + { + "epoch": 0.3281823553637617, + "grad_norm": 1.2615132331848145, + "learning_rate": 0.00015688134895746459, + "loss": 1.1358, + "step": 9164 + }, + { + "epoch": 0.32821816749333, + "grad_norm": 1.8630238771438599, + "learning_rate": 0.000156871808774784, + "loss": 1.3081, + "step": 9165 + }, + { + "epoch": 0.32825397962289826, + "grad_norm": 1.6203070878982544, + "learning_rate": 0.00015686226782696662, + "loss": 1.2543, + "step": 9166 + }, + { + "epoch": 0.3282897917524666, + "grad_norm": 1.8827050924301147, + "learning_rate": 0.0001568527261141408, + "loss": 1.0928, + "step": 9167 + }, + { + "epoch": 0.32832560388203486, + "grad_norm": 1.4021246433258057, + "learning_rate": 0.00015684318363643485, + "loss": 1.1888, + "step": 9168 + }, + { + "epoch": 0.3283614160116031, + "grad_norm": 1.618639349937439, + "learning_rate": 0.0001568336403939772, + "loss": 1.1636, + "step": 9169 + }, + { + "epoch": 0.3283972281411714, + "grad_norm": 1.385210633277893, + "learning_rate": 0.00015682409638689623, + "loss": 1.2533, + "step": 9170 + }, + { + "epoch": 0.3284330402707397, + "grad_norm": 1.379796028137207, + "learning_rate": 0.00015681455161532034, + "loss": 1.0546, + "step": 9171 + }, + { + "epoch": 0.328468852400308, + "grad_norm": 1.365638256072998, + "learning_rate": 0.00015680500607937793, + "loss": 0.9647, + "step": 9172 + }, + { + "epoch": 0.32850466452987626, + "grad_norm": 1.6585071086883545, + "learning_rate": 0.00015679545977919745, + "loss": 1.2998, + "step": 9173 + }, + { + "epoch": 0.3285404766594446, + "grad_norm": 1.264241099357605, + "learning_rate": 0.0001567859127149073, + "loss": 1.2602, + "step": 9174 + }, + { + "epoch": 0.32857628878901285, + "grad_norm": 1.4172636270523071, + "learning_rate": 0.00015677636488663595, + "loss": 1.1842, + "step": 9175 + }, + { + "epoch": 0.3286121009185811, + "grad_norm": 1.537340521812439, + "learning_rate": 0.00015676681629451185, + "loss": 1.2976, + "step": 9176 + }, + { + "epoch": 0.3286479130481494, + "grad_norm": 1.4157925844192505, + "learning_rate": 0.0001567572669386635, + "loss": 1.2516, + "step": 9177 + }, + { + "epoch": 0.3286837251777177, + "grad_norm": 1.9174137115478516, + "learning_rate": 0.0001567477168192193, + "loss": 1.425, + "step": 9178 + }, + { + "epoch": 0.328719537307286, + "grad_norm": 1.4906309843063354, + "learning_rate": 0.00015673816593630776, + "loss": 1.0114, + "step": 9179 + }, + { + "epoch": 0.32875534943685425, + "grad_norm": 1.5058034658432007, + "learning_rate": 0.00015672861429005737, + "loss": 1.116, + "step": 9180 + }, + { + "epoch": 0.3287911615664226, + "grad_norm": 1.5623418092727661, + "learning_rate": 0.00015671906188059672, + "loss": 1.2202, + "step": 9181 + }, + { + "epoch": 0.32882697369599084, + "grad_norm": 1.5180778503417969, + "learning_rate": 0.0001567095087080542, + "loss": 1.277, + "step": 9182 + }, + { + "epoch": 0.3288627858255591, + "grad_norm": 1.527577519416809, + "learning_rate": 0.00015669995477255838, + "loss": 1.3769, + "step": 9183 + }, + { + "epoch": 0.3288985979551274, + "grad_norm": 1.8552333116531372, + "learning_rate": 0.00015669040007423784, + "loss": 1.23, + "step": 9184 + }, + { + "epoch": 0.3289344100846957, + "grad_norm": 1.6632142066955566, + "learning_rate": 0.00015668084461322108, + "loss": 1.2374, + "step": 9185 + }, + { + "epoch": 0.328970222214264, + "grad_norm": 1.567264437675476, + "learning_rate": 0.00015667128838963668, + "loss": 1.2993, + "step": 9186 + }, + { + "epoch": 0.32900603434383224, + "grad_norm": 1.7490872144699097, + "learning_rate": 0.00015666173140361315, + "loss": 1.3812, + "step": 9187 + }, + { + "epoch": 0.32904184647340057, + "grad_norm": 1.8086212873458862, + "learning_rate": 0.00015665217365527917, + "loss": 1.3988, + "step": 9188 + }, + { + "epoch": 0.32907765860296884, + "grad_norm": 2.008340358734131, + "learning_rate": 0.00015664261514476322, + "loss": 1.1083, + "step": 9189 + }, + { + "epoch": 0.3291134707325371, + "grad_norm": 1.6551111936569214, + "learning_rate": 0.00015663305587219396, + "loss": 1.0844, + "step": 9190 + }, + { + "epoch": 0.3291492828621054, + "grad_norm": 1.4958890676498413, + "learning_rate": 0.00015662349583770002, + "loss": 1.3042, + "step": 9191 + }, + { + "epoch": 0.3291850949916737, + "grad_norm": 1.7695177793502808, + "learning_rate": 0.00015661393504140994, + "loss": 1.0895, + "step": 9192 + }, + { + "epoch": 0.32922090712124197, + "grad_norm": 1.421471357345581, + "learning_rate": 0.0001566043734834524, + "loss": 1.3912, + "step": 9193 + }, + { + "epoch": 0.32925671925081024, + "grad_norm": 1.7663644552230835, + "learning_rate": 0.00015659481116395604, + "loss": 1.1741, + "step": 9194 + }, + { + "epoch": 0.32929253138037856, + "grad_norm": 2.686823606491089, + "learning_rate": 0.0001565852480830495, + "loss": 1.0144, + "step": 9195 + }, + { + "epoch": 0.32932834350994683, + "grad_norm": 1.8893334865570068, + "learning_rate": 0.00015657568424086145, + "loss": 1.3837, + "step": 9196 + }, + { + "epoch": 0.3293641556395151, + "grad_norm": 1.3086657524108887, + "learning_rate": 0.0001565661196375205, + "loss": 1.0142, + "step": 9197 + }, + { + "epoch": 0.32939996776908337, + "grad_norm": 1.6397119760513306, + "learning_rate": 0.00015655655427315542, + "loss": 1.1598, + "step": 9198 + }, + { + "epoch": 0.3294357798986517, + "grad_norm": 1.563028335571289, + "learning_rate": 0.00015654698814789484, + "loss": 1.2287, + "step": 9199 + }, + { + "epoch": 0.32947159202821996, + "grad_norm": 2.287395477294922, + "learning_rate": 0.00015653742126186745, + "loss": 1.2611, + "step": 9200 + }, + { + "epoch": 0.32950740415778823, + "grad_norm": 1.6842126846313477, + "learning_rate": 0.00015652785361520204, + "loss": 1.2909, + "step": 9201 + }, + { + "epoch": 0.32954321628735656, + "grad_norm": 1.9840290546417236, + "learning_rate": 0.00015651828520802722, + "loss": 1.2931, + "step": 9202 + }, + { + "epoch": 0.3295790284169248, + "grad_norm": 1.502670168876648, + "learning_rate": 0.00015650871604047182, + "loss": 1.2821, + "step": 9203 + }, + { + "epoch": 0.3296148405464931, + "grad_norm": 1.5224722623825073, + "learning_rate": 0.0001564991461126645, + "loss": 1.3336, + "step": 9204 + }, + { + "epoch": 0.32965065267606136, + "grad_norm": 1.817708134651184, + "learning_rate": 0.00015648957542473406, + "loss": 1.1836, + "step": 9205 + }, + { + "epoch": 0.3296864648056297, + "grad_norm": 2.2494008541107178, + "learning_rate": 0.00015648000397680924, + "loss": 1.1458, + "step": 9206 + }, + { + "epoch": 0.32972227693519796, + "grad_norm": 1.5707820653915405, + "learning_rate": 0.00015647043176901886, + "loss": 1.1877, + "step": 9207 + }, + { + "epoch": 0.3297580890647662, + "grad_norm": 1.2705204486846924, + "learning_rate": 0.00015646085880149162, + "loss": 1.2477, + "step": 9208 + }, + { + "epoch": 0.3297939011943345, + "grad_norm": 1.2885280847549438, + "learning_rate": 0.00015645128507435637, + "loss": 0.9946, + "step": 9209 + }, + { + "epoch": 0.3298297133239028, + "grad_norm": 1.6687562465667725, + "learning_rate": 0.00015644171058774192, + "loss": 1.2067, + "step": 9210 + }, + { + "epoch": 0.3298655254534711, + "grad_norm": 1.3993239402770996, + "learning_rate": 0.000156432135341777, + "loss": 1.2082, + "step": 9211 + }, + { + "epoch": 0.32990133758303936, + "grad_norm": 1.4217232465744019, + "learning_rate": 0.00015642255933659053, + "loss": 1.1211, + "step": 9212 + }, + { + "epoch": 0.3299371497126077, + "grad_norm": 1.5589687824249268, + "learning_rate": 0.0001564129825723113, + "loss": 1.1039, + "step": 9213 + }, + { + "epoch": 0.32997296184217595, + "grad_norm": 2.020745277404785, + "learning_rate": 0.00015640340504906818, + "loss": 1.326, + "step": 9214 + }, + { + "epoch": 0.3300087739717442, + "grad_norm": 1.6998833417892456, + "learning_rate": 0.00015639382676698997, + "loss": 1.3637, + "step": 9215 + }, + { + "epoch": 0.3300445861013125, + "grad_norm": 1.6998929977416992, + "learning_rate": 0.00015638424772620554, + "loss": 1.5077, + "step": 9216 + }, + { + "epoch": 0.3300803982308808, + "grad_norm": 1.8840709924697876, + "learning_rate": 0.00015637466792684383, + "loss": 1.1423, + "step": 9217 + }, + { + "epoch": 0.3301162103604491, + "grad_norm": 1.5180296897888184, + "learning_rate": 0.00015636508736903366, + "loss": 1.0051, + "step": 9218 + }, + { + "epoch": 0.33015202249001735, + "grad_norm": 1.327406644821167, + "learning_rate": 0.00015635550605290396, + "loss": 1.2427, + "step": 9219 + }, + { + "epoch": 0.3301878346195857, + "grad_norm": 1.33887779712677, + "learning_rate": 0.00015634592397858362, + "loss": 1.0121, + "step": 9220 + }, + { + "epoch": 0.33022364674915394, + "grad_norm": 2.4319612979888916, + "learning_rate": 0.00015633634114620154, + "loss": 1.53, + "step": 9221 + }, + { + "epoch": 0.3302594588787222, + "grad_norm": 2.017099380493164, + "learning_rate": 0.00015632675755588668, + "loss": 1.5224, + "step": 9222 + }, + { + "epoch": 0.3302952710082905, + "grad_norm": 1.373799204826355, + "learning_rate": 0.00015631717320776795, + "loss": 0.8609, + "step": 9223 + }, + { + "epoch": 0.3303310831378588, + "grad_norm": 1.481022834777832, + "learning_rate": 0.00015630758810197427, + "loss": 1.2914, + "step": 9224 + }, + { + "epoch": 0.3303668952674271, + "grad_norm": 1.812497615814209, + "learning_rate": 0.00015629800223863465, + "loss": 1.1333, + "step": 9225 + }, + { + "epoch": 0.33040270739699534, + "grad_norm": 1.251869559288025, + "learning_rate": 0.000156288415617878, + "loss": 1.1081, + "step": 9226 + }, + { + "epoch": 0.33043851952656367, + "grad_norm": 1.5333133935928345, + "learning_rate": 0.00015627882823983336, + "loss": 1.1922, + "step": 9227 + }, + { + "epoch": 0.33047433165613194, + "grad_norm": 2.310601234436035, + "learning_rate": 0.00015626924010462968, + "loss": 1.3528, + "step": 9228 + }, + { + "epoch": 0.3305101437857002, + "grad_norm": 1.5529359579086304, + "learning_rate": 0.00015625965121239592, + "loss": 1.4144, + "step": 9229 + }, + { + "epoch": 0.3305459559152685, + "grad_norm": 2.011582374572754, + "learning_rate": 0.00015625006156326117, + "loss": 1.3806, + "step": 9230 + }, + { + "epoch": 0.3305817680448368, + "grad_norm": 1.920530080795288, + "learning_rate": 0.00015624047115735435, + "loss": 1.1355, + "step": 9231 + }, + { + "epoch": 0.33061758017440507, + "grad_norm": 2.5588765144348145, + "learning_rate": 0.00015623087999480458, + "loss": 1.2847, + "step": 9232 + }, + { + "epoch": 0.33065339230397334, + "grad_norm": 1.4218058586120605, + "learning_rate": 0.00015622128807574081, + "loss": 1.2685, + "step": 9233 + }, + { + "epoch": 0.33068920443354166, + "grad_norm": 1.5790073871612549, + "learning_rate": 0.00015621169540029216, + "loss": 1.0246, + "step": 9234 + }, + { + "epoch": 0.33072501656310993, + "grad_norm": 2.2380361557006836, + "learning_rate": 0.00015620210196858763, + "loss": 1.1446, + "step": 9235 + }, + { + "epoch": 0.3307608286926782, + "grad_norm": 1.3722422122955322, + "learning_rate": 0.00015619250778075634, + "loss": 1.2068, + "step": 9236 + }, + { + "epoch": 0.33079664082224647, + "grad_norm": 1.345026969909668, + "learning_rate": 0.00015618291283692735, + "loss": 1.1098, + "step": 9237 + }, + { + "epoch": 0.3308324529518148, + "grad_norm": 1.4090304374694824, + "learning_rate": 0.0001561733171372297, + "loss": 1.0838, + "step": 9238 + }, + { + "epoch": 0.33086826508138306, + "grad_norm": 1.1256980895996094, + "learning_rate": 0.00015616372068179255, + "loss": 1.0045, + "step": 9239 + }, + { + "epoch": 0.33090407721095133, + "grad_norm": 1.722213864326477, + "learning_rate": 0.00015615412347074498, + "loss": 1.105, + "step": 9240 + }, + { + "epoch": 0.33093988934051966, + "grad_norm": 2.7950096130371094, + "learning_rate": 0.0001561445255042161, + "loss": 1.053, + "step": 9241 + }, + { + "epoch": 0.3309757014700879, + "grad_norm": 1.5162198543548584, + "learning_rate": 0.00015613492678233509, + "loss": 1.1632, + "step": 9242 + }, + { + "epoch": 0.3310115135996562, + "grad_norm": 1.640545129776001, + "learning_rate": 0.000156125327305231, + "loss": 1.2205, + "step": 9243 + }, + { + "epoch": 0.33104732572922446, + "grad_norm": 1.39910888671875, + "learning_rate": 0.00015611572707303307, + "loss": 1.1487, + "step": 9244 + }, + { + "epoch": 0.3310831378587928, + "grad_norm": 1.511659026145935, + "learning_rate": 0.00015610612608587035, + "loss": 1.3056, + "step": 9245 + }, + { + "epoch": 0.33111894998836106, + "grad_norm": 1.9335922002792358, + "learning_rate": 0.00015609652434387216, + "loss": 0.9813, + "step": 9246 + }, + { + "epoch": 0.3311547621179293, + "grad_norm": 1.605541706085205, + "learning_rate": 0.00015608692184716753, + "loss": 1.1152, + "step": 9247 + }, + { + "epoch": 0.33119057424749765, + "grad_norm": 1.7129889726638794, + "learning_rate": 0.00015607731859588575, + "loss": 1.0686, + "step": 9248 + }, + { + "epoch": 0.3312263863770659, + "grad_norm": 1.5592997074127197, + "learning_rate": 0.00015606771459015598, + "loss": 1.4298, + "step": 9249 + }, + { + "epoch": 0.3312621985066342, + "grad_norm": 1.8422874212265015, + "learning_rate": 0.00015605810983010743, + "loss": 1.0305, + "step": 9250 + }, + { + "epoch": 0.33129801063620246, + "grad_norm": 2.907151699066162, + "learning_rate": 0.0001560485043158693, + "loss": 1.1289, + "step": 9251 + }, + { + "epoch": 0.3313338227657708, + "grad_norm": 1.583552598953247, + "learning_rate": 0.00015603889804757085, + "loss": 1.1347, + "step": 9252 + }, + { + "epoch": 0.33136963489533905, + "grad_norm": 1.287333607673645, + "learning_rate": 0.00015602929102534132, + "loss": 1.2531, + "step": 9253 + }, + { + "epoch": 0.3314054470249073, + "grad_norm": 1.2777210474014282, + "learning_rate": 0.00015601968324930997, + "loss": 1.1048, + "step": 9254 + }, + { + "epoch": 0.33144125915447564, + "grad_norm": 1.4724643230438232, + "learning_rate": 0.000156010074719606, + "loss": 1.0008, + "step": 9255 + }, + { + "epoch": 0.3314770712840439, + "grad_norm": 1.5311708450317383, + "learning_rate": 0.00015600046543635875, + "loss": 1.0465, + "step": 9256 + }, + { + "epoch": 0.3315128834136122, + "grad_norm": 1.3519338369369507, + "learning_rate": 0.0001559908553996975, + "loss": 1.1922, + "step": 9257 + }, + { + "epoch": 0.33154869554318045, + "grad_norm": 1.730715036392212, + "learning_rate": 0.00015598124460975148, + "loss": 1.2462, + "step": 9258 + }, + { + "epoch": 0.3315845076727488, + "grad_norm": 1.7080330848693848, + "learning_rate": 0.00015597163306665002, + "loss": 0.8995, + "step": 9259 + }, + { + "epoch": 0.33162031980231704, + "grad_norm": 1.46969473361969, + "learning_rate": 0.00015596202077052245, + "loss": 0.9997, + "step": 9260 + }, + { + "epoch": 0.3316561319318853, + "grad_norm": 1.287918210029602, + "learning_rate": 0.00015595240772149803, + "loss": 0.9452, + "step": 9261 + }, + { + "epoch": 0.33169194406145364, + "grad_norm": 1.3952991962432861, + "learning_rate": 0.0001559427939197062, + "loss": 1.3378, + "step": 9262 + }, + { + "epoch": 0.3317277561910219, + "grad_norm": 1.3939646482467651, + "learning_rate": 0.0001559331793652762, + "loss": 1.0101, + "step": 9263 + }, + { + "epoch": 0.3317635683205902, + "grad_norm": 1.403964638710022, + "learning_rate": 0.00015592356405833745, + "loss": 1.1563, + "step": 9264 + }, + { + "epoch": 0.33179938045015844, + "grad_norm": 1.5628684759140015, + "learning_rate": 0.00015591394799901927, + "loss": 1.1816, + "step": 9265 + }, + { + "epoch": 0.33183519257972677, + "grad_norm": 2.3515262603759766, + "learning_rate": 0.00015590433118745106, + "loss": 1.4308, + "step": 9266 + }, + { + "epoch": 0.33187100470929504, + "grad_norm": 1.800581693649292, + "learning_rate": 0.00015589471362376217, + "loss": 1.4041, + "step": 9267 + }, + { + "epoch": 0.3319068168388633, + "grad_norm": 1.5670593976974487, + "learning_rate": 0.00015588509530808199, + "loss": 1.4362, + "step": 9268 + }, + { + "epoch": 0.33194262896843163, + "grad_norm": 1.3416303396224976, + "learning_rate": 0.00015587547624053993, + "loss": 0.965, + "step": 9269 + }, + { + "epoch": 0.3319784410979999, + "grad_norm": 1.6795955896377563, + "learning_rate": 0.00015586585642126543, + "loss": 1.2534, + "step": 9270 + }, + { + "epoch": 0.33201425322756817, + "grad_norm": 1.3987977504730225, + "learning_rate": 0.00015585623585038792, + "loss": 1.1471, + "step": 9271 + }, + { + "epoch": 0.33205006535713644, + "grad_norm": 1.7095545530319214, + "learning_rate": 0.00015584661452803676, + "loss": 1.2023, + "step": 9272 + }, + { + "epoch": 0.33208587748670476, + "grad_norm": 1.3950568437576294, + "learning_rate": 0.00015583699245434146, + "loss": 1.1234, + "step": 9273 + }, + { + "epoch": 0.33212168961627303, + "grad_norm": 1.4168286323547363, + "learning_rate": 0.00015582736962943148, + "loss": 1.2875, + "step": 9274 + }, + { + "epoch": 0.3321575017458413, + "grad_norm": 1.1917799711227417, + "learning_rate": 0.00015581774605343622, + "loss": 1.125, + "step": 9275 + }, + { + "epoch": 0.3321933138754096, + "grad_norm": 1.4621995687484741, + "learning_rate": 0.0001558081217264852, + "loss": 1.2066, + "step": 9276 + }, + { + "epoch": 0.3322291260049779, + "grad_norm": 1.246384859085083, + "learning_rate": 0.00015579849664870788, + "loss": 1.1273, + "step": 9277 + }, + { + "epoch": 0.33226493813454616, + "grad_norm": 1.4199637174606323, + "learning_rate": 0.00015578887082023373, + "loss": 1.0507, + "step": 9278 + }, + { + "epoch": 0.33230075026411443, + "grad_norm": 1.4721342325210571, + "learning_rate": 0.00015577924424119233, + "loss": 1.0731, + "step": 9279 + }, + { + "epoch": 0.33233656239368276, + "grad_norm": 1.5514739751815796, + "learning_rate": 0.00015576961691171314, + "loss": 1.141, + "step": 9280 + }, + { + "epoch": 0.332372374523251, + "grad_norm": 1.5185071229934692, + "learning_rate": 0.0001557599888319257, + "loss": 1.2855, + "step": 9281 + }, + { + "epoch": 0.3324081866528193, + "grad_norm": 1.5042825937271118, + "learning_rate": 0.00015575036000195952, + "loss": 1.0823, + "step": 9282 + }, + { + "epoch": 0.3324439987823876, + "grad_norm": 1.4924014806747437, + "learning_rate": 0.00015574073042194417, + "loss": 1.3235, + "step": 9283 + }, + { + "epoch": 0.3324798109119559, + "grad_norm": 1.4049031734466553, + "learning_rate": 0.0001557311000920092, + "loss": 1.2017, + "step": 9284 + }, + { + "epoch": 0.33251562304152416, + "grad_norm": 1.723809003829956, + "learning_rate": 0.00015572146901228414, + "loss": 1.2064, + "step": 9285 + }, + { + "epoch": 0.3325514351710924, + "grad_norm": 1.5092613697052002, + "learning_rate": 0.0001557118371828986, + "loss": 1.095, + "step": 9286 + }, + { + "epoch": 0.33258724730066075, + "grad_norm": 1.545177698135376, + "learning_rate": 0.00015570220460398216, + "loss": 1.0118, + "step": 9287 + }, + { + "epoch": 0.332623059430229, + "grad_norm": 1.296292781829834, + "learning_rate": 0.00015569257127566441, + "loss": 1.2179, + "step": 9288 + }, + { + "epoch": 0.3326588715597973, + "grad_norm": 1.7826217412948608, + "learning_rate": 0.00015568293719807493, + "loss": 1.188, + "step": 9289 + }, + { + "epoch": 0.3326946836893656, + "grad_norm": 1.483933925628662, + "learning_rate": 0.00015567330237134338, + "loss": 1.3144, + "step": 9290 + }, + { + "epoch": 0.3327304958189339, + "grad_norm": 1.3896371126174927, + "learning_rate": 0.00015566366679559937, + "loss": 1.1815, + "step": 9291 + }, + { + "epoch": 0.33276630794850215, + "grad_norm": 1.451907992362976, + "learning_rate": 0.0001556540304709725, + "loss": 1.0768, + "step": 9292 + }, + { + "epoch": 0.3328021200780704, + "grad_norm": 1.7232232093811035, + "learning_rate": 0.00015564439339759245, + "loss": 1.276, + "step": 9293 + }, + { + "epoch": 0.33283793220763874, + "grad_norm": 2.0219972133636475, + "learning_rate": 0.00015563475557558887, + "loss": 1.0907, + "step": 9294 + }, + { + "epoch": 0.332873744337207, + "grad_norm": 2.060387372970581, + "learning_rate": 0.00015562511700509138, + "loss": 1.0712, + "step": 9295 + }, + { + "epoch": 0.3329095564667753, + "grad_norm": 1.9020543098449707, + "learning_rate": 0.00015561547768622974, + "loss": 1.2308, + "step": 9296 + }, + { + "epoch": 0.3329453685963436, + "grad_norm": 1.7370940446853638, + "learning_rate": 0.00015560583761913357, + "loss": 1.0176, + "step": 9297 + }, + { + "epoch": 0.3329811807259119, + "grad_norm": 1.6287310123443604, + "learning_rate": 0.00015559619680393256, + "loss": 1.1502, + "step": 9298 + }, + { + "epoch": 0.33301699285548014, + "grad_norm": 1.566928744316101, + "learning_rate": 0.00015558655524075646, + "loss": 1.0764, + "step": 9299 + }, + { + "epoch": 0.3330528049850484, + "grad_norm": 1.6709691286087036, + "learning_rate": 0.00015557691292973494, + "loss": 1.1174, + "step": 9300 + }, + { + "epoch": 0.33308861711461674, + "grad_norm": 1.5542023181915283, + "learning_rate": 0.0001555672698709978, + "loss": 1.0328, + "step": 9301 + }, + { + "epoch": 0.333124429244185, + "grad_norm": 1.7071218490600586, + "learning_rate": 0.00015555762606467465, + "loss": 1.0963, + "step": 9302 + }, + { + "epoch": 0.3331602413737533, + "grad_norm": 1.696084976196289, + "learning_rate": 0.00015554798151089534, + "loss": 1.0411, + "step": 9303 + }, + { + "epoch": 0.3331960535033216, + "grad_norm": 1.993564248085022, + "learning_rate": 0.00015553833620978957, + "loss": 1.2059, + "step": 9304 + }, + { + "epoch": 0.33323186563288987, + "grad_norm": 1.9398750066757202, + "learning_rate": 0.00015552869016148714, + "loss": 1.4542, + "step": 9305 + }, + { + "epoch": 0.33326767776245814, + "grad_norm": 1.639435887336731, + "learning_rate": 0.0001555190433661178, + "loss": 1.1126, + "step": 9306 + }, + { + "epoch": 0.3333034898920264, + "grad_norm": 1.3819907903671265, + "learning_rate": 0.00015550939582381135, + "loss": 1.0169, + "step": 9307 + }, + { + "epoch": 0.33333930202159473, + "grad_norm": 1.694944977760315, + "learning_rate": 0.00015549974753469763, + "loss": 1.1523, + "step": 9308 + }, + { + "epoch": 0.333375114151163, + "grad_norm": 1.9754638671875, + "learning_rate": 0.00015549009849890634, + "loss": 1.119, + "step": 9309 + }, + { + "epoch": 0.33341092628073127, + "grad_norm": 1.5281800031661987, + "learning_rate": 0.0001554804487165674, + "loss": 1.2906, + "step": 9310 + }, + { + "epoch": 0.3334467384102996, + "grad_norm": 1.9392694234848022, + "learning_rate": 0.00015547079818781055, + "loss": 1.1628, + "step": 9311 + }, + { + "epoch": 0.33348255053986786, + "grad_norm": 1.7666032314300537, + "learning_rate": 0.00015546114691276567, + "loss": 1.2182, + "step": 9312 + }, + { + "epoch": 0.33351836266943613, + "grad_norm": 2.0946149826049805, + "learning_rate": 0.0001554514948915626, + "loss": 1.222, + "step": 9313 + }, + { + "epoch": 0.3335541747990044, + "grad_norm": 1.7983962297439575, + "learning_rate": 0.00015544184212433116, + "loss": 1.1966, + "step": 9314 + }, + { + "epoch": 0.3335899869285727, + "grad_norm": 1.7194751501083374, + "learning_rate": 0.00015543218861120125, + "loss": 1.0539, + "step": 9315 + }, + { + "epoch": 0.333625799058141, + "grad_norm": 1.587265968322754, + "learning_rate": 0.00015542253435230278, + "loss": 1.1129, + "step": 9316 + }, + { + "epoch": 0.33366161118770926, + "grad_norm": 1.4794853925704956, + "learning_rate": 0.0001554128793477656, + "loss": 0.9678, + "step": 9317 + }, + { + "epoch": 0.3336974233172776, + "grad_norm": 1.4458521604537964, + "learning_rate": 0.0001554032235977196, + "loss": 1.1505, + "step": 9318 + }, + { + "epoch": 0.33373323544684586, + "grad_norm": 1.7227346897125244, + "learning_rate": 0.0001553935671022947, + "loss": 1.1237, + "step": 9319 + }, + { + "epoch": 0.3337690475764141, + "grad_norm": 1.5552278757095337, + "learning_rate": 0.00015538390986162082, + "loss": 1.1466, + "step": 9320 + }, + { + "epoch": 0.3338048597059824, + "grad_norm": 1.521482229232788, + "learning_rate": 0.00015537425187582785, + "loss": 1.3718, + "step": 9321 + }, + { + "epoch": 0.3338406718355507, + "grad_norm": 1.6175791025161743, + "learning_rate": 0.00015536459314504573, + "loss": 1.3682, + "step": 9322 + }, + { + "epoch": 0.333876483965119, + "grad_norm": 1.616468071937561, + "learning_rate": 0.00015535493366940442, + "loss": 1.2386, + "step": 9323 + }, + { + "epoch": 0.33391229609468726, + "grad_norm": 1.5988117456436157, + "learning_rate": 0.0001553452734490339, + "loss": 1.1373, + "step": 9324 + }, + { + "epoch": 0.3339481082242556, + "grad_norm": 1.2833340167999268, + "learning_rate": 0.00015533561248406413, + "loss": 1.1375, + "step": 9325 + }, + { + "epoch": 0.33398392035382385, + "grad_norm": 1.6266130208969116, + "learning_rate": 0.00015532595077462507, + "loss": 1.1923, + "step": 9326 + }, + { + "epoch": 0.3340197324833921, + "grad_norm": 3.417973518371582, + "learning_rate": 0.0001553162883208467, + "loss": 1.1879, + "step": 9327 + }, + { + "epoch": 0.3340555446129604, + "grad_norm": 2.2995059490203857, + "learning_rate": 0.00015530662512285902, + "loss": 1.0513, + "step": 9328 + }, + { + "epoch": 0.3340913567425287, + "grad_norm": 1.662327527999878, + "learning_rate": 0.00015529696118079205, + "loss": 1.1415, + "step": 9329 + }, + { + "epoch": 0.334127168872097, + "grad_norm": 1.9256610870361328, + "learning_rate": 0.00015528729649477574, + "loss": 1.2166, + "step": 9330 + }, + { + "epoch": 0.33416298100166525, + "grad_norm": 1.8110530376434326, + "learning_rate": 0.00015527763106494024, + "loss": 1.0838, + "step": 9331 + }, + { + "epoch": 0.3341987931312336, + "grad_norm": 1.9739553928375244, + "learning_rate": 0.0001552679648914155, + "loss": 1.4523, + "step": 9332 + }, + { + "epoch": 0.33423460526080184, + "grad_norm": 1.3431679010391235, + "learning_rate": 0.00015525829797433157, + "loss": 1.0645, + "step": 9333 + }, + { + "epoch": 0.3342704173903701, + "grad_norm": 1.5912567377090454, + "learning_rate": 0.00015524863031381853, + "loss": 1.063, + "step": 9334 + }, + { + "epoch": 0.3343062295199384, + "grad_norm": 1.5187122821807861, + "learning_rate": 0.00015523896191000643, + "loss": 1.3147, + "step": 9335 + }, + { + "epoch": 0.3343420416495067, + "grad_norm": 1.7614284753799438, + "learning_rate": 0.00015522929276302536, + "loss": 1.1314, + "step": 9336 + }, + { + "epoch": 0.334377853779075, + "grad_norm": 1.5688053369522095, + "learning_rate": 0.0001552196228730054, + "loss": 1.0352, + "step": 9337 + }, + { + "epoch": 0.33441366590864324, + "grad_norm": 1.6132919788360596, + "learning_rate": 0.00015520995224007662, + "loss": 1.1242, + "step": 9338 + }, + { + "epoch": 0.33444947803821157, + "grad_norm": 1.631592869758606, + "learning_rate": 0.00015520028086436915, + "loss": 1.095, + "step": 9339 + }, + { + "epoch": 0.33448529016777984, + "grad_norm": 1.3491302728652954, + "learning_rate": 0.00015519060874601313, + "loss": 1.0047, + "step": 9340 + }, + { + "epoch": 0.3345211022973481, + "grad_norm": 1.4680368900299072, + "learning_rate": 0.00015518093588513863, + "loss": 1.2384, + "step": 9341 + }, + { + "epoch": 0.3345569144269164, + "grad_norm": 1.8025187253952026, + "learning_rate": 0.0001551712622818758, + "loss": 1.2253, + "step": 9342 + }, + { + "epoch": 0.3345927265564847, + "grad_norm": 1.5375769138336182, + "learning_rate": 0.00015516158793635486, + "loss": 1.0181, + "step": 9343 + }, + { + "epoch": 0.33462853868605297, + "grad_norm": 1.6001405715942383, + "learning_rate": 0.00015515191284870588, + "loss": 1.2611, + "step": 9344 + }, + { + "epoch": 0.33466435081562124, + "grad_norm": 1.6486072540283203, + "learning_rate": 0.00015514223701905904, + "loss": 1.1406, + "step": 9345 + }, + { + "epoch": 0.33470016294518956, + "grad_norm": 1.443271279335022, + "learning_rate": 0.00015513256044754457, + "loss": 1.242, + "step": 9346 + }, + { + "epoch": 0.33473597507475783, + "grad_norm": 1.767303228378296, + "learning_rate": 0.00015512288313429258, + "loss": 1.222, + "step": 9347 + }, + { + "epoch": 0.3347717872043261, + "grad_norm": 2.1369528770446777, + "learning_rate": 0.0001551132050794333, + "loss": 1.0257, + "step": 9348 + }, + { + "epoch": 0.33480759933389437, + "grad_norm": 1.693066954612732, + "learning_rate": 0.00015510352628309693, + "loss": 1.2612, + "step": 9349 + }, + { + "epoch": 0.3348434114634627, + "grad_norm": 1.546708345413208, + "learning_rate": 0.00015509384674541372, + "loss": 1.1165, + "step": 9350 + }, + { + "epoch": 0.33487922359303096, + "grad_norm": 1.4805935621261597, + "learning_rate": 0.00015508416646651385, + "loss": 1.1812, + "step": 9351 + }, + { + "epoch": 0.33491503572259923, + "grad_norm": 1.5449471473693848, + "learning_rate": 0.0001550744854465276, + "loss": 1.2692, + "step": 9352 + }, + { + "epoch": 0.33495084785216755, + "grad_norm": 1.7097676992416382, + "learning_rate": 0.00015506480368558516, + "loss": 1.2575, + "step": 9353 + }, + { + "epoch": 0.3349866599817358, + "grad_norm": 1.9258707761764526, + "learning_rate": 0.00015505512118381683, + "loss": 1.1038, + "step": 9354 + }, + { + "epoch": 0.3350224721113041, + "grad_norm": 1.5878294706344604, + "learning_rate": 0.00015504543794135284, + "loss": 1.2328, + "step": 9355 + }, + { + "epoch": 0.33505828424087236, + "grad_norm": 1.3388012647628784, + "learning_rate": 0.00015503575395832352, + "loss": 1.0977, + "step": 9356 + }, + { + "epoch": 0.3350940963704407, + "grad_norm": 1.9607563018798828, + "learning_rate": 0.00015502606923485906, + "loss": 1.084, + "step": 9357 + }, + { + "epoch": 0.33512990850000896, + "grad_norm": 1.3679319620132446, + "learning_rate": 0.00015501638377108987, + "loss": 1.1453, + "step": 9358 + }, + { + "epoch": 0.3351657206295772, + "grad_norm": 1.2702571153640747, + "learning_rate": 0.00015500669756714618, + "loss": 0.9767, + "step": 9359 + }, + { + "epoch": 0.33520153275914555, + "grad_norm": 1.9277809858322144, + "learning_rate": 0.0001549970106231583, + "loss": 1.15, + "step": 9360 + }, + { + "epoch": 0.3352373448887138, + "grad_norm": 1.6373703479766846, + "learning_rate": 0.00015498732293925667, + "loss": 1.1393, + "step": 9361 + }, + { + "epoch": 0.3352731570182821, + "grad_norm": 1.5169281959533691, + "learning_rate": 0.00015497763451557148, + "loss": 1.203, + "step": 9362 + }, + { + "epoch": 0.33530896914785036, + "grad_norm": 1.6904114484786987, + "learning_rate": 0.00015496794535223315, + "loss": 0.8999, + "step": 9363 + }, + { + "epoch": 0.3353447812774187, + "grad_norm": 1.6167004108428955, + "learning_rate": 0.000154958255449372, + "loss": 1.1594, + "step": 9364 + }, + { + "epoch": 0.33538059340698695, + "grad_norm": 1.7256909608840942, + "learning_rate": 0.00015494856480711844, + "loss": 1.0889, + "step": 9365 + }, + { + "epoch": 0.3354164055365552, + "grad_norm": 1.8583862781524658, + "learning_rate": 0.0001549388734256028, + "loss": 1.3843, + "step": 9366 + }, + { + "epoch": 0.33545221766612354, + "grad_norm": 1.5851376056671143, + "learning_rate": 0.00015492918130495547, + "loss": 1.0811, + "step": 9367 + }, + { + "epoch": 0.3354880297956918, + "grad_norm": 1.5616225004196167, + "learning_rate": 0.0001549194884453069, + "loss": 1.0475, + "step": 9368 + }, + { + "epoch": 0.3355238419252601, + "grad_norm": 1.4150042533874512, + "learning_rate": 0.00015490979484678743, + "loss": 1.0235, + "step": 9369 + }, + { + "epoch": 0.33555965405482835, + "grad_norm": 1.7521148920059204, + "learning_rate": 0.0001549001005095275, + "loss": 1.2159, + "step": 9370 + }, + { + "epoch": 0.3355954661843967, + "grad_norm": 1.387341856956482, + "learning_rate": 0.00015489040543365754, + "loss": 1.0668, + "step": 9371 + }, + { + "epoch": 0.33563127831396494, + "grad_norm": 1.537010669708252, + "learning_rate": 0.00015488070961930796, + "loss": 1.1704, + "step": 9372 + }, + { + "epoch": 0.3356670904435332, + "grad_norm": 1.3833014965057373, + "learning_rate": 0.00015487101306660924, + "loss": 1.1413, + "step": 9373 + }, + { + "epoch": 0.33570290257310154, + "grad_norm": 1.6578857898712158, + "learning_rate": 0.00015486131577569182, + "loss": 1.1004, + "step": 9374 + }, + { + "epoch": 0.3357387147026698, + "grad_norm": 1.29314386844635, + "learning_rate": 0.00015485161774668615, + "loss": 1.1385, + "step": 9375 + }, + { + "epoch": 0.3357745268322381, + "grad_norm": 1.3138821125030518, + "learning_rate": 0.00015484191897972274, + "loss": 1.1384, + "step": 9376 + }, + { + "epoch": 0.33581033896180634, + "grad_norm": 1.7955459356307983, + "learning_rate": 0.000154832219474932, + "loss": 1.0583, + "step": 9377 + }, + { + "epoch": 0.33584615109137467, + "grad_norm": 1.5514675378799438, + "learning_rate": 0.00015482251923244452, + "loss": 1.0799, + "step": 9378 + }, + { + "epoch": 0.33588196322094294, + "grad_norm": 1.344207763671875, + "learning_rate": 0.00015481281825239072, + "loss": 1.1838, + "step": 9379 + }, + { + "epoch": 0.3359177753505112, + "grad_norm": 1.3368449211120605, + "learning_rate": 0.00015480311653490124, + "loss": 1.098, + "step": 9380 + }, + { + "epoch": 0.33595358748007953, + "grad_norm": 1.4666204452514648, + "learning_rate": 0.00015479341408010643, + "loss": 1.0801, + "step": 9381 + }, + { + "epoch": 0.3359893996096478, + "grad_norm": 1.7793225049972534, + "learning_rate": 0.00015478371088813696, + "loss": 1.2998, + "step": 9382 + }, + { + "epoch": 0.33602521173921607, + "grad_norm": 1.7626646757125854, + "learning_rate": 0.0001547740069591233, + "loss": 1.2642, + "step": 9383 + }, + { + "epoch": 0.33606102386878434, + "grad_norm": 1.806421160697937, + "learning_rate": 0.00015476430229319603, + "loss": 1.1662, + "step": 9384 + }, + { + "epoch": 0.33609683599835266, + "grad_norm": 1.5555216073989868, + "learning_rate": 0.00015475459689048572, + "loss": 1.107, + "step": 9385 + }, + { + "epoch": 0.33613264812792093, + "grad_norm": 1.5963133573532104, + "learning_rate": 0.00015474489075112296, + "loss": 1.1577, + "step": 9386 + }, + { + "epoch": 0.3361684602574892, + "grad_norm": 1.4297221899032593, + "learning_rate": 0.00015473518387523825, + "loss": 1.2517, + "step": 9387 + }, + { + "epoch": 0.3362042723870575, + "grad_norm": 1.8961101770401, + "learning_rate": 0.0001547254762629623, + "loss": 1.1872, + "step": 9388 + }, + { + "epoch": 0.3362400845166258, + "grad_norm": 1.432710886001587, + "learning_rate": 0.00015471576791442564, + "loss": 1.1564, + "step": 9389 + }, + { + "epoch": 0.33627589664619406, + "grad_norm": 1.6677558422088623, + "learning_rate": 0.00015470605882975891, + "loss": 1.1145, + "step": 9390 + }, + { + "epoch": 0.33631170877576233, + "grad_norm": 1.612631916999817, + "learning_rate": 0.00015469634900909271, + "loss": 1.2259, + "step": 9391 + }, + { + "epoch": 0.33634752090533065, + "grad_norm": 1.358488917350769, + "learning_rate": 0.00015468663845255768, + "loss": 1.1938, + "step": 9392 + }, + { + "epoch": 0.3363833330348989, + "grad_norm": 1.9852408170700073, + "learning_rate": 0.0001546769271602845, + "loss": 1.1068, + "step": 9393 + }, + { + "epoch": 0.3364191451644672, + "grad_norm": 1.8587549924850464, + "learning_rate": 0.0001546672151324038, + "loss": 1.1846, + "step": 9394 + }, + { + "epoch": 0.3364549572940355, + "grad_norm": 1.4312388896942139, + "learning_rate": 0.0001546575023690462, + "loss": 1.2271, + "step": 9395 + }, + { + "epoch": 0.3364907694236038, + "grad_norm": 1.679551601409912, + "learning_rate": 0.00015464778887034242, + "loss": 0.9841, + "step": 9396 + }, + { + "epoch": 0.33652658155317206, + "grad_norm": 1.554856300354004, + "learning_rate": 0.0001546380746364231, + "loss": 1.2921, + "step": 9397 + }, + { + "epoch": 0.3365623936827403, + "grad_norm": 1.787235140800476, + "learning_rate": 0.00015462835966741903, + "loss": 0.7727, + "step": 9398 + }, + { + "epoch": 0.33659820581230865, + "grad_norm": 1.3927935361862183, + "learning_rate": 0.0001546186439634608, + "loss": 1.1804, + "step": 9399 + }, + { + "epoch": 0.3366340179418769, + "grad_norm": 1.6887887716293335, + "learning_rate": 0.0001546089275246792, + "loss": 1.2915, + "step": 9400 + }, + { + "epoch": 0.3366698300714452, + "grad_norm": 1.53842294216156, + "learning_rate": 0.00015459921035120488, + "loss": 1.1129, + "step": 9401 + }, + { + "epoch": 0.3367056422010135, + "grad_norm": 1.3175371885299683, + "learning_rate": 0.00015458949244316866, + "loss": 1.1915, + "step": 9402 + }, + { + "epoch": 0.3367414543305818, + "grad_norm": 1.478389859199524, + "learning_rate": 0.00015457977380070118, + "loss": 1.1567, + "step": 9403 + }, + { + "epoch": 0.33677726646015005, + "grad_norm": 2.039062023162842, + "learning_rate": 0.00015457005442393327, + "loss": 1.2654, + "step": 9404 + }, + { + "epoch": 0.3368130785897183, + "grad_norm": 1.5230510234832764, + "learning_rate": 0.00015456033431299567, + "loss": 0.9958, + "step": 9405 + }, + { + "epoch": 0.33684889071928664, + "grad_norm": 1.5522255897521973, + "learning_rate": 0.00015455061346801916, + "loss": 1.0864, + "step": 9406 + }, + { + "epoch": 0.3368847028488549, + "grad_norm": 1.8264758586883545, + "learning_rate": 0.00015454089188913454, + "loss": 1.1622, + "step": 9407 + }, + { + "epoch": 0.3369205149784232, + "grad_norm": 1.950891137123108, + "learning_rate": 0.00015453116957647254, + "loss": 1.1106, + "step": 9408 + }, + { + "epoch": 0.33695632710799145, + "grad_norm": 1.4607633352279663, + "learning_rate": 0.00015452144653016397, + "loss": 1.2456, + "step": 9409 + }, + { + "epoch": 0.3369921392375598, + "grad_norm": 1.5765186548233032, + "learning_rate": 0.0001545117227503397, + "loss": 1.1463, + "step": 9410 + }, + { + "epoch": 0.33702795136712804, + "grad_norm": 1.4701000452041626, + "learning_rate": 0.00015450199823713047, + "loss": 1.0194, + "step": 9411 + }, + { + "epoch": 0.3370637634966963, + "grad_norm": 3.5960209369659424, + "learning_rate": 0.0001544922729906672, + "loss": 1.021, + "step": 9412 + }, + { + "epoch": 0.33709957562626464, + "grad_norm": 1.3440697193145752, + "learning_rate": 0.00015448254701108067, + "loss": 1.0972, + "step": 9413 + }, + { + "epoch": 0.3371353877558329, + "grad_norm": 1.3863211870193481, + "learning_rate": 0.00015447282029850174, + "loss": 0.9779, + "step": 9414 + }, + { + "epoch": 0.3371711998854012, + "grad_norm": 1.6555179357528687, + "learning_rate": 0.00015446309285306131, + "loss": 1.2342, + "step": 9415 + }, + { + "epoch": 0.33720701201496944, + "grad_norm": 1.4641039371490479, + "learning_rate": 0.0001544533646748902, + "loss": 1.0625, + "step": 9416 + }, + { + "epoch": 0.33724282414453777, + "grad_norm": 1.3416274785995483, + "learning_rate": 0.00015444363576411929, + "loss": 1.0517, + "step": 9417 + }, + { + "epoch": 0.33727863627410604, + "grad_norm": 1.819218635559082, + "learning_rate": 0.00015443390612087952, + "loss": 1.0384, + "step": 9418 + }, + { + "epoch": 0.3373144484036743, + "grad_norm": 1.8136471509933472, + "learning_rate": 0.00015442417574530173, + "loss": 1.2006, + "step": 9419 + }, + { + "epoch": 0.33735026053324263, + "grad_norm": 1.4833272695541382, + "learning_rate": 0.00015441444463751687, + "loss": 1.231, + "step": 9420 + }, + { + "epoch": 0.3373860726628109, + "grad_norm": 1.7020957469940186, + "learning_rate": 0.00015440471279765583, + "loss": 1.2015, + "step": 9421 + }, + { + "epoch": 0.33742188479237917, + "grad_norm": 1.300756812095642, + "learning_rate": 0.00015439498022584957, + "loss": 1.1035, + "step": 9422 + }, + { + "epoch": 0.33745769692194744, + "grad_norm": 1.8729486465454102, + "learning_rate": 0.00015438524692222902, + "loss": 1.2044, + "step": 9423 + }, + { + "epoch": 0.33749350905151576, + "grad_norm": 1.4103506803512573, + "learning_rate": 0.00015437551288692512, + "loss": 1.1139, + "step": 9424 + }, + { + "epoch": 0.33752932118108403, + "grad_norm": 1.6412907838821411, + "learning_rate": 0.00015436577812006884, + "loss": 1.1157, + "step": 9425 + }, + { + "epoch": 0.3375651333106523, + "grad_norm": 2.1418135166168213, + "learning_rate": 0.00015435604262179116, + "loss": 1.0001, + "step": 9426 + }, + { + "epoch": 0.3376009454402206, + "grad_norm": 1.7714242935180664, + "learning_rate": 0.000154346306392223, + "loss": 1.1041, + "step": 9427 + }, + { + "epoch": 0.3376367575697889, + "grad_norm": 1.6712678670883179, + "learning_rate": 0.00015433656943149543, + "loss": 1.3295, + "step": 9428 + }, + { + "epoch": 0.33767256969935716, + "grad_norm": 1.50557279586792, + "learning_rate": 0.00015432683173973935, + "loss": 1.3105, + "step": 9429 + }, + { + "epoch": 0.33770838182892543, + "grad_norm": 1.5158710479736328, + "learning_rate": 0.0001543170933170859, + "loss": 1.1904, + "step": 9430 + }, + { + "epoch": 0.33774419395849375, + "grad_norm": 1.9119740724563599, + "learning_rate": 0.00015430735416366596, + "loss": 1.4095, + "step": 9431 + }, + { + "epoch": 0.337780006088062, + "grad_norm": 1.3410300016403198, + "learning_rate": 0.00015429761427961065, + "loss": 1.1233, + "step": 9432 + }, + { + "epoch": 0.3378158182176303, + "grad_norm": 1.5688095092773438, + "learning_rate": 0.00015428787366505094, + "loss": 1.1585, + "step": 9433 + }, + { + "epoch": 0.3378516303471986, + "grad_norm": 1.9656267166137695, + "learning_rate": 0.00015427813232011799, + "loss": 1.2528, + "step": 9434 + }, + { + "epoch": 0.3378874424767669, + "grad_norm": 2.0548596382141113, + "learning_rate": 0.00015426839024494272, + "loss": 1.2172, + "step": 9435 + }, + { + "epoch": 0.33792325460633515, + "grad_norm": 1.6359905004501343, + "learning_rate": 0.0001542586474396563, + "loss": 1.0839, + "step": 9436 + }, + { + "epoch": 0.3379590667359034, + "grad_norm": 2.2203712463378906, + "learning_rate": 0.00015424890390438974, + "loss": 1.279, + "step": 9437 + }, + { + "epoch": 0.33799487886547175, + "grad_norm": 1.6484078168869019, + "learning_rate": 0.00015423915963927418, + "loss": 1.0484, + "step": 9438 + }, + { + "epoch": 0.33803069099504, + "grad_norm": 1.2192832231521606, + "learning_rate": 0.00015422941464444064, + "loss": 1.1858, + "step": 9439 + }, + { + "epoch": 0.3380665031246083, + "grad_norm": 1.610754132270813, + "learning_rate": 0.00015421966892002032, + "loss": 1.1922, + "step": 9440 + }, + { + "epoch": 0.3381023152541766, + "grad_norm": 1.7944527864456177, + "learning_rate": 0.00015420992246614428, + "loss": 1.167, + "step": 9441 + }, + { + "epoch": 0.3381381273837449, + "grad_norm": 1.3404579162597656, + "learning_rate": 0.00015420017528294368, + "loss": 1.0803, + "step": 9442 + }, + { + "epoch": 0.33817393951331315, + "grad_norm": 1.7292122840881348, + "learning_rate": 0.00015419042737054963, + "loss": 1.3016, + "step": 9443 + }, + { + "epoch": 0.3382097516428814, + "grad_norm": 1.4143527746200562, + "learning_rate": 0.00015418067872909326, + "loss": 1.1871, + "step": 9444 + }, + { + "epoch": 0.33824556377244974, + "grad_norm": 1.3768342733383179, + "learning_rate": 0.00015417092935870574, + "loss": 1.3598, + "step": 9445 + }, + { + "epoch": 0.338281375902018, + "grad_norm": 1.7161815166473389, + "learning_rate": 0.00015416117925951827, + "loss": 0.9516, + "step": 9446 + }, + { + "epoch": 0.3383171880315863, + "grad_norm": 1.4318798780441284, + "learning_rate": 0.000154151428431662, + "loss": 1.0513, + "step": 9447 + }, + { + "epoch": 0.3383530001611546, + "grad_norm": 1.6439528465270996, + "learning_rate": 0.00015414167687526805, + "loss": 1.0948, + "step": 9448 + }, + { + "epoch": 0.3383888122907229, + "grad_norm": 1.789784550666809, + "learning_rate": 0.00015413192459046772, + "loss": 1.0864, + "step": 9449 + }, + { + "epoch": 0.33842462442029114, + "grad_norm": 1.9490666389465332, + "learning_rate": 0.00015412217157739216, + "loss": 1.0747, + "step": 9450 + }, + { + "epoch": 0.3384604365498594, + "grad_norm": 1.3676568269729614, + "learning_rate": 0.00015411241783617262, + "loss": 1.151, + "step": 9451 + }, + { + "epoch": 0.33849624867942774, + "grad_norm": 1.6286402940750122, + "learning_rate": 0.0001541026633669403, + "loss": 1.0431, + "step": 9452 + }, + { + "epoch": 0.338532060808996, + "grad_norm": 2.144502639770508, + "learning_rate": 0.0001540929081698264, + "loss": 1.0096, + "step": 9453 + }, + { + "epoch": 0.3385678729385643, + "grad_norm": 1.723382830619812, + "learning_rate": 0.00015408315224496222, + "loss": 1.1215, + "step": 9454 + }, + { + "epoch": 0.3386036850681326, + "grad_norm": 1.6695696115493774, + "learning_rate": 0.00015407339559247895, + "loss": 1.1975, + "step": 9455 + }, + { + "epoch": 0.33863949719770087, + "grad_norm": 1.660327434539795, + "learning_rate": 0.00015406363821250793, + "loss": 1.2556, + "step": 9456 + }, + { + "epoch": 0.33867530932726914, + "grad_norm": 1.6852799654006958, + "learning_rate": 0.00015405388010518038, + "loss": 1.4745, + "step": 9457 + }, + { + "epoch": 0.3387111214568374, + "grad_norm": 2.452394723892212, + "learning_rate": 0.00015404412127062762, + "loss": 1.3245, + "step": 9458 + }, + { + "epoch": 0.33874693358640573, + "grad_norm": 2.066319704055786, + "learning_rate": 0.00015403436170898088, + "loss": 1.1817, + "step": 9459 + }, + { + "epoch": 0.338782745715974, + "grad_norm": 1.3717730045318604, + "learning_rate": 0.00015402460142037154, + "loss": 1.1755, + "step": 9460 + }, + { + "epoch": 0.33881855784554227, + "grad_norm": 1.9684600830078125, + "learning_rate": 0.00015401484040493085, + "loss": 1.2175, + "step": 9461 + }, + { + "epoch": 0.3388543699751106, + "grad_norm": 1.7584764957427979, + "learning_rate": 0.00015400507866279018, + "loss": 1.2374, + "step": 9462 + }, + { + "epoch": 0.33889018210467886, + "grad_norm": 1.3895982503890991, + "learning_rate": 0.0001539953161940808, + "loss": 1.2807, + "step": 9463 + }, + { + "epoch": 0.33892599423424713, + "grad_norm": 1.770262360572815, + "learning_rate": 0.00015398555299893412, + "loss": 1.0618, + "step": 9464 + }, + { + "epoch": 0.3389618063638154, + "grad_norm": 1.4226878881454468, + "learning_rate": 0.00015397578907748146, + "loss": 1.2343, + "step": 9465 + }, + { + "epoch": 0.3389976184933837, + "grad_norm": 1.940524935722351, + "learning_rate": 0.00015396602442985417, + "loss": 1.0772, + "step": 9466 + }, + { + "epoch": 0.339033430622952, + "grad_norm": 1.398921251296997, + "learning_rate": 0.00015395625905618364, + "loss": 1.081, + "step": 9467 + }, + { + "epoch": 0.33906924275252026, + "grad_norm": 1.4468436241149902, + "learning_rate": 0.00015394649295660123, + "loss": 1.2561, + "step": 9468 + }, + { + "epoch": 0.3391050548820886, + "grad_norm": 1.3994896411895752, + "learning_rate": 0.00015393672613123836, + "loss": 1.0656, + "step": 9469 + }, + { + "epoch": 0.33914086701165685, + "grad_norm": 1.3560876846313477, + "learning_rate": 0.0001539269585802264, + "loss": 1.1937, + "step": 9470 + }, + { + "epoch": 0.3391766791412251, + "grad_norm": 1.2983362674713135, + "learning_rate": 0.0001539171903036968, + "loss": 1.2475, + "step": 9471 + }, + { + "epoch": 0.3392124912707934, + "grad_norm": 1.2887375354766846, + "learning_rate": 0.0001539074213017809, + "loss": 0.9591, + "step": 9472 + }, + { + "epoch": 0.3392483034003617, + "grad_norm": 1.5050984621047974, + "learning_rate": 0.00015389765157461022, + "loss": 0.9954, + "step": 9473 + }, + { + "epoch": 0.33928411552993, + "grad_norm": 1.3626652956008911, + "learning_rate": 0.00015388788112231615, + "loss": 1.0128, + "step": 9474 + }, + { + "epoch": 0.33931992765949825, + "grad_norm": 1.303463339805603, + "learning_rate": 0.00015387810994503016, + "loss": 1.0756, + "step": 9475 + }, + { + "epoch": 0.3393557397890666, + "grad_norm": 1.5339077711105347, + "learning_rate": 0.0001538683380428837, + "loss": 0.9802, + "step": 9476 + }, + { + "epoch": 0.33939155191863485, + "grad_norm": 1.9730784893035889, + "learning_rate": 0.00015385856541600825, + "loss": 1.0436, + "step": 9477 + }, + { + "epoch": 0.3394273640482031, + "grad_norm": 1.722670078277588, + "learning_rate": 0.00015384879206453524, + "loss": 1.1659, + "step": 9478 + }, + { + "epoch": 0.3394631761777714, + "grad_norm": 1.915269374847412, + "learning_rate": 0.00015383901798859622, + "loss": 1.1614, + "step": 9479 + }, + { + "epoch": 0.3394989883073397, + "grad_norm": 1.3942240476608276, + "learning_rate": 0.00015382924318832264, + "loss": 1.0828, + "step": 9480 + }, + { + "epoch": 0.339534800436908, + "grad_norm": 1.1679518222808838, + "learning_rate": 0.00015381946766384602, + "loss": 1.0844, + "step": 9481 + }, + { + "epoch": 0.33957061256647625, + "grad_norm": 1.4692243337631226, + "learning_rate": 0.0001538096914152979, + "loss": 1.2726, + "step": 9482 + }, + { + "epoch": 0.3396064246960446, + "grad_norm": 1.6770132780075073, + "learning_rate": 0.00015379991444280979, + "loss": 1.279, + "step": 9483 + }, + { + "epoch": 0.33964223682561284, + "grad_norm": 1.6194911003112793, + "learning_rate": 0.00015379013674651323, + "loss": 1.1871, + "step": 9484 + }, + { + "epoch": 0.3396780489551811, + "grad_norm": 1.4982327222824097, + "learning_rate": 0.00015378035832653975, + "loss": 1.4486, + "step": 9485 + }, + { + "epoch": 0.3397138610847494, + "grad_norm": 1.5968992710113525, + "learning_rate": 0.00015377057918302097, + "loss": 1.1674, + "step": 9486 + }, + { + "epoch": 0.3397496732143177, + "grad_norm": 1.3048651218414307, + "learning_rate": 0.00015376079931608838, + "loss": 1.1858, + "step": 9487 + }, + { + "epoch": 0.339785485343886, + "grad_norm": 1.4209002256393433, + "learning_rate": 0.00015375101872587357, + "loss": 1.1162, + "step": 9488 + }, + { + "epoch": 0.33982129747345424, + "grad_norm": 1.4350895881652832, + "learning_rate": 0.00015374123741250815, + "loss": 1.0724, + "step": 9489 + }, + { + "epoch": 0.33985710960302257, + "grad_norm": 1.4492056369781494, + "learning_rate": 0.00015373145537612369, + "loss": 1.212, + "step": 9490 + }, + { + "epoch": 0.33989292173259084, + "grad_norm": 1.87797212600708, + "learning_rate": 0.00015372167261685178, + "loss": 1.0229, + "step": 9491 + }, + { + "epoch": 0.3399287338621591, + "grad_norm": 2.069861888885498, + "learning_rate": 0.0001537118891348241, + "loss": 1.1803, + "step": 9492 + }, + { + "epoch": 0.3399645459917274, + "grad_norm": 1.620638370513916, + "learning_rate": 0.00015370210493017222, + "loss": 1.2012, + "step": 9493 + }, + { + "epoch": 0.3400003581212957, + "grad_norm": 1.8631258010864258, + "learning_rate": 0.00015369232000302777, + "loss": 1.2488, + "step": 9494 + }, + { + "epoch": 0.34003617025086397, + "grad_norm": 2.228572130203247, + "learning_rate": 0.00015368253435352246, + "loss": 1.0727, + "step": 9495 + }, + { + "epoch": 0.34007198238043224, + "grad_norm": 1.4529756307601929, + "learning_rate": 0.00015367274798178788, + "loss": 1.2015, + "step": 9496 + }, + { + "epoch": 0.34010779451000056, + "grad_norm": 1.5171754360198975, + "learning_rate": 0.0001536629608879557, + "loss": 1.1622, + "step": 9497 + }, + { + "epoch": 0.34014360663956883, + "grad_norm": 1.5022085905075073, + "learning_rate": 0.00015365317307215759, + "loss": 1.3407, + "step": 9498 + }, + { + "epoch": 0.3401794187691371, + "grad_norm": 1.6639324426651, + "learning_rate": 0.00015364338453452528, + "loss": 1.0873, + "step": 9499 + }, + { + "epoch": 0.34021523089870537, + "grad_norm": 1.3297526836395264, + "learning_rate": 0.00015363359527519036, + "loss": 1.2233, + "step": 9500 + }, + { + "epoch": 0.3402510430282737, + "grad_norm": 1.82257878780365, + "learning_rate": 0.00015362380529428466, + "loss": 1.2033, + "step": 9501 + }, + { + "epoch": 0.34028685515784196, + "grad_norm": 2.4764654636383057, + "learning_rate": 0.0001536140145919398, + "loss": 1.0988, + "step": 9502 + }, + { + "epoch": 0.34032266728741023, + "grad_norm": 1.5113221406936646, + "learning_rate": 0.00015360422316828754, + "loss": 1.2631, + "step": 9503 + }, + { + "epoch": 0.34035847941697855, + "grad_norm": 1.3668409585952759, + "learning_rate": 0.0001535944310234596, + "loss": 1.3073, + "step": 9504 + }, + { + "epoch": 0.3403942915465468, + "grad_norm": 1.7105666399002075, + "learning_rate": 0.0001535846381575877, + "loss": 1.2429, + "step": 9505 + }, + { + "epoch": 0.3404301036761151, + "grad_norm": 1.2715353965759277, + "learning_rate": 0.00015357484457080366, + "loss": 0.9187, + "step": 9506 + }, + { + "epoch": 0.34046591580568336, + "grad_norm": 1.2895420789718628, + "learning_rate": 0.00015356505026323917, + "loss": 1.2218, + "step": 9507 + }, + { + "epoch": 0.3405017279352517, + "grad_norm": 1.6621992588043213, + "learning_rate": 0.00015355525523502603, + "loss": 1.2637, + "step": 9508 + }, + { + "epoch": 0.34053754006481995, + "grad_norm": 1.4151612520217896, + "learning_rate": 0.00015354545948629598, + "loss": 1.1652, + "step": 9509 + }, + { + "epoch": 0.3405733521943882, + "grad_norm": 1.2661523818969727, + "learning_rate": 0.00015353566301718087, + "loss": 1.0518, + "step": 9510 + }, + { + "epoch": 0.34060916432395655, + "grad_norm": 1.3867603540420532, + "learning_rate": 0.00015352586582781247, + "loss": 1.1989, + "step": 9511 + }, + { + "epoch": 0.3406449764535248, + "grad_norm": 1.054750680923462, + "learning_rate": 0.0001535160679183226, + "loss": 1.011, + "step": 9512 + }, + { + "epoch": 0.3406807885830931, + "grad_norm": 1.3829423189163208, + "learning_rate": 0.00015350626928884307, + "loss": 1.2427, + "step": 9513 + }, + { + "epoch": 0.34071660071266135, + "grad_norm": 1.3865256309509277, + "learning_rate": 0.00015349646993950567, + "loss": 1.0689, + "step": 9514 + }, + { + "epoch": 0.3407524128422297, + "grad_norm": 1.4890847206115723, + "learning_rate": 0.00015348666987044228, + "loss": 1.1995, + "step": 9515 + }, + { + "epoch": 0.34078822497179795, + "grad_norm": 1.2949410676956177, + "learning_rate": 0.00015347686908178475, + "loss": 0.9592, + "step": 9516 + }, + { + "epoch": 0.3408240371013662, + "grad_norm": 1.3217215538024902, + "learning_rate": 0.0001534670675736649, + "loss": 0.991, + "step": 9517 + }, + { + "epoch": 0.34085984923093454, + "grad_norm": 1.2596313953399658, + "learning_rate": 0.00015345726534621466, + "loss": 1.1949, + "step": 9518 + }, + { + "epoch": 0.3408956613605028, + "grad_norm": 1.5577688217163086, + "learning_rate": 0.00015344746239956587, + "loss": 1.2558, + "step": 9519 + }, + { + "epoch": 0.3409314734900711, + "grad_norm": 1.2174620628356934, + "learning_rate": 0.00015343765873385037, + "loss": 1.3334, + "step": 9520 + }, + { + "epoch": 0.34096728561963935, + "grad_norm": 1.5602349042892456, + "learning_rate": 0.00015342785434920017, + "loss": 1.1037, + "step": 9521 + }, + { + "epoch": 0.3410030977492077, + "grad_norm": 1.6718690395355225, + "learning_rate": 0.00015341804924574707, + "loss": 1.1948, + "step": 9522 + }, + { + "epoch": 0.34103890987877594, + "grad_norm": 1.4512139558792114, + "learning_rate": 0.00015340824342362303, + "loss": 1.308, + "step": 9523 + }, + { + "epoch": 0.3410747220083442, + "grad_norm": 1.5640931129455566, + "learning_rate": 0.00015339843688295997, + "loss": 1.2221, + "step": 9524 + }, + { + "epoch": 0.34111053413791254, + "grad_norm": 2.0319972038269043, + "learning_rate": 0.00015338862962388977, + "loss": 1.044, + "step": 9525 + }, + { + "epoch": 0.3411463462674808, + "grad_norm": 1.6691817045211792, + "learning_rate": 0.0001533788216465445, + "loss": 1.1771, + "step": 9526 + }, + { + "epoch": 0.3411821583970491, + "grad_norm": 1.8015412092208862, + "learning_rate": 0.00015336901295105596, + "loss": 1.3372, + "step": 9527 + }, + { + "epoch": 0.34121797052661734, + "grad_norm": 1.9800865650177002, + "learning_rate": 0.00015335920353755627, + "loss": 1.1538, + "step": 9528 + }, + { + "epoch": 0.34125378265618567, + "grad_norm": 2.20184588432312, + "learning_rate": 0.00015334939340617726, + "loss": 1.26, + "step": 9529 + }, + { + "epoch": 0.34128959478575394, + "grad_norm": 1.4129714965820312, + "learning_rate": 0.00015333958255705102, + "loss": 1.2342, + "step": 9530 + }, + { + "epoch": 0.3413254069153222, + "grad_norm": 1.3521151542663574, + "learning_rate": 0.00015332977099030953, + "loss": 1.2808, + "step": 9531 + }, + { + "epoch": 0.34136121904489053, + "grad_norm": 1.560713291168213, + "learning_rate": 0.0001533199587060847, + "loss": 1.0222, + "step": 9532 + }, + { + "epoch": 0.3413970311744588, + "grad_norm": 1.2807600498199463, + "learning_rate": 0.0001533101457045086, + "loss": 1.18, + "step": 9533 + }, + { + "epoch": 0.34143284330402707, + "grad_norm": 1.5458533763885498, + "learning_rate": 0.0001533003319857133, + "loss": 0.9677, + "step": 9534 + }, + { + "epoch": 0.34146865543359534, + "grad_norm": 1.2134628295898438, + "learning_rate": 0.00015329051754983076, + "loss": 1.0546, + "step": 9535 + }, + { + "epoch": 0.34150446756316366, + "grad_norm": 1.6064984798431396, + "learning_rate": 0.00015328070239699305, + "loss": 1.1249, + "step": 9536 + }, + { + "epoch": 0.34154027969273193, + "grad_norm": 1.386572241783142, + "learning_rate": 0.0001532708865273322, + "loss": 1.0614, + "step": 9537 + }, + { + "epoch": 0.3415760918223002, + "grad_norm": 1.4977784156799316, + "learning_rate": 0.0001532610699409803, + "loss": 1.13, + "step": 9538 + }, + { + "epoch": 0.3416119039518685, + "grad_norm": 2.1100683212280273, + "learning_rate": 0.00015325125263806943, + "loss": 1.1423, + "step": 9539 + }, + { + "epoch": 0.3416477160814368, + "grad_norm": 1.5350663661956787, + "learning_rate": 0.0001532414346187316, + "loss": 1.2842, + "step": 9540 + }, + { + "epoch": 0.34168352821100506, + "grad_norm": 1.997750163078308, + "learning_rate": 0.000153231615883099, + "loss": 1.1698, + "step": 9541 + }, + { + "epoch": 0.34171934034057333, + "grad_norm": 2.0986523628234863, + "learning_rate": 0.0001532217964313036, + "loss": 1.1347, + "step": 9542 + }, + { + "epoch": 0.34175515247014165, + "grad_norm": 1.7596735954284668, + "learning_rate": 0.00015321197626347766, + "loss": 1.2462, + "step": 9543 + }, + { + "epoch": 0.3417909645997099, + "grad_norm": 1.3147863149642944, + "learning_rate": 0.00015320215537975313, + "loss": 1.1968, + "step": 9544 + }, + { + "epoch": 0.3418267767292782, + "grad_norm": 1.6891016960144043, + "learning_rate": 0.0001531923337802623, + "loss": 1.3102, + "step": 9545 + }, + { + "epoch": 0.3418625888588465, + "grad_norm": 1.5207985639572144, + "learning_rate": 0.0001531825114651372, + "loss": 1.1886, + "step": 9546 + }, + { + "epoch": 0.3418984009884148, + "grad_norm": 1.790817379951477, + "learning_rate": 0.00015317268843451003, + "loss": 1.1792, + "step": 9547 + }, + { + "epoch": 0.34193421311798305, + "grad_norm": 1.6169546842575073, + "learning_rate": 0.0001531628646885129, + "loss": 1.3324, + "step": 9548 + }, + { + "epoch": 0.3419700252475513, + "grad_norm": 1.5700955390930176, + "learning_rate": 0.000153153040227278, + "loss": 1.0746, + "step": 9549 + }, + { + "epoch": 0.34200583737711965, + "grad_norm": 1.3235890865325928, + "learning_rate": 0.00015314321505093751, + "loss": 1.2124, + "step": 9550 + }, + { + "epoch": 0.3420416495066879, + "grad_norm": 1.6126346588134766, + "learning_rate": 0.00015313338915962362, + "loss": 1.1393, + "step": 9551 + }, + { + "epoch": 0.3420774616362562, + "grad_norm": 1.4172085523605347, + "learning_rate": 0.0001531235625534685, + "loss": 0.9876, + "step": 9552 + }, + { + "epoch": 0.3421132737658245, + "grad_norm": 1.4654741287231445, + "learning_rate": 0.00015311373523260437, + "loss": 1.0941, + "step": 9553 + }, + { + "epoch": 0.3421490858953928, + "grad_norm": 1.805468201637268, + "learning_rate": 0.00015310390719716348, + "loss": 1.2438, + "step": 9554 + }, + { + "epoch": 0.34218489802496105, + "grad_norm": 1.4833403825759888, + "learning_rate": 0.000153094078447278, + "loss": 1.0777, + "step": 9555 + }, + { + "epoch": 0.3422207101545293, + "grad_norm": 1.802106499671936, + "learning_rate": 0.00015308424898308017, + "loss": 1.2324, + "step": 9556 + }, + { + "epoch": 0.34225652228409764, + "grad_norm": 1.2765405178070068, + "learning_rate": 0.00015307441880470227, + "loss": 1.1853, + "step": 9557 + }, + { + "epoch": 0.3422923344136659, + "grad_norm": 1.2834960222244263, + "learning_rate": 0.00015306458791227646, + "loss": 1.2505, + "step": 9558 + }, + { + "epoch": 0.3423281465432342, + "grad_norm": 2.643662452697754, + "learning_rate": 0.00015305475630593516, + "loss": 1.2287, + "step": 9559 + }, + { + "epoch": 0.3423639586728025, + "grad_norm": 1.9332960844039917, + "learning_rate": 0.00015304492398581046, + "loss": 1.0769, + "step": 9560 + }, + { + "epoch": 0.3423997708023708, + "grad_norm": 1.8139201402664185, + "learning_rate": 0.0001530350909520348, + "loss": 1.2133, + "step": 9561 + }, + { + "epoch": 0.34243558293193904, + "grad_norm": 1.77981698513031, + "learning_rate": 0.00015302525720474038, + "loss": 1.2488, + "step": 9562 + }, + { + "epoch": 0.3424713950615073, + "grad_norm": 1.6823062896728516, + "learning_rate": 0.00015301542274405948, + "loss": 1.1256, + "step": 9563 + }, + { + "epoch": 0.34250720719107564, + "grad_norm": 1.818016529083252, + "learning_rate": 0.00015300558757012448, + "loss": 1.0176, + "step": 9564 + }, + { + "epoch": 0.3425430193206439, + "grad_norm": 2.355372428894043, + "learning_rate": 0.00015299575168306774, + "loss": 1.1935, + "step": 9565 + }, + { + "epoch": 0.3425788314502122, + "grad_norm": 1.5968785285949707, + "learning_rate": 0.00015298591508302142, + "loss": 0.9995, + "step": 9566 + }, + { + "epoch": 0.3426146435797805, + "grad_norm": 1.3371225595474243, + "learning_rate": 0.000152976077770118, + "loss": 1.1472, + "step": 9567 + }, + { + "epoch": 0.34265045570934877, + "grad_norm": 1.8261845111846924, + "learning_rate": 0.00015296623974448982, + "loss": 1.2599, + "step": 9568 + }, + { + "epoch": 0.34268626783891704, + "grad_norm": 1.2878512144088745, + "learning_rate": 0.00015295640100626914, + "loss": 1.0279, + "step": 9569 + }, + { + "epoch": 0.3427220799684853, + "grad_norm": 1.5249112844467163, + "learning_rate": 0.00015294656155558843, + "loss": 0.9294, + "step": 9570 + }, + { + "epoch": 0.34275789209805363, + "grad_norm": 1.398984670639038, + "learning_rate": 0.00015293672139258003, + "loss": 1.144, + "step": 9571 + }, + { + "epoch": 0.3427937042276219, + "grad_norm": 1.7544749975204468, + "learning_rate": 0.00015292688051737633, + "loss": 1.1995, + "step": 9572 + }, + { + "epoch": 0.34282951635719017, + "grad_norm": 2.3699138164520264, + "learning_rate": 0.0001529170389301097, + "loss": 1.3226, + "step": 9573 + }, + { + "epoch": 0.3428653284867585, + "grad_norm": 1.3292350769042969, + "learning_rate": 0.00015290719663091262, + "loss": 1.142, + "step": 9574 + }, + { + "epoch": 0.34290114061632676, + "grad_norm": 1.9487642049789429, + "learning_rate": 0.00015289735361991743, + "loss": 1.0106, + "step": 9575 + }, + { + "epoch": 0.34293695274589503, + "grad_norm": 1.5851879119873047, + "learning_rate": 0.00015288750989725657, + "loss": 1.0492, + "step": 9576 + }, + { + "epoch": 0.3429727648754633, + "grad_norm": 1.4632664918899536, + "learning_rate": 0.00015287766546306247, + "loss": 1.1768, + "step": 9577 + }, + { + "epoch": 0.3430085770050316, + "grad_norm": 1.3295665979385376, + "learning_rate": 0.00015286782031746763, + "loss": 1.1443, + "step": 9578 + }, + { + "epoch": 0.3430443891345999, + "grad_norm": 1.957970380783081, + "learning_rate": 0.00015285797446060442, + "loss": 1.3591, + "step": 9579 + }, + { + "epoch": 0.34308020126416816, + "grad_norm": 1.3647724390029907, + "learning_rate": 0.00015284812789260536, + "loss": 1.1992, + "step": 9580 + }, + { + "epoch": 0.3431160133937365, + "grad_norm": 1.9365417957305908, + "learning_rate": 0.00015283828061360291, + "loss": 1.164, + "step": 9581 + }, + { + "epoch": 0.34315182552330475, + "grad_norm": 2.604904890060425, + "learning_rate": 0.00015282843262372955, + "loss": 1.363, + "step": 9582 + }, + { + "epoch": 0.343187637652873, + "grad_norm": 1.3333133459091187, + "learning_rate": 0.0001528185839231178, + "loss": 1.3624, + "step": 9583 + }, + { + "epoch": 0.3432234497824413, + "grad_norm": 1.5194308757781982, + "learning_rate": 0.00015280873451190008, + "loss": 1.1191, + "step": 9584 + }, + { + "epoch": 0.3432592619120096, + "grad_norm": 2.4023520946502686, + "learning_rate": 0.000152798884390209, + "loss": 1.2383, + "step": 9585 + }, + { + "epoch": 0.3432950740415779, + "grad_norm": 1.6975901126861572, + "learning_rate": 0.000152789033558177, + "loss": 1.163, + "step": 9586 + }, + { + "epoch": 0.34333088617114615, + "grad_norm": 1.6529203653335571, + "learning_rate": 0.0001527791820159367, + "loss": 1.0628, + "step": 9587 + }, + { + "epoch": 0.3433666983007145, + "grad_norm": 1.6875580549240112, + "learning_rate": 0.00015276932976362052, + "loss": 0.9766, + "step": 9588 + }, + { + "epoch": 0.34340251043028275, + "grad_norm": 1.3545644283294678, + "learning_rate": 0.00015275947680136112, + "loss": 1.1739, + "step": 9589 + }, + { + "epoch": 0.343438322559851, + "grad_norm": 1.7683098316192627, + "learning_rate": 0.000152749623129291, + "loss": 1.2271, + "step": 9590 + }, + { + "epoch": 0.3434741346894193, + "grad_norm": 1.9092854261398315, + "learning_rate": 0.00015273976874754274, + "loss": 1.1099, + "step": 9591 + }, + { + "epoch": 0.3435099468189876, + "grad_norm": 1.6686033010482788, + "learning_rate": 0.00015272991365624896, + "loss": 1.3542, + "step": 9592 + }, + { + "epoch": 0.3435457589485559, + "grad_norm": 1.5867096185684204, + "learning_rate": 0.00015272005785554215, + "loss": 1.3401, + "step": 9593 + }, + { + "epoch": 0.34358157107812415, + "grad_norm": 1.4275401830673218, + "learning_rate": 0.000152710201345555, + "loss": 1.0911, + "step": 9594 + }, + { + "epoch": 0.3436173832076925, + "grad_norm": 1.7948700189590454, + "learning_rate": 0.00015270034412642007, + "loss": 1.3703, + "step": 9595 + }, + { + "epoch": 0.34365319533726074, + "grad_norm": 2.173154354095459, + "learning_rate": 0.00015269048619827, + "loss": 1.3361, + "step": 9596 + }, + { + "epoch": 0.343689007466829, + "grad_norm": 1.8356138467788696, + "learning_rate": 0.0001526806275612374, + "loss": 1.1847, + "step": 9597 + }, + { + "epoch": 0.3437248195963973, + "grad_norm": 1.4745250940322876, + "learning_rate": 0.00015267076821545489, + "loss": 1.2244, + "step": 9598 + }, + { + "epoch": 0.3437606317259656, + "grad_norm": 1.6805270910263062, + "learning_rate": 0.00015266090816105514, + "loss": 1.422, + "step": 9599 + }, + { + "epoch": 0.3437964438555339, + "grad_norm": 1.589485764503479, + "learning_rate": 0.00015265104739817082, + "loss": 1.3522, + "step": 9600 + }, + { + "epoch": 0.34383225598510214, + "grad_norm": 1.8467923402786255, + "learning_rate": 0.00015264118592693457, + "loss": 0.9976, + "step": 9601 + }, + { + "epoch": 0.34386806811467047, + "grad_norm": 1.5657449960708618, + "learning_rate": 0.00015263132374747907, + "loss": 1.0713, + "step": 9602 + }, + { + "epoch": 0.34390388024423874, + "grad_norm": 1.9207648038864136, + "learning_rate": 0.00015262146085993697, + "loss": 1.4739, + "step": 9603 + }, + { + "epoch": 0.343939692373807, + "grad_norm": 1.6555887460708618, + "learning_rate": 0.00015261159726444098, + "loss": 1.3017, + "step": 9604 + }, + { + "epoch": 0.3439755045033753, + "grad_norm": 1.3943760395050049, + "learning_rate": 0.00015260173296112385, + "loss": 1.1584, + "step": 9605 + }, + { + "epoch": 0.3440113166329436, + "grad_norm": 1.815686821937561, + "learning_rate": 0.00015259186795011823, + "loss": 1.4572, + "step": 9606 + }, + { + "epoch": 0.34404712876251187, + "grad_norm": 1.719547986984253, + "learning_rate": 0.0001525820022315569, + "loss": 1.2612, + "step": 9607 + }, + { + "epoch": 0.34408294089208014, + "grad_norm": 1.7400864362716675, + "learning_rate": 0.0001525721358055725, + "loss": 1.0812, + "step": 9608 + }, + { + "epoch": 0.3441187530216484, + "grad_norm": 1.3057868480682373, + "learning_rate": 0.0001525622686722979, + "loss": 1.0695, + "step": 9609 + }, + { + "epoch": 0.34415456515121673, + "grad_norm": 1.707817792892456, + "learning_rate": 0.00015255240083186572, + "loss": 1.0628, + "step": 9610 + }, + { + "epoch": 0.344190377280785, + "grad_norm": 1.4533143043518066, + "learning_rate": 0.00015254253228440877, + "loss": 1.1715, + "step": 9611 + }, + { + "epoch": 0.34422618941035327, + "grad_norm": 2.4127800464630127, + "learning_rate": 0.00015253266303005987, + "loss": 1.0617, + "step": 9612 + }, + { + "epoch": 0.3442620015399216, + "grad_norm": 1.5702675580978394, + "learning_rate": 0.00015252279306895172, + "loss": 1.079, + "step": 9613 + }, + { + "epoch": 0.34429781366948986, + "grad_norm": 1.3684226274490356, + "learning_rate": 0.00015251292240121714, + "loss": 1.0358, + "step": 9614 + }, + { + "epoch": 0.34433362579905813, + "grad_norm": 1.583186149597168, + "learning_rate": 0.0001525030510269889, + "loss": 1.1973, + "step": 9615 + }, + { + "epoch": 0.3443694379286264, + "grad_norm": 1.248952865600586, + "learning_rate": 0.00015249317894639987, + "loss": 1.1541, + "step": 9616 + }, + { + "epoch": 0.3444052500581947, + "grad_norm": 1.4508904218673706, + "learning_rate": 0.00015248330615958282, + "loss": 1.0848, + "step": 9617 + }, + { + "epoch": 0.344441062187763, + "grad_norm": 1.6473090648651123, + "learning_rate": 0.00015247343266667061, + "loss": 1.221, + "step": 9618 + }, + { + "epoch": 0.34447687431733126, + "grad_norm": 1.651487946510315, + "learning_rate": 0.00015246355846779602, + "loss": 1.3703, + "step": 9619 + }, + { + "epoch": 0.3445126864468996, + "grad_norm": 1.4134849309921265, + "learning_rate": 0.00015245368356309194, + "loss": 1.0929, + "step": 9620 + }, + { + "epoch": 0.34454849857646785, + "grad_norm": 1.4873024225234985, + "learning_rate": 0.00015244380795269118, + "loss": 1.3041, + "step": 9621 + }, + { + "epoch": 0.3445843107060361, + "grad_norm": 1.6809308528900146, + "learning_rate": 0.00015243393163672664, + "loss": 1.1308, + "step": 9622 + }, + { + "epoch": 0.3446201228356044, + "grad_norm": 1.646581768989563, + "learning_rate": 0.00015242405461533118, + "loss": 1.2324, + "step": 9623 + }, + { + "epoch": 0.3446559349651727, + "grad_norm": 1.4818466901779175, + "learning_rate": 0.0001524141768886377, + "loss": 1.1836, + "step": 9624 + }, + { + "epoch": 0.344691747094741, + "grad_norm": 1.867539405822754, + "learning_rate": 0.0001524042984567791, + "loss": 1.2264, + "step": 9625 + }, + { + "epoch": 0.34472755922430925, + "grad_norm": 1.5543105602264404, + "learning_rate": 0.0001523944193198882, + "loss": 1.112, + "step": 9626 + }, + { + "epoch": 0.3447633713538776, + "grad_norm": 1.8285465240478516, + "learning_rate": 0.00015238453947809805, + "loss": 1.3324, + "step": 9627 + }, + { + "epoch": 0.34479918348344585, + "grad_norm": 1.6862468719482422, + "learning_rate": 0.00015237465893154143, + "loss": 1.1982, + "step": 9628 + }, + { + "epoch": 0.3448349956130141, + "grad_norm": 1.9227724075317383, + "learning_rate": 0.00015236477768035137, + "loss": 1.2084, + "step": 9629 + }, + { + "epoch": 0.3448708077425824, + "grad_norm": 1.3495662212371826, + "learning_rate": 0.00015235489572466078, + "loss": 1.2651, + "step": 9630 + }, + { + "epoch": 0.3449066198721507, + "grad_norm": 1.7470605373382568, + "learning_rate": 0.00015234501306460256, + "loss": 1.1367, + "step": 9631 + }, + { + "epoch": 0.344942432001719, + "grad_norm": 1.482189416885376, + "learning_rate": 0.0001523351297003097, + "loss": 1.1475, + "step": 9632 + }, + { + "epoch": 0.34497824413128725, + "grad_norm": 1.512742519378662, + "learning_rate": 0.00015232524563191523, + "loss": 0.9843, + "step": 9633 + }, + { + "epoch": 0.3450140562608556, + "grad_norm": 2.0052013397216797, + "learning_rate": 0.00015231536085955205, + "loss": 1.2066, + "step": 9634 + }, + { + "epoch": 0.34504986839042384, + "grad_norm": 1.3280797004699707, + "learning_rate": 0.00015230547538335317, + "loss": 1.252, + "step": 9635 + }, + { + "epoch": 0.3450856805199921, + "grad_norm": 1.7224472761154175, + "learning_rate": 0.00015229558920345162, + "loss": 1.1889, + "step": 9636 + }, + { + "epoch": 0.3451214926495604, + "grad_norm": 1.6281859874725342, + "learning_rate": 0.00015228570231998033, + "loss": 1.2308, + "step": 9637 + }, + { + "epoch": 0.3451573047791287, + "grad_norm": 1.5207011699676514, + "learning_rate": 0.00015227581473307238, + "loss": 1.2229, + "step": 9638 + }, + { + "epoch": 0.345193116908697, + "grad_norm": 1.4097263813018799, + "learning_rate": 0.00015226592644286075, + "loss": 1.0353, + "step": 9639 + }, + { + "epoch": 0.34522892903826524, + "grad_norm": 1.6506962776184082, + "learning_rate": 0.00015225603744947852, + "loss": 1.2311, + "step": 9640 + }, + { + "epoch": 0.34526474116783357, + "grad_norm": 1.25148606300354, + "learning_rate": 0.0001522461477530587, + "loss": 1.1935, + "step": 9641 + }, + { + "epoch": 0.34530055329740184, + "grad_norm": 1.2874782085418701, + "learning_rate": 0.00015223625735373436, + "loss": 1.1363, + "step": 9642 + }, + { + "epoch": 0.3453363654269701, + "grad_norm": 1.538773775100708, + "learning_rate": 0.00015222636625163854, + "loss": 1.2132, + "step": 9643 + }, + { + "epoch": 0.3453721775565384, + "grad_norm": 1.4156872034072876, + "learning_rate": 0.00015221647444690437, + "loss": 1.0673, + "step": 9644 + }, + { + "epoch": 0.3454079896861067, + "grad_norm": 1.3999695777893066, + "learning_rate": 0.00015220658193966489, + "loss": 1.1227, + "step": 9645 + }, + { + "epoch": 0.34544380181567497, + "grad_norm": 1.6978250741958618, + "learning_rate": 0.00015219668873005314, + "loss": 1.2266, + "step": 9646 + }, + { + "epoch": 0.34547961394524324, + "grad_norm": 1.4824182987213135, + "learning_rate": 0.0001521867948182023, + "loss": 1.1772, + "step": 9647 + }, + { + "epoch": 0.34551542607481156, + "grad_norm": 1.6255099773406982, + "learning_rate": 0.00015217690020424547, + "loss": 1.3704, + "step": 9648 + }, + { + "epoch": 0.34555123820437983, + "grad_norm": 1.2984426021575928, + "learning_rate": 0.00015216700488831573, + "loss": 1.2364, + "step": 9649 + }, + { + "epoch": 0.3455870503339481, + "grad_norm": 1.2877962589263916, + "learning_rate": 0.00015215710887054622, + "loss": 1.1701, + "step": 9650 + }, + { + "epoch": 0.34562286246351637, + "grad_norm": 1.5184756517410278, + "learning_rate": 0.00015214721215107011, + "loss": 1.0819, + "step": 9651 + }, + { + "epoch": 0.3456586745930847, + "grad_norm": 1.2996724843978882, + "learning_rate": 0.0001521373147300205, + "loss": 1.0046, + "step": 9652 + }, + { + "epoch": 0.34569448672265296, + "grad_norm": 1.9473332166671753, + "learning_rate": 0.0001521274166075306, + "loss": 1.2629, + "step": 9653 + }, + { + "epoch": 0.34573029885222123, + "grad_norm": 1.2508527040481567, + "learning_rate": 0.00015211751778373357, + "loss": 1.0896, + "step": 9654 + }, + { + "epoch": 0.34576611098178955, + "grad_norm": 1.6940336227416992, + "learning_rate": 0.0001521076182587625, + "loss": 1.2743, + "step": 9655 + }, + { + "epoch": 0.3458019231113578, + "grad_norm": 1.3343433141708374, + "learning_rate": 0.0001520977180327507, + "loss": 1.0103, + "step": 9656 + }, + { + "epoch": 0.3458377352409261, + "grad_norm": 1.7165130376815796, + "learning_rate": 0.00015208781710583126, + "loss": 0.9345, + "step": 9657 + }, + { + "epoch": 0.34587354737049436, + "grad_norm": 2.094696521759033, + "learning_rate": 0.00015207791547813744, + "loss": 1.2253, + "step": 9658 + }, + { + "epoch": 0.3459093595000627, + "grad_norm": 1.8207379579544067, + "learning_rate": 0.00015206801314980245, + "loss": 1.3148, + "step": 9659 + }, + { + "epoch": 0.34594517162963095, + "grad_norm": 1.5047814846038818, + "learning_rate": 0.00015205811012095952, + "loss": 1.3364, + "step": 9660 + }, + { + "epoch": 0.3459809837591992, + "grad_norm": 1.6303040981292725, + "learning_rate": 0.00015204820639174184, + "loss": 1.2462, + "step": 9661 + }, + { + "epoch": 0.34601679588876755, + "grad_norm": 1.2657214403152466, + "learning_rate": 0.00015203830196228272, + "loss": 1.3293, + "step": 9662 + }, + { + "epoch": 0.3460526080183358, + "grad_norm": 1.7310987710952759, + "learning_rate": 0.00015202839683271536, + "loss": 0.9023, + "step": 9663 + }, + { + "epoch": 0.3460884201479041, + "grad_norm": 1.4321213960647583, + "learning_rate": 0.000152018491003173, + "loss": 1.0939, + "step": 9664 + }, + { + "epoch": 0.34612423227747235, + "grad_norm": 1.4699749946594238, + "learning_rate": 0.00015200858447378897, + "loss": 1.2409, + "step": 9665 + }, + { + "epoch": 0.3461600444070407, + "grad_norm": 1.656327247619629, + "learning_rate": 0.0001519986772446965, + "loss": 1.1497, + "step": 9666 + }, + { + "epoch": 0.34619585653660895, + "grad_norm": 1.5885419845581055, + "learning_rate": 0.00015198876931602894, + "loss": 1.2424, + "step": 9667 + }, + { + "epoch": 0.3462316686661772, + "grad_norm": 1.5385602712631226, + "learning_rate": 0.00015197886068791952, + "loss": 1.1401, + "step": 9668 + }, + { + "epoch": 0.34626748079574554, + "grad_norm": 1.5430552959442139, + "learning_rate": 0.00015196895136050157, + "loss": 1.1273, + "step": 9669 + }, + { + "epoch": 0.3463032929253138, + "grad_norm": 1.4798239469528198, + "learning_rate": 0.00015195904133390842, + "loss": 0.9645, + "step": 9670 + }, + { + "epoch": 0.3463391050548821, + "grad_norm": 1.5810937881469727, + "learning_rate": 0.00015194913060827343, + "loss": 0.9953, + "step": 9671 + }, + { + "epoch": 0.34637491718445035, + "grad_norm": 1.5200392007827759, + "learning_rate": 0.00015193921918372984, + "loss": 1.1679, + "step": 9672 + }, + { + "epoch": 0.3464107293140187, + "grad_norm": 1.645698070526123, + "learning_rate": 0.00015192930706041112, + "loss": 1.2402, + "step": 9673 + }, + { + "epoch": 0.34644654144358694, + "grad_norm": 1.2313026189804077, + "learning_rate": 0.00015191939423845049, + "loss": 1.0954, + "step": 9674 + }, + { + "epoch": 0.3464823535731552, + "grad_norm": 1.5932331085205078, + "learning_rate": 0.0001519094807179814, + "loss": 1.1513, + "step": 9675 + }, + { + "epoch": 0.34651816570272354, + "grad_norm": 1.6778525114059448, + "learning_rate": 0.00015189956649913722, + "loss": 1.0716, + "step": 9676 + }, + { + "epoch": 0.3465539778322918, + "grad_norm": 1.4357529878616333, + "learning_rate": 0.0001518896515820513, + "loss": 1.3896, + "step": 9677 + }, + { + "epoch": 0.3465897899618601, + "grad_norm": 1.5641772747039795, + "learning_rate": 0.00015187973596685706, + "loss": 1.1552, + "step": 9678 + }, + { + "epoch": 0.34662560209142834, + "grad_norm": 1.2926721572875977, + "learning_rate": 0.0001518698196536879, + "loss": 1.0391, + "step": 9679 + }, + { + "epoch": 0.34666141422099667, + "grad_norm": 1.203285813331604, + "learning_rate": 0.00015185990264267725, + "loss": 1.1151, + "step": 9680 + }, + { + "epoch": 0.34669722635056494, + "grad_norm": 1.8319977521896362, + "learning_rate": 0.00015184998493395846, + "loss": 1.3633, + "step": 9681 + }, + { + "epoch": 0.3467330384801332, + "grad_norm": 1.7328323125839233, + "learning_rate": 0.00015184006652766503, + "loss": 1.0443, + "step": 9682 + }, + { + "epoch": 0.34676885060970153, + "grad_norm": 1.7407183647155762, + "learning_rate": 0.00015183014742393036, + "loss": 1.2559, + "step": 9683 + }, + { + "epoch": 0.3468046627392698, + "grad_norm": 1.6119823455810547, + "learning_rate": 0.0001518202276228879, + "loss": 1.0358, + "step": 9684 + }, + { + "epoch": 0.34684047486883807, + "grad_norm": 1.346174716949463, + "learning_rate": 0.00015181030712467113, + "loss": 1.0378, + "step": 9685 + }, + { + "epoch": 0.34687628699840634, + "grad_norm": 1.369115948677063, + "learning_rate": 0.0001518003859294135, + "loss": 1.2381, + "step": 9686 + }, + { + "epoch": 0.34691209912797466, + "grad_norm": 1.5480401515960693, + "learning_rate": 0.00015179046403724852, + "loss": 1.2369, + "step": 9687 + }, + { + "epoch": 0.34694791125754293, + "grad_norm": 1.6652370691299438, + "learning_rate": 0.00015178054144830965, + "loss": 1.2568, + "step": 9688 + }, + { + "epoch": 0.3469837233871112, + "grad_norm": 1.8954780101776123, + "learning_rate": 0.0001517706181627304, + "loss": 1.0836, + "step": 9689 + }, + { + "epoch": 0.3470195355166795, + "grad_norm": 2.0534305572509766, + "learning_rate": 0.0001517606941806442, + "loss": 1.3777, + "step": 9690 + }, + { + "epoch": 0.3470553476462478, + "grad_norm": 1.4262809753417969, + "learning_rate": 0.00015175076950218468, + "loss": 1.0737, + "step": 9691 + }, + { + "epoch": 0.34709115977581606, + "grad_norm": 1.75150728225708, + "learning_rate": 0.00015174084412748529, + "loss": 1.1452, + "step": 9692 + }, + { + "epoch": 0.34712697190538433, + "grad_norm": 1.3214682340621948, + "learning_rate": 0.00015173091805667957, + "loss": 1.0941, + "step": 9693 + }, + { + "epoch": 0.34716278403495265, + "grad_norm": 1.4653339385986328, + "learning_rate": 0.0001517209912899011, + "loss": 1.2431, + "step": 9694 + }, + { + "epoch": 0.3471985961645209, + "grad_norm": 1.596140742301941, + "learning_rate": 0.00015171106382728342, + "loss": 0.9859, + "step": 9695 + }, + { + "epoch": 0.3472344082940892, + "grad_norm": 1.5870321989059448, + "learning_rate": 0.00015170113566896005, + "loss": 1.1128, + "step": 9696 + }, + { + "epoch": 0.3472702204236575, + "grad_norm": 1.6176552772521973, + "learning_rate": 0.0001516912068150646, + "loss": 1.2027, + "step": 9697 + }, + { + "epoch": 0.3473060325532258, + "grad_norm": 1.3415840864181519, + "learning_rate": 0.00015168127726573064, + "loss": 1.2293, + "step": 9698 + }, + { + "epoch": 0.34734184468279405, + "grad_norm": 2.0355167388916016, + "learning_rate": 0.00015167134702109177, + "loss": 1.1386, + "step": 9699 + }, + { + "epoch": 0.3473776568123623, + "grad_norm": 1.3568644523620605, + "learning_rate": 0.00015166141608128158, + "loss": 1.2783, + "step": 9700 + }, + { + "epoch": 0.34741346894193065, + "grad_norm": 1.9533430337905884, + "learning_rate": 0.0001516514844464336, + "loss": 1.295, + "step": 9701 + }, + { + "epoch": 0.3474492810714989, + "grad_norm": 1.4866571426391602, + "learning_rate": 0.00015164155211668163, + "loss": 0.9664, + "step": 9702 + }, + { + "epoch": 0.3474850932010672, + "grad_norm": 1.5984506607055664, + "learning_rate": 0.00015163161909215913, + "loss": 1.1281, + "step": 9703 + }, + { + "epoch": 0.3475209053306355, + "grad_norm": 1.674021601676941, + "learning_rate": 0.00015162168537299979, + "loss": 1.2735, + "step": 9704 + }, + { + "epoch": 0.3475567174602038, + "grad_norm": 1.7028120756149292, + "learning_rate": 0.00015161175095933729, + "loss": 1.0605, + "step": 9705 + }, + { + "epoch": 0.34759252958977205, + "grad_norm": 2.2827718257904053, + "learning_rate": 0.00015160181585130523, + "loss": 1.1753, + "step": 9706 + }, + { + "epoch": 0.3476283417193403, + "grad_norm": 1.8715972900390625, + "learning_rate": 0.00015159188004903733, + "loss": 1.2299, + "step": 9707 + }, + { + "epoch": 0.34766415384890864, + "grad_norm": 2.161724090576172, + "learning_rate": 0.0001515819435526672, + "loss": 1.2662, + "step": 9708 + }, + { + "epoch": 0.3476999659784769, + "grad_norm": 1.7287324666976929, + "learning_rate": 0.00015157200636232857, + "loss": 1.2333, + "step": 9709 + }, + { + "epoch": 0.3477357781080452, + "grad_norm": 1.5967175960540771, + "learning_rate": 0.0001515620684781551, + "loss": 1.2496, + "step": 9710 + }, + { + "epoch": 0.3477715902376135, + "grad_norm": 1.1962029933929443, + "learning_rate": 0.00015155212990028053, + "loss": 1.0958, + "step": 9711 + }, + { + "epoch": 0.3478074023671818, + "grad_norm": 1.5559639930725098, + "learning_rate": 0.00015154219062883854, + "loss": 1.0981, + "step": 9712 + }, + { + "epoch": 0.34784321449675004, + "grad_norm": 1.583907961845398, + "learning_rate": 0.00015153225066396288, + "loss": 1.0959, + "step": 9713 + }, + { + "epoch": 0.3478790266263183, + "grad_norm": 1.3993730545043945, + "learning_rate": 0.00015152231000578723, + "loss": 1.1058, + "step": 9714 + }, + { + "epoch": 0.34791483875588664, + "grad_norm": 1.2110188007354736, + "learning_rate": 0.00015151236865444537, + "loss": 1.2218, + "step": 9715 + }, + { + "epoch": 0.3479506508854549, + "grad_norm": 1.8573819398880005, + "learning_rate": 0.00015150242661007103, + "loss": 1.0971, + "step": 9716 + }, + { + "epoch": 0.3479864630150232, + "grad_norm": 1.6154427528381348, + "learning_rate": 0.000151492483872798, + "loss": 1.1404, + "step": 9717 + }, + { + "epoch": 0.3480222751445915, + "grad_norm": 1.8010661602020264, + "learning_rate": 0.00015148254044276, + "loss": 1.2908, + "step": 9718 + }, + { + "epoch": 0.34805808727415977, + "grad_norm": 1.494921088218689, + "learning_rate": 0.00015147259632009082, + "loss": 1.0577, + "step": 9719 + }, + { + "epoch": 0.34809389940372804, + "grad_norm": 1.2275960445404053, + "learning_rate": 0.00015146265150492428, + "loss": 1.1126, + "step": 9720 + }, + { + "epoch": 0.3481297115332963, + "grad_norm": 1.5217235088348389, + "learning_rate": 0.0001514527059973941, + "loss": 1.1609, + "step": 9721 + }, + { + "epoch": 0.34816552366286463, + "grad_norm": 1.4670363664627075, + "learning_rate": 0.00015144275979763416, + "loss": 1.2838, + "step": 9722 + }, + { + "epoch": 0.3482013357924329, + "grad_norm": 1.5391273498535156, + "learning_rate": 0.0001514328129057782, + "loss": 1.1218, + "step": 9723 + }, + { + "epoch": 0.34823714792200117, + "grad_norm": 2.6264989376068115, + "learning_rate": 0.00015142286532196018, + "loss": 1.1955, + "step": 9724 + }, + { + "epoch": 0.3482729600515695, + "grad_norm": 1.689054012298584, + "learning_rate": 0.00015141291704631374, + "loss": 1.1878, + "step": 9725 + }, + { + "epoch": 0.34830877218113776, + "grad_norm": 1.8116180896759033, + "learning_rate": 0.00015140296807897289, + "loss": 1.1649, + "step": 9726 + }, + { + "epoch": 0.34834458431070603, + "grad_norm": 1.7240902185440063, + "learning_rate": 0.00015139301842007137, + "loss": 1.2593, + "step": 9727 + }, + { + "epoch": 0.3483803964402743, + "grad_norm": 1.3776490688323975, + "learning_rate": 0.0001513830680697431, + "loss": 1.0586, + "step": 9728 + }, + { + "epoch": 0.3484162085698426, + "grad_norm": 1.5433671474456787, + "learning_rate": 0.0001513731170281219, + "loss": 1.0142, + "step": 9729 + }, + { + "epoch": 0.3484520206994109, + "grad_norm": 1.4980100393295288, + "learning_rate": 0.00015136316529534168, + "loss": 1.2663, + "step": 9730 + }, + { + "epoch": 0.34848783282897916, + "grad_norm": 1.911441683769226, + "learning_rate": 0.00015135321287153636, + "loss": 0.965, + "step": 9731 + }, + { + "epoch": 0.3485236449585475, + "grad_norm": 1.823591709136963, + "learning_rate": 0.00015134325975683975, + "loss": 1.1078, + "step": 9732 + }, + { + "epoch": 0.34855945708811575, + "grad_norm": 1.893239974975586, + "learning_rate": 0.00015133330595138586, + "loss": 1.3123, + "step": 9733 + }, + { + "epoch": 0.348595269217684, + "grad_norm": 1.243790864944458, + "learning_rate": 0.00015132335145530854, + "loss": 1.0676, + "step": 9734 + }, + { + "epoch": 0.3486310813472523, + "grad_norm": 1.5625475645065308, + "learning_rate": 0.00015131339626874173, + "loss": 0.9817, + "step": 9735 + }, + { + "epoch": 0.3486668934768206, + "grad_norm": 1.605320930480957, + "learning_rate": 0.00015130344039181935, + "loss": 1.0534, + "step": 9736 + }, + { + "epoch": 0.3487027056063889, + "grad_norm": 1.5703614950180054, + "learning_rate": 0.00015129348382467535, + "loss": 1.4608, + "step": 9737 + }, + { + "epoch": 0.34873851773595715, + "grad_norm": 1.282514214515686, + "learning_rate": 0.00015128352656744372, + "loss": 1.2242, + "step": 9738 + }, + { + "epoch": 0.3487743298655255, + "grad_norm": 1.9980137348175049, + "learning_rate": 0.00015127356862025835, + "loss": 1.1318, + "step": 9739 + }, + { + "epoch": 0.34881014199509375, + "grad_norm": 1.3702197074890137, + "learning_rate": 0.00015126360998325326, + "loss": 1.1147, + "step": 9740 + }, + { + "epoch": 0.348845954124662, + "grad_norm": 1.5789941549301147, + "learning_rate": 0.00015125365065656244, + "loss": 0.9998, + "step": 9741 + }, + { + "epoch": 0.3488817662542303, + "grad_norm": 2.100034713745117, + "learning_rate": 0.00015124369064031988, + "loss": 1.3359, + "step": 9742 + }, + { + "epoch": 0.3489175783837986, + "grad_norm": 1.489302158355713, + "learning_rate": 0.00015123372993465953, + "loss": 1.2621, + "step": 9743 + }, + { + "epoch": 0.3489533905133669, + "grad_norm": 1.373498797416687, + "learning_rate": 0.00015122376853971545, + "loss": 1.2669, + "step": 9744 + }, + { + "epoch": 0.34898920264293515, + "grad_norm": 2.072453498840332, + "learning_rate": 0.00015121380645562163, + "loss": 1.1901, + "step": 9745 + }, + { + "epoch": 0.3490250147725035, + "grad_norm": 1.3221815824508667, + "learning_rate": 0.0001512038436825121, + "loss": 1.0993, + "step": 9746 + }, + { + "epoch": 0.34906082690207174, + "grad_norm": 1.5770299434661865, + "learning_rate": 0.0001511938802205209, + "loss": 1.22, + "step": 9747 + }, + { + "epoch": 0.34909663903164, + "grad_norm": 1.5741233825683594, + "learning_rate": 0.0001511839160697821, + "loss": 1.0259, + "step": 9748 + }, + { + "epoch": 0.3491324511612083, + "grad_norm": 1.3751839399337769, + "learning_rate": 0.00015117395123042968, + "loss": 0.9657, + "step": 9749 + }, + { + "epoch": 0.3491682632907766, + "grad_norm": 1.6203888654708862, + "learning_rate": 0.0001511639857025978, + "loss": 1.3591, + "step": 9750 + }, + { + "epoch": 0.3492040754203449, + "grad_norm": 1.236509919166565, + "learning_rate": 0.0001511540194864205, + "loss": 1.1532, + "step": 9751 + }, + { + "epoch": 0.34923988754991314, + "grad_norm": 1.207047700881958, + "learning_rate": 0.0001511440525820318, + "loss": 1.1559, + "step": 9752 + }, + { + "epoch": 0.34927569967948147, + "grad_norm": 1.3727480173110962, + "learning_rate": 0.0001511340849895659, + "loss": 1.0458, + "step": 9753 + }, + { + "epoch": 0.34931151180904974, + "grad_norm": 1.7945502996444702, + "learning_rate": 0.0001511241167091568, + "loss": 1.2905, + "step": 9754 + }, + { + "epoch": 0.349347323938618, + "grad_norm": 1.9165256023406982, + "learning_rate": 0.0001511141477409387, + "loss": 1.1151, + "step": 9755 + }, + { + "epoch": 0.3493831360681863, + "grad_norm": 1.2926796674728394, + "learning_rate": 0.00015110417808504562, + "loss": 1.2626, + "step": 9756 + }, + { + "epoch": 0.3494189481977546, + "grad_norm": 1.6297129392623901, + "learning_rate": 0.00015109420774161178, + "loss": 1.2329, + "step": 9757 + }, + { + "epoch": 0.34945476032732287, + "grad_norm": 1.3772050142288208, + "learning_rate": 0.00015108423671077124, + "loss": 1.2864, + "step": 9758 + }, + { + "epoch": 0.34949057245689114, + "grad_norm": 1.7811938524246216, + "learning_rate": 0.00015107426499265825, + "loss": 1.2375, + "step": 9759 + }, + { + "epoch": 0.34952638458645946, + "grad_norm": 1.4164825677871704, + "learning_rate": 0.00015106429258740687, + "loss": 1.1594, + "step": 9760 + }, + { + "epoch": 0.34956219671602773, + "grad_norm": 1.3440722227096558, + "learning_rate": 0.00015105431949515133, + "loss": 0.9316, + "step": 9761 + }, + { + "epoch": 0.349598008845596, + "grad_norm": 1.4197653532028198, + "learning_rate": 0.00015104434571602577, + "loss": 1.1023, + "step": 9762 + }, + { + "epoch": 0.34963382097516427, + "grad_norm": 1.4260345697402954, + "learning_rate": 0.00015103437125016432, + "loss": 0.9996, + "step": 9763 + }, + { + "epoch": 0.3496696331047326, + "grad_norm": 1.9381436109542847, + "learning_rate": 0.0001510243960977013, + "loss": 1.3128, + "step": 9764 + }, + { + "epoch": 0.34970544523430086, + "grad_norm": 1.8666207790374756, + "learning_rate": 0.00015101442025877083, + "loss": 1.0583, + "step": 9765 + }, + { + "epoch": 0.34974125736386913, + "grad_norm": 1.7008190155029297, + "learning_rate": 0.00015100444373350715, + "loss": 1.2329, + "step": 9766 + }, + { + "epoch": 0.34977706949343745, + "grad_norm": 1.585594654083252, + "learning_rate": 0.00015099446652204446, + "loss": 1.2517, + "step": 9767 + }, + { + "epoch": 0.3498128816230057, + "grad_norm": 1.9226524829864502, + "learning_rate": 0.00015098448862451702, + "loss": 1.0946, + "step": 9768 + }, + { + "epoch": 0.349848693752574, + "grad_norm": 1.4130514860153198, + "learning_rate": 0.00015097451004105904, + "loss": 1.0972, + "step": 9769 + }, + { + "epoch": 0.34988450588214226, + "grad_norm": 1.4189722537994385, + "learning_rate": 0.00015096453077180475, + "loss": 1.1864, + "step": 9770 + }, + { + "epoch": 0.3499203180117106, + "grad_norm": 1.919880986213684, + "learning_rate": 0.00015095455081688846, + "loss": 1.4798, + "step": 9771 + }, + { + "epoch": 0.34995613014127885, + "grad_norm": 1.5445538759231567, + "learning_rate": 0.0001509445701764444, + "loss": 1.2372, + "step": 9772 + }, + { + "epoch": 0.3499919422708471, + "grad_norm": 1.3368805646896362, + "learning_rate": 0.00015093458885060687, + "loss": 0.9419, + "step": 9773 + }, + { + "epoch": 0.35002775440041545, + "grad_norm": 1.5968114137649536, + "learning_rate": 0.00015092460683951015, + "loss": 1.1254, + "step": 9774 + }, + { + "epoch": 0.3500635665299837, + "grad_norm": 1.8087244033813477, + "learning_rate": 0.00015091462414328855, + "loss": 1.3993, + "step": 9775 + }, + { + "epoch": 0.350099378659552, + "grad_norm": 1.9582949876785278, + "learning_rate": 0.00015090464076207634, + "loss": 1.2614, + "step": 9776 + }, + { + "epoch": 0.35013519078912025, + "grad_norm": 1.5172375440597534, + "learning_rate": 0.0001508946566960079, + "loss": 1.2369, + "step": 9777 + }, + { + "epoch": 0.3501710029186886, + "grad_norm": 1.4542908668518066, + "learning_rate": 0.0001508846719452174, + "loss": 1.0478, + "step": 9778 + }, + { + "epoch": 0.35020681504825685, + "grad_norm": 1.4836713075637817, + "learning_rate": 0.00015087468650983935, + "loss": 1.1035, + "step": 9779 + }, + { + "epoch": 0.3502426271778251, + "grad_norm": 1.6385128498077393, + "learning_rate": 0.00015086470039000802, + "loss": 1.0415, + "step": 9780 + }, + { + "epoch": 0.35027843930739344, + "grad_norm": 1.740248203277588, + "learning_rate": 0.00015085471358585774, + "loss": 1.0886, + "step": 9781 + }, + { + "epoch": 0.3503142514369617, + "grad_norm": 1.6153805255889893, + "learning_rate": 0.00015084472609752284, + "loss": 1.2256, + "step": 9782 + }, + { + "epoch": 0.35035006356653, + "grad_norm": 1.3679765462875366, + "learning_rate": 0.0001508347379251378, + "loss": 1.214, + "step": 9783 + }, + { + "epoch": 0.35038587569609825, + "grad_norm": 1.8758317232131958, + "learning_rate": 0.0001508247490688369, + "loss": 1.1657, + "step": 9784 + }, + { + "epoch": 0.3504216878256666, + "grad_norm": 1.5831730365753174, + "learning_rate": 0.0001508147595287546, + "loss": 1.1962, + "step": 9785 + }, + { + "epoch": 0.35045749995523484, + "grad_norm": 1.2796611785888672, + "learning_rate": 0.00015080476930502522, + "loss": 1.1921, + "step": 9786 + }, + { + "epoch": 0.3504933120848031, + "grad_norm": 1.4531158208847046, + "learning_rate": 0.0001507947783977832, + "loss": 1.1649, + "step": 9787 + }, + { + "epoch": 0.35052912421437143, + "grad_norm": 1.4559426307678223, + "learning_rate": 0.00015078478680716299, + "loss": 1.0531, + "step": 9788 + }, + { + "epoch": 0.3505649363439397, + "grad_norm": 1.4340333938598633, + "learning_rate": 0.00015077479453329894, + "loss": 1.0395, + "step": 9789 + }, + { + "epoch": 0.350600748473508, + "grad_norm": 1.9389071464538574, + "learning_rate": 0.0001507648015763256, + "loss": 1.2013, + "step": 9790 + }, + { + "epoch": 0.35063656060307624, + "grad_norm": 1.7823364734649658, + "learning_rate": 0.00015075480793637724, + "loss": 1.3354, + "step": 9791 + }, + { + "epoch": 0.35067237273264457, + "grad_norm": 1.4107086658477783, + "learning_rate": 0.00015074481361358844, + "loss": 1.2087, + "step": 9792 + }, + { + "epoch": 0.35070818486221284, + "grad_norm": 1.4939305782318115, + "learning_rate": 0.00015073481860809363, + "loss": 1.0343, + "step": 9793 + }, + { + "epoch": 0.3507439969917811, + "grad_norm": 1.6083067655563354, + "learning_rate": 0.00015072482292002732, + "loss": 1.0915, + "step": 9794 + }, + { + "epoch": 0.35077980912134943, + "grad_norm": 1.4494825601577759, + "learning_rate": 0.0001507148265495239, + "loss": 1.121, + "step": 9795 + }, + { + "epoch": 0.3508156212509177, + "grad_norm": 1.5477017164230347, + "learning_rate": 0.00015070482949671794, + "loss": 0.9908, + "step": 9796 + }, + { + "epoch": 0.35085143338048597, + "grad_norm": 1.8897614479064941, + "learning_rate": 0.0001506948317617439, + "loss": 1.3689, + "step": 9797 + }, + { + "epoch": 0.35088724551005424, + "grad_norm": 1.4916149377822876, + "learning_rate": 0.00015068483334473623, + "loss": 1.2331, + "step": 9798 + }, + { + "epoch": 0.35092305763962256, + "grad_norm": 1.3958134651184082, + "learning_rate": 0.00015067483424582956, + "loss": 1.0929, + "step": 9799 + }, + { + "epoch": 0.35095886976919083, + "grad_norm": 1.6310174465179443, + "learning_rate": 0.00015066483446515836, + "loss": 0.9185, + "step": 9800 + }, + { + "epoch": 0.3509946818987591, + "grad_norm": 1.4658968448638916, + "learning_rate": 0.00015065483400285716, + "loss": 1.1841, + "step": 9801 + }, + { + "epoch": 0.3510304940283274, + "grad_norm": 1.703964352607727, + "learning_rate": 0.00015064483285906052, + "loss": 1.0964, + "step": 9802 + }, + { + "epoch": 0.3510663061578957, + "grad_norm": 1.2864514589309692, + "learning_rate": 0.00015063483103390296, + "loss": 1.1339, + "step": 9803 + }, + { + "epoch": 0.35110211828746396, + "grad_norm": 1.504599928855896, + "learning_rate": 0.00015062482852751908, + "loss": 1.0879, + "step": 9804 + }, + { + "epoch": 0.35113793041703223, + "grad_norm": 1.543170690536499, + "learning_rate": 0.0001506148253400434, + "loss": 1.1107, + "step": 9805 + }, + { + "epoch": 0.35117374254660055, + "grad_norm": 1.4892404079437256, + "learning_rate": 0.00015060482147161058, + "loss": 1.0544, + "step": 9806 + }, + { + "epoch": 0.3512095546761688, + "grad_norm": 1.5005767345428467, + "learning_rate": 0.00015059481692235514, + "loss": 1.2147, + "step": 9807 + }, + { + "epoch": 0.3512453668057371, + "grad_norm": 1.5710654258728027, + "learning_rate": 0.0001505848116924117, + "loss": 1.1175, + "step": 9808 + }, + { + "epoch": 0.35128117893530536, + "grad_norm": 1.4932194948196411, + "learning_rate": 0.00015057480578191485, + "loss": 0.8719, + "step": 9809 + }, + { + "epoch": 0.3513169910648737, + "grad_norm": 1.626858115196228, + "learning_rate": 0.00015056479919099927, + "loss": 1.2125, + "step": 9810 + }, + { + "epoch": 0.35135280319444195, + "grad_norm": 1.3680399656295776, + "learning_rate": 0.0001505547919197995, + "loss": 1.1009, + "step": 9811 + }, + { + "epoch": 0.3513886153240102, + "grad_norm": 1.3560386896133423, + "learning_rate": 0.00015054478396845026, + "loss": 1.1544, + "step": 9812 + }, + { + "epoch": 0.35142442745357855, + "grad_norm": 1.5983729362487793, + "learning_rate": 0.00015053477533708608, + "loss": 1.224, + "step": 9813 + }, + { + "epoch": 0.3514602395831468, + "grad_norm": 1.7613227367401123, + "learning_rate": 0.00015052476602584177, + "loss": 1.1351, + "step": 9814 + }, + { + "epoch": 0.3514960517127151, + "grad_norm": 1.5099636316299438, + "learning_rate": 0.00015051475603485183, + "loss": 0.938, + "step": 9815 + }, + { + "epoch": 0.35153186384228335, + "grad_norm": 1.3762383460998535, + "learning_rate": 0.00015050474536425101, + "loss": 1.1244, + "step": 9816 + }, + { + "epoch": 0.3515676759718517, + "grad_norm": 1.5999419689178467, + "learning_rate": 0.00015049473401417403, + "loss": 1.2481, + "step": 9817 + }, + { + "epoch": 0.35160348810141995, + "grad_norm": 1.5760712623596191, + "learning_rate": 0.00015048472198475553, + "loss": 1.2545, + "step": 9818 + }, + { + "epoch": 0.3516393002309882, + "grad_norm": 1.5166449546813965, + "learning_rate": 0.00015047470927613018, + "loss": 1.3029, + "step": 9819 + }, + { + "epoch": 0.35167511236055654, + "grad_norm": 1.6240808963775635, + "learning_rate": 0.00015046469588843276, + "loss": 1.1691, + "step": 9820 + }, + { + "epoch": 0.3517109244901248, + "grad_norm": 2.1581602096557617, + "learning_rate": 0.00015045468182179795, + "loss": 1.4329, + "step": 9821 + }, + { + "epoch": 0.3517467366196931, + "grad_norm": 1.870644211769104, + "learning_rate": 0.0001504446670763605, + "loss": 1.2312, + "step": 9822 + }, + { + "epoch": 0.35178254874926135, + "grad_norm": 1.62003493309021, + "learning_rate": 0.0001504346516522551, + "loss": 1.3216, + "step": 9823 + }, + { + "epoch": 0.3518183608788297, + "grad_norm": 1.4517337083816528, + "learning_rate": 0.00015042463554961648, + "loss": 1.0915, + "step": 9824 + }, + { + "epoch": 0.35185417300839794, + "grad_norm": 1.8905670642852783, + "learning_rate": 0.0001504146187685795, + "loss": 1.1774, + "step": 9825 + }, + { + "epoch": 0.3518899851379662, + "grad_norm": 1.5447136163711548, + "learning_rate": 0.0001504046013092788, + "loss": 0.8985, + "step": 9826 + }, + { + "epoch": 0.35192579726753453, + "grad_norm": 1.9797468185424805, + "learning_rate": 0.00015039458317184923, + "loss": 1.2335, + "step": 9827 + }, + { + "epoch": 0.3519616093971028, + "grad_norm": 1.3299579620361328, + "learning_rate": 0.00015038456435642554, + "loss": 1.2107, + "step": 9828 + }, + { + "epoch": 0.3519974215266711, + "grad_norm": 1.3810408115386963, + "learning_rate": 0.0001503745448631425, + "loss": 1.1959, + "step": 9829 + }, + { + "epoch": 0.35203323365623934, + "grad_norm": 2.1322410106658936, + "learning_rate": 0.00015036452469213504, + "loss": 1.1761, + "step": 9830 + }, + { + "epoch": 0.35206904578580767, + "grad_norm": 1.524885892868042, + "learning_rate": 0.00015035450384353775, + "loss": 1.1891, + "step": 9831 + }, + { + "epoch": 0.35210485791537594, + "grad_norm": 1.7269960641860962, + "learning_rate": 0.0001503444823174856, + "loss": 1.3084, + "step": 9832 + }, + { + "epoch": 0.3521406700449442, + "grad_norm": 1.5470001697540283, + "learning_rate": 0.0001503344601141134, + "loss": 1.0617, + "step": 9833 + }, + { + "epoch": 0.35217648217451253, + "grad_norm": 1.303488850593567, + "learning_rate": 0.00015032443723355597, + "loss": 1.2065, + "step": 9834 + }, + { + "epoch": 0.3522122943040808, + "grad_norm": 1.7679214477539062, + "learning_rate": 0.0001503144136759481, + "loss": 1.0728, + "step": 9835 + }, + { + "epoch": 0.35224810643364907, + "grad_norm": 1.3973852396011353, + "learning_rate": 0.00015030438944142475, + "loss": 1.0837, + "step": 9836 + }, + { + "epoch": 0.35228391856321734, + "grad_norm": 1.3096765279769897, + "learning_rate": 0.0001502943645301207, + "loss": 0.9484, + "step": 9837 + }, + { + "epoch": 0.35231973069278566, + "grad_norm": 1.613806128501892, + "learning_rate": 0.00015028433894217087, + "loss": 0.9552, + "step": 9838 + }, + { + "epoch": 0.35235554282235393, + "grad_norm": 1.4625219106674194, + "learning_rate": 0.0001502743126777101, + "loss": 1.079, + "step": 9839 + }, + { + "epoch": 0.3523913549519222, + "grad_norm": 1.744978427886963, + "learning_rate": 0.0001502642857368733, + "loss": 1.2026, + "step": 9840 + }, + { + "epoch": 0.3524271670814905, + "grad_norm": 1.7407716512680054, + "learning_rate": 0.00015025425811979542, + "loss": 1.0092, + "step": 9841 + }, + { + "epoch": 0.3524629792110588, + "grad_norm": 1.2519451379776, + "learning_rate": 0.00015024422982661125, + "loss": 1.1686, + "step": 9842 + }, + { + "epoch": 0.35249879134062706, + "grad_norm": 1.30332350730896, + "learning_rate": 0.00015023420085745584, + "loss": 1.1629, + "step": 9843 + }, + { + "epoch": 0.35253460347019533, + "grad_norm": 1.898923635482788, + "learning_rate": 0.00015022417121246398, + "loss": 1.0846, + "step": 9844 + }, + { + "epoch": 0.35257041559976365, + "grad_norm": 1.7823002338409424, + "learning_rate": 0.00015021414089177077, + "loss": 1.26, + "step": 9845 + }, + { + "epoch": 0.3526062277293319, + "grad_norm": 1.4894944429397583, + "learning_rate": 0.000150204109895511, + "loss": 1.2415, + "step": 9846 + }, + { + "epoch": 0.3526420398589002, + "grad_norm": 1.5033265352249146, + "learning_rate": 0.00015019407822381973, + "loss": 1.1053, + "step": 9847 + }, + { + "epoch": 0.3526778519884685, + "grad_norm": 1.5556507110595703, + "learning_rate": 0.00015018404587683186, + "loss": 1.2754, + "step": 9848 + }, + { + "epoch": 0.3527136641180368, + "grad_norm": 1.2979432344436646, + "learning_rate": 0.0001501740128546824, + "loss": 0.9111, + "step": 9849 + }, + { + "epoch": 0.35274947624760505, + "grad_norm": 1.3297159671783447, + "learning_rate": 0.00015016397915750633, + "loss": 1.0328, + "step": 9850 + }, + { + "epoch": 0.3527852883771733, + "grad_norm": 2.4332544803619385, + "learning_rate": 0.0001501539447854386, + "loss": 1.1812, + "step": 9851 + }, + { + "epoch": 0.35282110050674165, + "grad_norm": 1.5905245542526245, + "learning_rate": 0.00015014390973861424, + "loss": 1.2242, + "step": 9852 + }, + { + "epoch": 0.3528569126363099, + "grad_norm": 1.6262837648391724, + "learning_rate": 0.00015013387401716823, + "loss": 0.9777, + "step": 9853 + }, + { + "epoch": 0.3528927247658782, + "grad_norm": 1.8818694353103638, + "learning_rate": 0.00015012383762123566, + "loss": 1.368, + "step": 9854 + }, + { + "epoch": 0.3529285368954465, + "grad_norm": 1.7907822132110596, + "learning_rate": 0.0001501138005509515, + "loss": 1.1741, + "step": 9855 + }, + { + "epoch": 0.3529643490250148, + "grad_norm": 1.8452285528182983, + "learning_rate": 0.00015010376280645077, + "loss": 1.1205, + "step": 9856 + }, + { + "epoch": 0.35300016115458305, + "grad_norm": 1.8573046922683716, + "learning_rate": 0.00015009372438786858, + "loss": 1.3318, + "step": 9857 + }, + { + "epoch": 0.3530359732841513, + "grad_norm": 1.7760368585586548, + "learning_rate": 0.00015008368529533992, + "loss": 1.2491, + "step": 9858 + }, + { + "epoch": 0.35307178541371964, + "grad_norm": 1.3232274055480957, + "learning_rate": 0.00015007364552899988, + "loss": 1.1911, + "step": 9859 + }, + { + "epoch": 0.3531075975432879, + "grad_norm": 1.6055084466934204, + "learning_rate": 0.0001500636050889835, + "loss": 0.9475, + "step": 9860 + }, + { + "epoch": 0.3531434096728562, + "grad_norm": 1.6156290769577026, + "learning_rate": 0.00015005356397542597, + "loss": 1.3315, + "step": 9861 + }, + { + "epoch": 0.3531792218024245, + "grad_norm": 1.509942650794983, + "learning_rate": 0.00015004352218846222, + "loss": 1.2644, + "step": 9862 + }, + { + "epoch": 0.3532150339319928, + "grad_norm": 2.013720989227295, + "learning_rate": 0.00015003347972822746, + "loss": 1.378, + "step": 9863 + }, + { + "epoch": 0.35325084606156104, + "grad_norm": 1.4673702716827393, + "learning_rate": 0.00015002343659485678, + "loss": 1.1465, + "step": 9864 + }, + { + "epoch": 0.3532866581911293, + "grad_norm": 1.5034077167510986, + "learning_rate": 0.0001500133927884853, + "loss": 1.2391, + "step": 9865 + }, + { + "epoch": 0.35332247032069763, + "grad_norm": 1.447718858718872, + "learning_rate": 0.0001500033483092481, + "loss": 1.1952, + "step": 9866 + }, + { + "epoch": 0.3533582824502659, + "grad_norm": 1.9349154233932495, + "learning_rate": 0.00014999330315728035, + "loss": 1.0679, + "step": 9867 + }, + { + "epoch": 0.3533940945798342, + "grad_norm": 1.5311660766601562, + "learning_rate": 0.00014998325733271722, + "loss": 1.069, + "step": 9868 + }, + { + "epoch": 0.3534299067094025, + "grad_norm": 1.7797675132751465, + "learning_rate": 0.0001499732108356938, + "loss": 1.2216, + "step": 9869 + }, + { + "epoch": 0.35346571883897077, + "grad_norm": 1.3776856660842896, + "learning_rate": 0.00014996316366634532, + "loss": 1.0168, + "step": 9870 + }, + { + "epoch": 0.35350153096853904, + "grad_norm": 2.101534843444824, + "learning_rate": 0.0001499531158248069, + "loss": 1.3319, + "step": 9871 + }, + { + "epoch": 0.3535373430981073, + "grad_norm": 1.4254441261291504, + "learning_rate": 0.00014994306731121374, + "loss": 1.3898, + "step": 9872 + }, + { + "epoch": 0.35357315522767563, + "grad_norm": 1.632056713104248, + "learning_rate": 0.00014993301812570104, + "loss": 1.0744, + "step": 9873 + }, + { + "epoch": 0.3536089673572439, + "grad_norm": 1.5497018098831177, + "learning_rate": 0.00014992296826840402, + "loss": 1.1397, + "step": 9874 + }, + { + "epoch": 0.35364477948681217, + "grad_norm": 1.7146087884902954, + "learning_rate": 0.00014991291773945782, + "loss": 1.0893, + "step": 9875 + }, + { + "epoch": 0.3536805916163805, + "grad_norm": 1.4841564893722534, + "learning_rate": 0.0001499028665389977, + "loss": 1.3818, + "step": 9876 + }, + { + "epoch": 0.35371640374594876, + "grad_norm": 1.5348974466323853, + "learning_rate": 0.00014989281466715887, + "loss": 1.1242, + "step": 9877 + }, + { + "epoch": 0.35375221587551703, + "grad_norm": 1.2998963594436646, + "learning_rate": 0.0001498827621240766, + "loss": 0.9683, + "step": 9878 + }, + { + "epoch": 0.3537880280050853, + "grad_norm": 1.62977933883667, + "learning_rate": 0.0001498727089098861, + "loss": 1.3977, + "step": 9879 + }, + { + "epoch": 0.3538238401346536, + "grad_norm": 1.9272698163986206, + "learning_rate": 0.00014986265502472262, + "loss": 1.3281, + "step": 9880 + }, + { + "epoch": 0.3538596522642219, + "grad_norm": 1.4520589113235474, + "learning_rate": 0.00014985260046872145, + "loss": 0.9513, + "step": 9881 + }, + { + "epoch": 0.35389546439379016, + "grad_norm": 1.5212454795837402, + "learning_rate": 0.00014984254524201784, + "loss": 1.0318, + "step": 9882 + }, + { + "epoch": 0.3539312765233585, + "grad_norm": 1.349360704421997, + "learning_rate": 0.0001498324893447471, + "loss": 0.969, + "step": 9883 + }, + { + "epoch": 0.35396708865292675, + "grad_norm": 1.5738508701324463, + "learning_rate": 0.00014982243277704446, + "loss": 1.1741, + "step": 9884 + }, + { + "epoch": 0.354002900782495, + "grad_norm": 1.6995205879211426, + "learning_rate": 0.0001498123755390453, + "loss": 1.0549, + "step": 9885 + }, + { + "epoch": 0.3540387129120633, + "grad_norm": 1.895370364189148, + "learning_rate": 0.00014980231763088482, + "loss": 1.1431, + "step": 9886 + }, + { + "epoch": 0.3540745250416316, + "grad_norm": 1.3506393432617188, + "learning_rate": 0.00014979225905269842, + "loss": 1.0352, + "step": 9887 + }, + { + "epoch": 0.3541103371711999, + "grad_norm": 1.3570785522460938, + "learning_rate": 0.0001497821998046214, + "loss": 1.1534, + "step": 9888 + }, + { + "epoch": 0.35414614930076815, + "grad_norm": 1.6351977586746216, + "learning_rate": 0.0001497721398867891, + "loss": 1.0195, + "step": 9889 + }, + { + "epoch": 0.3541819614303365, + "grad_norm": 1.4064701795578003, + "learning_rate": 0.00014976207929933688, + "loss": 1.2249, + "step": 9890 + }, + { + "epoch": 0.35421777355990475, + "grad_norm": 1.5306637287139893, + "learning_rate": 0.00014975201804240005, + "loss": 1.2733, + "step": 9891 + }, + { + "epoch": 0.354253585689473, + "grad_norm": 2.0531435012817383, + "learning_rate": 0.00014974195611611402, + "loss": 1.323, + "step": 9892 + }, + { + "epoch": 0.3542893978190413, + "grad_norm": 1.621541142463684, + "learning_rate": 0.00014973189352061409, + "loss": 1.055, + "step": 9893 + }, + { + "epoch": 0.3543252099486096, + "grad_norm": 1.925011396408081, + "learning_rate": 0.0001497218302560357, + "loss": 1.1617, + "step": 9894 + }, + { + "epoch": 0.3543610220781779, + "grad_norm": 1.5546512603759766, + "learning_rate": 0.0001497117663225142, + "loss": 1.2067, + "step": 9895 + }, + { + "epoch": 0.35439683420774615, + "grad_norm": 1.5123398303985596, + "learning_rate": 0.00014970170172018505, + "loss": 1.2155, + "step": 9896 + }, + { + "epoch": 0.35443264633731447, + "grad_norm": 2.5121731758117676, + "learning_rate": 0.00014969163644918358, + "loss": 1.1123, + "step": 9897 + }, + { + "epoch": 0.35446845846688274, + "grad_norm": 1.6583205461502075, + "learning_rate": 0.00014968157050964526, + "loss": 1.1815, + "step": 9898 + }, + { + "epoch": 0.354504270596451, + "grad_norm": 1.619205355644226, + "learning_rate": 0.00014967150390170547, + "loss": 1.1541, + "step": 9899 + }, + { + "epoch": 0.3545400827260193, + "grad_norm": 1.4862487316131592, + "learning_rate": 0.0001496614366254997, + "loss": 1.0579, + "step": 9900 + }, + { + "epoch": 0.3545758948555876, + "grad_norm": 1.9496465921401978, + "learning_rate": 0.00014965136868116334, + "loss": 1.3108, + "step": 9901 + }, + { + "epoch": 0.3546117069851559, + "grad_norm": 1.7597970962524414, + "learning_rate": 0.00014964130006883187, + "loss": 1.346, + "step": 9902 + }, + { + "epoch": 0.35464751911472414, + "grad_norm": 1.3578381538391113, + "learning_rate": 0.00014963123078864073, + "loss": 1.0803, + "step": 9903 + }, + { + "epoch": 0.35468333124429247, + "grad_norm": 1.5754430294036865, + "learning_rate": 0.0001496211608407254, + "loss": 1.2689, + "step": 9904 + }, + { + "epoch": 0.35471914337386073, + "grad_norm": 2.028059244155884, + "learning_rate": 0.00014961109022522135, + "loss": 1.3202, + "step": 9905 + }, + { + "epoch": 0.354754955503429, + "grad_norm": 1.8251923322677612, + "learning_rate": 0.0001496010189422641, + "loss": 1.0414, + "step": 9906 + }, + { + "epoch": 0.3547907676329973, + "grad_norm": 1.3864855766296387, + "learning_rate": 0.00014959094699198907, + "loss": 1.0337, + "step": 9907 + }, + { + "epoch": 0.3548265797625656, + "grad_norm": 1.4648710489273071, + "learning_rate": 0.00014958087437453186, + "loss": 1.1572, + "step": 9908 + }, + { + "epoch": 0.35486239189213387, + "grad_norm": 1.3347917795181274, + "learning_rate": 0.00014957080109002794, + "loss": 1.0756, + "step": 9909 + }, + { + "epoch": 0.35489820402170213, + "grad_norm": 1.742512583732605, + "learning_rate": 0.00014956072713861286, + "loss": 1.2376, + "step": 9910 + }, + { + "epoch": 0.35493401615127046, + "grad_norm": 1.5908455848693848, + "learning_rate": 0.00014955065252042206, + "loss": 1.285, + "step": 9911 + }, + { + "epoch": 0.35496982828083873, + "grad_norm": 2.1822526454925537, + "learning_rate": 0.00014954057723559115, + "loss": 1.4349, + "step": 9912 + }, + { + "epoch": 0.355005640410407, + "grad_norm": 1.6651301383972168, + "learning_rate": 0.0001495305012842557, + "loss": 1.2246, + "step": 9913 + }, + { + "epoch": 0.35504145253997527, + "grad_norm": 1.7716532945632935, + "learning_rate": 0.00014952042466655126, + "loss": 1.1799, + "step": 9914 + }, + { + "epoch": 0.3550772646695436, + "grad_norm": 1.6261059045791626, + "learning_rate": 0.00014951034738261337, + "loss": 1.2888, + "step": 9915 + }, + { + "epoch": 0.35511307679911186, + "grad_norm": 1.457478642463684, + "learning_rate": 0.00014950026943257762, + "loss": 1.2534, + "step": 9916 + }, + { + "epoch": 0.35514888892868013, + "grad_norm": 1.7452863454818726, + "learning_rate": 0.00014949019081657959, + "loss": 1.1441, + "step": 9917 + }, + { + "epoch": 0.35518470105824845, + "grad_norm": 1.972368597984314, + "learning_rate": 0.00014948011153475491, + "loss": 1.2837, + "step": 9918 + }, + { + "epoch": 0.3552205131878167, + "grad_norm": 1.5417578220367432, + "learning_rate": 0.0001494700315872391, + "loss": 1.2096, + "step": 9919 + }, + { + "epoch": 0.355256325317385, + "grad_norm": 2.061330556869507, + "learning_rate": 0.00014945995097416788, + "loss": 1.2586, + "step": 9920 + }, + { + "epoch": 0.35529213744695326, + "grad_norm": 1.3010426759719849, + "learning_rate": 0.0001494498696956768, + "loss": 1.2014, + "step": 9921 + }, + { + "epoch": 0.3553279495765216, + "grad_norm": 1.4543670415878296, + "learning_rate": 0.0001494397877519015, + "loss": 1.1694, + "step": 9922 + }, + { + "epoch": 0.35536376170608985, + "grad_norm": 1.3269355297088623, + "learning_rate": 0.00014942970514297761, + "loss": 0.9365, + "step": 9923 + }, + { + "epoch": 0.3553995738356581, + "grad_norm": 1.3955373764038086, + "learning_rate": 0.00014941962186904083, + "loss": 1.189, + "step": 9924 + }, + { + "epoch": 0.35543538596522645, + "grad_norm": 1.53092622756958, + "learning_rate": 0.00014940953793022676, + "loss": 1.1755, + "step": 9925 + }, + { + "epoch": 0.3554711980947947, + "grad_norm": 2.223764419555664, + "learning_rate": 0.00014939945332667108, + "loss": 1.1591, + "step": 9926 + }, + { + "epoch": 0.355507010224363, + "grad_norm": 1.528232455253601, + "learning_rate": 0.00014938936805850955, + "loss": 1.0226, + "step": 9927 + }, + { + "epoch": 0.35554282235393125, + "grad_norm": 1.5883843898773193, + "learning_rate": 0.0001493792821258777, + "loss": 1.1772, + "step": 9928 + }, + { + "epoch": 0.3555786344834996, + "grad_norm": 1.5065370798110962, + "learning_rate": 0.00014936919552891134, + "loss": 0.9974, + "step": 9929 + }, + { + "epoch": 0.35561444661306785, + "grad_norm": 1.4708603620529175, + "learning_rate": 0.00014935910826774612, + "loss": 1.32, + "step": 9930 + }, + { + "epoch": 0.3556502587426361, + "grad_norm": 1.7068907022476196, + "learning_rate": 0.0001493490203425178, + "loss": 1.1033, + "step": 9931 + }, + { + "epoch": 0.35568607087220444, + "grad_norm": 1.7582863569259644, + "learning_rate": 0.00014933893175336202, + "loss": 1.2475, + "step": 9932 + }, + { + "epoch": 0.3557218830017727, + "grad_norm": 1.6924091577529907, + "learning_rate": 0.0001493288425004146, + "loss": 1.1765, + "step": 9933 + }, + { + "epoch": 0.355757695131341, + "grad_norm": 1.7326093912124634, + "learning_rate": 0.00014931875258381117, + "loss": 1.2945, + "step": 9934 + }, + { + "epoch": 0.35579350726090925, + "grad_norm": 1.3792593479156494, + "learning_rate": 0.00014930866200368761, + "loss": 1.303, + "step": 9935 + }, + { + "epoch": 0.35582931939047757, + "grad_norm": 1.7573635578155518, + "learning_rate": 0.0001492985707601796, + "loss": 1.2397, + "step": 9936 + }, + { + "epoch": 0.35586513152004584, + "grad_norm": 1.6930618286132812, + "learning_rate": 0.00014928847885342287, + "loss": 1.1368, + "step": 9937 + }, + { + "epoch": 0.3559009436496141, + "grad_norm": 1.5544538497924805, + "learning_rate": 0.00014927838628355327, + "loss": 1.0768, + "step": 9938 + }, + { + "epoch": 0.35593675577918243, + "grad_norm": 1.1760685443878174, + "learning_rate": 0.0001492682930507065, + "loss": 1.0269, + "step": 9939 + }, + { + "epoch": 0.3559725679087507, + "grad_norm": 1.751328945159912, + "learning_rate": 0.00014925819915501847, + "loss": 1.247, + "step": 9940 + }, + { + "epoch": 0.356008380038319, + "grad_norm": 1.8983820676803589, + "learning_rate": 0.00014924810459662484, + "loss": 1.4111, + "step": 9941 + }, + { + "epoch": 0.35604419216788724, + "grad_norm": 1.7549796104431152, + "learning_rate": 0.0001492380093756615, + "loss": 1.067, + "step": 9942 + }, + { + "epoch": 0.35608000429745557, + "grad_norm": 1.2942321300506592, + "learning_rate": 0.0001492279134922643, + "loss": 1.1052, + "step": 9943 + }, + { + "epoch": 0.35611581642702383, + "grad_norm": 1.8963521718978882, + "learning_rate": 0.000149217816946569, + "loss": 1.0809, + "step": 9944 + }, + { + "epoch": 0.3561516285565921, + "grad_norm": 1.6714656352996826, + "learning_rate": 0.00014920771973871147, + "loss": 1.3825, + "step": 9945 + }, + { + "epoch": 0.35618744068616043, + "grad_norm": 1.6547831296920776, + "learning_rate": 0.00014919762186882754, + "loss": 1.1221, + "step": 9946 + }, + { + "epoch": 0.3562232528157287, + "grad_norm": 1.4799480438232422, + "learning_rate": 0.00014918752333705303, + "loss": 0.8536, + "step": 9947 + }, + { + "epoch": 0.35625906494529697, + "grad_norm": 1.6180773973464966, + "learning_rate": 0.00014917742414352386, + "loss": 1.1439, + "step": 9948 + }, + { + "epoch": 0.35629487707486523, + "grad_norm": 1.8168809413909912, + "learning_rate": 0.00014916732428837593, + "loss": 1.3821, + "step": 9949 + }, + { + "epoch": 0.35633068920443356, + "grad_norm": 1.472667932510376, + "learning_rate": 0.00014915722377174503, + "loss": 1.0832, + "step": 9950 + }, + { + "epoch": 0.35636650133400183, + "grad_norm": 1.4810538291931152, + "learning_rate": 0.0001491471225937671, + "loss": 1.2364, + "step": 9951 + }, + { + "epoch": 0.3564023134635701, + "grad_norm": 1.4295247793197632, + "learning_rate": 0.000149137020754578, + "loss": 1.1444, + "step": 9952 + }, + { + "epoch": 0.3564381255931384, + "grad_norm": 1.4190547466278076, + "learning_rate": 0.0001491269182543137, + "loss": 1.1403, + "step": 9953 + }, + { + "epoch": 0.3564739377227067, + "grad_norm": 2.085313558578491, + "learning_rate": 0.0001491168150931101, + "loss": 1.0817, + "step": 9954 + }, + { + "epoch": 0.35650974985227496, + "grad_norm": 1.82062828540802, + "learning_rate": 0.00014910671127110308, + "loss": 1.0832, + "step": 9955 + }, + { + "epoch": 0.35654556198184323, + "grad_norm": 1.5688157081604004, + "learning_rate": 0.00014909660678842862, + "loss": 1.1161, + "step": 9956 + }, + { + "epoch": 0.35658137411141155, + "grad_norm": 1.6024221181869507, + "learning_rate": 0.0001490865016452226, + "loss": 1.279, + "step": 9957 + }, + { + "epoch": 0.3566171862409798, + "grad_norm": 1.4036805629730225, + "learning_rate": 0.00014907639584162109, + "loss": 1.247, + "step": 9958 + }, + { + "epoch": 0.3566529983705481, + "grad_norm": 1.528833270072937, + "learning_rate": 0.00014906628937775995, + "loss": 1.2812, + "step": 9959 + }, + { + "epoch": 0.3566888105001164, + "grad_norm": 1.5632951259613037, + "learning_rate": 0.00014905618225377517, + "loss": 1.0044, + "step": 9960 + }, + { + "epoch": 0.3567246226296847, + "grad_norm": 1.8925098180770874, + "learning_rate": 0.00014904607446980273, + "loss": 1.1152, + "step": 9961 + }, + { + "epoch": 0.35676043475925295, + "grad_norm": 1.651370882987976, + "learning_rate": 0.00014903596602597864, + "loss": 1.3329, + "step": 9962 + }, + { + "epoch": 0.3567962468888212, + "grad_norm": 1.8060799837112427, + "learning_rate": 0.00014902585692243885, + "loss": 1.1972, + "step": 9963 + }, + { + "epoch": 0.35683205901838955, + "grad_norm": 1.5328514575958252, + "learning_rate": 0.00014901574715931942, + "loss": 1.0935, + "step": 9964 + }, + { + "epoch": 0.3568678711479578, + "grad_norm": 1.5265860557556152, + "learning_rate": 0.00014900563673675633, + "loss": 1.0813, + "step": 9965 + }, + { + "epoch": 0.3569036832775261, + "grad_norm": 1.7016756534576416, + "learning_rate": 0.00014899552565488563, + "loss": 1.3155, + "step": 9966 + }, + { + "epoch": 0.3569394954070944, + "grad_norm": 1.5904252529144287, + "learning_rate": 0.0001489854139138433, + "loss": 1.1827, + "step": 9967 + }, + { + "epoch": 0.3569753075366627, + "grad_norm": 1.470960259437561, + "learning_rate": 0.00014897530151376545, + "loss": 1.2639, + "step": 9968 + }, + { + "epoch": 0.35701111966623095, + "grad_norm": 1.5675864219665527, + "learning_rate": 0.00014896518845478805, + "loss": 1.0919, + "step": 9969 + }, + { + "epoch": 0.3570469317957992, + "grad_norm": 1.5472854375839233, + "learning_rate": 0.00014895507473704718, + "loss": 1.2247, + "step": 9970 + }, + { + "epoch": 0.35708274392536754, + "grad_norm": 1.8024758100509644, + "learning_rate": 0.00014894496036067903, + "loss": 1.0984, + "step": 9971 + }, + { + "epoch": 0.3571185560549358, + "grad_norm": 2.0242316722869873, + "learning_rate": 0.00014893484532581947, + "loss": 1.1866, + "step": 9972 + }, + { + "epoch": 0.3571543681845041, + "grad_norm": 1.411063313484192, + "learning_rate": 0.00014892472963260475, + "loss": 1.3674, + "step": 9973 + }, + { + "epoch": 0.3571901803140724, + "grad_norm": 1.668316125869751, + "learning_rate": 0.00014891461328117087, + "loss": 1.0358, + "step": 9974 + }, + { + "epoch": 0.35722599244364067, + "grad_norm": 1.433045506477356, + "learning_rate": 0.00014890449627165398, + "loss": 1.0438, + "step": 9975 + }, + { + "epoch": 0.35726180457320894, + "grad_norm": 1.4465934038162231, + "learning_rate": 0.00014889437860419013, + "loss": 1.2768, + "step": 9976 + }, + { + "epoch": 0.3572976167027772, + "grad_norm": 1.690127968788147, + "learning_rate": 0.00014888426027891553, + "loss": 1.2402, + "step": 9977 + }, + { + "epoch": 0.35733342883234553, + "grad_norm": 1.3642487525939941, + "learning_rate": 0.00014887414129596623, + "loss": 1.2365, + "step": 9978 + }, + { + "epoch": 0.3573692409619138, + "grad_norm": 1.7012896537780762, + "learning_rate": 0.00014886402165547845, + "loss": 1.0863, + "step": 9979 + }, + { + "epoch": 0.3574050530914821, + "grad_norm": 1.24565851688385, + "learning_rate": 0.00014885390135758826, + "loss": 1.135, + "step": 9980 + }, + { + "epoch": 0.3574408652210504, + "grad_norm": 1.7465468645095825, + "learning_rate": 0.00014884378040243184, + "loss": 1.3168, + "step": 9981 + }, + { + "epoch": 0.35747667735061867, + "grad_norm": 1.1902148723602295, + "learning_rate": 0.0001488336587901454, + "loss": 1.1037, + "step": 9982 + }, + { + "epoch": 0.35751248948018693, + "grad_norm": 1.619979977607727, + "learning_rate": 0.000148823536520865, + "loss": 1.1154, + "step": 9983 + }, + { + "epoch": 0.3575483016097552, + "grad_norm": 1.54076087474823, + "learning_rate": 0.00014881341359472696, + "loss": 1.2519, + "step": 9984 + }, + { + "epoch": 0.35758411373932353, + "grad_norm": 2.0008814334869385, + "learning_rate": 0.00014880329001186736, + "loss": 1.1014, + "step": 9985 + }, + { + "epoch": 0.3576199258688918, + "grad_norm": 1.4983762502670288, + "learning_rate": 0.00014879316577242246, + "loss": 1.0861, + "step": 9986 + }, + { + "epoch": 0.35765573799846007, + "grad_norm": 2.0293140411376953, + "learning_rate": 0.00014878304087652847, + "loss": 1.1643, + "step": 9987 + }, + { + "epoch": 0.3576915501280284, + "grad_norm": 1.6483116149902344, + "learning_rate": 0.00014877291532432158, + "loss": 1.1083, + "step": 9988 + }, + { + "epoch": 0.35772736225759666, + "grad_norm": 1.558372974395752, + "learning_rate": 0.00014876278911593802, + "loss": 1.2422, + "step": 9989 + }, + { + "epoch": 0.35776317438716493, + "grad_norm": 1.5747052431106567, + "learning_rate": 0.00014875266225151403, + "loss": 1.0174, + "step": 9990 + }, + { + "epoch": 0.3577989865167332, + "grad_norm": 1.6416763067245483, + "learning_rate": 0.00014874253473118586, + "loss": 1.1064, + "step": 9991 + }, + { + "epoch": 0.3578347986463015, + "grad_norm": 1.4228304624557495, + "learning_rate": 0.00014873240655508975, + "loss": 1.087, + "step": 9992 + }, + { + "epoch": 0.3578706107758698, + "grad_norm": 1.5811141729354858, + "learning_rate": 0.00014872227772336197, + "loss": 1.2629, + "step": 9993 + }, + { + "epoch": 0.35790642290543806, + "grad_norm": 1.400210976600647, + "learning_rate": 0.0001487121482361388, + "loss": 1.0186, + "step": 9994 + }, + { + "epoch": 0.3579422350350064, + "grad_norm": 1.7294005155563354, + "learning_rate": 0.00014870201809355653, + "loss": 1.1167, + "step": 9995 + }, + { + "epoch": 0.35797804716457465, + "grad_norm": 1.6089154481887817, + "learning_rate": 0.0001486918872957514, + "loss": 1.0522, + "step": 9996 + }, + { + "epoch": 0.3580138592941429, + "grad_norm": 1.794985055923462, + "learning_rate": 0.00014868175584285974, + "loss": 1.1592, + "step": 9997 + }, + { + "epoch": 0.3580496714237112, + "grad_norm": 1.6904035806655884, + "learning_rate": 0.00014867162373501786, + "loss": 1.1878, + "step": 9998 + }, + { + "epoch": 0.3580854835532795, + "grad_norm": 1.5313382148742676, + "learning_rate": 0.00014866149097236204, + "loss": 1.0745, + "step": 9999 + }, + { + "epoch": 0.3581212956828478, + "grad_norm": 1.6024516820907593, + "learning_rate": 0.00014865135755502866, + "loss": 1.0659, + "step": 10000 + }, + { + "epoch": 0.35815710781241605, + "grad_norm": 1.3833664655685425, + "learning_rate": 0.000148641223483154, + "loss": 1.111, + "step": 10001 + }, + { + "epoch": 0.3581929199419843, + "grad_norm": 1.8405494689941406, + "learning_rate": 0.00014863108875687444, + "loss": 1.0268, + "step": 10002 + }, + { + "epoch": 0.35822873207155265, + "grad_norm": 2.0618538856506348, + "learning_rate": 0.00014862095337632626, + "loss": 1.1072, + "step": 10003 + }, + { + "epoch": 0.3582645442011209, + "grad_norm": 1.5912472009658813, + "learning_rate": 0.00014861081734164592, + "loss": 1.2031, + "step": 10004 + }, + { + "epoch": 0.3583003563306892, + "grad_norm": 1.7056297063827515, + "learning_rate": 0.0001486006806529697, + "loss": 1.2253, + "step": 10005 + }, + { + "epoch": 0.3583361684602575, + "grad_norm": 1.4825495481491089, + "learning_rate": 0.00014859054331043406, + "loss": 1.2279, + "step": 10006 + }, + { + "epoch": 0.3583719805898258, + "grad_norm": 1.6487232446670532, + "learning_rate": 0.0001485804053141753, + "loss": 1.253, + "step": 10007 + }, + { + "epoch": 0.35840779271939405, + "grad_norm": 1.5332564115524292, + "learning_rate": 0.00014857026666432988, + "loss": 1.1815, + "step": 10008 + }, + { + "epoch": 0.3584436048489623, + "grad_norm": 1.4887186288833618, + "learning_rate": 0.00014856012736103413, + "loss": 1.3188, + "step": 10009 + }, + { + "epoch": 0.35847941697853064, + "grad_norm": 1.6696367263793945, + "learning_rate": 0.00014854998740442454, + "loss": 1.2531, + "step": 10010 + }, + { + "epoch": 0.3585152291080989, + "grad_norm": 1.4433315992355347, + "learning_rate": 0.00014853984679463747, + "loss": 1.1849, + "step": 10011 + }, + { + "epoch": 0.3585510412376672, + "grad_norm": 1.475480318069458, + "learning_rate": 0.00014852970553180938, + "loss": 0.9323, + "step": 10012 + }, + { + "epoch": 0.3585868533672355, + "grad_norm": 1.2462831735610962, + "learning_rate": 0.0001485195636160767, + "loss": 1.0433, + "step": 10013 + }, + { + "epoch": 0.35862266549680377, + "grad_norm": 8.797764778137207, + "learning_rate": 0.00014850942104757588, + "loss": 1.1693, + "step": 10014 + }, + { + "epoch": 0.35865847762637204, + "grad_norm": 1.594314455986023, + "learning_rate": 0.0001484992778264434, + "loss": 1.1223, + "step": 10015 + }, + { + "epoch": 0.3586942897559403, + "grad_norm": 1.477168321609497, + "learning_rate": 0.00014848913395281568, + "loss": 1.0587, + "step": 10016 + }, + { + "epoch": 0.35873010188550863, + "grad_norm": 1.4987590312957764, + "learning_rate": 0.00014847898942682922, + "loss": 1.1413, + "step": 10017 + }, + { + "epoch": 0.3587659140150769, + "grad_norm": 1.4779413938522339, + "learning_rate": 0.00014846884424862044, + "loss": 1.1076, + "step": 10018 + }, + { + "epoch": 0.35880172614464517, + "grad_norm": 1.3978538513183594, + "learning_rate": 0.0001484586984183259, + "loss": 1.1533, + "step": 10019 + }, + { + "epoch": 0.3588375382742135, + "grad_norm": 1.568698763847351, + "learning_rate": 0.0001484485519360821, + "loss": 1.1467, + "step": 10020 + }, + { + "epoch": 0.35887335040378177, + "grad_norm": 1.768151879310608, + "learning_rate": 0.00014843840480202554, + "loss": 1.1859, + "step": 10021 + }, + { + "epoch": 0.35890916253335003, + "grad_norm": 1.5240693092346191, + "learning_rate": 0.00014842825701629267, + "loss": 1.0279, + "step": 10022 + }, + { + "epoch": 0.3589449746629183, + "grad_norm": 1.5371402502059937, + "learning_rate": 0.0001484181085790201, + "loss": 1.111, + "step": 10023 + }, + { + "epoch": 0.35898078679248663, + "grad_norm": 1.464701771736145, + "learning_rate": 0.00014840795949034439, + "loss": 1.1694, + "step": 10024 + }, + { + "epoch": 0.3590165989220549, + "grad_norm": 1.6503924131393433, + "learning_rate": 0.00014839780975040194, + "loss": 1.1398, + "step": 10025 + }, + { + "epoch": 0.35905241105162317, + "grad_norm": 1.3522515296936035, + "learning_rate": 0.00014838765935932944, + "loss": 1.1715, + "step": 10026 + }, + { + "epoch": 0.3590882231811915, + "grad_norm": 3.1149566173553467, + "learning_rate": 0.00014837750831726338, + "loss": 1.171, + "step": 10027 + }, + { + "epoch": 0.35912403531075976, + "grad_norm": 1.7269411087036133, + "learning_rate": 0.00014836735662434035, + "loss": 1.2377, + "step": 10028 + }, + { + "epoch": 0.35915984744032803, + "grad_norm": 1.543023943901062, + "learning_rate": 0.00014835720428069693, + "loss": 0.8269, + "step": 10029 + }, + { + "epoch": 0.3591956595698963, + "grad_norm": 1.5594792366027832, + "learning_rate": 0.0001483470512864697, + "loss": 1.08, + "step": 10030 + }, + { + "epoch": 0.3592314716994646, + "grad_norm": 1.4114876985549927, + "learning_rate": 0.00014833689764179523, + "loss": 1.2794, + "step": 10031 + }, + { + "epoch": 0.3592672838290329, + "grad_norm": 1.281861424446106, + "learning_rate": 0.00014832674334681022, + "loss": 1.0507, + "step": 10032 + }, + { + "epoch": 0.35930309595860116, + "grad_norm": 1.663297414779663, + "learning_rate": 0.00014831658840165117, + "loss": 1.3264, + "step": 10033 + }, + { + "epoch": 0.3593389080881695, + "grad_norm": 1.7764054536819458, + "learning_rate": 0.00014830643280645472, + "loss": 1.1187, + "step": 10034 + }, + { + "epoch": 0.35937472021773775, + "grad_norm": 1.2981988191604614, + "learning_rate": 0.00014829627656135757, + "loss": 1.0744, + "step": 10035 + }, + { + "epoch": 0.359410532347306, + "grad_norm": 1.5554753541946411, + "learning_rate": 0.0001482861196664963, + "loss": 1.1313, + "step": 10036 + }, + { + "epoch": 0.3594463444768743, + "grad_norm": 1.4123824834823608, + "learning_rate": 0.00014827596212200762, + "loss": 1.0523, + "step": 10037 + }, + { + "epoch": 0.3594821566064426, + "grad_norm": 1.9121085405349731, + "learning_rate": 0.00014826580392802806, + "loss": 1.3497, + "step": 10038 + }, + { + "epoch": 0.3595179687360109, + "grad_norm": 2.2713611125946045, + "learning_rate": 0.00014825564508469443, + "loss": 1.4311, + "step": 10039 + }, + { + "epoch": 0.35955378086557915, + "grad_norm": 3.1128196716308594, + "learning_rate": 0.00014824548559214332, + "loss": 1.1429, + "step": 10040 + }, + { + "epoch": 0.3595895929951475, + "grad_norm": 1.4343270063400269, + "learning_rate": 0.0001482353254505114, + "loss": 1.2336, + "step": 10041 + }, + { + "epoch": 0.35962540512471575, + "grad_norm": 1.5832581520080566, + "learning_rate": 0.00014822516465993546, + "loss": 1.1325, + "step": 10042 + }, + { + "epoch": 0.359661217254284, + "grad_norm": 1.3456978797912598, + "learning_rate": 0.0001482150032205521, + "loss": 1.239, + "step": 10043 + }, + { + "epoch": 0.3596970293838523, + "grad_norm": 1.406951665878296, + "learning_rate": 0.00014820484113249805, + "loss": 1.1134, + "step": 10044 + }, + { + "epoch": 0.3597328415134206, + "grad_norm": 1.3235896825790405, + "learning_rate": 0.00014819467839591007, + "loss": 1.1235, + "step": 10045 + }, + { + "epoch": 0.3597686536429889, + "grad_norm": 1.5649739503860474, + "learning_rate": 0.00014818451501092485, + "loss": 1.1856, + "step": 10046 + }, + { + "epoch": 0.35980446577255715, + "grad_norm": 1.5148497819900513, + "learning_rate": 0.00014817435097767912, + "loss": 1.2475, + "step": 10047 + }, + { + "epoch": 0.35984027790212547, + "grad_norm": 1.2941906452178955, + "learning_rate": 0.00014816418629630968, + "loss": 1.0132, + "step": 10048 + }, + { + "epoch": 0.35987609003169374, + "grad_norm": 1.8339210748672485, + "learning_rate": 0.0001481540209669532, + "loss": 1.1741, + "step": 10049 + }, + { + "epoch": 0.359911902161262, + "grad_norm": 1.702880620956421, + "learning_rate": 0.0001481438549897465, + "loss": 1.0537, + "step": 10050 + }, + { + "epoch": 0.3599477142908303, + "grad_norm": 1.3481134176254272, + "learning_rate": 0.00014813368836482632, + "loss": 0.9136, + "step": 10051 + }, + { + "epoch": 0.3599835264203986, + "grad_norm": 1.9219114780426025, + "learning_rate": 0.00014812352109232947, + "loss": 1.0406, + "step": 10052 + }, + { + "epoch": 0.36001933854996687, + "grad_norm": 1.7949159145355225, + "learning_rate": 0.0001481133531723927, + "loss": 1.0389, + "step": 10053 + }, + { + "epoch": 0.36005515067953514, + "grad_norm": 2.2268218994140625, + "learning_rate": 0.00014810318460515282, + "loss": 1.3636, + "step": 10054 + }, + { + "epoch": 0.36009096280910347, + "grad_norm": 1.7730791568756104, + "learning_rate": 0.00014809301539074667, + "loss": 1.1012, + "step": 10055 + }, + { + "epoch": 0.36012677493867173, + "grad_norm": 1.9007648229599, + "learning_rate": 0.000148082845529311, + "loss": 1.2254, + "step": 10056 + }, + { + "epoch": 0.36016258706824, + "grad_norm": 1.4087740182876587, + "learning_rate": 0.00014807267502098267, + "loss": 1.0637, + "step": 10057 + }, + { + "epoch": 0.36019839919780827, + "grad_norm": 1.5188648700714111, + "learning_rate": 0.00014806250386589851, + "loss": 1.1862, + "step": 10058 + }, + { + "epoch": 0.3602342113273766, + "grad_norm": 1.7672334909439087, + "learning_rate": 0.0001480523320641954, + "loss": 0.9373, + "step": 10059 + }, + { + "epoch": 0.36027002345694487, + "grad_norm": 2.1847407817840576, + "learning_rate": 0.00014804215961601008, + "loss": 1.158, + "step": 10060 + }, + { + "epoch": 0.36030583558651313, + "grad_norm": 2.076361894607544, + "learning_rate": 0.00014803198652147952, + "loss": 1.3697, + "step": 10061 + }, + { + "epoch": 0.36034164771608146, + "grad_norm": 1.765182614326477, + "learning_rate": 0.00014802181278074052, + "loss": 1.2934, + "step": 10062 + }, + { + "epoch": 0.36037745984564973, + "grad_norm": 1.669805884361267, + "learning_rate": 0.00014801163839392998, + "loss": 1.0966, + "step": 10063 + }, + { + "epoch": 0.360413271975218, + "grad_norm": 1.6779811382293701, + "learning_rate": 0.00014800146336118474, + "loss": 1.2726, + "step": 10064 + }, + { + "epoch": 0.36044908410478627, + "grad_norm": 1.797054648399353, + "learning_rate": 0.0001479912876826418, + "loss": 1.2041, + "step": 10065 + }, + { + "epoch": 0.3604848962343546, + "grad_norm": 1.367297649383545, + "learning_rate": 0.0001479811113584379, + "loss": 1.1815, + "step": 10066 + }, + { + "epoch": 0.36052070836392286, + "grad_norm": 1.457230567932129, + "learning_rate": 0.00014797093438871008, + "loss": 1.0765, + "step": 10067 + }, + { + "epoch": 0.36055652049349113, + "grad_norm": 1.523238182067871, + "learning_rate": 0.00014796075677359525, + "loss": 1.1627, + "step": 10068 + }, + { + "epoch": 0.36059233262305945, + "grad_norm": 1.523218035697937, + "learning_rate": 0.00014795057851323023, + "loss": 1.1829, + "step": 10069 + }, + { + "epoch": 0.3606281447526277, + "grad_norm": 1.5231448411941528, + "learning_rate": 0.0001479403996077521, + "loss": 0.8992, + "step": 10070 + }, + { + "epoch": 0.360663956882196, + "grad_norm": 2.102266788482666, + "learning_rate": 0.0001479302200572977, + "loss": 1.1011, + "step": 10071 + }, + { + "epoch": 0.36069976901176426, + "grad_norm": 1.4190372228622437, + "learning_rate": 0.00014792003986200403, + "loss": 1.0941, + "step": 10072 + }, + { + "epoch": 0.3607355811413326, + "grad_norm": 2.0923972129821777, + "learning_rate": 0.000147909859022008, + "loss": 1.1801, + "step": 10073 + }, + { + "epoch": 0.36077139327090085, + "grad_norm": 1.2213666439056396, + "learning_rate": 0.00014789967753744664, + "loss": 1.0994, + "step": 10074 + }, + { + "epoch": 0.3608072054004691, + "grad_norm": 1.725895643234253, + "learning_rate": 0.00014788949540845689, + "loss": 1.07, + "step": 10075 + }, + { + "epoch": 0.36084301753003745, + "grad_norm": 1.3486530780792236, + "learning_rate": 0.0001478793126351758, + "loss": 1.1366, + "step": 10076 + }, + { + "epoch": 0.3608788296596057, + "grad_norm": 1.716391682624817, + "learning_rate": 0.00014786912921774028, + "loss": 1.2712, + "step": 10077 + }, + { + "epoch": 0.360914641789174, + "grad_norm": 2.1078522205352783, + "learning_rate": 0.00014785894515628736, + "loss": 1.2544, + "step": 10078 + }, + { + "epoch": 0.36095045391874225, + "grad_norm": 1.6265997886657715, + "learning_rate": 0.0001478487604509541, + "loss": 1.1831, + "step": 10079 + }, + { + "epoch": 0.3609862660483106, + "grad_norm": 1.5680893659591675, + "learning_rate": 0.00014783857510187743, + "loss": 1.287, + "step": 10080 + }, + { + "epoch": 0.36102207817787885, + "grad_norm": 1.7272233963012695, + "learning_rate": 0.00014782838910919449, + "loss": 1.0813, + "step": 10081 + }, + { + "epoch": 0.3610578903074471, + "grad_norm": 1.7600164413452148, + "learning_rate": 0.00014781820247304227, + "loss": 1.2179, + "step": 10082 + }, + { + "epoch": 0.36109370243701544, + "grad_norm": 1.6152929067611694, + "learning_rate": 0.00014780801519355782, + "loss": 1.1072, + "step": 10083 + }, + { + "epoch": 0.3611295145665837, + "grad_norm": 1.7318254709243774, + "learning_rate": 0.00014779782727087815, + "loss": 1.1262, + "step": 10084 + }, + { + "epoch": 0.361165326696152, + "grad_norm": 1.984965205192566, + "learning_rate": 0.0001477876387051404, + "loss": 1.2773, + "step": 10085 + }, + { + "epoch": 0.36120113882572025, + "grad_norm": 1.2699191570281982, + "learning_rate": 0.00014777744949648163, + "loss": 1.0636, + "step": 10086 + }, + { + "epoch": 0.36123695095528857, + "grad_norm": 1.5229862928390503, + "learning_rate": 0.00014776725964503888, + "loss": 1.1318, + "step": 10087 + }, + { + "epoch": 0.36127276308485684, + "grad_norm": 1.8044993877410889, + "learning_rate": 0.00014775706915094928, + "loss": 1.2668, + "step": 10088 + }, + { + "epoch": 0.3613085752144251, + "grad_norm": 2.605436325073242, + "learning_rate": 0.0001477468780143499, + "loss": 1.1147, + "step": 10089 + }, + { + "epoch": 0.36134438734399343, + "grad_norm": 1.460282325744629, + "learning_rate": 0.00014773668623537786, + "loss": 1.0229, + "step": 10090 + }, + { + "epoch": 0.3613801994735617, + "grad_norm": 1.4074604511260986, + "learning_rate": 0.0001477264938141703, + "loss": 1.2113, + "step": 10091 + }, + { + "epoch": 0.36141601160312997, + "grad_norm": 1.3921787738800049, + "learning_rate": 0.00014771630075086434, + "loss": 1.1784, + "step": 10092 + }, + { + "epoch": 0.36145182373269824, + "grad_norm": 1.4569685459136963, + "learning_rate": 0.00014770610704559708, + "loss": 1.265, + "step": 10093 + }, + { + "epoch": 0.36148763586226657, + "grad_norm": 1.6548079252243042, + "learning_rate": 0.0001476959126985057, + "loss": 1.0356, + "step": 10094 + }, + { + "epoch": 0.36152344799183483, + "grad_norm": 1.6696362495422363, + "learning_rate": 0.00014768571770972734, + "loss": 1.1777, + "step": 10095 + }, + { + "epoch": 0.3615592601214031, + "grad_norm": 1.5856258869171143, + "learning_rate": 0.00014767552207939913, + "loss": 0.9784, + "step": 10096 + }, + { + "epoch": 0.3615950722509714, + "grad_norm": 1.4582496881484985, + "learning_rate": 0.0001476653258076583, + "loss": 1.2024, + "step": 10097 + }, + { + "epoch": 0.3616308843805397, + "grad_norm": 1.4219856262207031, + "learning_rate": 0.00014765512889464198, + "loss": 1.028, + "step": 10098 + }, + { + "epoch": 0.36166669651010797, + "grad_norm": 1.3425030708312988, + "learning_rate": 0.00014764493134048737, + "loss": 1.2128, + "step": 10099 + }, + { + "epoch": 0.36170250863967623, + "grad_norm": 1.7323471307754517, + "learning_rate": 0.00014763473314533166, + "loss": 1.0861, + "step": 10100 + }, + { + "epoch": 0.36173832076924456, + "grad_norm": 1.5314821004867554, + "learning_rate": 0.0001476245343093121, + "loss": 1.1089, + "step": 10101 + }, + { + "epoch": 0.36177413289881283, + "grad_norm": 1.4727953672409058, + "learning_rate": 0.00014761433483256582, + "loss": 1.1438, + "step": 10102 + }, + { + "epoch": 0.3618099450283811, + "grad_norm": 1.303435206413269, + "learning_rate": 0.00014760413471523012, + "loss": 1.035, + "step": 10103 + }, + { + "epoch": 0.3618457571579494, + "grad_norm": 1.5040369033813477, + "learning_rate": 0.00014759393395744215, + "loss": 1.2302, + "step": 10104 + }, + { + "epoch": 0.3618815692875177, + "grad_norm": 2.234722375869751, + "learning_rate": 0.00014758373255933924, + "loss": 1.4117, + "step": 10105 + }, + { + "epoch": 0.36191738141708596, + "grad_norm": 1.5493662357330322, + "learning_rate": 0.00014757353052105853, + "loss": 0.8949, + "step": 10106 + }, + { + "epoch": 0.36195319354665423, + "grad_norm": 2.1716349124908447, + "learning_rate": 0.00014756332784273738, + "loss": 1.2431, + "step": 10107 + }, + { + "epoch": 0.36198900567622255, + "grad_norm": 1.4196513891220093, + "learning_rate": 0.00014755312452451296, + "loss": 1.3441, + "step": 10108 + }, + { + "epoch": 0.3620248178057908, + "grad_norm": 1.3427472114562988, + "learning_rate": 0.0001475429205665226, + "loss": 1.0032, + "step": 10109 + }, + { + "epoch": 0.3620606299353591, + "grad_norm": 1.3500571250915527, + "learning_rate": 0.0001475327159689036, + "loss": 1.192, + "step": 10110 + }, + { + "epoch": 0.3620964420649274, + "grad_norm": 2.6423397064208984, + "learning_rate": 0.0001475225107317932, + "loss": 1.0002, + "step": 10111 + }, + { + "epoch": 0.3621322541944957, + "grad_norm": 2.217785358428955, + "learning_rate": 0.00014751230485532873, + "loss": 1.1477, + "step": 10112 + }, + { + "epoch": 0.36216806632406395, + "grad_norm": 2.0779497623443604, + "learning_rate": 0.00014750209833964747, + "loss": 1.2469, + "step": 10113 + }, + { + "epoch": 0.3622038784536322, + "grad_norm": 1.5484553575515747, + "learning_rate": 0.00014749189118488677, + "loss": 1.0809, + "step": 10114 + }, + { + "epoch": 0.36223969058320055, + "grad_norm": 1.5928325653076172, + "learning_rate": 0.0001474816833911839, + "loss": 1.1765, + "step": 10115 + }, + { + "epoch": 0.3622755027127688, + "grad_norm": 2.2348084449768066, + "learning_rate": 0.00014747147495867627, + "loss": 1.111, + "step": 10116 + }, + { + "epoch": 0.3623113148423371, + "grad_norm": 1.8760329484939575, + "learning_rate": 0.00014746126588750116, + "loss": 1.2269, + "step": 10117 + }, + { + "epoch": 0.3623471269719054, + "grad_norm": 2.070812940597534, + "learning_rate": 0.00014745105617779594, + "loss": 1.1456, + "step": 10118 + }, + { + "epoch": 0.3623829391014737, + "grad_norm": 1.6603468656539917, + "learning_rate": 0.00014744084582969793, + "loss": 1.3894, + "step": 10119 + }, + { + "epoch": 0.36241875123104195, + "grad_norm": 1.5430848598480225, + "learning_rate": 0.00014743063484334455, + "loss": 1.1859, + "step": 10120 + }, + { + "epoch": 0.3624545633606102, + "grad_norm": 1.7808287143707275, + "learning_rate": 0.00014742042321887322, + "loss": 1.2853, + "step": 10121 + }, + { + "epoch": 0.36249037549017854, + "grad_norm": 1.5389230251312256, + "learning_rate": 0.00014741021095642117, + "loss": 1.3329, + "step": 10122 + }, + { + "epoch": 0.3625261876197468, + "grad_norm": 1.335506558418274, + "learning_rate": 0.00014739999805612596, + "loss": 0.8972, + "step": 10123 + }, + { + "epoch": 0.3625619997493151, + "grad_norm": 1.7109079360961914, + "learning_rate": 0.00014738978451812488, + "loss": 1.0698, + "step": 10124 + }, + { + "epoch": 0.3625978118788834, + "grad_norm": 1.285657525062561, + "learning_rate": 0.00014737957034255538, + "loss": 1.2823, + "step": 10125 + }, + { + "epoch": 0.36263362400845167, + "grad_norm": 1.3205238580703735, + "learning_rate": 0.00014736935552955488, + "loss": 1.1348, + "step": 10126 + }, + { + "epoch": 0.36266943613801994, + "grad_norm": 1.4576444625854492, + "learning_rate": 0.00014735914007926084, + "loss": 1.15, + "step": 10127 + }, + { + "epoch": 0.3627052482675882, + "grad_norm": 1.7874659299850464, + "learning_rate": 0.0001473489239918106, + "loss": 1.0746, + "step": 10128 + }, + { + "epoch": 0.36274106039715653, + "grad_norm": 1.2705789804458618, + "learning_rate": 0.0001473387072673417, + "loss": 1.0891, + "step": 10129 + }, + { + "epoch": 0.3627768725267248, + "grad_norm": 1.4749045372009277, + "learning_rate": 0.00014732848990599154, + "loss": 1.1569, + "step": 10130 + }, + { + "epoch": 0.36281268465629307, + "grad_norm": 1.357578992843628, + "learning_rate": 0.0001473182719078976, + "loss": 1.1644, + "step": 10131 + }, + { + "epoch": 0.3628484967858614, + "grad_norm": 1.3589375019073486, + "learning_rate": 0.00014730805327319737, + "loss": 1.1878, + "step": 10132 + }, + { + "epoch": 0.36288430891542967, + "grad_norm": 1.4176702499389648, + "learning_rate": 0.00014729783400202828, + "loss": 1.2, + "step": 10133 + }, + { + "epoch": 0.36292012104499793, + "grad_norm": 2.1109859943389893, + "learning_rate": 0.00014728761409452785, + "loss": 1.167, + "step": 10134 + }, + { + "epoch": 0.3629559331745662, + "grad_norm": 2.133897542953491, + "learning_rate": 0.00014727739355083357, + "loss": 1.2149, + "step": 10135 + }, + { + "epoch": 0.3629917453041345, + "grad_norm": 1.6820111274719238, + "learning_rate": 0.00014726717237108293, + "loss": 1.2042, + "step": 10136 + }, + { + "epoch": 0.3630275574337028, + "grad_norm": 1.4089354276657104, + "learning_rate": 0.00014725695055541348, + "loss": 1.0727, + "step": 10137 + }, + { + "epoch": 0.36306336956327107, + "grad_norm": 1.3702677488327026, + "learning_rate": 0.00014724672810396272, + "loss": 1.0875, + "step": 10138 + }, + { + "epoch": 0.3630991816928394, + "grad_norm": 1.2813756465911865, + "learning_rate": 0.00014723650501686817, + "loss": 1.1038, + "step": 10139 + }, + { + "epoch": 0.36313499382240766, + "grad_norm": 1.4552891254425049, + "learning_rate": 0.00014722628129426734, + "loss": 1.0691, + "step": 10140 + }, + { + "epoch": 0.36317080595197593, + "grad_norm": 1.453718662261963, + "learning_rate": 0.0001472160569362979, + "loss": 1.221, + "step": 10141 + }, + { + "epoch": 0.3632066180815442, + "grad_norm": 1.9610761404037476, + "learning_rate": 0.0001472058319430972, + "loss": 1.1632, + "step": 10142 + }, + { + "epoch": 0.3632424302111125, + "grad_norm": 1.6482576131820679, + "learning_rate": 0.000147195606314803, + "loss": 1.1774, + "step": 10143 + }, + { + "epoch": 0.3632782423406808, + "grad_norm": 2.2952706813812256, + "learning_rate": 0.0001471853800515528, + "loss": 1.2232, + "step": 10144 + }, + { + "epoch": 0.36331405447024906, + "grad_norm": 1.4293609857559204, + "learning_rate": 0.00014717515315348413, + "loss": 1.3008, + "step": 10145 + }, + { + "epoch": 0.3633498665998174, + "grad_norm": 1.8843226432800293, + "learning_rate": 0.00014716492562073466, + "loss": 1.198, + "step": 10146 + }, + { + "epoch": 0.36338567872938565, + "grad_norm": 1.38600754737854, + "learning_rate": 0.00014715469745344196, + "loss": 1.0276, + "step": 10147 + }, + { + "epoch": 0.3634214908589539, + "grad_norm": 2.1023576259613037, + "learning_rate": 0.00014714446865174362, + "loss": 1.2244, + "step": 10148 + }, + { + "epoch": 0.3634573029885222, + "grad_norm": 1.7808927297592163, + "learning_rate": 0.00014713423921577725, + "loss": 1.0577, + "step": 10149 + }, + { + "epoch": 0.3634931151180905, + "grad_norm": 1.8251972198486328, + "learning_rate": 0.0001471240091456805, + "loss": 1.1962, + "step": 10150 + }, + { + "epoch": 0.3635289272476588, + "grad_norm": 1.5825400352478027, + "learning_rate": 0.00014711377844159099, + "loss": 1.1133, + "step": 10151 + }, + { + "epoch": 0.36356473937722705, + "grad_norm": 2.2078351974487305, + "learning_rate": 0.00014710354710364637, + "loss": 0.9839, + "step": 10152 + }, + { + "epoch": 0.3636005515067954, + "grad_norm": 1.694775938987732, + "learning_rate": 0.00014709331513198425, + "loss": 1.164, + "step": 10153 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.6966567039489746, + "learning_rate": 0.00014708308252674236, + "loss": 1.0828, + "step": 10154 + }, + { + "epoch": 0.3636721757659319, + "grad_norm": 1.4390692710876465, + "learning_rate": 0.0001470728492880583, + "loss": 1.2185, + "step": 10155 + }, + { + "epoch": 0.3637079878955002, + "grad_norm": 1.5495678186416626, + "learning_rate": 0.00014706261541606983, + "loss": 1.2105, + "step": 10156 + }, + { + "epoch": 0.3637438000250685, + "grad_norm": 1.4501078128814697, + "learning_rate": 0.00014705238091091455, + "loss": 1.2022, + "step": 10157 + }, + { + "epoch": 0.3637796121546368, + "grad_norm": 1.5145426988601685, + "learning_rate": 0.00014704214577273016, + "loss": 1.1372, + "step": 10158 + }, + { + "epoch": 0.36381542428420505, + "grad_norm": 1.350497841835022, + "learning_rate": 0.00014703191000165438, + "loss": 1.2231, + "step": 10159 + }, + { + "epoch": 0.36385123641377337, + "grad_norm": 1.574840784072876, + "learning_rate": 0.00014702167359782493, + "loss": 1.1791, + "step": 10160 + }, + { + "epoch": 0.36388704854334164, + "grad_norm": 1.6828198432922363, + "learning_rate": 0.0001470114365613795, + "loss": 0.9989, + "step": 10161 + }, + { + "epoch": 0.3639228606729099, + "grad_norm": 1.5437126159667969, + "learning_rate": 0.00014700119889245582, + "loss": 1.1279, + "step": 10162 + }, + { + "epoch": 0.3639586728024782, + "grad_norm": 1.2495217323303223, + "learning_rate": 0.00014699096059119166, + "loss": 1.0701, + "step": 10163 + }, + { + "epoch": 0.3639944849320465, + "grad_norm": 1.6375486850738525, + "learning_rate": 0.0001469807216577247, + "loss": 1.2365, + "step": 10164 + }, + { + "epoch": 0.36403029706161477, + "grad_norm": 2.1795706748962402, + "learning_rate": 0.0001469704820921928, + "loss": 1.0864, + "step": 10165 + }, + { + "epoch": 0.36406610919118304, + "grad_norm": 1.737825632095337, + "learning_rate": 0.00014696024189473362, + "loss": 1.1132, + "step": 10166 + }, + { + "epoch": 0.36410192132075136, + "grad_norm": 1.5882232189178467, + "learning_rate": 0.00014695000106548496, + "loss": 1.1457, + "step": 10167 + }, + { + "epoch": 0.36413773345031963, + "grad_norm": 1.6965373754501343, + "learning_rate": 0.0001469397596045846, + "loss": 1.2281, + "step": 10168 + }, + { + "epoch": 0.3641735455798879, + "grad_norm": 1.7842035293579102, + "learning_rate": 0.0001469295175121703, + "loss": 1.3402, + "step": 10169 + }, + { + "epoch": 0.36420935770945617, + "grad_norm": 1.581007957458496, + "learning_rate": 0.00014691927478837987, + "loss": 1.2363, + "step": 10170 + }, + { + "epoch": 0.3642451698390245, + "grad_norm": 1.5174388885498047, + "learning_rate": 0.00014690903143335117, + "loss": 1.0429, + "step": 10171 + }, + { + "epoch": 0.36428098196859277, + "grad_norm": 1.4778114557266235, + "learning_rate": 0.00014689878744722192, + "loss": 1.1018, + "step": 10172 + }, + { + "epoch": 0.36431679409816103, + "grad_norm": 1.3787548542022705, + "learning_rate": 0.00014688854283013001, + "loss": 0.9858, + "step": 10173 + }, + { + "epoch": 0.36435260622772936, + "grad_norm": 1.451682686805725, + "learning_rate": 0.0001468782975822132, + "loss": 1.0391, + "step": 10174 + }, + { + "epoch": 0.3643884183572976, + "grad_norm": 1.7494145631790161, + "learning_rate": 0.0001468680517036094, + "loss": 1.22, + "step": 10175 + }, + { + "epoch": 0.3644242304868659, + "grad_norm": 1.492238998413086, + "learning_rate": 0.0001468578051944564, + "loss": 1.1045, + "step": 10176 + }, + { + "epoch": 0.36446004261643417, + "grad_norm": 1.9698872566223145, + "learning_rate": 0.00014684755805489206, + "loss": 1.3262, + "step": 10177 + }, + { + "epoch": 0.3644958547460025, + "grad_norm": 1.4343699216842651, + "learning_rate": 0.0001468373102850543, + "loss": 1.1111, + "step": 10178 + }, + { + "epoch": 0.36453166687557076, + "grad_norm": 1.4145736694335938, + "learning_rate": 0.0001468270618850809, + "loss": 1.3112, + "step": 10179 + }, + { + "epoch": 0.36456747900513903, + "grad_norm": 1.780834436416626, + "learning_rate": 0.0001468168128551098, + "loss": 1.193, + "step": 10180 + }, + { + "epoch": 0.36460329113470735, + "grad_norm": 1.7795794010162354, + "learning_rate": 0.00014680656319527886, + "loss": 1.0956, + "step": 10181 + }, + { + "epoch": 0.3646391032642756, + "grad_norm": 1.4075981378555298, + "learning_rate": 0.00014679631290572602, + "loss": 1.117, + "step": 10182 + }, + { + "epoch": 0.3646749153938439, + "grad_norm": 1.6016145944595337, + "learning_rate": 0.00014678606198658916, + "loss": 1.0508, + "step": 10183 + }, + { + "epoch": 0.36471072752341216, + "grad_norm": 1.4088815450668335, + "learning_rate": 0.00014677581043800615, + "loss": 1.0656, + "step": 10184 + }, + { + "epoch": 0.3647465396529805, + "grad_norm": 1.5005613565444946, + "learning_rate": 0.00014676555826011496, + "loss": 1.0739, + "step": 10185 + }, + { + "epoch": 0.36478235178254875, + "grad_norm": 1.6275074481964111, + "learning_rate": 0.0001467553054530535, + "loss": 1.3987, + "step": 10186 + }, + { + "epoch": 0.364818163912117, + "grad_norm": 1.7962366342544556, + "learning_rate": 0.00014674505201695971, + "loss": 1.1239, + "step": 10187 + }, + { + "epoch": 0.36485397604168535, + "grad_norm": 1.6270785331726074, + "learning_rate": 0.00014673479795197154, + "loss": 0.8851, + "step": 10188 + }, + { + "epoch": 0.3648897881712536, + "grad_norm": 1.4471001625061035, + "learning_rate": 0.00014672454325822696, + "loss": 1.1402, + "step": 10189 + }, + { + "epoch": 0.3649256003008219, + "grad_norm": 1.5301926136016846, + "learning_rate": 0.00014671428793586392, + "loss": 1.0711, + "step": 10190 + }, + { + "epoch": 0.36496141243039015, + "grad_norm": 1.445727825164795, + "learning_rate": 0.0001467040319850204, + "loss": 1.1096, + "step": 10191 + }, + { + "epoch": 0.3649972245599585, + "grad_norm": 1.83064866065979, + "learning_rate": 0.0001466937754058344, + "loss": 1.207, + "step": 10192 + }, + { + "epoch": 0.36503303668952675, + "grad_norm": 1.3298965692520142, + "learning_rate": 0.00014668351819844384, + "loss": 1.0104, + "step": 10193 + }, + { + "epoch": 0.365068848819095, + "grad_norm": 1.7107380628585815, + "learning_rate": 0.00014667326036298675, + "loss": 1.2505, + "step": 10194 + }, + { + "epoch": 0.36510466094866334, + "grad_norm": 1.2967441082000732, + "learning_rate": 0.00014666300189960116, + "loss": 0.8507, + "step": 10195 + }, + { + "epoch": 0.3651404730782316, + "grad_norm": 1.6302907466888428, + "learning_rate": 0.00014665274280842508, + "loss": 1.1517, + "step": 10196 + }, + { + "epoch": 0.3651762852077999, + "grad_norm": 1.4841407537460327, + "learning_rate": 0.0001466424830895965, + "loss": 1.1063, + "step": 10197 + }, + { + "epoch": 0.36521209733736815, + "grad_norm": 1.622147798538208, + "learning_rate": 0.00014663222274325353, + "loss": 0.9979, + "step": 10198 + }, + { + "epoch": 0.36524790946693647, + "grad_norm": 1.640760898590088, + "learning_rate": 0.0001466219617695341, + "loss": 1.0037, + "step": 10199 + }, + { + "epoch": 0.36528372159650474, + "grad_norm": 2.1065919399261475, + "learning_rate": 0.00014661170016857633, + "loss": 1.2436, + "step": 10200 + }, + { + "epoch": 0.365319533726073, + "grad_norm": 1.3776403665542603, + "learning_rate": 0.00014660143794051827, + "loss": 1.2419, + "step": 10201 + }, + { + "epoch": 0.3653553458556413, + "grad_norm": 2.2468221187591553, + "learning_rate": 0.00014659117508549797, + "loss": 1.1775, + "step": 10202 + }, + { + "epoch": 0.3653911579852096, + "grad_norm": 1.5415728092193604, + "learning_rate": 0.0001465809116036535, + "loss": 1.2249, + "step": 10203 + }, + { + "epoch": 0.36542697011477787, + "grad_norm": 1.5397197008132935, + "learning_rate": 0.00014657064749512295, + "loss": 1.3681, + "step": 10204 + }, + { + "epoch": 0.36546278224434614, + "grad_norm": 1.5627810955047607, + "learning_rate": 0.0001465603827600444, + "loss": 1.1749, + "step": 10205 + }, + { + "epoch": 0.36549859437391446, + "grad_norm": 1.6053462028503418, + "learning_rate": 0.00014655011739855595, + "loss": 1.1128, + "step": 10206 + }, + { + "epoch": 0.36553440650348273, + "grad_norm": 1.3075929880142212, + "learning_rate": 0.00014653985141079576, + "loss": 1.0227, + "step": 10207 + }, + { + "epoch": 0.365570218633051, + "grad_norm": 1.7376984357833862, + "learning_rate": 0.00014652958479690185, + "loss": 0.9858, + "step": 10208 + }, + { + "epoch": 0.36560603076261927, + "grad_norm": 1.7681280374526978, + "learning_rate": 0.00014651931755701246, + "loss": 1.1721, + "step": 10209 + }, + { + "epoch": 0.3656418428921876, + "grad_norm": 1.4321454763412476, + "learning_rate": 0.0001465090496912656, + "loss": 1.0024, + "step": 10210 + }, + { + "epoch": 0.36567765502175587, + "grad_norm": 1.7506979703903198, + "learning_rate": 0.0001464987811997995, + "loss": 1.1875, + "step": 10211 + }, + { + "epoch": 0.36571346715132413, + "grad_norm": 1.5633145570755005, + "learning_rate": 0.00014648851208275224, + "loss": 1.3253, + "step": 10212 + }, + { + "epoch": 0.36574927928089246, + "grad_norm": 1.3936399221420288, + "learning_rate": 0.00014647824234026205, + "loss": 0.9981, + "step": 10213 + }, + { + "epoch": 0.3657850914104607, + "grad_norm": 1.7278743982315063, + "learning_rate": 0.00014646797197246706, + "loss": 0.9574, + "step": 10214 + }, + { + "epoch": 0.365820903540029, + "grad_norm": 1.336218237876892, + "learning_rate": 0.00014645770097950544, + "loss": 1.0979, + "step": 10215 + }, + { + "epoch": 0.36585671566959727, + "grad_norm": 1.2757453918457031, + "learning_rate": 0.00014644742936151538, + "loss": 1.1104, + "step": 10216 + }, + { + "epoch": 0.3658925277991656, + "grad_norm": 1.504571795463562, + "learning_rate": 0.00014643715711863507, + "loss": 1.1839, + "step": 10217 + }, + { + "epoch": 0.36592833992873386, + "grad_norm": 1.7478975057601929, + "learning_rate": 0.00014642688425100273, + "loss": 1.3897, + "step": 10218 + }, + { + "epoch": 0.36596415205830213, + "grad_norm": 2.0384750366210938, + "learning_rate": 0.00014641661075875652, + "loss": 1.0786, + "step": 10219 + }, + { + "epoch": 0.36599996418787045, + "grad_norm": 1.5288299322128296, + "learning_rate": 0.0001464063366420347, + "loss": 1.1289, + "step": 10220 + }, + { + "epoch": 0.3660357763174387, + "grad_norm": 1.4345828294754028, + "learning_rate": 0.00014639606190097547, + "loss": 1.1557, + "step": 10221 + }, + { + "epoch": 0.366071588447007, + "grad_norm": 1.5757791996002197, + "learning_rate": 0.00014638578653571708, + "loss": 1.2064, + "step": 10222 + }, + { + "epoch": 0.36610740057657526, + "grad_norm": 1.618928074836731, + "learning_rate": 0.00014637551054639774, + "loss": 1.1773, + "step": 10223 + }, + { + "epoch": 0.3661432127061436, + "grad_norm": 1.6207671165466309, + "learning_rate": 0.00014636523393315578, + "loss": 0.979, + "step": 10224 + }, + { + "epoch": 0.36617902483571185, + "grad_norm": 1.3548725843429565, + "learning_rate": 0.00014635495669612934, + "loss": 1.1992, + "step": 10225 + }, + { + "epoch": 0.3662148369652801, + "grad_norm": 1.6060603857040405, + "learning_rate": 0.0001463446788354568, + "loss": 1.2718, + "step": 10226 + }, + { + "epoch": 0.36625064909484845, + "grad_norm": 1.7679516077041626, + "learning_rate": 0.00014633440035127638, + "loss": 1.2296, + "step": 10227 + }, + { + "epoch": 0.3662864612244167, + "grad_norm": 1.600614309310913, + "learning_rate": 0.00014632412124372635, + "loss": 1.0607, + "step": 10228 + }, + { + "epoch": 0.366322273353985, + "grad_norm": 1.6240226030349731, + "learning_rate": 0.00014631384151294507, + "loss": 1.3932, + "step": 10229 + }, + { + "epoch": 0.36635808548355325, + "grad_norm": 1.4476392269134521, + "learning_rate": 0.00014630356115907073, + "loss": 1.1889, + "step": 10230 + }, + { + "epoch": 0.3663938976131216, + "grad_norm": 1.3434808254241943, + "learning_rate": 0.00014629328018224175, + "loss": 1.0819, + "step": 10231 + }, + { + "epoch": 0.36642970974268985, + "grad_norm": 1.5093432664871216, + "learning_rate": 0.00014628299858259638, + "loss": 1.1012, + "step": 10232 + }, + { + "epoch": 0.3664655218722581, + "grad_norm": 1.2977194786071777, + "learning_rate": 0.00014627271636027297, + "loss": 1.1825, + "step": 10233 + }, + { + "epoch": 0.36650133400182644, + "grad_norm": 1.3820290565490723, + "learning_rate": 0.00014626243351540983, + "loss": 0.9091, + "step": 10234 + }, + { + "epoch": 0.3665371461313947, + "grad_norm": 1.9988056421279907, + "learning_rate": 0.00014625215004814533, + "loss": 1.2736, + "step": 10235 + }, + { + "epoch": 0.366572958260963, + "grad_norm": 1.4736089706420898, + "learning_rate": 0.00014624186595861785, + "loss": 1.093, + "step": 10236 + }, + { + "epoch": 0.36660877039053125, + "grad_norm": 1.6831952333450317, + "learning_rate": 0.00014623158124696565, + "loss": 1.3994, + "step": 10237 + }, + { + "epoch": 0.36664458252009957, + "grad_norm": 1.3553693294525146, + "learning_rate": 0.00014622129591332722, + "loss": 1.202, + "step": 10238 + }, + { + "epoch": 0.36668039464966784, + "grad_norm": 1.7295199632644653, + "learning_rate": 0.0001462110099578408, + "loss": 1.2944, + "step": 10239 + }, + { + "epoch": 0.3667162067792361, + "grad_norm": 1.3457646369934082, + "learning_rate": 0.0001462007233806449, + "loss": 1.0636, + "step": 10240 + }, + { + "epoch": 0.36675201890880443, + "grad_norm": 1.6281592845916748, + "learning_rate": 0.00014619043618187784, + "loss": 0.8739, + "step": 10241 + }, + { + "epoch": 0.3667878310383727, + "grad_norm": 1.7433851957321167, + "learning_rate": 0.00014618014836167807, + "loss": 1.3777, + "step": 10242 + }, + { + "epoch": 0.36682364316794097, + "grad_norm": 1.4907419681549072, + "learning_rate": 0.00014616985992018394, + "loss": 1.1835, + "step": 10243 + }, + { + "epoch": 0.36685945529750924, + "grad_norm": 1.4456708431243896, + "learning_rate": 0.00014615957085753394, + "loss": 1.1753, + "step": 10244 + }, + { + "epoch": 0.36689526742707756, + "grad_norm": 1.780623197555542, + "learning_rate": 0.00014614928117386643, + "loss": 1.1039, + "step": 10245 + }, + { + "epoch": 0.36693107955664583, + "grad_norm": 2.080355167388916, + "learning_rate": 0.0001461389908693199, + "loss": 1.1429, + "step": 10246 + }, + { + "epoch": 0.3669668916862141, + "grad_norm": 1.4278651475906372, + "learning_rate": 0.0001461286999440327, + "loss": 0.9192, + "step": 10247 + }, + { + "epoch": 0.3670027038157824, + "grad_norm": 1.4938682317733765, + "learning_rate": 0.00014611840839814336, + "loss": 1.0361, + "step": 10248 + }, + { + "epoch": 0.3670385159453507, + "grad_norm": 1.6940748691558838, + "learning_rate": 0.00014610811623179038, + "loss": 1.1132, + "step": 10249 + }, + { + "epoch": 0.36707432807491897, + "grad_norm": 1.598029375076294, + "learning_rate": 0.00014609782344511213, + "loss": 1.0888, + "step": 10250 + }, + { + "epoch": 0.36711014020448723, + "grad_norm": 1.2694755792617798, + "learning_rate": 0.0001460875300382471, + "loss": 1.0086, + "step": 10251 + }, + { + "epoch": 0.36714595233405556, + "grad_norm": 1.5728025436401367, + "learning_rate": 0.00014607723601133384, + "loss": 1.1034, + "step": 10252 + }, + { + "epoch": 0.3671817644636238, + "grad_norm": 1.5915447473526, + "learning_rate": 0.00014606694136451082, + "loss": 1.2168, + "step": 10253 + }, + { + "epoch": 0.3672175765931921, + "grad_norm": 1.3104326725006104, + "learning_rate": 0.0001460566460979165, + "loss": 1.1058, + "step": 10254 + }, + { + "epoch": 0.3672533887227604, + "grad_norm": 1.6392563581466675, + "learning_rate": 0.00014604635021168942, + "loss": 1.1903, + "step": 10255 + }, + { + "epoch": 0.3672892008523287, + "grad_norm": 1.6805120706558228, + "learning_rate": 0.00014603605370596808, + "loss": 0.8747, + "step": 10256 + }, + { + "epoch": 0.36732501298189696, + "grad_norm": 1.2805397510528564, + "learning_rate": 0.00014602575658089105, + "loss": 1.0775, + "step": 10257 + }, + { + "epoch": 0.36736082511146523, + "grad_norm": 1.6759974956512451, + "learning_rate": 0.0001460154588365968, + "loss": 1.0848, + "step": 10258 + }, + { + "epoch": 0.36739663724103355, + "grad_norm": 1.5081287622451782, + "learning_rate": 0.00014600516047322392, + "loss": 1.2343, + "step": 10259 + }, + { + "epoch": 0.3674324493706018, + "grad_norm": 2.1872777938842773, + "learning_rate": 0.00014599486149091096, + "loss": 1.1775, + "step": 10260 + }, + { + "epoch": 0.3674682615001701, + "grad_norm": 1.5729113817214966, + "learning_rate": 0.00014598456188979643, + "loss": 1.3319, + "step": 10261 + }, + { + "epoch": 0.3675040736297384, + "grad_norm": 1.4194223880767822, + "learning_rate": 0.000145974261670019, + "loss": 1.0224, + "step": 10262 + }, + { + "epoch": 0.3675398857593067, + "grad_norm": 1.3824254274368286, + "learning_rate": 0.00014596396083171715, + "loss": 1.306, + "step": 10263 + }, + { + "epoch": 0.36757569788887495, + "grad_norm": 1.3569759130477905, + "learning_rate": 0.0001459536593750295, + "loss": 1.0007, + "step": 10264 + }, + { + "epoch": 0.3676115100184432, + "grad_norm": 1.7883782386779785, + "learning_rate": 0.00014594335730009462, + "loss": 1.2423, + "step": 10265 + }, + { + "epoch": 0.36764732214801155, + "grad_norm": 1.3982688188552856, + "learning_rate": 0.00014593305460705114, + "loss": 1.1574, + "step": 10266 + }, + { + "epoch": 0.3676831342775798, + "grad_norm": 1.3593698740005493, + "learning_rate": 0.00014592275129603766, + "loss": 1.1414, + "step": 10267 + }, + { + "epoch": 0.3677189464071481, + "grad_norm": 1.7397785186767578, + "learning_rate": 0.00014591244736719282, + "loss": 1.2811, + "step": 10268 + }, + { + "epoch": 0.3677547585367164, + "grad_norm": 1.4247277975082397, + "learning_rate": 0.00014590214282065518, + "loss": 1.1592, + "step": 10269 + }, + { + "epoch": 0.3677905706662847, + "grad_norm": 1.3650145530700684, + "learning_rate": 0.00014589183765656343, + "loss": 1.0565, + "step": 10270 + }, + { + "epoch": 0.36782638279585295, + "grad_norm": 1.6268584728240967, + "learning_rate": 0.00014588153187505625, + "loss": 1.1307, + "step": 10271 + }, + { + "epoch": 0.3678621949254212, + "grad_norm": 1.387117624282837, + "learning_rate": 0.00014587122547627217, + "loss": 1.1413, + "step": 10272 + }, + { + "epoch": 0.36789800705498954, + "grad_norm": 1.2180728912353516, + "learning_rate": 0.00014586091846034997, + "loss": 0.9983, + "step": 10273 + }, + { + "epoch": 0.3679338191845578, + "grad_norm": 1.3391273021697998, + "learning_rate": 0.00014585061082742824, + "loss": 1.1997, + "step": 10274 + }, + { + "epoch": 0.3679696313141261, + "grad_norm": 1.400262475013733, + "learning_rate": 0.0001458403025776457, + "loss": 1.261, + "step": 10275 + }, + { + "epoch": 0.3680054434436944, + "grad_norm": 1.7002747058868408, + "learning_rate": 0.000145829993711141, + "loss": 1.1634, + "step": 10276 + }, + { + "epoch": 0.36804125557326267, + "grad_norm": 2.0593526363372803, + "learning_rate": 0.00014581968422805287, + "loss": 1.1837, + "step": 10277 + }, + { + "epoch": 0.36807706770283094, + "grad_norm": 1.5934066772460938, + "learning_rate": 0.00014580937412852, + "loss": 1.1826, + "step": 10278 + }, + { + "epoch": 0.3681128798323992, + "grad_norm": 1.4913181066513062, + "learning_rate": 0.0001457990634126811, + "loss": 1.1926, + "step": 10279 + }, + { + "epoch": 0.36814869196196753, + "grad_norm": 1.3964769840240479, + "learning_rate": 0.00014578875208067483, + "loss": 0.9769, + "step": 10280 + }, + { + "epoch": 0.3681845040915358, + "grad_norm": 1.3705568313598633, + "learning_rate": 0.00014577844013264, + "loss": 1.1519, + "step": 10281 + }, + { + "epoch": 0.36822031622110407, + "grad_norm": 1.4618734121322632, + "learning_rate": 0.0001457681275687153, + "loss": 1.3157, + "step": 10282 + }, + { + "epoch": 0.3682561283506724, + "grad_norm": 1.499370813369751, + "learning_rate": 0.00014575781438903946, + "loss": 1.1331, + "step": 10283 + }, + { + "epoch": 0.36829194048024066, + "grad_norm": 1.2989789247512817, + "learning_rate": 0.0001457475005937513, + "loss": 1.1153, + "step": 10284 + }, + { + "epoch": 0.36832775260980893, + "grad_norm": 2.6623141765594482, + "learning_rate": 0.0001457371861829895, + "loss": 1.2332, + "step": 10285 + }, + { + "epoch": 0.3683635647393772, + "grad_norm": 1.3339117765426636, + "learning_rate": 0.00014572687115689282, + "loss": 1.0169, + "step": 10286 + }, + { + "epoch": 0.3683993768689455, + "grad_norm": 1.4847960472106934, + "learning_rate": 0.0001457165555156001, + "loss": 1.0262, + "step": 10287 + }, + { + "epoch": 0.3684351889985138, + "grad_norm": 1.8445957899093628, + "learning_rate": 0.00014570623925925014, + "loss": 0.9652, + "step": 10288 + }, + { + "epoch": 0.36847100112808207, + "grad_norm": 1.5666780471801758, + "learning_rate": 0.00014569592238798163, + "loss": 1.0542, + "step": 10289 + }, + { + "epoch": 0.3685068132576504, + "grad_norm": 1.845860242843628, + "learning_rate": 0.00014568560490193345, + "loss": 1.1727, + "step": 10290 + }, + { + "epoch": 0.36854262538721866, + "grad_norm": 1.4671051502227783, + "learning_rate": 0.0001456752868012444, + "loss": 1.1634, + "step": 10291 + }, + { + "epoch": 0.3685784375167869, + "grad_norm": 2.4438841342926025, + "learning_rate": 0.00014566496808605326, + "loss": 1.108, + "step": 10292 + }, + { + "epoch": 0.3686142496463552, + "grad_norm": 1.5734028816223145, + "learning_rate": 0.00014565464875649888, + "loss": 1.1758, + "step": 10293 + }, + { + "epoch": 0.3686500617759235, + "grad_norm": 1.3563404083251953, + "learning_rate": 0.0001456443288127201, + "loss": 1.1141, + "step": 10294 + }, + { + "epoch": 0.3686858739054918, + "grad_norm": 1.5672085285186768, + "learning_rate": 0.00014563400825485576, + "loss": 1.2728, + "step": 10295 + }, + { + "epoch": 0.36872168603506006, + "grad_norm": 1.4384769201278687, + "learning_rate": 0.00014562368708304467, + "loss": 1.0567, + "step": 10296 + }, + { + "epoch": 0.3687574981646284, + "grad_norm": 1.4766343832015991, + "learning_rate": 0.00014561336529742575, + "loss": 1.0822, + "step": 10297 + }, + { + "epoch": 0.36879331029419665, + "grad_norm": 1.6578742265701294, + "learning_rate": 0.00014560304289813785, + "loss": 1.0631, + "step": 10298 + }, + { + "epoch": 0.3688291224237649, + "grad_norm": 1.6444659233093262, + "learning_rate": 0.0001455927198853198, + "loss": 1.2238, + "step": 10299 + }, + { + "epoch": 0.3688649345533332, + "grad_norm": 2.0652666091918945, + "learning_rate": 0.00014558239625911052, + "loss": 1.4601, + "step": 10300 + }, + { + "epoch": 0.3689007466829015, + "grad_norm": 1.4216798543930054, + "learning_rate": 0.00014557207201964893, + "loss": 1.1465, + "step": 10301 + }, + { + "epoch": 0.3689365588124698, + "grad_norm": 2.2214772701263428, + "learning_rate": 0.00014556174716707384, + "loss": 1.1783, + "step": 10302 + }, + { + "epoch": 0.36897237094203805, + "grad_norm": 1.49165940284729, + "learning_rate": 0.00014555142170152423, + "loss": 1.2693, + "step": 10303 + }, + { + "epoch": 0.3690081830716064, + "grad_norm": 1.4121403694152832, + "learning_rate": 0.00014554109562313903, + "loss": 1.068, + "step": 10304 + }, + { + "epoch": 0.36904399520117465, + "grad_norm": 1.9425990581512451, + "learning_rate": 0.00014553076893205708, + "loss": 1.0799, + "step": 10305 + }, + { + "epoch": 0.3690798073307429, + "grad_norm": 1.6165108680725098, + "learning_rate": 0.00014552044162841743, + "loss": 0.9174, + "step": 10306 + }, + { + "epoch": 0.3691156194603112, + "grad_norm": 1.603857159614563, + "learning_rate": 0.0001455101137123589, + "loss": 1.2893, + "step": 10307 + }, + { + "epoch": 0.3691514315898795, + "grad_norm": 1.2577816247940063, + "learning_rate": 0.00014549978518402053, + "loss": 1.163, + "step": 10308 + }, + { + "epoch": 0.3691872437194478, + "grad_norm": 1.7883297204971313, + "learning_rate": 0.0001454894560435412, + "loss": 1.0026, + "step": 10309 + }, + { + "epoch": 0.36922305584901605, + "grad_norm": 1.2056766748428345, + "learning_rate": 0.00014547912629105995, + "loss": 1.0067, + "step": 10310 + }, + { + "epoch": 0.36925886797858437, + "grad_norm": 1.3757487535476685, + "learning_rate": 0.00014546879592671573, + "loss": 1.1242, + "step": 10311 + }, + { + "epoch": 0.36929468010815264, + "grad_norm": 1.487725853919983, + "learning_rate": 0.00014545846495064748, + "loss": 1.2482, + "step": 10312 + }, + { + "epoch": 0.3693304922377209, + "grad_norm": 1.6113399267196655, + "learning_rate": 0.0001454481333629942, + "loss": 1.2408, + "step": 10313 + }, + { + "epoch": 0.3693663043672892, + "grad_norm": 1.6796178817749023, + "learning_rate": 0.00014543780116389496, + "loss": 1.1583, + "step": 10314 + }, + { + "epoch": 0.3694021164968575, + "grad_norm": 1.78257155418396, + "learning_rate": 0.0001454274683534887, + "loss": 0.9062, + "step": 10315 + }, + { + "epoch": 0.36943792862642577, + "grad_norm": 1.8971065282821655, + "learning_rate": 0.00014541713493191444, + "loss": 1.1241, + "step": 10316 + }, + { + "epoch": 0.36947374075599404, + "grad_norm": 1.7811377048492432, + "learning_rate": 0.00014540680089931125, + "loss": 1.2005, + "step": 10317 + }, + { + "epoch": 0.36950955288556236, + "grad_norm": 1.3922691345214844, + "learning_rate": 0.00014539646625581805, + "loss": 0.9198, + "step": 10318 + }, + { + "epoch": 0.36954536501513063, + "grad_norm": 1.8672831058502197, + "learning_rate": 0.00014538613100157404, + "loss": 1.0836, + "step": 10319 + }, + { + "epoch": 0.3695811771446989, + "grad_norm": 1.7405376434326172, + "learning_rate": 0.0001453757951367181, + "loss": 1.1997, + "step": 10320 + }, + { + "epoch": 0.36961698927426717, + "grad_norm": 1.317000150680542, + "learning_rate": 0.00014536545866138941, + "loss": 1.0715, + "step": 10321 + }, + { + "epoch": 0.3696528014038355, + "grad_norm": 1.5889583826065063, + "learning_rate": 0.000145355121575727, + "loss": 1.0643, + "step": 10322 + }, + { + "epoch": 0.36968861353340376, + "grad_norm": 1.3292734622955322, + "learning_rate": 0.00014534478387986992, + "loss": 1.0668, + "step": 10323 + }, + { + "epoch": 0.36972442566297203, + "grad_norm": 1.620405912399292, + "learning_rate": 0.0001453344455739573, + "loss": 1.0355, + "step": 10324 + }, + { + "epoch": 0.36976023779254036, + "grad_norm": 1.5308345556259155, + "learning_rate": 0.0001453241066581281, + "loss": 1.0408, + "step": 10325 + }, + { + "epoch": 0.3697960499221086, + "grad_norm": 1.7498178482055664, + "learning_rate": 0.0001453137671325216, + "loss": 1.0806, + "step": 10326 + }, + { + "epoch": 0.3698318620516769, + "grad_norm": 1.4744629859924316, + "learning_rate": 0.00014530342699727676, + "loss": 0.9818, + "step": 10327 + }, + { + "epoch": 0.36986767418124517, + "grad_norm": 1.6324210166931152, + "learning_rate": 0.0001452930862525328, + "loss": 1.2205, + "step": 10328 + }, + { + "epoch": 0.3699034863108135, + "grad_norm": 2.1235153675079346, + "learning_rate": 0.00014528274489842872, + "loss": 0.9917, + "step": 10329 + }, + { + "epoch": 0.36993929844038176, + "grad_norm": 1.58273184299469, + "learning_rate": 0.00014527240293510377, + "loss": 1.0382, + "step": 10330 + }, + { + "epoch": 0.36997511056995, + "grad_norm": 1.8782628774642944, + "learning_rate": 0.000145262060362697, + "loss": 1.3187, + "step": 10331 + }, + { + "epoch": 0.37001092269951835, + "grad_norm": 1.677203893661499, + "learning_rate": 0.00014525171718134762, + "loss": 1.3184, + "step": 10332 + }, + { + "epoch": 0.3700467348290866, + "grad_norm": 1.3605105876922607, + "learning_rate": 0.00014524137339119478, + "loss": 1.2044, + "step": 10333 + }, + { + "epoch": 0.3700825469586549, + "grad_norm": 1.975957989692688, + "learning_rate": 0.00014523102899237754, + "loss": 1.0908, + "step": 10334 + }, + { + "epoch": 0.37011835908822316, + "grad_norm": 2.000730514526367, + "learning_rate": 0.00014522068398503522, + "loss": 1.2972, + "step": 10335 + }, + { + "epoch": 0.3701541712177915, + "grad_norm": 1.544758677482605, + "learning_rate": 0.00014521033836930689, + "loss": 0.8722, + "step": 10336 + }, + { + "epoch": 0.37018998334735975, + "grad_norm": 1.9027223587036133, + "learning_rate": 0.0001451999921453318, + "loss": 1.172, + "step": 10337 + }, + { + "epoch": 0.370225795476928, + "grad_norm": 1.5668352842330933, + "learning_rate": 0.00014518964531324907, + "loss": 1.1711, + "step": 10338 + }, + { + "epoch": 0.37026160760649635, + "grad_norm": 1.7756946086883545, + "learning_rate": 0.000145179297873198, + "loss": 1.468, + "step": 10339 + }, + { + "epoch": 0.3702974197360646, + "grad_norm": 1.5677233934402466, + "learning_rate": 0.00014516894982531775, + "loss": 1.0886, + "step": 10340 + }, + { + "epoch": 0.3703332318656329, + "grad_norm": 1.4947408437728882, + "learning_rate": 0.00014515860116974752, + "loss": 1.2955, + "step": 10341 + }, + { + "epoch": 0.37036904399520115, + "grad_norm": 1.3835188150405884, + "learning_rate": 0.0001451482519066266, + "loss": 0.9605, + "step": 10342 + }, + { + "epoch": 0.3704048561247695, + "grad_norm": 1.542248010635376, + "learning_rate": 0.00014513790203609416, + "loss": 1.0178, + "step": 10343 + }, + { + "epoch": 0.37044066825433775, + "grad_norm": 1.601357340812683, + "learning_rate": 0.0001451275515582895, + "loss": 1.2769, + "step": 10344 + }, + { + "epoch": 0.370476480383906, + "grad_norm": 1.3413798809051514, + "learning_rate": 0.0001451172004733518, + "loss": 1.2282, + "step": 10345 + }, + { + "epoch": 0.37051229251347434, + "grad_norm": 1.6398290395736694, + "learning_rate": 0.00014510684878142038, + "loss": 1.2555, + "step": 10346 + }, + { + "epoch": 0.3705481046430426, + "grad_norm": 1.4033088684082031, + "learning_rate": 0.00014509649648263449, + "loss": 1.2465, + "step": 10347 + }, + { + "epoch": 0.3705839167726109, + "grad_norm": 1.5917178392410278, + "learning_rate": 0.00014508614357713342, + "loss": 1.082, + "step": 10348 + }, + { + "epoch": 0.37061972890217915, + "grad_norm": 1.4031621217727661, + "learning_rate": 0.00014507579006505642, + "loss": 1.0671, + "step": 10349 + }, + { + "epoch": 0.37065554103174747, + "grad_norm": 1.9115808010101318, + "learning_rate": 0.00014506543594654288, + "loss": 1.0652, + "step": 10350 + }, + { + "epoch": 0.37069135316131574, + "grad_norm": 2.047037363052368, + "learning_rate": 0.00014505508122173198, + "loss": 1.2934, + "step": 10351 + }, + { + "epoch": 0.370727165290884, + "grad_norm": 1.4323323965072632, + "learning_rate": 0.00014504472589076307, + "loss": 1.3262, + "step": 10352 + }, + { + "epoch": 0.37076297742045233, + "grad_norm": 1.3678412437438965, + "learning_rate": 0.00014503436995377548, + "loss": 1.2218, + "step": 10353 + }, + { + "epoch": 0.3707987895500206, + "grad_norm": 1.383673071861267, + "learning_rate": 0.00014502401341090853, + "loss": 1.0588, + "step": 10354 + }, + { + "epoch": 0.37083460167958887, + "grad_norm": 1.3877700567245483, + "learning_rate": 0.00014501365626230157, + "loss": 1.3448, + "step": 10355 + }, + { + "epoch": 0.37087041380915714, + "grad_norm": 1.9005107879638672, + "learning_rate": 0.00014500329850809394, + "loss": 1.219, + "step": 10356 + }, + { + "epoch": 0.37090622593872546, + "grad_norm": 1.2821680307388306, + "learning_rate": 0.00014499294014842494, + "loss": 0.9249, + "step": 10357 + }, + { + "epoch": 0.37094203806829373, + "grad_norm": 1.2554670572280884, + "learning_rate": 0.000144982581183434, + "loss": 0.9205, + "step": 10358 + }, + { + "epoch": 0.370977850197862, + "grad_norm": 1.4187244176864624, + "learning_rate": 0.00014497222161326045, + "loss": 1.0365, + "step": 10359 + }, + { + "epoch": 0.3710136623274303, + "grad_norm": 1.335163950920105, + "learning_rate": 0.00014496186143804366, + "loss": 1.0519, + "step": 10360 + }, + { + "epoch": 0.3710494744569986, + "grad_norm": 1.5487794876098633, + "learning_rate": 0.000144951500657923, + "loss": 1.1269, + "step": 10361 + }, + { + "epoch": 0.37108528658656686, + "grad_norm": 1.6330479383468628, + "learning_rate": 0.00014494113927303792, + "loss": 1.0824, + "step": 10362 + }, + { + "epoch": 0.37112109871613513, + "grad_norm": 1.8847358226776123, + "learning_rate": 0.00014493077728352778, + "loss": 1.2692, + "step": 10363 + }, + { + "epoch": 0.37115691084570346, + "grad_norm": 1.5343530178070068, + "learning_rate": 0.00014492041468953194, + "loss": 0.9915, + "step": 10364 + }, + { + "epoch": 0.3711927229752717, + "grad_norm": 1.3022723197937012, + "learning_rate": 0.0001449100514911899, + "loss": 0.9667, + "step": 10365 + }, + { + "epoch": 0.37122853510484, + "grad_norm": 1.6260838508605957, + "learning_rate": 0.00014489968768864107, + "loss": 1.1717, + "step": 10366 + }, + { + "epoch": 0.3712643472344083, + "grad_norm": 1.5207087993621826, + "learning_rate": 0.00014488932328202484, + "loss": 1.1775, + "step": 10367 + }, + { + "epoch": 0.3713001593639766, + "grad_norm": 1.7596250772476196, + "learning_rate": 0.00014487895827148067, + "loss": 1.3565, + "step": 10368 + }, + { + "epoch": 0.37133597149354486, + "grad_norm": 2.0709879398345947, + "learning_rate": 0.00014486859265714798, + "loss": 1.19, + "step": 10369 + }, + { + "epoch": 0.3713717836231131, + "grad_norm": 1.914086937904358, + "learning_rate": 0.00014485822643916626, + "loss": 1.0265, + "step": 10370 + }, + { + "epoch": 0.37140759575268145, + "grad_norm": 1.392157793045044, + "learning_rate": 0.00014484785961767498, + "loss": 1.135, + "step": 10371 + }, + { + "epoch": 0.3714434078822497, + "grad_norm": 1.9352325201034546, + "learning_rate": 0.0001448374921928136, + "loss": 1.0919, + "step": 10372 + }, + { + "epoch": 0.371479220011818, + "grad_norm": 1.6601988077163696, + "learning_rate": 0.00014482712416472157, + "loss": 1.4059, + "step": 10373 + }, + { + "epoch": 0.3715150321413863, + "grad_norm": 1.455023169517517, + "learning_rate": 0.00014481675553353843, + "loss": 0.9733, + "step": 10374 + }, + { + "epoch": 0.3715508442709546, + "grad_norm": 1.7432026863098145, + "learning_rate": 0.00014480638629940366, + "loss": 1.3743, + "step": 10375 + }, + { + "epoch": 0.37158665640052285, + "grad_norm": 1.5364363193511963, + "learning_rate": 0.00014479601646245676, + "loss": 1.0548, + "step": 10376 + }, + { + "epoch": 0.3716224685300911, + "grad_norm": 1.6837997436523438, + "learning_rate": 0.00014478564602283725, + "loss": 1.1013, + "step": 10377 + }, + { + "epoch": 0.37165828065965945, + "grad_norm": 1.689933180809021, + "learning_rate": 0.0001447752749806846, + "loss": 1.1552, + "step": 10378 + }, + { + "epoch": 0.3716940927892277, + "grad_norm": 1.8003361225128174, + "learning_rate": 0.00014476490333613842, + "loss": 1.1719, + "step": 10379 + }, + { + "epoch": 0.371729904918796, + "grad_norm": 1.7540369033813477, + "learning_rate": 0.00014475453108933817, + "loss": 1.2923, + "step": 10380 + }, + { + "epoch": 0.3717657170483643, + "grad_norm": 1.549452543258667, + "learning_rate": 0.00014474415824042346, + "loss": 0.9743, + "step": 10381 + }, + { + "epoch": 0.3718015291779326, + "grad_norm": 1.4493972063064575, + "learning_rate": 0.0001447337847895338, + "loss": 1.2722, + "step": 10382 + }, + { + "epoch": 0.37183734130750085, + "grad_norm": 2.1160247325897217, + "learning_rate": 0.00014472341073680883, + "loss": 1.504, + "step": 10383 + }, + { + "epoch": 0.3718731534370691, + "grad_norm": 1.6895430088043213, + "learning_rate": 0.00014471303608238798, + "loss": 1.0734, + "step": 10384 + }, + { + "epoch": 0.37190896556663744, + "grad_norm": 1.8025500774383545, + "learning_rate": 0.00014470266082641095, + "loss": 1.2961, + "step": 10385 + }, + { + "epoch": 0.3719447776962057, + "grad_norm": 1.9869662523269653, + "learning_rate": 0.00014469228496901727, + "loss": 1.305, + "step": 10386 + }, + { + "epoch": 0.371980589825774, + "grad_norm": 1.6500483751296997, + "learning_rate": 0.00014468190851034656, + "loss": 1.2464, + "step": 10387 + }, + { + "epoch": 0.3720164019553423, + "grad_norm": 1.4363914728164673, + "learning_rate": 0.0001446715314505384, + "loss": 1.3419, + "step": 10388 + }, + { + "epoch": 0.37205221408491057, + "grad_norm": 1.413333773612976, + "learning_rate": 0.00014466115378973236, + "loss": 1.1666, + "step": 10389 + }, + { + "epoch": 0.37208802621447884, + "grad_norm": 1.5483415126800537, + "learning_rate": 0.00014465077552806813, + "loss": 1.29, + "step": 10390 + }, + { + "epoch": 0.3721238383440471, + "grad_norm": 1.509721279144287, + "learning_rate": 0.00014464039666568532, + "loss": 0.9888, + "step": 10391 + }, + { + "epoch": 0.37215965047361543, + "grad_norm": 1.460334062576294, + "learning_rate": 0.00014463001720272357, + "loss": 1.2245, + "step": 10392 + }, + { + "epoch": 0.3721954626031837, + "grad_norm": 1.3562787771224976, + "learning_rate": 0.00014461963713932247, + "loss": 0.9882, + "step": 10393 + }, + { + "epoch": 0.37223127473275197, + "grad_norm": 1.7697736024856567, + "learning_rate": 0.00014460925647562174, + "loss": 1.0666, + "step": 10394 + }, + { + "epoch": 0.3722670868623203, + "grad_norm": 1.5959316492080688, + "learning_rate": 0.000144598875211761, + "loss": 1.2833, + "step": 10395 + }, + { + "epoch": 0.37230289899188856, + "grad_norm": 1.2369204759597778, + "learning_rate": 0.00014458849334787993, + "loss": 1.0864, + "step": 10396 + }, + { + "epoch": 0.37233871112145683, + "grad_norm": 1.5716081857681274, + "learning_rate": 0.00014457811088411816, + "loss": 1.2417, + "step": 10397 + }, + { + "epoch": 0.3723745232510251, + "grad_norm": 1.3588340282440186, + "learning_rate": 0.00014456772782061545, + "loss": 0.9758, + "step": 10398 + }, + { + "epoch": 0.3724103353805934, + "grad_norm": 2.1585912704467773, + "learning_rate": 0.00014455734415751143, + "loss": 1.2011, + "step": 10399 + }, + { + "epoch": 0.3724461475101617, + "grad_norm": 1.4919794797897339, + "learning_rate": 0.00014454695989494582, + "loss": 1.2109, + "step": 10400 + }, + { + "epoch": 0.37248195963972996, + "grad_norm": 1.514877438545227, + "learning_rate": 0.00014453657503305832, + "loss": 1.0302, + "step": 10401 + }, + { + "epoch": 0.37251777176929823, + "grad_norm": 2.1386218070983887, + "learning_rate": 0.00014452618957198866, + "loss": 1.1488, + "step": 10402 + }, + { + "epoch": 0.37255358389886656, + "grad_norm": 1.3604559898376465, + "learning_rate": 0.00014451580351187656, + "loss": 1.2579, + "step": 10403 + }, + { + "epoch": 0.3725893960284348, + "grad_norm": 1.9289608001708984, + "learning_rate": 0.00014450541685286173, + "loss": 1.2863, + "step": 10404 + }, + { + "epoch": 0.3726252081580031, + "grad_norm": 1.3389389514923096, + "learning_rate": 0.00014449502959508394, + "loss": 1.2549, + "step": 10405 + }, + { + "epoch": 0.3726610202875714, + "grad_norm": 1.4950898885726929, + "learning_rate": 0.00014448464173868293, + "loss": 1.0552, + "step": 10406 + }, + { + "epoch": 0.3726968324171397, + "grad_norm": 1.710965871810913, + "learning_rate": 0.00014447425328379843, + "loss": 0.879, + "step": 10407 + }, + { + "epoch": 0.37273264454670796, + "grad_norm": 1.6750081777572632, + "learning_rate": 0.00014446386423057022, + "loss": 0.9728, + "step": 10408 + }, + { + "epoch": 0.3727684566762762, + "grad_norm": 1.745764970779419, + "learning_rate": 0.00014445347457913807, + "loss": 1.0002, + "step": 10409 + }, + { + "epoch": 0.37280426880584455, + "grad_norm": 1.5952363014221191, + "learning_rate": 0.00014444308432964175, + "loss": 1.0224, + "step": 10410 + }, + { + "epoch": 0.3728400809354128, + "grad_norm": 2.680539846420288, + "learning_rate": 0.00014443269348222109, + "loss": 1.2661, + "step": 10411 + }, + { + "epoch": 0.3728758930649811, + "grad_norm": 1.682770013809204, + "learning_rate": 0.00014442230203701582, + "loss": 1.0806, + "step": 10412 + }, + { + "epoch": 0.3729117051945494, + "grad_norm": 1.7116855382919312, + "learning_rate": 0.0001444119099941658, + "loss": 1.1244, + "step": 10413 + }, + { + "epoch": 0.3729475173241177, + "grad_norm": 1.6776255369186401, + "learning_rate": 0.0001444015173538108, + "loss": 1.2354, + "step": 10414 + }, + { + "epoch": 0.37298332945368595, + "grad_norm": 1.5159648656845093, + "learning_rate": 0.00014439112411609065, + "loss": 1.0611, + "step": 10415 + }, + { + "epoch": 0.3730191415832542, + "grad_norm": 2.122450351715088, + "learning_rate": 0.00014438073028114523, + "loss": 1.1365, + "step": 10416 + }, + { + "epoch": 0.37305495371282255, + "grad_norm": 1.6313506364822388, + "learning_rate": 0.00014437033584911428, + "loss": 1.212, + "step": 10417 + }, + { + "epoch": 0.3730907658423908, + "grad_norm": 1.8764395713806152, + "learning_rate": 0.00014435994082013772, + "loss": 1.0486, + "step": 10418 + }, + { + "epoch": 0.3731265779719591, + "grad_norm": 1.320555567741394, + "learning_rate": 0.00014434954519435537, + "loss": 1.1576, + "step": 10419 + }, + { + "epoch": 0.3731623901015274, + "grad_norm": 1.7596385478973389, + "learning_rate": 0.0001443391489719071, + "loss": 1.1025, + "step": 10420 + }, + { + "epoch": 0.3731982022310957, + "grad_norm": 1.855676531791687, + "learning_rate": 0.0001443287521529328, + "loss": 1.1418, + "step": 10421 + }, + { + "epoch": 0.37323401436066395, + "grad_norm": 1.51803457736969, + "learning_rate": 0.00014431835473757227, + "loss": 0.9116, + "step": 10422 + }, + { + "epoch": 0.3732698264902322, + "grad_norm": 1.390568733215332, + "learning_rate": 0.0001443079567259655, + "loss": 1.2114, + "step": 10423 + }, + { + "epoch": 0.37330563861980054, + "grad_norm": 1.9044382572174072, + "learning_rate": 0.00014429755811825226, + "loss": 1.095, + "step": 10424 + }, + { + "epoch": 0.3733414507493688, + "grad_norm": 1.4102803468704224, + "learning_rate": 0.00014428715891457255, + "loss": 1.1338, + "step": 10425 + }, + { + "epoch": 0.3733772628789371, + "grad_norm": 1.7611323595046997, + "learning_rate": 0.00014427675911506623, + "loss": 1.0785, + "step": 10426 + }, + { + "epoch": 0.3734130750085054, + "grad_norm": 1.5695034265518188, + "learning_rate": 0.00014426635871987327, + "loss": 1.1314, + "step": 10427 + }, + { + "epoch": 0.37344888713807367, + "grad_norm": 1.460630178451538, + "learning_rate": 0.0001442559577291335, + "loss": 0.9958, + "step": 10428 + }, + { + "epoch": 0.37348469926764194, + "grad_norm": 1.5520685911178589, + "learning_rate": 0.00014424555614298693, + "loss": 1.2002, + "step": 10429 + }, + { + "epoch": 0.3735205113972102, + "grad_norm": 1.6285802125930786, + "learning_rate": 0.0001442351539615735, + "loss": 1.0631, + "step": 10430 + }, + { + "epoch": 0.37355632352677853, + "grad_norm": 1.4034781455993652, + "learning_rate": 0.00014422475118503307, + "loss": 1.3143, + "step": 10431 + }, + { + "epoch": 0.3735921356563468, + "grad_norm": 1.6311842203140259, + "learning_rate": 0.0001442143478135057, + "loss": 1.1058, + "step": 10432 + }, + { + "epoch": 0.37362794778591507, + "grad_norm": 1.4094206094741821, + "learning_rate": 0.00014420394384713129, + "loss": 1.202, + "step": 10433 + }, + { + "epoch": 0.3736637599154834, + "grad_norm": 1.5041565895080566, + "learning_rate": 0.00014419353928604988, + "loss": 1.1199, + "step": 10434 + }, + { + "epoch": 0.37369957204505166, + "grad_norm": 1.8962926864624023, + "learning_rate": 0.00014418313413040138, + "loss": 0.9312, + "step": 10435 + }, + { + "epoch": 0.37373538417461993, + "grad_norm": 1.7728314399719238, + "learning_rate": 0.00014417272838032578, + "loss": 1.1764, + "step": 10436 + }, + { + "epoch": 0.3737711963041882, + "grad_norm": 1.944075107574463, + "learning_rate": 0.00014416232203596312, + "loss": 1.1033, + "step": 10437 + }, + { + "epoch": 0.3738070084337565, + "grad_norm": 1.6606757640838623, + "learning_rate": 0.00014415191509745338, + "loss": 1.0336, + "step": 10438 + }, + { + "epoch": 0.3738428205633248, + "grad_norm": 2.244061231613159, + "learning_rate": 0.0001441415075649366, + "loss": 1.1406, + "step": 10439 + }, + { + "epoch": 0.37387863269289306, + "grad_norm": 1.6896034479141235, + "learning_rate": 0.00014413109943855275, + "loss": 0.9889, + "step": 10440 + }, + { + "epoch": 0.3739144448224614, + "grad_norm": 1.6852695941925049, + "learning_rate": 0.00014412069071844186, + "loss": 1.118, + "step": 10441 + }, + { + "epoch": 0.37395025695202966, + "grad_norm": 1.7655450105667114, + "learning_rate": 0.00014411028140474402, + "loss": 1.2729, + "step": 10442 + }, + { + "epoch": 0.3739860690815979, + "grad_norm": 1.6019580364227295, + "learning_rate": 0.0001440998714975992, + "loss": 1.1964, + "step": 10443 + }, + { + "epoch": 0.3740218812111662, + "grad_norm": 1.710414171218872, + "learning_rate": 0.00014408946099714754, + "loss": 1.3003, + "step": 10444 + }, + { + "epoch": 0.3740576933407345, + "grad_norm": 1.591282606124878, + "learning_rate": 0.00014407904990352904, + "loss": 1.201, + "step": 10445 + }, + { + "epoch": 0.3740935054703028, + "grad_norm": 1.7533255815505981, + "learning_rate": 0.00014406863821688374, + "loss": 1.3085, + "step": 10446 + }, + { + "epoch": 0.37412931759987106, + "grad_norm": 1.3973376750946045, + "learning_rate": 0.00014405822593735183, + "loss": 1.2687, + "step": 10447 + }, + { + "epoch": 0.3741651297294394, + "grad_norm": 1.3887263536453247, + "learning_rate": 0.0001440478130650733, + "loss": 0.8822, + "step": 10448 + }, + { + "epoch": 0.37420094185900765, + "grad_norm": 1.4797534942626953, + "learning_rate": 0.00014403739960018824, + "loss": 1.0771, + "step": 10449 + }, + { + "epoch": 0.3742367539885759, + "grad_norm": 1.2296708822250366, + "learning_rate": 0.00014402698554283675, + "loss": 1.2119, + "step": 10450 + }, + { + "epoch": 0.3742725661181442, + "grad_norm": 1.9264649152755737, + "learning_rate": 0.00014401657089315904, + "loss": 1.2122, + "step": 10451 + }, + { + "epoch": 0.3743083782477125, + "grad_norm": 1.88303542137146, + "learning_rate": 0.00014400615565129507, + "loss": 1.0886, + "step": 10452 + }, + { + "epoch": 0.3743441903772808, + "grad_norm": 2.202104091644287, + "learning_rate": 0.00014399573981738507, + "loss": 1.0785, + "step": 10453 + }, + { + "epoch": 0.37438000250684905, + "grad_norm": 1.5108681917190552, + "learning_rate": 0.00014398532339156912, + "loss": 1.0839, + "step": 10454 + }, + { + "epoch": 0.3744158146364174, + "grad_norm": 1.869485855102539, + "learning_rate": 0.00014397490637398742, + "loss": 1.07, + "step": 10455 + }, + { + "epoch": 0.37445162676598565, + "grad_norm": 1.2426440715789795, + "learning_rate": 0.00014396448876478007, + "loss": 1.1379, + "step": 10456 + }, + { + "epoch": 0.3744874388955539, + "grad_norm": 1.529839277267456, + "learning_rate": 0.00014395407056408722, + "loss": 1.1619, + "step": 10457 + }, + { + "epoch": 0.3745232510251222, + "grad_norm": 2.118504285812378, + "learning_rate": 0.00014394365177204904, + "loss": 1.3331, + "step": 10458 + }, + { + "epoch": 0.3745590631546905, + "grad_norm": 1.7520041465759277, + "learning_rate": 0.00014393323238880571, + "loss": 0.9849, + "step": 10459 + }, + { + "epoch": 0.3745948752842588, + "grad_norm": 1.3344172239303589, + "learning_rate": 0.00014392281241449743, + "loss": 1.0842, + "step": 10460 + }, + { + "epoch": 0.37463068741382705, + "grad_norm": 1.4448012113571167, + "learning_rate": 0.00014391239184926433, + "loss": 1.0424, + "step": 10461 + }, + { + "epoch": 0.37466649954339537, + "grad_norm": 1.348976492881775, + "learning_rate": 0.00014390197069324667, + "loss": 1.2781, + "step": 10462 + }, + { + "epoch": 0.37470231167296364, + "grad_norm": 1.5094586610794067, + "learning_rate": 0.0001438915489465846, + "loss": 1.0351, + "step": 10463 + }, + { + "epoch": 0.3747381238025319, + "grad_norm": 1.7666783332824707, + "learning_rate": 0.0001438811266094184, + "loss": 1.3367, + "step": 10464 + }, + { + "epoch": 0.3747739359321002, + "grad_norm": 1.5230839252471924, + "learning_rate": 0.0001438707036818882, + "loss": 1.1778, + "step": 10465 + }, + { + "epoch": 0.3748097480616685, + "grad_norm": 1.505164384841919, + "learning_rate": 0.00014386028016413426, + "loss": 1.0497, + "step": 10466 + }, + { + "epoch": 0.37484556019123677, + "grad_norm": 2.0688259601593018, + "learning_rate": 0.00014384985605629685, + "loss": 1.121, + "step": 10467 + }, + { + "epoch": 0.37488137232080504, + "grad_norm": 1.7429180145263672, + "learning_rate": 0.0001438394313585162, + "loss": 1.0509, + "step": 10468 + }, + { + "epoch": 0.37491718445037336, + "grad_norm": 1.6239659786224365, + "learning_rate": 0.00014382900607093254, + "loss": 1.2542, + "step": 10469 + }, + { + "epoch": 0.37495299657994163, + "grad_norm": 2.159344434738159, + "learning_rate": 0.00014381858019368613, + "loss": 1.3441, + "step": 10470 + }, + { + "epoch": 0.3749888087095099, + "grad_norm": 1.6711456775665283, + "learning_rate": 0.00014380815372691728, + "loss": 1.0533, + "step": 10471 + }, + { + "epoch": 0.37502462083907817, + "grad_norm": 1.5430355072021484, + "learning_rate": 0.00014379772667076618, + "loss": 1.3258, + "step": 10472 + }, + { + "epoch": 0.3750604329686465, + "grad_norm": 1.6612141132354736, + "learning_rate": 0.0001437872990253732, + "loss": 1.1585, + "step": 10473 + }, + { + "epoch": 0.37509624509821476, + "grad_norm": 1.4697226285934448, + "learning_rate": 0.0001437768707908786, + "loss": 1.2478, + "step": 10474 + }, + { + "epoch": 0.37513205722778303, + "grad_norm": 1.380982518196106, + "learning_rate": 0.00014376644196742263, + "loss": 1.0438, + "step": 10475 + }, + { + "epoch": 0.37516786935735136, + "grad_norm": 1.210137963294983, + "learning_rate": 0.00014375601255514565, + "loss": 0.9079, + "step": 10476 + }, + { + "epoch": 0.3752036814869196, + "grad_norm": 2.4806694984436035, + "learning_rate": 0.00014374558255418797, + "loss": 1.014, + "step": 10477 + }, + { + "epoch": 0.3752394936164879, + "grad_norm": 1.9158201217651367, + "learning_rate": 0.00014373515196468991, + "loss": 0.9972, + "step": 10478 + }, + { + "epoch": 0.37527530574605616, + "grad_norm": 1.4997025728225708, + "learning_rate": 0.00014372472078679177, + "loss": 1.0439, + "step": 10479 + }, + { + "epoch": 0.3753111178756245, + "grad_norm": 2.392381191253662, + "learning_rate": 0.00014371428902063395, + "loss": 1.2145, + "step": 10480 + }, + { + "epoch": 0.37534693000519276, + "grad_norm": 1.579236388206482, + "learning_rate": 0.00014370385666635674, + "loss": 0.9845, + "step": 10481 + }, + { + "epoch": 0.375382742134761, + "grad_norm": 1.7665538787841797, + "learning_rate": 0.00014369342372410053, + "loss": 1.1719, + "step": 10482 + }, + { + "epoch": 0.37541855426432935, + "grad_norm": 1.54545259475708, + "learning_rate": 0.00014368299019400563, + "loss": 0.82, + "step": 10483 + }, + { + "epoch": 0.3754543663938976, + "grad_norm": 1.6557285785675049, + "learning_rate": 0.0001436725560762125, + "loss": 1.3152, + "step": 10484 + }, + { + "epoch": 0.3754901785234659, + "grad_norm": 1.5518207550048828, + "learning_rate": 0.0001436621213708614, + "loss": 1.1616, + "step": 10485 + }, + { + "epoch": 0.37552599065303416, + "grad_norm": 1.797467827796936, + "learning_rate": 0.0001436516860780928, + "loss": 1.1944, + "step": 10486 + }, + { + "epoch": 0.3755618027826025, + "grad_norm": 1.5801595449447632, + "learning_rate": 0.00014364125019804708, + "loss": 1.3844, + "step": 10487 + }, + { + "epoch": 0.37559761491217075, + "grad_norm": 2.3659234046936035, + "learning_rate": 0.00014363081373086462, + "loss": 1.3182, + "step": 10488 + }, + { + "epoch": 0.375633427041739, + "grad_norm": 1.6590784788131714, + "learning_rate": 0.00014362037667668584, + "loss": 0.9555, + "step": 10489 + }, + { + "epoch": 0.37566923917130735, + "grad_norm": 1.996625304222107, + "learning_rate": 0.00014360993903565116, + "loss": 1.4798, + "step": 10490 + }, + { + "epoch": 0.3757050513008756, + "grad_norm": 1.32607901096344, + "learning_rate": 0.00014359950080790101, + "loss": 1.1752, + "step": 10491 + }, + { + "epoch": 0.3757408634304439, + "grad_norm": 1.4647752046585083, + "learning_rate": 0.0001435890619935758, + "loss": 1.0466, + "step": 10492 + }, + { + "epoch": 0.37577667556001215, + "grad_norm": 1.3597625494003296, + "learning_rate": 0.00014357862259281603, + "loss": 1.2667, + "step": 10493 + }, + { + "epoch": 0.3758124876895805, + "grad_norm": 1.787951946258545, + "learning_rate": 0.00014356818260576206, + "loss": 1.0982, + "step": 10494 + }, + { + "epoch": 0.37584829981914875, + "grad_norm": 1.540905475616455, + "learning_rate": 0.0001435577420325544, + "loss": 1.1813, + "step": 10495 + }, + { + "epoch": 0.375884111948717, + "grad_norm": 1.4616777896881104, + "learning_rate": 0.0001435473008733335, + "loss": 1.197, + "step": 10496 + }, + { + "epoch": 0.37591992407828534, + "grad_norm": 1.9380156993865967, + "learning_rate": 0.00014353685912823987, + "loss": 1.0369, + "step": 10497 + }, + { + "epoch": 0.3759557362078536, + "grad_norm": 1.281128168106079, + "learning_rate": 0.00014352641679741393, + "loss": 1.2292, + "step": 10498 + }, + { + "epoch": 0.3759915483374219, + "grad_norm": 1.2548588514328003, + "learning_rate": 0.0001435159738809962, + "loss": 0.9847, + "step": 10499 + }, + { + "epoch": 0.37602736046699015, + "grad_norm": 1.6887837648391724, + "learning_rate": 0.0001435055303791272, + "loss": 1.1209, + "step": 10500 + }, + { + "epoch": 0.37606317259655847, + "grad_norm": 1.9397847652435303, + "learning_rate": 0.00014349508629194738, + "loss": 1.1504, + "step": 10501 + }, + { + "epoch": 0.37609898472612674, + "grad_norm": 1.5845630168914795, + "learning_rate": 0.00014348464161959728, + "loss": 1.3004, + "step": 10502 + }, + { + "epoch": 0.376134796855695, + "grad_norm": 1.6772267818450928, + "learning_rate": 0.0001434741963622174, + "loss": 1.0555, + "step": 10503 + }, + { + "epoch": 0.37617060898526333, + "grad_norm": 1.5182394981384277, + "learning_rate": 0.00014346375051994833, + "loss": 1.1118, + "step": 10504 + }, + { + "epoch": 0.3762064211148316, + "grad_norm": 1.4239946603775024, + "learning_rate": 0.00014345330409293053, + "loss": 1.1148, + "step": 10505 + }, + { + "epoch": 0.37624223324439987, + "grad_norm": 1.5598915815353394, + "learning_rate": 0.0001434428570813046, + "loss": 1.1884, + "step": 10506 + }, + { + "epoch": 0.37627804537396814, + "grad_norm": 1.425937294960022, + "learning_rate": 0.00014343240948521104, + "loss": 1.0785, + "step": 10507 + }, + { + "epoch": 0.37631385750353646, + "grad_norm": 1.7741011381149292, + "learning_rate": 0.00014342196130479043, + "loss": 1.2437, + "step": 10508 + }, + { + "epoch": 0.37634966963310473, + "grad_norm": 1.6043827533721924, + "learning_rate": 0.0001434115125401834, + "loss": 1.2311, + "step": 10509 + }, + { + "epoch": 0.376385481762673, + "grad_norm": 1.6382536888122559, + "learning_rate": 0.00014340106319153038, + "loss": 1.3076, + "step": 10510 + }, + { + "epoch": 0.3764212938922413, + "grad_norm": 1.5487408638000488, + "learning_rate": 0.0001433906132589721, + "loss": 1.0228, + "step": 10511 + }, + { + "epoch": 0.3764571060218096, + "grad_norm": 1.3585165739059448, + "learning_rate": 0.00014338016274264905, + "loss": 1.2419, + "step": 10512 + }, + { + "epoch": 0.37649291815137786, + "grad_norm": 1.7043477296829224, + "learning_rate": 0.0001433697116427019, + "loss": 1.3073, + "step": 10513 + }, + { + "epoch": 0.37652873028094613, + "grad_norm": 1.430136799812317, + "learning_rate": 0.0001433592599592712, + "loss": 1.073, + "step": 10514 + }, + { + "epoch": 0.37656454241051446, + "grad_norm": 1.854625940322876, + "learning_rate": 0.00014334880769249758, + "loss": 1.1561, + "step": 10515 + }, + { + "epoch": 0.3766003545400827, + "grad_norm": 1.389679193496704, + "learning_rate": 0.00014333835484252167, + "loss": 1.3172, + "step": 10516 + }, + { + "epoch": 0.376636166669651, + "grad_norm": 1.4084864854812622, + "learning_rate": 0.00014332790140948414, + "loss": 1.1498, + "step": 10517 + }, + { + "epoch": 0.3766719787992193, + "grad_norm": 1.7700613737106323, + "learning_rate": 0.00014331744739352556, + "loss": 1.3227, + "step": 10518 + }, + { + "epoch": 0.3767077909287876, + "grad_norm": 1.2470078468322754, + "learning_rate": 0.0001433069927947866, + "loss": 1.2069, + "step": 10519 + }, + { + "epoch": 0.37674360305835586, + "grad_norm": 1.8183808326721191, + "learning_rate": 0.0001432965376134079, + "loss": 1.3925, + "step": 10520 + }, + { + "epoch": 0.3767794151879241, + "grad_norm": 1.5808792114257812, + "learning_rate": 0.00014328608184953012, + "loss": 1.0555, + "step": 10521 + }, + { + "epoch": 0.37681522731749245, + "grad_norm": 1.5300661325454712, + "learning_rate": 0.000143275625503294, + "loss": 1.1257, + "step": 10522 + }, + { + "epoch": 0.3768510394470607, + "grad_norm": 1.8388609886169434, + "learning_rate": 0.0001432651685748401, + "loss": 1.1607, + "step": 10523 + }, + { + "epoch": 0.376886851576629, + "grad_norm": 1.7156720161437988, + "learning_rate": 0.0001432547110643092, + "loss": 1.227, + "step": 10524 + }, + { + "epoch": 0.3769226637061973, + "grad_norm": 1.6666383743286133, + "learning_rate": 0.00014324425297184193, + "loss": 1.2272, + "step": 10525 + }, + { + "epoch": 0.3769584758357656, + "grad_norm": 1.7671054601669312, + "learning_rate": 0.00014323379429757906, + "loss": 1.3191, + "step": 10526 + }, + { + "epoch": 0.37699428796533385, + "grad_norm": 1.8710664510726929, + "learning_rate": 0.00014322333504166124, + "loss": 1.3009, + "step": 10527 + }, + { + "epoch": 0.3770301000949021, + "grad_norm": 2.749913454055786, + "learning_rate": 0.00014321287520422917, + "loss": 1.1724, + "step": 10528 + }, + { + "epoch": 0.37706591222447045, + "grad_norm": 1.2870534658432007, + "learning_rate": 0.00014320241478542363, + "loss": 1.0878, + "step": 10529 + }, + { + "epoch": 0.3771017243540387, + "grad_norm": 1.62325918674469, + "learning_rate": 0.0001431919537853853, + "loss": 1.1916, + "step": 10530 + }, + { + "epoch": 0.377137536483607, + "grad_norm": 1.7270344495773315, + "learning_rate": 0.000143181492204255, + "loss": 1.0877, + "step": 10531 + }, + { + "epoch": 0.3771733486131753, + "grad_norm": 1.445548176765442, + "learning_rate": 0.0001431710300421734, + "loss": 1.2213, + "step": 10532 + }, + { + "epoch": 0.3772091607427436, + "grad_norm": 1.5974400043487549, + "learning_rate": 0.00014316056729928126, + "loss": 1.3816, + "step": 10533 + }, + { + "epoch": 0.37724497287231185, + "grad_norm": 2.315377712249756, + "learning_rate": 0.00014315010397571937, + "loss": 1.2644, + "step": 10534 + }, + { + "epoch": 0.3772807850018801, + "grad_norm": 2.7439260482788086, + "learning_rate": 0.0001431396400716285, + "loss": 1.4684, + "step": 10535 + }, + { + "epoch": 0.37731659713144844, + "grad_norm": 1.3973640203475952, + "learning_rate": 0.00014312917558714943, + "loss": 1.1144, + "step": 10536 + }, + { + "epoch": 0.3773524092610167, + "grad_norm": 1.6425228118896484, + "learning_rate": 0.00014311871052242293, + "loss": 1.2622, + "step": 10537 + }, + { + "epoch": 0.377388221390585, + "grad_norm": 2.1687521934509277, + "learning_rate": 0.00014310824487758975, + "loss": 1.1171, + "step": 10538 + }, + { + "epoch": 0.3774240335201533, + "grad_norm": 1.4746471643447876, + "learning_rate": 0.00014309777865279078, + "loss": 0.9743, + "step": 10539 + }, + { + "epoch": 0.37745984564972157, + "grad_norm": 1.558475375175476, + "learning_rate": 0.00014308731184816678, + "loss": 1.2443, + "step": 10540 + }, + { + "epoch": 0.37749565777928984, + "grad_norm": 1.2878092527389526, + "learning_rate": 0.00014307684446385855, + "loss": 1.0372, + "step": 10541 + }, + { + "epoch": 0.3775314699088581, + "grad_norm": 1.3610345125198364, + "learning_rate": 0.000143066376500007, + "loss": 1.1964, + "step": 10542 + }, + { + "epoch": 0.37756728203842643, + "grad_norm": 1.444318413734436, + "learning_rate": 0.00014305590795675286, + "loss": 1.1685, + "step": 10543 + }, + { + "epoch": 0.3776030941679947, + "grad_norm": 1.7477495670318604, + "learning_rate": 0.00014304543883423708, + "loss": 1.3574, + "step": 10544 + }, + { + "epoch": 0.37763890629756297, + "grad_norm": 2.221991539001465, + "learning_rate": 0.0001430349691326004, + "loss": 1.1442, + "step": 10545 + }, + { + "epoch": 0.3776747184271313, + "grad_norm": 1.6133168935775757, + "learning_rate": 0.00014302449885198373, + "loss": 1.2102, + "step": 10546 + }, + { + "epoch": 0.37771053055669956, + "grad_norm": 1.3743300437927246, + "learning_rate": 0.00014301402799252793, + "loss": 1.0502, + "step": 10547 + }, + { + "epoch": 0.37774634268626783, + "grad_norm": 1.504784345626831, + "learning_rate": 0.00014300355655437385, + "loss": 1.1962, + "step": 10548 + }, + { + "epoch": 0.3777821548158361, + "grad_norm": 1.280097484588623, + "learning_rate": 0.00014299308453766238, + "loss": 1.1448, + "step": 10549 + }, + { + "epoch": 0.3778179669454044, + "grad_norm": 1.4433566331863403, + "learning_rate": 0.00014298261194253443, + "loss": 1.3357, + "step": 10550 + }, + { + "epoch": 0.3778537790749727, + "grad_norm": 1.5846706628799438, + "learning_rate": 0.00014297213876913087, + "loss": 1.2767, + "step": 10551 + }, + { + "epoch": 0.37788959120454096, + "grad_norm": 1.9302610158920288, + "learning_rate": 0.00014296166501759263, + "loss": 1.2199, + "step": 10552 + }, + { + "epoch": 0.3779254033341093, + "grad_norm": 1.4131442308425903, + "learning_rate": 0.00014295119068806063, + "loss": 1.1619, + "step": 10553 + }, + { + "epoch": 0.37796121546367756, + "grad_norm": 1.417655348777771, + "learning_rate": 0.00014294071578067568, + "loss": 1.1358, + "step": 10554 + }, + { + "epoch": 0.3779970275932458, + "grad_norm": 1.586219072341919, + "learning_rate": 0.00014293024029557886, + "loss": 1.0825, + "step": 10555 + }, + { + "epoch": 0.3780328397228141, + "grad_norm": 1.47831392288208, + "learning_rate": 0.000142919764232911, + "loss": 1.3174, + "step": 10556 + }, + { + "epoch": 0.3780686518523824, + "grad_norm": 1.4008208513259888, + "learning_rate": 0.0001429092875928131, + "loss": 1.1037, + "step": 10557 + }, + { + "epoch": 0.3781044639819507, + "grad_norm": 1.493168830871582, + "learning_rate": 0.00014289881037542605, + "loss": 1.1333, + "step": 10558 + }, + { + "epoch": 0.37814027611151896, + "grad_norm": 1.6311225891113281, + "learning_rate": 0.00014288833258089086, + "loss": 1.1244, + "step": 10559 + }, + { + "epoch": 0.3781760882410873, + "grad_norm": 1.2776373624801636, + "learning_rate": 0.00014287785420934846, + "loss": 1.1195, + "step": 10560 + }, + { + "epoch": 0.37821190037065555, + "grad_norm": 1.797434687614441, + "learning_rate": 0.0001428673752609399, + "loss": 0.9156, + "step": 10561 + }, + { + "epoch": 0.3782477125002238, + "grad_norm": 1.4112979173660278, + "learning_rate": 0.00014285689573580607, + "loss": 1.1527, + "step": 10562 + }, + { + "epoch": 0.3782835246297921, + "grad_norm": 1.6021002531051636, + "learning_rate": 0.00014284641563408796, + "loss": 1.3415, + "step": 10563 + }, + { + "epoch": 0.3783193367593604, + "grad_norm": 1.400423288345337, + "learning_rate": 0.00014283593495592663, + "loss": 1.1728, + "step": 10564 + }, + { + "epoch": 0.3783551488889287, + "grad_norm": 2.0995001792907715, + "learning_rate": 0.000142825453701463, + "loss": 1.2258, + "step": 10565 + }, + { + "epoch": 0.37839096101849695, + "grad_norm": 1.6746764183044434, + "learning_rate": 0.00014281497187083818, + "loss": 1.0417, + "step": 10566 + }, + { + "epoch": 0.3784267731480653, + "grad_norm": 1.499566674232483, + "learning_rate": 0.00014280448946419312, + "loss": 1.0985, + "step": 10567 + }, + { + "epoch": 0.37846258527763355, + "grad_norm": 1.3587145805358887, + "learning_rate": 0.0001427940064816689, + "loss": 1.1455, + "step": 10568 + }, + { + "epoch": 0.3784983974072018, + "grad_norm": 1.9506030082702637, + "learning_rate": 0.00014278352292340646, + "loss": 1.0587, + "step": 10569 + }, + { + "epoch": 0.3785342095367701, + "grad_norm": 1.6050993204116821, + "learning_rate": 0.00014277303878954694, + "loss": 1.1876, + "step": 10570 + }, + { + "epoch": 0.3785700216663384, + "grad_norm": 1.9197604656219482, + "learning_rate": 0.00014276255408023138, + "loss": 1.1622, + "step": 10571 + }, + { + "epoch": 0.3786058337959067, + "grad_norm": 1.2302416563034058, + "learning_rate": 0.00014275206879560079, + "loss": 1.1397, + "step": 10572 + }, + { + "epoch": 0.37864164592547495, + "grad_norm": 1.5080682039260864, + "learning_rate": 0.00014274158293579628, + "loss": 1.2517, + "step": 10573 + }, + { + "epoch": 0.37867745805504327, + "grad_norm": 1.4235804080963135, + "learning_rate": 0.00014273109650095886, + "loss": 1.0741, + "step": 10574 + }, + { + "epoch": 0.37871327018461154, + "grad_norm": 1.7373746633529663, + "learning_rate": 0.0001427206094912297, + "loss": 1.0623, + "step": 10575 + }, + { + "epoch": 0.3787490823141798, + "grad_norm": 1.4526360034942627, + "learning_rate": 0.00014271012190674983, + "loss": 0.9839, + "step": 10576 + }, + { + "epoch": 0.3787848944437481, + "grad_norm": 1.3972949981689453, + "learning_rate": 0.00014269963374766034, + "loss": 1.0191, + "step": 10577 + }, + { + "epoch": 0.3788207065733164, + "grad_norm": 1.920261025428772, + "learning_rate": 0.00014268914501410239, + "loss": 1.1592, + "step": 10578 + }, + { + "epoch": 0.37885651870288467, + "grad_norm": 1.4666427373886108, + "learning_rate": 0.00014267865570621706, + "loss": 1.1992, + "step": 10579 + }, + { + "epoch": 0.37889233083245294, + "grad_norm": 1.8517146110534668, + "learning_rate": 0.00014266816582414547, + "loss": 1.1069, + "step": 10580 + }, + { + "epoch": 0.37892814296202126, + "grad_norm": 2.0744035243988037, + "learning_rate": 0.00014265767536802873, + "loss": 1.2337, + "step": 10581 + }, + { + "epoch": 0.37896395509158953, + "grad_norm": 1.3761391639709473, + "learning_rate": 0.000142647184338008, + "loss": 1.2, + "step": 10582 + }, + { + "epoch": 0.3789997672211578, + "grad_norm": 1.6792322397232056, + "learning_rate": 0.0001426366927342244, + "loss": 1.278, + "step": 10583 + }, + { + "epoch": 0.37903557935072607, + "grad_norm": 1.4410008192062378, + "learning_rate": 0.0001426262005568191, + "loss": 0.9883, + "step": 10584 + }, + { + "epoch": 0.3790713914802944, + "grad_norm": 1.4186745882034302, + "learning_rate": 0.00014261570780593327, + "loss": 1.1539, + "step": 10585 + }, + { + "epoch": 0.37910720360986266, + "grad_norm": 1.7986811399459839, + "learning_rate": 0.00014260521448170805, + "loss": 1.2312, + "step": 10586 + }, + { + "epoch": 0.37914301573943093, + "grad_norm": 1.5876169204711914, + "learning_rate": 0.0001425947205842846, + "loss": 1.0553, + "step": 10587 + }, + { + "epoch": 0.37917882786899926, + "grad_norm": 1.5271782875061035, + "learning_rate": 0.00014258422611380418, + "loss": 1.1644, + "step": 10588 + }, + { + "epoch": 0.3792146399985675, + "grad_norm": 1.8598933219909668, + "learning_rate": 0.0001425737310704079, + "loss": 0.9528, + "step": 10589 + }, + { + "epoch": 0.3792504521281358, + "grad_norm": 1.6201739311218262, + "learning_rate": 0.000142563235454237, + "loss": 1.4285, + "step": 10590 + }, + { + "epoch": 0.37928626425770406, + "grad_norm": 1.6022567749023438, + "learning_rate": 0.00014255273926543264, + "loss": 1.1224, + "step": 10591 + }, + { + "epoch": 0.3793220763872724, + "grad_norm": 1.4672765731811523, + "learning_rate": 0.0001425422425041361, + "loss": 1.2658, + "step": 10592 + }, + { + "epoch": 0.37935788851684066, + "grad_norm": 1.4375985860824585, + "learning_rate": 0.00014253174517048854, + "loss": 1.0741, + "step": 10593 + }, + { + "epoch": 0.3793937006464089, + "grad_norm": 1.4182236194610596, + "learning_rate": 0.00014252124726463121, + "loss": 1.0336, + "step": 10594 + }, + { + "epoch": 0.37942951277597725, + "grad_norm": 1.3672842979431152, + "learning_rate": 0.00014251074878670537, + "loss": 1.3017, + "step": 10595 + }, + { + "epoch": 0.3794653249055455, + "grad_norm": 1.7530051469802856, + "learning_rate": 0.00014250024973685218, + "loss": 1.2273, + "step": 10596 + }, + { + "epoch": 0.3795011370351138, + "grad_norm": 1.9602117538452148, + "learning_rate": 0.000142489750115213, + "loss": 1.1103, + "step": 10597 + }, + { + "epoch": 0.37953694916468206, + "grad_norm": 1.4474519491195679, + "learning_rate": 0.00014247924992192906, + "loss": 1.1692, + "step": 10598 + }, + { + "epoch": 0.3795727612942504, + "grad_norm": 1.410824179649353, + "learning_rate": 0.00014246874915714157, + "loss": 0.8552, + "step": 10599 + }, + { + "epoch": 0.37960857342381865, + "grad_norm": 1.2287418842315674, + "learning_rate": 0.00014245824782099185, + "loss": 1.0571, + "step": 10600 + }, + { + "epoch": 0.3796443855533869, + "grad_norm": 1.3635748624801636, + "learning_rate": 0.00014244774591362118, + "loss": 1.0488, + "step": 10601 + }, + { + "epoch": 0.3796801976829552, + "grad_norm": 1.5861951112747192, + "learning_rate": 0.00014243724343517082, + "loss": 1.1449, + "step": 10602 + }, + { + "epoch": 0.3797160098125235, + "grad_norm": 1.5977239608764648, + "learning_rate": 0.0001424267403857821, + "loss": 0.9841, + "step": 10603 + }, + { + "epoch": 0.3797518219420918, + "grad_norm": 1.9519745111465454, + "learning_rate": 0.00014241623676559633, + "loss": 0.9771, + "step": 10604 + }, + { + "epoch": 0.37978763407166005, + "grad_norm": 1.5972720384597778, + "learning_rate": 0.0001424057325747548, + "loss": 1.1974, + "step": 10605 + }, + { + "epoch": 0.3798234462012284, + "grad_norm": 1.9883801937103271, + "learning_rate": 0.00014239522781339884, + "loss": 1.0856, + "step": 10606 + }, + { + "epoch": 0.37985925833079665, + "grad_norm": 1.4717730283737183, + "learning_rate": 0.00014238472248166977, + "loss": 1.2121, + "step": 10607 + }, + { + "epoch": 0.3798950704603649, + "grad_norm": 1.699467420578003, + "learning_rate": 0.00014237421657970894, + "loss": 1.349, + "step": 10608 + }, + { + "epoch": 0.3799308825899332, + "grad_norm": 1.666683316230774, + "learning_rate": 0.00014236371010765766, + "loss": 1.1315, + "step": 10609 + }, + { + "epoch": 0.3799666947195015, + "grad_norm": 1.6778793334960938, + "learning_rate": 0.00014235320306565732, + "loss": 1.2573, + "step": 10610 + }, + { + "epoch": 0.3800025068490698, + "grad_norm": 1.4987640380859375, + "learning_rate": 0.00014234269545384927, + "loss": 1.2712, + "step": 10611 + }, + { + "epoch": 0.38003831897863805, + "grad_norm": 1.4886412620544434, + "learning_rate": 0.00014233218727237489, + "loss": 1.2265, + "step": 10612 + }, + { + "epoch": 0.38007413110820637, + "grad_norm": 1.4773731231689453, + "learning_rate": 0.00014232167852137547, + "loss": 1.2523, + "step": 10613 + }, + { + "epoch": 0.38010994323777464, + "grad_norm": 1.6213843822479248, + "learning_rate": 0.00014231116920099252, + "loss": 1.1425, + "step": 10614 + }, + { + "epoch": 0.3801457553673429, + "grad_norm": 1.1718209981918335, + "learning_rate": 0.00014230065931136735, + "loss": 1.0501, + "step": 10615 + }, + { + "epoch": 0.3801815674969112, + "grad_norm": 1.7064013481140137, + "learning_rate": 0.00014229014885264136, + "loss": 1.0771, + "step": 10616 + }, + { + "epoch": 0.3802173796264795, + "grad_norm": 1.4498484134674072, + "learning_rate": 0.00014227963782495598, + "loss": 1.1471, + "step": 10617 + }, + { + "epoch": 0.38025319175604777, + "grad_norm": 2.217349052429199, + "learning_rate": 0.0001422691262284526, + "loss": 1.1695, + "step": 10618 + }, + { + "epoch": 0.38028900388561604, + "grad_norm": 1.4766497611999512, + "learning_rate": 0.00014225861406327265, + "loss": 1.1227, + "step": 10619 + }, + { + "epoch": 0.38032481601518436, + "grad_norm": 1.60890793800354, + "learning_rate": 0.00014224810132955755, + "loss": 1.1122, + "step": 10620 + }, + { + "epoch": 0.38036062814475263, + "grad_norm": 1.1755268573760986, + "learning_rate": 0.00014223758802744878, + "loss": 1.1162, + "step": 10621 + }, + { + "epoch": 0.3803964402743209, + "grad_norm": 1.7723145484924316, + "learning_rate": 0.0001422270741570877, + "loss": 1.0782, + "step": 10622 + }, + { + "epoch": 0.38043225240388917, + "grad_norm": 1.4952970743179321, + "learning_rate": 0.00014221655971861582, + "loss": 1.2139, + "step": 10623 + }, + { + "epoch": 0.3804680645334575, + "grad_norm": 1.6348990201950073, + "learning_rate": 0.0001422060447121746, + "loss": 1.0252, + "step": 10624 + }, + { + "epoch": 0.38050387666302576, + "grad_norm": 1.602293848991394, + "learning_rate": 0.0001421955291379055, + "loss": 1.2338, + "step": 10625 + }, + { + "epoch": 0.38053968879259403, + "grad_norm": 1.6257212162017822, + "learning_rate": 0.00014218501299594996, + "loss": 1.1478, + "step": 10626 + }, + { + "epoch": 0.38057550092216236, + "grad_norm": 1.5756233930587769, + "learning_rate": 0.00014217449628644947, + "loss": 0.9909, + "step": 10627 + }, + { + "epoch": 0.3806113130517306, + "grad_norm": 1.9401086568832397, + "learning_rate": 0.00014216397900954558, + "loss": 1.1689, + "step": 10628 + }, + { + "epoch": 0.3806471251812989, + "grad_norm": 1.7039048671722412, + "learning_rate": 0.00014215346116537968, + "loss": 1.2753, + "step": 10629 + }, + { + "epoch": 0.38068293731086716, + "grad_norm": 1.4986765384674072, + "learning_rate": 0.0001421429427540934, + "loss": 1.0585, + "step": 10630 + }, + { + "epoch": 0.3807187494404355, + "grad_norm": 1.5750433206558228, + "learning_rate": 0.00014213242377582815, + "loss": 1.1442, + "step": 10631 + }, + { + "epoch": 0.38075456157000376, + "grad_norm": 1.3994122743606567, + "learning_rate": 0.0001421219042307255, + "loss": 1.1958, + "step": 10632 + }, + { + "epoch": 0.380790373699572, + "grad_norm": 1.694651484489441, + "learning_rate": 0.00014211138411892696, + "loss": 1.2368, + "step": 10633 + }, + { + "epoch": 0.38082618582914035, + "grad_norm": 1.500256061553955, + "learning_rate": 0.00014210086344057404, + "loss": 1.2494, + "step": 10634 + }, + { + "epoch": 0.3808619979587086, + "grad_norm": 1.2508199214935303, + "learning_rate": 0.00014209034219580833, + "loss": 1.0141, + "step": 10635 + }, + { + "epoch": 0.3808978100882769, + "grad_norm": 1.652545690536499, + "learning_rate": 0.00014207982038477135, + "loss": 1.2621, + "step": 10636 + }, + { + "epoch": 0.38093362221784516, + "grad_norm": 1.7463428974151611, + "learning_rate": 0.00014206929800760466, + "loss": 1.3781, + "step": 10637 + }, + { + "epoch": 0.3809694343474135, + "grad_norm": 1.6062507629394531, + "learning_rate": 0.00014205877506444982, + "loss": 1.2245, + "step": 10638 + }, + { + "epoch": 0.38100524647698175, + "grad_norm": 1.9932353496551514, + "learning_rate": 0.00014204825155544846, + "loss": 1.0311, + "step": 10639 + }, + { + "epoch": 0.38104105860655, + "grad_norm": 1.448678970336914, + "learning_rate": 0.00014203772748074206, + "loss": 1.2146, + "step": 10640 + }, + { + "epoch": 0.38107687073611834, + "grad_norm": 1.2814091444015503, + "learning_rate": 0.00014202720284047234, + "loss": 1.0634, + "step": 10641 + }, + { + "epoch": 0.3811126828656866, + "grad_norm": 1.5927743911743164, + "learning_rate": 0.00014201667763478074, + "loss": 1.2241, + "step": 10642 + }, + { + "epoch": 0.3811484949952549, + "grad_norm": 1.571252465248108, + "learning_rate": 0.00014200615186380899, + "loss": 1.2841, + "step": 10643 + }, + { + "epoch": 0.38118430712482315, + "grad_norm": 1.6563849449157715, + "learning_rate": 0.0001419956255276986, + "loss": 1.2015, + "step": 10644 + }, + { + "epoch": 0.3812201192543915, + "grad_norm": 1.4554392099380493, + "learning_rate": 0.00014198509862659129, + "loss": 1.0925, + "step": 10645 + }, + { + "epoch": 0.38125593138395975, + "grad_norm": 1.4083744287490845, + "learning_rate": 0.00014197457116062857, + "loss": 1.2741, + "step": 10646 + }, + { + "epoch": 0.381291743513528, + "grad_norm": 1.4634402990341187, + "learning_rate": 0.0001419640431299522, + "loss": 1.1188, + "step": 10647 + }, + { + "epoch": 0.38132755564309634, + "grad_norm": 1.7841798067092896, + "learning_rate": 0.00014195351453470374, + "loss": 1.0086, + "step": 10648 + }, + { + "epoch": 0.3813633677726646, + "grad_norm": 1.8969779014587402, + "learning_rate": 0.00014194298537502487, + "loss": 1.1822, + "step": 10649 + }, + { + "epoch": 0.3813991799022329, + "grad_norm": 1.5186681747436523, + "learning_rate": 0.00014193245565105722, + "loss": 1.2112, + "step": 10650 + }, + { + "epoch": 0.38143499203180115, + "grad_norm": 1.5824767351150513, + "learning_rate": 0.00014192192536294245, + "loss": 1.1833, + "step": 10651 + }, + { + "epoch": 0.38147080416136947, + "grad_norm": 1.6960227489471436, + "learning_rate": 0.00014191139451082228, + "loss": 1.2812, + "step": 10652 + }, + { + "epoch": 0.38150661629093774, + "grad_norm": 1.5017950534820557, + "learning_rate": 0.00014190086309483834, + "loss": 1.0751, + "step": 10653 + }, + { + "epoch": 0.381542428420506, + "grad_norm": 1.6422674655914307, + "learning_rate": 0.00014189033111513234, + "loss": 1.2608, + "step": 10654 + }, + { + "epoch": 0.38157824055007433, + "grad_norm": 1.8932439088821411, + "learning_rate": 0.00014187979857184597, + "loss": 1.1823, + "step": 10655 + }, + { + "epoch": 0.3816140526796426, + "grad_norm": 1.6841164827346802, + "learning_rate": 0.00014186926546512095, + "loss": 1.2781, + "step": 10656 + }, + { + "epoch": 0.38164986480921087, + "grad_norm": 1.5453945398330688, + "learning_rate": 0.00014185873179509893, + "loss": 1.0194, + "step": 10657 + }, + { + "epoch": 0.38168567693877914, + "grad_norm": 1.6436132192611694, + "learning_rate": 0.00014184819756192168, + "loss": 1.0296, + "step": 10658 + }, + { + "epoch": 0.38172148906834746, + "grad_norm": 1.621096134185791, + "learning_rate": 0.00014183766276573096, + "loss": 1.1523, + "step": 10659 + }, + { + "epoch": 0.38175730119791573, + "grad_norm": 2.041544198989868, + "learning_rate": 0.00014182712740666838, + "loss": 1.5182, + "step": 10660 + }, + { + "epoch": 0.381793113327484, + "grad_norm": 1.793533444404602, + "learning_rate": 0.00014181659148487582, + "loss": 1.0421, + "step": 10661 + }, + { + "epoch": 0.3818289254570523, + "grad_norm": 1.3543336391448975, + "learning_rate": 0.00014180605500049493, + "loss": 1.3262, + "step": 10662 + }, + { + "epoch": 0.3818647375866206, + "grad_norm": 1.3897836208343506, + "learning_rate": 0.0001417955179536675, + "loss": 1.1681, + "step": 10663 + }, + { + "epoch": 0.38190054971618886, + "grad_norm": 1.4271198511123657, + "learning_rate": 0.00014178498034453528, + "loss": 1.1383, + "step": 10664 + }, + { + "epoch": 0.38193636184575713, + "grad_norm": 1.6239309310913086, + "learning_rate": 0.00014177444217324005, + "loss": 1.0059, + "step": 10665 + }, + { + "epoch": 0.38197217397532546, + "grad_norm": 1.5276453495025635, + "learning_rate": 0.00014176390343992358, + "loss": 1.2794, + "step": 10666 + }, + { + "epoch": 0.3820079861048937, + "grad_norm": 1.3408405780792236, + "learning_rate": 0.0001417533641447277, + "loss": 1.1407, + "step": 10667 + }, + { + "epoch": 0.382043798234462, + "grad_norm": 1.3130462169647217, + "learning_rate": 0.00014174282428779412, + "loss": 1.332, + "step": 10668 + }, + { + "epoch": 0.3820796103640303, + "grad_norm": 1.7169933319091797, + "learning_rate": 0.0001417322838692647, + "loss": 1.001, + "step": 10669 + }, + { + "epoch": 0.3821154224935986, + "grad_norm": 1.5015872716903687, + "learning_rate": 0.00014172174288928124, + "loss": 1.1681, + "step": 10670 + }, + { + "epoch": 0.38215123462316686, + "grad_norm": 1.3004928827285767, + "learning_rate": 0.00014171120134798552, + "loss": 1.0768, + "step": 10671 + }, + { + "epoch": 0.3821870467527351, + "grad_norm": 1.636813759803772, + "learning_rate": 0.00014170065924551942, + "loss": 0.9874, + "step": 10672 + }, + { + "epoch": 0.38222285888230345, + "grad_norm": 1.7149536609649658, + "learning_rate": 0.00014169011658202472, + "loss": 1.029, + "step": 10673 + }, + { + "epoch": 0.3822586710118717, + "grad_norm": 1.5408331155776978, + "learning_rate": 0.00014167957335764331, + "loss": 1.2425, + "step": 10674 + }, + { + "epoch": 0.38229448314144, + "grad_norm": 1.9910101890563965, + "learning_rate": 0.00014166902957251696, + "loss": 1.2547, + "step": 10675 + }, + { + "epoch": 0.3823302952710083, + "grad_norm": 1.3597030639648438, + "learning_rate": 0.00014165848522678756, + "loss": 0.8821, + "step": 10676 + }, + { + "epoch": 0.3823661074005766, + "grad_norm": 1.7837733030319214, + "learning_rate": 0.00014164794032059703, + "loss": 1.2623, + "step": 10677 + }, + { + "epoch": 0.38240191953014485, + "grad_norm": 1.7612011432647705, + "learning_rate": 0.00014163739485408716, + "loss": 1.1628, + "step": 10678 + }, + { + "epoch": 0.3824377316597131, + "grad_norm": 1.4151420593261719, + "learning_rate": 0.00014162684882739984, + "loss": 1.1227, + "step": 10679 + }, + { + "epoch": 0.38247354378928144, + "grad_norm": 1.8775395154953003, + "learning_rate": 0.00014161630224067694, + "loss": 1.0876, + "step": 10680 + }, + { + "epoch": 0.3825093559188497, + "grad_norm": 1.829663872718811, + "learning_rate": 0.0001416057550940604, + "loss": 1.2948, + "step": 10681 + }, + { + "epoch": 0.382545168048418, + "grad_norm": 1.3948043584823608, + "learning_rate": 0.00014159520738769212, + "loss": 1.128, + "step": 10682 + }, + { + "epoch": 0.3825809801779863, + "grad_norm": 1.6130727529525757, + "learning_rate": 0.00014158465912171396, + "loss": 1.0912, + "step": 10683 + }, + { + "epoch": 0.3826167923075546, + "grad_norm": 1.6324862241744995, + "learning_rate": 0.00014157411029626783, + "loss": 1.065, + "step": 10684 + }, + { + "epoch": 0.38265260443712285, + "grad_norm": 1.320162057876587, + "learning_rate": 0.0001415635609114957, + "loss": 1.1046, + "step": 10685 + }, + { + "epoch": 0.3826884165666911, + "grad_norm": 1.7021591663360596, + "learning_rate": 0.00014155301096753945, + "loss": 1.1796, + "step": 10686 + }, + { + "epoch": 0.38272422869625944, + "grad_norm": 1.6817179918289185, + "learning_rate": 0.00014154246046454107, + "loss": 1.0951, + "step": 10687 + }, + { + "epoch": 0.3827600408258277, + "grad_norm": 1.3561136722564697, + "learning_rate": 0.00014153190940264246, + "loss": 1.202, + "step": 10688 + }, + { + "epoch": 0.382795852955396, + "grad_norm": 1.8523972034454346, + "learning_rate": 0.00014152135778198557, + "loss": 1.2456, + "step": 10689 + }, + { + "epoch": 0.3828316650849643, + "grad_norm": 1.5458388328552246, + "learning_rate": 0.00014151080560271235, + "loss": 1.0423, + "step": 10690 + }, + { + "epoch": 0.38286747721453257, + "grad_norm": 1.3777143955230713, + "learning_rate": 0.00014150025286496483, + "loss": 1.0273, + "step": 10691 + }, + { + "epoch": 0.38290328934410084, + "grad_norm": 1.9505817890167236, + "learning_rate": 0.0001414896995688849, + "loss": 1.2676, + "step": 10692 + }, + { + "epoch": 0.3829391014736691, + "grad_norm": 1.337141752243042, + "learning_rate": 0.00014147914571461455, + "loss": 1.1361, + "step": 10693 + }, + { + "epoch": 0.38297491360323743, + "grad_norm": 1.6002370119094849, + "learning_rate": 0.0001414685913022959, + "loss": 1.191, + "step": 10694 + }, + { + "epoch": 0.3830107257328057, + "grad_norm": 1.8758877515792847, + "learning_rate": 0.00014145803633207077, + "loss": 1.3046, + "step": 10695 + }, + { + "epoch": 0.38304653786237397, + "grad_norm": 1.5924655199050903, + "learning_rate": 0.00014144748080408126, + "loss": 1.1856, + "step": 10696 + }, + { + "epoch": 0.3830823499919423, + "grad_norm": 1.3094215393066406, + "learning_rate": 0.00014143692471846935, + "loss": 0.9759, + "step": 10697 + }, + { + "epoch": 0.38311816212151056, + "grad_norm": 1.514888882637024, + "learning_rate": 0.0001414263680753771, + "loss": 1.2245, + "step": 10698 + }, + { + "epoch": 0.38315397425107883, + "grad_norm": 1.346139907836914, + "learning_rate": 0.00014141581087494644, + "loss": 1.1559, + "step": 10699 + }, + { + "epoch": 0.3831897863806471, + "grad_norm": 1.3064484596252441, + "learning_rate": 0.00014140525311731952, + "loss": 1.1292, + "step": 10700 + }, + { + "epoch": 0.3832255985102154, + "grad_norm": 1.4657912254333496, + "learning_rate": 0.00014139469480263828, + "loss": 1.2326, + "step": 10701 + }, + { + "epoch": 0.3832614106397837, + "grad_norm": 1.8967844247817993, + "learning_rate": 0.00014138413593104486, + "loss": 0.9783, + "step": 10702 + }, + { + "epoch": 0.38329722276935196, + "grad_norm": 1.6768584251403809, + "learning_rate": 0.0001413735765026813, + "loss": 1.214, + "step": 10703 + }, + { + "epoch": 0.3833330348989203, + "grad_norm": 1.7874236106872559, + "learning_rate": 0.00014136301651768957, + "loss": 1.2751, + "step": 10704 + }, + { + "epoch": 0.38336884702848856, + "grad_norm": 1.6978073120117188, + "learning_rate": 0.00014135245597621184, + "loss": 0.9827, + "step": 10705 + }, + { + "epoch": 0.3834046591580568, + "grad_norm": 1.8561135530471802, + "learning_rate": 0.00014134189487839013, + "loss": 1.2486, + "step": 10706 + }, + { + "epoch": 0.3834404712876251, + "grad_norm": 1.5140055418014526, + "learning_rate": 0.0001413313332243666, + "loss": 1.0798, + "step": 10707 + }, + { + "epoch": 0.3834762834171934, + "grad_norm": 1.4238368272781372, + "learning_rate": 0.00014132077101428324, + "loss": 1.1374, + "step": 10708 + }, + { + "epoch": 0.3835120955467617, + "grad_norm": 1.466712474822998, + "learning_rate": 0.00014131020824828224, + "loss": 1.1816, + "step": 10709 + }, + { + "epoch": 0.38354790767632996, + "grad_norm": 2.239673137664795, + "learning_rate": 0.00014129964492650568, + "loss": 1.2631, + "step": 10710 + }, + { + "epoch": 0.3835837198058983, + "grad_norm": 1.664505124092102, + "learning_rate": 0.00014128908104909567, + "loss": 1.1575, + "step": 10711 + }, + { + "epoch": 0.38361953193546655, + "grad_norm": 1.8002755641937256, + "learning_rate": 0.00014127851661619432, + "loss": 0.9893, + "step": 10712 + }, + { + "epoch": 0.3836553440650348, + "grad_norm": 1.686881184577942, + "learning_rate": 0.00014126795162794378, + "loss": 1.2331, + "step": 10713 + }, + { + "epoch": 0.3836911561946031, + "grad_norm": 1.766732096672058, + "learning_rate": 0.00014125738608448618, + "loss": 1.1627, + "step": 10714 + }, + { + "epoch": 0.3837269683241714, + "grad_norm": 1.4794577360153198, + "learning_rate": 0.00014124681998596366, + "loss": 1.2112, + "step": 10715 + }, + { + "epoch": 0.3837627804537397, + "grad_norm": 1.606205940246582, + "learning_rate": 0.0001412362533325184, + "loss": 1.1938, + "step": 10716 + }, + { + "epoch": 0.38379859258330795, + "grad_norm": 1.487744688987732, + "learning_rate": 0.0001412256861242925, + "loss": 1.1163, + "step": 10717 + }, + { + "epoch": 0.3838344047128763, + "grad_norm": 1.5653159618377686, + "learning_rate": 0.00014121511836142823, + "loss": 1.2529, + "step": 10718 + }, + { + "epoch": 0.38387021684244454, + "grad_norm": 1.5312024354934692, + "learning_rate": 0.00014120455004406766, + "loss": 1.1803, + "step": 10719 + }, + { + "epoch": 0.3839060289720128, + "grad_norm": 1.8198802471160889, + "learning_rate": 0.00014119398117235304, + "loss": 1.2921, + "step": 10720 + }, + { + "epoch": 0.3839418411015811, + "grad_norm": 1.4280147552490234, + "learning_rate": 0.00014118341174642653, + "loss": 1.0761, + "step": 10721 + }, + { + "epoch": 0.3839776532311494, + "grad_norm": 1.5333375930786133, + "learning_rate": 0.00014117284176643033, + "loss": 1.0586, + "step": 10722 + }, + { + "epoch": 0.3840134653607177, + "grad_norm": 2.0470080375671387, + "learning_rate": 0.00014116227123250668, + "loss": 1.2076, + "step": 10723 + }, + { + "epoch": 0.38404927749028595, + "grad_norm": 1.26034677028656, + "learning_rate": 0.00014115170014479775, + "loss": 1.2269, + "step": 10724 + }, + { + "epoch": 0.38408508961985427, + "grad_norm": 1.7923071384429932, + "learning_rate": 0.0001411411285034458, + "loss": 1.2262, + "step": 10725 + }, + { + "epoch": 0.38412090174942254, + "grad_norm": 1.5160092115402222, + "learning_rate": 0.000141130556308593, + "loss": 1.1997, + "step": 10726 + }, + { + "epoch": 0.3841567138789908, + "grad_norm": 1.5554713010787964, + "learning_rate": 0.00014111998356038162, + "loss": 1.0574, + "step": 10727 + }, + { + "epoch": 0.3841925260085591, + "grad_norm": 1.6591445207595825, + "learning_rate": 0.00014110941025895392, + "loss": 1.0916, + "step": 10728 + }, + { + "epoch": 0.3842283381381274, + "grad_norm": 1.3562458753585815, + "learning_rate": 0.00014109883640445214, + "loss": 1.0475, + "step": 10729 + }, + { + "epoch": 0.38426415026769567, + "grad_norm": 2.0946052074432373, + "learning_rate": 0.00014108826199701852, + "loss": 1.1178, + "step": 10730 + }, + { + "epoch": 0.38429996239726394, + "grad_norm": 1.7829252481460571, + "learning_rate": 0.00014107768703679533, + "loss": 1.0416, + "step": 10731 + }, + { + "epoch": 0.38433577452683226, + "grad_norm": 1.414819598197937, + "learning_rate": 0.00014106711152392484, + "loss": 1.2324, + "step": 10732 + }, + { + "epoch": 0.38437158665640053, + "grad_norm": 2.1576106548309326, + "learning_rate": 0.00014105653545854935, + "loss": 1.1484, + "step": 10733 + }, + { + "epoch": 0.3844073987859688, + "grad_norm": 1.5301218032836914, + "learning_rate": 0.00014104595884081113, + "loss": 1.1253, + "step": 10734 + }, + { + "epoch": 0.38444321091553707, + "grad_norm": 1.673040747642517, + "learning_rate": 0.00014103538167085247, + "loss": 1.1752, + "step": 10735 + }, + { + "epoch": 0.3844790230451054, + "grad_norm": 1.6870970726013184, + "learning_rate": 0.0001410248039488157, + "loss": 1.1237, + "step": 10736 + }, + { + "epoch": 0.38451483517467366, + "grad_norm": 1.2985707521438599, + "learning_rate": 0.0001410142256748431, + "loss": 1.055, + "step": 10737 + }, + { + "epoch": 0.38455064730424193, + "grad_norm": 1.349631667137146, + "learning_rate": 0.00014100364684907702, + "loss": 0.9959, + "step": 10738 + }, + { + "epoch": 0.38458645943381026, + "grad_norm": 1.6012438535690308, + "learning_rate": 0.00014099306747165975, + "loss": 1.018, + "step": 10739 + }, + { + "epoch": 0.3846222715633785, + "grad_norm": 1.2426559925079346, + "learning_rate": 0.00014098248754273364, + "loss": 1.3376, + "step": 10740 + }, + { + "epoch": 0.3846580836929468, + "grad_norm": 1.5165462493896484, + "learning_rate": 0.000140971907062441, + "loss": 0.9859, + "step": 10741 + }, + { + "epoch": 0.38469389582251506, + "grad_norm": 1.6359885931015015, + "learning_rate": 0.0001409613260309242, + "loss": 1.1273, + "step": 10742 + }, + { + "epoch": 0.3847297079520834, + "grad_norm": 1.7260098457336426, + "learning_rate": 0.00014095074444832561, + "loss": 1.225, + "step": 10743 + }, + { + "epoch": 0.38476552008165166, + "grad_norm": 1.3348742723464966, + "learning_rate": 0.0001409401623147876, + "loss": 1.2138, + "step": 10744 + }, + { + "epoch": 0.3848013322112199, + "grad_norm": 1.9077093601226807, + "learning_rate": 0.00014092957963045245, + "loss": 1.4372, + "step": 10745 + }, + { + "epoch": 0.38483714434078825, + "grad_norm": 1.204251766204834, + "learning_rate": 0.00014091899639546263, + "loss": 0.8095, + "step": 10746 + }, + { + "epoch": 0.3848729564703565, + "grad_norm": 1.5800561904907227, + "learning_rate": 0.00014090841260996055, + "loss": 1.1845, + "step": 10747 + }, + { + "epoch": 0.3849087685999248, + "grad_norm": 1.3775256872177124, + "learning_rate": 0.0001408978282740885, + "loss": 1.0513, + "step": 10748 + }, + { + "epoch": 0.38494458072949306, + "grad_norm": 1.3509628772735596, + "learning_rate": 0.0001408872433879889, + "loss": 1.0272, + "step": 10749 + }, + { + "epoch": 0.3849803928590614, + "grad_norm": 1.4706060886383057, + "learning_rate": 0.00014087665795180422, + "loss": 1.1332, + "step": 10750 + }, + { + "epoch": 0.38501620498862965, + "grad_norm": 1.3751792907714844, + "learning_rate": 0.00014086607196567682, + "loss": 1.1382, + "step": 10751 + }, + { + "epoch": 0.3850520171181979, + "grad_norm": 1.3106950521469116, + "learning_rate": 0.00014085548542974914, + "loss": 1.0744, + "step": 10752 + }, + { + "epoch": 0.38508782924776624, + "grad_norm": 1.4032173156738281, + "learning_rate": 0.0001408448983441636, + "loss": 1.0405, + "step": 10753 + }, + { + "epoch": 0.3851236413773345, + "grad_norm": 1.627578854560852, + "learning_rate": 0.00014083431070906262, + "loss": 1.2971, + "step": 10754 + }, + { + "epoch": 0.3851594535069028, + "grad_norm": 1.9930490255355835, + "learning_rate": 0.0001408237225245887, + "loss": 1.1129, + "step": 10755 + }, + { + "epoch": 0.38519526563647105, + "grad_norm": 1.5636780261993408, + "learning_rate": 0.00014081313379088424, + "loss": 1.2042, + "step": 10756 + }, + { + "epoch": 0.3852310777660394, + "grad_norm": 1.4364736080169678, + "learning_rate": 0.0001408025445080917, + "loss": 1.1234, + "step": 10757 + }, + { + "epoch": 0.38526688989560764, + "grad_norm": 1.4395678043365479, + "learning_rate": 0.00014079195467635354, + "loss": 1.0636, + "step": 10758 + }, + { + "epoch": 0.3853027020251759, + "grad_norm": 1.5606831312179565, + "learning_rate": 0.00014078136429581227, + "loss": 1.2293, + "step": 10759 + }, + { + "epoch": 0.38533851415474424, + "grad_norm": 1.7937113046646118, + "learning_rate": 0.00014077077336661036, + "loss": 0.9869, + "step": 10760 + }, + { + "epoch": 0.3853743262843125, + "grad_norm": 1.659661054611206, + "learning_rate": 0.00014076018188889026, + "loss": 1.1028, + "step": 10761 + }, + { + "epoch": 0.3854101384138808, + "grad_norm": 1.7633919715881348, + "learning_rate": 0.0001407495898627945, + "loss": 1.245, + "step": 10762 + }, + { + "epoch": 0.38544595054344905, + "grad_norm": 1.6451648473739624, + "learning_rate": 0.00014073899728846555, + "loss": 1.104, + "step": 10763 + }, + { + "epoch": 0.38548176267301737, + "grad_norm": 1.3138970136642456, + "learning_rate": 0.00014072840416604597, + "loss": 1.0999, + "step": 10764 + }, + { + "epoch": 0.38551757480258564, + "grad_norm": 1.4607913494110107, + "learning_rate": 0.00014071781049567825, + "loss": 1.2052, + "step": 10765 + }, + { + "epoch": 0.3855533869321539, + "grad_norm": 1.6008567810058594, + "learning_rate": 0.0001407072162775049, + "loss": 1.2739, + "step": 10766 + }, + { + "epoch": 0.38558919906172223, + "grad_norm": 1.3632045984268188, + "learning_rate": 0.00014069662151166846, + "loss": 1.1803, + "step": 10767 + }, + { + "epoch": 0.3856250111912905, + "grad_norm": 1.4266724586486816, + "learning_rate": 0.00014068602619831148, + "loss": 1.0479, + "step": 10768 + }, + { + "epoch": 0.38566082332085877, + "grad_norm": 1.516648292541504, + "learning_rate": 0.0001406754303375765, + "loss": 1.3434, + "step": 10769 + }, + { + "epoch": 0.38569663545042704, + "grad_norm": 1.4575469493865967, + "learning_rate": 0.00014066483392960604, + "loss": 1.1487, + "step": 10770 + }, + { + "epoch": 0.38573244757999536, + "grad_norm": 1.783286452293396, + "learning_rate": 0.00014065423697454273, + "loss": 1.0614, + "step": 10771 + }, + { + "epoch": 0.38576825970956363, + "grad_norm": 1.4007909297943115, + "learning_rate": 0.0001406436394725291, + "loss": 1.1532, + "step": 10772 + }, + { + "epoch": 0.3858040718391319, + "grad_norm": 1.8715051412582397, + "learning_rate": 0.00014063304142370773, + "loss": 0.9295, + "step": 10773 + }, + { + "epoch": 0.3858398839687002, + "grad_norm": 2.0300498008728027, + "learning_rate": 0.0001406224428282212, + "loss": 1.3157, + "step": 10774 + }, + { + "epoch": 0.3858756960982685, + "grad_norm": 1.6672981977462769, + "learning_rate": 0.0001406118436862121, + "loss": 1.1283, + "step": 10775 + }, + { + "epoch": 0.38591150822783676, + "grad_norm": 1.5270498991012573, + "learning_rate": 0.000140601243997823, + "loss": 1.295, + "step": 10776 + }, + { + "epoch": 0.38594732035740503, + "grad_norm": 1.4913636445999146, + "learning_rate": 0.00014059064376319657, + "loss": 1.2447, + "step": 10777 + }, + { + "epoch": 0.38598313248697336, + "grad_norm": 1.4376540184020996, + "learning_rate": 0.00014058004298247537, + "loss": 1.0925, + "step": 10778 + }, + { + "epoch": 0.3860189446165416, + "grad_norm": 1.8074800968170166, + "learning_rate": 0.00014056944165580202, + "loss": 1.2475, + "step": 10779 + }, + { + "epoch": 0.3860547567461099, + "grad_norm": 1.3276946544647217, + "learning_rate": 0.00014055883978331916, + "loss": 1.1469, + "step": 10780 + }, + { + "epoch": 0.3860905688756782, + "grad_norm": 1.5700477361679077, + "learning_rate": 0.00014054823736516945, + "loss": 1.2645, + "step": 10781 + }, + { + "epoch": 0.3861263810052465, + "grad_norm": 1.4302537441253662, + "learning_rate": 0.00014053763440149552, + "loss": 1.1686, + "step": 10782 + }, + { + "epoch": 0.38616219313481476, + "grad_norm": 1.7234095335006714, + "learning_rate": 0.00014052703089244, + "loss": 1.2141, + "step": 10783 + }, + { + "epoch": 0.386198005264383, + "grad_norm": 1.4943876266479492, + "learning_rate": 0.00014051642683814557, + "loss": 1.1503, + "step": 10784 + }, + { + "epoch": 0.38623381739395135, + "grad_norm": 1.5004255771636963, + "learning_rate": 0.00014050582223875484, + "loss": 1.1562, + "step": 10785 + }, + { + "epoch": 0.3862696295235196, + "grad_norm": 2.0631325244903564, + "learning_rate": 0.00014049521709441057, + "loss": 1.1488, + "step": 10786 + }, + { + "epoch": 0.3863054416530879, + "grad_norm": 1.242364525794983, + "learning_rate": 0.00014048461140525533, + "loss": 1.1911, + "step": 10787 + }, + { + "epoch": 0.3863412537826562, + "grad_norm": 1.3723233938217163, + "learning_rate": 0.0001404740051714319, + "loss": 1.18, + "step": 10788 + }, + { + "epoch": 0.3863770659122245, + "grad_norm": 1.4637155532836914, + "learning_rate": 0.00014046339839308294, + "loss": 1.215, + "step": 10789 + }, + { + "epoch": 0.38641287804179275, + "grad_norm": 1.4933373928070068, + "learning_rate": 0.00014045279107035116, + "loss": 1.1671, + "step": 10790 + }, + { + "epoch": 0.386448690171361, + "grad_norm": 1.382350206375122, + "learning_rate": 0.00014044218320337923, + "loss": 1.1447, + "step": 10791 + }, + { + "epoch": 0.38648450230092934, + "grad_norm": 1.4281742572784424, + "learning_rate": 0.00014043157479230988, + "loss": 1.0851, + "step": 10792 + }, + { + "epoch": 0.3865203144304976, + "grad_norm": 1.926426649093628, + "learning_rate": 0.00014042096583728587, + "loss": 1.1942, + "step": 10793 + }, + { + "epoch": 0.3865561265600659, + "grad_norm": 1.3442635536193848, + "learning_rate": 0.0001404103563384499, + "loss": 1.1306, + "step": 10794 + }, + { + "epoch": 0.38659193868963415, + "grad_norm": 1.462384581565857, + "learning_rate": 0.00014039974629594473, + "loss": 1.3207, + "step": 10795 + }, + { + "epoch": 0.3866277508192025, + "grad_norm": 1.601415991783142, + "learning_rate": 0.00014038913570991302, + "loss": 1.0266, + "step": 10796 + }, + { + "epoch": 0.38666356294877074, + "grad_norm": 2.258451223373413, + "learning_rate": 0.00014037852458049764, + "loss": 1.1912, + "step": 10797 + }, + { + "epoch": 0.386699375078339, + "grad_norm": 1.711651086807251, + "learning_rate": 0.0001403679129078413, + "loss": 1.1148, + "step": 10798 + }, + { + "epoch": 0.38673518720790734, + "grad_norm": 1.6923120021820068, + "learning_rate": 0.00014035730069208676, + "loss": 1.1238, + "step": 10799 + }, + { + "epoch": 0.3867709993374756, + "grad_norm": 1.565880537033081, + "learning_rate": 0.0001403466879333768, + "loss": 1.1341, + "step": 10800 + }, + { + "epoch": 0.3868068114670439, + "grad_norm": 1.5123814344406128, + "learning_rate": 0.00014033607463185416, + "loss": 1.0529, + "step": 10801 + }, + { + "epoch": 0.38684262359661215, + "grad_norm": 1.3611154556274414, + "learning_rate": 0.0001403254607876617, + "loss": 1.2277, + "step": 10802 + }, + { + "epoch": 0.38687843572618047, + "grad_norm": 1.9661635160446167, + "learning_rate": 0.00014031484640094217, + "loss": 1.4982, + "step": 10803 + }, + { + "epoch": 0.38691424785574874, + "grad_norm": 1.5019874572753906, + "learning_rate": 0.0001403042314718384, + "loss": 1.1863, + "step": 10804 + }, + { + "epoch": 0.386950059985317, + "grad_norm": 1.3160066604614258, + "learning_rate": 0.00014029361600049315, + "loss": 1.0148, + "step": 10805 + }, + { + "epoch": 0.38698587211488533, + "grad_norm": 1.4739426374435425, + "learning_rate": 0.0001402829999870493, + "loss": 0.9599, + "step": 10806 + }, + { + "epoch": 0.3870216842444536, + "grad_norm": 1.4929380416870117, + "learning_rate": 0.00014027238343164965, + "loss": 1.119, + "step": 10807 + }, + { + "epoch": 0.38705749637402187, + "grad_norm": 1.528579831123352, + "learning_rate": 0.000140261766334437, + "loss": 1.2656, + "step": 10808 + }, + { + "epoch": 0.38709330850359014, + "grad_norm": 1.4254580736160278, + "learning_rate": 0.00014025114869555425, + "loss": 1.307, + "step": 10809 + }, + { + "epoch": 0.38712912063315846, + "grad_norm": 1.5242232084274292, + "learning_rate": 0.00014024053051514418, + "loss": 1.3003, + "step": 10810 + }, + { + "epoch": 0.38716493276272673, + "grad_norm": 1.389315128326416, + "learning_rate": 0.00014022991179334971, + "loss": 1.1047, + "step": 10811 + }, + { + "epoch": 0.387200744892295, + "grad_norm": 1.4513025283813477, + "learning_rate": 0.00014021929253031366, + "loss": 1.0304, + "step": 10812 + }, + { + "epoch": 0.3872365570218633, + "grad_norm": 1.7634708881378174, + "learning_rate": 0.0001402086727261789, + "loss": 1.245, + "step": 10813 + }, + { + "epoch": 0.3872723691514316, + "grad_norm": 2.6414988040924072, + "learning_rate": 0.0001401980523810883, + "loss": 1.2215, + "step": 10814 + }, + { + "epoch": 0.38730818128099986, + "grad_norm": 1.9010162353515625, + "learning_rate": 0.0001401874314951848, + "loss": 1.1446, + "step": 10815 + }, + { + "epoch": 0.38734399341056813, + "grad_norm": 1.5143288373947144, + "learning_rate": 0.0001401768100686112, + "loss": 1.1758, + "step": 10816 + }, + { + "epoch": 0.38737980554013646, + "grad_norm": 1.4128963947296143, + "learning_rate": 0.00014016618810151047, + "loss": 1.1418, + "step": 10817 + }, + { + "epoch": 0.3874156176697047, + "grad_norm": 1.2761280536651611, + "learning_rate": 0.00014015556559402551, + "loss": 0.9534, + "step": 10818 + }, + { + "epoch": 0.387451429799273, + "grad_norm": 1.560268521308899, + "learning_rate": 0.0001401449425462992, + "loss": 1.1888, + "step": 10819 + }, + { + "epoch": 0.3874872419288413, + "grad_norm": 1.4940054416656494, + "learning_rate": 0.00014013431895847447, + "loss": 1.0973, + "step": 10820 + }, + { + "epoch": 0.3875230540584096, + "grad_norm": 1.48980712890625, + "learning_rate": 0.0001401236948306942, + "loss": 1.1577, + "step": 10821 + }, + { + "epoch": 0.38755886618797786, + "grad_norm": 1.5724784135818481, + "learning_rate": 0.00014011307016310144, + "loss": 1.0124, + "step": 10822 + }, + { + "epoch": 0.3875946783175461, + "grad_norm": 2.410255193710327, + "learning_rate": 0.00014010244495583901, + "loss": 1.3504, + "step": 10823 + }, + { + "epoch": 0.38763049044711445, + "grad_norm": 1.7284127473831177, + "learning_rate": 0.00014009181920904995, + "loss": 1.1381, + "step": 10824 + }, + { + "epoch": 0.3876663025766827, + "grad_norm": 2.1486692428588867, + "learning_rate": 0.00014008119292287715, + "loss": 1.2954, + "step": 10825 + }, + { + "epoch": 0.387702114706251, + "grad_norm": 1.5161041021347046, + "learning_rate": 0.00014007056609746362, + "loss": 1.1406, + "step": 10826 + }, + { + "epoch": 0.3877379268358193, + "grad_norm": 1.5738234519958496, + "learning_rate": 0.00014005993873295234, + "loss": 1.1112, + "step": 10827 + }, + { + "epoch": 0.3877737389653876, + "grad_norm": 1.3898944854736328, + "learning_rate": 0.0001400493108294862, + "loss": 1.1814, + "step": 10828 + }, + { + "epoch": 0.38780955109495585, + "grad_norm": 1.775344967842102, + "learning_rate": 0.00014003868238720828, + "loss": 1.2941, + "step": 10829 + }, + { + "epoch": 0.3878453632245241, + "grad_norm": 2.113210439682007, + "learning_rate": 0.0001400280534062615, + "loss": 1.2388, + "step": 10830 + }, + { + "epoch": 0.38788117535409244, + "grad_norm": 1.6050254106521606, + "learning_rate": 0.0001400174238867889, + "loss": 1.0773, + "step": 10831 + }, + { + "epoch": 0.3879169874836607, + "grad_norm": 1.425668716430664, + "learning_rate": 0.00014000679382893352, + "loss": 1.0615, + "step": 10832 + }, + { + "epoch": 0.387952799613229, + "grad_norm": 1.426527976989746, + "learning_rate": 0.0001399961632328383, + "loss": 1.2582, + "step": 10833 + }, + { + "epoch": 0.3879886117427973, + "grad_norm": 1.4814820289611816, + "learning_rate": 0.00013998553209864628, + "loss": 1.1851, + "step": 10834 + }, + { + "epoch": 0.3880244238723656, + "grad_norm": 1.287793517112732, + "learning_rate": 0.00013997490042650054, + "loss": 0.9888, + "step": 10835 + }, + { + "epoch": 0.38806023600193384, + "grad_norm": 2.344306230545044, + "learning_rate": 0.00013996426821654407, + "loss": 1.1954, + "step": 10836 + }, + { + "epoch": 0.3880960481315021, + "grad_norm": 1.4447723627090454, + "learning_rate": 0.00013995363546891992, + "loss": 1.2226, + "step": 10837 + }, + { + "epoch": 0.38813186026107044, + "grad_norm": 1.6948219537734985, + "learning_rate": 0.00013994300218377113, + "loss": 1.4217, + "step": 10838 + }, + { + "epoch": 0.3881676723906387, + "grad_norm": 1.60816490650177, + "learning_rate": 0.0001399323683612408, + "loss": 1.2021, + "step": 10839 + }, + { + "epoch": 0.388203484520207, + "grad_norm": 1.3836288452148438, + "learning_rate": 0.00013992173400147193, + "loss": 0.966, + "step": 10840 + }, + { + "epoch": 0.3882392966497753, + "grad_norm": 1.2788670063018799, + "learning_rate": 0.00013991109910460763, + "loss": 1.0432, + "step": 10841 + }, + { + "epoch": 0.38827510877934357, + "grad_norm": 1.2875338792800903, + "learning_rate": 0.00013990046367079098, + "loss": 1.1457, + "step": 10842 + }, + { + "epoch": 0.38831092090891184, + "grad_norm": 1.425485372543335, + "learning_rate": 0.00013988982770016505, + "loss": 1.1213, + "step": 10843 + }, + { + "epoch": 0.3883467330384801, + "grad_norm": 1.5842069387435913, + "learning_rate": 0.00013987919119287296, + "loss": 1.2481, + "step": 10844 + }, + { + "epoch": 0.38838254516804843, + "grad_norm": 1.738357424736023, + "learning_rate": 0.00013986855414905777, + "loss": 1.0796, + "step": 10845 + }, + { + "epoch": 0.3884183572976167, + "grad_norm": 1.438183307647705, + "learning_rate": 0.00013985791656886262, + "loss": 1.1631, + "step": 10846 + }, + { + "epoch": 0.38845416942718497, + "grad_norm": 1.5724940299987793, + "learning_rate": 0.00013984727845243062, + "loss": 1.1575, + "step": 10847 + }, + { + "epoch": 0.3884899815567533, + "grad_norm": 1.3839017152786255, + "learning_rate": 0.00013983663979990488, + "loss": 1.1171, + "step": 10848 + }, + { + "epoch": 0.38852579368632156, + "grad_norm": 2.2130937576293945, + "learning_rate": 0.00013982600061142854, + "loss": 1.1442, + "step": 10849 + }, + { + "epoch": 0.38856160581588983, + "grad_norm": 1.5596321821212769, + "learning_rate": 0.00013981536088714474, + "loss": 1.1719, + "step": 10850 + }, + { + "epoch": 0.3885974179454581, + "grad_norm": 1.599600076675415, + "learning_rate": 0.0001398047206271966, + "loss": 1.1426, + "step": 10851 + }, + { + "epoch": 0.3886332300750264, + "grad_norm": 1.5103462934494019, + "learning_rate": 0.00013979407983172733, + "loss": 1.0329, + "step": 10852 + }, + { + "epoch": 0.3886690422045947, + "grad_norm": 1.773205041885376, + "learning_rate": 0.00013978343850088002, + "loss": 1.1887, + "step": 10853 + }, + { + "epoch": 0.38870485433416296, + "grad_norm": 1.8405461311340332, + "learning_rate": 0.00013977279663479784, + "loss": 1.1302, + "step": 10854 + }, + { + "epoch": 0.3887406664637313, + "grad_norm": 1.6010640859603882, + "learning_rate": 0.000139762154233624, + "loss": 1.1257, + "step": 10855 + }, + { + "epoch": 0.38877647859329956, + "grad_norm": 1.8201407194137573, + "learning_rate": 0.00013975151129750168, + "loss": 1.2612, + "step": 10856 + }, + { + "epoch": 0.3888122907228678, + "grad_norm": 1.451341152191162, + "learning_rate": 0.00013974086782657404, + "loss": 1.2593, + "step": 10857 + }, + { + "epoch": 0.3888481028524361, + "grad_norm": 1.8507534265518188, + "learning_rate": 0.00013973022382098428, + "loss": 0.9942, + "step": 10858 + }, + { + "epoch": 0.3888839149820044, + "grad_norm": 1.254355788230896, + "learning_rate": 0.0001397195792808756, + "loss": 1.1371, + "step": 10859 + }, + { + "epoch": 0.3889197271115727, + "grad_norm": 1.6322108507156372, + "learning_rate": 0.00013970893420639123, + "loss": 1.1843, + "step": 10860 + }, + { + "epoch": 0.38895553924114096, + "grad_norm": 1.763843059539795, + "learning_rate": 0.00013969828859767438, + "loss": 1.0847, + "step": 10861 + }, + { + "epoch": 0.3889913513707093, + "grad_norm": 1.8926513195037842, + "learning_rate": 0.00013968764245486824, + "loss": 1.0173, + "step": 10862 + }, + { + "epoch": 0.38902716350027755, + "grad_norm": 1.8864343166351318, + "learning_rate": 0.0001396769957781161, + "loss": 1.1731, + "step": 10863 + }, + { + "epoch": 0.3890629756298458, + "grad_norm": 1.9926187992095947, + "learning_rate": 0.00013966634856756114, + "loss": 1.3623, + "step": 10864 + }, + { + "epoch": 0.3890987877594141, + "grad_norm": 1.8318089246749878, + "learning_rate": 0.0001396557008233466, + "loss": 1.1441, + "step": 10865 + }, + { + "epoch": 0.3891345998889824, + "grad_norm": 1.6292909383773804, + "learning_rate": 0.0001396450525456158, + "loss": 1.3967, + "step": 10866 + }, + { + "epoch": 0.3891704120185507, + "grad_norm": 1.4611480236053467, + "learning_rate": 0.0001396344037345119, + "loss": 1.0687, + "step": 10867 + }, + { + "epoch": 0.38920622414811895, + "grad_norm": 2.2570509910583496, + "learning_rate": 0.0001396237543901783, + "loss": 1.0566, + "step": 10868 + }, + { + "epoch": 0.3892420362776873, + "grad_norm": 2.081791639328003, + "learning_rate": 0.00013961310451275814, + "loss": 1.1891, + "step": 10869 + }, + { + "epoch": 0.38927784840725554, + "grad_norm": 1.5289301872253418, + "learning_rate": 0.00013960245410239478, + "loss": 1.1078, + "step": 10870 + }, + { + "epoch": 0.3893136605368238, + "grad_norm": 1.3141332864761353, + "learning_rate": 0.00013959180315923148, + "loss": 0.9553, + "step": 10871 + }, + { + "epoch": 0.3893494726663921, + "grad_norm": 2.113666534423828, + "learning_rate": 0.00013958115168341155, + "loss": 1.073, + "step": 10872 + }, + { + "epoch": 0.3893852847959604, + "grad_norm": 1.5179417133331299, + "learning_rate": 0.00013957049967507824, + "loss": 0.9365, + "step": 10873 + }, + { + "epoch": 0.3894210969255287, + "grad_norm": 1.514445185661316, + "learning_rate": 0.00013955984713437492, + "loss": 1.2014, + "step": 10874 + }, + { + "epoch": 0.38945690905509694, + "grad_norm": 1.7940737009048462, + "learning_rate": 0.00013954919406144488, + "loss": 1.1546, + "step": 10875 + }, + { + "epoch": 0.38949272118466527, + "grad_norm": 1.4419658184051514, + "learning_rate": 0.00013953854045643146, + "loss": 1.3518, + "step": 10876 + }, + { + "epoch": 0.38952853331423354, + "grad_norm": 1.4942480325698853, + "learning_rate": 0.00013952788631947798, + "loss": 1.1164, + "step": 10877 + }, + { + "epoch": 0.3895643454438018, + "grad_norm": 1.5162851810455322, + "learning_rate": 0.00013951723165072776, + "loss": 1.2818, + "step": 10878 + }, + { + "epoch": 0.3896001575733701, + "grad_norm": 1.395817756652832, + "learning_rate": 0.00013950657645032418, + "loss": 1.2371, + "step": 10879 + }, + { + "epoch": 0.3896359697029384, + "grad_norm": 1.808361291885376, + "learning_rate": 0.0001394959207184106, + "loss": 1.2082, + "step": 10880 + }, + { + "epoch": 0.38967178183250667, + "grad_norm": 1.5588864088058472, + "learning_rate": 0.00013948526445513033, + "loss": 1.2469, + "step": 10881 + }, + { + "epoch": 0.38970759396207494, + "grad_norm": 2.2851758003234863, + "learning_rate": 0.00013947460766062673, + "loss": 1.1979, + "step": 10882 + }, + { + "epoch": 0.38974340609164326, + "grad_norm": 1.5693045854568481, + "learning_rate": 0.00013946395033504323, + "loss": 1.0219, + "step": 10883 + }, + { + "epoch": 0.38977921822121153, + "grad_norm": 1.375240445137024, + "learning_rate": 0.00013945329247852317, + "loss": 1.2466, + "step": 10884 + }, + { + "epoch": 0.3898150303507798, + "grad_norm": 1.5434613227844238, + "learning_rate": 0.00013944263409120997, + "loss": 0.9636, + "step": 10885 + }, + { + "epoch": 0.38985084248034807, + "grad_norm": 1.6043273210525513, + "learning_rate": 0.00013943197517324698, + "loss": 1.3153, + "step": 10886 + }, + { + "epoch": 0.3898866546099164, + "grad_norm": 1.3661344051361084, + "learning_rate": 0.00013942131572477763, + "loss": 1.1837, + "step": 10887 + }, + { + "epoch": 0.38992246673948466, + "grad_norm": 1.5645201206207275, + "learning_rate": 0.00013941065574594536, + "loss": 1.1428, + "step": 10888 + }, + { + "epoch": 0.38995827886905293, + "grad_norm": 1.7355390787124634, + "learning_rate": 0.0001393999952368935, + "loss": 1.0091, + "step": 10889 + }, + { + "epoch": 0.38999409099862126, + "grad_norm": 1.54319429397583, + "learning_rate": 0.0001393893341977656, + "loss": 1.1222, + "step": 10890 + }, + { + "epoch": 0.3900299031281895, + "grad_norm": 1.684592366218567, + "learning_rate": 0.00013937867262870494, + "loss": 1.0383, + "step": 10891 + }, + { + "epoch": 0.3900657152577578, + "grad_norm": 1.6112600564956665, + "learning_rate": 0.00013936801052985508, + "loss": 1.2454, + "step": 10892 + }, + { + "epoch": 0.39010152738732606, + "grad_norm": 1.8831716775894165, + "learning_rate": 0.0001393573479013594, + "loss": 1.1807, + "step": 10893 + }, + { + "epoch": 0.3901373395168944, + "grad_norm": 1.6419109106063843, + "learning_rate": 0.00013934668474336137, + "loss": 1.3114, + "step": 10894 + }, + { + "epoch": 0.39017315164646266, + "grad_norm": 1.4154380559921265, + "learning_rate": 0.00013933602105600446, + "loss": 1.0421, + "step": 10895 + }, + { + "epoch": 0.3902089637760309, + "grad_norm": 1.3070162534713745, + "learning_rate": 0.00013932535683943212, + "loss": 0.9388, + "step": 10896 + }, + { + "epoch": 0.39024477590559925, + "grad_norm": 1.4977788925170898, + "learning_rate": 0.00013931469209378788, + "loss": 1.1501, + "step": 10897 + }, + { + "epoch": 0.3902805880351675, + "grad_norm": 1.709435224533081, + "learning_rate": 0.0001393040268192151, + "loss": 1.3079, + "step": 10898 + }, + { + "epoch": 0.3903164001647358, + "grad_norm": 1.6322529315948486, + "learning_rate": 0.00013929336101585737, + "loss": 1.1336, + "step": 10899 + }, + { + "epoch": 0.39035221229430406, + "grad_norm": 1.3519244194030762, + "learning_rate": 0.00013928269468385814, + "loss": 1.0894, + "step": 10900 + }, + { + "epoch": 0.3903880244238724, + "grad_norm": 1.739013671875, + "learning_rate": 0.00013927202782336093, + "loss": 1.1862, + "step": 10901 + }, + { + "epoch": 0.39042383655344065, + "grad_norm": 1.3443593978881836, + "learning_rate": 0.0001392613604345092, + "loss": 0.9389, + "step": 10902 + }, + { + "epoch": 0.3904596486830089, + "grad_norm": 1.4347686767578125, + "learning_rate": 0.00013925069251744657, + "loss": 1.1985, + "step": 10903 + }, + { + "epoch": 0.39049546081257724, + "grad_norm": 1.6214340925216675, + "learning_rate": 0.0001392400240723165, + "loss": 1.202, + "step": 10904 + }, + { + "epoch": 0.3905312729421455, + "grad_norm": 1.5861231088638306, + "learning_rate": 0.00013922935509926249, + "loss": 1.2055, + "step": 10905 + }, + { + "epoch": 0.3905670850717138, + "grad_norm": 1.674906611442566, + "learning_rate": 0.00013921868559842813, + "loss": 1.2771, + "step": 10906 + }, + { + "epoch": 0.39060289720128205, + "grad_norm": 1.6490753889083862, + "learning_rate": 0.00013920801556995693, + "loss": 1.1852, + "step": 10907 + }, + { + "epoch": 0.3906387093308504, + "grad_norm": 1.7696576118469238, + "learning_rate": 0.00013919734501399248, + "loss": 1.0262, + "step": 10908 + }, + { + "epoch": 0.39067452146041864, + "grad_norm": 1.8578169345855713, + "learning_rate": 0.0001391866739306783, + "loss": 1.1622, + "step": 10909 + }, + { + "epoch": 0.3907103335899869, + "grad_norm": 1.6147013902664185, + "learning_rate": 0.00013917600232015798, + "loss": 1.1248, + "step": 10910 + }, + { + "epoch": 0.39074614571955524, + "grad_norm": 1.4373103380203247, + "learning_rate": 0.00013916533018257506, + "loss": 1.1902, + "step": 10911 + }, + { + "epoch": 0.3907819578491235, + "grad_norm": 1.3796343803405762, + "learning_rate": 0.00013915465751807314, + "loss": 1.1745, + "step": 10912 + }, + { + "epoch": 0.3908177699786918, + "grad_norm": 1.3638838529586792, + "learning_rate": 0.00013914398432679582, + "loss": 1.1587, + "step": 10913 + }, + { + "epoch": 0.39085358210826004, + "grad_norm": 1.3753916025161743, + "learning_rate": 0.00013913331060888667, + "loss": 0.8603, + "step": 10914 + }, + { + "epoch": 0.39088939423782837, + "grad_norm": 1.662827968597412, + "learning_rate": 0.00013912263636448936, + "loss": 0.9474, + "step": 10915 + }, + { + "epoch": 0.39092520636739664, + "grad_norm": 1.410487174987793, + "learning_rate": 0.00013911196159374737, + "loss": 1.1595, + "step": 10916 + }, + { + "epoch": 0.3909610184969649, + "grad_norm": 1.5242422819137573, + "learning_rate": 0.00013910128629680441, + "loss": 1.1121, + "step": 10917 + }, + { + "epoch": 0.39099683062653323, + "grad_norm": 1.5005086660385132, + "learning_rate": 0.0001390906104738041, + "loss": 1.2664, + "step": 10918 + }, + { + "epoch": 0.3910326427561015, + "grad_norm": 1.7902626991271973, + "learning_rate": 0.00013907993412489003, + "loss": 1.0807, + "step": 10919 + }, + { + "epoch": 0.39106845488566977, + "grad_norm": 1.6320263147354126, + "learning_rate": 0.00013906925725020586, + "loss": 1.2536, + "step": 10920 + }, + { + "epoch": 0.39110426701523804, + "grad_norm": 1.652199387550354, + "learning_rate": 0.00013905857984989524, + "loss": 1.1656, + "step": 10921 + }, + { + "epoch": 0.39114007914480636, + "grad_norm": 1.4831689596176147, + "learning_rate": 0.00013904790192410178, + "loss": 1.1026, + "step": 10922 + }, + { + "epoch": 0.39117589127437463, + "grad_norm": 1.2804460525512695, + "learning_rate": 0.0001390372234729692, + "loss": 1.0082, + "step": 10923 + }, + { + "epoch": 0.3912117034039429, + "grad_norm": 1.4610369205474854, + "learning_rate": 0.00013902654449664115, + "loss": 1.138, + "step": 10924 + }, + { + "epoch": 0.3912475155335112, + "grad_norm": 1.3403488397598267, + "learning_rate": 0.00013901586499526125, + "loss": 1.214, + "step": 10925 + }, + { + "epoch": 0.3912833276630795, + "grad_norm": 1.5579601526260376, + "learning_rate": 0.0001390051849689732, + "loss": 1.1177, + "step": 10926 + }, + { + "epoch": 0.39131913979264776, + "grad_norm": 1.614186406135559, + "learning_rate": 0.00013899450441792074, + "loss": 0.7957, + "step": 10927 + }, + { + "epoch": 0.39135495192221603, + "grad_norm": 1.9155383110046387, + "learning_rate": 0.00013898382334224748, + "loss": 1.0336, + "step": 10928 + }, + { + "epoch": 0.39139076405178436, + "grad_norm": 1.737666368484497, + "learning_rate": 0.0001389731417420972, + "loss": 1.1358, + "step": 10929 + }, + { + "epoch": 0.3914265761813526, + "grad_norm": 1.6148030757904053, + "learning_rate": 0.00013896245961761354, + "loss": 1.0904, + "step": 10930 + }, + { + "epoch": 0.3914623883109209, + "grad_norm": 1.4726492166519165, + "learning_rate": 0.00013895177696894023, + "loss": 1.0258, + "step": 10931 + }, + { + "epoch": 0.3914982004404892, + "grad_norm": 1.5271472930908203, + "learning_rate": 0.00013894109379622104, + "loss": 1.0592, + "step": 10932 + }, + { + "epoch": 0.3915340125700575, + "grad_norm": 1.6170673370361328, + "learning_rate": 0.00013893041009959968, + "loss": 1.1838, + "step": 10933 + }, + { + "epoch": 0.39156982469962576, + "grad_norm": 1.4743291139602661, + "learning_rate": 0.00013891972587921987, + "loss": 1.292, + "step": 10934 + }, + { + "epoch": 0.391605636829194, + "grad_norm": 1.3494369983673096, + "learning_rate": 0.0001389090411352253, + "loss": 1.1855, + "step": 10935 + }, + { + "epoch": 0.39164144895876235, + "grad_norm": 1.9229321479797363, + "learning_rate": 0.0001388983558677598, + "loss": 1.0909, + "step": 10936 + }, + { + "epoch": 0.3916772610883306, + "grad_norm": 2.3089582920074463, + "learning_rate": 0.00013888767007696709, + "loss": 1.0277, + "step": 10937 + }, + { + "epoch": 0.3917130732178989, + "grad_norm": 1.6326868534088135, + "learning_rate": 0.00013887698376299095, + "loss": 1.138, + "step": 10938 + }, + { + "epoch": 0.3917488853474672, + "grad_norm": 1.9805235862731934, + "learning_rate": 0.00013886629692597512, + "loss": 1.0801, + "step": 10939 + }, + { + "epoch": 0.3917846974770355, + "grad_norm": 1.5621111392974854, + "learning_rate": 0.00013885560956606344, + "loss": 1.2669, + "step": 10940 + }, + { + "epoch": 0.39182050960660375, + "grad_norm": 1.5883170366287231, + "learning_rate": 0.00013884492168339963, + "loss": 1.2103, + "step": 10941 + }, + { + "epoch": 0.391856321736172, + "grad_norm": 1.6096582412719727, + "learning_rate": 0.00013883423327812748, + "loss": 1.0438, + "step": 10942 + }, + { + "epoch": 0.39189213386574034, + "grad_norm": 1.4078572988510132, + "learning_rate": 0.00013882354435039085, + "loss": 0.9401, + "step": 10943 + }, + { + "epoch": 0.3919279459953086, + "grad_norm": 1.7751905918121338, + "learning_rate": 0.00013881285490033348, + "loss": 0.9931, + "step": 10944 + }, + { + "epoch": 0.3919637581248769, + "grad_norm": 1.6092041730880737, + "learning_rate": 0.00013880216492809924, + "loss": 1.3563, + "step": 10945 + }, + { + "epoch": 0.3919995702544452, + "grad_norm": 1.659403920173645, + "learning_rate": 0.00013879147443383188, + "loss": 1.0164, + "step": 10946 + }, + { + "epoch": 0.3920353823840135, + "grad_norm": 1.375616192817688, + "learning_rate": 0.00013878078341767532, + "loss": 1.0748, + "step": 10947 + }, + { + "epoch": 0.39207119451358174, + "grad_norm": 1.8484100103378296, + "learning_rate": 0.00013877009187977332, + "loss": 1.2834, + "step": 10948 + }, + { + "epoch": 0.39210700664315, + "grad_norm": 1.644431471824646, + "learning_rate": 0.00013875939982026976, + "loss": 1.2001, + "step": 10949 + }, + { + "epoch": 0.39214281877271834, + "grad_norm": 1.6522222757339478, + "learning_rate": 0.00013874870723930847, + "loss": 1.2427, + "step": 10950 + }, + { + "epoch": 0.3921786309022866, + "grad_norm": 1.3691465854644775, + "learning_rate": 0.00013873801413703327, + "loss": 1.0946, + "step": 10951 + }, + { + "epoch": 0.3922144430318549, + "grad_norm": 1.5971145629882812, + "learning_rate": 0.00013872732051358808, + "loss": 1.3198, + "step": 10952 + }, + { + "epoch": 0.3922502551614232, + "grad_norm": 1.4714429378509521, + "learning_rate": 0.00013871662636911672, + "loss": 1.2039, + "step": 10953 + }, + { + "epoch": 0.39228606729099147, + "grad_norm": 1.698266863822937, + "learning_rate": 0.00013870593170376317, + "loss": 1.2131, + "step": 10954 + }, + { + "epoch": 0.39232187942055974, + "grad_norm": 1.4656447172164917, + "learning_rate": 0.00013869523651767116, + "loss": 1.4063, + "step": 10955 + }, + { + "epoch": 0.392357691550128, + "grad_norm": 1.3849778175354004, + "learning_rate": 0.0001386845408109847, + "loss": 1.0914, + "step": 10956 + }, + { + "epoch": 0.39239350367969633, + "grad_norm": 1.5043892860412598, + "learning_rate": 0.0001386738445838476, + "loss": 1.2276, + "step": 10957 + }, + { + "epoch": 0.3924293158092646, + "grad_norm": 1.642017126083374, + "learning_rate": 0.00013866314783640384, + "loss": 1.2184, + "step": 10958 + }, + { + "epoch": 0.39246512793883287, + "grad_norm": 1.6051830053329468, + "learning_rate": 0.0001386524505687973, + "loss": 1.25, + "step": 10959 + }, + { + "epoch": 0.3925009400684012, + "grad_norm": 1.717272162437439, + "learning_rate": 0.00013864175278117187, + "loss": 1.1775, + "step": 10960 + }, + { + "epoch": 0.39253675219796946, + "grad_norm": 1.8468143939971924, + "learning_rate": 0.00013863105447367154, + "loss": 1.2631, + "step": 10961 + }, + { + "epoch": 0.39257256432753773, + "grad_norm": 2.152616024017334, + "learning_rate": 0.00013862035564644017, + "loss": 1.2527, + "step": 10962 + }, + { + "epoch": 0.392608376457106, + "grad_norm": 1.3315397500991821, + "learning_rate": 0.00013860965629962176, + "loss": 1.295, + "step": 10963 + }, + { + "epoch": 0.3926441885866743, + "grad_norm": 1.4446122646331787, + "learning_rate": 0.0001385989564333602, + "loss": 1.259, + "step": 10964 + }, + { + "epoch": 0.3926800007162426, + "grad_norm": 1.411852240562439, + "learning_rate": 0.00013858825604779945, + "loss": 1.2784, + "step": 10965 + }, + { + "epoch": 0.39271581284581086, + "grad_norm": 1.6750668287277222, + "learning_rate": 0.00013857755514308352, + "loss": 1.1975, + "step": 10966 + }, + { + "epoch": 0.3927516249753792, + "grad_norm": 1.4723635911941528, + "learning_rate": 0.00013856685371935637, + "loss": 1.291, + "step": 10967 + }, + { + "epoch": 0.39278743710494746, + "grad_norm": 1.5552462339401245, + "learning_rate": 0.00013855615177676191, + "loss": 1.125, + "step": 10968 + }, + { + "epoch": 0.3928232492345157, + "grad_norm": 1.4101855754852295, + "learning_rate": 0.0001385454493154442, + "loss": 1.2067, + "step": 10969 + }, + { + "epoch": 0.392859061364084, + "grad_norm": 2.0911285877227783, + "learning_rate": 0.00013853474633554715, + "loss": 1.0741, + "step": 10970 + }, + { + "epoch": 0.3928948734936523, + "grad_norm": 1.4221930503845215, + "learning_rate": 0.00013852404283721482, + "loss": 1.1301, + "step": 10971 + }, + { + "epoch": 0.3929306856232206, + "grad_norm": 1.9714624881744385, + "learning_rate": 0.00013851333882059118, + "loss": 0.9307, + "step": 10972 + }, + { + "epoch": 0.39296649775278886, + "grad_norm": 1.776806354522705, + "learning_rate": 0.00013850263428582022, + "loss": 1.2025, + "step": 10973 + }, + { + "epoch": 0.3930023098823572, + "grad_norm": 1.5934330224990845, + "learning_rate": 0.000138491929233046, + "loss": 1.2881, + "step": 10974 + }, + { + "epoch": 0.39303812201192545, + "grad_norm": 1.4189409017562866, + "learning_rate": 0.00013848122366241254, + "loss": 1.2277, + "step": 10975 + }, + { + "epoch": 0.3930739341414937, + "grad_norm": 1.7660354375839233, + "learning_rate": 0.00013847051757406384, + "loss": 1.1979, + "step": 10976 + }, + { + "epoch": 0.393109746271062, + "grad_norm": 1.8595491647720337, + "learning_rate": 0.00013845981096814397, + "loss": 1.063, + "step": 10977 + }, + { + "epoch": 0.3931455584006303, + "grad_norm": 1.5479165315628052, + "learning_rate": 0.00013844910384479693, + "loss": 1.2572, + "step": 10978 + }, + { + "epoch": 0.3931813705301986, + "grad_norm": 2.060671329498291, + "learning_rate": 0.00013843839620416678, + "loss": 1.3852, + "step": 10979 + }, + { + "epoch": 0.39321718265976685, + "grad_norm": 1.6408567428588867, + "learning_rate": 0.00013842768804639763, + "loss": 1.0195, + "step": 10980 + }, + { + "epoch": 0.3932529947893352, + "grad_norm": 1.7582327127456665, + "learning_rate": 0.00013841697937163344, + "loss": 1.1303, + "step": 10981 + }, + { + "epoch": 0.39328880691890344, + "grad_norm": 1.347931981086731, + "learning_rate": 0.0001384062701800184, + "loss": 1.1922, + "step": 10982 + }, + { + "epoch": 0.3933246190484717, + "grad_norm": 1.7995479106903076, + "learning_rate": 0.00013839556047169654, + "loss": 1.3424, + "step": 10983 + }, + { + "epoch": 0.39336043117804, + "grad_norm": 1.5684152841567993, + "learning_rate": 0.00013838485024681192, + "loss": 1.2748, + "step": 10984 + }, + { + "epoch": 0.3933962433076083, + "grad_norm": 1.5058616399765015, + "learning_rate": 0.00013837413950550865, + "loss": 1.3624, + "step": 10985 + }, + { + "epoch": 0.3934320554371766, + "grad_norm": 1.3797556161880493, + "learning_rate": 0.00013836342824793084, + "loss": 0.7772, + "step": 10986 + }, + { + "epoch": 0.39346786756674484, + "grad_norm": 1.5464379787445068, + "learning_rate": 0.00013835271647422262, + "loss": 1.1192, + "step": 10987 + }, + { + "epoch": 0.39350367969631317, + "grad_norm": 1.406862735748291, + "learning_rate": 0.000138342004184528, + "loss": 1.054, + "step": 10988 + }, + { + "epoch": 0.39353949182588144, + "grad_norm": 1.4572889804840088, + "learning_rate": 0.00013833129137899122, + "loss": 1.0092, + "step": 10989 + }, + { + "epoch": 0.3935753039554497, + "grad_norm": 2.250074863433838, + "learning_rate": 0.00013832057805775636, + "loss": 1.3547, + "step": 10990 + }, + { + "epoch": 0.393611116085018, + "grad_norm": 1.4998323917388916, + "learning_rate": 0.00013830986422096756, + "loss": 1.1996, + "step": 10991 + }, + { + "epoch": 0.3936469282145863, + "grad_norm": 1.7722835540771484, + "learning_rate": 0.0001382991498687689, + "loss": 1.2739, + "step": 10992 + }, + { + "epoch": 0.39368274034415457, + "grad_norm": 1.8480466604232788, + "learning_rate": 0.00013828843500130462, + "loss": 1.2831, + "step": 10993 + }, + { + "epoch": 0.39371855247372284, + "grad_norm": 1.3405214548110962, + "learning_rate": 0.00013827771961871885, + "loss": 1.1085, + "step": 10994 + }, + { + "epoch": 0.3937543646032911, + "grad_norm": 1.5033886432647705, + "learning_rate": 0.0001382670037211557, + "loss": 0.9954, + "step": 10995 + }, + { + "epoch": 0.39379017673285943, + "grad_norm": 1.5024293661117554, + "learning_rate": 0.0001382562873087594, + "loss": 1.3022, + "step": 10996 + }, + { + "epoch": 0.3938259888624277, + "grad_norm": 1.6119903326034546, + "learning_rate": 0.00013824557038167408, + "loss": 1.0996, + "step": 10997 + }, + { + "epoch": 0.39386180099199597, + "grad_norm": 1.298298716545105, + "learning_rate": 0.00013823485294004397, + "loss": 0.9918, + "step": 10998 + }, + { + "epoch": 0.3938976131215643, + "grad_norm": 1.7288939952850342, + "learning_rate": 0.00013822413498401322, + "loss": 1.0897, + "step": 10999 + }, + { + "epoch": 0.39393342525113256, + "grad_norm": 1.7094658613204956, + "learning_rate": 0.00013821341651372603, + "loss": 1.0836, + "step": 11000 + }, + { + "epoch": 0.39396923738070083, + "grad_norm": 1.5171291828155518, + "learning_rate": 0.00013820269752932662, + "loss": 0.8895, + "step": 11001 + }, + { + "epoch": 0.3940050495102691, + "grad_norm": 1.4648566246032715, + "learning_rate": 0.0001381919780309592, + "loss": 1.238, + "step": 11002 + }, + { + "epoch": 0.3940408616398374, + "grad_norm": 2.1040565967559814, + "learning_rate": 0.000138181258018768, + "loss": 1.3505, + "step": 11003 + }, + { + "epoch": 0.3940766737694057, + "grad_norm": 1.8147730827331543, + "learning_rate": 0.00013817053749289718, + "loss": 1.18, + "step": 11004 + }, + { + "epoch": 0.39411248589897396, + "grad_norm": 2.3539891242980957, + "learning_rate": 0.00013815981645349105, + "loss": 1.2329, + "step": 11005 + }, + { + "epoch": 0.3941482980285423, + "grad_norm": 1.45710027217865, + "learning_rate": 0.00013814909490069378, + "loss": 1.1108, + "step": 11006 + }, + { + "epoch": 0.39418411015811056, + "grad_norm": 1.3371624946594238, + "learning_rate": 0.00013813837283464968, + "loss": 1.1801, + "step": 11007 + }, + { + "epoch": 0.3942199222876788, + "grad_norm": 1.6823095083236694, + "learning_rate": 0.00013812765025550294, + "loss": 1.0917, + "step": 11008 + }, + { + "epoch": 0.3942557344172471, + "grad_norm": 1.5755517482757568, + "learning_rate": 0.00013811692716339785, + "loss": 1.1892, + "step": 11009 + }, + { + "epoch": 0.3942915465468154, + "grad_norm": 1.4189430475234985, + "learning_rate": 0.00013810620355847868, + "loss": 1.1426, + "step": 11010 + }, + { + "epoch": 0.3943273586763837, + "grad_norm": 1.316659927368164, + "learning_rate": 0.00013809547944088968, + "loss": 1.4396, + "step": 11011 + }, + { + "epoch": 0.39436317080595196, + "grad_norm": 1.9760653972625732, + "learning_rate": 0.00013808475481077518, + "loss": 1.2497, + "step": 11012 + }, + { + "epoch": 0.3943989829355203, + "grad_norm": 1.4751070737838745, + "learning_rate": 0.00013807402966827944, + "loss": 1.097, + "step": 11013 + }, + { + "epoch": 0.39443479506508855, + "grad_norm": 1.4562599658966064, + "learning_rate": 0.00013806330401354671, + "loss": 1.0855, + "step": 11014 + }, + { + "epoch": 0.3944706071946568, + "grad_norm": 1.6772425174713135, + "learning_rate": 0.0001380525778467213, + "loss": 1.1321, + "step": 11015 + }, + { + "epoch": 0.3945064193242251, + "grad_norm": 1.3435089588165283, + "learning_rate": 0.00013804185116794755, + "loss": 1.124, + "step": 11016 + }, + { + "epoch": 0.3945422314537934, + "grad_norm": 1.9495357275009155, + "learning_rate": 0.00013803112397736976, + "loss": 1.1936, + "step": 11017 + }, + { + "epoch": 0.3945780435833617, + "grad_norm": 1.344006896018982, + "learning_rate": 0.0001380203962751323, + "loss": 1.0205, + "step": 11018 + }, + { + "epoch": 0.39461385571292995, + "grad_norm": 1.2908375263214111, + "learning_rate": 0.0001380096680613794, + "loss": 1.0866, + "step": 11019 + }, + { + "epoch": 0.3946496678424983, + "grad_norm": 2.147547960281372, + "learning_rate": 0.00013799893933625547, + "loss": 1.0966, + "step": 11020 + }, + { + "epoch": 0.39468547997206654, + "grad_norm": 1.2990623712539673, + "learning_rate": 0.00013798821009990486, + "loss": 1.1226, + "step": 11021 + }, + { + "epoch": 0.3947212921016348, + "grad_norm": 1.4621583223342896, + "learning_rate": 0.00013797748035247184, + "loss": 1.1075, + "step": 11022 + }, + { + "epoch": 0.3947571042312031, + "grad_norm": 2.9454641342163086, + "learning_rate": 0.0001379667500941008, + "loss": 1.1662, + "step": 11023 + }, + { + "epoch": 0.3947929163607714, + "grad_norm": 1.6421496868133545, + "learning_rate": 0.00013795601932493613, + "loss": 1.2125, + "step": 11024 + }, + { + "epoch": 0.3948287284903397, + "grad_norm": 1.7953903675079346, + "learning_rate": 0.0001379452880451222, + "loss": 1.0773, + "step": 11025 + }, + { + "epoch": 0.39486454061990794, + "grad_norm": 2.070511817932129, + "learning_rate": 0.00013793455625480332, + "loss": 1.2402, + "step": 11026 + }, + { + "epoch": 0.39490035274947627, + "grad_norm": 1.3758100271224976, + "learning_rate": 0.00013792382395412392, + "loss": 1.3445, + "step": 11027 + }, + { + "epoch": 0.39493616487904454, + "grad_norm": 1.1997243165969849, + "learning_rate": 0.00013791309114322841, + "loss": 0.9422, + "step": 11028 + }, + { + "epoch": 0.3949719770086128, + "grad_norm": 1.6305580139160156, + "learning_rate": 0.00013790235782226118, + "loss": 1.1385, + "step": 11029 + }, + { + "epoch": 0.3950077891381811, + "grad_norm": 1.4113308191299438, + "learning_rate": 0.0001378916239913666, + "loss": 1.2461, + "step": 11030 + }, + { + "epoch": 0.3950436012677494, + "grad_norm": 1.5720272064208984, + "learning_rate": 0.0001378808896506891, + "loss": 1.1777, + "step": 11031 + }, + { + "epoch": 0.39507941339731767, + "grad_norm": 1.5645430088043213, + "learning_rate": 0.00013787015480037307, + "loss": 1.221, + "step": 11032 + }, + { + "epoch": 0.39511522552688594, + "grad_norm": 2.1643712520599365, + "learning_rate": 0.00013785941944056298, + "loss": 1.1413, + "step": 11033 + }, + { + "epoch": 0.39515103765645426, + "grad_norm": 1.6777374744415283, + "learning_rate": 0.00013784868357140322, + "loss": 1.2397, + "step": 11034 + }, + { + "epoch": 0.39518684978602253, + "grad_norm": 1.783079981803894, + "learning_rate": 0.00013783794719303825, + "loss": 1.1835, + "step": 11035 + }, + { + "epoch": 0.3952226619155908, + "grad_norm": 1.4679272174835205, + "learning_rate": 0.0001378272103056125, + "loss": 1.225, + "step": 11036 + }, + { + "epoch": 0.39525847404515907, + "grad_norm": 1.5045790672302246, + "learning_rate": 0.00013781647290927044, + "loss": 1.0633, + "step": 11037 + }, + { + "epoch": 0.3952942861747274, + "grad_norm": 1.3688522577285767, + "learning_rate": 0.00013780573500415654, + "loss": 1.0708, + "step": 11038 + }, + { + "epoch": 0.39533009830429566, + "grad_norm": 1.4610631465911865, + "learning_rate": 0.0001377949965904152, + "loss": 1.1117, + "step": 11039 + }, + { + "epoch": 0.39536591043386393, + "grad_norm": 1.2278196811676025, + "learning_rate": 0.00013778425766819096, + "loss": 0.999, + "step": 11040 + }, + { + "epoch": 0.39540172256343226, + "grad_norm": 1.9504257440567017, + "learning_rate": 0.00013777351823762826, + "loss": 1.2105, + "step": 11041 + }, + { + "epoch": 0.3954375346930005, + "grad_norm": 1.4205855131149292, + "learning_rate": 0.0001377627782988716, + "loss": 1.0794, + "step": 11042 + }, + { + "epoch": 0.3954733468225688, + "grad_norm": 1.4208701848983765, + "learning_rate": 0.00013775203785206544, + "loss": 0.9014, + "step": 11043 + }, + { + "epoch": 0.39550915895213706, + "grad_norm": 1.4626874923706055, + "learning_rate": 0.00013774129689735437, + "loss": 1.0677, + "step": 11044 + }, + { + "epoch": 0.3955449710817054, + "grad_norm": 1.6636028289794922, + "learning_rate": 0.00013773055543488276, + "loss": 1.1339, + "step": 11045 + }, + { + "epoch": 0.39558078321127366, + "grad_norm": 1.4185606241226196, + "learning_rate": 0.00013771981346479524, + "loss": 1.2396, + "step": 11046 + }, + { + "epoch": 0.3956165953408419, + "grad_norm": 1.5893808603286743, + "learning_rate": 0.0001377090709872363, + "loss": 1.2475, + "step": 11047 + }, + { + "epoch": 0.39565240747041025, + "grad_norm": 1.5757571458816528, + "learning_rate": 0.00013769832800235041, + "loss": 1.0866, + "step": 11048 + }, + { + "epoch": 0.3956882195999785, + "grad_norm": 1.6351374387741089, + "learning_rate": 0.00013768758451028216, + "loss": 1.1249, + "step": 11049 + }, + { + "epoch": 0.3957240317295468, + "grad_norm": 1.6831293106079102, + "learning_rate": 0.00013767684051117605, + "loss": 1.2482, + "step": 11050 + }, + { + "epoch": 0.39575984385911506, + "grad_norm": 1.5079941749572754, + "learning_rate": 0.0001376660960051767, + "loss": 1.2122, + "step": 11051 + }, + { + "epoch": 0.3957956559886834, + "grad_norm": 1.4727623462677002, + "learning_rate": 0.0001376553509924286, + "loss": 1.0921, + "step": 11052 + }, + { + "epoch": 0.39583146811825165, + "grad_norm": 1.5777161121368408, + "learning_rate": 0.00013764460547307632, + "loss": 1.1791, + "step": 11053 + }, + { + "epoch": 0.3958672802478199, + "grad_norm": 1.5932589769363403, + "learning_rate": 0.0001376338594472644, + "loss": 1.123, + "step": 11054 + }, + { + "epoch": 0.39590309237738824, + "grad_norm": 1.8545756340026855, + "learning_rate": 0.00013762311291513747, + "loss": 1.2429, + "step": 11055 + }, + { + "epoch": 0.3959389045069565, + "grad_norm": 1.6220186948776245, + "learning_rate": 0.0001376123658768401, + "loss": 1.2026, + "step": 11056 + }, + { + "epoch": 0.3959747166365248, + "grad_norm": 1.322306513786316, + "learning_rate": 0.00013760161833251683, + "loss": 1.0721, + "step": 11057 + }, + { + "epoch": 0.39601052876609305, + "grad_norm": 1.2666889429092407, + "learning_rate": 0.00013759087028231232, + "loss": 1.0329, + "step": 11058 + }, + { + "epoch": 0.3960463408956614, + "grad_norm": 1.6955205202102661, + "learning_rate": 0.0001375801217263711, + "loss": 1.1627, + "step": 11059 + }, + { + "epoch": 0.39608215302522964, + "grad_norm": 1.4156631231307983, + "learning_rate": 0.00013756937266483788, + "loss": 1.2115, + "step": 11060 + }, + { + "epoch": 0.3961179651547979, + "grad_norm": 1.56638765335083, + "learning_rate": 0.00013755862309785716, + "loss": 1.2479, + "step": 11061 + }, + { + "epoch": 0.39615377728436624, + "grad_norm": 1.8369884490966797, + "learning_rate": 0.00013754787302557364, + "loss": 1.1986, + "step": 11062 + }, + { + "epoch": 0.3961895894139345, + "grad_norm": 1.5588657855987549, + "learning_rate": 0.0001375371224481319, + "loss": 1.3337, + "step": 11063 + }, + { + "epoch": 0.3962254015435028, + "grad_norm": 1.9973620176315308, + "learning_rate": 0.0001375263713656766, + "loss": 1.1113, + "step": 11064 + }, + { + "epoch": 0.39626121367307104, + "grad_norm": 1.8426103591918945, + "learning_rate": 0.00013751561977835242, + "loss": 1.3727, + "step": 11065 + }, + { + "epoch": 0.39629702580263937, + "grad_norm": 1.352287769317627, + "learning_rate": 0.00013750486768630393, + "loss": 1.171, + "step": 11066 + }, + { + "epoch": 0.39633283793220764, + "grad_norm": 1.8209214210510254, + "learning_rate": 0.00013749411508967582, + "loss": 1.2111, + "step": 11067 + }, + { + "epoch": 0.3963686500617759, + "grad_norm": 1.8601410388946533, + "learning_rate": 0.0001374833619886128, + "loss": 1.4446, + "step": 11068 + }, + { + "epoch": 0.39640446219134423, + "grad_norm": 1.5486446619033813, + "learning_rate": 0.0001374726083832594, + "loss": 1.0549, + "step": 11069 + }, + { + "epoch": 0.3964402743209125, + "grad_norm": 1.465664267539978, + "learning_rate": 0.00013746185427376047, + "loss": 1.1276, + "step": 11070 + }, + { + "epoch": 0.39647608645048077, + "grad_norm": 1.237675428390503, + "learning_rate": 0.00013745109966026056, + "loss": 1.1182, + "step": 11071 + }, + { + "epoch": 0.39651189858004904, + "grad_norm": 1.2596451044082642, + "learning_rate": 0.00013744034454290447, + "loss": 1.1911, + "step": 11072 + }, + { + "epoch": 0.39654771070961736, + "grad_norm": 1.4354841709136963, + "learning_rate": 0.0001374295889218368, + "loss": 1.07, + "step": 11073 + }, + { + "epoch": 0.39658352283918563, + "grad_norm": 1.3563402891159058, + "learning_rate": 0.0001374188327972023, + "loss": 1.1348, + "step": 11074 + }, + { + "epoch": 0.3966193349687539, + "grad_norm": 1.3617116212844849, + "learning_rate": 0.0001374080761691457, + "loss": 1.1614, + "step": 11075 + }, + { + "epoch": 0.3966551470983222, + "grad_norm": 1.6185399293899536, + "learning_rate": 0.00013739731903781164, + "loss": 1.1148, + "step": 11076 + }, + { + "epoch": 0.3966909592278905, + "grad_norm": 1.732047200202942, + "learning_rate": 0.00013738656140334493, + "loss": 1.1527, + "step": 11077 + }, + { + "epoch": 0.39672677135745876, + "grad_norm": 1.6872307062149048, + "learning_rate": 0.00013737580326589024, + "loss": 1.2965, + "step": 11078 + }, + { + "epoch": 0.39676258348702703, + "grad_norm": 1.835649013519287, + "learning_rate": 0.00013736504462559234, + "loss": 1.0726, + "step": 11079 + }, + { + "epoch": 0.39679839561659536, + "grad_norm": 1.8576315641403198, + "learning_rate": 0.00013735428548259594, + "loss": 1.118, + "step": 11080 + }, + { + "epoch": 0.3968342077461636, + "grad_norm": 2.024268388748169, + "learning_rate": 0.00013734352583704587, + "loss": 1.2729, + "step": 11081 + }, + { + "epoch": 0.3968700198757319, + "grad_norm": 1.3729242086410522, + "learning_rate": 0.00013733276568908678, + "loss": 1.2496, + "step": 11082 + }, + { + "epoch": 0.3969058320053002, + "grad_norm": 1.451158881187439, + "learning_rate": 0.00013732200503886348, + "loss": 0.9743, + "step": 11083 + }, + { + "epoch": 0.3969416441348685, + "grad_norm": 1.6895438432693481, + "learning_rate": 0.00013731124388652077, + "loss": 1.1102, + "step": 11084 + }, + { + "epoch": 0.39697745626443676, + "grad_norm": 1.9632341861724854, + "learning_rate": 0.00013730048223220336, + "loss": 1.112, + "step": 11085 + }, + { + "epoch": 0.397013268394005, + "grad_norm": 1.4981615543365479, + "learning_rate": 0.00013728972007605612, + "loss": 1.2343, + "step": 11086 + }, + { + "epoch": 0.39704908052357335, + "grad_norm": 1.37602961063385, + "learning_rate": 0.00013727895741822379, + "loss": 1.0468, + "step": 11087 + }, + { + "epoch": 0.3970848926531416, + "grad_norm": 1.390188217163086, + "learning_rate": 0.00013726819425885115, + "loss": 1.1957, + "step": 11088 + }, + { + "epoch": 0.3971207047827099, + "grad_norm": 1.972608208656311, + "learning_rate": 0.00013725743059808305, + "loss": 1.234, + "step": 11089 + }, + { + "epoch": 0.3971565169122782, + "grad_norm": 1.5771992206573486, + "learning_rate": 0.00013724666643606429, + "loss": 1.1549, + "step": 11090 + }, + { + "epoch": 0.3971923290418465, + "grad_norm": 1.4878829717636108, + "learning_rate": 0.00013723590177293967, + "loss": 1.171, + "step": 11091 + }, + { + "epoch": 0.39722814117141475, + "grad_norm": 2.0455713272094727, + "learning_rate": 0.000137225136608854, + "loss": 1.3756, + "step": 11092 + }, + { + "epoch": 0.397263953300983, + "grad_norm": 1.753700852394104, + "learning_rate": 0.00013721437094395213, + "loss": 1.0446, + "step": 11093 + }, + { + "epoch": 0.39729976543055134, + "grad_norm": 1.9955272674560547, + "learning_rate": 0.00013720360477837893, + "loss": 1.0801, + "step": 11094 + }, + { + "epoch": 0.3973355775601196, + "grad_norm": 1.3636784553527832, + "learning_rate": 0.00013719283811227921, + "loss": 1.1904, + "step": 11095 + }, + { + "epoch": 0.3973713896896879, + "grad_norm": 1.7221887111663818, + "learning_rate": 0.0001371820709457978, + "loss": 1.3605, + "step": 11096 + }, + { + "epoch": 0.3974072018192562, + "grad_norm": 2.0329205989837646, + "learning_rate": 0.00013717130327907964, + "loss": 1.3182, + "step": 11097 + }, + { + "epoch": 0.3974430139488245, + "grad_norm": 1.9145394563674927, + "learning_rate": 0.00013716053511226949, + "loss": 1.0442, + "step": 11098 + }, + { + "epoch": 0.39747882607839274, + "grad_norm": 1.6126124858856201, + "learning_rate": 0.00013714976644551232, + "loss": 1.1687, + "step": 11099 + }, + { + "epoch": 0.397514638207961, + "grad_norm": 1.2931900024414062, + "learning_rate": 0.00013713899727895294, + "loss": 1.1019, + "step": 11100 + }, + { + "epoch": 0.39755045033752934, + "grad_norm": 1.8124977350234985, + "learning_rate": 0.00013712822761273625, + "loss": 1.195, + "step": 11101 + }, + { + "epoch": 0.3975862624670976, + "grad_norm": 1.3835965394973755, + "learning_rate": 0.00013711745744700714, + "loss": 1.1763, + "step": 11102 + }, + { + "epoch": 0.3976220745966659, + "grad_norm": 1.6267002820968628, + "learning_rate": 0.00013710668678191054, + "loss": 1.2631, + "step": 11103 + }, + { + "epoch": 0.3976578867262342, + "grad_norm": 1.7345430850982666, + "learning_rate": 0.00013709591561759135, + "loss": 1.0725, + "step": 11104 + }, + { + "epoch": 0.39769369885580247, + "grad_norm": 1.4765450954437256, + "learning_rate": 0.00013708514395419444, + "loss": 1.2762, + "step": 11105 + }, + { + "epoch": 0.39772951098537074, + "grad_norm": 1.3779988288879395, + "learning_rate": 0.00013707437179186476, + "loss": 1.0072, + "step": 11106 + }, + { + "epoch": 0.397765323114939, + "grad_norm": 1.5551273822784424, + "learning_rate": 0.0001370635991307472, + "loss": 1.2301, + "step": 11107 + }, + { + "epoch": 0.39780113524450733, + "grad_norm": 1.455592155456543, + "learning_rate": 0.00013705282597098676, + "loss": 0.8982, + "step": 11108 + }, + { + "epoch": 0.3978369473740756, + "grad_norm": 1.6461873054504395, + "learning_rate": 0.00013704205231272835, + "loss": 1.4093, + "step": 11109 + }, + { + "epoch": 0.39787275950364387, + "grad_norm": 1.7737711668014526, + "learning_rate": 0.0001370312781561169, + "loss": 1.2534, + "step": 11110 + }, + { + "epoch": 0.3979085716332122, + "grad_norm": 1.8352036476135254, + "learning_rate": 0.00013702050350129734, + "loss": 1.2859, + "step": 11111 + }, + { + "epoch": 0.39794438376278046, + "grad_norm": 1.4779471158981323, + "learning_rate": 0.00013700972834841468, + "loss": 1.3689, + "step": 11112 + }, + { + "epoch": 0.39798019589234873, + "grad_norm": 1.5025595426559448, + "learning_rate": 0.00013699895269761385, + "loss": 0.9429, + "step": 11113 + }, + { + "epoch": 0.398016008021917, + "grad_norm": 1.317108154296875, + "learning_rate": 0.00013698817654903984, + "loss": 1.1762, + "step": 11114 + }, + { + "epoch": 0.3980518201514853, + "grad_norm": 1.4746346473693848, + "learning_rate": 0.00013697739990283764, + "loss": 1.2589, + "step": 11115 + }, + { + "epoch": 0.3980876322810536, + "grad_norm": 1.785088062286377, + "learning_rate": 0.00013696662275915222, + "loss": 1.0016, + "step": 11116 + }, + { + "epoch": 0.39812344441062186, + "grad_norm": 1.925941824913025, + "learning_rate": 0.00013695584511812857, + "loss": 1.1862, + "step": 11117 + }, + { + "epoch": 0.3981592565401902, + "grad_norm": 1.7058091163635254, + "learning_rate": 0.00013694506697991169, + "loss": 1.1791, + "step": 11118 + }, + { + "epoch": 0.39819506866975846, + "grad_norm": 1.7207900285720825, + "learning_rate": 0.00013693428834464662, + "loss": 1.154, + "step": 11119 + }, + { + "epoch": 0.3982308807993267, + "grad_norm": 1.5384005308151245, + "learning_rate": 0.00013692350921247829, + "loss": 1.0442, + "step": 11120 + }, + { + "epoch": 0.398266692928895, + "grad_norm": 1.528791069984436, + "learning_rate": 0.00013691272958355182, + "loss": 1.2216, + "step": 11121 + }, + { + "epoch": 0.3983025050584633, + "grad_norm": 1.5231900215148926, + "learning_rate": 0.00013690194945801214, + "loss": 1.2565, + "step": 11122 + }, + { + "epoch": 0.3983383171880316, + "grad_norm": 2.500988483428955, + "learning_rate": 0.00013689116883600436, + "loss": 1.1422, + "step": 11123 + }, + { + "epoch": 0.39837412931759986, + "grad_norm": 1.3719956874847412, + "learning_rate": 0.00013688038771767345, + "loss": 1.1087, + "step": 11124 + }, + { + "epoch": 0.3984099414471682, + "grad_norm": 1.71195387840271, + "learning_rate": 0.00013686960610316456, + "loss": 0.9799, + "step": 11125 + }, + { + "epoch": 0.39844575357673645, + "grad_norm": 1.9711216688156128, + "learning_rate": 0.00013685882399262265, + "loss": 1.3149, + "step": 11126 + }, + { + "epoch": 0.3984815657063047, + "grad_norm": 1.4029719829559326, + "learning_rate": 0.0001368480413861928, + "loss": 1.1942, + "step": 11127 + }, + { + "epoch": 0.398517377835873, + "grad_norm": 1.470272183418274, + "learning_rate": 0.0001368372582840201, + "loss": 1.1594, + "step": 11128 + }, + { + "epoch": 0.3985531899654413, + "grad_norm": 1.4304648637771606, + "learning_rate": 0.00013682647468624958, + "loss": 1.2225, + "step": 11129 + }, + { + "epoch": 0.3985890020950096, + "grad_norm": 1.4060752391815186, + "learning_rate": 0.00013681569059302638, + "loss": 1.0628, + "step": 11130 + }, + { + "epoch": 0.39862481422457785, + "grad_norm": 1.8301560878753662, + "learning_rate": 0.00013680490600449552, + "loss": 1.1942, + "step": 11131 + }, + { + "epoch": 0.3986606263541462, + "grad_norm": 1.595718264579773, + "learning_rate": 0.00013679412092080213, + "loss": 1.2134, + "step": 11132 + }, + { + "epoch": 0.39869643848371444, + "grad_norm": 1.5056065320968628, + "learning_rate": 0.0001367833353420913, + "loss": 1.3189, + "step": 11133 + }, + { + "epoch": 0.3987322506132827, + "grad_norm": 1.3729811906814575, + "learning_rate": 0.00013677254926850818, + "loss": 1.1745, + "step": 11134 + }, + { + "epoch": 0.398768062742851, + "grad_norm": 1.4213913679122925, + "learning_rate": 0.0001367617627001978, + "loss": 1.1162, + "step": 11135 + }, + { + "epoch": 0.3988038748724193, + "grad_norm": 1.6187292337417603, + "learning_rate": 0.0001367509756373053, + "loss": 1.1065, + "step": 11136 + }, + { + "epoch": 0.3988396870019876, + "grad_norm": 1.5394669771194458, + "learning_rate": 0.0001367401880799759, + "loss": 1.1899, + "step": 11137 + }, + { + "epoch": 0.39887549913155584, + "grad_norm": 1.3008971214294434, + "learning_rate": 0.0001367294000283546, + "loss": 1.1325, + "step": 11138 + }, + { + "epoch": 0.39891131126112417, + "grad_norm": 1.5818136930465698, + "learning_rate": 0.00013671861148258665, + "loss": 1.2981, + "step": 11139 + }, + { + "epoch": 0.39894712339069244, + "grad_norm": 1.4717556238174438, + "learning_rate": 0.0001367078224428171, + "loss": 1.1897, + "step": 11140 + }, + { + "epoch": 0.3989829355202607, + "grad_norm": 1.5390032529830933, + "learning_rate": 0.00013669703290919118, + "loss": 1.139, + "step": 11141 + }, + { + "epoch": 0.399018747649829, + "grad_norm": 1.6559422016143799, + "learning_rate": 0.00013668624288185402, + "loss": 1.2718, + "step": 11142 + }, + { + "epoch": 0.3990545597793973, + "grad_norm": 1.471861481666565, + "learning_rate": 0.00013667545236095076, + "loss": 1.0884, + "step": 11143 + }, + { + "epoch": 0.39909037190896557, + "grad_norm": 1.4308053255081177, + "learning_rate": 0.00013666466134662662, + "loss": 1.1977, + "step": 11144 + }, + { + "epoch": 0.39912618403853384, + "grad_norm": 1.4296433925628662, + "learning_rate": 0.00013665386983902672, + "loss": 1.2608, + "step": 11145 + }, + { + "epoch": 0.39916199616810216, + "grad_norm": 1.765353798866272, + "learning_rate": 0.00013664307783829634, + "loss": 1.2057, + "step": 11146 + }, + { + "epoch": 0.39919780829767043, + "grad_norm": 1.6171599626541138, + "learning_rate": 0.00013663228534458054, + "loss": 1.2089, + "step": 11147 + }, + { + "epoch": 0.3992336204272387, + "grad_norm": 1.7634974718093872, + "learning_rate": 0.00013662149235802465, + "loss": 1.0028, + "step": 11148 + }, + { + "epoch": 0.39926943255680697, + "grad_norm": 1.7793338298797607, + "learning_rate": 0.0001366106988787738, + "loss": 1.3879, + "step": 11149 + }, + { + "epoch": 0.3993052446863753, + "grad_norm": 1.4962031841278076, + "learning_rate": 0.00013659990490697322, + "loss": 1.1952, + "step": 11150 + }, + { + "epoch": 0.39934105681594356, + "grad_norm": 1.5902522802352905, + "learning_rate": 0.0001365891104427681, + "loss": 1.2705, + "step": 11151 + }, + { + "epoch": 0.39937686894551183, + "grad_norm": 1.655560851097107, + "learning_rate": 0.00013657831548630377, + "loss": 1.2331, + "step": 11152 + }, + { + "epoch": 0.39941268107508016, + "grad_norm": 1.5384715795516968, + "learning_rate": 0.00013656752003772535, + "loss": 1.1559, + "step": 11153 + }, + { + "epoch": 0.3994484932046484, + "grad_norm": 1.4833201169967651, + "learning_rate": 0.00013655672409717813, + "loss": 1.1228, + "step": 11154 + }, + { + "epoch": 0.3994843053342167, + "grad_norm": 2.7800564765930176, + "learning_rate": 0.0001365459276648073, + "loss": 1.4186, + "step": 11155 + }, + { + "epoch": 0.39952011746378496, + "grad_norm": 1.3237253427505493, + "learning_rate": 0.00013653513074075816, + "loss": 1.129, + "step": 11156 + }, + { + "epoch": 0.3995559295933533, + "grad_norm": 1.3268241882324219, + "learning_rate": 0.000136524333325176, + "loss": 1.2254, + "step": 11157 + }, + { + "epoch": 0.39959174172292156, + "grad_norm": 1.3598263263702393, + "learning_rate": 0.00013651353541820603, + "loss": 0.9724, + "step": 11158 + }, + { + "epoch": 0.3996275538524898, + "grad_norm": 1.4591912031173706, + "learning_rate": 0.00013650273701999353, + "loss": 1.0697, + "step": 11159 + }, + { + "epoch": 0.39966336598205815, + "grad_norm": 1.385761022567749, + "learning_rate": 0.0001364919381306838, + "loss": 1.1865, + "step": 11160 + }, + { + "epoch": 0.3996991781116264, + "grad_norm": 1.8331865072250366, + "learning_rate": 0.00013648113875042213, + "loss": 1.1594, + "step": 11161 + }, + { + "epoch": 0.3997349902411947, + "grad_norm": 3.274385452270508, + "learning_rate": 0.00013647033887935378, + "loss": 1.0493, + "step": 11162 + }, + { + "epoch": 0.39977080237076296, + "grad_norm": 2.1678919792175293, + "learning_rate": 0.00013645953851762406, + "loss": 1.0124, + "step": 11163 + }, + { + "epoch": 0.3998066145003313, + "grad_norm": 1.6149855852127075, + "learning_rate": 0.00013644873766537828, + "loss": 1.1544, + "step": 11164 + }, + { + "epoch": 0.39984242662989955, + "grad_norm": 1.431489109992981, + "learning_rate": 0.00013643793632276175, + "loss": 0.8328, + "step": 11165 + }, + { + "epoch": 0.3998782387594678, + "grad_norm": 1.6470329761505127, + "learning_rate": 0.00013642713448991977, + "loss": 0.8151, + "step": 11166 + }, + { + "epoch": 0.39991405088903614, + "grad_norm": 2.0392467975616455, + "learning_rate": 0.0001364163321669977, + "loss": 1.2887, + "step": 11167 + }, + { + "epoch": 0.3999498630186044, + "grad_norm": 1.721431851387024, + "learning_rate": 0.00013640552935414085, + "loss": 1.0846, + "step": 11168 + }, + { + "epoch": 0.3999856751481727, + "grad_norm": 1.5290380716323853, + "learning_rate": 0.00013639472605149456, + "loss": 1.0545, + "step": 11169 + }, + { + "epoch": 0.40002148727774095, + "grad_norm": 1.578355073928833, + "learning_rate": 0.00013638392225920418, + "loss": 1.3209, + "step": 11170 + }, + { + "epoch": 0.4000572994073093, + "grad_norm": 1.310037612915039, + "learning_rate": 0.00013637311797741507, + "loss": 1.13, + "step": 11171 + }, + { + "epoch": 0.40009311153687754, + "grad_norm": 1.7671077251434326, + "learning_rate": 0.00013636231320627258, + "loss": 1.149, + "step": 11172 + }, + { + "epoch": 0.4001289236664458, + "grad_norm": 1.5626415014266968, + "learning_rate": 0.00013635150794592205, + "loss": 0.9132, + "step": 11173 + }, + { + "epoch": 0.40016473579601414, + "grad_norm": 1.363291621208191, + "learning_rate": 0.00013634070219650888, + "loss": 1.1146, + "step": 11174 + }, + { + "epoch": 0.4002005479255824, + "grad_norm": 1.4121274948120117, + "learning_rate": 0.00013632989595817842, + "loss": 0.972, + "step": 11175 + }, + { + "epoch": 0.4002363600551507, + "grad_norm": 1.5563912391662598, + "learning_rate": 0.0001363190892310761, + "loss": 1.051, + "step": 11176 + }, + { + "epoch": 0.40027217218471894, + "grad_norm": 1.8687381744384766, + "learning_rate": 0.00013630828201534727, + "loss": 1.0578, + "step": 11177 + }, + { + "epoch": 0.40030798431428727, + "grad_norm": 1.414817214012146, + "learning_rate": 0.00013629747431113734, + "loss": 1.3423, + "step": 11178 + }, + { + "epoch": 0.40034379644385554, + "grad_norm": 1.6464450359344482, + "learning_rate": 0.0001362866661185917, + "loss": 1.091, + "step": 11179 + }, + { + "epoch": 0.4003796085734238, + "grad_norm": 1.3092440366744995, + "learning_rate": 0.00013627585743785582, + "loss": 1.2221, + "step": 11180 + }, + { + "epoch": 0.40041542070299213, + "grad_norm": 1.3770411014556885, + "learning_rate": 0.000136265048269075, + "loss": 1.1035, + "step": 11181 + }, + { + "epoch": 0.4004512328325604, + "grad_norm": 1.5426234006881714, + "learning_rate": 0.0001362542386123948, + "loss": 1.082, + "step": 11182 + }, + { + "epoch": 0.40048704496212867, + "grad_norm": 1.786787748336792, + "learning_rate": 0.00013624342846796058, + "loss": 1.0394, + "step": 11183 + }, + { + "epoch": 0.40052285709169694, + "grad_norm": 1.4659271240234375, + "learning_rate": 0.00013623261783591773, + "loss": 1.2804, + "step": 11184 + }, + { + "epoch": 0.40055866922126526, + "grad_norm": 1.6420917510986328, + "learning_rate": 0.00013622180671641178, + "loss": 1.0876, + "step": 11185 + }, + { + "epoch": 0.40059448135083353, + "grad_norm": 1.822031855583191, + "learning_rate": 0.0001362109951095881, + "loss": 1.258, + "step": 11186 + }, + { + "epoch": 0.4006302934804018, + "grad_norm": 1.6027960777282715, + "learning_rate": 0.00013620018301559224, + "loss": 1.2453, + "step": 11187 + }, + { + "epoch": 0.4006661056099701, + "grad_norm": 1.6881145238876343, + "learning_rate": 0.0001361893704345696, + "loss": 1.249, + "step": 11188 + }, + { + "epoch": 0.4007019177395384, + "grad_norm": 1.3635296821594238, + "learning_rate": 0.00013617855736666566, + "loss": 1.1021, + "step": 11189 + }, + { + "epoch": 0.40073772986910666, + "grad_norm": 2.5274930000305176, + "learning_rate": 0.00013616774381202591, + "loss": 1.2435, + "step": 11190 + }, + { + "epoch": 0.40077354199867493, + "grad_norm": 1.489745020866394, + "learning_rate": 0.00013615692977079577, + "loss": 0.9893, + "step": 11191 + }, + { + "epoch": 0.40080935412824326, + "grad_norm": 1.4739577770233154, + "learning_rate": 0.00013614611524312084, + "loss": 1.1975, + "step": 11192 + }, + { + "epoch": 0.4008451662578115, + "grad_norm": 1.7538821697235107, + "learning_rate": 0.0001361353002291465, + "loss": 1.148, + "step": 11193 + }, + { + "epoch": 0.4008809783873798, + "grad_norm": 1.7549766302108765, + "learning_rate": 0.00013612448472901834, + "loss": 1.1517, + "step": 11194 + }, + { + "epoch": 0.40091679051694806, + "grad_norm": 1.5883152484893799, + "learning_rate": 0.00013611366874288186, + "loss": 1.4029, + "step": 11195 + }, + { + "epoch": 0.4009526026465164, + "grad_norm": 1.7295842170715332, + "learning_rate": 0.0001361028522708825, + "loss": 1.2187, + "step": 11196 + }, + { + "epoch": 0.40098841477608466, + "grad_norm": 1.6166092157363892, + "learning_rate": 0.00013609203531316587, + "loss": 1.1811, + "step": 11197 + }, + { + "epoch": 0.4010242269056529, + "grad_norm": 1.5193450450897217, + "learning_rate": 0.0001360812178698774, + "loss": 0.9124, + "step": 11198 + }, + { + "epoch": 0.40106003903522125, + "grad_norm": 1.6554416418075562, + "learning_rate": 0.00013607039994116278, + "loss": 1.0276, + "step": 11199 + }, + { + "epoch": 0.4010958511647895, + "grad_norm": 2.0932066440582275, + "learning_rate": 0.00013605958152716738, + "loss": 1.3679, + "step": 11200 + }, + { + "epoch": 0.4011316632943578, + "grad_norm": 1.4532256126403809, + "learning_rate": 0.00013604876262803686, + "loss": 1.0803, + "step": 11201 + }, + { + "epoch": 0.40116747542392606, + "grad_norm": 2.2837395668029785, + "learning_rate": 0.00013603794324391672, + "loss": 1.0902, + "step": 11202 + }, + { + "epoch": 0.4012032875534944, + "grad_norm": 1.7119640111923218, + "learning_rate": 0.00013602712337495255, + "loss": 1.1417, + "step": 11203 + }, + { + "epoch": 0.40123909968306265, + "grad_norm": 1.465251088142395, + "learning_rate": 0.0001360163030212899, + "loss": 1.2832, + "step": 11204 + }, + { + "epoch": 0.4012749118126309, + "grad_norm": 1.851870059967041, + "learning_rate": 0.00013600548218307436, + "loss": 1.0353, + "step": 11205 + }, + { + "epoch": 0.40131072394219924, + "grad_norm": 1.391998529434204, + "learning_rate": 0.0001359946608604515, + "loss": 1.2738, + "step": 11206 + }, + { + "epoch": 0.4013465360717675, + "grad_norm": 1.5049867630004883, + "learning_rate": 0.00013598383905356692, + "loss": 0.9418, + "step": 11207 + }, + { + "epoch": 0.4013823482013358, + "grad_norm": 1.7416377067565918, + "learning_rate": 0.00013597301676256617, + "loss": 1.1101, + "step": 11208 + }, + { + "epoch": 0.40141816033090405, + "grad_norm": 1.7084896564483643, + "learning_rate": 0.0001359621939875949, + "loss": 1.2362, + "step": 11209 + }, + { + "epoch": 0.4014539724604724, + "grad_norm": 1.3271247148513794, + "learning_rate": 0.00013595137072879867, + "loss": 0.9173, + "step": 11210 + }, + { + "epoch": 0.40148978459004064, + "grad_norm": 1.3208390474319458, + "learning_rate": 0.00013594054698632315, + "loss": 1.0731, + "step": 11211 + }, + { + "epoch": 0.4015255967196089, + "grad_norm": 1.6244232654571533, + "learning_rate": 0.00013592972276031394, + "loss": 1.2364, + "step": 11212 + }, + { + "epoch": 0.40156140884917724, + "grad_norm": 1.8059965372085571, + "learning_rate": 0.00013591889805091663, + "loss": 1.25, + "step": 11213 + }, + { + "epoch": 0.4015972209787455, + "grad_norm": 1.749160885810852, + "learning_rate": 0.00013590807285827688, + "loss": 1.2476, + "step": 11214 + }, + { + "epoch": 0.4016330331083138, + "grad_norm": 1.453265905380249, + "learning_rate": 0.00013589724718254036, + "loss": 1.0145, + "step": 11215 + }, + { + "epoch": 0.40166884523788204, + "grad_norm": 1.7036688327789307, + "learning_rate": 0.00013588642102385266, + "loss": 1.3091, + "step": 11216 + }, + { + "epoch": 0.40170465736745037, + "grad_norm": 1.3878021240234375, + "learning_rate": 0.00013587559438235945, + "loss": 1.2161, + "step": 11217 + }, + { + "epoch": 0.40174046949701864, + "grad_norm": 1.6837899684906006, + "learning_rate": 0.0001358647672582064, + "loss": 1.1033, + "step": 11218 + }, + { + "epoch": 0.4017762816265869, + "grad_norm": 1.2954835891723633, + "learning_rate": 0.00013585393965153916, + "loss": 0.9959, + "step": 11219 + }, + { + "epoch": 0.40181209375615523, + "grad_norm": 1.2081544399261475, + "learning_rate": 0.00013584311156250342, + "loss": 1.2468, + "step": 11220 + }, + { + "epoch": 0.4018479058857235, + "grad_norm": 1.4341942071914673, + "learning_rate": 0.00013583228299124484, + "loss": 1.3072, + "step": 11221 + }, + { + "epoch": 0.40188371801529177, + "grad_norm": 1.5182627439498901, + "learning_rate": 0.00013582145393790913, + "loss": 1.1432, + "step": 11222 + }, + { + "epoch": 0.40191953014486004, + "grad_norm": 1.619230031967163, + "learning_rate": 0.00013581062440264194, + "loss": 1.1121, + "step": 11223 + }, + { + "epoch": 0.40195534227442836, + "grad_norm": 1.6430588960647583, + "learning_rate": 0.000135799794385589, + "loss": 1.1282, + "step": 11224 + }, + { + "epoch": 0.40199115440399663, + "grad_norm": 1.7408621311187744, + "learning_rate": 0.00013578896388689602, + "loss": 1.1005, + "step": 11225 + }, + { + "epoch": 0.4020269665335649, + "grad_norm": 1.3481826782226562, + "learning_rate": 0.00013577813290670867, + "loss": 1.1128, + "step": 11226 + }, + { + "epoch": 0.4020627786631332, + "grad_norm": 1.8166859149932861, + "learning_rate": 0.00013576730144517271, + "loss": 1.3222, + "step": 11227 + }, + { + "epoch": 0.4020985907927015, + "grad_norm": 1.6916091442108154, + "learning_rate": 0.00013575646950243384, + "loss": 1.3517, + "step": 11228 + }, + { + "epoch": 0.40213440292226976, + "grad_norm": 1.8335497379302979, + "learning_rate": 0.0001357456370786378, + "loss": 1.0361, + "step": 11229 + }, + { + "epoch": 0.40217021505183803, + "grad_norm": 1.563218116760254, + "learning_rate": 0.0001357348041739303, + "loss": 1.1918, + "step": 11230 + }, + { + "epoch": 0.40220602718140636, + "grad_norm": 1.6106045246124268, + "learning_rate": 0.00013572397078845716, + "loss": 1.084, + "step": 11231 + }, + { + "epoch": 0.4022418393109746, + "grad_norm": 1.5328147411346436, + "learning_rate": 0.00013571313692236405, + "loss": 1.0982, + "step": 11232 + }, + { + "epoch": 0.4022776514405429, + "grad_norm": 1.5537692308425903, + "learning_rate": 0.0001357023025757967, + "loss": 1.1392, + "step": 11233 + }, + { + "epoch": 0.4023134635701112, + "grad_norm": 1.830716609954834, + "learning_rate": 0.00013569146774890099, + "loss": 1.2769, + "step": 11234 + }, + { + "epoch": 0.4023492756996795, + "grad_norm": 1.4981743097305298, + "learning_rate": 0.00013568063244182257, + "loss": 1.157, + "step": 11235 + }, + { + "epoch": 0.40238508782924776, + "grad_norm": 1.4209059476852417, + "learning_rate": 0.00013566979665470728, + "loss": 1.1181, + "step": 11236 + }, + { + "epoch": 0.402420899958816, + "grad_norm": 1.7679741382598877, + "learning_rate": 0.0001356589603877009, + "loss": 1.2166, + "step": 11237 + }, + { + "epoch": 0.40245671208838435, + "grad_norm": 2.001664400100708, + "learning_rate": 0.0001356481236409492, + "loss": 1.0221, + "step": 11238 + }, + { + "epoch": 0.4024925242179526, + "grad_norm": 1.495166540145874, + "learning_rate": 0.00013563728641459793, + "loss": 1.2238, + "step": 11239 + }, + { + "epoch": 0.4025283363475209, + "grad_norm": 1.3037998676300049, + "learning_rate": 0.000135626448708793, + "loss": 1.1563, + "step": 11240 + }, + { + "epoch": 0.4025641484770892, + "grad_norm": 1.3027243614196777, + "learning_rate": 0.00013561561052368015, + "loss": 1.1615, + "step": 11241 + }, + { + "epoch": 0.4025999606066575, + "grad_norm": 1.3519775867462158, + "learning_rate": 0.0001356047718594052, + "loss": 1.1569, + "step": 11242 + }, + { + "epoch": 0.40263577273622575, + "grad_norm": 1.6728477478027344, + "learning_rate": 0.00013559393271611397, + "loss": 1.2997, + "step": 11243 + }, + { + "epoch": 0.402671584865794, + "grad_norm": 1.6076161861419678, + "learning_rate": 0.00013558309309395224, + "loss": 0.9429, + "step": 11244 + }, + { + "epoch": 0.40270739699536234, + "grad_norm": 2.051030397415161, + "learning_rate": 0.00013557225299306599, + "loss": 1.2082, + "step": 11245 + }, + { + "epoch": 0.4027432091249306, + "grad_norm": 1.6124062538146973, + "learning_rate": 0.00013556141241360088, + "loss": 1.3812, + "step": 11246 + }, + { + "epoch": 0.4027790212544989, + "grad_norm": 1.36955726146698, + "learning_rate": 0.00013555057135570286, + "loss": 1.1619, + "step": 11247 + }, + { + "epoch": 0.4028148333840672, + "grad_norm": 2.4195566177368164, + "learning_rate": 0.00013553972981951776, + "loss": 1.146, + "step": 11248 + }, + { + "epoch": 0.4028506455136355, + "grad_norm": 1.1521602869033813, + "learning_rate": 0.00013552888780519144, + "loss": 1.1511, + "step": 11249 + }, + { + "epoch": 0.40288645764320374, + "grad_norm": 1.264478325843811, + "learning_rate": 0.00013551804531286975, + "loss": 1.1541, + "step": 11250 + }, + { + "epoch": 0.402922269772772, + "grad_norm": 1.5111082792282104, + "learning_rate": 0.0001355072023426986, + "loss": 1.151, + "step": 11251 + }, + { + "epoch": 0.40295808190234034, + "grad_norm": 1.6069495677947998, + "learning_rate": 0.00013549635889482383, + "loss": 1.1897, + "step": 11252 + }, + { + "epoch": 0.4029938940319086, + "grad_norm": 2.3078510761260986, + "learning_rate": 0.00013548551496939132, + "loss": 1.1736, + "step": 11253 + }, + { + "epoch": 0.4030297061614769, + "grad_norm": 1.5538890361785889, + "learning_rate": 0.00013547467056654702, + "loss": 1.2953, + "step": 11254 + }, + { + "epoch": 0.4030655182910452, + "grad_norm": 1.32401704788208, + "learning_rate": 0.00013546382568643676, + "loss": 1.3071, + "step": 11255 + }, + { + "epoch": 0.40310133042061347, + "grad_norm": 1.2607526779174805, + "learning_rate": 0.00013545298032920647, + "loss": 1.1876, + "step": 11256 + }, + { + "epoch": 0.40313714255018174, + "grad_norm": 1.4203006029129028, + "learning_rate": 0.00013544213449500204, + "loss": 1.1271, + "step": 11257 + }, + { + "epoch": 0.40317295467975, + "grad_norm": 1.5663456916809082, + "learning_rate": 0.00013543128818396946, + "loss": 1.1308, + "step": 11258 + }, + { + "epoch": 0.40320876680931833, + "grad_norm": 1.5732173919677734, + "learning_rate": 0.0001354204413962546, + "loss": 1.1138, + "step": 11259 + }, + { + "epoch": 0.4032445789388866, + "grad_norm": 1.8729130029678345, + "learning_rate": 0.00013540959413200335, + "loss": 1.1475, + "step": 11260 + }, + { + "epoch": 0.40328039106845487, + "grad_norm": 1.636218786239624, + "learning_rate": 0.0001353987463913617, + "loss": 1.2584, + "step": 11261 + }, + { + "epoch": 0.4033162031980232, + "grad_norm": 1.816329002380371, + "learning_rate": 0.0001353878981744756, + "loss": 1.2488, + "step": 11262 + }, + { + "epoch": 0.40335201532759146, + "grad_norm": 1.2289009094238281, + "learning_rate": 0.00013537704948149093, + "loss": 1.1147, + "step": 11263 + }, + { + "epoch": 0.40338782745715973, + "grad_norm": 2.3801934719085693, + "learning_rate": 0.00013536620031255373, + "loss": 1.3729, + "step": 11264 + }, + { + "epoch": 0.403423639586728, + "grad_norm": 1.701791763305664, + "learning_rate": 0.0001353553506678099, + "loss": 1.1856, + "step": 11265 + }, + { + "epoch": 0.4034594517162963, + "grad_norm": 1.5441887378692627, + "learning_rate": 0.00013534450054740544, + "loss": 1.1711, + "step": 11266 + }, + { + "epoch": 0.4034952638458646, + "grad_norm": 1.5057464838027954, + "learning_rate": 0.0001353336499514863, + "loss": 1.0591, + "step": 11267 + }, + { + "epoch": 0.40353107597543286, + "grad_norm": 2.342479705810547, + "learning_rate": 0.00013532279888019851, + "loss": 1.0144, + "step": 11268 + }, + { + "epoch": 0.4035668881050012, + "grad_norm": 2.021944046020508, + "learning_rate": 0.00013531194733368805, + "loss": 1.3326, + "step": 11269 + }, + { + "epoch": 0.40360270023456946, + "grad_norm": 1.5700405836105347, + "learning_rate": 0.00013530109531210082, + "loss": 1.1499, + "step": 11270 + }, + { + "epoch": 0.4036385123641377, + "grad_norm": 1.6276450157165527, + "learning_rate": 0.00013529024281558292, + "loss": 1.1718, + "step": 11271 + }, + { + "epoch": 0.403674324493706, + "grad_norm": 1.861865520477295, + "learning_rate": 0.00013527938984428031, + "loss": 1.2504, + "step": 11272 + }, + { + "epoch": 0.4037101366232743, + "grad_norm": 1.6616631746292114, + "learning_rate": 0.00013526853639833904, + "loss": 1.2623, + "step": 11273 + }, + { + "epoch": 0.4037459487528426, + "grad_norm": 1.4624296426773071, + "learning_rate": 0.0001352576824779051, + "loss": 1.0329, + "step": 11274 + }, + { + "epoch": 0.40378176088241086, + "grad_norm": 1.5996993780136108, + "learning_rate": 0.0001352468280831245, + "loss": 1.01, + "step": 11275 + }, + { + "epoch": 0.4038175730119792, + "grad_norm": 1.6590691804885864, + "learning_rate": 0.00013523597321414332, + "loss": 1.1352, + "step": 11276 + }, + { + "epoch": 0.40385338514154745, + "grad_norm": 1.8270632028579712, + "learning_rate": 0.00013522511787110756, + "loss": 1.2779, + "step": 11277 + }, + { + "epoch": 0.4038891972711157, + "grad_norm": 1.3976081609725952, + "learning_rate": 0.00013521426205416326, + "loss": 1.0669, + "step": 11278 + }, + { + "epoch": 0.403925009400684, + "grad_norm": 1.4848263263702393, + "learning_rate": 0.00013520340576345653, + "loss": 1.2518, + "step": 11279 + }, + { + "epoch": 0.4039608215302523, + "grad_norm": 1.6102906465530396, + "learning_rate": 0.00013519254899913333, + "loss": 1.3086, + "step": 11280 + }, + { + "epoch": 0.4039966336598206, + "grad_norm": 1.4480026960372925, + "learning_rate": 0.0001351816917613398, + "loss": 1.1112, + "step": 11281 + }, + { + "epoch": 0.40403244578938885, + "grad_norm": 1.3922817707061768, + "learning_rate": 0.00013517083405022203, + "loss": 1.2063, + "step": 11282 + }, + { + "epoch": 0.4040682579189572, + "grad_norm": 1.7715654373168945, + "learning_rate": 0.000135159975865926, + "loss": 1.2452, + "step": 11283 + }, + { + "epoch": 0.40410407004852544, + "grad_norm": 2.0573697090148926, + "learning_rate": 0.00013514911720859785, + "loss": 1.2106, + "step": 11284 + }, + { + "epoch": 0.4041398821780937, + "grad_norm": 1.2651151418685913, + "learning_rate": 0.00013513825807838373, + "loss": 1.0447, + "step": 11285 + }, + { + "epoch": 0.404175694307662, + "grad_norm": 1.3463444709777832, + "learning_rate": 0.0001351273984754296, + "loss": 1.1083, + "step": 11286 + }, + { + "epoch": 0.4042115064372303, + "grad_norm": 1.441053032875061, + "learning_rate": 0.00013511653839988168, + "loss": 1.212, + "step": 11287 + }, + { + "epoch": 0.4042473185667986, + "grad_norm": 1.9512711763381958, + "learning_rate": 0.000135105677851886, + "loss": 1.1359, + "step": 11288 + }, + { + "epoch": 0.40428313069636684, + "grad_norm": 1.5904825925827026, + "learning_rate": 0.00013509481683158874, + "loss": 1.1776, + "step": 11289 + }, + { + "epoch": 0.40431894282593517, + "grad_norm": 1.4839062690734863, + "learning_rate": 0.00013508395533913593, + "loss": 1.1291, + "step": 11290 + }, + { + "epoch": 0.40435475495550344, + "grad_norm": 1.413978099822998, + "learning_rate": 0.0001350730933746738, + "loss": 1.1184, + "step": 11291 + }, + { + "epoch": 0.4043905670850717, + "grad_norm": 1.5678945779800415, + "learning_rate": 0.00013506223093834844, + "loss": 1.212, + "step": 11292 + }, + { + "epoch": 0.40442637921464, + "grad_norm": 1.6923338174819946, + "learning_rate": 0.000135051368030306, + "loss": 1.273, + "step": 11293 + }, + { + "epoch": 0.4044621913442083, + "grad_norm": 1.4023898839950562, + "learning_rate": 0.00013504050465069263, + "loss": 1.2404, + "step": 11294 + }, + { + "epoch": 0.40449800347377657, + "grad_norm": 1.455000877380371, + "learning_rate": 0.0001350296407996544, + "loss": 1.1364, + "step": 11295 + }, + { + "epoch": 0.40453381560334484, + "grad_norm": 1.6415921449661255, + "learning_rate": 0.0001350187764773376, + "loss": 1.1535, + "step": 11296 + }, + { + "epoch": 0.40456962773291316, + "grad_norm": 1.4376757144927979, + "learning_rate": 0.0001350079116838883, + "loss": 1.1922, + "step": 11297 + }, + { + "epoch": 0.40460543986248143, + "grad_norm": 1.3017642498016357, + "learning_rate": 0.0001349970464194527, + "loss": 1.0008, + "step": 11298 + }, + { + "epoch": 0.4046412519920497, + "grad_norm": 1.4529356956481934, + "learning_rate": 0.000134986180684177, + "loss": 1.1191, + "step": 11299 + }, + { + "epoch": 0.40467706412161797, + "grad_norm": 1.3637619018554688, + "learning_rate": 0.0001349753144782074, + "loss": 1.126, + "step": 11300 + }, + { + "epoch": 0.4047128762511863, + "grad_norm": 2.103764533996582, + "learning_rate": 0.00013496444780169, + "loss": 1.2023, + "step": 11301 + }, + { + "epoch": 0.40474868838075456, + "grad_norm": 1.3207083940505981, + "learning_rate": 0.0001349535806547711, + "loss": 1.0041, + "step": 11302 + }, + { + "epoch": 0.40478450051032283, + "grad_norm": 1.4936374425888062, + "learning_rate": 0.00013494271303759686, + "loss": 1.3263, + "step": 11303 + }, + { + "epoch": 0.40482031263989116, + "grad_norm": 1.2028409242630005, + "learning_rate": 0.0001349318449503135, + "loss": 0.9896, + "step": 11304 + }, + { + "epoch": 0.4048561247694594, + "grad_norm": 1.5086771249771118, + "learning_rate": 0.00013492097639306716, + "loss": 1.1627, + "step": 11305 + }, + { + "epoch": 0.4048919368990277, + "grad_norm": 1.7056670188903809, + "learning_rate": 0.00013491010736600418, + "loss": 1.291, + "step": 11306 + }, + { + "epoch": 0.40492774902859596, + "grad_norm": 1.529719591140747, + "learning_rate": 0.0001348992378692707, + "loss": 1.1179, + "step": 11307 + }, + { + "epoch": 0.4049635611581643, + "grad_norm": 1.614398717880249, + "learning_rate": 0.000134888367903013, + "loss": 0.9721, + "step": 11308 + }, + { + "epoch": 0.40499937328773256, + "grad_norm": 1.525844693183899, + "learning_rate": 0.00013487749746737734, + "loss": 1.0838, + "step": 11309 + }, + { + "epoch": 0.4050351854173008, + "grad_norm": 1.4960054159164429, + "learning_rate": 0.0001348666265625099, + "loss": 1.0841, + "step": 11310 + }, + { + "epoch": 0.40507099754686915, + "grad_norm": 1.302823781967163, + "learning_rate": 0.00013485575518855703, + "loss": 1.17, + "step": 11311 + }, + { + "epoch": 0.4051068096764374, + "grad_norm": 2.1826651096343994, + "learning_rate": 0.00013484488334566488, + "loss": 1.0996, + "step": 11312 + }, + { + "epoch": 0.4051426218060057, + "grad_norm": 1.5029515027999878, + "learning_rate": 0.00013483401103397982, + "loss": 1.1608, + "step": 11313 + }, + { + "epoch": 0.40517843393557396, + "grad_norm": 1.557921290397644, + "learning_rate": 0.00013482313825364804, + "loss": 1.0779, + "step": 11314 + }, + { + "epoch": 0.4052142460651423, + "grad_norm": 1.8561131954193115, + "learning_rate": 0.00013481226500481588, + "loss": 1.2093, + "step": 11315 + }, + { + "epoch": 0.40525005819471055, + "grad_norm": 1.7163931131362915, + "learning_rate": 0.00013480139128762956, + "loss": 1.2047, + "step": 11316 + }, + { + "epoch": 0.4052858703242788, + "grad_norm": 1.5294655561447144, + "learning_rate": 0.00013479051710223544, + "loss": 1.1982, + "step": 11317 + }, + { + "epoch": 0.40532168245384714, + "grad_norm": 1.3223471641540527, + "learning_rate": 0.00013477964244877977, + "loss": 1.1657, + "step": 11318 + }, + { + "epoch": 0.4053574945834154, + "grad_norm": 1.2408074140548706, + "learning_rate": 0.0001347687673274089, + "loss": 0.95, + "step": 11319 + }, + { + "epoch": 0.4053933067129837, + "grad_norm": 1.2678711414337158, + "learning_rate": 0.00013475789173826908, + "loss": 1.1373, + "step": 11320 + }, + { + "epoch": 0.40542911884255195, + "grad_norm": 1.3221596479415894, + "learning_rate": 0.0001347470156815067, + "loss": 1.1502, + "step": 11321 + }, + { + "epoch": 0.4054649309721203, + "grad_norm": 1.6059190034866333, + "learning_rate": 0.000134736139157268, + "loss": 1.2597, + "step": 11322 + }, + { + "epoch": 0.40550074310168854, + "grad_norm": 1.7549272775650024, + "learning_rate": 0.0001347252621656994, + "loss": 1.1935, + "step": 11323 + }, + { + "epoch": 0.4055365552312568, + "grad_norm": 1.5481117963790894, + "learning_rate": 0.00013471438470694715, + "loss": 1.2748, + "step": 11324 + }, + { + "epoch": 0.40557236736082514, + "grad_norm": 2.0800321102142334, + "learning_rate": 0.00013470350678115763, + "loss": 1.0712, + "step": 11325 + }, + { + "epoch": 0.4056081794903934, + "grad_norm": 1.5459129810333252, + "learning_rate": 0.00013469262838847724, + "loss": 0.9715, + "step": 11326 + }, + { + "epoch": 0.4056439916199617, + "grad_norm": 1.6255378723144531, + "learning_rate": 0.00013468174952905223, + "loss": 0.9993, + "step": 11327 + }, + { + "epoch": 0.40567980374952994, + "grad_norm": 1.815807819366455, + "learning_rate": 0.00013467087020302906, + "loss": 1.0664, + "step": 11328 + }, + { + "epoch": 0.40571561587909827, + "grad_norm": 1.6389068365097046, + "learning_rate": 0.00013465999041055405, + "loss": 1.0721, + "step": 11329 + }, + { + "epoch": 0.40575142800866654, + "grad_norm": 1.5020796060562134, + "learning_rate": 0.00013464911015177356, + "loss": 1.0391, + "step": 11330 + }, + { + "epoch": 0.4057872401382348, + "grad_norm": 1.4255144596099854, + "learning_rate": 0.000134638229426834, + "loss": 1.1891, + "step": 11331 + }, + { + "epoch": 0.40582305226780313, + "grad_norm": 1.4715979099273682, + "learning_rate": 0.0001346273482358817, + "loss": 1.0909, + "step": 11332 + }, + { + "epoch": 0.4058588643973714, + "grad_norm": 1.4732025861740112, + "learning_rate": 0.00013461646657906315, + "loss": 1.2162, + "step": 11333 + }, + { + "epoch": 0.40589467652693967, + "grad_norm": 1.6134861707687378, + "learning_rate": 0.00013460558445652467, + "loss": 1.4531, + "step": 11334 + }, + { + "epoch": 0.40593048865650794, + "grad_norm": 1.3869401216506958, + "learning_rate": 0.0001345947018684127, + "loss": 1.2491, + "step": 11335 + }, + { + "epoch": 0.40596630078607626, + "grad_norm": 1.3888612985610962, + "learning_rate": 0.00013458381881487362, + "loss": 1.0851, + "step": 11336 + }, + { + "epoch": 0.40600211291564453, + "grad_norm": 1.5619386434555054, + "learning_rate": 0.0001345729352960539, + "loss": 1.0101, + "step": 11337 + }, + { + "epoch": 0.4060379250452128, + "grad_norm": 1.4530189037322998, + "learning_rate": 0.00013456205131209988, + "loss": 1.2437, + "step": 11338 + }, + { + "epoch": 0.4060737371747811, + "grad_norm": 1.589477300643921, + "learning_rate": 0.0001345511668631581, + "loss": 1.0669, + "step": 11339 + }, + { + "epoch": 0.4061095493043494, + "grad_norm": 1.4261202812194824, + "learning_rate": 0.0001345402819493749, + "loss": 1.2052, + "step": 11340 + }, + { + "epoch": 0.40614536143391766, + "grad_norm": 1.412002444267273, + "learning_rate": 0.00013452939657089677, + "loss": 1.135, + "step": 11341 + }, + { + "epoch": 0.40618117356348593, + "grad_norm": 1.4420788288116455, + "learning_rate": 0.00013451851072787013, + "loss": 0.9441, + "step": 11342 + }, + { + "epoch": 0.40621698569305426, + "grad_norm": 1.547714352607727, + "learning_rate": 0.00013450762442044148, + "loss": 1.0174, + "step": 11343 + }, + { + "epoch": 0.4062527978226225, + "grad_norm": 1.3132750988006592, + "learning_rate": 0.00013449673764875724, + "loss": 1.0524, + "step": 11344 + }, + { + "epoch": 0.4062886099521908, + "grad_norm": 1.3150289058685303, + "learning_rate": 0.00013448585041296392, + "loss": 1.1367, + "step": 11345 + }, + { + "epoch": 0.4063244220817591, + "grad_norm": 1.27207350730896, + "learning_rate": 0.00013447496271320794, + "loss": 1.0872, + "step": 11346 + }, + { + "epoch": 0.4063602342113274, + "grad_norm": 1.4751765727996826, + "learning_rate": 0.00013446407454963582, + "loss": 1.1808, + "step": 11347 + }, + { + "epoch": 0.40639604634089566, + "grad_norm": 1.899837851524353, + "learning_rate": 0.00013445318592239405, + "loss": 1.1409, + "step": 11348 + }, + { + "epoch": 0.4064318584704639, + "grad_norm": 1.5762003660202026, + "learning_rate": 0.00013444229683162904, + "loss": 1.2142, + "step": 11349 + }, + { + "epoch": 0.40646767060003225, + "grad_norm": 1.8556190729141235, + "learning_rate": 0.00013443140727748738, + "loss": 1.104, + "step": 11350 + }, + { + "epoch": 0.4065034827296005, + "grad_norm": 1.663715124130249, + "learning_rate": 0.0001344205172601156, + "loss": 1.1905, + "step": 11351 + }, + { + "epoch": 0.4065392948591688, + "grad_norm": 1.7598750591278076, + "learning_rate": 0.00013440962677966012, + "loss": 0.9264, + "step": 11352 + }, + { + "epoch": 0.4065751069887371, + "grad_norm": 1.334956169128418, + "learning_rate": 0.0001343987358362675, + "loss": 1.309, + "step": 11353 + }, + { + "epoch": 0.4066109191183054, + "grad_norm": 1.5323807001113892, + "learning_rate": 0.00013438784443008426, + "loss": 1.0462, + "step": 11354 + }, + { + "epoch": 0.40664673124787365, + "grad_norm": 1.3937528133392334, + "learning_rate": 0.00013437695256125694, + "loss": 1.2687, + "step": 11355 + }, + { + "epoch": 0.4066825433774419, + "grad_norm": 1.6691933870315552, + "learning_rate": 0.00013436606022993207, + "loss": 1.1364, + "step": 11356 + }, + { + "epoch": 0.40671835550701024, + "grad_norm": 1.5632743835449219, + "learning_rate": 0.00013435516743625617, + "loss": 1.3979, + "step": 11357 + }, + { + "epoch": 0.4067541676365785, + "grad_norm": 1.643844723701477, + "learning_rate": 0.0001343442741803758, + "loss": 1.1255, + "step": 11358 + }, + { + "epoch": 0.4067899797661468, + "grad_norm": 1.3671529293060303, + "learning_rate": 0.00013433338046243753, + "loss": 1.1859, + "step": 11359 + }, + { + "epoch": 0.4068257918957151, + "grad_norm": 1.8310626745224, + "learning_rate": 0.0001343224862825879, + "loss": 1.1372, + "step": 11360 + }, + { + "epoch": 0.4068616040252834, + "grad_norm": 1.2512379884719849, + "learning_rate": 0.00013431159164097354, + "loss": 1.1946, + "step": 11361 + }, + { + "epoch": 0.40689741615485164, + "grad_norm": 1.6334959268569946, + "learning_rate": 0.0001343006965377409, + "loss": 0.8866, + "step": 11362 + }, + { + "epoch": 0.4069332282844199, + "grad_norm": 1.862403392791748, + "learning_rate": 0.00013428980097303668, + "loss": 1.2941, + "step": 11363 + }, + { + "epoch": 0.40696904041398824, + "grad_norm": 1.697129726409912, + "learning_rate": 0.0001342789049470074, + "loss": 1.1328, + "step": 11364 + }, + { + "epoch": 0.4070048525435565, + "grad_norm": 1.3880308866500854, + "learning_rate": 0.0001342680084597997, + "loss": 0.9788, + "step": 11365 + }, + { + "epoch": 0.4070406646731248, + "grad_norm": 1.6186320781707764, + "learning_rate": 0.00013425711151156014, + "loss": 1.3387, + "step": 11366 + }, + { + "epoch": 0.4070764768026931, + "grad_norm": 1.5264357328414917, + "learning_rate": 0.00013424621410243533, + "loss": 1.1765, + "step": 11367 + }, + { + "epoch": 0.40711228893226137, + "grad_norm": 1.7230759859085083, + "learning_rate": 0.00013423531623257189, + "loss": 1.025, + "step": 11368 + }, + { + "epoch": 0.40714810106182964, + "grad_norm": 1.654958724975586, + "learning_rate": 0.0001342244179021164, + "loss": 1.3482, + "step": 11369 + }, + { + "epoch": 0.4071839131913979, + "grad_norm": 2.1714987754821777, + "learning_rate": 0.00013421351911121554, + "loss": 1.1688, + "step": 11370 + }, + { + "epoch": 0.40721972532096623, + "grad_norm": 1.7491713762283325, + "learning_rate": 0.00013420261986001587, + "loss": 1.2582, + "step": 11371 + }, + { + "epoch": 0.4072555374505345, + "grad_norm": 1.8208177089691162, + "learning_rate": 0.00013419172014866412, + "loss": 1.1266, + "step": 11372 + }, + { + "epoch": 0.40729134958010277, + "grad_norm": 1.6185930967330933, + "learning_rate": 0.00013418081997730686, + "loss": 1.2086, + "step": 11373 + }, + { + "epoch": 0.4073271617096711, + "grad_norm": 1.4098742008209229, + "learning_rate": 0.00013416991934609075, + "loss": 1.1398, + "step": 11374 + }, + { + "epoch": 0.40736297383923936, + "grad_norm": 2.235719680786133, + "learning_rate": 0.00013415901825516248, + "loss": 1.1536, + "step": 11375 + }, + { + "epoch": 0.40739878596880763, + "grad_norm": 1.707482099533081, + "learning_rate": 0.00013414811670466864, + "loss": 1.2034, + "step": 11376 + }, + { + "epoch": 0.4074345980983759, + "grad_norm": 1.1902461051940918, + "learning_rate": 0.00013413721469475597, + "loss": 1.0493, + "step": 11377 + }, + { + "epoch": 0.4074704102279442, + "grad_norm": 1.6310187578201294, + "learning_rate": 0.00013412631222557112, + "loss": 1.1727, + "step": 11378 + }, + { + "epoch": 0.4075062223575125, + "grad_norm": 1.2751080989837646, + "learning_rate": 0.00013411540929726072, + "loss": 1.0088, + "step": 11379 + }, + { + "epoch": 0.40754203448708076, + "grad_norm": 1.3744661808013916, + "learning_rate": 0.0001341045059099715, + "loss": 1.2793, + "step": 11380 + }, + { + "epoch": 0.4075778466166491, + "grad_norm": 2.3720827102661133, + "learning_rate": 0.00013409360206385017, + "loss": 1.2233, + "step": 11381 + }, + { + "epoch": 0.40761365874621736, + "grad_norm": 1.7099735736846924, + "learning_rate": 0.00013408269775904338, + "loss": 1.2761, + "step": 11382 + }, + { + "epoch": 0.4076494708757856, + "grad_norm": 1.6578395366668701, + "learning_rate": 0.00013407179299569787, + "loss": 1.0745, + "step": 11383 + }, + { + "epoch": 0.4076852830053539, + "grad_norm": 2.5661158561706543, + "learning_rate": 0.00013406088777396033, + "loss": 1.1282, + "step": 11384 + }, + { + "epoch": 0.4077210951349222, + "grad_norm": 1.3150012493133545, + "learning_rate": 0.00013404998209397748, + "loss": 1.2343, + "step": 11385 + }, + { + "epoch": 0.4077569072644905, + "grad_norm": 1.886649250984192, + "learning_rate": 0.00013403907595589605, + "loss": 1.1746, + "step": 11386 + }, + { + "epoch": 0.40779271939405876, + "grad_norm": 1.7254194021224976, + "learning_rate": 0.0001340281693598627, + "loss": 1.2409, + "step": 11387 + }, + { + "epoch": 0.407828531523627, + "grad_norm": 1.5671777725219727, + "learning_rate": 0.0001340172623060243, + "loss": 0.972, + "step": 11388 + }, + { + "epoch": 0.40786434365319535, + "grad_norm": 1.8005319833755493, + "learning_rate": 0.0001340063547945275, + "loss": 1.2184, + "step": 11389 + }, + { + "epoch": 0.4079001557827636, + "grad_norm": 2.0667576789855957, + "learning_rate": 0.00013399544682551903, + "loss": 1.2225, + "step": 11390 + }, + { + "epoch": 0.4079359679123319, + "grad_norm": 1.4095345735549927, + "learning_rate": 0.00013398453839914574, + "loss": 1.2117, + "step": 11391 + }, + { + "epoch": 0.4079717800419002, + "grad_norm": 2.054354429244995, + "learning_rate": 0.00013397362951555425, + "loss": 1.1558, + "step": 11392 + }, + { + "epoch": 0.4080075921714685, + "grad_norm": 1.3406370878219604, + "learning_rate": 0.00013396272017489143, + "loss": 1.1296, + "step": 11393 + }, + { + "epoch": 0.40804340430103675, + "grad_norm": 1.8360228538513184, + "learning_rate": 0.000133951810377304, + "loss": 1.2085, + "step": 11394 + }, + { + "epoch": 0.408079216430605, + "grad_norm": 1.1478716135025024, + "learning_rate": 0.00013394090012293879, + "loss": 1.2746, + "step": 11395 + }, + { + "epoch": 0.40811502856017334, + "grad_norm": 2.196234941482544, + "learning_rate": 0.0001339299894119425, + "loss": 1.2233, + "step": 11396 + }, + { + "epoch": 0.4081508406897416, + "grad_norm": 1.5769718885421753, + "learning_rate": 0.00013391907824446202, + "loss": 1.1457, + "step": 11397 + }, + { + "epoch": 0.4081866528193099, + "grad_norm": 1.3413803577423096, + "learning_rate": 0.00013390816662064406, + "loss": 1.1574, + "step": 11398 + }, + { + "epoch": 0.4082224649488782, + "grad_norm": 1.4391398429870605, + "learning_rate": 0.00013389725454063549, + "loss": 1.1083, + "step": 11399 + }, + { + "epoch": 0.4082582770784465, + "grad_norm": 2.0179800987243652, + "learning_rate": 0.00013388634200458305, + "loss": 1.23, + "step": 11400 + }, + { + "epoch": 0.40829408920801474, + "grad_norm": 2.166086435317993, + "learning_rate": 0.00013387542901263362, + "loss": 1.187, + "step": 11401 + }, + { + "epoch": 0.408329901337583, + "grad_norm": 1.3352351188659668, + "learning_rate": 0.00013386451556493396, + "loss": 1.3955, + "step": 11402 + }, + { + "epoch": 0.40836571346715134, + "grad_norm": 1.386296033859253, + "learning_rate": 0.00013385360166163094, + "loss": 1.1935, + "step": 11403 + }, + { + "epoch": 0.4084015255967196, + "grad_norm": 1.845186471939087, + "learning_rate": 0.00013384268730287136, + "loss": 1.1457, + "step": 11404 + }, + { + "epoch": 0.4084373377262879, + "grad_norm": 1.4966092109680176, + "learning_rate": 0.0001338317724888021, + "loss": 0.9078, + "step": 11405 + }, + { + "epoch": 0.4084731498558562, + "grad_norm": 1.5690648555755615, + "learning_rate": 0.00013382085721956997, + "loss": 1.2362, + "step": 11406 + }, + { + "epoch": 0.40850896198542447, + "grad_norm": 1.3675588369369507, + "learning_rate": 0.00013380994149532181, + "loss": 1.0551, + "step": 11407 + }, + { + "epoch": 0.40854477411499274, + "grad_norm": 1.3035943508148193, + "learning_rate": 0.00013379902531620455, + "loss": 1.1983, + "step": 11408 + }, + { + "epoch": 0.408580586244561, + "grad_norm": 1.780292272567749, + "learning_rate": 0.00013378810868236497, + "loss": 1.2395, + "step": 11409 + }, + { + "epoch": 0.40861639837412933, + "grad_norm": 1.5090771913528442, + "learning_rate": 0.00013377719159394998, + "loss": 1.2172, + "step": 11410 + }, + { + "epoch": 0.4086522105036976, + "grad_norm": 1.4563946723937988, + "learning_rate": 0.00013376627405110644, + "loss": 1.0707, + "step": 11411 + }, + { + "epoch": 0.40868802263326587, + "grad_norm": 1.6102405786514282, + "learning_rate": 0.00013375535605398127, + "loss": 1.1609, + "step": 11412 + }, + { + "epoch": 0.4087238347628342, + "grad_norm": 1.7599645853042603, + "learning_rate": 0.00013374443760272127, + "loss": 1.082, + "step": 11413 + }, + { + "epoch": 0.40875964689240246, + "grad_norm": 1.5818537473678589, + "learning_rate": 0.0001337335186974734, + "loss": 1.1468, + "step": 11414 + }, + { + "epoch": 0.40879545902197073, + "grad_norm": 1.2001920938491821, + "learning_rate": 0.00013372259933838458, + "loss": 1.1002, + "step": 11415 + }, + { + "epoch": 0.408831271151539, + "grad_norm": 1.559090256690979, + "learning_rate": 0.00013371167952560168, + "loss": 0.9405, + "step": 11416 + }, + { + "epoch": 0.4088670832811073, + "grad_norm": 1.9057070016860962, + "learning_rate": 0.00013370075925927158, + "loss": 1.0554, + "step": 11417 + }, + { + "epoch": 0.4089028954106756, + "grad_norm": 1.5240813493728638, + "learning_rate": 0.00013368983853954126, + "loss": 1.1233, + "step": 11418 + }, + { + "epoch": 0.40893870754024386, + "grad_norm": 1.514455795288086, + "learning_rate": 0.00013367891736655764, + "loss": 1.1532, + "step": 11419 + }, + { + "epoch": 0.4089745196698122, + "grad_norm": 1.6028668880462646, + "learning_rate": 0.0001336679957404676, + "loss": 1.2145, + "step": 11420 + }, + { + "epoch": 0.40901033179938046, + "grad_norm": 1.3136330842971802, + "learning_rate": 0.00013365707366141814, + "loss": 1.2469, + "step": 11421 + }, + { + "epoch": 0.4090461439289487, + "grad_norm": 1.7450716495513916, + "learning_rate": 0.00013364615112955612, + "loss": 1.1508, + "step": 11422 + }, + { + "epoch": 0.409081956058517, + "grad_norm": 1.6917316913604736, + "learning_rate": 0.0001336352281450286, + "loss": 1.247, + "step": 11423 + }, + { + "epoch": 0.4091177681880853, + "grad_norm": 1.4504810571670532, + "learning_rate": 0.0001336243047079824, + "loss": 1.1923, + "step": 11424 + }, + { + "epoch": 0.4091535803176536, + "grad_norm": 1.684409737586975, + "learning_rate": 0.00013361338081856457, + "loss": 1.271, + "step": 11425 + }, + { + "epoch": 0.40918939244722186, + "grad_norm": 1.53290855884552, + "learning_rate": 0.0001336024564769221, + "loss": 0.9833, + "step": 11426 + }, + { + "epoch": 0.4092252045767902, + "grad_norm": 1.4636331796646118, + "learning_rate": 0.00013359153168320188, + "loss": 0.9723, + "step": 11427 + }, + { + "epoch": 0.40926101670635845, + "grad_norm": 1.579563021659851, + "learning_rate": 0.00013358060643755098, + "loss": 1.2457, + "step": 11428 + }, + { + "epoch": 0.4092968288359267, + "grad_norm": 1.2723549604415894, + "learning_rate": 0.00013356968074011626, + "loss": 1.1158, + "step": 11429 + }, + { + "epoch": 0.409332640965495, + "grad_norm": 1.5476524829864502, + "learning_rate": 0.00013355875459104485, + "loss": 1.2351, + "step": 11430 + }, + { + "epoch": 0.4093684530950633, + "grad_norm": 1.5317386388778687, + "learning_rate": 0.00013354782799048366, + "loss": 1.0235, + "step": 11431 + }, + { + "epoch": 0.4094042652246316, + "grad_norm": 1.5430917739868164, + "learning_rate": 0.00013353690093857972, + "loss": 1.1692, + "step": 11432 + }, + { + "epoch": 0.40944007735419985, + "grad_norm": 1.6842831373214722, + "learning_rate": 0.00013352597343548004, + "loss": 1.135, + "step": 11433 + }, + { + "epoch": 0.4094758894837682, + "grad_norm": 1.5470572710037231, + "learning_rate": 0.00013351504548133166, + "loss": 1.3018, + "step": 11434 + }, + { + "epoch": 0.40951170161333644, + "grad_norm": 2.2223410606384277, + "learning_rate": 0.00013350411707628153, + "loss": 1.2905, + "step": 11435 + }, + { + "epoch": 0.4095475137429047, + "grad_norm": 1.3507447242736816, + "learning_rate": 0.00013349318822047674, + "loss": 1.1255, + "step": 11436 + }, + { + "epoch": 0.409583325872473, + "grad_norm": 1.3746464252471924, + "learning_rate": 0.00013348225891406432, + "loss": 1.0984, + "step": 11437 + }, + { + "epoch": 0.4096191380020413, + "grad_norm": 2.2940104007720947, + "learning_rate": 0.00013347132915719127, + "loss": 1.4146, + "step": 11438 + }, + { + "epoch": 0.4096549501316096, + "grad_norm": 1.6638449430465698, + "learning_rate": 0.0001334603989500047, + "loss": 1.1003, + "step": 11439 + }, + { + "epoch": 0.40969076226117784, + "grad_norm": 1.4991264343261719, + "learning_rate": 0.00013344946829265157, + "loss": 1.1708, + "step": 11440 + }, + { + "epoch": 0.40972657439074617, + "grad_norm": 2.2569453716278076, + "learning_rate": 0.000133438537185279, + "loss": 1.3268, + "step": 11441 + }, + { + "epoch": 0.40976238652031444, + "grad_norm": 1.4841902256011963, + "learning_rate": 0.00013342760562803406, + "loss": 1.1892, + "step": 11442 + }, + { + "epoch": 0.4097981986498827, + "grad_norm": 2.1264946460723877, + "learning_rate": 0.0001334166736210638, + "loss": 1.222, + "step": 11443 + }, + { + "epoch": 0.409834010779451, + "grad_norm": 1.4543956518173218, + "learning_rate": 0.00013340574116451533, + "loss": 1.0253, + "step": 11444 + }, + { + "epoch": 0.4098698229090193, + "grad_norm": 1.5149650573730469, + "learning_rate": 0.0001333948082585357, + "loss": 1.1051, + "step": 11445 + }, + { + "epoch": 0.40990563503858757, + "grad_norm": 1.6119897365570068, + "learning_rate": 0.00013338387490327195, + "loss": 1.1161, + "step": 11446 + }, + { + "epoch": 0.40994144716815584, + "grad_norm": 1.290800929069519, + "learning_rate": 0.00013337294109887123, + "loss": 1.0933, + "step": 11447 + }, + { + "epoch": 0.40997725929772416, + "grad_norm": 1.4090884923934937, + "learning_rate": 0.0001333620068454807, + "loss": 1.0999, + "step": 11448 + }, + { + "epoch": 0.41001307142729243, + "grad_norm": 1.7297462224960327, + "learning_rate": 0.00013335107214324733, + "loss": 1.3487, + "step": 11449 + }, + { + "epoch": 0.4100488835568607, + "grad_norm": 1.687380075454712, + "learning_rate": 0.00013334013699231836, + "loss": 1.0679, + "step": 11450 + }, + { + "epoch": 0.41008469568642897, + "grad_norm": 1.810742974281311, + "learning_rate": 0.0001333292013928408, + "loss": 1.3049, + "step": 11451 + }, + { + "epoch": 0.4101205078159973, + "grad_norm": 1.6850972175598145, + "learning_rate": 0.00013331826534496188, + "loss": 1.2078, + "step": 11452 + }, + { + "epoch": 0.41015631994556556, + "grad_norm": 1.6230109930038452, + "learning_rate": 0.00013330732884882866, + "loss": 1.1203, + "step": 11453 + }, + { + "epoch": 0.41019213207513383, + "grad_norm": 2.221991777420044, + "learning_rate": 0.0001332963919045883, + "loss": 1.3049, + "step": 11454 + }, + { + "epoch": 0.41022794420470216, + "grad_norm": 1.5486077070236206, + "learning_rate": 0.00013328545451238793, + "loss": 1.1256, + "step": 11455 + }, + { + "epoch": 0.4102637563342704, + "grad_norm": 1.7231965065002441, + "learning_rate": 0.00013327451667237468, + "loss": 1.0252, + "step": 11456 + }, + { + "epoch": 0.4102995684638387, + "grad_norm": 1.2533962726593018, + "learning_rate": 0.00013326357838469574, + "loss": 1.2298, + "step": 11457 + }, + { + "epoch": 0.41033538059340696, + "grad_norm": 1.3686883449554443, + "learning_rate": 0.0001332526396494983, + "loss": 1.1158, + "step": 11458 + }, + { + "epoch": 0.4103711927229753, + "grad_norm": 1.558911919593811, + "learning_rate": 0.00013324170046692942, + "loss": 1.1949, + "step": 11459 + }, + { + "epoch": 0.41040700485254356, + "grad_norm": 1.6708871126174927, + "learning_rate": 0.00013323076083713637, + "loss": 1.0893, + "step": 11460 + }, + { + "epoch": 0.4104428169821118, + "grad_norm": 2.0062458515167236, + "learning_rate": 0.00013321982076026632, + "loss": 1.2773, + "step": 11461 + }, + { + "epoch": 0.41047862911168015, + "grad_norm": 1.3943290710449219, + "learning_rate": 0.0001332088802364664, + "loss": 1.2233, + "step": 11462 + }, + { + "epoch": 0.4105144412412484, + "grad_norm": 1.5180110931396484, + "learning_rate": 0.00013319793926588387, + "loss": 0.9158, + "step": 11463 + }, + { + "epoch": 0.4105502533708167, + "grad_norm": 1.658469796180725, + "learning_rate": 0.00013318699784866585, + "loss": 1.2123, + "step": 11464 + }, + { + "epoch": 0.41058606550038496, + "grad_norm": 1.3616617918014526, + "learning_rate": 0.0001331760559849596, + "loss": 0.933, + "step": 11465 + }, + { + "epoch": 0.4106218776299533, + "grad_norm": 1.4099726676940918, + "learning_rate": 0.0001331651136749123, + "loss": 1.0247, + "step": 11466 + }, + { + "epoch": 0.41065768975952155, + "grad_norm": 1.6402201652526855, + "learning_rate": 0.0001331541709186712, + "loss": 1.0643, + "step": 11467 + }, + { + "epoch": 0.4106935018890898, + "grad_norm": 1.6219416856765747, + "learning_rate": 0.00013314322771638346, + "loss": 1.4402, + "step": 11468 + }, + { + "epoch": 0.41072931401865814, + "grad_norm": 1.3757586479187012, + "learning_rate": 0.00013313228406819637, + "loss": 1.1439, + "step": 11469 + }, + { + "epoch": 0.4107651261482264, + "grad_norm": 1.4983913898468018, + "learning_rate": 0.00013312133997425712, + "loss": 1.2458, + "step": 11470 + }, + { + "epoch": 0.4108009382777947, + "grad_norm": 1.5508441925048828, + "learning_rate": 0.00013311039543471297, + "loss": 1.1539, + "step": 11471 + }, + { + "epoch": 0.41083675040736295, + "grad_norm": 1.4219183921813965, + "learning_rate": 0.00013309945044971116, + "loss": 1.0581, + "step": 11472 + }, + { + "epoch": 0.4108725625369313, + "grad_norm": 1.5791375637054443, + "learning_rate": 0.00013308850501939892, + "loss": 1.1459, + "step": 11473 + }, + { + "epoch": 0.41090837466649954, + "grad_norm": 1.3305336236953735, + "learning_rate": 0.00013307755914392357, + "loss": 0.9805, + "step": 11474 + }, + { + "epoch": 0.4109441867960678, + "grad_norm": 1.5876463651657104, + "learning_rate": 0.0001330666128234323, + "loss": 1.2901, + "step": 11475 + }, + { + "epoch": 0.41097999892563614, + "grad_norm": 1.4341071844100952, + "learning_rate": 0.0001330556660580724, + "loss": 0.9474, + "step": 11476 + }, + { + "epoch": 0.4110158110552044, + "grad_norm": 1.4274110794067383, + "learning_rate": 0.00013304471884799116, + "loss": 1.2432, + "step": 11477 + }, + { + "epoch": 0.4110516231847727, + "grad_norm": 1.3938705921173096, + "learning_rate": 0.00013303377119333587, + "loss": 1.2196, + "step": 11478 + }, + { + "epoch": 0.41108743531434094, + "grad_norm": 1.3960814476013184, + "learning_rate": 0.0001330228230942538, + "loss": 1.004, + "step": 11479 + }, + { + "epoch": 0.41112324744390927, + "grad_norm": 1.7635775804519653, + "learning_rate": 0.00013301187455089223, + "loss": 1.456, + "step": 11480 + }, + { + "epoch": 0.41115905957347754, + "grad_norm": 1.5366168022155762, + "learning_rate": 0.00013300092556339847, + "loss": 1.3588, + "step": 11481 + }, + { + "epoch": 0.4111948717030458, + "grad_norm": 1.4439380168914795, + "learning_rate": 0.00013298997613191978, + "loss": 1.229, + "step": 11482 + }, + { + "epoch": 0.41123068383261413, + "grad_norm": 1.4736065864562988, + "learning_rate": 0.00013297902625660358, + "loss": 0.9901, + "step": 11483 + }, + { + "epoch": 0.4112664959621824, + "grad_norm": 1.3806006908416748, + "learning_rate": 0.00013296807593759708, + "loss": 1.1811, + "step": 11484 + }, + { + "epoch": 0.41130230809175067, + "grad_norm": 1.4885565042495728, + "learning_rate": 0.0001329571251750477, + "loss": 1.2177, + "step": 11485 + }, + { + "epoch": 0.41133812022131894, + "grad_norm": 1.386436939239502, + "learning_rate": 0.00013294617396910266, + "loss": 1.2739, + "step": 11486 + }, + { + "epoch": 0.41137393235088726, + "grad_norm": 1.332343578338623, + "learning_rate": 0.00013293522231990935, + "loss": 1.2033, + "step": 11487 + }, + { + "epoch": 0.41140974448045553, + "grad_norm": 2.0568583011627197, + "learning_rate": 0.00013292427022761514, + "loss": 1.1906, + "step": 11488 + }, + { + "epoch": 0.4114455566100238, + "grad_norm": 1.2601057291030884, + "learning_rate": 0.0001329133176923673, + "loss": 1.1203, + "step": 11489 + }, + { + "epoch": 0.4114813687395921, + "grad_norm": 1.692683219909668, + "learning_rate": 0.00013290236471431326, + "loss": 1.1718, + "step": 11490 + }, + { + "epoch": 0.4115171808691604, + "grad_norm": 1.6077358722686768, + "learning_rate": 0.00013289141129360033, + "loss": 1.1577, + "step": 11491 + }, + { + "epoch": 0.41155299299872866, + "grad_norm": 2.251427173614502, + "learning_rate": 0.0001328804574303759, + "loss": 1.4328, + "step": 11492 + }, + { + "epoch": 0.41158880512829693, + "grad_norm": 1.6564087867736816, + "learning_rate": 0.0001328695031247873, + "loss": 1.2708, + "step": 11493 + }, + { + "epoch": 0.41162461725786526, + "grad_norm": 1.649369716644287, + "learning_rate": 0.00013285854837698195, + "loss": 1.1661, + "step": 11494 + }, + { + "epoch": 0.4116604293874335, + "grad_norm": 1.6267011165618896, + "learning_rate": 0.0001328475931871072, + "loss": 1.1666, + "step": 11495 + }, + { + "epoch": 0.4116962415170018, + "grad_norm": 1.3765634298324585, + "learning_rate": 0.0001328366375553105, + "loss": 0.9334, + "step": 11496 + }, + { + "epoch": 0.4117320536465701, + "grad_norm": 2.1156256198883057, + "learning_rate": 0.00013282568148173917, + "loss": 1.1472, + "step": 11497 + }, + { + "epoch": 0.4117678657761384, + "grad_norm": 1.6489038467407227, + "learning_rate": 0.00013281472496654064, + "loss": 1.0292, + "step": 11498 + }, + { + "epoch": 0.41180367790570666, + "grad_norm": 1.5967295169830322, + "learning_rate": 0.0001328037680098623, + "loss": 1.1207, + "step": 11499 + }, + { + "epoch": 0.4118394900352749, + "grad_norm": 1.444140076637268, + "learning_rate": 0.00013279281061185158, + "loss": 1.0746, + "step": 11500 + }, + { + "epoch": 0.41187530216484325, + "grad_norm": 1.6105090379714966, + "learning_rate": 0.0001327818527726559, + "loss": 1.1029, + "step": 11501 + }, + { + "epoch": 0.4119111142944115, + "grad_norm": 1.2947196960449219, + "learning_rate": 0.00013277089449242267, + "loss": 0.9659, + "step": 11502 + }, + { + "epoch": 0.4119469264239798, + "grad_norm": 1.5854629278182983, + "learning_rate": 0.00013275993577129932, + "loss": 1.3291, + "step": 11503 + }, + { + "epoch": 0.4119827385535481, + "grad_norm": 1.539975643157959, + "learning_rate": 0.0001327489766094333, + "loss": 0.9682, + "step": 11504 + }, + { + "epoch": 0.4120185506831164, + "grad_norm": 1.5488711595535278, + "learning_rate": 0.00013273801700697206, + "loss": 1.3484, + "step": 11505 + }, + { + "epoch": 0.41205436281268465, + "grad_norm": 1.3688145875930786, + "learning_rate": 0.00013272705696406302, + "loss": 1.1089, + "step": 11506 + }, + { + "epoch": 0.4120901749422529, + "grad_norm": 1.357352375984192, + "learning_rate": 0.00013271609648085367, + "loss": 1.1706, + "step": 11507 + }, + { + "epoch": 0.41212598707182124, + "grad_norm": 1.6051174402236938, + "learning_rate": 0.0001327051355574914, + "loss": 1.369, + "step": 11508 + }, + { + "epoch": 0.4121617992013895, + "grad_norm": 1.3780649900436401, + "learning_rate": 0.0001326941741941237, + "loss": 1.042, + "step": 11509 + }, + { + "epoch": 0.4121976113309578, + "grad_norm": 1.1763402223587036, + "learning_rate": 0.00013268321239089809, + "loss": 1.1052, + "step": 11510 + }, + { + "epoch": 0.4122334234605261, + "grad_norm": 1.6881073713302612, + "learning_rate": 0.00013267225014796202, + "loss": 1.0863, + "step": 11511 + }, + { + "epoch": 0.4122692355900944, + "grad_norm": 1.4142866134643555, + "learning_rate": 0.00013266128746546296, + "loss": 0.9352, + "step": 11512 + }, + { + "epoch": 0.41230504771966264, + "grad_norm": 1.3058613538742065, + "learning_rate": 0.0001326503243435484, + "loss": 1.102, + "step": 11513 + }, + { + "epoch": 0.4123408598492309, + "grad_norm": 1.3735226392745972, + "learning_rate": 0.00013263936078236586, + "loss": 0.9772, + "step": 11514 + }, + { + "epoch": 0.41237667197879924, + "grad_norm": 1.8561128377914429, + "learning_rate": 0.00013262839678206283, + "loss": 1.2501, + "step": 11515 + }, + { + "epoch": 0.4124124841083675, + "grad_norm": 1.3571009635925293, + "learning_rate": 0.00013261743234278678, + "loss": 1.2091, + "step": 11516 + }, + { + "epoch": 0.4124482962379358, + "grad_norm": 1.3356913328170776, + "learning_rate": 0.00013260646746468527, + "loss": 0.9806, + "step": 11517 + }, + { + "epoch": 0.4124841083675041, + "grad_norm": 1.7658920288085938, + "learning_rate": 0.0001325955021479058, + "loss": 1.2397, + "step": 11518 + }, + { + "epoch": 0.41251992049707237, + "grad_norm": 1.476793646812439, + "learning_rate": 0.00013258453639259586, + "loss": 1.0427, + "step": 11519 + }, + { + "epoch": 0.41255573262664064, + "grad_norm": 1.5970097780227661, + "learning_rate": 0.00013257357019890307, + "loss": 1.1106, + "step": 11520 + }, + { + "epoch": 0.4125915447562089, + "grad_norm": 1.63698410987854, + "learning_rate": 0.00013256260356697485, + "loss": 1.2785, + "step": 11521 + }, + { + "epoch": 0.41262735688577723, + "grad_norm": 1.5096495151519775, + "learning_rate": 0.00013255163649695886, + "loss": 1.2455, + "step": 11522 + }, + { + "epoch": 0.4126631690153455, + "grad_norm": 1.9204319715499878, + "learning_rate": 0.00013254066898900257, + "loss": 1.1747, + "step": 11523 + }, + { + "epoch": 0.41269898114491377, + "grad_norm": 1.670980453491211, + "learning_rate": 0.00013252970104325352, + "loss": 1.0265, + "step": 11524 + }, + { + "epoch": 0.4127347932744821, + "grad_norm": 1.3308067321777344, + "learning_rate": 0.00013251873265985936, + "loss": 1.1729, + "step": 11525 + }, + { + "epoch": 0.41277060540405036, + "grad_norm": 1.4035162925720215, + "learning_rate": 0.00013250776383896752, + "loss": 1.0967, + "step": 11526 + }, + { + "epoch": 0.41280641753361863, + "grad_norm": 1.2844692468643188, + "learning_rate": 0.00013249679458072572, + "loss": 1.1116, + "step": 11527 + }, + { + "epoch": 0.4128422296631869, + "grad_norm": 1.4011739492416382, + "learning_rate": 0.00013248582488528142, + "loss": 0.942, + "step": 11528 + }, + { + "epoch": 0.4128780417927552, + "grad_norm": 1.4617398977279663, + "learning_rate": 0.0001324748547527823, + "loss": 1.074, + "step": 11529 + }, + { + "epoch": 0.4129138539223235, + "grad_norm": 1.3982661962509155, + "learning_rate": 0.00013246388418337586, + "loss": 0.9966, + "step": 11530 + }, + { + "epoch": 0.41294966605189176, + "grad_norm": 1.5889053344726562, + "learning_rate": 0.00013245291317720974, + "loss": 1.0678, + "step": 11531 + }, + { + "epoch": 0.4129854781814601, + "grad_norm": 1.6692841053009033, + "learning_rate": 0.00013244194173443155, + "loss": 1.0412, + "step": 11532 + }, + { + "epoch": 0.41302129031102836, + "grad_norm": 1.5403544902801514, + "learning_rate": 0.00013243096985518887, + "loss": 1.0698, + "step": 11533 + }, + { + "epoch": 0.4130571024405966, + "grad_norm": 1.5184966325759888, + "learning_rate": 0.00013241999753962932, + "loss": 0.9627, + "step": 11534 + }, + { + "epoch": 0.4130929145701649, + "grad_norm": 1.5912410020828247, + "learning_rate": 0.00013240902478790052, + "loss": 1.0866, + "step": 11535 + }, + { + "epoch": 0.4131287266997332, + "grad_norm": 1.4641891717910767, + "learning_rate": 0.0001323980516001501, + "loss": 1.0962, + "step": 11536 + }, + { + "epoch": 0.4131645388293015, + "grad_norm": 1.5833688974380493, + "learning_rate": 0.00013238707797652569, + "loss": 1.2698, + "step": 11537 + }, + { + "epoch": 0.41320035095886976, + "grad_norm": 1.3427938222885132, + "learning_rate": 0.0001323761039171749, + "loss": 0.9449, + "step": 11538 + }, + { + "epoch": 0.4132361630884381, + "grad_norm": 1.50185227394104, + "learning_rate": 0.00013236512942224545, + "loss": 1.1329, + "step": 11539 + }, + { + "epoch": 0.41327197521800635, + "grad_norm": 1.7839784622192383, + "learning_rate": 0.0001323541544918849, + "loss": 1.3662, + "step": 11540 + }, + { + "epoch": 0.4133077873475746, + "grad_norm": 2.031745195388794, + "learning_rate": 0.00013234317912624093, + "loss": 1.1083, + "step": 11541 + }, + { + "epoch": 0.4133435994771429, + "grad_norm": 1.714531421661377, + "learning_rate": 0.0001323322033254612, + "loss": 1.1951, + "step": 11542 + }, + { + "epoch": 0.4133794116067112, + "grad_norm": 1.3913650512695312, + "learning_rate": 0.00013232122708969337, + "loss": 1.0555, + "step": 11543 + }, + { + "epoch": 0.4134152237362795, + "grad_norm": 1.5446761846542358, + "learning_rate": 0.00013231025041908514, + "loss": 1.3245, + "step": 11544 + }, + { + "epoch": 0.41345103586584775, + "grad_norm": 1.6801691055297852, + "learning_rate": 0.00013229927331378418, + "loss": 1.1235, + "step": 11545 + }, + { + "epoch": 0.4134868479954161, + "grad_norm": 1.7469172477722168, + "learning_rate": 0.0001322882957739381, + "loss": 1.1258, + "step": 11546 + }, + { + "epoch": 0.41352266012498434, + "grad_norm": 1.358953833580017, + "learning_rate": 0.00013227731779969472, + "loss": 0.9453, + "step": 11547 + }, + { + "epoch": 0.4135584722545526, + "grad_norm": 1.838191032409668, + "learning_rate": 0.00013226633939120164, + "loss": 1.0082, + "step": 11548 + }, + { + "epoch": 0.4135942843841209, + "grad_norm": 1.3360694646835327, + "learning_rate": 0.00013225536054860658, + "loss": 1.0804, + "step": 11549 + }, + { + "epoch": 0.4136300965136892, + "grad_norm": 1.883376121520996, + "learning_rate": 0.00013224438127205725, + "loss": 1.0047, + "step": 11550 + }, + { + "epoch": 0.4136659086432575, + "grad_norm": 1.3639518022537231, + "learning_rate": 0.0001322334015617014, + "loss": 1.1685, + "step": 11551 + }, + { + "epoch": 0.41370172077282574, + "grad_norm": 1.2572153806686401, + "learning_rate": 0.00013222242141768664, + "loss": 0.7138, + "step": 11552 + }, + { + "epoch": 0.41373753290239407, + "grad_norm": 1.3216936588287354, + "learning_rate": 0.00013221144084016082, + "loss": 1.0712, + "step": 11553 + }, + { + "epoch": 0.41377334503196234, + "grad_norm": 1.7282087802886963, + "learning_rate": 0.00013220045982927157, + "loss": 1.13, + "step": 11554 + }, + { + "epoch": 0.4138091571615306, + "grad_norm": 1.613604187965393, + "learning_rate": 0.00013218947838516672, + "loss": 1.2574, + "step": 11555 + }, + { + "epoch": 0.4138449692910989, + "grad_norm": 1.8538426160812378, + "learning_rate": 0.0001321784965079939, + "loss": 1.3474, + "step": 11556 + }, + { + "epoch": 0.4138807814206672, + "grad_norm": 1.4775124788284302, + "learning_rate": 0.00013216751419790096, + "loss": 1.0665, + "step": 11557 + }, + { + "epoch": 0.41391659355023547, + "grad_norm": 1.270676851272583, + "learning_rate": 0.00013215653145503558, + "loss": 1.1212, + "step": 11558 + }, + { + "epoch": 0.41395240567980374, + "grad_norm": 1.418536901473999, + "learning_rate": 0.00013214554827954556, + "loss": 1.0971, + "step": 11559 + }, + { + "epoch": 0.41398821780937206, + "grad_norm": 1.6598337888717651, + "learning_rate": 0.00013213456467157868, + "loss": 1.1371, + "step": 11560 + }, + { + "epoch": 0.41402402993894033, + "grad_norm": 1.4634809494018555, + "learning_rate": 0.00013212358063128266, + "loss": 1.0079, + "step": 11561 + }, + { + "epoch": 0.4140598420685086, + "grad_norm": 1.3802311420440674, + "learning_rate": 0.0001321125961588053, + "loss": 1.1373, + "step": 11562 + }, + { + "epoch": 0.41409565419807687, + "grad_norm": 1.7655688524246216, + "learning_rate": 0.00013210161125429436, + "loss": 1.2045, + "step": 11563 + }, + { + "epoch": 0.4141314663276452, + "grad_norm": 1.6903401613235474, + "learning_rate": 0.0001320906259178977, + "loss": 1.1994, + "step": 11564 + }, + { + "epoch": 0.41416727845721346, + "grad_norm": 1.460293173789978, + "learning_rate": 0.00013207964014976299, + "loss": 1.2527, + "step": 11565 + }, + { + "epoch": 0.41420309058678173, + "grad_norm": 1.6231828927993774, + "learning_rate": 0.00013206865395003816, + "loss": 0.9474, + "step": 11566 + }, + { + "epoch": 0.41423890271635005, + "grad_norm": 1.2558178901672363, + "learning_rate": 0.00013205766731887094, + "loss": 1.0396, + "step": 11567 + }, + { + "epoch": 0.4142747148459183, + "grad_norm": 2.3413069248199463, + "learning_rate": 0.00013204668025640915, + "loss": 1.2418, + "step": 11568 + }, + { + "epoch": 0.4143105269754866, + "grad_norm": 1.4645905494689941, + "learning_rate": 0.00013203569276280062, + "loss": 1.2103, + "step": 11569 + }, + { + "epoch": 0.41434633910505486, + "grad_norm": 1.2511111497879028, + "learning_rate": 0.00013202470483819316, + "loss": 1.0105, + "step": 11570 + }, + { + "epoch": 0.4143821512346232, + "grad_norm": 1.5456643104553223, + "learning_rate": 0.00013201371648273463, + "loss": 1.2599, + "step": 11571 + }, + { + "epoch": 0.41441796336419146, + "grad_norm": 1.7673511505126953, + "learning_rate": 0.00013200272769657283, + "loss": 1.1706, + "step": 11572 + }, + { + "epoch": 0.4144537754937597, + "grad_norm": 1.3965378999710083, + "learning_rate": 0.00013199173847985559, + "loss": 1.0385, + "step": 11573 + }, + { + "epoch": 0.41448958762332805, + "grad_norm": 1.5094232559204102, + "learning_rate": 0.0001319807488327308, + "loss": 1.0882, + "step": 11574 + }, + { + "epoch": 0.4145253997528963, + "grad_norm": 1.458139181137085, + "learning_rate": 0.00013196975875534624, + "loss": 1.2635, + "step": 11575 + }, + { + "epoch": 0.4145612118824646, + "grad_norm": 1.6929209232330322, + "learning_rate": 0.00013195876824784988, + "loss": 1.119, + "step": 11576 + }, + { + "epoch": 0.41459702401203286, + "grad_norm": 1.3912441730499268, + "learning_rate": 0.00013194777731038946, + "loss": 1.1068, + "step": 11577 + }, + { + "epoch": 0.4146328361416012, + "grad_norm": 1.9373853206634521, + "learning_rate": 0.00013193678594311295, + "loss": 1.1646, + "step": 11578 + }, + { + "epoch": 0.41466864827116945, + "grad_norm": 1.6081831455230713, + "learning_rate": 0.00013192579414616815, + "loss": 1.0199, + "step": 11579 + }, + { + "epoch": 0.4147044604007377, + "grad_norm": 2.2927463054656982, + "learning_rate": 0.000131914801919703, + "loss": 1.1478, + "step": 11580 + }, + { + "epoch": 0.41474027253030604, + "grad_norm": 1.5124168395996094, + "learning_rate": 0.0001319038092638653, + "loss": 1.2161, + "step": 11581 + }, + { + "epoch": 0.4147760846598743, + "grad_norm": 1.6365100145339966, + "learning_rate": 0.00013189281617880308, + "loss": 1.0049, + "step": 11582 + }, + { + "epoch": 0.4148118967894426, + "grad_norm": 1.3843350410461426, + "learning_rate": 0.0001318818226646641, + "loss": 1.0639, + "step": 11583 + }, + { + "epoch": 0.41484770891901085, + "grad_norm": 1.6839985847473145, + "learning_rate": 0.00013187082872159636, + "loss": 1.153, + "step": 11584 + }, + { + "epoch": 0.4148835210485792, + "grad_norm": 1.1824469566345215, + "learning_rate": 0.0001318598343497477, + "loss": 0.8799, + "step": 11585 + }, + { + "epoch": 0.41491933317814744, + "grad_norm": 1.4404369592666626, + "learning_rate": 0.00013184883954926607, + "loss": 0.9356, + "step": 11586 + }, + { + "epoch": 0.4149551453077157, + "grad_norm": 1.3034342527389526, + "learning_rate": 0.0001318378443202994, + "loss": 1.1133, + "step": 11587 + }, + { + "epoch": 0.414990957437284, + "grad_norm": 1.894585132598877, + "learning_rate": 0.00013182684866299557, + "loss": 1.1587, + "step": 11588 + }, + { + "epoch": 0.4150267695668523, + "grad_norm": 1.8299773931503296, + "learning_rate": 0.00013181585257750257, + "loss": 1.1173, + "step": 11589 + }, + { + "epoch": 0.4150625816964206, + "grad_norm": 1.5166057348251343, + "learning_rate": 0.0001318048560639683, + "loss": 1.0386, + "step": 11590 + }, + { + "epoch": 0.41509839382598884, + "grad_norm": 1.3933980464935303, + "learning_rate": 0.00013179385912254072, + "loss": 1.1601, + "step": 11591 + }, + { + "epoch": 0.41513420595555717, + "grad_norm": 2.0097463130950928, + "learning_rate": 0.00013178286175336777, + "loss": 1.0419, + "step": 11592 + }, + { + "epoch": 0.41517001808512544, + "grad_norm": 1.6696621179580688, + "learning_rate": 0.00013177186395659743, + "loss": 1.408, + "step": 11593 + }, + { + "epoch": 0.4152058302146937, + "grad_norm": 1.7444169521331787, + "learning_rate": 0.00013176086573237766, + "loss": 1.2467, + "step": 11594 + }, + { + "epoch": 0.415241642344262, + "grad_norm": 1.7135603427886963, + "learning_rate": 0.0001317498670808564, + "loss": 1.2124, + "step": 11595 + }, + { + "epoch": 0.4152774544738303, + "grad_norm": 1.535563349723816, + "learning_rate": 0.0001317388680021816, + "loss": 1.2933, + "step": 11596 + }, + { + "epoch": 0.41531326660339857, + "grad_norm": 1.5895172357559204, + "learning_rate": 0.00013172786849650133, + "loss": 1.2193, + "step": 11597 + }, + { + "epoch": 0.41534907873296684, + "grad_norm": 1.7656563520431519, + "learning_rate": 0.00013171686856396344, + "loss": 1.3269, + "step": 11598 + }, + { + "epoch": 0.41538489086253516, + "grad_norm": 1.8231850862503052, + "learning_rate": 0.00013170586820471605, + "loss": 1.1055, + "step": 11599 + }, + { + "epoch": 0.41542070299210343, + "grad_norm": 1.403834581375122, + "learning_rate": 0.00013169486741890706, + "loss": 1.155, + "step": 11600 + }, + { + "epoch": 0.4154565151216717, + "grad_norm": 1.7092188596725464, + "learning_rate": 0.0001316838662066845, + "loss": 1.0628, + "step": 11601 + }, + { + "epoch": 0.41549232725123997, + "grad_norm": 1.3617042303085327, + "learning_rate": 0.00013167286456819646, + "loss": 1.2255, + "step": 11602 + }, + { + "epoch": 0.4155281393808083, + "grad_norm": 1.271972417831421, + "learning_rate": 0.00013166186250359086, + "loss": 1.2229, + "step": 11603 + }, + { + "epoch": 0.41556395151037656, + "grad_norm": 1.6256684064865112, + "learning_rate": 0.00013165086001301575, + "loss": 1.1462, + "step": 11604 + }, + { + "epoch": 0.41559976363994483, + "grad_norm": 1.3217943906784058, + "learning_rate": 0.0001316398570966191, + "loss": 1.0848, + "step": 11605 + }, + { + "epoch": 0.41563557576951315, + "grad_norm": 1.414151906967163, + "learning_rate": 0.000131628853754549, + "loss": 1.3002, + "step": 11606 + }, + { + "epoch": 0.4156713878990814, + "grad_norm": 1.4055739641189575, + "learning_rate": 0.00013161784998695349, + "loss": 1.1388, + "step": 11607 + }, + { + "epoch": 0.4157072000286497, + "grad_norm": 1.993319034576416, + "learning_rate": 0.00013160684579398057, + "loss": 1.1512, + "step": 11608 + }, + { + "epoch": 0.41574301215821796, + "grad_norm": 1.316646933555603, + "learning_rate": 0.00013159584117577831, + "loss": 0.9858, + "step": 11609 + }, + { + "epoch": 0.4157788242877863, + "grad_norm": 1.346179723739624, + "learning_rate": 0.0001315848361324948, + "loss": 1.2374, + "step": 11610 + }, + { + "epoch": 0.41581463641735456, + "grad_norm": 1.5440714359283447, + "learning_rate": 0.000131573830664278, + "loss": 1.2878, + "step": 11611 + }, + { + "epoch": 0.4158504485469228, + "grad_norm": 1.486095905303955, + "learning_rate": 0.0001315628247712761, + "loss": 1.1565, + "step": 11612 + }, + { + "epoch": 0.41588626067649115, + "grad_norm": 1.540124535560608, + "learning_rate": 0.0001315518184536371, + "loss": 1.235, + "step": 11613 + }, + { + "epoch": 0.4159220728060594, + "grad_norm": 1.8534979820251465, + "learning_rate": 0.00013154081171150902, + "loss": 1.3698, + "step": 11614 + }, + { + "epoch": 0.4159578849356277, + "grad_norm": 1.4915058612823486, + "learning_rate": 0.00013152980454504007, + "loss": 1.2985, + "step": 11615 + }, + { + "epoch": 0.41599369706519596, + "grad_norm": 1.7693334817886353, + "learning_rate": 0.00013151879695437823, + "loss": 1.3801, + "step": 11616 + }, + { + "epoch": 0.4160295091947643, + "grad_norm": 1.5690200328826904, + "learning_rate": 0.00013150778893967165, + "loss": 1.4159, + "step": 11617 + }, + { + "epoch": 0.41606532132433255, + "grad_norm": 1.764380931854248, + "learning_rate": 0.0001314967805010684, + "loss": 1.2898, + "step": 11618 + }, + { + "epoch": 0.4161011334539008, + "grad_norm": 1.869052767753601, + "learning_rate": 0.0001314857716387166, + "loss": 1.2764, + "step": 11619 + }, + { + "epoch": 0.41613694558346914, + "grad_norm": 1.4806019067764282, + "learning_rate": 0.00013147476235276438, + "loss": 1.1403, + "step": 11620 + }, + { + "epoch": 0.4161727577130374, + "grad_norm": 1.197623372077942, + "learning_rate": 0.00013146375264335978, + "loss": 1.1664, + "step": 11621 + }, + { + "epoch": 0.4162085698426057, + "grad_norm": 1.8583018779754639, + "learning_rate": 0.00013145274251065103, + "loss": 1.2222, + "step": 11622 + }, + { + "epoch": 0.41624438197217395, + "grad_norm": 1.47785222530365, + "learning_rate": 0.00013144173195478616, + "loss": 1.1004, + "step": 11623 + }, + { + "epoch": 0.4162801941017423, + "grad_norm": 2.1759250164031982, + "learning_rate": 0.0001314307209759134, + "loss": 1.1303, + "step": 11624 + }, + { + "epoch": 0.41631600623131054, + "grad_norm": 1.7111594676971436, + "learning_rate": 0.00013141970957418074, + "loss": 1.1777, + "step": 11625 + }, + { + "epoch": 0.4163518183608788, + "grad_norm": 1.3479748964309692, + "learning_rate": 0.0001314086977497365, + "loss": 0.9966, + "step": 11626 + }, + { + "epoch": 0.41638763049044714, + "grad_norm": 1.6129834651947021, + "learning_rate": 0.0001313976855027287, + "loss": 1.0854, + "step": 11627 + }, + { + "epoch": 0.4164234426200154, + "grad_norm": 2.0185141563415527, + "learning_rate": 0.00013138667283330556, + "loss": 1.2965, + "step": 11628 + }, + { + "epoch": 0.4164592547495837, + "grad_norm": 1.6106027364730835, + "learning_rate": 0.00013137565974161524, + "loss": 1.4019, + "step": 11629 + }, + { + "epoch": 0.41649506687915194, + "grad_norm": 1.8513290882110596, + "learning_rate": 0.00013136464622780583, + "loss": 1.2957, + "step": 11630 + }, + { + "epoch": 0.41653087900872027, + "grad_norm": 1.835290789604187, + "learning_rate": 0.00013135363229202564, + "loss": 1.1697, + "step": 11631 + }, + { + "epoch": 0.41656669113828854, + "grad_norm": 1.8392423391342163, + "learning_rate": 0.0001313426179344227, + "loss": 1.0157, + "step": 11632 + }, + { + "epoch": 0.4166025032678568, + "grad_norm": 1.4039641618728638, + "learning_rate": 0.0001313316031551453, + "loss": 1.133, + "step": 11633 + }, + { + "epoch": 0.41663831539742513, + "grad_norm": 1.610543966293335, + "learning_rate": 0.00013132058795434158, + "loss": 1.2412, + "step": 11634 + }, + { + "epoch": 0.4166741275269934, + "grad_norm": 1.4131901264190674, + "learning_rate": 0.0001313095723321598, + "loss": 1.0816, + "step": 11635 + }, + { + "epoch": 0.41670993965656167, + "grad_norm": 1.3769910335540771, + "learning_rate": 0.00013129855628874805, + "loss": 1.201, + "step": 11636 + }, + { + "epoch": 0.41674575178612994, + "grad_norm": 1.6554574966430664, + "learning_rate": 0.0001312875398242546, + "loss": 1.2156, + "step": 11637 + }, + { + "epoch": 0.41678156391569826, + "grad_norm": 1.7545665502548218, + "learning_rate": 0.0001312765229388277, + "loss": 1.114, + "step": 11638 + }, + { + "epoch": 0.41681737604526653, + "grad_norm": 1.6350696086883545, + "learning_rate": 0.00013126550563261551, + "loss": 1.1454, + "step": 11639 + }, + { + "epoch": 0.4168531881748348, + "grad_norm": 1.474371075630188, + "learning_rate": 0.00013125448790576627, + "loss": 1.0729, + "step": 11640 + }, + { + "epoch": 0.4168890003044031, + "grad_norm": 1.5029237270355225, + "learning_rate": 0.00013124346975842822, + "loss": 1.1346, + "step": 11641 + }, + { + "epoch": 0.4169248124339714, + "grad_norm": 2.1627821922302246, + "learning_rate": 0.00013123245119074956, + "loss": 1.1638, + "step": 11642 + }, + { + "epoch": 0.41696062456353966, + "grad_norm": 2.3314969539642334, + "learning_rate": 0.00013122143220287854, + "loss": 1.1214, + "step": 11643 + }, + { + "epoch": 0.41699643669310793, + "grad_norm": 1.479791283607483, + "learning_rate": 0.00013121041279496348, + "loss": 1.2065, + "step": 11644 + }, + { + "epoch": 0.41703224882267625, + "grad_norm": 1.499226450920105, + "learning_rate": 0.00013119939296715253, + "loss": 1.2045, + "step": 11645 + }, + { + "epoch": 0.4170680609522445, + "grad_norm": 1.903215765953064, + "learning_rate": 0.00013118837271959403, + "loss": 1.3017, + "step": 11646 + }, + { + "epoch": 0.4171038730818128, + "grad_norm": 1.899121880531311, + "learning_rate": 0.0001311773520524362, + "loss": 1.2747, + "step": 11647 + }, + { + "epoch": 0.4171396852113811, + "grad_norm": 2.0419399738311768, + "learning_rate": 0.00013116633096582728, + "loss": 0.977, + "step": 11648 + }, + { + "epoch": 0.4171754973409494, + "grad_norm": 1.777565360069275, + "learning_rate": 0.0001311553094599156, + "loss": 1.1166, + "step": 11649 + }, + { + "epoch": 0.41721130947051766, + "grad_norm": 1.6996486186981201, + "learning_rate": 0.00013114428753484942, + "loss": 1.1087, + "step": 11650 + }, + { + "epoch": 0.4172471216000859, + "grad_norm": 1.5545101165771484, + "learning_rate": 0.00013113326519077702, + "loss": 1.4441, + "step": 11651 + }, + { + "epoch": 0.41728293372965425, + "grad_norm": 1.6305791139602661, + "learning_rate": 0.0001311222424278467, + "loss": 1.1194, + "step": 11652 + }, + { + "epoch": 0.4173187458592225, + "grad_norm": 1.5684545040130615, + "learning_rate": 0.00013111121924620672, + "loss": 0.9854, + "step": 11653 + }, + { + "epoch": 0.4173545579887908, + "grad_norm": 1.7287007570266724, + "learning_rate": 0.00013110019564600546, + "loss": 1.1941, + "step": 11654 + }, + { + "epoch": 0.4173903701183591, + "grad_norm": 1.325722098350525, + "learning_rate": 0.00013108917162739115, + "loss": 1.2121, + "step": 11655 + }, + { + "epoch": 0.4174261822479274, + "grad_norm": 1.395890235900879, + "learning_rate": 0.00013107814719051216, + "loss": 1.0824, + "step": 11656 + }, + { + "epoch": 0.41746199437749565, + "grad_norm": 1.2975422143936157, + "learning_rate": 0.0001310671223355168, + "loss": 1.2099, + "step": 11657 + }, + { + "epoch": 0.4174978065070639, + "grad_norm": 1.8570709228515625, + "learning_rate": 0.00013105609706255336, + "loss": 1.2463, + "step": 11658 + }, + { + "epoch": 0.41753361863663224, + "grad_norm": 1.605630874633789, + "learning_rate": 0.00013104507137177022, + "loss": 1.1627, + "step": 11659 + }, + { + "epoch": 0.4175694307662005, + "grad_norm": 1.566985011100769, + "learning_rate": 0.00013103404526331564, + "loss": 1.0976, + "step": 11660 + }, + { + "epoch": 0.4176052428957688, + "grad_norm": 1.7084866762161255, + "learning_rate": 0.00013102301873733807, + "loss": 1.1922, + "step": 11661 + }, + { + "epoch": 0.4176410550253371, + "grad_norm": 1.855980634689331, + "learning_rate": 0.00013101199179398572, + "loss": 0.9658, + "step": 11662 + }, + { + "epoch": 0.4176768671549054, + "grad_norm": 1.6894290447235107, + "learning_rate": 0.0001310009644334071, + "loss": 1.0862, + "step": 11663 + }, + { + "epoch": 0.41771267928447364, + "grad_norm": 1.5273185968399048, + "learning_rate": 0.00013098993665575047, + "loss": 1.2473, + "step": 11664 + }, + { + "epoch": 0.4177484914140419, + "grad_norm": 1.605176329612732, + "learning_rate": 0.0001309789084611642, + "loss": 1.1744, + "step": 11665 + }, + { + "epoch": 0.41778430354361024, + "grad_norm": 1.54000985622406, + "learning_rate": 0.00013096787984979673, + "loss": 1.4378, + "step": 11666 + }, + { + "epoch": 0.4178201156731785, + "grad_norm": 1.6394402980804443, + "learning_rate": 0.00013095685082179632, + "loss": 1.1644, + "step": 11667 + }, + { + "epoch": 0.4178559278027468, + "grad_norm": 1.5409258604049683, + "learning_rate": 0.00013094582137731145, + "loss": 1.0112, + "step": 11668 + }, + { + "epoch": 0.4178917399323151, + "grad_norm": 1.493111252784729, + "learning_rate": 0.00013093479151649043, + "loss": 1.2253, + "step": 11669 + }, + { + "epoch": 0.41792755206188337, + "grad_norm": 1.333396315574646, + "learning_rate": 0.00013092376123948174, + "loss": 1.0237, + "step": 11670 + }, + { + "epoch": 0.41796336419145164, + "grad_norm": 1.4776397943496704, + "learning_rate": 0.0001309127305464337, + "loss": 0.8383, + "step": 11671 + }, + { + "epoch": 0.4179991763210199, + "grad_norm": 1.50295889377594, + "learning_rate": 0.00013090169943749476, + "loss": 1.1536, + "step": 11672 + }, + { + "epoch": 0.41803498845058823, + "grad_norm": 1.5422708988189697, + "learning_rate": 0.00013089066791281332, + "loss": 1.1413, + "step": 11673 + }, + { + "epoch": 0.4180708005801565, + "grad_norm": 2.0429883003234863, + "learning_rate": 0.00013087963597253777, + "loss": 1.1516, + "step": 11674 + }, + { + "epoch": 0.41810661270972477, + "grad_norm": 1.3694952726364136, + "learning_rate": 0.00013086860361681657, + "loss": 1.0354, + "step": 11675 + }, + { + "epoch": 0.4181424248392931, + "grad_norm": 2.0671257972717285, + "learning_rate": 0.00013085757084579808, + "loss": 1.2497, + "step": 11676 + }, + { + "epoch": 0.41817823696886136, + "grad_norm": 1.665863037109375, + "learning_rate": 0.00013084653765963085, + "loss": 1.28, + "step": 11677 + }, + { + "epoch": 0.41821404909842963, + "grad_norm": 1.3761825561523438, + "learning_rate": 0.0001308355040584632, + "loss": 1.0127, + "step": 11678 + }, + { + "epoch": 0.4182498612279979, + "grad_norm": 1.4200778007507324, + "learning_rate": 0.0001308244700424436, + "loss": 1.1499, + "step": 11679 + }, + { + "epoch": 0.4182856733575662, + "grad_norm": 1.415906310081482, + "learning_rate": 0.00013081343561172055, + "loss": 1.1409, + "step": 11680 + }, + { + "epoch": 0.4183214854871345, + "grad_norm": 1.4685412645339966, + "learning_rate": 0.00013080240076644245, + "loss": 1.1883, + "step": 11681 + }, + { + "epoch": 0.41835729761670276, + "grad_norm": 1.5382176637649536, + "learning_rate": 0.0001307913655067578, + "loss": 1.0407, + "step": 11682 + }, + { + "epoch": 0.4183931097462711, + "grad_norm": 1.829534888267517, + "learning_rate": 0.00013078032983281505, + "loss": 1.3427, + "step": 11683 + }, + { + "epoch": 0.41842892187583935, + "grad_norm": 1.604785442352295, + "learning_rate": 0.00013076929374476265, + "loss": 1.2824, + "step": 11684 + }, + { + "epoch": 0.4184647340054076, + "grad_norm": 1.260918378829956, + "learning_rate": 0.00013075825724274907, + "loss": 1.106, + "step": 11685 + }, + { + "epoch": 0.4185005461349759, + "grad_norm": 1.4999088048934937, + "learning_rate": 0.00013074722032692287, + "loss": 1.0929, + "step": 11686 + }, + { + "epoch": 0.4185363582645442, + "grad_norm": 2.036437511444092, + "learning_rate": 0.00013073618299743242, + "loss": 1.0852, + "step": 11687 + }, + { + "epoch": 0.4185721703941125, + "grad_norm": 1.4207277297973633, + "learning_rate": 0.00013072514525442632, + "loss": 1.1677, + "step": 11688 + }, + { + "epoch": 0.41860798252368076, + "grad_norm": 1.6188565492630005, + "learning_rate": 0.000130714107098053, + "loss": 1.1123, + "step": 11689 + }, + { + "epoch": 0.4186437946532491, + "grad_norm": 1.4034252166748047, + "learning_rate": 0.000130703068528461, + "loss": 1.0997, + "step": 11690 + }, + { + "epoch": 0.41867960678281735, + "grad_norm": 1.7574976682662964, + "learning_rate": 0.00013069202954579882, + "loss": 1.1803, + "step": 11691 + }, + { + "epoch": 0.4187154189123856, + "grad_norm": 1.7111691236495972, + "learning_rate": 0.00013068099015021498, + "loss": 1.0729, + "step": 11692 + }, + { + "epoch": 0.4187512310419539, + "grad_norm": 1.9808944463729858, + "learning_rate": 0.00013066995034185798, + "loss": 1.2994, + "step": 11693 + }, + { + "epoch": 0.4187870431715222, + "grad_norm": 1.6209832429885864, + "learning_rate": 0.00013065891012087634, + "loss": 1.3335, + "step": 11694 + }, + { + "epoch": 0.4188228553010905, + "grad_norm": 1.3969697952270508, + "learning_rate": 0.00013064786948741863, + "loss": 1.1659, + "step": 11695 + }, + { + "epoch": 0.41885866743065875, + "grad_norm": 1.5253546237945557, + "learning_rate": 0.00013063682844163338, + "loss": 1.0224, + "step": 11696 + }, + { + "epoch": 0.4188944795602271, + "grad_norm": 1.8310890197753906, + "learning_rate": 0.00013062578698366909, + "loss": 1.1585, + "step": 11697 + }, + { + "epoch": 0.41893029168979534, + "grad_norm": 1.4371905326843262, + "learning_rate": 0.00013061474511367435, + "loss": 1.2103, + "step": 11698 + }, + { + "epoch": 0.4189661038193636, + "grad_norm": 1.555946946144104, + "learning_rate": 0.00013060370283179772, + "loss": 1.3186, + "step": 11699 + }, + { + "epoch": 0.4190019159489319, + "grad_norm": 1.7758899927139282, + "learning_rate": 0.00013059266013818775, + "loss": 1.1351, + "step": 11700 + }, + { + "epoch": 0.4190377280785002, + "grad_norm": 1.7256242036819458, + "learning_rate": 0.000130581617032993, + "loss": 1.2299, + "step": 11701 + }, + { + "epoch": 0.4190735402080685, + "grad_norm": 1.638013482093811, + "learning_rate": 0.00013057057351636205, + "loss": 0.9007, + "step": 11702 + }, + { + "epoch": 0.41910935233763674, + "grad_norm": 1.5221067667007446, + "learning_rate": 0.00013055952958844345, + "loss": 1.2211, + "step": 11703 + }, + { + "epoch": 0.41914516446720507, + "grad_norm": 1.4915080070495605, + "learning_rate": 0.00013054848524938576, + "loss": 1.1551, + "step": 11704 + }, + { + "epoch": 0.41918097659677334, + "grad_norm": 1.2175612449645996, + "learning_rate": 0.00013053744049933765, + "loss": 1.2117, + "step": 11705 + }, + { + "epoch": 0.4192167887263416, + "grad_norm": 1.5752780437469482, + "learning_rate": 0.00013052639533844766, + "loss": 1.0101, + "step": 11706 + }, + { + "epoch": 0.4192526008559099, + "grad_norm": 1.290915846824646, + "learning_rate": 0.0001305153497668644, + "loss": 1.0896, + "step": 11707 + }, + { + "epoch": 0.4192884129854782, + "grad_norm": 1.908077597618103, + "learning_rate": 0.00013050430378473647, + "loss": 0.9904, + "step": 11708 + }, + { + "epoch": 0.41932422511504647, + "grad_norm": 1.643808364868164, + "learning_rate": 0.00013049325739221247, + "loss": 1.28, + "step": 11709 + }, + { + "epoch": 0.41936003724461474, + "grad_norm": 1.6328022480010986, + "learning_rate": 0.00013048221058944103, + "loss": 1.1613, + "step": 11710 + }, + { + "epoch": 0.41939584937418306, + "grad_norm": 1.6733269691467285, + "learning_rate": 0.00013047116337657077, + "loss": 1.3181, + "step": 11711 + }, + { + "epoch": 0.41943166150375133, + "grad_norm": 1.3676923513412476, + "learning_rate": 0.00013046011575375033, + "loss": 1.0447, + "step": 11712 + }, + { + "epoch": 0.4194674736333196, + "grad_norm": 1.7669490575790405, + "learning_rate": 0.00013044906772112828, + "loss": 1.1433, + "step": 11713 + }, + { + "epoch": 0.41950328576288787, + "grad_norm": 1.501710057258606, + "learning_rate": 0.00013043801927885334, + "loss": 1.2017, + "step": 11714 + }, + { + "epoch": 0.4195390978924562, + "grad_norm": 1.8594566583633423, + "learning_rate": 0.00013042697042707407, + "loss": 0.9821, + "step": 11715 + }, + { + "epoch": 0.41957491002202446, + "grad_norm": 1.593291163444519, + "learning_rate": 0.0001304159211659392, + "loss": 1.154, + "step": 11716 + }, + { + "epoch": 0.41961072215159273, + "grad_norm": 1.7115195989608765, + "learning_rate": 0.00013040487149559735, + "loss": 1.1188, + "step": 11717 + }, + { + "epoch": 0.41964653428116105, + "grad_norm": 1.5338294506072998, + "learning_rate": 0.00013039382141619713, + "loss": 1.2359, + "step": 11718 + }, + { + "epoch": 0.4196823464107293, + "grad_norm": 1.5830397605895996, + "learning_rate": 0.0001303827709278873, + "loss": 1.3152, + "step": 11719 + }, + { + "epoch": 0.4197181585402976, + "grad_norm": 1.7221488952636719, + "learning_rate": 0.0001303717200308164, + "loss": 1.1428, + "step": 11720 + }, + { + "epoch": 0.41975397066986586, + "grad_norm": 1.6585030555725098, + "learning_rate": 0.00013036066872513326, + "loss": 1.1977, + "step": 11721 + }, + { + "epoch": 0.4197897827994342, + "grad_norm": 1.648769736289978, + "learning_rate": 0.00013034961701098645, + "loss": 1.1253, + "step": 11722 + }, + { + "epoch": 0.41982559492900245, + "grad_norm": 1.7773696184158325, + "learning_rate": 0.0001303385648885247, + "loss": 1.402, + "step": 11723 + }, + { + "epoch": 0.4198614070585707, + "grad_norm": 1.5465151071548462, + "learning_rate": 0.00013032751235789668, + "loss": 1.0693, + "step": 11724 + }, + { + "epoch": 0.41989721918813905, + "grad_norm": 1.575225830078125, + "learning_rate": 0.00013031645941925115, + "loss": 1.2376, + "step": 11725 + }, + { + "epoch": 0.4199330313177073, + "grad_norm": 1.548642635345459, + "learning_rate": 0.00013030540607273674, + "loss": 1.2564, + "step": 11726 + }, + { + "epoch": 0.4199688434472756, + "grad_norm": 1.5824639797210693, + "learning_rate": 0.00013029435231850215, + "loss": 1.0552, + "step": 11727 + }, + { + "epoch": 0.42000465557684385, + "grad_norm": 1.5621235370635986, + "learning_rate": 0.00013028329815669616, + "loss": 1.0951, + "step": 11728 + }, + { + "epoch": 0.4200404677064122, + "grad_norm": 1.3959424495697021, + "learning_rate": 0.00013027224358746743, + "loss": 1.0406, + "step": 11729 + }, + { + "epoch": 0.42007627983598045, + "grad_norm": 1.477270483970642, + "learning_rate": 0.00013026118861096472, + "loss": 0.992, + "step": 11730 + }, + { + "epoch": 0.4201120919655487, + "grad_norm": 1.5409213304519653, + "learning_rate": 0.00013025013322733674, + "loss": 1.1525, + "step": 11731 + }, + { + "epoch": 0.42014790409511704, + "grad_norm": 1.4198755025863647, + "learning_rate": 0.00013023907743673228, + "loss": 1.2003, + "step": 11732 + }, + { + "epoch": 0.4201837162246853, + "grad_norm": 1.241610050201416, + "learning_rate": 0.0001302280212393, + "loss": 1.0802, + "step": 11733 + }, + { + "epoch": 0.4202195283542536, + "grad_norm": 1.834234595298767, + "learning_rate": 0.0001302169646351887, + "loss": 1.2346, + "step": 11734 + }, + { + "epoch": 0.42025534048382185, + "grad_norm": 1.412363886833191, + "learning_rate": 0.00013020590762454713, + "loss": 1.1117, + "step": 11735 + }, + { + "epoch": 0.4202911526133902, + "grad_norm": 1.6652770042419434, + "learning_rate": 0.00013019485020752402, + "loss": 1.0401, + "step": 11736 + }, + { + "epoch": 0.42032696474295844, + "grad_norm": 1.4638367891311646, + "learning_rate": 0.00013018379238426814, + "loss": 1.2776, + "step": 11737 + }, + { + "epoch": 0.4203627768725267, + "grad_norm": 1.7045844793319702, + "learning_rate": 0.0001301727341549283, + "loss": 1.1821, + "step": 11738 + }, + { + "epoch": 0.42039858900209504, + "grad_norm": 1.4038751125335693, + "learning_rate": 0.0001301616755196532, + "loss": 1.1872, + "step": 11739 + }, + { + "epoch": 0.4204344011316633, + "grad_norm": 1.5050945281982422, + "learning_rate": 0.0001301506164785917, + "loss": 1.1665, + "step": 11740 + }, + { + "epoch": 0.4204702132612316, + "grad_norm": 1.63808012008667, + "learning_rate": 0.00013013955703189252, + "loss": 1.306, + "step": 11741 + }, + { + "epoch": 0.42050602539079984, + "grad_norm": 1.7118072509765625, + "learning_rate": 0.00013012849717970447, + "loss": 1.0065, + "step": 11742 + }, + { + "epoch": 0.42054183752036817, + "grad_norm": 1.8297134637832642, + "learning_rate": 0.00013011743692217638, + "loss": 1.3391, + "step": 11743 + }, + { + "epoch": 0.42057764964993644, + "grad_norm": 1.243053674697876, + "learning_rate": 0.000130106376259457, + "loss": 1.0332, + "step": 11744 + }, + { + "epoch": 0.4206134617795047, + "grad_norm": 1.507420301437378, + "learning_rate": 0.0001300953151916952, + "loss": 1.3101, + "step": 11745 + }, + { + "epoch": 0.42064927390907303, + "grad_norm": 2.4357011318206787, + "learning_rate": 0.0001300842537190397, + "loss": 1.1608, + "step": 11746 + }, + { + "epoch": 0.4206850860386413, + "grad_norm": 2.6648166179656982, + "learning_rate": 0.00013007319184163944, + "loss": 1.2299, + "step": 11747 + }, + { + "epoch": 0.42072089816820957, + "grad_norm": 1.8018635511398315, + "learning_rate": 0.00013006212955964311, + "loss": 1.2454, + "step": 11748 + }, + { + "epoch": 0.42075671029777784, + "grad_norm": 1.4373723268508911, + "learning_rate": 0.00013005106687319966, + "loss": 1.3313, + "step": 11749 + }, + { + "epoch": 0.42079252242734616, + "grad_norm": 3.1203184127807617, + "learning_rate": 0.00013004000378245782, + "loss": 1.2703, + "step": 11750 + }, + { + "epoch": 0.42082833455691443, + "grad_norm": 1.3604038953781128, + "learning_rate": 0.00013002894028756653, + "loss": 1.0107, + "step": 11751 + }, + { + "epoch": 0.4208641466864827, + "grad_norm": 1.4670329093933105, + "learning_rate": 0.0001300178763886745, + "loss": 1.1424, + "step": 11752 + }, + { + "epoch": 0.420899958816051, + "grad_norm": 1.3965401649475098, + "learning_rate": 0.00013000681208593073, + "loss": 1.2596, + "step": 11753 + }, + { + "epoch": 0.4209357709456193, + "grad_norm": 1.5106005668640137, + "learning_rate": 0.000129995747379484, + "loss": 1.1671, + "step": 11754 + }, + { + "epoch": 0.42097158307518756, + "grad_norm": 1.5465129613876343, + "learning_rate": 0.00012998468226948316, + "loss": 1.2613, + "step": 11755 + }, + { + "epoch": 0.42100739520475583, + "grad_norm": 1.7292498350143433, + "learning_rate": 0.00012997361675607714, + "loss": 0.9785, + "step": 11756 + }, + { + "epoch": 0.42104320733432415, + "grad_norm": 2.0267140865325928, + "learning_rate": 0.0001299625508394147, + "loss": 1.2196, + "step": 11757 + }, + { + "epoch": 0.4210790194638924, + "grad_norm": 1.2140575647354126, + "learning_rate": 0.00012995148451964487, + "loss": 1.1604, + "step": 11758 + }, + { + "epoch": 0.4211148315934607, + "grad_norm": 1.4750107526779175, + "learning_rate": 0.00012994041779691639, + "loss": 1.1909, + "step": 11759 + }, + { + "epoch": 0.421150643723029, + "grad_norm": 1.3854986429214478, + "learning_rate": 0.00012992935067137823, + "loss": 1.0438, + "step": 11760 + }, + { + "epoch": 0.4211864558525973, + "grad_norm": 1.6139713525772095, + "learning_rate": 0.00012991828314317923, + "loss": 1.1056, + "step": 11761 + }, + { + "epoch": 0.42122226798216555, + "grad_norm": 1.5734633207321167, + "learning_rate": 0.00012990721521246839, + "loss": 1.1952, + "step": 11762 + }, + { + "epoch": 0.4212580801117338, + "grad_norm": 1.5052772760391235, + "learning_rate": 0.00012989614687939453, + "loss": 1.1163, + "step": 11763 + }, + { + "epoch": 0.42129389224130215, + "grad_norm": 1.3392757177352905, + "learning_rate": 0.00012988507814410652, + "loss": 1.2822, + "step": 11764 + }, + { + "epoch": 0.4213297043708704, + "grad_norm": 1.1865421533584595, + "learning_rate": 0.0001298740090067534, + "loss": 1.1305, + "step": 11765 + }, + { + "epoch": 0.4213655165004387, + "grad_norm": 2.171365261077881, + "learning_rate": 0.000129862939467484, + "loss": 1.3206, + "step": 11766 + }, + { + "epoch": 0.421401328630007, + "grad_norm": 1.5784958600997925, + "learning_rate": 0.00012985186952644724, + "loss": 1.1923, + "step": 11767 + }, + { + "epoch": 0.4214371407595753, + "grad_norm": 2.3312020301818848, + "learning_rate": 0.0001298407991837921, + "loss": 1.0823, + "step": 11768 + }, + { + "epoch": 0.42147295288914355, + "grad_norm": 1.40310800075531, + "learning_rate": 0.0001298297284396675, + "loss": 1.2475, + "step": 11769 + }, + { + "epoch": 0.4215087650187118, + "grad_norm": 1.631093144416809, + "learning_rate": 0.0001298186572942224, + "loss": 1.1222, + "step": 11770 + }, + { + "epoch": 0.42154457714828014, + "grad_norm": 1.622137427330017, + "learning_rate": 0.00012980758574760573, + "loss": 1.3773, + "step": 11771 + }, + { + "epoch": 0.4215803892778484, + "grad_norm": 1.7617299556732178, + "learning_rate": 0.00012979651379996642, + "loss": 1.2272, + "step": 11772 + }, + { + "epoch": 0.4216162014074167, + "grad_norm": 2.1361560821533203, + "learning_rate": 0.00012978544145145343, + "loss": 1.131, + "step": 11773 + }, + { + "epoch": 0.421652013536985, + "grad_norm": 1.6445908546447754, + "learning_rate": 0.0001297743687022158, + "loss": 1.205, + "step": 11774 + }, + { + "epoch": 0.4216878256665533, + "grad_norm": 1.3495659828186035, + "learning_rate": 0.0001297632955524024, + "loss": 0.9439, + "step": 11775 + }, + { + "epoch": 0.42172363779612154, + "grad_norm": 1.7994189262390137, + "learning_rate": 0.00012975222200216227, + "loss": 1.0365, + "step": 11776 + }, + { + "epoch": 0.4217594499256898, + "grad_norm": 1.586832046508789, + "learning_rate": 0.00012974114805164438, + "loss": 1.1985, + "step": 11777 + }, + { + "epoch": 0.42179526205525814, + "grad_norm": 1.1652613878250122, + "learning_rate": 0.0001297300737009977, + "loss": 1.0438, + "step": 11778 + }, + { + "epoch": 0.4218310741848264, + "grad_norm": 1.7819578647613525, + "learning_rate": 0.00012971899895037123, + "loss": 1.0933, + "step": 11779 + }, + { + "epoch": 0.4218668863143947, + "grad_norm": 1.7958059310913086, + "learning_rate": 0.00012970792379991396, + "loss": 1.1805, + "step": 11780 + }, + { + "epoch": 0.421902698443963, + "grad_norm": 1.4553848505020142, + "learning_rate": 0.00012969684824977492, + "loss": 1.1244, + "step": 11781 + }, + { + "epoch": 0.42193851057353127, + "grad_norm": 1.6552484035491943, + "learning_rate": 0.00012968577230010304, + "loss": 1.2602, + "step": 11782 + }, + { + "epoch": 0.42197432270309954, + "grad_norm": 1.2699856758117676, + "learning_rate": 0.0001296746959510474, + "loss": 1.1096, + "step": 11783 + }, + { + "epoch": 0.4220101348326678, + "grad_norm": 1.5168498754501343, + "learning_rate": 0.00012966361920275702, + "loss": 1.0249, + "step": 11784 + }, + { + "epoch": 0.42204594696223613, + "grad_norm": 1.564701795578003, + "learning_rate": 0.0001296525420553809, + "loss": 1.1112, + "step": 11785 + }, + { + "epoch": 0.4220817590918044, + "grad_norm": 2.6146018505096436, + "learning_rate": 0.00012964146450906807, + "loss": 1.1208, + "step": 11786 + }, + { + "epoch": 0.42211757122137267, + "grad_norm": 1.8049308061599731, + "learning_rate": 0.0001296303865639676, + "loss": 1.2037, + "step": 11787 + }, + { + "epoch": 0.42215338335094094, + "grad_norm": 1.3820269107818604, + "learning_rate": 0.00012961930822022848, + "loss": 1.3316, + "step": 11788 + }, + { + "epoch": 0.42218919548050926, + "grad_norm": 1.4484199285507202, + "learning_rate": 0.00012960822947799978, + "loss": 1.045, + "step": 11789 + }, + { + "epoch": 0.42222500761007753, + "grad_norm": 1.2755193710327148, + "learning_rate": 0.0001295971503374305, + "loss": 1.1364, + "step": 11790 + }, + { + "epoch": 0.4222608197396458, + "grad_norm": 1.5563523769378662, + "learning_rate": 0.0001295860707986698, + "loss": 1.1741, + "step": 11791 + }, + { + "epoch": 0.4222966318692141, + "grad_norm": 2.501993179321289, + "learning_rate": 0.00012957499086186665, + "loss": 1.2095, + "step": 11792 + }, + { + "epoch": 0.4223324439987824, + "grad_norm": 1.8584046363830566, + "learning_rate": 0.00012956391052717017, + "loss": 1.0907, + "step": 11793 + }, + { + "epoch": 0.42236825612835066, + "grad_norm": 1.2970452308654785, + "learning_rate": 0.00012955282979472938, + "loss": 0.915, + "step": 11794 + }, + { + "epoch": 0.42240406825791893, + "grad_norm": 1.6354880332946777, + "learning_rate": 0.00012954174866469336, + "loss": 1.3637, + "step": 11795 + }, + { + "epoch": 0.42243988038748725, + "grad_norm": 1.6531819105148315, + "learning_rate": 0.0001295306671372113, + "loss": 1.2064, + "step": 11796 + }, + { + "epoch": 0.4224756925170555, + "grad_norm": 1.5636409521102905, + "learning_rate": 0.00012951958521243215, + "loss": 1.2572, + "step": 11797 + }, + { + "epoch": 0.4225115046466238, + "grad_norm": 1.1275839805603027, + "learning_rate": 0.00012950850289050508, + "loss": 1.1237, + "step": 11798 + }, + { + "epoch": 0.4225473167761921, + "grad_norm": 1.5606886148452759, + "learning_rate": 0.00012949742017157915, + "loss": 1.0881, + "step": 11799 + }, + { + "epoch": 0.4225831289057604, + "grad_norm": 1.3861061334609985, + "learning_rate": 0.0001294863370558035, + "loss": 1.0648, + "step": 11800 + }, + { + "epoch": 0.42261894103532865, + "grad_norm": 1.6571664810180664, + "learning_rate": 0.0001294752535433272, + "loss": 1.0705, + "step": 11801 + }, + { + "epoch": 0.4226547531648969, + "grad_norm": 1.6081892251968384, + "learning_rate": 0.0001294641696342994, + "loss": 1.2594, + "step": 11802 + }, + { + "epoch": 0.42269056529446525, + "grad_norm": 1.679232120513916, + "learning_rate": 0.00012945308532886918, + "loss": 0.9613, + "step": 11803 + }, + { + "epoch": 0.4227263774240335, + "grad_norm": 1.954422950744629, + "learning_rate": 0.0001294420006271857, + "loss": 1.1413, + "step": 11804 + }, + { + "epoch": 0.4227621895536018, + "grad_norm": 1.2991750240325928, + "learning_rate": 0.00012943091552939807, + "loss": 0.959, + "step": 11805 + }, + { + "epoch": 0.4227980016831701, + "grad_norm": 1.704436182975769, + "learning_rate": 0.00012941983003565544, + "loss": 1.0509, + "step": 11806 + }, + { + "epoch": 0.4228338138127384, + "grad_norm": 1.3164840936660767, + "learning_rate": 0.00012940874414610698, + "loss": 1.1751, + "step": 11807 + }, + { + "epoch": 0.42286962594230665, + "grad_norm": 1.7544970512390137, + "learning_rate": 0.00012939765786090178, + "loss": 0.9988, + "step": 11808 + }, + { + "epoch": 0.4229054380718749, + "grad_norm": 1.5989761352539062, + "learning_rate": 0.000129386571180189, + "loss": 1.2207, + "step": 11809 + }, + { + "epoch": 0.42294125020144324, + "grad_norm": 1.691055178642273, + "learning_rate": 0.00012937548410411778, + "loss": 1.1526, + "step": 11810 + }, + { + "epoch": 0.4229770623310115, + "grad_norm": 1.4714614152908325, + "learning_rate": 0.0001293643966328374, + "loss": 1.0671, + "step": 11811 + }, + { + "epoch": 0.4230128744605798, + "grad_norm": 1.7106091976165771, + "learning_rate": 0.00012935330876649687, + "loss": 1.237, + "step": 11812 + }, + { + "epoch": 0.4230486865901481, + "grad_norm": 1.661245584487915, + "learning_rate": 0.00012934222050524547, + "loss": 1.242, + "step": 11813 + }, + { + "epoch": 0.4230844987197164, + "grad_norm": 1.4279000759124756, + "learning_rate": 0.00012933113184923232, + "loss": 1.1571, + "step": 11814 + }, + { + "epoch": 0.42312031084928464, + "grad_norm": 1.3381149768829346, + "learning_rate": 0.00012932004279860663, + "loss": 1.0822, + "step": 11815 + }, + { + "epoch": 0.4231561229788529, + "grad_norm": 1.947373390197754, + "learning_rate": 0.0001293089533535176, + "loss": 1.3607, + "step": 11816 + }, + { + "epoch": 0.42319193510842124, + "grad_norm": 1.8138532638549805, + "learning_rate": 0.00012929786351411439, + "loss": 1.0036, + "step": 11817 + }, + { + "epoch": 0.4232277472379895, + "grad_norm": 1.5226013660430908, + "learning_rate": 0.00012928677328054623, + "loss": 1.1791, + "step": 11818 + }, + { + "epoch": 0.4232635593675578, + "grad_norm": 2.1852755546569824, + "learning_rate": 0.00012927568265296227, + "loss": 1.1774, + "step": 11819 + }, + { + "epoch": 0.4232993714971261, + "grad_norm": 1.4067823886871338, + "learning_rate": 0.00012926459163151182, + "loss": 1.2744, + "step": 11820 + }, + { + "epoch": 0.42333518362669437, + "grad_norm": 1.5838507413864136, + "learning_rate": 0.000129253500216344, + "loss": 1.2192, + "step": 11821 + }, + { + "epoch": 0.42337099575626264, + "grad_norm": 1.2762587070465088, + "learning_rate": 0.00012924240840760811, + "loss": 1.1218, + "step": 11822 + }, + { + "epoch": 0.4234068078858309, + "grad_norm": 1.4634974002838135, + "learning_rate": 0.0001292313162054533, + "loss": 1.1344, + "step": 11823 + }, + { + "epoch": 0.42344262001539923, + "grad_norm": 1.5123440027236938, + "learning_rate": 0.00012922022361002886, + "loss": 1.2617, + "step": 11824 + }, + { + "epoch": 0.4234784321449675, + "grad_norm": 2.1210169792175293, + "learning_rate": 0.00012920913062148398, + "loss": 1.4718, + "step": 11825 + }, + { + "epoch": 0.42351424427453577, + "grad_norm": 1.5584173202514648, + "learning_rate": 0.00012919803723996794, + "loss": 1.27, + "step": 11826 + }, + { + "epoch": 0.4235500564041041, + "grad_norm": 1.4072414636611938, + "learning_rate": 0.00012918694346562997, + "loss": 1.3699, + "step": 11827 + }, + { + "epoch": 0.42358586853367236, + "grad_norm": 1.3503265380859375, + "learning_rate": 0.0001291758492986193, + "loss": 1.1874, + "step": 11828 + }, + { + "epoch": 0.42362168066324063, + "grad_norm": 1.4352205991744995, + "learning_rate": 0.00012916475473908525, + "loss": 0.9379, + "step": 11829 + }, + { + "epoch": 0.4236574927928089, + "grad_norm": 1.6634399890899658, + "learning_rate": 0.00012915365978717703, + "loss": 1.0672, + "step": 11830 + }, + { + "epoch": 0.4236933049223772, + "grad_norm": 1.357803225517273, + "learning_rate": 0.0001291425644430439, + "loss": 1.1745, + "step": 11831 + }, + { + "epoch": 0.4237291170519455, + "grad_norm": 1.6219903230667114, + "learning_rate": 0.0001291314687068352, + "loss": 1.0766, + "step": 11832 + }, + { + "epoch": 0.42376492918151376, + "grad_norm": 1.2493711709976196, + "learning_rate": 0.00012912037257870016, + "loss": 1.2566, + "step": 11833 + }, + { + "epoch": 0.4238007413110821, + "grad_norm": 1.392769694328308, + "learning_rate": 0.00012910927605878803, + "loss": 1.0437, + "step": 11834 + }, + { + "epoch": 0.42383655344065035, + "grad_norm": 1.5115296840667725, + "learning_rate": 0.0001290981791472482, + "loss": 1.3632, + "step": 11835 + }, + { + "epoch": 0.4238723655702186, + "grad_norm": 1.431099772453308, + "learning_rate": 0.00012908708184422983, + "loss": 1.0864, + "step": 11836 + }, + { + "epoch": 0.4239081776997869, + "grad_norm": 1.4179503917694092, + "learning_rate": 0.0001290759841498823, + "loss": 1.2093, + "step": 11837 + }, + { + "epoch": 0.4239439898293552, + "grad_norm": 1.5140700340270996, + "learning_rate": 0.00012906488606435497, + "loss": 1.1521, + "step": 11838 + }, + { + "epoch": 0.4239798019589235, + "grad_norm": 1.4787898063659668, + "learning_rate": 0.00012905378758779702, + "loss": 0.9973, + "step": 11839 + }, + { + "epoch": 0.42401561408849175, + "grad_norm": 1.4888122081756592, + "learning_rate": 0.00012904268872035787, + "loss": 0.9615, + "step": 11840 + }, + { + "epoch": 0.4240514262180601, + "grad_norm": 2.0386226177215576, + "learning_rate": 0.00012903158946218682, + "loss": 1.1928, + "step": 11841 + }, + { + "epoch": 0.42408723834762835, + "grad_norm": 1.6471748352050781, + "learning_rate": 0.0001290204898134332, + "loss": 1.1639, + "step": 11842 + }, + { + "epoch": 0.4241230504771966, + "grad_norm": 2.104900360107422, + "learning_rate": 0.00012900938977424627, + "loss": 1.2192, + "step": 11843 + }, + { + "epoch": 0.4241588626067649, + "grad_norm": 1.7008192539215088, + "learning_rate": 0.00012899828934477545, + "loss": 1.3092, + "step": 11844 + }, + { + "epoch": 0.4241946747363332, + "grad_norm": 1.3381187915802002, + "learning_rate": 0.00012898718852517003, + "loss": 1.1391, + "step": 11845 + }, + { + "epoch": 0.4242304868659015, + "grad_norm": 1.5413808822631836, + "learning_rate": 0.0001289760873155794, + "loss": 1.2513, + "step": 11846 + }, + { + "epoch": 0.42426629899546975, + "grad_norm": 1.4519031047821045, + "learning_rate": 0.00012896498571615287, + "loss": 1.1682, + "step": 11847 + }, + { + "epoch": 0.4243021111250381, + "grad_norm": 1.5907670259475708, + "learning_rate": 0.00012895388372703985, + "loss": 1.2676, + "step": 11848 + }, + { + "epoch": 0.42433792325460634, + "grad_norm": 1.653930902481079, + "learning_rate": 0.00012894278134838963, + "loss": 1.2022, + "step": 11849 + }, + { + "epoch": 0.4243737353841746, + "grad_norm": 1.5742918252944946, + "learning_rate": 0.00012893167858035168, + "loss": 1.1856, + "step": 11850 + }, + { + "epoch": 0.4244095475137429, + "grad_norm": 1.3244616985321045, + "learning_rate": 0.00012892057542307527, + "loss": 1.299, + "step": 11851 + }, + { + "epoch": 0.4244453596433112, + "grad_norm": 1.7068554162979126, + "learning_rate": 0.00012890947187670982, + "loss": 1.2527, + "step": 11852 + }, + { + "epoch": 0.4244811717728795, + "grad_norm": 1.3202046155929565, + "learning_rate": 0.00012889836794140473, + "loss": 1.2631, + "step": 11853 + }, + { + "epoch": 0.42451698390244774, + "grad_norm": 1.4349133968353271, + "learning_rate": 0.00012888726361730935, + "loss": 1.2426, + "step": 11854 + }, + { + "epoch": 0.42455279603201607, + "grad_norm": 1.4341559410095215, + "learning_rate": 0.00012887615890457314, + "loss": 1.0912, + "step": 11855 + }, + { + "epoch": 0.42458860816158434, + "grad_norm": 1.269834280014038, + "learning_rate": 0.00012886505380334544, + "loss": 1.0407, + "step": 11856 + }, + { + "epoch": 0.4246244202911526, + "grad_norm": 1.296610713005066, + "learning_rate": 0.00012885394831377569, + "loss": 1.1929, + "step": 11857 + }, + { + "epoch": 0.4246602324207209, + "grad_norm": 1.3436713218688965, + "learning_rate": 0.00012884284243601325, + "loss": 1.1181, + "step": 11858 + }, + { + "epoch": 0.4246960445502892, + "grad_norm": 1.2461183071136475, + "learning_rate": 0.0001288317361702076, + "loss": 1.0315, + "step": 11859 + }, + { + "epoch": 0.42473185667985747, + "grad_norm": 1.2549728155136108, + "learning_rate": 0.00012882062951650815, + "loss": 1.1049, + "step": 11860 + }, + { + "epoch": 0.42476766880942574, + "grad_norm": 2.0624868869781494, + "learning_rate": 0.00012880952247506426, + "loss": 1.1207, + "step": 11861 + }, + { + "epoch": 0.42480348093899406, + "grad_norm": 1.8010036945343018, + "learning_rate": 0.00012879841504602544, + "loss": 1.2532, + "step": 11862 + }, + { + "epoch": 0.42483929306856233, + "grad_norm": 1.4449591636657715, + "learning_rate": 0.00012878730722954104, + "loss": 1.1232, + "step": 11863 + }, + { + "epoch": 0.4248751051981306, + "grad_norm": 1.4986565113067627, + "learning_rate": 0.00012877619902576062, + "loss": 1.1453, + "step": 11864 + }, + { + "epoch": 0.42491091732769887, + "grad_norm": 1.6120518445968628, + "learning_rate": 0.0001287650904348335, + "loss": 0.8364, + "step": 11865 + }, + { + "epoch": 0.4249467294572672, + "grad_norm": 1.6766959428787231, + "learning_rate": 0.00012875398145690924, + "loss": 1.0321, + "step": 11866 + }, + { + "epoch": 0.42498254158683546, + "grad_norm": 1.5280537605285645, + "learning_rate": 0.00012874287209213724, + "loss": 1.0454, + "step": 11867 + }, + { + "epoch": 0.42501835371640373, + "grad_norm": 1.2292038202285767, + "learning_rate": 0.00012873176234066698, + "loss": 1.1502, + "step": 11868 + }, + { + "epoch": 0.42505416584597205, + "grad_norm": 1.6718661785125732, + "learning_rate": 0.0001287206522026479, + "loss": 1.29, + "step": 11869 + }, + { + "epoch": 0.4250899779755403, + "grad_norm": 1.4056791067123413, + "learning_rate": 0.0001287095416782295, + "loss": 1.1195, + "step": 11870 + }, + { + "epoch": 0.4251257901051086, + "grad_norm": 1.6021045446395874, + "learning_rate": 0.00012869843076756125, + "loss": 1.1307, + "step": 11871 + }, + { + "epoch": 0.42516160223467686, + "grad_norm": 1.8065776824951172, + "learning_rate": 0.0001286873194707926, + "loss": 1.5, + "step": 11872 + }, + { + "epoch": 0.4251974143642452, + "grad_norm": 1.552996277809143, + "learning_rate": 0.0001286762077880731, + "loss": 1.0331, + "step": 11873 + }, + { + "epoch": 0.42523322649381345, + "grad_norm": 2.0668625831604004, + "learning_rate": 0.00012866509571955221, + "loss": 1.4064, + "step": 11874 + }, + { + "epoch": 0.4252690386233817, + "grad_norm": 1.330610752105713, + "learning_rate": 0.00012865398326537944, + "loss": 1.0734, + "step": 11875 + }, + { + "epoch": 0.42530485075295005, + "grad_norm": 1.5055532455444336, + "learning_rate": 0.0001286428704257043, + "loss": 1.1291, + "step": 11876 + }, + { + "epoch": 0.4253406628825183, + "grad_norm": 1.72886323928833, + "learning_rate": 0.00012863175720067627, + "loss": 1.2814, + "step": 11877 + }, + { + "epoch": 0.4253764750120866, + "grad_norm": 1.3864039182662964, + "learning_rate": 0.00012862064359044485, + "loss": 1.2225, + "step": 11878 + }, + { + "epoch": 0.42541228714165485, + "grad_norm": 1.6477689743041992, + "learning_rate": 0.00012860952959515962, + "loss": 1.1299, + "step": 11879 + }, + { + "epoch": 0.4254480992712232, + "grad_norm": 2.0613982677459717, + "learning_rate": 0.00012859841521497008, + "loss": 1.0705, + "step": 11880 + }, + { + "epoch": 0.42548391140079145, + "grad_norm": 1.9266636371612549, + "learning_rate": 0.00012858730045002572, + "loss": 1.3249, + "step": 11881 + }, + { + "epoch": 0.4255197235303597, + "grad_norm": 1.564435362815857, + "learning_rate": 0.00012857618530047615, + "loss": 1.1293, + "step": 11882 + }, + { + "epoch": 0.42555553565992804, + "grad_norm": 1.4691474437713623, + "learning_rate": 0.0001285650697664708, + "loss": 1.182, + "step": 11883 + }, + { + "epoch": 0.4255913477894963, + "grad_norm": 1.7709897756576538, + "learning_rate": 0.00012855395384815937, + "loss": 1.3503, + "step": 11884 + }, + { + "epoch": 0.4256271599190646, + "grad_norm": 1.6525561809539795, + "learning_rate": 0.00012854283754569127, + "loss": 1.1021, + "step": 11885 + }, + { + "epoch": 0.42566297204863285, + "grad_norm": 1.54116952419281, + "learning_rate": 0.00012853172085921613, + "loss": 1.0721, + "step": 11886 + }, + { + "epoch": 0.4256987841782012, + "grad_norm": 1.35331130027771, + "learning_rate": 0.00012852060378888347, + "loss": 1.2173, + "step": 11887 + }, + { + "epoch": 0.42573459630776944, + "grad_norm": 1.623600959777832, + "learning_rate": 0.00012850948633484288, + "loss": 1.0621, + "step": 11888 + }, + { + "epoch": 0.4257704084373377, + "grad_norm": 1.758484125137329, + "learning_rate": 0.00012849836849724392, + "loss": 1.0024, + "step": 11889 + }, + { + "epoch": 0.42580622056690604, + "grad_norm": 1.5154815912246704, + "learning_rate": 0.0001284872502762362, + "loss": 1.231, + "step": 11890 + }, + { + "epoch": 0.4258420326964743, + "grad_norm": 1.5676137208938599, + "learning_rate": 0.00012847613167196923, + "loss": 1.1234, + "step": 11891 + }, + { + "epoch": 0.4258778448260426, + "grad_norm": 1.3782830238342285, + "learning_rate": 0.00012846501268459266, + "loss": 1.0113, + "step": 11892 + }, + { + "epoch": 0.42591365695561084, + "grad_norm": 1.3881540298461914, + "learning_rate": 0.00012845389331425606, + "loss": 0.9569, + "step": 11893 + }, + { + "epoch": 0.42594946908517917, + "grad_norm": 1.2706384658813477, + "learning_rate": 0.00012844277356110906, + "loss": 1.216, + "step": 11894 + }, + { + "epoch": 0.42598528121474744, + "grad_norm": 1.359503149986267, + "learning_rate": 0.0001284316534253012, + "loss": 1.0341, + "step": 11895 + }, + { + "epoch": 0.4260210933443157, + "grad_norm": 2.1669983863830566, + "learning_rate": 0.0001284205329069821, + "loss": 1.1775, + "step": 11896 + }, + { + "epoch": 0.42605690547388403, + "grad_norm": 1.3205825090408325, + "learning_rate": 0.00012840941200630143, + "loss": 1.1607, + "step": 11897 + }, + { + "epoch": 0.4260927176034523, + "grad_norm": 1.544364333152771, + "learning_rate": 0.00012839829072340875, + "loss": 1.0903, + "step": 11898 + }, + { + "epoch": 0.42612852973302057, + "grad_norm": 1.492375373840332, + "learning_rate": 0.0001283871690584537, + "loss": 1.2211, + "step": 11899 + }, + { + "epoch": 0.42616434186258884, + "grad_norm": 1.433387041091919, + "learning_rate": 0.00012837604701158587, + "loss": 1.2203, + "step": 11900 + }, + { + "epoch": 0.42620015399215716, + "grad_norm": 1.4169259071350098, + "learning_rate": 0.000128364924582955, + "loss": 1.2053, + "step": 11901 + }, + { + "epoch": 0.42623596612172543, + "grad_norm": 1.642189383506775, + "learning_rate": 0.00012835380177271058, + "loss": 1.0463, + "step": 11902 + }, + { + "epoch": 0.4262717782512937, + "grad_norm": 1.7615892887115479, + "learning_rate": 0.00012834267858100238, + "loss": 0.9561, + "step": 11903 + }, + { + "epoch": 0.426307590380862, + "grad_norm": 1.6017496585845947, + "learning_rate": 0.00012833155500798003, + "loss": 1.1166, + "step": 11904 + }, + { + "epoch": 0.4263434025104303, + "grad_norm": 1.5360126495361328, + "learning_rate": 0.0001283204310537931, + "loss": 1.2381, + "step": 11905 + }, + { + "epoch": 0.42637921463999856, + "grad_norm": 1.519460678100586, + "learning_rate": 0.00012830930671859132, + "loss": 1.0437, + "step": 11906 + }, + { + "epoch": 0.42641502676956683, + "grad_norm": 1.6984070539474487, + "learning_rate": 0.00012829818200252432, + "loss": 1.2182, + "step": 11907 + }, + { + "epoch": 0.42645083889913515, + "grad_norm": 1.3625787496566772, + "learning_rate": 0.0001282870569057418, + "loss": 1.1724, + "step": 11908 + }, + { + "epoch": 0.4264866510287034, + "grad_norm": 1.347603678703308, + "learning_rate": 0.0001282759314283934, + "loss": 1.0381, + "step": 11909 + }, + { + "epoch": 0.4265224631582717, + "grad_norm": 1.2929747104644775, + "learning_rate": 0.00012826480557062884, + "loss": 1.0217, + "step": 11910 + }, + { + "epoch": 0.42655827528784, + "grad_norm": 1.6029220819473267, + "learning_rate": 0.00012825367933259774, + "loss": 1.1117, + "step": 11911 + }, + { + "epoch": 0.4265940874174083, + "grad_norm": 1.4148749113082886, + "learning_rate": 0.00012824255271444987, + "loss": 1.0715, + "step": 11912 + }, + { + "epoch": 0.42662989954697655, + "grad_norm": 1.6217290163040161, + "learning_rate": 0.00012823142571633488, + "loss": 1.1782, + "step": 11913 + }, + { + "epoch": 0.4266657116765448, + "grad_norm": 1.7803683280944824, + "learning_rate": 0.00012822029833840245, + "loss": 1.0402, + "step": 11914 + }, + { + "epoch": 0.42670152380611315, + "grad_norm": 1.653348445892334, + "learning_rate": 0.00012820917058080234, + "loss": 1.3044, + "step": 11915 + }, + { + "epoch": 0.4267373359356814, + "grad_norm": 1.8677040338516235, + "learning_rate": 0.0001281980424436842, + "loss": 1.2323, + "step": 11916 + }, + { + "epoch": 0.4267731480652497, + "grad_norm": 1.5772522687911987, + "learning_rate": 0.00012818691392719778, + "loss": 1.1585, + "step": 11917 + }, + { + "epoch": 0.426808960194818, + "grad_norm": 1.3561416864395142, + "learning_rate": 0.00012817578503149276, + "loss": 1.2104, + "step": 11918 + }, + { + "epoch": 0.4268447723243863, + "grad_norm": 1.3809889554977417, + "learning_rate": 0.00012816465575671895, + "loss": 1.052, + "step": 11919 + }, + { + "epoch": 0.42688058445395455, + "grad_norm": 1.260909914970398, + "learning_rate": 0.000128153526103026, + "loss": 1.1623, + "step": 11920 + }, + { + "epoch": 0.4269163965835228, + "grad_norm": 1.7103805541992188, + "learning_rate": 0.00012814239607056367, + "loss": 1.0529, + "step": 11921 + }, + { + "epoch": 0.42695220871309114, + "grad_norm": 2.4545953273773193, + "learning_rate": 0.00012813126565948171, + "loss": 0.9889, + "step": 11922 + }, + { + "epoch": 0.4269880208426594, + "grad_norm": 1.9225283861160278, + "learning_rate": 0.00012812013486992985, + "loss": 1.3039, + "step": 11923 + }, + { + "epoch": 0.4270238329722277, + "grad_norm": 1.5499284267425537, + "learning_rate": 0.00012810900370205784, + "loss": 1.3113, + "step": 11924 + }, + { + "epoch": 0.427059645101796, + "grad_norm": 1.4276931285858154, + "learning_rate": 0.00012809787215601546, + "loss": 1.1463, + "step": 11925 + }, + { + "epoch": 0.4270954572313643, + "grad_norm": 1.7363452911376953, + "learning_rate": 0.00012808674023195244, + "loss": 1.1277, + "step": 11926 + }, + { + "epoch": 0.42713126936093254, + "grad_norm": 1.2459523677825928, + "learning_rate": 0.00012807560793001856, + "loss": 1.0775, + "step": 11927 + }, + { + "epoch": 0.4271670814905008, + "grad_norm": 1.4627254009246826, + "learning_rate": 0.0001280644752503636, + "loss": 1.102, + "step": 11928 + }, + { + "epoch": 0.42720289362006914, + "grad_norm": 1.2726306915283203, + "learning_rate": 0.00012805334219313734, + "loss": 1.2244, + "step": 11929 + }, + { + "epoch": 0.4272387057496374, + "grad_norm": 1.8107647895812988, + "learning_rate": 0.00012804220875848953, + "loss": 1.1245, + "step": 11930 + }, + { + "epoch": 0.4272745178792057, + "grad_norm": 1.808623194694519, + "learning_rate": 0.00012803107494657, + "loss": 1.013, + "step": 11931 + }, + { + "epoch": 0.427310330008774, + "grad_norm": 1.741328477859497, + "learning_rate": 0.0001280199407575285, + "loss": 1.2102, + "step": 11932 + }, + { + "epoch": 0.42734614213834227, + "grad_norm": 1.5402311086654663, + "learning_rate": 0.0001280088061915148, + "loss": 1.0812, + "step": 11933 + }, + { + "epoch": 0.42738195426791054, + "grad_norm": 2.117802143096924, + "learning_rate": 0.00012799767124867874, + "loss": 1.131, + "step": 11934 + }, + { + "epoch": 0.4274177663974788, + "grad_norm": 1.5810493230819702, + "learning_rate": 0.00012798653592917017, + "loss": 1.0914, + "step": 11935 + }, + { + "epoch": 0.42745357852704713, + "grad_norm": 1.389101505279541, + "learning_rate": 0.00012797540023313882, + "loss": 1.1105, + "step": 11936 + }, + { + "epoch": 0.4274893906566154, + "grad_norm": 1.6564030647277832, + "learning_rate": 0.0001279642641607346, + "loss": 1.1689, + "step": 11937 + }, + { + "epoch": 0.42752520278618367, + "grad_norm": 2.0278706550598145, + "learning_rate": 0.00012795312771210726, + "loss": 1.2235, + "step": 11938 + }, + { + "epoch": 0.427561014915752, + "grad_norm": 1.6320174932479858, + "learning_rate": 0.00012794199088740665, + "loss": 1.08, + "step": 11939 + }, + { + "epoch": 0.42759682704532026, + "grad_norm": 1.3594666719436646, + "learning_rate": 0.00012793085368678254, + "loss": 1.1658, + "step": 11940 + }, + { + "epoch": 0.42763263917488853, + "grad_norm": 1.4265018701553345, + "learning_rate": 0.00012791971611038488, + "loss": 0.9057, + "step": 11941 + }, + { + "epoch": 0.4276684513044568, + "grad_norm": 1.5651917457580566, + "learning_rate": 0.00012790857815836342, + "loss": 1.0277, + "step": 11942 + }, + { + "epoch": 0.4277042634340251, + "grad_norm": 1.6135685443878174, + "learning_rate": 0.00012789743983086807, + "loss": 1.0977, + "step": 11943 + }, + { + "epoch": 0.4277400755635934, + "grad_norm": 1.2563422918319702, + "learning_rate": 0.00012788630112804862, + "loss": 1.1162, + "step": 11944 + }, + { + "epoch": 0.42777588769316166, + "grad_norm": 1.8598573207855225, + "learning_rate": 0.000127875162050055, + "loss": 1.0146, + "step": 11945 + }, + { + "epoch": 0.42781169982273, + "grad_norm": 1.5711168050765991, + "learning_rate": 0.00012786402259703697, + "loss": 0.9078, + "step": 11946 + }, + { + "epoch": 0.42784751195229825, + "grad_norm": 1.9462863206863403, + "learning_rate": 0.0001278528827691445, + "loss": 1.1172, + "step": 11947 + }, + { + "epoch": 0.4278833240818665, + "grad_norm": 1.9544185400009155, + "learning_rate": 0.00012784174256652743, + "loss": 1.1276, + "step": 11948 + }, + { + "epoch": 0.4279191362114348, + "grad_norm": 1.9037967920303345, + "learning_rate": 0.0001278306019893356, + "loss": 0.9081, + "step": 11949 + }, + { + "epoch": 0.4279549483410031, + "grad_norm": 1.4194331169128418, + "learning_rate": 0.00012781946103771892, + "loss": 0.9957, + "step": 11950 + }, + { + "epoch": 0.4279907604705714, + "grad_norm": 1.6048306226730347, + "learning_rate": 0.0001278083197118273, + "loss": 1.2458, + "step": 11951 + }, + { + "epoch": 0.42802657260013965, + "grad_norm": 1.4966739416122437, + "learning_rate": 0.00012779717801181058, + "loss": 1.2572, + "step": 11952 + }, + { + "epoch": 0.428062384729708, + "grad_norm": 1.6944504976272583, + "learning_rate": 0.0001277860359378187, + "loss": 1.1912, + "step": 11953 + }, + { + "epoch": 0.42809819685927625, + "grad_norm": 1.9216766357421875, + "learning_rate": 0.00012777489349000156, + "loss": 1.0764, + "step": 11954 + }, + { + "epoch": 0.4281340089888445, + "grad_norm": 1.4232109785079956, + "learning_rate": 0.00012776375066850902, + "loss": 1.1349, + "step": 11955 + }, + { + "epoch": 0.4281698211184128, + "grad_norm": 1.5270382165908813, + "learning_rate": 0.00012775260747349107, + "loss": 0.9965, + "step": 11956 + }, + { + "epoch": 0.4282056332479811, + "grad_norm": 1.501751184463501, + "learning_rate": 0.0001277414639050976, + "loss": 1.1999, + "step": 11957 + }, + { + "epoch": 0.4282414453775494, + "grad_norm": 1.6532460451126099, + "learning_rate": 0.00012773031996347845, + "loss": 1.072, + "step": 11958 + }, + { + "epoch": 0.42827725750711765, + "grad_norm": 1.338843584060669, + "learning_rate": 0.00012771917564878367, + "loss": 0.9676, + "step": 11959 + }, + { + "epoch": 0.428313069636686, + "grad_norm": 1.3295155763626099, + "learning_rate": 0.00012770803096116308, + "loss": 1.0458, + "step": 11960 + }, + { + "epoch": 0.42834888176625424, + "grad_norm": 1.462594985961914, + "learning_rate": 0.00012769688590076673, + "loss": 1.0054, + "step": 11961 + }, + { + "epoch": 0.4283846938958225, + "grad_norm": 1.5064935684204102, + "learning_rate": 0.00012768574046774446, + "loss": 1.1782, + "step": 11962 + }, + { + "epoch": 0.4284205060253908, + "grad_norm": 1.665350317955017, + "learning_rate": 0.00012767459466224632, + "loss": 1.1904, + "step": 11963 + }, + { + "epoch": 0.4284563181549591, + "grad_norm": 1.6753569841384888, + "learning_rate": 0.00012766344848442218, + "loss": 1.3812, + "step": 11964 + }, + { + "epoch": 0.4284921302845274, + "grad_norm": 1.589890718460083, + "learning_rate": 0.00012765230193442198, + "loss": 1.0745, + "step": 11965 + }, + { + "epoch": 0.42852794241409564, + "grad_norm": 1.8453665971755981, + "learning_rate": 0.0001276411550123958, + "loss": 1.1337, + "step": 11966 + }, + { + "epoch": 0.42856375454366397, + "grad_norm": 1.5958977937698364, + "learning_rate": 0.00012763000771849348, + "loss": 1.2815, + "step": 11967 + }, + { + "epoch": 0.42859956667323224, + "grad_norm": 2.1262028217315674, + "learning_rate": 0.00012761886005286508, + "loss": 1.1308, + "step": 11968 + }, + { + "epoch": 0.4286353788028005, + "grad_norm": 1.3893120288848877, + "learning_rate": 0.0001276077120156605, + "loss": 0.9605, + "step": 11969 + }, + { + "epoch": 0.4286711909323688, + "grad_norm": 1.4262772798538208, + "learning_rate": 0.0001275965636070298, + "loss": 0.9472, + "step": 11970 + }, + { + "epoch": 0.4287070030619371, + "grad_norm": 2.265742778778076, + "learning_rate": 0.00012758541482712295, + "loss": 1.0027, + "step": 11971 + }, + { + "epoch": 0.42874281519150537, + "grad_norm": 1.320253610610962, + "learning_rate": 0.0001275742656760899, + "loss": 1.0108, + "step": 11972 + }, + { + "epoch": 0.42877862732107364, + "grad_norm": 1.3877204656600952, + "learning_rate": 0.00012756311615408068, + "loss": 0.9915, + "step": 11973 + }, + { + "epoch": 0.42881443945064196, + "grad_norm": 1.2809312343597412, + "learning_rate": 0.0001275519662612453, + "loss": 1.0546, + "step": 11974 + }, + { + "epoch": 0.42885025158021023, + "grad_norm": 1.618111491203308, + "learning_rate": 0.00012754081599773373, + "loss": 1.2734, + "step": 11975 + }, + { + "epoch": 0.4288860637097785, + "grad_norm": 1.4016692638397217, + "learning_rate": 0.00012752966536369603, + "loss": 0.9897, + "step": 11976 + }, + { + "epoch": 0.42892187583934677, + "grad_norm": 1.4637268781661987, + "learning_rate": 0.00012751851435928216, + "loss": 1.2154, + "step": 11977 + }, + { + "epoch": 0.4289576879689151, + "grad_norm": 1.2364965677261353, + "learning_rate": 0.00012750736298464216, + "loss": 0.9827, + "step": 11978 + }, + { + "epoch": 0.42899350009848336, + "grad_norm": 1.760238528251648, + "learning_rate": 0.00012749621123992613, + "loss": 1.1369, + "step": 11979 + }, + { + "epoch": 0.42902931222805163, + "grad_norm": 1.4084523916244507, + "learning_rate": 0.000127485059125284, + "loss": 0.9928, + "step": 11980 + }, + { + "epoch": 0.42906512435761995, + "grad_norm": 1.3623006343841553, + "learning_rate": 0.0001274739066408659, + "loss": 1.1911, + "step": 11981 + }, + { + "epoch": 0.4291009364871882, + "grad_norm": 1.5282193422317505, + "learning_rate": 0.0001274627537868218, + "loss": 1.1724, + "step": 11982 + }, + { + "epoch": 0.4291367486167565, + "grad_norm": 1.409812331199646, + "learning_rate": 0.00012745160056330178, + "loss": 1.2191, + "step": 11983 + }, + { + "epoch": 0.42917256074632476, + "grad_norm": 1.85293710231781, + "learning_rate": 0.00012744044697045586, + "loss": 1.3423, + "step": 11984 + }, + { + "epoch": 0.4292083728758931, + "grad_norm": 1.8642857074737549, + "learning_rate": 0.00012742929300843417, + "loss": 1.2433, + "step": 11985 + }, + { + "epoch": 0.42924418500546135, + "grad_norm": 2.0763304233551025, + "learning_rate": 0.00012741813867738665, + "loss": 1.5046, + "step": 11986 + }, + { + "epoch": 0.4292799971350296, + "grad_norm": 2.0532264709472656, + "learning_rate": 0.00012740698397746352, + "loss": 1.1162, + "step": 11987 + }, + { + "epoch": 0.4293158092645979, + "grad_norm": 1.5968438386917114, + "learning_rate": 0.0001273958289088147, + "loss": 1.175, + "step": 11988 + }, + { + "epoch": 0.4293516213941662, + "grad_norm": 1.4182860851287842, + "learning_rate": 0.0001273846734715904, + "loss": 1.0338, + "step": 11989 + }, + { + "epoch": 0.4293874335237345, + "grad_norm": 1.4828894138336182, + "learning_rate": 0.0001273735176659406, + "loss": 1.032, + "step": 11990 + }, + { + "epoch": 0.42942324565330275, + "grad_norm": 1.503562569618225, + "learning_rate": 0.00012736236149201547, + "loss": 1.1717, + "step": 11991 + }, + { + "epoch": 0.4294590577828711, + "grad_norm": 1.7701627016067505, + "learning_rate": 0.000127351204949965, + "loss": 1.1748, + "step": 11992 + }, + { + "epoch": 0.42949486991243935, + "grad_norm": 1.7342123985290527, + "learning_rate": 0.0001273400480399394, + "loss": 1.0648, + "step": 11993 + }, + { + "epoch": 0.4295306820420076, + "grad_norm": 1.3372910022735596, + "learning_rate": 0.00012732889076208872, + "loss": 1.0736, + "step": 11994 + }, + { + "epoch": 0.4295664941715759, + "grad_norm": 1.4422963857650757, + "learning_rate": 0.00012731773311656304, + "loss": 1.2077, + "step": 11995 + }, + { + "epoch": 0.4296023063011442, + "grad_norm": 1.4276336431503296, + "learning_rate": 0.00012730657510351252, + "loss": 1.1354, + "step": 11996 + }, + { + "epoch": 0.4296381184307125, + "grad_norm": 1.283072829246521, + "learning_rate": 0.00012729541672308722, + "loss": 0.9949, + "step": 11997 + }, + { + "epoch": 0.42967393056028075, + "grad_norm": 1.682037115097046, + "learning_rate": 0.00012728425797543731, + "loss": 1.1766, + "step": 11998 + }, + { + "epoch": 0.4297097426898491, + "grad_norm": 1.4509061574935913, + "learning_rate": 0.00012727309886071292, + "loss": 0.9262, + "step": 11999 + }, + { + "epoch": 0.42974555481941734, + "grad_norm": 1.2935198545455933, + "learning_rate": 0.00012726193937906416, + "loss": 1.0954, + "step": 12000 + }, + { + "epoch": 0.4297813669489856, + "grad_norm": 1.8740746974945068, + "learning_rate": 0.00012725077953064119, + "loss": 1.1735, + "step": 12001 + }, + { + "epoch": 0.4298171790785539, + "grad_norm": 1.5180059671401978, + "learning_rate": 0.0001272396193155941, + "loss": 1.0453, + "step": 12002 + }, + { + "epoch": 0.4298529912081222, + "grad_norm": 1.8639497756958008, + "learning_rate": 0.0001272284587340731, + "loss": 1.2336, + "step": 12003 + }, + { + "epoch": 0.4298888033376905, + "grad_norm": 1.4643055200576782, + "learning_rate": 0.00012721729778622826, + "loss": 1.2387, + "step": 12004 + }, + { + "epoch": 0.42992461546725874, + "grad_norm": 1.2881048917770386, + "learning_rate": 0.00012720613647220985, + "loss": 0.9671, + "step": 12005 + }, + { + "epoch": 0.42996042759682707, + "grad_norm": 2.986626625061035, + "learning_rate": 0.0001271949747921679, + "loss": 1.1902, + "step": 12006 + }, + { + "epoch": 0.42999623972639534, + "grad_norm": 1.6741094589233398, + "learning_rate": 0.0001271838127462527, + "loss": 1.1786, + "step": 12007 + }, + { + "epoch": 0.4300320518559636, + "grad_norm": 1.5032238960266113, + "learning_rate": 0.0001271726503346143, + "loss": 0.9425, + "step": 12008 + }, + { + "epoch": 0.4300678639855319, + "grad_norm": 1.423807978630066, + "learning_rate": 0.00012716148755740302, + "loss": 1.2023, + "step": 12009 + }, + { + "epoch": 0.4301036761151002, + "grad_norm": 1.4366247653961182, + "learning_rate": 0.00012715032441476892, + "loss": 1.2139, + "step": 12010 + }, + { + "epoch": 0.43013948824466847, + "grad_norm": 1.8112841844558716, + "learning_rate": 0.00012713916090686223, + "loss": 1.1918, + "step": 12011 + }, + { + "epoch": 0.43017530037423674, + "grad_norm": 1.5400820970535278, + "learning_rate": 0.00012712799703383314, + "loss": 1.087, + "step": 12012 + }, + { + "epoch": 0.43021111250380506, + "grad_norm": 1.7920022010803223, + "learning_rate": 0.00012711683279583181, + "loss": 1.1796, + "step": 12013 + }, + { + "epoch": 0.43024692463337333, + "grad_norm": 1.4640105962753296, + "learning_rate": 0.00012710566819300854, + "loss": 1.2334, + "step": 12014 + }, + { + "epoch": 0.4302827367629416, + "grad_norm": 1.347864031791687, + "learning_rate": 0.00012709450322551338, + "loss": 1.1392, + "step": 12015 + }, + { + "epoch": 0.43031854889250987, + "grad_norm": 1.9746415615081787, + "learning_rate": 0.00012708333789349671, + "loss": 1.2905, + "step": 12016 + }, + { + "epoch": 0.4303543610220782, + "grad_norm": 1.5557894706726074, + "learning_rate": 0.00012707217219710864, + "loss": 1.0879, + "step": 12017 + }, + { + "epoch": 0.43039017315164646, + "grad_norm": 1.4314581155776978, + "learning_rate": 0.0001270610061364994, + "loss": 1.067, + "step": 12018 + }, + { + "epoch": 0.43042598528121473, + "grad_norm": 1.764589786529541, + "learning_rate": 0.00012704983971181924, + "loss": 1.1738, + "step": 12019 + }, + { + "epoch": 0.43046179741078305, + "grad_norm": 1.726563572883606, + "learning_rate": 0.00012703867292321837, + "loss": 1.3665, + "step": 12020 + }, + { + "epoch": 0.4304976095403513, + "grad_norm": 1.371132254600525, + "learning_rate": 0.00012702750577084704, + "loss": 1.1682, + "step": 12021 + }, + { + "epoch": 0.4305334216699196, + "grad_norm": 1.3668429851531982, + "learning_rate": 0.00012701633825485547, + "loss": 1.2205, + "step": 12022 + }, + { + "epoch": 0.43056923379948786, + "grad_norm": 1.7097301483154297, + "learning_rate": 0.00012700517037539394, + "loss": 1.3523, + "step": 12023 + }, + { + "epoch": 0.4306050459290562, + "grad_norm": 1.7673914432525635, + "learning_rate": 0.00012699400213261262, + "loss": 1.0962, + "step": 12024 + }, + { + "epoch": 0.43064085805862445, + "grad_norm": 1.2555028200149536, + "learning_rate": 0.0001269828335266619, + "loss": 1.0039, + "step": 12025 + }, + { + "epoch": 0.4306766701881927, + "grad_norm": 1.7378467321395874, + "learning_rate": 0.00012697166455769192, + "loss": 1.2688, + "step": 12026 + }, + { + "epoch": 0.43071248231776105, + "grad_norm": 1.5047364234924316, + "learning_rate": 0.000126960495225853, + "loss": 1.057, + "step": 12027 + }, + { + "epoch": 0.4307482944473293, + "grad_norm": 1.7605352401733398, + "learning_rate": 0.00012694932553129537, + "loss": 1.1417, + "step": 12028 + }, + { + "epoch": 0.4307841065768976, + "grad_norm": 1.597182035446167, + "learning_rate": 0.00012693815547416934, + "loss": 1.2956, + "step": 12029 + }, + { + "epoch": 0.43081991870646585, + "grad_norm": 1.5960837602615356, + "learning_rate": 0.00012692698505462516, + "loss": 0.9923, + "step": 12030 + }, + { + "epoch": 0.4308557308360342, + "grad_norm": 1.3937773704528809, + "learning_rate": 0.00012691581427281317, + "loss": 1.0779, + "step": 12031 + }, + { + "epoch": 0.43089154296560245, + "grad_norm": 1.3601535558700562, + "learning_rate": 0.00012690464312888357, + "loss": 1.0335, + "step": 12032 + }, + { + "epoch": 0.4309273550951707, + "grad_norm": 1.715883493423462, + "learning_rate": 0.0001268934716229867, + "loss": 1.3024, + "step": 12033 + }, + { + "epoch": 0.43096316722473904, + "grad_norm": 1.8960275650024414, + "learning_rate": 0.0001268822997552729, + "loss": 1.1924, + "step": 12034 + }, + { + "epoch": 0.4309989793543073, + "grad_norm": 1.358450174331665, + "learning_rate": 0.00012687112752589243, + "loss": 1.1852, + "step": 12035 + }, + { + "epoch": 0.4310347914838756, + "grad_norm": 1.5556000471115112, + "learning_rate": 0.00012685995493499558, + "loss": 1.0296, + "step": 12036 + }, + { + "epoch": 0.43107060361344385, + "grad_norm": 1.383671760559082, + "learning_rate": 0.00012684878198273268, + "loss": 1.0536, + "step": 12037 + }, + { + "epoch": 0.4311064157430122, + "grad_norm": 1.4083285331726074, + "learning_rate": 0.00012683760866925408, + "loss": 1.1433, + "step": 12038 + }, + { + "epoch": 0.43114222787258044, + "grad_norm": 1.5231084823608398, + "learning_rate": 0.00012682643499471003, + "loss": 1.0049, + "step": 12039 + }, + { + "epoch": 0.4311780400021487, + "grad_norm": 1.5813905000686646, + "learning_rate": 0.00012681526095925094, + "loss": 1.0352, + "step": 12040 + }, + { + "epoch": 0.43121385213171703, + "grad_norm": 1.320574402809143, + "learning_rate": 0.00012680408656302707, + "loss": 1.107, + "step": 12041 + }, + { + "epoch": 0.4312496642612853, + "grad_norm": 1.4749572277069092, + "learning_rate": 0.00012679291180618882, + "loss": 1.2093, + "step": 12042 + }, + { + "epoch": 0.4312854763908536, + "grad_norm": 1.5012037754058838, + "learning_rate": 0.00012678173668888645, + "loss": 1.1304, + "step": 12043 + }, + { + "epoch": 0.43132128852042184, + "grad_norm": 1.5687150955200195, + "learning_rate": 0.00012677056121127042, + "loss": 1.3852, + "step": 12044 + }, + { + "epoch": 0.43135710064999017, + "grad_norm": 1.7889206409454346, + "learning_rate": 0.000126759385373491, + "loss": 1.1769, + "step": 12045 + }, + { + "epoch": 0.43139291277955844, + "grad_norm": 1.206360936164856, + "learning_rate": 0.00012674820917569856, + "loss": 0.8128, + "step": 12046 + }, + { + "epoch": 0.4314287249091267, + "grad_norm": 1.5568093061447144, + "learning_rate": 0.0001267370326180435, + "loss": 1.0773, + "step": 12047 + }, + { + "epoch": 0.43146453703869503, + "grad_norm": 1.8913822174072266, + "learning_rate": 0.0001267258557006761, + "loss": 1.2284, + "step": 12048 + }, + { + "epoch": 0.4315003491682633, + "grad_norm": 1.5910898447036743, + "learning_rate": 0.00012671467842374683, + "loss": 1.3514, + "step": 12049 + }, + { + "epoch": 0.43153616129783157, + "grad_norm": 1.8703441619873047, + "learning_rate": 0.000126703500787406, + "loss": 1.1008, + "step": 12050 + }, + { + "epoch": 0.43157197342739984, + "grad_norm": 1.7725284099578857, + "learning_rate": 0.000126692322791804, + "loss": 1.1671, + "step": 12051 + }, + { + "epoch": 0.43160778555696816, + "grad_norm": 1.6113708019256592, + "learning_rate": 0.00012668114443709124, + "loss": 1.4467, + "step": 12052 + }, + { + "epoch": 0.43164359768653643, + "grad_norm": 1.868439793586731, + "learning_rate": 0.0001266699657234181, + "loss": 1.3149, + "step": 12053 + }, + { + "epoch": 0.4316794098161047, + "grad_norm": 1.6550803184509277, + "learning_rate": 0.000126658786650935, + "loss": 1.14, + "step": 12054 + }, + { + "epoch": 0.431715221945673, + "grad_norm": 1.5625454187393188, + "learning_rate": 0.00012664760721979227, + "loss": 1.0442, + "step": 12055 + }, + { + "epoch": 0.4317510340752413, + "grad_norm": 1.9151405096054077, + "learning_rate": 0.00012663642743014037, + "loss": 1.0141, + "step": 12056 + }, + { + "epoch": 0.43178684620480956, + "grad_norm": 1.8465876579284668, + "learning_rate": 0.0001266252472821297, + "loss": 1.1547, + "step": 12057 + }, + { + "epoch": 0.43182265833437783, + "grad_norm": 1.5397660732269287, + "learning_rate": 0.00012661406677591067, + "loss": 1.0006, + "step": 12058 + }, + { + "epoch": 0.43185847046394615, + "grad_norm": 1.710680603981018, + "learning_rate": 0.00012660288591163373, + "loss": 1.1838, + "step": 12059 + }, + { + "epoch": 0.4318942825935144, + "grad_norm": 1.4740351438522339, + "learning_rate": 0.00012659170468944924, + "loss": 1.0405, + "step": 12060 + }, + { + "epoch": 0.4319300947230827, + "grad_norm": 1.722485065460205, + "learning_rate": 0.00012658052310950767, + "loss": 1.2602, + "step": 12061 + }, + { + "epoch": 0.431965906852651, + "grad_norm": 1.3565417528152466, + "learning_rate": 0.00012656934117195946, + "loss": 1.1818, + "step": 12062 + }, + { + "epoch": 0.4320017189822193, + "grad_norm": 1.7569645643234253, + "learning_rate": 0.00012655815887695503, + "loss": 1.0823, + "step": 12063 + }, + { + "epoch": 0.43203753111178755, + "grad_norm": 1.5453790426254272, + "learning_rate": 0.00012654697622464483, + "loss": 1.2144, + "step": 12064 + }, + { + "epoch": 0.4320733432413558, + "grad_norm": 1.6100525856018066, + "learning_rate": 0.0001265357932151793, + "loss": 1.1924, + "step": 12065 + }, + { + "epoch": 0.43210915537092415, + "grad_norm": 1.2449604272842407, + "learning_rate": 0.0001265246098487089, + "loss": 0.9969, + "step": 12066 + }, + { + "epoch": 0.4321449675004924, + "grad_norm": 1.8932164907455444, + "learning_rate": 0.0001265134261253841, + "loss": 1.0864, + "step": 12067 + }, + { + "epoch": 0.4321807796300607, + "grad_norm": 1.6156278848648071, + "learning_rate": 0.00012650224204535535, + "loss": 1.1769, + "step": 12068 + }, + { + "epoch": 0.432216591759629, + "grad_norm": 1.7889131307601929, + "learning_rate": 0.00012649105760877312, + "loss": 1.0433, + "step": 12069 + }, + { + "epoch": 0.4322524038891973, + "grad_norm": 1.9466263055801392, + "learning_rate": 0.00012647987281578789, + "loss": 1.2756, + "step": 12070 + }, + { + "epoch": 0.43228821601876555, + "grad_norm": 1.4881592988967896, + "learning_rate": 0.0001264686876665501, + "loss": 0.9446, + "step": 12071 + }, + { + "epoch": 0.4323240281483338, + "grad_norm": 1.4642778635025024, + "learning_rate": 0.00012645750216121028, + "loss": 1.0548, + "step": 12072 + }, + { + "epoch": 0.43235984027790214, + "grad_norm": 1.7014602422714233, + "learning_rate": 0.0001264463162999189, + "loss": 1.1453, + "step": 12073 + }, + { + "epoch": 0.4323956524074704, + "grad_norm": 1.4722245931625366, + "learning_rate": 0.00012643513008282645, + "loss": 1.1582, + "step": 12074 + }, + { + "epoch": 0.4324314645370387, + "grad_norm": 1.3436403274536133, + "learning_rate": 0.00012642394351008337, + "loss": 1.1364, + "step": 12075 + }, + { + "epoch": 0.432467276666607, + "grad_norm": 1.41680908203125, + "learning_rate": 0.00012641275658184026, + "loss": 1.1521, + "step": 12076 + }, + { + "epoch": 0.4325030887961753, + "grad_norm": 1.5432193279266357, + "learning_rate": 0.00012640156929824757, + "loss": 1.2266, + "step": 12077 + }, + { + "epoch": 0.43253890092574354, + "grad_norm": 1.6962043046951294, + "learning_rate": 0.00012639038165945584, + "loss": 1.1377, + "step": 12078 + }, + { + "epoch": 0.4325747130553118, + "grad_norm": 1.274756669998169, + "learning_rate": 0.00012637919366561556, + "loss": 1.0657, + "step": 12079 + }, + { + "epoch": 0.43261052518488013, + "grad_norm": 1.5036288499832153, + "learning_rate": 0.00012636800531687728, + "loss": 1.1428, + "step": 12080 + }, + { + "epoch": 0.4326463373144484, + "grad_norm": 2.287994623184204, + "learning_rate": 0.00012635681661339146, + "loss": 1.4619, + "step": 12081 + }, + { + "epoch": 0.4326821494440167, + "grad_norm": 1.8444747924804688, + "learning_rate": 0.00012634562755530867, + "loss": 1.2762, + "step": 12082 + }, + { + "epoch": 0.432717961573585, + "grad_norm": 3.32460880279541, + "learning_rate": 0.00012633443814277946, + "loss": 1.0209, + "step": 12083 + }, + { + "epoch": 0.43275377370315327, + "grad_norm": 1.4329911470413208, + "learning_rate": 0.00012632324837595434, + "loss": 1.1006, + "step": 12084 + }, + { + "epoch": 0.43278958583272154, + "grad_norm": 1.4814823865890503, + "learning_rate": 0.00012631205825498388, + "loss": 1.113, + "step": 12085 + }, + { + "epoch": 0.4328253979622898, + "grad_norm": 1.5493429899215698, + "learning_rate": 0.0001263008677800186, + "loss": 0.9772, + "step": 12086 + }, + { + "epoch": 0.43286121009185813, + "grad_norm": 1.4923303127288818, + "learning_rate": 0.0001262896769512091, + "loss": 1.0035, + "step": 12087 + }, + { + "epoch": 0.4328970222214264, + "grad_norm": 1.8145009279251099, + "learning_rate": 0.0001262784857687059, + "loss": 1.2612, + "step": 12088 + }, + { + "epoch": 0.43293283435099467, + "grad_norm": 1.2811377048492432, + "learning_rate": 0.00012626729423265956, + "loss": 1.0663, + "step": 12089 + }, + { + "epoch": 0.432968646480563, + "grad_norm": 1.6236488819122314, + "learning_rate": 0.00012625610234322064, + "loss": 1.1486, + "step": 12090 + }, + { + "epoch": 0.43300445861013126, + "grad_norm": 1.3819528818130493, + "learning_rate": 0.00012624491010053976, + "loss": 1.1155, + "step": 12091 + }, + { + "epoch": 0.43304027073969953, + "grad_norm": 3.9016404151916504, + "learning_rate": 0.00012623371750476747, + "loss": 1.3384, + "step": 12092 + }, + { + "epoch": 0.4330760828692678, + "grad_norm": 1.4269132614135742, + "learning_rate": 0.00012622252455605435, + "loss": 1.069, + "step": 12093 + }, + { + "epoch": 0.4331118949988361, + "grad_norm": 1.6111977100372314, + "learning_rate": 0.00012621133125455093, + "loss": 1.1802, + "step": 12094 + }, + { + "epoch": 0.4331477071284044, + "grad_norm": 1.3000625371932983, + "learning_rate": 0.0001262001376004079, + "loss": 0.9895, + "step": 12095 + }, + { + "epoch": 0.43318351925797266, + "grad_norm": 1.468554139137268, + "learning_rate": 0.00012618894359377585, + "loss": 0.95, + "step": 12096 + }, + { + "epoch": 0.433219331387541, + "grad_norm": 1.6764171123504639, + "learning_rate": 0.0001261777492348053, + "loss": 1.1482, + "step": 12097 + }, + { + "epoch": 0.43325514351710925, + "grad_norm": 1.3410978317260742, + "learning_rate": 0.00012616655452364693, + "loss": 1.2004, + "step": 12098 + }, + { + "epoch": 0.4332909556466775, + "grad_norm": 1.5551608800888062, + "learning_rate": 0.0001261553594604513, + "loss": 1.1476, + "step": 12099 + }, + { + "epoch": 0.4333267677762458, + "grad_norm": 1.729219913482666, + "learning_rate": 0.00012614416404536905, + "loss": 1.285, + "step": 12100 + }, + { + "epoch": 0.4333625799058141, + "grad_norm": 1.7555440664291382, + "learning_rate": 0.00012613296827855078, + "loss": 1.1214, + "step": 12101 + }, + { + "epoch": 0.4333983920353824, + "grad_norm": 1.534983515739441, + "learning_rate": 0.0001261217721601472, + "loss": 1.2998, + "step": 12102 + }, + { + "epoch": 0.43343420416495065, + "grad_norm": 1.5093518495559692, + "learning_rate": 0.00012611057569030876, + "loss": 1.0244, + "step": 12103 + }, + { + "epoch": 0.433470016294519, + "grad_norm": 1.8163046836853027, + "learning_rate": 0.0001260993788691863, + "loss": 1.2388, + "step": 12104 + }, + { + "epoch": 0.43350582842408725, + "grad_norm": 1.836379885673523, + "learning_rate": 0.00012608818169693032, + "loss": 1.241, + "step": 12105 + }, + { + "epoch": 0.4335416405536555, + "grad_norm": 1.6795183420181274, + "learning_rate": 0.00012607698417369152, + "loss": 1.0621, + "step": 12106 + }, + { + "epoch": 0.4335774526832238, + "grad_norm": 1.7558332681655884, + "learning_rate": 0.00012606578629962054, + "loss": 1.3499, + "step": 12107 + }, + { + "epoch": 0.4336132648127921, + "grad_norm": 1.4459127187728882, + "learning_rate": 0.00012605458807486797, + "loss": 1.0028, + "step": 12108 + }, + { + "epoch": 0.4336490769423604, + "grad_norm": 1.3628969192504883, + "learning_rate": 0.0001260433894995846, + "loss": 0.886, + "step": 12109 + }, + { + "epoch": 0.43368488907192865, + "grad_norm": 1.5039305686950684, + "learning_rate": 0.00012603219057392097, + "loss": 1.0085, + "step": 12110 + }, + { + "epoch": 0.43372070120149697, + "grad_norm": 1.4241583347320557, + "learning_rate": 0.0001260209912980278, + "loss": 1.2859, + "step": 12111 + }, + { + "epoch": 0.43375651333106524, + "grad_norm": 1.3069486618041992, + "learning_rate": 0.0001260097916720558, + "loss": 0.7792, + "step": 12112 + }, + { + "epoch": 0.4337923254606335, + "grad_norm": 1.4672473669052124, + "learning_rate": 0.00012599859169615558, + "loss": 1.5047, + "step": 12113 + }, + { + "epoch": 0.4338281375902018, + "grad_norm": 1.4877936840057373, + "learning_rate": 0.00012598739137047784, + "loss": 1.1179, + "step": 12114 + }, + { + "epoch": 0.4338639497197701, + "grad_norm": 2.0998411178588867, + "learning_rate": 0.00012597619069517328, + "loss": 1.4363, + "step": 12115 + }, + { + "epoch": 0.4338997618493384, + "grad_norm": 1.4779173135757446, + "learning_rate": 0.00012596498967039257, + "loss": 1.1563, + "step": 12116 + }, + { + "epoch": 0.43393557397890664, + "grad_norm": 1.5543497800827026, + "learning_rate": 0.0001259537882962864, + "loss": 1.0483, + "step": 12117 + }, + { + "epoch": 0.43397138610847497, + "grad_norm": 1.6216239929199219, + "learning_rate": 0.0001259425865730055, + "loss": 1.0076, + "step": 12118 + }, + { + "epoch": 0.43400719823804323, + "grad_norm": 1.1966079473495483, + "learning_rate": 0.00012593138450070056, + "loss": 1.0072, + "step": 12119 + }, + { + "epoch": 0.4340430103676115, + "grad_norm": 1.3650846481323242, + "learning_rate": 0.0001259201820795223, + "loss": 0.982, + "step": 12120 + }, + { + "epoch": 0.4340788224971798, + "grad_norm": 1.6548525094985962, + "learning_rate": 0.00012590897930962142, + "loss": 1.0873, + "step": 12121 + }, + { + "epoch": 0.4341146346267481, + "grad_norm": 1.6631560325622559, + "learning_rate": 0.00012589777619114863, + "loss": 1.2544, + "step": 12122 + }, + { + "epoch": 0.43415044675631637, + "grad_norm": 1.8366447687149048, + "learning_rate": 0.0001258865727242547, + "loss": 1.0337, + "step": 12123 + }, + { + "epoch": 0.43418625888588464, + "grad_norm": 1.978233814239502, + "learning_rate": 0.00012587536890909033, + "loss": 1.2967, + "step": 12124 + }, + { + "epoch": 0.43422207101545296, + "grad_norm": 1.4815369844436646, + "learning_rate": 0.0001258641647458062, + "loss": 1.1345, + "step": 12125 + }, + { + "epoch": 0.43425788314502123, + "grad_norm": 1.526599645614624, + "learning_rate": 0.00012585296023455314, + "loss": 1.2267, + "step": 12126 + }, + { + "epoch": 0.4342936952745895, + "grad_norm": 1.5526117086410522, + "learning_rate": 0.00012584175537548183, + "loss": 1.0226, + "step": 12127 + }, + { + "epoch": 0.43432950740415777, + "grad_norm": 1.6320558786392212, + "learning_rate": 0.00012583055016874303, + "loss": 1.2114, + "step": 12128 + }, + { + "epoch": 0.4343653195337261, + "grad_norm": 1.4262217283248901, + "learning_rate": 0.00012581934461448747, + "loss": 1.1957, + "step": 12129 + }, + { + "epoch": 0.43440113166329436, + "grad_norm": 1.75495183467865, + "learning_rate": 0.00012580813871286597, + "loss": 1.1066, + "step": 12130 + }, + { + "epoch": 0.43443694379286263, + "grad_norm": 1.7564650774002075, + "learning_rate": 0.00012579693246402924, + "loss": 1.2171, + "step": 12131 + }, + { + "epoch": 0.43447275592243095, + "grad_norm": 1.715489149093628, + "learning_rate": 0.00012578572586812806, + "loss": 1.0259, + "step": 12132 + }, + { + "epoch": 0.4345085680519992, + "grad_norm": 1.9639018774032593, + "learning_rate": 0.00012577451892531322, + "loss": 1.1435, + "step": 12133 + }, + { + "epoch": 0.4345443801815675, + "grad_norm": 1.6358572244644165, + "learning_rate": 0.00012576331163573548, + "loss": 1.0592, + "step": 12134 + }, + { + "epoch": 0.43458019231113576, + "grad_norm": 2.70592999458313, + "learning_rate": 0.00012575210399954557, + "loss": 1.1354, + "step": 12135 + }, + { + "epoch": 0.4346160044407041, + "grad_norm": 1.4940510988235474, + "learning_rate": 0.00012574089601689433, + "loss": 0.94, + "step": 12136 + }, + { + "epoch": 0.43465181657027235, + "grad_norm": 2.715761423110962, + "learning_rate": 0.00012572968768793257, + "loss": 1.2564, + "step": 12137 + }, + { + "epoch": 0.4346876286998406, + "grad_norm": 2.117523670196533, + "learning_rate": 0.00012571847901281103, + "loss": 1.1202, + "step": 12138 + }, + { + "epoch": 0.43472344082940895, + "grad_norm": 1.8870277404785156, + "learning_rate": 0.0001257072699916805, + "loss": 1.1001, + "step": 12139 + }, + { + "epoch": 0.4347592529589772, + "grad_norm": 1.4602434635162354, + "learning_rate": 0.00012569606062469186, + "loss": 1.1628, + "step": 12140 + }, + { + "epoch": 0.4347950650885455, + "grad_norm": 1.7424436807632446, + "learning_rate": 0.00012568485091199585, + "loss": 0.9867, + "step": 12141 + }, + { + "epoch": 0.43483087721811375, + "grad_norm": 1.822213053703308, + "learning_rate": 0.0001256736408537433, + "loss": 1.2594, + "step": 12142 + }, + { + "epoch": 0.4348666893476821, + "grad_norm": 1.6843619346618652, + "learning_rate": 0.00012566243045008504, + "loss": 1.3187, + "step": 12143 + }, + { + "epoch": 0.43490250147725035, + "grad_norm": 1.9607222080230713, + "learning_rate": 0.0001256512197011719, + "loss": 1.3361, + "step": 12144 + }, + { + "epoch": 0.4349383136068186, + "grad_norm": 1.5660905838012695, + "learning_rate": 0.00012564000860715464, + "loss": 1.2577, + "step": 12145 + }, + { + "epoch": 0.43497412573638694, + "grad_norm": 3.187577724456787, + "learning_rate": 0.00012562879716818416, + "loss": 1.429, + "step": 12146 + }, + { + "epoch": 0.4350099378659552, + "grad_norm": 1.3576406240463257, + "learning_rate": 0.00012561758538441126, + "loss": 1.0637, + "step": 12147 + }, + { + "epoch": 0.4350457499955235, + "grad_norm": 1.4991236925125122, + "learning_rate": 0.0001256063732559868, + "loss": 1.0831, + "step": 12148 + }, + { + "epoch": 0.43508156212509175, + "grad_norm": 1.4257315397262573, + "learning_rate": 0.0001255951607830616, + "loss": 1.0345, + "step": 12149 + }, + { + "epoch": 0.43511737425466007, + "grad_norm": 1.357177495956421, + "learning_rate": 0.00012558394796578656, + "loss": 1.0448, + "step": 12150 + }, + { + "epoch": 0.43515318638422834, + "grad_norm": 1.3740768432617188, + "learning_rate": 0.0001255727348043125, + "loss": 1.299, + "step": 12151 + }, + { + "epoch": 0.4351889985137966, + "grad_norm": 1.393744707107544, + "learning_rate": 0.00012556152129879027, + "loss": 1.0882, + "step": 12152 + }, + { + "epoch": 0.43522481064336493, + "grad_norm": 1.7662427425384521, + "learning_rate": 0.00012555030744937075, + "loss": 0.9431, + "step": 12153 + }, + { + "epoch": 0.4352606227729332, + "grad_norm": 1.9019646644592285, + "learning_rate": 0.0001255390932562048, + "loss": 1.1183, + "step": 12154 + }, + { + "epoch": 0.4352964349025015, + "grad_norm": 1.5904232263565063, + "learning_rate": 0.00012552787871944327, + "loss": 1.3046, + "step": 12155 + }, + { + "epoch": 0.43533224703206974, + "grad_norm": 2.2260119915008545, + "learning_rate": 0.00012551666383923705, + "loss": 1.1122, + "step": 12156 + }, + { + "epoch": 0.43536805916163807, + "grad_norm": 1.6540255546569824, + "learning_rate": 0.00012550544861573707, + "loss": 0.9744, + "step": 12157 + }, + { + "epoch": 0.43540387129120633, + "grad_norm": 1.41975998878479, + "learning_rate": 0.0001254942330490942, + "loss": 1.2384, + "step": 12158 + }, + { + "epoch": 0.4354396834207746, + "grad_norm": 1.9200338125228882, + "learning_rate": 0.00012548301713945925, + "loss": 1.1594, + "step": 12159 + }, + { + "epoch": 0.43547549555034293, + "grad_norm": 1.273733377456665, + "learning_rate": 0.00012547180088698322, + "loss": 1.073, + "step": 12160 + }, + { + "epoch": 0.4355113076799112, + "grad_norm": 2.0867300033569336, + "learning_rate": 0.00012546058429181692, + "loss": 1.3962, + "step": 12161 + }, + { + "epoch": 0.43554711980947947, + "grad_norm": 2.0041027069091797, + "learning_rate": 0.00012544936735411135, + "loss": 1.1082, + "step": 12162 + }, + { + "epoch": 0.43558293193904773, + "grad_norm": 1.7353641986846924, + "learning_rate": 0.00012543815007401733, + "loss": 1.1811, + "step": 12163 + }, + { + "epoch": 0.43561874406861606, + "grad_norm": 1.5971543788909912, + "learning_rate": 0.00012542693245168584, + "loss": 1.2716, + "step": 12164 + }, + { + "epoch": 0.43565455619818433, + "grad_norm": 2.1535098552703857, + "learning_rate": 0.00012541571448726775, + "loss": 1.1329, + "step": 12165 + }, + { + "epoch": 0.4356903683277526, + "grad_norm": 1.6712000370025635, + "learning_rate": 0.00012540449618091403, + "loss": 1.1655, + "step": 12166 + }, + { + "epoch": 0.4357261804573209, + "grad_norm": 1.4329904317855835, + "learning_rate": 0.00012539327753277555, + "loss": 1.292, + "step": 12167 + }, + { + "epoch": 0.4357619925868892, + "grad_norm": 1.668187141418457, + "learning_rate": 0.00012538205854300334, + "loss": 1.1591, + "step": 12168 + }, + { + "epoch": 0.43579780471645746, + "grad_norm": 1.3478697538375854, + "learning_rate": 0.00012537083921174822, + "loss": 1.1572, + "step": 12169 + }, + { + "epoch": 0.43583361684602573, + "grad_norm": 1.505539059638977, + "learning_rate": 0.0001253596195391612, + "loss": 1.0548, + "step": 12170 + }, + { + "epoch": 0.43586942897559405, + "grad_norm": 1.3353190422058105, + "learning_rate": 0.0001253483995253932, + "loss": 1.2524, + "step": 12171 + }, + { + "epoch": 0.4359052411051623, + "grad_norm": 1.5034992694854736, + "learning_rate": 0.00012533717917059516, + "loss": 1.0227, + "step": 12172 + }, + { + "epoch": 0.4359410532347306, + "grad_norm": 1.452196717262268, + "learning_rate": 0.0001253259584749181, + "loss": 0.9852, + "step": 12173 + }, + { + "epoch": 0.4359768653642989, + "grad_norm": 1.5961917638778687, + "learning_rate": 0.0001253147374385129, + "loss": 1.0824, + "step": 12174 + }, + { + "epoch": 0.4360126774938672, + "grad_norm": 1.535973072052002, + "learning_rate": 0.0001253035160615306, + "loss": 1.2408, + "step": 12175 + }, + { + "epoch": 0.43604848962343545, + "grad_norm": 1.2998002767562866, + "learning_rate": 0.00012529229434412212, + "loss": 1.1744, + "step": 12176 + }, + { + "epoch": 0.4360843017530037, + "grad_norm": 1.9616923332214355, + "learning_rate": 0.00012528107228643843, + "loss": 1.1688, + "step": 12177 + }, + { + "epoch": 0.43612011388257205, + "grad_norm": 1.4870790243148804, + "learning_rate": 0.00012526984988863054, + "loss": 1.1684, + "step": 12178 + }, + { + "epoch": 0.4361559260121403, + "grad_norm": 1.7569197416305542, + "learning_rate": 0.0001252586271508494, + "loss": 1.2402, + "step": 12179 + }, + { + "epoch": 0.4361917381417086, + "grad_norm": 1.8350422382354736, + "learning_rate": 0.000125247404073246, + "loss": 1.1547, + "step": 12180 + }, + { + "epoch": 0.43622755027127685, + "grad_norm": 1.6148453950881958, + "learning_rate": 0.0001252361806559714, + "loss": 1.1603, + "step": 12181 + }, + { + "epoch": 0.4362633624008452, + "grad_norm": 1.913171648979187, + "learning_rate": 0.00012522495689917647, + "loss": 1.1287, + "step": 12182 + }, + { + "epoch": 0.43629917453041345, + "grad_norm": 1.6710659265518188, + "learning_rate": 0.00012521373280301233, + "loss": 1.2227, + "step": 12183 + }, + { + "epoch": 0.4363349866599817, + "grad_norm": 1.2710306644439697, + "learning_rate": 0.0001252025083676299, + "loss": 0.8884, + "step": 12184 + }, + { + "epoch": 0.43637079878955004, + "grad_norm": 1.3959388732910156, + "learning_rate": 0.00012519128359318027, + "loss": 0.9852, + "step": 12185 + }, + { + "epoch": 0.4364066109191183, + "grad_norm": 1.573169469833374, + "learning_rate": 0.0001251800584798144, + "loss": 1.1237, + "step": 12186 + }, + { + "epoch": 0.4364424230486866, + "grad_norm": 1.5965335369110107, + "learning_rate": 0.0001251688330276833, + "loss": 1.2066, + "step": 12187 + }, + { + "epoch": 0.43647823517825485, + "grad_norm": 1.358051061630249, + "learning_rate": 0.00012515760723693807, + "loss": 1.2832, + "step": 12188 + }, + { + "epoch": 0.43651404730782317, + "grad_norm": 1.6009246110916138, + "learning_rate": 0.00012514638110772963, + "loss": 0.9278, + "step": 12189 + }, + { + "epoch": 0.43654985943739144, + "grad_norm": 1.4397426843643188, + "learning_rate": 0.0001251351546402091, + "loss": 1.3015, + "step": 12190 + }, + { + "epoch": 0.4365856715669597, + "grad_norm": 1.3567500114440918, + "learning_rate": 0.00012512392783452746, + "loss": 1.1912, + "step": 12191 + }, + { + "epoch": 0.43662148369652803, + "grad_norm": 1.8230180740356445, + "learning_rate": 0.00012511270069083582, + "loss": 1.1051, + "step": 12192 + }, + { + "epoch": 0.4366572958260963, + "grad_norm": 1.2205828428268433, + "learning_rate": 0.00012510147320928515, + "loss": 0.9823, + "step": 12193 + }, + { + "epoch": 0.4366931079556646, + "grad_norm": 1.3983910083770752, + "learning_rate": 0.00012509024539002653, + "loss": 0.9032, + "step": 12194 + }, + { + "epoch": 0.43672892008523284, + "grad_norm": 1.587350845336914, + "learning_rate": 0.00012507901723321106, + "loss": 1.209, + "step": 12195 + }, + { + "epoch": 0.43676473221480117, + "grad_norm": 1.4987655878067017, + "learning_rate": 0.00012506778873898976, + "loss": 1.1556, + "step": 12196 + }, + { + "epoch": 0.43680054434436943, + "grad_norm": 2.0972630977630615, + "learning_rate": 0.00012505655990751368, + "loss": 1.4369, + "step": 12197 + }, + { + "epoch": 0.4368363564739377, + "grad_norm": 1.6574863195419312, + "learning_rate": 0.0001250453307389339, + "loss": 1.2464, + "step": 12198 + }, + { + "epoch": 0.43687216860350603, + "grad_norm": 1.4147732257843018, + "learning_rate": 0.0001250341012334015, + "loss": 1.2181, + "step": 12199 + }, + { + "epoch": 0.4369079807330743, + "grad_norm": 1.4186499118804932, + "learning_rate": 0.00012502287139106756, + "loss": 0.8707, + "step": 12200 + }, + { + "epoch": 0.43694379286264257, + "grad_norm": 1.617843747138977, + "learning_rate": 0.0001250116412120832, + "loss": 1.2475, + "step": 12201 + }, + { + "epoch": 0.43697960499221083, + "grad_norm": 1.8066749572753906, + "learning_rate": 0.00012500041069659943, + "loss": 1.0294, + "step": 12202 + }, + { + "epoch": 0.43701541712177916, + "grad_norm": 1.485886812210083, + "learning_rate": 0.00012498917984476738, + "loss": 1.2035, + "step": 12203 + }, + { + "epoch": 0.43705122925134743, + "grad_norm": 1.886085867881775, + "learning_rate": 0.00012497794865673817, + "loss": 1.0318, + "step": 12204 + }, + { + "epoch": 0.4370870413809157, + "grad_norm": 1.6786655187606812, + "learning_rate": 0.0001249667171326629, + "loss": 1.2964, + "step": 12205 + }, + { + "epoch": 0.437122853510484, + "grad_norm": 1.2648266553878784, + "learning_rate": 0.0001249554852726926, + "loss": 0.9994, + "step": 12206 + }, + { + "epoch": 0.4371586656400523, + "grad_norm": 1.620659351348877, + "learning_rate": 0.00012494425307697847, + "loss": 1.1761, + "step": 12207 + }, + { + "epoch": 0.43719447776962056, + "grad_norm": 1.2568840980529785, + "learning_rate": 0.0001249330205456716, + "loss": 1.2354, + "step": 12208 + }, + { + "epoch": 0.43723028989918883, + "grad_norm": 1.4191741943359375, + "learning_rate": 0.00012492178767892307, + "loss": 1.1247, + "step": 12209 + }, + { + "epoch": 0.43726610202875715, + "grad_norm": 1.4551115036010742, + "learning_rate": 0.00012491055447688405, + "loss": 1.1196, + "step": 12210 + }, + { + "epoch": 0.4373019141583254, + "grad_norm": 1.698448657989502, + "learning_rate": 0.00012489932093970568, + "loss": 1.2019, + "step": 12211 + }, + { + "epoch": 0.4373377262878937, + "grad_norm": 1.6398769617080688, + "learning_rate": 0.00012488808706753902, + "loss": 1.2431, + "step": 12212 + }, + { + "epoch": 0.437373538417462, + "grad_norm": 1.4033682346343994, + "learning_rate": 0.00012487685286053526, + "loss": 1.289, + "step": 12213 + }, + { + "epoch": 0.4374093505470303, + "grad_norm": 1.2576318979263306, + "learning_rate": 0.00012486561831884552, + "loss": 0.9432, + "step": 12214 + }, + { + "epoch": 0.43744516267659855, + "grad_norm": 1.4915738105773926, + "learning_rate": 0.000124854383442621, + "loss": 1.2148, + "step": 12215 + }, + { + "epoch": 0.4374809748061668, + "grad_norm": 1.284921646118164, + "learning_rate": 0.00012484314823201276, + "loss": 1.0058, + "step": 12216 + }, + { + "epoch": 0.43751678693573515, + "grad_norm": 1.8424102067947388, + "learning_rate": 0.00012483191268717207, + "loss": 1.1827, + "step": 12217 + }, + { + "epoch": 0.4375525990653034, + "grad_norm": 1.5602511167526245, + "learning_rate": 0.00012482067680824998, + "loss": 1.0996, + "step": 12218 + }, + { + "epoch": 0.4375884111948717, + "grad_norm": 1.4309464693069458, + "learning_rate": 0.0001248094405953977, + "loss": 1.1484, + "step": 12219 + }, + { + "epoch": 0.43762422332444, + "grad_norm": 1.8141108751296997, + "learning_rate": 0.00012479820404876643, + "loss": 1.1741, + "step": 12220 + }, + { + "epoch": 0.4376600354540083, + "grad_norm": 1.304327368736267, + "learning_rate": 0.0001247869671685073, + "loss": 0.9806, + "step": 12221 + }, + { + "epoch": 0.43769584758357655, + "grad_norm": 1.5284442901611328, + "learning_rate": 0.0001247757299547715, + "loss": 0.9385, + "step": 12222 + }, + { + "epoch": 0.4377316597131448, + "grad_norm": 1.8902809619903564, + "learning_rate": 0.00012476449240771023, + "loss": 1.2027, + "step": 12223 + }, + { + "epoch": 0.43776747184271314, + "grad_norm": 1.7086906433105469, + "learning_rate": 0.0001247532545274746, + "loss": 1.2799, + "step": 12224 + }, + { + "epoch": 0.4378032839722814, + "grad_norm": 1.481526494026184, + "learning_rate": 0.00012474201631421588, + "loss": 0.9956, + "step": 12225 + }, + { + "epoch": 0.4378390961018497, + "grad_norm": 1.3143192529678345, + "learning_rate": 0.00012473077776808527, + "loss": 1.1873, + "step": 12226 + }, + { + "epoch": 0.437874908231418, + "grad_norm": 2.1252799034118652, + "learning_rate": 0.00012471953888923393, + "loss": 1.1323, + "step": 12227 + }, + { + "epoch": 0.43791072036098627, + "grad_norm": 1.3626461029052734, + "learning_rate": 0.00012470829967781307, + "loss": 1.0067, + "step": 12228 + }, + { + "epoch": 0.43794653249055454, + "grad_norm": 2.006732225418091, + "learning_rate": 0.00012469706013397395, + "loss": 1.2512, + "step": 12229 + }, + { + "epoch": 0.4379823446201228, + "grad_norm": 1.3411105871200562, + "learning_rate": 0.00012468582025786774, + "loss": 1.1896, + "step": 12230 + }, + { + "epoch": 0.43801815674969113, + "grad_norm": 1.3323489427566528, + "learning_rate": 0.0001246745800496456, + "loss": 1.0672, + "step": 12231 + }, + { + "epoch": 0.4380539688792594, + "grad_norm": 1.5205799341201782, + "learning_rate": 0.00012466333950945889, + "loss": 1.171, + "step": 12232 + }, + { + "epoch": 0.4380897810088277, + "grad_norm": 2.228858709335327, + "learning_rate": 0.0001246520986374587, + "loss": 0.998, + "step": 12233 + }, + { + "epoch": 0.438125593138396, + "grad_norm": 1.5177810192108154, + "learning_rate": 0.00012464085743379635, + "loss": 1.0803, + "step": 12234 + }, + { + "epoch": 0.43816140526796427, + "grad_norm": 1.2607026100158691, + "learning_rate": 0.000124629615898623, + "loss": 1.1118, + "step": 12235 + }, + { + "epoch": 0.43819721739753253, + "grad_norm": 1.6762537956237793, + "learning_rate": 0.00012461837403209, + "loss": 1.268, + "step": 12236 + }, + { + "epoch": 0.4382330295271008, + "grad_norm": 1.481716513633728, + "learning_rate": 0.0001246071318343485, + "loss": 0.9934, + "step": 12237 + }, + { + "epoch": 0.43826884165666913, + "grad_norm": 2.333979606628418, + "learning_rate": 0.0001245958893055498, + "loss": 1.0683, + "step": 12238 + }, + { + "epoch": 0.4383046537862374, + "grad_norm": 1.3189713954925537, + "learning_rate": 0.00012458464644584516, + "loss": 0.9762, + "step": 12239 + }, + { + "epoch": 0.43834046591580567, + "grad_norm": 1.3026782274246216, + "learning_rate": 0.00012457340325538576, + "loss": 1.0762, + "step": 12240 + }, + { + "epoch": 0.438376278045374, + "grad_norm": 1.5998269319534302, + "learning_rate": 0.00012456215973432295, + "loss": 0.9515, + "step": 12241 + }, + { + "epoch": 0.43841209017494226, + "grad_norm": 1.38333261013031, + "learning_rate": 0.00012455091588280793, + "loss": 1.3003, + "step": 12242 + }, + { + "epoch": 0.43844790230451053, + "grad_norm": 1.4339632987976074, + "learning_rate": 0.00012453967170099204, + "loss": 0.9921, + "step": 12243 + }, + { + "epoch": 0.4384837144340788, + "grad_norm": 1.7056729793548584, + "learning_rate": 0.00012452842718902647, + "loss": 1.2294, + "step": 12244 + }, + { + "epoch": 0.4385195265636471, + "grad_norm": 1.6594828367233276, + "learning_rate": 0.00012451718234706262, + "loss": 1.0934, + "step": 12245 + }, + { + "epoch": 0.4385553386932154, + "grad_norm": 1.4561437368392944, + "learning_rate": 0.00012450593717525167, + "loss": 1.0356, + "step": 12246 + }, + { + "epoch": 0.43859115082278366, + "grad_norm": 2.1342005729675293, + "learning_rate": 0.00012449469167374498, + "loss": 1.3991, + "step": 12247 + }, + { + "epoch": 0.438626962952352, + "grad_norm": 1.8364615440368652, + "learning_rate": 0.00012448344584269379, + "loss": 1.2797, + "step": 12248 + }, + { + "epoch": 0.43866277508192025, + "grad_norm": 1.3999775648117065, + "learning_rate": 0.0001244721996822494, + "loss": 1.0617, + "step": 12249 + }, + { + "epoch": 0.4386985872114885, + "grad_norm": 1.3803467750549316, + "learning_rate": 0.00012446095319256314, + "loss": 1.1224, + "step": 12250 + }, + { + "epoch": 0.4387343993410568, + "grad_norm": 1.4045273065567017, + "learning_rate": 0.00012444970637378631, + "loss": 1.0483, + "step": 12251 + }, + { + "epoch": 0.4387702114706251, + "grad_norm": 1.6266058683395386, + "learning_rate": 0.0001244384592260702, + "loss": 1.2351, + "step": 12252 + }, + { + "epoch": 0.4388060236001934, + "grad_norm": 1.3974958658218384, + "learning_rate": 0.00012442721174956616, + "loss": 1.1967, + "step": 12253 + }, + { + "epoch": 0.43884183572976165, + "grad_norm": 1.3655990362167358, + "learning_rate": 0.0001244159639444255, + "loss": 1.1257, + "step": 12254 + }, + { + "epoch": 0.43887764785933, + "grad_norm": 1.6743096113204956, + "learning_rate": 0.00012440471581079952, + "loss": 1.351, + "step": 12255 + }, + { + "epoch": 0.43891345998889825, + "grad_norm": 1.2926181554794312, + "learning_rate": 0.0001243934673488396, + "loss": 1.0818, + "step": 12256 + }, + { + "epoch": 0.4389492721184665, + "grad_norm": 1.4278771877288818, + "learning_rate": 0.00012438221855869702, + "loss": 1.1327, + "step": 12257 + }, + { + "epoch": 0.4389850842480348, + "grad_norm": 1.536197543144226, + "learning_rate": 0.00012437096944052317, + "loss": 1.2523, + "step": 12258 + }, + { + "epoch": 0.4390208963776031, + "grad_norm": 1.3945857286453247, + "learning_rate": 0.0001243597199944693, + "loss": 0.989, + "step": 12259 + }, + { + "epoch": 0.4390567085071714, + "grad_norm": 1.7818763256072998, + "learning_rate": 0.0001243484702206869, + "loss": 1.0314, + "step": 12260 + }, + { + "epoch": 0.43909252063673965, + "grad_norm": 1.4189872741699219, + "learning_rate": 0.00012433722011932717, + "loss": 1.023, + "step": 12261 + }, + { + "epoch": 0.43912833276630797, + "grad_norm": 1.5881117582321167, + "learning_rate": 0.00012432596969054157, + "loss": 1.3157, + "step": 12262 + }, + { + "epoch": 0.43916414489587624, + "grad_norm": 1.5793838500976562, + "learning_rate": 0.0001243147189344814, + "loss": 1.1827, + "step": 12263 + }, + { + "epoch": 0.4391999570254445, + "grad_norm": 1.2466567754745483, + "learning_rate": 0.0001243034678512981, + "loss": 1.1147, + "step": 12264 + }, + { + "epoch": 0.4392357691550128, + "grad_norm": 1.5679523944854736, + "learning_rate": 0.00012429221644114294, + "loss": 1.2779, + "step": 12265 + }, + { + "epoch": 0.4392715812845811, + "grad_norm": 1.8140414953231812, + "learning_rate": 0.00012428096470416738, + "loss": 1.0391, + "step": 12266 + }, + { + "epoch": 0.43930739341414937, + "grad_norm": 1.8956023454666138, + "learning_rate": 0.00012426971264052275, + "loss": 1.1223, + "step": 12267 + }, + { + "epoch": 0.43934320554371764, + "grad_norm": 1.5351024866104126, + "learning_rate": 0.00012425846025036042, + "loss": 1.0947, + "step": 12268 + }, + { + "epoch": 0.43937901767328597, + "grad_norm": 1.5215864181518555, + "learning_rate": 0.0001242472075338318, + "loss": 1.0747, + "step": 12269 + }, + { + "epoch": 0.43941482980285423, + "grad_norm": 1.4782159328460693, + "learning_rate": 0.0001242359544910883, + "loss": 1.1874, + "step": 12270 + }, + { + "epoch": 0.4394506419324225, + "grad_norm": 1.3398631811141968, + "learning_rate": 0.00012422470112228125, + "loss": 1.2322, + "step": 12271 + }, + { + "epoch": 0.43948645406199077, + "grad_norm": 1.5604110956192017, + "learning_rate": 0.00012421344742756215, + "loss": 0.9945, + "step": 12272 + }, + { + "epoch": 0.4395222661915591, + "grad_norm": 1.7148844003677368, + "learning_rate": 0.00012420219340708236, + "loss": 1.1619, + "step": 12273 + }, + { + "epoch": 0.43955807832112737, + "grad_norm": 1.4472572803497314, + "learning_rate": 0.00012419093906099323, + "loss": 1.1375, + "step": 12274 + }, + { + "epoch": 0.43959389045069563, + "grad_norm": 1.3941391706466675, + "learning_rate": 0.00012417968438944622, + "loss": 1.0379, + "step": 12275 + }, + { + "epoch": 0.43962970258026396, + "grad_norm": 2.0516011714935303, + "learning_rate": 0.0001241684293925928, + "loss": 1.1115, + "step": 12276 + }, + { + "epoch": 0.43966551470983223, + "grad_norm": 1.495137333869934, + "learning_rate": 0.00012415717407058427, + "loss": 0.9809, + "step": 12277 + }, + { + "epoch": 0.4397013268394005, + "grad_norm": 1.3164634704589844, + "learning_rate": 0.00012414591842357215, + "loss": 0.9653, + "step": 12278 + }, + { + "epoch": 0.43973713896896877, + "grad_norm": 1.2221169471740723, + "learning_rate": 0.00012413466245170783, + "loss": 0.9183, + "step": 12279 + }, + { + "epoch": 0.4397729510985371, + "grad_norm": 1.5365604162216187, + "learning_rate": 0.0001241234061551428, + "loss": 1.1307, + "step": 12280 + }, + { + "epoch": 0.43980876322810536, + "grad_norm": 1.7028917074203491, + "learning_rate": 0.00012411214953402842, + "loss": 1.164, + "step": 12281 + }, + { + "epoch": 0.43984457535767363, + "grad_norm": 1.5497665405273438, + "learning_rate": 0.00012410089258851618, + "loss": 1.1996, + "step": 12282 + }, + { + "epoch": 0.43988038748724195, + "grad_norm": 1.3928496837615967, + "learning_rate": 0.00012408963531875753, + "loss": 1.2322, + "step": 12283 + }, + { + "epoch": 0.4399161996168102, + "grad_norm": 1.1123722791671753, + "learning_rate": 0.00012407837772490389, + "loss": 1.1044, + "step": 12284 + }, + { + "epoch": 0.4399520117463785, + "grad_norm": 1.4161484241485596, + "learning_rate": 0.00012406711980710676, + "loss": 1.1977, + "step": 12285 + }, + { + "epoch": 0.43998782387594676, + "grad_norm": 1.407277226448059, + "learning_rate": 0.00012405586156551753, + "loss": 1.115, + "step": 12286 + }, + { + "epoch": 0.4400236360055151, + "grad_norm": 1.5001519918441772, + "learning_rate": 0.00012404460300028774, + "loss": 0.9029, + "step": 12287 + }, + { + "epoch": 0.44005944813508335, + "grad_norm": 1.4687556028366089, + "learning_rate": 0.00012403334411156884, + "loss": 1.185, + "step": 12288 + }, + { + "epoch": 0.4400952602646516, + "grad_norm": 1.6131868362426758, + "learning_rate": 0.0001240220848995123, + "loss": 1.1969, + "step": 12289 + }, + { + "epoch": 0.44013107239421995, + "grad_norm": 1.584001898765564, + "learning_rate": 0.00012401082536426958, + "loss": 1.1534, + "step": 12290 + }, + { + "epoch": 0.4401668845237882, + "grad_norm": 1.6101514101028442, + "learning_rate": 0.00012399956550599218, + "loss": 1.2468, + "step": 12291 + }, + { + "epoch": 0.4402026966533565, + "grad_norm": 1.5584014654159546, + "learning_rate": 0.0001239883053248316, + "loss": 1.1554, + "step": 12292 + }, + { + "epoch": 0.44023850878292475, + "grad_norm": 1.7444429397583008, + "learning_rate": 0.0001239770448209393, + "loss": 1.2868, + "step": 12293 + }, + { + "epoch": 0.4402743209124931, + "grad_norm": 1.6582573652267456, + "learning_rate": 0.00012396578399446678, + "loss": 1.1519, + "step": 12294 + }, + { + "epoch": 0.44031013304206135, + "grad_norm": 1.7564115524291992, + "learning_rate": 0.00012395452284556558, + "loss": 1.3002, + "step": 12295 + }, + { + "epoch": 0.4403459451716296, + "grad_norm": 1.995479702949524, + "learning_rate": 0.00012394326137438714, + "loss": 1.0327, + "step": 12296 + }, + { + "epoch": 0.44038175730119794, + "grad_norm": 1.375566005706787, + "learning_rate": 0.000123931999581083, + "loss": 0.9413, + "step": 12297 + }, + { + "epoch": 0.4404175694307662, + "grad_norm": 1.4806358814239502, + "learning_rate": 0.00012392073746580472, + "loss": 1.1796, + "step": 12298 + }, + { + "epoch": 0.4404533815603345, + "grad_norm": 1.346580147743225, + "learning_rate": 0.00012390947502870375, + "loss": 0.9904, + "step": 12299 + }, + { + "epoch": 0.44048919368990275, + "grad_norm": 1.587193489074707, + "learning_rate": 0.00012389821226993164, + "loss": 0.9299, + "step": 12300 + }, + { + "epoch": 0.44052500581947107, + "grad_norm": 1.4852557182312012, + "learning_rate": 0.0001238869491896399, + "loss": 1.187, + "step": 12301 + }, + { + "epoch": 0.44056081794903934, + "grad_norm": 1.4641693830490112, + "learning_rate": 0.00012387568578798005, + "loss": 1.0319, + "step": 12302 + }, + { + "epoch": 0.4405966300786076, + "grad_norm": 1.8615264892578125, + "learning_rate": 0.00012386442206510368, + "loss": 1.1933, + "step": 12303 + }, + { + "epoch": 0.44063244220817593, + "grad_norm": 1.3708473443984985, + "learning_rate": 0.00012385315802116226, + "loss": 1.1667, + "step": 12304 + }, + { + "epoch": 0.4406682543377442, + "grad_norm": 1.658268928527832, + "learning_rate": 0.0001238418936563074, + "loss": 1.0945, + "step": 12305 + }, + { + "epoch": 0.44070406646731247, + "grad_norm": 1.76737380027771, + "learning_rate": 0.0001238306289706906, + "loss": 1.1382, + "step": 12306 + }, + { + "epoch": 0.44073987859688074, + "grad_norm": 1.8121154308319092, + "learning_rate": 0.00012381936396446344, + "loss": 0.9982, + "step": 12307 + }, + { + "epoch": 0.44077569072644907, + "grad_norm": 1.4220837354660034, + "learning_rate": 0.00012380809863777746, + "loss": 1.1116, + "step": 12308 + }, + { + "epoch": 0.44081150285601733, + "grad_norm": 2.541186571121216, + "learning_rate": 0.00012379683299078422, + "loss": 1.4073, + "step": 12309 + }, + { + "epoch": 0.4408473149855856, + "grad_norm": 1.5617212057113647, + "learning_rate": 0.00012378556702363527, + "loss": 1.0542, + "step": 12310 + }, + { + "epoch": 0.4408831271151539, + "grad_norm": 1.4194533824920654, + "learning_rate": 0.00012377430073648218, + "loss": 1.0928, + "step": 12311 + }, + { + "epoch": 0.4409189392447222, + "grad_norm": 1.897386074066162, + "learning_rate": 0.0001237630341294766, + "loss": 1.2446, + "step": 12312 + }, + { + "epoch": 0.44095475137429047, + "grad_norm": 1.7641335725784302, + "learning_rate": 0.00012375176720277002, + "loss": 1.1046, + "step": 12313 + }, + { + "epoch": 0.44099056350385873, + "grad_norm": 1.8271814584732056, + "learning_rate": 0.00012374049995651405, + "loss": 1.1575, + "step": 12314 + }, + { + "epoch": 0.44102637563342706, + "grad_norm": 1.8333297967910767, + "learning_rate": 0.00012372923239086024, + "loss": 1.1219, + "step": 12315 + }, + { + "epoch": 0.44106218776299533, + "grad_norm": 1.4991517066955566, + "learning_rate": 0.00012371796450596028, + "loss": 0.933, + "step": 12316 + }, + { + "epoch": 0.4410979998925636, + "grad_norm": 1.6211124658584595, + "learning_rate": 0.00012370669630196567, + "loss": 1.2268, + "step": 12317 + }, + { + "epoch": 0.4411338120221319, + "grad_norm": 1.6104605197906494, + "learning_rate": 0.00012369542777902805, + "loss": 1.0983, + "step": 12318 + }, + { + "epoch": 0.4411696241517002, + "grad_norm": 1.475265383720398, + "learning_rate": 0.00012368415893729902, + "loss": 1.2128, + "step": 12319 + }, + { + "epoch": 0.44120543628126846, + "grad_norm": 1.4736744165420532, + "learning_rate": 0.00012367288977693016, + "loss": 1.1205, + "step": 12320 + }, + { + "epoch": 0.44124124841083673, + "grad_norm": 1.3254427909851074, + "learning_rate": 0.0001236616202980731, + "loss": 1.0517, + "step": 12321 + }, + { + "epoch": 0.44127706054040505, + "grad_norm": 1.4528627395629883, + "learning_rate": 0.0001236503505008795, + "loss": 0.945, + "step": 12322 + }, + { + "epoch": 0.4413128726699733, + "grad_norm": 1.9350597858428955, + "learning_rate": 0.0001236390803855009, + "loss": 1.3479, + "step": 12323 + }, + { + "epoch": 0.4413486847995416, + "grad_norm": 1.3891699314117432, + "learning_rate": 0.00012362780995208895, + "loss": 1.0924, + "step": 12324 + }, + { + "epoch": 0.4413844969291099, + "grad_norm": 1.5208957195281982, + "learning_rate": 0.00012361653920079534, + "loss": 1.2591, + "step": 12325 + }, + { + "epoch": 0.4414203090586782, + "grad_norm": 1.5128856897354126, + "learning_rate": 0.00012360526813177163, + "loss": 1.0203, + "step": 12326 + }, + { + "epoch": 0.44145612118824645, + "grad_norm": 1.5610188245773315, + "learning_rate": 0.00012359399674516955, + "loss": 1.0421, + "step": 12327 + }, + { + "epoch": 0.4414919333178147, + "grad_norm": 1.372241735458374, + "learning_rate": 0.00012358272504114058, + "loss": 1.1144, + "step": 12328 + }, + { + "epoch": 0.44152774544738305, + "grad_norm": 1.9234801530838013, + "learning_rate": 0.00012357145301983651, + "loss": 1.2727, + "step": 12329 + }, + { + "epoch": 0.4415635575769513, + "grad_norm": 1.3964424133300781, + "learning_rate": 0.00012356018068140895, + "loss": 1.2931, + "step": 12330 + }, + { + "epoch": 0.4415993697065196, + "grad_norm": 1.7438509464263916, + "learning_rate": 0.00012354890802600957, + "loss": 1.1746, + "step": 12331 + }, + { + "epoch": 0.4416351818360879, + "grad_norm": 1.7742996215820312, + "learning_rate": 0.00012353763505378997, + "loss": 1.141, + "step": 12332 + }, + { + "epoch": 0.4416709939656562, + "grad_norm": 1.853426218032837, + "learning_rate": 0.00012352636176490186, + "loss": 1.4558, + "step": 12333 + }, + { + "epoch": 0.44170680609522445, + "grad_norm": 1.2490230798721313, + "learning_rate": 0.00012351508815949691, + "loss": 1.1532, + "step": 12334 + }, + { + "epoch": 0.4417426182247927, + "grad_norm": 1.8085836172103882, + "learning_rate": 0.00012350381423772676, + "loss": 1.1017, + "step": 12335 + }, + { + "epoch": 0.44177843035436104, + "grad_norm": 1.3237571716308594, + "learning_rate": 0.00012349253999974314, + "loss": 0.8352, + "step": 12336 + }, + { + "epoch": 0.4418142424839293, + "grad_norm": 1.1856634616851807, + "learning_rate": 0.00012348126544569767, + "loss": 1.1192, + "step": 12337 + }, + { + "epoch": 0.4418500546134976, + "grad_norm": 1.2362232208251953, + "learning_rate": 0.00012346999057574209, + "loss": 1.231, + "step": 12338 + }, + { + "epoch": 0.4418858667430659, + "grad_norm": 1.8815280199050903, + "learning_rate": 0.00012345871539002801, + "loss": 1.2422, + "step": 12339 + }, + { + "epoch": 0.44192167887263417, + "grad_norm": 1.6170735359191895, + "learning_rate": 0.00012344743988870722, + "loss": 1.1256, + "step": 12340 + }, + { + "epoch": 0.44195749100220244, + "grad_norm": 1.5714569091796875, + "learning_rate": 0.00012343616407193135, + "loss": 1.1741, + "step": 12341 + }, + { + "epoch": 0.4419933031317707, + "grad_norm": 1.6041383743286133, + "learning_rate": 0.00012342488793985214, + "loss": 1.1743, + "step": 12342 + }, + { + "epoch": 0.44202911526133903, + "grad_norm": 1.9145398139953613, + "learning_rate": 0.00012341361149262125, + "loss": 1.1208, + "step": 12343 + }, + { + "epoch": 0.4420649273909073, + "grad_norm": 1.8837475776672363, + "learning_rate": 0.00012340233473039045, + "loss": 0.9591, + "step": 12344 + }, + { + "epoch": 0.44210073952047557, + "grad_norm": 1.736660122871399, + "learning_rate": 0.00012339105765331142, + "loss": 0.9091, + "step": 12345 + }, + { + "epoch": 0.4421365516500439, + "grad_norm": 1.7238366603851318, + "learning_rate": 0.00012337978026153587, + "loss": 1.0146, + "step": 12346 + }, + { + "epoch": 0.44217236377961217, + "grad_norm": 1.6077978610992432, + "learning_rate": 0.00012336850255521554, + "loss": 1.1612, + "step": 12347 + }, + { + "epoch": 0.44220817590918043, + "grad_norm": 1.3025940656661987, + "learning_rate": 0.00012335722453450215, + "loss": 1.1563, + "step": 12348 + }, + { + "epoch": 0.4422439880387487, + "grad_norm": 1.8151426315307617, + "learning_rate": 0.00012334594619954742, + "loss": 1.0028, + "step": 12349 + }, + { + "epoch": 0.442279800168317, + "grad_norm": 1.721496820449829, + "learning_rate": 0.0001233346675505031, + "loss": 1.2421, + "step": 12350 + }, + { + "epoch": 0.4423156122978853, + "grad_norm": 1.526387333869934, + "learning_rate": 0.00012332338858752094, + "loss": 1.2101, + "step": 12351 + }, + { + "epoch": 0.44235142442745357, + "grad_norm": 1.8356789350509644, + "learning_rate": 0.0001233121093107527, + "loss": 1.1535, + "step": 12352 + }, + { + "epoch": 0.4423872365570219, + "grad_norm": 1.2904812097549438, + "learning_rate": 0.00012330082972035006, + "loss": 0.947, + "step": 12353 + }, + { + "epoch": 0.44242304868659016, + "grad_norm": 2.180267810821533, + "learning_rate": 0.00012328954981646482, + "loss": 1.3845, + "step": 12354 + }, + { + "epoch": 0.44245886081615843, + "grad_norm": 1.4502116441726685, + "learning_rate": 0.0001232782695992487, + "loss": 1.03, + "step": 12355 + }, + { + "epoch": 0.4424946729457267, + "grad_norm": 1.867507815361023, + "learning_rate": 0.00012326698906885353, + "loss": 1.2972, + "step": 12356 + }, + { + "epoch": 0.442530485075295, + "grad_norm": 1.5641686916351318, + "learning_rate": 0.00012325570822543103, + "loss": 1.1228, + "step": 12357 + }, + { + "epoch": 0.4425662972048633, + "grad_norm": 1.7332501411437988, + "learning_rate": 0.00012324442706913296, + "loss": 1.1989, + "step": 12358 + }, + { + "epoch": 0.44260210933443156, + "grad_norm": 1.6651654243469238, + "learning_rate": 0.0001232331456001111, + "loss": 1.2133, + "step": 12359 + }, + { + "epoch": 0.4426379214639999, + "grad_norm": 1.8220444917678833, + "learning_rate": 0.00012322186381851725, + "loss": 1.2483, + "step": 12360 + }, + { + "epoch": 0.44267373359356815, + "grad_norm": 1.5816841125488281, + "learning_rate": 0.00012321058172450318, + "loss": 1.2744, + "step": 12361 + }, + { + "epoch": 0.4427095457231364, + "grad_norm": 1.4576380252838135, + "learning_rate": 0.0001231992993182207, + "loss": 1.1824, + "step": 12362 + }, + { + "epoch": 0.4427453578527047, + "grad_norm": 1.4059758186340332, + "learning_rate": 0.00012318801659982152, + "loss": 1.2157, + "step": 12363 + }, + { + "epoch": 0.442781169982273, + "grad_norm": 1.4871861934661865, + "learning_rate": 0.00012317673356945753, + "loss": 0.9123, + "step": 12364 + }, + { + "epoch": 0.4428169821118413, + "grad_norm": 1.6971583366394043, + "learning_rate": 0.00012316545022728043, + "loss": 1.0937, + "step": 12365 + }, + { + "epoch": 0.44285279424140955, + "grad_norm": 3.2435426712036133, + "learning_rate": 0.00012315416657344213, + "loss": 1.3011, + "step": 12366 + }, + { + "epoch": 0.4428886063709779, + "grad_norm": 2.038350820541382, + "learning_rate": 0.00012314288260809435, + "loss": 1.3193, + "step": 12367 + }, + { + "epoch": 0.44292441850054615, + "grad_norm": 1.537414789199829, + "learning_rate": 0.00012313159833138892, + "loss": 1.3118, + "step": 12368 + }, + { + "epoch": 0.4429602306301144, + "grad_norm": 1.4312251806259155, + "learning_rate": 0.00012312031374347773, + "loss": 1.0977, + "step": 12369 + }, + { + "epoch": 0.4429960427596827, + "grad_norm": 1.7003660202026367, + "learning_rate": 0.00012310902884451252, + "loss": 1.2468, + "step": 12370 + }, + { + "epoch": 0.443031854889251, + "grad_norm": 1.6146223545074463, + "learning_rate": 0.00012309774363464514, + "loss": 1.2685, + "step": 12371 + }, + { + "epoch": 0.4430676670188193, + "grad_norm": 1.5490837097167969, + "learning_rate": 0.00012308645811402738, + "loss": 1.1212, + "step": 12372 + }, + { + "epoch": 0.44310347914838755, + "grad_norm": 1.8557878732681274, + "learning_rate": 0.00012307517228281117, + "loss": 1.2936, + "step": 12373 + }, + { + "epoch": 0.44313929127795587, + "grad_norm": 1.4592067003250122, + "learning_rate": 0.00012306388614114822, + "loss": 1.0759, + "step": 12374 + }, + { + "epoch": 0.44317510340752414, + "grad_norm": 1.482511043548584, + "learning_rate": 0.00012305259968919046, + "loss": 1.2058, + "step": 12375 + }, + { + "epoch": 0.4432109155370924, + "grad_norm": 1.4782817363739014, + "learning_rate": 0.00012304131292708968, + "loss": 1.0639, + "step": 12376 + }, + { + "epoch": 0.4432467276666607, + "grad_norm": 1.4971710443496704, + "learning_rate": 0.0001230300258549978, + "loss": 1.1218, + "step": 12377 + }, + { + "epoch": 0.443282539796229, + "grad_norm": 1.6666786670684814, + "learning_rate": 0.00012301873847306657, + "loss": 1.0629, + "step": 12378 + }, + { + "epoch": 0.44331835192579727, + "grad_norm": 1.720961570739746, + "learning_rate": 0.00012300745078144796, + "loss": 1.2414, + "step": 12379 + }, + { + "epoch": 0.44335416405536554, + "grad_norm": 1.6083269119262695, + "learning_rate": 0.00012299616278029375, + "loss": 1.1746, + "step": 12380 + }, + { + "epoch": 0.4433899761849338, + "grad_norm": 1.2974921464920044, + "learning_rate": 0.00012298487446975583, + "loss": 1.1903, + "step": 12381 + }, + { + "epoch": 0.44342578831450213, + "grad_norm": 1.9176331758499146, + "learning_rate": 0.0001229735858499861, + "loss": 1.1142, + "step": 12382 + }, + { + "epoch": 0.4434616004440704, + "grad_norm": 1.7194279432296753, + "learning_rate": 0.0001229622969211364, + "loss": 1.1882, + "step": 12383 + }, + { + "epoch": 0.44349741257363867, + "grad_norm": 1.3748211860656738, + "learning_rate": 0.00012295100768335858, + "loss": 1.2856, + "step": 12384 + }, + { + "epoch": 0.443533224703207, + "grad_norm": 1.768463373184204, + "learning_rate": 0.00012293971813680458, + "loss": 1.1466, + "step": 12385 + }, + { + "epoch": 0.44356903683277527, + "grad_norm": 2.8963534832000732, + "learning_rate": 0.00012292842828162627, + "loss": 1.31, + "step": 12386 + }, + { + "epoch": 0.44360484896234353, + "grad_norm": 1.657067894935608, + "learning_rate": 0.00012291713811797553, + "loss": 1.0902, + "step": 12387 + }, + { + "epoch": 0.4436406610919118, + "grad_norm": 1.3932774066925049, + "learning_rate": 0.00012290584764600425, + "loss": 1.1274, + "step": 12388 + }, + { + "epoch": 0.4436764732214801, + "grad_norm": 1.420223355293274, + "learning_rate": 0.00012289455686586434, + "loss": 1.0704, + "step": 12389 + }, + { + "epoch": 0.4437122853510484, + "grad_norm": 1.4247347116470337, + "learning_rate": 0.0001228832657777077, + "loss": 1.2279, + "step": 12390 + }, + { + "epoch": 0.44374809748061667, + "grad_norm": 1.757333517074585, + "learning_rate": 0.00012287197438168624, + "loss": 1.4082, + "step": 12391 + }, + { + "epoch": 0.443783909610185, + "grad_norm": 1.3725390434265137, + "learning_rate": 0.00012286068267795185, + "loss": 0.9219, + "step": 12392 + }, + { + "epoch": 0.44381972173975326, + "grad_norm": 1.2452943325042725, + "learning_rate": 0.00012284939066665648, + "loss": 1.1034, + "step": 12393 + }, + { + "epoch": 0.44385553386932153, + "grad_norm": 1.518505573272705, + "learning_rate": 0.00012283809834795202, + "loss": 1.2708, + "step": 12394 + }, + { + "epoch": 0.4438913459988898, + "grad_norm": 1.2730510234832764, + "learning_rate": 0.00012282680572199043, + "loss": 1.1488, + "step": 12395 + }, + { + "epoch": 0.4439271581284581, + "grad_norm": 1.2109757661819458, + "learning_rate": 0.00012281551278892357, + "loss": 0.9071, + "step": 12396 + }, + { + "epoch": 0.4439629702580264, + "grad_norm": 1.5206644535064697, + "learning_rate": 0.00012280421954890346, + "loss": 1.1264, + "step": 12397 + }, + { + "epoch": 0.44399878238759466, + "grad_norm": 2.343061685562134, + "learning_rate": 0.000122792926002082, + "loss": 1.0636, + "step": 12398 + }, + { + "epoch": 0.444034594517163, + "grad_norm": 1.421325445175171, + "learning_rate": 0.00012278163214861107, + "loss": 1.0526, + "step": 12399 + }, + { + "epoch": 0.44407040664673125, + "grad_norm": 1.696638822555542, + "learning_rate": 0.00012277033798864268, + "loss": 1.2547, + "step": 12400 + }, + { + "epoch": 0.4441062187762995, + "grad_norm": 2.145021915435791, + "learning_rate": 0.00012275904352232876, + "loss": 1.1681, + "step": 12401 + }, + { + "epoch": 0.4441420309058678, + "grad_norm": 1.6175189018249512, + "learning_rate": 0.00012274774874982132, + "loss": 1.3106, + "step": 12402 + }, + { + "epoch": 0.4441778430354361, + "grad_norm": 1.4442263841629028, + "learning_rate": 0.0001227364536712722, + "loss": 1.1443, + "step": 12403 + }, + { + "epoch": 0.4442136551650044, + "grad_norm": 1.422917366027832, + "learning_rate": 0.00012272515828683344, + "loss": 1.0745, + "step": 12404 + }, + { + "epoch": 0.44424946729457265, + "grad_norm": 1.7158435583114624, + "learning_rate": 0.00012271386259665701, + "loss": 1.0687, + "step": 12405 + }, + { + "epoch": 0.444285279424141, + "grad_norm": 1.9762165546417236, + "learning_rate": 0.00012270256660089484, + "loss": 1.3855, + "step": 12406 + }, + { + "epoch": 0.44432109155370925, + "grad_norm": 1.564367413520813, + "learning_rate": 0.00012269127029969893, + "loss": 1.247, + "step": 12407 + }, + { + "epoch": 0.4443569036832775, + "grad_norm": 1.5321643352508545, + "learning_rate": 0.00012267997369322126, + "loss": 1.18, + "step": 12408 + }, + { + "epoch": 0.4443927158128458, + "grad_norm": 1.7934608459472656, + "learning_rate": 0.00012266867678161375, + "loss": 1.0522, + "step": 12409 + }, + { + "epoch": 0.4444285279424141, + "grad_norm": 1.7875019311904907, + "learning_rate": 0.00012265737956502847, + "loss": 1.0026, + "step": 12410 + }, + { + "epoch": 0.4444643400719824, + "grad_norm": 1.5321128368377686, + "learning_rate": 0.0001226460820436174, + "loss": 1.2526, + "step": 12411 + }, + { + "epoch": 0.44450015220155065, + "grad_norm": 1.8733384609222412, + "learning_rate": 0.00012263478421753243, + "loss": 1.1033, + "step": 12412 + }, + { + "epoch": 0.44453596433111897, + "grad_norm": 1.7352384328842163, + "learning_rate": 0.0001226234860869257, + "loss": 1.0648, + "step": 12413 + }, + { + "epoch": 0.44457177646068724, + "grad_norm": 1.3600826263427734, + "learning_rate": 0.00012261218765194913, + "loss": 1.1583, + "step": 12414 + }, + { + "epoch": 0.4446075885902555, + "grad_norm": 1.380501627922058, + "learning_rate": 0.00012260088891275476, + "loss": 1.1789, + "step": 12415 + }, + { + "epoch": 0.4446434007198238, + "grad_norm": 1.670594573020935, + "learning_rate": 0.00012258958986949455, + "loss": 1.1691, + "step": 12416 + }, + { + "epoch": 0.4446792128493921, + "grad_norm": 1.3334637880325317, + "learning_rate": 0.00012257829052232056, + "loss": 1.3762, + "step": 12417 + }, + { + "epoch": 0.44471502497896037, + "grad_norm": 1.3068169355392456, + "learning_rate": 0.00012256699087138479, + "loss": 1.0847, + "step": 12418 + }, + { + "epoch": 0.44475083710852864, + "grad_norm": 1.7769193649291992, + "learning_rate": 0.0001225556909168393, + "loss": 1.2364, + "step": 12419 + }, + { + "epoch": 0.44478664923809696, + "grad_norm": 1.6938310861587524, + "learning_rate": 0.00012254439065883602, + "loss": 1.0949, + "step": 12420 + }, + { + "epoch": 0.44482246136766523, + "grad_norm": 1.5441943407058716, + "learning_rate": 0.0001225330900975271, + "loss": 1.1472, + "step": 12421 + }, + { + "epoch": 0.4448582734972335, + "grad_norm": 1.538325309753418, + "learning_rate": 0.00012252178923306448, + "loss": 0.9859, + "step": 12422 + }, + { + "epoch": 0.44489408562680177, + "grad_norm": 1.3765649795532227, + "learning_rate": 0.00012251048806560027, + "loss": 0.9701, + "step": 12423 + }, + { + "epoch": 0.4449298977563701, + "grad_norm": 1.6351853609085083, + "learning_rate": 0.00012249918659528648, + "loss": 1.1377, + "step": 12424 + }, + { + "epoch": 0.44496570988593837, + "grad_norm": 2.273077964782715, + "learning_rate": 0.0001224878848222751, + "loss": 1.2673, + "step": 12425 + }, + { + "epoch": 0.44500152201550663, + "grad_norm": 1.3749414682388306, + "learning_rate": 0.0001224765827467183, + "loss": 0.9253, + "step": 12426 + }, + { + "epoch": 0.44503733414507496, + "grad_norm": 1.2590607404708862, + "learning_rate": 0.00012246528036876807, + "loss": 1.1371, + "step": 12427 + }, + { + "epoch": 0.4450731462746432, + "grad_norm": 1.4776325225830078, + "learning_rate": 0.00012245397768857646, + "loss": 1.0846, + "step": 12428 + }, + { + "epoch": 0.4451089584042115, + "grad_norm": 1.4460406303405762, + "learning_rate": 0.0001224426747062955, + "loss": 1.0288, + "step": 12429 + }, + { + "epoch": 0.44514477053377977, + "grad_norm": 1.753621220588684, + "learning_rate": 0.00012243137142207733, + "loss": 1.287, + "step": 12430 + }, + { + "epoch": 0.4451805826633481, + "grad_norm": 1.708176612854004, + "learning_rate": 0.000122420067836074, + "loss": 1.0955, + "step": 12431 + }, + { + "epoch": 0.44521639479291636, + "grad_norm": 1.4334949254989624, + "learning_rate": 0.0001224087639484376, + "loss": 1.1371, + "step": 12432 + }, + { + "epoch": 0.44525220692248463, + "grad_norm": 1.8988333940505981, + "learning_rate": 0.00012239745975932016, + "loss": 0.9708, + "step": 12433 + }, + { + "epoch": 0.44528801905205295, + "grad_norm": 2.3437340259552, + "learning_rate": 0.00012238615526887378, + "loss": 1.5707, + "step": 12434 + }, + { + "epoch": 0.4453238311816212, + "grad_norm": 1.7657283544540405, + "learning_rate": 0.00012237485047725057, + "loss": 1.2045, + "step": 12435 + }, + { + "epoch": 0.4453596433111895, + "grad_norm": 1.4898440837860107, + "learning_rate": 0.00012236354538460259, + "loss": 1.1989, + "step": 12436 + }, + { + "epoch": 0.44539545544075776, + "grad_norm": 2.024409532546997, + "learning_rate": 0.000122352239991082, + "loss": 1.1554, + "step": 12437 + }, + { + "epoch": 0.4454312675703261, + "grad_norm": 1.6294277906417847, + "learning_rate": 0.0001223409342968408, + "loss": 0.9793, + "step": 12438 + }, + { + "epoch": 0.44546707969989435, + "grad_norm": 1.386597752571106, + "learning_rate": 0.00012232962830203116, + "loss": 1.0991, + "step": 12439 + }, + { + "epoch": 0.4455028918294626, + "grad_norm": 1.7375671863555908, + "learning_rate": 0.00012231832200680518, + "loss": 1.096, + "step": 12440 + }, + { + "epoch": 0.44553870395903095, + "grad_norm": 1.6691917181015015, + "learning_rate": 0.00012230701541131499, + "loss": 1.0459, + "step": 12441 + }, + { + "epoch": 0.4455745160885992, + "grad_norm": 1.6814359426498413, + "learning_rate": 0.00012229570851571265, + "loss": 1.2301, + "step": 12442 + }, + { + "epoch": 0.4456103282181675, + "grad_norm": 2.615305185317993, + "learning_rate": 0.00012228440132015033, + "loss": 1.132, + "step": 12443 + }, + { + "epoch": 0.44564614034773575, + "grad_norm": 2.356613874435425, + "learning_rate": 0.0001222730938247801, + "loss": 1.3833, + "step": 12444 + }, + { + "epoch": 0.4456819524773041, + "grad_norm": 1.504423975944519, + "learning_rate": 0.00012226178602975417, + "loss": 1.1534, + "step": 12445 + }, + { + "epoch": 0.44571776460687235, + "grad_norm": 2.1521339416503906, + "learning_rate": 0.00012225047793522462, + "loss": 1.2085, + "step": 12446 + }, + { + "epoch": 0.4457535767364406, + "grad_norm": 1.614080548286438, + "learning_rate": 0.00012223916954134356, + "loss": 1.3138, + "step": 12447 + }, + { + "epoch": 0.44578938886600894, + "grad_norm": 1.3703036308288574, + "learning_rate": 0.00012222786084826318, + "loss": 1.1405, + "step": 12448 + }, + { + "epoch": 0.4458252009955772, + "grad_norm": 1.5243009328842163, + "learning_rate": 0.00012221655185613557, + "loss": 1.0602, + "step": 12449 + }, + { + "epoch": 0.4458610131251455, + "grad_norm": 1.4858012199401855, + "learning_rate": 0.00012220524256511297, + "loss": 1.1002, + "step": 12450 + }, + { + "epoch": 0.44589682525471375, + "grad_norm": 1.5569677352905273, + "learning_rate": 0.00012219393297534744, + "loss": 0.9416, + "step": 12451 + }, + { + "epoch": 0.44593263738428207, + "grad_norm": 2.0757768154144287, + "learning_rate": 0.00012218262308699119, + "loss": 1.0468, + "step": 12452 + }, + { + "epoch": 0.44596844951385034, + "grad_norm": 1.4580016136169434, + "learning_rate": 0.00012217131290019633, + "loss": 1.1733, + "step": 12453 + }, + { + "epoch": 0.4460042616434186, + "grad_norm": 1.7790756225585938, + "learning_rate": 0.00012216000241511507, + "loss": 1.2666, + "step": 12454 + }, + { + "epoch": 0.44604007377298693, + "grad_norm": 1.8171170949935913, + "learning_rate": 0.00012214869163189958, + "loss": 1.2014, + "step": 12455 + }, + { + "epoch": 0.4460758859025552, + "grad_norm": 1.6195200681686401, + "learning_rate": 0.00012213738055070195, + "loss": 1.2713, + "step": 12456 + }, + { + "epoch": 0.44611169803212347, + "grad_norm": 1.3785194158554077, + "learning_rate": 0.0001221260691716745, + "loss": 0.995, + "step": 12457 + }, + { + "epoch": 0.44614751016169174, + "grad_norm": 1.8208757638931274, + "learning_rate": 0.0001221147574949693, + "loss": 1.1104, + "step": 12458 + }, + { + "epoch": 0.44618332229126006, + "grad_norm": 1.5884209871292114, + "learning_rate": 0.00012210344552073855, + "loss": 1.326, + "step": 12459 + }, + { + "epoch": 0.44621913442082833, + "grad_norm": 1.2369102239608765, + "learning_rate": 0.00012209213324913446, + "loss": 1.0626, + "step": 12460 + }, + { + "epoch": 0.4462549465503966, + "grad_norm": 1.9501622915267944, + "learning_rate": 0.00012208082068030924, + "loss": 1.2678, + "step": 12461 + }, + { + "epoch": 0.4462907586799649, + "grad_norm": 1.549985647201538, + "learning_rate": 0.00012206950781441502, + "loss": 0.9737, + "step": 12462 + }, + { + "epoch": 0.4463265708095332, + "grad_norm": 1.7209018468856812, + "learning_rate": 0.00012205819465160407, + "loss": 1.3926, + "step": 12463 + }, + { + "epoch": 0.44636238293910147, + "grad_norm": 1.7333745956420898, + "learning_rate": 0.00012204688119202852, + "loss": 1.2358, + "step": 12464 + }, + { + "epoch": 0.44639819506866973, + "grad_norm": 1.3693087100982666, + "learning_rate": 0.00012203556743584063, + "loss": 1.136, + "step": 12465 + }, + { + "epoch": 0.44643400719823806, + "grad_norm": 2.078516721725464, + "learning_rate": 0.00012202425338319265, + "loss": 1.313, + "step": 12466 + }, + { + "epoch": 0.4464698193278063, + "grad_norm": 1.3904060125350952, + "learning_rate": 0.00012201293903423675, + "loss": 1.0436, + "step": 12467 + }, + { + "epoch": 0.4465056314573746, + "grad_norm": 1.644364833831787, + "learning_rate": 0.00012200162438912512, + "loss": 1.2594, + "step": 12468 + }, + { + "epoch": 0.4465414435869429, + "grad_norm": 1.8082462549209595, + "learning_rate": 0.00012199030944801, + "loss": 1.2005, + "step": 12469 + }, + { + "epoch": 0.4465772557165112, + "grad_norm": 1.8631588220596313, + "learning_rate": 0.00012197899421104367, + "loss": 1.2417, + "step": 12470 + }, + { + "epoch": 0.44661306784607946, + "grad_norm": 1.279868721961975, + "learning_rate": 0.00012196767867837829, + "loss": 1.1325, + "step": 12471 + }, + { + "epoch": 0.44664887997564773, + "grad_norm": 1.4452197551727295, + "learning_rate": 0.00012195636285016614, + "loss": 1.1361, + "step": 12472 + }, + { + "epoch": 0.44668469210521605, + "grad_norm": 1.2737966775894165, + "learning_rate": 0.00012194504672655944, + "loss": 1.0391, + "step": 12473 + }, + { + "epoch": 0.4467205042347843, + "grad_norm": 1.5810718536376953, + "learning_rate": 0.00012193373030771046, + "loss": 1.1742, + "step": 12474 + }, + { + "epoch": 0.4467563163643526, + "grad_norm": 1.5260132551193237, + "learning_rate": 0.00012192241359377143, + "loss": 1.2291, + "step": 12475 + }, + { + "epoch": 0.4467921284939209, + "grad_norm": 1.4997831583023071, + "learning_rate": 0.00012191109658489462, + "loss": 1.0162, + "step": 12476 + }, + { + "epoch": 0.4468279406234892, + "grad_norm": 2.0820484161376953, + "learning_rate": 0.00012189977928123224, + "loss": 1.2665, + "step": 12477 + }, + { + "epoch": 0.44686375275305745, + "grad_norm": 1.7406705617904663, + "learning_rate": 0.0001218884616829366, + "loss": 1.1504, + "step": 12478 + }, + { + "epoch": 0.4468995648826257, + "grad_norm": 2.2255356311798096, + "learning_rate": 0.00012187714379015993, + "loss": 1.3426, + "step": 12479 + }, + { + "epoch": 0.44693537701219405, + "grad_norm": 1.8903247117996216, + "learning_rate": 0.00012186582560305448, + "loss": 1.158, + "step": 12480 + }, + { + "epoch": 0.4469711891417623, + "grad_norm": 1.442824125289917, + "learning_rate": 0.0001218545071217726, + "loss": 1.2001, + "step": 12481 + }, + { + "epoch": 0.4470070012713306, + "grad_norm": 1.7077813148498535, + "learning_rate": 0.00012184318834646648, + "loss": 1.2301, + "step": 12482 + }, + { + "epoch": 0.4470428134008989, + "grad_norm": 1.6543728113174438, + "learning_rate": 0.00012183186927728846, + "loss": 1.1624, + "step": 12483 + }, + { + "epoch": 0.4470786255304672, + "grad_norm": 1.2811168432235718, + "learning_rate": 0.00012182054991439078, + "loss": 0.9748, + "step": 12484 + }, + { + "epoch": 0.44711443766003545, + "grad_norm": 1.4927005767822266, + "learning_rate": 0.00012180923025792579, + "loss": 1.2618, + "step": 12485 + }, + { + "epoch": 0.4471502497896037, + "grad_norm": 1.3238179683685303, + "learning_rate": 0.00012179791030804573, + "loss": 1.1108, + "step": 12486 + }, + { + "epoch": 0.44718606191917204, + "grad_norm": 1.466036319732666, + "learning_rate": 0.00012178659006490285, + "loss": 1.1535, + "step": 12487 + }, + { + "epoch": 0.4472218740487403, + "grad_norm": 1.620058298110962, + "learning_rate": 0.00012177526952864955, + "loss": 1.2901, + "step": 12488 + }, + { + "epoch": 0.4472576861783086, + "grad_norm": 1.5878959894180298, + "learning_rate": 0.00012176394869943805, + "loss": 1.3319, + "step": 12489 + }, + { + "epoch": 0.4472934983078769, + "grad_norm": 1.2906824350357056, + "learning_rate": 0.00012175262757742074, + "loss": 1.0879, + "step": 12490 + }, + { + "epoch": 0.44732931043744517, + "grad_norm": 1.5493402481079102, + "learning_rate": 0.00012174130616274985, + "loss": 1.0911, + "step": 12491 + }, + { + "epoch": 0.44736512256701344, + "grad_norm": 1.6099927425384521, + "learning_rate": 0.00012172998445557775, + "loss": 1.1017, + "step": 12492 + }, + { + "epoch": 0.4474009346965817, + "grad_norm": 1.9119503498077393, + "learning_rate": 0.00012171866245605671, + "loss": 1.2546, + "step": 12493 + }, + { + "epoch": 0.44743674682615003, + "grad_norm": 1.4432005882263184, + "learning_rate": 0.00012170734016433914, + "loss": 1.1076, + "step": 12494 + }, + { + "epoch": 0.4474725589557183, + "grad_norm": 1.369800329208374, + "learning_rate": 0.00012169601758057727, + "loss": 1.2857, + "step": 12495 + }, + { + "epoch": 0.44750837108528657, + "grad_norm": 1.686331033706665, + "learning_rate": 0.00012168469470492345, + "loss": 1.1496, + "step": 12496 + }, + { + "epoch": 0.4475441832148549, + "grad_norm": 1.4869205951690674, + "learning_rate": 0.00012167337153753007, + "loss": 1.4604, + "step": 12497 + }, + { + "epoch": 0.44757999534442316, + "grad_norm": 1.785766363143921, + "learning_rate": 0.00012166204807854942, + "loss": 1.18, + "step": 12498 + }, + { + "epoch": 0.44761580747399143, + "grad_norm": 1.7866418361663818, + "learning_rate": 0.00012165072432813385, + "loss": 1.1859, + "step": 12499 + }, + { + "epoch": 0.4476516196035597, + "grad_norm": 1.5866912603378296, + "learning_rate": 0.0001216394002864357, + "loss": 1.0745, + "step": 12500 + }, + { + "epoch": 0.447687431733128, + "grad_norm": 1.328317642211914, + "learning_rate": 0.00012162807595360737, + "loss": 1.1839, + "step": 12501 + }, + { + "epoch": 0.4477232438626963, + "grad_norm": 1.5510601997375488, + "learning_rate": 0.00012161675132980114, + "loss": 1.1826, + "step": 12502 + }, + { + "epoch": 0.44775905599226457, + "grad_norm": 1.8580487966537476, + "learning_rate": 0.00012160542641516945, + "loss": 1.342, + "step": 12503 + }, + { + "epoch": 0.4477948681218329, + "grad_norm": 1.547385334968567, + "learning_rate": 0.00012159410120986456, + "loss": 1.1946, + "step": 12504 + }, + { + "epoch": 0.44783068025140116, + "grad_norm": 1.5124621391296387, + "learning_rate": 0.00012158277571403893, + "loss": 1.0338, + "step": 12505 + }, + { + "epoch": 0.4478664923809694, + "grad_norm": 1.5745869874954224, + "learning_rate": 0.00012157144992784486, + "loss": 1.1382, + "step": 12506 + }, + { + "epoch": 0.4479023045105377, + "grad_norm": 1.3825337886810303, + "learning_rate": 0.00012156012385143479, + "loss": 1.3445, + "step": 12507 + }, + { + "epoch": 0.447938116640106, + "grad_norm": 1.7341787815093994, + "learning_rate": 0.00012154879748496104, + "loss": 1.1834, + "step": 12508 + }, + { + "epoch": 0.4479739287696743, + "grad_norm": 1.734142541885376, + "learning_rate": 0.00012153747082857601, + "loss": 1.2716, + "step": 12509 + }, + { + "epoch": 0.44800974089924256, + "grad_norm": 2.097318172454834, + "learning_rate": 0.00012152614388243213, + "loss": 1.263, + "step": 12510 + }, + { + "epoch": 0.4480455530288109, + "grad_norm": 1.8025460243225098, + "learning_rate": 0.00012151481664668175, + "loss": 1.185, + "step": 12511 + }, + { + "epoch": 0.44808136515837915, + "grad_norm": 1.6939913034439087, + "learning_rate": 0.00012150348912147723, + "loss": 1.1251, + "step": 12512 + }, + { + "epoch": 0.4481171772879474, + "grad_norm": 1.2808101177215576, + "learning_rate": 0.000121492161306971, + "loss": 1.1623, + "step": 12513 + }, + { + "epoch": 0.4481529894175157, + "grad_norm": 1.6158297061920166, + "learning_rate": 0.00012148083320331549, + "loss": 1.2583, + "step": 12514 + }, + { + "epoch": 0.448188801547084, + "grad_norm": 2.2238380908966064, + "learning_rate": 0.00012146950481066304, + "loss": 1.1863, + "step": 12515 + }, + { + "epoch": 0.4482246136766523, + "grad_norm": 1.6233168840408325, + "learning_rate": 0.00012145817612916612, + "loss": 1.1634, + "step": 12516 + }, + { + "epoch": 0.44826042580622055, + "grad_norm": 1.641796588897705, + "learning_rate": 0.00012144684715897711, + "loss": 0.9306, + "step": 12517 + }, + { + "epoch": 0.4482962379357889, + "grad_norm": 1.2944047451019287, + "learning_rate": 0.00012143551790024848, + "loss": 1.1671, + "step": 12518 + }, + { + "epoch": 0.44833205006535715, + "grad_norm": 1.6376656293869019, + "learning_rate": 0.00012142418835313254, + "loss": 1.1649, + "step": 12519 + }, + { + "epoch": 0.4483678621949254, + "grad_norm": 1.7315990924835205, + "learning_rate": 0.00012141285851778183, + "loss": 1.1751, + "step": 12520 + }, + { + "epoch": 0.4484036743244937, + "grad_norm": 1.9899976253509521, + "learning_rate": 0.00012140152839434873, + "loss": 1.282, + "step": 12521 + }, + { + "epoch": 0.448439486454062, + "grad_norm": 2.134491205215454, + "learning_rate": 0.00012139019798298563, + "loss": 1.3084, + "step": 12522 + }, + { + "epoch": 0.4484752985836303, + "grad_norm": 1.722747564315796, + "learning_rate": 0.00012137886728384504, + "loss": 1.1944, + "step": 12523 + }, + { + "epoch": 0.44851111071319855, + "grad_norm": 1.3310046195983887, + "learning_rate": 0.00012136753629707936, + "loss": 1.2131, + "step": 12524 + }, + { + "epoch": 0.44854692284276687, + "grad_norm": 1.8476234674453735, + "learning_rate": 0.00012135620502284104, + "loss": 1.1749, + "step": 12525 + }, + { + "epoch": 0.44858273497233514, + "grad_norm": 1.601874828338623, + "learning_rate": 0.00012134487346128252, + "loss": 1.1566, + "step": 12526 + }, + { + "epoch": 0.4486185471019034, + "grad_norm": 1.6920515298843384, + "learning_rate": 0.00012133354161255628, + "loss": 1.0676, + "step": 12527 + }, + { + "epoch": 0.4486543592314717, + "grad_norm": 1.4055428504943848, + "learning_rate": 0.00012132220947681472, + "loss": 0.9638, + "step": 12528 + }, + { + "epoch": 0.44869017136104, + "grad_norm": 1.644868016242981, + "learning_rate": 0.00012131087705421036, + "loss": 1.2258, + "step": 12529 + }, + { + "epoch": 0.44872598349060827, + "grad_norm": 1.5205330848693848, + "learning_rate": 0.00012129954434489566, + "loss": 1.1461, + "step": 12530 + }, + { + "epoch": 0.44876179562017654, + "grad_norm": 1.379572868347168, + "learning_rate": 0.00012128821134902302, + "loss": 1.0694, + "step": 12531 + }, + { + "epoch": 0.44879760774974486, + "grad_norm": 2.0367019176483154, + "learning_rate": 0.00012127687806674499, + "loss": 1.235, + "step": 12532 + }, + { + "epoch": 0.44883341987931313, + "grad_norm": 1.559360384941101, + "learning_rate": 0.00012126554449821399, + "loss": 1.2456, + "step": 12533 + }, + { + "epoch": 0.4488692320088814, + "grad_norm": 1.4897093772888184, + "learning_rate": 0.00012125421064358253, + "loss": 1.0555, + "step": 12534 + }, + { + "epoch": 0.44890504413844967, + "grad_norm": 1.3973147869110107, + "learning_rate": 0.00012124287650300307, + "loss": 1.2146, + "step": 12535 + }, + { + "epoch": 0.448940856268018, + "grad_norm": 1.781774640083313, + "learning_rate": 0.00012123154207662815, + "loss": 1.0499, + "step": 12536 + }, + { + "epoch": 0.44897666839758626, + "grad_norm": 1.4050662517547607, + "learning_rate": 0.00012122020736461018, + "loss": 1.1091, + "step": 12537 + }, + { + "epoch": 0.44901248052715453, + "grad_norm": 1.4097473621368408, + "learning_rate": 0.0001212088723671017, + "loss": 1.1912, + "step": 12538 + }, + { + "epoch": 0.44904829265672286, + "grad_norm": 1.574893832206726, + "learning_rate": 0.0001211975370842552, + "loss": 1.1998, + "step": 12539 + }, + { + "epoch": 0.4490841047862911, + "grad_norm": 1.9536290168762207, + "learning_rate": 0.00012118620151622317, + "loss": 1.0657, + "step": 12540 + }, + { + "epoch": 0.4491199169158594, + "grad_norm": 1.4366233348846436, + "learning_rate": 0.00012117486566315814, + "loss": 1.0335, + "step": 12541 + }, + { + "epoch": 0.44915572904542767, + "grad_norm": 1.4448251724243164, + "learning_rate": 0.0001211635295252126, + "loss": 1.0433, + "step": 12542 + }, + { + "epoch": 0.449191541174996, + "grad_norm": 1.4079573154449463, + "learning_rate": 0.0001211521931025391, + "loss": 1.1441, + "step": 12543 + }, + { + "epoch": 0.44922735330456426, + "grad_norm": 1.4167492389678955, + "learning_rate": 0.00012114085639529007, + "loss": 1.0848, + "step": 12544 + }, + { + "epoch": 0.4492631654341325, + "grad_norm": 1.396400809288025, + "learning_rate": 0.00012112951940361812, + "loss": 1.1597, + "step": 12545 + }, + { + "epoch": 0.44929897756370085, + "grad_norm": 2.625183582305908, + "learning_rate": 0.00012111818212767572, + "loss": 1.3118, + "step": 12546 + }, + { + "epoch": 0.4493347896932691, + "grad_norm": 1.5239216089248657, + "learning_rate": 0.00012110684456761547, + "loss": 0.9323, + "step": 12547 + }, + { + "epoch": 0.4493706018228374, + "grad_norm": 1.4539698362350464, + "learning_rate": 0.0001210955067235898, + "loss": 1.1087, + "step": 12548 + }, + { + "epoch": 0.44940641395240566, + "grad_norm": 1.5330661535263062, + "learning_rate": 0.00012108416859575131, + "loss": 1.2029, + "step": 12549 + }, + { + "epoch": 0.449442226081974, + "grad_norm": 1.9225578308105469, + "learning_rate": 0.00012107283018425256, + "loss": 1.4235, + "step": 12550 + }, + { + "epoch": 0.44947803821154225, + "grad_norm": 1.3893022537231445, + "learning_rate": 0.00012106149148924602, + "loss": 1.2391, + "step": 12551 + }, + { + "epoch": 0.4495138503411105, + "grad_norm": 1.6601532697677612, + "learning_rate": 0.0001210501525108843, + "loss": 1.3626, + "step": 12552 + }, + { + "epoch": 0.44954966247067885, + "grad_norm": 1.638514757156372, + "learning_rate": 0.00012103881324931991, + "loss": 1.0319, + "step": 12553 + }, + { + "epoch": 0.4495854746002471, + "grad_norm": 1.3087632656097412, + "learning_rate": 0.00012102747370470546, + "loss": 0.9624, + "step": 12554 + }, + { + "epoch": 0.4496212867298154, + "grad_norm": 1.4438117742538452, + "learning_rate": 0.00012101613387719348, + "loss": 1.2921, + "step": 12555 + }, + { + "epoch": 0.44965709885938365, + "grad_norm": 2.1373212337493896, + "learning_rate": 0.00012100479376693652, + "loss": 1.2227, + "step": 12556 + }, + { + "epoch": 0.449692910988952, + "grad_norm": 1.5930893421173096, + "learning_rate": 0.00012099345337408712, + "loss": 1.0428, + "step": 12557 + }, + { + "epoch": 0.44972872311852025, + "grad_norm": 1.4732933044433594, + "learning_rate": 0.00012098211269879791, + "loss": 0.9676, + "step": 12558 + }, + { + "epoch": 0.4497645352480885, + "grad_norm": 1.4296464920043945, + "learning_rate": 0.00012097077174122143, + "loss": 1.2487, + "step": 12559 + }, + { + "epoch": 0.44980034737765684, + "grad_norm": 1.430499792098999, + "learning_rate": 0.00012095943050151026, + "loss": 1.2371, + "step": 12560 + }, + { + "epoch": 0.4498361595072251, + "grad_norm": 1.656430959701538, + "learning_rate": 0.000120948088979817, + "loss": 1.0067, + "step": 12561 + }, + { + "epoch": 0.4498719716367934, + "grad_norm": 1.2820757627487183, + "learning_rate": 0.00012093674717629419, + "loss": 1.1412, + "step": 12562 + }, + { + "epoch": 0.44990778376636165, + "grad_norm": 1.5507267713546753, + "learning_rate": 0.00012092540509109451, + "loss": 1.1879, + "step": 12563 + }, + { + "epoch": 0.44994359589592997, + "grad_norm": 1.7770248651504517, + "learning_rate": 0.00012091406272437049, + "loss": 1.1494, + "step": 12564 + }, + { + "epoch": 0.44997940802549824, + "grad_norm": 1.5923206806182861, + "learning_rate": 0.00012090272007627472, + "loss": 1.2441, + "step": 12565 + }, + { + "epoch": 0.4500152201550665, + "grad_norm": 1.392187237739563, + "learning_rate": 0.0001208913771469598, + "loss": 1.0206, + "step": 12566 + }, + { + "epoch": 0.45005103228463483, + "grad_norm": 1.5764999389648438, + "learning_rate": 0.00012088003393657837, + "loss": 1.0857, + "step": 12567 + }, + { + "epoch": 0.4500868444142031, + "grad_norm": 1.289261817932129, + "learning_rate": 0.00012086869044528297, + "loss": 1.1729, + "step": 12568 + }, + { + "epoch": 0.45012265654377137, + "grad_norm": 1.85535728931427, + "learning_rate": 0.0001208573466732263, + "loss": 1.2009, + "step": 12569 + }, + { + "epoch": 0.45015846867333964, + "grad_norm": 1.353985071182251, + "learning_rate": 0.00012084600262056094, + "loss": 1.2246, + "step": 12570 + }, + { + "epoch": 0.45019428080290796, + "grad_norm": 1.5238908529281616, + "learning_rate": 0.0001208346582874395, + "loss": 1.1092, + "step": 12571 + }, + { + "epoch": 0.45023009293247623, + "grad_norm": 1.6882758140563965, + "learning_rate": 0.00012082331367401458, + "loss": 1.1509, + "step": 12572 + }, + { + "epoch": 0.4502659050620445, + "grad_norm": 1.625630497932434, + "learning_rate": 0.00012081196878043885, + "loss": 1.013, + "step": 12573 + }, + { + "epoch": 0.4503017171916128, + "grad_norm": 1.979264259338379, + "learning_rate": 0.00012080062360686495, + "loss": 1.2867, + "step": 12574 + }, + { + "epoch": 0.4503375293211811, + "grad_norm": 1.2346901893615723, + "learning_rate": 0.00012078927815344545, + "loss": 0.9827, + "step": 12575 + }, + { + "epoch": 0.45037334145074936, + "grad_norm": 1.7264797687530518, + "learning_rate": 0.00012077793242033307, + "loss": 1.3226, + "step": 12576 + }, + { + "epoch": 0.45040915358031763, + "grad_norm": 1.8244925737380981, + "learning_rate": 0.00012076658640768036, + "loss": 1.1525, + "step": 12577 + }, + { + "epoch": 0.45044496570988596, + "grad_norm": 1.3330841064453125, + "learning_rate": 0.00012075524011564005, + "loss": 1.0101, + "step": 12578 + }, + { + "epoch": 0.4504807778394542, + "grad_norm": 1.4295392036437988, + "learning_rate": 0.00012074389354436475, + "loss": 1.1828, + "step": 12579 + }, + { + "epoch": 0.4505165899690225, + "grad_norm": 1.509832501411438, + "learning_rate": 0.00012073254669400713, + "loss": 1.1911, + "step": 12580 + }, + { + "epoch": 0.45055240209859077, + "grad_norm": 1.5433253049850464, + "learning_rate": 0.00012072119956471981, + "loss": 1.1939, + "step": 12581 + }, + { + "epoch": 0.4505882142281591, + "grad_norm": 1.3402963876724243, + "learning_rate": 0.00012070985215665551, + "loss": 1.248, + "step": 12582 + }, + { + "epoch": 0.45062402635772736, + "grad_norm": 1.6793440580368042, + "learning_rate": 0.00012069850446996686, + "loss": 1.2587, + "step": 12583 + }, + { + "epoch": 0.4506598384872956, + "grad_norm": 1.4481250047683716, + "learning_rate": 0.00012068715650480653, + "loss": 1.0959, + "step": 12584 + }, + { + "epoch": 0.45069565061686395, + "grad_norm": 2.025334119796753, + "learning_rate": 0.00012067580826132718, + "loss": 1.5275, + "step": 12585 + }, + { + "epoch": 0.4507314627464322, + "grad_norm": 1.5476757287979126, + "learning_rate": 0.0001206644597396815, + "loss": 1.175, + "step": 12586 + }, + { + "epoch": 0.4507672748760005, + "grad_norm": 1.2327238321304321, + "learning_rate": 0.00012065311094002218, + "loss": 0.8824, + "step": 12587 + }, + { + "epoch": 0.45080308700556876, + "grad_norm": 1.5207105875015259, + "learning_rate": 0.00012064176186250189, + "loss": 1.2558, + "step": 12588 + }, + { + "epoch": 0.4508388991351371, + "grad_norm": 1.4308329820632935, + "learning_rate": 0.00012063041250727331, + "loss": 1.0276, + "step": 12589 + }, + { + "epoch": 0.45087471126470535, + "grad_norm": 1.476004958152771, + "learning_rate": 0.00012061906287448914, + "loss": 1.2142, + "step": 12590 + }, + { + "epoch": 0.4509105233942736, + "grad_norm": 1.9685579538345337, + "learning_rate": 0.00012060771296430209, + "loss": 0.9884, + "step": 12591 + }, + { + "epoch": 0.45094633552384195, + "grad_norm": 1.4679992198944092, + "learning_rate": 0.00012059636277686486, + "loss": 0.826, + "step": 12592 + }, + { + "epoch": 0.4509821476534102, + "grad_norm": 1.4565513134002686, + "learning_rate": 0.00012058501231233011, + "loss": 1.0827, + "step": 12593 + }, + { + "epoch": 0.4510179597829785, + "grad_norm": 1.6132372617721558, + "learning_rate": 0.00012057366157085058, + "loss": 1.0577, + "step": 12594 + }, + { + "epoch": 0.45105377191254675, + "grad_norm": 1.3969491720199585, + "learning_rate": 0.00012056231055257896, + "loss": 1.0457, + "step": 12595 + }, + { + "epoch": 0.4510895840421151, + "grad_norm": 1.3557456731796265, + "learning_rate": 0.000120550959257668, + "loss": 1.0784, + "step": 12596 + }, + { + "epoch": 0.45112539617168335, + "grad_norm": 1.346323847770691, + "learning_rate": 0.00012053960768627036, + "loss": 1.1238, + "step": 12597 + }, + { + "epoch": 0.4511612083012516, + "grad_norm": 1.9579453468322754, + "learning_rate": 0.00012052825583853881, + "loss": 1.0547, + "step": 12598 + }, + { + "epoch": 0.45119702043081994, + "grad_norm": 1.5690759420394897, + "learning_rate": 0.00012051690371462608, + "loss": 0.9381, + "step": 12599 + }, + { + "epoch": 0.4512328325603882, + "grad_norm": 1.413905382156372, + "learning_rate": 0.00012050555131468484, + "loss": 1.0858, + "step": 12600 + }, + { + "epoch": 0.4512686446899565, + "grad_norm": 1.5249675512313843, + "learning_rate": 0.00012049419863886786, + "loss": 1.2339, + "step": 12601 + }, + { + "epoch": 0.45130445681952475, + "grad_norm": 1.4267899990081787, + "learning_rate": 0.00012048284568732791, + "loss": 1.4671, + "step": 12602 + }, + { + "epoch": 0.45134026894909307, + "grad_norm": 1.9088222980499268, + "learning_rate": 0.00012047149246021763, + "loss": 1.1657, + "step": 12603 + }, + { + "epoch": 0.45137608107866134, + "grad_norm": 1.7153000831604004, + "learning_rate": 0.00012046013895768986, + "loss": 1.1476, + "step": 12604 + }, + { + "epoch": 0.4514118932082296, + "grad_norm": 1.432915210723877, + "learning_rate": 0.0001204487851798973, + "loss": 1.2379, + "step": 12605 + }, + { + "epoch": 0.45144770533779793, + "grad_norm": 2.000365734100342, + "learning_rate": 0.00012043743112699273, + "loss": 1.0212, + "step": 12606 + }, + { + "epoch": 0.4514835174673662, + "grad_norm": 1.5059118270874023, + "learning_rate": 0.0001204260767991289, + "loss": 1.0567, + "step": 12607 + }, + { + "epoch": 0.45151932959693447, + "grad_norm": 1.5345405340194702, + "learning_rate": 0.00012041472219645854, + "loss": 1.0711, + "step": 12608 + }, + { + "epoch": 0.45155514172650274, + "grad_norm": 2.1510567665100098, + "learning_rate": 0.00012040336731913442, + "loss": 1.3559, + "step": 12609 + }, + { + "epoch": 0.45159095385607106, + "grad_norm": 1.6141160726547241, + "learning_rate": 0.00012039201216730931, + "loss": 1.2082, + "step": 12610 + }, + { + "epoch": 0.45162676598563933, + "grad_norm": 1.8446906805038452, + "learning_rate": 0.00012038065674113598, + "loss": 1.3998, + "step": 12611 + }, + { + "epoch": 0.4516625781152076, + "grad_norm": 1.9221606254577637, + "learning_rate": 0.0001203693010407672, + "loss": 1.3212, + "step": 12612 + }, + { + "epoch": 0.4516983902447759, + "grad_norm": 1.5370228290557861, + "learning_rate": 0.00012035794506635575, + "loss": 1.2083, + "step": 12613 + }, + { + "epoch": 0.4517342023743442, + "grad_norm": 1.4541479349136353, + "learning_rate": 0.0001203465888180544, + "loss": 1.3762, + "step": 12614 + }, + { + "epoch": 0.45177001450391246, + "grad_norm": 2.1586248874664307, + "learning_rate": 0.00012033523229601598, + "loss": 1.0423, + "step": 12615 + }, + { + "epoch": 0.45180582663348073, + "grad_norm": 1.7716866731643677, + "learning_rate": 0.00012032387550039319, + "loss": 1.3044, + "step": 12616 + }, + { + "epoch": 0.45184163876304906, + "grad_norm": 1.4703259468078613, + "learning_rate": 0.00012031251843133891, + "loss": 1.1959, + "step": 12617 + }, + { + "epoch": 0.4518774508926173, + "grad_norm": 2.0271787643432617, + "learning_rate": 0.00012030116108900589, + "loss": 1.258, + "step": 12618 + }, + { + "epoch": 0.4519132630221856, + "grad_norm": 1.5699498653411865, + "learning_rate": 0.00012028980347354692, + "loss": 1.0554, + "step": 12619 + }, + { + "epoch": 0.4519490751517539, + "grad_norm": 1.2851762771606445, + "learning_rate": 0.00012027844558511483, + "loss": 1.2182, + "step": 12620 + }, + { + "epoch": 0.4519848872813222, + "grad_norm": 1.5179264545440674, + "learning_rate": 0.00012026708742386239, + "loss": 0.9942, + "step": 12621 + }, + { + "epoch": 0.45202069941089046, + "grad_norm": 1.7149531841278076, + "learning_rate": 0.00012025572898994246, + "loss": 1.1203, + "step": 12622 + }, + { + "epoch": 0.4520565115404587, + "grad_norm": 1.465275526046753, + "learning_rate": 0.00012024437028350779, + "loss": 1.1239, + "step": 12623 + }, + { + "epoch": 0.45209232367002705, + "grad_norm": 1.74788236618042, + "learning_rate": 0.00012023301130471128, + "loss": 1.1442, + "step": 12624 + }, + { + "epoch": 0.4521281357995953, + "grad_norm": 1.5974164009094238, + "learning_rate": 0.00012022165205370565, + "loss": 1.2863, + "step": 12625 + }, + { + "epoch": 0.4521639479291636, + "grad_norm": 1.5057660341262817, + "learning_rate": 0.00012021029253064382, + "loss": 1.2951, + "step": 12626 + }, + { + "epoch": 0.4521997600587319, + "grad_norm": 1.810327172279358, + "learning_rate": 0.00012019893273567855, + "loss": 1.0309, + "step": 12627 + }, + { + "epoch": 0.4522355721883002, + "grad_norm": 1.7372148036956787, + "learning_rate": 0.00012018757266896267, + "loss": 1.229, + "step": 12628 + }, + { + "epoch": 0.45227138431786845, + "grad_norm": 2.261345148086548, + "learning_rate": 0.00012017621233064908, + "loss": 1.5333, + "step": 12629 + }, + { + "epoch": 0.4523071964474367, + "grad_norm": 1.7907792329788208, + "learning_rate": 0.00012016485172089056, + "loss": 1.1257, + "step": 12630 + }, + { + "epoch": 0.45234300857700505, + "grad_norm": 1.8179367780685425, + "learning_rate": 0.00012015349083983998, + "loss": 1.0164, + "step": 12631 + }, + { + "epoch": 0.4523788207065733, + "grad_norm": 1.3435375690460205, + "learning_rate": 0.00012014212968765018, + "loss": 1.1227, + "step": 12632 + }, + { + "epoch": 0.4524146328361416, + "grad_norm": 2.0875468254089355, + "learning_rate": 0.000120130768264474, + "loss": 1.0631, + "step": 12633 + }, + { + "epoch": 0.4524504449657099, + "grad_norm": 1.3956944942474365, + "learning_rate": 0.00012011940657046427, + "loss": 1.2142, + "step": 12634 + }, + { + "epoch": 0.4524862570952782, + "grad_norm": 1.455458641052246, + "learning_rate": 0.00012010804460577395, + "loss": 1.2433, + "step": 12635 + }, + { + "epoch": 0.45252206922484645, + "grad_norm": 1.586660623550415, + "learning_rate": 0.00012009668237055578, + "loss": 1.3595, + "step": 12636 + }, + { + "epoch": 0.4525578813544147, + "grad_norm": 1.557891607284546, + "learning_rate": 0.00012008531986496266, + "loss": 1.1057, + "step": 12637 + }, + { + "epoch": 0.45259369348398304, + "grad_norm": 1.4575579166412354, + "learning_rate": 0.0001200739570891475, + "loss": 1.1226, + "step": 12638 + }, + { + "epoch": 0.4526295056135513, + "grad_norm": 1.1997109651565552, + "learning_rate": 0.0001200625940432631, + "loss": 0.9224, + "step": 12639 + }, + { + "epoch": 0.4526653177431196, + "grad_norm": 1.1531322002410889, + "learning_rate": 0.00012005123072746242, + "loss": 1.1603, + "step": 12640 + }, + { + "epoch": 0.4527011298726879, + "grad_norm": 1.6054075956344604, + "learning_rate": 0.00012003986714189825, + "loss": 0.9623, + "step": 12641 + }, + { + "epoch": 0.45273694200225617, + "grad_norm": 1.5790528059005737, + "learning_rate": 0.00012002850328672357, + "loss": 1.2927, + "step": 12642 + }, + { + "epoch": 0.45277275413182444, + "grad_norm": 1.3929443359375, + "learning_rate": 0.00012001713916209117, + "loss": 1.02, + "step": 12643 + }, + { + "epoch": 0.4528085662613927, + "grad_norm": 1.5798028707504272, + "learning_rate": 0.00012000577476815402, + "loss": 1.0643, + "step": 12644 + }, + { + "epoch": 0.45284437839096103, + "grad_norm": 1.7282578945159912, + "learning_rate": 0.00011999441010506496, + "loss": 1.2046, + "step": 12645 + }, + { + "epoch": 0.4528801905205293, + "grad_norm": 1.3155224323272705, + "learning_rate": 0.00011998304517297687, + "loss": 1.0401, + "step": 12646 + }, + { + "epoch": 0.45291600265009757, + "grad_norm": 1.6361740827560425, + "learning_rate": 0.00011997167997204272, + "loss": 1.1229, + "step": 12647 + }, + { + "epoch": 0.4529518147796659, + "grad_norm": 1.8119382858276367, + "learning_rate": 0.00011996031450241536, + "loss": 1.0957, + "step": 12648 + }, + { + "epoch": 0.45298762690923416, + "grad_norm": 1.9988112449645996, + "learning_rate": 0.00011994894876424773, + "loss": 1.2059, + "step": 12649 + }, + { + "epoch": 0.45302343903880243, + "grad_norm": 1.481033205986023, + "learning_rate": 0.00011993758275769273, + "loss": 1.1238, + "step": 12650 + }, + { + "epoch": 0.4530592511683707, + "grad_norm": 1.7344914674758911, + "learning_rate": 0.00011992621648290328, + "loss": 1.1162, + "step": 12651 + }, + { + "epoch": 0.453095063297939, + "grad_norm": 1.7236862182617188, + "learning_rate": 0.00011991484994003226, + "loss": 1.2063, + "step": 12652 + }, + { + "epoch": 0.4531308754275073, + "grad_norm": 1.7620609998703003, + "learning_rate": 0.00011990348312923266, + "loss": 1.2431, + "step": 12653 + }, + { + "epoch": 0.45316668755707556, + "grad_norm": 2.21829891204834, + "learning_rate": 0.00011989211605065733, + "loss": 1.2121, + "step": 12654 + }, + { + "epoch": 0.4532024996866439, + "grad_norm": 1.8722586631774902, + "learning_rate": 0.00011988074870445927, + "loss": 1.2859, + "step": 12655 + }, + { + "epoch": 0.45323831181621216, + "grad_norm": 1.598936676979065, + "learning_rate": 0.00011986938109079133, + "loss": 1.1973, + "step": 12656 + }, + { + "epoch": 0.4532741239457804, + "grad_norm": 1.4841498136520386, + "learning_rate": 0.00011985801320980654, + "loss": 1.2826, + "step": 12657 + }, + { + "epoch": 0.4533099360753487, + "grad_norm": 1.2111271619796753, + "learning_rate": 0.00011984664506165777, + "loss": 1.0795, + "step": 12658 + }, + { + "epoch": 0.453345748204917, + "grad_norm": 1.4879316091537476, + "learning_rate": 0.00011983527664649801, + "loss": 1.2785, + "step": 12659 + }, + { + "epoch": 0.4533815603344853, + "grad_norm": 1.481628656387329, + "learning_rate": 0.0001198239079644802, + "loss": 1.0596, + "step": 12660 + }, + { + "epoch": 0.45341737246405356, + "grad_norm": 1.732614278793335, + "learning_rate": 0.00011981253901575726, + "loss": 1.2392, + "step": 12661 + }, + { + "epoch": 0.4534531845936219, + "grad_norm": 1.5143285989761353, + "learning_rate": 0.00011980116980048217, + "loss": 1.2385, + "step": 12662 + }, + { + "epoch": 0.45348899672319015, + "grad_norm": 1.8499023914337158, + "learning_rate": 0.00011978980031880789, + "loss": 1.3439, + "step": 12663 + }, + { + "epoch": 0.4535248088527584, + "grad_norm": 1.3746498823165894, + "learning_rate": 0.00011977843057088735, + "loss": 1.2774, + "step": 12664 + }, + { + "epoch": 0.4535606209823267, + "grad_norm": 1.6918343305587769, + "learning_rate": 0.00011976706055687357, + "loss": 1.2015, + "step": 12665 + }, + { + "epoch": 0.453596433111895, + "grad_norm": 1.3563802242279053, + "learning_rate": 0.00011975569027691947, + "loss": 1.1013, + "step": 12666 + }, + { + "epoch": 0.4536322452414633, + "grad_norm": 1.9149479866027832, + "learning_rate": 0.00011974431973117804, + "loss": 1.3665, + "step": 12667 + }, + { + "epoch": 0.45366805737103155, + "grad_norm": 1.4888674020767212, + "learning_rate": 0.00011973294891980224, + "loss": 1.1594, + "step": 12668 + }, + { + "epoch": 0.4537038695005999, + "grad_norm": 1.5367547273635864, + "learning_rate": 0.00011972157784294508, + "loss": 1.1542, + "step": 12669 + }, + { + "epoch": 0.45373968163016815, + "grad_norm": 1.4564636945724487, + "learning_rate": 0.00011971020650075954, + "loss": 0.9664, + "step": 12670 + }, + { + "epoch": 0.4537754937597364, + "grad_norm": 1.601654291152954, + "learning_rate": 0.00011969883489339862, + "loss": 1.0473, + "step": 12671 + }, + { + "epoch": 0.4538113058893047, + "grad_norm": 1.651910662651062, + "learning_rate": 0.00011968746302101523, + "loss": 1.0443, + "step": 12672 + }, + { + "epoch": 0.453847118018873, + "grad_norm": 1.5802216529846191, + "learning_rate": 0.00011967609088376245, + "loss": 1.1459, + "step": 12673 + }, + { + "epoch": 0.4538829301484413, + "grad_norm": 1.3020273447036743, + "learning_rate": 0.00011966471848179324, + "loss": 1.0977, + "step": 12674 + }, + { + "epoch": 0.45391874227800955, + "grad_norm": 1.6387485265731812, + "learning_rate": 0.00011965334581526062, + "loss": 0.9971, + "step": 12675 + }, + { + "epoch": 0.45395455440757787, + "grad_norm": 1.4913272857666016, + "learning_rate": 0.00011964197288431756, + "loss": 1.088, + "step": 12676 + }, + { + "epoch": 0.45399036653714614, + "grad_norm": 1.5308340787887573, + "learning_rate": 0.00011963059968911712, + "loss": 1.3311, + "step": 12677 + }, + { + "epoch": 0.4540261786667144, + "grad_norm": 1.7332077026367188, + "learning_rate": 0.00011961922622981225, + "loss": 1.1259, + "step": 12678 + }, + { + "epoch": 0.4540619907962827, + "grad_norm": 1.7556313276290894, + "learning_rate": 0.00011960785250655604, + "loss": 1.1908, + "step": 12679 + }, + { + "epoch": 0.454097802925851, + "grad_norm": 1.4490129947662354, + "learning_rate": 0.00011959647851950145, + "loss": 1.093, + "step": 12680 + }, + { + "epoch": 0.45413361505541927, + "grad_norm": 1.5579735040664673, + "learning_rate": 0.0001195851042688015, + "loss": 1.3733, + "step": 12681 + }, + { + "epoch": 0.45416942718498754, + "grad_norm": 1.360239863395691, + "learning_rate": 0.00011957372975460925, + "loss": 1.0437, + "step": 12682 + }, + { + "epoch": 0.45420523931455586, + "grad_norm": 1.1063517332077026, + "learning_rate": 0.00011956235497707771, + "loss": 1.0031, + "step": 12683 + }, + { + "epoch": 0.45424105144412413, + "grad_norm": 1.1417086124420166, + "learning_rate": 0.00011955097993635991, + "loss": 0.831, + "step": 12684 + }, + { + "epoch": 0.4542768635736924, + "grad_norm": 1.401847004890442, + "learning_rate": 0.0001195396046326089, + "loss": 1.2285, + "step": 12685 + }, + { + "epoch": 0.45431267570326067, + "grad_norm": 1.219774842262268, + "learning_rate": 0.00011952822906597773, + "loss": 1.0319, + "step": 12686 + }, + { + "epoch": 0.454348487832829, + "grad_norm": 1.4537009000778198, + "learning_rate": 0.0001195168532366194, + "loss": 1.0048, + "step": 12687 + }, + { + "epoch": 0.45438429996239726, + "grad_norm": 1.6552225351333618, + "learning_rate": 0.000119505477144687, + "loss": 1.1109, + "step": 12688 + }, + { + "epoch": 0.45442011209196553, + "grad_norm": 1.4951599836349487, + "learning_rate": 0.00011949410079033359, + "loss": 1.0945, + "step": 12689 + }, + { + "epoch": 0.45445592422153386, + "grad_norm": 1.3992465734481812, + "learning_rate": 0.00011948272417371216, + "loss": 1.1797, + "step": 12690 + }, + { + "epoch": 0.4544917363511021, + "grad_norm": 1.3094408512115479, + "learning_rate": 0.00011947134729497583, + "loss": 1.2604, + "step": 12691 + }, + { + "epoch": 0.4545275484806704, + "grad_norm": 1.5370755195617676, + "learning_rate": 0.00011945997015427761, + "loss": 1.2256, + "step": 12692 + }, + { + "epoch": 0.45456336061023866, + "grad_norm": 1.866200566291809, + "learning_rate": 0.00011944859275177063, + "loss": 1.2959, + "step": 12693 + }, + { + "epoch": 0.454599172739807, + "grad_norm": 1.5158616304397583, + "learning_rate": 0.00011943721508760788, + "loss": 1.1563, + "step": 12694 + }, + { + "epoch": 0.45463498486937526, + "grad_norm": 1.3448257446289062, + "learning_rate": 0.00011942583716194251, + "loss": 1.0793, + "step": 12695 + }, + { + "epoch": 0.4546707969989435, + "grad_norm": 1.4682682752609253, + "learning_rate": 0.00011941445897492755, + "loss": 1.2102, + "step": 12696 + }, + { + "epoch": 0.45470660912851185, + "grad_norm": 1.4156224727630615, + "learning_rate": 0.0001194030805267161, + "loss": 1.1602, + "step": 12697 + }, + { + "epoch": 0.4547424212580801, + "grad_norm": 1.6115443706512451, + "learning_rate": 0.0001193917018174612, + "loss": 0.9595, + "step": 12698 + }, + { + "epoch": 0.4547782333876484, + "grad_norm": 2.2058093547821045, + "learning_rate": 0.00011938032284731599, + "loss": 1.196, + "step": 12699 + }, + { + "epoch": 0.45481404551721666, + "grad_norm": 1.5436464548110962, + "learning_rate": 0.00011936894361643351, + "loss": 0.9863, + "step": 12700 + }, + { + "epoch": 0.454849857646785, + "grad_norm": 1.509790062904358, + "learning_rate": 0.00011935756412496688, + "loss": 1.2253, + "step": 12701 + }, + { + "epoch": 0.45488566977635325, + "grad_norm": 1.5870587825775146, + "learning_rate": 0.00011934618437306921, + "loss": 1.0848, + "step": 12702 + }, + { + "epoch": 0.4549214819059215, + "grad_norm": 1.2906001806259155, + "learning_rate": 0.00011933480436089357, + "loss": 1.0713, + "step": 12703 + }, + { + "epoch": 0.45495729403548985, + "grad_norm": 1.3536715507507324, + "learning_rate": 0.0001193234240885931, + "loss": 0.9905, + "step": 12704 + }, + { + "epoch": 0.4549931061650581, + "grad_norm": 1.896675944328308, + "learning_rate": 0.00011931204355632089, + "loss": 1.1774, + "step": 12705 + }, + { + "epoch": 0.4550289182946264, + "grad_norm": 1.9894356727600098, + "learning_rate": 0.00011930066276423003, + "loss": 1.2173, + "step": 12706 + }, + { + "epoch": 0.45506473042419465, + "grad_norm": 2.037440538406372, + "learning_rate": 0.00011928928171247362, + "loss": 1.0778, + "step": 12707 + }, + { + "epoch": 0.455100542553763, + "grad_norm": 1.27736234664917, + "learning_rate": 0.00011927790040120484, + "loss": 1.0735, + "step": 12708 + }, + { + "epoch": 0.45513635468333125, + "grad_norm": 1.3397629261016846, + "learning_rate": 0.00011926651883057676, + "loss": 0.9825, + "step": 12709 + }, + { + "epoch": 0.4551721668128995, + "grad_norm": 1.5195871591567993, + "learning_rate": 0.00011925513700074253, + "loss": 1.0636, + "step": 12710 + }, + { + "epoch": 0.45520797894246784, + "grad_norm": 1.575372576713562, + "learning_rate": 0.00011924375491185526, + "loss": 1.0373, + "step": 12711 + }, + { + "epoch": 0.4552437910720361, + "grad_norm": 1.3513984680175781, + "learning_rate": 0.00011923237256406812, + "loss": 1.2544, + "step": 12712 + }, + { + "epoch": 0.4552796032016044, + "grad_norm": 1.373196005821228, + "learning_rate": 0.00011922098995753417, + "loss": 1.3546, + "step": 12713 + }, + { + "epoch": 0.45531541533117265, + "grad_norm": 1.412304162979126, + "learning_rate": 0.00011920960709240662, + "loss": 1.1806, + "step": 12714 + }, + { + "epoch": 0.45535122746074097, + "grad_norm": 1.6181211471557617, + "learning_rate": 0.0001191982239688386, + "loss": 0.9886, + "step": 12715 + }, + { + "epoch": 0.45538703959030924, + "grad_norm": 1.5537728071212769, + "learning_rate": 0.00011918684058698319, + "loss": 1.0711, + "step": 12716 + }, + { + "epoch": 0.4554228517198775, + "grad_norm": 1.4900199174880981, + "learning_rate": 0.00011917545694699365, + "loss": 1.0825, + "step": 12717 + }, + { + "epoch": 0.45545866384944583, + "grad_norm": 1.9259567260742188, + "learning_rate": 0.00011916407304902302, + "loss": 1.326, + "step": 12718 + }, + { + "epoch": 0.4554944759790141, + "grad_norm": 1.7112880945205688, + "learning_rate": 0.00011915268889322456, + "loss": 1.0466, + "step": 12719 + }, + { + "epoch": 0.45553028810858237, + "grad_norm": 1.717335820198059, + "learning_rate": 0.00011914130447975131, + "loss": 1.324, + "step": 12720 + }, + { + "epoch": 0.45556610023815064, + "grad_norm": 1.4079755544662476, + "learning_rate": 0.00011912991980875654, + "loss": 1.1596, + "step": 12721 + }, + { + "epoch": 0.45560191236771896, + "grad_norm": 1.45822012424469, + "learning_rate": 0.00011911853488039337, + "loss": 1.0729, + "step": 12722 + }, + { + "epoch": 0.45563772449728723, + "grad_norm": 1.8277711868286133, + "learning_rate": 0.00011910714969481498, + "loss": 1.2225, + "step": 12723 + }, + { + "epoch": 0.4556735366268555, + "grad_norm": 1.6497169733047485, + "learning_rate": 0.00011909576425217455, + "loss": 1.4093, + "step": 12724 + }, + { + "epoch": 0.4557093487564238, + "grad_norm": 1.5189549922943115, + "learning_rate": 0.0001190843785526252, + "loss": 1.1864, + "step": 12725 + }, + { + "epoch": 0.4557451608859921, + "grad_norm": 1.4255785942077637, + "learning_rate": 0.00011907299259632019, + "loss": 1.1687, + "step": 12726 + }, + { + "epoch": 0.45578097301556036, + "grad_norm": 1.348806619644165, + "learning_rate": 0.00011906160638341264, + "loss": 1.058, + "step": 12727 + }, + { + "epoch": 0.45581678514512863, + "grad_norm": 2.269932985305786, + "learning_rate": 0.00011905021991405578, + "loss": 1.3673, + "step": 12728 + }, + { + "epoch": 0.45585259727469696, + "grad_norm": 1.6271024942398071, + "learning_rate": 0.00011903883318840279, + "loss": 1.3138, + "step": 12729 + }, + { + "epoch": 0.4558884094042652, + "grad_norm": 1.4600716829299927, + "learning_rate": 0.00011902744620660686, + "loss": 1.0792, + "step": 12730 + }, + { + "epoch": 0.4559242215338335, + "grad_norm": 1.665052056312561, + "learning_rate": 0.00011901605896882116, + "loss": 1.2172, + "step": 12731 + }, + { + "epoch": 0.4559600336634018, + "grad_norm": 1.3798357248306274, + "learning_rate": 0.00011900467147519893, + "loss": 0.9671, + "step": 12732 + }, + { + "epoch": 0.4559958457929701, + "grad_norm": 1.4627691507339478, + "learning_rate": 0.00011899328372589338, + "loss": 1.1026, + "step": 12733 + }, + { + "epoch": 0.45603165792253836, + "grad_norm": 1.7417699098587036, + "learning_rate": 0.00011898189572105767, + "loss": 1.2225, + "step": 12734 + }, + { + "epoch": 0.4560674700521066, + "grad_norm": 1.567580223083496, + "learning_rate": 0.00011897050746084504, + "loss": 0.9933, + "step": 12735 + }, + { + "epoch": 0.45610328218167495, + "grad_norm": 1.5049108266830444, + "learning_rate": 0.0001189591189454087, + "loss": 1.047, + "step": 12736 + }, + { + "epoch": 0.4561390943112432, + "grad_norm": 1.3394869565963745, + "learning_rate": 0.00011894773017490189, + "loss": 1.0008, + "step": 12737 + }, + { + "epoch": 0.4561749064408115, + "grad_norm": 1.6732635498046875, + "learning_rate": 0.00011893634114947778, + "loss": 1.0088, + "step": 12738 + }, + { + "epoch": 0.4562107185703798, + "grad_norm": 1.563500165939331, + "learning_rate": 0.00011892495186928966, + "loss": 1.1214, + "step": 12739 + }, + { + "epoch": 0.4562465306999481, + "grad_norm": 1.6413320302963257, + "learning_rate": 0.00011891356233449069, + "loss": 1.1866, + "step": 12740 + }, + { + "epoch": 0.45628234282951635, + "grad_norm": 1.90431809425354, + "learning_rate": 0.0001189021725452342, + "loss": 1.203, + "step": 12741 + }, + { + "epoch": 0.4563181549590846, + "grad_norm": 1.6276259422302246, + "learning_rate": 0.00011889078250167329, + "loss": 1.0722, + "step": 12742 + }, + { + "epoch": 0.45635396708865295, + "grad_norm": 1.4253326654434204, + "learning_rate": 0.0001188793922039613, + "loss": 0.8892, + "step": 12743 + }, + { + "epoch": 0.4563897792182212, + "grad_norm": 1.9478073120117188, + "learning_rate": 0.00011886800165225143, + "loss": 1.2316, + "step": 12744 + }, + { + "epoch": 0.4564255913477895, + "grad_norm": 1.5627261400222778, + "learning_rate": 0.00011885661084669693, + "loss": 1.0628, + "step": 12745 + }, + { + "epoch": 0.4564614034773578, + "grad_norm": 1.3624008893966675, + "learning_rate": 0.00011884521978745106, + "loss": 1.138, + "step": 12746 + }, + { + "epoch": 0.4564972156069261, + "grad_norm": 1.5063775777816772, + "learning_rate": 0.00011883382847466706, + "loss": 1.2535, + "step": 12747 + }, + { + "epoch": 0.45653302773649435, + "grad_norm": 2.174795627593994, + "learning_rate": 0.00011882243690849824, + "loss": 1.0658, + "step": 12748 + }, + { + "epoch": 0.4565688398660626, + "grad_norm": 1.2083498239517212, + "learning_rate": 0.00011881104508909778, + "loss": 1.1123, + "step": 12749 + }, + { + "epoch": 0.45660465199563094, + "grad_norm": 1.5392276048660278, + "learning_rate": 0.00011879965301661897, + "loss": 1.2259, + "step": 12750 + }, + { + "epoch": 0.4566404641251992, + "grad_norm": 2.1384968757629395, + "learning_rate": 0.00011878826069121505, + "loss": 1.1208, + "step": 12751 + }, + { + "epoch": 0.4566762762547675, + "grad_norm": 1.2740819454193115, + "learning_rate": 0.00011877686811303937, + "loss": 1.065, + "step": 12752 + }, + { + "epoch": 0.4567120883843358, + "grad_norm": 1.4035472869873047, + "learning_rate": 0.00011876547528224511, + "loss": 1.2436, + "step": 12753 + }, + { + "epoch": 0.45674790051390407, + "grad_norm": 1.3804525136947632, + "learning_rate": 0.00011875408219898561, + "loss": 1.2091, + "step": 12754 + }, + { + "epoch": 0.45678371264347234, + "grad_norm": 1.5598496198654175, + "learning_rate": 0.00011874268886341409, + "loss": 1.2104, + "step": 12755 + }, + { + "epoch": 0.4568195247730406, + "grad_norm": 1.877087116241455, + "learning_rate": 0.00011873129527568388, + "loss": 1.2951, + "step": 12756 + }, + { + "epoch": 0.45685533690260893, + "grad_norm": 1.5937604904174805, + "learning_rate": 0.00011871990143594827, + "loss": 1.1451, + "step": 12757 + }, + { + "epoch": 0.4568911490321772, + "grad_norm": 1.8455636501312256, + "learning_rate": 0.00011870850734436054, + "loss": 1.1478, + "step": 12758 + }, + { + "epoch": 0.45692696116174547, + "grad_norm": 1.361972451210022, + "learning_rate": 0.00011869711300107398, + "loss": 1.0039, + "step": 12759 + }, + { + "epoch": 0.4569627732913138, + "grad_norm": 1.4559046030044556, + "learning_rate": 0.00011868571840624185, + "loss": 1.1705, + "step": 12760 + }, + { + "epoch": 0.45699858542088206, + "grad_norm": 1.7028831243515015, + "learning_rate": 0.0001186743235600175, + "loss": 1.2045, + "step": 12761 + }, + { + "epoch": 0.45703439755045033, + "grad_norm": 1.853192687034607, + "learning_rate": 0.0001186629284625542, + "loss": 1.2222, + "step": 12762 + }, + { + "epoch": 0.4570702096800186, + "grad_norm": 2.0500569343566895, + "learning_rate": 0.00011865153311400529, + "loss": 1.1089, + "step": 12763 + }, + { + "epoch": 0.4571060218095869, + "grad_norm": 1.5838327407836914, + "learning_rate": 0.00011864013751452405, + "loss": 1.1431, + "step": 12764 + }, + { + "epoch": 0.4571418339391552, + "grad_norm": 1.7606602907180786, + "learning_rate": 0.00011862874166426381, + "loss": 1.1221, + "step": 12765 + }, + { + "epoch": 0.45717764606872346, + "grad_norm": 1.503161907196045, + "learning_rate": 0.00011861734556337787, + "loss": 1.1596, + "step": 12766 + }, + { + "epoch": 0.4572134581982918, + "grad_norm": 1.28075110912323, + "learning_rate": 0.00011860594921201958, + "loss": 1.1226, + "step": 12767 + }, + { + "epoch": 0.45724927032786006, + "grad_norm": 1.850751519203186, + "learning_rate": 0.00011859455261034225, + "loss": 1.1983, + "step": 12768 + }, + { + "epoch": 0.4572850824574283, + "grad_norm": 1.2723721265792847, + "learning_rate": 0.00011858315575849914, + "loss": 1.0609, + "step": 12769 + }, + { + "epoch": 0.4573208945869966, + "grad_norm": 1.3242632150650024, + "learning_rate": 0.00011857175865664372, + "loss": 1.1474, + "step": 12770 + }, + { + "epoch": 0.4573567067165649, + "grad_norm": 1.6644277572631836, + "learning_rate": 0.00011856036130492917, + "loss": 1.358, + "step": 12771 + }, + { + "epoch": 0.4573925188461332, + "grad_norm": 1.4639084339141846, + "learning_rate": 0.00011854896370350894, + "loss": 1.0976, + "step": 12772 + }, + { + "epoch": 0.45742833097570146, + "grad_norm": 1.2741413116455078, + "learning_rate": 0.00011853756585253633, + "loss": 0.9659, + "step": 12773 + }, + { + "epoch": 0.4574641431052697, + "grad_norm": 1.412558913230896, + "learning_rate": 0.00011852616775216467, + "loss": 1.1946, + "step": 12774 + }, + { + "epoch": 0.45749995523483805, + "grad_norm": 1.372417688369751, + "learning_rate": 0.00011851476940254733, + "loss": 1.1625, + "step": 12775 + }, + { + "epoch": 0.4575357673644063, + "grad_norm": 1.4582303762435913, + "learning_rate": 0.00011850337080383764, + "loss": 1.1636, + "step": 12776 + }, + { + "epoch": 0.4575715794939746, + "grad_norm": 1.6482785940170288, + "learning_rate": 0.000118491971956189, + "loss": 1.0543, + "step": 12777 + }, + { + "epoch": 0.4576073916235429, + "grad_norm": 1.8477686643600464, + "learning_rate": 0.00011848057285975467, + "loss": 1.0698, + "step": 12778 + }, + { + "epoch": 0.4576432037531112, + "grad_norm": 1.4389973878860474, + "learning_rate": 0.00011846917351468811, + "loss": 0.8828, + "step": 12779 + }, + { + "epoch": 0.45767901588267945, + "grad_norm": 1.4612510204315186, + "learning_rate": 0.00011845777392114263, + "loss": 1.1288, + "step": 12780 + }, + { + "epoch": 0.4577148280122477, + "grad_norm": 1.6306989192962646, + "learning_rate": 0.00011844637407927161, + "loss": 0.9615, + "step": 12781 + }, + { + "epoch": 0.45775064014181605, + "grad_norm": 1.2744097709655762, + "learning_rate": 0.00011843497398922842, + "loss": 1.028, + "step": 12782 + }, + { + "epoch": 0.4577864522713843, + "grad_norm": 1.2008180618286133, + "learning_rate": 0.00011842357365116645, + "loss": 1.2495, + "step": 12783 + }, + { + "epoch": 0.4578222644009526, + "grad_norm": 2.0195956230163574, + "learning_rate": 0.00011841217306523904, + "loss": 1.2433, + "step": 12784 + }, + { + "epoch": 0.4578580765305209, + "grad_norm": 1.267651081085205, + "learning_rate": 0.00011840077223159965, + "loss": 1.1972, + "step": 12785 + }, + { + "epoch": 0.4578938886600892, + "grad_norm": 1.955867886543274, + "learning_rate": 0.00011838937115040154, + "loss": 1.2681, + "step": 12786 + }, + { + "epoch": 0.45792970078965745, + "grad_norm": 1.3317457437515259, + "learning_rate": 0.00011837796982179817, + "loss": 1.0699, + "step": 12787 + }, + { + "epoch": 0.4579655129192257, + "grad_norm": 2.45865797996521, + "learning_rate": 0.00011836656824594295, + "loss": 1.3084, + "step": 12788 + }, + { + "epoch": 0.45800132504879404, + "grad_norm": 1.632346272468567, + "learning_rate": 0.0001183551664229892, + "loss": 1.2941, + "step": 12789 + }, + { + "epoch": 0.4580371371783623, + "grad_norm": 1.7907370328903198, + "learning_rate": 0.0001183437643530904, + "loss": 1.2315, + "step": 12790 + }, + { + "epoch": 0.4580729493079306, + "grad_norm": 1.7416836023330688, + "learning_rate": 0.00011833236203639987, + "loss": 1.1555, + "step": 12791 + }, + { + "epoch": 0.4581087614374989, + "grad_norm": 1.608353614807129, + "learning_rate": 0.00011832095947307111, + "loss": 1.456, + "step": 12792 + }, + { + "epoch": 0.45814457356706717, + "grad_norm": 1.9044525623321533, + "learning_rate": 0.00011830955666325748, + "loss": 0.951, + "step": 12793 + }, + { + "epoch": 0.45818038569663544, + "grad_norm": 1.3248863220214844, + "learning_rate": 0.00011829815360711234, + "loss": 1.1373, + "step": 12794 + }, + { + "epoch": 0.4582161978262037, + "grad_norm": 1.6109696626663208, + "learning_rate": 0.00011828675030478915, + "loss": 1.1969, + "step": 12795 + }, + { + "epoch": 0.45825200995577203, + "grad_norm": 1.3437201976776123, + "learning_rate": 0.00011827534675644134, + "loss": 1.0607, + "step": 12796 + }, + { + "epoch": 0.4582878220853403, + "grad_norm": 1.5887658596038818, + "learning_rate": 0.00011826394296222229, + "loss": 1.1536, + "step": 12797 + }, + { + "epoch": 0.45832363421490857, + "grad_norm": 1.8441067934036255, + "learning_rate": 0.00011825253892228547, + "loss": 1.1507, + "step": 12798 + }, + { + "epoch": 0.4583594463444769, + "grad_norm": 1.7843148708343506, + "learning_rate": 0.00011824113463678427, + "loss": 1.2245, + "step": 12799 + }, + { + "epoch": 0.45839525847404516, + "grad_norm": 1.4131927490234375, + "learning_rate": 0.00011822973010587213, + "loss": 1.1889, + "step": 12800 + }, + { + "epoch": 0.45843107060361343, + "grad_norm": 1.4636985063552856, + "learning_rate": 0.0001182183253297025, + "loss": 1.0467, + "step": 12801 + }, + { + "epoch": 0.4584668827331817, + "grad_norm": 3.4810869693756104, + "learning_rate": 0.00011820692030842879, + "loss": 1.1045, + "step": 12802 + }, + { + "epoch": 0.45850269486275, + "grad_norm": 1.8076516389846802, + "learning_rate": 0.00011819551504220447, + "loss": 1.1362, + "step": 12803 + }, + { + "epoch": 0.4585385069923183, + "grad_norm": 1.6499656438827515, + "learning_rate": 0.00011818410953118296, + "loss": 1.0487, + "step": 12804 + }, + { + "epoch": 0.45857431912188656, + "grad_norm": 1.0858861207962036, + "learning_rate": 0.0001181727037755177, + "loss": 0.6789, + "step": 12805 + }, + { + "epoch": 0.4586101312514549, + "grad_norm": 1.3140679597854614, + "learning_rate": 0.00011816129777536216, + "loss": 0.9949, + "step": 12806 + }, + { + "epoch": 0.45864594338102316, + "grad_norm": 1.804017424583435, + "learning_rate": 0.00011814989153086977, + "loss": 1.1623, + "step": 12807 + }, + { + "epoch": 0.4586817555105914, + "grad_norm": 1.4117541313171387, + "learning_rate": 0.00011813848504219403, + "loss": 1.0292, + "step": 12808 + }, + { + "epoch": 0.4587175676401597, + "grad_norm": 1.6409507989883423, + "learning_rate": 0.00011812707830948835, + "loss": 1.2363, + "step": 12809 + }, + { + "epoch": 0.458753379769728, + "grad_norm": 1.5405890941619873, + "learning_rate": 0.0001181156713329062, + "loss": 1.0819, + "step": 12810 + }, + { + "epoch": 0.4587891918992963, + "grad_norm": 1.5508774518966675, + "learning_rate": 0.0001181042641126011, + "loss": 1.1142, + "step": 12811 + }, + { + "epoch": 0.45882500402886456, + "grad_norm": 1.3877745866775513, + "learning_rate": 0.00011809285664872645, + "loss": 0.9941, + "step": 12812 + }, + { + "epoch": 0.4588608161584329, + "grad_norm": 1.5650204420089722, + "learning_rate": 0.00011808144894143575, + "loss": 1.1975, + "step": 12813 + }, + { + "epoch": 0.45889662828800115, + "grad_norm": 1.595494270324707, + "learning_rate": 0.00011807004099088251, + "loss": 1.0959, + "step": 12814 + }, + { + "epoch": 0.4589324404175694, + "grad_norm": 1.5125110149383545, + "learning_rate": 0.00011805863279722014, + "loss": 1.2138, + "step": 12815 + }, + { + "epoch": 0.4589682525471377, + "grad_norm": 1.917132019996643, + "learning_rate": 0.00011804722436060218, + "loss": 1.2301, + "step": 12816 + }, + { + "epoch": 0.459004064676706, + "grad_norm": 1.5241032838821411, + "learning_rate": 0.00011803581568118207, + "loss": 1.0055, + "step": 12817 + }, + { + "epoch": 0.4590398768062743, + "grad_norm": 1.2011103630065918, + "learning_rate": 0.00011802440675911335, + "loss": 1.116, + "step": 12818 + }, + { + "epoch": 0.45907568893584255, + "grad_norm": 1.2361558675765991, + "learning_rate": 0.00011801299759454947, + "loss": 0.9764, + "step": 12819 + }, + { + "epoch": 0.4591115010654109, + "grad_norm": 1.676230549812317, + "learning_rate": 0.00011800158818764395, + "loss": 1.3599, + "step": 12820 + }, + { + "epoch": 0.45914731319497915, + "grad_norm": 1.378448486328125, + "learning_rate": 0.0001179901785385503, + "loss": 1.1084, + "step": 12821 + }, + { + "epoch": 0.4591831253245474, + "grad_norm": 2.1776928901672363, + "learning_rate": 0.00011797876864742198, + "loss": 1.242, + "step": 12822 + }, + { + "epoch": 0.4592189374541157, + "grad_norm": 1.3742048740386963, + "learning_rate": 0.00011796735851441254, + "loss": 0.983, + "step": 12823 + }, + { + "epoch": 0.459254749583684, + "grad_norm": 2.1170430183410645, + "learning_rate": 0.00011795594813967543, + "loss": 1.4399, + "step": 12824 + }, + { + "epoch": 0.4592905617132523, + "grad_norm": 1.350805401802063, + "learning_rate": 0.00011794453752336425, + "loss": 0.9938, + "step": 12825 + }, + { + "epoch": 0.45932637384282055, + "grad_norm": 2.3694815635681152, + "learning_rate": 0.00011793312666563241, + "loss": 1.2562, + "step": 12826 + }, + { + "epoch": 0.45936218597238887, + "grad_norm": 1.2787245512008667, + "learning_rate": 0.00011792171556663353, + "loss": 0.8625, + "step": 12827 + }, + { + "epoch": 0.45939799810195714, + "grad_norm": 1.5022379159927368, + "learning_rate": 0.00011791030422652105, + "loss": 0.9644, + "step": 12828 + }, + { + "epoch": 0.4594338102315254, + "grad_norm": 1.6127203702926636, + "learning_rate": 0.00011789889264544855, + "loss": 1.0366, + "step": 12829 + }, + { + "epoch": 0.4594696223610937, + "grad_norm": 1.3590091466903687, + "learning_rate": 0.00011788748082356955, + "loss": 0.9917, + "step": 12830 + }, + { + "epoch": 0.459505434490662, + "grad_norm": 1.8000396490097046, + "learning_rate": 0.00011787606876103753, + "loss": 1.251, + "step": 12831 + }, + { + "epoch": 0.45954124662023027, + "grad_norm": 1.5068577527999878, + "learning_rate": 0.00011786465645800609, + "loss": 1.1794, + "step": 12832 + }, + { + "epoch": 0.45957705874979854, + "grad_norm": 1.3951047658920288, + "learning_rate": 0.00011785324391462873, + "loss": 1.071, + "step": 12833 + }, + { + "epoch": 0.45961287087936686, + "grad_norm": 1.2631100416183472, + "learning_rate": 0.000117841831131059, + "loss": 1.2554, + "step": 12834 + }, + { + "epoch": 0.45964868300893513, + "grad_norm": 1.5704069137573242, + "learning_rate": 0.00011783041810745045, + "loss": 0.9219, + "step": 12835 + }, + { + "epoch": 0.4596844951385034, + "grad_norm": 1.465108871459961, + "learning_rate": 0.00011781900484395665, + "loss": 1.2055, + "step": 12836 + }, + { + "epoch": 0.45972030726807167, + "grad_norm": 1.4236599206924438, + "learning_rate": 0.00011780759134073107, + "loss": 1.4263, + "step": 12837 + }, + { + "epoch": 0.45975611939764, + "grad_norm": 1.7680556774139404, + "learning_rate": 0.00011779617759792738, + "loss": 1.127, + "step": 12838 + }, + { + "epoch": 0.45979193152720826, + "grad_norm": 1.7550184726715088, + "learning_rate": 0.00011778476361569903, + "loss": 1.0714, + "step": 12839 + }, + { + "epoch": 0.45982774365677653, + "grad_norm": 1.3634366989135742, + "learning_rate": 0.00011777334939419966, + "loss": 1.0375, + "step": 12840 + }, + { + "epoch": 0.45986355578634486, + "grad_norm": 1.5067209005355835, + "learning_rate": 0.00011776193493358278, + "loss": 1.2325, + "step": 12841 + }, + { + "epoch": 0.4598993679159131, + "grad_norm": 1.511208415031433, + "learning_rate": 0.00011775052023400197, + "loss": 1.1483, + "step": 12842 + }, + { + "epoch": 0.4599351800454814, + "grad_norm": 1.2370412349700928, + "learning_rate": 0.0001177391052956108, + "loss": 0.9368, + "step": 12843 + }, + { + "epoch": 0.45997099217504966, + "grad_norm": 1.3870946168899536, + "learning_rate": 0.00011772769011856286, + "loss": 1.1121, + "step": 12844 + }, + { + "epoch": 0.460006804304618, + "grad_norm": 1.6251839399337769, + "learning_rate": 0.00011771627470301174, + "loss": 1.1633, + "step": 12845 + }, + { + "epoch": 0.46004261643418626, + "grad_norm": 1.2918386459350586, + "learning_rate": 0.00011770485904911099, + "loss": 1.1337, + "step": 12846 + }, + { + "epoch": 0.4600784285637545, + "grad_norm": 1.4459412097930908, + "learning_rate": 0.0001176934431570142, + "loss": 1.0846, + "step": 12847 + }, + { + "epoch": 0.46011424069332285, + "grad_norm": 1.660775065422058, + "learning_rate": 0.00011768202702687492, + "loss": 1.0976, + "step": 12848 + }, + { + "epoch": 0.4601500528228911, + "grad_norm": 1.793946623802185, + "learning_rate": 0.00011767061065884682, + "loss": 1.2836, + "step": 12849 + }, + { + "epoch": 0.4601858649524594, + "grad_norm": 1.9801634550094604, + "learning_rate": 0.00011765919405308341, + "loss": 1.1691, + "step": 12850 + }, + { + "epoch": 0.46022167708202766, + "grad_norm": 1.478566288948059, + "learning_rate": 0.00011764777720973835, + "loss": 1.1165, + "step": 12851 + }, + { + "epoch": 0.460257489211596, + "grad_norm": 1.3731679916381836, + "learning_rate": 0.00011763636012896518, + "loss": 1.182, + "step": 12852 + }, + { + "epoch": 0.46029330134116425, + "grad_norm": 1.5502210855484009, + "learning_rate": 0.00011762494281091756, + "loss": 1.1924, + "step": 12853 + }, + { + "epoch": 0.4603291134707325, + "grad_norm": 1.672896146774292, + "learning_rate": 0.00011761352525574905, + "loss": 0.9635, + "step": 12854 + }, + { + "epoch": 0.46036492560030084, + "grad_norm": 1.9026237726211548, + "learning_rate": 0.00011760210746361329, + "loss": 1.2414, + "step": 12855 + }, + { + "epoch": 0.4604007377298691, + "grad_norm": 1.5120714902877808, + "learning_rate": 0.00011759068943466389, + "loss": 1.0656, + "step": 12856 + }, + { + "epoch": 0.4604365498594374, + "grad_norm": 1.2824711799621582, + "learning_rate": 0.00011757927116905442, + "loss": 1.0202, + "step": 12857 + }, + { + "epoch": 0.46047236198900565, + "grad_norm": 1.4918677806854248, + "learning_rate": 0.00011756785266693857, + "loss": 1.017, + "step": 12858 + }, + { + "epoch": 0.460508174118574, + "grad_norm": 1.6860451698303223, + "learning_rate": 0.00011755643392846991, + "loss": 1.3141, + "step": 12859 + }, + { + "epoch": 0.46054398624814225, + "grad_norm": 1.7015498876571655, + "learning_rate": 0.00011754501495380209, + "loss": 1.2353, + "step": 12860 + }, + { + "epoch": 0.4605797983777105, + "grad_norm": 1.6875501871109009, + "learning_rate": 0.00011753359574308869, + "loss": 1.1288, + "step": 12861 + }, + { + "epoch": 0.46061561050727884, + "grad_norm": 1.5361196994781494, + "learning_rate": 0.00011752217629648341, + "loss": 1.0369, + "step": 12862 + }, + { + "epoch": 0.4606514226368471, + "grad_norm": 1.9981660842895508, + "learning_rate": 0.00011751075661413982, + "loss": 1.2899, + "step": 12863 + }, + { + "epoch": 0.4606872347664154, + "grad_norm": 1.378311038017273, + "learning_rate": 0.00011749933669621161, + "loss": 1.2132, + "step": 12864 + }, + { + "epoch": 0.46072304689598365, + "grad_norm": 1.4558876752853394, + "learning_rate": 0.0001174879165428524, + "loss": 1.4192, + "step": 12865 + }, + { + "epoch": 0.46075885902555197, + "grad_norm": 1.6041709184646606, + "learning_rate": 0.00011747649615421581, + "loss": 1.1504, + "step": 12866 + }, + { + "epoch": 0.46079467115512024, + "grad_norm": 1.2337498664855957, + "learning_rate": 0.00011746507553045552, + "loss": 1.045, + "step": 12867 + }, + { + "epoch": 0.4608304832846885, + "grad_norm": 1.4614715576171875, + "learning_rate": 0.00011745365467172516, + "loss": 1.1161, + "step": 12868 + }, + { + "epoch": 0.46086629541425683, + "grad_norm": 1.7865307331085205, + "learning_rate": 0.00011744223357817841, + "loss": 1.302, + "step": 12869 + }, + { + "epoch": 0.4609021075438251, + "grad_norm": 1.8010350465774536, + "learning_rate": 0.00011743081224996888, + "loss": 1.2139, + "step": 12870 + }, + { + "epoch": 0.46093791967339337, + "grad_norm": 1.7880151271820068, + "learning_rate": 0.00011741939068725027, + "loss": 1.172, + "step": 12871 + }, + { + "epoch": 0.46097373180296164, + "grad_norm": 1.5827863216400146, + "learning_rate": 0.00011740796889017623, + "loss": 1.0758, + "step": 12872 + }, + { + "epoch": 0.46100954393252996, + "grad_norm": 1.520300269126892, + "learning_rate": 0.0001173965468589004, + "loss": 1.0876, + "step": 12873 + }, + { + "epoch": 0.46104535606209823, + "grad_norm": 1.5395957231521606, + "learning_rate": 0.0001173851245935765, + "loss": 1.1595, + "step": 12874 + }, + { + "epoch": 0.4610811681916665, + "grad_norm": 1.5589860677719116, + "learning_rate": 0.00011737370209435816, + "loss": 0.966, + "step": 12875 + }, + { + "epoch": 0.4611169803212348, + "grad_norm": 2.0641140937805176, + "learning_rate": 0.00011736227936139908, + "loss": 1.0309, + "step": 12876 + }, + { + "epoch": 0.4611527924508031, + "grad_norm": 1.7185477018356323, + "learning_rate": 0.00011735085639485291, + "loss": 1.1443, + "step": 12877 + }, + { + "epoch": 0.46118860458037136, + "grad_norm": 1.7839844226837158, + "learning_rate": 0.00011733943319487337, + "loss": 1.2662, + "step": 12878 + }, + { + "epoch": 0.46122441670993963, + "grad_norm": 1.5835062265396118, + "learning_rate": 0.00011732800976161408, + "loss": 1.1712, + "step": 12879 + }, + { + "epoch": 0.46126022883950796, + "grad_norm": 1.7096202373504639, + "learning_rate": 0.00011731658609522881, + "loss": 0.9401, + "step": 12880 + }, + { + "epoch": 0.4612960409690762, + "grad_norm": 1.5889978408813477, + "learning_rate": 0.0001173051621958712, + "loss": 1.4699, + "step": 12881 + }, + { + "epoch": 0.4613318530986445, + "grad_norm": 1.4253778457641602, + "learning_rate": 0.00011729373806369499, + "loss": 1.2103, + "step": 12882 + }, + { + "epoch": 0.4613676652282128, + "grad_norm": 1.3450847864151, + "learning_rate": 0.0001172823136988538, + "loss": 0.9568, + "step": 12883 + }, + { + "epoch": 0.4614034773577811, + "grad_norm": 2.1633834838867188, + "learning_rate": 0.00011727088910150137, + "loss": 1.1124, + "step": 12884 + }, + { + "epoch": 0.46143928948734936, + "grad_norm": 1.8221745491027832, + "learning_rate": 0.00011725946427179142, + "loss": 1.4241, + "step": 12885 + }, + { + "epoch": 0.4614751016169176, + "grad_norm": 1.8547481298446655, + "learning_rate": 0.00011724803920987761, + "loss": 1.2727, + "step": 12886 + }, + { + "epoch": 0.46151091374648595, + "grad_norm": 1.765018343925476, + "learning_rate": 0.00011723661391591371, + "loss": 1.2582, + "step": 12887 + }, + { + "epoch": 0.4615467258760542, + "grad_norm": 1.492621660232544, + "learning_rate": 0.00011722518839005341, + "loss": 1.1316, + "step": 12888 + }, + { + "epoch": 0.4615825380056225, + "grad_norm": 1.305405616760254, + "learning_rate": 0.00011721376263245041, + "loss": 1.2098, + "step": 12889 + }, + { + "epoch": 0.4616183501351908, + "grad_norm": 1.6089801788330078, + "learning_rate": 0.00011720233664325846, + "loss": 1.028, + "step": 12890 + }, + { + "epoch": 0.4616541622647591, + "grad_norm": 1.543152928352356, + "learning_rate": 0.00011719091042263124, + "loss": 1.2431, + "step": 12891 + }, + { + "epoch": 0.46168997439432735, + "grad_norm": 1.8039484024047852, + "learning_rate": 0.00011717948397072246, + "loss": 1.306, + "step": 12892 + }, + { + "epoch": 0.4617257865238956, + "grad_norm": 1.5564934015274048, + "learning_rate": 0.00011716805728768593, + "loss": 1.1387, + "step": 12893 + }, + { + "epoch": 0.46176159865346394, + "grad_norm": 1.6431461572647095, + "learning_rate": 0.00011715663037367532, + "loss": 1.1318, + "step": 12894 + }, + { + "epoch": 0.4617974107830322, + "grad_norm": 1.648577094078064, + "learning_rate": 0.00011714520322884439, + "loss": 0.9988, + "step": 12895 + }, + { + "epoch": 0.4618332229126005, + "grad_norm": 1.7138605117797852, + "learning_rate": 0.00011713377585334684, + "loss": 1.1276, + "step": 12896 + }, + { + "epoch": 0.4618690350421688, + "grad_norm": 1.637879490852356, + "learning_rate": 0.00011712234824733644, + "loss": 1.2587, + "step": 12897 + }, + { + "epoch": 0.4619048471717371, + "grad_norm": 1.4242366552352905, + "learning_rate": 0.00011711092041096693, + "loss": 1.0088, + "step": 12898 + }, + { + "epoch": 0.46194065930130535, + "grad_norm": 1.937127709388733, + "learning_rate": 0.0001170994923443921, + "loss": 1.1888, + "step": 12899 + }, + { + "epoch": 0.4619764714308736, + "grad_norm": 1.5222843885421753, + "learning_rate": 0.00011708806404776563, + "loss": 1.1402, + "step": 12900 + }, + { + "epoch": 0.46201228356044194, + "grad_norm": 1.4184266328811646, + "learning_rate": 0.00011707663552124128, + "loss": 1.3039, + "step": 12901 + }, + { + "epoch": 0.4620480956900102, + "grad_norm": 1.7487369775772095, + "learning_rate": 0.00011706520676497285, + "loss": 1.3075, + "step": 12902 + }, + { + "epoch": 0.4620839078195785, + "grad_norm": 1.6045348644256592, + "learning_rate": 0.00011705377777911406, + "loss": 1.1353, + "step": 12903 + }, + { + "epoch": 0.4621197199491468, + "grad_norm": 2.7303502559661865, + "learning_rate": 0.0001170423485638187, + "loss": 1.3173, + "step": 12904 + }, + { + "epoch": 0.46215553207871507, + "grad_norm": 1.2515852451324463, + "learning_rate": 0.00011703091911924051, + "loss": 1.0451, + "step": 12905 + }, + { + "epoch": 0.46219134420828334, + "grad_norm": 1.3844231367111206, + "learning_rate": 0.0001170194894455333, + "loss": 0.9562, + "step": 12906 + }, + { + "epoch": 0.4622271563378516, + "grad_norm": 1.640581727027893, + "learning_rate": 0.0001170080595428508, + "loss": 1.0944, + "step": 12907 + }, + { + "epoch": 0.46226296846741993, + "grad_norm": 1.4188117980957031, + "learning_rate": 0.00011699662941134679, + "loss": 1.005, + "step": 12908 + }, + { + "epoch": 0.4622987805969882, + "grad_norm": 1.7253326177597046, + "learning_rate": 0.00011698519905117507, + "loss": 1.102, + "step": 12909 + }, + { + "epoch": 0.46233459272655647, + "grad_norm": 1.5747073888778687, + "learning_rate": 0.00011697376846248937, + "loss": 1.351, + "step": 12910 + }, + { + "epoch": 0.4623704048561248, + "grad_norm": 1.5391029119491577, + "learning_rate": 0.00011696233764544353, + "loss": 1.0081, + "step": 12911 + }, + { + "epoch": 0.46240621698569306, + "grad_norm": 1.6457544565200806, + "learning_rate": 0.00011695090660019132, + "loss": 1.1252, + "step": 12912 + }, + { + "epoch": 0.46244202911526133, + "grad_norm": 1.8388975858688354, + "learning_rate": 0.00011693947532688653, + "loss": 1.1842, + "step": 12913 + }, + { + "epoch": 0.4624778412448296, + "grad_norm": 1.7194340229034424, + "learning_rate": 0.00011692804382568294, + "loss": 1.208, + "step": 12914 + }, + { + "epoch": 0.4625136533743979, + "grad_norm": 1.5401338338851929, + "learning_rate": 0.00011691661209673437, + "loss": 1.1712, + "step": 12915 + }, + { + "epoch": 0.4625494655039662, + "grad_norm": 1.5445610284805298, + "learning_rate": 0.00011690518014019458, + "loss": 1.0082, + "step": 12916 + }, + { + "epoch": 0.46258527763353446, + "grad_norm": 1.5305681228637695, + "learning_rate": 0.00011689374795621744, + "loss": 1.0655, + "step": 12917 + }, + { + "epoch": 0.4626210897631028, + "grad_norm": 1.3602449893951416, + "learning_rate": 0.00011688231554495668, + "loss": 1.0631, + "step": 12918 + }, + { + "epoch": 0.46265690189267106, + "grad_norm": 1.5425820350646973, + "learning_rate": 0.00011687088290656613, + "loss": 1.0184, + "step": 12919 + }, + { + "epoch": 0.4626927140222393, + "grad_norm": 1.4511734247207642, + "learning_rate": 0.00011685945004119965, + "loss": 1.2059, + "step": 12920 + }, + { + "epoch": 0.4627285261518076, + "grad_norm": 1.8196427822113037, + "learning_rate": 0.00011684801694901099, + "loss": 1.1081, + "step": 12921 + }, + { + "epoch": 0.4627643382813759, + "grad_norm": 1.8480161428451538, + "learning_rate": 0.00011683658363015402, + "loss": 1.0604, + "step": 12922 + }, + { + "epoch": 0.4628001504109442, + "grad_norm": 1.71962308883667, + "learning_rate": 0.0001168251500847825, + "loss": 1.1579, + "step": 12923 + }, + { + "epoch": 0.46283596254051246, + "grad_norm": 1.7273693084716797, + "learning_rate": 0.00011681371631305032, + "loss": 1.2409, + "step": 12924 + }, + { + "epoch": 0.4628717746700808, + "grad_norm": 1.956167221069336, + "learning_rate": 0.00011680228231511123, + "loss": 1.0203, + "step": 12925 + }, + { + "epoch": 0.46290758679964905, + "grad_norm": 1.4352366924285889, + "learning_rate": 0.00011679084809111915, + "loss": 1.0438, + "step": 12926 + }, + { + "epoch": 0.4629433989292173, + "grad_norm": 1.5460764169692993, + "learning_rate": 0.00011677941364122787, + "loss": 1.102, + "step": 12927 + }, + { + "epoch": 0.4629792110587856, + "grad_norm": 1.423625111579895, + "learning_rate": 0.0001167679789655912, + "loss": 1.0547, + "step": 12928 + }, + { + "epoch": 0.4630150231883539, + "grad_norm": 1.561253309249878, + "learning_rate": 0.00011675654406436301, + "loss": 0.9999, + "step": 12929 + }, + { + "epoch": 0.4630508353179222, + "grad_norm": 2.5965397357940674, + "learning_rate": 0.00011674510893769713, + "loss": 1.1644, + "step": 12930 + }, + { + "epoch": 0.46308664744749045, + "grad_norm": 1.4719523191452026, + "learning_rate": 0.00011673367358574741, + "loss": 0.8278, + "step": 12931 + }, + { + "epoch": 0.4631224595770588, + "grad_norm": 1.579182744026184, + "learning_rate": 0.00011672223800866768, + "loss": 1.2618, + "step": 12932 + }, + { + "epoch": 0.46315827170662704, + "grad_norm": 1.9788135290145874, + "learning_rate": 0.00011671080220661183, + "loss": 1.175, + "step": 12933 + }, + { + "epoch": 0.4631940838361953, + "grad_norm": 1.722080945968628, + "learning_rate": 0.00011669936617973367, + "loss": 1.1423, + "step": 12934 + }, + { + "epoch": 0.4632298959657636, + "grad_norm": 1.573500156402588, + "learning_rate": 0.00011668792992818714, + "loss": 1.0766, + "step": 12935 + }, + { + "epoch": 0.4632657080953319, + "grad_norm": 1.314070224761963, + "learning_rate": 0.000116676493452126, + "loss": 1.2931, + "step": 12936 + }, + { + "epoch": 0.4633015202249002, + "grad_norm": 1.5557459592819214, + "learning_rate": 0.00011666505675170413, + "loss": 1.2613, + "step": 12937 + }, + { + "epoch": 0.46333733235446845, + "grad_norm": 1.5434309244155884, + "learning_rate": 0.00011665361982707543, + "loss": 1.2088, + "step": 12938 + }, + { + "epoch": 0.46337314448403677, + "grad_norm": 1.306604027748108, + "learning_rate": 0.00011664218267839375, + "loss": 0.9974, + "step": 12939 + }, + { + "epoch": 0.46340895661360504, + "grad_norm": 1.8093574047088623, + "learning_rate": 0.000116630745305813, + "loss": 1.141, + "step": 12940 + }, + { + "epoch": 0.4634447687431733, + "grad_norm": 1.531246304512024, + "learning_rate": 0.00011661930770948699, + "loss": 1.0673, + "step": 12941 + }, + { + "epoch": 0.4634805808727416, + "grad_norm": 1.4207147359848022, + "learning_rate": 0.00011660786988956964, + "loss": 0.9889, + "step": 12942 + }, + { + "epoch": 0.4635163930023099, + "grad_norm": 1.5306403636932373, + "learning_rate": 0.00011659643184621485, + "loss": 1.1722, + "step": 12943 + }, + { + "epoch": 0.46355220513187817, + "grad_norm": 1.7014849185943604, + "learning_rate": 0.00011658499357957646, + "loss": 1.3024, + "step": 12944 + }, + { + "epoch": 0.46358801726144644, + "grad_norm": 1.685126543045044, + "learning_rate": 0.00011657355508980836, + "loss": 1.0802, + "step": 12945 + }, + { + "epoch": 0.46362382939101476, + "grad_norm": 1.6655924320220947, + "learning_rate": 0.00011656211637706449, + "loss": 1.2917, + "step": 12946 + }, + { + "epoch": 0.46365964152058303, + "grad_norm": 1.6543450355529785, + "learning_rate": 0.00011655067744149865, + "loss": 1.1111, + "step": 12947 + }, + { + "epoch": 0.4636954536501513, + "grad_norm": 1.5443823337554932, + "learning_rate": 0.00011653923828326485, + "loss": 1.0083, + "step": 12948 + }, + { + "epoch": 0.46373126577971957, + "grad_norm": 1.3118226528167725, + "learning_rate": 0.0001165277989025169, + "loss": 1.0277, + "step": 12949 + }, + { + "epoch": 0.4637670779092879, + "grad_norm": 1.3588968515396118, + "learning_rate": 0.00011651635929940874, + "loss": 0.9267, + "step": 12950 + }, + { + "epoch": 0.46380289003885616, + "grad_norm": 1.8623311519622803, + "learning_rate": 0.00011650491947409427, + "loss": 1.2937, + "step": 12951 + }, + { + "epoch": 0.46383870216842443, + "grad_norm": 1.5189707279205322, + "learning_rate": 0.00011649347942672741, + "loss": 1.2066, + "step": 12952 + }, + { + "epoch": 0.46387451429799276, + "grad_norm": 1.5750439167022705, + "learning_rate": 0.00011648203915746208, + "loss": 1.1654, + "step": 12953 + }, + { + "epoch": 0.463910326427561, + "grad_norm": 1.558652639389038, + "learning_rate": 0.00011647059866645213, + "loss": 1.1217, + "step": 12954 + }, + { + "epoch": 0.4639461385571293, + "grad_norm": 1.2012755870819092, + "learning_rate": 0.00011645915795385154, + "loss": 1.2067, + "step": 12955 + }, + { + "epoch": 0.46398195068669756, + "grad_norm": 1.5334841012954712, + "learning_rate": 0.0001164477170198142, + "loss": 1.0633, + "step": 12956 + }, + { + "epoch": 0.4640177628162659, + "grad_norm": 1.573493480682373, + "learning_rate": 0.00011643627586449406, + "loss": 1.1789, + "step": 12957 + }, + { + "epoch": 0.46405357494583416, + "grad_norm": 2.0128133296966553, + "learning_rate": 0.000116424834488045, + "loss": 1.1399, + "step": 12958 + }, + { + "epoch": 0.4640893870754024, + "grad_norm": 1.897806167602539, + "learning_rate": 0.00011641339289062101, + "loss": 1.2366, + "step": 12959 + }, + { + "epoch": 0.46412519920497075, + "grad_norm": 1.8216419219970703, + "learning_rate": 0.00011640195107237596, + "loss": 0.9974, + "step": 12960 + }, + { + "epoch": 0.464161011334539, + "grad_norm": 1.4891393184661865, + "learning_rate": 0.00011639050903346387, + "loss": 1.0776, + "step": 12961 + }, + { + "epoch": 0.4641968234641073, + "grad_norm": 1.773279070854187, + "learning_rate": 0.00011637906677403859, + "loss": 1.0383, + "step": 12962 + }, + { + "epoch": 0.46423263559367556, + "grad_norm": 1.2488069534301758, + "learning_rate": 0.00011636762429425407, + "loss": 0.8826, + "step": 12963 + }, + { + "epoch": 0.4642684477232439, + "grad_norm": 2.354229688644409, + "learning_rate": 0.0001163561815942643, + "loss": 1.1379, + "step": 12964 + }, + { + "epoch": 0.46430425985281215, + "grad_norm": 1.767897129058838, + "learning_rate": 0.00011634473867422322, + "loss": 1.1275, + "step": 12965 + }, + { + "epoch": 0.4643400719823804, + "grad_norm": 1.5435453653335571, + "learning_rate": 0.00011633329553428476, + "loss": 0.9603, + "step": 12966 + }, + { + "epoch": 0.46437588411194874, + "grad_norm": 1.3649662733078003, + "learning_rate": 0.00011632185217460283, + "loss": 1.2017, + "step": 12967 + }, + { + "epoch": 0.464411696241517, + "grad_norm": 1.6677131652832031, + "learning_rate": 0.00011631040859533148, + "loss": 0.9938, + "step": 12968 + }, + { + "epoch": 0.4644475083710853, + "grad_norm": 1.8299020528793335, + "learning_rate": 0.00011629896479662461, + "loss": 1.2143, + "step": 12969 + }, + { + "epoch": 0.46448332050065355, + "grad_norm": 1.3128690719604492, + "learning_rate": 0.0001162875207786362, + "loss": 1.0569, + "step": 12970 + }, + { + "epoch": 0.4645191326302219, + "grad_norm": 1.1798003911972046, + "learning_rate": 0.00011627607654152022, + "loss": 1.1881, + "step": 12971 + }, + { + "epoch": 0.46455494475979014, + "grad_norm": 1.727105975151062, + "learning_rate": 0.0001162646320854306, + "loss": 1.0217, + "step": 12972 + }, + { + "epoch": 0.4645907568893584, + "grad_norm": 1.904604196548462, + "learning_rate": 0.00011625318741052133, + "loss": 1.3547, + "step": 12973 + }, + { + "epoch": 0.4646265690189267, + "grad_norm": 2.3534579277038574, + "learning_rate": 0.0001162417425169464, + "loss": 0.9925, + "step": 12974 + }, + { + "epoch": 0.464662381148495, + "grad_norm": 1.6690727472305298, + "learning_rate": 0.00011623029740485978, + "loss": 1.0823, + "step": 12975 + }, + { + "epoch": 0.4646981932780633, + "grad_norm": 1.2566229104995728, + "learning_rate": 0.00011621885207441541, + "loss": 1.1837, + "step": 12976 + }, + { + "epoch": 0.46473400540763155, + "grad_norm": 1.3291035890579224, + "learning_rate": 0.00011620740652576736, + "loss": 0.9458, + "step": 12977 + }, + { + "epoch": 0.46476981753719987, + "grad_norm": 1.6138832569122314, + "learning_rate": 0.0001161959607590695, + "loss": 1.059, + "step": 12978 + }, + { + "epoch": 0.46480562966676814, + "grad_norm": 1.9339349269866943, + "learning_rate": 0.00011618451477447596, + "loss": 1.1284, + "step": 12979 + }, + { + "epoch": 0.4648414417963364, + "grad_norm": 2.044161319732666, + "learning_rate": 0.00011617306857214059, + "loss": 1.1889, + "step": 12980 + }, + { + "epoch": 0.4648772539259047, + "grad_norm": 1.6172083616256714, + "learning_rate": 0.00011616162215221744, + "loss": 1.0243, + "step": 12981 + }, + { + "epoch": 0.464913066055473, + "grad_norm": 1.6563080549240112, + "learning_rate": 0.00011615017551486054, + "loss": 1.3731, + "step": 12982 + }, + { + "epoch": 0.46494887818504127, + "grad_norm": 1.5973105430603027, + "learning_rate": 0.00011613872866022384, + "loss": 1.1401, + "step": 12983 + }, + { + "epoch": 0.46498469031460954, + "grad_norm": 1.5693509578704834, + "learning_rate": 0.00011612728158846138, + "loss": 1.1924, + "step": 12984 + }, + { + "epoch": 0.46502050244417786, + "grad_norm": 1.9536948204040527, + "learning_rate": 0.00011611583429972715, + "loss": 1.1042, + "step": 12985 + }, + { + "epoch": 0.46505631457374613, + "grad_norm": 1.6566978693008423, + "learning_rate": 0.00011610438679417515, + "loss": 1.1502, + "step": 12986 + }, + { + "epoch": 0.4650921267033144, + "grad_norm": 1.5025293827056885, + "learning_rate": 0.0001160929390719594, + "loss": 1.2177, + "step": 12987 + }, + { + "epoch": 0.46512793883288267, + "grad_norm": 3.031271457672119, + "learning_rate": 0.00011608149113323392, + "loss": 1.1303, + "step": 12988 + }, + { + "epoch": 0.465163750962451, + "grad_norm": 2.1164743900299072, + "learning_rate": 0.00011607004297815271, + "loss": 1.0103, + "step": 12989 + }, + { + "epoch": 0.46519956309201926, + "grad_norm": 1.353331446647644, + "learning_rate": 0.00011605859460686981, + "loss": 1.1582, + "step": 12990 + }, + { + "epoch": 0.46523537522158753, + "grad_norm": 1.5107905864715576, + "learning_rate": 0.00011604714601953922, + "loss": 1.2001, + "step": 12991 + }, + { + "epoch": 0.46527118735115586, + "grad_norm": 1.0393067598342896, + "learning_rate": 0.00011603569721631499, + "loss": 0.7417, + "step": 12992 + }, + { + "epoch": 0.4653069994807241, + "grad_norm": 1.4743016958236694, + "learning_rate": 0.00011602424819735111, + "loss": 1.2249, + "step": 12993 + }, + { + "epoch": 0.4653428116102924, + "grad_norm": 1.5858477354049683, + "learning_rate": 0.00011601279896280167, + "loss": 1.3263, + "step": 12994 + }, + { + "epoch": 0.46537862373986066, + "grad_norm": 1.328994870185852, + "learning_rate": 0.00011600134951282067, + "loss": 1.2662, + "step": 12995 + }, + { + "epoch": 0.465414435869429, + "grad_norm": 1.7247636318206787, + "learning_rate": 0.00011598989984756216, + "loss": 1.2039, + "step": 12996 + }, + { + "epoch": 0.46545024799899726, + "grad_norm": 1.5173218250274658, + "learning_rate": 0.0001159784499671802, + "loss": 1.4346, + "step": 12997 + }, + { + "epoch": 0.4654860601285655, + "grad_norm": 1.4862293004989624, + "learning_rate": 0.00011596699987182873, + "loss": 1.1026, + "step": 12998 + }, + { + "epoch": 0.46552187225813385, + "grad_norm": 1.6942188739776611, + "learning_rate": 0.00011595554956166195, + "loss": 1.1035, + "step": 12999 + }, + { + "epoch": 0.4655576843877021, + "grad_norm": 1.3264648914337158, + "learning_rate": 0.00011594409903683376, + "loss": 1.1319, + "step": 13000 + }, + { + "epoch": 0.4655934965172704, + "grad_norm": 1.5225495100021362, + "learning_rate": 0.00011593264829749835, + "loss": 1.1041, + "step": 13001 + }, + { + "epoch": 0.46562930864683866, + "grad_norm": 1.3096891641616821, + "learning_rate": 0.00011592119734380966, + "loss": 1.3282, + "step": 13002 + }, + { + "epoch": 0.465665120776407, + "grad_norm": 1.6344177722930908, + "learning_rate": 0.00011590974617592182, + "loss": 1.2728, + "step": 13003 + }, + { + "epoch": 0.46570093290597525, + "grad_norm": 1.7194674015045166, + "learning_rate": 0.00011589829479398886, + "loss": 1.398, + "step": 13004 + }, + { + "epoch": 0.4657367450355435, + "grad_norm": 1.7166026830673218, + "learning_rate": 0.00011588684319816485, + "loss": 1.0528, + "step": 13005 + }, + { + "epoch": 0.46577255716511184, + "grad_norm": 1.5571637153625488, + "learning_rate": 0.00011587539138860388, + "loss": 1.1496, + "step": 13006 + }, + { + "epoch": 0.4658083692946801, + "grad_norm": 2.0319111347198486, + "learning_rate": 0.00011586393936545995, + "loss": 1.2148, + "step": 13007 + }, + { + "epoch": 0.4658441814242484, + "grad_norm": 1.4007229804992676, + "learning_rate": 0.00011585248712888724, + "loss": 1.2989, + "step": 13008 + }, + { + "epoch": 0.46587999355381665, + "grad_norm": 1.6871718168258667, + "learning_rate": 0.0001158410346790397, + "loss": 1.095, + "step": 13009 + }, + { + "epoch": 0.465915805683385, + "grad_norm": 1.4719923734664917, + "learning_rate": 0.00011582958201607152, + "loss": 1.2291, + "step": 13010 + }, + { + "epoch": 0.46595161781295324, + "grad_norm": 1.3594721555709839, + "learning_rate": 0.0001158181291401367, + "loss": 1.1602, + "step": 13011 + }, + { + "epoch": 0.4659874299425215, + "grad_norm": 1.2007436752319336, + "learning_rate": 0.00011580667605138937, + "loss": 1.0616, + "step": 13012 + }, + { + "epoch": 0.46602324207208984, + "grad_norm": 1.4416711330413818, + "learning_rate": 0.0001157952227499836, + "loss": 1.4419, + "step": 13013 + }, + { + "epoch": 0.4660590542016581, + "grad_norm": 1.7252696752548218, + "learning_rate": 0.0001157837692360735, + "loss": 1.0215, + "step": 13014 + }, + { + "epoch": 0.4660948663312264, + "grad_norm": 1.8440748453140259, + "learning_rate": 0.00011577231550981313, + "loss": 1.3111, + "step": 13015 + }, + { + "epoch": 0.46613067846079465, + "grad_norm": 1.8201357126235962, + "learning_rate": 0.00011576086157135659, + "loss": 1.1366, + "step": 13016 + }, + { + "epoch": 0.46616649059036297, + "grad_norm": 1.581466555595398, + "learning_rate": 0.00011574940742085803, + "loss": 1.2044, + "step": 13017 + }, + { + "epoch": 0.46620230271993124, + "grad_norm": 1.7736271619796753, + "learning_rate": 0.00011573795305847146, + "loss": 1.2101, + "step": 13018 + }, + { + "epoch": 0.4662381148494995, + "grad_norm": 1.6567494869232178, + "learning_rate": 0.00011572649848435104, + "loss": 1.3597, + "step": 13019 + }, + { + "epoch": 0.46627392697906783, + "grad_norm": 1.4900007247924805, + "learning_rate": 0.00011571504369865087, + "loss": 1.0331, + "step": 13020 + }, + { + "epoch": 0.4663097391086361, + "grad_norm": 2.256582260131836, + "learning_rate": 0.0001157035887015251, + "loss": 1.0375, + "step": 13021 + }, + { + "epoch": 0.46634555123820437, + "grad_norm": 1.5757055282592773, + "learning_rate": 0.00011569213349312773, + "loss": 1.4944, + "step": 13022 + }, + { + "epoch": 0.46638136336777264, + "grad_norm": 2.9196863174438477, + "learning_rate": 0.000115680678073613, + "loss": 1.3384, + "step": 13023 + }, + { + "epoch": 0.46641717549734096, + "grad_norm": 1.7478206157684326, + "learning_rate": 0.00011566922244313496, + "loss": 1.1045, + "step": 13024 + }, + { + "epoch": 0.46645298762690923, + "grad_norm": 1.5183589458465576, + "learning_rate": 0.00011565776660184772, + "loss": 1.0929, + "step": 13025 + }, + { + "epoch": 0.4664887997564775, + "grad_norm": 1.5575188398361206, + "learning_rate": 0.00011564631054990546, + "loss": 1.1352, + "step": 13026 + }, + { + "epoch": 0.4665246118860458, + "grad_norm": 1.2238153219223022, + "learning_rate": 0.00011563485428746226, + "loss": 1.0937, + "step": 13027 + }, + { + "epoch": 0.4665604240156141, + "grad_norm": 1.4309768676757812, + "learning_rate": 0.00011562339781467226, + "loss": 1.123, + "step": 13028 + }, + { + "epoch": 0.46659623614518236, + "grad_norm": 1.7498738765716553, + "learning_rate": 0.00011561194113168958, + "loss": 1.0771, + "step": 13029 + }, + { + "epoch": 0.46663204827475063, + "grad_norm": 1.5791773796081543, + "learning_rate": 0.0001156004842386684, + "loss": 1.2011, + "step": 13030 + }, + { + "epoch": 0.46666786040431896, + "grad_norm": 1.2736420631408691, + "learning_rate": 0.0001155890271357628, + "loss": 0.8811, + "step": 13031 + }, + { + "epoch": 0.4667036725338872, + "grad_norm": 1.5224515199661255, + "learning_rate": 0.00011557756982312699, + "loss": 1.2047, + "step": 13032 + }, + { + "epoch": 0.4667394846634555, + "grad_norm": 1.5326662063598633, + "learning_rate": 0.00011556611230091502, + "loss": 1.2858, + "step": 13033 + }, + { + "epoch": 0.4667752967930238, + "grad_norm": 1.477596402168274, + "learning_rate": 0.00011555465456928114, + "loss": 1.0111, + "step": 13034 + }, + { + "epoch": 0.4668111089225921, + "grad_norm": 1.570785641670227, + "learning_rate": 0.0001155431966283794, + "loss": 0.8891, + "step": 13035 + }, + { + "epoch": 0.46684692105216036, + "grad_norm": 1.4675427675247192, + "learning_rate": 0.00011553173847836403, + "loss": 1.0595, + "step": 13036 + }, + { + "epoch": 0.4668827331817286, + "grad_norm": 2.1790549755096436, + "learning_rate": 0.00011552028011938913, + "loss": 1.0335, + "step": 13037 + }, + { + "epoch": 0.46691854531129695, + "grad_norm": 1.5573476552963257, + "learning_rate": 0.0001155088215516089, + "loss": 1.0037, + "step": 13038 + }, + { + "epoch": 0.4669543574408652, + "grad_norm": 1.36250638961792, + "learning_rate": 0.00011549736277517746, + "loss": 1.196, + "step": 13039 + }, + { + "epoch": 0.4669901695704335, + "grad_norm": 1.5724226236343384, + "learning_rate": 0.00011548590379024904, + "loss": 1.1008, + "step": 13040 + }, + { + "epoch": 0.4670259817000018, + "grad_norm": 2.246070384979248, + "learning_rate": 0.00011547444459697772, + "loss": 1.1523, + "step": 13041 + }, + { + "epoch": 0.4670617938295701, + "grad_norm": 1.434496283531189, + "learning_rate": 0.00011546298519551771, + "loss": 1.124, + "step": 13042 + }, + { + "epoch": 0.46709760595913835, + "grad_norm": 1.558634638786316, + "learning_rate": 0.00011545152558602319, + "loss": 1.3087, + "step": 13043 + }, + { + "epoch": 0.4671334180887066, + "grad_norm": 1.4036660194396973, + "learning_rate": 0.00011544006576864832, + "loss": 1.1104, + "step": 13044 + }, + { + "epoch": 0.46716923021827494, + "grad_norm": 1.3529795408248901, + "learning_rate": 0.00011542860574354727, + "loss": 1.2548, + "step": 13045 + }, + { + "epoch": 0.4672050423478432, + "grad_norm": 1.5631482601165771, + "learning_rate": 0.00011541714551087423, + "loss": 1.2017, + "step": 13046 + }, + { + "epoch": 0.4672408544774115, + "grad_norm": 1.480277419090271, + "learning_rate": 0.00011540568507078342, + "loss": 1.1678, + "step": 13047 + }, + { + "epoch": 0.4672766666069798, + "grad_norm": 1.515482783317566, + "learning_rate": 0.00011539422442342895, + "loss": 1.0791, + "step": 13048 + }, + { + "epoch": 0.4673124787365481, + "grad_norm": 1.386048436164856, + "learning_rate": 0.00011538276356896507, + "loss": 1.0473, + "step": 13049 + }, + { + "epoch": 0.46734829086611634, + "grad_norm": 1.5620633363723755, + "learning_rate": 0.00011537130250754595, + "loss": 1.1933, + "step": 13050 + }, + { + "epoch": 0.4673841029956846, + "grad_norm": 1.4490035772323608, + "learning_rate": 0.00011535984123932578, + "loss": 0.8334, + "step": 13051 + }, + { + "epoch": 0.46741991512525294, + "grad_norm": 1.4469952583312988, + "learning_rate": 0.00011534837976445875, + "loss": 0.9308, + "step": 13052 + }, + { + "epoch": 0.4674557272548212, + "grad_norm": 1.8207350969314575, + "learning_rate": 0.00011533691808309905, + "loss": 1.3549, + "step": 13053 + }, + { + "epoch": 0.4674915393843895, + "grad_norm": 1.5325818061828613, + "learning_rate": 0.00011532545619540094, + "loss": 0.9304, + "step": 13054 + }, + { + "epoch": 0.4675273515139578, + "grad_norm": 1.340889811515808, + "learning_rate": 0.00011531399410151855, + "loss": 1.0804, + "step": 13055 + }, + { + "epoch": 0.46756316364352607, + "grad_norm": 1.3504128456115723, + "learning_rate": 0.00011530253180160614, + "loss": 1.143, + "step": 13056 + }, + { + "epoch": 0.46759897577309434, + "grad_norm": 1.2155507802963257, + "learning_rate": 0.00011529106929581792, + "loss": 0.792, + "step": 13057 + }, + { + "epoch": 0.4676347879026626, + "grad_norm": 1.3814351558685303, + "learning_rate": 0.00011527960658430807, + "loss": 1.0386, + "step": 13058 + }, + { + "epoch": 0.46767060003223093, + "grad_norm": 1.511411190032959, + "learning_rate": 0.00011526814366723084, + "loss": 1.1026, + "step": 13059 + }, + { + "epoch": 0.4677064121617992, + "grad_norm": 1.784761667251587, + "learning_rate": 0.00011525668054474039, + "loss": 0.9124, + "step": 13060 + }, + { + "epoch": 0.46774222429136747, + "grad_norm": 1.8599694967269897, + "learning_rate": 0.00011524521721699102, + "loss": 1.0388, + "step": 13061 + }, + { + "epoch": 0.4677780364209358, + "grad_norm": 1.6281448602676392, + "learning_rate": 0.0001152337536841369, + "loss": 1.1383, + "step": 13062 + }, + { + "epoch": 0.46781384855050406, + "grad_norm": 1.9994699954986572, + "learning_rate": 0.00011522228994633229, + "loss": 1.1771, + "step": 13063 + }, + { + "epoch": 0.46784966068007233, + "grad_norm": 1.670890212059021, + "learning_rate": 0.00011521082600373136, + "loss": 1.0998, + "step": 13064 + }, + { + "epoch": 0.4678854728096406, + "grad_norm": 1.407305359840393, + "learning_rate": 0.00011519936185648842, + "loss": 0.9614, + "step": 13065 + }, + { + "epoch": 0.4679212849392089, + "grad_norm": 1.4768052101135254, + "learning_rate": 0.0001151878975047577, + "loss": 1.0736, + "step": 13066 + }, + { + "epoch": 0.4679570970687772, + "grad_norm": 1.415632963180542, + "learning_rate": 0.00011517643294869339, + "loss": 1.0796, + "step": 13067 + }, + { + "epoch": 0.46799290919834546, + "grad_norm": 1.5101070404052734, + "learning_rate": 0.00011516496818844972, + "loss": 1.2232, + "step": 13068 + }, + { + "epoch": 0.4680287213279138, + "grad_norm": 1.445245385169983, + "learning_rate": 0.000115153503224181, + "loss": 1.2324, + "step": 13069 + }, + { + "epoch": 0.46806453345748206, + "grad_norm": 1.5152003765106201, + "learning_rate": 0.00011514203805604142, + "loss": 0.9402, + "step": 13070 + }, + { + "epoch": 0.4681003455870503, + "grad_norm": 1.5949697494506836, + "learning_rate": 0.00011513057268418526, + "loss": 1.2424, + "step": 13071 + }, + { + "epoch": 0.4681361577166186, + "grad_norm": 1.606095790863037, + "learning_rate": 0.00011511910710876677, + "loss": 1.275, + "step": 13072 + }, + { + "epoch": 0.4681719698461869, + "grad_norm": 2.0154895782470703, + "learning_rate": 0.00011510764132994016, + "loss": 1.2088, + "step": 13073 + }, + { + "epoch": 0.4682077819757552, + "grad_norm": 1.3013864755630493, + "learning_rate": 0.00011509617534785976, + "loss": 1.1589, + "step": 13074 + }, + { + "epoch": 0.46824359410532346, + "grad_norm": 1.6700361967086792, + "learning_rate": 0.00011508470916267978, + "loss": 1.1901, + "step": 13075 + }, + { + "epoch": 0.4682794062348918, + "grad_norm": 1.497582197189331, + "learning_rate": 0.00011507324277455452, + "loss": 0.9721, + "step": 13076 + }, + { + "epoch": 0.46831521836446005, + "grad_norm": 1.8265101909637451, + "learning_rate": 0.00011506177618363818, + "loss": 1.2027, + "step": 13077 + }, + { + "epoch": 0.4683510304940283, + "grad_norm": 1.5780168771743774, + "learning_rate": 0.00011505030939008508, + "loss": 1.1673, + "step": 13078 + }, + { + "epoch": 0.4683868426235966, + "grad_norm": 1.447563648223877, + "learning_rate": 0.0001150388423940495, + "loss": 0.9685, + "step": 13079 + }, + { + "epoch": 0.4684226547531649, + "grad_norm": 1.4150625467300415, + "learning_rate": 0.00011502737519568567, + "loss": 1.3522, + "step": 13080 + }, + { + "epoch": 0.4684584668827332, + "grad_norm": 1.308445692062378, + "learning_rate": 0.00011501590779514793, + "loss": 1.1127, + "step": 13081 + }, + { + "epoch": 0.46849427901230145, + "grad_norm": 2.1089770793914795, + "learning_rate": 0.00011500444019259047, + "loss": 1.1563, + "step": 13082 + }, + { + "epoch": 0.4685300911418698, + "grad_norm": 1.4753447771072388, + "learning_rate": 0.00011499297238816767, + "loss": 1.2129, + "step": 13083 + }, + { + "epoch": 0.46856590327143804, + "grad_norm": 1.3524881601333618, + "learning_rate": 0.00011498150438203373, + "loss": 1.2248, + "step": 13084 + }, + { + "epoch": 0.4686017154010063, + "grad_norm": 1.41084623336792, + "learning_rate": 0.00011497003617434301, + "loss": 1.0598, + "step": 13085 + }, + { + "epoch": 0.4686375275305746, + "grad_norm": 1.7879208326339722, + "learning_rate": 0.00011495856776524971, + "loss": 0.8876, + "step": 13086 + }, + { + "epoch": 0.4686733396601429, + "grad_norm": 1.4174429178237915, + "learning_rate": 0.00011494709915490822, + "loss": 1.1073, + "step": 13087 + }, + { + "epoch": 0.4687091517897112, + "grad_norm": 1.6471564769744873, + "learning_rate": 0.00011493563034347277, + "loss": 1.2155, + "step": 13088 + }, + { + "epoch": 0.46874496391927944, + "grad_norm": 1.477708101272583, + "learning_rate": 0.00011492416133109769, + "loss": 1.0113, + "step": 13089 + }, + { + "epoch": 0.46878077604884777, + "grad_norm": 1.2760748863220215, + "learning_rate": 0.00011491269211793725, + "loss": 1.0339, + "step": 13090 + }, + { + "epoch": 0.46881658817841604, + "grad_norm": 1.4721603393554688, + "learning_rate": 0.00011490122270414578, + "loss": 1.0684, + "step": 13091 + }, + { + "epoch": 0.4688524003079843, + "grad_norm": 1.6857784986495972, + "learning_rate": 0.0001148897530898776, + "loss": 1.0048, + "step": 13092 + }, + { + "epoch": 0.4688882124375526, + "grad_norm": 1.42212975025177, + "learning_rate": 0.000114878283275287, + "loss": 1.0647, + "step": 13093 + }, + { + "epoch": 0.4689240245671209, + "grad_norm": 1.4569406509399414, + "learning_rate": 0.00011486681326052828, + "loss": 1.2137, + "step": 13094 + }, + { + "epoch": 0.46895983669668917, + "grad_norm": 1.7208428382873535, + "learning_rate": 0.00011485534304575575, + "loss": 1.1841, + "step": 13095 + }, + { + "epoch": 0.46899564882625744, + "grad_norm": 1.346204400062561, + "learning_rate": 0.00011484387263112377, + "loss": 1.0461, + "step": 13096 + }, + { + "epoch": 0.46903146095582576, + "grad_norm": 1.7594654560089111, + "learning_rate": 0.0001148324020167866, + "loss": 0.7861, + "step": 13097 + }, + { + "epoch": 0.46906727308539403, + "grad_norm": 1.72853422164917, + "learning_rate": 0.0001148209312028986, + "loss": 1.0068, + "step": 13098 + }, + { + "epoch": 0.4691030852149623, + "grad_norm": 1.6795216798782349, + "learning_rate": 0.0001148094601896141, + "loss": 1.0694, + "step": 13099 + }, + { + "epoch": 0.46913889734453057, + "grad_norm": 2.2161102294921875, + "learning_rate": 0.00011479798897708742, + "loss": 1.5077, + "step": 13100 + }, + { + "epoch": 0.4691747094740989, + "grad_norm": 1.8130117654800415, + "learning_rate": 0.00011478651756547287, + "loss": 1.2511, + "step": 13101 + }, + { + "epoch": 0.46921052160366716, + "grad_norm": 1.584208369255066, + "learning_rate": 0.00011477504595492481, + "loss": 0.9047, + "step": 13102 + }, + { + "epoch": 0.46924633373323543, + "grad_norm": 1.2994951009750366, + "learning_rate": 0.00011476357414559757, + "loss": 1.1845, + "step": 13103 + }, + { + "epoch": 0.46928214586280376, + "grad_norm": 2.745906114578247, + "learning_rate": 0.00011475210213764547, + "loss": 1.2927, + "step": 13104 + }, + { + "epoch": 0.469317957992372, + "grad_norm": 1.4162306785583496, + "learning_rate": 0.00011474062993122288, + "loss": 1.1397, + "step": 13105 + }, + { + "epoch": 0.4693537701219403, + "grad_norm": 1.5343674421310425, + "learning_rate": 0.0001147291575264841, + "loss": 1.3064, + "step": 13106 + }, + { + "epoch": 0.46938958225150856, + "grad_norm": 1.2541799545288086, + "learning_rate": 0.00011471768492358354, + "loss": 0.9893, + "step": 13107 + }, + { + "epoch": 0.4694253943810769, + "grad_norm": 1.459567904472351, + "learning_rate": 0.00011470621212267547, + "loss": 1.04, + "step": 13108 + }, + { + "epoch": 0.46946120651064516, + "grad_norm": 1.497878909111023, + "learning_rate": 0.00011469473912391433, + "loss": 1.0984, + "step": 13109 + }, + { + "epoch": 0.4694970186402134, + "grad_norm": 1.7075334787368774, + "learning_rate": 0.0001146832659274544, + "loss": 1.2978, + "step": 13110 + }, + { + "epoch": 0.46953283076978175, + "grad_norm": 1.6028001308441162, + "learning_rate": 0.00011467179253345008, + "loss": 1.1966, + "step": 13111 + }, + { + "epoch": 0.46956864289935, + "grad_norm": 1.4871248006820679, + "learning_rate": 0.00011466031894205574, + "loss": 1.1043, + "step": 13112 + }, + { + "epoch": 0.4696044550289183, + "grad_norm": 1.5182209014892578, + "learning_rate": 0.00011464884515342568, + "loss": 1.078, + "step": 13113 + }, + { + "epoch": 0.46964026715848656, + "grad_norm": 1.3288993835449219, + "learning_rate": 0.00011463737116771434, + "loss": 1.0013, + "step": 13114 + }, + { + "epoch": 0.4696760792880549, + "grad_norm": 1.6297810077667236, + "learning_rate": 0.00011462589698507603, + "loss": 1.2384, + "step": 13115 + }, + { + "epoch": 0.46971189141762315, + "grad_norm": 1.4532698392868042, + "learning_rate": 0.00011461442260566513, + "loss": 0.9572, + "step": 13116 + }, + { + "epoch": 0.4697477035471914, + "grad_norm": 1.2834807634353638, + "learning_rate": 0.00011460294802963602, + "loss": 0.9897, + "step": 13117 + }, + { + "epoch": 0.46978351567675974, + "grad_norm": 2.3404085636138916, + "learning_rate": 0.00011459147325714312, + "loss": 1.1299, + "step": 13118 + }, + { + "epoch": 0.469819327806328, + "grad_norm": 1.2640380859375, + "learning_rate": 0.00011457999828834073, + "loss": 1.1846, + "step": 13119 + }, + { + "epoch": 0.4698551399358963, + "grad_norm": 1.848799467086792, + "learning_rate": 0.00011456852312338331, + "loss": 0.9905, + "step": 13120 + }, + { + "epoch": 0.46989095206546455, + "grad_norm": 1.5246397256851196, + "learning_rate": 0.00011455704776242517, + "loss": 1.2392, + "step": 13121 + }, + { + "epoch": 0.4699267641950329, + "grad_norm": 1.4486843347549438, + "learning_rate": 0.00011454557220562074, + "loss": 1.0169, + "step": 13122 + }, + { + "epoch": 0.46996257632460114, + "grad_norm": 1.508795976638794, + "learning_rate": 0.0001145340964531244, + "loss": 1.0813, + "step": 13123 + }, + { + "epoch": 0.4699983884541694, + "grad_norm": 1.4145293235778809, + "learning_rate": 0.00011452262050509053, + "loss": 0.9961, + "step": 13124 + }, + { + "epoch": 0.47003420058373774, + "grad_norm": 1.9010560512542725, + "learning_rate": 0.00011451114436167356, + "loss": 1.0959, + "step": 13125 + }, + { + "epoch": 0.470070012713306, + "grad_norm": 1.4643597602844238, + "learning_rate": 0.00011449966802302783, + "loss": 1.2706, + "step": 13126 + }, + { + "epoch": 0.4701058248428743, + "grad_norm": 1.9646052122116089, + "learning_rate": 0.0001144881914893078, + "loss": 0.8977, + "step": 13127 + }, + { + "epoch": 0.47014163697244254, + "grad_norm": 1.348831295967102, + "learning_rate": 0.00011447671476066781, + "loss": 1.0517, + "step": 13128 + }, + { + "epoch": 0.47017744910201087, + "grad_norm": 1.1717110872268677, + "learning_rate": 0.00011446523783726235, + "loss": 1.0466, + "step": 13129 + }, + { + "epoch": 0.47021326123157914, + "grad_norm": 1.6493659019470215, + "learning_rate": 0.00011445376071924572, + "loss": 1.1113, + "step": 13130 + }, + { + "epoch": 0.4702490733611474, + "grad_norm": 1.3358803987503052, + "learning_rate": 0.00011444228340677241, + "loss": 1.2571, + "step": 13131 + }, + { + "epoch": 0.47028488549071573, + "grad_norm": 1.8322831392288208, + "learning_rate": 0.00011443080589999677, + "loss": 1.2846, + "step": 13132 + }, + { + "epoch": 0.470320697620284, + "grad_norm": 1.7873013019561768, + "learning_rate": 0.00011441932819907328, + "loss": 1.1605, + "step": 13133 + }, + { + "epoch": 0.47035650974985227, + "grad_norm": 2.6587204933166504, + "learning_rate": 0.00011440785030415633, + "loss": 0.9467, + "step": 13134 + }, + { + "epoch": 0.47039232187942054, + "grad_norm": 1.681336522102356, + "learning_rate": 0.00011439637221540031, + "loss": 1.3349, + "step": 13135 + }, + { + "epoch": 0.47042813400898886, + "grad_norm": 2.1128151416778564, + "learning_rate": 0.00011438489393295973, + "loss": 1.2571, + "step": 13136 + }, + { + "epoch": 0.47046394613855713, + "grad_norm": 1.4800351858139038, + "learning_rate": 0.00011437341545698892, + "loss": 1.0866, + "step": 13137 + }, + { + "epoch": 0.4704997582681254, + "grad_norm": 1.4117008447647095, + "learning_rate": 0.00011436193678764236, + "loss": 1.0393, + "step": 13138 + }, + { + "epoch": 0.4705355703976937, + "grad_norm": 1.3519508838653564, + "learning_rate": 0.00011435045792507443, + "loss": 1.1312, + "step": 13139 + }, + { + "epoch": 0.470571382527262, + "grad_norm": 1.6711409091949463, + "learning_rate": 0.00011433897886943965, + "loss": 1.3706, + "step": 13140 + }, + { + "epoch": 0.47060719465683026, + "grad_norm": 1.6909233331680298, + "learning_rate": 0.00011432749962089235, + "loss": 0.9899, + "step": 13141 + }, + { + "epoch": 0.47064300678639853, + "grad_norm": 1.780092716217041, + "learning_rate": 0.00011431602017958707, + "loss": 1.0629, + "step": 13142 + }, + { + "epoch": 0.47067881891596686, + "grad_norm": 1.8257427215576172, + "learning_rate": 0.00011430454054567819, + "loss": 1.2929, + "step": 13143 + }, + { + "epoch": 0.4707146310455351, + "grad_norm": 1.5151581764221191, + "learning_rate": 0.00011429306071932018, + "loss": 0.9524, + "step": 13144 + }, + { + "epoch": 0.4707504431751034, + "grad_norm": 1.5287777185440063, + "learning_rate": 0.00011428158070066743, + "loss": 1.2039, + "step": 13145 + }, + { + "epoch": 0.4707862553046717, + "grad_norm": 1.8172742128372192, + "learning_rate": 0.00011427010048987448, + "loss": 1.3458, + "step": 13146 + }, + { + "epoch": 0.47082206743424, + "grad_norm": 1.6816797256469727, + "learning_rate": 0.00011425862008709574, + "loss": 1.0626, + "step": 13147 + }, + { + "epoch": 0.47085787956380826, + "grad_norm": 1.387195110321045, + "learning_rate": 0.00011424713949248562, + "loss": 1.0355, + "step": 13148 + }, + { + "epoch": 0.4708936916933765, + "grad_norm": 1.7677918672561646, + "learning_rate": 0.00011423565870619863, + "loss": 1.0848, + "step": 13149 + }, + { + "epoch": 0.47092950382294485, + "grad_norm": 1.3408262729644775, + "learning_rate": 0.00011422417772838923, + "loss": 1.0646, + "step": 13150 + }, + { + "epoch": 0.4709653159525131, + "grad_norm": 1.7501624822616577, + "learning_rate": 0.00011421269655921185, + "loss": 1.0353, + "step": 13151 + }, + { + "epoch": 0.4710011280820814, + "grad_norm": 1.3313394784927368, + "learning_rate": 0.00011420121519882096, + "loss": 1.124, + "step": 13152 + }, + { + "epoch": 0.4710369402116497, + "grad_norm": 1.465660810470581, + "learning_rate": 0.00011418973364737107, + "loss": 1.2059, + "step": 13153 + }, + { + "epoch": 0.471072752341218, + "grad_norm": 1.8459510803222656, + "learning_rate": 0.00011417825190501658, + "loss": 1.0004, + "step": 13154 + }, + { + "epoch": 0.47110856447078625, + "grad_norm": 1.397580862045288, + "learning_rate": 0.00011416676997191205, + "loss": 1.1669, + "step": 13155 + }, + { + "epoch": 0.4711443766003545, + "grad_norm": 1.7250826358795166, + "learning_rate": 0.00011415528784821188, + "loss": 1.213, + "step": 13156 + }, + { + "epoch": 0.47118018872992284, + "grad_norm": 1.573739767074585, + "learning_rate": 0.00011414380553407055, + "loss": 0.9744, + "step": 13157 + }, + { + "epoch": 0.4712160008594911, + "grad_norm": 1.5985220670700073, + "learning_rate": 0.00011413232302964258, + "loss": 1.2067, + "step": 13158 + }, + { + "epoch": 0.4712518129890594, + "grad_norm": 1.748016357421875, + "learning_rate": 0.00011412084033508242, + "loss": 1.1427, + "step": 13159 + }, + { + "epoch": 0.4712876251186277, + "grad_norm": 1.6714134216308594, + "learning_rate": 0.00011410935745054459, + "loss": 1.1027, + "step": 13160 + }, + { + "epoch": 0.471323437248196, + "grad_norm": 1.3990130424499512, + "learning_rate": 0.00011409787437618353, + "loss": 1.1542, + "step": 13161 + }, + { + "epoch": 0.47135924937776424, + "grad_norm": 1.5751721858978271, + "learning_rate": 0.00011408639111215378, + "loss": 1.151, + "step": 13162 + }, + { + "epoch": 0.4713950615073325, + "grad_norm": 1.506591796875, + "learning_rate": 0.00011407490765860978, + "loss": 1.0462, + "step": 13163 + }, + { + "epoch": 0.47143087363690084, + "grad_norm": 1.4010014533996582, + "learning_rate": 0.00011406342401570609, + "loss": 1.1049, + "step": 13164 + }, + { + "epoch": 0.4714666857664691, + "grad_norm": 1.5352849960327148, + "learning_rate": 0.00011405194018359715, + "loss": 1.1136, + "step": 13165 + }, + { + "epoch": 0.4715024978960374, + "grad_norm": 1.5048675537109375, + "learning_rate": 0.00011404045616243745, + "loss": 1.1806, + "step": 13166 + }, + { + "epoch": 0.4715383100256057, + "grad_norm": 1.5980900526046753, + "learning_rate": 0.00011402897195238158, + "loss": 1.412, + "step": 13167 + }, + { + "epoch": 0.47157412215517397, + "grad_norm": 1.7793889045715332, + "learning_rate": 0.00011401748755358395, + "loss": 1.0673, + "step": 13168 + }, + { + "epoch": 0.47160993428474224, + "grad_norm": 1.6104648113250732, + "learning_rate": 0.00011400600296619912, + "loss": 1.2573, + "step": 13169 + }, + { + "epoch": 0.4716457464143105, + "grad_norm": 1.5893477201461792, + "learning_rate": 0.00011399451819038159, + "loss": 1.1043, + "step": 13170 + }, + { + "epoch": 0.47168155854387883, + "grad_norm": 1.4452580213546753, + "learning_rate": 0.00011398303322628585, + "loss": 1.1423, + "step": 13171 + }, + { + "epoch": 0.4717173706734471, + "grad_norm": 1.275611400604248, + "learning_rate": 0.00011397154807406645, + "loss": 0.9665, + "step": 13172 + }, + { + "epoch": 0.47175318280301537, + "grad_norm": 1.5914170742034912, + "learning_rate": 0.00011396006273387792, + "loss": 0.9926, + "step": 13173 + }, + { + "epoch": 0.47178899493258364, + "grad_norm": 1.7459657192230225, + "learning_rate": 0.0001139485772058747, + "loss": 1.287, + "step": 13174 + }, + { + "epoch": 0.47182480706215196, + "grad_norm": 1.8073300123214722, + "learning_rate": 0.0001139370914902114, + "loss": 1.2466, + "step": 13175 + }, + { + "epoch": 0.47186061919172023, + "grad_norm": 1.7775795459747314, + "learning_rate": 0.00011392560558704249, + "loss": 1.1071, + "step": 13176 + }, + { + "epoch": 0.4718964313212885, + "grad_norm": 1.4809566736221313, + "learning_rate": 0.00011391411949652253, + "loss": 1.1408, + "step": 13177 + }, + { + "epoch": 0.4719322434508568, + "grad_norm": 1.8120696544647217, + "learning_rate": 0.00011390263321880605, + "loss": 1.0641, + "step": 13178 + }, + { + "epoch": 0.4719680555804251, + "grad_norm": 1.797071933746338, + "learning_rate": 0.00011389114675404755, + "loss": 1.2282, + "step": 13179 + }, + { + "epoch": 0.47200386770999336, + "grad_norm": 1.6525871753692627, + "learning_rate": 0.00011387966010240161, + "loss": 1.2082, + "step": 13180 + }, + { + "epoch": 0.47203967983956163, + "grad_norm": 2.8168704509735107, + "learning_rate": 0.00011386817326402273, + "loss": 1.2265, + "step": 13181 + }, + { + "epoch": 0.47207549196912996, + "grad_norm": 1.6814237833023071, + "learning_rate": 0.00011385668623906551, + "loss": 1.1218, + "step": 13182 + }, + { + "epoch": 0.4721113040986982, + "grad_norm": 1.5657607316970825, + "learning_rate": 0.00011384519902768441, + "loss": 1.161, + "step": 13183 + }, + { + "epoch": 0.4721471162282665, + "grad_norm": 1.3563648462295532, + "learning_rate": 0.00011383371163003403, + "loss": 0.9198, + "step": 13184 + }, + { + "epoch": 0.4721829283578348, + "grad_norm": 2.4804775714874268, + "learning_rate": 0.00011382222404626888, + "loss": 1.097, + "step": 13185 + }, + { + "epoch": 0.4722187404874031, + "grad_norm": 1.6743710041046143, + "learning_rate": 0.00011381073627654357, + "loss": 1.219, + "step": 13186 + }, + { + "epoch": 0.47225455261697136, + "grad_norm": 1.268153429031372, + "learning_rate": 0.00011379924832101258, + "loss": 1.2048, + "step": 13187 + }, + { + "epoch": 0.4722903647465396, + "grad_norm": 1.453240990638733, + "learning_rate": 0.00011378776017983053, + "loss": 1.2211, + "step": 13188 + }, + { + "epoch": 0.47232617687610795, + "grad_norm": 1.4463844299316406, + "learning_rate": 0.00011377627185315194, + "loss": 1.0933, + "step": 13189 + }, + { + "epoch": 0.4723619890056762, + "grad_norm": 1.428719162940979, + "learning_rate": 0.00011376478334113139, + "loss": 1.019, + "step": 13190 + }, + { + "epoch": 0.4723978011352445, + "grad_norm": 1.6830168962478638, + "learning_rate": 0.00011375329464392343, + "loss": 1.2212, + "step": 13191 + }, + { + "epoch": 0.4724336132648128, + "grad_norm": 1.5284507274627686, + "learning_rate": 0.00011374180576168263, + "loss": 1.0, + "step": 13192 + }, + { + "epoch": 0.4724694253943811, + "grad_norm": 1.6400150060653687, + "learning_rate": 0.00011373031669456358, + "loss": 1.023, + "step": 13193 + }, + { + "epoch": 0.47250523752394935, + "grad_norm": 1.1968716382980347, + "learning_rate": 0.0001137188274427208, + "loss": 1.0897, + "step": 13194 + }, + { + "epoch": 0.4725410496535176, + "grad_norm": 1.6990662813186646, + "learning_rate": 0.00011370733800630892, + "loss": 1.2017, + "step": 13195 + }, + { + "epoch": 0.47257686178308594, + "grad_norm": 1.2250382900238037, + "learning_rate": 0.00011369584838548246, + "loss": 0.943, + "step": 13196 + }, + { + "epoch": 0.4726126739126542, + "grad_norm": 1.2994049787521362, + "learning_rate": 0.00011368435858039605, + "loss": 0.9731, + "step": 13197 + }, + { + "epoch": 0.4726484860422225, + "grad_norm": 1.3781278133392334, + "learning_rate": 0.00011367286859120423, + "loss": 1.0193, + "step": 13198 + }, + { + "epoch": 0.4726842981717908, + "grad_norm": 1.3713338375091553, + "learning_rate": 0.00011366137841806161, + "loss": 1.1764, + "step": 13199 + }, + { + "epoch": 0.4727201103013591, + "grad_norm": 2.1737077236175537, + "learning_rate": 0.00011364988806112278, + "loss": 1.2006, + "step": 13200 + }, + { + "epoch": 0.47275592243092734, + "grad_norm": 1.576979160308838, + "learning_rate": 0.00011363839752054228, + "loss": 1.2636, + "step": 13201 + }, + { + "epoch": 0.4727917345604956, + "grad_norm": 1.4682579040527344, + "learning_rate": 0.00011362690679647477, + "loss": 1.28, + "step": 13202 + }, + { + "epoch": 0.47282754669006394, + "grad_norm": 1.425987720489502, + "learning_rate": 0.00011361541588907477, + "loss": 1.1294, + "step": 13203 + }, + { + "epoch": 0.4728633588196322, + "grad_norm": 1.383626937866211, + "learning_rate": 0.00011360392479849693, + "loss": 0.9778, + "step": 13204 + }, + { + "epoch": 0.4728991709492005, + "grad_norm": 1.47225821018219, + "learning_rate": 0.00011359243352489581, + "loss": 1.2947, + "step": 13205 + }, + { + "epoch": 0.4729349830787688, + "grad_norm": 1.3016465902328491, + "learning_rate": 0.00011358094206842607, + "loss": 1.0871, + "step": 13206 + }, + { + "epoch": 0.47297079520833707, + "grad_norm": 1.7463663816452026, + "learning_rate": 0.00011356945042924223, + "loss": 1.2171, + "step": 13207 + }, + { + "epoch": 0.47300660733790534, + "grad_norm": 1.7142484188079834, + "learning_rate": 0.00011355795860749899, + "loss": 1.106, + "step": 13208 + }, + { + "epoch": 0.4730424194674736, + "grad_norm": 1.3530813455581665, + "learning_rate": 0.00011354646660335086, + "loss": 1.1042, + "step": 13209 + }, + { + "epoch": 0.47307823159704193, + "grad_norm": 2.172330856323242, + "learning_rate": 0.00011353497441695251, + "loss": 1.0448, + "step": 13210 + }, + { + "epoch": 0.4731140437266102, + "grad_norm": 1.8205782175064087, + "learning_rate": 0.00011352348204845853, + "loss": 0.9927, + "step": 13211 + }, + { + "epoch": 0.47314985585617847, + "grad_norm": 1.3855170011520386, + "learning_rate": 0.00011351198949802355, + "loss": 0.8001, + "step": 13212 + }, + { + "epoch": 0.4731856679857468, + "grad_norm": 2.1836347579956055, + "learning_rate": 0.0001135004967658022, + "loss": 1.2637, + "step": 13213 + }, + { + "epoch": 0.47322148011531506, + "grad_norm": 1.835919976234436, + "learning_rate": 0.00011348900385194903, + "loss": 1.2176, + "step": 13214 + }, + { + "epoch": 0.47325729224488333, + "grad_norm": 1.473170280456543, + "learning_rate": 0.00011347751075661876, + "loss": 1.1062, + "step": 13215 + }, + { + "epoch": 0.4732931043744516, + "grad_norm": 1.898541808128357, + "learning_rate": 0.00011346601747996595, + "loss": 1.1748, + "step": 13216 + }, + { + "epoch": 0.4733289165040199, + "grad_norm": 1.5008395910263062, + "learning_rate": 0.00011345452402214527, + "loss": 1.1596, + "step": 13217 + }, + { + "epoch": 0.4733647286335882, + "grad_norm": 1.6906217336654663, + "learning_rate": 0.0001134430303833113, + "loss": 1.2158, + "step": 13218 + }, + { + "epoch": 0.47340054076315646, + "grad_norm": 1.4230514764785767, + "learning_rate": 0.00011343153656361867, + "loss": 1.0381, + "step": 13219 + }, + { + "epoch": 0.4734363528927248, + "grad_norm": 1.6835334300994873, + "learning_rate": 0.00011342004256322208, + "loss": 1.1084, + "step": 13220 + }, + { + "epoch": 0.47347216502229306, + "grad_norm": 1.4248347282409668, + "learning_rate": 0.00011340854838227611, + "loss": 1.2787, + "step": 13221 + }, + { + "epoch": 0.4735079771518613, + "grad_norm": 1.5454216003417969, + "learning_rate": 0.00011339705402093543, + "loss": 1.364, + "step": 13222 + }, + { + "epoch": 0.4735437892814296, + "grad_norm": 1.698466420173645, + "learning_rate": 0.00011338555947935465, + "loss": 1.1243, + "step": 13223 + }, + { + "epoch": 0.4735796014109979, + "grad_norm": 1.6555436849594116, + "learning_rate": 0.00011337406475768846, + "loss": 1.3429, + "step": 13224 + }, + { + "epoch": 0.4736154135405662, + "grad_norm": 1.447961688041687, + "learning_rate": 0.00011336256985609144, + "loss": 1.2121, + "step": 13225 + }, + { + "epoch": 0.47365122567013446, + "grad_norm": 1.2412384748458862, + "learning_rate": 0.00011335107477471834, + "loss": 1.0327, + "step": 13226 + }, + { + "epoch": 0.4736870377997028, + "grad_norm": 1.8188989162445068, + "learning_rate": 0.00011333957951372372, + "loss": 0.9406, + "step": 13227 + }, + { + "epoch": 0.47372284992927105, + "grad_norm": 1.5691769123077393, + "learning_rate": 0.00011332808407326225, + "loss": 1.0137, + "step": 13228 + }, + { + "epoch": 0.4737586620588393, + "grad_norm": 1.2612189054489136, + "learning_rate": 0.0001133165884534886, + "loss": 1.312, + "step": 13229 + }, + { + "epoch": 0.4737944741884076, + "grad_norm": 1.4939329624176025, + "learning_rate": 0.00011330509265455745, + "loss": 1.1703, + "step": 13230 + }, + { + "epoch": 0.4738302863179759, + "grad_norm": 1.6485625505447388, + "learning_rate": 0.00011329359667662342, + "loss": 0.9537, + "step": 13231 + }, + { + "epoch": 0.4738660984475442, + "grad_norm": 1.564612627029419, + "learning_rate": 0.00011328210051984118, + "loss": 1.2548, + "step": 13232 + }, + { + "epoch": 0.47390191057711245, + "grad_norm": 1.3637051582336426, + "learning_rate": 0.00011327060418436545, + "loss": 1.2011, + "step": 13233 + }, + { + "epoch": 0.4739377227066808, + "grad_norm": 1.3996367454528809, + "learning_rate": 0.00011325910767035086, + "loss": 0.9915, + "step": 13234 + }, + { + "epoch": 0.47397353483624904, + "grad_norm": 1.9566293954849243, + "learning_rate": 0.00011324761097795206, + "loss": 1.2839, + "step": 13235 + }, + { + "epoch": 0.4740093469658173, + "grad_norm": 1.5586789846420288, + "learning_rate": 0.00011323611410732375, + "loss": 1.1509, + "step": 13236 + }, + { + "epoch": 0.4740451590953856, + "grad_norm": 1.5373153686523438, + "learning_rate": 0.0001132246170586206, + "loss": 1.1522, + "step": 13237 + }, + { + "epoch": 0.4740809712249539, + "grad_norm": 1.4659110307693481, + "learning_rate": 0.00011321311983199727, + "loss": 1.2495, + "step": 13238 + }, + { + "epoch": 0.4741167833545222, + "grad_norm": 1.6281468868255615, + "learning_rate": 0.00011320162242760848, + "loss": 1.1969, + "step": 13239 + }, + { + "epoch": 0.47415259548409044, + "grad_norm": 1.6179927587509155, + "learning_rate": 0.00011319012484560885, + "loss": 1.204, + "step": 13240 + }, + { + "epoch": 0.47418840761365877, + "grad_norm": 1.5544389486312866, + "learning_rate": 0.00011317862708615314, + "loss": 1.1664, + "step": 13241 + }, + { + "epoch": 0.47422421974322704, + "grad_norm": 1.700594186782837, + "learning_rate": 0.00011316712914939598, + "loss": 1.2164, + "step": 13242 + }, + { + "epoch": 0.4742600318727953, + "grad_norm": 1.3233191967010498, + "learning_rate": 0.00011315563103549211, + "loss": 0.858, + "step": 13243 + }, + { + "epoch": 0.4742958440023636, + "grad_norm": 1.6957342624664307, + "learning_rate": 0.00011314413274459618, + "loss": 1.0394, + "step": 13244 + }, + { + "epoch": 0.4743316561319319, + "grad_norm": 2.0930676460266113, + "learning_rate": 0.0001131326342768629, + "loss": 1.2122, + "step": 13245 + }, + { + "epoch": 0.47436746826150017, + "grad_norm": 1.542482614517212, + "learning_rate": 0.00011312113563244695, + "loss": 1.2567, + "step": 13246 + }, + { + "epoch": 0.47440328039106844, + "grad_norm": 2.433912992477417, + "learning_rate": 0.00011310963681150304, + "loss": 1.2927, + "step": 13247 + }, + { + "epoch": 0.47443909252063676, + "grad_norm": 1.4719980955123901, + "learning_rate": 0.0001130981378141859, + "loss": 1.3613, + "step": 13248 + }, + { + "epoch": 0.47447490465020503, + "grad_norm": 1.3247307538986206, + "learning_rate": 0.0001130866386406502, + "loss": 1.0471, + "step": 13249 + }, + { + "epoch": 0.4745107167797733, + "grad_norm": 1.3975105285644531, + "learning_rate": 0.00011307513929105067, + "loss": 1.0693, + "step": 13250 + }, + { + "epoch": 0.47454652890934157, + "grad_norm": 1.7878659963607788, + "learning_rate": 0.000113063639765542, + "loss": 1.117, + "step": 13251 + }, + { + "epoch": 0.4745823410389099, + "grad_norm": 1.416111946105957, + "learning_rate": 0.00011305214006427892, + "loss": 1.0461, + "step": 13252 + }, + { + "epoch": 0.47461815316847816, + "grad_norm": 1.5923138856887817, + "learning_rate": 0.00011304064018741612, + "loss": 1.1195, + "step": 13253 + }, + { + "epoch": 0.47465396529804643, + "grad_norm": 1.464121699333191, + "learning_rate": 0.0001130291401351083, + "loss": 1.1903, + "step": 13254 + }, + { + "epoch": 0.47468977742761476, + "grad_norm": 1.3929232358932495, + "learning_rate": 0.00011301763990751025, + "loss": 1.1842, + "step": 13255 + }, + { + "epoch": 0.474725589557183, + "grad_norm": 1.5744590759277344, + "learning_rate": 0.00011300613950477661, + "loss": 1.3164, + "step": 13256 + }, + { + "epoch": 0.4747614016867513, + "grad_norm": 1.5659172534942627, + "learning_rate": 0.00011299463892706217, + "loss": 1.2958, + "step": 13257 + }, + { + "epoch": 0.47479721381631956, + "grad_norm": 1.4969274997711182, + "learning_rate": 0.0001129831381745216, + "loss": 1.0463, + "step": 13258 + }, + { + "epoch": 0.4748330259458879, + "grad_norm": 1.4695470333099365, + "learning_rate": 0.00011297163724730968, + "loss": 1.0971, + "step": 13259 + }, + { + "epoch": 0.47486883807545616, + "grad_norm": 1.3337295055389404, + "learning_rate": 0.00011296013614558107, + "loss": 0.8386, + "step": 13260 + }, + { + "epoch": 0.4749046502050244, + "grad_norm": 1.4586668014526367, + "learning_rate": 0.00011294863486949059, + "loss": 0.8805, + "step": 13261 + }, + { + "epoch": 0.47494046233459275, + "grad_norm": 1.7930049896240234, + "learning_rate": 0.00011293713341919292, + "loss": 1.1518, + "step": 13262 + }, + { + "epoch": 0.474976274464161, + "grad_norm": 1.892699956893921, + "learning_rate": 0.0001129256317948428, + "loss": 1.2121, + "step": 13263 + }, + { + "epoch": 0.4750120865937293, + "grad_norm": 1.6002060174942017, + "learning_rate": 0.00011291412999659499, + "loss": 1.196, + "step": 13264 + }, + { + "epoch": 0.47504789872329756, + "grad_norm": 1.6040444374084473, + "learning_rate": 0.00011290262802460419, + "loss": 0.9912, + "step": 13265 + }, + { + "epoch": 0.4750837108528659, + "grad_norm": 1.594225525856018, + "learning_rate": 0.0001128911258790252, + "loss": 1.1534, + "step": 13266 + }, + { + "epoch": 0.47511952298243415, + "grad_norm": 1.4237980842590332, + "learning_rate": 0.00011287962356001272, + "loss": 1.1467, + "step": 13267 + }, + { + "epoch": 0.4751553351120024, + "grad_norm": 1.4427103996276855, + "learning_rate": 0.00011286812106772153, + "loss": 1.1144, + "step": 13268 + }, + { + "epoch": 0.47519114724157074, + "grad_norm": 1.441011667251587, + "learning_rate": 0.00011285661840230636, + "loss": 0.9627, + "step": 13269 + }, + { + "epoch": 0.475226959371139, + "grad_norm": 1.5911169052124023, + "learning_rate": 0.000112845115563922, + "loss": 1.2551, + "step": 13270 + }, + { + "epoch": 0.4752627715007073, + "grad_norm": 1.4997774362564087, + "learning_rate": 0.00011283361255272315, + "loss": 1.044, + "step": 13271 + }, + { + "epoch": 0.47529858363027555, + "grad_norm": 1.5268415212631226, + "learning_rate": 0.00011282210936886463, + "loss": 1.1181, + "step": 13272 + }, + { + "epoch": 0.4753343957598439, + "grad_norm": 1.444248914718628, + "learning_rate": 0.00011281060601250113, + "loss": 0.8589, + "step": 13273 + }, + { + "epoch": 0.47537020788941214, + "grad_norm": 1.747397780418396, + "learning_rate": 0.00011279910248378746, + "loss": 1.2129, + "step": 13274 + }, + { + "epoch": 0.4754060200189804, + "grad_norm": 1.3930426836013794, + "learning_rate": 0.00011278759878287839, + "loss": 1.2931, + "step": 13275 + }, + { + "epoch": 0.47544183214854874, + "grad_norm": 1.581139087677002, + "learning_rate": 0.00011277609490992866, + "loss": 1.1896, + "step": 13276 + }, + { + "epoch": 0.475477644278117, + "grad_norm": 1.5072437524795532, + "learning_rate": 0.00011276459086509305, + "loss": 1.0955, + "step": 13277 + }, + { + "epoch": 0.4755134564076853, + "grad_norm": 1.6542000770568848, + "learning_rate": 0.00011275308664852635, + "loss": 1.1701, + "step": 13278 + }, + { + "epoch": 0.47554926853725354, + "grad_norm": 1.5862518548965454, + "learning_rate": 0.00011274158226038334, + "loss": 1.1787, + "step": 13279 + }, + { + "epoch": 0.47558508066682187, + "grad_norm": 1.9717316627502441, + "learning_rate": 0.00011273007770081873, + "loss": 1.2044, + "step": 13280 + }, + { + "epoch": 0.47562089279639014, + "grad_norm": 1.5090327262878418, + "learning_rate": 0.00011271857296998737, + "loss": 1.0244, + "step": 13281 + }, + { + "epoch": 0.4756567049259584, + "grad_norm": 1.531905174255371, + "learning_rate": 0.000112707068068044, + "loss": 1.2819, + "step": 13282 + }, + { + "epoch": 0.47569251705552673, + "grad_norm": 1.5229064226150513, + "learning_rate": 0.00011269556299514346, + "loss": 1.0339, + "step": 13283 + }, + { + "epoch": 0.475728329185095, + "grad_norm": 1.5408467054367065, + "learning_rate": 0.00011268405775144044, + "loss": 1.0765, + "step": 13284 + }, + { + "epoch": 0.47576414131466327, + "grad_norm": 1.5429962873458862, + "learning_rate": 0.00011267255233708982, + "loss": 1.0758, + "step": 13285 + }, + { + "epoch": 0.47579995344423154, + "grad_norm": 1.5943351984024048, + "learning_rate": 0.00011266104675224633, + "loss": 1.0905, + "step": 13286 + }, + { + "epoch": 0.47583576557379986, + "grad_norm": 1.9126296043395996, + "learning_rate": 0.00011264954099706481, + "loss": 1.1765, + "step": 13287 + }, + { + "epoch": 0.47587157770336813, + "grad_norm": 1.6893173456192017, + "learning_rate": 0.00011263803507170005, + "loss": 1.0387, + "step": 13288 + }, + { + "epoch": 0.4759073898329364, + "grad_norm": 1.7411541938781738, + "learning_rate": 0.00011262652897630678, + "loss": 1.1368, + "step": 13289 + }, + { + "epoch": 0.4759432019625047, + "grad_norm": 1.531998872756958, + "learning_rate": 0.0001126150227110399, + "loss": 1.0302, + "step": 13290 + }, + { + "epoch": 0.475979014092073, + "grad_norm": 1.4740115404129028, + "learning_rate": 0.00011260351627605413, + "loss": 0.9353, + "step": 13291 + }, + { + "epoch": 0.47601482622164126, + "grad_norm": 1.4992389678955078, + "learning_rate": 0.00011259200967150432, + "loss": 1.0333, + "step": 13292 + }, + { + "epoch": 0.47605063835120953, + "grad_norm": 1.7253696918487549, + "learning_rate": 0.00011258050289754524, + "loss": 1.3193, + "step": 13293 + }, + { + "epoch": 0.47608645048077786, + "grad_norm": 1.786910891532898, + "learning_rate": 0.00011256899595433175, + "loss": 1.1798, + "step": 13294 + }, + { + "epoch": 0.4761222626103461, + "grad_norm": 1.976694941520691, + "learning_rate": 0.0001125574888420186, + "loss": 1.1128, + "step": 13295 + }, + { + "epoch": 0.4761580747399144, + "grad_norm": 1.6314566135406494, + "learning_rate": 0.00011254598156076066, + "loss": 1.2684, + "step": 13296 + }, + { + "epoch": 0.4761938868694827, + "grad_norm": 1.7294286489486694, + "learning_rate": 0.00011253447411071274, + "loss": 1.2984, + "step": 13297 + }, + { + "epoch": 0.476229698999051, + "grad_norm": 1.3166548013687134, + "learning_rate": 0.00011252296649202957, + "loss": 1.0405, + "step": 13298 + }, + { + "epoch": 0.47626551112861926, + "grad_norm": 1.7676652669906616, + "learning_rate": 0.00011251145870486612, + "loss": 1.0154, + "step": 13299 + }, + { + "epoch": 0.4763013232581875, + "grad_norm": 1.4194016456604004, + "learning_rate": 0.00011249995074937708, + "loss": 1.2118, + "step": 13300 + }, + { + "epoch": 0.47633713538775585, + "grad_norm": 1.4153170585632324, + "learning_rate": 0.00011248844262571737, + "loss": 1.0301, + "step": 13301 + }, + { + "epoch": 0.4763729475173241, + "grad_norm": 1.2630383968353271, + "learning_rate": 0.00011247693433404172, + "loss": 0.7973, + "step": 13302 + }, + { + "epoch": 0.4764087596468924, + "grad_norm": 1.464370608329773, + "learning_rate": 0.00011246542587450504, + "loss": 1.2104, + "step": 13303 + }, + { + "epoch": 0.4764445717764607, + "grad_norm": 1.3724743127822876, + "learning_rate": 0.00011245391724726213, + "loss": 1.1235, + "step": 13304 + }, + { + "epoch": 0.476480383906029, + "grad_norm": 2.693183183670044, + "learning_rate": 0.00011244240845246783, + "loss": 1.2352, + "step": 13305 + }, + { + "epoch": 0.47651619603559725, + "grad_norm": 1.8039917945861816, + "learning_rate": 0.00011243089949027699, + "loss": 1.2054, + "step": 13306 + }, + { + "epoch": 0.4765520081651655, + "grad_norm": 1.8824361562728882, + "learning_rate": 0.0001124193903608444, + "loss": 1.1206, + "step": 13307 + }, + { + "epoch": 0.47658782029473384, + "grad_norm": 1.7749863862991333, + "learning_rate": 0.00011240788106432496, + "loss": 1.2814, + "step": 13308 + }, + { + "epoch": 0.4766236324243021, + "grad_norm": 1.9892925024032593, + "learning_rate": 0.00011239637160087346, + "loss": 1.0299, + "step": 13309 + }, + { + "epoch": 0.4766594445538704, + "grad_norm": 1.5921730995178223, + "learning_rate": 0.00011238486197064479, + "loss": 1.03, + "step": 13310 + }, + { + "epoch": 0.4766952566834387, + "grad_norm": 1.3648608922958374, + "learning_rate": 0.00011237335217379377, + "loss": 1.1145, + "step": 13311 + }, + { + "epoch": 0.476731068813007, + "grad_norm": 1.8401013612747192, + "learning_rate": 0.00011236184221047526, + "loss": 1.3331, + "step": 13312 + }, + { + "epoch": 0.47676688094257524, + "grad_norm": 1.354565978050232, + "learning_rate": 0.00011235033208084411, + "loss": 1.1754, + "step": 13313 + }, + { + "epoch": 0.4768026930721435, + "grad_norm": 1.4627219438552856, + "learning_rate": 0.00011233882178505519, + "loss": 1.1062, + "step": 13314 + }, + { + "epoch": 0.47683850520171184, + "grad_norm": 1.747267723083496, + "learning_rate": 0.00011232731132326331, + "loss": 1.1709, + "step": 13315 + }, + { + "epoch": 0.4768743173312801, + "grad_norm": 1.4019356966018677, + "learning_rate": 0.00011231580069562335, + "loss": 0.9828, + "step": 13316 + }, + { + "epoch": 0.4769101294608484, + "grad_norm": 2.014709949493408, + "learning_rate": 0.0001123042899022902, + "loss": 1.2297, + "step": 13317 + }, + { + "epoch": 0.4769459415904167, + "grad_norm": 1.6941988468170166, + "learning_rate": 0.00011229277894341869, + "loss": 1.2458, + "step": 13318 + }, + { + "epoch": 0.47698175371998497, + "grad_norm": 1.555740475654602, + "learning_rate": 0.0001122812678191637, + "loss": 1.2305, + "step": 13319 + }, + { + "epoch": 0.47701756584955324, + "grad_norm": 1.8459281921386719, + "learning_rate": 0.00011226975652968011, + "loss": 1.2243, + "step": 13320 + }, + { + "epoch": 0.4770533779791215, + "grad_norm": 2.1005523204803467, + "learning_rate": 0.00011225824507512275, + "loss": 1.0598, + "step": 13321 + }, + { + "epoch": 0.47708919010868983, + "grad_norm": 1.5528249740600586, + "learning_rate": 0.00011224673345564651, + "loss": 1.2459, + "step": 13322 + }, + { + "epoch": 0.4771250022382581, + "grad_norm": 1.4385509490966797, + "learning_rate": 0.0001122352216714063, + "loss": 1.1921, + "step": 13323 + }, + { + "epoch": 0.47716081436782637, + "grad_norm": 1.769011378288269, + "learning_rate": 0.00011222370972255694, + "loss": 1.1349, + "step": 13324 + }, + { + "epoch": 0.4771966264973947, + "grad_norm": 1.4096726179122925, + "learning_rate": 0.00011221219760925334, + "loss": 1.0752, + "step": 13325 + }, + { + "epoch": 0.47723243862696296, + "grad_norm": 1.847143292427063, + "learning_rate": 0.00011220068533165036, + "loss": 1.1723, + "step": 13326 + }, + { + "epoch": 0.47726825075653123, + "grad_norm": 1.6316720247268677, + "learning_rate": 0.00011218917288990292, + "loss": 1.0958, + "step": 13327 + }, + { + "epoch": 0.4773040628860995, + "grad_norm": 1.7619483470916748, + "learning_rate": 0.00011217766028416585, + "loss": 1.2083, + "step": 13328 + }, + { + "epoch": 0.4773398750156678, + "grad_norm": 1.8005644083023071, + "learning_rate": 0.00011216614751459408, + "loss": 1.316, + "step": 13329 + }, + { + "epoch": 0.4773756871452361, + "grad_norm": 1.5118699073791504, + "learning_rate": 0.00011215463458134252, + "loss": 1.2281, + "step": 13330 + }, + { + "epoch": 0.47741149927480436, + "grad_norm": 1.6903514862060547, + "learning_rate": 0.000112143121484566, + "loss": 1.2359, + "step": 13331 + }, + { + "epoch": 0.4774473114043727, + "grad_norm": 1.2139511108398438, + "learning_rate": 0.00011213160822441948, + "loss": 1.2247, + "step": 13332 + }, + { + "epoch": 0.47748312353394096, + "grad_norm": 1.571465253829956, + "learning_rate": 0.00011212009480105777, + "loss": 1.1811, + "step": 13333 + }, + { + "epoch": 0.4775189356635092, + "grad_norm": 1.7870073318481445, + "learning_rate": 0.00011210858121463586, + "loss": 1.006, + "step": 13334 + }, + { + "epoch": 0.4775547477930775, + "grad_norm": 1.6256396770477295, + "learning_rate": 0.00011209706746530858, + "loss": 1.1762, + "step": 13335 + }, + { + "epoch": 0.4775905599226458, + "grad_norm": 1.5142242908477783, + "learning_rate": 0.00011208555355323088, + "loss": 1.105, + "step": 13336 + }, + { + "epoch": 0.4776263720522141, + "grad_norm": 1.2706642150878906, + "learning_rate": 0.00011207403947855761, + "loss": 0.9789, + "step": 13337 + }, + { + "epoch": 0.47766218418178236, + "grad_norm": 1.6766189336776733, + "learning_rate": 0.00011206252524144373, + "loss": 1.0924, + "step": 13338 + }, + { + "epoch": 0.4776979963113507, + "grad_norm": 1.2671113014221191, + "learning_rate": 0.00011205101084204414, + "loss": 1.1386, + "step": 13339 + }, + { + "epoch": 0.47773380844091895, + "grad_norm": 1.6820855140686035, + "learning_rate": 0.00011203949628051376, + "loss": 1.1236, + "step": 13340 + }, + { + "epoch": 0.4777696205704872, + "grad_norm": 1.478029727935791, + "learning_rate": 0.00011202798155700748, + "loss": 1.1914, + "step": 13341 + }, + { + "epoch": 0.4778054327000555, + "grad_norm": 1.7106707096099854, + "learning_rate": 0.0001120164666716802, + "loss": 1.0732, + "step": 13342 + }, + { + "epoch": 0.4778412448296238, + "grad_norm": 1.5674809217453003, + "learning_rate": 0.0001120049516246869, + "loss": 1.1372, + "step": 13343 + }, + { + "epoch": 0.4778770569591921, + "grad_norm": 1.4241355657577515, + "learning_rate": 0.0001119934364161824, + "loss": 1.0693, + "step": 13344 + }, + { + "epoch": 0.47791286908876035, + "grad_norm": 1.398541808128357, + "learning_rate": 0.00011198192104632174, + "loss": 1.0172, + "step": 13345 + }, + { + "epoch": 0.4779486812183287, + "grad_norm": 1.727357029914856, + "learning_rate": 0.00011197040551525977, + "loss": 1.0946, + "step": 13346 + }, + { + "epoch": 0.47798449334789694, + "grad_norm": 1.4165022373199463, + "learning_rate": 0.00011195888982315144, + "loss": 1.0951, + "step": 13347 + }, + { + "epoch": 0.4780203054774652, + "grad_norm": 1.916843056678772, + "learning_rate": 0.00011194737397015164, + "loss": 1.2584, + "step": 13348 + }, + { + "epoch": 0.4780561176070335, + "grad_norm": 1.6048541069030762, + "learning_rate": 0.00011193585795641539, + "loss": 1.0105, + "step": 13349 + }, + { + "epoch": 0.4780919297366018, + "grad_norm": 1.7340539693832397, + "learning_rate": 0.00011192434178209755, + "loss": 1.1387, + "step": 13350 + }, + { + "epoch": 0.4781277418661701, + "grad_norm": 1.3482859134674072, + "learning_rate": 0.00011191282544735304, + "loss": 1.2674, + "step": 13351 + }, + { + "epoch": 0.47816355399573834, + "grad_norm": 1.5692615509033203, + "learning_rate": 0.00011190130895233686, + "loss": 1.1763, + "step": 13352 + }, + { + "epoch": 0.47819936612530667, + "grad_norm": 1.2984085083007812, + "learning_rate": 0.00011188979229720389, + "loss": 1.1544, + "step": 13353 + }, + { + "epoch": 0.47823517825487494, + "grad_norm": 1.4715293645858765, + "learning_rate": 0.00011187827548210915, + "loss": 1.1825, + "step": 13354 + }, + { + "epoch": 0.4782709903844432, + "grad_norm": 3.9700679779052734, + "learning_rate": 0.00011186675850720749, + "loss": 1.0615, + "step": 13355 + }, + { + "epoch": 0.4783068025140115, + "grad_norm": 1.3262308835983276, + "learning_rate": 0.00011185524137265393, + "loss": 1.281, + "step": 13356 + }, + { + "epoch": 0.4783426146435798, + "grad_norm": 2.119497537612915, + "learning_rate": 0.00011184372407860336, + "loss": 1.4318, + "step": 13357 + }, + { + "epoch": 0.47837842677314807, + "grad_norm": 1.2910892963409424, + "learning_rate": 0.00011183220662521079, + "loss": 1.2604, + "step": 13358 + }, + { + "epoch": 0.47841423890271634, + "grad_norm": 1.3199374675750732, + "learning_rate": 0.00011182068901263114, + "loss": 1.0855, + "step": 13359 + }, + { + "epoch": 0.47845005103228466, + "grad_norm": 1.9800810813903809, + "learning_rate": 0.00011180917124101936, + "loss": 1.3192, + "step": 13360 + }, + { + "epoch": 0.47848586316185293, + "grad_norm": 1.3459769487380981, + "learning_rate": 0.0001117976533105304, + "loss": 1.1585, + "step": 13361 + }, + { + "epoch": 0.4785216752914212, + "grad_norm": 2.036860704421997, + "learning_rate": 0.00011178613522131924, + "loss": 1.0147, + "step": 13362 + }, + { + "epoch": 0.47855748742098947, + "grad_norm": 1.7726057767868042, + "learning_rate": 0.00011177461697354084, + "loss": 1.2403, + "step": 13363 + }, + { + "epoch": 0.4785932995505578, + "grad_norm": 1.463556170463562, + "learning_rate": 0.00011176309856735014, + "loss": 1.0351, + "step": 13364 + }, + { + "epoch": 0.47862911168012606, + "grad_norm": 1.5336294174194336, + "learning_rate": 0.00011175158000290216, + "loss": 1.1719, + "step": 13365 + }, + { + "epoch": 0.47866492380969433, + "grad_norm": 1.488568663597107, + "learning_rate": 0.00011174006128035178, + "loss": 1.2007, + "step": 13366 + }, + { + "epoch": 0.47870073593926266, + "grad_norm": 1.3855433464050293, + "learning_rate": 0.00011172854239985409, + "loss": 1.0761, + "step": 13367 + }, + { + "epoch": 0.4787365480688309, + "grad_norm": 1.452728271484375, + "learning_rate": 0.0001117170233615639, + "loss": 1.0453, + "step": 13368 + }, + { + "epoch": 0.4787723601983992, + "grad_norm": 1.4849971532821655, + "learning_rate": 0.00011170550416563634, + "loss": 1.0565, + "step": 13369 + }, + { + "epoch": 0.47880817232796746, + "grad_norm": 1.650675654411316, + "learning_rate": 0.0001116939848122263, + "loss": 1.1879, + "step": 13370 + }, + { + "epoch": 0.4788439844575358, + "grad_norm": 1.9150779247283936, + "learning_rate": 0.00011168246530148876, + "loss": 1.2328, + "step": 13371 + }, + { + "epoch": 0.47887979658710406, + "grad_norm": 2.6559290885925293, + "learning_rate": 0.00011167094563357876, + "loss": 1.3037, + "step": 13372 + }, + { + "epoch": 0.4789156087166723, + "grad_norm": 1.9172574281692505, + "learning_rate": 0.00011165942580865118, + "loss": 1.1027, + "step": 13373 + }, + { + "epoch": 0.4789514208462406, + "grad_norm": 1.3469539880752563, + "learning_rate": 0.00011164790582686113, + "loss": 1.0602, + "step": 13374 + }, + { + "epoch": 0.4789872329758089, + "grad_norm": 1.5371955633163452, + "learning_rate": 0.0001116363856883635, + "loss": 1.0811, + "step": 13375 + }, + { + "epoch": 0.4790230451053772, + "grad_norm": 1.8341792821884155, + "learning_rate": 0.00011162486539331334, + "loss": 1.1296, + "step": 13376 + }, + { + "epoch": 0.47905885723494546, + "grad_norm": 1.6308156251907349, + "learning_rate": 0.00011161334494186557, + "loss": 1.2397, + "step": 13377 + }, + { + "epoch": 0.4790946693645138, + "grad_norm": 1.8313729763031006, + "learning_rate": 0.00011160182433417525, + "loss": 1.1511, + "step": 13378 + }, + { + "epoch": 0.47913048149408205, + "grad_norm": 1.5219786167144775, + "learning_rate": 0.00011159030357039733, + "loss": 1.0281, + "step": 13379 + }, + { + "epoch": 0.4791662936236503, + "grad_norm": 1.3452204465866089, + "learning_rate": 0.00011157878265068685, + "loss": 1.1092, + "step": 13380 + }, + { + "epoch": 0.4792021057532186, + "grad_norm": 1.9030005931854248, + "learning_rate": 0.00011156726157519877, + "loss": 1.2665, + "step": 13381 + }, + { + "epoch": 0.4792379178827869, + "grad_norm": 1.5922068357467651, + "learning_rate": 0.00011155574034408812, + "loss": 1.1237, + "step": 13382 + }, + { + "epoch": 0.4792737300123552, + "grad_norm": 1.6517162322998047, + "learning_rate": 0.00011154421895750984, + "loss": 1.1471, + "step": 13383 + }, + { + "epoch": 0.47930954214192345, + "grad_norm": 1.6683616638183594, + "learning_rate": 0.00011153269741561905, + "loss": 1.0645, + "step": 13384 + }, + { + "epoch": 0.4793453542714918, + "grad_norm": 1.4788893461227417, + "learning_rate": 0.00011152117571857065, + "loss": 0.8479, + "step": 13385 + }, + { + "epoch": 0.47938116640106004, + "grad_norm": 1.5334571599960327, + "learning_rate": 0.0001115096538665197, + "loss": 1.0968, + "step": 13386 + }, + { + "epoch": 0.4794169785306283, + "grad_norm": 1.7347112894058228, + "learning_rate": 0.00011149813185962124, + "loss": 1.0328, + "step": 13387 + }, + { + "epoch": 0.4794527906601966, + "grad_norm": 1.4822018146514893, + "learning_rate": 0.00011148660969803019, + "loss": 1.5445, + "step": 13388 + }, + { + "epoch": 0.4794886027897649, + "grad_norm": 1.299217939376831, + "learning_rate": 0.00011147508738190167, + "loss": 1.1613, + "step": 13389 + }, + { + "epoch": 0.4795244149193332, + "grad_norm": 1.494076132774353, + "learning_rate": 0.0001114635649113906, + "loss": 1.1318, + "step": 13390 + }, + { + "epoch": 0.47956022704890144, + "grad_norm": 1.6734445095062256, + "learning_rate": 0.00011145204228665209, + "loss": 0.9817, + "step": 13391 + }, + { + "epoch": 0.47959603917846977, + "grad_norm": 1.418591856956482, + "learning_rate": 0.00011144051950784111, + "loss": 1.0529, + "step": 13392 + }, + { + "epoch": 0.47963185130803804, + "grad_norm": 1.4990030527114868, + "learning_rate": 0.00011142899657511272, + "loss": 1.1877, + "step": 13393 + }, + { + "epoch": 0.4796676634376063, + "grad_norm": 1.4920377731323242, + "learning_rate": 0.00011141747348862191, + "loss": 1.1736, + "step": 13394 + }, + { + "epoch": 0.4797034755671746, + "grad_norm": 1.446656346321106, + "learning_rate": 0.00011140595024852369, + "loss": 1.1981, + "step": 13395 + }, + { + "epoch": 0.4797392876967429, + "grad_norm": 1.521122694015503, + "learning_rate": 0.00011139442685497317, + "loss": 1.3347, + "step": 13396 + }, + { + "epoch": 0.47977509982631117, + "grad_norm": 1.8120923042297363, + "learning_rate": 0.0001113829033081253, + "loss": 1.0355, + "step": 13397 + }, + { + "epoch": 0.47981091195587944, + "grad_norm": 1.2067068815231323, + "learning_rate": 0.00011137137960813517, + "loss": 1.0047, + "step": 13398 + }, + { + "epoch": 0.47984672408544776, + "grad_norm": 1.4034299850463867, + "learning_rate": 0.00011135985575515778, + "loss": 1.194, + "step": 13399 + }, + { + "epoch": 0.47988253621501603, + "grad_norm": 1.2919951677322388, + "learning_rate": 0.0001113483317493482, + "loss": 1.1307, + "step": 13400 + }, + { + "epoch": 0.4799183483445843, + "grad_norm": 1.4513535499572754, + "learning_rate": 0.00011133680759086145, + "loss": 1.3192, + "step": 13401 + }, + { + "epoch": 0.47995416047415257, + "grad_norm": 1.422686219215393, + "learning_rate": 0.00011132528327985256, + "loss": 1.3059, + "step": 13402 + }, + { + "epoch": 0.4799899726037209, + "grad_norm": 1.2010931968688965, + "learning_rate": 0.00011131375881647664, + "loss": 0.8969, + "step": 13403 + }, + { + "epoch": 0.48002578473328916, + "grad_norm": 1.3424878120422363, + "learning_rate": 0.00011130223420088864, + "loss": 0.9886, + "step": 13404 + }, + { + "epoch": 0.48006159686285743, + "grad_norm": 1.3302596807479858, + "learning_rate": 0.00011129070943324366, + "loss": 1.0575, + "step": 13405 + }, + { + "epoch": 0.48009740899242576, + "grad_norm": 1.3142274618148804, + "learning_rate": 0.00011127918451369676, + "loss": 0.9823, + "step": 13406 + }, + { + "epoch": 0.480133221121994, + "grad_norm": 1.4335311651229858, + "learning_rate": 0.00011126765944240298, + "loss": 1.1267, + "step": 13407 + }, + { + "epoch": 0.4801690332515623, + "grad_norm": 1.4862662553787231, + "learning_rate": 0.00011125613421951737, + "loss": 1.0662, + "step": 13408 + }, + { + "epoch": 0.48020484538113056, + "grad_norm": 1.1656019687652588, + "learning_rate": 0.00011124460884519503, + "loss": 1.0353, + "step": 13409 + }, + { + "epoch": 0.4802406575106989, + "grad_norm": 1.3259000778198242, + "learning_rate": 0.00011123308331959093, + "loss": 1.1345, + "step": 13410 + }, + { + "epoch": 0.48027646964026716, + "grad_norm": 1.296142339706421, + "learning_rate": 0.00011122155764286024, + "loss": 1.0016, + "step": 13411 + }, + { + "epoch": 0.4803122817698354, + "grad_norm": 1.631473422050476, + "learning_rate": 0.00011121003181515792, + "loss": 1.0892, + "step": 13412 + }, + { + "epoch": 0.48034809389940375, + "grad_norm": 2.1840009689331055, + "learning_rate": 0.00011119850583663908, + "loss": 1.2281, + "step": 13413 + }, + { + "epoch": 0.480383906028972, + "grad_norm": 2.3816256523132324, + "learning_rate": 0.00011118697970745881, + "loss": 1.2656, + "step": 13414 + }, + { + "epoch": 0.4804197181585403, + "grad_norm": 1.377570629119873, + "learning_rate": 0.00011117545342777215, + "loss": 1.067, + "step": 13415 + }, + { + "epoch": 0.48045553028810856, + "grad_norm": 1.5153108835220337, + "learning_rate": 0.0001111639269977342, + "loss": 1.2028, + "step": 13416 + }, + { + "epoch": 0.4804913424176769, + "grad_norm": 1.2187294960021973, + "learning_rate": 0.00011115240041749999, + "loss": 1.119, + "step": 13417 + }, + { + "epoch": 0.48052715454724515, + "grad_norm": 1.9057780504226685, + "learning_rate": 0.00011114087368722463, + "loss": 0.9783, + "step": 13418 + }, + { + "epoch": 0.4805629666768134, + "grad_norm": 1.8609756231307983, + "learning_rate": 0.00011112934680706317, + "loss": 1.2037, + "step": 13419 + }, + { + "epoch": 0.48059877880638174, + "grad_norm": 1.4491907358169556, + "learning_rate": 0.00011111781977717075, + "loss": 0.9749, + "step": 13420 + }, + { + "epoch": 0.48063459093595, + "grad_norm": 1.5604594945907593, + "learning_rate": 0.00011110629259770235, + "loss": 1.2885, + "step": 13421 + }, + { + "epoch": 0.4806704030655183, + "grad_norm": 1.4888091087341309, + "learning_rate": 0.00011109476526881313, + "loss": 1.1005, + "step": 13422 + }, + { + "epoch": 0.48070621519508655, + "grad_norm": 1.844168782234192, + "learning_rate": 0.00011108323779065814, + "loss": 1.1954, + "step": 13423 + }, + { + "epoch": 0.4807420273246549, + "grad_norm": 1.4329942464828491, + "learning_rate": 0.00011107171016339251, + "loss": 1.1967, + "step": 13424 + }, + { + "epoch": 0.48077783945422314, + "grad_norm": 1.493247151374817, + "learning_rate": 0.00011106018238717128, + "loss": 1.0508, + "step": 13425 + }, + { + "epoch": 0.4808136515837914, + "grad_norm": 1.4442464113235474, + "learning_rate": 0.00011104865446214957, + "loss": 1.0535, + "step": 13426 + }, + { + "epoch": 0.48084946371335974, + "grad_norm": 1.3069480657577515, + "learning_rate": 0.00011103712638848244, + "loss": 1.1319, + "step": 13427 + }, + { + "epoch": 0.480885275842928, + "grad_norm": 1.8055490255355835, + "learning_rate": 0.00011102559816632507, + "loss": 1.326, + "step": 13428 + }, + { + "epoch": 0.4809210879724963, + "grad_norm": 1.4109548330307007, + "learning_rate": 0.00011101406979583246, + "loss": 0.9924, + "step": 13429 + }, + { + "epoch": 0.48095690010206454, + "grad_norm": 1.9183076620101929, + "learning_rate": 0.00011100254127715975, + "loss": 1.0261, + "step": 13430 + }, + { + "epoch": 0.48099271223163287, + "grad_norm": 1.4959547519683838, + "learning_rate": 0.00011099101261046205, + "loss": 1.1089, + "step": 13431 + }, + { + "epoch": 0.48102852436120114, + "grad_norm": 1.4701334238052368, + "learning_rate": 0.00011097948379589444, + "loss": 1.1006, + "step": 13432 + }, + { + "epoch": 0.4810643364907694, + "grad_norm": 1.4918694496154785, + "learning_rate": 0.00011096795483361205, + "loss": 1.1137, + "step": 13433 + }, + { + "epoch": 0.48110014862033773, + "grad_norm": 1.3022524118423462, + "learning_rate": 0.00011095642572376996, + "loss": 1.0539, + "step": 13434 + }, + { + "epoch": 0.481135960749906, + "grad_norm": 1.3946529626846313, + "learning_rate": 0.0001109448964665233, + "loss": 1.0351, + "step": 13435 + }, + { + "epoch": 0.48117177287947427, + "grad_norm": 1.4722555875778198, + "learning_rate": 0.00011093336706202717, + "loss": 1.3201, + "step": 13436 + }, + { + "epoch": 0.48120758500904254, + "grad_norm": 1.3908361196517944, + "learning_rate": 0.00011092183751043672, + "loss": 1.2246, + "step": 13437 + }, + { + "epoch": 0.48124339713861086, + "grad_norm": 1.3624634742736816, + "learning_rate": 0.000110910307811907, + "loss": 0.9561, + "step": 13438 + }, + { + "epoch": 0.48127920926817913, + "grad_norm": 1.5555710792541504, + "learning_rate": 0.00011089877796659319, + "loss": 1.1115, + "step": 13439 + }, + { + "epoch": 0.4813150213977474, + "grad_norm": 1.5120277404785156, + "learning_rate": 0.00011088724797465036, + "loss": 1.1674, + "step": 13440 + }, + { + "epoch": 0.4813508335273157, + "grad_norm": 1.980964183807373, + "learning_rate": 0.00011087571783623365, + "loss": 1.377, + "step": 13441 + }, + { + "epoch": 0.481386645656884, + "grad_norm": 1.546349287033081, + "learning_rate": 0.0001108641875514982, + "loss": 1.0126, + "step": 13442 + }, + { + "epoch": 0.48142245778645226, + "grad_norm": 2.1496264934539795, + "learning_rate": 0.00011085265712059909, + "loss": 1.3099, + "step": 13443 + }, + { + "epoch": 0.48145826991602053, + "grad_norm": 1.5759695768356323, + "learning_rate": 0.00011084112654369152, + "loss": 0.9919, + "step": 13444 + }, + { + "epoch": 0.48149408204558886, + "grad_norm": 1.3351014852523804, + "learning_rate": 0.00011082959582093055, + "loss": 1.0418, + "step": 13445 + }, + { + "epoch": 0.4815298941751571, + "grad_norm": 1.4423069953918457, + "learning_rate": 0.00011081806495247136, + "loss": 1.2413, + "step": 13446 + }, + { + "epoch": 0.4815657063047254, + "grad_norm": 1.717212438583374, + "learning_rate": 0.00011080653393846905, + "loss": 1.1826, + "step": 13447 + }, + { + "epoch": 0.4816015184342937, + "grad_norm": 2.171905517578125, + "learning_rate": 0.00011079500277907875, + "loss": 1.0139, + "step": 13448 + }, + { + "epoch": 0.481637330563862, + "grad_norm": 1.4971842765808105, + "learning_rate": 0.00011078347147445563, + "loss": 1.2303, + "step": 13449 + }, + { + "epoch": 0.48167314269343026, + "grad_norm": 1.6386743783950806, + "learning_rate": 0.0001107719400247548, + "loss": 1.0022, + "step": 13450 + }, + { + "epoch": 0.4817089548229985, + "grad_norm": 1.7289009094238281, + "learning_rate": 0.00011076040843013141, + "loss": 0.9385, + "step": 13451 + }, + { + "epoch": 0.48174476695256685, + "grad_norm": 1.7023710012435913, + "learning_rate": 0.00011074887669074058, + "loss": 1.0257, + "step": 13452 + }, + { + "epoch": 0.4817805790821351, + "grad_norm": 1.3476769924163818, + "learning_rate": 0.00011073734480673754, + "loss": 1.0988, + "step": 13453 + }, + { + "epoch": 0.4818163912117034, + "grad_norm": 1.5797438621520996, + "learning_rate": 0.00011072581277827732, + "loss": 1.1535, + "step": 13454 + }, + { + "epoch": 0.4818522033412717, + "grad_norm": 1.8280997276306152, + "learning_rate": 0.00011071428060551517, + "loss": 1.3071, + "step": 13455 + }, + { + "epoch": 0.48188801547084, + "grad_norm": 1.3282864093780518, + "learning_rate": 0.00011070274828860618, + "loss": 1.0255, + "step": 13456 + }, + { + "epoch": 0.48192382760040825, + "grad_norm": 1.3676060438156128, + "learning_rate": 0.0001106912158277055, + "loss": 1.0563, + "step": 13457 + }, + { + "epoch": 0.4819596397299765, + "grad_norm": 1.656654953956604, + "learning_rate": 0.00011067968322296831, + "loss": 1.2599, + "step": 13458 + }, + { + "epoch": 0.48199545185954484, + "grad_norm": 1.533772349357605, + "learning_rate": 0.00011066815047454975, + "loss": 1.1464, + "step": 13459 + }, + { + "epoch": 0.4820312639891131, + "grad_norm": 1.4413704872131348, + "learning_rate": 0.00011065661758260502, + "loss": 1.119, + "step": 13460 + }, + { + "epoch": 0.4820670761186814, + "grad_norm": 1.4658193588256836, + "learning_rate": 0.00011064508454728921, + "loss": 1.0167, + "step": 13461 + }, + { + "epoch": 0.4821028882482497, + "grad_norm": 1.9024556875228882, + "learning_rate": 0.00011063355136875753, + "loss": 1.2281, + "step": 13462 + }, + { + "epoch": 0.482138700377818, + "grad_norm": 1.9936647415161133, + "learning_rate": 0.00011062201804716512, + "loss": 1.2828, + "step": 13463 + }, + { + "epoch": 0.48217451250738624, + "grad_norm": 1.6744608879089355, + "learning_rate": 0.0001106104845826672, + "loss": 0.9449, + "step": 13464 + }, + { + "epoch": 0.4822103246369545, + "grad_norm": 1.5455191135406494, + "learning_rate": 0.00011059895097541888, + "loss": 1.3559, + "step": 13465 + }, + { + "epoch": 0.48224613676652284, + "grad_norm": 1.805795669555664, + "learning_rate": 0.00011058741722557533, + "loss": 1.0709, + "step": 13466 + }, + { + "epoch": 0.4822819488960911, + "grad_norm": 1.7361233234405518, + "learning_rate": 0.00011057588333329174, + "loss": 1.2202, + "step": 13467 + }, + { + "epoch": 0.4823177610256594, + "grad_norm": 1.837822675704956, + "learning_rate": 0.00011056434929872325, + "loss": 1.3368, + "step": 13468 + }, + { + "epoch": 0.4823535731552277, + "grad_norm": 1.4370440244674683, + "learning_rate": 0.00011055281512202513, + "loss": 1.1704, + "step": 13469 + }, + { + "epoch": 0.48238938528479597, + "grad_norm": 1.892727255821228, + "learning_rate": 0.00011054128080335246, + "loss": 1.2064, + "step": 13470 + }, + { + "epoch": 0.48242519741436424, + "grad_norm": 1.8708908557891846, + "learning_rate": 0.00011052974634286046, + "loss": 1.3126, + "step": 13471 + }, + { + "epoch": 0.4824610095439325, + "grad_norm": 1.4989069700241089, + "learning_rate": 0.00011051821174070429, + "loss": 1.3109, + "step": 13472 + }, + { + "epoch": 0.48249682167350083, + "grad_norm": 1.253719449043274, + "learning_rate": 0.0001105066769970392, + "loss": 1.1934, + "step": 13473 + }, + { + "epoch": 0.4825326338030691, + "grad_norm": 1.7423723936080933, + "learning_rate": 0.00011049514211202028, + "loss": 1.4499, + "step": 13474 + }, + { + "epoch": 0.48256844593263737, + "grad_norm": 1.6558876037597656, + "learning_rate": 0.00011048360708580279, + "loss": 1.2231, + "step": 13475 + }, + { + "epoch": 0.4826042580622057, + "grad_norm": 1.8655120134353638, + "learning_rate": 0.00011047207191854185, + "loss": 1.1954, + "step": 13476 + }, + { + "epoch": 0.48264007019177396, + "grad_norm": 1.3506633043289185, + "learning_rate": 0.00011046053661039273, + "loss": 1.158, + "step": 13477 + }, + { + "epoch": 0.48267588232134223, + "grad_norm": 1.5025112628936768, + "learning_rate": 0.00011044900116151053, + "loss": 1.1554, + "step": 13478 + }, + { + "epoch": 0.4827116944509105, + "grad_norm": 1.441322684288025, + "learning_rate": 0.00011043746557205054, + "loss": 1.2225, + "step": 13479 + }, + { + "epoch": 0.4827475065804788, + "grad_norm": 1.6892601251602173, + "learning_rate": 0.0001104259298421679, + "loss": 1.2409, + "step": 13480 + }, + { + "epoch": 0.4827833187100471, + "grad_norm": 1.4608616828918457, + "learning_rate": 0.00011041439397201785, + "loss": 1.2773, + "step": 13481 + }, + { + "epoch": 0.48281913083961536, + "grad_norm": 1.553572654724121, + "learning_rate": 0.00011040285796175553, + "loss": 0.9207, + "step": 13482 + }, + { + "epoch": 0.4828549429691837, + "grad_norm": 1.295596718788147, + "learning_rate": 0.00011039132181153618, + "loss": 1.1044, + "step": 13483 + }, + { + "epoch": 0.48289075509875196, + "grad_norm": 1.550323247909546, + "learning_rate": 0.00011037978552151502, + "loss": 1.0403, + "step": 13484 + }, + { + "epoch": 0.4829265672283202, + "grad_norm": 2.06510066986084, + "learning_rate": 0.0001103682490918472, + "loss": 1.3761, + "step": 13485 + }, + { + "epoch": 0.4829623793578885, + "grad_norm": 1.7233411073684692, + "learning_rate": 0.000110356712522688, + "loss": 1.1924, + "step": 13486 + }, + { + "epoch": 0.4829981914874568, + "grad_norm": 1.6466723680496216, + "learning_rate": 0.00011034517581419255, + "loss": 1.2942, + "step": 13487 + }, + { + "epoch": 0.4830340036170251, + "grad_norm": 1.5437216758728027, + "learning_rate": 0.00011033363896651613, + "loss": 1.0063, + "step": 13488 + }, + { + "epoch": 0.48306981574659336, + "grad_norm": 1.4454154968261719, + "learning_rate": 0.00011032210197981392, + "loss": 0.9491, + "step": 13489 + }, + { + "epoch": 0.4831056278761617, + "grad_norm": 1.3720722198486328, + "learning_rate": 0.00011031056485424116, + "loss": 1.0652, + "step": 13490 + }, + { + "epoch": 0.48314144000572995, + "grad_norm": 1.401532530784607, + "learning_rate": 0.00011029902758995304, + "loss": 1.0619, + "step": 13491 + }, + { + "epoch": 0.4831772521352982, + "grad_norm": 1.3647633790969849, + "learning_rate": 0.00011028749018710478, + "loss": 1.069, + "step": 13492 + }, + { + "epoch": 0.4832130642648665, + "grad_norm": 1.4075431823730469, + "learning_rate": 0.00011027595264585162, + "loss": 0.9072, + "step": 13493 + }, + { + "epoch": 0.4832488763944348, + "grad_norm": 1.9266712665557861, + "learning_rate": 0.00011026441496634874, + "loss": 1.1476, + "step": 13494 + }, + { + "epoch": 0.4832846885240031, + "grad_norm": 1.231419324874878, + "learning_rate": 0.00011025287714875143, + "loss": 1.0415, + "step": 13495 + }, + { + "epoch": 0.48332050065357135, + "grad_norm": 1.6093156337738037, + "learning_rate": 0.00011024133919321486, + "loss": 1.22, + "step": 13496 + }, + { + "epoch": 0.4833563127831397, + "grad_norm": 1.5191514492034912, + "learning_rate": 0.00011022980109989431, + "loss": 1.1785, + "step": 13497 + }, + { + "epoch": 0.48339212491270794, + "grad_norm": 1.6625779867172241, + "learning_rate": 0.00011021826286894496, + "loss": 1.2317, + "step": 13498 + }, + { + "epoch": 0.4834279370422762, + "grad_norm": 1.4718612432479858, + "learning_rate": 0.0001102067245005221, + "loss": 1.0831, + "step": 13499 + }, + { + "epoch": 0.4834637491718445, + "grad_norm": 1.6687569618225098, + "learning_rate": 0.0001101951859947809, + "loss": 1.3335, + "step": 13500 + }, + { + "epoch": 0.4834995613014128, + "grad_norm": 1.7661077976226807, + "learning_rate": 0.00011018364735187661, + "loss": 1.0364, + "step": 13501 + }, + { + "epoch": 0.4835353734309811, + "grad_norm": 1.406534194946289, + "learning_rate": 0.00011017210857196449, + "loss": 1.249, + "step": 13502 + }, + { + "epoch": 0.48357118556054934, + "grad_norm": 1.920100212097168, + "learning_rate": 0.00011016056965519979, + "loss": 1.2702, + "step": 13503 + }, + { + "epoch": 0.48360699769011767, + "grad_norm": 1.3170056343078613, + "learning_rate": 0.00011014903060173772, + "loss": 1.2199, + "step": 13504 + }, + { + "epoch": 0.48364280981968594, + "grad_norm": 1.3242385387420654, + "learning_rate": 0.00011013749141173351, + "loss": 1.1058, + "step": 13505 + }, + { + "epoch": 0.4836786219492542, + "grad_norm": 1.719358205795288, + "learning_rate": 0.00011012595208534247, + "loss": 1.4073, + "step": 13506 + }, + { + "epoch": 0.4837144340788225, + "grad_norm": 1.4371877908706665, + "learning_rate": 0.00011011441262271975, + "loss": 1.0636, + "step": 13507 + }, + { + "epoch": 0.4837502462083908, + "grad_norm": 1.5140326023101807, + "learning_rate": 0.00011010287302402073, + "loss": 0.9129, + "step": 13508 + }, + { + "epoch": 0.48378605833795907, + "grad_norm": 1.3420625925064087, + "learning_rate": 0.00011009133328940053, + "loss": 1.2117, + "step": 13509 + }, + { + "epoch": 0.48382187046752734, + "grad_norm": 1.7382162809371948, + "learning_rate": 0.00011007979341901446, + "loss": 1.2488, + "step": 13510 + }, + { + "epoch": 0.48385768259709566, + "grad_norm": 1.3646080493927002, + "learning_rate": 0.0001100682534130178, + "loss": 1.1218, + "step": 13511 + }, + { + "epoch": 0.48389349472666393, + "grad_norm": 1.637351155281067, + "learning_rate": 0.00011005671327156574, + "loss": 1.0971, + "step": 13512 + }, + { + "epoch": 0.4839293068562322, + "grad_norm": 1.4559919834136963, + "learning_rate": 0.00011004517299481363, + "loss": 1.0449, + "step": 13513 + }, + { + "epoch": 0.48396511898580047, + "grad_norm": 1.5911407470703125, + "learning_rate": 0.00011003363258291664, + "loss": 1.2439, + "step": 13514 + }, + { + "epoch": 0.4840009311153688, + "grad_norm": 1.9142132997512817, + "learning_rate": 0.00011002209203603007, + "loss": 1.0473, + "step": 13515 + }, + { + "epoch": 0.48403674324493706, + "grad_norm": 1.5649126768112183, + "learning_rate": 0.00011001055135430916, + "loss": 1.0007, + "step": 13516 + }, + { + "epoch": 0.48407255537450533, + "grad_norm": 1.246883511543274, + "learning_rate": 0.00010999901053790924, + "loss": 1.127, + "step": 13517 + }, + { + "epoch": 0.48410836750407366, + "grad_norm": 2.1026463508605957, + "learning_rate": 0.0001099874695869855, + "loss": 1.0953, + "step": 13518 + }, + { + "epoch": 0.4841441796336419, + "grad_norm": 1.7997781038284302, + "learning_rate": 0.00010997592850169325, + "loss": 1.2636, + "step": 13519 + }, + { + "epoch": 0.4841799917632102, + "grad_norm": 1.3178631067276, + "learning_rate": 0.00010996438728218772, + "loss": 1.0312, + "step": 13520 + }, + { + "epoch": 0.48421580389277846, + "grad_norm": 1.3864692449569702, + "learning_rate": 0.00010995284592862425, + "loss": 1.3066, + "step": 13521 + }, + { + "epoch": 0.4842516160223468, + "grad_norm": 2.1044559478759766, + "learning_rate": 0.00010994130444115804, + "loss": 1.1993, + "step": 13522 + }, + { + "epoch": 0.48428742815191506, + "grad_norm": 1.6085728406906128, + "learning_rate": 0.00010992976281994443, + "loss": 1.029, + "step": 13523 + }, + { + "epoch": 0.4843232402814833, + "grad_norm": 1.9298253059387207, + "learning_rate": 0.00010991822106513867, + "loss": 1.0665, + "step": 13524 + }, + { + "epoch": 0.48435905241105165, + "grad_norm": 1.766041874885559, + "learning_rate": 0.00010990667917689603, + "loss": 1.2284, + "step": 13525 + }, + { + "epoch": 0.4843948645406199, + "grad_norm": 1.5864484310150146, + "learning_rate": 0.00010989513715537184, + "loss": 1.2481, + "step": 13526 + }, + { + "epoch": 0.4844306766701882, + "grad_norm": 1.5396007299423218, + "learning_rate": 0.00010988359500072128, + "loss": 1.0059, + "step": 13527 + }, + { + "epoch": 0.48446648879975646, + "grad_norm": 1.6521434783935547, + "learning_rate": 0.00010987205271309972, + "loss": 1.2131, + "step": 13528 + }, + { + "epoch": 0.4845023009293248, + "grad_norm": 1.651788592338562, + "learning_rate": 0.00010986051029266242, + "loss": 1.2327, + "step": 13529 + }, + { + "epoch": 0.48453811305889305, + "grad_norm": 2.0011723041534424, + "learning_rate": 0.0001098489677395647, + "loss": 1.4035, + "step": 13530 + }, + { + "epoch": 0.4845739251884613, + "grad_norm": 1.7925331592559814, + "learning_rate": 0.0001098374250539618, + "loss": 1.034, + "step": 13531 + }, + { + "epoch": 0.48460973731802964, + "grad_norm": 1.6934716701507568, + "learning_rate": 0.00010982588223600905, + "loss": 0.9533, + "step": 13532 + }, + { + "epoch": 0.4846455494475979, + "grad_norm": 1.4759881496429443, + "learning_rate": 0.00010981433928586168, + "loss": 1.2533, + "step": 13533 + }, + { + "epoch": 0.4846813615771662, + "grad_norm": 1.516919732093811, + "learning_rate": 0.00010980279620367511, + "loss": 1.1349, + "step": 13534 + }, + { + "epoch": 0.48471717370673445, + "grad_norm": 1.2256269454956055, + "learning_rate": 0.00010979125298960453, + "loss": 1.1453, + "step": 13535 + }, + { + "epoch": 0.4847529858363028, + "grad_norm": 1.6631349325180054, + "learning_rate": 0.00010977970964380526, + "loss": 1.195, + "step": 13536 + }, + { + "epoch": 0.48478879796587104, + "grad_norm": 1.641527771949768, + "learning_rate": 0.00010976816616643262, + "loss": 1.0921, + "step": 13537 + }, + { + "epoch": 0.4848246100954393, + "grad_norm": 1.6185647249221802, + "learning_rate": 0.0001097566225576419, + "loss": 1.1829, + "step": 13538 + }, + { + "epoch": 0.48486042222500764, + "grad_norm": 1.3002452850341797, + "learning_rate": 0.00010974507881758842, + "loss": 1.233, + "step": 13539 + }, + { + "epoch": 0.4848962343545759, + "grad_norm": 1.8615164756774902, + "learning_rate": 0.00010973353494642745, + "loss": 1.1946, + "step": 13540 + }, + { + "epoch": 0.4849320464841442, + "grad_norm": 1.655421257019043, + "learning_rate": 0.00010972199094431435, + "loss": 1.2979, + "step": 13541 + }, + { + "epoch": 0.48496785861371244, + "grad_norm": 1.4540029764175415, + "learning_rate": 0.00010971044681140437, + "loss": 1.2803, + "step": 13542 + }, + { + "epoch": 0.48500367074328077, + "grad_norm": 1.258224368095398, + "learning_rate": 0.0001096989025478529, + "loss": 1.1856, + "step": 13543 + }, + { + "epoch": 0.48503948287284904, + "grad_norm": 1.7053686380386353, + "learning_rate": 0.0001096873581538152, + "loss": 1.348, + "step": 13544 + }, + { + "epoch": 0.4850752950024173, + "grad_norm": 1.5888937711715698, + "learning_rate": 0.00010967581362944654, + "loss": 1.2493, + "step": 13545 + }, + { + "epoch": 0.48511110713198563, + "grad_norm": 1.2039732933044434, + "learning_rate": 0.00010966426897490234, + "loss": 1.1605, + "step": 13546 + }, + { + "epoch": 0.4851469192615539, + "grad_norm": 1.4466387033462524, + "learning_rate": 0.00010965272419033782, + "loss": 1.0952, + "step": 13547 + }, + { + "epoch": 0.48518273139112217, + "grad_norm": 1.4169111251831055, + "learning_rate": 0.0001096411792759084, + "loss": 0.9436, + "step": 13548 + }, + { + "epoch": 0.48521854352069044, + "grad_norm": 1.399491548538208, + "learning_rate": 0.0001096296342317693, + "loss": 1.0156, + "step": 13549 + }, + { + "epoch": 0.48525435565025876, + "grad_norm": 1.5092034339904785, + "learning_rate": 0.00010961808905807593, + "loss": 1.05, + "step": 13550 + }, + { + "epoch": 0.48529016777982703, + "grad_norm": 1.7407118082046509, + "learning_rate": 0.00010960654375498357, + "loss": 1.2804, + "step": 13551 + }, + { + "epoch": 0.4853259799093953, + "grad_norm": 1.4531168937683105, + "learning_rate": 0.00010959499832264754, + "loss": 1.1821, + "step": 13552 + }, + { + "epoch": 0.4853617920389636, + "grad_norm": 1.5959516763687134, + "learning_rate": 0.00010958345276122322, + "loss": 0.8853, + "step": 13553 + }, + { + "epoch": 0.4853976041685319, + "grad_norm": 1.5151937007904053, + "learning_rate": 0.00010957190707086586, + "loss": 0.9631, + "step": 13554 + }, + { + "epoch": 0.48543341629810016, + "grad_norm": 1.3685595989227295, + "learning_rate": 0.00010956036125173088, + "loss": 1.2216, + "step": 13555 + }, + { + "epoch": 0.48546922842766843, + "grad_norm": 1.8619862794876099, + "learning_rate": 0.00010954881530397352, + "loss": 1.1656, + "step": 13556 + }, + { + "epoch": 0.48550504055723676, + "grad_norm": 1.4332129955291748, + "learning_rate": 0.0001095372692277492, + "loss": 1.0452, + "step": 13557 + }, + { + "epoch": 0.485540852686805, + "grad_norm": 1.5847800970077515, + "learning_rate": 0.00010952572302321322, + "loss": 1.0888, + "step": 13558 + }, + { + "epoch": 0.4855766648163733, + "grad_norm": 2.256859540939331, + "learning_rate": 0.00010951417669052093, + "loss": 1.2354, + "step": 13559 + }, + { + "epoch": 0.4856124769459416, + "grad_norm": 1.4158436059951782, + "learning_rate": 0.00010950263022982766, + "loss": 1.1908, + "step": 13560 + }, + { + "epoch": 0.4856482890755099, + "grad_norm": 2.1029551029205322, + "learning_rate": 0.0001094910836412888, + "loss": 1.2025, + "step": 13561 + }, + { + "epoch": 0.48568410120507816, + "grad_norm": 1.5242172479629517, + "learning_rate": 0.00010947953692505959, + "loss": 1.3488, + "step": 13562 + }, + { + "epoch": 0.4857199133346464, + "grad_norm": 1.702043056488037, + "learning_rate": 0.00010946799008129547, + "loss": 0.9098, + "step": 13563 + }, + { + "epoch": 0.48575572546421475, + "grad_norm": 1.655885934829712, + "learning_rate": 0.00010945644311015172, + "loss": 1.3838, + "step": 13564 + }, + { + "epoch": 0.485791537593783, + "grad_norm": 2.8926753997802734, + "learning_rate": 0.00010944489601178373, + "loss": 1.3567, + "step": 13565 + }, + { + "epoch": 0.4858273497233513, + "grad_norm": 2.5971570014953613, + "learning_rate": 0.0001094333487863469, + "loss": 1.2396, + "step": 13566 + }, + { + "epoch": 0.48586316185291956, + "grad_norm": 1.55757737159729, + "learning_rate": 0.00010942180143399647, + "loss": 1.1118, + "step": 13567 + }, + { + "epoch": 0.4858989739824879, + "grad_norm": 1.5241539478302002, + "learning_rate": 0.0001094102539548879, + "loss": 1.1332, + "step": 13568 + }, + { + "epoch": 0.48593478611205615, + "grad_norm": 2.7219879627227783, + "learning_rate": 0.00010939870634917647, + "loss": 1.4127, + "step": 13569 + }, + { + "epoch": 0.4859705982416244, + "grad_norm": 1.734186053276062, + "learning_rate": 0.00010938715861701762, + "loss": 1.2587, + "step": 13570 + }, + { + "epoch": 0.48600641037119274, + "grad_norm": 1.3821349143981934, + "learning_rate": 0.00010937561075856662, + "loss": 1.1972, + "step": 13571 + }, + { + "epoch": 0.486042222500761, + "grad_norm": 1.9475135803222656, + "learning_rate": 0.00010936406277397888, + "loss": 1.2741, + "step": 13572 + }, + { + "epoch": 0.4860780346303293, + "grad_norm": 1.369322657585144, + "learning_rate": 0.00010935251466340973, + "loss": 1.0543, + "step": 13573 + }, + { + "epoch": 0.48611384675989755, + "grad_norm": 1.1941146850585938, + "learning_rate": 0.0001093409664270146, + "loss": 1.0395, + "step": 13574 + }, + { + "epoch": 0.4861496588894659, + "grad_norm": 1.777272343635559, + "learning_rate": 0.00010932941806494876, + "loss": 1.2675, + "step": 13575 + }, + { + "epoch": 0.48618547101903414, + "grad_norm": 1.508747935295105, + "learning_rate": 0.0001093178695773677, + "loss": 0.9746, + "step": 13576 + }, + { + "epoch": 0.4862212831486024, + "grad_norm": 1.4938126802444458, + "learning_rate": 0.00010930632096442665, + "loss": 1.2304, + "step": 13577 + }, + { + "epoch": 0.48625709527817074, + "grad_norm": 1.5385394096374512, + "learning_rate": 0.00010929477222628113, + "loss": 1.0881, + "step": 13578 + }, + { + "epoch": 0.486292907407739, + "grad_norm": 2.66666579246521, + "learning_rate": 0.00010928322336308641, + "loss": 1.1486, + "step": 13579 + }, + { + "epoch": 0.4863287195373073, + "grad_norm": 1.3286529779434204, + "learning_rate": 0.00010927167437499788, + "loss": 1.1538, + "step": 13580 + }, + { + "epoch": 0.48636453166687554, + "grad_norm": 1.3992087841033936, + "learning_rate": 0.00010926012526217095, + "loss": 1.0976, + "step": 13581 + }, + { + "epoch": 0.48640034379644387, + "grad_norm": 1.4375295639038086, + "learning_rate": 0.00010924857602476095, + "loss": 1.0619, + "step": 13582 + }, + { + "epoch": 0.48643615592601214, + "grad_norm": 1.679450273513794, + "learning_rate": 0.00010923702666292333, + "loss": 1.2917, + "step": 13583 + }, + { + "epoch": 0.4864719680555804, + "grad_norm": 1.773956298828125, + "learning_rate": 0.0001092254771768134, + "loss": 1.1093, + "step": 13584 + }, + { + "epoch": 0.48650778018514873, + "grad_norm": 1.414607048034668, + "learning_rate": 0.0001092139275665866, + "loss": 1.1849, + "step": 13585 + }, + { + "epoch": 0.486543592314717, + "grad_norm": 1.5769665241241455, + "learning_rate": 0.00010920237783239824, + "loss": 1.0556, + "step": 13586 + }, + { + "epoch": 0.48657940444428527, + "grad_norm": 1.576771855354309, + "learning_rate": 0.0001091908279744038, + "loss": 1.0432, + "step": 13587 + }, + { + "epoch": 0.48661521657385354, + "grad_norm": 1.5803637504577637, + "learning_rate": 0.00010917927799275865, + "loss": 1.1165, + "step": 13588 + }, + { + "epoch": 0.48665102870342186, + "grad_norm": 1.849291205406189, + "learning_rate": 0.00010916772788761809, + "loss": 1.1623, + "step": 13589 + }, + { + "epoch": 0.48668684083299013, + "grad_norm": 1.733810305595398, + "learning_rate": 0.00010915617765913761, + "loss": 1.0518, + "step": 13590 + }, + { + "epoch": 0.4867226529625584, + "grad_norm": 2.402871608734131, + "learning_rate": 0.00010914462730747257, + "loss": 1.0808, + "step": 13591 + }, + { + "epoch": 0.4867584650921267, + "grad_norm": 1.3536838293075562, + "learning_rate": 0.00010913307683277838, + "loss": 1.1056, + "step": 13592 + }, + { + "epoch": 0.486794277221695, + "grad_norm": 1.9166343212127686, + "learning_rate": 0.0001091215262352104, + "loss": 1.0425, + "step": 13593 + }, + { + "epoch": 0.48683008935126326, + "grad_norm": 1.5082917213439941, + "learning_rate": 0.00010910997551492405, + "loss": 0.9978, + "step": 13594 + }, + { + "epoch": 0.48686590148083153, + "grad_norm": 1.6690934896469116, + "learning_rate": 0.00010909842467207472, + "loss": 1.1438, + "step": 13595 + }, + { + "epoch": 0.48690171361039986, + "grad_norm": 1.9303202629089355, + "learning_rate": 0.00010908687370681785, + "loss": 1.2771, + "step": 13596 + }, + { + "epoch": 0.4869375257399681, + "grad_norm": 1.7338279485702515, + "learning_rate": 0.00010907532261930881, + "loss": 1.1396, + "step": 13597 + }, + { + "epoch": 0.4869733378695364, + "grad_norm": 2.1860504150390625, + "learning_rate": 0.00010906377140970301, + "loss": 1.2542, + "step": 13598 + }, + { + "epoch": 0.4870091499991047, + "grad_norm": 1.9841210842132568, + "learning_rate": 0.00010905222007815585, + "loss": 1.156, + "step": 13599 + }, + { + "epoch": 0.487044962128673, + "grad_norm": 1.708571434020996, + "learning_rate": 0.00010904066862482274, + "loss": 1.1619, + "step": 13600 + }, + { + "epoch": 0.48708077425824126, + "grad_norm": 1.3575469255447388, + "learning_rate": 0.00010902911704985912, + "loss": 1.1795, + "step": 13601 + }, + { + "epoch": 0.4871165863878095, + "grad_norm": 2.009396553039551, + "learning_rate": 0.00010901756535342033, + "loss": 1.3069, + "step": 13602 + }, + { + "epoch": 0.48715239851737785, + "grad_norm": 1.3931678533554077, + "learning_rate": 0.00010900601353566188, + "loss": 1.1939, + "step": 13603 + }, + { + "epoch": 0.4871882106469461, + "grad_norm": 1.6581696271896362, + "learning_rate": 0.0001089944615967391, + "loss": 1.0429, + "step": 13604 + }, + { + "epoch": 0.4872240227765144, + "grad_norm": 1.4625452756881714, + "learning_rate": 0.0001089829095368075, + "loss": 1.0445, + "step": 13605 + }, + { + "epoch": 0.4872598349060827, + "grad_norm": 1.8381484746932983, + "learning_rate": 0.00010897135735602238, + "loss": 0.9966, + "step": 13606 + }, + { + "epoch": 0.487295647035651, + "grad_norm": 1.2835732698440552, + "learning_rate": 0.00010895980505453924, + "loss": 0.9607, + "step": 13607 + }, + { + "epoch": 0.48733145916521925, + "grad_norm": 1.4389845132827759, + "learning_rate": 0.00010894825263251345, + "loss": 1.3206, + "step": 13608 + }, + { + "epoch": 0.4873672712947875, + "grad_norm": 1.6921988725662231, + "learning_rate": 0.00010893670009010049, + "loss": 1.0923, + "step": 13609 + }, + { + "epoch": 0.48740308342435584, + "grad_norm": 1.4719619750976562, + "learning_rate": 0.00010892514742745576, + "loss": 0.9559, + "step": 13610 + }, + { + "epoch": 0.4874388955539241, + "grad_norm": 1.7326616048812866, + "learning_rate": 0.00010891359464473468, + "loss": 1.2924, + "step": 13611 + }, + { + "epoch": 0.4874747076834924, + "grad_norm": 1.4594309329986572, + "learning_rate": 0.00010890204174209269, + "loss": 1.133, + "step": 13612 + }, + { + "epoch": 0.4875105198130607, + "grad_norm": 1.3080930709838867, + "learning_rate": 0.00010889048871968517, + "loss": 1.0684, + "step": 13613 + }, + { + "epoch": 0.487546331942629, + "grad_norm": 1.6015368700027466, + "learning_rate": 0.00010887893557766766, + "loss": 1.1831, + "step": 13614 + }, + { + "epoch": 0.48758214407219724, + "grad_norm": 1.6761972904205322, + "learning_rate": 0.00010886738231619549, + "loss": 1.2327, + "step": 13615 + }, + { + "epoch": 0.4876179562017655, + "grad_norm": 1.5404202938079834, + "learning_rate": 0.00010885582893542411, + "loss": 1.2339, + "step": 13616 + }, + { + "epoch": 0.48765376833133384, + "grad_norm": 1.5589219331741333, + "learning_rate": 0.00010884427543550899, + "loss": 1.2968, + "step": 13617 + }, + { + "epoch": 0.4876895804609021, + "grad_norm": 1.5487886667251587, + "learning_rate": 0.00010883272181660558, + "loss": 0.9647, + "step": 13618 + }, + { + "epoch": 0.4877253925904704, + "grad_norm": 1.5582998991012573, + "learning_rate": 0.00010882116807886924, + "loss": 1.1134, + "step": 13619 + }, + { + "epoch": 0.4877612047200387, + "grad_norm": 1.5490996837615967, + "learning_rate": 0.0001088096142224555, + "loss": 1.1092, + "step": 13620 + }, + { + "epoch": 0.48779701684960697, + "grad_norm": 1.2582415342330933, + "learning_rate": 0.00010879806024751975, + "loss": 1.0251, + "step": 13621 + }, + { + "epoch": 0.48783282897917524, + "grad_norm": 2.328996419906616, + "learning_rate": 0.00010878650615421744, + "loss": 1.1403, + "step": 13622 + }, + { + "epoch": 0.4878686411087435, + "grad_norm": 1.5997244119644165, + "learning_rate": 0.00010877495194270407, + "loss": 1.1048, + "step": 13623 + }, + { + "epoch": 0.48790445323831183, + "grad_norm": 1.8059452772140503, + "learning_rate": 0.00010876339761313499, + "loss": 1.0858, + "step": 13624 + }, + { + "epoch": 0.4879402653678801, + "grad_norm": 1.8181126117706299, + "learning_rate": 0.00010875184316566571, + "loss": 1.2295, + "step": 13625 + }, + { + "epoch": 0.48797607749744837, + "grad_norm": 1.7691401243209839, + "learning_rate": 0.00010874028860045166, + "loss": 1.1739, + "step": 13626 + }, + { + "epoch": 0.4880118896270167, + "grad_norm": 1.5051873922348022, + "learning_rate": 0.00010872873391764833, + "loss": 1.2038, + "step": 13627 + }, + { + "epoch": 0.48804770175658496, + "grad_norm": 1.581182837486267, + "learning_rate": 0.00010871717911741113, + "loss": 1.0632, + "step": 13628 + }, + { + "epoch": 0.48808351388615323, + "grad_norm": 1.3757565021514893, + "learning_rate": 0.00010870562419989552, + "loss": 1.0559, + "step": 13629 + }, + { + "epoch": 0.4881193260157215, + "grad_norm": 1.7169995307922363, + "learning_rate": 0.00010869406916525698, + "loss": 1.2186, + "step": 13630 + }, + { + "epoch": 0.4881551381452898, + "grad_norm": 1.8103995323181152, + "learning_rate": 0.00010868251401365095, + "loss": 1.1012, + "step": 13631 + }, + { + "epoch": 0.4881909502748581, + "grad_norm": 2.1158833503723145, + "learning_rate": 0.0001086709587452329, + "loss": 1.0105, + "step": 13632 + }, + { + "epoch": 0.48822676240442636, + "grad_norm": 1.7309494018554688, + "learning_rate": 0.00010865940336015828, + "loss": 1.0984, + "step": 13633 + }, + { + "epoch": 0.4882625745339947, + "grad_norm": 1.4152594804763794, + "learning_rate": 0.00010864784785858256, + "loss": 1.016, + "step": 13634 + }, + { + "epoch": 0.48829838666356296, + "grad_norm": 1.3700488805770874, + "learning_rate": 0.00010863629224066116, + "loss": 1.1476, + "step": 13635 + }, + { + "epoch": 0.4883341987931312, + "grad_norm": 1.5846494436264038, + "learning_rate": 0.00010862473650654965, + "loss": 1.2505, + "step": 13636 + }, + { + "epoch": 0.4883700109226995, + "grad_norm": 1.431643009185791, + "learning_rate": 0.00010861318065640338, + "loss": 0.9913, + "step": 13637 + }, + { + "epoch": 0.4884058230522678, + "grad_norm": 2.2476370334625244, + "learning_rate": 0.00010860162469037792, + "loss": 1.214, + "step": 13638 + }, + { + "epoch": 0.4884416351818361, + "grad_norm": 1.7909153699874878, + "learning_rate": 0.00010859006860862865, + "loss": 1.1927, + "step": 13639 + }, + { + "epoch": 0.48847744731140436, + "grad_norm": 1.4699407815933228, + "learning_rate": 0.00010857851241131114, + "loss": 1.1294, + "step": 13640 + }, + { + "epoch": 0.4885132594409727, + "grad_norm": 1.738048791885376, + "learning_rate": 0.0001085669560985808, + "loss": 1.1547, + "step": 13641 + }, + { + "epoch": 0.48854907157054095, + "grad_norm": 1.6849509477615356, + "learning_rate": 0.0001085553996705931, + "loss": 1.3517, + "step": 13642 + }, + { + "epoch": 0.4885848837001092, + "grad_norm": 1.5248969793319702, + "learning_rate": 0.00010854384312750354, + "loss": 1.0469, + "step": 13643 + }, + { + "epoch": 0.4886206958296775, + "grad_norm": 1.4559988975524902, + "learning_rate": 0.00010853228646946758, + "loss": 0.9669, + "step": 13644 + }, + { + "epoch": 0.4886565079592458, + "grad_norm": 1.6015793085098267, + "learning_rate": 0.00010852072969664073, + "loss": 1.1225, + "step": 13645 + }, + { + "epoch": 0.4886923200888141, + "grad_norm": 1.364424705505371, + "learning_rate": 0.00010850917280917843, + "loss": 0.8683, + "step": 13646 + }, + { + "epoch": 0.48872813221838235, + "grad_norm": 1.4856380224227905, + "learning_rate": 0.0001084976158072362, + "loss": 1.2404, + "step": 13647 + }, + { + "epoch": 0.4887639443479507, + "grad_norm": 1.847459077835083, + "learning_rate": 0.0001084860586909695, + "loss": 0.9144, + "step": 13648 + }, + { + "epoch": 0.48879975647751894, + "grad_norm": 1.4285037517547607, + "learning_rate": 0.00010847450146053386, + "loss": 1.0627, + "step": 13649 + }, + { + "epoch": 0.4888355686070872, + "grad_norm": 1.752026081085205, + "learning_rate": 0.0001084629441160847, + "loss": 1.1644, + "step": 13650 + }, + { + "epoch": 0.4888713807366555, + "grad_norm": 1.725614309310913, + "learning_rate": 0.00010845138665777754, + "loss": 1.3263, + "step": 13651 + }, + { + "epoch": 0.4889071928662238, + "grad_norm": 1.2794320583343506, + "learning_rate": 0.0001084398290857679, + "loss": 1.0716, + "step": 13652 + }, + { + "epoch": 0.4889430049957921, + "grad_norm": 1.8726156949996948, + "learning_rate": 0.00010842827140021121, + "loss": 1.1421, + "step": 13653 + }, + { + "epoch": 0.48897881712536034, + "grad_norm": 1.1841576099395752, + "learning_rate": 0.00010841671360126304, + "loss": 1.0654, + "step": 13654 + }, + { + "epoch": 0.48901462925492867, + "grad_norm": 1.4840983152389526, + "learning_rate": 0.0001084051556890788, + "loss": 1.1126, + "step": 13655 + }, + { + "epoch": 0.48905044138449694, + "grad_norm": 1.5560499429702759, + "learning_rate": 0.0001083935976638141, + "loss": 1.0851, + "step": 13656 + }, + { + "epoch": 0.4890862535140652, + "grad_norm": 1.5501261949539185, + "learning_rate": 0.00010838203952562432, + "loss": 1.1547, + "step": 13657 + }, + { + "epoch": 0.4891220656436335, + "grad_norm": 1.6434682607650757, + "learning_rate": 0.00010837048127466505, + "loss": 0.9354, + "step": 13658 + }, + { + "epoch": 0.4891578777732018, + "grad_norm": 1.777979850769043, + "learning_rate": 0.00010835892291109169, + "loss": 1.134, + "step": 13659 + }, + { + "epoch": 0.48919368990277007, + "grad_norm": 1.8393515348434448, + "learning_rate": 0.00010834736443505986, + "loss": 1.2318, + "step": 13660 + }, + { + "epoch": 0.48922950203233834, + "grad_norm": 1.6024129390716553, + "learning_rate": 0.00010833580584672496, + "loss": 0.9457, + "step": 13661 + }, + { + "epoch": 0.48926531416190666, + "grad_norm": 1.3166911602020264, + "learning_rate": 0.00010832424714624259, + "loss": 1.0534, + "step": 13662 + }, + { + "epoch": 0.48930112629147493, + "grad_norm": 1.4069452285766602, + "learning_rate": 0.00010831268833376817, + "loss": 1.0519, + "step": 13663 + }, + { + "epoch": 0.4893369384210432, + "grad_norm": 1.5995296239852905, + "learning_rate": 0.00010830112940945726, + "loss": 1.1725, + "step": 13664 + }, + { + "epoch": 0.48937275055061147, + "grad_norm": 1.8690534830093384, + "learning_rate": 0.00010828957037346538, + "loss": 1.0439, + "step": 13665 + }, + { + "epoch": 0.4894085626801798, + "grad_norm": 1.7238473892211914, + "learning_rate": 0.00010827801122594802, + "loss": 1.0219, + "step": 13666 + }, + { + "epoch": 0.48944437480974806, + "grad_norm": 1.4883167743682861, + "learning_rate": 0.00010826645196706074, + "loss": 1.0399, + "step": 13667 + }, + { + "epoch": 0.48948018693931633, + "grad_norm": 1.5552512407302856, + "learning_rate": 0.00010825489259695894, + "loss": 1.1532, + "step": 13668 + }, + { + "epoch": 0.48951599906888466, + "grad_norm": 1.6254520416259766, + "learning_rate": 0.00010824333311579824, + "loss": 1.202, + "step": 13669 + }, + { + "epoch": 0.4895518111984529, + "grad_norm": 1.7817859649658203, + "learning_rate": 0.00010823177352373412, + "loss": 1.1759, + "step": 13670 + }, + { + "epoch": 0.4895876233280212, + "grad_norm": 1.5781725645065308, + "learning_rate": 0.00010822021382092211, + "loss": 1.1251, + "step": 13671 + }, + { + "epoch": 0.48962343545758946, + "grad_norm": 1.5567145347595215, + "learning_rate": 0.00010820865400751772, + "loss": 1.2103, + "step": 13672 + }, + { + "epoch": 0.4896592475871578, + "grad_norm": 1.6633527278900146, + "learning_rate": 0.00010819709408367649, + "loss": 1.1626, + "step": 13673 + }, + { + "epoch": 0.48969505971672606, + "grad_norm": 1.5877212285995483, + "learning_rate": 0.00010818553404955391, + "loss": 1.172, + "step": 13674 + }, + { + "epoch": 0.4897308718462943, + "grad_norm": 1.6351540088653564, + "learning_rate": 0.00010817397390530555, + "loss": 1.2719, + "step": 13675 + }, + { + "epoch": 0.48976668397586265, + "grad_norm": 1.5289798974990845, + "learning_rate": 0.00010816241365108692, + "loss": 1.2911, + "step": 13676 + }, + { + "epoch": 0.4898024961054309, + "grad_norm": 1.5273357629776, + "learning_rate": 0.00010815085328705352, + "loss": 1.0643, + "step": 13677 + }, + { + "epoch": 0.4898383082349992, + "grad_norm": 1.5226749181747437, + "learning_rate": 0.00010813929281336092, + "loss": 1.0531, + "step": 13678 + }, + { + "epoch": 0.48987412036456746, + "grad_norm": 1.6454253196716309, + "learning_rate": 0.00010812773223016461, + "loss": 1.0594, + "step": 13679 + }, + { + "epoch": 0.4899099324941358, + "grad_norm": 1.5007514953613281, + "learning_rate": 0.00010811617153762017, + "loss": 1.175, + "step": 13680 + }, + { + "epoch": 0.48994574462370405, + "grad_norm": 1.4983201026916504, + "learning_rate": 0.0001081046107358831, + "loss": 1.0517, + "step": 13681 + }, + { + "epoch": 0.4899815567532723, + "grad_norm": 1.280237078666687, + "learning_rate": 0.00010809304982510897, + "loss": 1.1056, + "step": 13682 + }, + { + "epoch": 0.49001736888284064, + "grad_norm": 1.88190495967865, + "learning_rate": 0.00010808148880545325, + "loss": 0.9938, + "step": 13683 + }, + { + "epoch": 0.4900531810124089, + "grad_norm": 1.5418071746826172, + "learning_rate": 0.00010806992767707155, + "loss": 1.2699, + "step": 13684 + }, + { + "epoch": 0.4900889931419772, + "grad_norm": 1.6499145030975342, + "learning_rate": 0.00010805836644011939, + "loss": 1.2235, + "step": 13685 + }, + { + "epoch": 0.49012480527154545, + "grad_norm": 1.8860747814178467, + "learning_rate": 0.00010804680509475229, + "loss": 1.1021, + "step": 13686 + }, + { + "epoch": 0.4901606174011138, + "grad_norm": 1.3685340881347656, + "learning_rate": 0.00010803524364112583, + "loss": 1.0733, + "step": 13687 + }, + { + "epoch": 0.49019642953068204, + "grad_norm": 1.800211787223816, + "learning_rate": 0.0001080236820793955, + "loss": 1.1777, + "step": 13688 + }, + { + "epoch": 0.4902322416602503, + "grad_norm": 1.4521242380142212, + "learning_rate": 0.00010801212040971691, + "loss": 1.1666, + "step": 13689 + }, + { + "epoch": 0.49026805378981864, + "grad_norm": 1.315740942955017, + "learning_rate": 0.00010800055863224555, + "loss": 1.0755, + "step": 13690 + }, + { + "epoch": 0.4903038659193869, + "grad_norm": 1.543819785118103, + "learning_rate": 0.00010798899674713699, + "loss": 1.0701, + "step": 13691 + }, + { + "epoch": 0.4903396780489552, + "grad_norm": 1.7105821371078491, + "learning_rate": 0.00010797743475454678, + "loss": 1.2515, + "step": 13692 + }, + { + "epoch": 0.49037549017852344, + "grad_norm": 1.332040548324585, + "learning_rate": 0.0001079658726546305, + "loss": 1.101, + "step": 13693 + }, + { + "epoch": 0.49041130230809177, + "grad_norm": 1.7984873056411743, + "learning_rate": 0.00010795431044754367, + "loss": 0.9045, + "step": 13694 + }, + { + "epoch": 0.49044711443766004, + "grad_norm": 1.7146726846694946, + "learning_rate": 0.00010794274813344185, + "loss": 1.2369, + "step": 13695 + }, + { + "epoch": 0.4904829265672283, + "grad_norm": 1.3818751573562622, + "learning_rate": 0.0001079311857124806, + "loss": 0.986, + "step": 13696 + }, + { + "epoch": 0.49051873869679663, + "grad_norm": 1.8305761814117432, + "learning_rate": 0.00010791962318481547, + "loss": 1.3086, + "step": 13697 + }, + { + "epoch": 0.4905545508263649, + "grad_norm": 2.028808832168579, + "learning_rate": 0.00010790806055060205, + "loss": 1.3752, + "step": 13698 + }, + { + "epoch": 0.49059036295593317, + "grad_norm": 1.581603765487671, + "learning_rate": 0.00010789649780999585, + "loss": 1.2136, + "step": 13699 + }, + { + "epoch": 0.49062617508550144, + "grad_norm": 1.3912142515182495, + "learning_rate": 0.00010788493496315246, + "loss": 0.924, + "step": 13700 + }, + { + "epoch": 0.49066198721506976, + "grad_norm": 1.3956928253173828, + "learning_rate": 0.00010787337201022745, + "loss": 1.192, + "step": 13701 + }, + { + "epoch": 0.49069779934463803, + "grad_norm": 1.4678109884262085, + "learning_rate": 0.00010786180895137639, + "loss": 0.9301, + "step": 13702 + }, + { + "epoch": 0.4907336114742063, + "grad_norm": 1.5654054880142212, + "learning_rate": 0.0001078502457867548, + "loss": 1.1371, + "step": 13703 + }, + { + "epoch": 0.4907694236037746, + "grad_norm": 1.323202133178711, + "learning_rate": 0.00010783868251651833, + "loss": 1.0571, + "step": 13704 + }, + { + "epoch": 0.4908052357333429, + "grad_norm": 1.3957830667495728, + "learning_rate": 0.00010782711914082242, + "loss": 1.0419, + "step": 13705 + }, + { + "epoch": 0.49084104786291116, + "grad_norm": 1.2681523561477661, + "learning_rate": 0.00010781555565982276, + "loss": 1.0054, + "step": 13706 + }, + { + "epoch": 0.49087685999247943, + "grad_norm": 1.405045509338379, + "learning_rate": 0.00010780399207367489, + "loss": 0.9282, + "step": 13707 + }, + { + "epoch": 0.49091267212204776, + "grad_norm": 1.8807651996612549, + "learning_rate": 0.00010779242838253433, + "loss": 1.1817, + "step": 13708 + }, + { + "epoch": 0.490948484251616, + "grad_norm": 1.490771770477295, + "learning_rate": 0.00010778086458655677, + "loss": 1.1538, + "step": 13709 + }, + { + "epoch": 0.4909842963811843, + "grad_norm": 1.9286694526672363, + "learning_rate": 0.00010776930068589764, + "loss": 1.2747, + "step": 13710 + }, + { + "epoch": 0.4910201085107526, + "grad_norm": 2.1040401458740234, + "learning_rate": 0.00010775773668071265, + "loss": 1.175, + "step": 13711 + }, + { + "epoch": 0.4910559206403209, + "grad_norm": 1.8172919750213623, + "learning_rate": 0.00010774617257115728, + "loss": 0.8975, + "step": 13712 + }, + { + "epoch": 0.49109173276988916, + "grad_norm": 1.7321276664733887, + "learning_rate": 0.00010773460835738718, + "loss": 1.0959, + "step": 13713 + }, + { + "epoch": 0.4911275448994574, + "grad_norm": 1.5406532287597656, + "learning_rate": 0.00010772304403955789, + "loss": 1.1317, + "step": 13714 + }, + { + "epoch": 0.49116335702902575, + "grad_norm": 1.3440672159194946, + "learning_rate": 0.000107711479617825, + "loss": 0.9703, + "step": 13715 + }, + { + "epoch": 0.491199169158594, + "grad_norm": 1.6215044260025024, + "learning_rate": 0.00010769991509234408, + "loss": 1.2797, + "step": 13716 + }, + { + "epoch": 0.4912349812881623, + "grad_norm": 1.54573655128479, + "learning_rate": 0.00010768835046327077, + "loss": 0.9221, + "step": 13717 + }, + { + "epoch": 0.4912707934177306, + "grad_norm": 1.886520504951477, + "learning_rate": 0.00010767678573076058, + "loss": 1.252, + "step": 13718 + }, + { + "epoch": 0.4913066055472989, + "grad_norm": 1.7035260200500488, + "learning_rate": 0.00010766522089496915, + "loss": 1.1973, + "step": 13719 + }, + { + "epoch": 0.49134241767686715, + "grad_norm": 1.425410270690918, + "learning_rate": 0.00010765365595605212, + "loss": 1.0829, + "step": 13720 + }, + { + "epoch": 0.4913782298064354, + "grad_norm": 1.5487481355667114, + "learning_rate": 0.00010764209091416497, + "loss": 1.103, + "step": 13721 + }, + { + "epoch": 0.49141404193600374, + "grad_norm": 1.3672025203704834, + "learning_rate": 0.00010763052576946335, + "loss": 1.1668, + "step": 13722 + }, + { + "epoch": 0.491449854065572, + "grad_norm": 1.7599130868911743, + "learning_rate": 0.00010761896052210285, + "loss": 1.0355, + "step": 13723 + }, + { + "epoch": 0.4914856661951403, + "grad_norm": 1.5551373958587646, + "learning_rate": 0.00010760739517223908, + "loss": 1.1077, + "step": 13724 + }, + { + "epoch": 0.4915214783247086, + "grad_norm": 2.0578601360321045, + "learning_rate": 0.00010759582972002758, + "loss": 1.1921, + "step": 13725 + }, + { + "epoch": 0.4915572904542769, + "grad_norm": 1.5632405281066895, + "learning_rate": 0.00010758426416562402, + "loss": 1.2645, + "step": 13726 + }, + { + "epoch": 0.49159310258384514, + "grad_norm": 1.864395260810852, + "learning_rate": 0.00010757269850918394, + "loss": 1.0203, + "step": 13727 + }, + { + "epoch": 0.4916289147134134, + "grad_norm": 1.634012222290039, + "learning_rate": 0.00010756113275086302, + "loss": 1.24, + "step": 13728 + }, + { + "epoch": 0.49166472684298174, + "grad_norm": 1.8203890323638916, + "learning_rate": 0.00010754956689081678, + "loss": 1.3023, + "step": 13729 + }, + { + "epoch": 0.49170053897255, + "grad_norm": 1.607223629951477, + "learning_rate": 0.00010753800092920086, + "loss": 1.2365, + "step": 13730 + }, + { + "epoch": 0.4917363511021183, + "grad_norm": 1.593701720237732, + "learning_rate": 0.00010752643486617086, + "loss": 1.264, + "step": 13731 + }, + { + "epoch": 0.4917721632316866, + "grad_norm": 1.859411358833313, + "learning_rate": 0.00010751486870188239, + "loss": 1.1116, + "step": 13732 + }, + { + "epoch": 0.49180797536125487, + "grad_norm": 1.9117140769958496, + "learning_rate": 0.00010750330243649104, + "loss": 1.2042, + "step": 13733 + }, + { + "epoch": 0.49184378749082314, + "grad_norm": 1.9476282596588135, + "learning_rate": 0.00010749173607015247, + "loss": 1.0105, + "step": 13734 + }, + { + "epoch": 0.4918795996203914, + "grad_norm": 1.6268504858016968, + "learning_rate": 0.00010748016960302223, + "loss": 1.1326, + "step": 13735 + }, + { + "epoch": 0.49191541174995973, + "grad_norm": 1.2941741943359375, + "learning_rate": 0.00010746860303525595, + "loss": 1.2617, + "step": 13736 + }, + { + "epoch": 0.491951223879528, + "grad_norm": 1.607442021369934, + "learning_rate": 0.00010745703636700926, + "loss": 1.2234, + "step": 13737 + }, + { + "epoch": 0.49198703600909627, + "grad_norm": 1.4233944416046143, + "learning_rate": 0.00010744546959843777, + "loss": 1.2341, + "step": 13738 + }, + { + "epoch": 0.4920228481386646, + "grad_norm": 1.193726897239685, + "learning_rate": 0.00010743390272969706, + "loss": 0.9833, + "step": 13739 + }, + { + "epoch": 0.49205866026823286, + "grad_norm": 1.2590339183807373, + "learning_rate": 0.00010742233576094283, + "loss": 1.0655, + "step": 13740 + }, + { + "epoch": 0.49209447239780113, + "grad_norm": 1.8017699718475342, + "learning_rate": 0.0001074107686923306, + "loss": 1.1889, + "step": 13741 + }, + { + "epoch": 0.4921302845273694, + "grad_norm": 1.3432918787002563, + "learning_rate": 0.00010739920152401605, + "loss": 1.36, + "step": 13742 + }, + { + "epoch": 0.4921660966569377, + "grad_norm": 1.426053524017334, + "learning_rate": 0.00010738763425615479, + "loss": 0.9787, + "step": 13743 + }, + { + "epoch": 0.492201908786506, + "grad_norm": 1.2098139524459839, + "learning_rate": 0.00010737606688890245, + "loss": 1.1136, + "step": 13744 + }, + { + "epoch": 0.49223772091607426, + "grad_norm": 1.290090560913086, + "learning_rate": 0.00010736449942241465, + "loss": 1.0707, + "step": 13745 + }, + { + "epoch": 0.4922735330456426, + "grad_norm": 1.488801121711731, + "learning_rate": 0.000107352931856847, + "loss": 1.0833, + "step": 13746 + }, + { + "epoch": 0.49230934517521086, + "grad_norm": 1.6005643606185913, + "learning_rate": 0.00010734136419235512, + "loss": 1.1291, + "step": 13747 + }, + { + "epoch": 0.4923451573047791, + "grad_norm": 2.502366781234741, + "learning_rate": 0.00010732979642909466, + "loss": 1.1522, + "step": 13748 + }, + { + "epoch": 0.4923809694343474, + "grad_norm": 1.3307945728302002, + "learning_rate": 0.00010731822856722127, + "loss": 1.1994, + "step": 13749 + }, + { + "epoch": 0.4924167815639157, + "grad_norm": 1.735573172569275, + "learning_rate": 0.00010730666060689053, + "loss": 1.3834, + "step": 13750 + }, + { + "epoch": 0.492452593693484, + "grad_norm": 1.5113552808761597, + "learning_rate": 0.00010729509254825811, + "loss": 1.1548, + "step": 13751 + }, + { + "epoch": 0.49248840582305226, + "grad_norm": 1.7052990198135376, + "learning_rate": 0.00010728352439147959, + "loss": 1.3382, + "step": 13752 + }, + { + "epoch": 0.4925242179526206, + "grad_norm": 1.742197871208191, + "learning_rate": 0.00010727195613671071, + "loss": 1.1707, + "step": 13753 + }, + { + "epoch": 0.49256003008218885, + "grad_norm": 1.4157462120056152, + "learning_rate": 0.00010726038778410699, + "loss": 1.0746, + "step": 13754 + }, + { + "epoch": 0.4925958422117571, + "grad_norm": 1.4352636337280273, + "learning_rate": 0.00010724881933382416, + "loss": 1.2786, + "step": 13755 + }, + { + "epoch": 0.4926316543413254, + "grad_norm": 1.7616106271743774, + "learning_rate": 0.00010723725078601778, + "loss": 1.2101, + "step": 13756 + }, + { + "epoch": 0.4926674664708937, + "grad_norm": 1.556312084197998, + "learning_rate": 0.00010722568214084354, + "loss": 1.3369, + "step": 13757 + }, + { + "epoch": 0.492703278600462, + "grad_norm": 1.359554648399353, + "learning_rate": 0.00010721411339845707, + "loss": 1.1169, + "step": 13758 + }, + { + "epoch": 0.49273909073003025, + "grad_norm": 1.3362705707550049, + "learning_rate": 0.00010720254455901399, + "loss": 1.0712, + "step": 13759 + }, + { + "epoch": 0.4927749028595986, + "grad_norm": 2.0283734798431396, + "learning_rate": 0.00010719097562266998, + "loss": 1.0248, + "step": 13760 + }, + { + "epoch": 0.49281071498916684, + "grad_norm": 1.2557852268218994, + "learning_rate": 0.00010717940658958066, + "loss": 1.1316, + "step": 13761 + }, + { + "epoch": 0.4928465271187351, + "grad_norm": 2.631258726119995, + "learning_rate": 0.00010716783745990169, + "loss": 1.1171, + "step": 13762 + }, + { + "epoch": 0.4928823392483034, + "grad_norm": 1.4781328439712524, + "learning_rate": 0.0001071562682337887, + "loss": 1.1865, + "step": 13763 + }, + { + "epoch": 0.4929181513778717, + "grad_norm": 1.4690414667129517, + "learning_rate": 0.0001071446989113974, + "loss": 1.0459, + "step": 13764 + }, + { + "epoch": 0.49295396350744, + "grad_norm": 1.4121190309524536, + "learning_rate": 0.00010713312949288334, + "loss": 1.0635, + "step": 13765 + }, + { + "epoch": 0.49298977563700824, + "grad_norm": 1.4876601696014404, + "learning_rate": 0.00010712155997840225, + "loss": 1.1006, + "step": 13766 + }, + { + "epoch": 0.4930255877665765, + "grad_norm": 1.3167495727539062, + "learning_rate": 0.00010710999036810975, + "loss": 0.8325, + "step": 13767 + }, + { + "epoch": 0.49306139989614484, + "grad_norm": 1.399275302886963, + "learning_rate": 0.00010709842066216151, + "loss": 0.9345, + "step": 13768 + }, + { + "epoch": 0.4930972120257131, + "grad_norm": 1.4271806478500366, + "learning_rate": 0.00010708685086071316, + "loss": 1.0093, + "step": 13769 + }, + { + "epoch": 0.4931330241552814, + "grad_norm": 2.4268131256103516, + "learning_rate": 0.00010707528096392038, + "loss": 1.0875, + "step": 13770 + }, + { + "epoch": 0.4931688362848497, + "grad_norm": 1.622310757637024, + "learning_rate": 0.00010706371097193881, + "loss": 1.3032, + "step": 13771 + }, + { + "epoch": 0.49320464841441797, + "grad_norm": 1.7864476442337036, + "learning_rate": 0.00010705214088492415, + "loss": 1.0674, + "step": 13772 + }, + { + "epoch": 0.49324046054398624, + "grad_norm": 1.587343692779541, + "learning_rate": 0.00010704057070303201, + "loss": 1.072, + "step": 13773 + }, + { + "epoch": 0.4932762726735545, + "grad_norm": 1.6379622220993042, + "learning_rate": 0.00010702900042641806, + "loss": 1.1576, + "step": 13774 + }, + { + "epoch": 0.49331208480312283, + "grad_norm": 1.692352294921875, + "learning_rate": 0.00010701743005523801, + "loss": 1.0916, + "step": 13775 + }, + { + "epoch": 0.4933478969326911, + "grad_norm": 1.9300954341888428, + "learning_rate": 0.00010700585958964744, + "loss": 1.2834, + "step": 13776 + }, + { + "epoch": 0.49338370906225937, + "grad_norm": 1.7432961463928223, + "learning_rate": 0.00010699428902980211, + "loss": 1.3523, + "step": 13777 + }, + { + "epoch": 0.4934195211918277, + "grad_norm": 1.5679579973220825, + "learning_rate": 0.00010698271837585762, + "loss": 1.0926, + "step": 13778 + }, + { + "epoch": 0.49345533332139596, + "grad_norm": 1.2336353063583374, + "learning_rate": 0.0001069711476279697, + "loss": 1.1424, + "step": 13779 + }, + { + "epoch": 0.49349114545096423, + "grad_norm": 1.3836613893508911, + "learning_rate": 0.00010695957678629391, + "loss": 1.0735, + "step": 13780 + }, + { + "epoch": 0.4935269575805325, + "grad_norm": 1.6974273920059204, + "learning_rate": 0.00010694800585098606, + "loss": 1.0574, + "step": 13781 + }, + { + "epoch": 0.4935627697101008, + "grad_norm": 1.4904338121414185, + "learning_rate": 0.00010693643482220173, + "loss": 1.2383, + "step": 13782 + }, + { + "epoch": 0.4935985818396691, + "grad_norm": 1.6585134267807007, + "learning_rate": 0.0001069248637000966, + "loss": 1.0588, + "step": 13783 + }, + { + "epoch": 0.49363439396923736, + "grad_norm": 1.4678122997283936, + "learning_rate": 0.0001069132924848264, + "loss": 1.0494, + "step": 13784 + }, + { + "epoch": 0.4936702060988057, + "grad_norm": 2.3628203868865967, + "learning_rate": 0.00010690172117654672, + "loss": 1.2959, + "step": 13785 + }, + { + "epoch": 0.49370601822837396, + "grad_norm": 1.3129981756210327, + "learning_rate": 0.00010689014977541332, + "loss": 1.0732, + "step": 13786 + }, + { + "epoch": 0.4937418303579422, + "grad_norm": 1.2995208501815796, + "learning_rate": 0.00010687857828158182, + "loss": 1.1829, + "step": 13787 + }, + { + "epoch": 0.4937776424875105, + "grad_norm": 1.3268799781799316, + "learning_rate": 0.00010686700669520792, + "loss": 1.1719, + "step": 13788 + }, + { + "epoch": 0.4938134546170788, + "grad_norm": 1.3405041694641113, + "learning_rate": 0.00010685543501644732, + "loss": 0.9801, + "step": 13789 + }, + { + "epoch": 0.4938492667466471, + "grad_norm": 1.9763669967651367, + "learning_rate": 0.00010684386324545567, + "loss": 1.2626, + "step": 13790 + }, + { + "epoch": 0.49388507887621536, + "grad_norm": 1.2797378301620483, + "learning_rate": 0.0001068322913823887, + "loss": 0.8953, + "step": 13791 + }, + { + "epoch": 0.4939208910057837, + "grad_norm": 1.3723903894424438, + "learning_rate": 0.00010682071942740202, + "loss": 0.9117, + "step": 13792 + }, + { + "epoch": 0.49395670313535195, + "grad_norm": 1.5404770374298096, + "learning_rate": 0.0001068091473806514, + "loss": 1.1026, + "step": 13793 + }, + { + "epoch": 0.4939925152649202, + "grad_norm": 1.4437459707260132, + "learning_rate": 0.00010679757524229244, + "loss": 1.0259, + "step": 13794 + }, + { + "epoch": 0.4940283273944885, + "grad_norm": 1.4981592893600464, + "learning_rate": 0.0001067860030124809, + "loss": 1.0324, + "step": 13795 + }, + { + "epoch": 0.4940641395240568, + "grad_norm": 1.4598103761672974, + "learning_rate": 0.00010677443069137242, + "loss": 0.9854, + "step": 13796 + }, + { + "epoch": 0.4940999516536251, + "grad_norm": 2.172757863998413, + "learning_rate": 0.00010676285827912276, + "loss": 0.984, + "step": 13797 + }, + { + "epoch": 0.49413576378319335, + "grad_norm": 1.3860734701156616, + "learning_rate": 0.00010675128577588751, + "loss": 1.2231, + "step": 13798 + }, + { + "epoch": 0.4941715759127617, + "grad_norm": 1.3211885690689087, + "learning_rate": 0.00010673971318182247, + "loss": 1.2113, + "step": 13799 + }, + { + "epoch": 0.49420738804232994, + "grad_norm": 1.5944465398788452, + "learning_rate": 0.00010672814049708326, + "loss": 1.158, + "step": 13800 + }, + { + "epoch": 0.4942432001718982, + "grad_norm": 1.7542108297348022, + "learning_rate": 0.0001067165677218256, + "loss": 0.9705, + "step": 13801 + }, + { + "epoch": 0.4942790123014665, + "grad_norm": 1.7157632112503052, + "learning_rate": 0.00010670499485620517, + "loss": 1.1118, + "step": 13802 + }, + { + "epoch": 0.4943148244310348, + "grad_norm": 1.512017011642456, + "learning_rate": 0.0001066934219003777, + "loss": 1.0031, + "step": 13803 + }, + { + "epoch": 0.4943506365606031, + "grad_norm": 1.414271593093872, + "learning_rate": 0.00010668184885449886, + "loss": 1.1335, + "step": 13804 + }, + { + "epoch": 0.49438644869017134, + "grad_norm": 1.433200716972351, + "learning_rate": 0.00010667027571872436, + "loss": 0.9065, + "step": 13805 + }, + { + "epoch": 0.49442226081973967, + "grad_norm": 1.6589115858078003, + "learning_rate": 0.00010665870249320993, + "loss": 1.206, + "step": 13806 + }, + { + "epoch": 0.49445807294930794, + "grad_norm": 1.7840197086334229, + "learning_rate": 0.00010664712917811121, + "loss": 1.2437, + "step": 13807 + }, + { + "epoch": 0.4944938850788762, + "grad_norm": 1.614095687866211, + "learning_rate": 0.000106635555773584, + "loss": 1.0269, + "step": 13808 + }, + { + "epoch": 0.4945296972084445, + "grad_norm": 1.9239699840545654, + "learning_rate": 0.00010662398227978389, + "loss": 0.9168, + "step": 13809 + }, + { + "epoch": 0.4945655093380128, + "grad_norm": 1.2627544403076172, + "learning_rate": 0.00010661240869686669, + "loss": 1.1032, + "step": 13810 + }, + { + "epoch": 0.49460132146758107, + "grad_norm": 1.559301733970642, + "learning_rate": 0.00010660083502498801, + "loss": 1.0322, + "step": 13811 + }, + { + "epoch": 0.49463713359714934, + "grad_norm": 1.6419615745544434, + "learning_rate": 0.00010658926126430364, + "loss": 1.1977, + "step": 13812 + }, + { + "epoch": 0.49467294572671766, + "grad_norm": 1.5492384433746338, + "learning_rate": 0.00010657768741496923, + "loss": 1.0677, + "step": 13813 + }, + { + "epoch": 0.49470875785628593, + "grad_norm": 1.644034504890442, + "learning_rate": 0.00010656611347714056, + "loss": 1.2693, + "step": 13814 + }, + { + "epoch": 0.4947445699858542, + "grad_norm": 1.886256456375122, + "learning_rate": 0.00010655453945097327, + "loss": 1.3287, + "step": 13815 + }, + { + "epoch": 0.49478038211542247, + "grad_norm": 1.7566969394683838, + "learning_rate": 0.0001065429653366231, + "loss": 1.1559, + "step": 13816 + }, + { + "epoch": 0.4948161942449908, + "grad_norm": 1.7946645021438599, + "learning_rate": 0.00010653139113424581, + "loss": 1.254, + "step": 13817 + }, + { + "epoch": 0.49485200637455906, + "grad_norm": 1.6090731620788574, + "learning_rate": 0.00010651981684399705, + "loss": 1.0035, + "step": 13818 + }, + { + "epoch": 0.49488781850412733, + "grad_norm": 1.5503422021865845, + "learning_rate": 0.0001065082424660326, + "loss": 1.0019, + "step": 13819 + }, + { + "epoch": 0.49492363063369565, + "grad_norm": 1.3699657917022705, + "learning_rate": 0.00010649666800050808, + "loss": 1.0308, + "step": 13820 + }, + { + "epoch": 0.4949594427632639, + "grad_norm": 1.932310700416565, + "learning_rate": 0.00010648509344757933, + "loss": 1.0713, + "step": 13821 + }, + { + "epoch": 0.4949952548928322, + "grad_norm": 1.5666706562042236, + "learning_rate": 0.00010647351880740197, + "loss": 1.0824, + "step": 13822 + }, + { + "epoch": 0.49503106702240046, + "grad_norm": 1.57487154006958, + "learning_rate": 0.00010646194408013179, + "loss": 1.1394, + "step": 13823 + }, + { + "epoch": 0.4950668791519688, + "grad_norm": 1.4516996145248413, + "learning_rate": 0.00010645036926592449, + "loss": 0.9901, + "step": 13824 + }, + { + "epoch": 0.49510269128153706, + "grad_norm": 1.5141478776931763, + "learning_rate": 0.00010643879436493578, + "loss": 1.2143, + "step": 13825 + }, + { + "epoch": 0.4951385034111053, + "grad_norm": 1.5703903436660767, + "learning_rate": 0.0001064272193773214, + "loss": 1.0349, + "step": 13826 + }, + { + "epoch": 0.49517431554067365, + "grad_norm": 1.6609444618225098, + "learning_rate": 0.00010641564430323707, + "loss": 1.1853, + "step": 13827 + }, + { + "epoch": 0.4952101276702419, + "grad_norm": 1.761574149131775, + "learning_rate": 0.00010640406914283854, + "loss": 0.9975, + "step": 13828 + }, + { + "epoch": 0.4952459397998102, + "grad_norm": 1.5595309734344482, + "learning_rate": 0.00010639249389628149, + "loss": 1.1995, + "step": 13829 + }, + { + "epoch": 0.49528175192937846, + "grad_norm": 1.7487800121307373, + "learning_rate": 0.00010638091856372172, + "loss": 1.0928, + "step": 13830 + }, + { + "epoch": 0.4953175640589468, + "grad_norm": 1.3622695207595825, + "learning_rate": 0.00010636934314531488, + "loss": 1.2705, + "step": 13831 + }, + { + "epoch": 0.49535337618851505, + "grad_norm": 1.5660585165023804, + "learning_rate": 0.00010635776764121677, + "loss": 1.1772, + "step": 13832 + }, + { + "epoch": 0.4953891883180833, + "grad_norm": 1.435826301574707, + "learning_rate": 0.00010634619205158307, + "loss": 1.1988, + "step": 13833 + }, + { + "epoch": 0.49542500044765164, + "grad_norm": 1.415701985359192, + "learning_rate": 0.00010633461637656958, + "loss": 1.1656, + "step": 13834 + }, + { + "epoch": 0.4954608125772199, + "grad_norm": 1.7686851024627686, + "learning_rate": 0.00010632304061633199, + "loss": 1.1999, + "step": 13835 + }, + { + "epoch": 0.4954966247067882, + "grad_norm": 1.2012666463851929, + "learning_rate": 0.00010631146477102602, + "loss": 0.9184, + "step": 13836 + }, + { + "epoch": 0.49553243683635645, + "grad_norm": 1.3070536851882935, + "learning_rate": 0.00010629988884080745, + "loss": 1.1138, + "step": 13837 + }, + { + "epoch": 0.4955682489659248, + "grad_norm": 1.6814719438552856, + "learning_rate": 0.00010628831282583201, + "loss": 0.9529, + "step": 13838 + }, + { + "epoch": 0.49560406109549304, + "grad_norm": 1.6987860202789307, + "learning_rate": 0.00010627673672625542, + "loss": 1.1551, + "step": 13839 + }, + { + "epoch": 0.4956398732250613, + "grad_norm": 1.3387701511383057, + "learning_rate": 0.00010626516054223341, + "loss": 1.1361, + "step": 13840 + }, + { + "epoch": 0.49567568535462964, + "grad_norm": 1.4807765483856201, + "learning_rate": 0.0001062535842739218, + "loss": 1.0792, + "step": 13841 + }, + { + "epoch": 0.4957114974841979, + "grad_norm": 1.2418360710144043, + "learning_rate": 0.00010624200792147622, + "loss": 1.2822, + "step": 13842 + }, + { + "epoch": 0.4957473096137662, + "grad_norm": 1.647182583808899, + "learning_rate": 0.00010623043148505254, + "loss": 1.044, + "step": 13843 + }, + { + "epoch": 0.49578312174333444, + "grad_norm": 2.0033323764801025, + "learning_rate": 0.00010621885496480641, + "loss": 1.1442, + "step": 13844 + }, + { + "epoch": 0.49581893387290277, + "grad_norm": 1.5948082208633423, + "learning_rate": 0.00010620727836089359, + "loss": 1.2407, + "step": 13845 + }, + { + "epoch": 0.49585474600247104, + "grad_norm": 1.5178366899490356, + "learning_rate": 0.00010619570167346987, + "loss": 1.001, + "step": 13846 + }, + { + "epoch": 0.4958905581320393, + "grad_norm": 1.4396636486053467, + "learning_rate": 0.00010618412490269096, + "loss": 1.1096, + "step": 13847 + }, + { + "epoch": 0.49592637026160763, + "grad_norm": 1.2894892692565918, + "learning_rate": 0.00010617254804871264, + "loss": 1.0191, + "step": 13848 + }, + { + "epoch": 0.4959621823911759, + "grad_norm": 1.3107260465621948, + "learning_rate": 0.00010616097111169063, + "loss": 1.1076, + "step": 13849 + }, + { + "epoch": 0.49599799452074417, + "grad_norm": 1.699028491973877, + "learning_rate": 0.00010614939409178072, + "loss": 1.0835, + "step": 13850 + }, + { + "epoch": 0.49603380665031244, + "grad_norm": 1.8200846910476685, + "learning_rate": 0.00010613781698913863, + "loss": 1.0093, + "step": 13851 + }, + { + "epoch": 0.49606961877988076, + "grad_norm": 1.2825819253921509, + "learning_rate": 0.00010612623980392016, + "loss": 0.8726, + "step": 13852 + }, + { + "epoch": 0.49610543090944903, + "grad_norm": 1.6639866828918457, + "learning_rate": 0.00010611466253628101, + "loss": 1.1175, + "step": 13853 + }, + { + "epoch": 0.4961412430390173, + "grad_norm": 1.3772714138031006, + "learning_rate": 0.00010610308518637697, + "loss": 1.0589, + "step": 13854 + }, + { + "epoch": 0.4961770551685856, + "grad_norm": 1.4199450016021729, + "learning_rate": 0.00010609150775436378, + "loss": 1.1405, + "step": 13855 + }, + { + "epoch": 0.4962128672981539, + "grad_norm": 1.8559718132019043, + "learning_rate": 0.00010607993024039722, + "loss": 0.978, + "step": 13856 + }, + { + "epoch": 0.49624867942772216, + "grad_norm": 1.2508903741836548, + "learning_rate": 0.00010606835264463305, + "loss": 1.066, + "step": 13857 + }, + { + "epoch": 0.49628449155729043, + "grad_norm": 2.5277106761932373, + "learning_rate": 0.00010605677496722699, + "loss": 1.4136, + "step": 13858 + }, + { + "epoch": 0.49632030368685875, + "grad_norm": 1.7025820016860962, + "learning_rate": 0.00010604519720833486, + "loss": 1.2563, + "step": 13859 + }, + { + "epoch": 0.496356115816427, + "grad_norm": 1.4113322496414185, + "learning_rate": 0.00010603361936811239, + "loss": 1.1471, + "step": 13860 + }, + { + "epoch": 0.4963919279459953, + "grad_norm": 1.5839418172836304, + "learning_rate": 0.00010602204144671539, + "loss": 1.1731, + "step": 13861 + }, + { + "epoch": 0.4964277400755636, + "grad_norm": 2.1083381175994873, + "learning_rate": 0.00010601046344429955, + "loss": 1.1302, + "step": 13862 + }, + { + "epoch": 0.4964635522051319, + "grad_norm": 1.9151639938354492, + "learning_rate": 0.0001059988853610207, + "loss": 1.001, + "step": 13863 + }, + { + "epoch": 0.49649936433470016, + "grad_norm": 1.3262253999710083, + "learning_rate": 0.00010598730719703456, + "loss": 1.288, + "step": 13864 + }, + { + "epoch": 0.4965351764642684, + "grad_norm": 1.4913891553878784, + "learning_rate": 0.00010597572895249694, + "loss": 0.9304, + "step": 13865 + }, + { + "epoch": 0.49657098859383675, + "grad_norm": 1.7758768796920776, + "learning_rate": 0.00010596415062756358, + "loss": 1.093, + "step": 13866 + }, + { + "epoch": 0.496606800723405, + "grad_norm": 1.65043044090271, + "learning_rate": 0.0001059525722223903, + "loss": 1.1542, + "step": 13867 + }, + { + "epoch": 0.4966426128529733, + "grad_norm": 1.4599688053131104, + "learning_rate": 0.0001059409937371328, + "loss": 1.1542, + "step": 13868 + }, + { + "epoch": 0.4966784249825416, + "grad_norm": 1.4354685544967651, + "learning_rate": 0.00010592941517194692, + "loss": 1.0471, + "step": 13869 + }, + { + "epoch": 0.4967142371121099, + "grad_norm": 1.6551918983459473, + "learning_rate": 0.00010591783652698841, + "loss": 1.0399, + "step": 13870 + }, + { + "epoch": 0.49675004924167815, + "grad_norm": 1.6226146221160889, + "learning_rate": 0.00010590625780241302, + "loss": 0.9958, + "step": 13871 + }, + { + "epoch": 0.4967858613712464, + "grad_norm": 1.299118161201477, + "learning_rate": 0.00010589467899837657, + "loss": 0.9314, + "step": 13872 + }, + { + "epoch": 0.49682167350081474, + "grad_norm": 1.7053788900375366, + "learning_rate": 0.0001058831001150348, + "loss": 1.1331, + "step": 13873 + }, + { + "epoch": 0.496857485630383, + "grad_norm": 1.5373817682266235, + "learning_rate": 0.00010587152115254353, + "loss": 1.2187, + "step": 13874 + }, + { + "epoch": 0.4968932977599513, + "grad_norm": 1.6143414974212646, + "learning_rate": 0.0001058599421110585, + "loss": 1.0099, + "step": 13875 + }, + { + "epoch": 0.4969291098895196, + "grad_norm": 1.7369681596755981, + "learning_rate": 0.0001058483629907355, + "loss": 1.1596, + "step": 13876 + }, + { + "epoch": 0.4969649220190879, + "grad_norm": 1.8071568012237549, + "learning_rate": 0.00010583678379173032, + "loss": 1.3059, + "step": 13877 + }, + { + "epoch": 0.49700073414865614, + "grad_norm": 2.3906731605529785, + "learning_rate": 0.00010582520451419877, + "loss": 1.168, + "step": 13878 + }, + { + "epoch": 0.4970365462782244, + "grad_norm": 1.2538001537322998, + "learning_rate": 0.0001058136251582966, + "loss": 1.1954, + "step": 13879 + }, + { + "epoch": 0.49707235840779274, + "grad_norm": 1.2857609987258911, + "learning_rate": 0.00010580204572417957, + "loss": 1.2057, + "step": 13880 + }, + { + "epoch": 0.497108170537361, + "grad_norm": 1.9411721229553223, + "learning_rate": 0.00010579046621200355, + "loss": 1.3382, + "step": 13881 + }, + { + "epoch": 0.4971439826669293, + "grad_norm": 1.607828140258789, + "learning_rate": 0.00010577888662192424, + "loss": 1.2554, + "step": 13882 + }, + { + "epoch": 0.4971797947964976, + "grad_norm": 1.4239394664764404, + "learning_rate": 0.00010576730695409747, + "loss": 1.0555, + "step": 13883 + }, + { + "epoch": 0.49721560692606587, + "grad_norm": 1.85844087600708, + "learning_rate": 0.00010575572720867901, + "loss": 1.322, + "step": 13884 + }, + { + "epoch": 0.49725141905563414, + "grad_norm": 1.6631128787994385, + "learning_rate": 0.0001057441473858247, + "loss": 1.0701, + "step": 13885 + }, + { + "epoch": 0.4972872311852024, + "grad_norm": 1.548978567123413, + "learning_rate": 0.00010573256748569027, + "loss": 1.1452, + "step": 13886 + }, + { + "epoch": 0.49732304331477073, + "grad_norm": 1.7792048454284668, + "learning_rate": 0.00010572098750843155, + "loss": 1.1631, + "step": 13887 + }, + { + "epoch": 0.497358855444339, + "grad_norm": 1.3964602947235107, + "learning_rate": 0.00010570940745420433, + "loss": 1.2638, + "step": 13888 + }, + { + "epoch": 0.49739466757390727, + "grad_norm": 1.4696050882339478, + "learning_rate": 0.00010569782732316438, + "loss": 1.0954, + "step": 13889 + }, + { + "epoch": 0.4974304797034756, + "grad_norm": 1.6925228834152222, + "learning_rate": 0.00010568624711546752, + "loss": 1.0818, + "step": 13890 + }, + { + "epoch": 0.49746629183304386, + "grad_norm": 1.4731111526489258, + "learning_rate": 0.00010567466683126952, + "loss": 1.2408, + "step": 13891 + }, + { + "epoch": 0.49750210396261213, + "grad_norm": 2.1797070503234863, + "learning_rate": 0.00010566308647072624, + "loss": 1.2455, + "step": 13892 + }, + { + "epoch": 0.4975379160921804, + "grad_norm": 1.7988015413284302, + "learning_rate": 0.0001056515060339934, + "loss": 1.3439, + "step": 13893 + }, + { + "epoch": 0.4975737282217487, + "grad_norm": 1.4910448789596558, + "learning_rate": 0.00010563992552122686, + "loss": 0.9366, + "step": 13894 + }, + { + "epoch": 0.497609540351317, + "grad_norm": 1.533177137374878, + "learning_rate": 0.00010562834493258237, + "loss": 1.0234, + "step": 13895 + }, + { + "epoch": 0.49764535248088526, + "grad_norm": 1.8107879161834717, + "learning_rate": 0.00010561676426821581, + "loss": 1.1813, + "step": 13896 + }, + { + "epoch": 0.4976811646104536, + "grad_norm": 1.4899659156799316, + "learning_rate": 0.00010560518352828288, + "loss": 1.2787, + "step": 13897 + }, + { + "epoch": 0.49771697674002185, + "grad_norm": 2.279397964477539, + "learning_rate": 0.00010559360271293947, + "loss": 1.1392, + "step": 13898 + }, + { + "epoch": 0.4977527888695901, + "grad_norm": 1.67399001121521, + "learning_rate": 0.00010558202182234132, + "loss": 1.1891, + "step": 13899 + }, + { + "epoch": 0.4977886009991584, + "grad_norm": 1.595414161682129, + "learning_rate": 0.00010557044085664428, + "loss": 1.144, + "step": 13900 + }, + { + "epoch": 0.4978244131287267, + "grad_norm": 1.4120904207229614, + "learning_rate": 0.00010555885981600416, + "loss": 0.9224, + "step": 13901 + }, + { + "epoch": 0.497860225258295, + "grad_norm": 1.6231253147125244, + "learning_rate": 0.00010554727870057671, + "loss": 1.2343, + "step": 13902 + }, + { + "epoch": 0.49789603738786326, + "grad_norm": 1.6243733167648315, + "learning_rate": 0.00010553569751051782, + "loss": 1.0368, + "step": 13903 + }, + { + "epoch": 0.4979318495174316, + "grad_norm": 1.8923441171646118, + "learning_rate": 0.00010552411624598325, + "loss": 1.0315, + "step": 13904 + }, + { + "epoch": 0.49796766164699985, + "grad_norm": 1.9084713459014893, + "learning_rate": 0.00010551253490712882, + "loss": 1.1781, + "step": 13905 + }, + { + "epoch": 0.4980034737765681, + "grad_norm": 1.4363248348236084, + "learning_rate": 0.00010550095349411033, + "loss": 1.0423, + "step": 13906 + }, + { + "epoch": 0.4980392859061364, + "grad_norm": 1.4505283832550049, + "learning_rate": 0.00010548937200708365, + "loss": 1.0809, + "step": 13907 + }, + { + "epoch": 0.4980750980357047, + "grad_norm": 1.9849387407302856, + "learning_rate": 0.0001054777904462045, + "loss": 1.1657, + "step": 13908 + }, + { + "epoch": 0.498110910165273, + "grad_norm": 1.4838151931762695, + "learning_rate": 0.00010546620881162876, + "loss": 0.9567, + "step": 13909 + }, + { + "epoch": 0.49814672229484125, + "grad_norm": 1.4156662225723267, + "learning_rate": 0.00010545462710351224, + "loss": 1.1412, + "step": 13910 + }, + { + "epoch": 0.4981825344244096, + "grad_norm": 1.489721655845642, + "learning_rate": 0.00010544304532201075, + "loss": 0.8814, + "step": 13911 + }, + { + "epoch": 0.49821834655397784, + "grad_norm": 1.4600305557250977, + "learning_rate": 0.0001054314634672801, + "loss": 1.2966, + "step": 13912 + }, + { + "epoch": 0.4982541586835461, + "grad_norm": 1.3036302328109741, + "learning_rate": 0.00010541988153947609, + "loss": 1.1192, + "step": 13913 + }, + { + "epoch": 0.4982899708131144, + "grad_norm": 1.498739242553711, + "learning_rate": 0.00010540829953875462, + "loss": 1.0574, + "step": 13914 + }, + { + "epoch": 0.4983257829426827, + "grad_norm": 1.5034939050674438, + "learning_rate": 0.00010539671746527142, + "loss": 1.1223, + "step": 13915 + }, + { + "epoch": 0.498361595072251, + "grad_norm": 1.9260262250900269, + "learning_rate": 0.00010538513531918237, + "loss": 1.059, + "step": 13916 + }, + { + "epoch": 0.49839740720181924, + "grad_norm": 1.69242525100708, + "learning_rate": 0.00010537355310064323, + "loss": 1.0896, + "step": 13917 + }, + { + "epoch": 0.49843321933138757, + "grad_norm": 1.1525650024414062, + "learning_rate": 0.00010536197080980991, + "loss": 1.1464, + "step": 13918 + }, + { + "epoch": 0.49846903146095584, + "grad_norm": 1.7765785455703735, + "learning_rate": 0.00010535038844683816, + "loss": 1.2372, + "step": 13919 + }, + { + "epoch": 0.4985048435905241, + "grad_norm": 1.3321387767791748, + "learning_rate": 0.00010533880601188384, + "loss": 1.1367, + "step": 13920 + }, + { + "epoch": 0.4985406557200924, + "grad_norm": 1.755483865737915, + "learning_rate": 0.00010532722350510277, + "loss": 1.1011, + "step": 13921 + }, + { + "epoch": 0.4985764678496607, + "grad_norm": 1.3745454549789429, + "learning_rate": 0.00010531564092665079, + "loss": 1.1466, + "step": 13922 + }, + { + "epoch": 0.49861227997922897, + "grad_norm": 1.6150037050247192, + "learning_rate": 0.00010530405827668372, + "loss": 0.992, + "step": 13923 + }, + { + "epoch": 0.49864809210879724, + "grad_norm": 1.3955692052841187, + "learning_rate": 0.00010529247555535738, + "loss": 1.2364, + "step": 13924 + }, + { + "epoch": 0.49868390423836556, + "grad_norm": 1.4054596424102783, + "learning_rate": 0.00010528089276282762, + "loss": 1.2169, + "step": 13925 + }, + { + "epoch": 0.49871971636793383, + "grad_norm": 1.4992358684539795, + "learning_rate": 0.00010526930989925023, + "loss": 1.2276, + "step": 13926 + }, + { + "epoch": 0.4987555284975021, + "grad_norm": 1.4871578216552734, + "learning_rate": 0.0001052577269647811, + "loss": 1.1181, + "step": 13927 + }, + { + "epoch": 0.49879134062707037, + "grad_norm": 1.1421825885772705, + "learning_rate": 0.00010524614395957602, + "loss": 0.9658, + "step": 13928 + }, + { + "epoch": 0.4988271527566387, + "grad_norm": 2.0803539752960205, + "learning_rate": 0.00010523456088379084, + "loss": 1.1188, + "step": 13929 + }, + { + "epoch": 0.49886296488620696, + "grad_norm": 1.5746747255325317, + "learning_rate": 0.00010522297773758141, + "loss": 1.1812, + "step": 13930 + }, + { + "epoch": 0.49889877701577523, + "grad_norm": 1.3626153469085693, + "learning_rate": 0.00010521139452110354, + "loss": 1.204, + "step": 13931 + }, + { + "epoch": 0.49893458914534355, + "grad_norm": 1.9161467552185059, + "learning_rate": 0.0001051998112345131, + "loss": 0.9503, + "step": 13932 + }, + { + "epoch": 0.4989704012749118, + "grad_norm": 1.588303804397583, + "learning_rate": 0.00010518822787796587, + "loss": 1.2246, + "step": 13933 + }, + { + "epoch": 0.4990062134044801, + "grad_norm": 1.537840485572815, + "learning_rate": 0.00010517664445161775, + "loss": 1.0364, + "step": 13934 + }, + { + "epoch": 0.49904202553404836, + "grad_norm": 1.8107130527496338, + "learning_rate": 0.00010516506095562455, + "loss": 1.0451, + "step": 13935 + }, + { + "epoch": 0.4990778376636167, + "grad_norm": 1.7555510997772217, + "learning_rate": 0.00010515347739014212, + "loss": 1.0871, + "step": 13936 + }, + { + "epoch": 0.49911364979318495, + "grad_norm": 1.4125629663467407, + "learning_rate": 0.00010514189375532629, + "loss": 0.9314, + "step": 13937 + }, + { + "epoch": 0.4991494619227532, + "grad_norm": 1.5124410390853882, + "learning_rate": 0.00010513031005133293, + "loss": 0.9259, + "step": 13938 + }, + { + "epoch": 0.49918527405232155, + "grad_norm": 1.5038713216781616, + "learning_rate": 0.00010511872627831785, + "loss": 1.2553, + "step": 13939 + }, + { + "epoch": 0.4992210861818898, + "grad_norm": 1.1069097518920898, + "learning_rate": 0.00010510714243643693, + "loss": 0.9894, + "step": 13940 + }, + { + "epoch": 0.4992568983114581, + "grad_norm": 1.6683385372161865, + "learning_rate": 0.00010509555852584598, + "loss": 1.0605, + "step": 13941 + }, + { + "epoch": 0.49929271044102636, + "grad_norm": 1.382008671760559, + "learning_rate": 0.00010508397454670085, + "loss": 1.1837, + "step": 13942 + }, + { + "epoch": 0.4993285225705947, + "grad_norm": 1.8648359775543213, + "learning_rate": 0.00010507239049915742, + "loss": 1.4162, + "step": 13943 + }, + { + "epoch": 0.49936433470016295, + "grad_norm": 1.2795735597610474, + "learning_rate": 0.00010506080638337152, + "loss": 1.1074, + "step": 13944 + }, + { + "epoch": 0.4994001468297312, + "grad_norm": 1.8956758975982666, + "learning_rate": 0.000105049222199499, + "loss": 1.1999, + "step": 13945 + }, + { + "epoch": 0.49943595895929954, + "grad_norm": 1.3950077295303345, + "learning_rate": 0.0001050376379476957, + "loss": 0.9895, + "step": 13946 + }, + { + "epoch": 0.4994717710888678, + "grad_norm": 1.6319326162338257, + "learning_rate": 0.00010502605362811748, + "loss": 1.0322, + "step": 13947 + }, + { + "epoch": 0.4995075832184361, + "grad_norm": 1.4448676109313965, + "learning_rate": 0.00010501446924092018, + "loss": 1.0394, + "step": 13948 + }, + { + "epoch": 0.49954339534800435, + "grad_norm": 1.6936979293823242, + "learning_rate": 0.0001050028847862597, + "loss": 1.1927, + "step": 13949 + }, + { + "epoch": 0.4995792074775727, + "grad_norm": 1.5797710418701172, + "learning_rate": 0.00010499130026429182, + "loss": 1.1663, + "step": 13950 + }, + { + "epoch": 0.49961501960714094, + "grad_norm": 1.82871675491333, + "learning_rate": 0.00010497971567517246, + "loss": 1.0928, + "step": 13951 + }, + { + "epoch": 0.4996508317367092, + "grad_norm": 1.9071637392044067, + "learning_rate": 0.00010496813101905745, + "loss": 1.0283, + "step": 13952 + }, + { + "epoch": 0.49968664386627754, + "grad_norm": 1.473973035812378, + "learning_rate": 0.00010495654629610264, + "loss": 0.9557, + "step": 13953 + }, + { + "epoch": 0.4997224559958458, + "grad_norm": 2.1685428619384766, + "learning_rate": 0.00010494496150646387, + "loss": 1.2045, + "step": 13954 + }, + { + "epoch": 0.4997582681254141, + "grad_norm": 1.7883613109588623, + "learning_rate": 0.000104933376650297, + "loss": 1.2242, + "step": 13955 + }, + { + "epoch": 0.49979408025498234, + "grad_norm": 2.3001811504364014, + "learning_rate": 0.00010492179172775797, + "loss": 1.1037, + "step": 13956 + }, + { + "epoch": 0.49982989238455067, + "grad_norm": 1.552876591682434, + "learning_rate": 0.00010491020673900256, + "loss": 1.187, + "step": 13957 + }, + { + "epoch": 0.49986570451411894, + "grad_norm": 1.4268643856048584, + "learning_rate": 0.00010489862168418667, + "loss": 0.9256, + "step": 13958 + }, + { + "epoch": 0.4999015166436872, + "grad_norm": 1.6487617492675781, + "learning_rate": 0.00010488703656346612, + "loss": 1.2722, + "step": 13959 + }, + { + "epoch": 0.49993732877325553, + "grad_norm": 1.7078641653060913, + "learning_rate": 0.00010487545137699682, + "loss": 1.0063, + "step": 13960 + }, + { + "epoch": 0.4999731409028238, + "grad_norm": 1.756568431854248, + "learning_rate": 0.00010486386612493458, + "loss": 1.1325, + "step": 13961 + }, + { + "epoch": 0.5000089530323921, + "grad_norm": 1.5911824703216553, + "learning_rate": 0.00010485228080743532, + "loss": 0.9361, + "step": 13962 + }, + { + "epoch": 0.5000447651619604, + "grad_norm": 1.6170697212219238, + "learning_rate": 0.00010484069542465484, + "loss": 1.0464, + "step": 13963 + }, + { + "epoch": 0.5000805772915287, + "grad_norm": 1.4787074327468872, + "learning_rate": 0.00010482910997674911, + "loss": 1.1312, + "step": 13964 + }, + { + "epoch": 0.5001163894210969, + "grad_norm": 1.7114598751068115, + "learning_rate": 0.00010481752446387387, + "loss": 1.1562, + "step": 13965 + }, + { + "epoch": 0.5001522015506652, + "grad_norm": 1.432037115097046, + "learning_rate": 0.0001048059388861851, + "loss": 1.1672, + "step": 13966 + }, + { + "epoch": 0.5001880136802335, + "grad_norm": 1.6915920972824097, + "learning_rate": 0.00010479435324383861, + "loss": 1.0083, + "step": 13967 + }, + { + "epoch": 0.5002238258098017, + "grad_norm": 1.5433189868927002, + "learning_rate": 0.00010478276753699028, + "loss": 1.0964, + "step": 13968 + }, + { + "epoch": 0.5002596379393701, + "grad_norm": 1.4231839179992676, + "learning_rate": 0.00010477118176579597, + "loss": 1.062, + "step": 13969 + }, + { + "epoch": 0.5002954500689384, + "grad_norm": 1.637089490890503, + "learning_rate": 0.00010475959593041156, + "loss": 1.128, + "step": 13970 + }, + { + "epoch": 0.5003312621985067, + "grad_norm": 1.9502356052398682, + "learning_rate": 0.00010474801003099294, + "loss": 1.2173, + "step": 13971 + }, + { + "epoch": 0.5003670743280749, + "grad_norm": 1.6084601879119873, + "learning_rate": 0.00010473642406769597, + "loss": 0.965, + "step": 13972 + }, + { + "epoch": 0.5004028864576432, + "grad_norm": 1.452130675315857, + "learning_rate": 0.00010472483804067652, + "loss": 0.8237, + "step": 13973 + }, + { + "epoch": 0.5004386985872115, + "grad_norm": 1.3558834791183472, + "learning_rate": 0.00010471325195009047, + "loss": 1.2375, + "step": 13974 + }, + { + "epoch": 0.5004745107167797, + "grad_norm": 1.2988653182983398, + "learning_rate": 0.00010470166579609371, + "loss": 1.142, + "step": 13975 + }, + { + "epoch": 0.5005103228463481, + "grad_norm": 1.252532720565796, + "learning_rate": 0.0001046900795788421, + "loss": 1.057, + "step": 13976 + }, + { + "epoch": 0.5005461349759164, + "grad_norm": 1.6426969766616821, + "learning_rate": 0.00010467849329849148, + "loss": 1.1975, + "step": 13977 + }, + { + "epoch": 0.5005819471054846, + "grad_norm": 1.3974807262420654, + "learning_rate": 0.00010466690695519781, + "loss": 1.1387, + "step": 13978 + }, + { + "epoch": 0.5006177592350529, + "grad_norm": 1.5427067279815674, + "learning_rate": 0.00010465532054911689, + "loss": 1.2199, + "step": 13979 + }, + { + "epoch": 0.5006535713646212, + "grad_norm": 1.437906265258789, + "learning_rate": 0.00010464373408040467, + "loss": 1.2763, + "step": 13980 + }, + { + "epoch": 0.5006893834941895, + "grad_norm": 1.8036398887634277, + "learning_rate": 0.00010463214754921697, + "loss": 1.1656, + "step": 13981 + }, + { + "epoch": 0.5007251956237577, + "grad_norm": 2.2507805824279785, + "learning_rate": 0.00010462056095570974, + "loss": 1.2482, + "step": 13982 + }, + { + "epoch": 0.5007610077533261, + "grad_norm": 1.344923734664917, + "learning_rate": 0.00010460897430003877, + "loss": 1.0552, + "step": 13983 + }, + { + "epoch": 0.5007968198828944, + "grad_norm": 2.0101068019866943, + "learning_rate": 0.00010459738758236006, + "loss": 1.5306, + "step": 13984 + }, + { + "epoch": 0.5008326320124626, + "grad_norm": 1.3688249588012695, + "learning_rate": 0.00010458580080282938, + "loss": 1.2003, + "step": 13985 + }, + { + "epoch": 0.5008684441420309, + "grad_norm": 1.4398082494735718, + "learning_rate": 0.00010457421396160265, + "loss": 0.9946, + "step": 13986 + }, + { + "epoch": 0.5009042562715992, + "grad_norm": 1.8611685037612915, + "learning_rate": 0.00010456262705883581, + "loss": 0.9, + "step": 13987 + }, + { + "epoch": 0.5009400684011674, + "grad_norm": 1.5403368473052979, + "learning_rate": 0.0001045510400946847, + "loss": 1.1601, + "step": 13988 + }, + { + "epoch": 0.5009758805307357, + "grad_norm": 1.2801231145858765, + "learning_rate": 0.00010453945306930521, + "loss": 1.1046, + "step": 13989 + }, + { + "epoch": 0.5010116926603041, + "grad_norm": 1.5863529443740845, + "learning_rate": 0.00010452786598285323, + "loss": 1.2424, + "step": 13990 + }, + { + "epoch": 0.5010475047898724, + "grad_norm": 1.3631693124771118, + "learning_rate": 0.00010451627883548468, + "loss": 1.0477, + "step": 13991 + }, + { + "epoch": 0.5010833169194406, + "grad_norm": 1.2655340433120728, + "learning_rate": 0.00010450469162735539, + "loss": 1.0483, + "step": 13992 + }, + { + "epoch": 0.5011191290490089, + "grad_norm": 1.491312026977539, + "learning_rate": 0.00010449310435862134, + "loss": 1.1173, + "step": 13993 + }, + { + "epoch": 0.5011549411785772, + "grad_norm": 1.3643749952316284, + "learning_rate": 0.00010448151702943831, + "loss": 1.1138, + "step": 13994 + }, + { + "epoch": 0.5011907533081454, + "grad_norm": 1.64271080493927, + "learning_rate": 0.00010446992963996227, + "loss": 1.204, + "step": 13995 + }, + { + "epoch": 0.5012265654377137, + "grad_norm": 1.701359748840332, + "learning_rate": 0.00010445834219034909, + "loss": 1.2885, + "step": 13996 + }, + { + "epoch": 0.5012623775672821, + "grad_norm": 1.3747706413269043, + "learning_rate": 0.00010444675468075467, + "loss": 1.0658, + "step": 13997 + }, + { + "epoch": 0.5012981896968504, + "grad_norm": 1.234751582145691, + "learning_rate": 0.00010443516711133487, + "loss": 1.0785, + "step": 13998 + }, + { + "epoch": 0.5013340018264186, + "grad_norm": 1.779242992401123, + "learning_rate": 0.00010442357948224564, + "loss": 1.1942, + "step": 13999 + }, + { + "epoch": 0.5013698139559869, + "grad_norm": 1.492415428161621, + "learning_rate": 0.00010441199179364287, + "loss": 0.8559, + "step": 14000 + }, + { + "epoch": 0.5014056260855552, + "grad_norm": 2.090595006942749, + "learning_rate": 0.00010440040404568241, + "loss": 1.2214, + "step": 14001 + }, + { + "epoch": 0.5014414382151234, + "grad_norm": 1.3735376596450806, + "learning_rate": 0.00010438881623852026, + "loss": 1.2131, + "step": 14002 + }, + { + "epoch": 0.5014772503446917, + "grad_norm": 1.6405901908874512, + "learning_rate": 0.00010437722837231218, + "loss": 1.27, + "step": 14003 + }, + { + "epoch": 0.5015130624742601, + "grad_norm": 1.4393515586853027, + "learning_rate": 0.00010436564044721415, + "loss": 1.291, + "step": 14004 + }, + { + "epoch": 0.5015488746038284, + "grad_norm": 1.330583095550537, + "learning_rate": 0.00010435405246338205, + "loss": 1.0696, + "step": 14005 + }, + { + "epoch": 0.5015846867333966, + "grad_norm": 1.2545677423477173, + "learning_rate": 0.00010434246442097184, + "loss": 0.9991, + "step": 14006 + }, + { + "epoch": 0.5016204988629649, + "grad_norm": 1.9968000650405884, + "learning_rate": 0.00010433087632013931, + "loss": 1.1015, + "step": 14007 + }, + { + "epoch": 0.5016563109925332, + "grad_norm": 1.7296053171157837, + "learning_rate": 0.00010431928816104048, + "loss": 1.0451, + "step": 14008 + }, + { + "epoch": 0.5016921231221014, + "grad_norm": 1.9910703897476196, + "learning_rate": 0.00010430769994383116, + "loss": 1.1066, + "step": 14009 + }, + { + "epoch": 0.5017279352516697, + "grad_norm": 1.3906586170196533, + "learning_rate": 0.0001042961116686673, + "loss": 1.1145, + "step": 14010 + }, + { + "epoch": 0.5017637473812381, + "grad_norm": 2.102825403213501, + "learning_rate": 0.00010428452333570482, + "loss": 1.4164, + "step": 14011 + }, + { + "epoch": 0.5017995595108063, + "grad_norm": 1.8860812187194824, + "learning_rate": 0.0001042729349450996, + "loss": 1.1264, + "step": 14012 + }, + { + "epoch": 0.5018353716403746, + "grad_norm": 1.751267910003662, + "learning_rate": 0.00010426134649700754, + "loss": 1.0316, + "step": 14013 + }, + { + "epoch": 0.5018711837699429, + "grad_norm": 1.8104509115219116, + "learning_rate": 0.00010424975799158456, + "loss": 1.0405, + "step": 14014 + }, + { + "epoch": 0.5019069958995112, + "grad_norm": 1.4212226867675781, + "learning_rate": 0.00010423816942898659, + "loss": 1.2283, + "step": 14015 + }, + { + "epoch": 0.5019428080290794, + "grad_norm": 1.729678750038147, + "learning_rate": 0.00010422658080936947, + "loss": 1.248, + "step": 14016 + }, + { + "epoch": 0.5019786201586477, + "grad_norm": 1.5218816995620728, + "learning_rate": 0.00010421499213288919, + "loss": 1.1532, + "step": 14017 + }, + { + "epoch": 0.5020144322882161, + "grad_norm": 1.6274086236953735, + "learning_rate": 0.00010420340339970163, + "loss": 1.0987, + "step": 14018 + }, + { + "epoch": 0.5020502444177843, + "grad_norm": 1.7174677848815918, + "learning_rate": 0.0001041918146099627, + "loss": 1.2017, + "step": 14019 + }, + { + "epoch": 0.5020860565473526, + "grad_norm": 1.3492730855941772, + "learning_rate": 0.00010418022576382831, + "loss": 0.9116, + "step": 14020 + }, + { + "epoch": 0.5021218686769209, + "grad_norm": 2.415907621383667, + "learning_rate": 0.00010416863686145434, + "loss": 0.9904, + "step": 14021 + }, + { + "epoch": 0.5021576808064891, + "grad_norm": 1.4291274547576904, + "learning_rate": 0.00010415704790299678, + "loss": 1.0186, + "step": 14022 + }, + { + "epoch": 0.5021934929360574, + "grad_norm": 1.4777354001998901, + "learning_rate": 0.00010414545888861149, + "loss": 1.1277, + "step": 14023 + }, + { + "epoch": 0.5022293050656257, + "grad_norm": 1.5617023706436157, + "learning_rate": 0.0001041338698184544, + "loss": 1.0898, + "step": 14024 + }, + { + "epoch": 0.5022651171951941, + "grad_norm": 2.1644699573516846, + "learning_rate": 0.00010412228069268142, + "loss": 1.1152, + "step": 14025 + }, + { + "epoch": 0.5023009293247623, + "grad_norm": 1.4704514741897583, + "learning_rate": 0.00010411069151144848, + "loss": 1.2251, + "step": 14026 + }, + { + "epoch": 0.5023367414543306, + "grad_norm": 3.220775604248047, + "learning_rate": 0.00010409910227491146, + "loss": 1.3075, + "step": 14027 + }, + { + "epoch": 0.5023725535838989, + "grad_norm": 1.4341130256652832, + "learning_rate": 0.00010408751298322634, + "loss": 1.0173, + "step": 14028 + }, + { + "epoch": 0.5024083657134671, + "grad_norm": 1.6665929555892944, + "learning_rate": 0.00010407592363654901, + "loss": 1.1079, + "step": 14029 + }, + { + "epoch": 0.5024441778430354, + "grad_norm": 1.9801605939865112, + "learning_rate": 0.00010406433423503534, + "loss": 1.1702, + "step": 14030 + }, + { + "epoch": 0.5024799899726037, + "grad_norm": 1.5248323678970337, + "learning_rate": 0.00010405274477884135, + "loss": 0.9648, + "step": 14031 + }, + { + "epoch": 0.5025158021021721, + "grad_norm": 1.2400537729263306, + "learning_rate": 0.00010404115526812286, + "loss": 1.0932, + "step": 14032 + }, + { + "epoch": 0.5025516142317403, + "grad_norm": 1.5683361291885376, + "learning_rate": 0.00010402956570303586, + "loss": 1.1525, + "step": 14033 + }, + { + "epoch": 0.5025874263613086, + "grad_norm": 1.511968970298767, + "learning_rate": 0.00010401797608373625, + "loss": 1.2581, + "step": 14034 + }, + { + "epoch": 0.5026232384908769, + "grad_norm": 2.7152516841888428, + "learning_rate": 0.00010400638641037996, + "loss": 1.3002, + "step": 14035 + }, + { + "epoch": 0.5026590506204451, + "grad_norm": 1.4157695770263672, + "learning_rate": 0.00010399479668312288, + "loss": 1.1583, + "step": 14036 + }, + { + "epoch": 0.5026948627500134, + "grad_norm": 1.153918981552124, + "learning_rate": 0.00010398320690212102, + "loss": 1.0522, + "step": 14037 + }, + { + "epoch": 0.5027306748795817, + "grad_norm": 1.5124598741531372, + "learning_rate": 0.00010397161706753021, + "loss": 1.0381, + "step": 14038 + }, + { + "epoch": 0.50276648700915, + "grad_norm": 1.4917701482772827, + "learning_rate": 0.00010396002717950644, + "loss": 1.0775, + "step": 14039 + }, + { + "epoch": 0.5028022991387183, + "grad_norm": 1.8849823474884033, + "learning_rate": 0.00010394843723820558, + "loss": 1.0774, + "step": 14040 + }, + { + "epoch": 0.5028381112682866, + "grad_norm": 2.127854824066162, + "learning_rate": 0.00010393684724378358, + "loss": 1.0409, + "step": 14041 + }, + { + "epoch": 0.5028739233978549, + "grad_norm": 1.5723485946655273, + "learning_rate": 0.00010392525719639642, + "loss": 0.8244, + "step": 14042 + }, + { + "epoch": 0.5029097355274231, + "grad_norm": 1.4239248037338257, + "learning_rate": 0.00010391366709619994, + "loss": 1.054, + "step": 14043 + }, + { + "epoch": 0.5029455476569914, + "grad_norm": 1.4589346647262573, + "learning_rate": 0.00010390207694335017, + "loss": 1.1341, + "step": 14044 + }, + { + "epoch": 0.5029813597865597, + "grad_norm": 1.4287631511688232, + "learning_rate": 0.00010389048673800294, + "loss": 1.0458, + "step": 14045 + }, + { + "epoch": 0.503017171916128, + "grad_norm": 1.7384763956069946, + "learning_rate": 0.00010387889648031428, + "loss": 1.1807, + "step": 14046 + }, + { + "epoch": 0.5030529840456963, + "grad_norm": 1.4769114255905151, + "learning_rate": 0.00010386730617044005, + "loss": 1.0364, + "step": 14047 + }, + { + "epoch": 0.5030887961752646, + "grad_norm": 2.576192617416382, + "learning_rate": 0.0001038557158085362, + "loss": 1.2042, + "step": 14048 + }, + { + "epoch": 0.5031246083048329, + "grad_norm": 1.369301438331604, + "learning_rate": 0.00010384412539475865, + "loss": 0.926, + "step": 14049 + }, + { + "epoch": 0.5031604204344011, + "grad_norm": 1.5365663766860962, + "learning_rate": 0.00010383253492926339, + "loss": 1.2588, + "step": 14050 + }, + { + "epoch": 0.5031962325639694, + "grad_norm": 1.5549591779708862, + "learning_rate": 0.00010382094441220627, + "loss": 1.0712, + "step": 14051 + }, + { + "epoch": 0.5032320446935377, + "grad_norm": 1.7101424932479858, + "learning_rate": 0.00010380935384374331, + "loss": 1.2432, + "step": 14052 + }, + { + "epoch": 0.503267856823106, + "grad_norm": 1.5787028074264526, + "learning_rate": 0.00010379776322403039, + "loss": 1.0642, + "step": 14053 + }, + { + "epoch": 0.5033036689526743, + "grad_norm": 1.8156802654266357, + "learning_rate": 0.00010378617255322344, + "loss": 0.9165, + "step": 14054 + }, + { + "epoch": 0.5033394810822426, + "grad_norm": 1.867934226989746, + "learning_rate": 0.00010377458183147848, + "loss": 1.2475, + "step": 14055 + }, + { + "epoch": 0.5033752932118108, + "grad_norm": 1.3702327013015747, + "learning_rate": 0.00010376299105895135, + "loss": 1.0919, + "step": 14056 + }, + { + "epoch": 0.5034111053413791, + "grad_norm": 1.7061177492141724, + "learning_rate": 0.00010375140023579805, + "loss": 1.1632, + "step": 14057 + }, + { + "epoch": 0.5034469174709474, + "grad_norm": 1.6515291929244995, + "learning_rate": 0.0001037398093621745, + "loss": 1.2785, + "step": 14058 + }, + { + "epoch": 0.5034827296005157, + "grad_norm": 2.1499595642089844, + "learning_rate": 0.00010372821843823661, + "loss": 1.1468, + "step": 14059 + }, + { + "epoch": 0.503518541730084, + "grad_norm": 1.4388549327850342, + "learning_rate": 0.00010371662746414037, + "loss": 1.1339, + "step": 14060 + }, + { + "epoch": 0.5035543538596523, + "grad_norm": 1.5654526948928833, + "learning_rate": 0.00010370503644004171, + "loss": 1.1744, + "step": 14061 + }, + { + "epoch": 0.5035901659892206, + "grad_norm": 1.501944661140442, + "learning_rate": 0.00010369344536609653, + "loss": 1.236, + "step": 14062 + }, + { + "epoch": 0.5036259781187888, + "grad_norm": 1.454038143157959, + "learning_rate": 0.00010368185424246084, + "loss": 1.278, + "step": 14063 + }, + { + "epoch": 0.5036617902483571, + "grad_norm": 1.7251337766647339, + "learning_rate": 0.00010367026306929056, + "loss": 1.2814, + "step": 14064 + }, + { + "epoch": 0.5036976023779254, + "grad_norm": 1.7214398384094238, + "learning_rate": 0.00010365867184674159, + "loss": 1.0644, + "step": 14065 + }, + { + "epoch": 0.5037334145074936, + "grad_norm": 1.472713589668274, + "learning_rate": 0.00010364708057496992, + "loss": 1.1865, + "step": 14066 + }, + { + "epoch": 0.5037692266370619, + "grad_norm": 1.370149850845337, + "learning_rate": 0.00010363548925413149, + "loss": 1.0075, + "step": 14067 + }, + { + "epoch": 0.5038050387666303, + "grad_norm": 2.0751030445098877, + "learning_rate": 0.00010362389788438225, + "loss": 0.949, + "step": 14068 + }, + { + "epoch": 0.5038408508961986, + "grad_norm": 1.625069260597229, + "learning_rate": 0.00010361230646587812, + "loss": 1.06, + "step": 14069 + }, + { + "epoch": 0.5038766630257668, + "grad_norm": 1.6068646907806396, + "learning_rate": 0.00010360071499877508, + "loss": 1.0805, + "step": 14070 + }, + { + "epoch": 0.5039124751553351, + "grad_norm": 1.34037446975708, + "learning_rate": 0.00010358912348322904, + "loss": 0.9304, + "step": 14071 + }, + { + "epoch": 0.5039482872849034, + "grad_norm": 2.181539535522461, + "learning_rate": 0.00010357753191939601, + "loss": 1.3281, + "step": 14072 + }, + { + "epoch": 0.5039840994144716, + "grad_norm": 1.542353868484497, + "learning_rate": 0.0001035659403074319, + "loss": 1.3563, + "step": 14073 + }, + { + "epoch": 0.5040199115440399, + "grad_norm": 1.6201198101043701, + "learning_rate": 0.00010355434864749262, + "loss": 1.0406, + "step": 14074 + }, + { + "epoch": 0.5040557236736083, + "grad_norm": 1.5884416103363037, + "learning_rate": 0.0001035427569397342, + "loss": 0.9996, + "step": 14075 + }, + { + "epoch": 0.5040915358031766, + "grad_norm": 1.5923162698745728, + "learning_rate": 0.00010353116518431254, + "loss": 1.0844, + "step": 14076 + }, + { + "epoch": 0.5041273479327448, + "grad_norm": 1.290151596069336, + "learning_rate": 0.00010351957338138363, + "loss": 1.2632, + "step": 14077 + }, + { + "epoch": 0.5041631600623131, + "grad_norm": 1.7232643365859985, + "learning_rate": 0.00010350798153110337, + "loss": 0.983, + "step": 14078 + }, + { + "epoch": 0.5041989721918814, + "grad_norm": 1.5125858783721924, + "learning_rate": 0.00010349638963362777, + "loss": 1.1778, + "step": 14079 + }, + { + "epoch": 0.5042347843214496, + "grad_norm": 2.0402116775512695, + "learning_rate": 0.00010348479768911272, + "loss": 0.9573, + "step": 14080 + }, + { + "epoch": 0.5042705964510179, + "grad_norm": 1.8277109861373901, + "learning_rate": 0.00010347320569771428, + "loss": 1.3003, + "step": 14081 + }, + { + "epoch": 0.5043064085805863, + "grad_norm": 1.942257285118103, + "learning_rate": 0.00010346161365958829, + "loss": 0.926, + "step": 14082 + }, + { + "epoch": 0.5043422207101546, + "grad_norm": 1.4808621406555176, + "learning_rate": 0.00010345002157489074, + "loss": 1.0005, + "step": 14083 + }, + { + "epoch": 0.5043780328397228, + "grad_norm": 1.3611432313919067, + "learning_rate": 0.00010343842944377764, + "loss": 1.3611, + "step": 14084 + }, + { + "epoch": 0.5044138449692911, + "grad_norm": 1.3842544555664062, + "learning_rate": 0.00010342683726640487, + "loss": 1.1585, + "step": 14085 + }, + { + "epoch": 0.5044496570988594, + "grad_norm": 1.4685643911361694, + "learning_rate": 0.00010341524504292845, + "loss": 1.1868, + "step": 14086 + }, + { + "epoch": 0.5044854692284276, + "grad_norm": 1.605909824371338, + "learning_rate": 0.00010340365277350428, + "loss": 1.1373, + "step": 14087 + }, + { + "epoch": 0.5045212813579959, + "grad_norm": 1.4864774942398071, + "learning_rate": 0.0001033920604582884, + "loss": 1.0533, + "step": 14088 + }, + { + "epoch": 0.5045570934875643, + "grad_norm": 1.4439876079559326, + "learning_rate": 0.00010338046809743668, + "loss": 0.7921, + "step": 14089 + }, + { + "epoch": 0.5045929056171325, + "grad_norm": 1.587330937385559, + "learning_rate": 0.00010336887569110518, + "loss": 1.0673, + "step": 14090 + }, + { + "epoch": 0.5046287177467008, + "grad_norm": 1.4597392082214355, + "learning_rate": 0.00010335728323944974, + "loss": 0.944, + "step": 14091 + }, + { + "epoch": 0.5046645298762691, + "grad_norm": 1.5467630624771118, + "learning_rate": 0.00010334569074262641, + "loss": 1.1402, + "step": 14092 + }, + { + "epoch": 0.5047003420058374, + "grad_norm": 1.8656717538833618, + "learning_rate": 0.00010333409820079112, + "loss": 1.1644, + "step": 14093 + }, + { + "epoch": 0.5047361541354056, + "grad_norm": 1.416191816329956, + "learning_rate": 0.00010332250561409986, + "loss": 1.1458, + "step": 14094 + }, + { + "epoch": 0.5047719662649739, + "grad_norm": 1.323682188987732, + "learning_rate": 0.00010331091298270854, + "loss": 1.1007, + "step": 14095 + }, + { + "epoch": 0.5048077783945423, + "grad_norm": 1.8224788904190063, + "learning_rate": 0.00010329932030677316, + "loss": 1.1085, + "step": 14096 + }, + { + "epoch": 0.5048435905241105, + "grad_norm": 1.3796321153640747, + "learning_rate": 0.00010328772758644971, + "loss": 1.1146, + "step": 14097 + }, + { + "epoch": 0.5048794026536788, + "grad_norm": 1.3897795677185059, + "learning_rate": 0.00010327613482189409, + "loss": 0.9847, + "step": 14098 + }, + { + "epoch": 0.5049152147832471, + "grad_norm": 1.6908252239227295, + "learning_rate": 0.00010326454201326236, + "loss": 1.0478, + "step": 14099 + }, + { + "epoch": 0.5049510269128153, + "grad_norm": 1.652565598487854, + "learning_rate": 0.00010325294916071038, + "loss": 1.0596, + "step": 14100 + }, + { + "epoch": 0.5049868390423836, + "grad_norm": 1.516002893447876, + "learning_rate": 0.00010324135626439419, + "loss": 1.1095, + "step": 14101 + }, + { + "epoch": 0.5050226511719519, + "grad_norm": 1.6441426277160645, + "learning_rate": 0.0001032297633244697, + "loss": 1.1433, + "step": 14102 + }, + { + "epoch": 0.5050584633015203, + "grad_norm": 1.414609670639038, + "learning_rate": 0.00010321817034109293, + "loss": 0.9494, + "step": 14103 + }, + { + "epoch": 0.5050942754310885, + "grad_norm": 1.462463617324829, + "learning_rate": 0.00010320657731441982, + "loss": 1.1027, + "step": 14104 + }, + { + "epoch": 0.5051300875606568, + "grad_norm": 1.9046497344970703, + "learning_rate": 0.00010319498424460636, + "loss": 1.0104, + "step": 14105 + }, + { + "epoch": 0.5051658996902251, + "grad_norm": 2.114089012145996, + "learning_rate": 0.0001031833911318085, + "loss": 1.0506, + "step": 14106 + }, + { + "epoch": 0.5052017118197933, + "grad_norm": 1.5854692459106445, + "learning_rate": 0.00010317179797618223, + "loss": 1.1309, + "step": 14107 + }, + { + "epoch": 0.5052375239493616, + "grad_norm": 1.1583586931228638, + "learning_rate": 0.00010316020477788353, + "loss": 1.2831, + "step": 14108 + }, + { + "epoch": 0.5052733360789299, + "grad_norm": 1.5149216651916504, + "learning_rate": 0.0001031486115370683, + "loss": 1.3019, + "step": 14109 + }, + { + "epoch": 0.5053091482084983, + "grad_norm": 1.4688607454299927, + "learning_rate": 0.00010313701825389259, + "loss": 1.0868, + "step": 14110 + }, + { + "epoch": 0.5053449603380665, + "grad_norm": 1.9249656200408936, + "learning_rate": 0.00010312542492851234, + "loss": 0.9087, + "step": 14111 + }, + { + "epoch": 0.5053807724676348, + "grad_norm": 1.5690891742706299, + "learning_rate": 0.00010311383156108354, + "loss": 1.3511, + "step": 14112 + }, + { + "epoch": 0.5054165845972031, + "grad_norm": 1.5411872863769531, + "learning_rate": 0.00010310223815176215, + "loss": 1.0737, + "step": 14113 + }, + { + "epoch": 0.5054523967267713, + "grad_norm": 1.584245204925537, + "learning_rate": 0.00010309064470070414, + "loss": 1.1356, + "step": 14114 + }, + { + "epoch": 0.5054882088563396, + "grad_norm": 1.413081169128418, + "learning_rate": 0.00010307905120806549, + "loss": 1.0481, + "step": 14115 + }, + { + "epoch": 0.5055240209859079, + "grad_norm": 1.8090509176254272, + "learning_rate": 0.00010306745767400219, + "loss": 1.1306, + "step": 14116 + }, + { + "epoch": 0.5055598331154763, + "grad_norm": 1.9769338369369507, + "learning_rate": 0.0001030558640986702, + "loss": 1.212, + "step": 14117 + }, + { + "epoch": 0.5055956452450445, + "grad_norm": 1.7152245044708252, + "learning_rate": 0.0001030442704822255, + "loss": 0.9967, + "step": 14118 + }, + { + "epoch": 0.5056314573746128, + "grad_norm": 1.356695532798767, + "learning_rate": 0.00010303267682482405, + "loss": 1.0885, + "step": 14119 + }, + { + "epoch": 0.5056672695041811, + "grad_norm": 1.8967856168746948, + "learning_rate": 0.00010302108312662184, + "loss": 1.0453, + "step": 14120 + }, + { + "epoch": 0.5057030816337493, + "grad_norm": 1.8035964965820312, + "learning_rate": 0.00010300948938777491, + "loss": 1.2881, + "step": 14121 + }, + { + "epoch": 0.5057388937633176, + "grad_norm": 1.5946409702301025, + "learning_rate": 0.00010299789560843911, + "loss": 0.9896, + "step": 14122 + }, + { + "epoch": 0.5057747058928859, + "grad_norm": 1.7676112651824951, + "learning_rate": 0.00010298630178877053, + "loss": 1.1681, + "step": 14123 + }, + { + "epoch": 0.5058105180224542, + "grad_norm": 1.8820717334747314, + "learning_rate": 0.00010297470792892512, + "loss": 0.9495, + "step": 14124 + }, + { + "epoch": 0.5058463301520225, + "grad_norm": 1.7525123357772827, + "learning_rate": 0.00010296311402905884, + "loss": 1.1484, + "step": 14125 + }, + { + "epoch": 0.5058821422815908, + "grad_norm": 1.2692043781280518, + "learning_rate": 0.0001029515200893277, + "loss": 1.046, + "step": 14126 + }, + { + "epoch": 0.505917954411159, + "grad_norm": 1.581471562385559, + "learning_rate": 0.00010293992610988763, + "loss": 1.0429, + "step": 14127 + }, + { + "epoch": 0.5059537665407273, + "grad_norm": 1.7381147146224976, + "learning_rate": 0.00010292833209089467, + "loss": 1.0194, + "step": 14128 + }, + { + "epoch": 0.5059895786702956, + "grad_norm": 1.7257658243179321, + "learning_rate": 0.00010291673803250477, + "loss": 1.1332, + "step": 14129 + }, + { + "epoch": 0.5060253907998639, + "grad_norm": 1.3776700496673584, + "learning_rate": 0.00010290514393487391, + "loss": 1.1336, + "step": 14130 + }, + { + "epoch": 0.5060612029294322, + "grad_norm": 1.9883538484573364, + "learning_rate": 0.00010289354979815811, + "loss": 1.2253, + "step": 14131 + }, + { + "epoch": 0.5060970150590005, + "grad_norm": 1.7947614192962646, + "learning_rate": 0.00010288195562251332, + "loss": 1.1201, + "step": 14132 + }, + { + "epoch": 0.5061328271885688, + "grad_norm": 1.7032139301300049, + "learning_rate": 0.00010287036140809552, + "loss": 1.1715, + "step": 14133 + }, + { + "epoch": 0.506168639318137, + "grad_norm": 1.626717209815979, + "learning_rate": 0.00010285876715506076, + "loss": 1.1223, + "step": 14134 + }, + { + "epoch": 0.5062044514477053, + "grad_norm": 1.4833048582077026, + "learning_rate": 0.00010284717286356493, + "loss": 1.0101, + "step": 14135 + }, + { + "epoch": 0.5062402635772736, + "grad_norm": 1.1928553581237793, + "learning_rate": 0.00010283557853376408, + "loss": 1.0866, + "step": 14136 + }, + { + "epoch": 0.5062760757068419, + "grad_norm": 2.0788662433624268, + "learning_rate": 0.00010282398416581415, + "loss": 1.2645, + "step": 14137 + }, + { + "epoch": 0.5063118878364102, + "grad_norm": 1.4563791751861572, + "learning_rate": 0.00010281238975987118, + "loss": 1.1096, + "step": 14138 + }, + { + "epoch": 0.5063476999659785, + "grad_norm": 1.7770742177963257, + "learning_rate": 0.00010280079531609112, + "loss": 1.0125, + "step": 14139 + }, + { + "epoch": 0.5063835120955468, + "grad_norm": 1.6008529663085938, + "learning_rate": 0.00010278920083462997, + "loss": 1.2593, + "step": 14140 + }, + { + "epoch": 0.506419324225115, + "grad_norm": 1.6806246042251587, + "learning_rate": 0.00010277760631564375, + "loss": 0.9671, + "step": 14141 + }, + { + "epoch": 0.5064551363546833, + "grad_norm": 1.5059146881103516, + "learning_rate": 0.00010276601175928839, + "loss": 1.141, + "step": 14142 + }, + { + "epoch": 0.5064909484842516, + "grad_norm": 1.5139508247375488, + "learning_rate": 0.00010275441716571996, + "loss": 1.0601, + "step": 14143 + }, + { + "epoch": 0.5065267606138198, + "grad_norm": 1.4623081684112549, + "learning_rate": 0.00010274282253509436, + "loss": 1.1449, + "step": 14144 + }, + { + "epoch": 0.5065625727433882, + "grad_norm": 1.344497561454773, + "learning_rate": 0.00010273122786756762, + "loss": 1.2451, + "step": 14145 + }, + { + "epoch": 0.5065983848729565, + "grad_norm": 1.6853904724121094, + "learning_rate": 0.00010271963316329571, + "loss": 1.0351, + "step": 14146 + }, + { + "epoch": 0.5066341970025248, + "grad_norm": 1.5053244829177856, + "learning_rate": 0.00010270803842243469, + "loss": 1.3293, + "step": 14147 + }, + { + "epoch": 0.506670009132093, + "grad_norm": 1.6021615266799927, + "learning_rate": 0.00010269644364514046, + "loss": 1.1453, + "step": 14148 + }, + { + "epoch": 0.5067058212616613, + "grad_norm": 1.645701289176941, + "learning_rate": 0.0001026848488315691, + "loss": 1.1744, + "step": 14149 + }, + { + "epoch": 0.5067416333912296, + "grad_norm": 1.4829243421554565, + "learning_rate": 0.00010267325398187653, + "loss": 1.043, + "step": 14150 + }, + { + "epoch": 0.5067774455207978, + "grad_norm": 1.7403429746627808, + "learning_rate": 0.00010266165909621879, + "loss": 1.2747, + "step": 14151 + }, + { + "epoch": 0.5068132576503662, + "grad_norm": 1.60703706741333, + "learning_rate": 0.00010265006417475189, + "loss": 1.1819, + "step": 14152 + }, + { + "epoch": 0.5068490697799345, + "grad_norm": 1.5514870882034302, + "learning_rate": 0.00010263846921763174, + "loss": 1.0362, + "step": 14153 + }, + { + "epoch": 0.5068848819095028, + "grad_norm": 1.6850626468658447, + "learning_rate": 0.00010262687422501442, + "loss": 0.9781, + "step": 14154 + }, + { + "epoch": 0.506920694039071, + "grad_norm": 1.5973395109176636, + "learning_rate": 0.00010261527919705589, + "loss": 1.3848, + "step": 14155 + }, + { + "epoch": 0.5069565061686393, + "grad_norm": 2.270339012145996, + "learning_rate": 0.00010260368413391217, + "loss": 1.2933, + "step": 14156 + }, + { + "epoch": 0.5069923182982076, + "grad_norm": 1.3871304988861084, + "learning_rate": 0.0001025920890357392, + "loss": 1.0886, + "step": 14157 + }, + { + "epoch": 0.5070281304277758, + "grad_norm": 1.6101248264312744, + "learning_rate": 0.00010258049390269305, + "loss": 1.0504, + "step": 14158 + }, + { + "epoch": 0.5070639425573442, + "grad_norm": 1.4631417989730835, + "learning_rate": 0.00010256889873492966, + "loss": 1.2269, + "step": 14159 + }, + { + "epoch": 0.5070997546869125, + "grad_norm": 1.764601230621338, + "learning_rate": 0.00010255730353260507, + "loss": 0.9554, + "step": 14160 + }, + { + "epoch": 0.5071355668164808, + "grad_norm": 1.6833672523498535, + "learning_rate": 0.00010254570829587527, + "loss": 1.0194, + "step": 14161 + }, + { + "epoch": 0.507171378946049, + "grad_norm": 1.8467636108398438, + "learning_rate": 0.00010253411302489622, + "loss": 1.1358, + "step": 14162 + }, + { + "epoch": 0.5072071910756173, + "grad_norm": 1.9342877864837646, + "learning_rate": 0.00010252251771982395, + "loss": 1.2508, + "step": 14163 + }, + { + "epoch": 0.5072430032051856, + "grad_norm": 1.495714545249939, + "learning_rate": 0.00010251092238081446, + "loss": 1.2615, + "step": 14164 + }, + { + "epoch": 0.5072788153347538, + "grad_norm": 1.4256242513656616, + "learning_rate": 0.00010249932700802376, + "loss": 1.2498, + "step": 14165 + }, + { + "epoch": 0.5073146274643222, + "grad_norm": 1.680510401725769, + "learning_rate": 0.00010248773160160782, + "loss": 1.1918, + "step": 14166 + }, + { + "epoch": 0.5073504395938905, + "grad_norm": 1.3784092664718628, + "learning_rate": 0.0001024761361617227, + "loss": 1.2659, + "step": 14167 + }, + { + "epoch": 0.5073862517234587, + "grad_norm": 1.365929365158081, + "learning_rate": 0.00010246454068852431, + "loss": 1.046, + "step": 14168 + }, + { + "epoch": 0.507422063853027, + "grad_norm": 1.9217628240585327, + "learning_rate": 0.00010245294518216875, + "loss": 1.0821, + "step": 14169 + }, + { + "epoch": 0.5074578759825953, + "grad_norm": 1.7301764488220215, + "learning_rate": 0.00010244134964281195, + "loss": 1.0646, + "step": 14170 + }, + { + "epoch": 0.5074936881121636, + "grad_norm": 2.199875593185425, + "learning_rate": 0.00010242975407060995, + "loss": 0.998, + "step": 14171 + }, + { + "epoch": 0.5075295002417318, + "grad_norm": 1.2445343732833862, + "learning_rate": 0.00010241815846571874, + "loss": 1.2912, + "step": 14172 + }, + { + "epoch": 0.5075653123713002, + "grad_norm": 1.251937747001648, + "learning_rate": 0.00010240656282829433, + "loss": 1.1192, + "step": 14173 + }, + { + "epoch": 0.5076011245008685, + "grad_norm": 1.4737675189971924, + "learning_rate": 0.00010239496715849273, + "loss": 0.8773, + "step": 14174 + }, + { + "epoch": 0.5076369366304367, + "grad_norm": 1.473730444908142, + "learning_rate": 0.0001023833714564699, + "loss": 1.08, + "step": 14175 + }, + { + "epoch": 0.507672748760005, + "grad_norm": 1.5507493019104004, + "learning_rate": 0.00010237177572238192, + "loss": 1.1782, + "step": 14176 + }, + { + "epoch": 0.5077085608895733, + "grad_norm": 1.5707058906555176, + "learning_rate": 0.00010236017995638472, + "loss": 1.201, + "step": 14177 + }, + { + "epoch": 0.5077443730191415, + "grad_norm": 2.161930799484253, + "learning_rate": 0.00010234858415863439, + "loss": 1.0719, + "step": 14178 + }, + { + "epoch": 0.5077801851487098, + "grad_norm": 1.5692583322525024, + "learning_rate": 0.00010233698832928686, + "loss": 1.0204, + "step": 14179 + }, + { + "epoch": 0.5078159972782782, + "grad_norm": 1.542769193649292, + "learning_rate": 0.00010232539246849818, + "loss": 1.0543, + "step": 14180 + }, + { + "epoch": 0.5078518094078465, + "grad_norm": 1.5170994997024536, + "learning_rate": 0.00010231379657642432, + "loss": 1.0206, + "step": 14181 + }, + { + "epoch": 0.5078876215374147, + "grad_norm": 2.232719659805298, + "learning_rate": 0.00010230220065322132, + "loss": 1.0841, + "step": 14182 + }, + { + "epoch": 0.507923433666983, + "grad_norm": 1.5860161781311035, + "learning_rate": 0.00010229060469904519, + "loss": 0.9329, + "step": 14183 + }, + { + "epoch": 0.5079592457965513, + "grad_norm": 1.8027839660644531, + "learning_rate": 0.00010227900871405191, + "loss": 1.2182, + "step": 14184 + }, + { + "epoch": 0.5079950579261195, + "grad_norm": 1.5158839225769043, + "learning_rate": 0.00010226741269839755, + "loss": 1.0255, + "step": 14185 + }, + { + "epoch": 0.5080308700556878, + "grad_norm": 1.5110230445861816, + "learning_rate": 0.00010225581665223802, + "loss": 1.1112, + "step": 14186 + }, + { + "epoch": 0.5080666821852562, + "grad_norm": 1.6202472448349, + "learning_rate": 0.00010224422057572947, + "loss": 1.2593, + "step": 14187 + }, + { + "epoch": 0.5081024943148245, + "grad_norm": 1.4197348356246948, + "learning_rate": 0.00010223262446902775, + "loss": 1.2349, + "step": 14188 + }, + { + "epoch": 0.5081383064443927, + "grad_norm": 1.5767993927001953, + "learning_rate": 0.00010222102833228897, + "loss": 1.085, + "step": 14189 + }, + { + "epoch": 0.508174118573961, + "grad_norm": 1.43828284740448, + "learning_rate": 0.00010220943216566912, + "loss": 1.02, + "step": 14190 + }, + { + "epoch": 0.5082099307035293, + "grad_norm": 1.606485366821289, + "learning_rate": 0.00010219783596932421, + "loss": 1.0803, + "step": 14191 + }, + { + "epoch": 0.5082457428330975, + "grad_norm": 1.3409172296524048, + "learning_rate": 0.00010218623974341024, + "loss": 1.119, + "step": 14192 + }, + { + "epoch": 0.5082815549626658, + "grad_norm": 1.831608533859253, + "learning_rate": 0.00010217464348808323, + "loss": 1.3094, + "step": 14193 + }, + { + "epoch": 0.5083173670922342, + "grad_norm": 1.8562854528427124, + "learning_rate": 0.00010216304720349922, + "loss": 1.199, + "step": 14194 + }, + { + "epoch": 0.5083531792218025, + "grad_norm": 1.946392297744751, + "learning_rate": 0.00010215145088981419, + "loss": 1.2237, + "step": 14195 + }, + { + "epoch": 0.5083889913513707, + "grad_norm": 1.6409062147140503, + "learning_rate": 0.0001021398545471842, + "loss": 1.0058, + "step": 14196 + }, + { + "epoch": 0.508424803480939, + "grad_norm": 1.638924241065979, + "learning_rate": 0.00010212825817576519, + "loss": 1.24, + "step": 14197 + }, + { + "epoch": 0.5084606156105073, + "grad_norm": 1.2281758785247803, + "learning_rate": 0.00010211666177571322, + "loss": 1.1415, + "step": 14198 + }, + { + "epoch": 0.5084964277400755, + "grad_norm": 1.483260989189148, + "learning_rate": 0.00010210506534718427, + "loss": 1.0315, + "step": 14199 + }, + { + "epoch": 0.5085322398696438, + "grad_norm": 1.3291429281234741, + "learning_rate": 0.00010209346889033442, + "loss": 1.1244, + "step": 14200 + }, + { + "epoch": 0.5085680519992122, + "grad_norm": 1.580917477607727, + "learning_rate": 0.00010208187240531962, + "loss": 1.0043, + "step": 14201 + }, + { + "epoch": 0.5086038641287804, + "grad_norm": 1.8104311227798462, + "learning_rate": 0.00010207027589229594, + "loss": 1.28, + "step": 14202 + }, + { + "epoch": 0.5086396762583487, + "grad_norm": 1.44049072265625, + "learning_rate": 0.00010205867935141933, + "loss": 1.2157, + "step": 14203 + }, + { + "epoch": 0.508675488387917, + "grad_norm": 1.639949083328247, + "learning_rate": 0.00010204708278284587, + "loss": 1.1421, + "step": 14204 + }, + { + "epoch": 0.5087113005174853, + "grad_norm": 1.2622960805892944, + "learning_rate": 0.00010203548618673155, + "loss": 1.1307, + "step": 14205 + }, + { + "epoch": 0.5087471126470535, + "grad_norm": 1.619931697845459, + "learning_rate": 0.00010202388956323238, + "loss": 1.0652, + "step": 14206 + }, + { + "epoch": 0.5087829247766218, + "grad_norm": 1.3301764726638794, + "learning_rate": 0.0001020122929125044, + "loss": 1.142, + "step": 14207 + }, + { + "epoch": 0.5088187369061902, + "grad_norm": 1.9107455015182495, + "learning_rate": 0.00010200069623470358, + "loss": 1.0759, + "step": 14208 + }, + { + "epoch": 0.5088545490357584, + "grad_norm": 1.789570689201355, + "learning_rate": 0.00010198909952998603, + "loss": 1.2673, + "step": 14209 + }, + { + "epoch": 0.5088903611653267, + "grad_norm": 1.2914844751358032, + "learning_rate": 0.00010197750279850767, + "loss": 1.1439, + "step": 14210 + }, + { + "epoch": 0.508926173294895, + "grad_norm": 1.6152604818344116, + "learning_rate": 0.00010196590604042457, + "loss": 0.9914, + "step": 14211 + }, + { + "epoch": 0.5089619854244632, + "grad_norm": 1.8375219106674194, + "learning_rate": 0.00010195430925589274, + "loss": 1.1941, + "step": 14212 + }, + { + "epoch": 0.5089977975540315, + "grad_norm": 1.5907158851623535, + "learning_rate": 0.00010194271244506821, + "loss": 1.1939, + "step": 14213 + }, + { + "epoch": 0.5090336096835998, + "grad_norm": 1.4120042324066162, + "learning_rate": 0.00010193111560810697, + "loss": 1.1914, + "step": 14214 + }, + { + "epoch": 0.5090694218131682, + "grad_norm": 1.7162079811096191, + "learning_rate": 0.00010191951874516508, + "loss": 0.9418, + "step": 14215 + }, + { + "epoch": 0.5091052339427364, + "grad_norm": 1.7538617849349976, + "learning_rate": 0.00010190792185639855, + "loss": 1.155, + "step": 14216 + }, + { + "epoch": 0.5091410460723047, + "grad_norm": 1.5106538534164429, + "learning_rate": 0.00010189632494196335, + "loss": 1.0406, + "step": 14217 + }, + { + "epoch": 0.509176858201873, + "grad_norm": 1.6115390062332153, + "learning_rate": 0.00010188472800201558, + "loss": 1.1013, + "step": 14218 + }, + { + "epoch": 0.5092126703314412, + "grad_norm": 1.3911632299423218, + "learning_rate": 0.00010187313103671122, + "loss": 1.0984, + "step": 14219 + }, + { + "epoch": 0.5092484824610095, + "grad_norm": 1.3721168041229248, + "learning_rate": 0.00010186153404620628, + "loss": 1.1673, + "step": 14220 + }, + { + "epoch": 0.5092842945905778, + "grad_norm": 1.4348002672195435, + "learning_rate": 0.00010184993703065682, + "loss": 1.045, + "step": 14221 + }, + { + "epoch": 0.5093201067201462, + "grad_norm": 1.4781640768051147, + "learning_rate": 0.00010183833999021884, + "loss": 1.108, + "step": 14222 + }, + { + "epoch": 0.5093559188497144, + "grad_norm": 2.2156734466552734, + "learning_rate": 0.00010182674292504837, + "loss": 1.1553, + "step": 14223 + }, + { + "epoch": 0.5093917309792827, + "grad_norm": 1.8053174018859863, + "learning_rate": 0.00010181514583530141, + "loss": 1.088, + "step": 14224 + }, + { + "epoch": 0.509427543108851, + "grad_norm": 1.9322000741958618, + "learning_rate": 0.00010180354872113403, + "loss": 1.323, + "step": 14225 + }, + { + "epoch": 0.5094633552384192, + "grad_norm": 1.4627728462219238, + "learning_rate": 0.0001017919515827022, + "loss": 1.1444, + "step": 14226 + }, + { + "epoch": 0.5094991673679875, + "grad_norm": 1.6191974878311157, + "learning_rate": 0.000101780354420162, + "loss": 1.1873, + "step": 14227 + }, + { + "epoch": 0.5095349794975558, + "grad_norm": 1.135184407234192, + "learning_rate": 0.00010176875723366941, + "loss": 1.0574, + "step": 14228 + }, + { + "epoch": 0.5095707916271242, + "grad_norm": 1.5819789171218872, + "learning_rate": 0.00010175716002338049, + "loss": 1.0735, + "step": 14229 + }, + { + "epoch": 0.5096066037566924, + "grad_norm": 1.469929814338684, + "learning_rate": 0.00010174556278945123, + "loss": 0.9757, + "step": 14230 + }, + { + "epoch": 0.5096424158862607, + "grad_norm": 1.5239301919937134, + "learning_rate": 0.00010173396553203771, + "loss": 1.0869, + "step": 14231 + }, + { + "epoch": 0.509678228015829, + "grad_norm": 2.6175153255462646, + "learning_rate": 0.00010172236825129588, + "loss": 1.1216, + "step": 14232 + }, + { + "epoch": 0.5097140401453972, + "grad_norm": 1.601203203201294, + "learning_rate": 0.00010171077094738183, + "loss": 1.1719, + "step": 14233 + }, + { + "epoch": 0.5097498522749655, + "grad_norm": 1.6799286603927612, + "learning_rate": 0.00010169917362045154, + "loss": 1.0126, + "step": 14234 + }, + { + "epoch": 0.5097856644045338, + "grad_norm": 1.4862207174301147, + "learning_rate": 0.00010168757627066105, + "loss": 1.1029, + "step": 14235 + }, + { + "epoch": 0.5098214765341021, + "grad_norm": 1.7457611560821533, + "learning_rate": 0.00010167597889816644, + "loss": 1.2637, + "step": 14236 + }, + { + "epoch": 0.5098572886636704, + "grad_norm": 1.3447980880737305, + "learning_rate": 0.00010166438150312367, + "loss": 1.2362, + "step": 14237 + }, + { + "epoch": 0.5098931007932387, + "grad_norm": 1.7206381559371948, + "learning_rate": 0.00010165278408568881, + "loss": 1.0886, + "step": 14238 + }, + { + "epoch": 0.509928912922807, + "grad_norm": 1.4297466278076172, + "learning_rate": 0.00010164118664601785, + "loss": 1.1906, + "step": 14239 + }, + { + "epoch": 0.5099647250523752, + "grad_norm": 1.5093291997909546, + "learning_rate": 0.0001016295891842669, + "loss": 1.2033, + "step": 14240 + }, + { + "epoch": 0.5100005371819435, + "grad_norm": 1.2976919412612915, + "learning_rate": 0.00010161799170059187, + "loss": 1.1199, + "step": 14241 + }, + { + "epoch": 0.5100363493115118, + "grad_norm": 1.4606363773345947, + "learning_rate": 0.00010160639419514888, + "loss": 0.9139, + "step": 14242 + }, + { + "epoch": 0.5100721614410801, + "grad_norm": 1.2436596155166626, + "learning_rate": 0.00010159479666809388, + "loss": 1.039, + "step": 14243 + }, + { + "epoch": 0.5101079735706484, + "grad_norm": 1.3766981363296509, + "learning_rate": 0.00010158319911958301, + "loss": 1.1144, + "step": 14244 + }, + { + "epoch": 0.5101437857002167, + "grad_norm": 2.0251688957214355, + "learning_rate": 0.00010157160154977219, + "loss": 1.0489, + "step": 14245 + }, + { + "epoch": 0.510179597829785, + "grad_norm": 1.5959560871124268, + "learning_rate": 0.00010156000395881752, + "loss": 1.1921, + "step": 14246 + }, + { + "epoch": 0.5102154099593532, + "grad_norm": 1.4763717651367188, + "learning_rate": 0.000101548406346875, + "loss": 1.1551, + "step": 14247 + }, + { + "epoch": 0.5102512220889215, + "grad_norm": 2.1402125358581543, + "learning_rate": 0.00010153680871410065, + "loss": 1.1653, + "step": 14248 + }, + { + "epoch": 0.5102870342184898, + "grad_norm": 1.5426734685897827, + "learning_rate": 0.00010152521106065058, + "loss": 1.0774, + "step": 14249 + }, + { + "epoch": 0.5103228463480581, + "grad_norm": 1.8861578702926636, + "learning_rate": 0.00010151361338668072, + "loss": 1.3145, + "step": 14250 + }, + { + "epoch": 0.5103586584776264, + "grad_norm": 1.5839565992355347, + "learning_rate": 0.00010150201569234717, + "loss": 0.9835, + "step": 14251 + }, + { + "epoch": 0.5103944706071947, + "grad_norm": 1.905541181564331, + "learning_rate": 0.0001014904179778059, + "loss": 1.2697, + "step": 14252 + }, + { + "epoch": 0.5104302827367629, + "grad_norm": 1.6010770797729492, + "learning_rate": 0.000101478820243213, + "loss": 1.36, + "step": 14253 + }, + { + "epoch": 0.5104660948663312, + "grad_norm": 1.7368193864822388, + "learning_rate": 0.00010146722248872446, + "loss": 1.1369, + "step": 14254 + }, + { + "epoch": 0.5105019069958995, + "grad_norm": 1.6837491989135742, + "learning_rate": 0.00010145562471449638, + "loss": 1.0778, + "step": 14255 + }, + { + "epoch": 0.5105377191254677, + "grad_norm": 1.4082506895065308, + "learning_rate": 0.00010144402692068472, + "loss": 1.1571, + "step": 14256 + }, + { + "epoch": 0.5105735312550361, + "grad_norm": 1.9294952154159546, + "learning_rate": 0.00010143242910744555, + "loss": 1.2056, + "step": 14257 + }, + { + "epoch": 0.5106093433846044, + "grad_norm": 1.6119173765182495, + "learning_rate": 0.00010142083127493489, + "loss": 1.1514, + "step": 14258 + }, + { + "epoch": 0.5106451555141727, + "grad_norm": 1.625075340270996, + "learning_rate": 0.00010140923342330875, + "loss": 1.1627, + "step": 14259 + }, + { + "epoch": 0.5106809676437409, + "grad_norm": 1.9540650844573975, + "learning_rate": 0.00010139763555272323, + "loss": 1.4076, + "step": 14260 + }, + { + "epoch": 0.5107167797733092, + "grad_norm": 1.539080023765564, + "learning_rate": 0.0001013860376633343, + "loss": 0.8856, + "step": 14261 + }, + { + "epoch": 0.5107525919028775, + "grad_norm": 1.356128215789795, + "learning_rate": 0.00010137443975529804, + "loss": 0.9629, + "step": 14262 + }, + { + "epoch": 0.5107884040324457, + "grad_norm": 1.4488691091537476, + "learning_rate": 0.00010136284182877045, + "loss": 1.0145, + "step": 14263 + }, + { + "epoch": 0.5108242161620141, + "grad_norm": 1.6027289628982544, + "learning_rate": 0.0001013512438839076, + "loss": 1.1901, + "step": 14264 + }, + { + "epoch": 0.5108600282915824, + "grad_norm": 1.2773905992507935, + "learning_rate": 0.00010133964592086547, + "loss": 1.1218, + "step": 14265 + }, + { + "epoch": 0.5108958404211507, + "grad_norm": 1.9166256189346313, + "learning_rate": 0.00010132804793980018, + "loss": 1.0232, + "step": 14266 + }, + { + "epoch": 0.5109316525507189, + "grad_norm": 1.3467785120010376, + "learning_rate": 0.0001013164499408677, + "loss": 1.251, + "step": 14267 + }, + { + "epoch": 0.5109674646802872, + "grad_norm": 1.5313332080841064, + "learning_rate": 0.00010130485192422408, + "loss": 1.0057, + "step": 14268 + }, + { + "epoch": 0.5110032768098555, + "grad_norm": 1.4223132133483887, + "learning_rate": 0.00010129325389002536, + "loss": 1.1182, + "step": 14269 + }, + { + "epoch": 0.5110390889394237, + "grad_norm": 1.4805747270584106, + "learning_rate": 0.00010128165583842757, + "loss": 1.1727, + "step": 14270 + }, + { + "epoch": 0.5110749010689921, + "grad_norm": 1.5605409145355225, + "learning_rate": 0.00010127005776958676, + "loss": 1.0973, + "step": 14271 + }, + { + "epoch": 0.5111107131985604, + "grad_norm": 1.4496806859970093, + "learning_rate": 0.00010125845968365895, + "loss": 1.0319, + "step": 14272 + }, + { + "epoch": 0.5111465253281287, + "grad_norm": 1.8770891427993774, + "learning_rate": 0.00010124686158080021, + "loss": 1.111, + "step": 14273 + }, + { + "epoch": 0.5111823374576969, + "grad_norm": 1.4584888219833374, + "learning_rate": 0.00010123526346116654, + "loss": 1.2029, + "step": 14274 + }, + { + "epoch": 0.5112181495872652, + "grad_norm": 1.3978583812713623, + "learning_rate": 0.00010122366532491403, + "loss": 0.9542, + "step": 14275 + }, + { + "epoch": 0.5112539617168335, + "grad_norm": 1.1734704971313477, + "learning_rate": 0.00010121206717219865, + "loss": 1.1161, + "step": 14276 + }, + { + "epoch": 0.5112897738464017, + "grad_norm": 1.757991075515747, + "learning_rate": 0.00010120046900317646, + "loss": 1.2426, + "step": 14277 + }, + { + "epoch": 0.5113255859759701, + "grad_norm": 1.3305968046188354, + "learning_rate": 0.00010118887081800352, + "loss": 1.1464, + "step": 14278 + }, + { + "epoch": 0.5113613981055384, + "grad_norm": 1.473546028137207, + "learning_rate": 0.00010117727261683585, + "loss": 1.0466, + "step": 14279 + }, + { + "epoch": 0.5113972102351066, + "grad_norm": 2.280439615249634, + "learning_rate": 0.00010116567439982952, + "loss": 1.3931, + "step": 14280 + }, + { + "epoch": 0.5114330223646749, + "grad_norm": 1.4052388668060303, + "learning_rate": 0.0001011540761671405, + "loss": 0.8854, + "step": 14281 + }, + { + "epoch": 0.5114688344942432, + "grad_norm": 1.4222018718719482, + "learning_rate": 0.00010114247791892491, + "loss": 1.08, + "step": 14282 + }, + { + "epoch": 0.5115046466238115, + "grad_norm": 1.7985014915466309, + "learning_rate": 0.00010113087965533874, + "loss": 1.1542, + "step": 14283 + }, + { + "epoch": 0.5115404587533797, + "grad_norm": 1.5433368682861328, + "learning_rate": 0.00010111928137653808, + "loss": 1.2601, + "step": 14284 + }, + { + "epoch": 0.5115762708829481, + "grad_norm": 1.4170889854431152, + "learning_rate": 0.00010110768308267889, + "loss": 0.9437, + "step": 14285 + }, + { + "epoch": 0.5116120830125164, + "grad_norm": 1.5162243843078613, + "learning_rate": 0.00010109608477391725, + "loss": 1.1659, + "step": 14286 + }, + { + "epoch": 0.5116478951420846, + "grad_norm": 2.0926756858825684, + "learning_rate": 0.00010108448645040919, + "loss": 1.3099, + "step": 14287 + }, + { + "epoch": 0.5116837072716529, + "grad_norm": 1.6155922412872314, + "learning_rate": 0.00010107288811231081, + "loss": 0.9747, + "step": 14288 + }, + { + "epoch": 0.5117195194012212, + "grad_norm": 1.6054718494415283, + "learning_rate": 0.00010106128975977809, + "loss": 1.0835, + "step": 14289 + }, + { + "epoch": 0.5117553315307894, + "grad_norm": 1.6929752826690674, + "learning_rate": 0.00010104969139296705, + "loss": 1.2466, + "step": 14290 + }, + { + "epoch": 0.5117911436603577, + "grad_norm": 1.4249944686889648, + "learning_rate": 0.00010103809301203382, + "loss": 1.0221, + "step": 14291 + }, + { + "epoch": 0.5118269557899261, + "grad_norm": 1.4383646249771118, + "learning_rate": 0.00010102649461713434, + "loss": 1.112, + "step": 14292 + }, + { + "epoch": 0.5118627679194944, + "grad_norm": 1.3567750453948975, + "learning_rate": 0.00010101489620842475, + "loss": 1.0545, + "step": 14293 + }, + { + "epoch": 0.5118985800490626, + "grad_norm": 1.6957590579986572, + "learning_rate": 0.00010100329778606101, + "loss": 1.2492, + "step": 14294 + }, + { + "epoch": 0.5119343921786309, + "grad_norm": 1.4024572372436523, + "learning_rate": 0.0001009916993501992, + "loss": 1.0217, + "step": 14295 + }, + { + "epoch": 0.5119702043081992, + "grad_norm": 1.5625803470611572, + "learning_rate": 0.00010098010090099532, + "loss": 1.0471, + "step": 14296 + }, + { + "epoch": 0.5120060164377674, + "grad_norm": 1.468648910522461, + "learning_rate": 0.00010096850243860549, + "loss": 0.9707, + "step": 14297 + }, + { + "epoch": 0.5120418285673357, + "grad_norm": 1.3958230018615723, + "learning_rate": 0.00010095690396318569, + "loss": 1.1415, + "step": 14298 + }, + { + "epoch": 0.5120776406969041, + "grad_norm": 2.020313262939453, + "learning_rate": 0.00010094530547489201, + "loss": 1.3445, + "step": 14299 + }, + { + "epoch": 0.5121134528264724, + "grad_norm": 1.5911704301834106, + "learning_rate": 0.0001009337069738804, + "loss": 1.1294, + "step": 14300 + }, + { + "epoch": 0.5121492649560406, + "grad_norm": 1.4705065488815308, + "learning_rate": 0.00010092210846030703, + "loss": 1.0685, + "step": 14301 + }, + { + "epoch": 0.5121850770856089, + "grad_norm": 1.452013611793518, + "learning_rate": 0.00010091050993432787, + "loss": 1.1117, + "step": 14302 + }, + { + "epoch": 0.5122208892151772, + "grad_norm": 2.0908918380737305, + "learning_rate": 0.00010089891139609895, + "loss": 1.1176, + "step": 14303 + }, + { + "epoch": 0.5122567013447454, + "grad_norm": 1.5008336305618286, + "learning_rate": 0.00010088731284577636, + "loss": 1.1627, + "step": 14304 + }, + { + "epoch": 0.5122925134743137, + "grad_norm": 1.3626724481582642, + "learning_rate": 0.0001008757142835161, + "loss": 0.9733, + "step": 14305 + }, + { + "epoch": 0.5123283256038821, + "grad_norm": 1.4199378490447998, + "learning_rate": 0.00010086411570947424, + "loss": 1.3152, + "step": 14306 + }, + { + "epoch": 0.5123641377334504, + "grad_norm": 1.2120038270950317, + "learning_rate": 0.0001008525171238068, + "loss": 1.2125, + "step": 14307 + }, + { + "epoch": 0.5123999498630186, + "grad_norm": 1.5177483558654785, + "learning_rate": 0.00010084091852666988, + "loss": 0.9787, + "step": 14308 + }, + { + "epoch": 0.5124357619925869, + "grad_norm": 1.4882084131240845, + "learning_rate": 0.00010082931991821945, + "loss": 1.083, + "step": 14309 + }, + { + "epoch": 0.5124715741221552, + "grad_norm": 1.3016273975372314, + "learning_rate": 0.00010081772129861163, + "loss": 1.1898, + "step": 14310 + }, + { + "epoch": 0.5125073862517234, + "grad_norm": 1.659900188446045, + "learning_rate": 0.00010080612266800241, + "loss": 1.1417, + "step": 14311 + }, + { + "epoch": 0.5125431983812917, + "grad_norm": 1.3952877521514893, + "learning_rate": 0.0001007945240265478, + "loss": 1.2935, + "step": 14312 + }, + { + "epoch": 0.5125790105108601, + "grad_norm": 1.2920141220092773, + "learning_rate": 0.00010078292537440397, + "loss": 1.4783, + "step": 14313 + }, + { + "epoch": 0.5126148226404283, + "grad_norm": 1.6401501893997192, + "learning_rate": 0.00010077132671172685, + "loss": 1.116, + "step": 14314 + }, + { + "epoch": 0.5126506347699966, + "grad_norm": 1.43374764919281, + "learning_rate": 0.00010075972803867254, + "loss": 1.0944, + "step": 14315 + }, + { + "epoch": 0.5126864468995649, + "grad_norm": 1.3668228387832642, + "learning_rate": 0.00010074812935539703, + "loss": 1.0844, + "step": 14316 + }, + { + "epoch": 0.5127222590291332, + "grad_norm": 1.370374321937561, + "learning_rate": 0.00010073653066205644, + "loss": 1.1999, + "step": 14317 + }, + { + "epoch": 0.5127580711587014, + "grad_norm": 2.7051987648010254, + "learning_rate": 0.00010072493195880676, + "loss": 0.9767, + "step": 14318 + }, + { + "epoch": 0.5127938832882697, + "grad_norm": 1.2578970193862915, + "learning_rate": 0.00010071333324580408, + "loss": 1.0557, + "step": 14319 + }, + { + "epoch": 0.5128296954178381, + "grad_norm": 1.2708408832550049, + "learning_rate": 0.00010070173452320442, + "loss": 0.9545, + "step": 14320 + }, + { + "epoch": 0.5128655075474063, + "grad_norm": 1.7913739681243896, + "learning_rate": 0.0001006901357911638, + "loss": 1.0425, + "step": 14321 + }, + { + "epoch": 0.5129013196769746, + "grad_norm": 1.4483264684677124, + "learning_rate": 0.00010067853704983832, + "loss": 1.0114, + "step": 14322 + }, + { + "epoch": 0.5129371318065429, + "grad_norm": 1.5723859071731567, + "learning_rate": 0.00010066693829938398, + "loss": 1.239, + "step": 14323 + }, + { + "epoch": 0.5129729439361111, + "grad_norm": 1.5404258966445923, + "learning_rate": 0.00010065533953995688, + "loss": 1.1666, + "step": 14324 + }, + { + "epoch": 0.5130087560656794, + "grad_norm": 2.333956003189087, + "learning_rate": 0.00010064374077171296, + "loss": 1.3022, + "step": 14325 + }, + { + "epoch": 0.5130445681952477, + "grad_norm": 1.6418137550354004, + "learning_rate": 0.00010063214199480842, + "loss": 1.1543, + "step": 14326 + }, + { + "epoch": 0.5130803803248161, + "grad_norm": 1.4245193004608154, + "learning_rate": 0.00010062054320939916, + "loss": 0.8534, + "step": 14327 + }, + { + "epoch": 0.5131161924543843, + "grad_norm": 1.3180842399597168, + "learning_rate": 0.00010060894441564135, + "loss": 1.2247, + "step": 14328 + }, + { + "epoch": 0.5131520045839526, + "grad_norm": 1.5490387678146362, + "learning_rate": 0.00010059734561369095, + "loss": 1.1321, + "step": 14329 + }, + { + "epoch": 0.5131878167135209, + "grad_norm": 1.6262933015823364, + "learning_rate": 0.00010058574680370403, + "loss": 1.1523, + "step": 14330 + }, + { + "epoch": 0.5132236288430891, + "grad_norm": 1.4297395944595337, + "learning_rate": 0.00010057414798583664, + "loss": 1.1294, + "step": 14331 + }, + { + "epoch": 0.5132594409726574, + "grad_norm": 1.4127672910690308, + "learning_rate": 0.00010056254916024483, + "loss": 1.026, + "step": 14332 + }, + { + "epoch": 0.5132952531022257, + "grad_norm": 1.8824166059494019, + "learning_rate": 0.00010055095032708466, + "loss": 1.1356, + "step": 14333 + }, + { + "epoch": 0.5133310652317941, + "grad_norm": 1.5422590970993042, + "learning_rate": 0.00010053935148651214, + "loss": 1.0512, + "step": 14334 + }, + { + "epoch": 0.5133668773613623, + "grad_norm": 1.7909934520721436, + "learning_rate": 0.00010052775263868337, + "loss": 0.9891, + "step": 14335 + }, + { + "epoch": 0.5134026894909306, + "grad_norm": 1.578864574432373, + "learning_rate": 0.00010051615378375434, + "loss": 1.2264, + "step": 14336 + }, + { + "epoch": 0.5134385016204989, + "grad_norm": 1.3180867433547974, + "learning_rate": 0.00010050455492188118, + "loss": 1.2879, + "step": 14337 + }, + { + "epoch": 0.5134743137500671, + "grad_norm": 1.8989768028259277, + "learning_rate": 0.00010049295605321984, + "loss": 0.9364, + "step": 14338 + }, + { + "epoch": 0.5135101258796354, + "grad_norm": 1.759603500366211, + "learning_rate": 0.00010048135717792641, + "loss": 0.9802, + "step": 14339 + }, + { + "epoch": 0.5135459380092037, + "grad_norm": 1.9950302839279175, + "learning_rate": 0.00010046975829615695, + "loss": 1.2218, + "step": 14340 + }, + { + "epoch": 0.513581750138772, + "grad_norm": 1.511412501335144, + "learning_rate": 0.00010045815940806751, + "loss": 1.1363, + "step": 14341 + }, + { + "epoch": 0.5136175622683403, + "grad_norm": 1.414462685585022, + "learning_rate": 0.00010044656051381411, + "loss": 1.0621, + "step": 14342 + }, + { + "epoch": 0.5136533743979086, + "grad_norm": 1.74225652217865, + "learning_rate": 0.00010043496161355282, + "loss": 1.3284, + "step": 14343 + }, + { + "epoch": 0.5136891865274769, + "grad_norm": 1.4025025367736816, + "learning_rate": 0.00010042336270743968, + "loss": 1.2726, + "step": 14344 + }, + { + "epoch": 0.5137249986570451, + "grad_norm": 1.659119963645935, + "learning_rate": 0.00010041176379563073, + "loss": 1.1097, + "step": 14345 + }, + { + "epoch": 0.5137608107866134, + "grad_norm": 1.4132609367370605, + "learning_rate": 0.00010040016487828208, + "loss": 1.1372, + "step": 14346 + }, + { + "epoch": 0.5137966229161817, + "grad_norm": 1.7515811920166016, + "learning_rate": 0.00010038856595554967, + "loss": 1.0772, + "step": 14347 + }, + { + "epoch": 0.51383243504575, + "grad_norm": 1.716480016708374, + "learning_rate": 0.00010037696702758963, + "loss": 1.0635, + "step": 14348 + }, + { + "epoch": 0.5138682471753183, + "grad_norm": 1.3252458572387695, + "learning_rate": 0.00010036536809455796, + "loss": 1.0412, + "step": 14349 + }, + { + "epoch": 0.5139040593048866, + "grad_norm": 2.4141955375671387, + "learning_rate": 0.00010035376915661076, + "loss": 1.3803, + "step": 14350 + }, + { + "epoch": 0.5139398714344549, + "grad_norm": 1.6653963327407837, + "learning_rate": 0.00010034217021390404, + "loss": 1.0237, + "step": 14351 + }, + { + "epoch": 0.5139756835640231, + "grad_norm": 1.8547066450119019, + "learning_rate": 0.00010033057126659388, + "loss": 1.3181, + "step": 14352 + }, + { + "epoch": 0.5140114956935914, + "grad_norm": 1.5042412281036377, + "learning_rate": 0.0001003189723148363, + "loss": 1.1128, + "step": 14353 + }, + { + "epoch": 0.5140473078231597, + "grad_norm": 1.383740782737732, + "learning_rate": 0.00010030737335878735, + "loss": 1.3609, + "step": 14354 + }, + { + "epoch": 0.514083119952728, + "grad_norm": 1.5293803215026855, + "learning_rate": 0.00010029577439860312, + "loss": 1.0144, + "step": 14355 + }, + { + "epoch": 0.5141189320822963, + "grad_norm": 1.5700100660324097, + "learning_rate": 0.00010028417543443958, + "loss": 1.1929, + "step": 14356 + }, + { + "epoch": 0.5141547442118646, + "grad_norm": 1.588182806968689, + "learning_rate": 0.00010027257646645285, + "loss": 1.2463, + "step": 14357 + }, + { + "epoch": 0.5141905563414328, + "grad_norm": 1.4542715549468994, + "learning_rate": 0.00010026097749479895, + "loss": 1.0279, + "step": 14358 + }, + { + "epoch": 0.5142263684710011, + "grad_norm": 2.728816509246826, + "learning_rate": 0.00010024937851963394, + "loss": 0.9738, + "step": 14359 + }, + { + "epoch": 0.5142621806005694, + "grad_norm": 1.5493226051330566, + "learning_rate": 0.00010023777954111384, + "loss": 0.9046, + "step": 14360 + }, + { + "epoch": 0.5142979927301377, + "grad_norm": 1.6187260150909424, + "learning_rate": 0.00010022618055939477, + "loss": 1.2508, + "step": 14361 + }, + { + "epoch": 0.514333804859706, + "grad_norm": 1.5221320390701294, + "learning_rate": 0.00010021458157463268, + "loss": 1.1193, + "step": 14362 + }, + { + "epoch": 0.5143696169892743, + "grad_norm": 1.465072512626648, + "learning_rate": 0.0001002029825869837, + "loss": 1.2489, + "step": 14363 + }, + { + "epoch": 0.5144054291188426, + "grad_norm": 1.472633719444275, + "learning_rate": 0.00010019138359660387, + "loss": 1.2512, + "step": 14364 + }, + { + "epoch": 0.5144412412484108, + "grad_norm": 2.5336129665374756, + "learning_rate": 0.00010017978460364919, + "loss": 1.3397, + "step": 14365 + }, + { + "epoch": 0.5144770533779791, + "grad_norm": 1.540315866470337, + "learning_rate": 0.00010016818560827577, + "loss": 1.0898, + "step": 14366 + }, + { + "epoch": 0.5145128655075474, + "grad_norm": 1.4402724504470825, + "learning_rate": 0.00010015658661063957, + "loss": 1.107, + "step": 14367 + }, + { + "epoch": 0.5145486776371156, + "grad_norm": 1.5323071479797363, + "learning_rate": 0.00010014498761089677, + "loss": 1.3256, + "step": 14368 + }, + { + "epoch": 0.514584489766684, + "grad_norm": 1.475831151008606, + "learning_rate": 0.0001001333886092033, + "loss": 1.0398, + "step": 14369 + }, + { + "epoch": 0.5146203018962523, + "grad_norm": 1.486037254333496, + "learning_rate": 0.00010012178960571527, + "loss": 1.3743, + "step": 14370 + }, + { + "epoch": 0.5146561140258206, + "grad_norm": 1.263537049293518, + "learning_rate": 0.00010011019060058873, + "loss": 1.1908, + "step": 14371 + }, + { + "epoch": 0.5146919261553888, + "grad_norm": 1.2636455297470093, + "learning_rate": 0.00010009859159397974, + "loss": 1.1623, + "step": 14372 + }, + { + "epoch": 0.5147277382849571, + "grad_norm": 1.4388104677200317, + "learning_rate": 0.00010008699258604429, + "loss": 0.9357, + "step": 14373 + }, + { + "epoch": 0.5147635504145254, + "grad_norm": 1.4411470890045166, + "learning_rate": 0.00010007539357693845, + "loss": 1.0953, + "step": 14374 + }, + { + "epoch": 0.5147993625440936, + "grad_norm": 1.5150206089019775, + "learning_rate": 0.00010006379456681834, + "loss": 1.052, + "step": 14375 + }, + { + "epoch": 0.514835174673662, + "grad_norm": 1.4655386209487915, + "learning_rate": 0.00010005219555583991, + "loss": 1.3039, + "step": 14376 + }, + { + "epoch": 0.5148709868032303, + "grad_norm": 1.2776072025299072, + "learning_rate": 0.00010004059654415927, + "loss": 1.2674, + "step": 14377 + }, + { + "epoch": 0.5149067989327986, + "grad_norm": 1.67658269405365, + "learning_rate": 0.00010002899753193246, + "loss": 1.1422, + "step": 14378 + }, + { + "epoch": 0.5149426110623668, + "grad_norm": 1.7283084392547607, + "learning_rate": 0.00010001739851931553, + "loss": 1.2852, + "step": 14379 + }, + { + "epoch": 0.5149784231919351, + "grad_norm": 1.5214954614639282, + "learning_rate": 0.00010000579950646452, + "loss": 1.235, + "step": 14380 + }, + { + "epoch": 0.5150142353215034, + "grad_norm": 1.5363459587097168, + "learning_rate": 9.999420049353549e-05, + "loss": 1.2294, + "step": 14381 + }, + { + "epoch": 0.5150500474510716, + "grad_norm": 1.6676472425460815, + "learning_rate": 9.998260148068449e-05, + "loss": 0.9709, + "step": 14382 + }, + { + "epoch": 0.51508585958064, + "grad_norm": 1.831216812133789, + "learning_rate": 9.997100246806755e-05, + "loss": 1.3192, + "step": 14383 + }, + { + "epoch": 0.5151216717102083, + "grad_norm": 1.8973549604415894, + "learning_rate": 9.995940345584074e-05, + "loss": 1.349, + "step": 14384 + }, + { + "epoch": 0.5151574838397766, + "grad_norm": 1.3466126918792725, + "learning_rate": 9.994780444416013e-05, + "loss": 1.1782, + "step": 14385 + }, + { + "epoch": 0.5151932959693448, + "grad_norm": 1.5544013977050781, + "learning_rate": 9.99362054331817e-05, + "loss": 1.3132, + "step": 14386 + }, + { + "epoch": 0.5152291080989131, + "grad_norm": 1.3416900634765625, + "learning_rate": 9.992460642306156e-05, + "loss": 0.9439, + "step": 14387 + }, + { + "epoch": 0.5152649202284814, + "grad_norm": 1.5664063692092896, + "learning_rate": 9.991300741395574e-05, + "loss": 0.9731, + "step": 14388 + }, + { + "epoch": 0.5153007323580496, + "grad_norm": 1.5682909488677979, + "learning_rate": 9.99014084060203e-05, + "loss": 1.1063, + "step": 14389 + }, + { + "epoch": 0.515336544487618, + "grad_norm": 1.7446788549423218, + "learning_rate": 9.988980939941127e-05, + "loss": 1.1793, + "step": 14390 + }, + { + "epoch": 0.5153723566171863, + "grad_norm": 1.5136044025421143, + "learning_rate": 9.987821039428474e-05, + "loss": 0.8327, + "step": 14391 + }, + { + "epoch": 0.5154081687467545, + "grad_norm": 1.685994267463684, + "learning_rate": 9.986661139079671e-05, + "loss": 0.9321, + "step": 14392 + }, + { + "epoch": 0.5154439808763228, + "grad_norm": 1.6079216003417969, + "learning_rate": 9.985501238910325e-05, + "loss": 1.2235, + "step": 14393 + }, + { + "epoch": 0.5154797930058911, + "grad_norm": 1.4003196954727173, + "learning_rate": 9.984341338936043e-05, + "loss": 1.1122, + "step": 14394 + }, + { + "epoch": 0.5155156051354594, + "grad_norm": 1.8247429132461548, + "learning_rate": 9.983181439172426e-05, + "loss": 1.2084, + "step": 14395 + }, + { + "epoch": 0.5155514172650276, + "grad_norm": 1.5212409496307373, + "learning_rate": 9.982021539635084e-05, + "loss": 1.1408, + "step": 14396 + }, + { + "epoch": 0.515587229394596, + "grad_norm": 1.2178709506988525, + "learning_rate": 9.980861640339614e-05, + "loss": 0.8841, + "step": 14397 + }, + { + "epoch": 0.5156230415241643, + "grad_norm": 1.3305203914642334, + "learning_rate": 9.979701741301631e-05, + "loss": 0.9803, + "step": 14398 + }, + { + "epoch": 0.5156588536537325, + "grad_norm": 1.3392548561096191, + "learning_rate": 9.978541842536732e-05, + "loss": 1.1539, + "step": 14399 + }, + { + "epoch": 0.5156946657833008, + "grad_norm": 1.544443964958191, + "learning_rate": 9.977381944060525e-05, + "loss": 1.0934, + "step": 14400 + }, + { + "epoch": 0.5157304779128691, + "grad_norm": 1.916001319885254, + "learning_rate": 9.976222045888614e-05, + "loss": 1.2095, + "step": 14401 + }, + { + "epoch": 0.5157662900424373, + "grad_norm": 1.436069130897522, + "learning_rate": 9.975062148036608e-05, + "loss": 1.1546, + "step": 14402 + }, + { + "epoch": 0.5158021021720056, + "grad_norm": 1.2246285676956177, + "learning_rate": 9.97390225052011e-05, + "loss": 1.1753, + "step": 14403 + }, + { + "epoch": 0.515837914301574, + "grad_norm": 1.8794838190078735, + "learning_rate": 9.972742353354717e-05, + "loss": 1.2602, + "step": 14404 + }, + { + "epoch": 0.5158737264311423, + "grad_norm": 1.4341288805007935, + "learning_rate": 9.971582456556045e-05, + "loss": 1.0666, + "step": 14405 + }, + { + "epoch": 0.5159095385607105, + "grad_norm": 1.599692940711975, + "learning_rate": 9.970422560139692e-05, + "loss": 1.1381, + "step": 14406 + }, + { + "epoch": 0.5159453506902788, + "grad_norm": 1.5814013481140137, + "learning_rate": 9.969262664121267e-05, + "loss": 1.2093, + "step": 14407 + }, + { + "epoch": 0.5159811628198471, + "grad_norm": 1.6640212535858154, + "learning_rate": 9.968102768516371e-05, + "loss": 1.2921, + "step": 14408 + }, + { + "epoch": 0.5160169749494153, + "grad_norm": 1.3741905689239502, + "learning_rate": 9.966942873340614e-05, + "loss": 1.0759, + "step": 14409 + }, + { + "epoch": 0.5160527870789836, + "grad_norm": 1.2880216836929321, + "learning_rate": 9.965782978609595e-05, + "loss": 0.9918, + "step": 14410 + }, + { + "epoch": 0.516088599208552, + "grad_norm": 1.5255967378616333, + "learning_rate": 9.964623084338926e-05, + "loss": 1.2135, + "step": 14411 + }, + { + "epoch": 0.5161244113381203, + "grad_norm": 1.6262407302856445, + "learning_rate": 9.963463190544208e-05, + "loss": 1.1055, + "step": 14412 + }, + { + "epoch": 0.5161602234676885, + "grad_norm": 1.6371119022369385, + "learning_rate": 9.96230329724104e-05, + "loss": 1.2582, + "step": 14413 + }, + { + "epoch": 0.5161960355972568, + "grad_norm": 1.2978711128234863, + "learning_rate": 9.961143404445038e-05, + "loss": 1.0463, + "step": 14414 + }, + { + "epoch": 0.5162318477268251, + "grad_norm": 1.3225048780441284, + "learning_rate": 9.959983512171796e-05, + "loss": 1.0061, + "step": 14415 + }, + { + "epoch": 0.5162676598563933, + "grad_norm": 1.3775001764297485, + "learning_rate": 9.95882362043693e-05, + "loss": 1.0724, + "step": 14416 + }, + { + "epoch": 0.5163034719859616, + "grad_norm": 1.7028815746307373, + "learning_rate": 9.957663729256033e-05, + "loss": 1.021, + "step": 14417 + }, + { + "epoch": 0.51633928411553, + "grad_norm": 1.2594863176345825, + "learning_rate": 9.956503838644719e-05, + "loss": 1.0703, + "step": 14418 + }, + { + "epoch": 0.5163750962450983, + "grad_norm": 1.4428684711456299, + "learning_rate": 9.95534394861859e-05, + "loss": 1.1184, + "step": 14419 + }, + { + "epoch": 0.5164109083746665, + "grad_norm": 1.3784018754959106, + "learning_rate": 9.954184059193251e-05, + "loss": 1.1523, + "step": 14420 + }, + { + "epoch": 0.5164467205042348, + "grad_norm": 1.794216513633728, + "learning_rate": 9.953024170384309e-05, + "loss": 1.055, + "step": 14421 + }, + { + "epoch": 0.5164825326338031, + "grad_norm": 1.8349350690841675, + "learning_rate": 9.95186428220736e-05, + "loss": 1.0497, + "step": 14422 + }, + { + "epoch": 0.5165183447633713, + "grad_norm": 1.7286760807037354, + "learning_rate": 9.950704394678021e-05, + "loss": 1.0432, + "step": 14423 + }, + { + "epoch": 0.5165541568929396, + "grad_norm": 1.8481872081756592, + "learning_rate": 9.949544507811885e-05, + "loss": 1.1307, + "step": 14424 + }, + { + "epoch": 0.516589969022508, + "grad_norm": 1.5801844596862793, + "learning_rate": 9.948384621624569e-05, + "loss": 1.252, + "step": 14425 + }, + { + "epoch": 0.5166257811520762, + "grad_norm": 1.9419158697128296, + "learning_rate": 9.947224736131662e-05, + "loss": 1.1572, + "step": 14426 + }, + { + "epoch": 0.5166615932816445, + "grad_norm": 2.5042877197265625, + "learning_rate": 9.946064851348788e-05, + "loss": 1.15, + "step": 14427 + }, + { + "epoch": 0.5166974054112128, + "grad_norm": 1.7281066179275513, + "learning_rate": 9.944904967291533e-05, + "loss": 0.9223, + "step": 14428 + }, + { + "epoch": 0.516733217540781, + "grad_norm": 1.5733715295791626, + "learning_rate": 9.94374508397552e-05, + "loss": 0.9679, + "step": 14429 + }, + { + "epoch": 0.5167690296703493, + "grad_norm": 1.3959956169128418, + "learning_rate": 9.94258520141634e-05, + "loss": 0.9988, + "step": 14430 + }, + { + "epoch": 0.5168048417999176, + "grad_norm": 1.4489237070083618, + "learning_rate": 9.941425319629598e-05, + "loss": 1.2058, + "step": 14431 + }, + { + "epoch": 0.516840653929486, + "grad_norm": 1.323080062866211, + "learning_rate": 9.94026543863091e-05, + "loss": 1.0477, + "step": 14432 + }, + { + "epoch": 0.5168764660590542, + "grad_norm": 1.461216926574707, + "learning_rate": 9.939105558435866e-05, + "loss": 1.3566, + "step": 14433 + }, + { + "epoch": 0.5169122781886225, + "grad_norm": 1.685442566871643, + "learning_rate": 9.937945679060085e-05, + "loss": 1.1209, + "step": 14434 + }, + { + "epoch": 0.5169480903181908, + "grad_norm": 1.9545384645462036, + "learning_rate": 9.93678580051916e-05, + "loss": 1.1252, + "step": 14435 + }, + { + "epoch": 0.516983902447759, + "grad_norm": 1.3872956037521362, + "learning_rate": 9.935625922828705e-05, + "loss": 1.2562, + "step": 14436 + }, + { + "epoch": 0.5170197145773273, + "grad_norm": 1.5295852422714233, + "learning_rate": 9.934466046004313e-05, + "loss": 1.0854, + "step": 14437 + }, + { + "epoch": 0.5170555267068956, + "grad_norm": 1.5698922872543335, + "learning_rate": 9.933306170061604e-05, + "loss": 1.1101, + "step": 14438 + }, + { + "epoch": 0.517091338836464, + "grad_norm": 1.5765758752822876, + "learning_rate": 9.932146295016172e-05, + "loss": 1.1604, + "step": 14439 + }, + { + "epoch": 0.5171271509660322, + "grad_norm": 1.5390046834945679, + "learning_rate": 9.930986420883623e-05, + "loss": 1.056, + "step": 14440 + }, + { + "epoch": 0.5171629630956005, + "grad_norm": 2.2003536224365234, + "learning_rate": 9.929826547679563e-05, + "loss": 1.2052, + "step": 14441 + }, + { + "epoch": 0.5171987752251688, + "grad_norm": 1.4147140979766846, + "learning_rate": 9.928666675419595e-05, + "loss": 1.2216, + "step": 14442 + }, + { + "epoch": 0.517234587354737, + "grad_norm": 1.8758071660995483, + "learning_rate": 9.927506804119326e-05, + "loss": 0.9644, + "step": 14443 + }, + { + "epoch": 0.5172703994843053, + "grad_norm": 1.4190477132797241, + "learning_rate": 9.926346933794357e-05, + "loss": 1.0668, + "step": 14444 + }, + { + "epoch": 0.5173062116138736, + "grad_norm": 1.4404675960540771, + "learning_rate": 9.925187064460299e-05, + "loss": 1.0722, + "step": 14445 + }, + { + "epoch": 0.517342023743442, + "grad_norm": 1.2811132669448853, + "learning_rate": 9.924027196132747e-05, + "loss": 0.9705, + "step": 14446 + }, + { + "epoch": 0.5173778358730102, + "grad_norm": 2.2081570625305176, + "learning_rate": 9.922867328827319e-05, + "loss": 0.9584, + "step": 14447 + }, + { + "epoch": 0.5174136480025785, + "grad_norm": 1.433885931968689, + "learning_rate": 9.921707462559608e-05, + "loss": 1.2227, + "step": 14448 + }, + { + "epoch": 0.5174494601321468, + "grad_norm": 1.5207877159118652, + "learning_rate": 9.92054759734522e-05, + "loss": 0.965, + "step": 14449 + }, + { + "epoch": 0.517485272261715, + "grad_norm": 1.7111574411392212, + "learning_rate": 9.919387733199764e-05, + "loss": 1.0327, + "step": 14450 + }, + { + "epoch": 0.5175210843912833, + "grad_norm": 1.5109091997146606, + "learning_rate": 9.91822787013884e-05, + "loss": 1.2343, + "step": 14451 + }, + { + "epoch": 0.5175568965208516, + "grad_norm": 1.348147988319397, + "learning_rate": 9.917068008178056e-05, + "loss": 1.0034, + "step": 14452 + }, + { + "epoch": 0.51759270865042, + "grad_norm": 1.5185731649398804, + "learning_rate": 9.915908147333013e-05, + "loss": 1.0239, + "step": 14453 + }, + { + "epoch": 0.5176285207799882, + "grad_norm": 1.4994758367538452, + "learning_rate": 9.91474828761932e-05, + "loss": 0.9784, + "step": 14454 + }, + { + "epoch": 0.5176643329095565, + "grad_norm": 1.3558013439178467, + "learning_rate": 9.913588429052578e-05, + "loss": 1.0439, + "step": 14455 + }, + { + "epoch": 0.5177001450391248, + "grad_norm": 1.5605019330978394, + "learning_rate": 9.912428571648393e-05, + "loss": 1.1076, + "step": 14456 + }, + { + "epoch": 0.517735957168693, + "grad_norm": 1.559006690979004, + "learning_rate": 9.91126871542237e-05, + "loss": 1.0952, + "step": 14457 + }, + { + "epoch": 0.5177717692982613, + "grad_norm": 2.184288501739502, + "learning_rate": 9.910108860390107e-05, + "loss": 1.3049, + "step": 14458 + }, + { + "epoch": 0.5178075814278296, + "grad_norm": 2.5202887058258057, + "learning_rate": 9.908949006567218e-05, + "loss": 1.2652, + "step": 14459 + }, + { + "epoch": 0.5178433935573978, + "grad_norm": 1.6941872835159302, + "learning_rate": 9.9077891539693e-05, + "loss": 1.3152, + "step": 14460 + }, + { + "epoch": 0.5178792056869662, + "grad_norm": 2.2801270484924316, + "learning_rate": 9.906629302611961e-05, + "loss": 1.2404, + "step": 14461 + }, + { + "epoch": 0.5179150178165345, + "grad_norm": 1.7260040044784546, + "learning_rate": 9.905469452510803e-05, + "loss": 1.3752, + "step": 14462 + }, + { + "epoch": 0.5179508299461028, + "grad_norm": 1.6573331356048584, + "learning_rate": 9.904309603681433e-05, + "loss": 1.1288, + "step": 14463 + }, + { + "epoch": 0.517986642075671, + "grad_norm": 1.3300433158874512, + "learning_rate": 9.903149756139453e-05, + "loss": 1.1291, + "step": 14464 + }, + { + "epoch": 0.5180224542052393, + "grad_norm": 1.844744086265564, + "learning_rate": 9.901989909900469e-05, + "loss": 1.2934, + "step": 14465 + }, + { + "epoch": 0.5180582663348076, + "grad_norm": 1.5951581001281738, + "learning_rate": 9.900830064980084e-05, + "loss": 1.1674, + "step": 14466 + }, + { + "epoch": 0.5180940784643758, + "grad_norm": 1.589449167251587, + "learning_rate": 9.899670221393901e-05, + "loss": 1.3046, + "step": 14467 + }, + { + "epoch": 0.5181298905939442, + "grad_norm": 1.3778828382492065, + "learning_rate": 9.89851037915753e-05, + "loss": 1.1027, + "step": 14468 + }, + { + "epoch": 0.5181657027235125, + "grad_norm": 1.4439127445220947, + "learning_rate": 9.897350538286566e-05, + "loss": 1.0936, + "step": 14469 + }, + { + "epoch": 0.5182015148530807, + "grad_norm": 1.4038081169128418, + "learning_rate": 9.896190698796621e-05, + "loss": 1.0401, + "step": 14470 + }, + { + "epoch": 0.518237326982649, + "grad_norm": 1.5642751455307007, + "learning_rate": 9.895030860703295e-05, + "loss": 1.2011, + "step": 14471 + }, + { + "epoch": 0.5182731391122173, + "grad_norm": 1.5355178117752075, + "learning_rate": 9.893871024022195e-05, + "loss": 1.0065, + "step": 14472 + }, + { + "epoch": 0.5183089512417856, + "grad_norm": 1.502003788948059, + "learning_rate": 9.89271118876892e-05, + "loss": 1.1756, + "step": 14473 + }, + { + "epoch": 0.5183447633713538, + "grad_norm": 1.560584306716919, + "learning_rate": 9.891551354959082e-05, + "loss": 1.1253, + "step": 14474 + }, + { + "epoch": 0.5183805755009222, + "grad_norm": 1.8209328651428223, + "learning_rate": 9.890391522608278e-05, + "loss": 1.2315, + "step": 14475 + }, + { + "epoch": 0.5184163876304905, + "grad_norm": 1.5170873403549194, + "learning_rate": 9.889231691732115e-05, + "loss": 1.1206, + "step": 14476 + }, + { + "epoch": 0.5184521997600587, + "grad_norm": 1.511955976486206, + "learning_rate": 9.888071862346198e-05, + "loss": 1.2125, + "step": 14477 + }, + { + "epoch": 0.518488011889627, + "grad_norm": 1.4469146728515625, + "learning_rate": 9.886912034466127e-05, + "loss": 0.815, + "step": 14478 + }, + { + "epoch": 0.5185238240191953, + "grad_norm": 1.616801142692566, + "learning_rate": 9.885752208107511e-05, + "loss": 1.2293, + "step": 14479 + }, + { + "epoch": 0.5185596361487635, + "grad_norm": 1.2158396244049072, + "learning_rate": 9.88459238328595e-05, + "loss": 1.063, + "step": 14480 + }, + { + "epoch": 0.5185954482783318, + "grad_norm": 2.8228752613067627, + "learning_rate": 9.883432560017052e-05, + "loss": 1.3482, + "step": 14481 + }, + { + "epoch": 0.5186312604079002, + "grad_norm": 1.9728548526763916, + "learning_rate": 9.882272738316418e-05, + "loss": 1.3419, + "step": 14482 + }, + { + "epoch": 0.5186670725374685, + "grad_norm": 1.5694702863693237, + "learning_rate": 9.88111291819965e-05, + "loss": 1.1978, + "step": 14483 + }, + { + "epoch": 0.5187028846670367, + "grad_norm": 1.4086335897445679, + "learning_rate": 9.879953099682358e-05, + "loss": 1.0886, + "step": 14484 + }, + { + "epoch": 0.518738696796605, + "grad_norm": 1.5919445753097534, + "learning_rate": 9.878793282780137e-05, + "loss": 1.0946, + "step": 14485 + }, + { + "epoch": 0.5187745089261733, + "grad_norm": 1.8335378170013428, + "learning_rate": 9.877633467508602e-05, + "loss": 1.044, + "step": 14486 + }, + { + "epoch": 0.5188103210557415, + "grad_norm": 1.5374647378921509, + "learning_rate": 9.876473653883346e-05, + "loss": 1.0241, + "step": 14487 + }, + { + "epoch": 0.5188461331853098, + "grad_norm": 1.544538140296936, + "learning_rate": 9.87531384191998e-05, + "loss": 1.0875, + "step": 14488 + }, + { + "epoch": 0.5188819453148782, + "grad_norm": 1.3704016208648682, + "learning_rate": 9.874154031634103e-05, + "loss": 1.1254, + "step": 14489 + }, + { + "epoch": 0.5189177574444465, + "grad_norm": 1.932550072669983, + "learning_rate": 9.872994223041325e-05, + "loss": 1.2099, + "step": 14490 + }, + { + "epoch": 0.5189535695740147, + "grad_norm": 1.6212451457977295, + "learning_rate": 9.871834416157246e-05, + "loss": 0.8832, + "step": 14491 + }, + { + "epoch": 0.518989381703583, + "grad_norm": 1.5062719583511353, + "learning_rate": 9.870674610997467e-05, + "loss": 1.2194, + "step": 14492 + }, + { + "epoch": 0.5190251938331513, + "grad_norm": 1.4456133842468262, + "learning_rate": 9.869514807577595e-05, + "loss": 0.8536, + "step": 14493 + }, + { + "epoch": 0.5190610059627195, + "grad_norm": 1.7290223836898804, + "learning_rate": 9.868355005913232e-05, + "loss": 0.9686, + "step": 14494 + }, + { + "epoch": 0.5190968180922878, + "grad_norm": 1.368811011314392, + "learning_rate": 9.867195206019985e-05, + "loss": 1.2054, + "step": 14495 + }, + { + "epoch": 0.5191326302218562, + "grad_norm": 1.9264616966247559, + "learning_rate": 9.866035407913452e-05, + "loss": 1.1456, + "step": 14496 + }, + { + "epoch": 0.5191684423514245, + "grad_norm": 1.4454907178878784, + "learning_rate": 9.864875611609243e-05, + "loss": 1.1438, + "step": 14497 + }, + { + "epoch": 0.5192042544809927, + "grad_norm": 1.5798288583755493, + "learning_rate": 9.863715817122956e-05, + "loss": 1.0709, + "step": 14498 + }, + { + "epoch": 0.519240066610561, + "grad_norm": 1.4949496984481812, + "learning_rate": 9.862556024470199e-05, + "loss": 1.1819, + "step": 14499 + }, + { + "epoch": 0.5192758787401293, + "grad_norm": 1.4381492137908936, + "learning_rate": 9.861396233666574e-05, + "loss": 1.1911, + "step": 14500 + }, + { + "epoch": 0.5193116908696975, + "grad_norm": 1.3816219568252563, + "learning_rate": 9.860236444727679e-05, + "loss": 1.1938, + "step": 14501 + }, + { + "epoch": 0.5193475029992658, + "grad_norm": 1.7325705289840698, + "learning_rate": 9.859076657669127e-05, + "loss": 1.049, + "step": 14502 + }, + { + "epoch": 0.5193833151288342, + "grad_norm": 1.8109650611877441, + "learning_rate": 9.857916872506513e-05, + "loss": 1.138, + "step": 14503 + }, + { + "epoch": 0.5194191272584024, + "grad_norm": 1.5046424865722656, + "learning_rate": 9.856757089255448e-05, + "loss": 1.1166, + "step": 14504 + }, + { + "epoch": 0.5194549393879707, + "grad_norm": 1.0976074934005737, + "learning_rate": 9.85559730793153e-05, + "loss": 1.0165, + "step": 14505 + }, + { + "epoch": 0.519490751517539, + "grad_norm": 1.5461210012435913, + "learning_rate": 9.854437528550364e-05, + "loss": 1.0587, + "step": 14506 + }, + { + "epoch": 0.5195265636471073, + "grad_norm": 1.170897364616394, + "learning_rate": 9.853277751127552e-05, + "loss": 1.1424, + "step": 14507 + }, + { + "epoch": 0.5195623757766755, + "grad_norm": 1.5151535272598267, + "learning_rate": 9.852117975678701e-05, + "loss": 1.1034, + "step": 14508 + }, + { + "epoch": 0.5195981879062438, + "grad_norm": 1.4736615419387817, + "learning_rate": 9.850958202219414e-05, + "loss": 1.141, + "step": 14509 + }, + { + "epoch": 0.5196340000358122, + "grad_norm": 1.6786185503005981, + "learning_rate": 9.849798430765286e-05, + "loss": 1.0868, + "step": 14510 + }, + { + "epoch": 0.5196698121653804, + "grad_norm": 2.022641658782959, + "learning_rate": 9.848638661331933e-05, + "loss": 1.1131, + "step": 14511 + }, + { + "epoch": 0.5197056242949487, + "grad_norm": 1.9157803058624268, + "learning_rate": 9.847478893934944e-05, + "loss": 1.1042, + "step": 14512 + }, + { + "epoch": 0.519741436424517, + "grad_norm": 1.6783722639083862, + "learning_rate": 9.846319128589936e-05, + "loss": 0.972, + "step": 14513 + }, + { + "epoch": 0.5197772485540852, + "grad_norm": 1.391257882118225, + "learning_rate": 9.845159365312501e-05, + "loss": 1.0357, + "step": 14514 + }, + { + "epoch": 0.5198130606836535, + "grad_norm": 1.3758944272994995, + "learning_rate": 9.84399960411825e-05, + "loss": 0.9257, + "step": 14515 + }, + { + "epoch": 0.5198488728132218, + "grad_norm": 1.9010987281799316, + "learning_rate": 9.842839845022781e-05, + "loss": 1.2421, + "step": 14516 + }, + { + "epoch": 0.5198846849427902, + "grad_norm": 1.4807387590408325, + "learning_rate": 9.841680088041701e-05, + "loss": 1.1357, + "step": 14517 + }, + { + "epoch": 0.5199204970723584, + "grad_norm": 1.7397339344024658, + "learning_rate": 9.840520333190615e-05, + "loss": 1.0824, + "step": 14518 + }, + { + "epoch": 0.5199563092019267, + "grad_norm": 1.4784600734710693, + "learning_rate": 9.839360580485115e-05, + "loss": 0.9682, + "step": 14519 + }, + { + "epoch": 0.519992121331495, + "grad_norm": 2.04339861869812, + "learning_rate": 9.838200829940818e-05, + "loss": 1.1582, + "step": 14520 + }, + { + "epoch": 0.5200279334610632, + "grad_norm": 1.3229148387908936, + "learning_rate": 9.837041081573312e-05, + "loss": 0.9386, + "step": 14521 + }, + { + "epoch": 0.5200637455906315, + "grad_norm": 1.4265999794006348, + "learning_rate": 9.835881335398216e-05, + "loss": 1.1419, + "step": 14522 + }, + { + "epoch": 0.5200995577201998, + "grad_norm": 1.3953197002410889, + "learning_rate": 9.834721591431118e-05, + "loss": 1.1908, + "step": 14523 + }, + { + "epoch": 0.5201353698497682, + "grad_norm": 1.6294606924057007, + "learning_rate": 9.833561849687634e-05, + "loss": 1.2378, + "step": 14524 + }, + { + "epoch": 0.5201711819793364, + "grad_norm": 1.5177521705627441, + "learning_rate": 9.832402110183355e-05, + "loss": 1.1404, + "step": 14525 + }, + { + "epoch": 0.5202069941089047, + "grad_norm": 1.7583731412887573, + "learning_rate": 9.831242372933896e-05, + "loss": 1.1682, + "step": 14526 + }, + { + "epoch": 0.520242806238473, + "grad_norm": 1.3000975847244263, + "learning_rate": 9.830082637954851e-05, + "loss": 1.0103, + "step": 14527 + }, + { + "epoch": 0.5202786183680412, + "grad_norm": 1.6342895030975342, + "learning_rate": 9.828922905261819e-05, + "loss": 0.9813, + "step": 14528 + }, + { + "epoch": 0.5203144304976095, + "grad_norm": 1.650873064994812, + "learning_rate": 9.827763174870417e-05, + "loss": 1.0886, + "step": 14529 + }, + { + "epoch": 0.5203502426271778, + "grad_norm": 1.4855977296829224, + "learning_rate": 9.826603446796231e-05, + "loss": 0.8605, + "step": 14530 + }, + { + "epoch": 0.5203860547567462, + "grad_norm": 1.830512285232544, + "learning_rate": 9.82544372105488e-05, + "loss": 1.1126, + "step": 14531 + }, + { + "epoch": 0.5204218668863144, + "grad_norm": 1.5019077062606812, + "learning_rate": 9.824283997661952e-05, + "loss": 1.2043, + "step": 14532 + }, + { + "epoch": 0.5204576790158827, + "grad_norm": 1.5482194423675537, + "learning_rate": 9.823124276633061e-05, + "loss": 1.2081, + "step": 14533 + }, + { + "epoch": 0.520493491145451, + "grad_norm": 1.6785472631454468, + "learning_rate": 9.821964557983799e-05, + "loss": 1.2183, + "step": 14534 + }, + { + "epoch": 0.5205293032750192, + "grad_norm": 1.610468864440918, + "learning_rate": 9.820804841729782e-05, + "loss": 1.0394, + "step": 14535 + }, + { + "epoch": 0.5205651154045875, + "grad_norm": 1.4153943061828613, + "learning_rate": 9.819645127886602e-05, + "loss": 1.1005, + "step": 14536 + }, + { + "epoch": 0.5206009275341558, + "grad_norm": 1.318398356437683, + "learning_rate": 9.818485416469861e-05, + "loss": 1.0821, + "step": 14537 + }, + { + "epoch": 0.5206367396637241, + "grad_norm": 1.585486650466919, + "learning_rate": 9.817325707495167e-05, + "loss": 1.1861, + "step": 14538 + }, + { + "epoch": 0.5206725517932924, + "grad_norm": 2.0748746395111084, + "learning_rate": 9.816166000978119e-05, + "loss": 1.0194, + "step": 14539 + }, + { + "epoch": 0.5207083639228607, + "grad_norm": 1.5099953413009644, + "learning_rate": 9.815006296934321e-05, + "loss": 1.1892, + "step": 14540 + }, + { + "epoch": 0.520744176052429, + "grad_norm": 1.5195207595825195, + "learning_rate": 9.813846595379371e-05, + "loss": 1.1196, + "step": 14541 + }, + { + "epoch": 0.5207799881819972, + "grad_norm": 1.821628451347351, + "learning_rate": 9.812686896328882e-05, + "loss": 1.4242, + "step": 14542 + }, + { + "epoch": 0.5208158003115655, + "grad_norm": 2.342648983001709, + "learning_rate": 9.811527199798443e-05, + "loss": 1.069, + "step": 14543 + }, + { + "epoch": 0.5208516124411338, + "grad_norm": 1.8612124919891357, + "learning_rate": 9.810367505803667e-05, + "loss": 1.2292, + "step": 14544 + }, + { + "epoch": 0.5208874245707021, + "grad_norm": 1.694216012954712, + "learning_rate": 9.80920781436015e-05, + "loss": 1.2247, + "step": 14545 + }, + { + "epoch": 0.5209232367002704, + "grad_norm": 1.8888523578643799, + "learning_rate": 9.808048125483494e-05, + "loss": 1.0768, + "step": 14546 + }, + { + "epoch": 0.5209590488298387, + "grad_norm": 1.318444013595581, + "learning_rate": 9.806888439189306e-05, + "loss": 1.2273, + "step": 14547 + }, + { + "epoch": 0.520994860959407, + "grad_norm": 1.2997967004776, + "learning_rate": 9.805728755493182e-05, + "loss": 0.8969, + "step": 14548 + }, + { + "epoch": 0.5210306730889752, + "grad_norm": 1.412862777709961, + "learning_rate": 9.804569074410729e-05, + "loss": 1.0278, + "step": 14549 + }, + { + "epoch": 0.5210664852185435, + "grad_norm": 1.6801966428756714, + "learning_rate": 9.803409395957545e-05, + "loss": 1.2579, + "step": 14550 + }, + { + "epoch": 0.5211022973481118, + "grad_norm": 1.6614891290664673, + "learning_rate": 9.802249720149236e-05, + "loss": 1.1516, + "step": 14551 + }, + { + "epoch": 0.5211381094776801, + "grad_norm": 1.6651458740234375, + "learning_rate": 9.8010900470014e-05, + "loss": 0.9943, + "step": 14552 + }, + { + "epoch": 0.5211739216072484, + "grad_norm": 1.2974810600280762, + "learning_rate": 9.799930376529643e-05, + "loss": 1.1721, + "step": 14553 + }, + { + "epoch": 0.5212097337368167, + "grad_norm": 1.5659233331680298, + "learning_rate": 9.798770708749563e-05, + "loss": 1.1034, + "step": 14554 + }, + { + "epoch": 0.5212455458663849, + "grad_norm": 1.5581835508346558, + "learning_rate": 9.797611043676764e-05, + "loss": 1.2722, + "step": 14555 + }, + { + "epoch": 0.5212813579959532, + "grad_norm": 1.5489678382873535, + "learning_rate": 9.796451381326849e-05, + "loss": 1.3381, + "step": 14556 + }, + { + "epoch": 0.5213171701255215, + "grad_norm": 1.6427072286605835, + "learning_rate": 9.795291721715414e-05, + "loss": 1.3423, + "step": 14557 + }, + { + "epoch": 0.5213529822550897, + "grad_norm": 1.4574812650680542, + "learning_rate": 9.794132064858069e-05, + "loss": 1.2257, + "step": 14558 + }, + { + "epoch": 0.5213887943846581, + "grad_norm": 1.736242413520813, + "learning_rate": 9.792972410770409e-05, + "loss": 1.2764, + "step": 14559 + }, + { + "epoch": 0.5214246065142264, + "grad_norm": 1.3181074857711792, + "learning_rate": 9.791812759468039e-05, + "loss": 1.0019, + "step": 14560 + }, + { + "epoch": 0.5214604186437947, + "grad_norm": 1.3254915475845337, + "learning_rate": 9.79065311096656e-05, + "loss": 1.0402, + "step": 14561 + }, + { + "epoch": 0.5214962307733629, + "grad_norm": 1.2840526103973389, + "learning_rate": 9.789493465281574e-05, + "loss": 1.2143, + "step": 14562 + }, + { + "epoch": 0.5215320429029312, + "grad_norm": 1.7079427242279053, + "learning_rate": 9.788333822428682e-05, + "loss": 1.2524, + "step": 14563 + }, + { + "epoch": 0.5215678550324995, + "grad_norm": 1.4461078643798828, + "learning_rate": 9.787174182423484e-05, + "loss": 1.1979, + "step": 14564 + }, + { + "epoch": 0.5216036671620677, + "grad_norm": 2.038276195526123, + "learning_rate": 9.786014545281585e-05, + "loss": 1.082, + "step": 14565 + }, + { + "epoch": 0.5216394792916361, + "grad_norm": 1.333987832069397, + "learning_rate": 9.78485491101858e-05, + "loss": 1.1483, + "step": 14566 + }, + { + "epoch": 0.5216752914212044, + "grad_norm": 1.515095829963684, + "learning_rate": 9.783695279650079e-05, + "loss": 1.0169, + "step": 14567 + }, + { + "epoch": 0.5217111035507727, + "grad_norm": 1.6054552793502808, + "learning_rate": 9.782535651191676e-05, + "loss": 0.9346, + "step": 14568 + }, + { + "epoch": 0.5217469156803409, + "grad_norm": 1.815324068069458, + "learning_rate": 9.781376025658977e-05, + "loss": 1.2494, + "step": 14569 + }, + { + "epoch": 0.5217827278099092, + "grad_norm": 2.9637138843536377, + "learning_rate": 9.78021640306758e-05, + "loss": 1.0746, + "step": 14570 + }, + { + "epoch": 0.5218185399394775, + "grad_norm": 1.693361759185791, + "learning_rate": 9.77905678343309e-05, + "loss": 1.3674, + "step": 14571 + }, + { + "epoch": 0.5218543520690457, + "grad_norm": 1.3195186853408813, + "learning_rate": 9.777897166771107e-05, + "loss": 0.9733, + "step": 14572 + }, + { + "epoch": 0.5218901641986141, + "grad_norm": 1.6883633136749268, + "learning_rate": 9.776737553097227e-05, + "loss": 1.1932, + "step": 14573 + }, + { + "epoch": 0.5219259763281824, + "grad_norm": 2.0133275985717773, + "learning_rate": 9.775577942427058e-05, + "loss": 1.1189, + "step": 14574 + }, + { + "epoch": 0.5219617884577507, + "grad_norm": 1.6314830780029297, + "learning_rate": 9.774418334776196e-05, + "loss": 1.2261, + "step": 14575 + }, + { + "epoch": 0.5219976005873189, + "grad_norm": 1.4367952346801758, + "learning_rate": 9.773258730160247e-05, + "loss": 1.1214, + "step": 14576 + }, + { + "epoch": 0.5220334127168872, + "grad_norm": 1.7586597204208374, + "learning_rate": 9.772099128594808e-05, + "loss": 1.0675, + "step": 14577 + }, + { + "epoch": 0.5220692248464555, + "grad_norm": 1.5094636678695679, + "learning_rate": 9.770939530095482e-05, + "loss": 1.2469, + "step": 14578 + }, + { + "epoch": 0.5221050369760237, + "grad_norm": 1.5435549020767212, + "learning_rate": 9.769779934677869e-05, + "loss": 1.2547, + "step": 14579 + }, + { + "epoch": 0.5221408491055921, + "grad_norm": 2.1004679203033447, + "learning_rate": 9.76862034235757e-05, + "loss": 1.1247, + "step": 14580 + }, + { + "epoch": 0.5221766612351604, + "grad_norm": 1.5364216566085815, + "learning_rate": 9.767460753150186e-05, + "loss": 0.9509, + "step": 14581 + }, + { + "epoch": 0.5222124733647286, + "grad_norm": 1.9030849933624268, + "learning_rate": 9.766301167071316e-05, + "loss": 1.2363, + "step": 14582 + }, + { + "epoch": 0.5222482854942969, + "grad_norm": 1.2051597833633423, + "learning_rate": 9.765141584136565e-05, + "loss": 1.0411, + "step": 14583 + }, + { + "epoch": 0.5222840976238652, + "grad_norm": 1.6304035186767578, + "learning_rate": 9.763982004361527e-05, + "loss": 1.2167, + "step": 14584 + }, + { + "epoch": 0.5223199097534335, + "grad_norm": 1.4831441640853882, + "learning_rate": 9.76282242776181e-05, + "loss": 0.9519, + "step": 14585 + }, + { + "epoch": 0.5223557218830017, + "grad_norm": 1.4864709377288818, + "learning_rate": 9.76166285435301e-05, + "loss": 1.2163, + "step": 14586 + }, + { + "epoch": 0.5223915340125701, + "grad_norm": 1.3614163398742676, + "learning_rate": 9.76050328415073e-05, + "loss": 1.0063, + "step": 14587 + }, + { + "epoch": 0.5224273461421384, + "grad_norm": 1.1344913244247437, + "learning_rate": 9.759343717170571e-05, + "loss": 1.0038, + "step": 14588 + }, + { + "epoch": 0.5224631582717066, + "grad_norm": 1.4524916410446167, + "learning_rate": 9.758184153428126e-05, + "loss": 1.0328, + "step": 14589 + }, + { + "epoch": 0.5224989704012749, + "grad_norm": 1.352500557899475, + "learning_rate": 9.757024592939008e-05, + "loss": 1.1538, + "step": 14590 + }, + { + "epoch": 0.5225347825308432, + "grad_norm": 1.5199958086013794, + "learning_rate": 9.755865035718807e-05, + "loss": 1.1237, + "step": 14591 + }, + { + "epoch": 0.5225705946604114, + "grad_norm": 1.7416340112686157, + "learning_rate": 9.754705481783127e-05, + "loss": 1.2649, + "step": 14592 + }, + { + "epoch": 0.5226064067899797, + "grad_norm": 1.5172710418701172, + "learning_rate": 9.753545931147569e-05, + "loss": 1.2568, + "step": 14593 + }, + { + "epoch": 0.5226422189195481, + "grad_norm": 1.3305740356445312, + "learning_rate": 9.752386383827733e-05, + "loss": 1.1907, + "step": 14594 + }, + { + "epoch": 0.5226780310491164, + "grad_norm": 1.4564499855041504, + "learning_rate": 9.751226839839217e-05, + "loss": 1.1161, + "step": 14595 + }, + { + "epoch": 0.5227138431786846, + "grad_norm": 1.586652398109436, + "learning_rate": 9.750067299197625e-05, + "loss": 1.2076, + "step": 14596 + }, + { + "epoch": 0.5227496553082529, + "grad_norm": 1.5032027959823608, + "learning_rate": 9.748907761918558e-05, + "loss": 1.0445, + "step": 14597 + }, + { + "epoch": 0.5227854674378212, + "grad_norm": 1.4852052927017212, + "learning_rate": 9.747748228017606e-05, + "loss": 1.1374, + "step": 14598 + }, + { + "epoch": 0.5228212795673894, + "grad_norm": 1.4269174337387085, + "learning_rate": 9.746588697510381e-05, + "loss": 1.1651, + "step": 14599 + }, + { + "epoch": 0.5228570916969577, + "grad_norm": 1.8622736930847168, + "learning_rate": 9.745429170412476e-05, + "loss": 1.1997, + "step": 14600 + }, + { + "epoch": 0.5228929038265261, + "grad_norm": 1.2561582326889038, + "learning_rate": 9.744269646739494e-05, + "loss": 0.9408, + "step": 14601 + }, + { + "epoch": 0.5229287159560944, + "grad_norm": 1.3912198543548584, + "learning_rate": 9.743110126507034e-05, + "loss": 0.9895, + "step": 14602 + }, + { + "epoch": 0.5229645280856626, + "grad_norm": 1.5082165002822876, + "learning_rate": 9.741950609730696e-05, + "loss": 1.1752, + "step": 14603 + }, + { + "epoch": 0.5230003402152309, + "grad_norm": 1.3031065464019775, + "learning_rate": 9.74079109642608e-05, + "loss": 0.8106, + "step": 14604 + }, + { + "epoch": 0.5230361523447992, + "grad_norm": 1.4112268686294556, + "learning_rate": 9.739631586608786e-05, + "loss": 1.0594, + "step": 14605 + }, + { + "epoch": 0.5230719644743674, + "grad_norm": 1.6407636404037476, + "learning_rate": 9.738472080294415e-05, + "loss": 1.1489, + "step": 14606 + }, + { + "epoch": 0.5231077766039357, + "grad_norm": 1.7698725461959839, + "learning_rate": 9.737312577498559e-05, + "loss": 1.1702, + "step": 14607 + }, + { + "epoch": 0.5231435887335041, + "grad_norm": 1.97612726688385, + "learning_rate": 9.73615307823683e-05, + "loss": 1.0772, + "step": 14608 + }, + { + "epoch": 0.5231794008630724, + "grad_norm": 1.4889585971832275, + "learning_rate": 9.734993582524814e-05, + "loss": 1.1276, + "step": 14609 + }, + { + "epoch": 0.5232152129926406, + "grad_norm": 1.4994456768035889, + "learning_rate": 9.733834090378125e-05, + "loss": 1.002, + "step": 14610 + }, + { + "epoch": 0.5232510251222089, + "grad_norm": 1.4566965103149414, + "learning_rate": 9.732674601812347e-05, + "loss": 1.3417, + "step": 14611 + }, + { + "epoch": 0.5232868372517772, + "grad_norm": 1.5407490730285645, + "learning_rate": 9.731515116843094e-05, + "loss": 1.1552, + "step": 14612 + }, + { + "epoch": 0.5233226493813454, + "grad_norm": 1.147740364074707, + "learning_rate": 9.730355635485953e-05, + "loss": 0.9891, + "step": 14613 + }, + { + "epoch": 0.5233584615109137, + "grad_norm": 1.3656131029129028, + "learning_rate": 9.729196157756534e-05, + "loss": 1.0499, + "step": 14614 + }, + { + "epoch": 0.5233942736404821, + "grad_norm": 1.6912506818771362, + "learning_rate": 9.728036683670433e-05, + "loss": 1.1927, + "step": 14615 + }, + { + "epoch": 0.5234300857700503, + "grad_norm": 1.7761969566345215, + "learning_rate": 9.72687721324324e-05, + "loss": 1.1709, + "step": 14616 + }, + { + "epoch": 0.5234658978996186, + "grad_norm": 2.3895628452301025, + "learning_rate": 9.725717746490571e-05, + "loss": 1.1588, + "step": 14617 + }, + { + "epoch": 0.5235017100291869, + "grad_norm": 1.6711074113845825, + "learning_rate": 9.724558283428007e-05, + "loss": 1.2171, + "step": 14618 + }, + { + "epoch": 0.5235375221587552, + "grad_norm": 1.4837729930877686, + "learning_rate": 9.723398824071164e-05, + "loss": 1.1403, + "step": 14619 + }, + { + "epoch": 0.5235733342883234, + "grad_norm": 1.5715348720550537, + "learning_rate": 9.722239368435624e-05, + "loss": 1.151, + "step": 14620 + }, + { + "epoch": 0.5236091464178917, + "grad_norm": 1.8541991710662842, + "learning_rate": 9.721079916537004e-05, + "loss": 1.2078, + "step": 14621 + }, + { + "epoch": 0.5236449585474601, + "grad_norm": 1.6031315326690674, + "learning_rate": 9.719920468390888e-05, + "loss": 1.187, + "step": 14622 + }, + { + "epoch": 0.5236807706770283, + "grad_norm": 1.8043824434280396, + "learning_rate": 9.718761024012886e-05, + "loss": 1.0043, + "step": 14623 + }, + { + "epoch": 0.5237165828065966, + "grad_norm": 1.5070009231567383, + "learning_rate": 9.717601583418588e-05, + "loss": 1.1558, + "step": 14624 + }, + { + "epoch": 0.5237523949361649, + "grad_norm": 1.2841089963912964, + "learning_rate": 9.716442146623594e-05, + "loss": 0.9958, + "step": 14625 + }, + { + "epoch": 0.5237882070657331, + "grad_norm": 1.8059632778167725, + "learning_rate": 9.715282713643512e-05, + "loss": 1.1073, + "step": 14626 + }, + { + "epoch": 0.5238240191953014, + "grad_norm": 1.796873688697815, + "learning_rate": 9.714123284493925e-05, + "loss": 1.0282, + "step": 14627 + }, + { + "epoch": 0.5238598313248697, + "grad_norm": 1.7435870170593262, + "learning_rate": 9.712963859190449e-05, + "loss": 1.2485, + "step": 14628 + }, + { + "epoch": 0.5238956434544381, + "grad_norm": 1.340289831161499, + "learning_rate": 9.711804437748669e-05, + "loss": 1.172, + "step": 14629 + }, + { + "epoch": 0.5239314555840063, + "grad_norm": 1.9265645742416382, + "learning_rate": 9.710645020184193e-05, + "loss": 1.1787, + "step": 14630 + }, + { + "epoch": 0.5239672677135746, + "grad_norm": 1.7927985191345215, + "learning_rate": 9.709485606512607e-05, + "loss": 1.3034, + "step": 14631 + }, + { + "epoch": 0.5240030798431429, + "grad_norm": 1.4549213647842407, + "learning_rate": 9.708326196749527e-05, + "loss": 1.3562, + "step": 14632 + }, + { + "epoch": 0.5240388919727111, + "grad_norm": 2.3728456497192383, + "learning_rate": 9.707166790910538e-05, + "loss": 1.2045, + "step": 14633 + }, + { + "epoch": 0.5240747041022794, + "grad_norm": 1.9854947328567505, + "learning_rate": 9.70600738901124e-05, + "loss": 1.0533, + "step": 14634 + }, + { + "epoch": 0.5241105162318477, + "grad_norm": 1.523521065711975, + "learning_rate": 9.704847991067236e-05, + "loss": 1.1583, + "step": 14635 + }, + { + "epoch": 0.5241463283614161, + "grad_norm": 1.1820544004440308, + "learning_rate": 9.703688597094118e-05, + "loss": 1.068, + "step": 14636 + }, + { + "epoch": 0.5241821404909843, + "grad_norm": 1.3932892084121704, + "learning_rate": 9.702529207107491e-05, + "loss": 1.1917, + "step": 14637 + }, + { + "epoch": 0.5242179526205526, + "grad_norm": 1.3806737661361694, + "learning_rate": 9.701369821122945e-05, + "loss": 1.1313, + "step": 14638 + }, + { + "epoch": 0.5242537647501209, + "grad_norm": 1.51075279712677, + "learning_rate": 9.70021043915609e-05, + "loss": 0.9366, + "step": 14639 + }, + { + "epoch": 0.5242895768796891, + "grad_norm": 1.4872218370437622, + "learning_rate": 9.69905106122251e-05, + "loss": 1.1015, + "step": 14640 + }, + { + "epoch": 0.5243253890092574, + "grad_norm": 1.4475640058517456, + "learning_rate": 9.697891687337817e-05, + "loss": 1.0691, + "step": 14641 + }, + { + "epoch": 0.5243612011388257, + "grad_norm": 1.687768816947937, + "learning_rate": 9.696732317517599e-05, + "loss": 1.2233, + "step": 14642 + }, + { + "epoch": 0.524397013268394, + "grad_norm": 1.586735486984253, + "learning_rate": 9.695572951777454e-05, + "loss": 1.2191, + "step": 14643 + }, + { + "epoch": 0.5244328253979623, + "grad_norm": 1.493669867515564, + "learning_rate": 9.694413590132985e-05, + "loss": 0.9397, + "step": 14644 + }, + { + "epoch": 0.5244686375275306, + "grad_norm": 1.7893799543380737, + "learning_rate": 9.693254232599784e-05, + "loss": 1.1245, + "step": 14645 + }, + { + "epoch": 0.5245044496570989, + "grad_norm": 1.5697073936462402, + "learning_rate": 9.692094879193455e-05, + "loss": 1.2032, + "step": 14646 + }, + { + "epoch": 0.5245402617866671, + "grad_norm": 1.8275840282440186, + "learning_rate": 9.690935529929587e-05, + "loss": 1.4129, + "step": 14647 + }, + { + "epoch": 0.5245760739162354, + "grad_norm": 1.6693693399429321, + "learning_rate": 9.689776184823789e-05, + "loss": 1.1249, + "step": 14648 + }, + { + "epoch": 0.5246118860458037, + "grad_norm": 1.5586044788360596, + "learning_rate": 9.688616843891648e-05, + "loss": 0.8805, + "step": 14649 + }, + { + "epoch": 0.524647698175372, + "grad_norm": 1.8242470026016235, + "learning_rate": 9.687457507148768e-05, + "loss": 1.2696, + "step": 14650 + }, + { + "epoch": 0.5246835103049403, + "grad_norm": 1.530393362045288, + "learning_rate": 9.686298174610745e-05, + "loss": 0.9825, + "step": 14651 + }, + { + "epoch": 0.5247193224345086, + "grad_norm": 1.3257017135620117, + "learning_rate": 9.685138846293171e-05, + "loss": 1.0571, + "step": 14652 + }, + { + "epoch": 0.5247551345640769, + "grad_norm": 1.8393241167068481, + "learning_rate": 9.683979522211652e-05, + "loss": 1.1234, + "step": 14653 + }, + { + "epoch": 0.5247909466936451, + "grad_norm": 1.2826142311096191, + "learning_rate": 9.682820202381779e-05, + "loss": 1.125, + "step": 14654 + }, + { + "epoch": 0.5248267588232134, + "grad_norm": 1.3107086420059204, + "learning_rate": 9.681660886819152e-05, + "loss": 0.9917, + "step": 14655 + }, + { + "epoch": 0.5248625709527817, + "grad_norm": 1.598000168800354, + "learning_rate": 9.680501575539365e-05, + "loss": 1.0835, + "step": 14656 + }, + { + "epoch": 0.52489838308235, + "grad_norm": 1.369288682937622, + "learning_rate": 9.679342268558019e-05, + "loss": 1.0201, + "step": 14657 + }, + { + "epoch": 0.5249341952119183, + "grad_norm": 1.4739296436309814, + "learning_rate": 9.678182965890708e-05, + "loss": 1.1505, + "step": 14658 + }, + { + "epoch": 0.5249700073414866, + "grad_norm": 1.5775461196899414, + "learning_rate": 9.677023667553033e-05, + "loss": 1.0717, + "step": 14659 + }, + { + "epoch": 0.5250058194710548, + "grad_norm": 1.5587737560272217, + "learning_rate": 9.675864373560586e-05, + "loss": 0.9598, + "step": 14660 + }, + { + "epoch": 0.5250416316006231, + "grad_norm": 1.7463949918746948, + "learning_rate": 9.674705083928965e-05, + "loss": 1.2977, + "step": 14661 + }, + { + "epoch": 0.5250774437301914, + "grad_norm": 1.5047333240509033, + "learning_rate": 9.673545798673769e-05, + "loss": 1.0831, + "step": 14662 + }, + { + "epoch": 0.5251132558597597, + "grad_norm": 1.3778029680252075, + "learning_rate": 9.67238651781059e-05, + "loss": 1.0565, + "step": 14663 + }, + { + "epoch": 0.525149067989328, + "grad_norm": 1.522887110710144, + "learning_rate": 9.671227241355031e-05, + "loss": 1.0032, + "step": 14664 + }, + { + "epoch": 0.5251848801188963, + "grad_norm": 1.4701980352401733, + "learning_rate": 9.670067969322684e-05, + "loss": 1.175, + "step": 14665 + }, + { + "epoch": 0.5252206922484646, + "grad_norm": 1.3827142715454102, + "learning_rate": 9.668908701729148e-05, + "loss": 0.9761, + "step": 14666 + }, + { + "epoch": 0.5252565043780328, + "grad_norm": 1.4236021041870117, + "learning_rate": 9.667749438590017e-05, + "loss": 1.1486, + "step": 14667 + }, + { + "epoch": 0.5252923165076011, + "grad_norm": 1.5345367193222046, + "learning_rate": 9.66659017992089e-05, + "loss": 1.0984, + "step": 14668 + }, + { + "epoch": 0.5253281286371694, + "grad_norm": 1.5003331899642944, + "learning_rate": 9.665430925737362e-05, + "loss": 1.1808, + "step": 14669 + }, + { + "epoch": 0.5253639407667376, + "grad_norm": 1.8123353719711304, + "learning_rate": 9.664271676055027e-05, + "loss": 1.0032, + "step": 14670 + }, + { + "epoch": 0.525399752896306, + "grad_norm": 1.320603370666504, + "learning_rate": 9.663112430889487e-05, + "loss": 0.9659, + "step": 14671 + }, + { + "epoch": 0.5254355650258743, + "grad_norm": 1.515417456626892, + "learning_rate": 9.661953190256333e-05, + "loss": 1.2374, + "step": 14672 + }, + { + "epoch": 0.5254713771554426, + "grad_norm": 1.272866129875183, + "learning_rate": 9.660793954171163e-05, + "loss": 1.2902, + "step": 14673 + }, + { + "epoch": 0.5255071892850108, + "grad_norm": 1.288709044456482, + "learning_rate": 9.65963472264957e-05, + "loss": 1.1171, + "step": 14674 + }, + { + "epoch": 0.5255430014145791, + "grad_norm": 1.6877970695495605, + "learning_rate": 9.658475495707157e-05, + "loss": 1.2422, + "step": 14675 + }, + { + "epoch": 0.5255788135441474, + "grad_norm": 1.5443934202194214, + "learning_rate": 9.657316273359515e-05, + "loss": 1.0656, + "step": 14676 + }, + { + "epoch": 0.5256146256737156, + "grad_norm": 1.596598744392395, + "learning_rate": 9.65615705562224e-05, + "loss": 1.2244, + "step": 14677 + }, + { + "epoch": 0.525650437803284, + "grad_norm": 1.3520984649658203, + "learning_rate": 9.654997842510928e-05, + "loss": 1.2903, + "step": 14678 + }, + { + "epoch": 0.5256862499328523, + "grad_norm": 1.39377760887146, + "learning_rate": 9.653838634041173e-05, + "loss": 1.1595, + "step": 14679 + }, + { + "epoch": 0.5257220620624206, + "grad_norm": 1.395277976989746, + "learning_rate": 9.652679430228576e-05, + "loss": 1.2455, + "step": 14680 + }, + { + "epoch": 0.5257578741919888, + "grad_norm": 1.6562211513519287, + "learning_rate": 9.651520231088726e-05, + "loss": 1.1181, + "step": 14681 + }, + { + "epoch": 0.5257936863215571, + "grad_norm": 1.4058704376220703, + "learning_rate": 9.650361036637225e-05, + "loss": 1.1326, + "step": 14682 + }, + { + "epoch": 0.5258294984511254, + "grad_norm": 1.568258285522461, + "learning_rate": 9.649201846889663e-05, + "loss": 1.2293, + "step": 14683 + }, + { + "epoch": 0.5258653105806936, + "grad_norm": 1.5040273666381836, + "learning_rate": 9.64804266186164e-05, + "loss": 1.0811, + "step": 14684 + }, + { + "epoch": 0.525901122710262, + "grad_norm": 1.410474181175232, + "learning_rate": 9.646883481568748e-05, + "loss": 1.1359, + "step": 14685 + }, + { + "epoch": 0.5259369348398303, + "grad_norm": 1.6049667596817017, + "learning_rate": 9.645724306026582e-05, + "loss": 1.1633, + "step": 14686 + }, + { + "epoch": 0.5259727469693986, + "grad_norm": 1.5818753242492676, + "learning_rate": 9.644565135250739e-05, + "loss": 1.0337, + "step": 14687 + }, + { + "epoch": 0.5260085590989668, + "grad_norm": 2.8226253986358643, + "learning_rate": 9.643405969256814e-05, + "loss": 1.1568, + "step": 14688 + }, + { + "epoch": 0.5260443712285351, + "grad_norm": 1.843345046043396, + "learning_rate": 9.642246808060401e-05, + "loss": 1.048, + "step": 14689 + }, + { + "epoch": 0.5260801833581034, + "grad_norm": 1.5724197626113892, + "learning_rate": 9.641087651677096e-05, + "loss": 1.2367, + "step": 14690 + }, + { + "epoch": 0.5261159954876716, + "grad_norm": 1.5970796346664429, + "learning_rate": 9.639928500122495e-05, + "loss": 1.4169, + "step": 14691 + }, + { + "epoch": 0.52615180761724, + "grad_norm": 1.5083154439926147, + "learning_rate": 9.638769353412189e-05, + "loss": 1.3523, + "step": 14692 + }, + { + "epoch": 0.5261876197468083, + "grad_norm": 1.650966763496399, + "learning_rate": 9.637610211561779e-05, + "loss": 1.2269, + "step": 14693 + }, + { + "epoch": 0.5262234318763765, + "grad_norm": 1.7423144578933716, + "learning_rate": 9.636451074586856e-05, + "loss": 1.175, + "step": 14694 + }, + { + "epoch": 0.5262592440059448, + "grad_norm": 1.517987847328186, + "learning_rate": 9.63529194250301e-05, + "loss": 1.0945, + "step": 14695 + }, + { + "epoch": 0.5262950561355131, + "grad_norm": 1.6487540006637573, + "learning_rate": 9.634132815325844e-05, + "loss": 1.0514, + "step": 14696 + }, + { + "epoch": 0.5263308682650814, + "grad_norm": 1.7196379899978638, + "learning_rate": 9.632973693070947e-05, + "loss": 0.9146, + "step": 14697 + }, + { + "epoch": 0.5263666803946496, + "grad_norm": 1.511675477027893, + "learning_rate": 9.631814575753918e-05, + "loss": 1.205, + "step": 14698 + }, + { + "epoch": 0.526402492524218, + "grad_norm": 1.6503844261169434, + "learning_rate": 9.630655463390347e-05, + "loss": 0.9818, + "step": 14699 + }, + { + "epoch": 0.5264383046537863, + "grad_norm": 1.1333649158477783, + "learning_rate": 9.629496355995831e-05, + "loss": 0.9996, + "step": 14700 + }, + { + "epoch": 0.5264741167833545, + "grad_norm": 1.2889615297317505, + "learning_rate": 9.628337253585964e-05, + "loss": 1.0424, + "step": 14701 + }, + { + "epoch": 0.5265099289129228, + "grad_norm": 1.6297190189361572, + "learning_rate": 9.62717815617634e-05, + "loss": 1.1345, + "step": 14702 + }, + { + "epoch": 0.5265457410424911, + "grad_norm": 1.415145993232727, + "learning_rate": 9.626019063782557e-05, + "loss": 1.1295, + "step": 14703 + }, + { + "epoch": 0.5265815531720593, + "grad_norm": 1.1463359594345093, + "learning_rate": 9.624859976420196e-05, + "loss": 1.059, + "step": 14704 + }, + { + "epoch": 0.5266173653016276, + "grad_norm": 1.53556227684021, + "learning_rate": 9.623700894104869e-05, + "loss": 0.9816, + "step": 14705 + }, + { + "epoch": 0.526653177431196, + "grad_norm": 2.726259231567383, + "learning_rate": 9.622541816852153e-05, + "loss": 1.1427, + "step": 14706 + }, + { + "epoch": 0.5266889895607643, + "grad_norm": 2.428560256958008, + "learning_rate": 9.621382744677658e-05, + "loss": 1.38, + "step": 14707 + }, + { + "epoch": 0.5267248016903325, + "grad_norm": 1.1839935779571533, + "learning_rate": 9.620223677596962e-05, + "loss": 1.135, + "step": 14708 + }, + { + "epoch": 0.5267606138199008, + "grad_norm": 1.636155128479004, + "learning_rate": 9.619064615625671e-05, + "loss": 1.0795, + "step": 14709 + }, + { + "epoch": 0.5267964259494691, + "grad_norm": 1.9949535131454468, + "learning_rate": 9.617905558779373e-05, + "loss": 1.3008, + "step": 14710 + }, + { + "epoch": 0.5268322380790373, + "grad_norm": 1.3534977436065674, + "learning_rate": 9.616746507073664e-05, + "loss": 1.0557, + "step": 14711 + }, + { + "epoch": 0.5268680502086056, + "grad_norm": 1.8142400979995728, + "learning_rate": 9.61558746052414e-05, + "loss": 1.3297, + "step": 14712 + }, + { + "epoch": 0.526903862338174, + "grad_norm": 1.1199389696121216, + "learning_rate": 9.614428419146381e-05, + "loss": 1.1335, + "step": 14713 + }, + { + "epoch": 0.5269396744677423, + "grad_norm": 1.9609930515289307, + "learning_rate": 9.613269382956e-05, + "loss": 1.1908, + "step": 14714 + }, + { + "epoch": 0.5269754865973105, + "grad_norm": 1.3779577016830444, + "learning_rate": 9.612110351968573e-05, + "loss": 1.0293, + "step": 14715 + }, + { + "epoch": 0.5270112987268788, + "grad_norm": 1.9707410335540771, + "learning_rate": 9.610951326199707e-05, + "loss": 1.1434, + "step": 14716 + }, + { + "epoch": 0.5270471108564471, + "grad_norm": 1.904111623764038, + "learning_rate": 9.609792305664984e-05, + "loss": 1.3975, + "step": 14717 + }, + { + "epoch": 0.5270829229860153, + "grad_norm": 1.5083128213882446, + "learning_rate": 9.608633290380008e-05, + "loss": 1.205, + "step": 14718 + }, + { + "epoch": 0.5271187351155836, + "grad_norm": 1.2444195747375488, + "learning_rate": 9.60747428036036e-05, + "loss": 1.1022, + "step": 14719 + }, + { + "epoch": 0.527154547245152, + "grad_norm": 1.2170453071594238, + "learning_rate": 9.606315275621644e-05, + "loss": 1.0895, + "step": 14720 + }, + { + "epoch": 0.5271903593747203, + "grad_norm": 1.5701545476913452, + "learning_rate": 9.605156276179447e-05, + "loss": 1.2307, + "step": 14721 + }, + { + "epoch": 0.5272261715042885, + "grad_norm": 1.843735933303833, + "learning_rate": 9.60399728204936e-05, + "loss": 1.2956, + "step": 14722 + }, + { + "epoch": 0.5272619836338568, + "grad_norm": 1.293112874031067, + "learning_rate": 9.602838293246984e-05, + "loss": 1.1233, + "step": 14723 + }, + { + "epoch": 0.5272977957634251, + "grad_norm": 1.33250892162323, + "learning_rate": 9.6016793097879e-05, + "loss": 0.999, + "step": 14724 + }, + { + "epoch": 0.5273336078929933, + "grad_norm": 1.3760782480239868, + "learning_rate": 9.600520331687713e-05, + "loss": 1.2167, + "step": 14725 + }, + { + "epoch": 0.5273694200225616, + "grad_norm": 1.2930368185043335, + "learning_rate": 9.599361358962005e-05, + "loss": 1.0786, + "step": 14726 + }, + { + "epoch": 0.52740523215213, + "grad_norm": 1.4804093837738037, + "learning_rate": 9.598202391626379e-05, + "loss": 1.1373, + "step": 14727 + }, + { + "epoch": 0.5274410442816982, + "grad_norm": 1.5469719171524048, + "learning_rate": 9.597043429696413e-05, + "loss": 1.1732, + "step": 14728 + }, + { + "epoch": 0.5274768564112665, + "grad_norm": 1.5292866230010986, + "learning_rate": 9.595884473187716e-05, + "loss": 1.1957, + "step": 14729 + }, + { + "epoch": 0.5275126685408348, + "grad_norm": 1.2882428169250488, + "learning_rate": 9.594725522115871e-05, + "loss": 1.1615, + "step": 14730 + }, + { + "epoch": 0.527548480670403, + "grad_norm": 1.3895529508590698, + "learning_rate": 9.593566576496468e-05, + "loss": 1.1627, + "step": 14731 + }, + { + "epoch": 0.5275842927999713, + "grad_norm": 1.3042118549346924, + "learning_rate": 9.592407636345104e-05, + "loss": 0.9827, + "step": 14732 + }, + { + "epoch": 0.5276201049295396, + "grad_norm": 1.46929931640625, + "learning_rate": 9.591248701677368e-05, + "loss": 1.1577, + "step": 14733 + }, + { + "epoch": 0.527655917059108, + "grad_norm": 1.2370461225509644, + "learning_rate": 9.590089772508856e-05, + "loss": 1.0546, + "step": 14734 + }, + { + "epoch": 0.5276917291886762, + "grad_norm": 1.4786208868026733, + "learning_rate": 9.588930848855152e-05, + "loss": 1.3438, + "step": 14735 + }, + { + "epoch": 0.5277275413182445, + "grad_norm": 1.3040577173233032, + "learning_rate": 9.58777193073186e-05, + "loss": 1.1624, + "step": 14736 + }, + { + "epoch": 0.5277633534478128, + "grad_norm": 2.067505359649658, + "learning_rate": 9.58661301815456e-05, + "loss": 1.2683, + "step": 14737 + }, + { + "epoch": 0.527799165577381, + "grad_norm": 1.4345883131027222, + "learning_rate": 9.585454111138853e-05, + "loss": 0.9414, + "step": 14738 + }, + { + "epoch": 0.5278349777069493, + "grad_norm": 1.481715440750122, + "learning_rate": 9.584295209700326e-05, + "loss": 1.1814, + "step": 14739 + }, + { + "epoch": 0.5278707898365176, + "grad_norm": 1.5790839195251465, + "learning_rate": 9.583136313854567e-05, + "loss": 1.1685, + "step": 14740 + }, + { + "epoch": 0.527906601966086, + "grad_norm": 1.2207295894622803, + "learning_rate": 9.581977423617173e-05, + "loss": 0.8569, + "step": 14741 + }, + { + "epoch": 0.5279424140956542, + "grad_norm": 1.4446159601211548, + "learning_rate": 9.580818539003733e-05, + "loss": 1.0528, + "step": 14742 + }, + { + "epoch": 0.5279782262252225, + "grad_norm": 1.5821049213409424, + "learning_rate": 9.579659660029841e-05, + "loss": 1.1894, + "step": 14743 + }, + { + "epoch": 0.5280140383547908, + "grad_norm": 1.5513478517532349, + "learning_rate": 9.578500786711082e-05, + "loss": 1.1547, + "step": 14744 + }, + { + "epoch": 0.528049850484359, + "grad_norm": 1.4412842988967896, + "learning_rate": 9.577341919063055e-05, + "loss": 0.94, + "step": 14745 + }, + { + "epoch": 0.5280856626139273, + "grad_norm": 1.4302440881729126, + "learning_rate": 9.576183057101345e-05, + "loss": 1.0029, + "step": 14746 + }, + { + "epoch": 0.5281214747434956, + "grad_norm": 1.5424143075942993, + "learning_rate": 9.575024200841547e-05, + "loss": 1.0643, + "step": 14747 + }, + { + "epoch": 0.528157286873064, + "grad_norm": 2.198852777481079, + "learning_rate": 9.573865350299251e-05, + "loss": 1.3322, + "step": 14748 + }, + { + "epoch": 0.5281930990026322, + "grad_norm": 1.585476279258728, + "learning_rate": 9.572706505490043e-05, + "loss": 1.2429, + "step": 14749 + }, + { + "epoch": 0.5282289111322005, + "grad_norm": 1.4802234172821045, + "learning_rate": 9.571547666429521e-05, + "loss": 1.0168, + "step": 14750 + }, + { + "epoch": 0.5282647232617688, + "grad_norm": 1.3360475301742554, + "learning_rate": 9.57038883313327e-05, + "loss": 1.0679, + "step": 14751 + }, + { + "epoch": 0.528300535391337, + "grad_norm": 1.3898470401763916, + "learning_rate": 9.569230005616887e-05, + "loss": 1.2884, + "step": 14752 + }, + { + "epoch": 0.5283363475209053, + "grad_norm": 1.0816785097122192, + "learning_rate": 9.568071183895954e-05, + "loss": 1.0251, + "step": 14753 + }, + { + "epoch": 0.5283721596504736, + "grad_norm": 1.694063663482666, + "learning_rate": 9.56691236798607e-05, + "loss": 1.0682, + "step": 14754 + }, + { + "epoch": 0.528407971780042, + "grad_norm": 1.5235636234283447, + "learning_rate": 9.565753557902818e-05, + "loss": 1.2962, + "step": 14755 + }, + { + "epoch": 0.5284437839096102, + "grad_norm": 1.3337763547897339, + "learning_rate": 9.564594753661796e-05, + "loss": 1.1506, + "step": 14756 + }, + { + "epoch": 0.5284795960391785, + "grad_norm": 1.755393385887146, + "learning_rate": 9.563435955278587e-05, + "loss": 1.0477, + "step": 14757 + }, + { + "epoch": 0.5285154081687468, + "grad_norm": 1.5966068506240845, + "learning_rate": 9.562277162768785e-05, + "loss": 1.0844, + "step": 14758 + }, + { + "epoch": 0.528551220298315, + "grad_norm": 1.5450941324234009, + "learning_rate": 9.561118376147979e-05, + "loss": 1.3284, + "step": 14759 + }, + { + "epoch": 0.5285870324278833, + "grad_norm": 1.784041166305542, + "learning_rate": 9.559959595431758e-05, + "loss": 1.2194, + "step": 14760 + }, + { + "epoch": 0.5286228445574516, + "grad_norm": 1.438124418258667, + "learning_rate": 9.558800820635715e-05, + "loss": 1.1917, + "step": 14761 + }, + { + "epoch": 0.52865865668702, + "grad_norm": 1.4155910015106201, + "learning_rate": 9.557642051775436e-05, + "loss": 1.1304, + "step": 14762 + }, + { + "epoch": 0.5286944688165882, + "grad_norm": 1.3575936555862427, + "learning_rate": 9.556483288866515e-05, + "loss": 0.968, + "step": 14763 + }, + { + "epoch": 0.5287302809461565, + "grad_norm": 1.420677900314331, + "learning_rate": 9.555324531924536e-05, + "loss": 1.1924, + "step": 14764 + }, + { + "epoch": 0.5287660930757248, + "grad_norm": 1.5075576305389404, + "learning_rate": 9.554165780965095e-05, + "loss": 1.191, + "step": 14765 + }, + { + "epoch": 0.528801905205293, + "grad_norm": 1.4828130006790161, + "learning_rate": 9.553007036003777e-05, + "loss": 1.1315, + "step": 14766 + }, + { + "epoch": 0.5288377173348613, + "grad_norm": 1.805427074432373, + "learning_rate": 9.551848297056171e-05, + "loss": 1.084, + "step": 14767 + }, + { + "epoch": 0.5288735294644296, + "grad_norm": 1.2758477926254272, + "learning_rate": 9.550689564137872e-05, + "loss": 1.0839, + "step": 14768 + }, + { + "epoch": 0.5289093415939979, + "grad_norm": 1.7892613410949707, + "learning_rate": 9.54953083726446e-05, + "loss": 1.1988, + "step": 14769 + }, + { + "epoch": 0.5289451537235662, + "grad_norm": 2.602557897567749, + "learning_rate": 9.548372116451535e-05, + "loss": 1.269, + "step": 14770 + }, + { + "epoch": 0.5289809658531345, + "grad_norm": 1.4858283996582031, + "learning_rate": 9.547213401714677e-05, + "loss": 1.0395, + "step": 14771 + }, + { + "epoch": 0.5290167779827027, + "grad_norm": 1.449230670928955, + "learning_rate": 9.546054693069481e-05, + "loss": 1.2115, + "step": 14772 + }, + { + "epoch": 0.529052590112271, + "grad_norm": 1.2112559080123901, + "learning_rate": 9.544895990531532e-05, + "loss": 1.0961, + "step": 14773 + }, + { + "epoch": 0.5290884022418393, + "grad_norm": 1.3821895122528076, + "learning_rate": 9.54373729411642e-05, + "loss": 1.0069, + "step": 14774 + }, + { + "epoch": 0.5291242143714076, + "grad_norm": 1.8660240173339844, + "learning_rate": 9.542578603839736e-05, + "loss": 1.1628, + "step": 14775 + }, + { + "epoch": 0.5291600265009759, + "grad_norm": 1.7428141832351685, + "learning_rate": 9.541419919717064e-05, + "loss": 0.9216, + "step": 14776 + }, + { + "epoch": 0.5291958386305442, + "grad_norm": 1.4152214527130127, + "learning_rate": 9.540261241763999e-05, + "loss": 1.1175, + "step": 14777 + }, + { + "epoch": 0.5292316507601125, + "grad_norm": 1.4913249015808105, + "learning_rate": 9.539102569996124e-05, + "loss": 1.1178, + "step": 14778 + }, + { + "epoch": 0.5292674628896807, + "grad_norm": 1.5053510665893555, + "learning_rate": 9.53794390442903e-05, + "loss": 1.0778, + "step": 14779 + }, + { + "epoch": 0.529303275019249, + "grad_norm": 1.2755308151245117, + "learning_rate": 9.536785245078304e-05, + "loss": 0.9465, + "step": 14780 + }, + { + "epoch": 0.5293390871488173, + "grad_norm": 1.4014760255813599, + "learning_rate": 9.535626591959536e-05, + "loss": 1.2209, + "step": 14781 + }, + { + "epoch": 0.5293748992783855, + "grad_norm": 1.705365777015686, + "learning_rate": 9.534467945088313e-05, + "loss": 1.1046, + "step": 14782 + }, + { + "epoch": 0.5294107114079539, + "grad_norm": 1.4669654369354248, + "learning_rate": 9.533309304480221e-05, + "loss": 1.2551, + "step": 14783 + }, + { + "epoch": 0.5294465235375222, + "grad_norm": 1.204699993133545, + "learning_rate": 9.532150670150854e-05, + "loss": 1.0242, + "step": 14784 + }, + { + "epoch": 0.5294823356670905, + "grad_norm": 2.813769578933716, + "learning_rate": 9.530992042115794e-05, + "loss": 1.3039, + "step": 14785 + }, + { + "epoch": 0.5295181477966587, + "grad_norm": 1.4501322507858276, + "learning_rate": 9.529833420390631e-05, + "loss": 1.1293, + "step": 14786 + }, + { + "epoch": 0.529553959926227, + "grad_norm": 1.4918397665023804, + "learning_rate": 9.528674804990954e-05, + "loss": 1.1423, + "step": 14787 + }, + { + "epoch": 0.5295897720557953, + "grad_norm": 1.494348406791687, + "learning_rate": 9.527516195932349e-05, + "loss": 1.0372, + "step": 14788 + }, + { + "epoch": 0.5296255841853635, + "grad_norm": 1.7348182201385498, + "learning_rate": 9.526357593230403e-05, + "loss": 1.2385, + "step": 14789 + }, + { + "epoch": 0.5296613963149319, + "grad_norm": 2.076328754425049, + "learning_rate": 9.525198996900707e-05, + "loss": 1.1984, + "step": 14790 + }, + { + "epoch": 0.5296972084445002, + "grad_norm": 2.6297028064727783, + "learning_rate": 9.524040406958847e-05, + "loss": 1.252, + "step": 14791 + }, + { + "epoch": 0.5297330205740685, + "grad_norm": 1.4637964963912964, + "learning_rate": 9.522881823420404e-05, + "loss": 1.1711, + "step": 14792 + }, + { + "epoch": 0.5297688327036367, + "grad_norm": 1.456466555595398, + "learning_rate": 9.521723246300977e-05, + "loss": 1.1532, + "step": 14793 + }, + { + "epoch": 0.529804644833205, + "grad_norm": 1.3202946186065674, + "learning_rate": 9.520564675616141e-05, + "loss": 1.2437, + "step": 14794 + }, + { + "epoch": 0.5298404569627733, + "grad_norm": 1.62357759475708, + "learning_rate": 9.519406111381492e-05, + "loss": 1.0768, + "step": 14795 + }, + { + "epoch": 0.5298762690923415, + "grad_norm": 1.2887701988220215, + "learning_rate": 9.518247553612613e-05, + "loss": 0.983, + "step": 14796 + }, + { + "epoch": 0.5299120812219099, + "grad_norm": 1.8363299369812012, + "learning_rate": 9.517089002325093e-05, + "loss": 1.3758, + "step": 14797 + }, + { + "epoch": 0.5299478933514782, + "grad_norm": 1.5188792943954468, + "learning_rate": 9.515930457534514e-05, + "loss": 0.9393, + "step": 14798 + }, + { + "epoch": 0.5299837054810465, + "grad_norm": 1.4925857782363892, + "learning_rate": 9.514771919256472e-05, + "loss": 1.0207, + "step": 14799 + }, + { + "epoch": 0.5300195176106147, + "grad_norm": 1.5260432958602905, + "learning_rate": 9.513613387506547e-05, + "loss": 1.0858, + "step": 14800 + }, + { + "epoch": 0.530055329740183, + "grad_norm": 1.4759773015975952, + "learning_rate": 9.512454862300321e-05, + "loss": 1.1566, + "step": 14801 + }, + { + "epoch": 0.5300911418697513, + "grad_norm": 1.4902712106704712, + "learning_rate": 9.511296343653391e-05, + "loss": 0.9213, + "step": 14802 + }, + { + "epoch": 0.5301269539993195, + "grad_norm": 2.251478910446167, + "learning_rate": 9.510137831581334e-05, + "loss": 1.2585, + "step": 14803 + }, + { + "epoch": 0.5301627661288879, + "grad_norm": 1.672207236289978, + "learning_rate": 9.508979326099747e-05, + "loss": 0.851, + "step": 14804 + }, + { + "epoch": 0.5301985782584562, + "grad_norm": 1.4776350259780884, + "learning_rate": 9.507820827224202e-05, + "loss": 1.0754, + "step": 14805 + }, + { + "epoch": 0.5302343903880244, + "grad_norm": 1.4811598062515259, + "learning_rate": 9.5066623349703e-05, + "loss": 0.9693, + "step": 14806 + }, + { + "epoch": 0.5302702025175927, + "grad_norm": 1.9017791748046875, + "learning_rate": 9.505503849353613e-05, + "loss": 1.3272, + "step": 14807 + }, + { + "epoch": 0.530306014647161, + "grad_norm": 2.2934184074401855, + "learning_rate": 9.504345370389739e-05, + "loss": 1.2532, + "step": 14808 + }, + { + "epoch": 0.5303418267767293, + "grad_norm": 1.5556684732437134, + "learning_rate": 9.50318689809426e-05, + "loss": 1.2131, + "step": 14809 + }, + { + "epoch": 0.5303776389062975, + "grad_norm": 1.6197898387908936, + "learning_rate": 9.502028432482755e-05, + "loss": 1.1933, + "step": 14810 + }, + { + "epoch": 0.5304134510358659, + "grad_norm": 1.6092915534973145, + "learning_rate": 9.50086997357082e-05, + "loss": 0.9744, + "step": 14811 + }, + { + "epoch": 0.5304492631654342, + "grad_norm": 2.021225929260254, + "learning_rate": 9.499711521374031e-05, + "loss": 1.2448, + "step": 14812 + }, + { + "epoch": 0.5304850752950024, + "grad_norm": 1.8066986799240112, + "learning_rate": 9.498553075907985e-05, + "loss": 1.2802, + "step": 14813 + }, + { + "epoch": 0.5305208874245707, + "grad_norm": 1.9200677871704102, + "learning_rate": 9.497394637188251e-05, + "loss": 1.142, + "step": 14814 + }, + { + "epoch": 0.530556699554139, + "grad_norm": 1.9154771566390991, + "learning_rate": 9.496236205230433e-05, + "loss": 1.1254, + "step": 14815 + }, + { + "epoch": 0.5305925116837072, + "grad_norm": 1.4768359661102295, + "learning_rate": 9.4950777800501e-05, + "loss": 1.137, + "step": 14816 + }, + { + "epoch": 0.5306283238132755, + "grad_norm": 1.5159579515457153, + "learning_rate": 9.49391936166285e-05, + "loss": 1.0484, + "step": 14817 + }, + { + "epoch": 0.5306641359428439, + "grad_norm": 1.5183755159378052, + "learning_rate": 9.492760950084261e-05, + "loss": 1.1584, + "step": 14818 + }, + { + "epoch": 0.5306999480724122, + "grad_norm": 1.4934260845184326, + "learning_rate": 9.491602545329916e-05, + "loss": 1.1417, + "step": 14819 + }, + { + "epoch": 0.5307357602019804, + "grad_norm": 1.4749314785003662, + "learning_rate": 9.490444147415407e-05, + "loss": 1.092, + "step": 14820 + }, + { + "epoch": 0.5307715723315487, + "grad_norm": 2.190035581588745, + "learning_rate": 9.489285756356307e-05, + "loss": 1.2402, + "step": 14821 + }, + { + "epoch": 0.530807384461117, + "grad_norm": 1.4720195531845093, + "learning_rate": 9.488127372168218e-05, + "loss": 1.1673, + "step": 14822 + }, + { + "epoch": 0.5308431965906852, + "grad_norm": 2.487755060195923, + "learning_rate": 9.486968994866708e-05, + "loss": 1.2495, + "step": 14823 + }, + { + "epoch": 0.5308790087202535, + "grad_norm": 1.7835488319396973, + "learning_rate": 9.485810624467372e-05, + "loss": 1.0196, + "step": 14824 + }, + { + "epoch": 0.5309148208498219, + "grad_norm": 2.0461716651916504, + "learning_rate": 9.484652260985787e-05, + "loss": 1.1501, + "step": 14825 + }, + { + "epoch": 0.5309506329793902, + "grad_norm": 1.1799876689910889, + "learning_rate": 9.483493904437548e-05, + "loss": 0.9539, + "step": 14826 + }, + { + "epoch": 0.5309864451089584, + "grad_norm": 1.7071068286895752, + "learning_rate": 9.482335554838229e-05, + "loss": 0.9579, + "step": 14827 + }, + { + "epoch": 0.5310222572385267, + "grad_norm": 1.5523114204406738, + "learning_rate": 9.481177212203415e-05, + "loss": 1.379, + "step": 14828 + }, + { + "epoch": 0.531058069368095, + "grad_norm": 1.431685447692871, + "learning_rate": 9.480018876548695e-05, + "loss": 1.133, + "step": 14829 + }, + { + "epoch": 0.5310938814976632, + "grad_norm": 1.5374161005020142, + "learning_rate": 9.478860547889647e-05, + "loss": 1.1361, + "step": 14830 + }, + { + "epoch": 0.5311296936272315, + "grad_norm": 1.5774390697479248, + "learning_rate": 9.477702226241862e-05, + "loss": 1.0689, + "step": 14831 + }, + { + "epoch": 0.5311655057567999, + "grad_norm": 1.4005728960037231, + "learning_rate": 9.476543911620918e-05, + "loss": 1.2163, + "step": 14832 + }, + { + "epoch": 0.5312013178863682, + "grad_norm": 1.501123309135437, + "learning_rate": 9.4753856040424e-05, + "loss": 1.1008, + "step": 14833 + }, + { + "epoch": 0.5312371300159364, + "grad_norm": 1.7082552909851074, + "learning_rate": 9.47422730352189e-05, + "loss": 0.872, + "step": 14834 + }, + { + "epoch": 0.5312729421455047, + "grad_norm": 1.5187506675720215, + "learning_rate": 9.47306901007498e-05, + "loss": 1.0625, + "step": 14835 + }, + { + "epoch": 0.531308754275073, + "grad_norm": 2.07598876953125, + "learning_rate": 9.471910723717243e-05, + "loss": 1.1135, + "step": 14836 + }, + { + "epoch": 0.5313445664046412, + "grad_norm": 2.36104679107666, + "learning_rate": 9.470752444464265e-05, + "loss": 1.2309, + "step": 14837 + }, + { + "epoch": 0.5313803785342095, + "grad_norm": 1.3936169147491455, + "learning_rate": 9.469594172331631e-05, + "loss": 1.0483, + "step": 14838 + }, + { + "epoch": 0.5314161906637779, + "grad_norm": 1.5029224157333374, + "learning_rate": 9.468435907334922e-05, + "loss": 0.9468, + "step": 14839 + }, + { + "epoch": 0.5314520027933461, + "grad_norm": 1.3032505512237549, + "learning_rate": 9.467277649489725e-05, + "loss": 1.0884, + "step": 14840 + }, + { + "epoch": 0.5314878149229144, + "grad_norm": 2.508803129196167, + "learning_rate": 9.466119398811617e-05, + "loss": 1.2045, + "step": 14841 + }, + { + "epoch": 0.5315236270524827, + "grad_norm": 1.3341032266616821, + "learning_rate": 9.464961155316187e-05, + "loss": 1.1334, + "step": 14842 + }, + { + "epoch": 0.531559439182051, + "grad_norm": 1.475978970527649, + "learning_rate": 9.463802919019011e-05, + "loss": 0.9335, + "step": 14843 + }, + { + "epoch": 0.5315952513116192, + "grad_norm": 1.8344565629959106, + "learning_rate": 9.462644689935678e-05, + "loss": 1.2786, + "step": 14844 + }, + { + "epoch": 0.5316310634411875, + "grad_norm": 1.3954813480377197, + "learning_rate": 9.461486468081768e-05, + "loss": 1.1472, + "step": 14845 + }, + { + "epoch": 0.5316668755707559, + "grad_norm": 1.4746564626693726, + "learning_rate": 9.460328253472859e-05, + "loss": 1.2517, + "step": 14846 + }, + { + "epoch": 0.5317026877003241, + "grad_norm": 1.7212871313095093, + "learning_rate": 9.459170046124542e-05, + "loss": 1.2104, + "step": 14847 + }, + { + "epoch": 0.5317384998298924, + "grad_norm": 1.3669325113296509, + "learning_rate": 9.458011846052391e-05, + "loss": 1.1474, + "step": 14848 + }, + { + "epoch": 0.5317743119594607, + "grad_norm": 1.950591802597046, + "learning_rate": 9.456853653271992e-05, + "loss": 1.3107, + "step": 14849 + }, + { + "epoch": 0.531810124089029, + "grad_norm": 1.3481786251068115, + "learning_rate": 9.455695467798927e-05, + "loss": 1.0822, + "step": 14850 + }, + { + "epoch": 0.5318459362185972, + "grad_norm": 1.6258668899536133, + "learning_rate": 9.454537289648779e-05, + "loss": 1.1597, + "step": 14851 + }, + { + "epoch": 0.5318817483481655, + "grad_norm": 1.5836461782455444, + "learning_rate": 9.453379118837125e-05, + "loss": 1.0153, + "step": 14852 + }, + { + "epoch": 0.5319175604777338, + "grad_norm": 1.6212257146835327, + "learning_rate": 9.452220955379553e-05, + "loss": 1.2875, + "step": 14853 + }, + { + "epoch": 0.5319533726073021, + "grad_norm": 1.6632713079452515, + "learning_rate": 9.45106279929164e-05, + "loss": 0.953, + "step": 14854 + }, + { + "epoch": 0.5319891847368704, + "grad_norm": 1.4963592290878296, + "learning_rate": 9.449904650588968e-05, + "loss": 1.0398, + "step": 14855 + }, + { + "epoch": 0.5320249968664387, + "grad_norm": 1.6644805669784546, + "learning_rate": 9.44874650928712e-05, + "loss": 0.9373, + "step": 14856 + }, + { + "epoch": 0.5320608089960069, + "grad_norm": 1.167689561843872, + "learning_rate": 9.447588375401676e-05, + "loss": 0.9554, + "step": 14857 + }, + { + "epoch": 0.5320966211255752, + "grad_norm": 1.595596194267273, + "learning_rate": 9.44643024894822e-05, + "loss": 0.9686, + "step": 14858 + }, + { + "epoch": 0.5321324332551435, + "grad_norm": 1.454735279083252, + "learning_rate": 9.445272129942329e-05, + "loss": 1.0433, + "step": 14859 + }, + { + "epoch": 0.5321682453847117, + "grad_norm": 1.6779228448867798, + "learning_rate": 9.444114018399588e-05, + "loss": 1.0944, + "step": 14860 + }, + { + "epoch": 0.5322040575142801, + "grad_norm": 1.6790865659713745, + "learning_rate": 9.442955914335573e-05, + "loss": 1.1262, + "step": 14861 + }, + { + "epoch": 0.5322398696438484, + "grad_norm": 1.4926429986953735, + "learning_rate": 9.441797817765869e-05, + "loss": 1.0718, + "step": 14862 + }, + { + "epoch": 0.5322756817734167, + "grad_norm": 1.3747624158859253, + "learning_rate": 9.440639728706058e-05, + "loss": 0.9102, + "step": 14863 + }, + { + "epoch": 0.5323114939029849, + "grad_norm": 1.5490063428878784, + "learning_rate": 9.439481647171714e-05, + "loss": 1.139, + "step": 14864 + }, + { + "epoch": 0.5323473060325532, + "grad_norm": 1.6278611421585083, + "learning_rate": 9.438323573178424e-05, + "loss": 1.0825, + "step": 14865 + }, + { + "epoch": 0.5323831181621215, + "grad_norm": 2.1969034671783447, + "learning_rate": 9.437165506741764e-05, + "loss": 1.111, + "step": 14866 + }, + { + "epoch": 0.5324189302916897, + "grad_norm": 1.5938283205032349, + "learning_rate": 9.436007447877316e-05, + "loss": 0.9157, + "step": 14867 + }, + { + "epoch": 0.5324547424212581, + "grad_norm": 1.5000265836715698, + "learning_rate": 9.43484939660066e-05, + "loss": 0.9597, + "step": 14868 + }, + { + "epoch": 0.5324905545508264, + "grad_norm": 1.6122231483459473, + "learning_rate": 9.433691352927378e-05, + "loss": 1.1743, + "step": 14869 + }, + { + "epoch": 0.5325263666803947, + "grad_norm": 1.603891134262085, + "learning_rate": 9.43253331687305e-05, + "loss": 1.0984, + "step": 14870 + }, + { + "epoch": 0.5325621788099629, + "grad_norm": 1.8744386434555054, + "learning_rate": 9.43137528845325e-05, + "loss": 1.1748, + "step": 14871 + }, + { + "epoch": 0.5325979909395312, + "grad_norm": 1.9496805667877197, + "learning_rate": 9.430217267683566e-05, + "loss": 1.4074, + "step": 14872 + }, + { + "epoch": 0.5326338030690995, + "grad_norm": 1.1663883924484253, + "learning_rate": 9.42905925457957e-05, + "loss": 1.0782, + "step": 14873 + }, + { + "epoch": 0.5326696151986677, + "grad_norm": 1.5508592128753662, + "learning_rate": 9.427901249156847e-05, + "loss": 1.0737, + "step": 14874 + }, + { + "epoch": 0.5327054273282361, + "grad_norm": 1.6840400695800781, + "learning_rate": 9.426743251430974e-05, + "loss": 1.1442, + "step": 14875 + }, + { + "epoch": 0.5327412394578044, + "grad_norm": 1.7443331480026245, + "learning_rate": 9.425585261417533e-05, + "loss": 1.1482, + "step": 14876 + }, + { + "epoch": 0.5327770515873727, + "grad_norm": 1.7416257858276367, + "learning_rate": 9.424427279132099e-05, + "loss": 1.1321, + "step": 14877 + }, + { + "epoch": 0.5328128637169409, + "grad_norm": 1.331929326057434, + "learning_rate": 9.423269304590256e-05, + "loss": 0.8493, + "step": 14878 + }, + { + "epoch": 0.5328486758465092, + "grad_norm": 1.6781866550445557, + "learning_rate": 9.42211133780758e-05, + "loss": 1.2456, + "step": 14879 + }, + { + "epoch": 0.5328844879760775, + "grad_norm": 1.3908989429473877, + "learning_rate": 9.420953378799649e-05, + "loss": 1.1065, + "step": 14880 + }, + { + "epoch": 0.5329203001056457, + "grad_norm": 1.7545521259307861, + "learning_rate": 9.419795427582044e-05, + "loss": 1.0478, + "step": 14881 + }, + { + "epoch": 0.5329561122352141, + "grad_norm": 1.6206848621368408, + "learning_rate": 9.418637484170344e-05, + "loss": 1.1772, + "step": 14882 + }, + { + "epoch": 0.5329919243647824, + "grad_norm": 1.5065975189208984, + "learning_rate": 9.417479548580126e-05, + "loss": 1.1219, + "step": 14883 + }, + { + "epoch": 0.5330277364943506, + "grad_norm": 1.4047303199768066, + "learning_rate": 9.416321620826968e-05, + "loss": 1.356, + "step": 14884 + }, + { + "epoch": 0.5330635486239189, + "grad_norm": 1.391148328781128, + "learning_rate": 9.415163700926451e-05, + "loss": 1.0172, + "step": 14885 + }, + { + "epoch": 0.5330993607534872, + "grad_norm": 2.1556100845336914, + "learning_rate": 9.414005788894151e-05, + "loss": 1.3379, + "step": 14886 + }, + { + "epoch": 0.5331351728830555, + "grad_norm": 1.561303734779358, + "learning_rate": 9.41284788474565e-05, + "loss": 0.9909, + "step": 14887 + }, + { + "epoch": 0.5331709850126237, + "grad_norm": 1.2855076789855957, + "learning_rate": 9.411689988496526e-05, + "loss": 1.2909, + "step": 14888 + }, + { + "epoch": 0.5332067971421921, + "grad_norm": 1.6130969524383545, + "learning_rate": 9.410532100162344e-05, + "loss": 1.0946, + "step": 14889 + }, + { + "epoch": 0.5332426092717604, + "grad_norm": 1.943490982055664, + "learning_rate": 9.409374219758702e-05, + "loss": 1.0363, + "step": 14890 + }, + { + "epoch": 0.5332784214013286, + "grad_norm": 1.450260877609253, + "learning_rate": 9.408216347301161e-05, + "loss": 1.1972, + "step": 14891 + }, + { + "epoch": 0.5333142335308969, + "grad_norm": 1.7345454692840576, + "learning_rate": 9.40705848280531e-05, + "loss": 1.1496, + "step": 14892 + }, + { + "epoch": 0.5333500456604652, + "grad_norm": 1.3594058752059937, + "learning_rate": 9.40590062628672e-05, + "loss": 1.1247, + "step": 14893 + }, + { + "epoch": 0.5333858577900334, + "grad_norm": 1.457862138748169, + "learning_rate": 9.404742777760974e-05, + "loss": 1.1166, + "step": 14894 + }, + { + "epoch": 0.5334216699196017, + "grad_norm": 1.758289098739624, + "learning_rate": 9.403584937243642e-05, + "loss": 1.2164, + "step": 14895 + }, + { + "epoch": 0.5334574820491701, + "grad_norm": 2.005007743835449, + "learning_rate": 9.402427104750308e-05, + "loss": 1.1699, + "step": 14896 + }, + { + "epoch": 0.5334932941787384, + "grad_norm": 1.4692554473876953, + "learning_rate": 9.401269280296549e-05, + "loss": 0.976, + "step": 14897 + }, + { + "epoch": 0.5335291063083066, + "grad_norm": 1.4492331743240356, + "learning_rate": 9.400111463897932e-05, + "loss": 1.0834, + "step": 14898 + }, + { + "epoch": 0.5335649184378749, + "grad_norm": 1.543599009513855, + "learning_rate": 9.39895365557005e-05, + "loss": 1.0542, + "step": 14899 + }, + { + "epoch": 0.5336007305674432, + "grad_norm": 1.5897825956344604, + "learning_rate": 9.397795855328464e-05, + "loss": 1.1377, + "step": 14900 + }, + { + "epoch": 0.5336365426970114, + "grad_norm": 1.499152421951294, + "learning_rate": 9.396638063188764e-05, + "loss": 1.2449, + "step": 14901 + }, + { + "epoch": 0.5336723548265797, + "grad_norm": 1.7493963241577148, + "learning_rate": 9.395480279166514e-05, + "loss": 1.1667, + "step": 14902 + }, + { + "epoch": 0.5337081669561481, + "grad_norm": 2.0380172729492188, + "learning_rate": 9.394322503277305e-05, + "loss": 1.1337, + "step": 14903 + }, + { + "epoch": 0.5337439790857164, + "grad_norm": 1.2647238969802856, + "learning_rate": 9.393164735536696e-05, + "loss": 0.9647, + "step": 14904 + }, + { + "epoch": 0.5337797912152846, + "grad_norm": 1.5626670122146606, + "learning_rate": 9.39200697596028e-05, + "loss": 1.0136, + "step": 14905 + }, + { + "epoch": 0.5338156033448529, + "grad_norm": 1.6411333084106445, + "learning_rate": 9.390849224563627e-05, + "loss": 1.148, + "step": 14906 + }, + { + "epoch": 0.5338514154744212, + "grad_norm": 2.2838497161865234, + "learning_rate": 9.389691481362304e-05, + "loss": 1.1307, + "step": 14907 + }, + { + "epoch": 0.5338872276039894, + "grad_norm": 2.7651031017303467, + "learning_rate": 9.388533746371904e-05, + "loss": 1.046, + "step": 14908 + }, + { + "epoch": 0.5339230397335577, + "grad_norm": 1.5325324535369873, + "learning_rate": 9.387376019607985e-05, + "loss": 0.9337, + "step": 14909 + }, + { + "epoch": 0.5339588518631261, + "grad_norm": 1.2559362649917603, + "learning_rate": 9.386218301086139e-05, + "loss": 0.9559, + "step": 14910 + }, + { + "epoch": 0.5339946639926944, + "grad_norm": 1.7200263738632202, + "learning_rate": 9.385060590821929e-05, + "loss": 1.0561, + "step": 14911 + }, + { + "epoch": 0.5340304761222626, + "grad_norm": 2.039020538330078, + "learning_rate": 9.38390288883094e-05, + "loss": 1.3602, + "step": 14912 + }, + { + "epoch": 0.5340662882518309, + "grad_norm": 1.559288740158081, + "learning_rate": 9.382745195128736e-05, + "loss": 1.0199, + "step": 14913 + }, + { + "epoch": 0.5341021003813992, + "grad_norm": 1.6985760927200317, + "learning_rate": 9.381587509730907e-05, + "loss": 1.2244, + "step": 14914 + }, + { + "epoch": 0.5341379125109674, + "grad_norm": 1.4094542264938354, + "learning_rate": 9.380429832653017e-05, + "loss": 0.9829, + "step": 14915 + }, + { + "epoch": 0.5341737246405357, + "grad_norm": 1.3895437717437744, + "learning_rate": 9.379272163910643e-05, + "loss": 0.9998, + "step": 14916 + }, + { + "epoch": 0.5342095367701041, + "grad_norm": 1.7543543577194214, + "learning_rate": 9.378114503519364e-05, + "loss": 0.9976, + "step": 14917 + }, + { + "epoch": 0.5342453488996723, + "grad_norm": 1.3998806476593018, + "learning_rate": 9.376956851494747e-05, + "loss": 1.1364, + "step": 14918 + }, + { + "epoch": 0.5342811610292406, + "grad_norm": 1.3590837717056274, + "learning_rate": 9.375799207852379e-05, + "loss": 1.173, + "step": 14919 + }, + { + "epoch": 0.5343169731588089, + "grad_norm": 1.2621010541915894, + "learning_rate": 9.37464157260782e-05, + "loss": 1.1147, + "step": 14920 + }, + { + "epoch": 0.5343527852883772, + "grad_norm": 1.8373610973358154, + "learning_rate": 9.37348394577666e-05, + "loss": 1.3156, + "step": 14921 + }, + { + "epoch": 0.5343885974179454, + "grad_norm": 1.8537278175354004, + "learning_rate": 9.372326327374459e-05, + "loss": 1.0425, + "step": 14922 + }, + { + "epoch": 0.5344244095475137, + "grad_norm": 1.7958790063858032, + "learning_rate": 9.371168717416803e-05, + "loss": 1.1156, + "step": 14923 + }, + { + "epoch": 0.5344602216770821, + "grad_norm": 1.8511582612991333, + "learning_rate": 9.370011115919258e-05, + "loss": 1.2, + "step": 14924 + }, + { + "epoch": 0.5344960338066503, + "grad_norm": 1.4017881155014038, + "learning_rate": 9.368853522897399e-05, + "loss": 1.1419, + "step": 14925 + }, + { + "epoch": 0.5345318459362186, + "grad_norm": 1.441788673400879, + "learning_rate": 9.367695938366805e-05, + "loss": 0.9551, + "step": 14926 + }, + { + "epoch": 0.5345676580657869, + "grad_norm": 1.247322678565979, + "learning_rate": 9.366538362343043e-05, + "loss": 1.1239, + "step": 14927 + }, + { + "epoch": 0.5346034701953551, + "grad_norm": 1.337871789932251, + "learning_rate": 9.365380794841694e-05, + "loss": 1.1955, + "step": 14928 + }, + { + "epoch": 0.5346392823249234, + "grad_norm": 1.6291191577911377, + "learning_rate": 9.364223235878324e-05, + "loss": 1.2357, + "step": 14929 + }, + { + "epoch": 0.5346750944544917, + "grad_norm": 1.9941695928573608, + "learning_rate": 9.363065685468514e-05, + "loss": 1.241, + "step": 14930 + }, + { + "epoch": 0.5347109065840601, + "grad_norm": 1.4657315015792847, + "learning_rate": 9.361908143627829e-05, + "loss": 1.027, + "step": 14931 + }, + { + "epoch": 0.5347467187136283, + "grad_norm": 1.4103633165359497, + "learning_rate": 9.360750610371852e-05, + "loss": 1.3498, + "step": 14932 + }, + { + "epoch": 0.5347825308431966, + "grad_norm": 1.6919175386428833, + "learning_rate": 9.35959308571615e-05, + "loss": 1.1267, + "step": 14933 + }, + { + "epoch": 0.5348183429727649, + "grad_norm": 1.4615607261657715, + "learning_rate": 9.358435569676295e-05, + "loss": 1.0464, + "step": 14934 + }, + { + "epoch": 0.5348541551023331, + "grad_norm": 1.44126296043396, + "learning_rate": 9.357278062267863e-05, + "loss": 1.1647, + "step": 14935 + }, + { + "epoch": 0.5348899672319014, + "grad_norm": 1.8481472730636597, + "learning_rate": 9.356120563506424e-05, + "loss": 1.0989, + "step": 14936 + }, + { + "epoch": 0.5349257793614697, + "grad_norm": 1.4080053567886353, + "learning_rate": 9.354963073407555e-05, + "loss": 1.2661, + "step": 14937 + }, + { + "epoch": 0.5349615914910381, + "grad_norm": 1.4256287813186646, + "learning_rate": 9.353805591986822e-05, + "loss": 0.9613, + "step": 14938 + }, + { + "epoch": 0.5349974036206063, + "grad_norm": 1.6351466178894043, + "learning_rate": 9.352648119259804e-05, + "loss": 1.0169, + "step": 14939 + }, + { + "epoch": 0.5350332157501746, + "grad_norm": 1.313469648361206, + "learning_rate": 9.35149065524207e-05, + "loss": 0.9571, + "step": 14940 + }, + { + "epoch": 0.5350690278797429, + "grad_norm": 1.865895390510559, + "learning_rate": 9.350333199949193e-05, + "loss": 1.4014, + "step": 14941 + }, + { + "epoch": 0.5351048400093111, + "grad_norm": 1.8707988262176514, + "learning_rate": 9.349175753396746e-05, + "loss": 1.2102, + "step": 14942 + }, + { + "epoch": 0.5351406521388794, + "grad_norm": 2.32592511177063, + "learning_rate": 9.348018315600297e-05, + "loss": 1.1783, + "step": 14943 + }, + { + "epoch": 0.5351764642684477, + "grad_norm": 1.6084513664245605, + "learning_rate": 9.346860886575422e-05, + "loss": 1.227, + "step": 14944 + }, + { + "epoch": 0.535212276398016, + "grad_norm": 1.5620311498641968, + "learning_rate": 9.34570346633769e-05, + "loss": 1.1781, + "step": 14945 + }, + { + "epoch": 0.5352480885275843, + "grad_norm": 1.6475317478179932, + "learning_rate": 9.344546054902677e-05, + "loss": 1.2579, + "step": 14946 + }, + { + "epoch": 0.5352839006571526, + "grad_norm": 2.1068477630615234, + "learning_rate": 9.343388652285947e-05, + "loss": 1.1809, + "step": 14947 + }, + { + "epoch": 0.5353197127867209, + "grad_norm": 1.6255340576171875, + "learning_rate": 9.342231258503079e-05, + "loss": 1.382, + "step": 14948 + }, + { + "epoch": 0.5353555249162891, + "grad_norm": 1.317090630531311, + "learning_rate": 9.34107387356964e-05, + "loss": 1.0726, + "step": 14949 + }, + { + "epoch": 0.5353913370458574, + "grad_norm": 1.3818281888961792, + "learning_rate": 9.339916497501202e-05, + "loss": 1.1832, + "step": 14950 + }, + { + "epoch": 0.5354271491754257, + "grad_norm": 1.7423126697540283, + "learning_rate": 9.338759130313338e-05, + "loss": 1.3103, + "step": 14951 + }, + { + "epoch": 0.535462961304994, + "grad_norm": 1.4463268518447876, + "learning_rate": 9.337601772021612e-05, + "loss": 0.8746, + "step": 14952 + }, + { + "epoch": 0.5354987734345623, + "grad_norm": 1.5333232879638672, + "learning_rate": 9.336444422641605e-05, + "loss": 1.18, + "step": 14953 + }, + { + "epoch": 0.5355345855641306, + "grad_norm": 1.2173194885253906, + "learning_rate": 9.335287082188878e-05, + "loss": 1.1255, + "step": 14954 + }, + { + "epoch": 0.5355703976936989, + "grad_norm": 1.4789983034133911, + "learning_rate": 9.334129750679009e-05, + "loss": 1.167, + "step": 14955 + }, + { + "epoch": 0.5356062098232671, + "grad_norm": 1.7568895816802979, + "learning_rate": 9.332972428127563e-05, + "loss": 1.119, + "step": 14956 + }, + { + "epoch": 0.5356420219528354, + "grad_norm": 1.5837883949279785, + "learning_rate": 9.331815114550115e-05, + "loss": 1.1895, + "step": 14957 + }, + { + "epoch": 0.5356778340824037, + "grad_norm": 1.5895220041275024, + "learning_rate": 9.330657809962231e-05, + "loss": 1.0213, + "step": 14958 + }, + { + "epoch": 0.535713646211972, + "grad_norm": 1.5916731357574463, + "learning_rate": 9.329500514379485e-05, + "loss": 1.0195, + "step": 14959 + }, + { + "epoch": 0.5357494583415403, + "grad_norm": 1.4735480546951294, + "learning_rate": 9.328343227817443e-05, + "loss": 1.4211, + "step": 14960 + }, + { + "epoch": 0.5357852704711086, + "grad_norm": 1.6805269718170166, + "learning_rate": 9.327185950291676e-05, + "loss": 1.2356, + "step": 14961 + }, + { + "epoch": 0.5358210826006768, + "grad_norm": 2.085702896118164, + "learning_rate": 9.326028681817755e-05, + "loss": 1.196, + "step": 14962 + }, + { + "epoch": 0.5358568947302451, + "grad_norm": 1.5968742370605469, + "learning_rate": 9.324871422411248e-05, + "loss": 1.0565, + "step": 14963 + }, + { + "epoch": 0.5358927068598134, + "grad_norm": 1.1334501504898071, + "learning_rate": 9.323714172087726e-05, + "loss": 1.0635, + "step": 14964 + }, + { + "epoch": 0.5359285189893817, + "grad_norm": 1.4915575981140137, + "learning_rate": 9.322556930862757e-05, + "loss": 1.0738, + "step": 14965 + }, + { + "epoch": 0.53596433111895, + "grad_norm": 1.4046863317489624, + "learning_rate": 9.321399698751912e-05, + "loss": 0.9287, + "step": 14966 + }, + { + "epoch": 0.5360001432485183, + "grad_norm": 1.6732903718948364, + "learning_rate": 9.320242475770756e-05, + "loss": 1.1003, + "step": 14967 + }, + { + "epoch": 0.5360359553780866, + "grad_norm": 1.8782103061676025, + "learning_rate": 9.319085261934864e-05, + "loss": 1.0478, + "step": 14968 + }, + { + "epoch": 0.5360717675076548, + "grad_norm": 1.2656646966934204, + "learning_rate": 9.317928057259799e-05, + "loss": 1.168, + "step": 14969 + }, + { + "epoch": 0.5361075796372231, + "grad_norm": 1.3899558782577515, + "learning_rate": 9.316770861761132e-05, + "loss": 1.2389, + "step": 14970 + }, + { + "epoch": 0.5361433917667914, + "grad_norm": 1.5616984367370605, + "learning_rate": 9.315613675454435e-05, + "loss": 1.2718, + "step": 14971 + }, + { + "epoch": 0.5361792038963596, + "grad_norm": 1.58443284034729, + "learning_rate": 9.314456498355269e-05, + "loss": 0.968, + "step": 14972 + }, + { + "epoch": 0.536215016025928, + "grad_norm": 1.5088391304016113, + "learning_rate": 9.313299330479209e-05, + "loss": 1.005, + "step": 14973 + }, + { + "epoch": 0.5362508281554963, + "grad_norm": 1.6626956462860107, + "learning_rate": 9.31214217184182e-05, + "loss": 1.2386, + "step": 14974 + }, + { + "epoch": 0.5362866402850646, + "grad_norm": 1.7172918319702148, + "learning_rate": 9.31098502245867e-05, + "loss": 1.082, + "step": 14975 + }, + { + "epoch": 0.5363224524146328, + "grad_norm": 1.535685658454895, + "learning_rate": 9.30982788234533e-05, + "loss": 1.0663, + "step": 14976 + }, + { + "epoch": 0.5363582645442011, + "grad_norm": 1.4920706748962402, + "learning_rate": 9.308670751517363e-05, + "loss": 1.0523, + "step": 14977 + }, + { + "epoch": 0.5363940766737694, + "grad_norm": 1.943152904510498, + "learning_rate": 9.307513629990342e-05, + "loss": 1.0245, + "step": 14978 + }, + { + "epoch": 0.5364298888033376, + "grad_norm": 1.545335054397583, + "learning_rate": 9.306356517779828e-05, + "loss": 1.207, + "step": 14979 + }, + { + "epoch": 0.536465700932906, + "grad_norm": 2.005335807800293, + "learning_rate": 9.305199414901397e-05, + "loss": 1.2812, + "step": 14980 + }, + { + "epoch": 0.5365015130624743, + "grad_norm": 1.469110369682312, + "learning_rate": 9.304042321370607e-05, + "loss": 1.1015, + "step": 14981 + }, + { + "epoch": 0.5365373251920426, + "grad_norm": 2.3753278255462646, + "learning_rate": 9.302885237203034e-05, + "loss": 1.0912, + "step": 14982 + }, + { + "epoch": 0.5365731373216108, + "grad_norm": 1.391379714012146, + "learning_rate": 9.301728162414238e-05, + "loss": 0.9585, + "step": 14983 + }, + { + "epoch": 0.5366089494511791, + "grad_norm": 1.4915778636932373, + "learning_rate": 9.30057109701979e-05, + "loss": 1.2303, + "step": 14984 + }, + { + "epoch": 0.5366447615807474, + "grad_norm": 1.693599820137024, + "learning_rate": 9.299414041035259e-05, + "loss": 1.3213, + "step": 14985 + }, + { + "epoch": 0.5366805737103156, + "grad_norm": 1.4999797344207764, + "learning_rate": 9.298256994476202e-05, + "loss": 1.2223, + "step": 14986 + }, + { + "epoch": 0.536716385839884, + "grad_norm": 1.5191116333007812, + "learning_rate": 9.297099957358199e-05, + "loss": 1.2843, + "step": 14987 + }, + { + "epoch": 0.5367521979694523, + "grad_norm": 1.3215291500091553, + "learning_rate": 9.295942929696801e-05, + "loss": 0.9012, + "step": 14988 + }, + { + "epoch": 0.5367880100990206, + "grad_norm": 2.2382352352142334, + "learning_rate": 9.294785911507589e-05, + "loss": 1.0903, + "step": 14989 + }, + { + "epoch": 0.5368238222285888, + "grad_norm": 1.5963225364685059, + "learning_rate": 9.29362890280612e-05, + "loss": 1.3512, + "step": 14990 + }, + { + "epoch": 0.5368596343581571, + "grad_norm": 1.4092416763305664, + "learning_rate": 9.292471903607964e-05, + "loss": 1.2451, + "step": 14991 + }, + { + "epoch": 0.5368954464877254, + "grad_norm": 1.5784622430801392, + "learning_rate": 9.291314913928685e-05, + "loss": 1.0576, + "step": 14992 + }, + { + "epoch": 0.5369312586172936, + "grad_norm": 1.2747353315353394, + "learning_rate": 9.290157933783852e-05, + "loss": 0.942, + "step": 14993 + }, + { + "epoch": 0.536967070746862, + "grad_norm": 1.5280437469482422, + "learning_rate": 9.28900096318903e-05, + "loss": 1.0155, + "step": 14994 + }, + { + "epoch": 0.5370028828764303, + "grad_norm": 1.4269922971725464, + "learning_rate": 9.287844002159776e-05, + "loss": 1.2103, + "step": 14995 + }, + { + "epoch": 0.5370386950059985, + "grad_norm": 1.5923526287078857, + "learning_rate": 9.286687050711668e-05, + "loss": 1.1067, + "step": 14996 + }, + { + "epoch": 0.5370745071355668, + "grad_norm": 1.5695033073425293, + "learning_rate": 9.285530108860262e-05, + "loss": 1.1747, + "step": 14997 + }, + { + "epoch": 0.5371103192651351, + "grad_norm": 1.3779423236846924, + "learning_rate": 9.284373176621131e-05, + "loss": 1.2247, + "step": 14998 + }, + { + "epoch": 0.5371461313947034, + "grad_norm": 1.6922506093978882, + "learning_rate": 9.28321625400983e-05, + "loss": 1.1067, + "step": 14999 + }, + { + "epoch": 0.5371819435242716, + "grad_norm": 1.5318243503570557, + "learning_rate": 9.282059341041936e-05, + "loss": 1.1011, + "step": 15000 + }, + { + "epoch": 0.53721775565384, + "grad_norm": 1.6521897315979004, + "learning_rate": 9.280902437733003e-05, + "loss": 0.9766, + "step": 15001 + }, + { + "epoch": 0.5372535677834083, + "grad_norm": 1.7014528512954712, + "learning_rate": 9.279745544098602e-05, + "loss": 1.1137, + "step": 15002 + }, + { + "epoch": 0.5372893799129765, + "grad_norm": 1.7869797945022583, + "learning_rate": 9.278588660154298e-05, + "loss": 1.2637, + "step": 15003 + }, + { + "epoch": 0.5373251920425448, + "grad_norm": 1.478533148765564, + "learning_rate": 9.277431785915647e-05, + "loss": 0.9787, + "step": 15004 + }, + { + "epoch": 0.5373610041721131, + "grad_norm": 1.46896493434906, + "learning_rate": 9.276274921398225e-05, + "loss": 1.1904, + "step": 15005 + }, + { + "epoch": 0.5373968163016813, + "grad_norm": 1.4400699138641357, + "learning_rate": 9.275118066617585e-05, + "loss": 1.1748, + "step": 15006 + }, + { + "epoch": 0.5374326284312496, + "grad_norm": 1.4918845891952515, + "learning_rate": 9.273961221589303e-05, + "loss": 0.9812, + "step": 15007 + }, + { + "epoch": 0.537468440560818, + "grad_norm": 1.705938458442688, + "learning_rate": 9.27280438632893e-05, + "loss": 1.1427, + "step": 15008 + }, + { + "epoch": 0.5375042526903863, + "grad_norm": 1.6533147096633911, + "learning_rate": 9.271647560852042e-05, + "loss": 1.2322, + "step": 15009 + }, + { + "epoch": 0.5375400648199545, + "grad_norm": 1.4283419847488403, + "learning_rate": 9.27049074517419e-05, + "loss": 1.2704, + "step": 15010 + }, + { + "epoch": 0.5375758769495228, + "grad_norm": 2.2398550510406494, + "learning_rate": 9.26933393931095e-05, + "loss": 1.1809, + "step": 15011 + }, + { + "epoch": 0.5376116890790911, + "grad_norm": 1.7707643508911133, + "learning_rate": 9.268177143277877e-05, + "loss": 1.2569, + "step": 15012 + }, + { + "epoch": 0.5376475012086593, + "grad_norm": 1.564106822013855, + "learning_rate": 9.267020357090535e-05, + "loss": 1.0697, + "step": 15013 + }, + { + "epoch": 0.5376833133382276, + "grad_norm": 1.3939862251281738, + "learning_rate": 9.265863580764492e-05, + "loss": 1.1125, + "step": 15014 + }, + { + "epoch": 0.537719125467796, + "grad_norm": 1.8308329582214355, + "learning_rate": 9.264706814315302e-05, + "loss": 1.2036, + "step": 15015 + }, + { + "epoch": 0.5377549375973643, + "grad_norm": 1.2840163707733154, + "learning_rate": 9.263550057758539e-05, + "loss": 1.1657, + "step": 15016 + }, + { + "epoch": 0.5377907497269325, + "grad_norm": 1.4571125507354736, + "learning_rate": 9.262393311109754e-05, + "loss": 0.9175, + "step": 15017 + }, + { + "epoch": 0.5378265618565008, + "grad_norm": 1.6374567747116089, + "learning_rate": 9.261236574384523e-05, + "loss": 1.1766, + "step": 15018 + }, + { + "epoch": 0.5378623739860691, + "grad_norm": 1.583154559135437, + "learning_rate": 9.260079847598393e-05, + "loss": 1.0432, + "step": 15019 + }, + { + "epoch": 0.5378981861156373, + "grad_norm": 1.3947138786315918, + "learning_rate": 9.258923130766942e-05, + "loss": 1.1168, + "step": 15020 + }, + { + "epoch": 0.5379339982452056, + "grad_norm": 1.3604718446731567, + "learning_rate": 9.257766423905722e-05, + "loss": 1.2938, + "step": 15021 + }, + { + "epoch": 0.537969810374774, + "grad_norm": 1.473645567893982, + "learning_rate": 9.256609727030294e-05, + "loss": 1.23, + "step": 15022 + }, + { + "epoch": 0.5380056225043423, + "grad_norm": 1.704370141029358, + "learning_rate": 9.255453040156228e-05, + "loss": 1.1035, + "step": 15023 + }, + { + "epoch": 0.5380414346339105, + "grad_norm": 1.3757199048995972, + "learning_rate": 9.254296363299077e-05, + "loss": 1.1035, + "step": 15024 + }, + { + "epoch": 0.5380772467634788, + "grad_norm": 1.8258706331253052, + "learning_rate": 9.253139696474409e-05, + "loss": 1.1237, + "step": 15025 + }, + { + "epoch": 0.5381130588930471, + "grad_norm": 1.869985580444336, + "learning_rate": 9.25198303969778e-05, + "loss": 1.2156, + "step": 15026 + }, + { + "epoch": 0.5381488710226153, + "grad_norm": 2.017920732498169, + "learning_rate": 9.250826392984757e-05, + "loss": 1.0188, + "step": 15027 + }, + { + "epoch": 0.5381846831521836, + "grad_norm": 1.6475530862808228, + "learning_rate": 9.249669756350894e-05, + "loss": 1.2074, + "step": 15028 + }, + { + "epoch": 0.538220495281752, + "grad_norm": 1.2508081197738647, + "learning_rate": 9.248513129811765e-05, + "loss": 0.8955, + "step": 15029 + }, + { + "epoch": 0.5382563074113202, + "grad_norm": 1.5859547853469849, + "learning_rate": 9.247356513382917e-05, + "loss": 1.2064, + "step": 15030 + }, + { + "epoch": 0.5382921195408885, + "grad_norm": 1.617334246635437, + "learning_rate": 9.246199907079916e-05, + "loss": 1.0458, + "step": 15031 + }, + { + "epoch": 0.5383279316704568, + "grad_norm": 1.4351935386657715, + "learning_rate": 9.245043310918325e-05, + "loss": 1.1962, + "step": 15032 + }, + { + "epoch": 0.538363743800025, + "grad_norm": 2.1442339420318604, + "learning_rate": 9.2438867249137e-05, + "loss": 1.309, + "step": 15033 + }, + { + "epoch": 0.5383995559295933, + "grad_norm": 1.300506591796875, + "learning_rate": 9.242730149081606e-05, + "loss": 1.2178, + "step": 15034 + }, + { + "epoch": 0.5384353680591616, + "grad_norm": 1.5709128379821777, + "learning_rate": 9.241573583437599e-05, + "loss": 1.2025, + "step": 15035 + }, + { + "epoch": 0.53847118018873, + "grad_norm": 1.4142948389053345, + "learning_rate": 9.240417027997243e-05, + "loss": 1.0814, + "step": 15036 + }, + { + "epoch": 0.5385069923182982, + "grad_norm": 1.7247450351715088, + "learning_rate": 9.239260482776096e-05, + "loss": 1.1924, + "step": 15037 + }, + { + "epoch": 0.5385428044478665, + "grad_norm": 1.492917776107788, + "learning_rate": 9.238103947789718e-05, + "loss": 0.9711, + "step": 15038 + }, + { + "epoch": 0.5385786165774348, + "grad_norm": 1.9726094007492065, + "learning_rate": 9.236947423053669e-05, + "loss": 1.1764, + "step": 15039 + }, + { + "epoch": 0.538614428707003, + "grad_norm": 1.3594475984573364, + "learning_rate": 9.235790908583506e-05, + "loss": 1.0728, + "step": 15040 + }, + { + "epoch": 0.5386502408365713, + "grad_norm": 1.9288020133972168, + "learning_rate": 9.234634404394793e-05, + "loss": 1.1489, + "step": 15041 + }, + { + "epoch": 0.5386860529661396, + "grad_norm": 1.9058680534362793, + "learning_rate": 9.233477910503083e-05, + "loss": 1.1443, + "step": 15042 + }, + { + "epoch": 0.538721865095708, + "grad_norm": 1.8064649105072021, + "learning_rate": 9.232321426923943e-05, + "loss": 0.9991, + "step": 15043 + }, + { + "epoch": 0.5387576772252762, + "grad_norm": 1.5829215049743652, + "learning_rate": 9.231164953672926e-05, + "loss": 1.1473, + "step": 15044 + }, + { + "epoch": 0.5387934893548445, + "grad_norm": 1.7047607898712158, + "learning_rate": 9.230008490765593e-05, + "loss": 0.9475, + "step": 15045 + }, + { + "epoch": 0.5388293014844128, + "grad_norm": 1.2142118215560913, + "learning_rate": 9.228852038217502e-05, + "loss": 1.0118, + "step": 15046 + }, + { + "epoch": 0.538865113613981, + "grad_norm": 1.862554907798767, + "learning_rate": 9.227695596044215e-05, + "loss": 1.0585, + "step": 15047 + }, + { + "epoch": 0.5389009257435493, + "grad_norm": 1.4875619411468506, + "learning_rate": 9.226539164261286e-05, + "loss": 1.0224, + "step": 15048 + }, + { + "epoch": 0.5389367378731176, + "grad_norm": 1.9457651376724243, + "learning_rate": 9.225382742884273e-05, + "loss": 1.1259, + "step": 15049 + }, + { + "epoch": 0.538972550002686, + "grad_norm": 1.5172728300094604, + "learning_rate": 9.224226331928738e-05, + "loss": 1.1371, + "step": 15050 + }, + { + "epoch": 0.5390083621322542, + "grad_norm": 1.5400055646896362, + "learning_rate": 9.223069931410236e-05, + "loss": 1.0527, + "step": 15051 + }, + { + "epoch": 0.5390441742618225, + "grad_norm": 1.790693998336792, + "learning_rate": 9.221913541344327e-05, + "loss": 1.1997, + "step": 15052 + }, + { + "epoch": 0.5390799863913908, + "grad_norm": 1.5400407314300537, + "learning_rate": 9.220757161746566e-05, + "loss": 1.2061, + "step": 15053 + }, + { + "epoch": 0.539115798520959, + "grad_norm": 1.727962851524353, + "learning_rate": 9.219600792632513e-05, + "loss": 1.0975, + "step": 15054 + }, + { + "epoch": 0.5391516106505273, + "grad_norm": 1.5411056280136108, + "learning_rate": 9.218444434017724e-05, + "loss": 1.1993, + "step": 15055 + }, + { + "epoch": 0.5391874227800956, + "grad_norm": 1.4991525411605835, + "learning_rate": 9.217288085917759e-05, + "loss": 1.0041, + "step": 15056 + }, + { + "epoch": 0.539223234909664, + "grad_norm": 1.5071039199829102, + "learning_rate": 9.216131748348174e-05, + "loss": 1.1439, + "step": 15057 + }, + { + "epoch": 0.5392590470392322, + "grad_norm": 1.384921669960022, + "learning_rate": 9.21497542132452e-05, + "loss": 1.1189, + "step": 15058 + }, + { + "epoch": 0.5392948591688005, + "grad_norm": 1.6304348707199097, + "learning_rate": 9.213819104862365e-05, + "loss": 1.0662, + "step": 15059 + }, + { + "epoch": 0.5393306712983688, + "grad_norm": 1.2577582597732544, + "learning_rate": 9.212662798977256e-05, + "loss": 1.0626, + "step": 15060 + }, + { + "epoch": 0.539366483427937, + "grad_norm": 1.3741525411605835, + "learning_rate": 9.211506503684755e-05, + "loss": 0.9541, + "step": 15061 + }, + { + "epoch": 0.5394022955575053, + "grad_norm": 1.343461275100708, + "learning_rate": 9.210350219000416e-05, + "loss": 1.2467, + "step": 15062 + }, + { + "epoch": 0.5394381076870736, + "grad_norm": 1.5745233297348022, + "learning_rate": 9.209193944939798e-05, + "loss": 1.1684, + "step": 15063 + }, + { + "epoch": 0.539473919816642, + "grad_norm": 1.803763747215271, + "learning_rate": 9.208037681518454e-05, + "loss": 1.3145, + "step": 15064 + }, + { + "epoch": 0.5395097319462102, + "grad_norm": 1.4589296579360962, + "learning_rate": 9.206881428751941e-05, + "loss": 0.9406, + "step": 15065 + }, + { + "epoch": 0.5395455440757785, + "grad_norm": 1.7922730445861816, + "learning_rate": 9.205725186655817e-05, + "loss": 1.1863, + "step": 15066 + }, + { + "epoch": 0.5395813562053468, + "grad_norm": 1.2025411128997803, + "learning_rate": 9.204568955245634e-05, + "loss": 1.0817, + "step": 15067 + }, + { + "epoch": 0.539617168334915, + "grad_norm": 1.2678841352462769, + "learning_rate": 9.203412734536951e-05, + "loss": 1.0487, + "step": 15068 + }, + { + "epoch": 0.5396529804644833, + "grad_norm": 1.767777681350708, + "learning_rate": 9.202256524545322e-05, + "loss": 1.1399, + "step": 15069 + }, + { + "epoch": 0.5396887925940516, + "grad_norm": 1.3882607221603394, + "learning_rate": 9.201100325286302e-05, + "loss": 1.1268, + "step": 15070 + }, + { + "epoch": 0.5397246047236199, + "grad_norm": 1.9037710428237915, + "learning_rate": 9.199944136775446e-05, + "loss": 1.1374, + "step": 15071 + }, + { + "epoch": 0.5397604168531882, + "grad_norm": 1.7006701231002808, + "learning_rate": 9.198787959028312e-05, + "loss": 1.1647, + "step": 15072 + }, + { + "epoch": 0.5397962289827565, + "grad_norm": 1.3953346014022827, + "learning_rate": 9.197631792060453e-05, + "loss": 1.0484, + "step": 15073 + }, + { + "epoch": 0.5398320411123247, + "grad_norm": 1.4073867797851562, + "learning_rate": 9.196475635887419e-05, + "loss": 1.2346, + "step": 15074 + }, + { + "epoch": 0.539867853241893, + "grad_norm": 1.6582127809524536, + "learning_rate": 9.195319490524772e-05, + "loss": 1.067, + "step": 15075 + }, + { + "epoch": 0.5399036653714613, + "grad_norm": 1.3512868881225586, + "learning_rate": 9.194163355988062e-05, + "loss": 0.9712, + "step": 15076 + }, + { + "epoch": 0.5399394775010296, + "grad_norm": 1.9926413297653198, + "learning_rate": 9.193007232292846e-05, + "loss": 1.2098, + "step": 15077 + }, + { + "epoch": 0.5399752896305979, + "grad_norm": 1.352094054222107, + "learning_rate": 9.191851119454675e-05, + "loss": 0.9079, + "step": 15078 + }, + { + "epoch": 0.5400111017601662, + "grad_norm": 1.2376618385314941, + "learning_rate": 9.190695017489106e-05, + "loss": 1.0146, + "step": 15079 + }, + { + "epoch": 0.5400469138897345, + "grad_norm": 2.07698130607605, + "learning_rate": 9.18953892641169e-05, + "loss": 1.1542, + "step": 15080 + }, + { + "epoch": 0.5400827260193027, + "grad_norm": 1.785855770111084, + "learning_rate": 9.188382846237984e-05, + "loss": 1.1293, + "step": 15081 + }, + { + "epoch": 0.540118538148871, + "grad_norm": 1.9750093221664429, + "learning_rate": 9.187226776983543e-05, + "loss": 1.3285, + "step": 15082 + }, + { + "epoch": 0.5401543502784393, + "grad_norm": 1.806984543800354, + "learning_rate": 9.18607071866391e-05, + "loss": 1.3359, + "step": 15083 + }, + { + "epoch": 0.5401901624080075, + "grad_norm": 1.4690648317337036, + "learning_rate": 9.184914671294653e-05, + "loss": 1.0464, + "step": 15084 + }, + { + "epoch": 0.5402259745375759, + "grad_norm": 1.278173565864563, + "learning_rate": 9.18375863489131e-05, + "loss": 1.0075, + "step": 15085 + }, + { + "epoch": 0.5402617866671442, + "grad_norm": 1.5875636339187622, + "learning_rate": 9.182602609469448e-05, + "loss": 1.1485, + "step": 15086 + }, + { + "epoch": 0.5402975987967125, + "grad_norm": 2.1739277839660645, + "learning_rate": 9.18144659504461e-05, + "loss": 0.9982, + "step": 15087 + }, + { + "epoch": 0.5403334109262807, + "grad_norm": 1.587805986404419, + "learning_rate": 9.180290591632354e-05, + "loss": 1.1082, + "step": 15088 + }, + { + "epoch": 0.540369223055849, + "grad_norm": 1.6231727600097656, + "learning_rate": 9.179134599248228e-05, + "loss": 1.054, + "step": 15089 + }, + { + "epoch": 0.5404050351854173, + "grad_norm": 1.4914660453796387, + "learning_rate": 9.177978617907791e-05, + "loss": 1.1, + "step": 15090 + }, + { + "epoch": 0.5404408473149855, + "grad_norm": 1.288152813911438, + "learning_rate": 9.176822647626593e-05, + "loss": 1.2081, + "step": 15091 + }, + { + "epoch": 0.5404766594445539, + "grad_norm": 1.6197067499160767, + "learning_rate": 9.175666688420177e-05, + "loss": 1.1532, + "step": 15092 + }, + { + "epoch": 0.5405124715741222, + "grad_norm": 1.74802565574646, + "learning_rate": 9.17451074030411e-05, + "loss": 1.2091, + "step": 15093 + }, + { + "epoch": 0.5405482837036905, + "grad_norm": 1.5750722885131836, + "learning_rate": 9.17335480329393e-05, + "loss": 1.1827, + "step": 15094 + }, + { + "epoch": 0.5405840958332587, + "grad_norm": 1.775653600692749, + "learning_rate": 9.1721988774052e-05, + "loss": 0.9866, + "step": 15095 + }, + { + "epoch": 0.540619907962827, + "grad_norm": 1.3408294916152954, + "learning_rate": 9.17104296265346e-05, + "loss": 0.9777, + "step": 15096 + }, + { + "epoch": 0.5406557200923953, + "grad_norm": 1.262434482574463, + "learning_rate": 9.169887059054275e-05, + "loss": 1.0465, + "step": 15097 + }, + { + "epoch": 0.5406915322219635, + "grad_norm": 2.136880874633789, + "learning_rate": 9.168731166623182e-05, + "loss": 1.0906, + "step": 15098 + }, + { + "epoch": 0.5407273443515319, + "grad_norm": 2.087338447570801, + "learning_rate": 9.167575285375744e-05, + "loss": 1.1918, + "step": 15099 + }, + { + "epoch": 0.5407631564811002, + "grad_norm": 1.613421082496643, + "learning_rate": 9.166419415327508e-05, + "loss": 1.1916, + "step": 15100 + }, + { + "epoch": 0.5407989686106685, + "grad_norm": 1.7117291688919067, + "learning_rate": 9.165263556494016e-05, + "loss": 0.8886, + "step": 15101 + }, + { + "epoch": 0.5408347807402367, + "grad_norm": 1.2582361698150635, + "learning_rate": 9.164107708890835e-05, + "loss": 0.9322, + "step": 15102 + }, + { + "epoch": 0.540870592869805, + "grad_norm": 1.25182044506073, + "learning_rate": 9.162951872533498e-05, + "loss": 1.0402, + "step": 15103 + }, + { + "epoch": 0.5409064049993733, + "grad_norm": 1.6566601991653442, + "learning_rate": 9.161796047437572e-05, + "loss": 1.0924, + "step": 15104 + }, + { + "epoch": 0.5409422171289415, + "grad_norm": 1.3889508247375488, + "learning_rate": 9.160640233618591e-05, + "loss": 1.0078, + "step": 15105 + }, + { + "epoch": 0.5409780292585099, + "grad_norm": 1.8319240808486938, + "learning_rate": 9.15948443109212e-05, + "loss": 1.1314, + "step": 15106 + }, + { + "epoch": 0.5410138413880782, + "grad_norm": 1.5217591524124146, + "learning_rate": 9.158328639873695e-05, + "loss": 1.0912, + "step": 15107 + }, + { + "epoch": 0.5410496535176464, + "grad_norm": 1.341089129447937, + "learning_rate": 9.15717285997888e-05, + "loss": 1.0111, + "step": 15108 + }, + { + "epoch": 0.5410854656472147, + "grad_norm": 1.6160236597061157, + "learning_rate": 9.156017091423215e-05, + "loss": 1.1394, + "step": 15109 + }, + { + "epoch": 0.541121277776783, + "grad_norm": 1.2038792371749878, + "learning_rate": 9.154861334222248e-05, + "loss": 1.0958, + "step": 15110 + }, + { + "epoch": 0.5411570899063513, + "grad_norm": 1.290052056312561, + "learning_rate": 9.153705588391535e-05, + "loss": 0.8476, + "step": 15111 + }, + { + "epoch": 0.5411929020359195, + "grad_norm": 1.3892418146133423, + "learning_rate": 9.152549853946615e-05, + "loss": 0.9001, + "step": 15112 + }, + { + "epoch": 0.5412287141654879, + "grad_norm": 1.8173471689224243, + "learning_rate": 9.151394130903052e-05, + "loss": 1.2005, + "step": 15113 + }, + { + "epoch": 0.5412645262950562, + "grad_norm": 1.3971915245056152, + "learning_rate": 9.15023841927638e-05, + "loss": 1.3356, + "step": 15114 + }, + { + "epoch": 0.5413003384246244, + "grad_norm": 1.342857003211975, + "learning_rate": 9.14908271908216e-05, + "loss": 1.0617, + "step": 15115 + }, + { + "epoch": 0.5413361505541927, + "grad_norm": 1.8844618797302246, + "learning_rate": 9.147927030335928e-05, + "loss": 1.1957, + "step": 15116 + }, + { + "epoch": 0.541371962683761, + "grad_norm": 1.8254460096359253, + "learning_rate": 9.146771353053245e-05, + "loss": 0.9561, + "step": 15117 + }, + { + "epoch": 0.5414077748133292, + "grad_norm": 1.4102578163146973, + "learning_rate": 9.14561568724965e-05, + "loss": 0.937, + "step": 15118 + }, + { + "epoch": 0.5414435869428975, + "grad_norm": 1.6543982028961182, + "learning_rate": 9.144460032940693e-05, + "loss": 1.2288, + "step": 15119 + }, + { + "epoch": 0.5414793990724659, + "grad_norm": 1.2333974838256836, + "learning_rate": 9.143304390141925e-05, + "loss": 0.9055, + "step": 15120 + }, + { + "epoch": 0.5415152112020342, + "grad_norm": 1.661206841468811, + "learning_rate": 9.142148758868887e-05, + "loss": 1.1415, + "step": 15121 + }, + { + "epoch": 0.5415510233316024, + "grad_norm": 2.247873306274414, + "learning_rate": 9.140993139137135e-05, + "loss": 1.1678, + "step": 15122 + }, + { + "epoch": 0.5415868354611707, + "grad_norm": 1.6822291612625122, + "learning_rate": 9.139837530962209e-05, + "loss": 1.1744, + "step": 15123 + }, + { + "epoch": 0.541622647590739, + "grad_norm": 1.6315338611602783, + "learning_rate": 9.138681934359663e-05, + "loss": 1.1377, + "step": 15124 + }, + { + "epoch": 0.5416584597203072, + "grad_norm": 2.050593376159668, + "learning_rate": 9.137526349345036e-05, + "loss": 1.084, + "step": 15125 + }, + { + "epoch": 0.5416942718498755, + "grad_norm": 1.3584970235824585, + "learning_rate": 9.136370775933885e-05, + "loss": 1.0536, + "step": 15126 + }, + { + "epoch": 0.5417300839794439, + "grad_norm": 1.5057570934295654, + "learning_rate": 9.135215214141751e-05, + "loss": 1.0919, + "step": 15127 + }, + { + "epoch": 0.5417658961090122, + "grad_norm": 1.5349520444869995, + "learning_rate": 9.134059663984176e-05, + "loss": 1.3703, + "step": 15128 + }, + { + "epoch": 0.5418017082385804, + "grad_norm": 1.3233503103256226, + "learning_rate": 9.132904125476715e-05, + "loss": 0.9743, + "step": 15129 + }, + { + "epoch": 0.5418375203681487, + "grad_norm": 2.702543020248413, + "learning_rate": 9.131748598634907e-05, + "loss": 1.5218, + "step": 15130 + }, + { + "epoch": 0.541873332497717, + "grad_norm": 1.2615472078323364, + "learning_rate": 9.130593083474305e-05, + "loss": 0.921, + "step": 15131 + }, + { + "epoch": 0.5419091446272852, + "grad_norm": 1.4618586301803589, + "learning_rate": 9.129437580010449e-05, + "loss": 1.0963, + "step": 15132 + }, + { + "epoch": 0.5419449567568535, + "grad_norm": 2.61871600151062, + "learning_rate": 9.12828208825889e-05, + "loss": 1.0848, + "step": 15133 + }, + { + "epoch": 0.5419807688864219, + "grad_norm": 1.4406428337097168, + "learning_rate": 9.12712660823517e-05, + "loss": 1.2053, + "step": 15134 + }, + { + "epoch": 0.5420165810159902, + "grad_norm": 1.7839126586914062, + "learning_rate": 9.125971139954835e-05, + "loss": 1.1716, + "step": 15135 + }, + { + "epoch": 0.5420523931455584, + "grad_norm": 1.4385921955108643, + "learning_rate": 9.124815683433432e-05, + "loss": 0.9781, + "step": 15136 + }, + { + "epoch": 0.5420882052751267, + "grad_norm": 1.575852394104004, + "learning_rate": 9.123660238686503e-05, + "loss": 1.1702, + "step": 15137 + }, + { + "epoch": 0.542124017404695, + "grad_norm": 1.946977972984314, + "learning_rate": 9.122504805729598e-05, + "loss": 1.2571, + "step": 15138 + }, + { + "epoch": 0.5421598295342632, + "grad_norm": 1.4239073991775513, + "learning_rate": 9.121349384578255e-05, + "loss": 0.9894, + "step": 15139 + }, + { + "epoch": 0.5421956416638315, + "grad_norm": 1.534387469291687, + "learning_rate": 9.120193975248027e-05, + "loss": 1.2041, + "step": 15140 + }, + { + "epoch": 0.5422314537933999, + "grad_norm": 1.463379979133606, + "learning_rate": 9.119038577754451e-05, + "loss": 1.0054, + "step": 15141 + }, + { + "epoch": 0.5422672659229681, + "grad_norm": 1.3064643144607544, + "learning_rate": 9.117883192113077e-05, + "loss": 1.0638, + "step": 15142 + }, + { + "epoch": 0.5423030780525364, + "grad_norm": 1.208159327507019, + "learning_rate": 9.116727818339444e-05, + "loss": 1.2078, + "step": 15143 + }, + { + "epoch": 0.5423388901821047, + "grad_norm": 1.9189870357513428, + "learning_rate": 9.115572456449102e-05, + "loss": 1.2537, + "step": 15144 + }, + { + "epoch": 0.542374702311673, + "grad_norm": 1.5362211465835571, + "learning_rate": 9.114417106457591e-05, + "loss": 1.3096, + "step": 15145 + }, + { + "epoch": 0.5424105144412412, + "grad_norm": 1.466835856437683, + "learning_rate": 9.113261768380454e-05, + "loss": 1.203, + "step": 15146 + }, + { + "epoch": 0.5424463265708095, + "grad_norm": 1.726320743560791, + "learning_rate": 9.112106442233237e-05, + "loss": 1.134, + "step": 15147 + }, + { + "epoch": 0.5424821387003779, + "grad_norm": 1.403844952583313, + "learning_rate": 9.110951128031482e-05, + "loss": 0.9606, + "step": 15148 + }, + { + "epoch": 0.5425179508299461, + "grad_norm": 1.8000081777572632, + "learning_rate": 9.109795825790735e-05, + "loss": 1.0833, + "step": 15149 + }, + { + "epoch": 0.5425537629595144, + "grad_norm": 1.4693669080734253, + "learning_rate": 9.108640535526533e-05, + "loss": 1.1855, + "step": 15150 + }, + { + "epoch": 0.5425895750890827, + "grad_norm": 1.7062394618988037, + "learning_rate": 9.107485257254426e-05, + "loss": 1.2149, + "step": 15151 + }, + { + "epoch": 0.542625387218651, + "grad_norm": 1.522679328918457, + "learning_rate": 9.106329990989952e-05, + "loss": 0.9534, + "step": 15152 + }, + { + "epoch": 0.5426611993482192, + "grad_norm": 1.9270873069763184, + "learning_rate": 9.105174736748656e-05, + "loss": 1.1793, + "step": 15153 + }, + { + "epoch": 0.5426970114777875, + "grad_norm": 1.3537062406539917, + "learning_rate": 9.104019494546081e-05, + "loss": 1.041, + "step": 15154 + }, + { + "epoch": 0.5427328236073559, + "grad_norm": 1.4833788871765137, + "learning_rate": 9.102864264397765e-05, + "loss": 1.1444, + "step": 15155 + }, + { + "epoch": 0.5427686357369241, + "grad_norm": 1.7337102890014648, + "learning_rate": 9.101709046319256e-05, + "loss": 0.9693, + "step": 15156 + }, + { + "epoch": 0.5428044478664924, + "grad_norm": 1.4787192344665527, + "learning_rate": 9.10055384032609e-05, + "loss": 1.0165, + "step": 15157 + }, + { + "epoch": 0.5428402599960607, + "grad_norm": 1.7944719791412354, + "learning_rate": 9.099398646433814e-05, + "loss": 1.2666, + "step": 15158 + }, + { + "epoch": 0.5428760721256289, + "grad_norm": 1.754712700843811, + "learning_rate": 9.098243464657966e-05, + "loss": 1.3395, + "step": 15159 + }, + { + "epoch": 0.5429118842551972, + "grad_norm": 1.7062877416610718, + "learning_rate": 9.097088295014092e-05, + "loss": 1.1031, + "step": 15160 + }, + { + "epoch": 0.5429476963847655, + "grad_norm": 2.105759382247925, + "learning_rate": 9.095933137517727e-05, + "loss": 1.3885, + "step": 15161 + }, + { + "epoch": 0.5429835085143339, + "grad_norm": 1.4768275022506714, + "learning_rate": 9.094777992184417e-05, + "loss": 0.9081, + "step": 15162 + }, + { + "epoch": 0.5430193206439021, + "grad_norm": 1.552947998046875, + "learning_rate": 9.093622859029701e-05, + "loss": 1.2973, + "step": 15163 + }, + { + "epoch": 0.5430551327734704, + "grad_norm": 1.9556230306625366, + "learning_rate": 9.09246773806912e-05, + "loss": 1.0094, + "step": 15164 + }, + { + "epoch": 0.5430909449030387, + "grad_norm": 2.1414523124694824, + "learning_rate": 9.091312629318216e-05, + "loss": 1.3033, + "step": 15165 + }, + { + "epoch": 0.5431267570326069, + "grad_norm": 1.2130720615386963, + "learning_rate": 9.090157532792526e-05, + "loss": 1.0516, + "step": 15166 + }, + { + "epoch": 0.5431625691621752, + "grad_norm": 1.337594985961914, + "learning_rate": 9.089002448507596e-05, + "loss": 1.0702, + "step": 15167 + }, + { + "epoch": 0.5431983812917435, + "grad_norm": 1.3781734704971313, + "learning_rate": 9.087847376478961e-05, + "loss": 0.9436, + "step": 15168 + }, + { + "epoch": 0.5432341934213119, + "grad_norm": 1.4240977764129639, + "learning_rate": 9.086692316722166e-05, + "loss": 0.9978, + "step": 15169 + }, + { + "epoch": 0.5432700055508801, + "grad_norm": 1.448537826538086, + "learning_rate": 9.085537269252747e-05, + "loss": 1.0591, + "step": 15170 + }, + { + "epoch": 0.5433058176804484, + "grad_norm": 1.5800318717956543, + "learning_rate": 9.08438223408624e-05, + "loss": 1.0046, + "step": 15171 + }, + { + "epoch": 0.5433416298100167, + "grad_norm": 1.424264907836914, + "learning_rate": 9.083227211238192e-05, + "loss": 1.1272, + "step": 15172 + }, + { + "epoch": 0.5433774419395849, + "grad_norm": 4.3473076820373535, + "learning_rate": 9.082072200724139e-05, + "loss": 1.1713, + "step": 15173 + }, + { + "epoch": 0.5434132540691532, + "grad_norm": 1.3333090543746948, + "learning_rate": 9.08091720255962e-05, + "loss": 0.9372, + "step": 15174 + }, + { + "epoch": 0.5434490661987215, + "grad_norm": 1.7895138263702393, + "learning_rate": 9.079762216760174e-05, + "loss": 1.1613, + "step": 15175 + }, + { + "epoch": 0.5434848783282898, + "grad_norm": 2.0722320079803467, + "learning_rate": 9.078607243341344e-05, + "loss": 1.3818, + "step": 15176 + }, + { + "epoch": 0.5435206904578581, + "grad_norm": 1.2000904083251953, + "learning_rate": 9.077452282318661e-05, + "loss": 1.0663, + "step": 15177 + }, + { + "epoch": 0.5435565025874264, + "grad_norm": 1.4981296062469482, + "learning_rate": 9.076297333707669e-05, + "loss": 1.0227, + "step": 15178 + }, + { + "epoch": 0.5435923147169947, + "grad_norm": 1.676630973815918, + "learning_rate": 9.07514239752391e-05, + "loss": 1.0463, + "step": 15179 + }, + { + "epoch": 0.5436281268465629, + "grad_norm": 1.3562391996383667, + "learning_rate": 9.073987473782907e-05, + "loss": 1.2868, + "step": 15180 + }, + { + "epoch": 0.5436639389761312, + "grad_norm": 1.4200209379196167, + "learning_rate": 9.072832562500217e-05, + "loss": 1.109, + "step": 15181 + }, + { + "epoch": 0.5436997511056995, + "grad_norm": 1.3952436447143555, + "learning_rate": 9.071677663691361e-05, + "loss": 1.0441, + "step": 15182 + }, + { + "epoch": 0.5437355632352678, + "grad_norm": 1.5800141096115112, + "learning_rate": 9.070522777371892e-05, + "loss": 1.4497, + "step": 15183 + }, + { + "epoch": 0.5437713753648361, + "grad_norm": 1.4353243112564087, + "learning_rate": 9.069367903557333e-05, + "loss": 1.2116, + "step": 15184 + }, + { + "epoch": 0.5438071874944044, + "grad_norm": 1.3805155754089355, + "learning_rate": 9.068213042263234e-05, + "loss": 1.0363, + "step": 15185 + }, + { + "epoch": 0.5438429996239726, + "grad_norm": 1.5168741941452026, + "learning_rate": 9.067058193505124e-05, + "loss": 1.1755, + "step": 15186 + }, + { + "epoch": 0.5438788117535409, + "grad_norm": 1.5223033428192139, + "learning_rate": 9.065903357298544e-05, + "loss": 1.1685, + "step": 15187 + }, + { + "epoch": 0.5439146238831092, + "grad_norm": 1.4328505992889404, + "learning_rate": 9.064748533659031e-05, + "loss": 1.1811, + "step": 15188 + }, + { + "epoch": 0.5439504360126775, + "grad_norm": 1.5995750427246094, + "learning_rate": 9.063593722602115e-05, + "loss": 0.9996, + "step": 15189 + }, + { + "epoch": 0.5439862481422458, + "grad_norm": 1.936944842338562, + "learning_rate": 9.062438924143344e-05, + "loss": 1.1242, + "step": 15190 + }, + { + "epoch": 0.5440220602718141, + "grad_norm": 1.810138463973999, + "learning_rate": 9.06128413829824e-05, + "loss": 1.1608, + "step": 15191 + }, + { + "epoch": 0.5440578724013824, + "grad_norm": 1.9351588487625122, + "learning_rate": 9.060129365082354e-05, + "loss": 1.1859, + "step": 15192 + }, + { + "epoch": 0.5440936845309506, + "grad_norm": 1.714782953262329, + "learning_rate": 9.05897460451121e-05, + "loss": 1.2252, + "step": 15193 + }, + { + "epoch": 0.5441294966605189, + "grad_norm": 2.427274465560913, + "learning_rate": 9.057819856600355e-05, + "loss": 1.1468, + "step": 15194 + }, + { + "epoch": 0.5441653087900872, + "grad_norm": 3.03759765625, + "learning_rate": 9.056665121365311e-05, + "loss": 1.1966, + "step": 15195 + }, + { + "epoch": 0.5442011209196554, + "grad_norm": 1.5917128324508667, + "learning_rate": 9.055510398821627e-05, + "loss": 0.9558, + "step": 15196 + }, + { + "epoch": 0.5442369330492238, + "grad_norm": 1.322973370552063, + "learning_rate": 9.054355688984833e-05, + "loss": 1.2056, + "step": 15197 + }, + { + "epoch": 0.5442727451787921, + "grad_norm": 1.5414159297943115, + "learning_rate": 9.053200991870456e-05, + "loss": 0.9423, + "step": 15198 + }, + { + "epoch": 0.5443085573083604, + "grad_norm": 2.5350563526153564, + "learning_rate": 9.052046307494046e-05, + "loss": 1.2676, + "step": 15199 + }, + { + "epoch": 0.5443443694379286, + "grad_norm": 1.5244109630584717, + "learning_rate": 9.050891635871124e-05, + "loss": 0.9464, + "step": 15200 + }, + { + "epoch": 0.5443801815674969, + "grad_norm": 1.75009024143219, + "learning_rate": 9.049736977017236e-05, + "loss": 1.1918, + "step": 15201 + }, + { + "epoch": 0.5444159936970652, + "grad_norm": 1.6250227689743042, + "learning_rate": 9.048582330947906e-05, + "loss": 1.065, + "step": 15202 + }, + { + "epoch": 0.5444518058266334, + "grad_norm": 1.9863377809524536, + "learning_rate": 9.04742769767868e-05, + "loss": 1.3293, + "step": 15203 + }, + { + "epoch": 0.5444876179562018, + "grad_norm": 1.4659712314605713, + "learning_rate": 9.046273077225078e-05, + "loss": 1.1626, + "step": 15204 + }, + { + "epoch": 0.5445234300857701, + "grad_norm": 1.660841941833496, + "learning_rate": 9.045118469602649e-05, + "loss": 1.1327, + "step": 15205 + }, + { + "epoch": 0.5445592422153384, + "grad_norm": 1.7374157905578613, + "learning_rate": 9.043963874826917e-05, + "loss": 1.288, + "step": 15206 + }, + { + "epoch": 0.5445950543449066, + "grad_norm": 2.1799845695495605, + "learning_rate": 9.042809292913415e-05, + "loss": 1.0102, + "step": 15207 + }, + { + "epoch": 0.5446308664744749, + "grad_norm": 1.5899332761764526, + "learning_rate": 9.041654723877683e-05, + "loss": 1.0218, + "step": 15208 + }, + { + "epoch": 0.5446666786040432, + "grad_norm": 1.5276213884353638, + "learning_rate": 9.040500167735247e-05, + "loss": 1.3099, + "step": 15209 + }, + { + "epoch": 0.5447024907336114, + "grad_norm": 1.7700066566467285, + "learning_rate": 9.039345624501646e-05, + "loss": 1.0666, + "step": 15210 + }, + { + "epoch": 0.5447383028631798, + "grad_norm": 1.4939680099487305, + "learning_rate": 9.038191094192407e-05, + "loss": 1.007, + "step": 15211 + }, + { + "epoch": 0.5447741149927481, + "grad_norm": 1.3740752935409546, + "learning_rate": 9.037036576823072e-05, + "loss": 0.772, + "step": 15212 + }, + { + "epoch": 0.5448099271223164, + "grad_norm": 1.332777500152588, + "learning_rate": 9.035882072409161e-05, + "loss": 1.0889, + "step": 15213 + }, + { + "epoch": 0.5448457392518846, + "grad_norm": 1.719440221786499, + "learning_rate": 9.034727580966219e-05, + "loss": 1.3026, + "step": 15214 + }, + { + "epoch": 0.5448815513814529, + "grad_norm": 1.4662666320800781, + "learning_rate": 9.033573102509771e-05, + "loss": 1.0191, + "step": 15215 + }, + { + "epoch": 0.5449173635110212, + "grad_norm": 1.4038735628128052, + "learning_rate": 9.032418637055348e-05, + "loss": 1.1024, + "step": 15216 + }, + { + "epoch": 0.5449531756405894, + "grad_norm": 1.590088129043579, + "learning_rate": 9.031264184618487e-05, + "loss": 1.1509, + "step": 15217 + }, + { + "epoch": 0.5449889877701578, + "grad_norm": 1.8072872161865234, + "learning_rate": 9.030109745214713e-05, + "loss": 1.1633, + "step": 15218 + }, + { + "epoch": 0.5450247998997261, + "grad_norm": 1.7160840034484863, + "learning_rate": 9.028955318859564e-05, + "loss": 1.0058, + "step": 15219 + }, + { + "epoch": 0.5450606120292943, + "grad_norm": 1.3772549629211426, + "learning_rate": 9.027800905568568e-05, + "loss": 1.1732, + "step": 15220 + }, + { + "epoch": 0.5450964241588626, + "grad_norm": 1.7082716226577759, + "learning_rate": 9.026646505357258e-05, + "loss": 1.3006, + "step": 15221 + }, + { + "epoch": 0.5451322362884309, + "grad_norm": 1.76629638671875, + "learning_rate": 9.025492118241161e-05, + "loss": 1.397, + "step": 15222 + }, + { + "epoch": 0.5451680484179992, + "grad_norm": 1.4430381059646606, + "learning_rate": 9.024337744235814e-05, + "loss": 1.1681, + "step": 15223 + }, + { + "epoch": 0.5452038605475674, + "grad_norm": 1.5968126058578491, + "learning_rate": 9.023183383356743e-05, + "loss": 1.2, + "step": 15224 + }, + { + "epoch": 0.5452396726771358, + "grad_norm": 1.4840704202651978, + "learning_rate": 9.022029035619478e-05, + "loss": 1.1044, + "step": 15225 + }, + { + "epoch": 0.5452754848067041, + "grad_norm": 1.4371291399002075, + "learning_rate": 9.020874701039552e-05, + "loss": 1.1731, + "step": 15226 + }, + { + "epoch": 0.5453112969362723, + "grad_norm": 1.811286211013794, + "learning_rate": 9.019720379632493e-05, + "loss": 1.146, + "step": 15227 + }, + { + "epoch": 0.5453471090658406, + "grad_norm": 1.3973227739334106, + "learning_rate": 9.018566071413833e-05, + "loss": 1.1051, + "step": 15228 + }, + { + "epoch": 0.5453829211954089, + "grad_norm": 1.7144774198532104, + "learning_rate": 9.017411776399099e-05, + "loss": 1.2819, + "step": 15229 + }, + { + "epoch": 0.5454187333249771, + "grad_norm": 1.308411955833435, + "learning_rate": 9.016257494603824e-05, + "loss": 0.8347, + "step": 15230 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.7892787456512451, + "learning_rate": 9.015103226043533e-05, + "loss": 1.1888, + "step": 15231 + }, + { + "epoch": 0.5454903575841138, + "grad_norm": 1.328416109085083, + "learning_rate": 9.01394897073376e-05, + "loss": 1.0102, + "step": 15232 + }, + { + "epoch": 0.5455261697136821, + "grad_norm": 1.3169606924057007, + "learning_rate": 9.012794728690032e-05, + "loss": 1.1359, + "step": 15233 + }, + { + "epoch": 0.5455619818432503, + "grad_norm": 1.3899930715560913, + "learning_rate": 9.011640499927875e-05, + "loss": 1.2139, + "step": 15234 + }, + { + "epoch": 0.5455977939728186, + "grad_norm": 1.8527756929397583, + "learning_rate": 9.010486284462823e-05, + "loss": 1.1578, + "step": 15235 + }, + { + "epoch": 0.5456336061023869, + "grad_norm": 1.4828311204910278, + "learning_rate": 9.009332082310398e-05, + "loss": 0.9892, + "step": 15236 + }, + { + "epoch": 0.5456694182319551, + "grad_norm": 1.7948883771896362, + "learning_rate": 9.008177893486136e-05, + "loss": 1.106, + "step": 15237 + }, + { + "epoch": 0.5457052303615234, + "grad_norm": 1.3211824893951416, + "learning_rate": 9.007023718005558e-05, + "loss": 1.1542, + "step": 15238 + }, + { + "epoch": 0.5457410424910918, + "grad_norm": 1.7055917978286743, + "learning_rate": 9.005869555884197e-05, + "loss": 1.1127, + "step": 15239 + }, + { + "epoch": 0.5457768546206601, + "grad_norm": 1.511320948600769, + "learning_rate": 9.004715407137577e-05, + "loss": 1.091, + "step": 15240 + }, + { + "epoch": 0.5458126667502283, + "grad_norm": 1.2864536046981812, + "learning_rate": 9.003561271781229e-05, + "loss": 1.0188, + "step": 15241 + }, + { + "epoch": 0.5458484788797966, + "grad_norm": 1.2797027826309204, + "learning_rate": 9.002407149830679e-05, + "loss": 1.2031, + "step": 15242 + }, + { + "epoch": 0.5458842910093649, + "grad_norm": 1.799820065498352, + "learning_rate": 9.001253041301453e-05, + "loss": 1.2038, + "step": 15243 + }, + { + "epoch": 0.5459201031389331, + "grad_norm": 1.5152087211608887, + "learning_rate": 9.00009894620908e-05, + "loss": 1.142, + "step": 15244 + }, + { + "epoch": 0.5459559152685014, + "grad_norm": 1.5737677812576294, + "learning_rate": 8.998944864569084e-05, + "loss": 1.1505, + "step": 15245 + }, + { + "epoch": 0.5459917273980698, + "grad_norm": 1.8484922647476196, + "learning_rate": 8.997790796396996e-05, + "loss": 1.2786, + "step": 15246 + }, + { + "epoch": 0.546027539527638, + "grad_norm": 1.9564977884292603, + "learning_rate": 8.996636741708337e-05, + "loss": 1.3659, + "step": 15247 + }, + { + "epoch": 0.5460633516572063, + "grad_norm": 1.246519923210144, + "learning_rate": 8.995482700518639e-05, + "loss": 1.1008, + "step": 15248 + }, + { + "epoch": 0.5460991637867746, + "grad_norm": 1.3713644742965698, + "learning_rate": 8.994328672843424e-05, + "loss": 1.233, + "step": 15249 + }, + { + "epoch": 0.5461349759163429, + "grad_norm": 1.5438066720962524, + "learning_rate": 8.993174658698221e-05, + "loss": 1.0771, + "step": 15250 + }, + { + "epoch": 0.5461707880459111, + "grad_norm": 2.0066657066345215, + "learning_rate": 8.992020658098555e-05, + "loss": 1.1264, + "step": 15251 + }, + { + "epoch": 0.5462066001754794, + "grad_norm": 1.6238324642181396, + "learning_rate": 8.990866671059948e-05, + "loss": 1.1541, + "step": 15252 + }, + { + "epoch": 0.5462424123050477, + "grad_norm": 1.6169060468673706, + "learning_rate": 8.98971269759793e-05, + "loss": 1.2808, + "step": 15253 + }, + { + "epoch": 0.546278224434616, + "grad_norm": 1.6490596532821655, + "learning_rate": 8.988558737728023e-05, + "loss": 1.1687, + "step": 15254 + }, + { + "epoch": 0.5463140365641843, + "grad_norm": 1.7203807830810547, + "learning_rate": 8.987404791465757e-05, + "loss": 1.1887, + "step": 15255 + }, + { + "epoch": 0.5463498486937526, + "grad_norm": 1.4527039527893066, + "learning_rate": 8.986250858826649e-05, + "loss": 1.2625, + "step": 15256 + }, + { + "epoch": 0.5463856608233209, + "grad_norm": 1.5245307683944702, + "learning_rate": 8.985096939826231e-05, + "loss": 1.1048, + "step": 15257 + }, + { + "epoch": 0.5464214729528891, + "grad_norm": 2.1829755306243896, + "learning_rate": 8.983943034480022e-05, + "loss": 1.4036, + "step": 15258 + }, + { + "epoch": 0.5464572850824574, + "grad_norm": 1.4148908853530884, + "learning_rate": 8.982789142803552e-05, + "loss": 0.8409, + "step": 15259 + }, + { + "epoch": 0.5464930972120257, + "grad_norm": 1.839560627937317, + "learning_rate": 8.981635264812341e-05, + "loss": 1.1832, + "step": 15260 + }, + { + "epoch": 0.546528909341594, + "grad_norm": 1.7297101020812988, + "learning_rate": 8.980481400521911e-05, + "loss": 1.3081, + "step": 15261 + }, + { + "epoch": 0.5465647214711623, + "grad_norm": 1.514230728149414, + "learning_rate": 8.979327549947794e-05, + "loss": 1.3398, + "step": 15262 + }, + { + "epoch": 0.5466005336007306, + "grad_norm": 1.8228869438171387, + "learning_rate": 8.978173713105503e-05, + "loss": 1.2033, + "step": 15263 + }, + { + "epoch": 0.5466363457302988, + "grad_norm": 1.6967777013778687, + "learning_rate": 8.977019890010571e-05, + "loss": 1.1029, + "step": 15264 + }, + { + "epoch": 0.5466721578598671, + "grad_norm": 1.3788608312606812, + "learning_rate": 8.975866080678512e-05, + "loss": 1.0418, + "step": 15265 + }, + { + "epoch": 0.5467079699894354, + "grad_norm": 1.38814377784729, + "learning_rate": 8.974712285124858e-05, + "loss": 1.0857, + "step": 15266 + }, + { + "epoch": 0.5467437821190037, + "grad_norm": 1.4437484741210938, + "learning_rate": 8.973558503365129e-05, + "loss": 1.1573, + "step": 15267 + }, + { + "epoch": 0.546779594248572, + "grad_norm": 1.753220558166504, + "learning_rate": 8.97240473541484e-05, + "loss": 1.2959, + "step": 15268 + }, + { + "epoch": 0.5468154063781403, + "grad_norm": 1.2198458909988403, + "learning_rate": 8.971250981289525e-05, + "loss": 1.0264, + "step": 15269 + }, + { + "epoch": 0.5468512185077086, + "grad_norm": 1.650749921798706, + "learning_rate": 8.970097241004697e-05, + "loss": 1.2003, + "step": 15270 + }, + { + "epoch": 0.5468870306372768, + "grad_norm": 1.6269488334655762, + "learning_rate": 8.968943514575888e-05, + "loss": 1.2743, + "step": 15271 + }, + { + "epoch": 0.5469228427668451, + "grad_norm": 1.372757911682129, + "learning_rate": 8.967789802018607e-05, + "loss": 1.1126, + "step": 15272 + }, + { + "epoch": 0.5469586548964134, + "grad_norm": 1.72434663772583, + "learning_rate": 8.966636103348388e-05, + "loss": 1.3868, + "step": 15273 + }, + { + "epoch": 0.5469944670259816, + "grad_norm": 1.5935224294662476, + "learning_rate": 8.965482418580746e-05, + "loss": 1.3234, + "step": 15274 + }, + { + "epoch": 0.54703027915555, + "grad_norm": 1.778422236442566, + "learning_rate": 8.964328747731204e-05, + "loss": 1.0525, + "step": 15275 + }, + { + "epoch": 0.5470660912851183, + "grad_norm": 1.6306735277175903, + "learning_rate": 8.963175090815285e-05, + "loss": 1.0522, + "step": 15276 + }, + { + "epoch": 0.5471019034146866, + "grad_norm": 1.4634606838226318, + "learning_rate": 8.9620214478485e-05, + "loss": 0.971, + "step": 15277 + }, + { + "epoch": 0.5471377155442548, + "grad_norm": 1.2894266843795776, + "learning_rate": 8.960867818846386e-05, + "loss": 0.8755, + "step": 15278 + }, + { + "epoch": 0.5471735276738231, + "grad_norm": 1.2218431234359741, + "learning_rate": 8.959714203824449e-05, + "loss": 1.0599, + "step": 15279 + }, + { + "epoch": 0.5472093398033914, + "grad_norm": 2.1513350009918213, + "learning_rate": 8.958560602798221e-05, + "loss": 1.3189, + "step": 15280 + }, + { + "epoch": 0.5472451519329596, + "grad_norm": 1.6150249242782593, + "learning_rate": 8.95740701578321e-05, + "loss": 1.1055, + "step": 15281 + }, + { + "epoch": 0.547280964062528, + "grad_norm": 1.9196244478225708, + "learning_rate": 8.956253442794948e-05, + "loss": 0.9317, + "step": 15282 + }, + { + "epoch": 0.5473167761920963, + "grad_norm": 1.4791114330291748, + "learning_rate": 8.955099883848945e-05, + "loss": 1.065, + "step": 15283 + }, + { + "epoch": 0.5473525883216646, + "grad_norm": 1.4448730945587158, + "learning_rate": 8.953946338960731e-05, + "loss": 0.9604, + "step": 15284 + }, + { + "epoch": 0.5473884004512328, + "grad_norm": 1.863365650177002, + "learning_rate": 8.952792808145819e-05, + "loss": 1.1831, + "step": 15285 + }, + { + "epoch": 0.5474242125808011, + "grad_norm": 1.3513597249984741, + "learning_rate": 8.951639291419723e-05, + "loss": 1.0789, + "step": 15286 + }, + { + "epoch": 0.5474600247103694, + "grad_norm": 1.4529943466186523, + "learning_rate": 8.950485788797976e-05, + "loss": 1.2815, + "step": 15287 + }, + { + "epoch": 0.5474958368399376, + "grad_norm": 1.8641636371612549, + "learning_rate": 8.949332300296082e-05, + "loss": 1.171, + "step": 15288 + }, + { + "epoch": 0.547531648969506, + "grad_norm": 1.7414159774780273, + "learning_rate": 8.948178825929572e-05, + "loss": 0.9788, + "step": 15289 + }, + { + "epoch": 0.5475674610990743, + "grad_norm": 1.7483055591583252, + "learning_rate": 8.947025365713953e-05, + "loss": 1.1671, + "step": 15290 + }, + { + "epoch": 0.5476032732286426, + "grad_norm": 1.2962950468063354, + "learning_rate": 8.945871919664757e-05, + "loss": 0.8636, + "step": 15291 + }, + { + "epoch": 0.5476390853582108, + "grad_norm": 1.346174716949463, + "learning_rate": 8.944718487797487e-05, + "loss": 0.9958, + "step": 15292 + }, + { + "epoch": 0.5476748974877791, + "grad_norm": 1.4316483736038208, + "learning_rate": 8.943565070127676e-05, + "loss": 1.2866, + "step": 15293 + }, + { + "epoch": 0.5477107096173474, + "grad_norm": 1.5267447233200073, + "learning_rate": 8.94241166667083e-05, + "loss": 1.0797, + "step": 15294 + }, + { + "epoch": 0.5477465217469156, + "grad_norm": 1.8406336307525635, + "learning_rate": 8.94125827744247e-05, + "loss": 1.3024, + "step": 15295 + }, + { + "epoch": 0.547782333876484, + "grad_norm": 1.2826316356658936, + "learning_rate": 8.940104902458117e-05, + "loss": 1.1829, + "step": 15296 + }, + { + "epoch": 0.5478181460060523, + "grad_norm": 1.4874602556228638, + "learning_rate": 8.938951541733282e-05, + "loss": 1.0789, + "step": 15297 + }, + { + "epoch": 0.5478539581356205, + "grad_norm": 1.405557632446289, + "learning_rate": 8.93779819528349e-05, + "loss": 1.0322, + "step": 15298 + }, + { + "epoch": 0.5478897702651888, + "grad_norm": 1.7861195802688599, + "learning_rate": 8.936644863124246e-05, + "loss": 1.1249, + "step": 15299 + }, + { + "epoch": 0.5479255823947571, + "grad_norm": 1.9867537021636963, + "learning_rate": 8.935491545271081e-05, + "loss": 1.2207, + "step": 15300 + }, + { + "epoch": 0.5479613945243254, + "grad_norm": 1.4006234407424927, + "learning_rate": 8.934338241739498e-05, + "loss": 0.8373, + "step": 15301 + }, + { + "epoch": 0.5479972066538936, + "grad_norm": 1.8507261276245117, + "learning_rate": 8.933184952545026e-05, + "loss": 1.1782, + "step": 15302 + }, + { + "epoch": 0.548033018783462, + "grad_norm": 1.5676908493041992, + "learning_rate": 8.932031677703172e-05, + "loss": 0.8978, + "step": 15303 + }, + { + "epoch": 0.5480688309130303, + "grad_norm": 1.6775554418563843, + "learning_rate": 8.930878417229453e-05, + "loss": 1.1095, + "step": 15304 + }, + { + "epoch": 0.5481046430425985, + "grad_norm": 1.7866694927215576, + "learning_rate": 8.929725171139387e-05, + "loss": 1.0027, + "step": 15305 + }, + { + "epoch": 0.5481404551721668, + "grad_norm": 2.2479827404022217, + "learning_rate": 8.928571939448486e-05, + "loss": 1.3816, + "step": 15306 + }, + { + "epoch": 0.5481762673017351, + "grad_norm": 1.4532644748687744, + "learning_rate": 8.927418722172269e-05, + "loss": 1.097, + "step": 15307 + }, + { + "epoch": 0.5482120794313033, + "grad_norm": 1.4181870222091675, + "learning_rate": 8.926265519326246e-05, + "loss": 0.9775, + "step": 15308 + }, + { + "epoch": 0.5482478915608716, + "grad_norm": 1.3710474967956543, + "learning_rate": 8.925112330925943e-05, + "loss": 1.041, + "step": 15309 + }, + { + "epoch": 0.54828370369044, + "grad_norm": 1.2639050483703613, + "learning_rate": 8.923959156986859e-05, + "loss": 1.1876, + "step": 15310 + }, + { + "epoch": 0.5483195158200083, + "grad_norm": 1.5749225616455078, + "learning_rate": 8.922805997524524e-05, + "loss": 1.179, + "step": 15311 + }, + { + "epoch": 0.5483553279495765, + "grad_norm": 1.9295090436935425, + "learning_rate": 8.921652852554442e-05, + "loss": 1.4659, + "step": 15312 + }, + { + "epoch": 0.5483911400791448, + "grad_norm": 1.3317288160324097, + "learning_rate": 8.920499722092129e-05, + "loss": 1.0369, + "step": 15313 + }, + { + "epoch": 0.5484269522087131, + "grad_norm": 2.064351797103882, + "learning_rate": 8.9193466061531e-05, + "loss": 0.9136, + "step": 15314 + }, + { + "epoch": 0.5484627643382813, + "grad_norm": 1.2697325944900513, + "learning_rate": 8.918193504752868e-05, + "loss": 1.0698, + "step": 15315 + }, + { + "epoch": 0.5484985764678496, + "grad_norm": 1.7007455825805664, + "learning_rate": 8.917040417906947e-05, + "loss": 1.0822, + "step": 15316 + }, + { + "epoch": 0.548534388597418, + "grad_norm": 1.5716891288757324, + "learning_rate": 8.91588734563085e-05, + "loss": 1.1285, + "step": 15317 + }, + { + "epoch": 0.5485702007269863, + "grad_norm": 1.8892980813980103, + "learning_rate": 8.914734287940092e-05, + "loss": 1.1974, + "step": 15318 + }, + { + "epoch": 0.5486060128565545, + "grad_norm": 1.3923014402389526, + "learning_rate": 8.913581244850182e-05, + "loss": 1.0595, + "step": 15319 + }, + { + "epoch": 0.5486418249861228, + "grad_norm": 1.4837226867675781, + "learning_rate": 8.912428216376637e-05, + "loss": 1.036, + "step": 15320 + }, + { + "epoch": 0.5486776371156911, + "grad_norm": 1.4299285411834717, + "learning_rate": 8.911275202534968e-05, + "loss": 1.0362, + "step": 15321 + }, + { + "epoch": 0.5487134492452593, + "grad_norm": 2.127638816833496, + "learning_rate": 8.910122203340684e-05, + "loss": 1.2934, + "step": 15322 + }, + { + "epoch": 0.5487492613748276, + "grad_norm": 1.591174840927124, + "learning_rate": 8.908969218809302e-05, + "loss": 0.9599, + "step": 15323 + }, + { + "epoch": 0.548785073504396, + "grad_norm": 1.661843180656433, + "learning_rate": 8.907816248956331e-05, + "loss": 1.1354, + "step": 15324 + }, + { + "epoch": 0.5488208856339643, + "grad_norm": 1.5777908563613892, + "learning_rate": 8.906663293797284e-05, + "loss": 1.2819, + "step": 15325 + }, + { + "epoch": 0.5488566977635325, + "grad_norm": 1.484073519706726, + "learning_rate": 8.905510353347671e-05, + "loss": 1.1919, + "step": 15326 + }, + { + "epoch": 0.5488925098931008, + "grad_norm": 1.50313401222229, + "learning_rate": 8.904357427623007e-05, + "loss": 1.2834, + "step": 15327 + }, + { + "epoch": 0.5489283220226691, + "grad_norm": 1.4746475219726562, + "learning_rate": 8.903204516638796e-05, + "loss": 1.1472, + "step": 15328 + }, + { + "epoch": 0.5489641341522373, + "grad_norm": 1.5484555959701538, + "learning_rate": 8.902051620410558e-05, + "loss": 1.1768, + "step": 15329 + }, + { + "epoch": 0.5489999462818056, + "grad_norm": 1.6441407203674316, + "learning_rate": 8.9008987389538e-05, + "loss": 1.2259, + "step": 15330 + }, + { + "epoch": 0.549035758411374, + "grad_norm": 1.4848072528839111, + "learning_rate": 8.899745872284026e-05, + "loss": 0.8368, + "step": 15331 + }, + { + "epoch": 0.5490715705409422, + "grad_norm": 1.1466717720031738, + "learning_rate": 8.898593020416756e-05, + "loss": 1.1335, + "step": 15332 + }, + { + "epoch": 0.5491073826705105, + "grad_norm": 1.685815453529358, + "learning_rate": 8.897440183367496e-05, + "loss": 1.2054, + "step": 15333 + }, + { + "epoch": 0.5491431948000788, + "grad_norm": 1.2962695360183716, + "learning_rate": 8.896287361151757e-05, + "loss": 1.0423, + "step": 15334 + }, + { + "epoch": 0.549179006929647, + "grad_norm": 2.099655866622925, + "learning_rate": 8.895134553785044e-05, + "loss": 1.1504, + "step": 15335 + }, + { + "epoch": 0.5492148190592153, + "grad_norm": 1.7482348680496216, + "learning_rate": 8.893981761282874e-05, + "loss": 1.1761, + "step": 15336 + }, + { + "epoch": 0.5492506311887836, + "grad_norm": 1.3339041471481323, + "learning_rate": 8.89282898366075e-05, + "loss": 0.9921, + "step": 15337 + }, + { + "epoch": 0.549286443318352, + "grad_norm": 1.425553321838379, + "learning_rate": 8.891676220934188e-05, + "loss": 0.9924, + "step": 15338 + }, + { + "epoch": 0.5493222554479202, + "grad_norm": 1.6317082643508911, + "learning_rate": 8.89052347311869e-05, + "loss": 1.1158, + "step": 15339 + }, + { + "epoch": 0.5493580675774885, + "grad_norm": 1.692675232887268, + "learning_rate": 8.889370740229767e-05, + "loss": 1.0832, + "step": 15340 + }, + { + "epoch": 0.5493938797070568, + "grad_norm": 1.5261759757995605, + "learning_rate": 8.88821802228293e-05, + "loss": 1.0583, + "step": 15341 + }, + { + "epoch": 0.549429691836625, + "grad_norm": 1.3939419984817505, + "learning_rate": 8.887065319293684e-05, + "loss": 1.1687, + "step": 15342 + }, + { + "epoch": 0.5494655039661933, + "grad_norm": 1.4253418445587158, + "learning_rate": 8.88591263127754e-05, + "loss": 1.0668, + "step": 15343 + }, + { + "epoch": 0.5495013160957616, + "grad_norm": 1.6338064670562744, + "learning_rate": 8.884759958250002e-05, + "loss": 1.2414, + "step": 15344 + }, + { + "epoch": 0.54953712822533, + "grad_norm": 1.6318001747131348, + "learning_rate": 8.883607300226581e-05, + "loss": 1.0716, + "step": 15345 + }, + { + "epoch": 0.5495729403548982, + "grad_norm": 1.7355836629867554, + "learning_rate": 8.882454657222784e-05, + "loss": 1.2547, + "step": 15346 + }, + { + "epoch": 0.5496087524844665, + "grad_norm": 1.4268702268600464, + "learning_rate": 8.88130202925412e-05, + "loss": 1.1542, + "step": 15347 + }, + { + "epoch": 0.5496445646140348, + "grad_norm": 1.973320484161377, + "learning_rate": 8.880149416336093e-05, + "loss": 1.3695, + "step": 15348 + }, + { + "epoch": 0.549680376743603, + "grad_norm": 1.790505290031433, + "learning_rate": 8.878996818484209e-05, + "loss": 1.252, + "step": 15349 + }, + { + "epoch": 0.5497161888731713, + "grad_norm": 1.4172385931015015, + "learning_rate": 8.87784423571398e-05, + "loss": 1.0552, + "step": 15350 + }, + { + "epoch": 0.5497520010027396, + "grad_norm": 1.7928088903427124, + "learning_rate": 8.876691668040907e-05, + "loss": 1.1917, + "step": 15351 + }, + { + "epoch": 0.549787813132308, + "grad_norm": 1.7191046476364136, + "learning_rate": 8.8755391154805e-05, + "loss": 1.1093, + "step": 15352 + }, + { + "epoch": 0.5498236252618762, + "grad_norm": 1.68001127243042, + "learning_rate": 8.874386578048261e-05, + "loss": 1.01, + "step": 15353 + }, + { + "epoch": 0.5498594373914445, + "grad_norm": 1.4682663679122925, + "learning_rate": 8.873234055759703e-05, + "loss": 1.191, + "step": 15354 + }, + { + "epoch": 0.5498952495210128, + "grad_norm": 1.6058992147445679, + "learning_rate": 8.872081548630325e-05, + "loss": 1.0996, + "step": 15355 + }, + { + "epoch": 0.549931061650581, + "grad_norm": 1.684417486190796, + "learning_rate": 8.870929056675636e-05, + "loss": 1.1678, + "step": 15356 + }, + { + "epoch": 0.5499668737801493, + "grad_norm": 2.0567996501922607, + "learning_rate": 8.86977657991114e-05, + "loss": 1.2574, + "step": 15357 + }, + { + "epoch": 0.5500026859097176, + "grad_norm": 1.608168363571167, + "learning_rate": 8.86862411835234e-05, + "loss": 1.1312, + "step": 15358 + }, + { + "epoch": 0.550038498039286, + "grad_norm": 1.3640879392623901, + "learning_rate": 8.867471672014745e-05, + "loss": 1.0683, + "step": 15359 + }, + { + "epoch": 0.5500743101688542, + "grad_norm": 1.4975309371948242, + "learning_rate": 8.866319240913856e-05, + "loss": 1.0531, + "step": 15360 + }, + { + "epoch": 0.5501101222984225, + "grad_norm": 1.3660075664520264, + "learning_rate": 8.865166825065182e-05, + "loss": 1.0425, + "step": 15361 + }, + { + "epoch": 0.5501459344279908, + "grad_norm": 1.2586796283721924, + "learning_rate": 8.864014424484222e-05, + "loss": 1.013, + "step": 15362 + }, + { + "epoch": 0.550181746557559, + "grad_norm": 1.6470654010772705, + "learning_rate": 8.862862039186485e-05, + "loss": 1.3145, + "step": 15363 + }, + { + "epoch": 0.5502175586871273, + "grad_norm": 1.7822515964508057, + "learning_rate": 8.861709669187474e-05, + "loss": 1.1097, + "step": 15364 + }, + { + "epoch": 0.5502533708166956, + "grad_norm": 1.3892054557800293, + "learning_rate": 8.860557314502685e-05, + "loss": 1.1458, + "step": 15365 + }, + { + "epoch": 0.5502891829462639, + "grad_norm": 1.3947480916976929, + "learning_rate": 8.859404975147632e-05, + "loss": 1.0119, + "step": 15366 + }, + { + "epoch": 0.5503249950758322, + "grad_norm": 1.658197045326233, + "learning_rate": 8.858252651137812e-05, + "loss": 1.1953, + "step": 15367 + }, + { + "epoch": 0.5503608072054005, + "grad_norm": 1.2388184070587158, + "learning_rate": 8.857100342488732e-05, + "loss": 1.0897, + "step": 15368 + }, + { + "epoch": 0.5503966193349688, + "grad_norm": 1.4461625814437866, + "learning_rate": 8.855948049215888e-05, + "loss": 1.2401, + "step": 15369 + }, + { + "epoch": 0.550432431464537, + "grad_norm": 1.3300421237945557, + "learning_rate": 8.854795771334794e-05, + "loss": 1.1425, + "step": 15370 + }, + { + "epoch": 0.5504682435941053, + "grad_norm": 1.4963167905807495, + "learning_rate": 8.85364350886094e-05, + "loss": 1.0065, + "step": 15371 + }, + { + "epoch": 0.5505040557236736, + "grad_norm": 2.0051097869873047, + "learning_rate": 8.852491261809837e-05, + "loss": 1.3316, + "step": 15372 + }, + { + "epoch": 0.5505398678532419, + "grad_norm": 1.4374182224273682, + "learning_rate": 8.851339030196986e-05, + "loss": 1.1473, + "step": 15373 + }, + { + "epoch": 0.5505756799828102, + "grad_norm": 1.5734846591949463, + "learning_rate": 8.85018681403788e-05, + "loss": 0.9992, + "step": 15374 + }, + { + "epoch": 0.5506114921123785, + "grad_norm": 1.4475334882736206, + "learning_rate": 8.849034613348035e-05, + "loss": 1.0649, + "step": 15375 + }, + { + "epoch": 0.5506473042419467, + "grad_norm": 1.3209015130996704, + "learning_rate": 8.847882428142936e-05, + "loss": 1.0211, + "step": 15376 + }, + { + "epoch": 0.550683116371515, + "grad_norm": 1.4627764225006104, + "learning_rate": 8.8467302584381e-05, + "loss": 1.1606, + "step": 15377 + }, + { + "epoch": 0.5507189285010833, + "grad_norm": 1.1622644662857056, + "learning_rate": 8.845578104249014e-05, + "loss": 0.9747, + "step": 15378 + }, + { + "epoch": 0.5507547406306516, + "grad_norm": 1.2824229001998901, + "learning_rate": 8.844425965591192e-05, + "loss": 1.1238, + "step": 15379 + }, + { + "epoch": 0.5507905527602199, + "grad_norm": 1.5075678825378418, + "learning_rate": 8.843273842480124e-05, + "loss": 1.0256, + "step": 15380 + }, + { + "epoch": 0.5508263648897882, + "grad_norm": 1.3335285186767578, + "learning_rate": 8.842121734931316e-05, + "loss": 1.2239, + "step": 15381 + }, + { + "epoch": 0.5508621770193565, + "grad_norm": 1.4890097379684448, + "learning_rate": 8.840969642960271e-05, + "loss": 1.0175, + "step": 15382 + }, + { + "epoch": 0.5508979891489247, + "grad_norm": 1.3839421272277832, + "learning_rate": 8.839817566582477e-05, + "loss": 0.9838, + "step": 15383 + }, + { + "epoch": 0.550933801278493, + "grad_norm": 2.248251438140869, + "learning_rate": 8.838665505813448e-05, + "loss": 1.0463, + "step": 15384 + }, + { + "epoch": 0.5509696134080613, + "grad_norm": 1.642835259437561, + "learning_rate": 8.837513460668668e-05, + "loss": 1.2043, + "step": 15385 + }, + { + "epoch": 0.5510054255376295, + "grad_norm": 1.751665711402893, + "learning_rate": 8.836361431163653e-05, + "loss": 1.1064, + "step": 15386 + }, + { + "epoch": 0.5510412376671979, + "grad_norm": 1.7181556224822998, + "learning_rate": 8.835209417313886e-05, + "loss": 1.3148, + "step": 15387 + }, + { + "epoch": 0.5510770497967662, + "grad_norm": 1.6236302852630615, + "learning_rate": 8.834057419134883e-05, + "loss": 1.3156, + "step": 15388 + }, + { + "epoch": 0.5511128619263345, + "grad_norm": 1.5898280143737793, + "learning_rate": 8.832905436642125e-05, + "loss": 1.1075, + "step": 15389 + }, + { + "epoch": 0.5511486740559027, + "grad_norm": 1.4033300876617432, + "learning_rate": 8.831753469851126e-05, + "loss": 1.0989, + "step": 15390 + }, + { + "epoch": 0.551184486185471, + "grad_norm": 1.5938544273376465, + "learning_rate": 8.830601518777375e-05, + "loss": 1.1372, + "step": 15391 + }, + { + "epoch": 0.5512202983150393, + "grad_norm": 1.4615094661712646, + "learning_rate": 8.829449583436367e-05, + "loss": 1.1943, + "step": 15392 + }, + { + "epoch": 0.5512561104446075, + "grad_norm": 1.3572907447814941, + "learning_rate": 8.828297663843612e-05, + "loss": 1.1524, + "step": 15393 + }, + { + "epoch": 0.5512919225741759, + "grad_norm": 1.7901415824890137, + "learning_rate": 8.827145760014595e-05, + "loss": 1.2448, + "step": 15394 + }, + { + "epoch": 0.5513277347037442, + "grad_norm": 1.5621986389160156, + "learning_rate": 8.825993871964823e-05, + "loss": 1.2881, + "step": 15395 + }, + { + "epoch": 0.5513635468333125, + "grad_norm": 1.6693061590194702, + "learning_rate": 8.824841999709785e-05, + "loss": 0.9429, + "step": 15396 + }, + { + "epoch": 0.5513993589628807, + "grad_norm": 1.7139192819595337, + "learning_rate": 8.823690143264988e-05, + "loss": 1.2685, + "step": 15397 + }, + { + "epoch": 0.551435171092449, + "grad_norm": 1.524372935295105, + "learning_rate": 8.822538302645916e-05, + "loss": 1.1982, + "step": 15398 + }, + { + "epoch": 0.5514709832220173, + "grad_norm": 1.805549144744873, + "learning_rate": 8.821386477868078e-05, + "loss": 0.9566, + "step": 15399 + }, + { + "epoch": 0.5515067953515855, + "grad_norm": 1.4262418746948242, + "learning_rate": 8.820234668946963e-05, + "loss": 1.2033, + "step": 15400 + }, + { + "epoch": 0.5515426074811539, + "grad_norm": 1.2964640855789185, + "learning_rate": 8.819082875898068e-05, + "loss": 1.1087, + "step": 15401 + }, + { + "epoch": 0.5515784196107222, + "grad_norm": 1.5346555709838867, + "learning_rate": 8.817931098736891e-05, + "loss": 1.0961, + "step": 15402 + }, + { + "epoch": 0.5516142317402905, + "grad_norm": 1.6153591871261597, + "learning_rate": 8.816779337478923e-05, + "loss": 1.0559, + "step": 15403 + }, + { + "epoch": 0.5516500438698587, + "grad_norm": 2.105076313018799, + "learning_rate": 8.815627592139665e-05, + "loss": 0.9491, + "step": 15404 + }, + { + "epoch": 0.551685855999427, + "grad_norm": 1.3224414587020874, + "learning_rate": 8.814475862734608e-05, + "loss": 1.1877, + "step": 15405 + }, + { + "epoch": 0.5517216681289953, + "grad_norm": 1.3843016624450684, + "learning_rate": 8.813324149279254e-05, + "loss": 0.9608, + "step": 15406 + }, + { + "epoch": 0.5517574802585635, + "grad_norm": 1.4735214710235596, + "learning_rate": 8.812172451789086e-05, + "loss": 0.9288, + "step": 15407 + }, + { + "epoch": 0.5517932923881319, + "grad_norm": 1.6198002099990845, + "learning_rate": 8.811020770279612e-05, + "loss": 1.0707, + "step": 15408 + }, + { + "epoch": 0.5518291045177002, + "grad_norm": 1.6581356525421143, + "learning_rate": 8.809869104766318e-05, + "loss": 1.2376, + "step": 15409 + }, + { + "epoch": 0.5518649166472684, + "grad_norm": 1.4459307193756104, + "learning_rate": 8.808717455264698e-05, + "loss": 1.3618, + "step": 15410 + }, + { + "epoch": 0.5519007287768367, + "grad_norm": 1.4482595920562744, + "learning_rate": 8.80756582179025e-05, + "loss": 1.1602, + "step": 15411 + }, + { + "epoch": 0.551936540906405, + "grad_norm": 1.3722344636917114, + "learning_rate": 8.806414204358465e-05, + "loss": 1.02, + "step": 15412 + }, + { + "epoch": 0.5519723530359733, + "grad_norm": 1.1951289176940918, + "learning_rate": 8.805262602984838e-05, + "loss": 1.0783, + "step": 15413 + }, + { + "epoch": 0.5520081651655415, + "grad_norm": 1.401026725769043, + "learning_rate": 8.804111017684858e-05, + "loss": 1.1448, + "step": 15414 + }, + { + "epoch": 0.5520439772951099, + "grad_norm": 1.902077317237854, + "learning_rate": 8.802959448474025e-05, + "loss": 1.122, + "step": 15415 + }, + { + "epoch": 0.5520797894246782, + "grad_norm": 1.4906680583953857, + "learning_rate": 8.801807895367827e-05, + "loss": 1.0939, + "step": 15416 + }, + { + "epoch": 0.5521156015542464, + "grad_norm": 1.3725534677505493, + "learning_rate": 8.80065635838176e-05, + "loss": 0.9874, + "step": 15417 + }, + { + "epoch": 0.5521514136838147, + "grad_norm": 1.7519763708114624, + "learning_rate": 8.799504837531315e-05, + "loss": 1.2309, + "step": 15418 + }, + { + "epoch": 0.552187225813383, + "grad_norm": 1.2381460666656494, + "learning_rate": 8.798353332831981e-05, + "loss": 1.0225, + "step": 15419 + }, + { + "epoch": 0.5522230379429512, + "grad_norm": 1.4687094688415527, + "learning_rate": 8.797201844299257e-05, + "loss": 1.1079, + "step": 15420 + }, + { + "epoch": 0.5522588500725195, + "grad_norm": 1.38584566116333, + "learning_rate": 8.796050371948627e-05, + "loss": 1.1605, + "step": 15421 + }, + { + "epoch": 0.5522946622020879, + "grad_norm": 1.7128081321716309, + "learning_rate": 8.794898915795588e-05, + "loss": 1.2306, + "step": 15422 + }, + { + "epoch": 0.5523304743316562, + "grad_norm": 1.7132806777954102, + "learning_rate": 8.793747475855628e-05, + "loss": 1.0095, + "step": 15423 + }, + { + "epoch": 0.5523662864612244, + "grad_norm": 1.3418469429016113, + "learning_rate": 8.792596052144242e-05, + "loss": 1.2769, + "step": 15424 + }, + { + "epoch": 0.5524020985907927, + "grad_norm": 1.301711916923523, + "learning_rate": 8.791444644676916e-05, + "loss": 1.1008, + "step": 15425 + }, + { + "epoch": 0.552437910720361, + "grad_norm": 1.3854868412017822, + "learning_rate": 8.790293253469145e-05, + "loss": 1.2885, + "step": 15426 + }, + { + "epoch": 0.5524737228499292, + "grad_norm": 1.909411072731018, + "learning_rate": 8.789141878536419e-05, + "loss": 1.0038, + "step": 15427 + }, + { + "epoch": 0.5525095349794975, + "grad_norm": 1.8623158931732178, + "learning_rate": 8.787990519894224e-05, + "loss": 1.167, + "step": 15428 + }, + { + "epoch": 0.5525453471090659, + "grad_norm": 1.436476230621338, + "learning_rate": 8.786839177558057e-05, + "loss": 1.3415, + "step": 15429 + }, + { + "epoch": 0.5525811592386342, + "grad_norm": 1.697527527809143, + "learning_rate": 8.7856878515434e-05, + "loss": 1.2337, + "step": 15430 + }, + { + "epoch": 0.5526169713682024, + "grad_norm": 1.4566400051116943, + "learning_rate": 8.784536541865752e-05, + "loss": 0.9736, + "step": 15431 + }, + { + "epoch": 0.5526527834977707, + "grad_norm": 1.4909653663635254, + "learning_rate": 8.783385248540591e-05, + "loss": 1.0148, + "step": 15432 + }, + { + "epoch": 0.552688595627339, + "grad_norm": 1.5965017080307007, + "learning_rate": 8.782233971583416e-05, + "loss": 1.2706, + "step": 15433 + }, + { + "epoch": 0.5527244077569072, + "grad_norm": 1.7278697490692139, + "learning_rate": 8.781082711009709e-05, + "loss": 1.3062, + "step": 15434 + }, + { + "epoch": 0.5527602198864755, + "grad_norm": 1.4214926958084106, + "learning_rate": 8.779931466834965e-05, + "loss": 1.0, + "step": 15435 + }, + { + "epoch": 0.5527960320160439, + "grad_norm": 1.6373084783554077, + "learning_rate": 8.778780239074669e-05, + "loss": 0.9939, + "step": 15436 + }, + { + "epoch": 0.5528318441456122, + "grad_norm": 2.079941749572754, + "learning_rate": 8.777629027744307e-05, + "loss": 1.1385, + "step": 15437 + }, + { + "epoch": 0.5528676562751804, + "grad_norm": 1.4675730466842651, + "learning_rate": 8.776477832859374e-05, + "loss": 1.1007, + "step": 15438 + }, + { + "epoch": 0.5529034684047487, + "grad_norm": 1.5732439756393433, + "learning_rate": 8.77532665443535e-05, + "loss": 1.0177, + "step": 15439 + }, + { + "epoch": 0.552939280534317, + "grad_norm": 1.5106436014175415, + "learning_rate": 8.774175492487728e-05, + "loss": 1.0918, + "step": 15440 + }, + { + "epoch": 0.5529750926638852, + "grad_norm": 1.4777730703353882, + "learning_rate": 8.77302434703199e-05, + "loss": 1.056, + "step": 15441 + }, + { + "epoch": 0.5530109047934535, + "grad_norm": 1.7764559984207153, + "learning_rate": 8.771873218083631e-05, + "loss": 1.2566, + "step": 15442 + }, + { + "epoch": 0.5530467169230219, + "grad_norm": 1.9764949083328247, + "learning_rate": 8.770722105658132e-05, + "loss": 1.1339, + "step": 15443 + }, + { + "epoch": 0.5530825290525901, + "grad_norm": 1.6454055309295654, + "learning_rate": 8.769571009770982e-05, + "loss": 1.0627, + "step": 15444 + }, + { + "epoch": 0.5531183411821584, + "grad_norm": 1.2796331644058228, + "learning_rate": 8.768419930437667e-05, + "loss": 1.224, + "step": 15445 + }, + { + "epoch": 0.5531541533117267, + "grad_norm": 1.4239739179611206, + "learning_rate": 8.767268867673671e-05, + "loss": 1.2351, + "step": 15446 + }, + { + "epoch": 0.553189965441295, + "grad_norm": 1.4398677349090576, + "learning_rate": 8.766117821494485e-05, + "loss": 0.9533, + "step": 15447 + }, + { + "epoch": 0.5532257775708632, + "grad_norm": 2.099118947982788, + "learning_rate": 8.76496679191559e-05, + "loss": 1.2852, + "step": 15448 + }, + { + "epoch": 0.5532615897004315, + "grad_norm": 1.5516459941864014, + "learning_rate": 8.763815778952475e-05, + "loss": 1.0967, + "step": 15449 + }, + { + "epoch": 0.5532974018299999, + "grad_norm": 1.4418107271194458, + "learning_rate": 8.762664782620623e-05, + "loss": 1.2671, + "step": 15450 + }, + { + "epoch": 0.5533332139595681, + "grad_norm": 1.5718954801559448, + "learning_rate": 8.761513802935523e-05, + "loss": 1.301, + "step": 15451 + }, + { + "epoch": 0.5533690260891364, + "grad_norm": 1.1993294954299927, + "learning_rate": 8.760362839912654e-05, + "loss": 1.0101, + "step": 15452 + }, + { + "epoch": 0.5534048382187047, + "grad_norm": 1.2486159801483154, + "learning_rate": 8.759211893567505e-05, + "loss": 1.1244, + "step": 15453 + }, + { + "epoch": 0.5534406503482729, + "grad_norm": 1.4573440551757812, + "learning_rate": 8.758060963915562e-05, + "loss": 1.1733, + "step": 15454 + }, + { + "epoch": 0.5534764624778412, + "grad_norm": 2.1276590824127197, + "learning_rate": 8.756910050972304e-05, + "loss": 1.0551, + "step": 15455 + }, + { + "epoch": 0.5535122746074095, + "grad_norm": 1.3186815977096558, + "learning_rate": 8.755759154753219e-05, + "loss": 1.0556, + "step": 15456 + }, + { + "epoch": 0.5535480867369779, + "grad_norm": 1.7299696207046509, + "learning_rate": 8.754608275273788e-05, + "loss": 1.066, + "step": 15457 + }, + { + "epoch": 0.5535838988665461, + "grad_norm": 1.6902259588241577, + "learning_rate": 8.753457412549497e-05, + "loss": 1.178, + "step": 15458 + }, + { + "epoch": 0.5536197109961144, + "grad_norm": 1.6074737310409546, + "learning_rate": 8.752306566595828e-05, + "loss": 1.0562, + "step": 15459 + }, + { + "epoch": 0.5536555231256827, + "grad_norm": 1.4393666982650757, + "learning_rate": 8.751155737428267e-05, + "loss": 1.2267, + "step": 15460 + }, + { + "epoch": 0.5536913352552509, + "grad_norm": 1.317095160484314, + "learning_rate": 8.750004925062296e-05, + "loss": 1.1682, + "step": 15461 + }, + { + "epoch": 0.5537271473848192, + "grad_norm": 1.566422462463379, + "learning_rate": 8.74885412951339e-05, + "loss": 1.1544, + "step": 15462 + }, + { + "epoch": 0.5537629595143875, + "grad_norm": 2.0938937664031982, + "learning_rate": 8.747703350797044e-05, + "loss": 1.081, + "step": 15463 + }, + { + "epoch": 0.5537987716439559, + "grad_norm": 1.4565075635910034, + "learning_rate": 8.74655258892873e-05, + "loss": 1.1111, + "step": 15464 + }, + { + "epoch": 0.5538345837735241, + "grad_norm": 2.374903917312622, + "learning_rate": 8.745401843923936e-05, + "loss": 1.2654, + "step": 15465 + }, + { + "epoch": 0.5538703959030924, + "grad_norm": 1.393269419670105, + "learning_rate": 8.74425111579814e-05, + "loss": 1.0326, + "step": 15466 + }, + { + "epoch": 0.5539062080326607, + "grad_norm": 1.5279608964920044, + "learning_rate": 8.743100404566828e-05, + "loss": 1.2075, + "step": 15467 + }, + { + "epoch": 0.5539420201622289, + "grad_norm": 1.2495229244232178, + "learning_rate": 8.741949710245476e-05, + "loss": 1.0272, + "step": 15468 + }, + { + "epoch": 0.5539778322917972, + "grad_norm": 1.7568047046661377, + "learning_rate": 8.740799032849572e-05, + "loss": 0.9856, + "step": 15469 + }, + { + "epoch": 0.5540136444213655, + "grad_norm": 1.7455581426620483, + "learning_rate": 8.739648372394592e-05, + "loss": 1.3846, + "step": 15470 + }, + { + "epoch": 0.5540494565509338, + "grad_norm": 1.9716076850891113, + "learning_rate": 8.738497728896013e-05, + "loss": 1.0246, + "step": 15471 + }, + { + "epoch": 0.5540852686805021, + "grad_norm": 1.6673234701156616, + "learning_rate": 8.737347102369325e-05, + "loss": 0.837, + "step": 15472 + }, + { + "epoch": 0.5541210808100704, + "grad_norm": 2.1924996376037598, + "learning_rate": 8.736196492829997e-05, + "loss": 1.2761, + "step": 15473 + }, + { + "epoch": 0.5541568929396387, + "grad_norm": 1.6973023414611816, + "learning_rate": 8.735045900293522e-05, + "loss": 0.9858, + "step": 15474 + }, + { + "epoch": 0.5541927050692069, + "grad_norm": 1.9330719709396362, + "learning_rate": 8.733895324775366e-05, + "loss": 1.1725, + "step": 15475 + }, + { + "epoch": 0.5542285171987752, + "grad_norm": 1.630349040031433, + "learning_rate": 8.73274476629102e-05, + "loss": 1.2241, + "step": 15476 + }, + { + "epoch": 0.5542643293283435, + "grad_norm": 1.425303339958191, + "learning_rate": 8.731594224855956e-05, + "loss": 1.0991, + "step": 15477 + }, + { + "epoch": 0.5543001414579118, + "grad_norm": 1.5602591037750244, + "learning_rate": 8.730443700485658e-05, + "loss": 1.0671, + "step": 15478 + }, + { + "epoch": 0.5543359535874801, + "grad_norm": 1.6299757957458496, + "learning_rate": 8.729293193195603e-05, + "loss": 1.0868, + "step": 15479 + }, + { + "epoch": 0.5543717657170484, + "grad_norm": 1.5720100402832031, + "learning_rate": 8.728142703001264e-05, + "loss": 1.1143, + "step": 15480 + }, + { + "epoch": 0.5544075778466167, + "grad_norm": 1.7321925163269043, + "learning_rate": 8.72699222991813e-05, + "loss": 1.1397, + "step": 15481 + }, + { + "epoch": 0.5544433899761849, + "grad_norm": 2.1033034324645996, + "learning_rate": 8.725841773961669e-05, + "loss": 1.1896, + "step": 15482 + }, + { + "epoch": 0.5544792021057532, + "grad_norm": 1.7887487411499023, + "learning_rate": 8.724691335147367e-05, + "loss": 1.2036, + "step": 15483 + }, + { + "epoch": 0.5545150142353215, + "grad_norm": 1.8429871797561646, + "learning_rate": 8.723540913490693e-05, + "loss": 1.1811, + "step": 15484 + }, + { + "epoch": 0.5545508263648898, + "grad_norm": 1.6714324951171875, + "learning_rate": 8.722390509007137e-05, + "loss": 1.0186, + "step": 15485 + }, + { + "epoch": 0.5545866384944581, + "grad_norm": 1.5632121562957764, + "learning_rate": 8.721240121712161e-05, + "loss": 1.0831, + "step": 15486 + }, + { + "epoch": 0.5546224506240264, + "grad_norm": 1.2174334526062012, + "learning_rate": 8.720089751621256e-05, + "loss": 1.0845, + "step": 15487 + }, + { + "epoch": 0.5546582627535946, + "grad_norm": 1.3966975212097168, + "learning_rate": 8.71893939874989e-05, + "loss": 0.9961, + "step": 15488 + }, + { + "epoch": 0.5546940748831629, + "grad_norm": 1.4218897819519043, + "learning_rate": 8.717789063113539e-05, + "loss": 1.1898, + "step": 15489 + }, + { + "epoch": 0.5547298870127312, + "grad_norm": 1.6287486553192139, + "learning_rate": 8.716638744727687e-05, + "loss": 0.9209, + "step": 15490 + }, + { + "epoch": 0.5547656991422995, + "grad_norm": 1.6031004190444946, + "learning_rate": 8.7154884436078e-05, + "loss": 0.988, + "step": 15491 + }, + { + "epoch": 0.5548015112718678, + "grad_norm": 1.4243695735931396, + "learning_rate": 8.714338159769366e-05, + "loss": 1.2057, + "step": 15492 + }, + { + "epoch": 0.5548373234014361, + "grad_norm": 1.6708341836929321, + "learning_rate": 8.713187893227847e-05, + "loss": 0.9988, + "step": 15493 + }, + { + "epoch": 0.5548731355310044, + "grad_norm": 1.5617403984069824, + "learning_rate": 8.71203764399873e-05, + "loss": 1.1072, + "step": 15494 + }, + { + "epoch": 0.5549089476605726, + "grad_norm": 1.43968665599823, + "learning_rate": 8.71088741209748e-05, + "loss": 1.0941, + "step": 15495 + }, + { + "epoch": 0.5549447597901409, + "grad_norm": 1.8314317464828491, + "learning_rate": 8.709737197539583e-05, + "loss": 1.2744, + "step": 15496 + }, + { + "epoch": 0.5549805719197092, + "grad_norm": 1.5108131170272827, + "learning_rate": 8.708587000340506e-05, + "loss": 1.1319, + "step": 15497 + }, + { + "epoch": 0.5550163840492774, + "grad_norm": 1.3511900901794434, + "learning_rate": 8.707436820515723e-05, + "loss": 1.0509, + "step": 15498 + }, + { + "epoch": 0.5550521961788458, + "grad_norm": 1.5801746845245361, + "learning_rate": 8.706286658080711e-05, + "loss": 1.1763, + "step": 15499 + }, + { + "epoch": 0.5550880083084141, + "grad_norm": 1.6493682861328125, + "learning_rate": 8.705136513050944e-05, + "loss": 0.9389, + "step": 15500 + }, + { + "epoch": 0.5551238204379824, + "grad_norm": 1.629318118095398, + "learning_rate": 8.703986385441895e-05, + "loss": 1.1723, + "step": 15501 + }, + { + "epoch": 0.5551596325675506, + "grad_norm": 1.3209196329116821, + "learning_rate": 8.702836275269033e-05, + "loss": 1.0744, + "step": 15502 + }, + { + "epoch": 0.5551954446971189, + "grad_norm": 1.4219048023223877, + "learning_rate": 8.701686182547842e-05, + "loss": 1.1215, + "step": 15503 + }, + { + "epoch": 0.5552312568266872, + "grad_norm": 2.0155749320983887, + "learning_rate": 8.700536107293784e-05, + "loss": 1.2186, + "step": 15504 + }, + { + "epoch": 0.5552670689562554, + "grad_norm": 1.4456292390823364, + "learning_rate": 8.699386049522341e-05, + "loss": 1.1878, + "step": 15505 + }, + { + "epoch": 0.5553028810858238, + "grad_norm": 1.5315937995910645, + "learning_rate": 8.69823600924898e-05, + "loss": 1.1405, + "step": 15506 + }, + { + "epoch": 0.5553386932153921, + "grad_norm": 1.1982078552246094, + "learning_rate": 8.697085986489172e-05, + "loss": 1.2514, + "step": 15507 + }, + { + "epoch": 0.5553745053449604, + "grad_norm": 1.6061995029449463, + "learning_rate": 8.695935981258394e-05, + "loss": 0.9789, + "step": 15508 + }, + { + "epoch": 0.5554103174745286, + "grad_norm": 1.5229169130325317, + "learning_rate": 8.694785993572112e-05, + "loss": 1.0227, + "step": 15509 + }, + { + "epoch": 0.5554461296040969, + "grad_norm": 1.430404782295227, + "learning_rate": 8.693636023445804e-05, + "loss": 1.0763, + "step": 15510 + }, + { + "epoch": 0.5554819417336652, + "grad_norm": 1.631230115890503, + "learning_rate": 8.692486070894935e-05, + "loss": 1.1645, + "step": 15511 + }, + { + "epoch": 0.5555177538632334, + "grad_norm": 1.929694652557373, + "learning_rate": 8.691336135934982e-05, + "loss": 1.2287, + "step": 15512 + }, + { + "epoch": 0.5555535659928018, + "grad_norm": 1.41084885597229, + "learning_rate": 8.69018621858141e-05, + "loss": 0.8276, + "step": 15513 + }, + { + "epoch": 0.5555893781223701, + "grad_norm": 1.2885822057724, + "learning_rate": 8.689036318849697e-05, + "loss": 1.0441, + "step": 15514 + }, + { + "epoch": 0.5556251902519384, + "grad_norm": 1.5272276401519775, + "learning_rate": 8.68788643675531e-05, + "loss": 1.1448, + "step": 15515 + }, + { + "epoch": 0.5556610023815066, + "grad_norm": 1.424603819847107, + "learning_rate": 8.686736572313714e-05, + "loss": 1.0185, + "step": 15516 + }, + { + "epoch": 0.5556968145110749, + "grad_norm": 1.5994365215301514, + "learning_rate": 8.685586725540387e-05, + "loss": 1.0363, + "step": 15517 + }, + { + "epoch": 0.5557326266406432, + "grad_norm": 1.7965264320373535, + "learning_rate": 8.684436896450791e-05, + "loss": 1.2448, + "step": 15518 + }, + { + "epoch": 0.5557684387702114, + "grad_norm": 1.6310009956359863, + "learning_rate": 8.683287085060404e-05, + "loss": 1.1515, + "step": 15519 + }, + { + "epoch": 0.5558042508997798, + "grad_norm": 1.672195315361023, + "learning_rate": 8.682137291384687e-05, + "loss": 1.1553, + "step": 15520 + }, + { + "epoch": 0.5558400630293481, + "grad_norm": 1.3113112449645996, + "learning_rate": 8.680987515439116e-05, + "loss": 1.0951, + "step": 15521 + }, + { + "epoch": 0.5558758751589163, + "grad_norm": 1.4277819395065308, + "learning_rate": 8.679837757239156e-05, + "loss": 1.1965, + "step": 15522 + }, + { + "epoch": 0.5559116872884846, + "grad_norm": 1.4583649635314941, + "learning_rate": 8.678688016800276e-05, + "loss": 0.9614, + "step": 15523 + }, + { + "epoch": 0.5559474994180529, + "grad_norm": 1.5158084630966187, + "learning_rate": 8.677538294137945e-05, + "loss": 1.2333, + "step": 15524 + }, + { + "epoch": 0.5559833115476212, + "grad_norm": 1.3781957626342773, + "learning_rate": 8.676388589267628e-05, + "loss": 1.0814, + "step": 15525 + }, + { + "epoch": 0.5560191236771894, + "grad_norm": 1.9023503065109253, + "learning_rate": 8.675238902204797e-05, + "loss": 1.2073, + "step": 15526 + }, + { + "epoch": 0.5560549358067578, + "grad_norm": 1.4734536409378052, + "learning_rate": 8.674089232964916e-05, + "loss": 1.2279, + "step": 15527 + }, + { + "epoch": 0.5560907479363261, + "grad_norm": 1.2270431518554688, + "learning_rate": 8.672939581563456e-05, + "loss": 1.0211, + "step": 15528 + }, + { + "epoch": 0.5561265600658943, + "grad_norm": 2.6653339862823486, + "learning_rate": 8.67178994801588e-05, + "loss": 0.995, + "step": 15529 + }, + { + "epoch": 0.5561623721954626, + "grad_norm": 1.2233597040176392, + "learning_rate": 8.67064033233766e-05, + "loss": 0.9817, + "step": 15530 + }, + { + "epoch": 0.5561981843250309, + "grad_norm": 1.7747924327850342, + "learning_rate": 8.669490734544256e-05, + "loss": 1.2116, + "step": 15531 + }, + { + "epoch": 0.5562339964545991, + "grad_norm": 1.755318284034729, + "learning_rate": 8.668341154651141e-05, + "loss": 1.1288, + "step": 15532 + }, + { + "epoch": 0.5562698085841674, + "grad_norm": 1.5162807703018188, + "learning_rate": 8.667191592673779e-05, + "loss": 1.1696, + "step": 15533 + }, + { + "epoch": 0.5563056207137358, + "grad_norm": 1.6218836307525635, + "learning_rate": 8.666042048627632e-05, + "loss": 1.2491, + "step": 15534 + }, + { + "epoch": 0.5563414328433041, + "grad_norm": 1.7209824323654175, + "learning_rate": 8.66489252252817e-05, + "loss": 1.1752, + "step": 15535 + }, + { + "epoch": 0.5563772449728723, + "grad_norm": 1.6707121133804321, + "learning_rate": 8.663743014390855e-05, + "loss": 1.1179, + "step": 15536 + }, + { + "epoch": 0.5564130571024406, + "grad_norm": 1.7304900884628296, + "learning_rate": 8.662593524231158e-05, + "loss": 0.9615, + "step": 15537 + }, + { + "epoch": 0.5564488692320089, + "grad_norm": 1.2196024656295776, + "learning_rate": 8.661444052064536e-05, + "loss": 0.8766, + "step": 15538 + }, + { + "epoch": 0.5564846813615771, + "grad_norm": 1.6205333471298218, + "learning_rate": 8.66029459790646e-05, + "loss": 1.0533, + "step": 15539 + }, + { + "epoch": 0.5565204934911454, + "grad_norm": 1.6130203008651733, + "learning_rate": 8.65914516177239e-05, + "loss": 1.1611, + "step": 15540 + }, + { + "epoch": 0.5565563056207138, + "grad_norm": 1.5165525674819946, + "learning_rate": 8.657995743677793e-05, + "loss": 1.2319, + "step": 15541 + }, + { + "epoch": 0.5565921177502821, + "grad_norm": 1.7035466432571411, + "learning_rate": 8.656846343638135e-05, + "loss": 1.2323, + "step": 15542 + }, + { + "epoch": 0.5566279298798503, + "grad_norm": 1.3465718030929565, + "learning_rate": 8.655696961668873e-05, + "loss": 1.1097, + "step": 15543 + }, + { + "epoch": 0.5566637420094186, + "grad_norm": 1.60360848903656, + "learning_rate": 8.654547597785478e-05, + "loss": 1.2612, + "step": 15544 + }, + { + "epoch": 0.5566995541389869, + "grad_norm": 1.551140546798706, + "learning_rate": 8.653398252003406e-05, + "loss": 1.1265, + "step": 15545 + }, + { + "epoch": 0.5567353662685551, + "grad_norm": 1.463932752609253, + "learning_rate": 8.652248924338126e-05, + "loss": 1.1638, + "step": 15546 + }, + { + "epoch": 0.5567711783981234, + "grad_norm": 1.6874456405639648, + "learning_rate": 8.651099614805097e-05, + "loss": 1.1661, + "step": 15547 + }, + { + "epoch": 0.5568069905276918, + "grad_norm": 1.4797452688217163, + "learning_rate": 8.649950323419783e-05, + "loss": 0.9911, + "step": 15548 + }, + { + "epoch": 0.55684280265726, + "grad_norm": 1.783853530883789, + "learning_rate": 8.648801050197646e-05, + "loss": 1.2566, + "step": 15549 + }, + { + "epoch": 0.5568786147868283, + "grad_norm": 1.7791398763656616, + "learning_rate": 8.647651795154148e-05, + "loss": 0.9901, + "step": 15550 + }, + { + "epoch": 0.5569144269163966, + "grad_norm": 1.4150989055633545, + "learning_rate": 8.646502558304751e-05, + "loss": 1.0319, + "step": 15551 + }, + { + "epoch": 0.5569502390459649, + "grad_norm": 1.677121877670288, + "learning_rate": 8.645353339664915e-05, + "loss": 1.1052, + "step": 15552 + }, + { + "epoch": 0.5569860511755331, + "grad_norm": 1.7594122886657715, + "learning_rate": 8.644204139250105e-05, + "loss": 1.1328, + "step": 15553 + }, + { + "epoch": 0.5570218633051014, + "grad_norm": 1.3713117837905884, + "learning_rate": 8.643054957075776e-05, + "loss": 1.0091, + "step": 15554 + }, + { + "epoch": 0.5570576754346698, + "grad_norm": 1.4024144411087036, + "learning_rate": 8.641905793157395e-05, + "loss": 0.9818, + "step": 15555 + }, + { + "epoch": 0.557093487564238, + "grad_norm": 1.6589620113372803, + "learning_rate": 8.640756647510417e-05, + "loss": 1.2608, + "step": 15556 + }, + { + "epoch": 0.5571292996938063, + "grad_norm": 1.304970622062683, + "learning_rate": 8.639607520150308e-05, + "loss": 1.0067, + "step": 15557 + }, + { + "epoch": 0.5571651118233746, + "grad_norm": 1.384920358657837, + "learning_rate": 8.638458411092527e-05, + "loss": 0.9544, + "step": 15558 + }, + { + "epoch": 0.5572009239529429, + "grad_norm": 1.379024863243103, + "learning_rate": 8.637309320352526e-05, + "loss": 1.2484, + "step": 15559 + }, + { + "epoch": 0.5572367360825111, + "grad_norm": 1.5823109149932861, + "learning_rate": 8.636160247945774e-05, + "loss": 1.2073, + "step": 15560 + }, + { + "epoch": 0.5572725482120794, + "grad_norm": 1.2703381776809692, + "learning_rate": 8.635011193887725e-05, + "loss": 1.0804, + "step": 15561 + }, + { + "epoch": 0.5573083603416478, + "grad_norm": 1.6642780303955078, + "learning_rate": 8.633862158193841e-05, + "loss": 1.0466, + "step": 15562 + }, + { + "epoch": 0.557344172471216, + "grad_norm": 1.5775129795074463, + "learning_rate": 8.632713140879577e-05, + "loss": 1.2285, + "step": 15563 + }, + { + "epoch": 0.5573799846007843, + "grad_norm": 1.5594664812088013, + "learning_rate": 8.631564141960397e-05, + "loss": 1.0783, + "step": 15564 + }, + { + "epoch": 0.5574157967303526, + "grad_norm": 1.4766054153442383, + "learning_rate": 8.630415161451754e-05, + "loss": 1.1377, + "step": 15565 + }, + { + "epoch": 0.5574516088599208, + "grad_norm": 1.3791030645370483, + "learning_rate": 8.62926619936911e-05, + "loss": 1.0626, + "step": 15566 + }, + { + "epoch": 0.5574874209894891, + "grad_norm": 1.3114112615585327, + "learning_rate": 8.628117255727924e-05, + "loss": 1.1333, + "step": 15567 + }, + { + "epoch": 0.5575232331190574, + "grad_norm": 1.520241618156433, + "learning_rate": 8.626968330543643e-05, + "loss": 1.1363, + "step": 15568 + }, + { + "epoch": 0.5575590452486258, + "grad_norm": 1.6289722919464111, + "learning_rate": 8.62581942383174e-05, + "loss": 1.1544, + "step": 15569 + }, + { + "epoch": 0.557594857378194, + "grad_norm": 1.3833484649658203, + "learning_rate": 8.624670535607658e-05, + "loss": 1.0525, + "step": 15570 + }, + { + "epoch": 0.5576306695077623, + "grad_norm": 2.2140953540802, + "learning_rate": 8.623521665886865e-05, + "loss": 1.2538, + "step": 15571 + }, + { + "epoch": 0.5576664816373306, + "grad_norm": 1.3486047983169556, + "learning_rate": 8.622372814684806e-05, + "loss": 1.0798, + "step": 15572 + }, + { + "epoch": 0.5577022937668988, + "grad_norm": 1.350164532661438, + "learning_rate": 8.621223982016948e-05, + "loss": 1.0595, + "step": 15573 + }, + { + "epoch": 0.5577381058964671, + "grad_norm": 1.6472748517990112, + "learning_rate": 8.620075167898743e-05, + "loss": 1.2203, + "step": 15574 + }, + { + "epoch": 0.5577739180260354, + "grad_norm": 1.3782347440719604, + "learning_rate": 8.618926372345645e-05, + "loss": 1.1951, + "step": 15575 + }, + { + "epoch": 0.5578097301556038, + "grad_norm": 1.3297303915023804, + "learning_rate": 8.617777595373117e-05, + "loss": 1.0934, + "step": 15576 + }, + { + "epoch": 0.557845542285172, + "grad_norm": 1.4043245315551758, + "learning_rate": 8.6166288369966e-05, + "loss": 0.986, + "step": 15577 + }, + { + "epoch": 0.5578813544147403, + "grad_norm": 1.4931926727294922, + "learning_rate": 8.615480097231564e-05, + "loss": 0.8628, + "step": 15578 + }, + { + "epoch": 0.5579171665443086, + "grad_norm": 1.940955400466919, + "learning_rate": 8.614331376093452e-05, + "loss": 1.3809, + "step": 15579 + }, + { + "epoch": 0.5579529786738768, + "grad_norm": 1.779113531112671, + "learning_rate": 8.613182673597729e-05, + "loss": 1.1266, + "step": 15580 + }, + { + "epoch": 0.5579887908034451, + "grad_norm": 1.820340871810913, + "learning_rate": 8.612033989759838e-05, + "loss": 1.1017, + "step": 15581 + }, + { + "epoch": 0.5580246029330134, + "grad_norm": 1.3140239715576172, + "learning_rate": 8.610885324595249e-05, + "loss": 1.1078, + "step": 15582 + }, + { + "epoch": 0.5580604150625817, + "grad_norm": 1.2726318836212158, + "learning_rate": 8.609736678119396e-05, + "loss": 1.1264, + "step": 15583 + }, + { + "epoch": 0.55809622719215, + "grad_norm": 1.8085172176361084, + "learning_rate": 8.60858805034775e-05, + "loss": 1.2683, + "step": 15584 + }, + { + "epoch": 0.5581320393217183, + "grad_norm": 1.435471534729004, + "learning_rate": 8.607439441295755e-05, + "loss": 1.1484, + "step": 15585 + }, + { + "epoch": 0.5581678514512866, + "grad_norm": 1.431082010269165, + "learning_rate": 8.606290850978862e-05, + "loss": 1.095, + "step": 15586 + }, + { + "epoch": 0.5582036635808548, + "grad_norm": 1.7177064418792725, + "learning_rate": 8.605142279412533e-05, + "loss": 1.3187, + "step": 15587 + }, + { + "epoch": 0.5582394757104231, + "grad_norm": 1.6923407316207886, + "learning_rate": 8.60399372661221e-05, + "loss": 1.3476, + "step": 15588 + }, + { + "epoch": 0.5582752878399914, + "grad_norm": 1.484975814819336, + "learning_rate": 8.602845192593359e-05, + "loss": 0.885, + "step": 15589 + }, + { + "epoch": 0.5583110999695597, + "grad_norm": 1.3940484523773193, + "learning_rate": 8.601696677371414e-05, + "loss": 1.1657, + "step": 15590 + }, + { + "epoch": 0.558346912099128, + "grad_norm": 1.5431387424468994, + "learning_rate": 8.600548180961845e-05, + "loss": 0.9657, + "step": 15591 + }, + { + "epoch": 0.5583827242286963, + "grad_norm": 1.5078051090240479, + "learning_rate": 8.599399703380087e-05, + "loss": 0.9932, + "step": 15592 + }, + { + "epoch": 0.5584185363582646, + "grad_norm": 1.41428804397583, + "learning_rate": 8.598251244641608e-05, + "loss": 1.0276, + "step": 15593 + }, + { + "epoch": 0.5584543484878328, + "grad_norm": 1.7717994451522827, + "learning_rate": 8.597102804761846e-05, + "loss": 1.3085, + "step": 15594 + }, + { + "epoch": 0.5584901606174011, + "grad_norm": 1.8217966556549072, + "learning_rate": 8.595954383756256e-05, + "loss": 1.078, + "step": 15595 + }, + { + "epoch": 0.5585259727469694, + "grad_norm": 1.5711164474487305, + "learning_rate": 8.594805981640289e-05, + "loss": 1.187, + "step": 15596 + }, + { + "epoch": 0.5585617848765377, + "grad_norm": 1.3461328744888306, + "learning_rate": 8.593657598429395e-05, + "loss": 1.0794, + "step": 15597 + }, + { + "epoch": 0.558597597006106, + "grad_norm": 1.6758302450180054, + "learning_rate": 8.592509234139023e-05, + "loss": 1.134, + "step": 15598 + }, + { + "epoch": 0.5586334091356743, + "grad_norm": 1.427232265472412, + "learning_rate": 8.591360888784622e-05, + "loss": 1.046, + "step": 15599 + }, + { + "epoch": 0.5586692212652425, + "grad_norm": 1.204676866531372, + "learning_rate": 8.590212562381649e-05, + "loss": 1.1836, + "step": 15600 + }, + { + "epoch": 0.5587050333948108, + "grad_norm": 2.1264262199401855, + "learning_rate": 8.589064254945541e-05, + "loss": 1.0813, + "step": 15601 + }, + { + "epoch": 0.5587408455243791, + "grad_norm": 1.4742547273635864, + "learning_rate": 8.58791596649176e-05, + "loss": 1.1328, + "step": 15602 + }, + { + "epoch": 0.5587766576539474, + "grad_norm": 1.5363762378692627, + "learning_rate": 8.586767697035745e-05, + "loss": 1.1643, + "step": 15603 + }, + { + "epoch": 0.5588124697835157, + "grad_norm": 1.2426013946533203, + "learning_rate": 8.585619446592947e-05, + "loss": 1.102, + "step": 15604 + }, + { + "epoch": 0.558848281913084, + "grad_norm": 1.5656614303588867, + "learning_rate": 8.584471215178817e-05, + "loss": 1.1291, + "step": 15605 + }, + { + "epoch": 0.5588840940426523, + "grad_norm": 1.840725064277649, + "learning_rate": 8.583323002808798e-05, + "loss": 1.3173, + "step": 15606 + }, + { + "epoch": 0.5589199061722205, + "grad_norm": 1.7283743619918823, + "learning_rate": 8.582174809498343e-05, + "loss": 1.2063, + "step": 15607 + }, + { + "epoch": 0.5589557183017888, + "grad_norm": 1.4188374280929565, + "learning_rate": 8.581026635262894e-05, + "loss": 1.356, + "step": 15608 + }, + { + "epoch": 0.5589915304313571, + "grad_norm": 1.4663143157958984, + "learning_rate": 8.579878480117906e-05, + "loss": 1.0345, + "step": 15609 + }, + { + "epoch": 0.5590273425609253, + "grad_norm": 1.4633209705352783, + "learning_rate": 8.578730344078817e-05, + "loss": 0.94, + "step": 15610 + }, + { + "epoch": 0.5590631546904937, + "grad_norm": 2.157926321029663, + "learning_rate": 8.577582227161081e-05, + "loss": 1.2074, + "step": 15611 + }, + { + "epoch": 0.559098966820062, + "grad_norm": 1.4904593229293823, + "learning_rate": 8.57643412938014e-05, + "loss": 1.0164, + "step": 15612 + }, + { + "epoch": 0.5591347789496303, + "grad_norm": 1.8428982496261597, + "learning_rate": 8.575286050751441e-05, + "loss": 1.165, + "step": 15613 + }, + { + "epoch": 0.5591705910791985, + "grad_norm": 1.4738954305648804, + "learning_rate": 8.574137991290432e-05, + "loss": 1.238, + "step": 15614 + }, + { + "epoch": 0.5592064032087668, + "grad_norm": 1.906456470489502, + "learning_rate": 8.572989951012554e-05, + "loss": 1.119, + "step": 15615 + }, + { + "epoch": 0.5592422153383351, + "grad_norm": 1.4663848876953125, + "learning_rate": 8.571841929933258e-05, + "loss": 1.1414, + "step": 15616 + }, + { + "epoch": 0.5592780274679033, + "grad_norm": 1.4197362661361694, + "learning_rate": 8.570693928067986e-05, + "loss": 1.0236, + "step": 15617 + }, + { + "epoch": 0.5593138395974717, + "grad_norm": 1.7357805967330933, + "learning_rate": 8.569545945432185e-05, + "loss": 1.083, + "step": 15618 + }, + { + "epoch": 0.55934965172704, + "grad_norm": 1.7206653356552124, + "learning_rate": 8.568397982041294e-05, + "loss": 1.2196, + "step": 15619 + }, + { + "epoch": 0.5593854638566083, + "grad_norm": 1.4624581336975098, + "learning_rate": 8.567250037910766e-05, + "loss": 1.2241, + "step": 15620 + }, + { + "epoch": 0.5594212759861765, + "grad_norm": 1.9137705564498901, + "learning_rate": 8.56610211305604e-05, + "loss": 1.2878, + "step": 15621 + }, + { + "epoch": 0.5594570881157448, + "grad_norm": 1.6232866048812866, + "learning_rate": 8.564954207492558e-05, + "loss": 1.1949, + "step": 15622 + }, + { + "epoch": 0.5594929002453131, + "grad_norm": 1.382311463356018, + "learning_rate": 8.56380632123577e-05, + "loss": 0.9805, + "step": 15623 + }, + { + "epoch": 0.5595287123748813, + "grad_norm": 1.7452785968780518, + "learning_rate": 8.56265845430111e-05, + "loss": 1.2539, + "step": 15624 + }, + { + "epoch": 0.5595645245044497, + "grad_norm": 1.3616284132003784, + "learning_rate": 8.561510606704031e-05, + "loss": 1.0681, + "step": 15625 + }, + { + "epoch": 0.559600336634018, + "grad_norm": 1.6745953559875488, + "learning_rate": 8.560362778459968e-05, + "loss": 1.0852, + "step": 15626 + }, + { + "epoch": 0.5596361487635862, + "grad_norm": 1.2596863508224487, + "learning_rate": 8.55921496958437e-05, + "loss": 1.0666, + "step": 15627 + }, + { + "epoch": 0.5596719608931545, + "grad_norm": 1.7874656915664673, + "learning_rate": 8.558067180092673e-05, + "loss": 1.1833, + "step": 15628 + }, + { + "epoch": 0.5597077730227228, + "grad_norm": 1.3450274467468262, + "learning_rate": 8.556919410000323e-05, + "loss": 1.0582, + "step": 15629 + }, + { + "epoch": 0.5597435851522911, + "grad_norm": 1.2480299472808838, + "learning_rate": 8.555771659322765e-05, + "loss": 1.0736, + "step": 15630 + }, + { + "epoch": 0.5597793972818593, + "grad_norm": 1.259621262550354, + "learning_rate": 8.55462392807543e-05, + "loss": 1.0064, + "step": 15631 + }, + { + "epoch": 0.5598152094114277, + "grad_norm": 1.31948721408844, + "learning_rate": 8.55347621627377e-05, + "loss": 0.9725, + "step": 15632 + }, + { + "epoch": 0.559851021540996, + "grad_norm": 1.5015435218811035, + "learning_rate": 8.55232852393322e-05, + "loss": 1.2507, + "step": 15633 + }, + { + "epoch": 0.5598868336705642, + "grad_norm": 1.462523341178894, + "learning_rate": 8.551180851069222e-05, + "loss": 1.0367, + "step": 15634 + }, + { + "epoch": 0.5599226458001325, + "grad_norm": 1.5193897485733032, + "learning_rate": 8.550033197697218e-05, + "loss": 1.0791, + "step": 15635 + }, + { + "epoch": 0.5599584579297008, + "grad_norm": 1.461478590965271, + "learning_rate": 8.548885563832646e-05, + "loss": 0.9969, + "step": 15636 + }, + { + "epoch": 0.559994270059269, + "grad_norm": 1.5515167713165283, + "learning_rate": 8.547737949490946e-05, + "loss": 1.1961, + "step": 15637 + }, + { + "epoch": 0.5600300821888373, + "grad_norm": 1.5691975355148315, + "learning_rate": 8.546590354687562e-05, + "loss": 1.2909, + "step": 15638 + }, + { + "epoch": 0.5600658943184057, + "grad_norm": 1.4537683725357056, + "learning_rate": 8.545442779437928e-05, + "loss": 1.1164, + "step": 15639 + }, + { + "epoch": 0.560101706447974, + "grad_norm": 1.4856207370758057, + "learning_rate": 8.544295223757484e-05, + "loss": 1.1992, + "step": 15640 + }, + { + "epoch": 0.5601375185775422, + "grad_norm": 1.2672502994537354, + "learning_rate": 8.543147687661673e-05, + "loss": 0.9438, + "step": 15641 + }, + { + "epoch": 0.5601733307071105, + "grad_norm": 1.3713151216506958, + "learning_rate": 8.542000171165928e-05, + "loss": 1.0925, + "step": 15642 + }, + { + "epoch": 0.5602091428366788, + "grad_norm": 1.5023694038391113, + "learning_rate": 8.540852674285691e-05, + "loss": 1.3326, + "step": 15643 + }, + { + "epoch": 0.560244954966247, + "grad_norm": 1.4865880012512207, + "learning_rate": 8.539705197036398e-05, + "loss": 0.8725, + "step": 15644 + }, + { + "epoch": 0.5602807670958153, + "grad_norm": 1.6692067384719849, + "learning_rate": 8.53855773943349e-05, + "loss": 1.2416, + "step": 15645 + }, + { + "epoch": 0.5603165792253836, + "grad_norm": 1.5065170526504517, + "learning_rate": 8.537410301492398e-05, + "loss": 1.1064, + "step": 15646 + }, + { + "epoch": 0.560352391354952, + "grad_norm": 1.4075982570648193, + "learning_rate": 8.53626288322857e-05, + "loss": 1.0428, + "step": 15647 + }, + { + "epoch": 0.5603882034845202, + "grad_norm": 1.3644088506698608, + "learning_rate": 8.535115484657434e-05, + "loss": 0.9863, + "step": 15648 + }, + { + "epoch": 0.5604240156140885, + "grad_norm": 1.4418967962265015, + "learning_rate": 8.533968105794428e-05, + "loss": 1.1197, + "step": 15649 + }, + { + "epoch": 0.5604598277436568, + "grad_norm": 1.307563066482544, + "learning_rate": 8.532820746654993e-05, + "loss": 1.2039, + "step": 15650 + }, + { + "epoch": 0.560495639873225, + "grad_norm": 1.8148720264434814, + "learning_rate": 8.53167340725456e-05, + "loss": 1.055, + "step": 15651 + }, + { + "epoch": 0.5605314520027933, + "grad_norm": 1.419418454170227, + "learning_rate": 8.530526087608569e-05, + "loss": 1.1487, + "step": 15652 + }, + { + "epoch": 0.5605672641323616, + "grad_norm": 1.3481318950653076, + "learning_rate": 8.529378787732451e-05, + "loss": 1.1513, + "step": 15653 + }, + { + "epoch": 0.56060307626193, + "grad_norm": 1.800718069076538, + "learning_rate": 8.528231507641648e-05, + "loss": 0.9909, + "step": 15654 + }, + { + "epoch": 0.5606388883914982, + "grad_norm": 1.5649234056472778, + "learning_rate": 8.527084247351595e-05, + "loss": 0.9839, + "step": 15655 + }, + { + "epoch": 0.5606747005210665, + "grad_norm": 1.5501009225845337, + "learning_rate": 8.525937006877714e-05, + "loss": 0.9691, + "step": 15656 + }, + { + "epoch": 0.5607105126506348, + "grad_norm": 1.513872742652893, + "learning_rate": 8.524789786235458e-05, + "loss": 1.0262, + "step": 15657 + }, + { + "epoch": 0.560746324780203, + "grad_norm": 1.4566904306411743, + "learning_rate": 8.523642585440245e-05, + "loss": 1.1065, + "step": 15658 + }, + { + "epoch": 0.5607821369097713, + "grad_norm": 1.577322244644165, + "learning_rate": 8.522495404507521e-05, + "loss": 1.267, + "step": 15659 + }, + { + "epoch": 0.5608179490393396, + "grad_norm": 1.4135419130325317, + "learning_rate": 8.521348243452714e-05, + "loss": 0.9182, + "step": 15660 + }, + { + "epoch": 0.560853761168908, + "grad_norm": 1.5910855531692505, + "learning_rate": 8.52020110229126e-05, + "loss": 1.1294, + "step": 15661 + }, + { + "epoch": 0.5608895732984762, + "grad_norm": 1.3877040147781372, + "learning_rate": 8.51905398103859e-05, + "loss": 0.9209, + "step": 15662 + }, + { + "epoch": 0.5609253854280445, + "grad_norm": 1.3915274143218994, + "learning_rate": 8.51790687971014e-05, + "loss": 1.0639, + "step": 15663 + }, + { + "epoch": 0.5609611975576128, + "grad_norm": 1.5202603340148926, + "learning_rate": 8.516759798321345e-05, + "loss": 1.1235, + "step": 15664 + }, + { + "epoch": 0.560997009687181, + "grad_norm": 1.448915719985962, + "learning_rate": 8.515612736887627e-05, + "loss": 1.0512, + "step": 15665 + }, + { + "epoch": 0.5610328218167493, + "grad_norm": 1.3208822011947632, + "learning_rate": 8.51446569542443e-05, + "loss": 0.8187, + "step": 15666 + }, + { + "epoch": 0.5610686339463176, + "grad_norm": 1.8526521921157837, + "learning_rate": 8.513318673947173e-05, + "loss": 1.3149, + "step": 15667 + }, + { + "epoch": 0.5611044460758859, + "grad_norm": 1.2186213731765747, + "learning_rate": 8.512171672471305e-05, + "loss": 1.0009, + "step": 15668 + }, + { + "epoch": 0.5611402582054542, + "grad_norm": 1.3198853731155396, + "learning_rate": 8.51102469101224e-05, + "loss": 0.9824, + "step": 15669 + }, + { + "epoch": 0.5611760703350225, + "grad_norm": 1.42388916015625, + "learning_rate": 8.509877729585423e-05, + "loss": 1.138, + "step": 15670 + }, + { + "epoch": 0.5612118824645907, + "grad_norm": 1.5199240446090698, + "learning_rate": 8.508730788206273e-05, + "loss": 0.9127, + "step": 15671 + }, + { + "epoch": 0.561247694594159, + "grad_norm": 1.501275658607483, + "learning_rate": 8.507583866890233e-05, + "loss": 1.2118, + "step": 15672 + }, + { + "epoch": 0.5612835067237273, + "grad_norm": 1.5535836219787598, + "learning_rate": 8.506436965652728e-05, + "loss": 0.9618, + "step": 15673 + }, + { + "epoch": 0.5613193188532956, + "grad_norm": 2.0719637870788574, + "learning_rate": 8.50529008450918e-05, + "loss": 1.1862, + "step": 15674 + }, + { + "epoch": 0.5613551309828639, + "grad_norm": 1.7503900527954102, + "learning_rate": 8.504143223475031e-05, + "loss": 1.2025, + "step": 15675 + }, + { + "epoch": 0.5613909431124322, + "grad_norm": 1.3759108781814575, + "learning_rate": 8.502996382565702e-05, + "loss": 1.2184, + "step": 15676 + }, + { + "epoch": 0.5614267552420005, + "grad_norm": 1.527438759803772, + "learning_rate": 8.501849561796631e-05, + "loss": 0.9931, + "step": 15677 + }, + { + "epoch": 0.5614625673715687, + "grad_norm": 1.631866455078125, + "learning_rate": 8.500702761183234e-05, + "loss": 1.0869, + "step": 15678 + }, + { + "epoch": 0.561498379501137, + "grad_norm": 1.642457127571106, + "learning_rate": 8.499555980740956e-05, + "loss": 1.1121, + "step": 15679 + }, + { + "epoch": 0.5615341916307053, + "grad_norm": 1.8582127094268799, + "learning_rate": 8.498409220485208e-05, + "loss": 1.2586, + "step": 15680 + }, + { + "epoch": 0.5615700037602736, + "grad_norm": 1.666581392288208, + "learning_rate": 8.497262480431435e-05, + "loss": 1.0565, + "step": 15681 + }, + { + "epoch": 0.5616058158898419, + "grad_norm": 1.4053330421447754, + "learning_rate": 8.496115760595054e-05, + "loss": 1.0922, + "step": 15682 + }, + { + "epoch": 0.5616416280194102, + "grad_norm": 1.5997240543365479, + "learning_rate": 8.494969060991493e-05, + "loss": 1.2201, + "step": 15683 + }, + { + "epoch": 0.5616774401489785, + "grad_norm": 1.2730181217193604, + "learning_rate": 8.493822381636185e-05, + "loss": 1.1077, + "step": 15684 + }, + { + "epoch": 0.5617132522785467, + "grad_norm": 1.4854378700256348, + "learning_rate": 8.49267572254455e-05, + "loss": 1.1328, + "step": 15685 + }, + { + "epoch": 0.561749064408115, + "grad_norm": 1.5116347074508667, + "learning_rate": 8.491529083732025e-05, + "loss": 1.0674, + "step": 15686 + }, + { + "epoch": 0.5617848765376833, + "grad_norm": 1.4645494222640991, + "learning_rate": 8.490382465214025e-05, + "loss": 1.1546, + "step": 15687 + }, + { + "epoch": 0.5618206886672515, + "grad_norm": 1.8048495054244995, + "learning_rate": 8.489235867005985e-05, + "loss": 1.1459, + "step": 15688 + }, + { + "epoch": 0.5618565007968199, + "grad_norm": 1.8542463779449463, + "learning_rate": 8.488089289123324e-05, + "loss": 1.0705, + "step": 15689 + }, + { + "epoch": 0.5618923129263882, + "grad_norm": 1.539218544960022, + "learning_rate": 8.486942731581478e-05, + "loss": 1.1831, + "step": 15690 + }, + { + "epoch": 0.5619281250559565, + "grad_norm": 1.4989427328109741, + "learning_rate": 8.485796194395862e-05, + "loss": 1.1307, + "step": 15691 + }, + { + "epoch": 0.5619639371855247, + "grad_norm": 1.678518533706665, + "learning_rate": 8.484649677581904e-05, + "loss": 1.0587, + "step": 15692 + }, + { + "epoch": 0.561999749315093, + "grad_norm": 1.9444313049316406, + "learning_rate": 8.483503181155031e-05, + "loss": 1.068, + "step": 15693 + }, + { + "epoch": 0.5620355614446613, + "grad_norm": 1.4432889223098755, + "learning_rate": 8.482356705130665e-05, + "loss": 1.0724, + "step": 15694 + }, + { + "epoch": 0.5620713735742295, + "grad_norm": 1.2694330215454102, + "learning_rate": 8.481210249524234e-05, + "loss": 1.1206, + "step": 15695 + }, + { + "epoch": 0.5621071857037979, + "grad_norm": 1.3971408605575562, + "learning_rate": 8.480063814351159e-05, + "loss": 1.0773, + "step": 15696 + }, + { + "epoch": 0.5621429978333662, + "grad_norm": 1.4539690017700195, + "learning_rate": 8.478917399626865e-05, + "loss": 0.9818, + "step": 15697 + }, + { + "epoch": 0.5621788099629345, + "grad_norm": 1.8231608867645264, + "learning_rate": 8.477771005366772e-05, + "loss": 0.9852, + "step": 15698 + }, + { + "epoch": 0.5622146220925027, + "grad_norm": 1.7048487663269043, + "learning_rate": 8.476624631586313e-05, + "loss": 1.1057, + "step": 15699 + }, + { + "epoch": 0.562250434222071, + "grad_norm": 1.8447545766830444, + "learning_rate": 8.475478278300902e-05, + "loss": 0.9663, + "step": 15700 + }, + { + "epoch": 0.5622862463516393, + "grad_norm": 1.2828006744384766, + "learning_rate": 8.474331945525963e-05, + "loss": 1.106, + "step": 15701 + }, + { + "epoch": 0.5623220584812075, + "grad_norm": 1.3550150394439697, + "learning_rate": 8.47318563327692e-05, + "loss": 1.0819, + "step": 15702 + }, + { + "epoch": 0.5623578706107759, + "grad_norm": 1.3338961601257324, + "learning_rate": 8.472039341569195e-05, + "loss": 1.2083, + "step": 15703 + }, + { + "epoch": 0.5623936827403442, + "grad_norm": 1.9187588691711426, + "learning_rate": 8.470893070418211e-05, + "loss": 1.2496, + "step": 15704 + }, + { + "epoch": 0.5624294948699124, + "grad_norm": 2.0648207664489746, + "learning_rate": 8.469746819839387e-05, + "loss": 1.1976, + "step": 15705 + }, + { + "epoch": 0.5624653069994807, + "grad_norm": 1.3448870182037354, + "learning_rate": 8.468600589848146e-05, + "loss": 0.7621, + "step": 15706 + }, + { + "epoch": 0.562501119129049, + "grad_norm": 1.3656319379806519, + "learning_rate": 8.467454380459907e-05, + "loss": 1.1083, + "step": 15707 + }, + { + "epoch": 0.5625369312586173, + "grad_norm": 1.5992274284362793, + "learning_rate": 8.466308191690096e-05, + "loss": 1.3938, + "step": 15708 + }, + { + "epoch": 0.5625727433881855, + "grad_norm": 1.47621750831604, + "learning_rate": 8.46516202355413e-05, + "loss": 0.9238, + "step": 15709 + }, + { + "epoch": 0.5626085555177539, + "grad_norm": 1.5478237867355347, + "learning_rate": 8.464015876067425e-05, + "loss": 1.1302, + "step": 15710 + }, + { + "epoch": 0.5626443676473222, + "grad_norm": 2.188347816467285, + "learning_rate": 8.462869749245408e-05, + "loss": 1.0874, + "step": 15711 + }, + { + "epoch": 0.5626801797768904, + "grad_norm": 1.5684709548950195, + "learning_rate": 8.461723643103494e-05, + "loss": 1.0691, + "step": 15712 + }, + { + "epoch": 0.5627159919064587, + "grad_norm": 1.8320538997650146, + "learning_rate": 8.460577557657107e-05, + "loss": 1.2156, + "step": 15713 + }, + { + "epoch": 0.562751804036027, + "grad_norm": 1.4401377439498901, + "learning_rate": 8.45943149292166e-05, + "loss": 0.9977, + "step": 15714 + }, + { + "epoch": 0.5627876161655953, + "grad_norm": 1.4209023714065552, + "learning_rate": 8.458285448912578e-05, + "loss": 1.0894, + "step": 15715 + }, + { + "epoch": 0.5628234282951635, + "grad_norm": 1.2564704418182373, + "learning_rate": 8.457139425645273e-05, + "loss": 1.0607, + "step": 15716 + }, + { + "epoch": 0.5628592404247319, + "grad_norm": 1.6421546936035156, + "learning_rate": 8.455993423135172e-05, + "loss": 1.0761, + "step": 15717 + }, + { + "epoch": 0.5628950525543002, + "grad_norm": 1.6090723276138306, + "learning_rate": 8.454847441397684e-05, + "loss": 1.1965, + "step": 15718 + }, + { + "epoch": 0.5629308646838684, + "grad_norm": 1.6271437406539917, + "learning_rate": 8.45370148044823e-05, + "loss": 1.115, + "step": 15719 + }, + { + "epoch": 0.5629666768134367, + "grad_norm": 1.9566138982772827, + "learning_rate": 8.452555540302231e-05, + "loss": 1.1256, + "step": 15720 + }, + { + "epoch": 0.563002488943005, + "grad_norm": 1.8809969425201416, + "learning_rate": 8.451409620975099e-05, + "loss": 1.1137, + "step": 15721 + }, + { + "epoch": 0.5630383010725732, + "grad_norm": 1.574520468711853, + "learning_rate": 8.450263722482255e-05, + "loss": 0.9661, + "step": 15722 + }, + { + "epoch": 0.5630741132021415, + "grad_norm": 2.3013312816619873, + "learning_rate": 8.44911784483911e-05, + "loss": 1.2958, + "step": 15723 + }, + { + "epoch": 0.5631099253317099, + "grad_norm": 1.5781933069229126, + "learning_rate": 8.447971988061088e-05, + "loss": 1.1903, + "step": 15724 + }, + { + "epoch": 0.5631457374612782, + "grad_norm": 1.603025197982788, + "learning_rate": 8.446826152163598e-05, + "loss": 1.1952, + "step": 15725 + }, + { + "epoch": 0.5631815495908464, + "grad_norm": 1.725490927696228, + "learning_rate": 8.44568033716206e-05, + "loss": 1.011, + "step": 15726 + }, + { + "epoch": 0.5632173617204147, + "grad_norm": 1.4656883478164673, + "learning_rate": 8.444534543071891e-05, + "loss": 1.1406, + "step": 15727 + }, + { + "epoch": 0.563253173849983, + "grad_norm": 1.3981794118881226, + "learning_rate": 8.443388769908498e-05, + "loss": 1.1884, + "step": 15728 + }, + { + "epoch": 0.5632889859795512, + "grad_norm": 1.791640043258667, + "learning_rate": 8.442243017687304e-05, + "loss": 1.0201, + "step": 15729 + }, + { + "epoch": 0.5633247981091195, + "grad_norm": 1.5922662019729614, + "learning_rate": 8.44109728642372e-05, + "loss": 1.1605, + "step": 15730 + }, + { + "epoch": 0.5633606102386879, + "grad_norm": 2.3586947917938232, + "learning_rate": 8.439951576133162e-05, + "loss": 1.3791, + "step": 15731 + }, + { + "epoch": 0.5633964223682562, + "grad_norm": 1.4563663005828857, + "learning_rate": 8.438805886831042e-05, + "loss": 1.1472, + "step": 15732 + }, + { + "epoch": 0.5634322344978244, + "grad_norm": 1.5225389003753662, + "learning_rate": 8.437660218532777e-05, + "loss": 1.0403, + "step": 15733 + }, + { + "epoch": 0.5634680466273927, + "grad_norm": 1.3319928646087646, + "learning_rate": 8.436514571253775e-05, + "loss": 1.1726, + "step": 15734 + }, + { + "epoch": 0.563503858756961, + "grad_norm": 1.412058711051941, + "learning_rate": 8.435368945009456e-05, + "loss": 1.1124, + "step": 15735 + }, + { + "epoch": 0.5635396708865292, + "grad_norm": 2.377305507659912, + "learning_rate": 8.434223339815229e-05, + "loss": 1.1684, + "step": 15736 + }, + { + "epoch": 0.5635754830160975, + "grad_norm": 1.7811851501464844, + "learning_rate": 8.433077755686506e-05, + "loss": 1.0736, + "step": 15737 + }, + { + "epoch": 0.5636112951456659, + "grad_norm": 1.5444527864456177, + "learning_rate": 8.431932192638703e-05, + "loss": 1.1548, + "step": 15738 + }, + { + "epoch": 0.5636471072752341, + "grad_norm": 1.4307150840759277, + "learning_rate": 8.430786650687227e-05, + "loss": 1.2004, + "step": 15739 + }, + { + "epoch": 0.5636829194048024, + "grad_norm": 1.4044791460037231, + "learning_rate": 8.429641129847494e-05, + "loss": 0.8732, + "step": 15740 + }, + { + "epoch": 0.5637187315343707, + "grad_norm": 1.905816674232483, + "learning_rate": 8.428495630134912e-05, + "loss": 1.136, + "step": 15741 + }, + { + "epoch": 0.563754543663939, + "grad_norm": 1.5171940326690674, + "learning_rate": 8.427350151564897e-05, + "loss": 1.1404, + "step": 15742 + }, + { + "epoch": 0.5637903557935072, + "grad_norm": 1.536294937133789, + "learning_rate": 8.426204694152855e-05, + "loss": 1.0189, + "step": 15743 + }, + { + "epoch": 0.5638261679230755, + "grad_norm": 1.5951581001281738, + "learning_rate": 8.425059257914201e-05, + "loss": 1.0585, + "step": 15744 + }, + { + "epoch": 0.5638619800526439, + "grad_norm": 1.5285799503326416, + "learning_rate": 8.423913842864342e-05, + "loss": 1.0667, + "step": 15745 + }, + { + "epoch": 0.5638977921822121, + "grad_norm": 1.4808416366577148, + "learning_rate": 8.422768449018688e-05, + "loss": 0.8935, + "step": 15746 + }, + { + "epoch": 0.5639336043117804, + "grad_norm": 1.510002851486206, + "learning_rate": 8.421623076392652e-05, + "loss": 1.1632, + "step": 15747 + }, + { + "epoch": 0.5639694164413487, + "grad_norm": 1.5394147634506226, + "learning_rate": 8.420477725001639e-05, + "loss": 1.2916, + "step": 15748 + }, + { + "epoch": 0.564005228570917, + "grad_norm": 1.7623001337051392, + "learning_rate": 8.419332394861064e-05, + "loss": 1.0204, + "step": 15749 + }, + { + "epoch": 0.5640410407004852, + "grad_norm": 1.3296611309051514, + "learning_rate": 8.418187085986329e-05, + "loss": 1.1004, + "step": 15750 + }, + { + "epoch": 0.5640768528300535, + "grad_norm": 1.4401593208312988, + "learning_rate": 8.41704179839285e-05, + "loss": 1.1804, + "step": 15751 + }, + { + "epoch": 0.5641126649596219, + "grad_norm": 1.6028828620910645, + "learning_rate": 8.415896532096034e-05, + "loss": 1.0526, + "step": 15752 + }, + { + "epoch": 0.5641484770891901, + "grad_norm": 1.3017866611480713, + "learning_rate": 8.41475128711128e-05, + "loss": 0.9162, + "step": 15753 + }, + { + "epoch": 0.5641842892187584, + "grad_norm": 1.5844711065292358, + "learning_rate": 8.413606063454008e-05, + "loss": 1.0849, + "step": 15754 + }, + { + "epoch": 0.5642201013483267, + "grad_norm": 1.601043701171875, + "learning_rate": 8.412460861139615e-05, + "loss": 1.2483, + "step": 15755 + }, + { + "epoch": 0.5642559134778949, + "grad_norm": 1.7951710224151611, + "learning_rate": 8.411315680183517e-05, + "loss": 1.034, + "step": 15756 + }, + { + "epoch": 0.5642917256074632, + "grad_norm": 1.2505450248718262, + "learning_rate": 8.410170520601115e-05, + "loss": 0.9717, + "step": 15757 + }, + { + "epoch": 0.5643275377370315, + "grad_norm": 1.724668264389038, + "learning_rate": 8.40902538240782e-05, + "loss": 1.0451, + "step": 15758 + }, + { + "epoch": 0.5643633498665999, + "grad_norm": 1.3676486015319824, + "learning_rate": 8.407880265619035e-05, + "loss": 1.2184, + "step": 15759 + }, + { + "epoch": 0.5643991619961681, + "grad_norm": 1.5405967235565186, + "learning_rate": 8.406735170250168e-05, + "loss": 0.9816, + "step": 15760 + }, + { + "epoch": 0.5644349741257364, + "grad_norm": 1.7886258363723755, + "learning_rate": 8.405590096316626e-05, + "loss": 0.9731, + "step": 15761 + }, + { + "epoch": 0.5644707862553047, + "grad_norm": 1.429614782333374, + "learning_rate": 8.404445043833809e-05, + "loss": 1.1669, + "step": 15762 + }, + { + "epoch": 0.5645065983848729, + "grad_norm": 1.4786744117736816, + "learning_rate": 8.40330001281713e-05, + "loss": 1.2079, + "step": 15763 + }, + { + "epoch": 0.5645424105144412, + "grad_norm": 1.3619691133499146, + "learning_rate": 8.402155003281984e-05, + "loss": 1.1905, + "step": 15764 + }, + { + "epoch": 0.5645782226440095, + "grad_norm": 1.600500464439392, + "learning_rate": 8.401010015243787e-05, + "loss": 1.2242, + "step": 15765 + }, + { + "epoch": 0.5646140347735779, + "grad_norm": 1.231204867362976, + "learning_rate": 8.399865048717932e-05, + "loss": 1.1058, + "step": 15766 + }, + { + "epoch": 0.5646498469031461, + "grad_norm": 1.6784731149673462, + "learning_rate": 8.398720103719836e-05, + "loss": 1.3039, + "step": 15767 + }, + { + "epoch": 0.5646856590327144, + "grad_norm": 1.3128911256790161, + "learning_rate": 8.397575180264887e-05, + "loss": 0.9321, + "step": 15768 + }, + { + "epoch": 0.5647214711622827, + "grad_norm": 1.2340316772460938, + "learning_rate": 8.396430278368503e-05, + "loss": 1.0113, + "step": 15769 + }, + { + "epoch": 0.5647572832918509, + "grad_norm": 1.9823815822601318, + "learning_rate": 8.395285398046084e-05, + "loss": 1.173, + "step": 15770 + }, + { + "epoch": 0.5647930954214192, + "grad_norm": 1.606147050857544, + "learning_rate": 8.394140539313021e-05, + "loss": 1.1185, + "step": 15771 + }, + { + "epoch": 0.5648289075509875, + "grad_norm": 2.0649523735046387, + "learning_rate": 8.392995702184734e-05, + "loss": 1.2097, + "step": 15772 + }, + { + "epoch": 0.5648647196805558, + "grad_norm": 1.650418758392334, + "learning_rate": 8.391850886676609e-05, + "loss": 0.8171, + "step": 15773 + }, + { + "epoch": 0.5649005318101241, + "grad_norm": 1.4239253997802734, + "learning_rate": 8.390706092804064e-05, + "loss": 1.2398, + "step": 15774 + }, + { + "epoch": 0.5649363439396924, + "grad_norm": 1.706697940826416, + "learning_rate": 8.389561320582486e-05, + "loss": 1.1042, + "step": 15775 + }, + { + "epoch": 0.5649721560692607, + "grad_norm": 1.3317902088165283, + "learning_rate": 8.388416570027289e-05, + "loss": 1.1503, + "step": 15776 + }, + { + "epoch": 0.5650079681988289, + "grad_norm": 1.4939836263656616, + "learning_rate": 8.38727184115386e-05, + "loss": 0.9153, + "step": 15777 + }, + { + "epoch": 0.5650437803283972, + "grad_norm": 1.338563084602356, + "learning_rate": 8.386127133977617e-05, + "loss": 1.1273, + "step": 15778 + }, + { + "epoch": 0.5650795924579655, + "grad_norm": 1.2285479307174683, + "learning_rate": 8.384982448513949e-05, + "loss": 1.0447, + "step": 15779 + }, + { + "epoch": 0.5651154045875338, + "grad_norm": 1.2557531595230103, + "learning_rate": 8.383837784778257e-05, + "loss": 0.9018, + "step": 15780 + }, + { + "epoch": 0.5651512167171021, + "grad_norm": 1.5041451454162598, + "learning_rate": 8.382693142785945e-05, + "loss": 1.1918, + "step": 15781 + }, + { + "epoch": 0.5651870288466704, + "grad_norm": 1.5363048315048218, + "learning_rate": 8.381548522552406e-05, + "loss": 1.1306, + "step": 15782 + }, + { + "epoch": 0.5652228409762386, + "grad_norm": 1.7626014947891235, + "learning_rate": 8.38040392409305e-05, + "loss": 1.1605, + "step": 15783 + }, + { + "epoch": 0.5652586531058069, + "grad_norm": 1.5002918243408203, + "learning_rate": 8.379259347423265e-05, + "loss": 1.0723, + "step": 15784 + }, + { + "epoch": 0.5652944652353752, + "grad_norm": 1.4729465246200562, + "learning_rate": 8.37811479255846e-05, + "loss": 1.1855, + "step": 15785 + }, + { + "epoch": 0.5653302773649435, + "grad_norm": 1.4705431461334229, + "learning_rate": 8.376970259514023e-05, + "loss": 0.8802, + "step": 15786 + }, + { + "epoch": 0.5653660894945118, + "grad_norm": 1.5235322713851929, + "learning_rate": 8.375825748305364e-05, + "loss": 1.2806, + "step": 15787 + }, + { + "epoch": 0.5654019016240801, + "grad_norm": 1.62598717212677, + "learning_rate": 8.37468125894787e-05, + "loss": 1.1393, + "step": 15788 + }, + { + "epoch": 0.5654377137536484, + "grad_norm": 1.5852702856063843, + "learning_rate": 8.373536791456944e-05, + "loss": 1.3047, + "step": 15789 + }, + { + "epoch": 0.5654735258832166, + "grad_norm": 1.4584012031555176, + "learning_rate": 8.372392345847983e-05, + "loss": 1.1797, + "step": 15790 + }, + { + "epoch": 0.5655093380127849, + "grad_norm": 2.0458812713623047, + "learning_rate": 8.371247922136383e-05, + "loss": 1.0476, + "step": 15791 + }, + { + "epoch": 0.5655451501423532, + "grad_norm": 1.5523592233657837, + "learning_rate": 8.370103520337542e-05, + "loss": 1.0462, + "step": 15792 + }, + { + "epoch": 0.5655809622719215, + "grad_norm": 1.3300533294677734, + "learning_rate": 8.368959140466853e-05, + "loss": 1.0659, + "step": 15793 + }, + { + "epoch": 0.5656167744014898, + "grad_norm": 1.7694642543792725, + "learning_rate": 8.367814782539718e-05, + "loss": 1.4006, + "step": 15794 + }, + { + "epoch": 0.5656525865310581, + "grad_norm": 2.205274820327759, + "learning_rate": 8.366670446571525e-05, + "loss": 0.9564, + "step": 15795 + }, + { + "epoch": 0.5656883986606264, + "grad_norm": 1.6204209327697754, + "learning_rate": 8.365526132577681e-05, + "loss": 1.0873, + "step": 15796 + }, + { + "epoch": 0.5657242107901946, + "grad_norm": 1.8515616655349731, + "learning_rate": 8.364381840573573e-05, + "loss": 1.1291, + "step": 15797 + }, + { + "epoch": 0.5657600229197629, + "grad_norm": 1.3840726613998413, + "learning_rate": 8.363237570574595e-05, + "loss": 0.8818, + "step": 15798 + }, + { + "epoch": 0.5657958350493312, + "grad_norm": 1.6816116571426392, + "learning_rate": 8.362093322596145e-05, + "loss": 1.1236, + "step": 15799 + }, + { + "epoch": 0.5658316471788994, + "grad_norm": 1.5603312253952026, + "learning_rate": 8.360949096653616e-05, + "loss": 1.2695, + "step": 15800 + }, + { + "epoch": 0.5658674593084678, + "grad_norm": 1.9459893703460693, + "learning_rate": 8.359804892762405e-05, + "loss": 1.2388, + "step": 15801 + }, + { + "epoch": 0.5659032714380361, + "grad_norm": 2.0206782817840576, + "learning_rate": 8.3586607109379e-05, + "loss": 1.0507, + "step": 15802 + }, + { + "epoch": 0.5659390835676044, + "grad_norm": 1.599557876586914, + "learning_rate": 8.357516551195501e-05, + "loss": 1.0968, + "step": 15803 + }, + { + "epoch": 0.5659748956971726, + "grad_norm": 1.4412713050842285, + "learning_rate": 8.356372413550597e-05, + "loss": 1.0526, + "step": 15804 + }, + { + "epoch": 0.5660107078267409, + "grad_norm": 1.7959240674972534, + "learning_rate": 8.355228298018582e-05, + "loss": 1.2922, + "step": 15805 + }, + { + "epoch": 0.5660465199563092, + "grad_norm": 1.6639463901519775, + "learning_rate": 8.354084204614851e-05, + "loss": 0.9438, + "step": 15806 + }, + { + "epoch": 0.5660823320858774, + "grad_norm": 1.4269336462020874, + "learning_rate": 8.35294013335479e-05, + "loss": 1.0787, + "step": 15807 + }, + { + "epoch": 0.5661181442154458, + "grad_norm": 1.4042185544967651, + "learning_rate": 8.351796084253797e-05, + "loss": 1.12, + "step": 15808 + }, + { + "epoch": 0.5661539563450141, + "grad_norm": 1.6841152906417847, + "learning_rate": 8.350652057327261e-05, + "loss": 1.0514, + "step": 15809 + }, + { + "epoch": 0.5661897684745824, + "grad_norm": 1.5914981365203857, + "learning_rate": 8.349508052590574e-05, + "loss": 1.1292, + "step": 15810 + }, + { + "epoch": 0.5662255806041506, + "grad_norm": 1.4403440952301025, + "learning_rate": 8.348364070059127e-05, + "loss": 0.8968, + "step": 15811 + }, + { + "epoch": 0.5662613927337189, + "grad_norm": 1.5942341089248657, + "learning_rate": 8.347220109748312e-05, + "loss": 1.1694, + "step": 15812 + }, + { + "epoch": 0.5662972048632872, + "grad_norm": 2.6784396171569824, + "learning_rate": 8.346076171673518e-05, + "loss": 1.1942, + "step": 15813 + }, + { + "epoch": 0.5663330169928554, + "grad_norm": 1.3279110193252563, + "learning_rate": 8.344932255850136e-05, + "loss": 1.0099, + "step": 15814 + }, + { + "epoch": 0.5663688291224238, + "grad_norm": 1.2857534885406494, + "learning_rate": 8.343788362293556e-05, + "loss": 1.0944, + "step": 15815 + }, + { + "epoch": 0.5664046412519921, + "grad_norm": 1.4919193983078003, + "learning_rate": 8.342644491019165e-05, + "loss": 1.0152, + "step": 15816 + }, + { + "epoch": 0.5664404533815603, + "grad_norm": 1.5392708778381348, + "learning_rate": 8.341500642042359e-05, + "loss": 0.9917, + "step": 15817 + }, + { + "epoch": 0.5664762655111286, + "grad_norm": 1.6317784786224365, + "learning_rate": 8.340356815378517e-05, + "loss": 1.2672, + "step": 15818 + }, + { + "epoch": 0.5665120776406969, + "grad_norm": 1.5023505687713623, + "learning_rate": 8.339213011043038e-05, + "loss": 0.9652, + "step": 15819 + }, + { + "epoch": 0.5665478897702652, + "grad_norm": 1.3672000169754028, + "learning_rate": 8.338069229051302e-05, + "loss": 1.0652, + "step": 15820 + }, + { + "epoch": 0.5665837018998334, + "grad_norm": 1.7916226387023926, + "learning_rate": 8.336925469418704e-05, + "loss": 1.2273, + "step": 15821 + }, + { + "epoch": 0.5666195140294018, + "grad_norm": 1.5259076356887817, + "learning_rate": 8.335781732160625e-05, + "loss": 1.1351, + "step": 15822 + }, + { + "epoch": 0.5666553261589701, + "grad_norm": 1.5006332397460938, + "learning_rate": 8.334638017292459e-05, + "loss": 1.2583, + "step": 15823 + }, + { + "epoch": 0.5666911382885383, + "grad_norm": 1.5950871706008911, + "learning_rate": 8.33349432482959e-05, + "loss": 1.0892, + "step": 15824 + }, + { + "epoch": 0.5667269504181066, + "grad_norm": 1.4289298057556152, + "learning_rate": 8.332350654787404e-05, + "loss": 1.2035, + "step": 15825 + }, + { + "epoch": 0.5667627625476749, + "grad_norm": 1.2810603380203247, + "learning_rate": 8.33120700718129e-05, + "loss": 0.9795, + "step": 15826 + }, + { + "epoch": 0.5667985746772431, + "grad_norm": 1.467345952987671, + "learning_rate": 8.330063382026631e-05, + "loss": 1.2224, + "step": 15827 + }, + { + "epoch": 0.5668343868068114, + "grad_norm": 1.7479075193405151, + "learning_rate": 8.328919779338819e-05, + "loss": 0.8257, + "step": 15828 + }, + { + "epoch": 0.5668701989363798, + "grad_norm": 1.9676378965377808, + "learning_rate": 8.327776199133232e-05, + "loss": 0.9549, + "step": 15829 + }, + { + "epoch": 0.5669060110659481, + "grad_norm": 1.3279517889022827, + "learning_rate": 8.326632641425261e-05, + "loss": 1.2187, + "step": 15830 + }, + { + "epoch": 0.5669418231955163, + "grad_norm": 1.9492452144622803, + "learning_rate": 8.325489106230288e-05, + "loss": 0.9851, + "step": 15831 + }, + { + "epoch": 0.5669776353250846, + "grad_norm": 1.4149287939071655, + "learning_rate": 8.324345593563701e-05, + "loss": 0.988, + "step": 15832 + }, + { + "epoch": 0.5670134474546529, + "grad_norm": 2.2940211296081543, + "learning_rate": 8.323202103440884e-05, + "loss": 1.1111, + "step": 15833 + }, + { + "epoch": 0.5670492595842211, + "grad_norm": 1.2716920375823975, + "learning_rate": 8.322058635877216e-05, + "loss": 0.9885, + "step": 15834 + }, + { + "epoch": 0.5670850717137894, + "grad_norm": 1.5772655010223389, + "learning_rate": 8.320915190888087e-05, + "loss": 1.2522, + "step": 15835 + }, + { + "epoch": 0.5671208838433578, + "grad_norm": 1.3707797527313232, + "learning_rate": 8.319771768488877e-05, + "loss": 1.1645, + "step": 15836 + }, + { + "epoch": 0.5671566959729261, + "grad_norm": 1.8849711418151855, + "learning_rate": 8.318628368694972e-05, + "loss": 1.0464, + "step": 15837 + }, + { + "epoch": 0.5671925081024943, + "grad_norm": 1.432862639427185, + "learning_rate": 8.317484991521751e-05, + "loss": 1.0753, + "step": 15838 + }, + { + "epoch": 0.5672283202320626, + "grad_norm": 1.3288531303405762, + "learning_rate": 8.316341636984602e-05, + "loss": 1.1546, + "step": 15839 + }, + { + "epoch": 0.5672641323616309, + "grad_norm": 1.7307305335998535, + "learning_rate": 8.315198305098902e-05, + "loss": 0.9711, + "step": 15840 + }, + { + "epoch": 0.5672999444911991, + "grad_norm": 1.6652055978775024, + "learning_rate": 8.314054995880036e-05, + "loss": 1.0629, + "step": 15841 + }, + { + "epoch": 0.5673357566207674, + "grad_norm": 1.2697495222091675, + "learning_rate": 8.312911709343388e-05, + "loss": 1.062, + "step": 15842 + }, + { + "epoch": 0.5673715687503358, + "grad_norm": 1.6329575777053833, + "learning_rate": 8.311768445504333e-05, + "loss": 1.1718, + "step": 15843 + }, + { + "epoch": 0.567407380879904, + "grad_norm": 1.3221758604049683, + "learning_rate": 8.31062520437826e-05, + "loss": 0.9532, + "step": 15844 + }, + { + "epoch": 0.5674431930094723, + "grad_norm": 1.8986481428146362, + "learning_rate": 8.309481985980541e-05, + "loss": 1.0066, + "step": 15845 + }, + { + "epoch": 0.5674790051390406, + "grad_norm": 1.9641368389129639, + "learning_rate": 8.308338790326565e-05, + "loss": 1.3131, + "step": 15846 + }, + { + "epoch": 0.5675148172686089, + "grad_norm": 1.8568509817123413, + "learning_rate": 8.307195617431707e-05, + "loss": 1.1794, + "step": 15847 + }, + { + "epoch": 0.5675506293981771, + "grad_norm": 1.7522528171539307, + "learning_rate": 8.306052467311349e-05, + "loss": 1.1719, + "step": 15848 + }, + { + "epoch": 0.5675864415277454, + "grad_norm": 1.986798882484436, + "learning_rate": 8.304909339980873e-05, + "loss": 1.2118, + "step": 15849 + }, + { + "epoch": 0.5676222536573138, + "grad_norm": 1.4623939990997314, + "learning_rate": 8.303766235455648e-05, + "loss": 1.0158, + "step": 15850 + }, + { + "epoch": 0.567658065786882, + "grad_norm": 1.3995589017868042, + "learning_rate": 8.302623153751068e-05, + "loss": 1.0908, + "step": 15851 + }, + { + "epoch": 0.5676938779164503, + "grad_norm": 1.3701303005218506, + "learning_rate": 8.301480094882497e-05, + "loss": 0.9568, + "step": 15852 + }, + { + "epoch": 0.5677296900460186, + "grad_norm": 1.5172587633132935, + "learning_rate": 8.300337058865323e-05, + "loss": 1.1246, + "step": 15853 + }, + { + "epoch": 0.5677655021755869, + "grad_norm": 1.4224636554718018, + "learning_rate": 8.299194045714921e-05, + "loss": 1.1062, + "step": 15854 + }, + { + "epoch": 0.5678013143051551, + "grad_norm": 1.4704225063323975, + "learning_rate": 8.298051055446673e-05, + "loss": 1.102, + "step": 15855 + }, + { + "epoch": 0.5678371264347234, + "grad_norm": 2.2281863689422607, + "learning_rate": 8.296908088075949e-05, + "loss": 1.4172, + "step": 15856 + }, + { + "epoch": 0.5678729385642918, + "grad_norm": 1.9898067712783813, + "learning_rate": 8.295765143618131e-05, + "loss": 1.2577, + "step": 15857 + }, + { + "epoch": 0.56790875069386, + "grad_norm": 1.8884739875793457, + "learning_rate": 8.294622222088598e-05, + "loss": 1.197, + "step": 15858 + }, + { + "epoch": 0.5679445628234283, + "grad_norm": 1.7192503213882446, + "learning_rate": 8.293479323502716e-05, + "loss": 1.0267, + "step": 15859 + }, + { + "epoch": 0.5679803749529966, + "grad_norm": 1.762299656867981, + "learning_rate": 8.292336447875876e-05, + "loss": 1.273, + "step": 15860 + }, + { + "epoch": 0.5680161870825648, + "grad_norm": 1.9354933500289917, + "learning_rate": 8.291193595223438e-05, + "loss": 1.1391, + "step": 15861 + }, + { + "epoch": 0.5680519992121331, + "grad_norm": 1.2253953218460083, + "learning_rate": 8.290050765560795e-05, + "loss": 1.0774, + "step": 15862 + }, + { + "epoch": 0.5680878113417014, + "grad_norm": 1.2484666109085083, + "learning_rate": 8.288907958903305e-05, + "loss": 1.3285, + "step": 15863 + }, + { + "epoch": 0.5681236234712698, + "grad_norm": 1.9431604146957397, + "learning_rate": 8.287765175266358e-05, + "loss": 1.0019, + "step": 15864 + }, + { + "epoch": 0.568159435600838, + "grad_norm": 2.379206895828247, + "learning_rate": 8.286622414665317e-05, + "loss": 1.6191, + "step": 15865 + }, + { + "epoch": 0.5681952477304063, + "grad_norm": 1.5179100036621094, + "learning_rate": 8.285479677115563e-05, + "loss": 1.0766, + "step": 15866 + }, + { + "epoch": 0.5682310598599746, + "grad_norm": 1.258336067199707, + "learning_rate": 8.284336962632473e-05, + "loss": 1.0967, + "step": 15867 + }, + { + "epoch": 0.5682668719895428, + "grad_norm": 1.1738905906677246, + "learning_rate": 8.283194271231408e-05, + "loss": 1.1639, + "step": 15868 + }, + { + "epoch": 0.5683026841191111, + "grad_norm": 1.6872440576553345, + "learning_rate": 8.282051602927757e-05, + "loss": 1.208, + "step": 15869 + }, + { + "epoch": 0.5683384962486794, + "grad_norm": 1.559921383857727, + "learning_rate": 8.28090895773688e-05, + "loss": 0.951, + "step": 15870 + }, + { + "epoch": 0.5683743083782478, + "grad_norm": 2.023860216140747, + "learning_rate": 8.27976633567416e-05, + "loss": 0.9635, + "step": 15871 + }, + { + "epoch": 0.568410120507816, + "grad_norm": 1.3143991231918335, + "learning_rate": 8.27862373675496e-05, + "loss": 1.0473, + "step": 15872 + }, + { + "epoch": 0.5684459326373843, + "grad_norm": 1.6772751808166504, + "learning_rate": 8.277481160994663e-05, + "loss": 0.9715, + "step": 15873 + }, + { + "epoch": 0.5684817447669526, + "grad_norm": 1.2203961610794067, + "learning_rate": 8.276338608408627e-05, + "loss": 0.9357, + "step": 15874 + }, + { + "epoch": 0.5685175568965208, + "grad_norm": 1.2517304420471191, + "learning_rate": 8.27519607901224e-05, + "loss": 0.9359, + "step": 15875 + }, + { + "epoch": 0.5685533690260891, + "grad_norm": 1.6971246004104614, + "learning_rate": 8.274053572820862e-05, + "loss": 0.907, + "step": 15876 + }, + { + "epoch": 0.5685891811556574, + "grad_norm": 1.5751737356185913, + "learning_rate": 8.272911089849866e-05, + "loss": 1.181, + "step": 15877 + }, + { + "epoch": 0.5686249932852258, + "grad_norm": 1.4002463817596436, + "learning_rate": 8.271768630114624e-05, + "loss": 1.001, + "step": 15878 + }, + { + "epoch": 0.568660805414794, + "grad_norm": 1.6424846649169922, + "learning_rate": 8.270626193630503e-05, + "loss": 1.1782, + "step": 15879 + }, + { + "epoch": 0.5686966175443623, + "grad_norm": 1.5881786346435547, + "learning_rate": 8.269483780412883e-05, + "loss": 1.0137, + "step": 15880 + }, + { + "epoch": 0.5687324296739306, + "grad_norm": 1.6730905771255493, + "learning_rate": 8.268341390477118e-05, + "loss": 1.2903, + "step": 15881 + }, + { + "epoch": 0.5687682418034988, + "grad_norm": 1.3139526844024658, + "learning_rate": 8.267199023838593e-05, + "loss": 1.2557, + "step": 15882 + }, + { + "epoch": 0.5688040539330671, + "grad_norm": 1.7175328731536865, + "learning_rate": 8.266056680512664e-05, + "loss": 1.1116, + "step": 15883 + }, + { + "epoch": 0.5688398660626354, + "grad_norm": 1.7803292274475098, + "learning_rate": 8.26491436051471e-05, + "loss": 1.181, + "step": 15884 + }, + { + "epoch": 0.5688756781922037, + "grad_norm": 1.6713216304779053, + "learning_rate": 8.263772063860096e-05, + "loss": 1.0943, + "step": 15885 + }, + { + "epoch": 0.568911490321772, + "grad_norm": 1.7399286031723022, + "learning_rate": 8.262629790564186e-05, + "loss": 1.1442, + "step": 15886 + }, + { + "epoch": 0.5689473024513403, + "grad_norm": 1.4291775226593018, + "learning_rate": 8.261487540642353e-05, + "loss": 1.2073, + "step": 15887 + }, + { + "epoch": 0.5689831145809086, + "grad_norm": 1.3795007467269897, + "learning_rate": 8.26034531410996e-05, + "loss": 1.1063, + "step": 15888 + }, + { + "epoch": 0.5690189267104768, + "grad_norm": 1.266228437423706, + "learning_rate": 8.259203110982381e-05, + "loss": 0.8919, + "step": 15889 + }, + { + "epoch": 0.5690547388400451, + "grad_norm": 1.5873726606369019, + "learning_rate": 8.258060931274976e-05, + "loss": 1.0166, + "step": 15890 + }, + { + "epoch": 0.5690905509696134, + "grad_norm": 2.033592700958252, + "learning_rate": 8.256918775003115e-05, + "loss": 1.0784, + "step": 15891 + }, + { + "epoch": 0.5691263630991817, + "grad_norm": 1.2902277708053589, + "learning_rate": 8.255776642182159e-05, + "loss": 1.0151, + "step": 15892 + }, + { + "epoch": 0.56916217522875, + "grad_norm": 1.3903424739837646, + "learning_rate": 8.254634532827487e-05, + "loss": 1.2679, + "step": 15893 + }, + { + "epoch": 0.5691979873583183, + "grad_norm": 1.4974465370178223, + "learning_rate": 8.253492446954452e-05, + "loss": 1.2779, + "step": 15894 + }, + { + "epoch": 0.5692337994878865, + "grad_norm": 2.8247134685516357, + "learning_rate": 8.252350384578421e-05, + "loss": 1.1567, + "step": 15895 + }, + { + "epoch": 0.5692696116174548, + "grad_norm": 1.8844497203826904, + "learning_rate": 8.251208345714764e-05, + "loss": 1.0933, + "step": 15896 + }, + { + "epoch": 0.5693054237470231, + "grad_norm": 1.3027455806732178, + "learning_rate": 8.25006633037884e-05, + "loss": 1.171, + "step": 15897 + }, + { + "epoch": 0.5693412358765914, + "grad_norm": 1.342878818511963, + "learning_rate": 8.24892433858602e-05, + "loss": 1.1519, + "step": 15898 + }, + { + "epoch": 0.5693770480061597, + "grad_norm": 1.6434634923934937, + "learning_rate": 8.247782370351663e-05, + "loss": 1.0809, + "step": 15899 + }, + { + "epoch": 0.569412860135728, + "grad_norm": 1.653043508529663, + "learning_rate": 8.246640425691133e-05, + "loss": 1.1038, + "step": 15900 + }, + { + "epoch": 0.5694486722652963, + "grad_norm": 1.4469789266586304, + "learning_rate": 8.245498504619794e-05, + "loss": 0.8757, + "step": 15901 + }, + { + "epoch": 0.5694844843948645, + "grad_norm": 1.5579166412353516, + "learning_rate": 8.244356607153011e-05, + "loss": 1.1693, + "step": 15902 + }, + { + "epoch": 0.5695202965244328, + "grad_norm": 1.2961537837982178, + "learning_rate": 8.243214733306145e-05, + "loss": 1.0792, + "step": 15903 + }, + { + "epoch": 0.5695561086540011, + "grad_norm": 1.791042685508728, + "learning_rate": 8.242072883094559e-05, + "loss": 1.1564, + "step": 15904 + }, + { + "epoch": 0.5695919207835693, + "grad_norm": 1.7629203796386719, + "learning_rate": 8.240931056533615e-05, + "loss": 0.9911, + "step": 15905 + }, + { + "epoch": 0.5696277329131377, + "grad_norm": 1.2149169445037842, + "learning_rate": 8.239789253638672e-05, + "loss": 0.903, + "step": 15906 + }, + { + "epoch": 0.569663545042706, + "grad_norm": 1.4479845762252808, + "learning_rate": 8.238647474425097e-05, + "loss": 1.1763, + "step": 15907 + }, + { + "epoch": 0.5696993571722743, + "grad_norm": 1.458418369293213, + "learning_rate": 8.237505718908246e-05, + "loss": 1.187, + "step": 15908 + }, + { + "epoch": 0.5697351693018425, + "grad_norm": 1.6113213300704956, + "learning_rate": 8.236363987103483e-05, + "loss": 1.0656, + "step": 15909 + }, + { + "epoch": 0.5697709814314108, + "grad_norm": 1.5695337057113647, + "learning_rate": 8.235222279026168e-05, + "loss": 0.9069, + "step": 15910 + }, + { + "epoch": 0.5698067935609791, + "grad_norm": 1.5443652868270874, + "learning_rate": 8.234080594691663e-05, + "loss": 1.1606, + "step": 15911 + }, + { + "epoch": 0.5698426056905473, + "grad_norm": 1.9313530921936035, + "learning_rate": 8.232938934115323e-05, + "loss": 1.1541, + "step": 15912 + }, + { + "epoch": 0.5698784178201157, + "grad_norm": 1.402065634727478, + "learning_rate": 8.231797297312509e-05, + "loss": 1.2358, + "step": 15913 + }, + { + "epoch": 0.569914229949684, + "grad_norm": 1.2920514345169067, + "learning_rate": 8.230655684298585e-05, + "loss": 1.0749, + "step": 15914 + }, + { + "epoch": 0.5699500420792523, + "grad_norm": 1.7636195421218872, + "learning_rate": 8.229514095088903e-05, + "loss": 1.1458, + "step": 15915 + }, + { + "epoch": 0.5699858542088205, + "grad_norm": 1.5535978078842163, + "learning_rate": 8.228372529698828e-05, + "loss": 1.2178, + "step": 15916 + }, + { + "epoch": 0.5700216663383888, + "grad_norm": 2.236968994140625, + "learning_rate": 8.227230988143712e-05, + "loss": 1.0205, + "step": 15917 + }, + { + "epoch": 0.5700574784679571, + "grad_norm": 1.3512672185897827, + "learning_rate": 8.22608947043892e-05, + "loss": 1.1071, + "step": 15918 + }, + { + "epoch": 0.5700932905975253, + "grad_norm": 1.983895182609558, + "learning_rate": 8.224947976599804e-05, + "loss": 1.2737, + "step": 15919 + }, + { + "epoch": 0.5701291027270937, + "grad_norm": 1.4636794328689575, + "learning_rate": 8.223806506641724e-05, + "loss": 1.1319, + "step": 15920 + }, + { + "epoch": 0.570164914856662, + "grad_norm": 1.6659859418869019, + "learning_rate": 8.222665060580038e-05, + "loss": 1.031, + "step": 15921 + }, + { + "epoch": 0.5702007269862303, + "grad_norm": 1.1429609060287476, + "learning_rate": 8.221523638430098e-05, + "loss": 0.9131, + "step": 15922 + }, + { + "epoch": 0.5702365391157985, + "grad_norm": 1.417000412940979, + "learning_rate": 8.220382240207266e-05, + "loss": 1.1666, + "step": 15923 + }, + { + "epoch": 0.5702723512453668, + "grad_norm": 1.6724640130996704, + "learning_rate": 8.219240865926892e-05, + "loss": 1.0991, + "step": 15924 + }, + { + "epoch": 0.5703081633749351, + "grad_norm": 1.6757370233535767, + "learning_rate": 8.218099515604339e-05, + "loss": 1.105, + "step": 15925 + }, + { + "epoch": 0.5703439755045033, + "grad_norm": 1.6430224180221558, + "learning_rate": 8.216958189254956e-05, + "loss": 1.2592, + "step": 15926 + }, + { + "epoch": 0.5703797876340717, + "grad_norm": 1.4557440280914307, + "learning_rate": 8.215816886894102e-05, + "loss": 1.0223, + "step": 15927 + }, + { + "epoch": 0.57041559976364, + "grad_norm": 1.2693382501602173, + "learning_rate": 8.214675608537128e-05, + "loss": 0.9098, + "step": 15928 + }, + { + "epoch": 0.5704514118932082, + "grad_norm": 1.8709537982940674, + "learning_rate": 8.213534354199392e-05, + "loss": 1.3665, + "step": 15929 + }, + { + "epoch": 0.5704872240227765, + "grad_norm": 1.2823262214660645, + "learning_rate": 8.212393123896249e-05, + "loss": 1.1692, + "step": 15930 + }, + { + "epoch": 0.5705230361523448, + "grad_norm": 1.1912400722503662, + "learning_rate": 8.211251917643047e-05, + "loss": 1.1437, + "step": 15931 + }, + { + "epoch": 0.5705588482819131, + "grad_norm": 1.4434318542480469, + "learning_rate": 8.210110735455147e-05, + "loss": 0.9729, + "step": 15932 + }, + { + "epoch": 0.5705946604114813, + "grad_norm": 1.3838657140731812, + "learning_rate": 8.208969577347894e-05, + "loss": 0.9861, + "step": 15933 + }, + { + "epoch": 0.5706304725410497, + "grad_norm": 1.52186119556427, + "learning_rate": 8.207828443336649e-05, + "loss": 0.9828, + "step": 15934 + }, + { + "epoch": 0.570666284670618, + "grad_norm": 2.398559093475342, + "learning_rate": 8.206687333436758e-05, + "loss": 1.1672, + "step": 15935 + }, + { + "epoch": 0.5707020968001862, + "grad_norm": 1.6427830457687378, + "learning_rate": 8.205546247663578e-05, + "loss": 1.191, + "step": 15936 + }, + { + "epoch": 0.5707379089297545, + "grad_norm": 1.6300913095474243, + "learning_rate": 8.204405186032455e-05, + "loss": 1.2587, + "step": 15937 + }, + { + "epoch": 0.5707737210593228, + "grad_norm": 1.6349276304244995, + "learning_rate": 8.203264148558749e-05, + "loss": 1.0646, + "step": 15938 + }, + { + "epoch": 0.570809533188891, + "grad_norm": 1.3950893878936768, + "learning_rate": 8.202123135257804e-05, + "loss": 1.1122, + "step": 15939 + }, + { + "epoch": 0.5708453453184593, + "grad_norm": 1.488065481185913, + "learning_rate": 8.20098214614497e-05, + "loss": 1.2146, + "step": 15940 + }, + { + "epoch": 0.5708811574480277, + "grad_norm": 1.4913020133972168, + "learning_rate": 8.199841181235606e-05, + "loss": 1.0953, + "step": 15941 + }, + { + "epoch": 0.570916969577596, + "grad_norm": 1.9569392204284668, + "learning_rate": 8.198700240545053e-05, + "loss": 1.1035, + "step": 15942 + }, + { + "epoch": 0.5709527817071642, + "grad_norm": 1.4951149225234985, + "learning_rate": 8.197559324088666e-05, + "loss": 1.2416, + "step": 15943 + }, + { + "epoch": 0.5709885938367325, + "grad_norm": 1.5944353342056274, + "learning_rate": 8.196418431881793e-05, + "loss": 1.2547, + "step": 15944 + }, + { + "epoch": 0.5710244059663008, + "grad_norm": 1.745341181755066, + "learning_rate": 8.195277563939785e-05, + "loss": 1.2056, + "step": 15945 + }, + { + "epoch": 0.571060218095869, + "grad_norm": 1.5657789707183838, + "learning_rate": 8.194136720277992e-05, + "loss": 0.9723, + "step": 15946 + }, + { + "epoch": 0.5710960302254373, + "grad_norm": 2.0764811038970947, + "learning_rate": 8.192995900911751e-05, + "loss": 1.0487, + "step": 15947 + }, + { + "epoch": 0.5711318423550057, + "grad_norm": 1.60002601146698, + "learning_rate": 8.191855105856428e-05, + "loss": 1.0391, + "step": 15948 + }, + { + "epoch": 0.571167654484574, + "grad_norm": 1.3885066509246826, + "learning_rate": 8.190714335127356e-05, + "loss": 1.1119, + "step": 15949 + }, + { + "epoch": 0.5712034666141422, + "grad_norm": 1.663482666015625, + "learning_rate": 8.189573588739892e-05, + "loss": 1.1202, + "step": 15950 + }, + { + "epoch": 0.5712392787437105, + "grad_norm": 1.5528663396835327, + "learning_rate": 8.188432866709379e-05, + "loss": 1.0834, + "step": 15951 + }, + { + "epoch": 0.5712750908732788, + "grad_norm": 1.7179583311080933, + "learning_rate": 8.187292169051168e-05, + "loss": 1.0026, + "step": 15952 + }, + { + "epoch": 0.571310903002847, + "grad_norm": 1.4757609367370605, + "learning_rate": 8.186151495780598e-05, + "loss": 0.9424, + "step": 15953 + }, + { + "epoch": 0.5713467151324153, + "grad_norm": 3.2358667850494385, + "learning_rate": 8.185010846913024e-05, + "loss": 1.0782, + "step": 15954 + }, + { + "epoch": 0.5713825272619837, + "grad_norm": 1.5905388593673706, + "learning_rate": 8.183870222463789e-05, + "loss": 1.0982, + "step": 15955 + }, + { + "epoch": 0.571418339391552, + "grad_norm": 1.6274991035461426, + "learning_rate": 8.182729622448231e-05, + "loss": 1.1814, + "step": 15956 + }, + { + "epoch": 0.5714541515211202, + "grad_norm": 1.6686424016952515, + "learning_rate": 8.181589046881709e-05, + "loss": 1.0595, + "step": 15957 + }, + { + "epoch": 0.5714899636506885, + "grad_norm": 1.192423701286316, + "learning_rate": 8.180448495779554e-05, + "loss": 1.1339, + "step": 15958 + }, + { + "epoch": 0.5715257757802568, + "grad_norm": 1.5388637781143188, + "learning_rate": 8.179307969157123e-05, + "loss": 1.0154, + "step": 15959 + }, + { + "epoch": 0.571561587909825, + "grad_norm": 1.4867104291915894, + "learning_rate": 8.17816746702975e-05, + "loss": 1.2414, + "step": 15960 + }, + { + "epoch": 0.5715974000393933, + "grad_norm": 1.8304091691970825, + "learning_rate": 8.177026989412789e-05, + "loss": 0.9707, + "step": 15961 + }, + { + "epoch": 0.5716332121689617, + "grad_norm": 1.512740969657898, + "learning_rate": 8.175886536321574e-05, + "loss": 0.9971, + "step": 15962 + }, + { + "epoch": 0.57166902429853, + "grad_norm": 1.5291078090667725, + "learning_rate": 8.174746107771454e-05, + "loss": 1.3193, + "step": 15963 + }, + { + "epoch": 0.5717048364280982, + "grad_norm": 1.6666287183761597, + "learning_rate": 8.173605703777774e-05, + "loss": 1.1636, + "step": 15964 + }, + { + "epoch": 0.5717406485576665, + "grad_norm": 1.7788920402526855, + "learning_rate": 8.172465324355868e-05, + "loss": 1.0155, + "step": 15965 + }, + { + "epoch": 0.5717764606872348, + "grad_norm": 1.5048021078109741, + "learning_rate": 8.171324969521089e-05, + "loss": 0.95, + "step": 15966 + }, + { + "epoch": 0.571812272816803, + "grad_norm": 1.479804515838623, + "learning_rate": 8.170184639288767e-05, + "loss": 1.2548, + "step": 15967 + }, + { + "epoch": 0.5718480849463713, + "grad_norm": 1.6503392457962036, + "learning_rate": 8.169044333674259e-05, + "loss": 1.2181, + "step": 15968 + }, + { + "epoch": 0.5718838970759397, + "grad_norm": 1.6143442392349243, + "learning_rate": 8.167904052692889e-05, + "loss": 1.0424, + "step": 15969 + }, + { + "epoch": 0.5719197092055079, + "grad_norm": 1.581048607826233, + "learning_rate": 8.166763796360014e-05, + "loss": 0.9457, + "step": 15970 + }, + { + "epoch": 0.5719555213350762, + "grad_norm": 1.2326099872589111, + "learning_rate": 8.165623564690961e-05, + "loss": 1.1783, + "step": 15971 + }, + { + "epoch": 0.5719913334646445, + "grad_norm": 1.5945137739181519, + "learning_rate": 8.164483357701082e-05, + "loss": 1.1243, + "step": 15972 + }, + { + "epoch": 0.5720271455942127, + "grad_norm": 1.5612177848815918, + "learning_rate": 8.163343175405712e-05, + "loss": 0.9766, + "step": 15973 + }, + { + "epoch": 0.572062957723781, + "grad_norm": 1.5588704347610474, + "learning_rate": 8.162203017820186e-05, + "loss": 1.1184, + "step": 15974 + }, + { + "epoch": 0.5720987698533493, + "grad_norm": 1.6560430526733398, + "learning_rate": 8.161062884959852e-05, + "loss": 1.1888, + "step": 15975 + }, + { + "epoch": 0.5721345819829177, + "grad_norm": 1.5610874891281128, + "learning_rate": 8.159922776840039e-05, + "loss": 1.1328, + "step": 15976 + }, + { + "epoch": 0.5721703941124859, + "grad_norm": 1.8194422721862793, + "learning_rate": 8.158782693476099e-05, + "loss": 1.1206, + "step": 15977 + }, + { + "epoch": 0.5722062062420542, + "grad_norm": 1.4679369926452637, + "learning_rate": 8.157642634883355e-05, + "loss": 0.947, + "step": 15978 + }, + { + "epoch": 0.5722420183716225, + "grad_norm": 1.5380741357803345, + "learning_rate": 8.156502601077159e-05, + "loss": 1.059, + "step": 15979 + }, + { + "epoch": 0.5722778305011907, + "grad_norm": 1.3476871252059937, + "learning_rate": 8.155362592072837e-05, + "loss": 1.1268, + "step": 15980 + }, + { + "epoch": 0.572313642630759, + "grad_norm": 1.3051488399505615, + "learning_rate": 8.15422260788574e-05, + "loss": 1.1417, + "step": 15981 + }, + { + "epoch": 0.5723494547603273, + "grad_norm": 1.3089959621429443, + "learning_rate": 8.153082648531192e-05, + "loss": 1.2369, + "step": 15982 + }, + { + "epoch": 0.5723852668898957, + "grad_norm": 1.2229418754577637, + "learning_rate": 8.151942714024534e-05, + "loss": 0.9212, + "step": 15983 + }, + { + "epoch": 0.5724210790194639, + "grad_norm": 1.9415431022644043, + "learning_rate": 8.150802804381105e-05, + "loss": 1.1132, + "step": 15984 + }, + { + "epoch": 0.5724568911490322, + "grad_norm": 1.7959752082824707, + "learning_rate": 8.149662919616238e-05, + "loss": 1.1561, + "step": 15985 + }, + { + "epoch": 0.5724927032786005, + "grad_norm": 1.7921006679534912, + "learning_rate": 8.14852305974527e-05, + "loss": 1.1947, + "step": 15986 + }, + { + "epoch": 0.5725285154081687, + "grad_norm": 2.067929267883301, + "learning_rate": 8.147383224783534e-05, + "loss": 1.0972, + "step": 15987 + }, + { + "epoch": 0.572564327537737, + "grad_norm": 1.6399362087249756, + "learning_rate": 8.146243414746371e-05, + "loss": 1.0957, + "step": 15988 + }, + { + "epoch": 0.5726001396673053, + "grad_norm": 1.4565380811691284, + "learning_rate": 8.145103629649104e-05, + "loss": 1.0806, + "step": 15989 + }, + { + "epoch": 0.5726359517968737, + "grad_norm": 1.8207594156265259, + "learning_rate": 8.143963869507085e-05, + "loss": 0.9699, + "step": 15990 + }, + { + "epoch": 0.5726717639264419, + "grad_norm": 1.731702208518982, + "learning_rate": 8.142824134335633e-05, + "loss": 1.0396, + "step": 15991 + }, + { + "epoch": 0.5727075760560102, + "grad_norm": 1.245056390762329, + "learning_rate": 8.141684424150087e-05, + "loss": 1.2532, + "step": 15992 + }, + { + "epoch": 0.5727433881855785, + "grad_norm": 1.474345326423645, + "learning_rate": 8.14054473896578e-05, + "loss": 1.1939, + "step": 15993 + }, + { + "epoch": 0.5727792003151467, + "grad_norm": 1.346092700958252, + "learning_rate": 8.139405078798044e-05, + "loss": 1.206, + "step": 15994 + }, + { + "epoch": 0.572815012444715, + "grad_norm": 1.516502857208252, + "learning_rate": 8.138265443662215e-05, + "loss": 1.2896, + "step": 15995 + }, + { + "epoch": 0.5728508245742833, + "grad_norm": 1.5755329132080078, + "learning_rate": 8.137125833573622e-05, + "loss": 1.0535, + "step": 15996 + }, + { + "epoch": 0.5728866367038516, + "grad_norm": 1.8652448654174805, + "learning_rate": 8.135986248547597e-05, + "loss": 1.2705, + "step": 15997 + }, + { + "epoch": 0.5729224488334199, + "grad_norm": 1.5238237380981445, + "learning_rate": 8.134846688599473e-05, + "loss": 1.1383, + "step": 15998 + }, + { + "epoch": 0.5729582609629882, + "grad_norm": 1.5418792963027954, + "learning_rate": 8.133707153744582e-05, + "loss": 1.0484, + "step": 15999 + }, + { + "epoch": 0.5729940730925565, + "grad_norm": 1.6462653875350952, + "learning_rate": 8.132567643998254e-05, + "loss": 1.1057, + "step": 16000 + }, + { + "epoch": 0.5730298852221247, + "grad_norm": 1.6208840608596802, + "learning_rate": 8.131428159375817e-05, + "loss": 1.1459, + "step": 16001 + }, + { + "epoch": 0.573065697351693, + "grad_norm": 2.0962953567504883, + "learning_rate": 8.130288699892608e-05, + "loss": 1.2332, + "step": 16002 + }, + { + "epoch": 0.5731015094812613, + "grad_norm": 1.5112433433532715, + "learning_rate": 8.129149265563947e-05, + "loss": 1.0327, + "step": 16003 + }, + { + "epoch": 0.5731373216108296, + "grad_norm": 1.742034912109375, + "learning_rate": 8.128009856405174e-05, + "loss": 1.137, + "step": 16004 + }, + { + "epoch": 0.5731731337403979, + "grad_norm": 1.382654070854187, + "learning_rate": 8.126870472431613e-05, + "loss": 1.073, + "step": 16005 + }, + { + "epoch": 0.5732089458699662, + "grad_norm": 1.5228787660598755, + "learning_rate": 8.125731113658594e-05, + "loss": 1.0369, + "step": 16006 + }, + { + "epoch": 0.5732447579995344, + "grad_norm": 1.5594245195388794, + "learning_rate": 8.124591780101443e-05, + "loss": 1.0899, + "step": 16007 + }, + { + "epoch": 0.5732805701291027, + "grad_norm": 1.3821804523468018, + "learning_rate": 8.123452471775493e-05, + "loss": 1.0583, + "step": 16008 + }, + { + "epoch": 0.573316382258671, + "grad_norm": 1.566444754600525, + "learning_rate": 8.122313188696068e-05, + "loss": 1.0005, + "step": 16009 + }, + { + "epoch": 0.5733521943882393, + "grad_norm": 1.575561285018921, + "learning_rate": 8.121173930878496e-05, + "loss": 1.1093, + "step": 16010 + }, + { + "epoch": 0.5733880065178076, + "grad_norm": 1.6822260618209839, + "learning_rate": 8.120034698338108e-05, + "loss": 1.2449, + "step": 16011 + }, + { + "epoch": 0.5734238186473759, + "grad_norm": 1.429179310798645, + "learning_rate": 8.118895491090225e-05, + "loss": 1.0143, + "step": 16012 + }, + { + "epoch": 0.5734596307769442, + "grad_norm": 1.690493106842041, + "learning_rate": 8.11775630915018e-05, + "loss": 1.2349, + "step": 16013 + }, + { + "epoch": 0.5734954429065124, + "grad_norm": 1.423476219177246, + "learning_rate": 8.116617152533292e-05, + "loss": 1.2017, + "step": 16014 + }, + { + "epoch": 0.5735312550360807, + "grad_norm": 1.723628044128418, + "learning_rate": 8.115478021254895e-05, + "loss": 1.1773, + "step": 16015 + }, + { + "epoch": 0.573567067165649, + "grad_norm": 1.3129101991653442, + "learning_rate": 8.114338915330307e-05, + "loss": 0.7691, + "step": 16016 + }, + { + "epoch": 0.5736028792952172, + "grad_norm": 1.4901083707809448, + "learning_rate": 8.113199834774858e-05, + "loss": 1.0334, + "step": 16017 + }, + { + "epoch": 0.5736386914247856, + "grad_norm": 1.5161164999008179, + "learning_rate": 8.112060779603873e-05, + "loss": 1.0559, + "step": 16018 + }, + { + "epoch": 0.5736745035543539, + "grad_norm": 1.5625216960906982, + "learning_rate": 8.110921749832672e-05, + "loss": 1.2629, + "step": 16019 + }, + { + "epoch": 0.5737103156839222, + "grad_norm": 1.807887315750122, + "learning_rate": 8.109782745476585e-05, + "loss": 1.3395, + "step": 16020 + }, + { + "epoch": 0.5737461278134904, + "grad_norm": 1.6624199151992798, + "learning_rate": 8.108643766550929e-05, + "loss": 1.0382, + "step": 16021 + }, + { + "epoch": 0.5737819399430587, + "grad_norm": 1.4088783264160156, + "learning_rate": 8.107504813071036e-05, + "loss": 1.0951, + "step": 16022 + }, + { + "epoch": 0.573817752072627, + "grad_norm": 1.6283400058746338, + "learning_rate": 8.10636588505222e-05, + "loss": 1.0736, + "step": 16023 + }, + { + "epoch": 0.5738535642021952, + "grad_norm": 1.6283156871795654, + "learning_rate": 8.105226982509812e-05, + "loss": 1.2751, + "step": 16024 + }, + { + "epoch": 0.5738893763317636, + "grad_norm": 1.1809659004211426, + "learning_rate": 8.10408810545913e-05, + "loss": 1.0113, + "step": 16025 + }, + { + "epoch": 0.5739251884613319, + "grad_norm": 1.7412725687026978, + "learning_rate": 8.102949253915497e-05, + "loss": 1.0907, + "step": 16026 + }, + { + "epoch": 0.5739610005909002, + "grad_norm": 1.5226879119873047, + "learning_rate": 8.101810427894236e-05, + "loss": 1.2843, + "step": 16027 + }, + { + "epoch": 0.5739968127204684, + "grad_norm": 1.4044219255447388, + "learning_rate": 8.100671627410664e-05, + "loss": 1.1112, + "step": 16028 + }, + { + "epoch": 0.5740326248500367, + "grad_norm": 1.2727383375167847, + "learning_rate": 8.099532852480108e-05, + "loss": 1.0628, + "step": 16029 + }, + { + "epoch": 0.574068436979605, + "grad_norm": 1.4141556024551392, + "learning_rate": 8.098394103117885e-05, + "loss": 1.1459, + "step": 16030 + }, + { + "epoch": 0.5741042491091732, + "grad_norm": 1.6233789920806885, + "learning_rate": 8.097255379339317e-05, + "loss": 1.0378, + "step": 16031 + }, + { + "epoch": 0.5741400612387416, + "grad_norm": 1.4315518140792847, + "learning_rate": 8.096116681159722e-05, + "loss": 1.0073, + "step": 16032 + }, + { + "epoch": 0.5741758733683099, + "grad_norm": 1.8208550214767456, + "learning_rate": 8.094978008594423e-05, + "loss": 1.2079, + "step": 16033 + }, + { + "epoch": 0.5742116854978782, + "grad_norm": 1.8410422801971436, + "learning_rate": 8.093839361658735e-05, + "loss": 1.1623, + "step": 16034 + }, + { + "epoch": 0.5742474976274464, + "grad_norm": 1.33992600440979, + "learning_rate": 8.092700740367983e-05, + "loss": 1.058, + "step": 16035 + }, + { + "epoch": 0.5742833097570147, + "grad_norm": 1.444412350654602, + "learning_rate": 8.091562144737481e-05, + "loss": 1.1513, + "step": 16036 + }, + { + "epoch": 0.574319121886583, + "grad_norm": 1.471505045890808, + "learning_rate": 8.090423574782549e-05, + "loss": 1.1704, + "step": 16037 + }, + { + "epoch": 0.5743549340161512, + "grad_norm": 1.3860453367233276, + "learning_rate": 8.089285030518504e-05, + "loss": 1.2301, + "step": 16038 + }, + { + "epoch": 0.5743907461457196, + "grad_norm": 1.4084025621414185, + "learning_rate": 8.088146511960663e-05, + "loss": 1.2459, + "step": 16039 + }, + { + "epoch": 0.5744265582752879, + "grad_norm": 1.6537001132965088, + "learning_rate": 8.087008019124347e-05, + "loss": 0.9399, + "step": 16040 + }, + { + "epoch": 0.5744623704048561, + "grad_norm": 1.4766860008239746, + "learning_rate": 8.085869552024869e-05, + "loss": 1.026, + "step": 16041 + }, + { + "epoch": 0.5744981825344244, + "grad_norm": 1.6993485689163208, + "learning_rate": 8.084731110677548e-05, + "loss": 1.0689, + "step": 16042 + }, + { + "epoch": 0.5745339946639927, + "grad_norm": 1.3442777395248413, + "learning_rate": 8.083592695097702e-05, + "loss": 1.104, + "step": 16043 + }, + { + "epoch": 0.574569806793561, + "grad_norm": 1.738734483718872, + "learning_rate": 8.082454305300637e-05, + "loss": 1.0397, + "step": 16044 + }, + { + "epoch": 0.5746056189231292, + "grad_norm": 1.470032811164856, + "learning_rate": 8.081315941301683e-05, + "loss": 1.2543, + "step": 16045 + }, + { + "epoch": 0.5746414310526975, + "grad_norm": 1.3153985738754272, + "learning_rate": 8.080177603116142e-05, + "loss": 1.2659, + "step": 16046 + }, + { + "epoch": 0.5746772431822659, + "grad_norm": 1.5559333562850952, + "learning_rate": 8.079039290759341e-05, + "loss": 1.2534, + "step": 16047 + }, + { + "epoch": 0.5747130553118341, + "grad_norm": 1.4609137773513794, + "learning_rate": 8.077901004246584e-05, + "loss": 1.299, + "step": 16048 + }, + { + "epoch": 0.5747488674414024, + "grad_norm": 1.4145578145980835, + "learning_rate": 8.076762743593191e-05, + "loss": 1.0772, + "step": 16049 + }, + { + "epoch": 0.5747846795709707, + "grad_norm": 1.4358549118041992, + "learning_rate": 8.075624508814474e-05, + "loss": 1.0816, + "step": 16050 + }, + { + "epoch": 0.574820491700539, + "grad_norm": 1.741385817527771, + "learning_rate": 8.074486299925749e-05, + "loss": 1.0922, + "step": 16051 + }, + { + "epoch": 0.5748563038301072, + "grad_norm": 1.235917329788208, + "learning_rate": 8.073348116942329e-05, + "loss": 1.2026, + "step": 16052 + }, + { + "epoch": 0.5748921159596755, + "grad_norm": 1.8899214267730713, + "learning_rate": 8.072209959879517e-05, + "loss": 0.9553, + "step": 16053 + }, + { + "epoch": 0.5749279280892439, + "grad_norm": 1.395994782447815, + "learning_rate": 8.071071828752643e-05, + "loss": 0.9356, + "step": 16054 + }, + { + "epoch": 0.5749637402188121, + "grad_norm": 1.590659737586975, + "learning_rate": 8.069933723577e-05, + "loss": 1.2839, + "step": 16055 + }, + { + "epoch": 0.5749995523483804, + "grad_norm": 2.1290087699890137, + "learning_rate": 8.068795644367918e-05, + "loss": 1.3082, + "step": 16056 + }, + { + "epoch": 0.5750353644779487, + "grad_norm": 1.5570316314697266, + "learning_rate": 8.06765759114069e-05, + "loss": 1.433, + "step": 16057 + }, + { + "epoch": 0.5750711766075169, + "grad_norm": 1.6988483667373657, + "learning_rate": 8.066519563910645e-05, + "loss": 1.1124, + "step": 16058 + }, + { + "epoch": 0.5751069887370852, + "grad_norm": 2.1987452507019043, + "learning_rate": 8.065381562693078e-05, + "loss": 1.3022, + "step": 16059 + }, + { + "epoch": 0.5751428008666535, + "grad_norm": 1.9525032043457031, + "learning_rate": 8.064243587503313e-05, + "loss": 1.0051, + "step": 16060 + }, + { + "epoch": 0.5751786129962219, + "grad_norm": 1.768389105796814, + "learning_rate": 8.063105638356654e-05, + "loss": 0.9732, + "step": 16061 + }, + { + "epoch": 0.5752144251257901, + "grad_norm": 1.9619250297546387, + "learning_rate": 8.061967715268403e-05, + "loss": 1.3282, + "step": 16062 + }, + { + "epoch": 0.5752502372553584, + "grad_norm": 1.3371661901474, + "learning_rate": 8.060829818253884e-05, + "loss": 1.0887, + "step": 16063 + }, + { + "epoch": 0.5752860493849267, + "grad_norm": 1.679398775100708, + "learning_rate": 8.059691947328391e-05, + "loss": 1.2368, + "step": 16064 + }, + { + "epoch": 0.5753218615144949, + "grad_norm": 1.4801357984542847, + "learning_rate": 8.058554102507248e-05, + "loss": 1.2685, + "step": 16065 + }, + { + "epoch": 0.5753576736440632, + "grad_norm": 1.3514760732650757, + "learning_rate": 8.057416283805748e-05, + "loss": 1.0976, + "step": 16066 + }, + { + "epoch": 0.5753934857736315, + "grad_norm": 1.433896541595459, + "learning_rate": 8.056278491239213e-05, + "loss": 0.9289, + "step": 16067 + }, + { + "epoch": 0.5754292979031999, + "grad_norm": 1.7242668867111206, + "learning_rate": 8.055140724822938e-05, + "loss": 1.1159, + "step": 16068 + }, + { + "epoch": 0.5754651100327681, + "grad_norm": 1.1524672508239746, + "learning_rate": 8.054002984572241e-05, + "loss": 1.0209, + "step": 16069 + }, + { + "epoch": 0.5755009221623364, + "grad_norm": 1.4651234149932861, + "learning_rate": 8.052865270502422e-05, + "loss": 1.1536, + "step": 16070 + }, + { + "epoch": 0.5755367342919047, + "grad_norm": 1.8504074811935425, + "learning_rate": 8.051727582628788e-05, + "loss": 1.0465, + "step": 16071 + }, + { + "epoch": 0.5755725464214729, + "grad_norm": 1.622083067893982, + "learning_rate": 8.050589920966647e-05, + "loss": 1.1274, + "step": 16072 + }, + { + "epoch": 0.5756083585510412, + "grad_norm": 1.5283178091049194, + "learning_rate": 8.049452285531302e-05, + "loss": 1.1689, + "step": 16073 + }, + { + "epoch": 0.5756441706806095, + "grad_norm": 1.7035788297653198, + "learning_rate": 8.048314676338062e-05, + "loss": 1.2583, + "step": 16074 + }, + { + "epoch": 0.5756799828101778, + "grad_norm": 1.377707600593567, + "learning_rate": 8.047177093402228e-05, + "loss": 1.0832, + "step": 16075 + }, + { + "epoch": 0.5757157949397461, + "grad_norm": 1.4451268911361694, + "learning_rate": 8.046039536739111e-05, + "loss": 1.0602, + "step": 16076 + }, + { + "epoch": 0.5757516070693144, + "grad_norm": 1.8257713317871094, + "learning_rate": 8.044902006364008e-05, + "loss": 1.2533, + "step": 16077 + }, + { + "epoch": 0.5757874191988827, + "grad_norm": 1.5022609233856201, + "learning_rate": 8.043764502292232e-05, + "loss": 1.2162, + "step": 16078 + }, + { + "epoch": 0.5758232313284509, + "grad_norm": 1.3683303594589233, + "learning_rate": 8.04262702453908e-05, + "loss": 1.0835, + "step": 16079 + }, + { + "epoch": 0.5758590434580192, + "grad_norm": 1.3451919555664062, + "learning_rate": 8.041489573119853e-05, + "loss": 1.0533, + "step": 16080 + }, + { + "epoch": 0.5758948555875875, + "grad_norm": 1.486217737197876, + "learning_rate": 8.04035214804986e-05, + "loss": 1.04, + "step": 16081 + }, + { + "epoch": 0.5759306677171558, + "grad_norm": 1.709287166595459, + "learning_rate": 8.0392147493444e-05, + "loss": 1.0214, + "step": 16082 + }, + { + "epoch": 0.5759664798467241, + "grad_norm": 1.442338228225708, + "learning_rate": 8.038077377018776e-05, + "loss": 0.9693, + "step": 16083 + }, + { + "epoch": 0.5760022919762924, + "grad_norm": 2.037189245223999, + "learning_rate": 8.03694003108829e-05, + "loss": 1.2377, + "step": 16084 + }, + { + "epoch": 0.5760381041058606, + "grad_norm": 1.876827359199524, + "learning_rate": 8.035802711568245e-05, + "loss": 1.2083, + "step": 16085 + }, + { + "epoch": 0.5760739162354289, + "grad_norm": 1.7081935405731201, + "learning_rate": 8.03466541847394e-05, + "loss": 1.0801, + "step": 16086 + }, + { + "epoch": 0.5761097283649972, + "grad_norm": 1.6837999820709229, + "learning_rate": 8.033528151820679e-05, + "loss": 1.2945, + "step": 16087 + }, + { + "epoch": 0.5761455404945655, + "grad_norm": 1.3350414037704468, + "learning_rate": 8.032390911623758e-05, + "loss": 1.214, + "step": 16088 + }, + { + "epoch": 0.5761813526241338, + "grad_norm": 1.4586760997772217, + "learning_rate": 8.031253697898478e-05, + "loss": 1.1987, + "step": 16089 + }, + { + "epoch": 0.5762171647537021, + "grad_norm": 1.562679648399353, + "learning_rate": 8.030116510660143e-05, + "loss": 0.8278, + "step": 16090 + }, + { + "epoch": 0.5762529768832704, + "grad_norm": 1.699184536933899, + "learning_rate": 8.028979349924048e-05, + "loss": 1.2181, + "step": 16091 + }, + { + "epoch": 0.5762887890128386, + "grad_norm": 1.5016953945159912, + "learning_rate": 8.027842215705494e-05, + "loss": 0.9502, + "step": 16092 + }, + { + "epoch": 0.5763246011424069, + "grad_norm": 1.6930948495864868, + "learning_rate": 8.026705108019777e-05, + "loss": 1.1542, + "step": 16093 + }, + { + "epoch": 0.5763604132719752, + "grad_norm": 1.8150936365127563, + "learning_rate": 8.0255680268822e-05, + "loss": 0.9857, + "step": 16094 + }, + { + "epoch": 0.5763962254015434, + "grad_norm": 1.380172848701477, + "learning_rate": 8.024430972308056e-05, + "loss": 1.0008, + "step": 16095 + }, + { + "epoch": 0.5764320375311118, + "grad_norm": 1.3091981410980225, + "learning_rate": 8.023293944312647e-05, + "loss": 1.1643, + "step": 16096 + }, + { + "epoch": 0.5764678496606801, + "grad_norm": 1.586610198020935, + "learning_rate": 8.022156942911267e-05, + "loss": 1.0217, + "step": 16097 + }, + { + "epoch": 0.5765036617902484, + "grad_norm": 1.493849754333496, + "learning_rate": 8.021019968119215e-05, + "loss": 1.3845, + "step": 16098 + }, + { + "epoch": 0.5765394739198166, + "grad_norm": 1.8900524377822876, + "learning_rate": 8.019883019951786e-05, + "loss": 0.9481, + "step": 16099 + }, + { + "epoch": 0.5765752860493849, + "grad_norm": 1.7205283641815186, + "learning_rate": 8.018746098424276e-05, + "loss": 1.2973, + "step": 16100 + }, + { + "epoch": 0.5766110981789532, + "grad_norm": 1.5752482414245605, + "learning_rate": 8.017609203551983e-05, + "loss": 1.1239, + "step": 16101 + }, + { + "epoch": 0.5766469103085214, + "grad_norm": 1.716202974319458, + "learning_rate": 8.0164723353502e-05, + "loss": 1.0892, + "step": 16102 + }, + { + "epoch": 0.5766827224380898, + "grad_norm": 1.7306480407714844, + "learning_rate": 8.015335493834224e-05, + "loss": 1.2706, + "step": 16103 + }, + { + "epoch": 0.5767185345676581, + "grad_norm": 1.7084978818893433, + "learning_rate": 8.014198679019348e-05, + "loss": 1.2743, + "step": 16104 + }, + { + "epoch": 0.5767543466972264, + "grad_norm": 1.2483420372009277, + "learning_rate": 8.01306189092087e-05, + "loss": 1.1635, + "step": 16105 + }, + { + "epoch": 0.5767901588267946, + "grad_norm": 1.413422703742981, + "learning_rate": 8.011925129554078e-05, + "loss": 0.945, + "step": 16106 + }, + { + "epoch": 0.5768259709563629, + "grad_norm": 1.7724981307983398, + "learning_rate": 8.01078839493427e-05, + "loss": 0.9319, + "step": 16107 + }, + { + "epoch": 0.5768617830859312, + "grad_norm": 1.4209845066070557, + "learning_rate": 8.009651687076739e-05, + "loss": 1.0842, + "step": 16108 + }, + { + "epoch": 0.5768975952154994, + "grad_norm": 1.363034963607788, + "learning_rate": 8.008515005996775e-05, + "loss": 0.9126, + "step": 16109 + }, + { + "epoch": 0.5769334073450678, + "grad_norm": 1.5302677154541016, + "learning_rate": 8.007378351709676e-05, + "loss": 1.217, + "step": 16110 + }, + { + "epoch": 0.5769692194746361, + "grad_norm": 1.3356093168258667, + "learning_rate": 8.006241724230728e-05, + "loss": 1.2595, + "step": 16111 + }, + { + "epoch": 0.5770050316042044, + "grad_norm": 1.4793031215667725, + "learning_rate": 8.005105123575228e-05, + "loss": 1.0919, + "step": 16112 + }, + { + "epoch": 0.5770408437337726, + "grad_norm": 1.8133876323699951, + "learning_rate": 8.003968549758462e-05, + "loss": 1.2169, + "step": 16113 + }, + { + "epoch": 0.5770766558633409, + "grad_norm": 2.0642037391662598, + "learning_rate": 8.002832002795729e-05, + "loss": 0.8623, + "step": 16114 + }, + { + "epoch": 0.5771124679929092, + "grad_norm": 1.3148818016052246, + "learning_rate": 8.001695482702314e-05, + "loss": 1.0298, + "step": 16115 + }, + { + "epoch": 0.5771482801224774, + "grad_norm": 1.5083178281784058, + "learning_rate": 8.000558989493507e-05, + "loss": 1.2058, + "step": 16116 + }, + { + "epoch": 0.5771840922520458, + "grad_norm": 1.272042155265808, + "learning_rate": 7.999422523184601e-05, + "loss": 0.9975, + "step": 16117 + }, + { + "epoch": 0.5772199043816141, + "grad_norm": 1.4193263053894043, + "learning_rate": 7.998286083790883e-05, + "loss": 0.973, + "step": 16118 + }, + { + "epoch": 0.5772557165111823, + "grad_norm": 1.4870365858078003, + "learning_rate": 7.997149671327646e-05, + "loss": 1.1015, + "step": 16119 + }, + { + "epoch": 0.5772915286407506, + "grad_norm": 1.7273595333099365, + "learning_rate": 7.996013285810173e-05, + "loss": 1.2092, + "step": 16120 + }, + { + "epoch": 0.5773273407703189, + "grad_norm": 1.1539326906204224, + "learning_rate": 7.99487692725376e-05, + "loss": 0.8948, + "step": 16121 + }, + { + "epoch": 0.5773631528998872, + "grad_norm": 1.6219431161880493, + "learning_rate": 7.993740595673689e-05, + "loss": 1.0487, + "step": 16122 + }, + { + "epoch": 0.5773989650294554, + "grad_norm": 1.3661656379699707, + "learning_rate": 7.992604291085253e-05, + "loss": 1.2368, + "step": 16123 + }, + { + "epoch": 0.5774347771590238, + "grad_norm": 2.1174206733703613, + "learning_rate": 7.991468013503735e-05, + "loss": 1.1791, + "step": 16124 + }, + { + "epoch": 0.5774705892885921, + "grad_norm": 1.5479967594146729, + "learning_rate": 7.990331762944426e-05, + "loss": 1.0968, + "step": 16125 + }, + { + "epoch": 0.5775064014181603, + "grad_norm": 1.5775372982025146, + "learning_rate": 7.989195539422609e-05, + "loss": 1.148, + "step": 16126 + }, + { + "epoch": 0.5775422135477286, + "grad_norm": 2.0361859798431396, + "learning_rate": 7.988059342953571e-05, + "loss": 1.3323, + "step": 16127 + }, + { + "epoch": 0.5775780256772969, + "grad_norm": 1.5411615371704102, + "learning_rate": 7.986923173552602e-05, + "loss": 1.1994, + "step": 16128 + }, + { + "epoch": 0.5776138378068651, + "grad_norm": 1.2847652435302734, + "learning_rate": 7.985787031234983e-05, + "loss": 1.0428, + "step": 16129 + }, + { + "epoch": 0.5776496499364334, + "grad_norm": 1.688653588294983, + "learning_rate": 7.984650916016003e-05, + "loss": 1.1533, + "step": 16130 + }, + { + "epoch": 0.5776854620660018, + "grad_norm": 1.3335559368133545, + "learning_rate": 7.983514827910943e-05, + "loss": 1.0714, + "step": 16131 + }, + { + "epoch": 0.5777212741955701, + "grad_norm": 1.2565114498138428, + "learning_rate": 7.982378766935092e-05, + "loss": 1.1287, + "step": 16132 + }, + { + "epoch": 0.5777570863251383, + "grad_norm": 1.3227931261062622, + "learning_rate": 7.981242733103734e-05, + "loss": 1.1879, + "step": 16133 + }, + { + "epoch": 0.5777928984547066, + "grad_norm": 1.6585204601287842, + "learning_rate": 7.980106726432148e-05, + "loss": 0.9822, + "step": 16134 + }, + { + "epoch": 0.5778287105842749, + "grad_norm": 1.475830316543579, + "learning_rate": 7.978970746935621e-05, + "loss": 1.099, + "step": 16135 + }, + { + "epoch": 0.5778645227138431, + "grad_norm": 1.9946439266204834, + "learning_rate": 7.977834794629436e-05, + "loss": 1.1871, + "step": 16136 + }, + { + "epoch": 0.5779003348434114, + "grad_norm": 1.4533398151397705, + "learning_rate": 7.976698869528876e-05, + "loss": 1.112, + "step": 16137 + }, + { + "epoch": 0.5779361469729798, + "grad_norm": 1.3465690612792969, + "learning_rate": 7.975562971649219e-05, + "loss": 1.2095, + "step": 16138 + }, + { + "epoch": 0.5779719591025481, + "grad_norm": 1.533443570137024, + "learning_rate": 7.974427101005756e-05, + "loss": 0.9627, + "step": 16139 + }, + { + "epoch": 0.5780077712321163, + "grad_norm": 1.4571571350097656, + "learning_rate": 7.973291257613761e-05, + "loss": 1.1035, + "step": 16140 + }, + { + "epoch": 0.5780435833616846, + "grad_norm": 1.3042056560516357, + "learning_rate": 7.97215544148852e-05, + "loss": 1.0877, + "step": 16141 + }, + { + "epoch": 0.5780793954912529, + "grad_norm": 1.477419376373291, + "learning_rate": 7.971019652645313e-05, + "loss": 1.086, + "step": 16142 + }, + { + "epoch": 0.5781152076208211, + "grad_norm": 2.672039747238159, + "learning_rate": 7.969883891099412e-05, + "loss": 1.1425, + "step": 16143 + }, + { + "epoch": 0.5781510197503894, + "grad_norm": 1.8422902822494507, + "learning_rate": 7.968748156866113e-05, + "loss": 0.9882, + "step": 16144 + }, + { + "epoch": 0.5781868318799578, + "grad_norm": 1.8300706148147583, + "learning_rate": 7.967612449960679e-05, + "loss": 1.3045, + "step": 16145 + }, + { + "epoch": 0.578222644009526, + "grad_norm": 1.5001460313796997, + "learning_rate": 7.966476770398404e-05, + "loss": 1.145, + "step": 16146 + }, + { + "epoch": 0.5782584561390943, + "grad_norm": 1.5337493419647217, + "learning_rate": 7.965341118194559e-05, + "loss": 1.0281, + "step": 16147 + }, + { + "epoch": 0.5782942682686626, + "grad_norm": 1.266029715538025, + "learning_rate": 7.964205493364426e-05, + "loss": 1.2088, + "step": 16148 + }, + { + "epoch": 0.5783300803982309, + "grad_norm": 1.8624821901321411, + "learning_rate": 7.963069895923285e-05, + "loss": 1.1899, + "step": 16149 + }, + { + "epoch": 0.5783658925277991, + "grad_norm": 1.7304322719573975, + "learning_rate": 7.961934325886404e-05, + "loss": 0.9292, + "step": 16150 + }, + { + "epoch": 0.5784017046573674, + "grad_norm": 1.3733140230178833, + "learning_rate": 7.960798783269074e-05, + "loss": 1.1455, + "step": 16151 + }, + { + "epoch": 0.5784375167869358, + "grad_norm": 1.8078744411468506, + "learning_rate": 7.95966326808656e-05, + "loss": 1.03, + "step": 16152 + }, + { + "epoch": 0.578473328916504, + "grad_norm": 1.3615671396255493, + "learning_rate": 7.958527780354151e-05, + "loss": 1.0085, + "step": 16153 + }, + { + "epoch": 0.5785091410460723, + "grad_norm": 1.307497501373291, + "learning_rate": 7.957392320087112e-05, + "loss": 1.4743, + "step": 16154 + }, + { + "epoch": 0.5785449531756406, + "grad_norm": 1.2150083780288696, + "learning_rate": 7.956256887300729e-05, + "loss": 1.085, + "step": 16155 + }, + { + "epoch": 0.5785807653052089, + "grad_norm": 1.653429388999939, + "learning_rate": 7.955121482010268e-05, + "loss": 1.0103, + "step": 16156 + }, + { + "epoch": 0.5786165774347771, + "grad_norm": 1.7205766439437866, + "learning_rate": 7.953986104231018e-05, + "loss": 1.0599, + "step": 16157 + }, + { + "epoch": 0.5786523895643454, + "grad_norm": 1.5431851148605347, + "learning_rate": 7.95285075397824e-05, + "loss": 1.2184, + "step": 16158 + }, + { + "epoch": 0.5786882016939138, + "grad_norm": 1.6440786123275757, + "learning_rate": 7.951715431267213e-05, + "loss": 1.2018, + "step": 16159 + }, + { + "epoch": 0.578724013823482, + "grad_norm": 1.3163930177688599, + "learning_rate": 7.950580136113219e-05, + "loss": 1.0687, + "step": 16160 + }, + { + "epoch": 0.5787598259530503, + "grad_norm": 1.4953440427780151, + "learning_rate": 7.949444868531517e-05, + "loss": 1.1071, + "step": 16161 + }, + { + "epoch": 0.5787956380826186, + "grad_norm": 1.3451327085494995, + "learning_rate": 7.948309628537399e-05, + "loss": 1.0186, + "step": 16162 + }, + { + "epoch": 0.5788314502121868, + "grad_norm": 1.5791611671447754, + "learning_rate": 7.94717441614612e-05, + "loss": 1.1955, + "step": 16163 + }, + { + "epoch": 0.5788672623417551, + "grad_norm": 1.2314120531082153, + "learning_rate": 7.946039231372967e-05, + "loss": 1.106, + "step": 16164 + }, + { + "epoch": 0.5789030744713234, + "grad_norm": 1.539177656173706, + "learning_rate": 7.944904074233201e-05, + "loss": 1.1091, + "step": 16165 + }, + { + "epoch": 0.5789388866008918, + "grad_norm": 1.2953722476959229, + "learning_rate": 7.943768944742107e-05, + "loss": 1.0479, + "step": 16166 + }, + { + "epoch": 0.57897469873046, + "grad_norm": 1.413205862045288, + "learning_rate": 7.942633842914946e-05, + "loss": 1.1145, + "step": 16167 + }, + { + "epoch": 0.5790105108600283, + "grad_norm": 1.2743622064590454, + "learning_rate": 7.941498768766991e-05, + "loss": 1.0468, + "step": 16168 + }, + { + "epoch": 0.5790463229895966, + "grad_norm": 2.594268798828125, + "learning_rate": 7.940363722313519e-05, + "loss": 1.067, + "step": 16169 + }, + { + "epoch": 0.5790821351191648, + "grad_norm": 1.6604175567626953, + "learning_rate": 7.939228703569792e-05, + "loss": 1.2263, + "step": 16170 + }, + { + "epoch": 0.5791179472487331, + "grad_norm": 1.6052683591842651, + "learning_rate": 7.938093712551087e-05, + "loss": 1.2477, + "step": 16171 + }, + { + "epoch": 0.5791537593783014, + "grad_norm": 1.9785220623016357, + "learning_rate": 7.936958749272669e-05, + "loss": 1.3219, + "step": 16172 + }, + { + "epoch": 0.5791895715078698, + "grad_norm": 1.6394307613372803, + "learning_rate": 7.935823813749815e-05, + "loss": 1.1522, + "step": 16173 + }, + { + "epoch": 0.579225383637438, + "grad_norm": 2.0350303649902344, + "learning_rate": 7.934688905997781e-05, + "loss": 1.2489, + "step": 16174 + }, + { + "epoch": 0.5792611957670063, + "grad_norm": 1.4184038639068604, + "learning_rate": 7.933554026031852e-05, + "loss": 0.9481, + "step": 16175 + }, + { + "epoch": 0.5792970078965746, + "grad_norm": 1.3952240943908691, + "learning_rate": 7.932419173867286e-05, + "loss": 1.1566, + "step": 16176 + }, + { + "epoch": 0.5793328200261428, + "grad_norm": 1.7443636655807495, + "learning_rate": 7.93128434951935e-05, + "loss": 1.0272, + "step": 16177 + }, + { + "epoch": 0.5793686321557111, + "grad_norm": 1.7372013330459595, + "learning_rate": 7.930149553003318e-05, + "loss": 1.2435, + "step": 16178 + }, + { + "epoch": 0.5794044442852794, + "grad_norm": 1.5241985321044922, + "learning_rate": 7.92901478433445e-05, + "loss": 1.0494, + "step": 16179 + }, + { + "epoch": 0.5794402564148478, + "grad_norm": 1.4107056856155396, + "learning_rate": 7.92788004352802e-05, + "loss": 1.2316, + "step": 16180 + }, + { + "epoch": 0.579476068544416, + "grad_norm": 1.9451042413711548, + "learning_rate": 7.926745330599289e-05, + "loss": 1.2176, + "step": 16181 + }, + { + "epoch": 0.5795118806739843, + "grad_norm": 1.6000522375106812, + "learning_rate": 7.925610645563527e-05, + "loss": 1.1133, + "step": 16182 + }, + { + "epoch": 0.5795476928035526, + "grad_norm": 1.7063552141189575, + "learning_rate": 7.924475988435996e-05, + "loss": 1.2497, + "step": 16183 + }, + { + "epoch": 0.5795835049331208, + "grad_norm": 1.62648344039917, + "learning_rate": 7.923341359231965e-05, + "loss": 1.1884, + "step": 16184 + }, + { + "epoch": 0.5796193170626891, + "grad_norm": 1.4733234643936157, + "learning_rate": 7.922206757966698e-05, + "loss": 0.9423, + "step": 16185 + }, + { + "epoch": 0.5796551291922574, + "grad_norm": 1.859519600868225, + "learning_rate": 7.921072184655457e-05, + "loss": 1.048, + "step": 16186 + }, + { + "epoch": 0.5796909413218257, + "grad_norm": 1.4553723335266113, + "learning_rate": 7.91993763931351e-05, + "loss": 1.1132, + "step": 16187 + }, + { + "epoch": 0.579726753451394, + "grad_norm": 1.3949248790740967, + "learning_rate": 7.918803121956117e-05, + "loss": 1.1785, + "step": 16188 + }, + { + "epoch": 0.5797625655809623, + "grad_norm": 1.6098324060440063, + "learning_rate": 7.917668632598545e-05, + "loss": 1.0102, + "step": 16189 + }, + { + "epoch": 0.5797983777105306, + "grad_norm": 1.6656395196914673, + "learning_rate": 7.916534171256054e-05, + "loss": 0.8961, + "step": 16190 + }, + { + "epoch": 0.5798341898400988, + "grad_norm": 1.413787841796875, + "learning_rate": 7.91539973794391e-05, + "loss": 1.3301, + "step": 16191 + }, + { + "epoch": 0.5798700019696671, + "grad_norm": 1.598262071609497, + "learning_rate": 7.914265332677371e-05, + "loss": 1.1723, + "step": 16192 + }, + { + "epoch": 0.5799058140992354, + "grad_norm": 1.479655385017395, + "learning_rate": 7.913130955471704e-05, + "loss": 1.3146, + "step": 16193 + }, + { + "epoch": 0.5799416262288037, + "grad_norm": 1.4385954141616821, + "learning_rate": 7.911996606342168e-05, + "loss": 1.0431, + "step": 16194 + }, + { + "epoch": 0.579977438358372, + "grad_norm": 1.6300036907196045, + "learning_rate": 7.910862285304022e-05, + "loss": 1.0501, + "step": 16195 + }, + { + "epoch": 0.5800132504879403, + "grad_norm": 1.5643138885498047, + "learning_rate": 7.909727992372533e-05, + "loss": 1.0939, + "step": 16196 + }, + { + "epoch": 0.5800490626175085, + "grad_norm": 1.2582428455352783, + "learning_rate": 7.908593727562954e-05, + "loss": 1.0083, + "step": 16197 + }, + { + "epoch": 0.5800848747470768, + "grad_norm": 1.3980600833892822, + "learning_rate": 7.907459490890551e-05, + "loss": 0.9629, + "step": 16198 + }, + { + "epoch": 0.5801206868766451, + "grad_norm": 1.2810264825820923, + "learning_rate": 7.906325282370579e-05, + "loss": 1.2117, + "step": 16199 + }, + { + "epoch": 0.5801564990062134, + "grad_norm": 1.6594032049179077, + "learning_rate": 7.905191102018302e-05, + "loss": 1.0752, + "step": 16200 + }, + { + "epoch": 0.5801923111357817, + "grad_norm": 1.6338378190994263, + "learning_rate": 7.904056949848975e-05, + "loss": 1.1085, + "step": 16201 + }, + { + "epoch": 0.58022812326535, + "grad_norm": 1.6635088920593262, + "learning_rate": 7.90292282587786e-05, + "loss": 1.1086, + "step": 16202 + }, + { + "epoch": 0.5802639353949183, + "grad_norm": 1.4075121879577637, + "learning_rate": 7.901788730120214e-05, + "loss": 0.8672, + "step": 16203 + }, + { + "epoch": 0.5802997475244865, + "grad_norm": 1.6078139543533325, + "learning_rate": 7.90065466259129e-05, + "loss": 1.0643, + "step": 16204 + }, + { + "epoch": 0.5803355596540548, + "grad_norm": 1.5319492816925049, + "learning_rate": 7.899520623306353e-05, + "loss": 0.9728, + "step": 16205 + }, + { + "epoch": 0.5803713717836231, + "grad_norm": 1.7444231510162354, + "learning_rate": 7.898386612280654e-05, + "loss": 1.0026, + "step": 16206 + }, + { + "epoch": 0.5804071839131913, + "grad_norm": 1.4153860807418823, + "learning_rate": 7.897252629529455e-05, + "loss": 1.1314, + "step": 16207 + }, + { + "epoch": 0.5804429960427597, + "grad_norm": 1.732682466506958, + "learning_rate": 7.896118675068007e-05, + "loss": 1.1595, + "step": 16208 + }, + { + "epoch": 0.580478808172328, + "grad_norm": 1.5258427858352661, + "learning_rate": 7.894984748911572e-05, + "loss": 1.0826, + "step": 16209 + }, + { + "epoch": 0.5805146203018963, + "grad_norm": 1.9799833297729492, + "learning_rate": 7.893850851075398e-05, + "loss": 1.0977, + "step": 16210 + }, + { + "epoch": 0.5805504324314645, + "grad_norm": 2.227360963821411, + "learning_rate": 7.892716981574747e-05, + "loss": 1.2352, + "step": 16211 + }, + { + "epoch": 0.5805862445610328, + "grad_norm": 1.383536696434021, + "learning_rate": 7.89158314042487e-05, + "loss": 1.1502, + "step": 16212 + }, + { + "epoch": 0.5806220566906011, + "grad_norm": 1.5642067193984985, + "learning_rate": 7.890449327641021e-05, + "loss": 1.3177, + "step": 16213 + }, + { + "epoch": 0.5806578688201693, + "grad_norm": 1.5186715126037598, + "learning_rate": 7.889315543238457e-05, + "loss": 1.2498, + "step": 16214 + }, + { + "epoch": 0.5806936809497377, + "grad_norm": 1.4101649522781372, + "learning_rate": 7.888181787232427e-05, + "loss": 1.2024, + "step": 16215 + }, + { + "epoch": 0.580729493079306, + "grad_norm": 1.615905523300171, + "learning_rate": 7.88704805963819e-05, + "loss": 1.1479, + "step": 16216 + }, + { + "epoch": 0.5807653052088743, + "grad_norm": 1.61942720413208, + "learning_rate": 7.885914360470992e-05, + "loss": 1.1628, + "step": 16217 + }, + { + "epoch": 0.5808011173384425, + "grad_norm": 1.6427838802337646, + "learning_rate": 7.884780689746094e-05, + "loss": 1.1449, + "step": 16218 + }, + { + "epoch": 0.5808369294680108, + "grad_norm": 1.4408960342407227, + "learning_rate": 7.88364704747874e-05, + "loss": 1.1256, + "step": 16219 + }, + { + "epoch": 0.5808727415975791, + "grad_norm": 1.4035592079162598, + "learning_rate": 7.882513433684188e-05, + "loss": 1.1286, + "step": 16220 + }, + { + "epoch": 0.5809085537271473, + "grad_norm": 1.7351101636886597, + "learning_rate": 7.881379848377685e-05, + "loss": 1.2078, + "step": 16221 + }, + { + "epoch": 0.5809443658567157, + "grad_norm": 1.7563477754592896, + "learning_rate": 7.880246291574482e-05, + "loss": 0.9939, + "step": 16222 + }, + { + "epoch": 0.580980177986284, + "grad_norm": 1.566747784614563, + "learning_rate": 7.879112763289833e-05, + "loss": 1.0863, + "step": 16223 + }, + { + "epoch": 0.5810159901158523, + "grad_norm": 1.4956011772155762, + "learning_rate": 7.877979263538983e-05, + "loss": 1.236, + "step": 16224 + }, + { + "epoch": 0.5810518022454205, + "grad_norm": 1.3045196533203125, + "learning_rate": 7.876845792337189e-05, + "loss": 1.0302, + "step": 16225 + }, + { + "epoch": 0.5810876143749888, + "grad_norm": 1.8649146556854248, + "learning_rate": 7.875712349699692e-05, + "loss": 0.9123, + "step": 16226 + }, + { + "epoch": 0.5811234265045571, + "grad_norm": 1.8529996871948242, + "learning_rate": 7.874578935641748e-05, + "loss": 1.1199, + "step": 16227 + }, + { + "epoch": 0.5811592386341253, + "grad_norm": 1.4653624296188354, + "learning_rate": 7.873445550178601e-05, + "loss": 1.0351, + "step": 16228 + }, + { + "epoch": 0.5811950507636937, + "grad_norm": 1.7991926670074463, + "learning_rate": 7.872312193325502e-05, + "loss": 0.9521, + "step": 16229 + }, + { + "epoch": 0.581230862893262, + "grad_norm": 1.668229103088379, + "learning_rate": 7.871178865097699e-05, + "loss": 1.1699, + "step": 16230 + }, + { + "epoch": 0.5812666750228302, + "grad_norm": 1.5644352436065674, + "learning_rate": 7.870045565510436e-05, + "loss": 1.0594, + "step": 16231 + }, + { + "epoch": 0.5813024871523985, + "grad_norm": 1.5447109937667847, + "learning_rate": 7.868912294578965e-05, + "loss": 0.9239, + "step": 16232 + }, + { + "epoch": 0.5813382992819668, + "grad_norm": 1.5226558446884155, + "learning_rate": 7.867779052318528e-05, + "loss": 1.2583, + "step": 16233 + }, + { + "epoch": 0.581374111411535, + "grad_norm": 1.5740386247634888, + "learning_rate": 7.866645838744375e-05, + "loss": 1.2214, + "step": 16234 + }, + { + "epoch": 0.5814099235411033, + "grad_norm": 2.7467257976531982, + "learning_rate": 7.865512653871749e-05, + "loss": 1.0515, + "step": 16235 + }, + { + "epoch": 0.5814457356706717, + "grad_norm": 1.8122968673706055, + "learning_rate": 7.864379497715898e-05, + "loss": 1.2159, + "step": 16236 + }, + { + "epoch": 0.58148154780024, + "grad_norm": 1.204953908920288, + "learning_rate": 7.863246370292065e-05, + "loss": 0.9554, + "step": 16237 + }, + { + "epoch": 0.5815173599298082, + "grad_norm": 1.3933370113372803, + "learning_rate": 7.862113271615499e-05, + "loss": 0.9781, + "step": 16238 + }, + { + "epoch": 0.5815531720593765, + "grad_norm": 1.2828501462936401, + "learning_rate": 7.860980201701441e-05, + "loss": 1.0399, + "step": 16239 + }, + { + "epoch": 0.5815889841889448, + "grad_norm": 1.4199169874191284, + "learning_rate": 7.859847160565131e-05, + "loss": 1.0499, + "step": 16240 + }, + { + "epoch": 0.581624796318513, + "grad_norm": 1.7954286336898804, + "learning_rate": 7.858714148221822e-05, + "loss": 1.2605, + "step": 16241 + }, + { + "epoch": 0.5816606084480813, + "grad_norm": 1.3185869455337524, + "learning_rate": 7.857581164686744e-05, + "loss": 1.3149, + "step": 16242 + }, + { + "epoch": 0.5816964205776497, + "grad_norm": 2.0654001235961914, + "learning_rate": 7.856448209975156e-05, + "loss": 1.2844, + "step": 16243 + }, + { + "epoch": 0.581732232707218, + "grad_norm": 1.5844101905822754, + "learning_rate": 7.855315284102288e-05, + "loss": 0.9056, + "step": 16244 + }, + { + "epoch": 0.5817680448367862, + "grad_norm": 1.578384518623352, + "learning_rate": 7.854182387083389e-05, + "loss": 1.0731, + "step": 16245 + }, + { + "epoch": 0.5818038569663545, + "grad_norm": 1.793883204460144, + "learning_rate": 7.8530495189337e-05, + "loss": 1.3294, + "step": 16246 + }, + { + "epoch": 0.5818396690959228, + "grad_norm": 1.4371942281723022, + "learning_rate": 7.851916679668454e-05, + "loss": 1.0881, + "step": 16247 + }, + { + "epoch": 0.581875481225491, + "grad_norm": 1.6358712911605835, + "learning_rate": 7.850783869302905e-05, + "loss": 1.1382, + "step": 16248 + }, + { + "epoch": 0.5819112933550593, + "grad_norm": 1.508277416229248, + "learning_rate": 7.849651087852278e-05, + "loss": 1.0516, + "step": 16249 + }, + { + "epoch": 0.5819471054846277, + "grad_norm": 1.5256155729293823, + "learning_rate": 7.848518335331832e-05, + "loss": 1.2132, + "step": 16250 + }, + { + "epoch": 0.581982917614196, + "grad_norm": 1.3500776290893555, + "learning_rate": 7.847385611756788e-05, + "loss": 1.0153, + "step": 16251 + }, + { + "epoch": 0.5820187297437642, + "grad_norm": 1.5089564323425293, + "learning_rate": 7.8462529171424e-05, + "loss": 0.8731, + "step": 16252 + }, + { + "epoch": 0.5820545418733325, + "grad_norm": 1.6431623697280884, + "learning_rate": 7.845120251503896e-05, + "loss": 1.339, + "step": 16253 + }, + { + "epoch": 0.5820903540029008, + "grad_norm": 1.7186222076416016, + "learning_rate": 7.843987614856525e-05, + "loss": 1.2453, + "step": 16254 + }, + { + "epoch": 0.582126166132469, + "grad_norm": 1.3663320541381836, + "learning_rate": 7.842855007215517e-05, + "loss": 0.8464, + "step": 16255 + }, + { + "epoch": 0.5821619782620373, + "grad_norm": 1.3855562210083008, + "learning_rate": 7.841722428596109e-05, + "loss": 1.1368, + "step": 16256 + }, + { + "epoch": 0.5821977903916057, + "grad_norm": 1.6392287015914917, + "learning_rate": 7.840589879013548e-05, + "loss": 1.2449, + "step": 16257 + }, + { + "epoch": 0.582233602521174, + "grad_norm": 1.4389864206314087, + "learning_rate": 7.839457358483057e-05, + "loss": 1.0522, + "step": 16258 + }, + { + "epoch": 0.5822694146507422, + "grad_norm": 1.4745234251022339, + "learning_rate": 7.838324867019888e-05, + "loss": 0.9977, + "step": 16259 + }, + { + "epoch": 0.5823052267803105, + "grad_norm": 1.451280117034912, + "learning_rate": 7.837192404639264e-05, + "loss": 1.0918, + "step": 16260 + }, + { + "epoch": 0.5823410389098788, + "grad_norm": 1.5961008071899414, + "learning_rate": 7.83605997135643e-05, + "loss": 1.1489, + "step": 16261 + }, + { + "epoch": 0.582376851039447, + "grad_norm": 1.6647436618804932, + "learning_rate": 7.834927567186614e-05, + "loss": 1.0473, + "step": 16262 + }, + { + "epoch": 0.5824126631690153, + "grad_norm": 1.4847846031188965, + "learning_rate": 7.833795192145062e-05, + "loss": 1.1126, + "step": 16263 + }, + { + "epoch": 0.5824484752985837, + "grad_norm": 1.586654543876648, + "learning_rate": 7.832662846246997e-05, + "loss": 1.1274, + "step": 16264 + }, + { + "epoch": 0.582484287428152, + "grad_norm": 1.3837648630142212, + "learning_rate": 7.831530529507656e-05, + "loss": 1.0507, + "step": 16265 + }, + { + "epoch": 0.5825200995577202, + "grad_norm": 1.8549635410308838, + "learning_rate": 7.830398241942278e-05, + "loss": 1.0978, + "step": 16266 + }, + { + "epoch": 0.5825559116872885, + "grad_norm": 1.6929441690444946, + "learning_rate": 7.829265983566088e-05, + "loss": 1.0668, + "step": 16267 + }, + { + "epoch": 0.5825917238168568, + "grad_norm": 1.330588459968567, + "learning_rate": 7.82813375439433e-05, + "loss": 1.1598, + "step": 16268 + }, + { + "epoch": 0.582627535946425, + "grad_norm": 2.3575119972229004, + "learning_rate": 7.827001554442224e-05, + "loss": 1.175, + "step": 16269 + }, + { + "epoch": 0.5826633480759933, + "grad_norm": 1.6143646240234375, + "learning_rate": 7.825869383725017e-05, + "loss": 1.084, + "step": 16270 + }, + { + "epoch": 0.5826991602055617, + "grad_norm": 1.277207374572754, + "learning_rate": 7.824737242257925e-05, + "loss": 1.149, + "step": 16271 + }, + { + "epoch": 0.5827349723351299, + "grad_norm": 1.959450602531433, + "learning_rate": 7.823605130056196e-05, + "loss": 1.2193, + "step": 16272 + }, + { + "epoch": 0.5827707844646982, + "grad_norm": 1.7212493419647217, + "learning_rate": 7.822473047135048e-05, + "loss": 1.0807, + "step": 16273 + }, + { + "epoch": 0.5828065965942665, + "grad_norm": 1.5534263849258423, + "learning_rate": 7.821340993509716e-05, + "loss": 1.1676, + "step": 16274 + }, + { + "epoch": 0.5828424087238347, + "grad_norm": 1.6398370265960693, + "learning_rate": 7.820208969195432e-05, + "loss": 1.149, + "step": 16275 + }, + { + "epoch": 0.582878220853403, + "grad_norm": 1.4506124258041382, + "learning_rate": 7.819076974207425e-05, + "loss": 1.217, + "step": 16276 + }, + { + "epoch": 0.5829140329829713, + "grad_norm": 1.4424200057983398, + "learning_rate": 7.817945008560923e-05, + "loss": 1.22, + "step": 16277 + }, + { + "epoch": 0.5829498451125397, + "grad_norm": 1.2914750576019287, + "learning_rate": 7.816813072271155e-05, + "loss": 0.9056, + "step": 16278 + }, + { + "epoch": 0.5829856572421079, + "grad_norm": 1.4945721626281738, + "learning_rate": 7.815681165353353e-05, + "loss": 0.9905, + "step": 16279 + }, + { + "epoch": 0.5830214693716762, + "grad_norm": 1.3609520196914673, + "learning_rate": 7.814549287822743e-05, + "loss": 1.3354, + "step": 16280 + }, + { + "epoch": 0.5830572815012445, + "grad_norm": 1.866223692893982, + "learning_rate": 7.813417439694553e-05, + "loss": 1.1183, + "step": 16281 + }, + { + "epoch": 0.5830930936308127, + "grad_norm": 1.6392391920089722, + "learning_rate": 7.812285620984012e-05, + "loss": 0.9216, + "step": 16282 + }, + { + "epoch": 0.583128905760381, + "grad_norm": 1.6932716369628906, + "learning_rate": 7.811153831706344e-05, + "loss": 1.1595, + "step": 16283 + }, + { + "epoch": 0.5831647178899493, + "grad_norm": 1.3069998025894165, + "learning_rate": 7.81002207187678e-05, + "loss": 1.1825, + "step": 16284 + }, + { + "epoch": 0.5832005300195177, + "grad_norm": 1.5720137357711792, + "learning_rate": 7.808890341510542e-05, + "loss": 1.089, + "step": 16285 + }, + { + "epoch": 0.5832363421490859, + "grad_norm": 2.0093538761138916, + "learning_rate": 7.80775864062286e-05, + "loss": 1.0411, + "step": 16286 + }, + { + "epoch": 0.5832721542786542, + "grad_norm": 1.4814075231552124, + "learning_rate": 7.806626969228955e-05, + "loss": 0.9769, + "step": 16287 + }, + { + "epoch": 0.5833079664082225, + "grad_norm": 1.5158705711364746, + "learning_rate": 7.805495327344058e-05, + "loss": 1.2604, + "step": 16288 + }, + { + "epoch": 0.5833437785377907, + "grad_norm": 1.3082401752471924, + "learning_rate": 7.804363714983387e-05, + "loss": 0.9706, + "step": 16289 + }, + { + "epoch": 0.583379590667359, + "grad_norm": 1.5427874326705933, + "learning_rate": 7.803232132162174e-05, + "loss": 1.2047, + "step": 16290 + }, + { + "epoch": 0.5834154027969273, + "grad_norm": 1.4106240272521973, + "learning_rate": 7.802100578895638e-05, + "loss": 1.1904, + "step": 16291 + }, + { + "epoch": 0.5834512149264957, + "grad_norm": 1.4940377473831177, + "learning_rate": 7.800969055199003e-05, + "loss": 1.1266, + "step": 16292 + }, + { + "epoch": 0.5834870270560639, + "grad_norm": 1.2310227155685425, + "learning_rate": 7.799837561087493e-05, + "loss": 0.9932, + "step": 16293 + }, + { + "epoch": 0.5835228391856322, + "grad_norm": 1.5278295278549194, + "learning_rate": 7.798706096576329e-05, + "loss": 1.1778, + "step": 16294 + }, + { + "epoch": 0.5835586513152005, + "grad_norm": 1.300963282585144, + "learning_rate": 7.797574661680737e-05, + "loss": 0.8761, + "step": 16295 + }, + { + "epoch": 0.5835944634447687, + "grad_norm": 1.3971140384674072, + "learning_rate": 7.796443256415935e-05, + "loss": 1.226, + "step": 16296 + }, + { + "epoch": 0.583630275574337, + "grad_norm": 1.5593593120574951, + "learning_rate": 7.79531188079715e-05, + "loss": 1.0731, + "step": 16297 + }, + { + "epoch": 0.5836660877039053, + "grad_norm": 1.7029342651367188, + "learning_rate": 7.794180534839597e-05, + "loss": 0.958, + "step": 16298 + }, + { + "epoch": 0.5837018998334736, + "grad_norm": 1.8596760034561157, + "learning_rate": 7.793049218558501e-05, + "loss": 1.0994, + "step": 16299 + }, + { + "epoch": 0.5837377119630419, + "grad_norm": 1.5808812379837036, + "learning_rate": 7.791917931969082e-05, + "loss": 1.2165, + "step": 16300 + }, + { + "epoch": 0.5837735240926102, + "grad_norm": 1.8265684843063354, + "learning_rate": 7.790786675086555e-05, + "loss": 1.222, + "step": 16301 + }, + { + "epoch": 0.5838093362221785, + "grad_norm": 1.652974009513855, + "learning_rate": 7.789655447926147e-05, + "loss": 1.2552, + "step": 16302 + }, + { + "epoch": 0.5838451483517467, + "grad_norm": 1.4716227054595947, + "learning_rate": 7.788524250503072e-05, + "loss": 1.19, + "step": 16303 + }, + { + "epoch": 0.583880960481315, + "grad_norm": 1.4432616233825684, + "learning_rate": 7.787393082832553e-05, + "loss": 1.1598, + "step": 16304 + }, + { + "epoch": 0.5839167726108833, + "grad_norm": 1.8494104146957397, + "learning_rate": 7.786261944929803e-05, + "loss": 1.1844, + "step": 16305 + }, + { + "epoch": 0.5839525847404516, + "grad_norm": 3.431018829345703, + "learning_rate": 7.785130836810045e-05, + "loss": 1.1052, + "step": 16306 + }, + { + "epoch": 0.5839883968700199, + "grad_norm": 1.6412378549575806, + "learning_rate": 7.783999758488492e-05, + "loss": 1.019, + "step": 16307 + }, + { + "epoch": 0.5840242089995882, + "grad_norm": 1.4380549192428589, + "learning_rate": 7.782868709980368e-05, + "loss": 1.1323, + "step": 16308 + }, + { + "epoch": 0.5840600211291564, + "grad_norm": 1.4788466691970825, + "learning_rate": 7.781737691300884e-05, + "loss": 1.2037, + "step": 16309 + }, + { + "epoch": 0.5840958332587247, + "grad_norm": 1.5595582723617554, + "learning_rate": 7.780606702465256e-05, + "loss": 1.1289, + "step": 16310 + }, + { + "epoch": 0.584131645388293, + "grad_norm": 1.4895504713058472, + "learning_rate": 7.779475743488705e-05, + "loss": 1.1047, + "step": 16311 + }, + { + "epoch": 0.5841674575178613, + "grad_norm": 1.4273128509521484, + "learning_rate": 7.778344814386441e-05, + "loss": 1.0417, + "step": 16312 + }, + { + "epoch": 0.5842032696474296, + "grad_norm": 1.4697070121765137, + "learning_rate": 7.777213915173685e-05, + "loss": 1.1705, + "step": 16313 + }, + { + "epoch": 0.5842390817769979, + "grad_norm": 1.643593430519104, + "learning_rate": 7.776083045865645e-05, + "loss": 1.1601, + "step": 16314 + }, + { + "epoch": 0.5842748939065662, + "grad_norm": 1.6340702772140503, + "learning_rate": 7.774952206477542e-05, + "loss": 1.1794, + "step": 16315 + }, + { + "epoch": 0.5843107060361344, + "grad_norm": 1.599739909172058, + "learning_rate": 7.773821397024584e-05, + "loss": 1.2423, + "step": 16316 + }, + { + "epoch": 0.5843465181657027, + "grad_norm": 1.5832418203353882, + "learning_rate": 7.77269061752199e-05, + "loss": 1.1675, + "step": 16317 + }, + { + "epoch": 0.584382330295271, + "grad_norm": 1.6511191129684448, + "learning_rate": 7.77155986798497e-05, + "loss": 1.2025, + "step": 16318 + }, + { + "epoch": 0.5844181424248392, + "grad_norm": 1.4073106050491333, + "learning_rate": 7.770429148428736e-05, + "loss": 1.0013, + "step": 16319 + }, + { + "epoch": 0.5844539545544076, + "grad_norm": 1.4695957899093628, + "learning_rate": 7.769298458868504e-05, + "loss": 0.9789, + "step": 16320 + }, + { + "epoch": 0.5844897666839759, + "grad_norm": 1.5465013980865479, + "learning_rate": 7.768167799319481e-05, + "loss": 1.0544, + "step": 16321 + }, + { + "epoch": 0.5845255788135442, + "grad_norm": 1.699486255645752, + "learning_rate": 7.767037169796885e-05, + "loss": 1.1527, + "step": 16322 + }, + { + "epoch": 0.5845613909431124, + "grad_norm": 1.4463098049163818, + "learning_rate": 7.76590657031592e-05, + "loss": 1.225, + "step": 16323 + }, + { + "epoch": 0.5845972030726807, + "grad_norm": 1.666200876235962, + "learning_rate": 7.764776000891805e-05, + "loss": 0.9962, + "step": 16324 + }, + { + "epoch": 0.584633015202249, + "grad_norm": 1.3305416107177734, + "learning_rate": 7.763645461539741e-05, + "loss": 0.8468, + "step": 16325 + }, + { + "epoch": 0.5846688273318172, + "grad_norm": 1.494014024734497, + "learning_rate": 7.762514952274945e-05, + "loss": 0.9405, + "step": 16326 + }, + { + "epoch": 0.5847046394613856, + "grad_norm": 1.8549782037734985, + "learning_rate": 7.761384473112625e-05, + "loss": 1.1581, + "step": 16327 + }, + { + "epoch": 0.5847404515909539, + "grad_norm": 1.400206208229065, + "learning_rate": 7.760254024067986e-05, + "loss": 1.0463, + "step": 16328 + }, + { + "epoch": 0.5847762637205222, + "grad_norm": 1.9139339923858643, + "learning_rate": 7.759123605156243e-05, + "loss": 1.2449, + "step": 16329 + }, + { + "epoch": 0.5848120758500904, + "grad_norm": 1.337656855583191, + "learning_rate": 7.757993216392599e-05, + "loss": 1.1009, + "step": 16330 + }, + { + "epoch": 0.5848478879796587, + "grad_norm": 1.6581767797470093, + "learning_rate": 7.756862857792268e-05, + "loss": 1.3302, + "step": 16331 + }, + { + "epoch": 0.584883700109227, + "grad_norm": 1.4378429651260376, + "learning_rate": 7.755732529370449e-05, + "loss": 1.1775, + "step": 16332 + }, + { + "epoch": 0.5849195122387952, + "grad_norm": 1.8096048831939697, + "learning_rate": 7.754602231142359e-05, + "loss": 1.2263, + "step": 16333 + }, + { + "epoch": 0.5849553243683636, + "grad_norm": 1.4682468175888062, + "learning_rate": 7.753471963123196e-05, + "loss": 0.888, + "step": 16334 + }, + { + "epoch": 0.5849911364979319, + "grad_norm": 1.56658935546875, + "learning_rate": 7.752341725328171e-05, + "loss": 1.0133, + "step": 16335 + }, + { + "epoch": 0.5850269486275002, + "grad_norm": 1.3943458795547485, + "learning_rate": 7.751211517772491e-05, + "loss": 1.0978, + "step": 16336 + }, + { + "epoch": 0.5850627607570684, + "grad_norm": 1.4037548303604126, + "learning_rate": 7.750081340471355e-05, + "loss": 1.0306, + "step": 16337 + }, + { + "epoch": 0.5850985728866367, + "grad_norm": 1.1346642971038818, + "learning_rate": 7.748951193439977e-05, + "loss": 1.1186, + "step": 16338 + }, + { + "epoch": 0.585134385016205, + "grad_norm": 1.2809089422225952, + "learning_rate": 7.747821076693551e-05, + "loss": 1.0274, + "step": 16339 + }, + { + "epoch": 0.5851701971457732, + "grad_norm": 1.233031153678894, + "learning_rate": 7.746690990247291e-05, + "loss": 1.1202, + "step": 16340 + }, + { + "epoch": 0.5852060092753416, + "grad_norm": 1.5133507251739502, + "learning_rate": 7.745560934116398e-05, + "loss": 1.0644, + "step": 16341 + }, + { + "epoch": 0.5852418214049099, + "grad_norm": 1.513038158416748, + "learning_rate": 7.744430908316074e-05, + "loss": 1.135, + "step": 16342 + }, + { + "epoch": 0.5852776335344781, + "grad_norm": 1.6401368379592896, + "learning_rate": 7.743300912861525e-05, + "loss": 0.9588, + "step": 16343 + }, + { + "epoch": 0.5853134456640464, + "grad_norm": 1.629037618637085, + "learning_rate": 7.742170947767945e-05, + "loss": 1.0906, + "step": 16344 + }, + { + "epoch": 0.5853492577936147, + "grad_norm": 1.6235685348510742, + "learning_rate": 7.741041013050549e-05, + "loss": 1.0842, + "step": 16345 + }, + { + "epoch": 0.585385069923183, + "grad_norm": 1.4643471240997314, + "learning_rate": 7.739911108724527e-05, + "loss": 1.0348, + "step": 16346 + }, + { + "epoch": 0.5854208820527512, + "grad_norm": 1.2137075662612915, + "learning_rate": 7.73878123480509e-05, + "loss": 1.0089, + "step": 16347 + }, + { + "epoch": 0.5854566941823196, + "grad_norm": 1.5439608097076416, + "learning_rate": 7.73765139130743e-05, + "loss": 1.082, + "step": 16348 + }, + { + "epoch": 0.5854925063118879, + "grad_norm": 1.4141563177108765, + "learning_rate": 7.736521578246758e-05, + "loss": 1.1572, + "step": 16349 + }, + { + "epoch": 0.5855283184414561, + "grad_norm": 1.3762705326080322, + "learning_rate": 7.735391795638262e-05, + "loss": 0.9433, + "step": 16350 + }, + { + "epoch": 0.5855641305710244, + "grad_norm": 1.3783938884735107, + "learning_rate": 7.734262043497155e-05, + "loss": 1.2247, + "step": 16351 + }, + { + "epoch": 0.5855999427005927, + "grad_norm": 2.2933945655822754, + "learning_rate": 7.733132321838628e-05, + "loss": 1.1709, + "step": 16352 + }, + { + "epoch": 0.585635754830161, + "grad_norm": 1.5362390279769897, + "learning_rate": 7.732002630677878e-05, + "loss": 0.9818, + "step": 16353 + }, + { + "epoch": 0.5856715669597292, + "grad_norm": 1.4577016830444336, + "learning_rate": 7.73087297003011e-05, + "loss": 1.3432, + "step": 16354 + }, + { + "epoch": 0.5857073790892976, + "grad_norm": 1.5395904779434204, + "learning_rate": 7.729743339910515e-05, + "loss": 0.8771, + "step": 16355 + }, + { + "epoch": 0.5857431912188659, + "grad_norm": 1.4441596269607544, + "learning_rate": 7.728613740334304e-05, + "loss": 1.0612, + "step": 16356 + }, + { + "epoch": 0.5857790033484341, + "grad_norm": 1.9904698133468628, + "learning_rate": 7.727484171316655e-05, + "loss": 1.0585, + "step": 16357 + }, + { + "epoch": 0.5858148154780024, + "grad_norm": 1.6274032592773438, + "learning_rate": 7.726354632872783e-05, + "loss": 1.2157, + "step": 16358 + }, + { + "epoch": 0.5858506276075707, + "grad_norm": 1.3898348808288574, + "learning_rate": 7.72522512501787e-05, + "loss": 1.0455, + "step": 16359 + }, + { + "epoch": 0.5858864397371389, + "grad_norm": 1.5234483480453491, + "learning_rate": 7.724095647767125e-05, + "loss": 1.2697, + "step": 16360 + }, + { + "epoch": 0.5859222518667072, + "grad_norm": 1.6170040369033813, + "learning_rate": 7.722966201135736e-05, + "loss": 1.1471, + "step": 16361 + }, + { + "epoch": 0.5859580639962756, + "grad_norm": 1.2967547178268433, + "learning_rate": 7.721836785138896e-05, + "loss": 0.8138, + "step": 16362 + }, + { + "epoch": 0.5859938761258439, + "grad_norm": 1.1792638301849365, + "learning_rate": 7.720707399791807e-05, + "loss": 1.0783, + "step": 16363 + }, + { + "epoch": 0.5860296882554121, + "grad_norm": 1.3896304368972778, + "learning_rate": 7.719578045109657e-05, + "loss": 1.055, + "step": 16364 + }, + { + "epoch": 0.5860655003849804, + "grad_norm": 1.2465053796768188, + "learning_rate": 7.718448721107645e-05, + "loss": 0.8683, + "step": 16365 + }, + { + "epoch": 0.5861013125145487, + "grad_norm": 1.7362865209579468, + "learning_rate": 7.717319427800957e-05, + "loss": 1.1399, + "step": 16366 + }, + { + "epoch": 0.5861371246441169, + "grad_norm": 1.468375325202942, + "learning_rate": 7.7161901652048e-05, + "loss": 1.0071, + "step": 16367 + }, + { + "epoch": 0.5861729367736852, + "grad_norm": 1.3368752002716064, + "learning_rate": 7.715060933334351e-05, + "loss": 1.0221, + "step": 16368 + }, + { + "epoch": 0.5862087489032536, + "grad_norm": 2.0740180015563965, + "learning_rate": 7.713931732204816e-05, + "loss": 1.2448, + "step": 16369 + }, + { + "epoch": 0.5862445610328219, + "grad_norm": 1.949501872062683, + "learning_rate": 7.712802561831381e-05, + "loss": 1.1881, + "step": 16370 + }, + { + "epoch": 0.5862803731623901, + "grad_norm": 1.3111817836761475, + "learning_rate": 7.711673422229232e-05, + "loss": 1.1108, + "step": 16371 + }, + { + "epoch": 0.5863161852919584, + "grad_norm": 1.5339329242706299, + "learning_rate": 7.71054431341357e-05, + "loss": 1.0567, + "step": 16372 + }, + { + "epoch": 0.5863519974215267, + "grad_norm": 1.471100091934204, + "learning_rate": 7.709415235399577e-05, + "loss": 1.1321, + "step": 16373 + }, + { + "epoch": 0.5863878095510949, + "grad_norm": 1.4405372142791748, + "learning_rate": 7.708286188202451e-05, + "loss": 1.109, + "step": 16374 + }, + { + "epoch": 0.5864236216806632, + "grad_norm": 1.405034065246582, + "learning_rate": 7.707157171837374e-05, + "loss": 0.9503, + "step": 16375 + }, + { + "epoch": 0.5864594338102316, + "grad_norm": 2.0258705615997314, + "learning_rate": 7.706028186319543e-05, + "loss": 1.1003, + "step": 16376 + }, + { + "epoch": 0.5864952459397998, + "grad_norm": 1.4799989461898804, + "learning_rate": 7.704899231664143e-05, + "loss": 1.1716, + "step": 16377 + }, + { + "epoch": 0.5865310580693681, + "grad_norm": 1.6002997159957886, + "learning_rate": 7.703770307886364e-05, + "loss": 0.9311, + "step": 16378 + }, + { + "epoch": 0.5865668701989364, + "grad_norm": 1.6669716835021973, + "learning_rate": 7.702641415001394e-05, + "loss": 1.3291, + "step": 16379 + }, + { + "epoch": 0.5866026823285047, + "grad_norm": 1.819976568222046, + "learning_rate": 7.701512553024418e-05, + "loss": 1.059, + "step": 16380 + }, + { + "epoch": 0.5866384944580729, + "grad_norm": 1.5530710220336914, + "learning_rate": 7.700383721970628e-05, + "loss": 1.0555, + "step": 16381 + }, + { + "epoch": 0.5866743065876412, + "grad_norm": 1.8940783739089966, + "learning_rate": 7.699254921855206e-05, + "loss": 1.3183, + "step": 16382 + }, + { + "epoch": 0.5867101187172096, + "grad_norm": 1.7237741947174072, + "learning_rate": 7.698126152693345e-05, + "loss": 1.2258, + "step": 16383 + }, + { + "epoch": 0.5867459308467778, + "grad_norm": 2.4545674324035645, + "learning_rate": 7.696997414500223e-05, + "loss": 1.1236, + "step": 16384 + }, + { + "epoch": 0.5867817429763461, + "grad_norm": 2.5279500484466553, + "learning_rate": 7.695868707291034e-05, + "loss": 1.1286, + "step": 16385 + }, + { + "epoch": 0.5868175551059144, + "grad_norm": 1.3338446617126465, + "learning_rate": 7.694740031080957e-05, + "loss": 0.9579, + "step": 16386 + }, + { + "epoch": 0.5868533672354826, + "grad_norm": 1.716612696647644, + "learning_rate": 7.693611385885181e-05, + "loss": 1.1466, + "step": 16387 + }, + { + "epoch": 0.5868891793650509, + "grad_norm": 1.474052906036377, + "learning_rate": 7.69248277171889e-05, + "loss": 1.248, + "step": 16388 + }, + { + "epoch": 0.5869249914946192, + "grad_norm": 1.948604702949524, + "learning_rate": 7.691354188597263e-05, + "loss": 1.3406, + "step": 16389 + }, + { + "epoch": 0.5869608036241876, + "grad_norm": 1.2939046621322632, + "learning_rate": 7.69022563653549e-05, + "loss": 0.9927, + "step": 16390 + }, + { + "epoch": 0.5869966157537558, + "grad_norm": 1.5282903909683228, + "learning_rate": 7.689097115548751e-05, + "loss": 1.2468, + "step": 16391 + }, + { + "epoch": 0.5870324278833241, + "grad_norm": 1.3388323783874512, + "learning_rate": 7.68796862565223e-05, + "loss": 1.0869, + "step": 16392 + }, + { + "epoch": 0.5870682400128924, + "grad_norm": 1.3795522451400757, + "learning_rate": 7.686840166861106e-05, + "loss": 1.0252, + "step": 16393 + }, + { + "epoch": 0.5871040521424606, + "grad_norm": 1.382205843925476, + "learning_rate": 7.685711739190568e-05, + "loss": 1.102, + "step": 16394 + }, + { + "epoch": 0.5871398642720289, + "grad_norm": 1.6149733066558838, + "learning_rate": 7.684583342655791e-05, + "loss": 1.2396, + "step": 16395 + }, + { + "epoch": 0.5871756764015972, + "grad_norm": 1.4417389631271362, + "learning_rate": 7.68345497727196e-05, + "loss": 1.106, + "step": 16396 + }, + { + "epoch": 0.5872114885311656, + "grad_norm": 1.2772547006607056, + "learning_rate": 7.682326643054254e-05, + "loss": 1.2242, + "step": 16397 + }, + { + "epoch": 0.5872473006607338, + "grad_norm": 1.6415934562683105, + "learning_rate": 7.681198340017852e-05, + "loss": 1.0474, + "step": 16398 + }, + { + "epoch": 0.5872831127903021, + "grad_norm": 1.638211727142334, + "learning_rate": 7.680070068177936e-05, + "loss": 0.9753, + "step": 16399 + }, + { + "epoch": 0.5873189249198704, + "grad_norm": 1.477156639099121, + "learning_rate": 7.678941827549683e-05, + "loss": 1.1464, + "step": 16400 + }, + { + "epoch": 0.5873547370494386, + "grad_norm": 1.3640670776367188, + "learning_rate": 7.677813618148276e-05, + "loss": 1.1185, + "step": 16401 + }, + { + "epoch": 0.5873905491790069, + "grad_norm": 1.8076980113983154, + "learning_rate": 7.67668543998889e-05, + "loss": 1.2541, + "step": 16402 + }, + { + "epoch": 0.5874263613085752, + "grad_norm": 1.32332444190979, + "learning_rate": 7.675557293086706e-05, + "loss": 1.0837, + "step": 16403 + }, + { + "epoch": 0.5874621734381436, + "grad_norm": 1.6109614372253418, + "learning_rate": 7.674429177456899e-05, + "loss": 1.2529, + "step": 16404 + }, + { + "epoch": 0.5874979855677118, + "grad_norm": 1.344956874847412, + "learning_rate": 7.673301093114649e-05, + "loss": 0.9383, + "step": 16405 + }, + { + "epoch": 0.5875337976972801, + "grad_norm": 1.4908525943756104, + "learning_rate": 7.672173040075131e-05, + "loss": 1.1161, + "step": 16406 + }, + { + "epoch": 0.5875696098268484, + "grad_norm": 1.4589706659317017, + "learning_rate": 7.671045018353521e-05, + "loss": 0.9589, + "step": 16407 + }, + { + "epoch": 0.5876054219564166, + "grad_norm": 1.6553797721862793, + "learning_rate": 7.669917027964998e-05, + "loss": 1.0347, + "step": 16408 + }, + { + "epoch": 0.5876412340859849, + "grad_norm": 1.6489442586898804, + "learning_rate": 7.668789068924734e-05, + "loss": 1.1666, + "step": 16409 + }, + { + "epoch": 0.5876770462155532, + "grad_norm": 1.3982115983963013, + "learning_rate": 7.667661141247907e-05, + "loss": 0.9785, + "step": 16410 + }, + { + "epoch": 0.5877128583451215, + "grad_norm": 1.628912329673767, + "learning_rate": 7.66653324494969e-05, + "loss": 1.2233, + "step": 16411 + }, + { + "epoch": 0.5877486704746898, + "grad_norm": 1.5533204078674316, + "learning_rate": 7.665405380045258e-05, + "loss": 0.9799, + "step": 16412 + }, + { + "epoch": 0.5877844826042581, + "grad_norm": 1.6855796575546265, + "learning_rate": 7.664277546549786e-05, + "loss": 1.3156, + "step": 16413 + }, + { + "epoch": 0.5878202947338264, + "grad_norm": 2.087148427963257, + "learning_rate": 7.663149744478448e-05, + "loss": 1.2886, + "step": 16414 + }, + { + "epoch": 0.5878561068633946, + "grad_norm": 1.4130611419677734, + "learning_rate": 7.662021973846415e-05, + "loss": 0.9673, + "step": 16415 + }, + { + "epoch": 0.5878919189929629, + "grad_norm": 1.6957001686096191, + "learning_rate": 7.660894234668859e-05, + "loss": 1.2004, + "step": 16416 + }, + { + "epoch": 0.5879277311225312, + "grad_norm": 1.4537783861160278, + "learning_rate": 7.659766526960957e-05, + "loss": 1.163, + "step": 16417 + }, + { + "epoch": 0.5879635432520995, + "grad_norm": 1.4287279844284058, + "learning_rate": 7.658638850737874e-05, + "loss": 0.8737, + "step": 16418 + }, + { + "epoch": 0.5879993553816678, + "grad_norm": 1.4421789646148682, + "learning_rate": 7.657511206014788e-05, + "loss": 1.1869, + "step": 16419 + }, + { + "epoch": 0.5880351675112361, + "grad_norm": 1.445476770401001, + "learning_rate": 7.656383592806865e-05, + "loss": 1.0692, + "step": 16420 + }, + { + "epoch": 0.5880709796408043, + "grad_norm": 1.8680171966552734, + "learning_rate": 7.655256011129279e-05, + "loss": 1.0873, + "step": 16421 + }, + { + "epoch": 0.5881067917703726, + "grad_norm": 1.569720983505249, + "learning_rate": 7.654128460997198e-05, + "loss": 1.0873, + "step": 16422 + }, + { + "epoch": 0.5881426038999409, + "grad_norm": 2.349985122680664, + "learning_rate": 7.653000942425794e-05, + "loss": 1.0628, + "step": 16423 + }, + { + "epoch": 0.5881784160295092, + "grad_norm": 1.6759352684020996, + "learning_rate": 7.651873455430237e-05, + "loss": 1.2371, + "step": 16424 + }, + { + "epoch": 0.5882142281590775, + "grad_norm": 1.653738021850586, + "learning_rate": 7.650746000025688e-05, + "loss": 1.2312, + "step": 16425 + }, + { + "epoch": 0.5882500402886458, + "grad_norm": 2.2893853187561035, + "learning_rate": 7.649618576227325e-05, + "loss": 1.2701, + "step": 16426 + }, + { + "epoch": 0.5882858524182141, + "grad_norm": 1.4468097686767578, + "learning_rate": 7.648491184050311e-05, + "loss": 1.1728, + "step": 16427 + }, + { + "epoch": 0.5883216645477823, + "grad_norm": 1.2771373987197876, + "learning_rate": 7.647363823509815e-05, + "loss": 0.8389, + "step": 16428 + }, + { + "epoch": 0.5883574766773506, + "grad_norm": 1.5770896673202515, + "learning_rate": 7.646236494621004e-05, + "loss": 1.165, + "step": 16429 + }, + { + "epoch": 0.5883932888069189, + "grad_norm": 1.447217345237732, + "learning_rate": 7.645109197399047e-05, + "loss": 1.0272, + "step": 16430 + }, + { + "epoch": 0.5884291009364871, + "grad_norm": 1.6045366525650024, + "learning_rate": 7.643981931859104e-05, + "loss": 1.0451, + "step": 16431 + }, + { + "epoch": 0.5884649130660555, + "grad_norm": 1.5133663415908813, + "learning_rate": 7.642854698016348e-05, + "loss": 0.9729, + "step": 16432 + }, + { + "epoch": 0.5885007251956238, + "grad_norm": 1.4489924907684326, + "learning_rate": 7.641727495885944e-05, + "loss": 1.0319, + "step": 16433 + }, + { + "epoch": 0.5885365373251921, + "grad_norm": 2.0996851921081543, + "learning_rate": 7.640600325483049e-05, + "loss": 1.1001, + "step": 16434 + }, + { + "epoch": 0.5885723494547603, + "grad_norm": 1.5185215473175049, + "learning_rate": 7.639473186822839e-05, + "loss": 1.1927, + "step": 16435 + }, + { + "epoch": 0.5886081615843286, + "grad_norm": 1.4498940706253052, + "learning_rate": 7.638346079920466e-05, + "loss": 1.2878, + "step": 16436 + }, + { + "epoch": 0.5886439737138969, + "grad_norm": 1.5447698831558228, + "learning_rate": 7.637219004791106e-05, + "loss": 1.133, + "step": 16437 + }, + { + "epoch": 0.5886797858434651, + "grad_norm": 1.2874577045440674, + "learning_rate": 7.636091961449911e-05, + "loss": 1.1369, + "step": 16438 + }, + { + "epoch": 0.5887155979730334, + "grad_norm": 1.7319135665893555, + "learning_rate": 7.634964949912054e-05, + "loss": 1.2361, + "step": 16439 + }, + { + "epoch": 0.5887514101026018, + "grad_norm": 1.7135026454925537, + "learning_rate": 7.633837970192694e-05, + "loss": 1.1913, + "step": 16440 + }, + { + "epoch": 0.5887872222321701, + "grad_norm": 1.704420566558838, + "learning_rate": 7.632711022306985e-05, + "loss": 1.0524, + "step": 16441 + }, + { + "epoch": 0.5888230343617383, + "grad_norm": 1.544579029083252, + "learning_rate": 7.631584106270103e-05, + "loss": 1.0755, + "step": 16442 + }, + { + "epoch": 0.5888588464913066, + "grad_norm": 1.4825361967086792, + "learning_rate": 7.630457222097196e-05, + "loss": 1.0713, + "step": 16443 + }, + { + "epoch": 0.5888946586208749, + "grad_norm": 1.4225976467132568, + "learning_rate": 7.629330369803435e-05, + "loss": 1.0231, + "step": 16444 + }, + { + "epoch": 0.5889304707504431, + "grad_norm": 1.4592609405517578, + "learning_rate": 7.628203549403971e-05, + "loss": 1.2285, + "step": 16445 + }, + { + "epoch": 0.5889662828800114, + "grad_norm": 2.54262113571167, + "learning_rate": 7.627076760913976e-05, + "loss": 1.0625, + "step": 16446 + }, + { + "epoch": 0.5890020950095798, + "grad_norm": 1.3013726472854614, + "learning_rate": 7.625950004348595e-05, + "loss": 1.1046, + "step": 16447 + }, + { + "epoch": 0.589037907139148, + "grad_norm": 1.6431635618209839, + "learning_rate": 7.624823279723001e-05, + "loss": 1.0208, + "step": 16448 + }, + { + "epoch": 0.5890737192687163, + "grad_norm": 1.7138049602508545, + "learning_rate": 7.623696587052343e-05, + "loss": 1.2127, + "step": 16449 + }, + { + "epoch": 0.5891095313982846, + "grad_norm": 2.1448330879211426, + "learning_rate": 7.622569926351781e-05, + "loss": 1.0652, + "step": 16450 + }, + { + "epoch": 0.5891453435278529, + "grad_norm": 1.462355613708496, + "learning_rate": 7.621443297636478e-05, + "loss": 1.1266, + "step": 16451 + }, + { + "epoch": 0.5891811556574211, + "grad_norm": 1.3384770154953003, + "learning_rate": 7.62031670092158e-05, + "loss": 1.0486, + "step": 16452 + }, + { + "epoch": 0.5892169677869894, + "grad_norm": 1.4144552946090698, + "learning_rate": 7.619190136222259e-05, + "loss": 1.0479, + "step": 16453 + }, + { + "epoch": 0.5892527799165578, + "grad_norm": 2.0976130962371826, + "learning_rate": 7.618063603553655e-05, + "loss": 1.0925, + "step": 16454 + }, + { + "epoch": 0.589288592046126, + "grad_norm": 1.362608551979065, + "learning_rate": 7.616937102930942e-05, + "loss": 1.0508, + "step": 16455 + }, + { + "epoch": 0.5893244041756943, + "grad_norm": 1.7794855833053589, + "learning_rate": 7.61581063436926e-05, + "loss": 1.0075, + "step": 16456 + }, + { + "epoch": 0.5893602163052626, + "grad_norm": 1.4570624828338623, + "learning_rate": 7.614684197883775e-05, + "loss": 0.9952, + "step": 16457 + }, + { + "epoch": 0.5893960284348309, + "grad_norm": 1.9963051080703735, + "learning_rate": 7.613557793489637e-05, + "loss": 1.0616, + "step": 16458 + }, + { + "epoch": 0.5894318405643991, + "grad_norm": 1.4518359899520874, + "learning_rate": 7.612431421201996e-05, + "loss": 1.0638, + "step": 16459 + }, + { + "epoch": 0.5894676526939674, + "grad_norm": 1.2913861274719238, + "learning_rate": 7.611305081036015e-05, + "loss": 0.8898, + "step": 16460 + }, + { + "epoch": 0.5895034648235358, + "grad_norm": 1.396765947341919, + "learning_rate": 7.61017877300684e-05, + "loss": 1.0745, + "step": 16461 + }, + { + "epoch": 0.589539276953104, + "grad_norm": 2.2976233959198, + "learning_rate": 7.609052497129629e-05, + "loss": 1.1262, + "step": 16462 + }, + { + "epoch": 0.5895750890826723, + "grad_norm": 1.7043719291687012, + "learning_rate": 7.607926253419531e-05, + "loss": 1.1923, + "step": 16463 + }, + { + "epoch": 0.5896109012122406, + "grad_norm": 1.4271223545074463, + "learning_rate": 7.606800041891701e-05, + "loss": 1.1433, + "step": 16464 + }, + { + "epoch": 0.5896467133418088, + "grad_norm": 1.5332694053649902, + "learning_rate": 7.605673862561284e-05, + "loss": 1.1501, + "step": 16465 + }, + { + "epoch": 0.5896825254713771, + "grad_norm": 1.6435952186584473, + "learning_rate": 7.604547715443445e-05, + "loss": 1.2137, + "step": 16466 + }, + { + "epoch": 0.5897183376009454, + "grad_norm": 1.218113899230957, + "learning_rate": 7.603421600553324e-05, + "loss": 1.2457, + "step": 16467 + }, + { + "epoch": 0.5897541497305138, + "grad_norm": 1.4541798830032349, + "learning_rate": 7.602295517906072e-05, + "loss": 1.1243, + "step": 16468 + }, + { + "epoch": 0.589789961860082, + "grad_norm": 1.593093752861023, + "learning_rate": 7.601169467516844e-05, + "loss": 1.1466, + "step": 16469 + }, + { + "epoch": 0.5898257739896503, + "grad_norm": 1.4701374769210815, + "learning_rate": 7.600043449400782e-05, + "loss": 1.1975, + "step": 16470 + }, + { + "epoch": 0.5898615861192186, + "grad_norm": 1.5787769556045532, + "learning_rate": 7.598917463573044e-05, + "loss": 1.3128, + "step": 16471 + }, + { + "epoch": 0.5898973982487868, + "grad_norm": 1.5381447076797485, + "learning_rate": 7.59779151004877e-05, + "loss": 1.0326, + "step": 16472 + }, + { + "epoch": 0.5899332103783551, + "grad_norm": 1.6926288604736328, + "learning_rate": 7.596665588843117e-05, + "loss": 1.2828, + "step": 16473 + }, + { + "epoch": 0.5899690225079234, + "grad_norm": 1.6697108745574951, + "learning_rate": 7.595539699971225e-05, + "loss": 1.0954, + "step": 16474 + }, + { + "epoch": 0.5900048346374918, + "grad_norm": 1.4888554811477661, + "learning_rate": 7.594413843448248e-05, + "loss": 1.1206, + "step": 16475 + }, + { + "epoch": 0.59004064676706, + "grad_norm": 1.3029729127883911, + "learning_rate": 7.593288019289329e-05, + "loss": 1.0998, + "step": 16476 + }, + { + "epoch": 0.5900764588966283, + "grad_norm": 1.5667431354522705, + "learning_rate": 7.592162227509614e-05, + "loss": 1.1268, + "step": 16477 + }, + { + "epoch": 0.5901122710261966, + "grad_norm": 1.487425684928894, + "learning_rate": 7.591036468124252e-05, + "loss": 1.1, + "step": 16478 + }, + { + "epoch": 0.5901480831557648, + "grad_norm": 1.4879480600357056, + "learning_rate": 7.589910741148384e-05, + "loss": 0.9352, + "step": 16479 + }, + { + "epoch": 0.5901838952853331, + "grad_norm": 1.4413948059082031, + "learning_rate": 7.588785046597161e-05, + "loss": 1.0221, + "step": 16480 + }, + { + "epoch": 0.5902197074149014, + "grad_norm": 2.2404651641845703, + "learning_rate": 7.587659384485723e-05, + "loss": 1.2976, + "step": 16481 + }, + { + "epoch": 0.5902555195444698, + "grad_norm": 1.2131046056747437, + "learning_rate": 7.586533754829218e-05, + "loss": 1.0679, + "step": 16482 + }, + { + "epoch": 0.590291331674038, + "grad_norm": 1.4764553308486938, + "learning_rate": 7.585408157642786e-05, + "loss": 0.9808, + "step": 16483 + }, + { + "epoch": 0.5903271438036063, + "grad_norm": 1.5803104639053345, + "learning_rate": 7.584282592941574e-05, + "loss": 0.9361, + "step": 16484 + }, + { + "epoch": 0.5903629559331746, + "grad_norm": 1.6536612510681152, + "learning_rate": 7.583157060740727e-05, + "loss": 1.2823, + "step": 16485 + }, + { + "epoch": 0.5903987680627428, + "grad_norm": 1.5659980773925781, + "learning_rate": 7.582031561055378e-05, + "loss": 1.1019, + "step": 16486 + }, + { + "epoch": 0.5904345801923111, + "grad_norm": 1.3671454191207886, + "learning_rate": 7.58090609390068e-05, + "loss": 1.1723, + "step": 16487 + }, + { + "epoch": 0.5904703923218794, + "grad_norm": 1.8069736957550049, + "learning_rate": 7.579780659291768e-05, + "loss": 1.1046, + "step": 16488 + }, + { + "epoch": 0.5905062044514477, + "grad_norm": 1.3924870491027832, + "learning_rate": 7.578655257243786e-05, + "loss": 1.172, + "step": 16489 + }, + { + "epoch": 0.590542016581016, + "grad_norm": 1.7094745635986328, + "learning_rate": 7.577529887771873e-05, + "loss": 1.0872, + "step": 16490 + }, + { + "epoch": 0.5905778287105843, + "grad_norm": 1.5161018371582031, + "learning_rate": 7.576404550891172e-05, + "loss": 1.1739, + "step": 16491 + }, + { + "epoch": 0.5906136408401526, + "grad_norm": 1.6317038536071777, + "learning_rate": 7.57527924661682e-05, + "loss": 1.0421, + "step": 16492 + }, + { + "epoch": 0.5906494529697208, + "grad_norm": 1.7005877494812012, + "learning_rate": 7.57415397496396e-05, + "loss": 1.2039, + "step": 16493 + }, + { + "epoch": 0.5906852650992891, + "grad_norm": 1.4223685264587402, + "learning_rate": 7.57302873594773e-05, + "loss": 1.0488, + "step": 16494 + }, + { + "epoch": 0.5907210772288574, + "grad_norm": 1.5446349382400513, + "learning_rate": 7.571903529583265e-05, + "loss": 1.1558, + "step": 16495 + }, + { + "epoch": 0.5907568893584257, + "grad_norm": 1.3081774711608887, + "learning_rate": 7.570778355885708e-05, + "loss": 1.2212, + "step": 16496 + }, + { + "epoch": 0.590792701487994, + "grad_norm": 1.3895182609558105, + "learning_rate": 7.569653214870192e-05, + "loss": 1.1878, + "step": 16497 + }, + { + "epoch": 0.5908285136175623, + "grad_norm": 1.6893161535263062, + "learning_rate": 7.568528106551862e-05, + "loss": 0.9021, + "step": 16498 + }, + { + "epoch": 0.5908643257471305, + "grad_norm": 1.737159252166748, + "learning_rate": 7.567403030945844e-05, + "loss": 1.105, + "step": 16499 + }, + { + "epoch": 0.5909001378766988, + "grad_norm": 1.4903146028518677, + "learning_rate": 7.566277988067285e-05, + "loss": 1.0619, + "step": 16500 + }, + { + "epoch": 0.5909359500062671, + "grad_norm": 1.4458473920822144, + "learning_rate": 7.565152977931314e-05, + "loss": 1.1967, + "step": 16501 + }, + { + "epoch": 0.5909717621358354, + "grad_norm": 1.772502064704895, + "learning_rate": 7.56402800055307e-05, + "loss": 1.0936, + "step": 16502 + }, + { + "epoch": 0.5910075742654037, + "grad_norm": 1.5502326488494873, + "learning_rate": 7.562903055947688e-05, + "loss": 0.9723, + "step": 16503 + }, + { + "epoch": 0.591043386394972, + "grad_norm": 1.3715310096740723, + "learning_rate": 7.561778144130299e-05, + "loss": 1.1206, + "step": 16504 + }, + { + "epoch": 0.5910791985245403, + "grad_norm": 1.777698278427124, + "learning_rate": 7.560653265116042e-05, + "loss": 1.1156, + "step": 16505 + }, + { + "epoch": 0.5911150106541085, + "grad_norm": 1.5552839040756226, + "learning_rate": 7.559528418920048e-05, + "loss": 1.0721, + "step": 16506 + }, + { + "epoch": 0.5911508227836768, + "grad_norm": 1.697983980178833, + "learning_rate": 7.558403605557453e-05, + "loss": 1.074, + "step": 16507 + }, + { + "epoch": 0.5911866349132451, + "grad_norm": 1.5447120666503906, + "learning_rate": 7.557278825043385e-05, + "loss": 1.0185, + "step": 16508 + }, + { + "epoch": 0.5912224470428133, + "grad_norm": 1.4042669534683228, + "learning_rate": 7.556154077392982e-05, + "loss": 1.0193, + "step": 16509 + }, + { + "epoch": 0.5912582591723817, + "grad_norm": 1.4626420736312866, + "learning_rate": 7.555029362621371e-05, + "loss": 1.1013, + "step": 16510 + }, + { + "epoch": 0.59129407130195, + "grad_norm": 1.2087204456329346, + "learning_rate": 7.553904680743688e-05, + "loss": 1.0328, + "step": 16511 + }, + { + "epoch": 0.5913298834315183, + "grad_norm": 1.3369914293289185, + "learning_rate": 7.552780031775064e-05, + "loss": 1.0776, + "step": 16512 + }, + { + "epoch": 0.5913656955610865, + "grad_norm": 1.4269461631774902, + "learning_rate": 7.551655415730624e-05, + "loss": 1.2592, + "step": 16513 + }, + { + "epoch": 0.5914015076906548, + "grad_norm": 1.601727843284607, + "learning_rate": 7.550530832625505e-05, + "loss": 1.2242, + "step": 16514 + }, + { + "epoch": 0.5914373198202231, + "grad_norm": 2.4519026279449463, + "learning_rate": 7.549406282474833e-05, + "loss": 1.5358, + "step": 16515 + }, + { + "epoch": 0.5914731319497913, + "grad_norm": 1.488142490386963, + "learning_rate": 7.548281765293739e-05, + "loss": 1.2219, + "step": 16516 + }, + { + "epoch": 0.5915089440793597, + "grad_norm": 1.5203652381896973, + "learning_rate": 7.54715728109735e-05, + "loss": 0.9927, + "step": 16517 + }, + { + "epoch": 0.591544756208928, + "grad_norm": 1.6286001205444336, + "learning_rate": 7.546032829900797e-05, + "loss": 1.137, + "step": 16518 + }, + { + "epoch": 0.5915805683384963, + "grad_norm": 1.3674184083938599, + "learning_rate": 7.544908411719207e-05, + "loss": 1.0254, + "step": 16519 + }, + { + "epoch": 0.5916163804680645, + "grad_norm": 1.3812589645385742, + "learning_rate": 7.543784026567708e-05, + "loss": 1.1576, + "step": 16520 + }, + { + "epoch": 0.5916521925976328, + "grad_norm": 2.009861469268799, + "learning_rate": 7.542659674461429e-05, + "loss": 1.15, + "step": 16521 + }, + { + "epoch": 0.5916880047272011, + "grad_norm": 1.2723745107650757, + "learning_rate": 7.541535355415487e-05, + "loss": 1.1406, + "step": 16522 + }, + { + "epoch": 0.5917238168567693, + "grad_norm": 1.200943946838379, + "learning_rate": 7.540411069445021e-05, + "loss": 1.0728, + "step": 16523 + }, + { + "epoch": 0.5917596289863377, + "grad_norm": 1.3709125518798828, + "learning_rate": 7.53928681656515e-05, + "loss": 0.9337, + "step": 16524 + }, + { + "epoch": 0.591795441115906, + "grad_norm": 1.4159477949142456, + "learning_rate": 7.538162596791002e-05, + "loss": 1.3464, + "step": 16525 + }, + { + "epoch": 0.5918312532454743, + "grad_norm": 1.562605381011963, + "learning_rate": 7.537038410137698e-05, + "loss": 0.9838, + "step": 16526 + }, + { + "epoch": 0.5918670653750425, + "grad_norm": 1.5335460901260376, + "learning_rate": 7.535914256620368e-05, + "loss": 1.2238, + "step": 16527 + }, + { + "epoch": 0.5919028775046108, + "grad_norm": 2.1071224212646484, + "learning_rate": 7.534790136254132e-05, + "loss": 0.919, + "step": 16528 + }, + { + "epoch": 0.5919386896341791, + "grad_norm": 1.459559679031372, + "learning_rate": 7.533666049054115e-05, + "loss": 1.1276, + "step": 16529 + }, + { + "epoch": 0.5919745017637473, + "grad_norm": 1.5219274759292603, + "learning_rate": 7.532541995035444e-05, + "loss": 1.2742, + "step": 16530 + }, + { + "epoch": 0.5920103138933157, + "grad_norm": 1.7083683013916016, + "learning_rate": 7.53141797421323e-05, + "loss": 0.9637, + "step": 16531 + }, + { + "epoch": 0.592046126022884, + "grad_norm": 1.3881704807281494, + "learning_rate": 7.53029398660261e-05, + "loss": 1.1918, + "step": 16532 + }, + { + "epoch": 0.5920819381524522, + "grad_norm": 1.3630694150924683, + "learning_rate": 7.529170032218691e-05, + "loss": 1.1675, + "step": 16533 + }, + { + "epoch": 0.5921177502820205, + "grad_norm": 1.7746031284332275, + "learning_rate": 7.52804611107661e-05, + "loss": 1.2042, + "step": 16534 + }, + { + "epoch": 0.5921535624115888, + "grad_norm": 1.6976503133773804, + "learning_rate": 7.526922223191473e-05, + "loss": 1.295, + "step": 16535 + }, + { + "epoch": 0.592189374541157, + "grad_norm": 2.411592483520508, + "learning_rate": 7.525798368578412e-05, + "loss": 1.1975, + "step": 16536 + }, + { + "epoch": 0.5922251866707253, + "grad_norm": 1.449521541595459, + "learning_rate": 7.524674547252544e-05, + "loss": 1.0173, + "step": 16537 + }, + { + "epoch": 0.5922609988002937, + "grad_norm": 1.1906367540359497, + "learning_rate": 7.523550759228981e-05, + "loss": 1.0942, + "step": 16538 + }, + { + "epoch": 0.592296810929862, + "grad_norm": 1.3463809490203857, + "learning_rate": 7.522427004522855e-05, + "loss": 1.0781, + "step": 16539 + }, + { + "epoch": 0.5923326230594302, + "grad_norm": 2.970275402069092, + "learning_rate": 7.52130328314927e-05, + "loss": 1.2222, + "step": 16540 + }, + { + "epoch": 0.5923684351889985, + "grad_norm": 1.5858279466629028, + "learning_rate": 7.52017959512336e-05, + "loss": 1.0618, + "step": 16541 + }, + { + "epoch": 0.5924042473185668, + "grad_norm": 1.4226422309875488, + "learning_rate": 7.519055940460227e-05, + "loss": 0.9762, + "step": 16542 + }, + { + "epoch": 0.592440059448135, + "grad_norm": 1.3731257915496826, + "learning_rate": 7.517932319175003e-05, + "loss": 1.016, + "step": 16543 + }, + { + "epoch": 0.5924758715777033, + "grad_norm": 1.6529723405838013, + "learning_rate": 7.516808731282793e-05, + "loss": 1.2248, + "step": 16544 + }, + { + "epoch": 0.5925116837072717, + "grad_norm": 1.824446201324463, + "learning_rate": 7.515685176798723e-05, + "loss": 1.2982, + "step": 16545 + }, + { + "epoch": 0.59254749583684, + "grad_norm": 1.569880485534668, + "learning_rate": 7.514561655737904e-05, + "loss": 1.2799, + "step": 16546 + }, + { + "epoch": 0.5925833079664082, + "grad_norm": 1.5365476608276367, + "learning_rate": 7.513438168115449e-05, + "loss": 1.0204, + "step": 16547 + }, + { + "epoch": 0.5926191200959765, + "grad_norm": 1.813703179359436, + "learning_rate": 7.512314713946478e-05, + "loss": 1.244, + "step": 16548 + }, + { + "epoch": 0.5926549322255448, + "grad_norm": 2.0680086612701416, + "learning_rate": 7.5111912932461e-05, + "loss": 0.896, + "step": 16549 + }, + { + "epoch": 0.592690744355113, + "grad_norm": 3.903815507888794, + "learning_rate": 7.510067906029437e-05, + "loss": 1.3117, + "step": 16550 + }, + { + "epoch": 0.5927265564846813, + "grad_norm": 1.9635398387908936, + "learning_rate": 7.508944552311594e-05, + "loss": 1.0748, + "step": 16551 + }, + { + "epoch": 0.5927623686142497, + "grad_norm": 2.047942876815796, + "learning_rate": 7.507821232107695e-05, + "loss": 1.1559, + "step": 16552 + }, + { + "epoch": 0.592798180743818, + "grad_norm": 1.2524052858352661, + "learning_rate": 7.506697945432841e-05, + "loss": 1.2305, + "step": 16553 + }, + { + "epoch": 0.5928339928733862, + "grad_norm": 1.3465474843978882, + "learning_rate": 7.505574692302155e-05, + "loss": 1.0149, + "step": 16554 + }, + { + "epoch": 0.5928698050029545, + "grad_norm": 1.73092782497406, + "learning_rate": 7.504451472730743e-05, + "loss": 1.0591, + "step": 16555 + }, + { + "epoch": 0.5929056171325228, + "grad_norm": 1.3717879056930542, + "learning_rate": 7.503328286733715e-05, + "loss": 1.0888, + "step": 16556 + }, + { + "epoch": 0.592941429262091, + "grad_norm": 1.7002003192901611, + "learning_rate": 7.502205134326185e-05, + "loss": 1.0337, + "step": 16557 + }, + { + "epoch": 0.5929772413916593, + "grad_norm": 1.933418869972229, + "learning_rate": 7.501082015523263e-05, + "loss": 1.2994, + "step": 16558 + }, + { + "epoch": 0.5930130535212277, + "grad_norm": 2.3155393600463867, + "learning_rate": 7.499958930340061e-05, + "loss": 1.1793, + "step": 16559 + }, + { + "epoch": 0.593048865650796, + "grad_norm": 1.4647012948989868, + "learning_rate": 7.498835878791684e-05, + "loss": 1.2392, + "step": 16560 + }, + { + "epoch": 0.5930846777803642, + "grad_norm": 1.5616271495819092, + "learning_rate": 7.497712860893245e-05, + "loss": 0.9268, + "step": 16561 + }, + { + "epoch": 0.5931204899099325, + "grad_norm": 1.359679937362671, + "learning_rate": 7.49658987665985e-05, + "loss": 1.188, + "step": 16562 + }, + { + "epoch": 0.5931563020395008, + "grad_norm": 1.520154595375061, + "learning_rate": 7.495466926106614e-05, + "loss": 0.9937, + "step": 16563 + }, + { + "epoch": 0.593192114169069, + "grad_norm": 1.8116689920425415, + "learning_rate": 7.494344009248637e-05, + "loss": 1.1709, + "step": 16564 + }, + { + "epoch": 0.5932279262986373, + "grad_norm": 1.7655317783355713, + "learning_rate": 7.493221126101028e-05, + "loss": 1.2712, + "step": 16565 + }, + { + "epoch": 0.5932637384282057, + "grad_norm": 1.5348551273345947, + "learning_rate": 7.492098276678898e-05, + "loss": 1.0296, + "step": 16566 + }, + { + "epoch": 0.593299550557774, + "grad_norm": 1.3086992502212524, + "learning_rate": 7.490975460997348e-05, + "loss": 0.8862, + "step": 16567 + }, + { + "epoch": 0.5933353626873422, + "grad_norm": 1.3107188940048218, + "learning_rate": 7.489852679071488e-05, + "loss": 1.1145, + "step": 16568 + }, + { + "epoch": 0.5933711748169105, + "grad_norm": 1.5821149349212646, + "learning_rate": 7.488729930916421e-05, + "loss": 1.0431, + "step": 16569 + }, + { + "epoch": 0.5934069869464788, + "grad_norm": 1.5785022974014282, + "learning_rate": 7.487607216547255e-05, + "loss": 1.2571, + "step": 16570 + }, + { + "epoch": 0.593442799076047, + "grad_norm": 1.3384541273117065, + "learning_rate": 7.486484535979092e-05, + "loss": 1.1508, + "step": 16571 + }, + { + "epoch": 0.5934786112056153, + "grad_norm": 1.3617057800292969, + "learning_rate": 7.485361889227038e-05, + "loss": 1.0479, + "step": 16572 + }, + { + "epoch": 0.5935144233351837, + "grad_norm": 1.7522705793380737, + "learning_rate": 7.484239276306198e-05, + "loss": 1.1693, + "step": 16573 + }, + { + "epoch": 0.5935502354647519, + "grad_norm": 1.654096007347107, + "learning_rate": 7.483116697231671e-05, + "loss": 1.1024, + "step": 16574 + }, + { + "epoch": 0.5935860475943202, + "grad_norm": 1.7699416875839233, + "learning_rate": 7.481994152018563e-05, + "loss": 0.8708, + "step": 16575 + }, + { + "epoch": 0.5936218597238885, + "grad_norm": 1.3684507608413696, + "learning_rate": 7.480871640681975e-05, + "loss": 1.1106, + "step": 16576 + }, + { + "epoch": 0.5936576718534567, + "grad_norm": 1.5881329774856567, + "learning_rate": 7.479749163237012e-05, + "loss": 1.029, + "step": 16577 + }, + { + "epoch": 0.593693483983025, + "grad_norm": 1.509615421295166, + "learning_rate": 7.47862671969877e-05, + "loss": 1.2514, + "step": 16578 + }, + { + "epoch": 0.5937292961125933, + "grad_norm": 1.404259443283081, + "learning_rate": 7.477504310082354e-05, + "loss": 1.0895, + "step": 16579 + }, + { + "epoch": 0.5937651082421617, + "grad_norm": 1.8409932851791382, + "learning_rate": 7.476381934402865e-05, + "loss": 0.9274, + "step": 16580 + }, + { + "epoch": 0.5938009203717299, + "grad_norm": 1.5380122661590576, + "learning_rate": 7.475259592675402e-05, + "loss": 1.0873, + "step": 16581 + }, + { + "epoch": 0.5938367325012982, + "grad_norm": 1.7285351753234863, + "learning_rate": 7.474137284915065e-05, + "loss": 1.1903, + "step": 16582 + }, + { + "epoch": 0.5938725446308665, + "grad_norm": 2.0874874591827393, + "learning_rate": 7.47301501113695e-05, + "loss": 1.0721, + "step": 16583 + }, + { + "epoch": 0.5939083567604347, + "grad_norm": 1.4247885942459106, + "learning_rate": 7.471892771356161e-05, + "loss": 1.0926, + "step": 16584 + }, + { + "epoch": 0.593944168890003, + "grad_norm": 1.458268165588379, + "learning_rate": 7.470770565587792e-05, + "loss": 1.063, + "step": 16585 + }, + { + "epoch": 0.5939799810195713, + "grad_norm": 1.4853607416152954, + "learning_rate": 7.469648393846943e-05, + "loss": 1.0289, + "step": 16586 + }, + { + "epoch": 0.5940157931491397, + "grad_norm": 1.3692439794540405, + "learning_rate": 7.46852625614871e-05, + "loss": 1.187, + "step": 16587 + }, + { + "epoch": 0.5940516052787079, + "grad_norm": 1.7902936935424805, + "learning_rate": 7.467404152508193e-05, + "loss": 1.2927, + "step": 16588 + }, + { + "epoch": 0.5940874174082762, + "grad_norm": 1.969712734222412, + "learning_rate": 7.466282082940484e-05, + "loss": 1.1567, + "step": 16589 + }, + { + "epoch": 0.5941232295378445, + "grad_norm": 1.6857328414916992, + "learning_rate": 7.465160047460685e-05, + "loss": 1.2852, + "step": 16590 + }, + { + "epoch": 0.5941590416674127, + "grad_norm": 1.5760482549667358, + "learning_rate": 7.464038046083885e-05, + "loss": 1.1965, + "step": 16591 + }, + { + "epoch": 0.594194853796981, + "grad_norm": 1.3767951726913452, + "learning_rate": 7.462916078825182e-05, + "loss": 1.122, + "step": 16592 + }, + { + "epoch": 0.5942306659265493, + "grad_norm": 1.4382380247116089, + "learning_rate": 7.46179414569967e-05, + "loss": 1.1796, + "step": 16593 + }, + { + "epoch": 0.5942664780561177, + "grad_norm": 1.452130913734436, + "learning_rate": 7.460672246722444e-05, + "loss": 1.2519, + "step": 16594 + }, + { + "epoch": 0.5943022901856859, + "grad_norm": 1.5826442241668701, + "learning_rate": 7.4595503819086e-05, + "loss": 1.0326, + "step": 16595 + }, + { + "epoch": 0.5943381023152542, + "grad_norm": 1.3899954557418823, + "learning_rate": 7.458428551273226e-05, + "loss": 1.0913, + "step": 16596 + }, + { + "epoch": 0.5943739144448225, + "grad_norm": 2.425168991088867, + "learning_rate": 7.45730675483142e-05, + "loss": 1.1597, + "step": 16597 + }, + { + "epoch": 0.5944097265743907, + "grad_norm": 1.5296515226364136, + "learning_rate": 7.456184992598267e-05, + "loss": 1.1349, + "step": 16598 + }, + { + "epoch": 0.594445538703959, + "grad_norm": 1.3887624740600586, + "learning_rate": 7.455063264588869e-05, + "loss": 1.0359, + "step": 16599 + }, + { + "epoch": 0.5944813508335273, + "grad_norm": 1.4296931028366089, + "learning_rate": 7.453941570818309e-05, + "loss": 1.0653, + "step": 16600 + }, + { + "epoch": 0.5945171629630956, + "grad_norm": 1.4489874839782715, + "learning_rate": 7.452819911301681e-05, + "loss": 0.9072, + "step": 16601 + }, + { + "epoch": 0.5945529750926639, + "grad_norm": 2.070192575454712, + "learning_rate": 7.451698286054076e-05, + "loss": 1.441, + "step": 16602 + }, + { + "epoch": 0.5945887872222322, + "grad_norm": 1.465872883796692, + "learning_rate": 7.450576695090583e-05, + "loss": 0.9457, + "step": 16603 + }, + { + "epoch": 0.5946245993518005, + "grad_norm": 1.7126672267913818, + "learning_rate": 7.449455138426294e-05, + "loss": 1.0602, + "step": 16604 + }, + { + "epoch": 0.5946604114813687, + "grad_norm": 1.692246675491333, + "learning_rate": 7.448333616076293e-05, + "loss": 1.0926, + "step": 16605 + }, + { + "epoch": 0.594696223610937, + "grad_norm": 1.972288727760315, + "learning_rate": 7.447212128055675e-05, + "loss": 1.1165, + "step": 16606 + }, + { + "epoch": 0.5947320357405053, + "grad_norm": 1.2526748180389404, + "learning_rate": 7.446090674379522e-05, + "loss": 1.1807, + "step": 16607 + }, + { + "epoch": 0.5947678478700736, + "grad_norm": 2.0812716484069824, + "learning_rate": 7.444969255062928e-05, + "loss": 1.2424, + "step": 16608 + }, + { + "epoch": 0.5948036599996419, + "grad_norm": 1.728135585784912, + "learning_rate": 7.443847870120976e-05, + "loss": 1.0164, + "step": 16609 + }, + { + "epoch": 0.5948394721292102, + "grad_norm": 1.6061909198760986, + "learning_rate": 7.442726519568751e-05, + "loss": 1.1739, + "step": 16610 + }, + { + "epoch": 0.5948752842587784, + "grad_norm": 1.7019603252410889, + "learning_rate": 7.441605203421345e-05, + "loss": 1.0898, + "step": 16611 + }, + { + "epoch": 0.5949110963883467, + "grad_norm": 1.4623057842254639, + "learning_rate": 7.440483921693839e-05, + "loss": 1.1703, + "step": 16612 + }, + { + "epoch": 0.594946908517915, + "grad_norm": 1.2646340131759644, + "learning_rate": 7.439362674401322e-05, + "loss": 1.0603, + "step": 16613 + }, + { + "epoch": 0.5949827206474833, + "grad_norm": 1.463969349861145, + "learning_rate": 7.438241461558875e-05, + "loss": 1.0956, + "step": 16614 + }, + { + "epoch": 0.5950185327770516, + "grad_norm": 1.2596853971481323, + "learning_rate": 7.437120283181586e-05, + "loss": 1.0656, + "step": 16615 + }, + { + "epoch": 0.5950543449066199, + "grad_norm": 1.4573384523391724, + "learning_rate": 7.435999139284538e-05, + "loss": 1.1429, + "step": 16616 + }, + { + "epoch": 0.5950901570361882, + "grad_norm": 1.6083552837371826, + "learning_rate": 7.434878029882814e-05, + "loss": 1.0759, + "step": 16617 + }, + { + "epoch": 0.5951259691657564, + "grad_norm": 1.5958276987075806, + "learning_rate": 7.433756954991499e-05, + "loss": 1.4566, + "step": 16618 + }, + { + "epoch": 0.5951617812953247, + "grad_norm": 1.6874815225601196, + "learning_rate": 7.43263591462567e-05, + "loss": 1.1546, + "step": 16619 + }, + { + "epoch": 0.595197593424893, + "grad_norm": 1.6134421825408936, + "learning_rate": 7.431514908800417e-05, + "loss": 1.1294, + "step": 16620 + }, + { + "epoch": 0.5952334055544612, + "grad_norm": 1.885606050491333, + "learning_rate": 7.430393937530815e-05, + "loss": 1.3201, + "step": 16621 + }, + { + "epoch": 0.5952692176840296, + "grad_norm": 1.4560624361038208, + "learning_rate": 7.429273000831949e-05, + "loss": 1.1397, + "step": 16622 + }, + { + "epoch": 0.5953050298135979, + "grad_norm": 1.2364659309387207, + "learning_rate": 7.4281520987189e-05, + "loss": 1.1113, + "step": 16623 + }, + { + "epoch": 0.5953408419431662, + "grad_norm": 1.3224976062774658, + "learning_rate": 7.427031231206745e-05, + "loss": 0.9799, + "step": 16624 + }, + { + "epoch": 0.5953766540727344, + "grad_norm": 1.5551447868347168, + "learning_rate": 7.425910398310566e-05, + "loss": 1.0356, + "step": 16625 + }, + { + "epoch": 0.5954124662023027, + "grad_norm": 1.8845038414001465, + "learning_rate": 7.424789600045444e-05, + "loss": 1.1686, + "step": 16626 + }, + { + "epoch": 0.595448278331871, + "grad_norm": 1.4367215633392334, + "learning_rate": 7.423668836426458e-05, + "loss": 1.2407, + "step": 16627 + }, + { + "epoch": 0.5954840904614392, + "grad_norm": 1.4504125118255615, + "learning_rate": 7.422548107468679e-05, + "loss": 0.958, + "step": 16628 + }, + { + "epoch": 0.5955199025910076, + "grad_norm": 1.6199951171875, + "learning_rate": 7.421427413187197e-05, + "loss": 1.2977, + "step": 16629 + }, + { + "epoch": 0.5955557147205759, + "grad_norm": 1.4562246799468994, + "learning_rate": 7.420306753597076e-05, + "loss": 1.1064, + "step": 16630 + }, + { + "epoch": 0.5955915268501442, + "grad_norm": 1.3805123567581177, + "learning_rate": 7.419186128713407e-05, + "loss": 0.9674, + "step": 16631 + }, + { + "epoch": 0.5956273389797124, + "grad_norm": 1.8634122610092163, + "learning_rate": 7.418065538551253e-05, + "loss": 1.1388, + "step": 16632 + }, + { + "epoch": 0.5956631511092807, + "grad_norm": 2.5252442359924316, + "learning_rate": 7.4169449831257e-05, + "loss": 1.3767, + "step": 16633 + }, + { + "epoch": 0.595698963238849, + "grad_norm": 1.2870774269104004, + "learning_rate": 7.415824462451824e-05, + "loss": 0.9683, + "step": 16634 + }, + { + "epoch": 0.5957347753684172, + "grad_norm": 1.412084698677063, + "learning_rate": 7.41470397654469e-05, + "loss": 1.1145, + "step": 16635 + }, + { + "epoch": 0.5957705874979856, + "grad_norm": 1.5890315771102905, + "learning_rate": 7.413583525419385e-05, + "loss": 0.9547, + "step": 16636 + }, + { + "epoch": 0.5958063996275539, + "grad_norm": 1.6661990880966187, + "learning_rate": 7.41246310909097e-05, + "loss": 1.1391, + "step": 16637 + }, + { + "epoch": 0.5958422117571222, + "grad_norm": 1.5502920150756836, + "learning_rate": 7.411342727574533e-05, + "loss": 1.164, + "step": 16638 + }, + { + "epoch": 0.5958780238866904, + "grad_norm": 1.1021459102630615, + "learning_rate": 7.410222380885135e-05, + "loss": 0.9206, + "step": 16639 + }, + { + "epoch": 0.5959138360162587, + "grad_norm": 1.7319692373275757, + "learning_rate": 7.409102069037862e-05, + "loss": 0.8494, + "step": 16640 + }, + { + "epoch": 0.595949648145827, + "grad_norm": 2.4575610160827637, + "learning_rate": 7.407981792047769e-05, + "loss": 1.1642, + "step": 16641 + }, + { + "epoch": 0.5959854602753952, + "grad_norm": 1.5768848657608032, + "learning_rate": 7.406861549929946e-05, + "loss": 1.0345, + "step": 16642 + }, + { + "epoch": 0.5960212724049636, + "grad_norm": 1.5232549905776978, + "learning_rate": 7.405741342699453e-05, + "loss": 0.9729, + "step": 16643 + }, + { + "epoch": 0.5960570845345319, + "grad_norm": 1.5909981727600098, + "learning_rate": 7.404621170371362e-05, + "loss": 1.1867, + "step": 16644 + }, + { + "epoch": 0.5960928966641001, + "grad_norm": 1.2444266080856323, + "learning_rate": 7.403501032960748e-05, + "loss": 1.1457, + "step": 16645 + }, + { + "epoch": 0.5961287087936684, + "grad_norm": 1.4978617429733276, + "learning_rate": 7.402380930482673e-05, + "loss": 0.9479, + "step": 16646 + }, + { + "epoch": 0.5961645209232367, + "grad_norm": 1.5129646062850952, + "learning_rate": 7.40126086295222e-05, + "loss": 1.1732, + "step": 16647 + }, + { + "epoch": 0.596200333052805, + "grad_norm": 1.7241145372390747, + "learning_rate": 7.400140830384443e-05, + "loss": 1.1303, + "step": 16648 + }, + { + "epoch": 0.5962361451823732, + "grad_norm": 1.763044834136963, + "learning_rate": 7.399020832794424e-05, + "loss": 1.1041, + "step": 16649 + }, + { + "epoch": 0.5962719573119416, + "grad_norm": 1.3296271562576294, + "learning_rate": 7.397900870197216e-05, + "loss": 1.1091, + "step": 16650 + }, + { + "epoch": 0.5963077694415099, + "grad_norm": 1.2558962106704712, + "learning_rate": 7.396780942607904e-05, + "loss": 0.9741, + "step": 16651 + }, + { + "epoch": 0.5963435815710781, + "grad_norm": 1.6921117305755615, + "learning_rate": 7.395661050041545e-05, + "loss": 1.1826, + "step": 16652 + }, + { + "epoch": 0.5963793937006464, + "grad_norm": 1.268607497215271, + "learning_rate": 7.394541192513202e-05, + "loss": 1.1677, + "step": 16653 + }, + { + "epoch": 0.5964152058302147, + "grad_norm": 1.4300259351730347, + "learning_rate": 7.393421370037952e-05, + "loss": 1.1235, + "step": 16654 + }, + { + "epoch": 0.596451017959783, + "grad_norm": 1.1318427324295044, + "learning_rate": 7.392301582630852e-05, + "loss": 0.7502, + "step": 16655 + }, + { + "epoch": 0.5964868300893512, + "grad_norm": 1.2888429164886475, + "learning_rate": 7.391181830306972e-05, + "loss": 0.9195, + "step": 16656 + }, + { + "epoch": 0.5965226422189196, + "grad_norm": 1.3453487157821655, + "learning_rate": 7.390062113081373e-05, + "loss": 1.1918, + "step": 16657 + }, + { + "epoch": 0.5965584543484879, + "grad_norm": 1.3502614498138428, + "learning_rate": 7.388942430969123e-05, + "loss": 1.2396, + "step": 16658 + }, + { + "epoch": 0.5965942664780561, + "grad_norm": 1.5551799535751343, + "learning_rate": 7.387822783985283e-05, + "loss": 0.9857, + "step": 16659 + }, + { + "epoch": 0.5966300786076244, + "grad_norm": 1.4688916206359863, + "learning_rate": 7.386703172144921e-05, + "loss": 0.882, + "step": 16660 + }, + { + "epoch": 0.5966658907371927, + "grad_norm": 1.6939948797225952, + "learning_rate": 7.385583595463099e-05, + "loss": 1.2823, + "step": 16661 + }, + { + "epoch": 0.5967017028667609, + "grad_norm": 1.4405410289764404, + "learning_rate": 7.384464053954872e-05, + "loss": 1.1919, + "step": 16662 + }, + { + "epoch": 0.5967375149963292, + "grad_norm": 1.8701403141021729, + "learning_rate": 7.383344547635311e-05, + "loss": 1.1248, + "step": 16663 + }, + { + "epoch": 0.5967733271258976, + "grad_norm": 1.4019732475280762, + "learning_rate": 7.382225076519471e-05, + "loss": 0.9493, + "step": 16664 + }, + { + "epoch": 0.5968091392554659, + "grad_norm": 1.7999697923660278, + "learning_rate": 7.381105640622419e-05, + "loss": 1.1208, + "step": 16665 + }, + { + "epoch": 0.5968449513850341, + "grad_norm": 1.5434532165527344, + "learning_rate": 7.379986239959209e-05, + "loss": 1.1312, + "step": 16666 + }, + { + "epoch": 0.5968807635146024, + "grad_norm": 1.472904086112976, + "learning_rate": 7.378866874544908e-05, + "loss": 0.8375, + "step": 16667 + }, + { + "epoch": 0.5969165756441707, + "grad_norm": 2.0676944255828857, + "learning_rate": 7.377747544394568e-05, + "loss": 1.0052, + "step": 16668 + }, + { + "epoch": 0.5969523877737389, + "grad_norm": 1.3928327560424805, + "learning_rate": 7.376628249523257e-05, + "loss": 1.0846, + "step": 16669 + }, + { + "epoch": 0.5969881999033072, + "grad_norm": 1.6722140312194824, + "learning_rate": 7.375508989946027e-05, + "loss": 1.152, + "step": 16670 + }, + { + "epoch": 0.5970240120328756, + "grad_norm": 1.6492947340011597, + "learning_rate": 7.374389765677938e-05, + "loss": 1.0508, + "step": 16671 + }, + { + "epoch": 0.5970598241624439, + "grad_norm": 1.578833818435669, + "learning_rate": 7.373270576734048e-05, + "loss": 1.3028, + "step": 16672 + }, + { + "epoch": 0.5970956362920121, + "grad_norm": 1.7074806690216064, + "learning_rate": 7.372151423129414e-05, + "loss": 1.1455, + "step": 16673 + }, + { + "epoch": 0.5971314484215804, + "grad_norm": 1.6367493867874146, + "learning_rate": 7.371032304879094e-05, + "loss": 1.1471, + "step": 16674 + }, + { + "epoch": 0.5971672605511487, + "grad_norm": 1.334701418876648, + "learning_rate": 7.369913221998141e-05, + "loss": 1.0582, + "step": 16675 + }, + { + "epoch": 0.5972030726807169, + "grad_norm": 1.8822965621948242, + "learning_rate": 7.368794174501615e-05, + "loss": 1.1278, + "step": 16676 + }, + { + "epoch": 0.5972388848102852, + "grad_norm": 1.6436257362365723, + "learning_rate": 7.367675162404567e-05, + "loss": 1.2741, + "step": 16677 + }, + { + "epoch": 0.5972746969398536, + "grad_norm": 1.646235704421997, + "learning_rate": 7.366556185722056e-05, + "loss": 1.0668, + "step": 16678 + }, + { + "epoch": 0.5973105090694218, + "grad_norm": 1.9649919271469116, + "learning_rate": 7.365437244469135e-05, + "loss": 1.0781, + "step": 16679 + }, + { + "epoch": 0.5973463211989901, + "grad_norm": 1.3301258087158203, + "learning_rate": 7.364318338660858e-05, + "loss": 1.3269, + "step": 16680 + }, + { + "epoch": 0.5973821333285584, + "grad_norm": 1.5248535871505737, + "learning_rate": 7.363199468312277e-05, + "loss": 1.0642, + "step": 16681 + }, + { + "epoch": 0.5974179454581267, + "grad_norm": 1.950636625289917, + "learning_rate": 7.362080633438445e-05, + "loss": 1.2385, + "step": 16682 + }, + { + "epoch": 0.5974537575876949, + "grad_norm": 1.4209928512573242, + "learning_rate": 7.360961834054418e-05, + "loss": 1.3173, + "step": 16683 + }, + { + "epoch": 0.5974895697172632, + "grad_norm": 1.3877484798431396, + "learning_rate": 7.359843070175242e-05, + "loss": 1.0915, + "step": 16684 + }, + { + "epoch": 0.5975253818468316, + "grad_norm": 1.4664891958236694, + "learning_rate": 7.358724341815975e-05, + "loss": 1.0017, + "step": 16685 + }, + { + "epoch": 0.5975611939763998, + "grad_norm": 1.675077199935913, + "learning_rate": 7.357605648991661e-05, + "loss": 0.9856, + "step": 16686 + }, + { + "epoch": 0.5975970061059681, + "grad_norm": 1.6495978832244873, + "learning_rate": 7.356486991717359e-05, + "loss": 1.0636, + "step": 16687 + }, + { + "epoch": 0.5976328182355364, + "grad_norm": 2.3154358863830566, + "learning_rate": 7.355368370008113e-05, + "loss": 1.144, + "step": 16688 + }, + { + "epoch": 0.5976686303651046, + "grad_norm": 1.5075019598007202, + "learning_rate": 7.354249783878973e-05, + "loss": 1.1313, + "step": 16689 + }, + { + "epoch": 0.5977044424946729, + "grad_norm": 1.2919858694076538, + "learning_rate": 7.353131233344991e-05, + "loss": 1.1778, + "step": 16690 + }, + { + "epoch": 0.5977402546242412, + "grad_norm": 1.211946725845337, + "learning_rate": 7.352012718421212e-05, + "loss": 0.9376, + "step": 16691 + }, + { + "epoch": 0.5977760667538096, + "grad_norm": 1.919247031211853, + "learning_rate": 7.350894239122689e-05, + "loss": 1.4163, + "step": 16692 + }, + { + "epoch": 0.5978118788833778, + "grad_norm": 1.6272977590560913, + "learning_rate": 7.349775795464466e-05, + "loss": 1.0168, + "step": 16693 + }, + { + "epoch": 0.5978476910129461, + "grad_norm": 1.8381385803222656, + "learning_rate": 7.348657387461591e-05, + "loss": 1.155, + "step": 16694 + }, + { + "epoch": 0.5978835031425144, + "grad_norm": 1.5542246103286743, + "learning_rate": 7.34753901512911e-05, + "loss": 1.1688, + "step": 16695 + }, + { + "epoch": 0.5979193152720826, + "grad_norm": 1.556941270828247, + "learning_rate": 7.346420678482071e-05, + "loss": 0.9822, + "step": 16696 + }, + { + "epoch": 0.5979551274016509, + "grad_norm": 1.621666669845581, + "learning_rate": 7.345302377535521e-05, + "loss": 1.1428, + "step": 16697 + }, + { + "epoch": 0.5979909395312192, + "grad_norm": 1.8533540964126587, + "learning_rate": 7.3441841123045e-05, + "loss": 1.2357, + "step": 16698 + }, + { + "epoch": 0.5980267516607876, + "grad_norm": 1.6680175065994263, + "learning_rate": 7.343065882804056e-05, + "loss": 0.8484, + "step": 16699 + }, + { + "epoch": 0.5980625637903558, + "grad_norm": 1.5751537084579468, + "learning_rate": 7.341947689049233e-05, + "loss": 1.0385, + "step": 16700 + }, + { + "epoch": 0.5980983759199241, + "grad_norm": 1.4048066139221191, + "learning_rate": 7.340829531055078e-05, + "loss": 0.8155, + "step": 16701 + }, + { + "epoch": 0.5981341880494924, + "grad_norm": 2.4669911861419678, + "learning_rate": 7.339711408836629e-05, + "loss": 1.2588, + "step": 16702 + }, + { + "epoch": 0.5981700001790606, + "grad_norm": 1.5244214534759521, + "learning_rate": 7.338593322408933e-05, + "loss": 1.2028, + "step": 16703 + }, + { + "epoch": 0.5982058123086289, + "grad_norm": 2.0181779861450195, + "learning_rate": 7.33747527178703e-05, + "loss": 1.2662, + "step": 16704 + }, + { + "epoch": 0.5982416244381972, + "grad_norm": 1.781026840209961, + "learning_rate": 7.336357256985964e-05, + "loss": 1.1977, + "step": 16705 + }, + { + "epoch": 0.5982774365677656, + "grad_norm": 1.3270107507705688, + "learning_rate": 7.335239278020776e-05, + "loss": 1.1264, + "step": 16706 + }, + { + "epoch": 0.5983132486973338, + "grad_norm": 1.310183048248291, + "learning_rate": 7.334121334906503e-05, + "loss": 1.0543, + "step": 16707 + }, + { + "epoch": 0.5983490608269021, + "grad_norm": 1.4394696950912476, + "learning_rate": 7.333003427658192e-05, + "loss": 1.2477, + "step": 16708 + }, + { + "epoch": 0.5983848729564704, + "grad_norm": 2.091416835784912, + "learning_rate": 7.331885556290876e-05, + "loss": 1.317, + "step": 16709 + }, + { + "epoch": 0.5984206850860386, + "grad_norm": 1.560626745223999, + "learning_rate": 7.330767720819601e-05, + "loss": 1.27, + "step": 16710 + }, + { + "epoch": 0.5984564972156069, + "grad_norm": 1.8529149293899536, + "learning_rate": 7.329649921259402e-05, + "loss": 1.1872, + "step": 16711 + }, + { + "epoch": 0.5984923093451752, + "grad_norm": 1.688112735748291, + "learning_rate": 7.32853215762532e-05, + "loss": 1.0275, + "step": 16712 + }, + { + "epoch": 0.5985281214747435, + "grad_norm": 1.622776985168457, + "learning_rate": 7.32741442993239e-05, + "loss": 1.109, + "step": 16713 + }, + { + "epoch": 0.5985639336043118, + "grad_norm": 1.286012053489685, + "learning_rate": 7.326296738195654e-05, + "loss": 1.1007, + "step": 16714 + }, + { + "epoch": 0.5985997457338801, + "grad_norm": 1.5076383352279663, + "learning_rate": 7.325179082430148e-05, + "loss": 1.0209, + "step": 16715 + }, + { + "epoch": 0.5986355578634484, + "grad_norm": 1.9651471376419067, + "learning_rate": 7.324061462650901e-05, + "loss": 1.1799, + "step": 16716 + }, + { + "epoch": 0.5986713699930166, + "grad_norm": 1.6139397621154785, + "learning_rate": 7.32294387887296e-05, + "loss": 1.0413, + "step": 16717 + }, + { + "epoch": 0.5987071821225849, + "grad_norm": 1.583439588546753, + "learning_rate": 7.321826331111353e-05, + "loss": 1.3496, + "step": 16718 + }, + { + "epoch": 0.5987429942521532, + "grad_norm": 1.3197593688964844, + "learning_rate": 7.320708819381121e-05, + "loss": 1.1658, + "step": 16719 + }, + { + "epoch": 0.5987788063817215, + "grad_norm": 1.805470585823059, + "learning_rate": 7.319591343697293e-05, + "loss": 1.1287, + "step": 16720 + }, + { + "epoch": 0.5988146185112898, + "grad_norm": 1.320380449295044, + "learning_rate": 7.31847390407491e-05, + "loss": 1.0983, + "step": 16721 + }, + { + "epoch": 0.5988504306408581, + "grad_norm": 1.6520822048187256, + "learning_rate": 7.317356500528996e-05, + "loss": 1.1585, + "step": 16722 + }, + { + "epoch": 0.5988862427704263, + "grad_norm": 1.3093012571334839, + "learning_rate": 7.316239133074595e-05, + "loss": 1.023, + "step": 16723 + }, + { + "epoch": 0.5989220548999946, + "grad_norm": 1.5085985660552979, + "learning_rate": 7.315121801726737e-05, + "loss": 1.0212, + "step": 16724 + }, + { + "epoch": 0.5989578670295629, + "grad_norm": 1.5003893375396729, + "learning_rate": 7.314004506500443e-05, + "loss": 1.3056, + "step": 16725 + }, + { + "epoch": 0.5989936791591312, + "grad_norm": 1.3567959070205688, + "learning_rate": 7.312887247410762e-05, + "loss": 1.0265, + "step": 16726 + }, + { + "epoch": 0.5990294912886995, + "grad_norm": 1.3241362571716309, + "learning_rate": 7.311770024472711e-05, + "loss": 1.0948, + "step": 16727 + }, + { + "epoch": 0.5990653034182678, + "grad_norm": 1.557084560394287, + "learning_rate": 7.31065283770133e-05, + "loss": 1.2496, + "step": 16728 + }, + { + "epoch": 0.5991011155478361, + "grad_norm": 1.4404871463775635, + "learning_rate": 7.309535687111644e-05, + "loss": 1.1107, + "step": 16729 + }, + { + "epoch": 0.5991369276774043, + "grad_norm": 1.6622673273086548, + "learning_rate": 7.308418572718687e-05, + "loss": 1.27, + "step": 16730 + }, + { + "epoch": 0.5991727398069726, + "grad_norm": 1.5029243230819702, + "learning_rate": 7.307301494537489e-05, + "loss": 0.9793, + "step": 16731 + }, + { + "epoch": 0.5992085519365409, + "grad_norm": 1.2828078269958496, + "learning_rate": 7.306184452583067e-05, + "loss": 1.2668, + "step": 16732 + }, + { + "epoch": 0.5992443640661091, + "grad_norm": 1.6315340995788574, + "learning_rate": 7.305067446870468e-05, + "loss": 1.114, + "step": 16733 + }, + { + "epoch": 0.5992801761956775, + "grad_norm": 1.5127677917480469, + "learning_rate": 7.303950477414703e-05, + "loss": 1.1514, + "step": 16734 + }, + { + "epoch": 0.5993159883252458, + "grad_norm": 1.5997281074523926, + "learning_rate": 7.302833544230812e-05, + "loss": 0.9381, + "step": 16735 + }, + { + "epoch": 0.5993518004548141, + "grad_norm": 1.3070341348648071, + "learning_rate": 7.301716647333812e-05, + "loss": 0.8126, + "step": 16736 + }, + { + "epoch": 0.5993876125843823, + "grad_norm": 1.700245976448059, + "learning_rate": 7.300599786738739e-05, + "loss": 1.2503, + "step": 16737 + }, + { + "epoch": 0.5994234247139506, + "grad_norm": 1.7274832725524902, + "learning_rate": 7.299482962460607e-05, + "loss": 1.1462, + "step": 16738 + }, + { + "epoch": 0.5994592368435189, + "grad_norm": 1.5433671474456787, + "learning_rate": 7.298366174514456e-05, + "loss": 1.2156, + "step": 16739 + }, + { + "epoch": 0.5994950489730871, + "grad_norm": 1.2287781238555908, + "learning_rate": 7.297249422915301e-05, + "loss": 1.1336, + "step": 16740 + }, + { + "epoch": 0.5995308611026555, + "grad_norm": 1.7015236616134644, + "learning_rate": 7.296132707678166e-05, + "loss": 1.0823, + "step": 16741 + }, + { + "epoch": 0.5995666732322238, + "grad_norm": 1.3886137008666992, + "learning_rate": 7.29501602881808e-05, + "loss": 1.2197, + "step": 16742 + }, + { + "epoch": 0.5996024853617921, + "grad_norm": 1.9678038358688354, + "learning_rate": 7.29389938635006e-05, + "loss": 1.3363, + "step": 16743 + }, + { + "epoch": 0.5996382974913603, + "grad_norm": 1.7130216360092163, + "learning_rate": 7.292782780289141e-05, + "loss": 1.2059, + "step": 16744 + }, + { + "epoch": 0.5996741096209286, + "grad_norm": 1.4317939281463623, + "learning_rate": 7.291666210650328e-05, + "loss": 1.0151, + "step": 16745 + }, + { + "epoch": 0.5997099217504969, + "grad_norm": 1.4680081605911255, + "learning_rate": 7.290549677448661e-05, + "loss": 1.1125, + "step": 16746 + }, + { + "epoch": 0.5997457338800651, + "grad_norm": 1.7309690713882446, + "learning_rate": 7.289433180699148e-05, + "loss": 0.9669, + "step": 16747 + }, + { + "epoch": 0.5997815460096335, + "grad_norm": 1.5347802639007568, + "learning_rate": 7.28831672041682e-05, + "loss": 1.065, + "step": 16748 + }, + { + "epoch": 0.5998173581392018, + "grad_norm": 1.3534760475158691, + "learning_rate": 7.287200296616689e-05, + "loss": 1.1606, + "step": 16749 + }, + { + "epoch": 0.59985317026877, + "grad_norm": 1.9568572044372559, + "learning_rate": 7.286083909313779e-05, + "loss": 1.0905, + "step": 16750 + }, + { + "epoch": 0.5998889823983383, + "grad_norm": 1.6495352983474731, + "learning_rate": 7.284967558523112e-05, + "loss": 1.1483, + "step": 16751 + }, + { + "epoch": 0.5999247945279066, + "grad_norm": 1.6002119779586792, + "learning_rate": 7.2838512442597e-05, + "loss": 0.9184, + "step": 16752 + }, + { + "epoch": 0.5999606066574749, + "grad_norm": 1.3831512928009033, + "learning_rate": 7.282734966538569e-05, + "loss": 1.2585, + "step": 16753 + }, + { + "epoch": 0.5999964187870431, + "grad_norm": 1.921460509300232, + "learning_rate": 7.281618725374733e-05, + "loss": 1.0507, + "step": 16754 + }, + { + "epoch": 0.6000322309166115, + "grad_norm": 1.8081692457199097, + "learning_rate": 7.28050252078321e-05, + "loss": 1.1709, + "step": 16755 + }, + { + "epoch": 0.6000680430461798, + "grad_norm": 1.3428138494491577, + "learning_rate": 7.279386352779016e-05, + "loss": 1.0425, + "step": 16756 + }, + { + "epoch": 0.600103855175748, + "grad_norm": 1.3264707326889038, + "learning_rate": 7.278270221377174e-05, + "loss": 0.9726, + "step": 16757 + }, + { + "epoch": 0.6001396673053163, + "grad_norm": 1.753374695777893, + "learning_rate": 7.277154126592695e-05, + "loss": 1.1303, + "step": 16758 + }, + { + "epoch": 0.6001754794348846, + "grad_norm": 1.9683525562286377, + "learning_rate": 7.276038068440592e-05, + "loss": 0.9786, + "step": 16759 + }, + { + "epoch": 0.6002112915644529, + "grad_norm": 1.4969631433486938, + "learning_rate": 7.274922046935885e-05, + "loss": 1.1503, + "step": 16760 + }, + { + "epoch": 0.6002471036940211, + "grad_norm": 1.7623289823532104, + "learning_rate": 7.273806062093585e-05, + "loss": 1.1847, + "step": 16761 + }, + { + "epoch": 0.6002829158235895, + "grad_norm": 1.260407567024231, + "learning_rate": 7.27269011392871e-05, + "loss": 0.9785, + "step": 16762 + }, + { + "epoch": 0.6003187279531578, + "grad_norm": 1.3157106637954712, + "learning_rate": 7.271574202456268e-05, + "loss": 1.2248, + "step": 16763 + }, + { + "epoch": 0.600354540082726, + "grad_norm": 1.3142259120941162, + "learning_rate": 7.27045832769128e-05, + "loss": 1.1343, + "step": 16764 + }, + { + "epoch": 0.6003903522122943, + "grad_norm": 2.192157745361328, + "learning_rate": 7.269342489648752e-05, + "loss": 1.2237, + "step": 16765 + }, + { + "epoch": 0.6004261643418626, + "grad_norm": 1.7974677085876465, + "learning_rate": 7.268226688343699e-05, + "loss": 0.9522, + "step": 16766 + }, + { + "epoch": 0.6004619764714308, + "grad_norm": 1.5701184272766113, + "learning_rate": 7.267110923791133e-05, + "loss": 1.1793, + "step": 16767 + }, + { + "epoch": 0.6004977886009991, + "grad_norm": 1.3508986234664917, + "learning_rate": 7.265995196006062e-05, + "loss": 1.2116, + "step": 16768 + }, + { + "epoch": 0.6005336007305675, + "grad_norm": 1.4172941446304321, + "learning_rate": 7.264879505003502e-05, + "loss": 1.0019, + "step": 16769 + }, + { + "epoch": 0.6005694128601358, + "grad_norm": 1.4996039867401123, + "learning_rate": 7.263763850798458e-05, + "loss": 1.2303, + "step": 16770 + }, + { + "epoch": 0.600605224989704, + "grad_norm": 1.3710349798202515, + "learning_rate": 7.262648233405942e-05, + "loss": 0.8772, + "step": 16771 + }, + { + "epoch": 0.6006410371192723, + "grad_norm": 1.420803189277649, + "learning_rate": 7.261532652840964e-05, + "loss": 1.0727, + "step": 16772 + }, + { + "epoch": 0.6006768492488406, + "grad_norm": 1.551202416419983, + "learning_rate": 7.260417109118531e-05, + "loss": 0.9523, + "step": 16773 + }, + { + "epoch": 0.6007126613784088, + "grad_norm": 1.6732124090194702, + "learning_rate": 7.259301602253652e-05, + "loss": 1.2918, + "step": 16774 + }, + { + "epoch": 0.6007484735079771, + "grad_norm": 1.4631482362747192, + "learning_rate": 7.258186132261336e-05, + "loss": 1.2072, + "step": 16775 + }, + { + "epoch": 0.6007842856375455, + "grad_norm": 1.6843785047531128, + "learning_rate": 7.25707069915659e-05, + "loss": 1.0525, + "step": 16776 + }, + { + "epoch": 0.6008200977671138, + "grad_norm": 1.2571018934249878, + "learning_rate": 7.255955302954416e-05, + "loss": 0.94, + "step": 16777 + }, + { + "epoch": 0.600855909896682, + "grad_norm": 1.2619589567184448, + "learning_rate": 7.254839943669826e-05, + "loss": 0.9756, + "step": 16778 + }, + { + "epoch": 0.6008917220262503, + "grad_norm": 1.6737258434295654, + "learning_rate": 7.253724621317822e-05, + "loss": 1.1678, + "step": 16779 + }, + { + "epoch": 0.6009275341558186, + "grad_norm": 1.2509838342666626, + "learning_rate": 7.252609335913413e-05, + "loss": 1.0161, + "step": 16780 + }, + { + "epoch": 0.6009633462853868, + "grad_norm": 1.7090873718261719, + "learning_rate": 7.251494087471599e-05, + "loss": 1.2021, + "step": 16781 + }, + { + "epoch": 0.6009991584149551, + "grad_norm": 1.7607430219650269, + "learning_rate": 7.250378876007389e-05, + "loss": 0.9802, + "step": 16782 + }, + { + "epoch": 0.6010349705445235, + "grad_norm": 1.572011947631836, + "learning_rate": 7.249263701535782e-05, + "loss": 1.2654, + "step": 16783 + }, + { + "epoch": 0.6010707826740918, + "grad_norm": 1.8164092302322388, + "learning_rate": 7.248148564071787e-05, + "loss": 1.1037, + "step": 16784 + }, + { + "epoch": 0.60110659480366, + "grad_norm": 1.9459737539291382, + "learning_rate": 7.247033463630402e-05, + "loss": 1.2841, + "step": 16785 + }, + { + "epoch": 0.6011424069332283, + "grad_norm": 1.6291147470474243, + "learning_rate": 7.24591840022663e-05, + "loss": 1.1173, + "step": 16786 + }, + { + "epoch": 0.6011782190627966, + "grad_norm": 1.361190915107727, + "learning_rate": 7.244803373875475e-05, + "loss": 1.1272, + "step": 16787 + }, + { + "epoch": 0.6012140311923648, + "grad_norm": 1.7873187065124512, + "learning_rate": 7.243688384591934e-05, + "loss": 1.0325, + "step": 16788 + }, + { + "epoch": 0.6012498433219331, + "grad_norm": 1.5364787578582764, + "learning_rate": 7.242573432391012e-05, + "loss": 1.183, + "step": 16789 + }, + { + "epoch": 0.6012856554515015, + "grad_norm": 1.6824859380722046, + "learning_rate": 7.241458517287708e-05, + "loss": 1.2408, + "step": 16790 + }, + { + "epoch": 0.6013214675810697, + "grad_norm": 1.3340696096420288, + "learning_rate": 7.24034363929702e-05, + "loss": 1.2423, + "step": 16791 + }, + { + "epoch": 0.601357279710638, + "grad_norm": 1.4803149700164795, + "learning_rate": 7.23922879843395e-05, + "loss": 1.1817, + "step": 16792 + }, + { + "epoch": 0.6013930918402063, + "grad_norm": 1.7396416664123535, + "learning_rate": 7.238113994713495e-05, + "loss": 1.0993, + "step": 16793 + }, + { + "epoch": 0.6014289039697746, + "grad_norm": 1.3535008430480957, + "learning_rate": 7.236999228150654e-05, + "loss": 1.1702, + "step": 16794 + }, + { + "epoch": 0.6014647160993428, + "grad_norm": 1.472606897354126, + "learning_rate": 7.235884498760423e-05, + "loss": 1.07, + "step": 16795 + }, + { + "epoch": 0.6015005282289111, + "grad_norm": 1.4425619840621948, + "learning_rate": 7.234769806557802e-05, + "loss": 1.1458, + "step": 16796 + }, + { + "epoch": 0.6015363403584795, + "grad_norm": 1.3478314876556396, + "learning_rate": 7.233655151557786e-05, + "loss": 1.1597, + "step": 16797 + }, + { + "epoch": 0.6015721524880477, + "grad_norm": 1.570472002029419, + "learning_rate": 7.232540533775371e-05, + "loss": 1.2099, + "step": 16798 + }, + { + "epoch": 0.601607964617616, + "grad_norm": 1.6198346614837646, + "learning_rate": 7.231425953225552e-05, + "loss": 1.3774, + "step": 16799 + }, + { + "epoch": 0.6016437767471843, + "grad_norm": 1.3015438318252563, + "learning_rate": 7.230311409923329e-05, + "loss": 1.2253, + "step": 16800 + }, + { + "epoch": 0.6016795888767525, + "grad_norm": 2.189030170440674, + "learning_rate": 7.22919690388369e-05, + "loss": 1.1703, + "step": 16801 + }, + { + "epoch": 0.6017154010063208, + "grad_norm": 1.4778295755386353, + "learning_rate": 7.228082435121636e-05, + "loss": 1.3624, + "step": 16802 + }, + { + "epoch": 0.6017512131358891, + "grad_norm": 1.5985124111175537, + "learning_rate": 7.226968003652157e-05, + "loss": 1.0631, + "step": 16803 + }, + { + "epoch": 0.6017870252654575, + "grad_norm": 1.5401537418365479, + "learning_rate": 7.225853609490244e-05, + "loss": 0.9596, + "step": 16804 + }, + { + "epoch": 0.6018228373950257, + "grad_norm": 1.2753900289535522, + "learning_rate": 7.224739252650894e-05, + "loss": 0.9484, + "step": 16805 + }, + { + "epoch": 0.601858649524594, + "grad_norm": 1.465820074081421, + "learning_rate": 7.223624933149095e-05, + "loss": 0.9962, + "step": 16806 + }, + { + "epoch": 0.6018944616541623, + "grad_norm": 1.8187929391860962, + "learning_rate": 7.222510650999845e-05, + "loss": 1.1446, + "step": 16807 + }, + { + "epoch": 0.6019302737837305, + "grad_norm": 1.544900894165039, + "learning_rate": 7.221396406218129e-05, + "loss": 0.9977, + "step": 16808 + }, + { + "epoch": 0.6019660859132988, + "grad_norm": 1.3603068590164185, + "learning_rate": 7.220282198818941e-05, + "loss": 1.1573, + "step": 16809 + }, + { + "epoch": 0.6020018980428671, + "grad_norm": 1.3337068557739258, + "learning_rate": 7.21916802881727e-05, + "loss": 1.0803, + "step": 16810 + }, + { + "epoch": 0.6020377101724355, + "grad_norm": 1.382495403289795, + "learning_rate": 7.218053896228107e-05, + "loss": 0.9165, + "step": 16811 + }, + { + "epoch": 0.6020735223020037, + "grad_norm": 1.9610458612442017, + "learning_rate": 7.216939801066444e-05, + "loss": 1.1108, + "step": 16812 + }, + { + "epoch": 0.602109334431572, + "grad_norm": 1.315255045890808, + "learning_rate": 7.215825743347259e-05, + "loss": 0.9339, + "step": 16813 + }, + { + "epoch": 0.6021451465611403, + "grad_norm": 1.5602140426635742, + "learning_rate": 7.214711723085553e-05, + "loss": 1.0035, + "step": 16814 + }, + { + "epoch": 0.6021809586907085, + "grad_norm": 1.592604637145996, + "learning_rate": 7.213597740296304e-05, + "loss": 1.0036, + "step": 16815 + }, + { + "epoch": 0.6022167708202768, + "grad_norm": 1.5579335689544678, + "learning_rate": 7.212483794994503e-05, + "loss": 0.8893, + "step": 16816 + }, + { + "epoch": 0.6022525829498451, + "grad_norm": 1.2511059045791626, + "learning_rate": 7.211369887195139e-05, + "loss": 0.9842, + "step": 16817 + }, + { + "epoch": 0.6022883950794135, + "grad_norm": 1.6539254188537598, + "learning_rate": 7.210256016913195e-05, + "loss": 1.1136, + "step": 16818 + }, + { + "epoch": 0.6023242072089817, + "grad_norm": 1.5189464092254639, + "learning_rate": 7.209142184163657e-05, + "loss": 1.2867, + "step": 16819 + }, + { + "epoch": 0.60236001933855, + "grad_norm": 1.3212767839431763, + "learning_rate": 7.208028388961515e-05, + "loss": 1.0323, + "step": 16820 + }, + { + "epoch": 0.6023958314681183, + "grad_norm": 1.2843971252441406, + "learning_rate": 7.206914631321749e-05, + "loss": 1.046, + "step": 16821 + }, + { + "epoch": 0.6024316435976865, + "grad_norm": 1.1936440467834473, + "learning_rate": 7.205800911259338e-05, + "loss": 1.0441, + "step": 16822 + }, + { + "epoch": 0.6024674557272548, + "grad_norm": 2.2734787464141846, + "learning_rate": 7.204687228789279e-05, + "loss": 1.0907, + "step": 16823 + }, + { + "epoch": 0.6025032678568231, + "grad_norm": 1.466583490371704, + "learning_rate": 7.20357358392654e-05, + "loss": 1.1689, + "step": 16824 + }, + { + "epoch": 0.6025390799863914, + "grad_norm": 1.4894063472747803, + "learning_rate": 7.202459976686118e-05, + "loss": 1.0249, + "step": 16825 + }, + { + "epoch": 0.6025748921159597, + "grad_norm": 1.458000659942627, + "learning_rate": 7.201346407082982e-05, + "loss": 1.0594, + "step": 16826 + }, + { + "epoch": 0.602610704245528, + "grad_norm": 1.4174597263336182, + "learning_rate": 7.200232875132127e-05, + "loss": 1.2483, + "step": 16827 + }, + { + "epoch": 0.6026465163750963, + "grad_norm": 1.4583168029785156, + "learning_rate": 7.199119380848525e-05, + "loss": 1.0659, + "step": 16828 + }, + { + "epoch": 0.6026823285046645, + "grad_norm": 1.6944799423217773, + "learning_rate": 7.198005924247155e-05, + "loss": 1.2459, + "step": 16829 + }, + { + "epoch": 0.6027181406342328, + "grad_norm": 1.4535255432128906, + "learning_rate": 7.196892505343007e-05, + "loss": 0.9072, + "step": 16830 + }, + { + "epoch": 0.6027539527638011, + "grad_norm": 1.545196294784546, + "learning_rate": 7.195779124151048e-05, + "loss": 1.0771, + "step": 16831 + }, + { + "epoch": 0.6027897648933694, + "grad_norm": 1.8761937618255615, + "learning_rate": 7.19466578068627e-05, + "loss": 1.0827, + "step": 16832 + }, + { + "epoch": 0.6028255770229377, + "grad_norm": 1.4740140438079834, + "learning_rate": 7.193552474963638e-05, + "loss": 1.1853, + "step": 16833 + }, + { + "epoch": 0.602861389152506, + "grad_norm": 1.651989221572876, + "learning_rate": 7.192439206998146e-05, + "loss": 1.0449, + "step": 16834 + }, + { + "epoch": 0.6028972012820742, + "grad_norm": 1.60991632938385, + "learning_rate": 7.191325976804754e-05, + "loss": 0.9971, + "step": 16835 + }, + { + "epoch": 0.6029330134116425, + "grad_norm": 1.713376760482788, + "learning_rate": 7.190212784398458e-05, + "loss": 1.2565, + "step": 16836 + }, + { + "epoch": 0.6029688255412108, + "grad_norm": 1.4880640506744385, + "learning_rate": 7.18909962979422e-05, + "loss": 1.0403, + "step": 16837 + }, + { + "epoch": 0.603004637670779, + "grad_norm": 1.4176832437515259, + "learning_rate": 7.187986513007018e-05, + "loss": 1.1698, + "step": 16838 + }, + { + "epoch": 0.6030404498003473, + "grad_norm": 1.485244870185852, + "learning_rate": 7.186873434051832e-05, + "loss": 1.1621, + "step": 16839 + }, + { + "epoch": 0.6030762619299157, + "grad_norm": 1.814705729484558, + "learning_rate": 7.185760392943637e-05, + "loss": 1.087, + "step": 16840 + }, + { + "epoch": 0.603112074059484, + "grad_norm": 1.3405308723449707, + "learning_rate": 7.184647389697405e-05, + "loss": 1.2147, + "step": 16841 + }, + { + "epoch": 0.6031478861890522, + "grad_norm": 1.8385274410247803, + "learning_rate": 7.183534424328106e-05, + "loss": 1.1637, + "step": 16842 + }, + { + "epoch": 0.6031836983186205, + "grad_norm": 1.4081361293792725, + "learning_rate": 7.182421496850726e-05, + "loss": 1.0886, + "step": 16843 + }, + { + "epoch": 0.6032195104481888, + "grad_norm": 1.513899564743042, + "learning_rate": 7.181308607280223e-05, + "loss": 1.1679, + "step": 16844 + }, + { + "epoch": 0.603255322577757, + "grad_norm": 1.1515123844146729, + "learning_rate": 7.180195755631584e-05, + "loss": 0.8992, + "step": 16845 + }, + { + "epoch": 0.6032911347073253, + "grad_norm": 1.7720803022384644, + "learning_rate": 7.179082941919773e-05, + "loss": 1.0486, + "step": 16846 + }, + { + "epoch": 0.6033269468368937, + "grad_norm": 1.3789103031158447, + "learning_rate": 7.177970166159758e-05, + "loss": 1.1024, + "step": 16847 + }, + { + "epoch": 0.603362758966462, + "grad_norm": 1.1347465515136719, + "learning_rate": 7.176857428366517e-05, + "loss": 0.8187, + "step": 16848 + }, + { + "epoch": 0.6033985710960302, + "grad_norm": 1.440251111984253, + "learning_rate": 7.175744728555016e-05, + "loss": 1.1101, + "step": 16849 + }, + { + "epoch": 0.6034343832255985, + "grad_norm": 1.6373993158340454, + "learning_rate": 7.174632066740227e-05, + "loss": 1.5028, + "step": 16850 + }, + { + "epoch": 0.6034701953551668, + "grad_norm": 1.413166880607605, + "learning_rate": 7.17351944293712e-05, + "loss": 1.2564, + "step": 16851 + }, + { + "epoch": 0.603506007484735, + "grad_norm": 1.6343622207641602, + "learning_rate": 7.172406857160662e-05, + "loss": 1.2244, + "step": 16852 + }, + { + "epoch": 0.6035418196143033, + "grad_norm": 1.4239211082458496, + "learning_rate": 7.171294309425823e-05, + "loss": 1.1689, + "step": 16853 + }, + { + "epoch": 0.6035776317438717, + "grad_norm": 1.4419493675231934, + "learning_rate": 7.17018179974757e-05, + "loss": 1.1922, + "step": 16854 + }, + { + "epoch": 0.60361344387344, + "grad_norm": 1.8069806098937988, + "learning_rate": 7.169069328140872e-05, + "loss": 1.122, + "step": 16855 + }, + { + "epoch": 0.6036492560030082, + "grad_norm": 1.5104387998580933, + "learning_rate": 7.167956894620694e-05, + "loss": 0.9073, + "step": 16856 + }, + { + "epoch": 0.6036850681325765, + "grad_norm": 1.5714432001113892, + "learning_rate": 7.166844499202002e-05, + "loss": 1.2157, + "step": 16857 + }, + { + "epoch": 0.6037208802621448, + "grad_norm": 1.305506944656372, + "learning_rate": 7.165732141899761e-05, + "loss": 1.1059, + "step": 16858 + }, + { + "epoch": 0.603756692391713, + "grad_norm": 1.7464324235916138, + "learning_rate": 7.164619822728941e-05, + "loss": 1.2253, + "step": 16859 + }, + { + "epoch": 0.6037925045212813, + "grad_norm": 1.2962830066680908, + "learning_rate": 7.163507541704503e-05, + "loss": 1.1983, + "step": 16860 + }, + { + "epoch": 0.6038283166508497, + "grad_norm": 1.4401884078979492, + "learning_rate": 7.162395298841414e-05, + "loss": 1.1955, + "step": 16861 + }, + { + "epoch": 0.603864128780418, + "grad_norm": 1.2994046211242676, + "learning_rate": 7.161283094154633e-05, + "loss": 1.1246, + "step": 16862 + }, + { + "epoch": 0.6038999409099862, + "grad_norm": 1.6821424961090088, + "learning_rate": 7.160170927659128e-05, + "loss": 1.1416, + "step": 16863 + }, + { + "epoch": 0.6039357530395545, + "grad_norm": 1.5007191896438599, + "learning_rate": 7.159058799369861e-05, + "loss": 0.9609, + "step": 16864 + }, + { + "epoch": 0.6039715651691228, + "grad_norm": 1.355319857597351, + "learning_rate": 7.157946709301791e-05, + "loss": 0.7923, + "step": 16865 + }, + { + "epoch": 0.604007377298691, + "grad_norm": 1.284794569015503, + "learning_rate": 7.156834657469885e-05, + "loss": 1.1745, + "step": 16866 + }, + { + "epoch": 0.6040431894282593, + "grad_norm": 1.3548787832260132, + "learning_rate": 7.155722643889097e-05, + "loss": 1.1533, + "step": 16867 + }, + { + "epoch": 0.6040790015578277, + "grad_norm": 1.5986820459365845, + "learning_rate": 7.154610668574395e-05, + "loss": 0.8217, + "step": 16868 + }, + { + "epoch": 0.604114813687396, + "grad_norm": 1.2537776231765747, + "learning_rate": 7.153498731540735e-05, + "loss": 0.9311, + "step": 16869 + }, + { + "epoch": 0.6041506258169642, + "grad_norm": 1.7374112606048584, + "learning_rate": 7.15238683280308e-05, + "loss": 1.2532, + "step": 16870 + }, + { + "epoch": 0.6041864379465325, + "grad_norm": 1.527652382850647, + "learning_rate": 7.151274972376383e-05, + "loss": 0.9101, + "step": 16871 + }, + { + "epoch": 0.6042222500761008, + "grad_norm": 1.3566856384277344, + "learning_rate": 7.15016315027561e-05, + "loss": 0.9515, + "step": 16872 + }, + { + "epoch": 0.604258062205669, + "grad_norm": 1.6317296028137207, + "learning_rate": 7.149051366515716e-05, + "loss": 1.2292, + "step": 16873 + }, + { + "epoch": 0.6042938743352373, + "grad_norm": 1.5155797004699707, + "learning_rate": 7.147939621111655e-05, + "loss": 1.2309, + "step": 16874 + }, + { + "epoch": 0.6043296864648057, + "grad_norm": 1.3400853872299194, + "learning_rate": 7.146827914078391e-05, + "loss": 1.1232, + "step": 16875 + }, + { + "epoch": 0.6043654985943739, + "grad_norm": 1.5751407146453857, + "learning_rate": 7.145716245430876e-05, + "loss": 1.0793, + "step": 16876 + }, + { + "epoch": 0.6044013107239422, + "grad_norm": 1.4069135189056396, + "learning_rate": 7.144604615184067e-05, + "loss": 1.275, + "step": 16877 + }, + { + "epoch": 0.6044371228535105, + "grad_norm": 1.505393385887146, + "learning_rate": 7.143493023352918e-05, + "loss": 1.1579, + "step": 16878 + }, + { + "epoch": 0.6044729349830787, + "grad_norm": 2.0905754566192627, + "learning_rate": 7.142381469952388e-05, + "loss": 1.1886, + "step": 16879 + }, + { + "epoch": 0.604508747112647, + "grad_norm": 1.5361121892929077, + "learning_rate": 7.141269954997428e-05, + "loss": 1.1305, + "step": 16880 + }, + { + "epoch": 0.6045445592422153, + "grad_norm": 1.5752135515213013, + "learning_rate": 7.140158478502995e-05, + "loss": 1.087, + "step": 16881 + }, + { + "epoch": 0.6045803713717837, + "grad_norm": 1.438521146774292, + "learning_rate": 7.13904704048404e-05, + "loss": 0.9806, + "step": 16882 + }, + { + "epoch": 0.6046161835013519, + "grad_norm": 1.4798104763031006, + "learning_rate": 7.137935640955516e-05, + "loss": 1.1602, + "step": 16883 + }, + { + "epoch": 0.6046519956309202, + "grad_norm": 2.370622396469116, + "learning_rate": 7.136824279932378e-05, + "loss": 1.0558, + "step": 16884 + }, + { + "epoch": 0.6046878077604885, + "grad_norm": 1.5011303424835205, + "learning_rate": 7.135712957429573e-05, + "loss": 0.9221, + "step": 16885 + }, + { + "epoch": 0.6047236198900567, + "grad_norm": 1.8432420492172241, + "learning_rate": 7.134601673462058e-05, + "loss": 1.1782, + "step": 16886 + }, + { + "epoch": 0.604759432019625, + "grad_norm": 1.3845798969268799, + "learning_rate": 7.133490428044778e-05, + "loss": 0.8881, + "step": 16887 + }, + { + "epoch": 0.6047952441491933, + "grad_norm": 1.5779119729995728, + "learning_rate": 7.132379221192691e-05, + "loss": 0.9351, + "step": 16888 + }, + { + "epoch": 0.6048310562787617, + "grad_norm": 1.7235053777694702, + "learning_rate": 7.131268052920739e-05, + "loss": 1.0003, + "step": 16889 + }, + { + "epoch": 0.6048668684083299, + "grad_norm": 1.2910956144332886, + "learning_rate": 7.130156923243879e-05, + "loss": 1.1393, + "step": 16890 + }, + { + "epoch": 0.6049026805378982, + "grad_norm": 1.2798341512680054, + "learning_rate": 7.129045832177054e-05, + "loss": 1.0417, + "step": 16891 + }, + { + "epoch": 0.6049384926674665, + "grad_norm": 1.5975556373596191, + "learning_rate": 7.127934779735212e-05, + "loss": 1.2199, + "step": 16892 + }, + { + "epoch": 0.6049743047970347, + "grad_norm": 1.5113551616668701, + "learning_rate": 7.126823765933306e-05, + "loss": 1.0965, + "step": 16893 + }, + { + "epoch": 0.605010116926603, + "grad_norm": 1.7206227779388428, + "learning_rate": 7.125712790786277e-05, + "loss": 1.0985, + "step": 16894 + }, + { + "epoch": 0.6050459290561713, + "grad_norm": 1.5747525691986084, + "learning_rate": 7.124601854309077e-05, + "loss": 1.1458, + "step": 16895 + }, + { + "epoch": 0.6050817411857397, + "grad_norm": 2.1660988330841064, + "learning_rate": 7.123490956516649e-05, + "loss": 1.2407, + "step": 16896 + }, + { + "epoch": 0.6051175533153079, + "grad_norm": 1.4720816612243652, + "learning_rate": 7.122380097423941e-05, + "loss": 1.4492, + "step": 16897 + }, + { + "epoch": 0.6051533654448762, + "grad_norm": 1.5015136003494263, + "learning_rate": 7.121269277045894e-05, + "loss": 1.0466, + "step": 16898 + }, + { + "epoch": 0.6051891775744445, + "grad_norm": 1.583584189414978, + "learning_rate": 7.120158495397459e-05, + "loss": 1.3076, + "step": 16899 + }, + { + "epoch": 0.6052249897040127, + "grad_norm": 1.419739007949829, + "learning_rate": 7.119047752493576e-05, + "loss": 0.9678, + "step": 16900 + }, + { + "epoch": 0.605260801833581, + "grad_norm": 1.2484102249145508, + "learning_rate": 7.117937048349188e-05, + "loss": 1.1874, + "step": 16901 + }, + { + "epoch": 0.6052966139631493, + "grad_norm": 1.539975881576538, + "learning_rate": 7.11682638297924e-05, + "loss": 1.3393, + "step": 16902 + }, + { + "epoch": 0.6053324260927176, + "grad_norm": 1.4303542375564575, + "learning_rate": 7.115715756398674e-05, + "loss": 1.1393, + "step": 16903 + }, + { + "epoch": 0.6053682382222859, + "grad_norm": 1.4974884986877441, + "learning_rate": 7.114605168622432e-05, + "loss": 1.3568, + "step": 16904 + }, + { + "epoch": 0.6054040503518542, + "grad_norm": 1.6270791292190552, + "learning_rate": 7.113494619665456e-05, + "loss": 1.126, + "step": 16905 + }, + { + "epoch": 0.6054398624814225, + "grad_norm": 1.9134361743927002, + "learning_rate": 7.112384109542687e-05, + "loss": 0.9888, + "step": 16906 + }, + { + "epoch": 0.6054756746109907, + "grad_norm": 1.6632472276687622, + "learning_rate": 7.111273638269063e-05, + "loss": 1.211, + "step": 16907 + }, + { + "epoch": 0.605511486740559, + "grad_norm": 1.5220134258270264, + "learning_rate": 7.110163205859528e-05, + "loss": 1.1139, + "step": 16908 + }, + { + "epoch": 0.6055472988701273, + "grad_norm": 1.4163203239440918, + "learning_rate": 7.109052812329023e-05, + "loss": 0.904, + "step": 16909 + }, + { + "epoch": 0.6055831109996956, + "grad_norm": 2.069554090499878, + "learning_rate": 7.107942457692475e-05, + "loss": 1.2628, + "step": 16910 + }, + { + "epoch": 0.6056189231292639, + "grad_norm": 1.2999815940856934, + "learning_rate": 7.106832141964839e-05, + "loss": 1.0222, + "step": 16911 + }, + { + "epoch": 0.6056547352588322, + "grad_norm": 1.4514079093933105, + "learning_rate": 7.105721865161037e-05, + "loss": 1.1006, + "step": 16912 + }, + { + "epoch": 0.6056905473884004, + "grad_norm": 1.5512773990631104, + "learning_rate": 7.104611627296018e-05, + "loss": 1.059, + "step": 16913 + }, + { + "epoch": 0.6057263595179687, + "grad_norm": 1.4401503801345825, + "learning_rate": 7.103501428384714e-05, + "loss": 1.1179, + "step": 16914 + }, + { + "epoch": 0.605762171647537, + "grad_norm": 1.6256104707717896, + "learning_rate": 7.102391268442062e-05, + "loss": 1.0505, + "step": 16915 + }, + { + "epoch": 0.6057979837771053, + "grad_norm": 1.3716914653778076, + "learning_rate": 7.101281147482996e-05, + "loss": 1.0917, + "step": 16916 + }, + { + "epoch": 0.6058337959066736, + "grad_norm": 1.3700370788574219, + "learning_rate": 7.100171065522457e-05, + "loss": 1.0189, + "step": 16917 + }, + { + "epoch": 0.6058696080362419, + "grad_norm": 1.492182731628418, + "learning_rate": 7.099061022575377e-05, + "loss": 1.1089, + "step": 16918 + }, + { + "epoch": 0.6059054201658102, + "grad_norm": 1.3733375072479248, + "learning_rate": 7.097951018656683e-05, + "loss": 1.1361, + "step": 16919 + }, + { + "epoch": 0.6059412322953784, + "grad_norm": 1.5312672853469849, + "learning_rate": 7.09684105378132e-05, + "loss": 1.1425, + "step": 16920 + }, + { + "epoch": 0.6059770444249467, + "grad_norm": 1.4700000286102295, + "learning_rate": 7.095731127964211e-05, + "loss": 1.1031, + "step": 16921 + }, + { + "epoch": 0.606012856554515, + "grad_norm": 1.4739962816238403, + "learning_rate": 7.0946212412203e-05, + "loss": 1.4204, + "step": 16922 + }, + { + "epoch": 0.6060486686840832, + "grad_norm": 1.489272952079773, + "learning_rate": 7.093511393564504e-05, + "loss": 1.231, + "step": 16923 + }, + { + "epoch": 0.6060844808136516, + "grad_norm": 1.8832443952560425, + "learning_rate": 7.092401585011771e-05, + "loss": 1.0163, + "step": 16924 + }, + { + "epoch": 0.6061202929432199, + "grad_norm": 1.3813743591308594, + "learning_rate": 7.091291815577022e-05, + "loss": 1.2018, + "step": 16925 + }, + { + "epoch": 0.6061561050727882, + "grad_norm": 1.5330426692962646, + "learning_rate": 7.090182085275185e-05, + "loss": 1.1446, + "step": 16926 + }, + { + "epoch": 0.6061919172023564, + "grad_norm": 1.4137961864471436, + "learning_rate": 7.089072394121201e-05, + "loss": 1.1019, + "step": 16927 + }, + { + "epoch": 0.6062277293319247, + "grad_norm": 1.5793870687484741, + "learning_rate": 7.087962742129988e-05, + "loss": 1.0983, + "step": 16928 + }, + { + "epoch": 0.606263541461493, + "grad_norm": 1.3170424699783325, + "learning_rate": 7.086853129316484e-05, + "loss": 1.1126, + "step": 16929 + }, + { + "epoch": 0.6062993535910612, + "grad_norm": 1.5165075063705444, + "learning_rate": 7.085743555695609e-05, + "loss": 0.8941, + "step": 16930 + }, + { + "epoch": 0.6063351657206296, + "grad_norm": 1.7465806007385254, + "learning_rate": 7.084634021282301e-05, + "loss": 1.2718, + "step": 16931 + }, + { + "epoch": 0.6063709778501979, + "grad_norm": 1.4588038921356201, + "learning_rate": 7.083524526091475e-05, + "loss": 1.1108, + "step": 16932 + }, + { + "epoch": 0.6064067899797662, + "grad_norm": 1.6344404220581055, + "learning_rate": 7.082415070138071e-05, + "loss": 1.154, + "step": 16933 + }, + { + "epoch": 0.6064426021093344, + "grad_norm": 1.5430370569229126, + "learning_rate": 7.081305653437007e-05, + "loss": 1.0821, + "step": 16934 + }, + { + "epoch": 0.6064784142389027, + "grad_norm": 1.4800138473510742, + "learning_rate": 7.080196276003209e-05, + "loss": 1.0395, + "step": 16935 + }, + { + "epoch": 0.606514226368471, + "grad_norm": 1.6253000497817993, + "learning_rate": 7.079086937851604e-05, + "loss": 1.0265, + "step": 16936 + }, + { + "epoch": 0.6065500384980392, + "grad_norm": 1.4506663084030151, + "learning_rate": 7.077977638997117e-05, + "loss": 1.1767, + "step": 16937 + }, + { + "epoch": 0.6065858506276076, + "grad_norm": 1.8696798086166382, + "learning_rate": 7.076868379454673e-05, + "loss": 1.1823, + "step": 16938 + }, + { + "epoch": 0.6066216627571759, + "grad_norm": 1.4949595928192139, + "learning_rate": 7.07575915923919e-05, + "loss": 1.0591, + "step": 16939 + }, + { + "epoch": 0.6066574748867442, + "grad_norm": 1.6158320903778076, + "learning_rate": 7.074649978365602e-05, + "loss": 0.9395, + "step": 16940 + }, + { + "epoch": 0.6066932870163124, + "grad_norm": 1.7895283699035645, + "learning_rate": 7.073540836848817e-05, + "loss": 1.1486, + "step": 16941 + }, + { + "epoch": 0.6067290991458807, + "grad_norm": 1.4930275678634644, + "learning_rate": 7.072431734703772e-05, + "loss": 1.363, + "step": 16942 + }, + { + "epoch": 0.606764911275449, + "grad_norm": 1.845636248588562, + "learning_rate": 7.071322671945382e-05, + "loss": 1.1575, + "step": 16943 + }, + { + "epoch": 0.6068007234050172, + "grad_norm": 1.5221506357192993, + "learning_rate": 7.070213648588564e-05, + "loss": 1.1726, + "step": 16944 + }, + { + "epoch": 0.6068365355345856, + "grad_norm": 1.5016177892684937, + "learning_rate": 7.069104664648244e-05, + "loss": 1.0682, + "step": 16945 + }, + { + "epoch": 0.6068723476641539, + "grad_norm": 1.708628535270691, + "learning_rate": 7.06799572013934e-05, + "loss": 1.2354, + "step": 16946 + }, + { + "epoch": 0.6069081597937221, + "grad_norm": 1.4104335308074951, + "learning_rate": 7.066886815076771e-05, + "loss": 1.0644, + "step": 16947 + }, + { + "epoch": 0.6069439719232904, + "grad_norm": 1.4835561513900757, + "learning_rate": 7.065777949475456e-05, + "loss": 1.1228, + "step": 16948 + }, + { + "epoch": 0.6069797840528587, + "grad_norm": 1.363732099533081, + "learning_rate": 7.064669123350316e-05, + "loss": 1.0456, + "step": 16949 + }, + { + "epoch": 0.607015596182427, + "grad_norm": 1.3670978546142578, + "learning_rate": 7.063560336716263e-05, + "loss": 1.0165, + "step": 16950 + }, + { + "epoch": 0.6070514083119952, + "grad_norm": 1.549312949180603, + "learning_rate": 7.062451589588221e-05, + "loss": 0.9289, + "step": 16951 + }, + { + "epoch": 0.6070872204415636, + "grad_norm": 1.6258875131607056, + "learning_rate": 7.061342881981105e-05, + "loss": 1.1047, + "step": 16952 + }, + { + "epoch": 0.6071230325711319, + "grad_norm": 1.1335524320602417, + "learning_rate": 7.060234213909826e-05, + "loss": 1.0649, + "step": 16953 + }, + { + "epoch": 0.6071588447007001, + "grad_norm": 1.4229786396026611, + "learning_rate": 7.059125585389306e-05, + "loss": 1.1678, + "step": 16954 + }, + { + "epoch": 0.6071946568302684, + "grad_norm": 1.4328641891479492, + "learning_rate": 7.058016996434455e-05, + "loss": 1.2425, + "step": 16955 + }, + { + "epoch": 0.6072304689598367, + "grad_norm": 1.6336426734924316, + "learning_rate": 7.056908447060195e-05, + "loss": 1.1704, + "step": 16956 + }, + { + "epoch": 0.607266281089405, + "grad_norm": 1.256283164024353, + "learning_rate": 7.055799937281432e-05, + "loss": 1.25, + "step": 16957 + }, + { + "epoch": 0.6073020932189732, + "grad_norm": 1.5210916996002197, + "learning_rate": 7.054691467113085e-05, + "loss": 1.1149, + "step": 16958 + }, + { + "epoch": 0.6073379053485416, + "grad_norm": 1.453985571861267, + "learning_rate": 7.053583036570064e-05, + "loss": 1.0083, + "step": 16959 + }, + { + "epoch": 0.6073737174781099, + "grad_norm": 1.5923434495925903, + "learning_rate": 7.052474645667283e-05, + "loss": 1.1629, + "step": 16960 + }, + { + "epoch": 0.6074095296076781, + "grad_norm": 1.5836217403411865, + "learning_rate": 7.051366294419655e-05, + "loss": 0.9803, + "step": 16961 + }, + { + "epoch": 0.6074453417372464, + "grad_norm": 1.5548338890075684, + "learning_rate": 7.050257982842088e-05, + "loss": 1.1771, + "step": 16962 + }, + { + "epoch": 0.6074811538668147, + "grad_norm": 1.4686635732650757, + "learning_rate": 7.049149710949497e-05, + "loss": 1.114, + "step": 16963 + }, + { + "epoch": 0.6075169659963829, + "grad_norm": 1.2782706022262573, + "learning_rate": 7.048041478756786e-05, + "loss": 1.1761, + "step": 16964 + }, + { + "epoch": 0.6075527781259512, + "grad_norm": 1.6733824014663696, + "learning_rate": 7.046933286278874e-05, + "loss": 0.9587, + "step": 16965 + }, + { + "epoch": 0.6075885902555196, + "grad_norm": 1.275881052017212, + "learning_rate": 7.04582513353066e-05, + "loss": 1.0334, + "step": 16966 + }, + { + "epoch": 0.6076244023850879, + "grad_norm": 1.34718918800354, + "learning_rate": 7.044717020527065e-05, + "loss": 1.159, + "step": 16967 + }, + { + "epoch": 0.6076602145146561, + "grad_norm": 1.476212739944458, + "learning_rate": 7.043608947282985e-05, + "loss": 1.2614, + "step": 16968 + }, + { + "epoch": 0.6076960266442244, + "grad_norm": 1.7217010259628296, + "learning_rate": 7.042500913813337e-05, + "loss": 1.2254, + "step": 16969 + }, + { + "epoch": 0.6077318387737927, + "grad_norm": 1.3594708442687988, + "learning_rate": 7.041392920133024e-05, + "loss": 1.0035, + "step": 16970 + }, + { + "epoch": 0.6077676509033609, + "grad_norm": 1.458707332611084, + "learning_rate": 7.040284966256949e-05, + "loss": 1.2069, + "step": 16971 + }, + { + "epoch": 0.6078034630329292, + "grad_norm": 1.5520296096801758, + "learning_rate": 7.039177052200026e-05, + "loss": 1.0697, + "step": 16972 + }, + { + "epoch": 0.6078392751624976, + "grad_norm": 1.6134346723556519, + "learning_rate": 7.038069177977153e-05, + "loss": 1.0001, + "step": 16973 + }, + { + "epoch": 0.6078750872920659, + "grad_norm": 2.009611129760742, + "learning_rate": 7.036961343603243e-05, + "loss": 1.1811, + "step": 16974 + }, + { + "epoch": 0.6079108994216341, + "grad_norm": 1.5405566692352295, + "learning_rate": 7.035853549093192e-05, + "loss": 1.1223, + "step": 16975 + }, + { + "epoch": 0.6079467115512024, + "grad_norm": 1.516474962234497, + "learning_rate": 7.034745794461912e-05, + "loss": 1.1977, + "step": 16976 + }, + { + "epoch": 0.6079825236807707, + "grad_norm": 1.5350171327590942, + "learning_rate": 7.033638079724298e-05, + "loss": 1.2292, + "step": 16977 + }, + { + "epoch": 0.6080183358103389, + "grad_norm": 1.8312493562698364, + "learning_rate": 7.032530404895262e-05, + "loss": 1.2009, + "step": 16978 + }, + { + "epoch": 0.6080541479399072, + "grad_norm": 1.484458565711975, + "learning_rate": 7.0314227699897e-05, + "loss": 1.0741, + "step": 16979 + }, + { + "epoch": 0.6080899600694756, + "grad_norm": 1.3698830604553223, + "learning_rate": 7.030315175022513e-05, + "loss": 1.0251, + "step": 16980 + }, + { + "epoch": 0.6081257721990438, + "grad_norm": 1.3422988653182983, + "learning_rate": 7.029207620008606e-05, + "loss": 1.207, + "step": 16981 + }, + { + "epoch": 0.6081615843286121, + "grad_norm": 1.2976946830749512, + "learning_rate": 7.028100104962878e-05, + "loss": 1.0465, + "step": 16982 + }, + { + "epoch": 0.6081973964581804, + "grad_norm": 1.6317991018295288, + "learning_rate": 7.026992629900232e-05, + "loss": 1.1547, + "step": 16983 + }, + { + "epoch": 0.6082332085877487, + "grad_norm": 1.4380658864974976, + "learning_rate": 7.025885194835562e-05, + "loss": 1.0108, + "step": 16984 + }, + { + "epoch": 0.6082690207173169, + "grad_norm": 2.551555871963501, + "learning_rate": 7.024777799783774e-05, + "loss": 1.2843, + "step": 16985 + }, + { + "epoch": 0.6083048328468852, + "grad_norm": 1.5001152753829956, + "learning_rate": 7.02367044475976e-05, + "loss": 0.9256, + "step": 16986 + }, + { + "epoch": 0.6083406449764536, + "grad_norm": 1.546624779701233, + "learning_rate": 7.022563129778422e-05, + "loss": 1.2399, + "step": 16987 + }, + { + "epoch": 0.6083764571060218, + "grad_norm": 1.7054249048233032, + "learning_rate": 7.021455854854657e-05, + "loss": 1.2746, + "step": 16988 + }, + { + "epoch": 0.6084122692355901, + "grad_norm": 1.2321168184280396, + "learning_rate": 7.020348620003361e-05, + "loss": 1.2208, + "step": 16989 + }, + { + "epoch": 0.6084480813651584, + "grad_norm": 1.2917568683624268, + "learning_rate": 7.019241425239432e-05, + "loss": 1.0938, + "step": 16990 + }, + { + "epoch": 0.6084838934947266, + "grad_norm": 1.5280476808547974, + "learning_rate": 7.018134270577761e-05, + "loss": 1.0886, + "step": 16991 + }, + { + "epoch": 0.6085197056242949, + "grad_norm": 1.3356908559799194, + "learning_rate": 7.017027156033252e-05, + "loss": 1.2868, + "step": 16992 + }, + { + "epoch": 0.6085555177538632, + "grad_norm": 1.6619542837142944, + "learning_rate": 7.01592008162079e-05, + "loss": 1.0014, + "step": 16993 + }, + { + "epoch": 0.6085913298834316, + "grad_norm": 1.7739572525024414, + "learning_rate": 7.014813047355277e-05, + "loss": 1.0315, + "step": 16994 + }, + { + "epoch": 0.6086271420129998, + "grad_norm": 1.7023760080337524, + "learning_rate": 7.013706053251603e-05, + "loss": 1.1564, + "step": 16995 + }, + { + "epoch": 0.6086629541425681, + "grad_norm": 1.4887381792068481, + "learning_rate": 7.012599099324662e-05, + "loss": 1.0083, + "step": 16996 + }, + { + "epoch": 0.6086987662721364, + "grad_norm": 1.6408871412277222, + "learning_rate": 7.011492185589349e-05, + "loss": 1.1796, + "step": 16997 + }, + { + "epoch": 0.6087345784017046, + "grad_norm": 1.3905456066131592, + "learning_rate": 7.01038531206055e-05, + "loss": 1.1741, + "step": 16998 + }, + { + "epoch": 0.6087703905312729, + "grad_norm": 1.3378123044967651, + "learning_rate": 7.009278478753162e-05, + "loss": 0.9748, + "step": 16999 + }, + { + "epoch": 0.6088062026608412, + "grad_norm": 1.6208394765853882, + "learning_rate": 7.008171685682074e-05, + "loss": 0.8285, + "step": 17000 + }, + { + "epoch": 0.6088420147904096, + "grad_norm": 1.8872987031936646, + "learning_rate": 7.007064932862178e-05, + "loss": 1.0601, + "step": 17001 + }, + { + "epoch": 0.6088778269199778, + "grad_norm": 1.4856820106506348, + "learning_rate": 7.005958220308362e-05, + "loss": 1.1385, + "step": 17002 + }, + { + "epoch": 0.6089136390495461, + "grad_norm": 1.4458017349243164, + "learning_rate": 7.004851548035516e-05, + "loss": 0.9065, + "step": 17003 + }, + { + "epoch": 0.6089494511791144, + "grad_norm": 1.7324843406677246, + "learning_rate": 7.003744916058528e-05, + "loss": 1.0059, + "step": 17004 + }, + { + "epoch": 0.6089852633086826, + "grad_norm": 1.7534056901931763, + "learning_rate": 7.00263832439229e-05, + "loss": 1.0494, + "step": 17005 + }, + { + "epoch": 0.6090210754382509, + "grad_norm": 1.473981499671936, + "learning_rate": 7.001531773051688e-05, + "loss": 0.9222, + "step": 17006 + }, + { + "epoch": 0.6090568875678192, + "grad_norm": 1.573520302772522, + "learning_rate": 7.000425262051602e-05, + "loss": 1.3379, + "step": 17007 + }, + { + "epoch": 0.6090926996973876, + "grad_norm": 1.5718022584915161, + "learning_rate": 6.999318791406931e-05, + "loss": 1.0539, + "step": 17008 + }, + { + "epoch": 0.6091285118269558, + "grad_norm": 1.4498018026351929, + "learning_rate": 6.998212361132549e-05, + "loss": 1.0478, + "step": 17009 + }, + { + "epoch": 0.6091643239565241, + "grad_norm": 1.382986068725586, + "learning_rate": 6.997105971243352e-05, + "loss": 1.1553, + "step": 17010 + }, + { + "epoch": 0.6092001360860924, + "grad_norm": 1.5328130722045898, + "learning_rate": 6.995999621754219e-05, + "loss": 1.0979, + "step": 17011 + }, + { + "epoch": 0.6092359482156606, + "grad_norm": 1.443288803100586, + "learning_rate": 6.994893312680037e-05, + "loss": 1.1997, + "step": 17012 + }, + { + "epoch": 0.6092717603452289, + "grad_norm": 1.407687783241272, + "learning_rate": 6.99378704403569e-05, + "loss": 1.1641, + "step": 17013 + }, + { + "epoch": 0.6093075724747972, + "grad_norm": 1.2965140342712402, + "learning_rate": 6.99268081583606e-05, + "loss": 1.3079, + "step": 17014 + }, + { + "epoch": 0.6093433846043655, + "grad_norm": 1.3295387029647827, + "learning_rate": 6.991574628096033e-05, + "loss": 1.0896, + "step": 17015 + }, + { + "epoch": 0.6093791967339338, + "grad_norm": 1.6502915620803833, + "learning_rate": 6.990468480830482e-05, + "loss": 0.9583, + "step": 17016 + }, + { + "epoch": 0.6094150088635021, + "grad_norm": 1.870396614074707, + "learning_rate": 6.989362374054302e-05, + "loss": 1.072, + "step": 17017 + }, + { + "epoch": 0.6094508209930704, + "grad_norm": 1.905776023864746, + "learning_rate": 6.988256307782363e-05, + "loss": 1.0914, + "step": 17018 + }, + { + "epoch": 0.6094866331226386, + "grad_norm": 1.276781678199768, + "learning_rate": 6.987150282029555e-05, + "loss": 0.9568, + "step": 17019 + }, + { + "epoch": 0.6095224452522069, + "grad_norm": 1.97832453250885, + "learning_rate": 6.986044296810749e-05, + "loss": 1.1905, + "step": 17020 + }, + { + "epoch": 0.6095582573817752, + "grad_norm": 1.5194975137710571, + "learning_rate": 6.984938352140835e-05, + "loss": 1.1885, + "step": 17021 + }, + { + "epoch": 0.6095940695113435, + "grad_norm": 1.608293890953064, + "learning_rate": 6.983832448034684e-05, + "loss": 1.0199, + "step": 17022 + }, + { + "epoch": 0.6096298816409118, + "grad_norm": 1.9031078815460205, + "learning_rate": 6.982726584507173e-05, + "loss": 1.1501, + "step": 17023 + }, + { + "epoch": 0.6096656937704801, + "grad_norm": 1.3913733959197998, + "learning_rate": 6.981620761573188e-05, + "loss": 1.034, + "step": 17024 + }, + { + "epoch": 0.6097015059000483, + "grad_norm": 1.547910213470459, + "learning_rate": 6.980514979247599e-05, + "loss": 1.2468, + "step": 17025 + }, + { + "epoch": 0.6097373180296166, + "grad_norm": 2.4710254669189453, + "learning_rate": 6.979409237545291e-05, + "loss": 1.0557, + "step": 17026 + }, + { + "epoch": 0.6097731301591849, + "grad_norm": 1.622068166732788, + "learning_rate": 6.97830353648113e-05, + "loss": 1.2588, + "step": 17027 + }, + { + "epoch": 0.6098089422887532, + "grad_norm": 1.5548193454742432, + "learning_rate": 6.977197876070003e-05, + "loss": 1.0124, + "step": 17028 + }, + { + "epoch": 0.6098447544183215, + "grad_norm": 1.3400119543075562, + "learning_rate": 6.976092256326772e-05, + "loss": 0.9726, + "step": 17029 + }, + { + "epoch": 0.6098805665478898, + "grad_norm": 2.0034866333007812, + "learning_rate": 6.974986677266326e-05, + "loss": 1.0628, + "step": 17030 + }, + { + "epoch": 0.6099163786774581, + "grad_norm": 1.6469800472259521, + "learning_rate": 6.973881138903531e-05, + "loss": 1.1954, + "step": 17031 + }, + { + "epoch": 0.6099521908070263, + "grad_norm": 1.5626568794250488, + "learning_rate": 6.972775641253259e-05, + "loss": 1.0278, + "step": 17032 + }, + { + "epoch": 0.6099880029365946, + "grad_norm": 1.681544542312622, + "learning_rate": 6.971670184330389e-05, + "loss": 1.1278, + "step": 17033 + }, + { + "epoch": 0.6100238150661629, + "grad_norm": 1.5140455961227417, + "learning_rate": 6.970564768149788e-05, + "loss": 1.0229, + "step": 17034 + }, + { + "epoch": 0.6100596271957311, + "grad_norm": 1.481037974357605, + "learning_rate": 6.969459392726331e-05, + "loss": 1.0988, + "step": 17035 + }, + { + "epoch": 0.6100954393252995, + "grad_norm": 1.4333049058914185, + "learning_rate": 6.968354058074887e-05, + "loss": 0.9414, + "step": 17036 + }, + { + "epoch": 0.6101312514548678, + "grad_norm": 2.0365042686462402, + "learning_rate": 6.967248764210333e-05, + "loss": 1.0568, + "step": 17037 + }, + { + "epoch": 0.6101670635844361, + "grad_norm": 1.5593065023422241, + "learning_rate": 6.966143511147529e-05, + "loss": 0.9567, + "step": 17038 + }, + { + "epoch": 0.6102028757140043, + "grad_norm": 1.9651585817337036, + "learning_rate": 6.965038298901356e-05, + "loss": 1.0788, + "step": 17039 + }, + { + "epoch": 0.6102386878435726, + "grad_norm": 2.3008310794830322, + "learning_rate": 6.963933127486677e-05, + "loss": 1.0666, + "step": 17040 + }, + { + "epoch": 0.6102744999731409, + "grad_norm": 1.5836769342422485, + "learning_rate": 6.96282799691836e-05, + "loss": 1.1122, + "step": 17041 + }, + { + "epoch": 0.6103103121027091, + "grad_norm": 2.0701847076416016, + "learning_rate": 6.961722907211277e-05, + "loss": 1.1086, + "step": 17042 + }, + { + "epoch": 0.6103461242322775, + "grad_norm": 1.5872652530670166, + "learning_rate": 6.96061785838029e-05, + "loss": 1.0925, + "step": 17043 + }, + { + "epoch": 0.6103819363618458, + "grad_norm": 1.5449596643447876, + "learning_rate": 6.95951285044027e-05, + "loss": 1.1968, + "step": 17044 + }, + { + "epoch": 0.6104177484914141, + "grad_norm": 1.301332950592041, + "learning_rate": 6.958407883406082e-05, + "loss": 1.2035, + "step": 17045 + }, + { + "epoch": 0.6104535606209823, + "grad_norm": 1.2667808532714844, + "learning_rate": 6.957302957292596e-05, + "loss": 1.0425, + "step": 17046 + }, + { + "epoch": 0.6104893727505506, + "grad_norm": 1.4576541185379028, + "learning_rate": 6.956198072114669e-05, + "loss": 1.0441, + "step": 17047 + }, + { + "epoch": 0.6105251848801189, + "grad_norm": 1.458867073059082, + "learning_rate": 6.955093227887175e-05, + "loss": 1.0882, + "step": 17048 + }, + { + "epoch": 0.6105609970096871, + "grad_norm": 1.38541579246521, + "learning_rate": 6.953988424624973e-05, + "loss": 1.1449, + "step": 17049 + }, + { + "epoch": 0.6105968091392555, + "grad_norm": 1.8590021133422852, + "learning_rate": 6.952883662342926e-05, + "loss": 1.0606, + "step": 17050 + }, + { + "epoch": 0.6106326212688238, + "grad_norm": 1.678748369216919, + "learning_rate": 6.9517789410559e-05, + "loss": 1.2088, + "step": 17051 + }, + { + "epoch": 0.610668433398392, + "grad_norm": 1.499155879020691, + "learning_rate": 6.950674260778755e-05, + "loss": 1.0652, + "step": 17052 + }, + { + "epoch": 0.6107042455279603, + "grad_norm": 1.377507209777832, + "learning_rate": 6.949569621526357e-05, + "loss": 1.0732, + "step": 17053 + }, + { + "epoch": 0.6107400576575286, + "grad_norm": 1.4800876379013062, + "learning_rate": 6.948465023313562e-05, + "loss": 1.0501, + "step": 17054 + }, + { + "epoch": 0.6107758697870969, + "grad_norm": 1.4101285934448242, + "learning_rate": 6.947360466155237e-05, + "loss": 1.1103, + "step": 17055 + }, + { + "epoch": 0.6108116819166651, + "grad_norm": 1.7818716764450073, + "learning_rate": 6.946255950066236e-05, + "loss": 1.1977, + "step": 17056 + }, + { + "epoch": 0.6108474940462335, + "grad_norm": 1.5294722318649292, + "learning_rate": 6.945151475061425e-05, + "loss": 1.1837, + "step": 17057 + }, + { + "epoch": 0.6108833061758018, + "grad_norm": 1.7690221071243286, + "learning_rate": 6.944047041155662e-05, + "loss": 1.3266, + "step": 17058 + }, + { + "epoch": 0.61091911830537, + "grad_norm": 1.357926607131958, + "learning_rate": 6.9429426483638e-05, + "loss": 1.0513, + "step": 17059 + }, + { + "epoch": 0.6109549304349383, + "grad_norm": 1.5396602153778076, + "learning_rate": 6.941838296700703e-05, + "loss": 1.1489, + "step": 17060 + }, + { + "epoch": 0.6109907425645066, + "grad_norm": 1.7319899797439575, + "learning_rate": 6.940733986181226e-05, + "loss": 1.0958, + "step": 17061 + }, + { + "epoch": 0.6110265546940749, + "grad_norm": 1.590110182762146, + "learning_rate": 6.939629716820229e-05, + "loss": 1.0516, + "step": 17062 + }, + { + "epoch": 0.6110623668236431, + "grad_norm": 1.502112627029419, + "learning_rate": 6.938525488632563e-05, + "loss": 1.2458, + "step": 17063 + }, + { + "epoch": 0.6110981789532115, + "grad_norm": 1.6913044452667236, + "learning_rate": 6.937421301633091e-05, + "loss": 0.9642, + "step": 17064 + }, + { + "epoch": 0.6111339910827798, + "grad_norm": 1.652740478515625, + "learning_rate": 6.936317155836664e-05, + "loss": 1.0294, + "step": 17065 + }, + { + "epoch": 0.611169803212348, + "grad_norm": 1.4555799961090088, + "learning_rate": 6.935213051258138e-05, + "loss": 0.9437, + "step": 17066 + }, + { + "epoch": 0.6112056153419163, + "grad_norm": 1.5171189308166504, + "learning_rate": 6.934108987912369e-05, + "loss": 1.1371, + "step": 17067 + }, + { + "epoch": 0.6112414274714846, + "grad_norm": 1.6243114471435547, + "learning_rate": 6.933004965814205e-05, + "loss": 1.316, + "step": 17068 + }, + { + "epoch": 0.6112772396010528, + "grad_norm": 1.5315829515457153, + "learning_rate": 6.931900984978506e-05, + "loss": 0.8964, + "step": 17069 + }, + { + "epoch": 0.6113130517306211, + "grad_norm": 1.3494787216186523, + "learning_rate": 6.930797045420119e-05, + "loss": 1.0924, + "step": 17070 + }, + { + "epoch": 0.6113488638601895, + "grad_norm": 1.6444087028503418, + "learning_rate": 6.929693147153902e-05, + "loss": 1.0943, + "step": 17071 + }, + { + "epoch": 0.6113846759897578, + "grad_norm": 1.2709184885025024, + "learning_rate": 6.9285892901947e-05, + "loss": 1.0604, + "step": 17072 + }, + { + "epoch": 0.611420488119326, + "grad_norm": 1.4541912078857422, + "learning_rate": 6.927485474557369e-05, + "loss": 1.2096, + "step": 17073 + }, + { + "epoch": 0.6114563002488943, + "grad_norm": 1.6561377048492432, + "learning_rate": 6.926381700256757e-05, + "loss": 1.2387, + "step": 17074 + }, + { + "epoch": 0.6114921123784626, + "grad_norm": 1.4413856267929077, + "learning_rate": 6.925277967307717e-05, + "loss": 1.2654, + "step": 17075 + }, + { + "epoch": 0.6115279245080308, + "grad_norm": 1.4587289094924927, + "learning_rate": 6.924174275725094e-05, + "loss": 1.1655, + "step": 17076 + }, + { + "epoch": 0.6115637366375991, + "grad_norm": 1.4309983253479004, + "learning_rate": 6.923070625523737e-05, + "loss": 1.257, + "step": 17077 + }, + { + "epoch": 0.6115995487671675, + "grad_norm": 1.2323195934295654, + "learning_rate": 6.921967016718499e-05, + "loss": 0.8654, + "step": 17078 + }, + { + "epoch": 0.6116353608967358, + "grad_norm": 1.3314993381500244, + "learning_rate": 6.920863449324221e-05, + "loss": 1.1153, + "step": 17079 + }, + { + "epoch": 0.611671173026304, + "grad_norm": 1.7772375345230103, + "learning_rate": 6.919759923355756e-05, + "loss": 1.1173, + "step": 17080 + }, + { + "epoch": 0.6117069851558723, + "grad_norm": 2.3861215114593506, + "learning_rate": 6.918656438827946e-05, + "loss": 1.1117, + "step": 17081 + }, + { + "epoch": 0.6117427972854406, + "grad_norm": 1.4399361610412598, + "learning_rate": 6.917552995755641e-05, + "loss": 1.0891, + "step": 17082 + }, + { + "epoch": 0.6117786094150088, + "grad_norm": 1.8172500133514404, + "learning_rate": 6.916449594153682e-05, + "loss": 1.0697, + "step": 17083 + }, + { + "epoch": 0.6118144215445771, + "grad_norm": 1.3348246812820435, + "learning_rate": 6.915346234036919e-05, + "loss": 0.8241, + "step": 17084 + }, + { + "epoch": 0.6118502336741455, + "grad_norm": 1.305233359336853, + "learning_rate": 6.914242915420193e-05, + "loss": 0.917, + "step": 17085 + }, + { + "epoch": 0.6118860458037138, + "grad_norm": 1.447711706161499, + "learning_rate": 6.913139638318346e-05, + "loss": 1.077, + "step": 17086 + }, + { + "epoch": 0.611921857933282, + "grad_norm": 1.4804404973983765, + "learning_rate": 6.912036402746227e-05, + "loss": 0.9798, + "step": 17087 + }, + { + "epoch": 0.6119576700628503, + "grad_norm": 1.8317739963531494, + "learning_rate": 6.910933208718671e-05, + "loss": 1.094, + "step": 17088 + }, + { + "epoch": 0.6119934821924186, + "grad_norm": 1.2557854652404785, + "learning_rate": 6.909830056250527e-05, + "loss": 1.0832, + "step": 17089 + }, + { + "epoch": 0.6120292943219868, + "grad_norm": 1.7507680654525757, + "learning_rate": 6.908726945356632e-05, + "loss": 1.1024, + "step": 17090 + }, + { + "epoch": 0.6120651064515551, + "grad_norm": 1.5094844102859497, + "learning_rate": 6.90762387605183e-05, + "loss": 1.2191, + "step": 17091 + }, + { + "epoch": 0.6121009185811235, + "grad_norm": 1.8460692167282104, + "learning_rate": 6.906520848350957e-05, + "loss": 1.0973, + "step": 17092 + }, + { + "epoch": 0.6121367307106917, + "grad_norm": 1.7898225784301758, + "learning_rate": 6.905417862268859e-05, + "loss": 1.2088, + "step": 17093 + }, + { + "epoch": 0.61217254284026, + "grad_norm": 1.6092700958251953, + "learning_rate": 6.904314917820371e-05, + "loss": 1.0668, + "step": 17094 + }, + { + "epoch": 0.6122083549698283, + "grad_norm": 1.5845518112182617, + "learning_rate": 6.90321201502033e-05, + "loss": 1.1278, + "step": 17095 + }, + { + "epoch": 0.6122441670993966, + "grad_norm": 2.034120559692383, + "learning_rate": 6.90210915388358e-05, + "loss": 1.0976, + "step": 17096 + }, + { + "epoch": 0.6122799792289648, + "grad_norm": 1.315474271774292, + "learning_rate": 6.901006334424953e-05, + "loss": 0.9568, + "step": 17097 + }, + { + "epoch": 0.6123157913585331, + "grad_norm": 1.4102768898010254, + "learning_rate": 6.89990355665929e-05, + "loss": 1.0499, + "step": 17098 + }, + { + "epoch": 0.6123516034881015, + "grad_norm": 1.4900600910186768, + "learning_rate": 6.898800820601425e-05, + "loss": 1.1141, + "step": 17099 + }, + { + "epoch": 0.6123874156176697, + "grad_norm": 1.5335450172424316, + "learning_rate": 6.897698126266197e-05, + "loss": 1.2314, + "step": 17100 + }, + { + "epoch": 0.612423227747238, + "grad_norm": 1.343018889427185, + "learning_rate": 6.896595473668435e-05, + "loss": 1.1522, + "step": 17101 + }, + { + "epoch": 0.6124590398768063, + "grad_norm": 1.325927495956421, + "learning_rate": 6.89549286282298e-05, + "loss": 0.8616, + "step": 17102 + }, + { + "epoch": 0.6124948520063745, + "grad_norm": 1.2181131839752197, + "learning_rate": 6.894390293744668e-05, + "loss": 1.0675, + "step": 17103 + }, + { + "epoch": 0.6125306641359428, + "grad_norm": 1.493636131286621, + "learning_rate": 6.893287766448321e-05, + "loss": 1.1241, + "step": 17104 + }, + { + "epoch": 0.6125664762655111, + "grad_norm": 1.4189082384109497, + "learning_rate": 6.892185280948786e-05, + "loss": 0.9734, + "step": 17105 + }, + { + "epoch": 0.6126022883950795, + "grad_norm": 1.2516324520111084, + "learning_rate": 6.891082837260885e-05, + "loss": 1.0388, + "step": 17106 + }, + { + "epoch": 0.6126381005246477, + "grad_norm": 1.8084056377410889, + "learning_rate": 6.889980435399456e-05, + "loss": 1.3419, + "step": 17107 + }, + { + "epoch": 0.612673912654216, + "grad_norm": 1.347111463546753, + "learning_rate": 6.888878075379326e-05, + "loss": 1.0138, + "step": 17108 + }, + { + "epoch": 0.6127097247837843, + "grad_norm": 1.6068328619003296, + "learning_rate": 6.887775757215334e-05, + "loss": 1.3149, + "step": 17109 + }, + { + "epoch": 0.6127455369133525, + "grad_norm": 1.6326169967651367, + "learning_rate": 6.886673480922299e-05, + "loss": 1.1571, + "step": 17110 + }, + { + "epoch": 0.6127813490429208, + "grad_norm": 1.471596598625183, + "learning_rate": 6.88557124651506e-05, + "loss": 1.0592, + "step": 17111 + }, + { + "epoch": 0.6128171611724891, + "grad_norm": 1.6426836252212524, + "learning_rate": 6.884469054008444e-05, + "loss": 0.9945, + "step": 17112 + }, + { + "epoch": 0.6128529733020575, + "grad_norm": 1.4090949296951294, + "learning_rate": 6.883366903417273e-05, + "loss": 1.0391, + "step": 17113 + }, + { + "epoch": 0.6128887854316257, + "grad_norm": 1.441649079322815, + "learning_rate": 6.882264794756386e-05, + "loss": 0.9654, + "step": 17114 + }, + { + "epoch": 0.612924597561194, + "grad_norm": 1.8479949235916138, + "learning_rate": 6.881162728040598e-05, + "loss": 1.1714, + "step": 17115 + }, + { + "epoch": 0.6129604096907623, + "grad_norm": 1.4353041648864746, + "learning_rate": 6.880060703284748e-05, + "loss": 1.1625, + "step": 17116 + }, + { + "epoch": 0.6129962218203305, + "grad_norm": 1.5371201038360596, + "learning_rate": 6.878958720503652e-05, + "loss": 1.2768, + "step": 17117 + }, + { + "epoch": 0.6130320339498988, + "grad_norm": 1.7115511894226074, + "learning_rate": 6.877856779712147e-05, + "loss": 1.0464, + "step": 17118 + }, + { + "epoch": 0.6130678460794671, + "grad_norm": 1.3894851207733154, + "learning_rate": 6.876754880925049e-05, + "loss": 1.0863, + "step": 17119 + }, + { + "epoch": 0.6131036582090355, + "grad_norm": 1.3971489667892456, + "learning_rate": 6.87565302415718e-05, + "loss": 1.1279, + "step": 17120 + }, + { + "epoch": 0.6131394703386037, + "grad_norm": 1.3409106731414795, + "learning_rate": 6.874551209423376e-05, + "loss": 1.095, + "step": 17121 + }, + { + "epoch": 0.613175282468172, + "grad_norm": 1.4250898361206055, + "learning_rate": 6.873449436738451e-05, + "loss": 1.0591, + "step": 17122 + }, + { + "epoch": 0.6132110945977403, + "grad_norm": 1.4324144124984741, + "learning_rate": 6.872347706117233e-05, + "loss": 1.0549, + "step": 17123 + }, + { + "epoch": 0.6132469067273085, + "grad_norm": 1.2053589820861816, + "learning_rate": 6.871246017574537e-05, + "loss": 1.0728, + "step": 17124 + }, + { + "epoch": 0.6132827188568768, + "grad_norm": 2.2028231620788574, + "learning_rate": 6.870144371125198e-05, + "loss": 0.9648, + "step": 17125 + }, + { + "epoch": 0.6133185309864451, + "grad_norm": 1.6044602394104004, + "learning_rate": 6.869042766784022e-05, + "loss": 1.0678, + "step": 17126 + }, + { + "epoch": 0.6133543431160134, + "grad_norm": 1.383193016052246, + "learning_rate": 6.867941204565843e-05, + "loss": 0.8681, + "step": 17127 + }, + { + "epoch": 0.6133901552455817, + "grad_norm": 1.4773218631744385, + "learning_rate": 6.866839684485473e-05, + "loss": 1.032, + "step": 17128 + }, + { + "epoch": 0.61342596737515, + "grad_norm": 1.5860134363174438, + "learning_rate": 6.865738206557731e-05, + "loss": 0.9672, + "step": 17129 + }, + { + "epoch": 0.6134617795047183, + "grad_norm": 1.5263041257858276, + "learning_rate": 6.864636770797441e-05, + "loss": 1.1983, + "step": 17130 + }, + { + "epoch": 0.6134975916342865, + "grad_norm": 1.5801773071289062, + "learning_rate": 6.863535377219417e-05, + "loss": 1.1019, + "step": 17131 + }, + { + "epoch": 0.6135334037638548, + "grad_norm": 1.6543699502944946, + "learning_rate": 6.862434025838481e-05, + "loss": 1.2279, + "step": 17132 + }, + { + "epoch": 0.6135692158934231, + "grad_norm": 2.0593678951263428, + "learning_rate": 6.861332716669444e-05, + "loss": 1.2732, + "step": 17133 + }, + { + "epoch": 0.6136050280229914, + "grad_norm": 1.6615407466888428, + "learning_rate": 6.860231449727133e-05, + "loss": 1.1028, + "step": 17134 + }, + { + "epoch": 0.6136408401525597, + "grad_norm": 1.4223542213439941, + "learning_rate": 6.859130225026351e-05, + "loss": 1.051, + "step": 17135 + }, + { + "epoch": 0.613676652282128, + "grad_norm": 1.5264923572540283, + "learning_rate": 6.858029042581926e-05, + "loss": 0.8646, + "step": 17136 + }, + { + "epoch": 0.6137124644116962, + "grad_norm": 1.2369579076766968, + "learning_rate": 6.856927902408666e-05, + "loss": 1.1074, + "step": 17137 + }, + { + "epoch": 0.6137482765412645, + "grad_norm": 1.2843048572540283, + "learning_rate": 6.855826804521386e-05, + "loss": 1.0641, + "step": 17138 + }, + { + "epoch": 0.6137840886708328, + "grad_norm": 1.4180241823196411, + "learning_rate": 6.854725748934901e-05, + "loss": 1.2057, + "step": 17139 + }, + { + "epoch": 0.613819900800401, + "grad_norm": 1.7782917022705078, + "learning_rate": 6.853624735664021e-05, + "loss": 0.907, + "step": 17140 + }, + { + "epoch": 0.6138557129299694, + "grad_norm": 1.6314409971237183, + "learning_rate": 6.852523764723566e-05, + "loss": 1.3029, + "step": 17141 + }, + { + "epoch": 0.6138915250595377, + "grad_norm": 1.2198690176010132, + "learning_rate": 6.85142283612834e-05, + "loss": 1.0881, + "step": 17142 + }, + { + "epoch": 0.613927337189106, + "grad_norm": 1.9340394735336304, + "learning_rate": 6.850321949893162e-05, + "loss": 0.9234, + "step": 17143 + }, + { + "epoch": 0.6139631493186742, + "grad_norm": 1.5240039825439453, + "learning_rate": 6.849221106032837e-05, + "loss": 1.076, + "step": 17144 + }, + { + "epoch": 0.6139989614482425, + "grad_norm": 1.255825161933899, + "learning_rate": 6.84812030456218e-05, + "loss": 1.1215, + "step": 17145 + }, + { + "epoch": 0.6140347735778108, + "grad_norm": 1.55277681350708, + "learning_rate": 6.847019545495998e-05, + "loss": 1.1805, + "step": 17146 + }, + { + "epoch": 0.614070585707379, + "grad_norm": 1.355106234550476, + "learning_rate": 6.845918828849099e-05, + "loss": 0.8344, + "step": 17147 + }, + { + "epoch": 0.6141063978369474, + "grad_norm": 1.7007380723953247, + "learning_rate": 6.844818154636295e-05, + "loss": 1.0379, + "step": 17148 + }, + { + "epoch": 0.6141422099665157, + "grad_norm": 1.3888174295425415, + "learning_rate": 6.843717522872393e-05, + "loss": 1.1021, + "step": 17149 + }, + { + "epoch": 0.614178022096084, + "grad_norm": 1.5486217737197876, + "learning_rate": 6.8426169335722e-05, + "loss": 1.1103, + "step": 17150 + }, + { + "epoch": 0.6142138342256522, + "grad_norm": 1.3495296239852905, + "learning_rate": 6.841516386750523e-05, + "loss": 0.993, + "step": 17151 + }, + { + "epoch": 0.6142496463552205, + "grad_norm": 1.4281216859817505, + "learning_rate": 6.84041588242217e-05, + "loss": 1.3256, + "step": 17152 + }, + { + "epoch": 0.6142854584847888, + "grad_norm": 1.4926255941390991, + "learning_rate": 6.839315420601943e-05, + "loss": 1.1766, + "step": 17153 + }, + { + "epoch": 0.614321270614357, + "grad_norm": 1.7760971784591675, + "learning_rate": 6.838215001304654e-05, + "loss": 1.1595, + "step": 17154 + }, + { + "epoch": 0.6143570827439254, + "grad_norm": 1.916040062904358, + "learning_rate": 6.837114624545102e-05, + "loss": 1.2047, + "step": 17155 + }, + { + "epoch": 0.6143928948734937, + "grad_norm": 1.502561092376709, + "learning_rate": 6.836014290338093e-05, + "loss": 1.0956, + "step": 17156 + }, + { + "epoch": 0.614428707003062, + "grad_norm": 1.6886619329452515, + "learning_rate": 6.834913998698432e-05, + "loss": 1.2171, + "step": 17157 + }, + { + "epoch": 0.6144645191326302, + "grad_norm": 2.0964033603668213, + "learning_rate": 6.833813749640916e-05, + "loss": 0.7937, + "step": 17158 + }, + { + "epoch": 0.6145003312621985, + "grad_norm": 3.099285364151001, + "learning_rate": 6.832713543180356e-05, + "loss": 0.8929, + "step": 17159 + }, + { + "epoch": 0.6145361433917668, + "grad_norm": 1.4503071308135986, + "learning_rate": 6.831613379331547e-05, + "loss": 1.0698, + "step": 17160 + }, + { + "epoch": 0.614571955521335, + "grad_norm": 1.5188974142074585, + "learning_rate": 6.830513258109296e-05, + "loss": 1.353, + "step": 17161 + }, + { + "epoch": 0.6146077676509034, + "grad_norm": 1.3794962167739868, + "learning_rate": 6.829413179528398e-05, + "loss": 0.9722, + "step": 17162 + }, + { + "epoch": 0.6146435797804717, + "grad_norm": 1.257172703742981, + "learning_rate": 6.828313143603657e-05, + "loss": 0.8439, + "step": 17163 + }, + { + "epoch": 0.61467939191004, + "grad_norm": 1.4549365043640137, + "learning_rate": 6.827213150349874e-05, + "loss": 1.2743, + "step": 17164 + }, + { + "epoch": 0.6147152040396082, + "grad_norm": 1.2732396125793457, + "learning_rate": 6.826113199781841e-05, + "loss": 0.9604, + "step": 17165 + }, + { + "epoch": 0.6147510161691765, + "grad_norm": 1.4711189270019531, + "learning_rate": 6.825013291914363e-05, + "loss": 1.1051, + "step": 17166 + }, + { + "epoch": 0.6147868282987448, + "grad_norm": 1.2430073022842407, + "learning_rate": 6.823913426762237e-05, + "loss": 1.1303, + "step": 17167 + }, + { + "epoch": 0.614822640428313, + "grad_norm": 1.3939324617385864, + "learning_rate": 6.822813604340257e-05, + "loss": 1.0391, + "step": 17168 + }, + { + "epoch": 0.6148584525578814, + "grad_norm": 1.3994308710098267, + "learning_rate": 6.821713824663221e-05, + "loss": 1.0335, + "step": 17169 + }, + { + "epoch": 0.6148942646874497, + "grad_norm": 1.4535987377166748, + "learning_rate": 6.820614087745929e-05, + "loss": 1.0844, + "step": 17170 + }, + { + "epoch": 0.614930076817018, + "grad_norm": 1.577688455581665, + "learning_rate": 6.81951439360317e-05, + "loss": 0.9739, + "step": 17171 + }, + { + "epoch": 0.6149658889465862, + "grad_norm": 1.3488571643829346, + "learning_rate": 6.818414742249745e-05, + "loss": 0.9616, + "step": 17172 + }, + { + "epoch": 0.6150017010761545, + "grad_norm": 1.454085111618042, + "learning_rate": 6.817315133700446e-05, + "loss": 1.1058, + "step": 17173 + }, + { + "epoch": 0.6150375132057228, + "grad_norm": 1.3661667108535767, + "learning_rate": 6.816215567970063e-05, + "loss": 1.1297, + "step": 17174 + }, + { + "epoch": 0.615073325335291, + "grad_norm": 1.4698147773742676, + "learning_rate": 6.815116045073396e-05, + "loss": 1.0238, + "step": 17175 + }, + { + "epoch": 0.6151091374648594, + "grad_norm": 1.855130672454834, + "learning_rate": 6.814016565025231e-05, + "loss": 1.2795, + "step": 17176 + }, + { + "epoch": 0.6151449495944277, + "grad_norm": 1.6171393394470215, + "learning_rate": 6.812917127840368e-05, + "loss": 1.2908, + "step": 17177 + }, + { + "epoch": 0.6151807617239959, + "grad_norm": 1.6919502019882202, + "learning_rate": 6.81181773353359e-05, + "loss": 1.2383, + "step": 17178 + }, + { + "epoch": 0.6152165738535642, + "grad_norm": 1.6983447074890137, + "learning_rate": 6.810718382119694e-05, + "loss": 1.1417, + "step": 17179 + }, + { + "epoch": 0.6152523859831325, + "grad_norm": 1.1962636709213257, + "learning_rate": 6.809619073613467e-05, + "loss": 1.0316, + "step": 17180 + }, + { + "epoch": 0.6152881981127007, + "grad_norm": 1.871355652809143, + "learning_rate": 6.808519808029703e-05, + "loss": 1.1116, + "step": 17181 + }, + { + "epoch": 0.615324010242269, + "grad_norm": 1.39137864112854, + "learning_rate": 6.807420585383186e-05, + "loss": 1.1996, + "step": 17182 + }, + { + "epoch": 0.6153598223718374, + "grad_norm": 1.4659600257873535, + "learning_rate": 6.806321405688707e-05, + "loss": 1.2699, + "step": 17183 + }, + { + "epoch": 0.6153956345014057, + "grad_norm": 1.389594554901123, + "learning_rate": 6.805222268961054e-05, + "loss": 0.9658, + "step": 17184 + }, + { + "epoch": 0.6154314466309739, + "grad_norm": 1.554174780845642, + "learning_rate": 6.804123175215014e-05, + "loss": 1.126, + "step": 17185 + }, + { + "epoch": 0.6154672587605422, + "grad_norm": 1.5513404607772827, + "learning_rate": 6.803024124465375e-05, + "loss": 1.1036, + "step": 17186 + }, + { + "epoch": 0.6155030708901105, + "grad_norm": 1.4797865152359009, + "learning_rate": 6.801925116726922e-05, + "loss": 1.0597, + "step": 17187 + }, + { + "epoch": 0.6155388830196787, + "grad_norm": 1.4774601459503174, + "learning_rate": 6.800826152014442e-05, + "loss": 0.9413, + "step": 17188 + }, + { + "epoch": 0.615574695149247, + "grad_norm": 1.7489150762557983, + "learning_rate": 6.799727230342718e-05, + "loss": 1.1054, + "step": 17189 + }, + { + "epoch": 0.6156105072788154, + "grad_norm": 1.446613073348999, + "learning_rate": 6.798628351726539e-05, + "loss": 0.9552, + "step": 17190 + }, + { + "epoch": 0.6156463194083837, + "grad_norm": 1.5867663621902466, + "learning_rate": 6.797529516180687e-05, + "loss": 1.0056, + "step": 17191 + }, + { + "epoch": 0.6156821315379519, + "grad_norm": 1.4495728015899658, + "learning_rate": 6.796430723719939e-05, + "loss": 0.9235, + "step": 17192 + }, + { + "epoch": 0.6157179436675202, + "grad_norm": 1.6282011270523071, + "learning_rate": 6.795331974359088e-05, + "loss": 1.2537, + "step": 17193 + }, + { + "epoch": 0.6157537557970885, + "grad_norm": 1.7240582704544067, + "learning_rate": 6.794233268112907e-05, + "loss": 1.3158, + "step": 17194 + }, + { + "epoch": 0.6157895679266567, + "grad_norm": 1.5754265785217285, + "learning_rate": 6.793134604996185e-05, + "loss": 0.9826, + "step": 17195 + }, + { + "epoch": 0.615825380056225, + "grad_norm": 1.704257607460022, + "learning_rate": 6.7920359850237e-05, + "loss": 1.1017, + "step": 17196 + }, + { + "epoch": 0.6158611921857934, + "grad_norm": 2.2727925777435303, + "learning_rate": 6.790937408210233e-05, + "loss": 1.4201, + "step": 17197 + }, + { + "epoch": 0.6158970043153617, + "grad_norm": 1.503523588180542, + "learning_rate": 6.789838874570565e-05, + "loss": 1.1994, + "step": 17198 + }, + { + "epoch": 0.6159328164449299, + "grad_norm": 1.5014415979385376, + "learning_rate": 6.788740384119472e-05, + "loss": 1.1276, + "step": 17199 + }, + { + "epoch": 0.6159686285744982, + "grad_norm": 1.521148443222046, + "learning_rate": 6.787641936871739e-05, + "loss": 1.101, + "step": 17200 + }, + { + "epoch": 0.6160044407040665, + "grad_norm": 1.2035166025161743, + "learning_rate": 6.786543532842133e-05, + "loss": 0.8993, + "step": 17201 + }, + { + "epoch": 0.6160402528336347, + "grad_norm": 2.044792652130127, + "learning_rate": 6.785445172045448e-05, + "loss": 1.1913, + "step": 17202 + }, + { + "epoch": 0.616076064963203, + "grad_norm": 1.3773694038391113, + "learning_rate": 6.784346854496442e-05, + "loss": 0.9695, + "step": 17203 + }, + { + "epoch": 0.6161118770927714, + "grad_norm": 1.5464904308319092, + "learning_rate": 6.78324858020991e-05, + "loss": 1.1828, + "step": 17204 + }, + { + "epoch": 0.6161476892223396, + "grad_norm": 1.287036657333374, + "learning_rate": 6.78215034920061e-05, + "loss": 1.1393, + "step": 17205 + }, + { + "epoch": 0.6161835013519079, + "grad_norm": 1.3609594106674194, + "learning_rate": 6.781052161483332e-05, + "loss": 1.0524, + "step": 17206 + }, + { + "epoch": 0.6162193134814762, + "grad_norm": 1.4721897840499878, + "learning_rate": 6.779954017072842e-05, + "loss": 1.1621, + "step": 17207 + }, + { + "epoch": 0.6162551256110445, + "grad_norm": 1.5973131656646729, + "learning_rate": 6.778855915983921e-05, + "loss": 0.8608, + "step": 17208 + }, + { + "epoch": 0.6162909377406127, + "grad_norm": 2.0371222496032715, + "learning_rate": 6.777757858231339e-05, + "loss": 0.9789, + "step": 17209 + }, + { + "epoch": 0.616326749870181, + "grad_norm": 1.5609517097473145, + "learning_rate": 6.776659843829863e-05, + "loss": 1.0619, + "step": 17210 + }, + { + "epoch": 0.6163625619997494, + "grad_norm": 1.3980780839920044, + "learning_rate": 6.775561872794279e-05, + "loss": 1.0564, + "step": 17211 + }, + { + "epoch": 0.6163983741293176, + "grad_norm": 1.4813936948776245, + "learning_rate": 6.774463945139343e-05, + "loss": 1.2082, + "step": 17212 + }, + { + "epoch": 0.6164341862588859, + "grad_norm": 1.3831640481948853, + "learning_rate": 6.77336606087984e-05, + "loss": 1.0945, + "step": 17213 + }, + { + "epoch": 0.6164699983884542, + "grad_norm": 1.5501035451889038, + "learning_rate": 6.772268220030528e-05, + "loss": 0.9648, + "step": 17214 + }, + { + "epoch": 0.6165058105180224, + "grad_norm": 1.7275744676589966, + "learning_rate": 6.77117042260619e-05, + "loss": 1.2073, + "step": 17215 + }, + { + "epoch": 0.6165416226475907, + "grad_norm": 1.2965168952941895, + "learning_rate": 6.770072668621583e-05, + "loss": 1.0705, + "step": 17216 + }, + { + "epoch": 0.616577434777159, + "grad_norm": 1.8317406177520752, + "learning_rate": 6.768974958091488e-05, + "loss": 0.994, + "step": 17217 + }, + { + "epoch": 0.6166132469067274, + "grad_norm": 1.3410755395889282, + "learning_rate": 6.767877291030666e-05, + "loss": 1.2713, + "step": 17218 + }, + { + "epoch": 0.6166490590362956, + "grad_norm": 1.2520502805709839, + "learning_rate": 6.766779667453881e-05, + "loss": 1.0608, + "step": 17219 + }, + { + "epoch": 0.6166848711658639, + "grad_norm": 1.1794263124465942, + "learning_rate": 6.765682087375912e-05, + "loss": 0.8668, + "step": 17220 + }, + { + "epoch": 0.6167206832954322, + "grad_norm": 1.7733445167541504, + "learning_rate": 6.764584550811512e-05, + "loss": 1.0973, + "step": 17221 + }, + { + "epoch": 0.6167564954250004, + "grad_norm": 1.6885708570480347, + "learning_rate": 6.763487057775459e-05, + "loss": 1.0436, + "step": 17222 + }, + { + "epoch": 0.6167923075545687, + "grad_norm": 1.4252759218215942, + "learning_rate": 6.762389608282507e-05, + "loss": 0.9431, + "step": 17223 + }, + { + "epoch": 0.616828119684137, + "grad_norm": 1.351881742477417, + "learning_rate": 6.761292202347434e-05, + "loss": 1.1977, + "step": 17224 + }, + { + "epoch": 0.6168639318137054, + "grad_norm": 1.3289697170257568, + "learning_rate": 6.760194839984994e-05, + "loss": 0.9864, + "step": 17225 + }, + { + "epoch": 0.6168997439432736, + "grad_norm": 1.5494847297668457, + "learning_rate": 6.75909752120995e-05, + "loss": 1.3385, + "step": 17226 + }, + { + "epoch": 0.6169355560728419, + "grad_norm": 1.7324856519699097, + "learning_rate": 6.758000246037072e-05, + "loss": 1.4162, + "step": 17227 + }, + { + "epoch": 0.6169713682024102, + "grad_norm": 1.5823395252227783, + "learning_rate": 6.756903014481116e-05, + "loss": 0.9645, + "step": 17228 + }, + { + "epoch": 0.6170071803319784, + "grad_norm": 1.7649339437484741, + "learning_rate": 6.75580582655685e-05, + "loss": 1.3839, + "step": 17229 + }, + { + "epoch": 0.6170429924615467, + "grad_norm": 1.4271622896194458, + "learning_rate": 6.754708682279027e-05, + "loss": 0.9315, + "step": 17230 + }, + { + "epoch": 0.617078804591115, + "grad_norm": 1.692091703414917, + "learning_rate": 6.753611581662418e-05, + "loss": 1.2237, + "step": 17231 + }, + { + "epoch": 0.6171146167206832, + "grad_norm": 1.5875385999679565, + "learning_rate": 6.752514524721771e-05, + "loss": 1.2195, + "step": 17232 + }, + { + "epoch": 0.6171504288502516, + "grad_norm": 1.245118260383606, + "learning_rate": 6.751417511471859e-05, + "loss": 1.1486, + "step": 17233 + }, + { + "epoch": 0.6171862409798199, + "grad_norm": 1.3979976177215576, + "learning_rate": 6.750320541927433e-05, + "loss": 1.1897, + "step": 17234 + }, + { + "epoch": 0.6172220531093882, + "grad_norm": 1.3449925184249878, + "learning_rate": 6.749223616103249e-05, + "loss": 1.0846, + "step": 17235 + }, + { + "epoch": 0.6172578652389564, + "grad_norm": 1.3016109466552734, + "learning_rate": 6.74812673401407e-05, + "loss": 0.8514, + "step": 17236 + }, + { + "epoch": 0.6172936773685247, + "grad_norm": 1.5791857242584229, + "learning_rate": 6.74702989567465e-05, + "loss": 1.1263, + "step": 17237 + }, + { + "epoch": 0.617329489498093, + "grad_norm": 1.5226393938064575, + "learning_rate": 6.745933101099748e-05, + "loss": 1.1289, + "step": 17238 + }, + { + "epoch": 0.6173653016276612, + "grad_norm": 1.3543570041656494, + "learning_rate": 6.744836350304118e-05, + "loss": 0.9159, + "step": 17239 + }, + { + "epoch": 0.6174011137572296, + "grad_norm": 1.3707491159439087, + "learning_rate": 6.743739643302516e-05, + "loss": 1.0933, + "step": 17240 + }, + { + "epoch": 0.6174369258867979, + "grad_norm": 1.2860974073410034, + "learning_rate": 6.742642980109696e-05, + "loss": 1.1493, + "step": 17241 + }, + { + "epoch": 0.6174727380163662, + "grad_norm": 1.5797522068023682, + "learning_rate": 6.741546360740415e-05, + "loss": 1.2041, + "step": 17242 + }, + { + "epoch": 0.6175085501459344, + "grad_norm": 1.5136586427688599, + "learning_rate": 6.740449785209425e-05, + "loss": 0.9985, + "step": 17243 + }, + { + "epoch": 0.6175443622755027, + "grad_norm": 1.280689001083374, + "learning_rate": 6.739353253531475e-05, + "loss": 1.2006, + "step": 17244 + }, + { + "epoch": 0.617580174405071, + "grad_norm": 2.120088815689087, + "learning_rate": 6.738256765721324e-05, + "loss": 1.0907, + "step": 17245 + }, + { + "epoch": 0.6176159865346392, + "grad_norm": 1.5861181020736694, + "learning_rate": 6.73716032179372e-05, + "loss": 1.0217, + "step": 17246 + }, + { + "epoch": 0.6176517986642076, + "grad_norm": 1.5528883934020996, + "learning_rate": 6.736063921763415e-05, + "loss": 1.0237, + "step": 17247 + }, + { + "epoch": 0.6176876107937759, + "grad_norm": 1.2317280769348145, + "learning_rate": 6.73496756564516e-05, + "loss": 0.9361, + "step": 17248 + }, + { + "epoch": 0.6177234229233441, + "grad_norm": 1.6637369394302368, + "learning_rate": 6.733871253453707e-05, + "loss": 1.2178, + "step": 17249 + }, + { + "epoch": 0.6177592350529124, + "grad_norm": 1.6135258674621582, + "learning_rate": 6.7327749852038e-05, + "loss": 0.9977, + "step": 17250 + }, + { + "epoch": 0.6177950471824807, + "grad_norm": 1.6781256198883057, + "learning_rate": 6.731678760910192e-05, + "loss": 1.2417, + "step": 17251 + }, + { + "epoch": 0.617830859312049, + "grad_norm": 1.4958810806274414, + "learning_rate": 6.730582580587632e-05, + "loss": 1.0688, + "step": 17252 + }, + { + "epoch": 0.6178666714416172, + "grad_norm": 1.6201955080032349, + "learning_rate": 6.729486444250863e-05, + "loss": 1.1468, + "step": 17253 + }, + { + "epoch": 0.6179024835711856, + "grad_norm": 1.4236791133880615, + "learning_rate": 6.72839035191464e-05, + "loss": 0.8741, + "step": 17254 + }, + { + "epoch": 0.6179382957007539, + "grad_norm": 1.4538766145706177, + "learning_rate": 6.7272943035937e-05, + "loss": 1.2115, + "step": 17255 + }, + { + "epoch": 0.6179741078303221, + "grad_norm": 1.3188456296920776, + "learning_rate": 6.726198299302796e-05, + "loss": 1.1125, + "step": 17256 + }, + { + "epoch": 0.6180099199598904, + "grad_norm": 1.8033595085144043, + "learning_rate": 6.72510233905667e-05, + "loss": 1.1061, + "step": 17257 + }, + { + "epoch": 0.6180457320894587, + "grad_norm": 1.4324743747711182, + "learning_rate": 6.724006422870069e-05, + "loss": 1.0174, + "step": 17258 + }, + { + "epoch": 0.618081544219027, + "grad_norm": 1.2778135538101196, + "learning_rate": 6.722910550757734e-05, + "loss": 1.1147, + "step": 17259 + }, + { + "epoch": 0.6181173563485952, + "grad_norm": 1.7193419933319092, + "learning_rate": 6.721814722734412e-05, + "loss": 1.13, + "step": 17260 + }, + { + "epoch": 0.6181531684781636, + "grad_norm": 1.4165301322937012, + "learning_rate": 6.720718938814846e-05, + "loss": 1.1591, + "step": 17261 + }, + { + "epoch": 0.6181889806077319, + "grad_norm": 1.720771074295044, + "learning_rate": 6.719623199013771e-05, + "loss": 1.0238, + "step": 17262 + }, + { + "epoch": 0.6182247927373001, + "grad_norm": 1.3553985357284546, + "learning_rate": 6.718527503345939e-05, + "loss": 1.0382, + "step": 17263 + }, + { + "epoch": 0.6182606048668684, + "grad_norm": 1.1648540496826172, + "learning_rate": 6.717431851826086e-05, + "loss": 0.8043, + "step": 17264 + }, + { + "epoch": 0.6182964169964367, + "grad_norm": 1.2319858074188232, + "learning_rate": 6.716336244468954e-05, + "loss": 1.1608, + "step": 17265 + }, + { + "epoch": 0.6183322291260049, + "grad_norm": 1.83073890209198, + "learning_rate": 6.715240681289279e-05, + "loss": 1.2094, + "step": 17266 + }, + { + "epoch": 0.6183680412555732, + "grad_norm": 1.3320426940917969, + "learning_rate": 6.714145162301808e-05, + "loss": 0.9476, + "step": 17267 + }, + { + "epoch": 0.6184038533851416, + "grad_norm": 1.5358762741088867, + "learning_rate": 6.713049687521272e-05, + "loss": 1.1178, + "step": 17268 + }, + { + "epoch": 0.6184396655147099, + "grad_norm": 1.3307050466537476, + "learning_rate": 6.711954256962414e-05, + "loss": 1.1846, + "step": 17269 + }, + { + "epoch": 0.6184754776442781, + "grad_norm": 1.3061953783035278, + "learning_rate": 6.71085887063997e-05, + "loss": 1.2625, + "step": 17270 + }, + { + "epoch": 0.6185112897738464, + "grad_norm": 1.3990588188171387, + "learning_rate": 6.709763528568677e-05, + "loss": 0.9966, + "step": 17271 + }, + { + "epoch": 0.6185471019034147, + "grad_norm": 1.35612154006958, + "learning_rate": 6.708668230763272e-05, + "loss": 1.0786, + "step": 17272 + }, + { + "epoch": 0.6185829140329829, + "grad_norm": 1.4605796337127686, + "learning_rate": 6.707572977238489e-05, + "loss": 1.0098, + "step": 17273 + }, + { + "epoch": 0.6186187261625512, + "grad_norm": 1.6184886693954468, + "learning_rate": 6.706477768009067e-05, + "loss": 1.2905, + "step": 17274 + }, + { + "epoch": 0.6186545382921196, + "grad_norm": 1.4201867580413818, + "learning_rate": 6.705382603089737e-05, + "loss": 1.2083, + "step": 17275 + }, + { + "epoch": 0.6186903504216879, + "grad_norm": 1.436824917793274, + "learning_rate": 6.704287482495233e-05, + "loss": 1.0691, + "step": 17276 + }, + { + "epoch": 0.6187261625512561, + "grad_norm": 1.2271078824996948, + "learning_rate": 6.70319240624029e-05, + "loss": 0.9742, + "step": 17277 + }, + { + "epoch": 0.6187619746808244, + "grad_norm": 1.424075961112976, + "learning_rate": 6.702097374339644e-05, + "loss": 0.9382, + "step": 17278 + }, + { + "epoch": 0.6187977868103927, + "grad_norm": 1.2425166368484497, + "learning_rate": 6.701002386808021e-05, + "loss": 0.9574, + "step": 17279 + }, + { + "epoch": 0.6188335989399609, + "grad_norm": 1.631792426109314, + "learning_rate": 6.699907443660156e-05, + "loss": 1.0371, + "step": 17280 + }, + { + "epoch": 0.6188694110695292, + "grad_norm": 2.297302007675171, + "learning_rate": 6.698812544910781e-05, + "loss": 1.1614, + "step": 17281 + }, + { + "epoch": 0.6189052231990976, + "grad_norm": 1.487149715423584, + "learning_rate": 6.697717690574623e-05, + "loss": 1.1262, + "step": 17282 + }, + { + "epoch": 0.6189410353286658, + "grad_norm": 1.8003212213516235, + "learning_rate": 6.696622880666415e-05, + "loss": 0.9668, + "step": 17283 + }, + { + "epoch": 0.6189768474582341, + "grad_norm": 2.1249547004699707, + "learning_rate": 6.695528115200883e-05, + "loss": 1.3491, + "step": 17284 + }, + { + "epoch": 0.6190126595878024, + "grad_norm": 1.5309944152832031, + "learning_rate": 6.69443339419276e-05, + "loss": 1.0325, + "step": 17285 + }, + { + "epoch": 0.6190484717173707, + "grad_norm": 1.3707504272460938, + "learning_rate": 6.69333871765677e-05, + "loss": 1.2671, + "step": 17286 + }, + { + "epoch": 0.6190842838469389, + "grad_norm": 1.375831127166748, + "learning_rate": 6.692244085607644e-05, + "loss": 1.0108, + "step": 17287 + }, + { + "epoch": 0.6191200959765072, + "grad_norm": 2.070051908493042, + "learning_rate": 6.69114949806011e-05, + "loss": 1.2495, + "step": 17288 + }, + { + "epoch": 0.6191559081060756, + "grad_norm": 1.477064847946167, + "learning_rate": 6.690054955028885e-05, + "loss": 1.1207, + "step": 17289 + }, + { + "epoch": 0.6191917202356438, + "grad_norm": 1.666117787361145, + "learning_rate": 6.688960456528705e-05, + "loss": 1.1824, + "step": 17290 + }, + { + "epoch": 0.6192275323652121, + "grad_norm": 1.7278785705566406, + "learning_rate": 6.687866002574289e-05, + "loss": 1.1479, + "step": 17291 + }, + { + "epoch": 0.6192633444947804, + "grad_norm": 1.5327531099319458, + "learning_rate": 6.686771593180365e-05, + "loss": 0.9774, + "step": 17292 + }, + { + "epoch": 0.6192991566243486, + "grad_norm": 1.3155736923217773, + "learning_rate": 6.685677228361654e-05, + "loss": 1.1206, + "step": 17293 + }, + { + "epoch": 0.6193349687539169, + "grad_norm": 1.6657445430755615, + "learning_rate": 6.684582908132883e-05, + "loss": 1.1635, + "step": 17294 + }, + { + "epoch": 0.6193707808834852, + "grad_norm": 2.0607569217681885, + "learning_rate": 6.68348863250877e-05, + "loss": 1.1204, + "step": 17295 + }, + { + "epoch": 0.6194065930130536, + "grad_norm": 1.3751312494277954, + "learning_rate": 6.682394401504042e-05, + "loss": 1.2359, + "step": 17296 + }, + { + "epoch": 0.6194424051426218, + "grad_norm": 1.6363130807876587, + "learning_rate": 6.681300215133419e-05, + "loss": 1.1471, + "step": 17297 + }, + { + "epoch": 0.6194782172721901, + "grad_norm": 1.4747228622436523, + "learning_rate": 6.680206073411616e-05, + "loss": 1.1118, + "step": 17298 + }, + { + "epoch": 0.6195140294017584, + "grad_norm": 1.4866904020309448, + "learning_rate": 6.679111976353362e-05, + "loss": 1.1638, + "step": 17299 + }, + { + "epoch": 0.6195498415313266, + "grad_norm": 1.2866430282592773, + "learning_rate": 6.67801792397337e-05, + "loss": 1.0097, + "step": 17300 + }, + { + "epoch": 0.6195856536608949, + "grad_norm": 1.7985392808914185, + "learning_rate": 6.676923916286365e-05, + "loss": 1.1311, + "step": 17301 + }, + { + "epoch": 0.6196214657904632, + "grad_norm": 1.3421016931533813, + "learning_rate": 6.675829953307057e-05, + "loss": 1.0032, + "step": 17302 + }, + { + "epoch": 0.6196572779200316, + "grad_norm": 1.9027384519577026, + "learning_rate": 6.674736035050173e-05, + "loss": 1.0407, + "step": 17303 + }, + { + "epoch": 0.6196930900495998, + "grad_norm": 1.3758383989334106, + "learning_rate": 6.673642161530424e-05, + "loss": 0.9035, + "step": 17304 + }, + { + "epoch": 0.6197289021791681, + "grad_norm": 1.4399315118789673, + "learning_rate": 6.672548332762533e-05, + "loss": 1.2451, + "step": 17305 + }, + { + "epoch": 0.6197647143087364, + "grad_norm": 1.6151666641235352, + "learning_rate": 6.671454548761212e-05, + "loss": 1.0311, + "step": 17306 + }, + { + "epoch": 0.6198005264383046, + "grad_norm": 1.3214454650878906, + "learning_rate": 6.670360809541171e-05, + "loss": 1.1211, + "step": 17307 + }, + { + "epoch": 0.6198363385678729, + "grad_norm": 1.5536776781082153, + "learning_rate": 6.669267115117137e-05, + "loss": 0.8954, + "step": 17308 + }, + { + "epoch": 0.6198721506974412, + "grad_norm": 1.7099733352661133, + "learning_rate": 6.66817346550381e-05, + "loss": 1.1378, + "step": 17309 + }, + { + "epoch": 0.6199079628270096, + "grad_norm": 1.6056582927703857, + "learning_rate": 6.66707986071592e-05, + "loss": 1.0915, + "step": 17310 + }, + { + "epoch": 0.6199437749565778, + "grad_norm": 1.6147549152374268, + "learning_rate": 6.665986300768163e-05, + "loss": 1.1663, + "step": 17311 + }, + { + "epoch": 0.6199795870861461, + "grad_norm": 1.5044317245483398, + "learning_rate": 6.664892785675267e-05, + "loss": 1.0848, + "step": 17312 + }, + { + "epoch": 0.6200153992157144, + "grad_norm": 1.5314769744873047, + "learning_rate": 6.663799315451931e-05, + "loss": 1.2747, + "step": 17313 + }, + { + "epoch": 0.6200512113452826, + "grad_norm": 1.3315447568893433, + "learning_rate": 6.662705890112876e-05, + "loss": 1.1942, + "step": 17314 + }, + { + "epoch": 0.6200870234748509, + "grad_norm": 1.4222993850708008, + "learning_rate": 6.661612509672808e-05, + "loss": 1.1605, + "step": 17315 + }, + { + "epoch": 0.6201228356044192, + "grad_norm": 1.3154159784317017, + "learning_rate": 6.660519174146433e-05, + "loss": 1.2039, + "step": 17316 + }, + { + "epoch": 0.6201586477339875, + "grad_norm": 2.4648406505584717, + "learning_rate": 6.659425883548471e-05, + "loss": 1.275, + "step": 17317 + }, + { + "epoch": 0.6201944598635558, + "grad_norm": 1.4660054445266724, + "learning_rate": 6.658332637893619e-05, + "loss": 1.2318, + "step": 17318 + }, + { + "epoch": 0.6202302719931241, + "grad_norm": 1.7762740850448608, + "learning_rate": 6.657239437196596e-05, + "loss": 1.2126, + "step": 17319 + }, + { + "epoch": 0.6202660841226924, + "grad_norm": 1.5627379417419434, + "learning_rate": 6.656146281472098e-05, + "loss": 1.2001, + "step": 17320 + }, + { + "epoch": 0.6203018962522606, + "grad_norm": 1.498029351234436, + "learning_rate": 6.655053170734846e-05, + "loss": 1.0133, + "step": 17321 + }, + { + "epoch": 0.6203377083818289, + "grad_norm": 1.791383981704712, + "learning_rate": 6.653960104999537e-05, + "loss": 0.9551, + "step": 17322 + }, + { + "epoch": 0.6203735205113972, + "grad_norm": 1.9001439809799194, + "learning_rate": 6.652867084280876e-05, + "loss": 1.0421, + "step": 17323 + }, + { + "epoch": 0.6204093326409655, + "grad_norm": 1.653993010520935, + "learning_rate": 6.651774108593574e-05, + "loss": 1.0345, + "step": 17324 + }, + { + "epoch": 0.6204451447705338, + "grad_norm": 1.6144237518310547, + "learning_rate": 6.650681177952328e-05, + "loss": 1.05, + "step": 17325 + }, + { + "epoch": 0.6204809569001021, + "grad_norm": 1.720801591873169, + "learning_rate": 6.64958829237185e-05, + "loss": 1.0325, + "step": 17326 + }, + { + "epoch": 0.6205167690296703, + "grad_norm": 1.457018494606018, + "learning_rate": 6.648495451866838e-05, + "loss": 1.1753, + "step": 17327 + }, + { + "epoch": 0.6205525811592386, + "grad_norm": 1.6318995952606201, + "learning_rate": 6.647402656451998e-05, + "loss": 1.29, + "step": 17328 + }, + { + "epoch": 0.6205883932888069, + "grad_norm": 1.4776705503463745, + "learning_rate": 6.646309906142027e-05, + "loss": 1.2593, + "step": 17329 + }, + { + "epoch": 0.6206242054183752, + "grad_norm": 1.4625039100646973, + "learning_rate": 6.645217200951636e-05, + "loss": 1.2521, + "step": 17330 + }, + { + "epoch": 0.6206600175479435, + "grad_norm": 1.5368890762329102, + "learning_rate": 6.644124540895518e-05, + "loss": 0.9881, + "step": 17331 + }, + { + "epoch": 0.6206958296775118, + "grad_norm": 1.42987060546875, + "learning_rate": 6.643031925988375e-05, + "loss": 1.1112, + "step": 17332 + }, + { + "epoch": 0.6207316418070801, + "grad_norm": 1.5652403831481934, + "learning_rate": 6.641939356244908e-05, + "loss": 0.9401, + "step": 17333 + }, + { + "epoch": 0.6207674539366483, + "grad_norm": 1.6085370779037476, + "learning_rate": 6.640846831679815e-05, + "loss": 0.9281, + "step": 17334 + }, + { + "epoch": 0.6208032660662166, + "grad_norm": 1.3913638591766357, + "learning_rate": 6.639754352307794e-05, + "loss": 0.9269, + "step": 17335 + }, + { + "epoch": 0.6208390781957849, + "grad_norm": 1.5679676532745361, + "learning_rate": 6.638661918143542e-05, + "loss": 1.0461, + "step": 17336 + }, + { + "epoch": 0.6208748903253531, + "grad_norm": 1.5403779745101929, + "learning_rate": 6.637569529201763e-05, + "loss": 1.0962, + "step": 17337 + }, + { + "epoch": 0.6209107024549215, + "grad_norm": 1.3444945812225342, + "learning_rate": 6.636477185497145e-05, + "loss": 1.095, + "step": 17338 + }, + { + "epoch": 0.6209465145844898, + "grad_norm": 2.0731663703918457, + "learning_rate": 6.63538488704439e-05, + "loss": 1.0562, + "step": 17339 + }, + { + "epoch": 0.6209823267140581, + "grad_norm": 1.4650297164916992, + "learning_rate": 6.634292633858191e-05, + "loss": 1.1364, + "step": 17340 + }, + { + "epoch": 0.6210181388436263, + "grad_norm": 1.6964592933654785, + "learning_rate": 6.633200425953241e-05, + "loss": 1.1222, + "step": 17341 + }, + { + "epoch": 0.6210539509731946, + "grad_norm": 1.3277407884597778, + "learning_rate": 6.632108263344238e-05, + "loss": 0.9034, + "step": 17342 + }, + { + "epoch": 0.6210897631027629, + "grad_norm": 1.481891393661499, + "learning_rate": 6.631016146045874e-05, + "loss": 0.9549, + "step": 17343 + }, + { + "epoch": 0.6211255752323311, + "grad_norm": 1.6159526109695435, + "learning_rate": 6.629924074072844e-05, + "loss": 1.137, + "step": 17344 + }, + { + "epoch": 0.6211613873618995, + "grad_norm": 1.647426962852478, + "learning_rate": 6.628832047439835e-05, + "loss": 1.1585, + "step": 17345 + }, + { + "epoch": 0.6211971994914678, + "grad_norm": 1.9616785049438477, + "learning_rate": 6.627740066161545e-05, + "loss": 1.1351, + "step": 17346 + }, + { + "epoch": 0.6212330116210361, + "grad_norm": 1.47096586227417, + "learning_rate": 6.62664813025266e-05, + "loss": 1.0545, + "step": 17347 + }, + { + "epoch": 0.6212688237506043, + "grad_norm": 1.4473979473114014, + "learning_rate": 6.625556239727875e-05, + "loss": 1.218, + "step": 17348 + }, + { + "epoch": 0.6213046358801726, + "grad_norm": 1.7817524671554565, + "learning_rate": 6.624464394601879e-05, + "loss": 0.9253, + "step": 17349 + }, + { + "epoch": 0.6213404480097409, + "grad_norm": 1.3707118034362793, + "learning_rate": 6.623372594889358e-05, + "loss": 1.1476, + "step": 17350 + }, + { + "epoch": 0.6213762601393091, + "grad_norm": 1.3729976415634155, + "learning_rate": 6.622280840605005e-05, + "loss": 1.0144, + "step": 17351 + }, + { + "epoch": 0.6214120722688775, + "grad_norm": 1.7663838863372803, + "learning_rate": 6.621189131763505e-05, + "loss": 0.966, + "step": 17352 + }, + { + "epoch": 0.6214478843984458, + "grad_norm": 1.5180517435073853, + "learning_rate": 6.620097468379548e-05, + "loss": 0.8706, + "step": 17353 + }, + { + "epoch": 0.621483696528014, + "grad_norm": 1.4022183418273926, + "learning_rate": 6.619005850467818e-05, + "loss": 1.1623, + "step": 17354 + }, + { + "epoch": 0.6215195086575823, + "grad_norm": 1.6287455558776855, + "learning_rate": 6.617914278043005e-05, + "loss": 1.0645, + "step": 17355 + }, + { + "epoch": 0.6215553207871506, + "grad_norm": 1.3617403507232666, + "learning_rate": 6.616822751119792e-05, + "loss": 1.0924, + "step": 17356 + }, + { + "epoch": 0.6215911329167189, + "grad_norm": 1.5869144201278687, + "learning_rate": 6.615731269712864e-05, + "loss": 1.0659, + "step": 17357 + }, + { + "epoch": 0.6216269450462871, + "grad_norm": 1.538798213005066, + "learning_rate": 6.614639833836908e-05, + "loss": 1.0478, + "step": 17358 + }, + { + "epoch": 0.6216627571758555, + "grad_norm": 1.6445503234863281, + "learning_rate": 6.613548443506605e-05, + "loss": 1.1899, + "step": 17359 + }, + { + "epoch": 0.6216985693054238, + "grad_norm": 1.7626723051071167, + "learning_rate": 6.612457098736642e-05, + "loss": 1.141, + "step": 17360 + }, + { + "epoch": 0.621734381434992, + "grad_norm": 1.484185814857483, + "learning_rate": 6.611365799541695e-05, + "loss": 1.1785, + "step": 17361 + }, + { + "epoch": 0.6217701935645603, + "grad_norm": 1.4958595037460327, + "learning_rate": 6.610274545936455e-05, + "loss": 1.1633, + "step": 17362 + }, + { + "epoch": 0.6218060056941286, + "grad_norm": 1.5437242984771729, + "learning_rate": 6.609183337935594e-05, + "loss": 1.2091, + "step": 17363 + }, + { + "epoch": 0.6218418178236969, + "grad_norm": 1.3212611675262451, + "learning_rate": 6.6080921755538e-05, + "loss": 1.1335, + "step": 17364 + }, + { + "epoch": 0.6218776299532651, + "grad_norm": 1.5150173902511597, + "learning_rate": 6.607001058805749e-05, + "loss": 1.169, + "step": 17365 + }, + { + "epoch": 0.6219134420828335, + "grad_norm": 1.4760512113571167, + "learning_rate": 6.605909987706125e-05, + "loss": 0.9316, + "step": 17366 + }, + { + "epoch": 0.6219492542124018, + "grad_norm": 1.8137097358703613, + "learning_rate": 6.604818962269602e-05, + "loss": 0.8478, + "step": 17367 + }, + { + "epoch": 0.62198506634197, + "grad_norm": 1.4556853771209717, + "learning_rate": 6.603727982510859e-05, + "loss": 1.0277, + "step": 17368 + }, + { + "epoch": 0.6220208784715383, + "grad_norm": 1.6170275211334229, + "learning_rate": 6.602637048444578e-05, + "loss": 1.2046, + "step": 17369 + }, + { + "epoch": 0.6220566906011066, + "grad_norm": 1.476260781288147, + "learning_rate": 6.60154616008543e-05, + "loss": 0.9483, + "step": 17370 + }, + { + "epoch": 0.6220925027306748, + "grad_norm": 1.5791767835617065, + "learning_rate": 6.600455317448098e-05, + "loss": 1.0413, + "step": 17371 + }, + { + "epoch": 0.6221283148602431, + "grad_norm": 1.9819813966751099, + "learning_rate": 6.599364520547251e-05, + "loss": 1.2863, + "step": 17372 + }, + { + "epoch": 0.6221641269898115, + "grad_norm": 1.2301522493362427, + "learning_rate": 6.598273769397572e-05, + "loss": 1.1856, + "step": 17373 + }, + { + "epoch": 0.6221999391193798, + "grad_norm": 1.9182711839675903, + "learning_rate": 6.597183064013728e-05, + "loss": 1.0236, + "step": 17374 + }, + { + "epoch": 0.622235751248948, + "grad_norm": 1.3951267004013062, + "learning_rate": 6.5960924044104e-05, + "loss": 1.0008, + "step": 17375 + }, + { + "epoch": 0.6222715633785163, + "grad_norm": 1.2577179670333862, + "learning_rate": 6.595001790602255e-05, + "loss": 1.039, + "step": 17376 + }, + { + "epoch": 0.6223073755080846, + "grad_norm": 1.2814295291900635, + "learning_rate": 6.593911222603969e-05, + "loss": 0.9693, + "step": 17377 + }, + { + "epoch": 0.6223431876376528, + "grad_norm": 1.4760758876800537, + "learning_rate": 6.592820700430215e-05, + "loss": 1.2434, + "step": 17378 + }, + { + "epoch": 0.6223789997672211, + "grad_norm": 1.687957525253296, + "learning_rate": 6.591730224095663e-05, + "loss": 1.1317, + "step": 17379 + }, + { + "epoch": 0.6224148118967895, + "grad_norm": 1.3351826667785645, + "learning_rate": 6.590639793614985e-05, + "loss": 1.0024, + "step": 17380 + }, + { + "epoch": 0.6224506240263578, + "grad_norm": 1.7775379419326782, + "learning_rate": 6.589549409002851e-05, + "loss": 1.2166, + "step": 17381 + }, + { + "epoch": 0.622486436155926, + "grad_norm": 1.548425555229187, + "learning_rate": 6.588459070273931e-05, + "loss": 1.0333, + "step": 17382 + }, + { + "epoch": 0.6225222482854943, + "grad_norm": 1.7317639589309692, + "learning_rate": 6.58736877744289e-05, + "loss": 1.3566, + "step": 17383 + }, + { + "epoch": 0.6225580604150626, + "grad_norm": 1.5776512622833252, + "learning_rate": 6.586278530524405e-05, + "loss": 1.1142, + "step": 17384 + }, + { + "epoch": 0.6225938725446308, + "grad_norm": 1.3974690437316895, + "learning_rate": 6.58518832953314e-05, + "loss": 1.0852, + "step": 17385 + }, + { + "epoch": 0.6226296846741991, + "grad_norm": 1.5789600610733032, + "learning_rate": 6.584098174483754e-05, + "loss": 0.9141, + "step": 17386 + }, + { + "epoch": 0.6226654968037675, + "grad_norm": 1.4443835020065308, + "learning_rate": 6.583008065390925e-05, + "loss": 1.085, + "step": 17387 + }, + { + "epoch": 0.6227013089333358, + "grad_norm": 1.7709486484527588, + "learning_rate": 6.581918002269315e-05, + "loss": 1.1708, + "step": 17388 + }, + { + "epoch": 0.622737121062904, + "grad_norm": 1.5965380668640137, + "learning_rate": 6.58082798513359e-05, + "loss": 1.1293, + "step": 17389 + }, + { + "epoch": 0.6227729331924723, + "grad_norm": 1.3549669981002808, + "learning_rate": 6.579738013998411e-05, + "loss": 1.2258, + "step": 17390 + }, + { + "epoch": 0.6228087453220406, + "grad_norm": 1.507879614830017, + "learning_rate": 6.578648088878449e-05, + "loss": 1.1856, + "step": 17391 + }, + { + "epoch": 0.6228445574516088, + "grad_norm": 1.7591848373413086, + "learning_rate": 6.577558209788362e-05, + "loss": 1.2306, + "step": 17392 + }, + { + "epoch": 0.6228803695811771, + "grad_norm": 1.503336787223816, + "learning_rate": 6.576468376742815e-05, + "loss": 1.1154, + "step": 17393 + }, + { + "epoch": 0.6229161817107455, + "grad_norm": 1.320571780204773, + "learning_rate": 6.575378589756472e-05, + "loss": 1.0484, + "step": 17394 + }, + { + "epoch": 0.6229519938403137, + "grad_norm": 1.4773751497268677, + "learning_rate": 6.574288848843988e-05, + "loss": 0.9767, + "step": 17395 + }, + { + "epoch": 0.622987805969882, + "grad_norm": 1.7531578540802002, + "learning_rate": 6.573199154020033e-05, + "loss": 1.2424, + "step": 17396 + }, + { + "epoch": 0.6230236180994503, + "grad_norm": 1.4749597311019897, + "learning_rate": 6.57210950529926e-05, + "loss": 1.1299, + "step": 17397 + }, + { + "epoch": 0.6230594302290186, + "grad_norm": 1.2679396867752075, + "learning_rate": 6.571019902696335e-05, + "loss": 1.0046, + "step": 17398 + }, + { + "epoch": 0.6230952423585868, + "grad_norm": 1.4932007789611816, + "learning_rate": 6.569930346225909e-05, + "loss": 1.1725, + "step": 17399 + }, + { + "epoch": 0.6231310544881551, + "grad_norm": 1.3751888275146484, + "learning_rate": 6.56884083590265e-05, + "loss": 0.9891, + "step": 17400 + }, + { + "epoch": 0.6231668666177235, + "grad_norm": 1.5039820671081543, + "learning_rate": 6.567751371741209e-05, + "loss": 1.2708, + "step": 17401 + }, + { + "epoch": 0.6232026787472917, + "grad_norm": 1.4117289781570435, + "learning_rate": 6.566661953756248e-05, + "loss": 0.9972, + "step": 17402 + }, + { + "epoch": 0.62323849087686, + "grad_norm": 2.0289134979248047, + "learning_rate": 6.565572581962425e-05, + "loss": 1.2019, + "step": 17403 + }, + { + "epoch": 0.6232743030064283, + "grad_norm": 1.772194743156433, + "learning_rate": 6.564483256374386e-05, + "loss": 1.0106, + "step": 17404 + }, + { + "epoch": 0.6233101151359965, + "grad_norm": 1.4302353858947754, + "learning_rate": 6.5633939770068e-05, + "loss": 1.2852, + "step": 17405 + }, + { + "epoch": 0.6233459272655648, + "grad_norm": 1.7831939458847046, + "learning_rate": 6.562304743874308e-05, + "loss": 1.0387, + "step": 17406 + }, + { + "epoch": 0.6233817393951331, + "grad_norm": 1.5451738834381104, + "learning_rate": 6.561215556991578e-05, + "loss": 1.0154, + "step": 17407 + }, + { + "epoch": 0.6234175515247015, + "grad_norm": 1.2665656805038452, + "learning_rate": 6.56012641637325e-05, + "loss": 1.1551, + "step": 17408 + }, + { + "epoch": 0.6234533636542697, + "grad_norm": 1.2942179441452026, + "learning_rate": 6.559037322033991e-05, + "loss": 1.107, + "step": 17409 + }, + { + "epoch": 0.623489175783838, + "grad_norm": 1.4400897026062012, + "learning_rate": 6.55794827398844e-05, + "loss": 0.8768, + "step": 17410 + }, + { + "epoch": 0.6235249879134063, + "grad_norm": 1.3193923234939575, + "learning_rate": 6.556859272251261e-05, + "loss": 1.2713, + "step": 17411 + }, + { + "epoch": 0.6235608000429745, + "grad_norm": 1.5188593864440918, + "learning_rate": 6.555770316837098e-05, + "loss": 1.0913, + "step": 17412 + }, + { + "epoch": 0.6235966121725428, + "grad_norm": 1.6120672225952148, + "learning_rate": 6.554681407760598e-05, + "loss": 1.1163, + "step": 17413 + }, + { + "epoch": 0.6236324243021111, + "grad_norm": 1.4402945041656494, + "learning_rate": 6.553592545036421e-05, + "loss": 1.1592, + "step": 17414 + }, + { + "epoch": 0.6236682364316795, + "grad_norm": 2.0097761154174805, + "learning_rate": 6.552503728679204e-05, + "loss": 1.009, + "step": 17415 + }, + { + "epoch": 0.6237040485612477, + "grad_norm": 1.356035590171814, + "learning_rate": 6.551414958703611e-05, + "loss": 0.9851, + "step": 17416 + }, + { + "epoch": 0.623739860690816, + "grad_norm": 1.2909555435180664, + "learning_rate": 6.550326235124274e-05, + "loss": 1.1376, + "step": 17417 + }, + { + "epoch": 0.6237756728203843, + "grad_norm": 1.4276764392852783, + "learning_rate": 6.549237557955854e-05, + "loss": 1.3477, + "step": 17418 + }, + { + "epoch": 0.6238114849499525, + "grad_norm": 1.4177600145339966, + "learning_rate": 6.54814892721299e-05, + "loss": 0.9141, + "step": 17419 + }, + { + "epoch": 0.6238472970795208, + "grad_norm": 1.7249501943588257, + "learning_rate": 6.547060342910324e-05, + "loss": 1.3781, + "step": 17420 + }, + { + "epoch": 0.6238831092090891, + "grad_norm": 1.886185884475708, + "learning_rate": 6.545971805062514e-05, + "loss": 1.2563, + "step": 17421 + }, + { + "epoch": 0.6239189213386575, + "grad_norm": 1.5878061056137085, + "learning_rate": 6.544883313684193e-05, + "loss": 1.0267, + "step": 17422 + }, + { + "epoch": 0.6239547334682257, + "grad_norm": 1.397748589515686, + "learning_rate": 6.543794868790015e-05, + "loss": 0.9345, + "step": 17423 + }, + { + "epoch": 0.623990545597794, + "grad_norm": 1.486702799797058, + "learning_rate": 6.542706470394614e-05, + "loss": 1.195, + "step": 17424 + }, + { + "epoch": 0.6240263577273623, + "grad_norm": 2.329179525375366, + "learning_rate": 6.54161811851264e-05, + "loss": 1.4294, + "step": 17425 + }, + { + "epoch": 0.6240621698569305, + "grad_norm": 1.5831080675125122, + "learning_rate": 6.540529813158732e-05, + "loss": 1.1936, + "step": 17426 + }, + { + "epoch": 0.6240979819864988, + "grad_norm": 1.5669142007827759, + "learning_rate": 6.539441554347537e-05, + "loss": 1.1184, + "step": 17427 + }, + { + "epoch": 0.6241337941160671, + "grad_norm": 1.9169379472732544, + "learning_rate": 6.538353342093689e-05, + "loss": 1.2311, + "step": 17428 + }, + { + "epoch": 0.6241696062456354, + "grad_norm": 1.5071868896484375, + "learning_rate": 6.537265176411831e-05, + "loss": 1.1897, + "step": 17429 + }, + { + "epoch": 0.6242054183752037, + "grad_norm": 1.3631478548049927, + "learning_rate": 6.536177057316605e-05, + "loss": 1.0593, + "step": 17430 + }, + { + "epoch": 0.624241230504772, + "grad_norm": 1.4648616313934326, + "learning_rate": 6.535088984822647e-05, + "loss": 1.2052, + "step": 17431 + }, + { + "epoch": 0.6242770426343403, + "grad_norm": 1.623153567314148, + "learning_rate": 6.5340009589446e-05, + "loss": 1.1343, + "step": 17432 + }, + { + "epoch": 0.6243128547639085, + "grad_norm": 1.6296943426132202, + "learning_rate": 6.532912979697095e-05, + "loss": 1.2858, + "step": 17433 + }, + { + "epoch": 0.6243486668934768, + "grad_norm": 1.3386585712432861, + "learning_rate": 6.531825047094778e-05, + "loss": 1.1021, + "step": 17434 + }, + { + "epoch": 0.6243844790230451, + "grad_norm": 1.4078309535980225, + "learning_rate": 6.530737161152278e-05, + "loss": 1.2515, + "step": 17435 + }, + { + "epoch": 0.6244202911526134, + "grad_norm": 1.6339778900146484, + "learning_rate": 6.529649321884237e-05, + "loss": 1.1607, + "step": 17436 + }, + { + "epoch": 0.6244561032821817, + "grad_norm": 1.481066107749939, + "learning_rate": 6.528561529305289e-05, + "loss": 1.0212, + "step": 17437 + }, + { + "epoch": 0.62449191541175, + "grad_norm": 1.1559150218963623, + "learning_rate": 6.527473783430064e-05, + "loss": 0.94, + "step": 17438 + }, + { + "epoch": 0.6245277275413182, + "grad_norm": 1.6909540891647339, + "learning_rate": 6.526386084273202e-05, + "loss": 1.081, + "step": 17439 + }, + { + "epoch": 0.6245635396708865, + "grad_norm": 1.6901590824127197, + "learning_rate": 6.525298431849334e-05, + "loss": 1.18, + "step": 17440 + }, + { + "epoch": 0.6245993518004548, + "grad_norm": 1.5826209783554077, + "learning_rate": 6.524210826173094e-05, + "loss": 1.0348, + "step": 17441 + }, + { + "epoch": 0.624635163930023, + "grad_norm": 1.4234706163406372, + "learning_rate": 6.523123267259113e-05, + "loss": 1.0251, + "step": 17442 + }, + { + "epoch": 0.6246709760595914, + "grad_norm": 2.0401418209075928, + "learning_rate": 6.522035755122024e-05, + "loss": 1.2323, + "step": 17443 + }, + { + "epoch": 0.6247067881891597, + "grad_norm": 1.674841046333313, + "learning_rate": 6.520948289776459e-05, + "loss": 1.2779, + "step": 17444 + }, + { + "epoch": 0.624742600318728, + "grad_norm": 1.2511893510818481, + "learning_rate": 6.519860871237046e-05, + "loss": 1.0495, + "step": 17445 + }, + { + "epoch": 0.6247784124482962, + "grad_norm": 1.808090329170227, + "learning_rate": 6.518773499518418e-05, + "loss": 1.1085, + "step": 17446 + }, + { + "epoch": 0.6248142245778645, + "grad_norm": 1.6183868646621704, + "learning_rate": 6.517686174635198e-05, + "loss": 1.097, + "step": 17447 + }, + { + "epoch": 0.6248500367074328, + "grad_norm": 1.403356909751892, + "learning_rate": 6.516598896602022e-05, + "loss": 1.0445, + "step": 17448 + }, + { + "epoch": 0.624885848837001, + "grad_norm": 1.3158543109893799, + "learning_rate": 6.515511665433513e-05, + "loss": 1.155, + "step": 17449 + }, + { + "epoch": 0.6249216609665694, + "grad_norm": 1.8171954154968262, + "learning_rate": 6.514424481144301e-05, + "loss": 1.1416, + "step": 17450 + }, + { + "epoch": 0.6249574730961377, + "grad_norm": 1.4023410081863403, + "learning_rate": 6.513337343749008e-05, + "loss": 1.266, + "step": 17451 + }, + { + "epoch": 0.624993285225706, + "grad_norm": 1.667371153831482, + "learning_rate": 6.512250253262268e-05, + "loss": 1.0534, + "step": 17452 + }, + { + "epoch": 0.6250290973552742, + "grad_norm": 1.7990913391113281, + "learning_rate": 6.511163209698701e-05, + "loss": 1.2033, + "step": 17453 + }, + { + "epoch": 0.6250649094848425, + "grad_norm": 1.6218280792236328, + "learning_rate": 6.510076213072932e-05, + "loss": 1.0941, + "step": 17454 + }, + { + "epoch": 0.6251007216144108, + "grad_norm": 1.7224005460739136, + "learning_rate": 6.508989263399588e-05, + "loss": 0.9991, + "step": 17455 + }, + { + "epoch": 0.625136533743979, + "grad_norm": 1.3533166646957397, + "learning_rate": 6.507902360693286e-05, + "loss": 1.097, + "step": 17456 + }, + { + "epoch": 0.6251723458735474, + "grad_norm": 1.6629915237426758, + "learning_rate": 6.506815504968657e-05, + "loss": 1.136, + "step": 17457 + }, + { + "epoch": 0.6252081580031157, + "grad_norm": 1.4154002666473389, + "learning_rate": 6.505728696240316e-05, + "loss": 1.0661, + "step": 17458 + }, + { + "epoch": 0.625243970132684, + "grad_norm": 2.086585760116577, + "learning_rate": 6.504641934522892e-05, + "loss": 0.9812, + "step": 17459 + }, + { + "epoch": 0.6252797822622522, + "grad_norm": 1.4624428749084473, + "learning_rate": 6.503555219830999e-05, + "loss": 0.8998, + "step": 17460 + }, + { + "epoch": 0.6253155943918205, + "grad_norm": 2.048004388809204, + "learning_rate": 6.502468552179263e-05, + "loss": 1.2847, + "step": 17461 + }, + { + "epoch": 0.6253514065213888, + "grad_norm": 1.313262701034546, + "learning_rate": 6.501381931582297e-05, + "loss": 1.1486, + "step": 17462 + }, + { + "epoch": 0.625387218650957, + "grad_norm": 2.2275705337524414, + "learning_rate": 6.500295358054729e-05, + "loss": 1.1396, + "step": 17463 + }, + { + "epoch": 0.6254230307805254, + "grad_norm": 1.5003199577331543, + "learning_rate": 6.499208831611172e-05, + "loss": 0.9625, + "step": 17464 + }, + { + "epoch": 0.6254588429100937, + "grad_norm": 1.8990800380706787, + "learning_rate": 6.498122352266242e-05, + "loss": 1.109, + "step": 17465 + }, + { + "epoch": 0.625494655039662, + "grad_norm": 1.3190957307815552, + "learning_rate": 6.497035920034561e-05, + "loss": 1.1075, + "step": 17466 + }, + { + "epoch": 0.6255304671692302, + "grad_norm": 2.0659713745117188, + "learning_rate": 6.49594953493074e-05, + "loss": 1.1576, + "step": 17467 + }, + { + "epoch": 0.6255662792987985, + "grad_norm": 1.619398832321167, + "learning_rate": 6.494863196969403e-05, + "loss": 1.2727, + "step": 17468 + }, + { + "epoch": 0.6256020914283668, + "grad_norm": 1.3242329359054565, + "learning_rate": 6.493776906165155e-05, + "loss": 1.1821, + "step": 17469 + }, + { + "epoch": 0.625637903557935, + "grad_norm": 1.7054651975631714, + "learning_rate": 6.49269066253262e-05, + "loss": 1.2365, + "step": 17470 + }, + { + "epoch": 0.6256737156875034, + "grad_norm": 1.5708777904510498, + "learning_rate": 6.491604466086405e-05, + "loss": 1.1318, + "step": 17471 + }, + { + "epoch": 0.6257095278170717, + "grad_norm": 1.492213487625122, + "learning_rate": 6.49051831684113e-05, + "loss": 0.9678, + "step": 17472 + }, + { + "epoch": 0.62574533994664, + "grad_norm": 1.4990956783294678, + "learning_rate": 6.489432214811403e-05, + "loss": 0.9659, + "step": 17473 + }, + { + "epoch": 0.6257811520762082, + "grad_norm": 1.7470909357070923, + "learning_rate": 6.488346160011835e-05, + "loss": 0.9916, + "step": 17474 + }, + { + "epoch": 0.6258169642057765, + "grad_norm": 1.5456887483596802, + "learning_rate": 6.487260152457041e-05, + "loss": 1.1314, + "step": 17475 + }, + { + "epoch": 0.6258527763353448, + "grad_norm": 1.6094509363174438, + "learning_rate": 6.486174192161632e-05, + "loss": 1.0929, + "step": 17476 + }, + { + "epoch": 0.625888588464913, + "grad_norm": 1.63298499584198, + "learning_rate": 6.485088279140214e-05, + "loss": 1.2919, + "step": 17477 + }, + { + "epoch": 0.6259244005944814, + "grad_norm": 1.5021857023239136, + "learning_rate": 6.484002413407401e-05, + "loss": 1.1329, + "step": 17478 + }, + { + "epoch": 0.6259602127240497, + "grad_norm": 1.5097153186798096, + "learning_rate": 6.4829165949778e-05, + "loss": 1.0996, + "step": 17479 + }, + { + "epoch": 0.6259960248536179, + "grad_norm": 1.4720555543899536, + "learning_rate": 6.481830823866018e-05, + "loss": 0.9611, + "step": 17480 + }, + { + "epoch": 0.6260318369831862, + "grad_norm": 1.3998948335647583, + "learning_rate": 6.480745100086668e-05, + "loss": 1.15, + "step": 17481 + }, + { + "epoch": 0.6260676491127545, + "grad_norm": 1.3621288537979126, + "learning_rate": 6.479659423654352e-05, + "loss": 1.042, + "step": 17482 + }, + { + "epoch": 0.6261034612423227, + "grad_norm": 1.4046883583068848, + "learning_rate": 6.478573794583673e-05, + "loss": 1.0129, + "step": 17483 + }, + { + "epoch": 0.626139273371891, + "grad_norm": 1.303134560585022, + "learning_rate": 6.477488212889246e-05, + "loss": 1.1448, + "step": 17484 + }, + { + "epoch": 0.6261750855014594, + "grad_norm": 1.7044483423233032, + "learning_rate": 6.476402678585669e-05, + "loss": 0.937, + "step": 17485 + }, + { + "epoch": 0.6262108976310277, + "grad_norm": 1.4585167169570923, + "learning_rate": 6.47531719168755e-05, + "loss": 1.1489, + "step": 17486 + }, + { + "epoch": 0.6262467097605959, + "grad_norm": 1.6175090074539185, + "learning_rate": 6.474231752209492e-05, + "loss": 1.2107, + "step": 17487 + }, + { + "epoch": 0.6262825218901642, + "grad_norm": 1.5941641330718994, + "learning_rate": 6.473146360166098e-05, + "loss": 1.0935, + "step": 17488 + }, + { + "epoch": 0.6263183340197325, + "grad_norm": 1.4100631475448608, + "learning_rate": 6.472061015571968e-05, + "loss": 1.1183, + "step": 17489 + }, + { + "epoch": 0.6263541461493007, + "grad_norm": 1.2519474029541016, + "learning_rate": 6.47097571844171e-05, + "loss": 1.0969, + "step": 17490 + }, + { + "epoch": 0.626389958278869, + "grad_norm": 1.358107089996338, + "learning_rate": 6.469890468789922e-05, + "loss": 1.2418, + "step": 17491 + }, + { + "epoch": 0.6264257704084374, + "grad_norm": 1.5383974313735962, + "learning_rate": 6.468805266631199e-05, + "loss": 1.1436, + "step": 17492 + }, + { + "epoch": 0.6264615825380057, + "grad_norm": 1.8543046712875366, + "learning_rate": 6.467720111980151e-05, + "loss": 1.2391, + "step": 17493 + }, + { + "epoch": 0.6264973946675739, + "grad_norm": 1.677651047706604, + "learning_rate": 6.466635004851367e-05, + "loss": 1.018, + "step": 17494 + }, + { + "epoch": 0.6265332067971422, + "grad_norm": 1.699271321296692, + "learning_rate": 6.46554994525946e-05, + "loss": 1.2369, + "step": 17495 + }, + { + "epoch": 0.6265690189267105, + "grad_norm": 1.3383865356445312, + "learning_rate": 6.46446493321901e-05, + "loss": 0.9505, + "step": 17496 + }, + { + "epoch": 0.6266048310562787, + "grad_norm": 1.283347249031067, + "learning_rate": 6.46337996874463e-05, + "loss": 1.0652, + "step": 17497 + }, + { + "epoch": 0.626640643185847, + "grad_norm": 1.4231702089309692, + "learning_rate": 6.462295051850907e-05, + "loss": 1.0054, + "step": 17498 + }, + { + "epoch": 0.6266764553154154, + "grad_norm": 1.472180724143982, + "learning_rate": 6.461210182552444e-05, + "loss": 1.1886, + "step": 17499 + }, + { + "epoch": 0.6267122674449837, + "grad_norm": 1.3132495880126953, + "learning_rate": 6.460125360863835e-05, + "loss": 0.8473, + "step": 17500 + }, + { + "epoch": 0.6267480795745519, + "grad_norm": 1.751433253288269, + "learning_rate": 6.459040586799666e-05, + "loss": 1.2711, + "step": 17501 + }, + { + "epoch": 0.6267838917041202, + "grad_norm": 1.8405518531799316, + "learning_rate": 6.457955860374545e-05, + "loss": 1.0957, + "step": 17502 + }, + { + "epoch": 0.6268197038336885, + "grad_norm": 1.6240873336791992, + "learning_rate": 6.456871181603054e-05, + "loss": 1.2634, + "step": 17503 + }, + { + "epoch": 0.6268555159632567, + "grad_norm": 1.458640694618225, + "learning_rate": 6.455786550499796e-05, + "loss": 1.0207, + "step": 17504 + }, + { + "epoch": 0.626891328092825, + "grad_norm": 1.4554731845855713, + "learning_rate": 6.454701967079354e-05, + "loss": 1.1032, + "step": 17505 + }, + { + "epoch": 0.6269271402223934, + "grad_norm": 2.025439739227295, + "learning_rate": 6.453617431356327e-05, + "loss": 1.3271, + "step": 17506 + }, + { + "epoch": 0.6269629523519616, + "grad_norm": 1.2582290172576904, + "learning_rate": 6.452532943345298e-05, + "loss": 1.1986, + "step": 17507 + }, + { + "epoch": 0.6269987644815299, + "grad_norm": 1.2079954147338867, + "learning_rate": 6.451448503060868e-05, + "loss": 1.0611, + "step": 17508 + }, + { + "epoch": 0.6270345766110982, + "grad_norm": 1.5749090909957886, + "learning_rate": 6.45036411051762e-05, + "loss": 0.9657, + "step": 17509 + }, + { + "epoch": 0.6270703887406665, + "grad_norm": 1.3532439470291138, + "learning_rate": 6.449279765730141e-05, + "loss": 1.2178, + "step": 17510 + }, + { + "epoch": 0.6271062008702347, + "grad_norm": 1.4125399589538574, + "learning_rate": 6.448195468713028e-05, + "loss": 0.967, + "step": 17511 + }, + { + "epoch": 0.627142012999803, + "grad_norm": 1.1541063785552979, + "learning_rate": 6.447111219480857e-05, + "loss": 1.0758, + "step": 17512 + }, + { + "epoch": 0.6271778251293714, + "grad_norm": 1.4404067993164062, + "learning_rate": 6.446027018048228e-05, + "loss": 1.1651, + "step": 17513 + }, + { + "epoch": 0.6272136372589396, + "grad_norm": 2.20259165763855, + "learning_rate": 6.444942864429713e-05, + "loss": 0.9773, + "step": 17514 + }, + { + "epoch": 0.6272494493885079, + "grad_norm": 1.4553864002227783, + "learning_rate": 6.443858758639916e-05, + "loss": 1.1197, + "step": 17515 + }, + { + "epoch": 0.6272852615180762, + "grad_norm": 1.784186840057373, + "learning_rate": 6.442774700693408e-05, + "loss": 1.0431, + "step": 17516 + }, + { + "epoch": 0.6273210736476444, + "grad_norm": 1.5066829919815063, + "learning_rate": 6.441690690604775e-05, + "loss": 1.4175, + "step": 17517 + }, + { + "epoch": 0.6273568857772127, + "grad_norm": 1.3638231754302979, + "learning_rate": 6.440606728388607e-05, + "loss": 1.1092, + "step": 17518 + }, + { + "epoch": 0.627392697906781, + "grad_norm": 1.8059946298599243, + "learning_rate": 6.439522814059483e-05, + "loss": 1.1574, + "step": 17519 + }, + { + "epoch": 0.6274285100363494, + "grad_norm": 1.788379192352295, + "learning_rate": 6.438438947631989e-05, + "loss": 1.0827, + "step": 17520 + }, + { + "epoch": 0.6274643221659176, + "grad_norm": 1.4672609567642212, + "learning_rate": 6.437355129120701e-05, + "loss": 0.9703, + "step": 17521 + }, + { + "epoch": 0.6275001342954859, + "grad_norm": 1.6264480352401733, + "learning_rate": 6.436271358540206e-05, + "loss": 0.9005, + "step": 17522 + }, + { + "epoch": 0.6275359464250542, + "grad_norm": 1.6059805154800415, + "learning_rate": 6.435187635905082e-05, + "loss": 0.825, + "step": 17523 + }, + { + "epoch": 0.6275717585546224, + "grad_norm": 1.73818838596344, + "learning_rate": 6.434103961229913e-05, + "loss": 1.3197, + "step": 17524 + }, + { + "epoch": 0.6276075706841907, + "grad_norm": 1.161342740058899, + "learning_rate": 6.433020334529275e-05, + "loss": 1.0332, + "step": 17525 + }, + { + "epoch": 0.627643382813759, + "grad_norm": 1.4771836996078491, + "learning_rate": 6.431936755817746e-05, + "loss": 1.1826, + "step": 17526 + }, + { + "epoch": 0.6276791949433274, + "grad_norm": 1.668797254562378, + "learning_rate": 6.430853225109908e-05, + "loss": 1.3284, + "step": 17527 + }, + { + "epoch": 0.6277150070728956, + "grad_norm": 1.3769892454147339, + "learning_rate": 6.42976974242033e-05, + "loss": 1.1815, + "step": 17528 + }, + { + "epoch": 0.6277508192024639, + "grad_norm": 2.030296564102173, + "learning_rate": 6.428686307763601e-05, + "loss": 1.0278, + "step": 17529 + }, + { + "epoch": 0.6277866313320322, + "grad_norm": 1.483023762702942, + "learning_rate": 6.427602921154287e-05, + "loss": 1.175, + "step": 17530 + }, + { + "epoch": 0.6278224434616004, + "grad_norm": 1.4083759784698486, + "learning_rate": 6.426519582606971e-05, + "loss": 0.9333, + "step": 17531 + }, + { + "epoch": 0.6278582555911687, + "grad_norm": 1.2792644500732422, + "learning_rate": 6.42543629213622e-05, + "loss": 1.1343, + "step": 17532 + }, + { + "epoch": 0.627894067720737, + "grad_norm": 1.6789478063583374, + "learning_rate": 6.424353049756618e-05, + "loss": 1.0857, + "step": 17533 + }, + { + "epoch": 0.6279298798503054, + "grad_norm": 1.8127111196517944, + "learning_rate": 6.423269855482732e-05, + "loss": 1.2577, + "step": 17534 + }, + { + "epoch": 0.6279656919798736, + "grad_norm": 1.3044772148132324, + "learning_rate": 6.422186709329134e-05, + "loss": 1.0976, + "step": 17535 + }, + { + "epoch": 0.6280015041094419, + "grad_norm": 1.8108503818511963, + "learning_rate": 6.421103611310402e-05, + "loss": 1.0114, + "step": 17536 + }, + { + "epoch": 0.6280373162390102, + "grad_norm": 1.399266242980957, + "learning_rate": 6.420020561441101e-05, + "loss": 1.0964, + "step": 17537 + }, + { + "epoch": 0.6280731283685784, + "grad_norm": 1.4934076070785522, + "learning_rate": 6.41893755973581e-05, + "loss": 1.1577, + "step": 17538 + }, + { + "epoch": 0.6281089404981467, + "grad_norm": 1.3891797065734863, + "learning_rate": 6.417854606209091e-05, + "loss": 1.0868, + "step": 17539 + }, + { + "epoch": 0.628144752627715, + "grad_norm": 1.6390074491500854, + "learning_rate": 6.41677170087552e-05, + "loss": 1.2989, + "step": 17540 + }, + { + "epoch": 0.6281805647572833, + "grad_norm": 1.6437740325927734, + "learning_rate": 6.41568884374966e-05, + "loss": 1.0723, + "step": 17541 + }, + { + "epoch": 0.6282163768868516, + "grad_norm": 1.6368166208267212, + "learning_rate": 6.414606034846087e-05, + "loss": 1.125, + "step": 17542 + }, + { + "epoch": 0.6282521890164199, + "grad_norm": 1.50877046585083, + "learning_rate": 6.413523274179365e-05, + "loss": 1.0442, + "step": 17543 + }, + { + "epoch": 0.6282880011459882, + "grad_norm": 1.7273821830749512, + "learning_rate": 6.412440561764059e-05, + "loss": 1.2084, + "step": 17544 + }, + { + "epoch": 0.6283238132755564, + "grad_norm": 1.5456620454788208, + "learning_rate": 6.411357897614738e-05, + "loss": 1.0166, + "step": 17545 + }, + { + "epoch": 0.6283596254051247, + "grad_norm": 1.421291470527649, + "learning_rate": 6.410275281745967e-05, + "loss": 1.0544, + "step": 17546 + }, + { + "epoch": 0.628395437534693, + "grad_norm": 1.562516212463379, + "learning_rate": 6.409192714172314e-05, + "loss": 1.0937, + "step": 17547 + }, + { + "epoch": 0.6284312496642613, + "grad_norm": 1.5629721879959106, + "learning_rate": 6.408110194908338e-05, + "loss": 1.127, + "step": 17548 + }, + { + "epoch": 0.6284670617938296, + "grad_norm": 1.3844603300094604, + "learning_rate": 6.407027723968611e-05, + "loss": 0.9503, + "step": 17549 + }, + { + "epoch": 0.6285028739233979, + "grad_norm": 1.4322032928466797, + "learning_rate": 6.405945301367687e-05, + "loss": 1.179, + "step": 17550 + }, + { + "epoch": 0.6285386860529661, + "grad_norm": 1.77381432056427, + "learning_rate": 6.404862927120134e-05, + "loss": 1.092, + "step": 17551 + }, + { + "epoch": 0.6285744981825344, + "grad_norm": 1.4107915163040161, + "learning_rate": 6.403780601240514e-05, + "loss": 0.9702, + "step": 17552 + }, + { + "epoch": 0.6286103103121027, + "grad_norm": 1.36702561378479, + "learning_rate": 6.402698323743385e-05, + "loss": 1.0629, + "step": 17553 + }, + { + "epoch": 0.628646122441671, + "grad_norm": 1.4845046997070312, + "learning_rate": 6.401616094643312e-05, + "loss": 1.1789, + "step": 17554 + }, + { + "epoch": 0.6286819345712393, + "grad_norm": 1.528497338294983, + "learning_rate": 6.400533913954851e-05, + "loss": 1.1011, + "step": 17555 + }, + { + "epoch": 0.6287177467008076, + "grad_norm": 1.3023006916046143, + "learning_rate": 6.399451781692567e-05, + "loss": 0.9694, + "step": 17556 + }, + { + "epoch": 0.6287535588303759, + "grad_norm": 1.4440717697143555, + "learning_rate": 6.398369697871011e-05, + "loss": 1.2338, + "step": 17557 + }, + { + "epoch": 0.6287893709599441, + "grad_norm": 1.7396231889724731, + "learning_rate": 6.397287662504747e-05, + "loss": 1.0911, + "step": 17558 + }, + { + "epoch": 0.6288251830895124, + "grad_norm": 1.7658004760742188, + "learning_rate": 6.39620567560833e-05, + "loss": 1.1495, + "step": 17559 + }, + { + "epoch": 0.6288609952190807, + "grad_norm": 1.7276005744934082, + "learning_rate": 6.395123737196316e-05, + "loss": 1.0703, + "step": 17560 + }, + { + "epoch": 0.628896807348649, + "grad_norm": 1.4718931913375854, + "learning_rate": 6.394041847283263e-05, + "loss": 1.0921, + "step": 17561 + }, + { + "epoch": 0.6289326194782173, + "grad_norm": 1.4478129148483276, + "learning_rate": 6.392960005883726e-05, + "loss": 1.0592, + "step": 17562 + }, + { + "epoch": 0.6289684316077856, + "grad_norm": 1.8870482444763184, + "learning_rate": 6.391878213012258e-05, + "loss": 1.2066, + "step": 17563 + }, + { + "epoch": 0.6290042437373539, + "grad_norm": 1.5883607864379883, + "learning_rate": 6.390796468683416e-05, + "loss": 1.2306, + "step": 17564 + }, + { + "epoch": 0.6290400558669221, + "grad_norm": 1.2060528993606567, + "learning_rate": 6.389714772911751e-05, + "loss": 1.0467, + "step": 17565 + }, + { + "epoch": 0.6290758679964904, + "grad_norm": 1.3669483661651611, + "learning_rate": 6.388633125711816e-05, + "loss": 1.1836, + "step": 17566 + }, + { + "epoch": 0.6291116801260587, + "grad_norm": 1.413088083267212, + "learning_rate": 6.387551527098165e-05, + "loss": 1.1424, + "step": 17567 + }, + { + "epoch": 0.6291474922556269, + "grad_norm": 1.5295591354370117, + "learning_rate": 6.386469977085348e-05, + "loss": 0.9098, + "step": 17568 + }, + { + "epoch": 0.6291833043851953, + "grad_norm": 1.7230175733566284, + "learning_rate": 6.385388475687918e-05, + "loss": 1.1998, + "step": 17569 + }, + { + "epoch": 0.6292191165147636, + "grad_norm": 1.5619099140167236, + "learning_rate": 6.384307022920424e-05, + "loss": 1.2225, + "step": 17570 + }, + { + "epoch": 0.6292549286443319, + "grad_norm": 1.362798810005188, + "learning_rate": 6.383225618797412e-05, + "loss": 1.0953, + "step": 17571 + }, + { + "epoch": 0.6292907407739001, + "grad_norm": 1.6707860231399536, + "learning_rate": 6.382144263333436e-05, + "loss": 1.2281, + "step": 17572 + }, + { + "epoch": 0.6293265529034684, + "grad_norm": 1.7024197578430176, + "learning_rate": 6.381062956543041e-05, + "loss": 1.2918, + "step": 17573 + }, + { + "epoch": 0.6293623650330367, + "grad_norm": 1.3044421672821045, + "learning_rate": 6.379981698440778e-05, + "loss": 1.0238, + "step": 17574 + }, + { + "epoch": 0.6293981771626049, + "grad_norm": 1.4740287065505981, + "learning_rate": 6.378900489041188e-05, + "loss": 1.1702, + "step": 17575 + }, + { + "epoch": 0.6294339892921733, + "grad_norm": 1.731438398361206, + "learning_rate": 6.377819328358826e-05, + "loss": 1.1302, + "step": 17576 + }, + { + "epoch": 0.6294698014217416, + "grad_norm": 1.6326874494552612, + "learning_rate": 6.37673821640823e-05, + "loss": 1.2219, + "step": 17577 + }, + { + "epoch": 0.6295056135513099, + "grad_norm": 1.4809300899505615, + "learning_rate": 6.375657153203947e-05, + "loss": 1.1835, + "step": 17578 + }, + { + "epoch": 0.6295414256808781, + "grad_norm": 1.4860923290252686, + "learning_rate": 6.374576138760525e-05, + "loss": 1.0912, + "step": 17579 + }, + { + "epoch": 0.6295772378104464, + "grad_norm": 1.5550262928009033, + "learning_rate": 6.3734951730925e-05, + "loss": 0.8505, + "step": 17580 + }, + { + "epoch": 0.6296130499400147, + "grad_norm": 1.5568283796310425, + "learning_rate": 6.372414256214423e-05, + "loss": 1.0362, + "step": 17581 + }, + { + "epoch": 0.6296488620695829, + "grad_norm": 1.8579840660095215, + "learning_rate": 6.37133338814083e-05, + "loss": 1.0778, + "step": 17582 + }, + { + "epoch": 0.6296846741991513, + "grad_norm": 1.6520122289657593, + "learning_rate": 6.370252568886267e-05, + "loss": 1.0823, + "step": 17583 + }, + { + "epoch": 0.6297204863287196, + "grad_norm": 2.386019706726074, + "learning_rate": 6.369171798465274e-05, + "loss": 1.1138, + "step": 17584 + }, + { + "epoch": 0.6297562984582878, + "grad_norm": 1.6266319751739502, + "learning_rate": 6.368091076892392e-05, + "loss": 1.2478, + "step": 17585 + }, + { + "epoch": 0.6297921105878561, + "grad_norm": 1.8989784717559814, + "learning_rate": 6.367010404182158e-05, + "loss": 1.1577, + "step": 17586 + }, + { + "epoch": 0.6298279227174244, + "grad_norm": 1.6200668811798096, + "learning_rate": 6.365929780349113e-05, + "loss": 1.1205, + "step": 17587 + }, + { + "epoch": 0.6298637348469927, + "grad_norm": 1.8046419620513916, + "learning_rate": 6.3648492054078e-05, + "loss": 1.178, + "step": 17588 + }, + { + "epoch": 0.6298995469765609, + "grad_norm": 1.9491394758224487, + "learning_rate": 6.363768679372744e-05, + "loss": 1.0225, + "step": 17589 + }, + { + "epoch": 0.6299353591061293, + "grad_norm": 1.6388294696807861, + "learning_rate": 6.362688202258496e-05, + "loss": 1.0511, + "step": 17590 + }, + { + "epoch": 0.6299711712356976, + "grad_norm": 2.0971603393554688, + "learning_rate": 6.361607774079581e-05, + "loss": 1.167, + "step": 17591 + }, + { + "epoch": 0.6300069833652658, + "grad_norm": 1.4311262369155884, + "learning_rate": 6.360527394850547e-05, + "loss": 1.0472, + "step": 17592 + }, + { + "epoch": 0.6300427954948341, + "grad_norm": 1.3620226383209229, + "learning_rate": 6.359447064585915e-05, + "loss": 1.0132, + "step": 17593 + }, + { + "epoch": 0.6300786076244024, + "grad_norm": 1.6939667463302612, + "learning_rate": 6.358366783300231e-05, + "loss": 0.9115, + "step": 17594 + }, + { + "epoch": 0.6301144197539706, + "grad_norm": 1.2496892213821411, + "learning_rate": 6.357286551008024e-05, + "loss": 1.0809, + "step": 17595 + }, + { + "epoch": 0.6301502318835389, + "grad_norm": 2.013871431350708, + "learning_rate": 6.356206367723829e-05, + "loss": 1.1897, + "step": 17596 + }, + { + "epoch": 0.6301860440131073, + "grad_norm": 1.562288761138916, + "learning_rate": 6.355126233462179e-05, + "loss": 1.1986, + "step": 17597 + }, + { + "epoch": 0.6302218561426756, + "grad_norm": 1.5255281925201416, + "learning_rate": 6.354046148237597e-05, + "loss": 1.1383, + "step": 17598 + }, + { + "epoch": 0.6302576682722438, + "grad_norm": 1.5681835412979126, + "learning_rate": 6.352966112064627e-05, + "loss": 1.2274, + "step": 17599 + }, + { + "epoch": 0.6302934804018121, + "grad_norm": 1.499267339706421, + "learning_rate": 6.351886124957789e-05, + "loss": 1.1452, + "step": 17600 + }, + { + "epoch": 0.6303292925313804, + "grad_norm": 1.491477608680725, + "learning_rate": 6.350806186931623e-05, + "loss": 1.1477, + "step": 17601 + }, + { + "epoch": 0.6303651046609486, + "grad_norm": 1.5275958776474, + "learning_rate": 6.349726298000647e-05, + "loss": 1.2565, + "step": 17602 + }, + { + "epoch": 0.6304009167905169, + "grad_norm": 1.3120532035827637, + "learning_rate": 6.3486464581794e-05, + "loss": 1.0993, + "step": 17603 + }, + { + "epoch": 0.6304367289200853, + "grad_norm": 1.3929855823516846, + "learning_rate": 6.347566667482401e-05, + "loss": 1.0828, + "step": 17604 + }, + { + "epoch": 0.6304725410496536, + "grad_norm": 1.7253235578536987, + "learning_rate": 6.346486925924184e-05, + "loss": 1.2027, + "step": 17605 + }, + { + "epoch": 0.6305083531792218, + "grad_norm": 1.7304437160491943, + "learning_rate": 6.345407233519273e-05, + "loss": 1.0705, + "step": 17606 + }, + { + "epoch": 0.6305441653087901, + "grad_norm": 1.8172622919082642, + "learning_rate": 6.344327590282189e-05, + "loss": 1.1699, + "step": 17607 + }, + { + "epoch": 0.6305799774383584, + "grad_norm": 2.0956363677978516, + "learning_rate": 6.343247996227469e-05, + "loss": 1.211, + "step": 17608 + }, + { + "epoch": 0.6306157895679266, + "grad_norm": 1.660258173942566, + "learning_rate": 6.342168451369623e-05, + "loss": 1.014, + "step": 17609 + }, + { + "epoch": 0.6306516016974949, + "grad_norm": 1.3153445720672607, + "learning_rate": 6.341088955723189e-05, + "loss": 0.805, + "step": 17610 + }, + { + "epoch": 0.6306874138270633, + "grad_norm": 1.374284267425537, + "learning_rate": 6.340009509302676e-05, + "loss": 1.1197, + "step": 17611 + }, + { + "epoch": 0.6307232259566316, + "grad_norm": 1.4103161096572876, + "learning_rate": 6.338930112122622e-05, + "loss": 1.042, + "step": 17612 + }, + { + "epoch": 0.6307590380861998, + "grad_norm": 1.7454252243041992, + "learning_rate": 6.337850764197539e-05, + "loss": 1.1561, + "step": 17613 + }, + { + "epoch": 0.6307948502157681, + "grad_norm": 1.2057628631591797, + "learning_rate": 6.336771465541947e-05, + "loss": 0.9141, + "step": 17614 + }, + { + "epoch": 0.6308306623453364, + "grad_norm": 1.7051236629486084, + "learning_rate": 6.33569221617037e-05, + "loss": 1.1808, + "step": 17615 + }, + { + "epoch": 0.6308664744749046, + "grad_norm": 1.608251690864563, + "learning_rate": 6.334613016097328e-05, + "loss": 1.0367, + "step": 17616 + }, + { + "epoch": 0.6309022866044729, + "grad_norm": 1.4963264465332031, + "learning_rate": 6.333533865337343e-05, + "loss": 1.2687, + "step": 17617 + }, + { + "epoch": 0.6309380987340413, + "grad_norm": 1.5541778802871704, + "learning_rate": 6.332454763904925e-05, + "loss": 1.1468, + "step": 17618 + }, + { + "epoch": 0.6309739108636095, + "grad_norm": 1.5394108295440674, + "learning_rate": 6.3313757118146e-05, + "loss": 0.9223, + "step": 17619 + }, + { + "epoch": 0.6310097229931778, + "grad_norm": 1.8266786336898804, + "learning_rate": 6.330296709080881e-05, + "loss": 1.0481, + "step": 17620 + }, + { + "epoch": 0.6310455351227461, + "grad_norm": 1.9095871448516846, + "learning_rate": 6.329217755718291e-05, + "loss": 1.3449, + "step": 17621 + }, + { + "epoch": 0.6310813472523144, + "grad_norm": 1.248246192932129, + "learning_rate": 6.328138851741338e-05, + "loss": 1.1396, + "step": 17622 + }, + { + "epoch": 0.6311171593818826, + "grad_norm": 1.2373238801956177, + "learning_rate": 6.32705999716454e-05, + "loss": 0.7754, + "step": 17623 + }, + { + "epoch": 0.6311529715114509, + "grad_norm": 1.5389076471328735, + "learning_rate": 6.325981192002413e-05, + "loss": 1.285, + "step": 17624 + }, + { + "epoch": 0.6311887836410193, + "grad_norm": 1.3713810443878174, + "learning_rate": 6.324902436269469e-05, + "loss": 1.0714, + "step": 17625 + }, + { + "epoch": 0.6312245957705875, + "grad_norm": 2.1228528022766113, + "learning_rate": 6.323823729980222e-05, + "loss": 1.3706, + "step": 17626 + }, + { + "epoch": 0.6312604079001558, + "grad_norm": 2.0005850791931152, + "learning_rate": 6.322745073149185e-05, + "loss": 1.2405, + "step": 17627 + }, + { + "epoch": 0.6312962200297241, + "grad_norm": 1.8052325248718262, + "learning_rate": 6.321666465790872e-05, + "loss": 1.2337, + "step": 17628 + }, + { + "epoch": 0.6313320321592923, + "grad_norm": 1.5439038276672363, + "learning_rate": 6.320587907919788e-05, + "loss": 1.269, + "step": 17629 + }, + { + "epoch": 0.6313678442888606, + "grad_norm": 1.5840282440185547, + "learning_rate": 6.319509399550452e-05, + "loss": 1.1114, + "step": 17630 + }, + { + "epoch": 0.6314036564184289, + "grad_norm": 1.5271720886230469, + "learning_rate": 6.318430940697367e-05, + "loss": 1.1901, + "step": 17631 + }, + { + "epoch": 0.6314394685479972, + "grad_norm": 1.325752854347229, + "learning_rate": 6.317352531375045e-05, + "loss": 1.0362, + "step": 17632 + }, + { + "epoch": 0.6314752806775655, + "grad_norm": 1.3515899181365967, + "learning_rate": 6.316274171597995e-05, + "loss": 1.1007, + "step": 17633 + }, + { + "epoch": 0.6315110928071338, + "grad_norm": 1.577736258506775, + "learning_rate": 6.315195861380722e-05, + "loss": 1.0871, + "step": 17634 + }, + { + "epoch": 0.6315469049367021, + "grad_norm": 1.6936144828796387, + "learning_rate": 6.314117600737738e-05, + "loss": 1.1607, + "step": 17635 + }, + { + "epoch": 0.6315827170662703, + "grad_norm": 1.4333863258361816, + "learning_rate": 6.313039389683546e-05, + "loss": 1.0032, + "step": 17636 + }, + { + "epoch": 0.6316185291958386, + "grad_norm": 1.5689942836761475, + "learning_rate": 6.311961228232654e-05, + "loss": 1.345, + "step": 17637 + }, + { + "epoch": 0.6316543413254069, + "grad_norm": 1.343542218208313, + "learning_rate": 6.310883116399567e-05, + "loss": 0.986, + "step": 17638 + }, + { + "epoch": 0.6316901534549751, + "grad_norm": 1.6481982469558716, + "learning_rate": 6.309805054198787e-05, + "loss": 1.1006, + "step": 17639 + }, + { + "epoch": 0.6317259655845435, + "grad_norm": 1.3164188861846924, + "learning_rate": 6.308727041644824e-05, + "loss": 1.2883, + "step": 17640 + }, + { + "epoch": 0.6317617777141118, + "grad_norm": 1.3897929191589355, + "learning_rate": 6.307649078752174e-05, + "loss": 1.2196, + "step": 17641 + }, + { + "epoch": 0.6317975898436801, + "grad_norm": 2.0298240184783936, + "learning_rate": 6.306571165535343e-05, + "loss": 1.1474, + "step": 17642 + }, + { + "epoch": 0.6318334019732483, + "grad_norm": 1.3218047618865967, + "learning_rate": 6.305493302008832e-05, + "loss": 1.2902, + "step": 17643 + }, + { + "epoch": 0.6318692141028166, + "grad_norm": 1.5286496877670288, + "learning_rate": 6.304415488187145e-05, + "loss": 1.1618, + "step": 17644 + }, + { + "epoch": 0.6319050262323849, + "grad_norm": 1.6577259302139282, + "learning_rate": 6.303337724084779e-05, + "loss": 1.2825, + "step": 17645 + }, + { + "epoch": 0.6319408383619531, + "grad_norm": 1.2250860929489136, + "learning_rate": 6.302260009716237e-05, + "loss": 1.0617, + "step": 17646 + }, + { + "epoch": 0.6319766504915215, + "grad_norm": 1.7770565748214722, + "learning_rate": 6.301182345096017e-05, + "loss": 1.1125, + "step": 17647 + }, + { + "epoch": 0.6320124626210898, + "grad_norm": 1.889599323272705, + "learning_rate": 6.300104730238616e-05, + "loss": 1.1593, + "step": 17648 + }, + { + "epoch": 0.6320482747506581, + "grad_norm": 1.678222417831421, + "learning_rate": 6.299027165158536e-05, + "loss": 1.1216, + "step": 17649 + }, + { + "epoch": 0.6320840868802263, + "grad_norm": 1.2463219165802002, + "learning_rate": 6.297949649870267e-05, + "loss": 0.8472, + "step": 17650 + }, + { + "epoch": 0.6321198990097946, + "grad_norm": 1.3579977750778198, + "learning_rate": 6.296872184388315e-05, + "loss": 1.074, + "step": 17651 + }, + { + "epoch": 0.6321557111393629, + "grad_norm": 1.5503835678100586, + "learning_rate": 6.295794768727168e-05, + "loss": 1.0536, + "step": 17652 + }, + { + "epoch": 0.6321915232689311, + "grad_norm": 1.631861686706543, + "learning_rate": 6.294717402901325e-05, + "loss": 1.1158, + "step": 17653 + }, + { + "epoch": 0.6322273353984995, + "grad_norm": 1.7681516408920288, + "learning_rate": 6.293640086925279e-05, + "loss": 1.1961, + "step": 17654 + }, + { + "epoch": 0.6322631475280678, + "grad_norm": 1.3225212097167969, + "learning_rate": 6.292562820813528e-05, + "loss": 1.0836, + "step": 17655 + }, + { + "epoch": 0.632298959657636, + "grad_norm": 1.5466558933258057, + "learning_rate": 6.291485604580559e-05, + "loss": 1.194, + "step": 17656 + }, + { + "epoch": 0.6323347717872043, + "grad_norm": 1.4557281732559204, + "learning_rate": 6.290408438240869e-05, + "loss": 1.1818, + "step": 17657 + }, + { + "epoch": 0.6323705839167726, + "grad_norm": 1.5945738554000854, + "learning_rate": 6.289331321808948e-05, + "loss": 1.0656, + "step": 17658 + }, + { + "epoch": 0.6324063960463409, + "grad_norm": 1.8639625310897827, + "learning_rate": 6.288254255299286e-05, + "loss": 1.2076, + "step": 17659 + }, + { + "epoch": 0.6324422081759091, + "grad_norm": 1.2105695009231567, + "learning_rate": 6.287177238726378e-05, + "loss": 1.1518, + "step": 17660 + }, + { + "epoch": 0.6324780203054775, + "grad_norm": 1.6630611419677734, + "learning_rate": 6.28610027210471e-05, + "loss": 1.2578, + "step": 17661 + }, + { + "epoch": 0.6325138324350458, + "grad_norm": 1.5363808870315552, + "learning_rate": 6.285023355448772e-05, + "loss": 1.0927, + "step": 17662 + }, + { + "epoch": 0.632549644564614, + "grad_norm": 1.9364310503005981, + "learning_rate": 6.283946488773051e-05, + "loss": 1.1854, + "step": 17663 + }, + { + "epoch": 0.6325854566941823, + "grad_norm": 1.4557451009750366, + "learning_rate": 6.282869672092039e-05, + "loss": 1.0643, + "step": 17664 + }, + { + "epoch": 0.6326212688237506, + "grad_norm": 1.3345681428909302, + "learning_rate": 6.281792905420219e-05, + "loss": 1.0662, + "step": 17665 + }, + { + "epoch": 0.6326570809533189, + "grad_norm": 1.4253777265548706, + "learning_rate": 6.280716188772082e-05, + "loss": 1.0264, + "step": 17666 + }, + { + "epoch": 0.6326928930828871, + "grad_norm": 2.196429967880249, + "learning_rate": 6.279639522162111e-05, + "loss": 1.1489, + "step": 17667 + }, + { + "epoch": 0.6327287052124555, + "grad_norm": 1.4568952322006226, + "learning_rate": 6.278562905604788e-05, + "loss": 1.0431, + "step": 17668 + }, + { + "epoch": 0.6327645173420238, + "grad_norm": 1.9677149057388306, + "learning_rate": 6.277486339114605e-05, + "loss": 1.087, + "step": 17669 + }, + { + "epoch": 0.632800329471592, + "grad_norm": 1.6410224437713623, + "learning_rate": 6.276409822706038e-05, + "loss": 1.1837, + "step": 17670 + }, + { + "epoch": 0.6328361416011603, + "grad_norm": 1.8572241067886353, + "learning_rate": 6.275333356393575e-05, + "loss": 1.0884, + "step": 17671 + }, + { + "epoch": 0.6328719537307286, + "grad_norm": 1.8470784425735474, + "learning_rate": 6.274256940191696e-05, + "loss": 1.1302, + "step": 17672 + }, + { + "epoch": 0.6329077658602968, + "grad_norm": 1.5259283781051636, + "learning_rate": 6.273180574114887e-05, + "loss": 1.2988, + "step": 17673 + }, + { + "epoch": 0.6329435779898651, + "grad_norm": 1.4060050249099731, + "learning_rate": 6.272104258177622e-05, + "loss": 0.8703, + "step": 17674 + }, + { + "epoch": 0.6329793901194335, + "grad_norm": 1.5998953580856323, + "learning_rate": 6.271027992394389e-05, + "loss": 1.4556, + "step": 17675 + }, + { + "epoch": 0.6330152022490018, + "grad_norm": 1.6549357175827026, + "learning_rate": 6.269951776779667e-05, + "loss": 1.0703, + "step": 17676 + }, + { + "epoch": 0.63305101437857, + "grad_norm": 1.3561474084854126, + "learning_rate": 6.268875611347925e-05, + "loss": 1.0085, + "step": 17677 + }, + { + "epoch": 0.6330868265081383, + "grad_norm": 1.9145816564559937, + "learning_rate": 6.267799496113656e-05, + "loss": 1.1003, + "step": 17678 + }, + { + "epoch": 0.6331226386377066, + "grad_norm": 1.3590693473815918, + "learning_rate": 6.266723431091323e-05, + "loss": 1.1114, + "step": 17679 + }, + { + "epoch": 0.6331584507672748, + "grad_norm": 2.024812698364258, + "learning_rate": 6.265647416295417e-05, + "loss": 1.1958, + "step": 17680 + }, + { + "epoch": 0.6331942628968431, + "grad_norm": 1.3147846460342407, + "learning_rate": 6.264571451740405e-05, + "loss": 1.146, + "step": 17681 + }, + { + "epoch": 0.6332300750264115, + "grad_norm": 1.284769892692566, + "learning_rate": 6.263495537440766e-05, + "loss": 1.1513, + "step": 17682 + }, + { + "epoch": 0.6332658871559798, + "grad_norm": 1.8035635948181152, + "learning_rate": 6.262419673410976e-05, + "loss": 1.2047, + "step": 17683 + }, + { + "epoch": 0.633301699285548, + "grad_norm": 1.4228464365005493, + "learning_rate": 6.261343859665507e-05, + "loss": 1.0476, + "step": 17684 + }, + { + "epoch": 0.6333375114151163, + "grad_norm": 1.6565254926681519, + "learning_rate": 6.260268096218838e-05, + "loss": 1.1227, + "step": 17685 + }, + { + "epoch": 0.6333733235446846, + "grad_norm": 1.6008625030517578, + "learning_rate": 6.259192383085432e-05, + "loss": 1.0799, + "step": 17686 + }, + { + "epoch": 0.6334091356742528, + "grad_norm": 1.3039864301681519, + "learning_rate": 6.258116720279773e-05, + "loss": 0.8753, + "step": 17687 + }, + { + "epoch": 0.6334449478038211, + "grad_norm": 1.7347725629806519, + "learning_rate": 6.257041107816319e-05, + "loss": 1.2041, + "step": 17688 + }, + { + "epoch": 0.6334807599333895, + "grad_norm": 1.4404733180999756, + "learning_rate": 6.255965545709556e-05, + "loss": 0.9865, + "step": 17689 + }, + { + "epoch": 0.6335165720629578, + "grad_norm": 1.3351554870605469, + "learning_rate": 6.254890033973942e-05, + "loss": 1.2504, + "step": 17690 + }, + { + "epoch": 0.633552384192526, + "grad_norm": 1.3336395025253296, + "learning_rate": 6.253814572623958e-05, + "loss": 1.1588, + "step": 17691 + }, + { + "epoch": 0.6335881963220943, + "grad_norm": 1.2151726484298706, + "learning_rate": 6.252739161674059e-05, + "loss": 0.9852, + "step": 17692 + }, + { + "epoch": 0.6336240084516626, + "grad_norm": 1.3406100273132324, + "learning_rate": 6.251663801138725e-05, + "loss": 1.1363, + "step": 17693 + }, + { + "epoch": 0.6336598205812308, + "grad_norm": 1.333701729774475, + "learning_rate": 6.250588491032421e-05, + "loss": 1.1971, + "step": 17694 + }, + { + "epoch": 0.6336956327107991, + "grad_norm": 1.3213213682174683, + "learning_rate": 6.249513231369608e-05, + "loss": 1.0874, + "step": 17695 + }, + { + "epoch": 0.6337314448403675, + "grad_norm": 1.3899720907211304, + "learning_rate": 6.248438022164763e-05, + "loss": 1.0108, + "step": 17696 + }, + { + "epoch": 0.6337672569699357, + "grad_norm": 1.560310959815979, + "learning_rate": 6.247362863432337e-05, + "loss": 1.0809, + "step": 17697 + }, + { + "epoch": 0.633803069099504, + "grad_norm": 1.2601875066757202, + "learning_rate": 6.246287755186813e-05, + "loss": 1.0005, + "step": 17698 + }, + { + "epoch": 0.6338388812290723, + "grad_norm": 1.4635531902313232, + "learning_rate": 6.245212697442637e-05, + "loss": 1.0579, + "step": 17699 + }, + { + "epoch": 0.6338746933586406, + "grad_norm": 1.428498387336731, + "learning_rate": 6.244137690214287e-05, + "loss": 0.9211, + "step": 17700 + }, + { + "epoch": 0.6339105054882088, + "grad_norm": 1.8131414651870728, + "learning_rate": 6.243062733516211e-05, + "loss": 1.2256, + "step": 17701 + }, + { + "epoch": 0.6339463176177771, + "grad_norm": 1.3344218730926514, + "learning_rate": 6.24198782736289e-05, + "loss": 0.9896, + "step": 17702 + }, + { + "epoch": 0.6339821297473455, + "grad_norm": 1.3612680435180664, + "learning_rate": 6.240912971768771e-05, + "loss": 0.9672, + "step": 17703 + }, + { + "epoch": 0.6340179418769137, + "grad_norm": 1.796761393547058, + "learning_rate": 6.239838166748318e-05, + "loss": 0.9168, + "step": 17704 + }, + { + "epoch": 0.634053754006482, + "grad_norm": 1.6213324069976807, + "learning_rate": 6.238763412315993e-05, + "loss": 1.0023, + "step": 17705 + }, + { + "epoch": 0.6340895661360503, + "grad_norm": 2.623112678527832, + "learning_rate": 6.237688708486252e-05, + "loss": 1.1999, + "step": 17706 + }, + { + "epoch": 0.6341253782656185, + "grad_norm": 1.5201163291931152, + "learning_rate": 6.236614055273562e-05, + "loss": 1.0291, + "step": 17707 + }, + { + "epoch": 0.6341611903951868, + "grad_norm": 1.6977946758270264, + "learning_rate": 6.23553945269237e-05, + "loss": 1.1869, + "step": 17708 + }, + { + "epoch": 0.6341970025247551, + "grad_norm": 1.2454872131347656, + "learning_rate": 6.234464900757144e-05, + "loss": 1.0883, + "step": 17709 + }, + { + "epoch": 0.6342328146543235, + "grad_norm": 1.2695297002792358, + "learning_rate": 6.233390399482334e-05, + "loss": 1.0191, + "step": 17710 + }, + { + "epoch": 0.6342686267838917, + "grad_norm": 1.7306476831436157, + "learning_rate": 6.232315948882394e-05, + "loss": 1.0617, + "step": 17711 + }, + { + "epoch": 0.63430443891346, + "grad_norm": 1.4219025373458862, + "learning_rate": 6.231241548971788e-05, + "loss": 1.0827, + "step": 17712 + }, + { + "epoch": 0.6343402510430283, + "grad_norm": 1.7102876901626587, + "learning_rate": 6.230167199764962e-05, + "loss": 1.2238, + "step": 17713 + }, + { + "epoch": 0.6343760631725965, + "grad_norm": 1.553200125694275, + "learning_rate": 6.229092901276376e-05, + "loss": 1.2028, + "step": 17714 + }, + { + "epoch": 0.6344118753021648, + "grad_norm": 1.5640007257461548, + "learning_rate": 6.228018653520477e-05, + "loss": 1.0478, + "step": 17715 + }, + { + "epoch": 0.6344476874317331, + "grad_norm": 1.5186718702316284, + "learning_rate": 6.226944456511725e-05, + "loss": 1.1984, + "step": 17716 + }, + { + "epoch": 0.6344834995613015, + "grad_norm": 1.3609168529510498, + "learning_rate": 6.225870310264567e-05, + "loss": 1.0476, + "step": 17717 + }, + { + "epoch": 0.6345193116908697, + "grad_norm": 1.7173393964767456, + "learning_rate": 6.224796214793458e-05, + "loss": 1.2759, + "step": 17718 + }, + { + "epoch": 0.634555123820438, + "grad_norm": 1.430355429649353, + "learning_rate": 6.223722170112845e-05, + "loss": 1.1051, + "step": 17719 + }, + { + "epoch": 0.6345909359500063, + "grad_norm": 2.180199146270752, + "learning_rate": 6.222648176237179e-05, + "loss": 1.2797, + "step": 17720 + }, + { + "epoch": 0.6346267480795745, + "grad_norm": 1.472545862197876, + "learning_rate": 6.221574233180907e-05, + "loss": 1.1473, + "step": 17721 + }, + { + "epoch": 0.6346625602091428, + "grad_norm": 1.4164681434631348, + "learning_rate": 6.220500340958482e-05, + "loss": 0.9889, + "step": 17722 + }, + { + "epoch": 0.6346983723387111, + "grad_norm": 1.6178150177001953, + "learning_rate": 6.219426499584351e-05, + "loss": 1.1756, + "step": 17723 + }, + { + "epoch": 0.6347341844682794, + "grad_norm": 1.6591317653656006, + "learning_rate": 6.218352709072957e-05, + "loss": 1.1539, + "step": 17724 + }, + { + "epoch": 0.6347699965978477, + "grad_norm": 1.4133738279342651, + "learning_rate": 6.21727896943875e-05, + "loss": 1.0072, + "step": 17725 + }, + { + "epoch": 0.634805808727416, + "grad_norm": 1.5840389728546143, + "learning_rate": 6.216205280696177e-05, + "loss": 1.0057, + "step": 17726 + }, + { + "epoch": 0.6348416208569843, + "grad_norm": 1.6789524555206299, + "learning_rate": 6.21513164285968e-05, + "loss": 1.2645, + "step": 17727 + }, + { + "epoch": 0.6348774329865525, + "grad_norm": 1.6570730209350586, + "learning_rate": 6.214058055943706e-05, + "loss": 1.2078, + "step": 17728 + }, + { + "epoch": 0.6349132451161208, + "grad_norm": 1.2244049310684204, + "learning_rate": 6.212984519962695e-05, + "loss": 1.0861, + "step": 17729 + }, + { + "epoch": 0.6349490572456891, + "grad_norm": 1.422406792640686, + "learning_rate": 6.211911034931094e-05, + "loss": 1.1291, + "step": 17730 + }, + { + "epoch": 0.6349848693752574, + "grad_norm": 1.6300442218780518, + "learning_rate": 6.210837600863342e-05, + "loss": 1.0161, + "step": 17731 + }, + { + "epoch": 0.6350206815048257, + "grad_norm": 1.4994882345199585, + "learning_rate": 6.209764217773884e-05, + "loss": 1.106, + "step": 17732 + }, + { + "epoch": 0.635056493634394, + "grad_norm": 1.6083308458328247, + "learning_rate": 6.208690885677158e-05, + "loss": 0.9997, + "step": 17733 + }, + { + "epoch": 0.6350923057639623, + "grad_norm": 2.172194719314575, + "learning_rate": 6.207617604587607e-05, + "loss": 1.1652, + "step": 17734 + }, + { + "epoch": 0.6351281178935305, + "grad_norm": 1.6909304857254028, + "learning_rate": 6.20654437451967e-05, + "loss": 1.2221, + "step": 17735 + }, + { + "epoch": 0.6351639300230988, + "grad_norm": 1.457528829574585, + "learning_rate": 6.205471195487784e-05, + "loss": 1.1909, + "step": 17736 + }, + { + "epoch": 0.6351997421526671, + "grad_norm": 1.5661876201629639, + "learning_rate": 6.204398067506389e-05, + "loss": 1.188, + "step": 17737 + }, + { + "epoch": 0.6352355542822354, + "grad_norm": 1.4519020318984985, + "learning_rate": 6.203324990589922e-05, + "loss": 1.1217, + "step": 17738 + }, + { + "epoch": 0.6352713664118037, + "grad_norm": 1.4689581394195557, + "learning_rate": 6.20225196475282e-05, + "loss": 1.0565, + "step": 17739 + }, + { + "epoch": 0.635307178541372, + "grad_norm": 1.3696770668029785, + "learning_rate": 6.201178990009518e-05, + "loss": 1.1559, + "step": 17740 + }, + { + "epoch": 0.6353429906709402, + "grad_norm": 1.7151514291763306, + "learning_rate": 6.200106066374454e-05, + "loss": 1.1489, + "step": 17741 + }, + { + "epoch": 0.6353788028005085, + "grad_norm": 1.8854966163635254, + "learning_rate": 6.199033193862059e-05, + "loss": 1.1086, + "step": 17742 + }, + { + "epoch": 0.6354146149300768, + "grad_norm": 1.5919781923294067, + "learning_rate": 6.197960372486772e-05, + "loss": 1.1376, + "step": 17743 + }, + { + "epoch": 0.635450427059645, + "grad_norm": 1.5457743406295776, + "learning_rate": 6.196887602263022e-05, + "loss": 1.0395, + "step": 17744 + }, + { + "epoch": 0.6354862391892134, + "grad_norm": 1.6208319664001465, + "learning_rate": 6.195814883205245e-05, + "loss": 1.1761, + "step": 17745 + }, + { + "epoch": 0.6355220513187817, + "grad_norm": 2.0025105476379395, + "learning_rate": 6.194742215327873e-05, + "loss": 0.8997, + "step": 17746 + }, + { + "epoch": 0.63555786344835, + "grad_norm": 1.4292160272598267, + "learning_rate": 6.193669598645334e-05, + "loss": 1.067, + "step": 17747 + }, + { + "epoch": 0.6355936755779182, + "grad_norm": 1.5340646505355835, + "learning_rate": 6.19259703317206e-05, + "loss": 1.045, + "step": 17748 + }, + { + "epoch": 0.6356294877074865, + "grad_norm": 1.9685146808624268, + "learning_rate": 6.191524518922482e-05, + "loss": 1.0993, + "step": 17749 + }, + { + "epoch": 0.6356652998370548, + "grad_norm": 1.3495169878005981, + "learning_rate": 6.190452055911031e-05, + "loss": 1.0811, + "step": 17750 + }, + { + "epoch": 0.635701111966623, + "grad_norm": 1.2784981727600098, + "learning_rate": 6.189379644152132e-05, + "loss": 1.0583, + "step": 17751 + }, + { + "epoch": 0.6357369240961914, + "grad_norm": 1.4263583421707153, + "learning_rate": 6.188307283660216e-05, + "loss": 1.1493, + "step": 17752 + }, + { + "epoch": 0.6357727362257597, + "grad_norm": 1.6826074123382568, + "learning_rate": 6.187234974449707e-05, + "loss": 1.0696, + "step": 17753 + }, + { + "epoch": 0.635808548355328, + "grad_norm": 1.4222332239151, + "learning_rate": 6.186162716535036e-05, + "loss": 1.0362, + "step": 17754 + }, + { + "epoch": 0.6358443604848962, + "grad_norm": 1.4730768203735352, + "learning_rate": 6.185090509930624e-05, + "loss": 1.1405, + "step": 17755 + }, + { + "epoch": 0.6358801726144645, + "grad_norm": 1.372645616531372, + "learning_rate": 6.184018354650898e-05, + "loss": 1.0129, + "step": 17756 + }, + { + "epoch": 0.6359159847440328, + "grad_norm": 1.5796617269515991, + "learning_rate": 6.182946250710284e-05, + "loss": 1.2186, + "step": 17757 + }, + { + "epoch": 0.635951796873601, + "grad_norm": 1.6991322040557861, + "learning_rate": 6.181874198123203e-05, + "loss": 1.3749, + "step": 17758 + }, + { + "epoch": 0.6359876090031694, + "grad_norm": 1.431535243988037, + "learning_rate": 6.18080219690408e-05, + "loss": 0.9923, + "step": 17759 + }, + { + "epoch": 0.6360234211327377, + "grad_norm": 1.4452301263809204, + "learning_rate": 6.179730247067336e-05, + "loss": 1.0389, + "step": 17760 + }, + { + "epoch": 0.636059233262306, + "grad_norm": 1.2557216882705688, + "learning_rate": 6.178658348627398e-05, + "loss": 1.0207, + "step": 17761 + }, + { + "epoch": 0.6360950453918742, + "grad_norm": 1.4305990934371948, + "learning_rate": 6.177586501598679e-05, + "loss": 0.9949, + "step": 17762 + }, + { + "epoch": 0.6361308575214425, + "grad_norm": 1.6065775156021118, + "learning_rate": 6.176514705995604e-05, + "loss": 1.3421, + "step": 17763 + }, + { + "epoch": 0.6361666696510108, + "grad_norm": 1.3639861345291138, + "learning_rate": 6.175442961832593e-05, + "loss": 1.0607, + "step": 17764 + }, + { + "epoch": 0.636202481780579, + "grad_norm": 1.6258851289749146, + "learning_rate": 6.174371269124061e-05, + "loss": 1.2158, + "step": 17765 + }, + { + "epoch": 0.6362382939101474, + "grad_norm": 1.2850364446640015, + "learning_rate": 6.173299627884432e-05, + "loss": 1.0476, + "step": 17766 + }, + { + "epoch": 0.6362741060397157, + "grad_norm": 1.2771657705307007, + "learning_rate": 6.172228038128118e-05, + "loss": 1.0467, + "step": 17767 + }, + { + "epoch": 0.636309918169284, + "grad_norm": 1.1859009265899658, + "learning_rate": 6.171156499869539e-05, + "loss": 1.0551, + "step": 17768 + }, + { + "epoch": 0.6363457302988522, + "grad_norm": 1.3238558769226074, + "learning_rate": 6.17008501312311e-05, + "loss": 0.852, + "step": 17769 + }, + { + "epoch": 0.6363815424284205, + "grad_norm": 1.5301845073699951, + "learning_rate": 6.169013577903248e-05, + "loss": 1.2288, + "step": 17770 + }, + { + "epoch": 0.6364173545579888, + "grad_norm": 1.97567880153656, + "learning_rate": 6.167942194224365e-05, + "loss": 1.0726, + "step": 17771 + }, + { + "epoch": 0.636453166687557, + "grad_norm": 1.5739493370056152, + "learning_rate": 6.16687086210088e-05, + "loss": 1.2386, + "step": 17772 + }, + { + "epoch": 0.6364889788171254, + "grad_norm": 1.577728033065796, + "learning_rate": 6.165799581547203e-05, + "loss": 1.1319, + "step": 17773 + }, + { + "epoch": 0.6365247909466937, + "grad_norm": 1.7608850002288818, + "learning_rate": 6.164728352577743e-05, + "loss": 1.1354, + "step": 17774 + }, + { + "epoch": 0.6365606030762619, + "grad_norm": 1.6601589918136597, + "learning_rate": 6.16365717520692e-05, + "loss": 1.0181, + "step": 17775 + }, + { + "epoch": 0.6365964152058302, + "grad_norm": 1.6378363370895386, + "learning_rate": 6.162586049449136e-05, + "loss": 1.104, + "step": 17776 + }, + { + "epoch": 0.6366322273353985, + "grad_norm": 1.4905824661254883, + "learning_rate": 6.161514975318809e-05, + "loss": 1.2106, + "step": 17777 + }, + { + "epoch": 0.6366680394649668, + "grad_norm": 1.6253085136413574, + "learning_rate": 6.160443952830347e-05, + "loss": 1.0793, + "step": 17778 + }, + { + "epoch": 0.636703851594535, + "grad_norm": 1.5669382810592651, + "learning_rate": 6.159372981998161e-05, + "loss": 1.0872, + "step": 17779 + }, + { + "epoch": 0.6367396637241034, + "grad_norm": 1.626028299331665, + "learning_rate": 6.158302062836654e-05, + "loss": 1.2178, + "step": 17780 + }, + { + "epoch": 0.6367754758536717, + "grad_norm": 2.099911689758301, + "learning_rate": 6.157231195360241e-05, + "loss": 1.2327, + "step": 17781 + }, + { + "epoch": 0.6368112879832399, + "grad_norm": 1.5650746822357178, + "learning_rate": 6.156160379583325e-05, + "loss": 1.0113, + "step": 17782 + }, + { + "epoch": 0.6368471001128082, + "grad_norm": 1.9487766027450562, + "learning_rate": 6.155089615520308e-05, + "loss": 1.187, + "step": 17783 + }, + { + "epoch": 0.6368829122423765, + "grad_norm": 1.7307301759719849, + "learning_rate": 6.154018903185608e-05, + "loss": 0.9656, + "step": 17784 + }, + { + "epoch": 0.6369187243719447, + "grad_norm": 1.540452480316162, + "learning_rate": 6.152948242593615e-05, + "loss": 1.1466, + "step": 17785 + }, + { + "epoch": 0.636954536501513, + "grad_norm": 1.4585967063903809, + "learning_rate": 6.15187763375875e-05, + "loss": 1.1828, + "step": 17786 + }, + { + "epoch": 0.6369903486310814, + "grad_norm": 1.7153079509735107, + "learning_rate": 6.150807076695399e-05, + "loss": 1.0014, + "step": 17787 + }, + { + "epoch": 0.6370261607606497, + "grad_norm": 1.269416332244873, + "learning_rate": 6.149736571417979e-05, + "loss": 1.0375, + "step": 17788 + }, + { + "epoch": 0.6370619728902179, + "grad_norm": 1.6048264503479004, + "learning_rate": 6.148666117940882e-05, + "loss": 1.1917, + "step": 17789 + }, + { + "epoch": 0.6370977850197862, + "grad_norm": 1.2753033638000488, + "learning_rate": 6.147595716278519e-05, + "loss": 1.0077, + "step": 17790 + }, + { + "epoch": 0.6371335971493545, + "grad_norm": 1.3785865306854248, + "learning_rate": 6.146525366445288e-05, + "loss": 1.1571, + "step": 17791 + }, + { + "epoch": 0.6371694092789227, + "grad_norm": 1.5807839632034302, + "learning_rate": 6.145455068455583e-05, + "loss": 1.0134, + "step": 17792 + }, + { + "epoch": 0.637205221408491, + "grad_norm": 1.764299988746643, + "learning_rate": 6.144384822323812e-05, + "loss": 1.0454, + "step": 17793 + }, + { + "epoch": 0.6372410335380594, + "grad_norm": 1.3979873657226562, + "learning_rate": 6.143314628064365e-05, + "loss": 1.1794, + "step": 17794 + }, + { + "epoch": 0.6372768456676277, + "grad_norm": 1.2107250690460205, + "learning_rate": 6.14224448569165e-05, + "loss": 1.0934, + "step": 17795 + }, + { + "epoch": 0.6373126577971959, + "grad_norm": 1.5645512342453003, + "learning_rate": 6.141174395220053e-05, + "loss": 1.1336, + "step": 17796 + }, + { + "epoch": 0.6373484699267642, + "grad_norm": 1.5787421464920044, + "learning_rate": 6.140104356663984e-05, + "loss": 1.1046, + "step": 17797 + }, + { + "epoch": 0.6373842820563325, + "grad_norm": 1.7438632249832153, + "learning_rate": 6.139034370037826e-05, + "loss": 1.1753, + "step": 17798 + }, + { + "epoch": 0.6374200941859007, + "grad_norm": 1.3033710718154907, + "learning_rate": 6.137964435355984e-05, + "loss": 1.312, + "step": 17799 + }, + { + "epoch": 0.637455906315469, + "grad_norm": 1.3306900262832642, + "learning_rate": 6.13689455263285e-05, + "loss": 0.9219, + "step": 17800 + }, + { + "epoch": 0.6374917184450374, + "grad_norm": 1.89306640625, + "learning_rate": 6.135824721882815e-05, + "loss": 1.1457, + "step": 17801 + }, + { + "epoch": 0.6375275305746056, + "grad_norm": 1.3574854135513306, + "learning_rate": 6.134754943120273e-05, + "loss": 1.3005, + "step": 17802 + }, + { + "epoch": 0.6375633427041739, + "grad_norm": 1.4426542520523071, + "learning_rate": 6.133685216359615e-05, + "loss": 1.0105, + "step": 17803 + }, + { + "epoch": 0.6375991548337422, + "grad_norm": 1.838086485862732, + "learning_rate": 6.132615541615242e-05, + "loss": 1.153, + "step": 17804 + }, + { + "epoch": 0.6376349669633105, + "grad_norm": 1.229029893875122, + "learning_rate": 6.131545918901531e-05, + "loss": 1.0881, + "step": 17805 + }, + { + "epoch": 0.6376707790928787, + "grad_norm": 1.4614439010620117, + "learning_rate": 6.130476348232887e-05, + "loss": 1.0018, + "step": 17806 + }, + { + "epoch": 0.637706591222447, + "grad_norm": 1.3628621101379395, + "learning_rate": 6.12940682962369e-05, + "loss": 1.0044, + "step": 17807 + }, + { + "epoch": 0.6377424033520154, + "grad_norm": 1.6438707113265991, + "learning_rate": 6.128337363088327e-05, + "loss": 1.162, + "step": 17808 + }, + { + "epoch": 0.6377782154815836, + "grad_norm": 1.2645467519760132, + "learning_rate": 6.127267948641195e-05, + "loss": 0.9671, + "step": 17809 + }, + { + "epoch": 0.6378140276111519, + "grad_norm": 1.438874363899231, + "learning_rate": 6.126198586296676e-05, + "loss": 1.0027, + "step": 17810 + }, + { + "epoch": 0.6378498397407202, + "grad_norm": 1.567765235900879, + "learning_rate": 6.12512927606916e-05, + "loss": 1.109, + "step": 17811 + }, + { + "epoch": 0.6378856518702885, + "grad_norm": 1.675814151763916, + "learning_rate": 6.124060017973027e-05, + "loss": 1.4189, + "step": 17812 + }, + { + "epoch": 0.6379214639998567, + "grad_norm": 1.5570563077926636, + "learning_rate": 6.122990812022671e-05, + "loss": 1.0338, + "step": 17813 + }, + { + "epoch": 0.637957276129425, + "grad_norm": 1.4989540576934814, + "learning_rate": 6.12192165823247e-05, + "loss": 0.9989, + "step": 17814 + }, + { + "epoch": 0.6379930882589934, + "grad_norm": 1.9421133995056152, + "learning_rate": 6.120852556616811e-05, + "loss": 1.2141, + "step": 17815 + }, + { + "epoch": 0.6380289003885616, + "grad_norm": 1.4809482097625732, + "learning_rate": 6.11978350719008e-05, + "loss": 1.0163, + "step": 17816 + }, + { + "epoch": 0.6380647125181299, + "grad_norm": 1.9231292009353638, + "learning_rate": 6.118714509966654e-05, + "loss": 0.9846, + "step": 17817 + }, + { + "epoch": 0.6381005246476982, + "grad_norm": 1.6868020296096802, + "learning_rate": 6.117645564960919e-05, + "loss": 0.9184, + "step": 17818 + }, + { + "epoch": 0.6381363367772664, + "grad_norm": 1.1999170780181885, + "learning_rate": 6.116576672187254e-05, + "loss": 1.1912, + "step": 17819 + }, + { + "epoch": 0.6381721489068347, + "grad_norm": 1.325439214706421, + "learning_rate": 6.115507831660042e-05, + "loss": 1.1471, + "step": 17820 + }, + { + "epoch": 0.638207961036403, + "grad_norm": 1.4599874019622803, + "learning_rate": 6.11443904339366e-05, + "loss": 1.3892, + "step": 17821 + }, + { + "epoch": 0.6382437731659714, + "grad_norm": 1.7021801471710205, + "learning_rate": 6.11337030740249e-05, + "loss": 1.1431, + "step": 17822 + }, + { + "epoch": 0.6382795852955396, + "grad_norm": 1.6416183710098267, + "learning_rate": 6.112301623700907e-05, + "loss": 1.055, + "step": 17823 + }, + { + "epoch": 0.6383153974251079, + "grad_norm": 1.2838189601898193, + "learning_rate": 6.111232992303292e-05, + "loss": 1.105, + "step": 17824 + }, + { + "epoch": 0.6383512095546762, + "grad_norm": 1.7090554237365723, + "learning_rate": 6.110164413224025e-05, + "loss": 1.1877, + "step": 17825 + }, + { + "epoch": 0.6383870216842444, + "grad_norm": 1.8541678190231323, + "learning_rate": 6.109095886477472e-05, + "loss": 1.1756, + "step": 17826 + }, + { + "epoch": 0.6384228338138127, + "grad_norm": 1.5368341207504272, + "learning_rate": 6.108027412078018e-05, + "loss": 1.1583, + "step": 17827 + }, + { + "epoch": 0.638458645943381, + "grad_norm": 1.5716185569763184, + "learning_rate": 6.106958990040033e-05, + "loss": 1.1525, + "step": 17828 + }, + { + "epoch": 0.6384944580729494, + "grad_norm": 1.2657278776168823, + "learning_rate": 6.105890620377897e-05, + "loss": 1.1494, + "step": 17829 + }, + { + "epoch": 0.6385302702025176, + "grad_norm": 1.8653218746185303, + "learning_rate": 6.104822303105974e-05, + "loss": 1.1697, + "step": 17830 + }, + { + "epoch": 0.6385660823320859, + "grad_norm": 1.4122967720031738, + "learning_rate": 6.103754038238648e-05, + "loss": 1.0381, + "step": 17831 + }, + { + "epoch": 0.6386018944616542, + "grad_norm": 2.0000977516174316, + "learning_rate": 6.102685825790282e-05, + "loss": 1.0401, + "step": 17832 + }, + { + "epoch": 0.6386377065912224, + "grad_norm": 1.8011425733566284, + "learning_rate": 6.1016176657752534e-05, + "loss": 1.176, + "step": 17833 + }, + { + "epoch": 0.6386735187207907, + "grad_norm": 1.946900486946106, + "learning_rate": 6.100549558207931e-05, + "loss": 1.0791, + "step": 17834 + }, + { + "epoch": 0.638709330850359, + "grad_norm": 1.7054377794265747, + "learning_rate": 6.099481503102682e-05, + "loss": 1.1263, + "step": 17835 + }, + { + "epoch": 0.6387451429799273, + "grad_norm": 2.2092440128326416, + "learning_rate": 6.0984135004738784e-05, + "loss": 1.3473, + "step": 17836 + }, + { + "epoch": 0.6387809551094956, + "grad_norm": 1.4350874423980713, + "learning_rate": 6.097345550335889e-05, + "loss": 0.9738, + "step": 17837 + }, + { + "epoch": 0.6388167672390639, + "grad_norm": 1.235260248184204, + "learning_rate": 6.096277652703082e-05, + "loss": 1.2387, + "step": 17838 + }, + { + "epoch": 0.6388525793686322, + "grad_norm": 1.7148393392562866, + "learning_rate": 6.0952098075898214e-05, + "loss": 1.1809, + "step": 17839 + }, + { + "epoch": 0.6388883914982004, + "grad_norm": 1.2960397005081177, + "learning_rate": 6.0941420150104776e-05, + "loss": 1.0234, + "step": 17840 + }, + { + "epoch": 0.6389242036277687, + "grad_norm": 1.6526730060577393, + "learning_rate": 6.0930742749794145e-05, + "loss": 1.0298, + "step": 17841 + }, + { + "epoch": 0.638960015757337, + "grad_norm": 1.289764642715454, + "learning_rate": 6.0920065875109986e-05, + "loss": 1.0893, + "step": 17842 + }, + { + "epoch": 0.6389958278869053, + "grad_norm": 2.346815824508667, + "learning_rate": 6.0909389526195935e-05, + "loss": 0.9881, + "step": 17843 + }, + { + "epoch": 0.6390316400164736, + "grad_norm": 1.4239896535873413, + "learning_rate": 6.0898713703195595e-05, + "loss": 1.154, + "step": 17844 + }, + { + "epoch": 0.6390674521460419, + "grad_norm": 2.337937831878662, + "learning_rate": 6.0888038406252656e-05, + "loss": 1.0053, + "step": 17845 + }, + { + "epoch": 0.6391032642756102, + "grad_norm": 1.6611075401306152, + "learning_rate": 6.087736363551069e-05, + "loss": 1.1057, + "step": 17846 + }, + { + "epoch": 0.6391390764051784, + "grad_norm": 1.8550505638122559, + "learning_rate": 6.086668939111333e-05, + "loss": 1.1355, + "step": 17847 + }, + { + "epoch": 0.6391748885347467, + "grad_norm": 1.6447622776031494, + "learning_rate": 6.085601567320418e-05, + "loss": 1.0979, + "step": 17848 + }, + { + "epoch": 0.639210700664315, + "grad_norm": 1.2703522443771362, + "learning_rate": 6.084534248192688e-05, + "loss": 1.0078, + "step": 17849 + }, + { + "epoch": 0.6392465127938833, + "grad_norm": 1.5002200603485107, + "learning_rate": 6.083466981742496e-05, + "loss": 1.1704, + "step": 17850 + }, + { + "epoch": 0.6392823249234516, + "grad_norm": 1.4768236875534058, + "learning_rate": 6.082399767984206e-05, + "loss": 1.1362, + "step": 17851 + }, + { + "epoch": 0.6393181370530199, + "grad_norm": 1.8254790306091309, + "learning_rate": 6.081332606932173e-05, + "loss": 1.0467, + "step": 17852 + }, + { + "epoch": 0.6393539491825881, + "grad_norm": 1.415112018585205, + "learning_rate": 6.0802654986007534e-05, + "loss": 1.1231, + "step": 17853 + }, + { + "epoch": 0.6393897613121564, + "grad_norm": 1.5703070163726807, + "learning_rate": 6.079198443004308e-05, + "loss": 1.1951, + "step": 17854 + }, + { + "epoch": 0.6394255734417247, + "grad_norm": 1.324666142463684, + "learning_rate": 6.0781314401571875e-05, + "loss": 1.0823, + "step": 17855 + }, + { + "epoch": 0.639461385571293, + "grad_norm": 1.8740752935409546, + "learning_rate": 6.077064490073752e-05, + "loss": 1.1481, + "step": 17856 + }, + { + "epoch": 0.6394971977008613, + "grad_norm": 1.554121732711792, + "learning_rate": 6.075997592768352e-05, + "loss": 1.3236, + "step": 17857 + }, + { + "epoch": 0.6395330098304296, + "grad_norm": 1.400647759437561, + "learning_rate": 6.074930748255343e-05, + "loss": 1.0281, + "step": 17858 + }, + { + "epoch": 0.6395688219599979, + "grad_norm": 1.6525428295135498, + "learning_rate": 6.073863956549077e-05, + "loss": 1.1476, + "step": 17859 + }, + { + "epoch": 0.6396046340895661, + "grad_norm": 1.8111622333526611, + "learning_rate": 6.07279721766391e-05, + "loss": 1.1792, + "step": 17860 + }, + { + "epoch": 0.6396404462191344, + "grad_norm": 1.6366755962371826, + "learning_rate": 6.071730531614189e-05, + "loss": 1.0942, + "step": 17861 + }, + { + "epoch": 0.6396762583487027, + "grad_norm": 1.3475604057312012, + "learning_rate": 6.070663898414266e-05, + "loss": 1.0301, + "step": 17862 + }, + { + "epoch": 0.6397120704782709, + "grad_norm": 1.749776840209961, + "learning_rate": 6.069597318078493e-05, + "loss": 1.0892, + "step": 17863 + }, + { + "epoch": 0.6397478826078393, + "grad_norm": 1.4881023168563843, + "learning_rate": 6.0685307906212163e-05, + "loss": 0.9289, + "step": 17864 + }, + { + "epoch": 0.6397836947374076, + "grad_norm": 1.4482237100601196, + "learning_rate": 6.067464316056789e-05, + "loss": 0.9946, + "step": 17865 + }, + { + "epoch": 0.6398195068669759, + "grad_norm": 1.4271551370620728, + "learning_rate": 6.066397894399553e-05, + "loss": 1.1399, + "step": 17866 + }, + { + "epoch": 0.6398553189965441, + "grad_norm": 1.4578005075454712, + "learning_rate": 6.065331525663864e-05, + "loss": 1.3153, + "step": 17867 + }, + { + "epoch": 0.6398911311261124, + "grad_norm": 1.383047103881836, + "learning_rate": 6.064265209864061e-05, + "loss": 1.0033, + "step": 17868 + }, + { + "epoch": 0.6399269432556807, + "grad_norm": 1.6535923480987549, + "learning_rate": 6.063198947014495e-05, + "loss": 1.4395, + "step": 17869 + }, + { + "epoch": 0.6399627553852489, + "grad_norm": 1.316644310951233, + "learning_rate": 6.06213273712951e-05, + "loss": 0.8568, + "step": 17870 + }, + { + "epoch": 0.6399985675148173, + "grad_norm": 1.423403263092041, + "learning_rate": 6.061066580223445e-05, + "loss": 1.0857, + "step": 17871 + }, + { + "epoch": 0.6400343796443856, + "grad_norm": 1.6615808010101318, + "learning_rate": 6.0600004763106524e-05, + "loss": 0.9913, + "step": 17872 + }, + { + "epoch": 0.6400701917739539, + "grad_norm": 1.5388213396072388, + "learning_rate": 6.058934425405467e-05, + "loss": 1.2217, + "step": 17873 + }, + { + "epoch": 0.6401060039035221, + "grad_norm": 1.6015368700027466, + "learning_rate": 6.0578684275222376e-05, + "loss": 1.0051, + "step": 17874 + }, + { + "epoch": 0.6401418160330904, + "grad_norm": 1.3236775398254395, + "learning_rate": 6.056802482675303e-05, + "loss": 1.1092, + "step": 17875 + }, + { + "epoch": 0.6401776281626587, + "grad_norm": 1.62136709690094, + "learning_rate": 6.055736590879007e-05, + "loss": 1.3149, + "step": 17876 + }, + { + "epoch": 0.6402134402922269, + "grad_norm": 1.514784336090088, + "learning_rate": 6.0546707521476844e-05, + "loss": 1.2039, + "step": 17877 + }, + { + "epoch": 0.6402492524217953, + "grad_norm": 1.421582579612732, + "learning_rate": 6.0536049664956797e-05, + "loss": 1.081, + "step": 17878 + }, + { + "epoch": 0.6402850645513636, + "grad_norm": 1.722392201423645, + "learning_rate": 6.052539233937331e-05, + "loss": 0.9792, + "step": 17879 + }, + { + "epoch": 0.6403208766809318, + "grad_norm": 1.9384857416152954, + "learning_rate": 6.0514735544869706e-05, + "loss": 1.2324, + "step": 17880 + }, + { + "epoch": 0.6403566888105001, + "grad_norm": 1.6945345401763916, + "learning_rate": 6.0504079281589454e-05, + "loss": 1.1136, + "step": 17881 + }, + { + "epoch": 0.6403925009400684, + "grad_norm": 1.141910433769226, + "learning_rate": 6.049342354967581e-05, + "loss": 1.1806, + "step": 17882 + }, + { + "epoch": 0.6404283130696367, + "grad_norm": 1.621665358543396, + "learning_rate": 6.0482768349272256e-05, + "loss": 0.99, + "step": 17883 + }, + { + "epoch": 0.6404641251992049, + "grad_norm": 1.5541270971298218, + "learning_rate": 6.047211368052201e-05, + "loss": 1.1989, + "step": 17884 + }, + { + "epoch": 0.6404999373287733, + "grad_norm": 1.5709550380706787, + "learning_rate": 6.0461459543568566e-05, + "loss": 1.1713, + "step": 17885 + }, + { + "epoch": 0.6405357494583416, + "grad_norm": 2.917595386505127, + "learning_rate": 6.04508059385551e-05, + "loss": 1.1577, + "step": 17886 + }, + { + "epoch": 0.6405715615879098, + "grad_norm": 1.242363691329956, + "learning_rate": 6.0440152865625076e-05, + "loss": 1.1524, + "step": 17887 + }, + { + "epoch": 0.6406073737174781, + "grad_norm": 1.949920892715454, + "learning_rate": 6.042950032492179e-05, + "loss": 0.9983, + "step": 17888 + }, + { + "epoch": 0.6406431858470464, + "grad_norm": 1.473326563835144, + "learning_rate": 6.041884831658848e-05, + "loss": 1.0767, + "step": 17889 + }, + { + "epoch": 0.6406789979766147, + "grad_norm": 1.4960598945617676, + "learning_rate": 6.040819684076856e-05, + "loss": 1.3445, + "step": 17890 + }, + { + "epoch": 0.6407148101061829, + "grad_norm": 1.4897023439407349, + "learning_rate": 6.039754589760522e-05, + "loss": 1.0897, + "step": 17891 + }, + { + "epoch": 0.6407506222357513, + "grad_norm": 1.380122184753418, + "learning_rate": 6.038689548724189e-05, + "loss": 1.0017, + "step": 17892 + }, + { + "epoch": 0.6407864343653196, + "grad_norm": 1.7141797542572021, + "learning_rate": 6.037624560982171e-05, + "loss": 1.0549, + "step": 17893 + }, + { + "epoch": 0.6408222464948878, + "grad_norm": 1.6468604803085327, + "learning_rate": 6.03655962654881e-05, + "loss": 1.2286, + "step": 17894 + }, + { + "epoch": 0.6408580586244561, + "grad_norm": 1.4656063318252563, + "learning_rate": 6.035494745438421e-05, + "loss": 1.1337, + "step": 17895 + }, + { + "epoch": 0.6408938707540244, + "grad_norm": 1.6563538312911987, + "learning_rate": 6.034429917665342e-05, + "loss": 1.3094, + "step": 17896 + }, + { + "epoch": 0.6409296828835926, + "grad_norm": 1.6408495903015137, + "learning_rate": 6.033365143243891e-05, + "loss": 1.2932, + "step": 17897 + }, + { + "epoch": 0.6409654950131609, + "grad_norm": 1.4344727993011475, + "learning_rate": 6.0323004221883936e-05, + "loss": 0.9005, + "step": 17898 + }, + { + "epoch": 0.6410013071427293, + "grad_norm": 1.8076906204223633, + "learning_rate": 6.031235754513178e-05, + "loss": 1.117, + "step": 17899 + }, + { + "epoch": 0.6410371192722976, + "grad_norm": 1.3575937747955322, + "learning_rate": 6.030171140232562e-05, + "loss": 1.1507, + "step": 17900 + }, + { + "epoch": 0.6410729314018658, + "grad_norm": 1.5005685091018677, + "learning_rate": 6.029106579360879e-05, + "loss": 0.9134, + "step": 17901 + }, + { + "epoch": 0.6411087435314341, + "grad_norm": 2.289950370788574, + "learning_rate": 6.028042071912439e-05, + "loss": 1.2689, + "step": 17902 + }, + { + "epoch": 0.6411445556610024, + "grad_norm": 2.0998892784118652, + "learning_rate": 6.026977617901575e-05, + "loss": 1.1946, + "step": 17903 + }, + { + "epoch": 0.6411803677905706, + "grad_norm": 1.6019940376281738, + "learning_rate": 6.0259132173426006e-05, + "loss": 1.1531, + "step": 17904 + }, + { + "epoch": 0.6412161799201389, + "grad_norm": 2.6947035789489746, + "learning_rate": 6.0248488702498353e-05, + "loss": 1.1059, + "step": 17905 + }, + { + "epoch": 0.6412519920497073, + "grad_norm": 1.8094669580459595, + "learning_rate": 6.0237845766376035e-05, + "loss": 1.2298, + "step": 17906 + }, + { + "epoch": 0.6412878041792756, + "grad_norm": 1.3145873546600342, + "learning_rate": 6.022720336520218e-05, + "loss": 1.2527, + "step": 17907 + }, + { + "epoch": 0.6413236163088438, + "grad_norm": 1.6735002994537354, + "learning_rate": 6.021656149912003e-05, + "loss": 1.3314, + "step": 17908 + }, + { + "epoch": 0.6413594284384121, + "grad_norm": 1.5391485691070557, + "learning_rate": 6.020592016827271e-05, + "loss": 1.0677, + "step": 17909 + }, + { + "epoch": 0.6413952405679804, + "grad_norm": 1.4945452213287354, + "learning_rate": 6.019527937280342e-05, + "loss": 1.0157, + "step": 17910 + }, + { + "epoch": 0.6414310526975486, + "grad_norm": 2.049710988998413, + "learning_rate": 6.018463911285528e-05, + "loss": 1.0478, + "step": 17911 + }, + { + "epoch": 0.6414668648271169, + "grad_norm": 2.7168591022491455, + "learning_rate": 6.0173999388571486e-05, + "loss": 1.1546, + "step": 17912 + }, + { + "epoch": 0.6415026769566853, + "grad_norm": 1.3811497688293457, + "learning_rate": 6.0163360200095153e-05, + "loss": 1.0646, + "step": 17913 + }, + { + "epoch": 0.6415384890862535, + "grad_norm": 1.2397733926773071, + "learning_rate": 6.015272154756941e-05, + "loss": 1.1553, + "step": 17914 + }, + { + "epoch": 0.6415743012158218, + "grad_norm": 1.3227331638336182, + "learning_rate": 6.014208343113741e-05, + "loss": 1.0398, + "step": 17915 + }, + { + "epoch": 0.6416101133453901, + "grad_norm": 1.6126662492752075, + "learning_rate": 6.0131445850942256e-05, + "loss": 1.0978, + "step": 17916 + }, + { + "epoch": 0.6416459254749584, + "grad_norm": 2.190612554550171, + "learning_rate": 6.012080880712708e-05, + "loss": 0.9113, + "step": 17917 + }, + { + "epoch": 0.6416817376045266, + "grad_norm": 1.31301748752594, + "learning_rate": 6.011017229983497e-05, + "loss": 1.019, + "step": 17918 + }, + { + "epoch": 0.6417175497340949, + "grad_norm": 1.6182866096496582, + "learning_rate": 6.0099536329209046e-05, + "loss": 1.1891, + "step": 17919 + }, + { + "epoch": 0.6417533618636633, + "grad_norm": 1.2367134094238281, + "learning_rate": 6.008890089539239e-05, + "loss": 1.0434, + "step": 17920 + }, + { + "epoch": 0.6417891739932315, + "grad_norm": 1.4034374952316284, + "learning_rate": 6.0078265998528105e-05, + "loss": 0.969, + "step": 17921 + }, + { + "epoch": 0.6418249861227998, + "grad_norm": 1.2852174043655396, + "learning_rate": 6.006763163875925e-05, + "loss": 0.9583, + "step": 17922 + }, + { + "epoch": 0.6418607982523681, + "grad_norm": 2.008580446243286, + "learning_rate": 6.005699781622889e-05, + "loss": 1.1447, + "step": 17923 + }, + { + "epoch": 0.6418966103819364, + "grad_norm": 1.171587586402893, + "learning_rate": 6.00463645310801e-05, + "loss": 1.014, + "step": 17924 + }, + { + "epoch": 0.6419324225115046, + "grad_norm": 1.3918204307556152, + "learning_rate": 6.003573178345594e-05, + "loss": 1.0845, + "step": 17925 + }, + { + "epoch": 0.6419682346410729, + "grad_norm": 1.4979771375656128, + "learning_rate": 6.002509957349948e-05, + "loss": 1.0835, + "step": 17926 + }, + { + "epoch": 0.6420040467706413, + "grad_norm": 1.5120536088943481, + "learning_rate": 6.001446790135371e-05, + "loss": 1.0004, + "step": 17927 + }, + { + "epoch": 0.6420398589002095, + "grad_norm": 1.539136528968811, + "learning_rate": 6.0003836767161726e-05, + "loss": 1.0002, + "step": 17928 + }, + { + "epoch": 0.6420756710297778, + "grad_norm": 1.7289297580718994, + "learning_rate": 5.999320617106649e-05, + "loss": 1.2738, + "step": 17929 + }, + { + "epoch": 0.6421114831593461, + "grad_norm": 1.383885145187378, + "learning_rate": 5.9982576113211095e-05, + "loss": 0.9821, + "step": 17930 + }, + { + "epoch": 0.6421472952889143, + "grad_norm": 1.5390619039535522, + "learning_rate": 5.9971946593738525e-05, + "loss": 1.0454, + "step": 17931 + }, + { + "epoch": 0.6421831074184826, + "grad_norm": 1.453811764717102, + "learning_rate": 5.996131761279176e-05, + "loss": 1.2347, + "step": 17932 + }, + { + "epoch": 0.6422189195480509, + "grad_norm": 1.6120399236679077, + "learning_rate": 5.995068917051383e-05, + "loss": 1.0815, + "step": 17933 + }, + { + "epoch": 0.6422547316776193, + "grad_norm": 1.420595407485962, + "learning_rate": 5.9940061267047695e-05, + "loss": 1.146, + "step": 17934 + }, + { + "epoch": 0.6422905438071875, + "grad_norm": 1.9402658939361572, + "learning_rate": 5.992943390253639e-05, + "loss": 1.2697, + "step": 17935 + }, + { + "epoch": 0.6423263559367558, + "grad_norm": 1.215364933013916, + "learning_rate": 5.991880707712284e-05, + "loss": 1.0883, + "step": 17936 + }, + { + "epoch": 0.6423621680663241, + "grad_norm": 1.5722124576568604, + "learning_rate": 5.9908180790950064e-05, + "loss": 1.0389, + "step": 17937 + }, + { + "epoch": 0.6423979801958923, + "grad_norm": 1.4650877714157104, + "learning_rate": 5.989755504416098e-05, + "loss": 1.1291, + "step": 17938 + }, + { + "epoch": 0.6424337923254606, + "grad_norm": 1.6007808446884155, + "learning_rate": 5.988692983689859e-05, + "loss": 1.2965, + "step": 17939 + }, + { + "epoch": 0.6424696044550289, + "grad_norm": 1.307968258857727, + "learning_rate": 5.98763051693058e-05, + "loss": 0.981, + "step": 17940 + }, + { + "epoch": 0.6425054165845973, + "grad_norm": 1.7561975717544556, + "learning_rate": 5.9865681041525566e-05, + "loss": 1.2843, + "step": 17941 + }, + { + "epoch": 0.6425412287141655, + "grad_norm": 1.5296156406402588, + "learning_rate": 5.9855057453700836e-05, + "loss": 1.2587, + "step": 17942 + }, + { + "epoch": 0.6425770408437338, + "grad_norm": 1.9537322521209717, + "learning_rate": 5.98444344059745e-05, + "loss": 1.0661, + "step": 17943 + }, + { + "epoch": 0.6426128529733021, + "grad_norm": 1.4717364311218262, + "learning_rate": 5.9833811898489534e-05, + "loss": 1.098, + "step": 17944 + }, + { + "epoch": 0.6426486651028703, + "grad_norm": 1.6198629140853882, + "learning_rate": 5.982318993138879e-05, + "loss": 1.2074, + "step": 17945 + }, + { + "epoch": 0.6426844772324386, + "grad_norm": 1.3999427556991577, + "learning_rate": 5.981256850481523e-05, + "loss": 1.0618, + "step": 17946 + }, + { + "epoch": 0.6427202893620069, + "grad_norm": 1.5737831592559814, + "learning_rate": 5.980194761891169e-05, + "loss": 1.1914, + "step": 17947 + }, + { + "epoch": 0.6427561014915752, + "grad_norm": 1.4247572422027588, + "learning_rate": 5.9791327273821105e-05, + "loss": 1.1196, + "step": 17948 + }, + { + "epoch": 0.6427919136211435, + "grad_norm": 1.6196041107177734, + "learning_rate": 5.978070746968637e-05, + "loss": 1.0719, + "step": 17949 + }, + { + "epoch": 0.6428277257507118, + "grad_norm": 1.6522960662841797, + "learning_rate": 5.977008820665031e-05, + "loss": 1.3014, + "step": 17950 + }, + { + "epoch": 0.6428635378802801, + "grad_norm": 1.6688178777694702, + "learning_rate": 5.975946948485583e-05, + "loss": 1.2006, + "step": 17951 + }, + { + "epoch": 0.6428993500098483, + "grad_norm": 2.34222674369812, + "learning_rate": 5.974885130444577e-05, + "loss": 1.2018, + "step": 17952 + }, + { + "epoch": 0.6429351621394166, + "grad_norm": 1.2980868816375732, + "learning_rate": 5.9738233665563017e-05, + "loss": 0.9844, + "step": 17953 + }, + { + "epoch": 0.6429709742689849, + "grad_norm": 1.3329238891601562, + "learning_rate": 5.972761656835038e-05, + "loss": 0.939, + "step": 17954 + }, + { + "epoch": 0.6430067863985532, + "grad_norm": 1.481392502784729, + "learning_rate": 5.971700001295072e-05, + "loss": 1.0998, + "step": 17955 + }, + { + "epoch": 0.6430425985281215, + "grad_norm": 1.5144456624984741, + "learning_rate": 5.9706383999506855e-05, + "loss": 1.0723, + "step": 17956 + }, + { + "epoch": 0.6430784106576898, + "grad_norm": 1.591515064239502, + "learning_rate": 5.969576852816163e-05, + "loss": 1.1456, + "step": 17957 + }, + { + "epoch": 0.643114222787258, + "grad_norm": 1.7955058813095093, + "learning_rate": 5.968515359905785e-05, + "loss": 1.0429, + "step": 17958 + }, + { + "epoch": 0.6431500349168263, + "grad_norm": 2.487260103225708, + "learning_rate": 5.967453921233832e-05, + "loss": 1.2473, + "step": 17959 + }, + { + "epoch": 0.6431858470463946, + "grad_norm": 1.938642144203186, + "learning_rate": 5.966392536814585e-05, + "loss": 1.0436, + "step": 17960 + }, + { + "epoch": 0.6432216591759629, + "grad_norm": 1.946384072303772, + "learning_rate": 5.9653312066623234e-05, + "loss": 1.3621, + "step": 17961 + }, + { + "epoch": 0.6432574713055312, + "grad_norm": 1.5162551403045654, + "learning_rate": 5.964269930791326e-05, + "loss": 1.0666, + "step": 17962 + }, + { + "epoch": 0.6432932834350995, + "grad_norm": 1.4482696056365967, + "learning_rate": 5.963208709215871e-05, + "loss": 1.0373, + "step": 17963 + }, + { + "epoch": 0.6433290955646678, + "grad_norm": 1.5157848596572876, + "learning_rate": 5.962147541950236e-05, + "loss": 1.132, + "step": 17964 + }, + { + "epoch": 0.643364907694236, + "grad_norm": 1.6332885026931763, + "learning_rate": 5.961086429008696e-05, + "loss": 0.984, + "step": 17965 + }, + { + "epoch": 0.6434007198238043, + "grad_norm": 1.5445870161056519, + "learning_rate": 5.960025370405531e-05, + "loss": 1.0278, + "step": 17966 + }, + { + "epoch": 0.6434365319533726, + "grad_norm": 1.3605259656906128, + "learning_rate": 5.958964366155014e-05, + "loss": 1.0272, + "step": 17967 + }, + { + "epoch": 0.6434723440829409, + "grad_norm": 1.628433346748352, + "learning_rate": 5.957903416271414e-05, + "loss": 1.22, + "step": 17968 + }, + { + "epoch": 0.6435081562125092, + "grad_norm": 1.6366662979125977, + "learning_rate": 5.9568425207690146e-05, + "loss": 1.1, + "step": 17969 + }, + { + "epoch": 0.6435439683420775, + "grad_norm": 1.5633574724197388, + "learning_rate": 5.9557816796620804e-05, + "loss": 1.26, + "step": 17970 + }, + { + "epoch": 0.6435797804716458, + "grad_norm": 1.9171397686004639, + "learning_rate": 5.954720892964889e-05, + "loss": 1.0558, + "step": 17971 + }, + { + "epoch": 0.643615592601214, + "grad_norm": 1.4796490669250488, + "learning_rate": 5.9536601606917075e-05, + "loss": 1.0884, + "step": 17972 + }, + { + "epoch": 0.6436514047307823, + "grad_norm": 1.6093597412109375, + "learning_rate": 5.952599482856811e-05, + "loss": 1.2254, + "step": 17973 + }, + { + "epoch": 0.6436872168603506, + "grad_norm": 1.5068507194519043, + "learning_rate": 5.951538859474467e-05, + "loss": 1.1639, + "step": 17974 + }, + { + "epoch": 0.6437230289899188, + "grad_norm": 1.5001311302185059, + "learning_rate": 5.950478290558947e-05, + "loss": 1.1663, + "step": 17975 + }, + { + "epoch": 0.6437588411194872, + "grad_norm": 1.7065972089767456, + "learning_rate": 5.9494177761245194e-05, + "loss": 1.1245, + "step": 17976 + }, + { + "epoch": 0.6437946532490555, + "grad_norm": 1.45173180103302, + "learning_rate": 5.9483573161854464e-05, + "loss": 1.1795, + "step": 17977 + }, + { + "epoch": 0.6438304653786238, + "grad_norm": 1.3293328285217285, + "learning_rate": 5.947296910756004e-05, + "loss": 0.9546, + "step": 17978 + }, + { + "epoch": 0.643866277508192, + "grad_norm": 1.4761844873428345, + "learning_rate": 5.946236559850449e-05, + "loss": 1.0747, + "step": 17979 + }, + { + "epoch": 0.6439020896377603, + "grad_norm": 1.569812297821045, + "learning_rate": 5.945176263483057e-05, + "loss": 1.2538, + "step": 17980 + }, + { + "epoch": 0.6439379017673286, + "grad_norm": 1.5008883476257324, + "learning_rate": 5.9441160216680826e-05, + "loss": 1.2917, + "step": 17981 + }, + { + "epoch": 0.6439737138968968, + "grad_norm": 1.635762333869934, + "learning_rate": 5.9430558344198016e-05, + "loss": 1.1554, + "step": 17982 + }, + { + "epoch": 0.6440095260264652, + "grad_norm": 1.3335555791854858, + "learning_rate": 5.941995701752465e-05, + "loss": 1.0875, + "step": 17983 + }, + { + "epoch": 0.6440453381560335, + "grad_norm": 1.6645458936691284, + "learning_rate": 5.9409356236803456e-05, + "loss": 1.2426, + "step": 17984 + }, + { + "epoch": 0.6440811502856018, + "grad_norm": 2.047480583190918, + "learning_rate": 5.9398756002177035e-05, + "loss": 1.1057, + "step": 17985 + }, + { + "epoch": 0.64411696241517, + "grad_norm": 1.4342308044433594, + "learning_rate": 5.938815631378794e-05, + "loss": 1.0609, + "step": 17986 + }, + { + "epoch": 0.6441527745447383, + "grad_norm": 1.5981781482696533, + "learning_rate": 5.937755717177885e-05, + "loss": 1.0315, + "step": 17987 + }, + { + "epoch": 0.6441885866743066, + "grad_norm": 1.661577820777893, + "learning_rate": 5.9366958576292284e-05, + "loss": 1.1783, + "step": 17988 + }, + { + "epoch": 0.6442243988038748, + "grad_norm": 1.544746994972229, + "learning_rate": 5.9356360527470934e-05, + "loss": 1.0649, + "step": 17989 + }, + { + "epoch": 0.6442602109334432, + "grad_norm": 1.5781947374343872, + "learning_rate": 5.9345763025457266e-05, + "loss": 1.2522, + "step": 17990 + }, + { + "epoch": 0.6442960230630115, + "grad_norm": 1.2638951539993286, + "learning_rate": 5.9335166070393975e-05, + "loss": 1.1457, + "step": 17991 + }, + { + "epoch": 0.6443318351925797, + "grad_norm": 1.573810338973999, + "learning_rate": 5.93245696624235e-05, + "loss": 1.179, + "step": 17992 + }, + { + "epoch": 0.644367647322148, + "grad_norm": 1.2695993185043335, + "learning_rate": 5.931397380168855e-05, + "loss": 0.9941, + "step": 17993 + }, + { + "epoch": 0.6444034594517163, + "grad_norm": 2.2069716453552246, + "learning_rate": 5.9303378488331576e-05, + "loss": 1.2644, + "step": 17994 + }, + { + "epoch": 0.6444392715812846, + "grad_norm": 1.6211819648742676, + "learning_rate": 5.9292783722495126e-05, + "loss": 1.0678, + "step": 17995 + }, + { + "epoch": 0.6444750837108528, + "grad_norm": 1.396362543106079, + "learning_rate": 5.928218950432179e-05, + "loss": 0.9382, + "step": 17996 + }, + { + "epoch": 0.6445108958404212, + "grad_norm": 1.3180031776428223, + "learning_rate": 5.927159583395403e-05, + "loss": 1.0089, + "step": 17997 + }, + { + "epoch": 0.6445467079699895, + "grad_norm": 1.4755123853683472, + "learning_rate": 5.926100271153446e-05, + "loss": 1.0795, + "step": 17998 + }, + { + "epoch": 0.6445825200995577, + "grad_norm": 1.3932567834854126, + "learning_rate": 5.9250410137205506e-05, + "loss": 1.0337, + "step": 17999 + }, + { + "epoch": 0.644618332229126, + "grad_norm": 1.5334804058074951, + "learning_rate": 5.923981811110977e-05, + "loss": 1.2845, + "step": 18000 + }, + { + "epoch": 0.6446541443586943, + "grad_norm": 1.7325814962387085, + "learning_rate": 5.922922663338969e-05, + "loss": 1.1999, + "step": 18001 + }, + { + "epoch": 0.6446899564882625, + "grad_norm": 1.3228874206542969, + "learning_rate": 5.921863570418775e-05, + "loss": 1.1414, + "step": 18002 + }, + { + "epoch": 0.6447257686178308, + "grad_norm": 1.2044744491577148, + "learning_rate": 5.9208045323646474e-05, + "loss": 1.0246, + "step": 18003 + }, + { + "epoch": 0.6447615807473992, + "grad_norm": 1.8773850202560425, + "learning_rate": 5.919745549190834e-05, + "loss": 1.0858, + "step": 18004 + }, + { + "epoch": 0.6447973928769675, + "grad_norm": 1.3871314525604248, + "learning_rate": 5.91868662091158e-05, + "loss": 1.1082, + "step": 18005 + }, + { + "epoch": 0.6448332050065357, + "grad_norm": 1.3976218700408936, + "learning_rate": 5.9176277475411324e-05, + "loss": 1.0179, + "step": 18006 + }, + { + "epoch": 0.644869017136104, + "grad_norm": 1.6874144077301025, + "learning_rate": 5.91656892909374e-05, + "loss": 0.9615, + "step": 18007 + }, + { + "epoch": 0.6449048292656723, + "grad_norm": 1.3603365421295166, + "learning_rate": 5.915510165583642e-05, + "loss": 0.9883, + "step": 18008 + }, + { + "epoch": 0.6449406413952405, + "grad_norm": 1.3469597101211548, + "learning_rate": 5.91445145702509e-05, + "loss": 1.004, + "step": 18009 + }, + { + "epoch": 0.6449764535248088, + "grad_norm": 1.4085674285888672, + "learning_rate": 5.9133928034323215e-05, + "loss": 1.2003, + "step": 18010 + }, + { + "epoch": 0.6450122656543772, + "grad_norm": 1.8905491828918457, + "learning_rate": 5.912334204819581e-05, + "loss": 1.2429, + "step": 18011 + }, + { + "epoch": 0.6450480777839455, + "grad_norm": 1.5442070960998535, + "learning_rate": 5.911275661201112e-05, + "loss": 1.1735, + "step": 18012 + }, + { + "epoch": 0.6450838899135137, + "grad_norm": 1.5276068449020386, + "learning_rate": 5.910217172591155e-05, + "loss": 1.1121, + "step": 18013 + }, + { + "epoch": 0.645119702043082, + "grad_norm": 1.5478541851043701, + "learning_rate": 5.90915873900395e-05, + "loss": 0.937, + "step": 18014 + }, + { + "epoch": 0.6451555141726503, + "grad_norm": 1.6498082876205444, + "learning_rate": 5.908100360453737e-05, + "loss": 1.2166, + "step": 18015 + }, + { + "epoch": 0.6451913263022185, + "grad_norm": 1.2209423780441284, + "learning_rate": 5.9070420369547564e-05, + "loss": 0.9801, + "step": 18016 + }, + { + "epoch": 0.6452271384317868, + "grad_norm": 1.6045557260513306, + "learning_rate": 5.905983768521244e-05, + "loss": 0.9999, + "step": 18017 + }, + { + "epoch": 0.6452629505613552, + "grad_norm": 1.2900501489639282, + "learning_rate": 5.904925555167442e-05, + "loss": 0.9344, + "step": 18018 + }, + { + "epoch": 0.6452987626909235, + "grad_norm": 2.214294195175171, + "learning_rate": 5.903867396907583e-05, + "loss": 0.8918, + "step": 18019 + }, + { + "epoch": 0.6453345748204917, + "grad_norm": 1.4780917167663574, + "learning_rate": 5.9028092937559034e-05, + "loss": 1.1964, + "step": 18020 + }, + { + "epoch": 0.64537038695006, + "grad_norm": 1.559254765510559, + "learning_rate": 5.901751245726641e-05, + "loss": 1.2692, + "step": 18021 + }, + { + "epoch": 0.6454061990796283, + "grad_norm": 1.5446901321411133, + "learning_rate": 5.9006932528340284e-05, + "loss": 1.124, + "step": 18022 + }, + { + "epoch": 0.6454420112091965, + "grad_norm": 2.288935899734497, + "learning_rate": 5.899635315092301e-05, + "loss": 1.2055, + "step": 18023 + }, + { + "epoch": 0.6454778233387648, + "grad_norm": 1.7113425731658936, + "learning_rate": 5.89857743251569e-05, + "loss": 1.0306, + "step": 18024 + }, + { + "epoch": 0.6455136354683331, + "grad_norm": 1.4857239723205566, + "learning_rate": 5.897519605118431e-05, + "loss": 1.0235, + "step": 18025 + }, + { + "epoch": 0.6455494475979014, + "grad_norm": 1.7002958059310913, + "learning_rate": 5.896461832914753e-05, + "loss": 1.2004, + "step": 18026 + }, + { + "epoch": 0.6455852597274697, + "grad_norm": 1.4802786111831665, + "learning_rate": 5.8954041159188876e-05, + "loss": 1.3069, + "step": 18027 + }, + { + "epoch": 0.645621071857038, + "grad_norm": 1.5230262279510498, + "learning_rate": 5.894346454145068e-05, + "loss": 1.0496, + "step": 18028 + }, + { + "epoch": 0.6456568839866063, + "grad_norm": 1.4920628070831299, + "learning_rate": 5.8932888476075166e-05, + "loss": 1.1612, + "step": 18029 + }, + { + "epoch": 0.6456926961161745, + "grad_norm": 1.6635183095932007, + "learning_rate": 5.89223129632047e-05, + "loss": 1.0406, + "step": 18030 + }, + { + "epoch": 0.6457285082457428, + "grad_norm": 1.6536755561828613, + "learning_rate": 5.8911738002981506e-05, + "loss": 1.1211, + "step": 18031 + }, + { + "epoch": 0.6457643203753111, + "grad_norm": 1.520553469657898, + "learning_rate": 5.890116359554789e-05, + "loss": 1.049, + "step": 18032 + }, + { + "epoch": 0.6458001325048794, + "grad_norm": 1.3916207551956177, + "learning_rate": 5.8890589741046084e-05, + "loss": 1.2142, + "step": 18033 + }, + { + "epoch": 0.6458359446344477, + "grad_norm": 1.508259654045105, + "learning_rate": 5.888001643961839e-05, + "loss": 1.0807, + "step": 18034 + }, + { + "epoch": 0.645871756764016, + "grad_norm": 1.71613609790802, + "learning_rate": 5.886944369140701e-05, + "loss": 1.3353, + "step": 18035 + }, + { + "epoch": 0.6459075688935842, + "grad_norm": 1.4770381450653076, + "learning_rate": 5.8858871496554235e-05, + "loss": 1.0553, + "step": 18036 + }, + { + "epoch": 0.6459433810231525, + "grad_norm": 2.1125714778900146, + "learning_rate": 5.884829985520227e-05, + "loss": 1.2427, + "step": 18037 + }, + { + "epoch": 0.6459791931527208, + "grad_norm": 1.6731557846069336, + "learning_rate": 5.883772876749334e-05, + "loss": 0.9933, + "step": 18038 + }, + { + "epoch": 0.6460150052822891, + "grad_norm": 1.464522123336792, + "learning_rate": 5.882715823356968e-05, + "loss": 1.0148, + "step": 18039 + }, + { + "epoch": 0.6460508174118574, + "grad_norm": 2.428391933441162, + "learning_rate": 5.881658825357348e-05, + "loss": 1.2799, + "step": 18040 + }, + { + "epoch": 0.6460866295414257, + "grad_norm": 1.4142602682113647, + "learning_rate": 5.8806018827646994e-05, + "loss": 1.1015, + "step": 18041 + }, + { + "epoch": 0.646122441670994, + "grad_norm": 1.8987945318222046, + "learning_rate": 5.879544995593236e-05, + "loss": 1.2465, + "step": 18042 + }, + { + "epoch": 0.6461582538005622, + "grad_norm": 1.3141913414001465, + "learning_rate": 5.878488163857181e-05, + "loss": 1.0035, + "step": 18043 + }, + { + "epoch": 0.6461940659301305, + "grad_norm": 1.5781632661819458, + "learning_rate": 5.87743138757075e-05, + "loss": 1.164, + "step": 18044 + }, + { + "epoch": 0.6462298780596988, + "grad_norm": 1.471757173538208, + "learning_rate": 5.8763746667481634e-05, + "loss": 1.0197, + "step": 18045 + }, + { + "epoch": 0.646265690189267, + "grad_norm": 1.5663573741912842, + "learning_rate": 5.8753180014036377e-05, + "loss": 1.1148, + "step": 18046 + }, + { + "epoch": 0.6463015023188354, + "grad_norm": 1.5609619617462158, + "learning_rate": 5.874261391551386e-05, + "loss": 1.1797, + "step": 18047 + }, + { + "epoch": 0.6463373144484037, + "grad_norm": 1.5940922498703003, + "learning_rate": 5.873204837205626e-05, + "loss": 1.1537, + "step": 18048 + }, + { + "epoch": 0.646373126577972, + "grad_norm": 1.546802282333374, + "learning_rate": 5.8721483383805696e-05, + "loss": 1.1103, + "step": 18049 + }, + { + "epoch": 0.6464089387075402, + "grad_norm": 1.68024742603302, + "learning_rate": 5.871091895090437e-05, + "loss": 0.9629, + "step": 18050 + }, + { + "epoch": 0.6464447508371085, + "grad_norm": 1.7670350074768066, + "learning_rate": 5.870035507349434e-05, + "loss": 1.3847, + "step": 18051 + }, + { + "epoch": 0.6464805629666768, + "grad_norm": 1.2105357646942139, + "learning_rate": 5.8689791751717757e-05, + "loss": 0.9835, + "step": 18052 + }, + { + "epoch": 0.646516375096245, + "grad_norm": 1.3851059675216675, + "learning_rate": 5.867922898571675e-05, + "loss": 1.2643, + "step": 18053 + }, + { + "epoch": 0.6465521872258134, + "grad_norm": 1.9058839082717896, + "learning_rate": 5.8668666775633426e-05, + "loss": 1.3283, + "step": 18054 + }, + { + "epoch": 0.6465879993553817, + "grad_norm": 1.8175537586212158, + "learning_rate": 5.8658105121609896e-05, + "loss": 1.1012, + "step": 18055 + }, + { + "epoch": 0.64662381148495, + "grad_norm": 1.5975227355957031, + "learning_rate": 5.864754402378818e-05, + "loss": 1.1544, + "step": 18056 + }, + { + "epoch": 0.6466596236145182, + "grad_norm": 1.3838930130004883, + "learning_rate": 5.863698348231045e-05, + "loss": 1.0935, + "step": 18057 + }, + { + "epoch": 0.6466954357440865, + "grad_norm": 1.3896175622940063, + "learning_rate": 5.862642349731874e-05, + "loss": 1.1004, + "step": 18058 + }, + { + "epoch": 0.6467312478736548, + "grad_norm": 1.4262526035308838, + "learning_rate": 5.861586406895514e-05, + "loss": 0.9632, + "step": 18059 + }, + { + "epoch": 0.646767060003223, + "grad_norm": 1.2550119161605835, + "learning_rate": 5.8605305197361705e-05, + "loss": 1.1288, + "step": 18060 + }, + { + "epoch": 0.6468028721327914, + "grad_norm": 1.3308850526809692, + "learning_rate": 5.859474688268051e-05, + "loss": 1.1746, + "step": 18061 + }, + { + "epoch": 0.6468386842623597, + "grad_norm": 1.7711286544799805, + "learning_rate": 5.8584189125053556e-05, + "loss": 0.9966, + "step": 18062 + }, + { + "epoch": 0.646874496391928, + "grad_norm": 1.3883211612701416, + "learning_rate": 5.857363192462294e-05, + "loss": 1.1582, + "step": 18063 + }, + { + "epoch": 0.6469103085214962, + "grad_norm": 1.3589227199554443, + "learning_rate": 5.8563075281530685e-05, + "loss": 0.879, + "step": 18064 + }, + { + "epoch": 0.6469461206510645, + "grad_norm": 1.5748209953308105, + "learning_rate": 5.855251919591875e-05, + "loss": 1.1609, + "step": 18065 + }, + { + "epoch": 0.6469819327806328, + "grad_norm": 1.6474965810775757, + "learning_rate": 5.8541963667929276e-05, + "loss": 1.1727, + "step": 18066 + }, + { + "epoch": 0.647017744910201, + "grad_norm": 1.4741188287734985, + "learning_rate": 5.8531408697704124e-05, + "loss": 0.9578, + "step": 18067 + }, + { + "epoch": 0.6470535570397694, + "grad_norm": 1.5819783210754395, + "learning_rate": 5.852085428538545e-05, + "loss": 1.1396, + "step": 18068 + }, + { + "epoch": 0.6470893691693377, + "grad_norm": 1.6077594757080078, + "learning_rate": 5.851030043111512e-05, + "loss": 1.2439, + "step": 18069 + }, + { + "epoch": 0.647125181298906, + "grad_norm": 1.5684207677841187, + "learning_rate": 5.849974713503521e-05, + "loss": 1.1562, + "step": 18070 + }, + { + "epoch": 0.6471609934284742, + "grad_norm": 1.4573549032211304, + "learning_rate": 5.848919439728765e-05, + "loss": 1.0016, + "step": 18071 + }, + { + "epoch": 0.6471968055580425, + "grad_norm": 1.4751511812210083, + "learning_rate": 5.847864221801446e-05, + "loss": 1.1344, + "step": 18072 + }, + { + "epoch": 0.6472326176876108, + "grad_norm": 1.4860271215438843, + "learning_rate": 5.8468090597357595e-05, + "loss": 1.191, + "step": 18073 + }, + { + "epoch": 0.647268429817179, + "grad_norm": 1.687484860420227, + "learning_rate": 5.845753953545894e-05, + "loss": 1.0689, + "step": 18074 + }, + { + "epoch": 0.6473042419467474, + "grad_norm": 1.9489156007766724, + "learning_rate": 5.8446989032460574e-05, + "loss": 1.1854, + "step": 18075 + }, + { + "epoch": 0.6473400540763157, + "grad_norm": 1.3509007692337036, + "learning_rate": 5.84364390885043e-05, + "loss": 0.8616, + "step": 18076 + }, + { + "epoch": 0.6473758662058839, + "grad_norm": 1.5171451568603516, + "learning_rate": 5.8425889703732193e-05, + "loss": 1.0884, + "step": 18077 + }, + { + "epoch": 0.6474116783354522, + "grad_norm": 1.3863856792449951, + "learning_rate": 5.841534087828604e-05, + "loss": 0.9769, + "step": 18078 + }, + { + "epoch": 0.6474474904650205, + "grad_norm": 1.6990084648132324, + "learning_rate": 5.840479261230791e-05, + "loss": 1.1546, + "step": 18079 + }, + { + "epoch": 0.6474833025945887, + "grad_norm": 1.3314852714538574, + "learning_rate": 5.839424490593957e-05, + "loss": 1.1662, + "step": 18080 + }, + { + "epoch": 0.647519114724157, + "grad_norm": 2.514158010482788, + "learning_rate": 5.8383697759323045e-05, + "loss": 0.9643, + "step": 18081 + }, + { + "epoch": 0.6475549268537254, + "grad_norm": 1.3921856880187988, + "learning_rate": 5.8373151172600207e-05, + "loss": 0.9657, + "step": 18082 + }, + { + "epoch": 0.6475907389832937, + "grad_norm": 1.329740285873413, + "learning_rate": 5.836260514591287e-05, + "loss": 1.2992, + "step": 18083 + }, + { + "epoch": 0.6476265511128619, + "grad_norm": 1.500257134437561, + "learning_rate": 5.8352059679402994e-05, + "loss": 1.1314, + "step": 18084 + }, + { + "epoch": 0.6476623632424302, + "grad_norm": 1.4567893743515015, + "learning_rate": 5.834151477321242e-05, + "loss": 1.0395, + "step": 18085 + }, + { + "epoch": 0.6476981753719985, + "grad_norm": 1.857908010482788, + "learning_rate": 5.833097042748308e-05, + "loss": 1.1224, + "step": 18086 + }, + { + "epoch": 0.6477339875015667, + "grad_norm": 1.5113495588302612, + "learning_rate": 5.832042664235673e-05, + "loss": 0.9987, + "step": 18087 + }, + { + "epoch": 0.647769799631135, + "grad_norm": 1.4113749265670776, + "learning_rate": 5.8309883417975275e-05, + "loss": 1.052, + "step": 18088 + }, + { + "epoch": 0.6478056117607034, + "grad_norm": 1.2010306119918823, + "learning_rate": 5.829934075448058e-05, + "loss": 0.9977, + "step": 18089 + }, + { + "epoch": 0.6478414238902717, + "grad_norm": 1.4691741466522217, + "learning_rate": 5.8288798652014485e-05, + "loss": 1.1546, + "step": 18090 + }, + { + "epoch": 0.6478772360198399, + "grad_norm": 1.2451509237289429, + "learning_rate": 5.827825711071877e-05, + "loss": 1.0044, + "step": 18091 + }, + { + "epoch": 0.6479130481494082, + "grad_norm": 1.5325182676315308, + "learning_rate": 5.8267716130735295e-05, + "loss": 1.0837, + "step": 18092 + }, + { + "epoch": 0.6479488602789765, + "grad_norm": 1.7997934818267822, + "learning_rate": 5.82571757122059e-05, + "loss": 1.1036, + "step": 18093 + }, + { + "epoch": 0.6479846724085447, + "grad_norm": 1.43526029586792, + "learning_rate": 5.824663585527232e-05, + "loss": 1.214, + "step": 18094 + }, + { + "epoch": 0.648020484538113, + "grad_norm": 1.321129322052002, + "learning_rate": 5.8236096560076405e-05, + "loss": 0.9853, + "step": 18095 + }, + { + "epoch": 0.6480562966676814, + "grad_norm": 1.4673653841018677, + "learning_rate": 5.8225557826759935e-05, + "loss": 1.149, + "step": 18096 + }, + { + "epoch": 0.6480921087972497, + "grad_norm": 1.4697198867797852, + "learning_rate": 5.821501965546474e-05, + "loss": 1.0903, + "step": 18097 + }, + { + "epoch": 0.6481279209268179, + "grad_norm": 1.4593859910964966, + "learning_rate": 5.820448204633251e-05, + "loss": 1.1353, + "step": 18098 + }, + { + "epoch": 0.6481637330563862, + "grad_norm": 1.4053341150283813, + "learning_rate": 5.819394499950508e-05, + "loss": 1.1282, + "step": 18099 + }, + { + "epoch": 0.6481995451859545, + "grad_norm": 1.530137538909912, + "learning_rate": 5.8183408515124216e-05, + "loss": 1.132, + "step": 18100 + }, + { + "epoch": 0.6482353573155227, + "grad_norm": 1.8855100870132446, + "learning_rate": 5.817287259333162e-05, + "loss": 1.2824, + "step": 18101 + }, + { + "epoch": 0.648271169445091, + "grad_norm": 1.6849615573883057, + "learning_rate": 5.816233723426907e-05, + "loss": 1.0748, + "step": 18102 + }, + { + "epoch": 0.6483069815746594, + "grad_norm": 1.4272104501724243, + "learning_rate": 5.81518024380783e-05, + "loss": 1.417, + "step": 18103 + }, + { + "epoch": 0.6483427937042276, + "grad_norm": 1.2410340309143066, + "learning_rate": 5.814126820490109e-05, + "loss": 1.1851, + "step": 18104 + }, + { + "epoch": 0.6483786058337959, + "grad_norm": 1.7238903045654297, + "learning_rate": 5.8130734534879075e-05, + "loss": 1.2333, + "step": 18105 + }, + { + "epoch": 0.6484144179633642, + "grad_norm": 1.909754991531372, + "learning_rate": 5.812020142815403e-05, + "loss": 1.1583, + "step": 18106 + }, + { + "epoch": 0.6484502300929325, + "grad_norm": 1.469934105873108, + "learning_rate": 5.810966888486768e-05, + "loss": 0.8334, + "step": 18107 + }, + { + "epoch": 0.6484860422225007, + "grad_norm": 1.4874470233917236, + "learning_rate": 5.809913690516169e-05, + "loss": 0.9384, + "step": 18108 + }, + { + "epoch": 0.648521854352069, + "grad_norm": 1.5261298418045044, + "learning_rate": 5.808860548917778e-05, + "loss": 1.0868, + "step": 18109 + }, + { + "epoch": 0.6485576664816374, + "grad_norm": 1.7725774049758911, + "learning_rate": 5.807807463705754e-05, + "loss": 1.0703, + "step": 18110 + }, + { + "epoch": 0.6485934786112056, + "grad_norm": 1.516332983970642, + "learning_rate": 5.8067544348942825e-05, + "loss": 1.3553, + "step": 18111 + }, + { + "epoch": 0.6486292907407739, + "grad_norm": 1.3919403553009033, + "learning_rate": 5.805701462497517e-05, + "loss": 1.259, + "step": 18112 + }, + { + "epoch": 0.6486651028703422, + "grad_norm": 1.6061723232269287, + "learning_rate": 5.804648546529627e-05, + "loss": 1.3479, + "step": 18113 + }, + { + "epoch": 0.6487009149999104, + "grad_norm": 1.4668267965316772, + "learning_rate": 5.803595687004779e-05, + "loss": 1.1306, + "step": 18114 + }, + { + "epoch": 0.6487367271294787, + "grad_norm": 2.0939507484436035, + "learning_rate": 5.802542883937143e-05, + "loss": 1.3079, + "step": 18115 + }, + { + "epoch": 0.648772539259047, + "grad_norm": 1.6359663009643555, + "learning_rate": 5.801490137340879e-05, + "loss": 0.9416, + "step": 18116 + }, + { + "epoch": 0.6488083513886154, + "grad_norm": 1.309651494026184, + "learning_rate": 5.80043744723014e-05, + "loss": 1.0107, + "step": 18117 + }, + { + "epoch": 0.6488441635181836, + "grad_norm": 1.473883032798767, + "learning_rate": 5.7993848136191065e-05, + "loss": 1.1717, + "step": 18118 + }, + { + "epoch": 0.6488799756477519, + "grad_norm": 2.037435531616211, + "learning_rate": 5.7983322365219287e-05, + "loss": 1.1044, + "step": 18119 + }, + { + "epoch": 0.6489157877773202, + "grad_norm": 1.4827653169631958, + "learning_rate": 5.797279715952774e-05, + "loss": 1.224, + "step": 18120 + }, + { + "epoch": 0.6489515999068884, + "grad_norm": 1.8447620868682861, + "learning_rate": 5.796227251925792e-05, + "loss": 1.2139, + "step": 18121 + }, + { + "epoch": 0.6489874120364567, + "grad_norm": 1.7194759845733643, + "learning_rate": 5.795174844455157e-05, + "loss": 1.2311, + "step": 18122 + }, + { + "epoch": 0.649023224166025, + "grad_norm": 1.3712283372879028, + "learning_rate": 5.7941224935550166e-05, + "loss": 1.1055, + "step": 18123 + }, + { + "epoch": 0.6490590362955934, + "grad_norm": 1.6845616102218628, + "learning_rate": 5.793070199239534e-05, + "loss": 0.9803, + "step": 18124 + }, + { + "epoch": 0.6490948484251616, + "grad_norm": 1.5726016759872437, + "learning_rate": 5.7920179615228684e-05, + "loss": 1.0967, + "step": 18125 + }, + { + "epoch": 0.6491306605547299, + "grad_norm": 1.3062965869903564, + "learning_rate": 5.790965780419171e-05, + "loss": 0.8943, + "step": 18126 + }, + { + "epoch": 0.6491664726842982, + "grad_norm": 1.57351553440094, + "learning_rate": 5.7899136559426015e-05, + "loss": 0.9472, + "step": 18127 + }, + { + "epoch": 0.6492022848138664, + "grad_norm": 1.5236603021621704, + "learning_rate": 5.788861588107306e-05, + "loss": 1.3017, + "step": 18128 + }, + { + "epoch": 0.6492380969434347, + "grad_norm": 1.8247721195220947, + "learning_rate": 5.787809576927454e-05, + "loss": 1.0336, + "step": 18129 + }, + { + "epoch": 0.649273909073003, + "grad_norm": 1.4638160467147827, + "learning_rate": 5.786757622417187e-05, + "loss": 1.1916, + "step": 18130 + }, + { + "epoch": 0.6493097212025714, + "grad_norm": 1.5376218557357788, + "learning_rate": 5.7857057245906656e-05, + "loss": 1.2032, + "step": 18131 + }, + { + "epoch": 0.6493455333321396, + "grad_norm": 1.364381194114685, + "learning_rate": 5.784653883462029e-05, + "loss": 0.9933, + "step": 18132 + }, + { + "epoch": 0.6493813454617079, + "grad_norm": 1.2848522663116455, + "learning_rate": 5.7836020990454444e-05, + "loss": 1.0431, + "step": 18133 + }, + { + "epoch": 0.6494171575912762, + "grad_norm": 1.4732685089111328, + "learning_rate": 5.7825503713550555e-05, + "loss": 1.0373, + "step": 18134 + }, + { + "epoch": 0.6494529697208444, + "grad_norm": 1.3221296072006226, + "learning_rate": 5.7814987004050084e-05, + "loss": 0.9552, + "step": 18135 + }, + { + "epoch": 0.6494887818504127, + "grad_norm": 1.5082221031188965, + "learning_rate": 5.780447086209453e-05, + "loss": 0.902, + "step": 18136 + }, + { + "epoch": 0.649524593979981, + "grad_norm": 1.7047462463378906, + "learning_rate": 5.779395528782541e-05, + "loss": 1.2155, + "step": 18137 + }, + { + "epoch": 0.6495604061095493, + "grad_norm": 1.5106382369995117, + "learning_rate": 5.7783440281384205e-05, + "loss": 1.0242, + "step": 18138 + }, + { + "epoch": 0.6495962182391176, + "grad_norm": 1.2710767984390259, + "learning_rate": 5.777292584291227e-05, + "loss": 1.1355, + "step": 18139 + }, + { + "epoch": 0.6496320303686859, + "grad_norm": 1.337153434753418, + "learning_rate": 5.7762411972551254e-05, + "loss": 0.9697, + "step": 18140 + }, + { + "epoch": 0.6496678424982542, + "grad_norm": 1.432822346687317, + "learning_rate": 5.775189867044244e-05, + "loss": 1.1501, + "step": 18141 + }, + { + "epoch": 0.6497036546278224, + "grad_norm": 1.5895346403121948, + "learning_rate": 5.7741385936727375e-05, + "loss": 1.2435, + "step": 18142 + }, + { + "epoch": 0.6497394667573907, + "grad_norm": 1.7064470052719116, + "learning_rate": 5.7730873771547423e-05, + "loss": 1.1293, + "step": 18143 + }, + { + "epoch": 0.649775278886959, + "grad_norm": 1.9728573560714722, + "learning_rate": 5.772036217504404e-05, + "loss": 1.1164, + "step": 18144 + }, + { + "epoch": 0.6498110910165273, + "grad_norm": 1.36774480342865, + "learning_rate": 5.770985114735868e-05, + "loss": 1.0725, + "step": 18145 + }, + { + "epoch": 0.6498469031460956, + "grad_norm": 1.5648484230041504, + "learning_rate": 5.76993406886327e-05, + "loss": 1.0846, + "step": 18146 + }, + { + "epoch": 0.6498827152756639, + "grad_norm": 1.4013751745224, + "learning_rate": 5.768883079900751e-05, + "loss": 1.1277, + "step": 18147 + }, + { + "epoch": 0.6499185274052321, + "grad_norm": 1.6552801132202148, + "learning_rate": 5.767832147862452e-05, + "loss": 1.1634, + "step": 18148 + }, + { + "epoch": 0.6499543395348004, + "grad_norm": 2.055565357208252, + "learning_rate": 5.7667812727625184e-05, + "loss": 0.9312, + "step": 18149 + }, + { + "epoch": 0.6499901516643687, + "grad_norm": 1.5856667757034302, + "learning_rate": 5.765730454615072e-05, + "loss": 1.1895, + "step": 18150 + }, + { + "epoch": 0.650025963793937, + "grad_norm": 2.0048606395721436, + "learning_rate": 5.764679693434269e-05, + "loss": 1.1968, + "step": 18151 + }, + { + "epoch": 0.6500617759235053, + "grad_norm": 1.5393142700195312, + "learning_rate": 5.763628989234238e-05, + "loss": 1.084, + "step": 18152 + }, + { + "epoch": 0.6500975880530736, + "grad_norm": 1.7248783111572266, + "learning_rate": 5.76257834202911e-05, + "loss": 1.0659, + "step": 18153 + }, + { + "epoch": 0.6501334001826419, + "grad_norm": 1.4424023628234863, + "learning_rate": 5.761527751833026e-05, + "loss": 1.1311, + "step": 18154 + }, + { + "epoch": 0.6501692123122101, + "grad_norm": 1.855944037437439, + "learning_rate": 5.760477218660119e-05, + "loss": 1.164, + "step": 18155 + }, + { + "epoch": 0.6502050244417784, + "grad_norm": 1.2953473329544067, + "learning_rate": 5.759426742524524e-05, + "loss": 0.9481, + "step": 18156 + }, + { + "epoch": 0.6502408365713467, + "grad_norm": 1.411978006362915, + "learning_rate": 5.75837632344037e-05, + "loss": 1.1956, + "step": 18157 + }, + { + "epoch": 0.650276648700915, + "grad_norm": 1.5663447380065918, + "learning_rate": 5.757325961421791e-05, + "loss": 1.0565, + "step": 18158 + }, + { + "epoch": 0.6503124608304833, + "grad_norm": 1.5021251440048218, + "learning_rate": 5.756275656482918e-05, + "loss": 1.0145, + "step": 18159 + }, + { + "epoch": 0.6503482729600516, + "grad_norm": 1.3206936120986938, + "learning_rate": 5.7552254086378863e-05, + "loss": 1.0908, + "step": 18160 + }, + { + "epoch": 0.6503840850896199, + "grad_norm": 1.601036548614502, + "learning_rate": 5.754175217900817e-05, + "loss": 1.3665, + "step": 18161 + }, + { + "epoch": 0.6504198972191881, + "grad_norm": 1.6462470293045044, + "learning_rate": 5.753125084285844e-05, + "loss": 1.1802, + "step": 18162 + }, + { + "epoch": 0.6504557093487564, + "grad_norm": 1.3472328186035156, + "learning_rate": 5.752075007807098e-05, + "loss": 1.2193, + "step": 18163 + }, + { + "epoch": 0.6504915214783247, + "grad_norm": 1.5754928588867188, + "learning_rate": 5.751024988478701e-05, + "loss": 1.078, + "step": 18164 + }, + { + "epoch": 0.6505273336078929, + "grad_norm": 1.4419444799423218, + "learning_rate": 5.749975026314781e-05, + "loss": 1.1347, + "step": 18165 + }, + { + "epoch": 0.6505631457374613, + "grad_norm": 2.000070333480835, + "learning_rate": 5.748925121329465e-05, + "loss": 1.2187, + "step": 18166 + }, + { + "epoch": 0.6505989578670296, + "grad_norm": 1.1892942190170288, + "learning_rate": 5.747875273536882e-05, + "loss": 0.9687, + "step": 18167 + }, + { + "epoch": 0.6506347699965979, + "grad_norm": 1.3104664087295532, + "learning_rate": 5.746825482951148e-05, + "loss": 1.0778, + "step": 18168 + }, + { + "epoch": 0.6506705821261661, + "grad_norm": 1.8642090559005737, + "learning_rate": 5.7457757495863916e-05, + "loss": 0.9745, + "step": 18169 + }, + { + "epoch": 0.6507063942557344, + "grad_norm": 1.1717488765716553, + "learning_rate": 5.744726073456739e-05, + "loss": 1.2532, + "step": 18170 + }, + { + "epoch": 0.6507422063853027, + "grad_norm": 1.5052732229232788, + "learning_rate": 5.7436764545763034e-05, + "loss": 1.1634, + "step": 18171 + }, + { + "epoch": 0.6507780185148709, + "grad_norm": 1.6477077007293701, + "learning_rate": 5.7426268929592105e-05, + "loss": 1.1875, + "step": 18172 + }, + { + "epoch": 0.6508138306444393, + "grad_norm": 1.6695951223373413, + "learning_rate": 5.7415773886195834e-05, + "loss": 1.1976, + "step": 18173 + }, + { + "epoch": 0.6508496427740076, + "grad_norm": 1.5946446657180786, + "learning_rate": 5.740527941571541e-05, + "loss": 1.0637, + "step": 18174 + }, + { + "epoch": 0.6508854549035759, + "grad_norm": 1.519273042678833, + "learning_rate": 5.739478551829198e-05, + "loss": 1.0481, + "step": 18175 + }, + { + "epoch": 0.6509212670331441, + "grad_norm": 1.4790496826171875, + "learning_rate": 5.738429219406676e-05, + "loss": 0.9238, + "step": 18176 + }, + { + "epoch": 0.6509570791627124, + "grad_norm": 1.4336857795715332, + "learning_rate": 5.7373799443180906e-05, + "loss": 1.1698, + "step": 18177 + }, + { + "epoch": 0.6509928912922807, + "grad_norm": 1.6628520488739014, + "learning_rate": 5.7363307265775635e-05, + "loss": 0.9818, + "step": 18178 + }, + { + "epoch": 0.6510287034218489, + "grad_norm": 1.5396432876586914, + "learning_rate": 5.7352815661992046e-05, + "loss": 1.0865, + "step": 18179 + }, + { + "epoch": 0.6510645155514173, + "grad_norm": 1.2634320259094238, + "learning_rate": 5.734232463197129e-05, + "loss": 0.8616, + "step": 18180 + }, + { + "epoch": 0.6511003276809856, + "grad_norm": 1.364245891571045, + "learning_rate": 5.7331834175854596e-05, + "loss": 1.0759, + "step": 18181 + }, + { + "epoch": 0.6511361398105538, + "grad_norm": 1.311144471168518, + "learning_rate": 5.732134429378297e-05, + "loss": 1.0427, + "step": 18182 + }, + { + "epoch": 0.6511719519401221, + "grad_norm": 1.490212082862854, + "learning_rate": 5.731085498589761e-05, + "loss": 0.9623, + "step": 18183 + }, + { + "epoch": 0.6512077640696904, + "grad_norm": 1.6617379188537598, + "learning_rate": 5.730036625233963e-05, + "loss": 1.4816, + "step": 18184 + }, + { + "epoch": 0.6512435761992587, + "grad_norm": 1.408026099205017, + "learning_rate": 5.728987809325019e-05, + "loss": 1.1399, + "step": 18185 + }, + { + "epoch": 0.6512793883288269, + "grad_norm": 1.4994069337844849, + "learning_rate": 5.727939050877031e-05, + "loss": 1.1742, + "step": 18186 + }, + { + "epoch": 0.6513152004583953, + "grad_norm": 1.7237820625305176, + "learning_rate": 5.726890349904113e-05, + "loss": 1.0879, + "step": 18187 + }, + { + "epoch": 0.6513510125879636, + "grad_norm": 1.8047149181365967, + "learning_rate": 5.725841706420376e-05, + "loss": 1.0527, + "step": 18188 + }, + { + "epoch": 0.6513868247175318, + "grad_norm": 1.5610398054122925, + "learning_rate": 5.724793120439923e-05, + "loss": 1.2814, + "step": 18189 + }, + { + "epoch": 0.6514226368471001, + "grad_norm": 1.5089954137802124, + "learning_rate": 5.723744591976863e-05, + "loss": 0.991, + "step": 18190 + }, + { + "epoch": 0.6514584489766684, + "grad_norm": 1.7508310079574585, + "learning_rate": 5.722696121045303e-05, + "loss": 1.1297, + "step": 18191 + }, + { + "epoch": 0.6514942611062366, + "grad_norm": 1.5843431949615479, + "learning_rate": 5.7216477076593544e-05, + "loss": 1.0649, + "step": 18192 + }, + { + "epoch": 0.6515300732358049, + "grad_norm": 1.3654165267944336, + "learning_rate": 5.7205993518331134e-05, + "loss": 0.9462, + "step": 18193 + }, + { + "epoch": 0.6515658853653733, + "grad_norm": 1.756610631942749, + "learning_rate": 5.719551053580687e-05, + "loss": 1.108, + "step": 18194 + }, + { + "epoch": 0.6516016974949416, + "grad_norm": 1.3827564716339111, + "learning_rate": 5.718502812916186e-05, + "loss": 0.9131, + "step": 18195 + }, + { + "epoch": 0.6516375096245098, + "grad_norm": 1.4523929357528687, + "learning_rate": 5.7174546298537005e-05, + "loss": 1.1239, + "step": 18196 + }, + { + "epoch": 0.6516733217540781, + "grad_norm": 1.679711937904358, + "learning_rate": 5.71640650440734e-05, + "loss": 1.3468, + "step": 18197 + }, + { + "epoch": 0.6517091338836464, + "grad_norm": 1.8090288639068604, + "learning_rate": 5.715358436591205e-05, + "loss": 1.3909, + "step": 18198 + }, + { + "epoch": 0.6517449460132146, + "grad_norm": 1.7773468494415283, + "learning_rate": 5.7143104264193984e-05, + "loss": 1.1019, + "step": 18199 + }, + { + "epoch": 0.6517807581427829, + "grad_norm": 1.3046724796295166, + "learning_rate": 5.7132624739060134e-05, + "loss": 0.8467, + "step": 18200 + }, + { + "epoch": 0.6518165702723513, + "grad_norm": 1.7284849882125854, + "learning_rate": 5.712214579065152e-05, + "loss": 1.0581, + "step": 18201 + }, + { + "epoch": 0.6518523824019196, + "grad_norm": 1.5448046922683716, + "learning_rate": 5.711166741910912e-05, + "loss": 1.0782, + "step": 18202 + }, + { + "epoch": 0.6518881945314878, + "grad_norm": 1.6723790168762207, + "learning_rate": 5.710118962457396e-05, + "loss": 1.0728, + "step": 18203 + }, + { + "epoch": 0.6519240066610561, + "grad_norm": 1.7004867792129517, + "learning_rate": 5.709071240718695e-05, + "loss": 0.9641, + "step": 18204 + }, + { + "epoch": 0.6519598187906244, + "grad_norm": 1.5316555500030518, + "learning_rate": 5.7080235767088994e-05, + "loss": 1.1961, + "step": 18205 + }, + { + "epoch": 0.6519956309201926, + "grad_norm": 1.3226354122161865, + "learning_rate": 5.706975970442117e-05, + "loss": 1.0338, + "step": 18206 + }, + { + "epoch": 0.6520314430497609, + "grad_norm": 1.4853153228759766, + "learning_rate": 5.7059284219324315e-05, + "loss": 1.2199, + "step": 18207 + }, + { + "epoch": 0.6520672551793293, + "grad_norm": 1.5018583536148071, + "learning_rate": 5.7048809311939446e-05, + "loss": 1.0562, + "step": 18208 + }, + { + "epoch": 0.6521030673088976, + "grad_norm": 1.6608357429504395, + "learning_rate": 5.703833498240736e-05, + "loss": 1.2688, + "step": 18209 + }, + { + "epoch": 0.6521388794384658, + "grad_norm": 1.6303260326385498, + "learning_rate": 5.702786123086914e-05, + "loss": 0.9873, + "step": 18210 + }, + { + "epoch": 0.6521746915680341, + "grad_norm": 1.4096593856811523, + "learning_rate": 5.701738805746558e-05, + "loss": 1.1698, + "step": 18211 + }, + { + "epoch": 0.6522105036976024, + "grad_norm": 1.4079763889312744, + "learning_rate": 5.700691546233762e-05, + "loss": 1.0739, + "step": 18212 + }, + { + "epoch": 0.6522463158271706, + "grad_norm": 1.1942148208618164, + "learning_rate": 5.699644344562619e-05, + "loss": 1.0335, + "step": 18213 + }, + { + "epoch": 0.6522821279567389, + "grad_norm": 1.3781803846359253, + "learning_rate": 5.698597200747211e-05, + "loss": 0.9024, + "step": 18214 + }, + { + "epoch": 0.6523179400863073, + "grad_norm": 1.8995766639709473, + "learning_rate": 5.697550114801633e-05, + "loss": 1.0903, + "step": 18215 + }, + { + "epoch": 0.6523537522158755, + "grad_norm": 1.3426880836486816, + "learning_rate": 5.696503086739961e-05, + "loss": 0.9722, + "step": 18216 + }, + { + "epoch": 0.6523895643454438, + "grad_norm": 2.1629528999328613, + "learning_rate": 5.695456116576296e-05, + "loss": 1.0786, + "step": 18217 + }, + { + "epoch": 0.6524253764750121, + "grad_norm": 1.4186469316482544, + "learning_rate": 5.6944092043247124e-05, + "loss": 1.0734, + "step": 18218 + }, + { + "epoch": 0.6524611886045804, + "grad_norm": 1.3649635314941406, + "learning_rate": 5.693362349999303e-05, + "loss": 1.0955, + "step": 18219 + }, + { + "epoch": 0.6524970007341486, + "grad_norm": 1.3997613191604614, + "learning_rate": 5.6923155536141404e-05, + "loss": 1.1116, + "step": 18220 + }, + { + "epoch": 0.6525328128637169, + "grad_norm": 1.5940011739730835, + "learning_rate": 5.691268815183324e-05, + "loss": 0.9291, + "step": 18221 + }, + { + "epoch": 0.6525686249932853, + "grad_norm": 2.014465570449829, + "learning_rate": 5.690222134720927e-05, + "loss": 1.1785, + "step": 18222 + }, + { + "epoch": 0.6526044371228535, + "grad_norm": 1.4349157810211182, + "learning_rate": 5.6891755122410254e-05, + "loss": 0.7923, + "step": 18223 + }, + { + "epoch": 0.6526402492524218, + "grad_norm": 1.6132543087005615, + "learning_rate": 5.688128947757713e-05, + "loss": 1.1507, + "step": 18224 + }, + { + "epoch": 0.6526760613819901, + "grad_norm": 1.4701554775238037, + "learning_rate": 5.687082441285061e-05, + "loss": 1.1864, + "step": 18225 + }, + { + "epoch": 0.6527118735115583, + "grad_norm": 1.347537636756897, + "learning_rate": 5.6860359928371546e-05, + "loss": 1.3165, + "step": 18226 + }, + { + "epoch": 0.6527476856411266, + "grad_norm": 1.5028345584869385, + "learning_rate": 5.6849896024280614e-05, + "loss": 0.9718, + "step": 18227 + }, + { + "epoch": 0.6527834977706949, + "grad_norm": 1.4781365394592285, + "learning_rate": 5.6839432700718743e-05, + "loss": 0.9102, + "step": 18228 + }, + { + "epoch": 0.6528193099002633, + "grad_norm": 1.7104123830795288, + "learning_rate": 5.682896995782661e-05, + "loss": 1.1382, + "step": 18229 + }, + { + "epoch": 0.6528551220298315, + "grad_norm": 1.9335577487945557, + "learning_rate": 5.6818507795745025e-05, + "loss": 0.9999, + "step": 18230 + }, + { + "epoch": 0.6528909341593998, + "grad_norm": 1.5605158805847168, + "learning_rate": 5.6808046214614684e-05, + "loss": 1.1683, + "step": 18231 + }, + { + "epoch": 0.6529267462889681, + "grad_norm": 1.857777714729309, + "learning_rate": 5.679758521457637e-05, + "loss": 1.2042, + "step": 18232 + }, + { + "epoch": 0.6529625584185363, + "grad_norm": 1.3767361640930176, + "learning_rate": 5.678712479577086e-05, + "loss": 0.9387, + "step": 18233 + }, + { + "epoch": 0.6529983705481046, + "grad_norm": 1.4004113674163818, + "learning_rate": 5.67766649583388e-05, + "loss": 0.9365, + "step": 18234 + }, + { + "epoch": 0.6530341826776729, + "grad_norm": 1.276894211769104, + "learning_rate": 5.676620570242097e-05, + "loss": 1.0917, + "step": 18235 + }, + { + "epoch": 0.6530699948072413, + "grad_norm": 1.8269165754318237, + "learning_rate": 5.675574702815807e-05, + "loss": 1.2415, + "step": 18236 + }, + { + "epoch": 0.6531058069368095, + "grad_norm": 2.3485701084136963, + "learning_rate": 5.674528893569084e-05, + "loss": 0.9583, + "step": 18237 + }, + { + "epoch": 0.6531416190663778, + "grad_norm": 1.619513988494873, + "learning_rate": 5.673483142515988e-05, + "loss": 0.9989, + "step": 18238 + }, + { + "epoch": 0.6531774311959461, + "grad_norm": 1.4129502773284912, + "learning_rate": 5.672437449670605e-05, + "loss": 1.1318, + "step": 18239 + }, + { + "epoch": 0.6532132433255143, + "grad_norm": 1.6692594289779663, + "learning_rate": 5.6713918150469916e-05, + "loss": 0.9428, + "step": 18240 + }, + { + "epoch": 0.6532490554550826, + "grad_norm": 2.173628568649292, + "learning_rate": 5.6703462386592145e-05, + "loss": 1.2765, + "step": 18241 + }, + { + "epoch": 0.6532848675846509, + "grad_norm": 1.4047907590866089, + "learning_rate": 5.6693007205213444e-05, + "loss": 1.0548, + "step": 18242 + }, + { + "epoch": 0.6533206797142193, + "grad_norm": 1.4361077547073364, + "learning_rate": 5.668255260647447e-05, + "loss": 1.2338, + "step": 18243 + }, + { + "epoch": 0.6533564918437875, + "grad_norm": 1.6777318716049194, + "learning_rate": 5.667209859051592e-05, + "loss": 1.071, + "step": 18244 + }, + { + "epoch": 0.6533923039733558, + "grad_norm": 1.494209885597229, + "learning_rate": 5.6661645157478336e-05, + "loss": 0.8587, + "step": 18245 + }, + { + "epoch": 0.6534281161029241, + "grad_norm": 1.2133922576904297, + "learning_rate": 5.665119230750243e-05, + "loss": 1.0638, + "step": 18246 + }, + { + "epoch": 0.6534639282324923, + "grad_norm": 1.5508954524993896, + "learning_rate": 5.664074004072881e-05, + "loss": 0.9891, + "step": 18247 + }, + { + "epoch": 0.6534997403620606, + "grad_norm": 1.635770320892334, + "learning_rate": 5.663028835729815e-05, + "loss": 1.1682, + "step": 18248 + }, + { + "epoch": 0.6535355524916289, + "grad_norm": 1.489905834197998, + "learning_rate": 5.661983725735096e-05, + "loss": 1.1622, + "step": 18249 + }, + { + "epoch": 0.6535713646211972, + "grad_norm": 2.044356346130371, + "learning_rate": 5.6609386741027915e-05, + "loss": 1.302, + "step": 18250 + }, + { + "epoch": 0.6536071767507655, + "grad_norm": 1.2606227397918701, + "learning_rate": 5.659893680846965e-05, + "loss": 1.0846, + "step": 18251 + }, + { + "epoch": 0.6536429888803338, + "grad_norm": 1.457284688949585, + "learning_rate": 5.658848745981667e-05, + "loss": 1.202, + "step": 18252 + }, + { + "epoch": 0.653678801009902, + "grad_norm": 1.3601882457733154, + "learning_rate": 5.6578038695209566e-05, + "loss": 0.9374, + "step": 18253 + }, + { + "epoch": 0.6537146131394703, + "grad_norm": 2.0343713760375977, + "learning_rate": 5.656759051478897e-05, + "loss": 1.1104, + "step": 18254 + }, + { + "epoch": 0.6537504252690386, + "grad_norm": 1.545202374458313, + "learning_rate": 5.655714291869544e-05, + "loss": 1.2511, + "step": 18255 + }, + { + "epoch": 0.6537862373986069, + "grad_norm": 1.3898216485977173, + "learning_rate": 5.654669590706948e-05, + "loss": 0.9846, + "step": 18256 + }, + { + "epoch": 0.6538220495281752, + "grad_norm": 1.6678663492202759, + "learning_rate": 5.653624948005167e-05, + "loss": 1.0835, + "step": 18257 + }, + { + "epoch": 0.6538578616577435, + "grad_norm": 1.482773780822754, + "learning_rate": 5.6525803637782614e-05, + "loss": 1.268, + "step": 18258 + }, + { + "epoch": 0.6538936737873118, + "grad_norm": 1.5436135530471802, + "learning_rate": 5.651535838040275e-05, + "loss": 1.1725, + "step": 18259 + }, + { + "epoch": 0.65392948591688, + "grad_norm": 1.3468005657196045, + "learning_rate": 5.6504913708052646e-05, + "loss": 1.2391, + "step": 18260 + }, + { + "epoch": 0.6539652980464483, + "grad_norm": 1.472586750984192, + "learning_rate": 5.6494469620872814e-05, + "loss": 1.0281, + "step": 18261 + }, + { + "epoch": 0.6540011101760166, + "grad_norm": 1.504853367805481, + "learning_rate": 5.648402611900383e-05, + "loss": 0.8917, + "step": 18262 + }, + { + "epoch": 0.6540369223055849, + "grad_norm": 1.3752305507659912, + "learning_rate": 5.647358320258609e-05, + "loss": 1.0709, + "step": 18263 + }, + { + "epoch": 0.6540727344351532, + "grad_norm": 1.2458851337432861, + "learning_rate": 5.6463140871760144e-05, + "loss": 1.1039, + "step": 18264 + }, + { + "epoch": 0.6541085465647215, + "grad_norm": 1.3727893829345703, + "learning_rate": 5.6452699126666486e-05, + "loss": 1.2105, + "step": 18265 + }, + { + "epoch": 0.6541443586942898, + "grad_norm": 1.3874515295028687, + "learning_rate": 5.644225796744562e-05, + "loss": 0.8881, + "step": 18266 + }, + { + "epoch": 0.654180170823858, + "grad_norm": 1.520760178565979, + "learning_rate": 5.6431817394237964e-05, + "loss": 1.1745, + "step": 18267 + }, + { + "epoch": 0.6542159829534263, + "grad_norm": 2.0031039714813232, + "learning_rate": 5.6421377407183997e-05, + "loss": 1.1854, + "step": 18268 + }, + { + "epoch": 0.6542517950829946, + "grad_norm": 1.4920557737350464, + "learning_rate": 5.641093800642423e-05, + "loss": 0.9672, + "step": 18269 + }, + { + "epoch": 0.6542876072125628, + "grad_norm": 1.9773911237716675, + "learning_rate": 5.640049919209902e-05, + "loss": 1.177, + "step": 18270 + }, + { + "epoch": 0.6543234193421312, + "grad_norm": 1.3734891414642334, + "learning_rate": 5.6390060964348845e-05, + "loss": 1.1212, + "step": 18271 + }, + { + "epoch": 0.6543592314716995, + "grad_norm": 1.3447600603103638, + "learning_rate": 5.637962332331416e-05, + "loss": 0.9385, + "step": 18272 + }, + { + "epoch": 0.6543950436012678, + "grad_norm": 1.7558389902114868, + "learning_rate": 5.636918626913541e-05, + "loss": 1.2327, + "step": 18273 + }, + { + "epoch": 0.654430855730836, + "grad_norm": 1.5853533744812012, + "learning_rate": 5.6358749801952946e-05, + "loss": 1.3161, + "step": 18274 + }, + { + "epoch": 0.6544666678604043, + "grad_norm": 1.452418565750122, + "learning_rate": 5.63483139219072e-05, + "loss": 1.113, + "step": 18275 + }, + { + "epoch": 0.6545024799899726, + "grad_norm": 1.4960219860076904, + "learning_rate": 5.633787862913864e-05, + "loss": 0.9735, + "step": 18276 + }, + { + "epoch": 0.6545382921195408, + "grad_norm": 1.5038752555847168, + "learning_rate": 5.6327443923787546e-05, + "loss": 1.0066, + "step": 18277 + }, + { + "epoch": 0.6545741042491092, + "grad_norm": 1.2912575006484985, + "learning_rate": 5.631700980599437e-05, + "loss": 1.1477, + "step": 18278 + }, + { + "epoch": 0.6546099163786775, + "grad_norm": 1.5286219120025635, + "learning_rate": 5.630657627589948e-05, + "loss": 1.2611, + "step": 18279 + }, + { + "epoch": 0.6546457285082458, + "grad_norm": 1.528832197189331, + "learning_rate": 5.629614333364328e-05, + "loss": 1.0113, + "step": 18280 + }, + { + "epoch": 0.654681540637814, + "grad_norm": 1.5058423280715942, + "learning_rate": 5.628571097936606e-05, + "loss": 0.9976, + "step": 18281 + }, + { + "epoch": 0.6547173527673823, + "grad_norm": 1.920159935951233, + "learning_rate": 5.627527921320821e-05, + "loss": 1.0406, + "step": 18282 + }, + { + "epoch": 0.6547531648969506, + "grad_norm": 1.4209619760513306, + "learning_rate": 5.626484803531008e-05, + "loss": 1.1276, + "step": 18283 + }, + { + "epoch": 0.6547889770265188, + "grad_norm": 1.4660595655441284, + "learning_rate": 5.625441744581205e-05, + "loss": 0.9276, + "step": 18284 + }, + { + "epoch": 0.6548247891560872, + "grad_norm": 1.4220246076583862, + "learning_rate": 5.624398744485435e-05, + "loss": 1.0074, + "step": 18285 + }, + { + "epoch": 0.6548606012856555, + "grad_norm": 1.464608907699585, + "learning_rate": 5.623355803257737e-05, + "loss": 1.211, + "step": 18286 + }, + { + "epoch": 0.6548964134152238, + "grad_norm": 1.522158145904541, + "learning_rate": 5.622312920912145e-05, + "loss": 1.1703, + "step": 18287 + }, + { + "epoch": 0.654932225544792, + "grad_norm": 1.938324213027954, + "learning_rate": 5.621270097462682e-05, + "loss": 1.2403, + "step": 18288 + }, + { + "epoch": 0.6549680376743603, + "grad_norm": 2.051780939102173, + "learning_rate": 5.620227332923382e-05, + "loss": 0.9127, + "step": 18289 + }, + { + "epoch": 0.6550038498039286, + "grad_norm": 1.4343621730804443, + "learning_rate": 5.619184627308273e-05, + "loss": 1.0542, + "step": 18290 + }, + { + "epoch": 0.6550396619334968, + "grad_norm": 1.5744025707244873, + "learning_rate": 5.618141980631389e-05, + "loss": 1.2846, + "step": 18291 + }, + { + "epoch": 0.6550754740630652, + "grad_norm": 2.062464475631714, + "learning_rate": 5.617099392906751e-05, + "loss": 1.031, + "step": 18292 + }, + { + "epoch": 0.6551112861926335, + "grad_norm": 1.8694204092025757, + "learning_rate": 5.61605686414838e-05, + "loss": 1.2552, + "step": 18293 + }, + { + "epoch": 0.6551470983222017, + "grad_norm": 1.2973264455795288, + "learning_rate": 5.615014394370317e-05, + "loss": 0.727, + "step": 18294 + }, + { + "epoch": 0.65518291045177, + "grad_norm": 1.5379126071929932, + "learning_rate": 5.6139719835865745e-05, + "loss": 1.1039, + "step": 18295 + }, + { + "epoch": 0.6552187225813383, + "grad_norm": 1.5487474203109741, + "learning_rate": 5.612929631811181e-05, + "loss": 1.0458, + "step": 18296 + }, + { + "epoch": 0.6552545347109066, + "grad_norm": 1.5333656072616577, + "learning_rate": 5.611887339058162e-05, + "loss": 1.1756, + "step": 18297 + }, + { + "epoch": 0.6552903468404748, + "grad_norm": 1.5937635898590088, + "learning_rate": 5.610845105341542e-05, + "loss": 1.2881, + "step": 18298 + }, + { + "epoch": 0.6553261589700432, + "grad_norm": 1.794584035873413, + "learning_rate": 5.609802930675335e-05, + "loss": 1.1379, + "step": 18299 + }, + { + "epoch": 0.6553619710996115, + "grad_norm": 1.7434029579162598, + "learning_rate": 5.608760815073567e-05, + "loss": 1.0391, + "step": 18300 + }, + { + "epoch": 0.6553977832291797, + "grad_norm": 1.3134011030197144, + "learning_rate": 5.6077187585502624e-05, + "loss": 0.9786, + "step": 18301 + }, + { + "epoch": 0.655433595358748, + "grad_norm": 1.2578046321868896, + "learning_rate": 5.6066767611194316e-05, + "loss": 1.0824, + "step": 18302 + }, + { + "epoch": 0.6554694074883163, + "grad_norm": 1.737139105796814, + "learning_rate": 5.6056348227951025e-05, + "loss": 1.0272, + "step": 18303 + }, + { + "epoch": 0.6555052196178845, + "grad_norm": 1.6793577671051025, + "learning_rate": 5.6045929435912805e-05, + "loss": 1.1471, + "step": 18304 + }, + { + "epoch": 0.6555410317474528, + "grad_norm": 1.929254174232483, + "learning_rate": 5.603551123521997e-05, + "loss": 0.9619, + "step": 18305 + }, + { + "epoch": 0.6555768438770212, + "grad_norm": 1.2995744943618774, + "learning_rate": 5.60250936260126e-05, + "loss": 1.1376, + "step": 18306 + }, + { + "epoch": 0.6556126560065895, + "grad_norm": 1.4636249542236328, + "learning_rate": 5.601467660843087e-05, + "loss": 1.0056, + "step": 18307 + }, + { + "epoch": 0.6556484681361577, + "grad_norm": 1.483574628829956, + "learning_rate": 5.600426018261493e-05, + "loss": 1.2519, + "step": 18308 + }, + { + "epoch": 0.655684280265726, + "grad_norm": 1.2328214645385742, + "learning_rate": 5.599384434870496e-05, + "loss": 0.9389, + "step": 18309 + }, + { + "epoch": 0.6557200923952943, + "grad_norm": 1.3996974229812622, + "learning_rate": 5.5983429106841046e-05, + "loss": 1.0993, + "step": 18310 + }, + { + "epoch": 0.6557559045248625, + "grad_norm": 1.7274751663208008, + "learning_rate": 5.597301445716323e-05, + "loss": 1.2765, + "step": 18311 + }, + { + "epoch": 0.6557917166544308, + "grad_norm": 1.5634864568710327, + "learning_rate": 5.59626003998118e-05, + "loss": 1.1075, + "step": 18312 + }, + { + "epoch": 0.6558275287839992, + "grad_norm": 1.5229415893554688, + "learning_rate": 5.595218693492674e-05, + "loss": 1.0116, + "step": 18313 + }, + { + "epoch": 0.6558633409135675, + "grad_norm": 1.5635732412338257, + "learning_rate": 5.594177406264822e-05, + "loss": 1.1649, + "step": 18314 + }, + { + "epoch": 0.6558991530431357, + "grad_norm": 1.4804202318191528, + "learning_rate": 5.593136178311622e-05, + "loss": 0.908, + "step": 18315 + }, + { + "epoch": 0.655934965172704, + "grad_norm": 1.6723803281784058, + "learning_rate": 5.592095009647099e-05, + "loss": 1.1226, + "step": 18316 + }, + { + "epoch": 0.6559707773022723, + "grad_norm": 1.2613295316696167, + "learning_rate": 5.591053900285248e-05, + "loss": 1.034, + "step": 18317 + }, + { + "epoch": 0.6560065894318405, + "grad_norm": 1.6946399211883545, + "learning_rate": 5.590012850240083e-05, + "loss": 1.198, + "step": 18318 + }, + { + "epoch": 0.6560424015614088, + "grad_norm": 2.3277335166931152, + "learning_rate": 5.5889718595256026e-05, + "loss": 1.2077, + "step": 18319 + }, + { + "epoch": 0.6560782136909772, + "grad_norm": 1.765190601348877, + "learning_rate": 5.587930928155816e-05, + "loss": 0.9841, + "step": 18320 + }, + { + "epoch": 0.6561140258205455, + "grad_norm": 1.3800567388534546, + "learning_rate": 5.586890056144732e-05, + "loss": 1.2047, + "step": 18321 + }, + { + "epoch": 0.6561498379501137, + "grad_norm": 1.4170769453048706, + "learning_rate": 5.585849243506342e-05, + "loss": 1.0222, + "step": 18322 + }, + { + "epoch": 0.656185650079682, + "grad_norm": 1.5350074768066406, + "learning_rate": 5.584808490254664e-05, + "loss": 1.2095, + "step": 18323 + }, + { + "epoch": 0.6562214622092503, + "grad_norm": 1.5550000667572021, + "learning_rate": 5.5837677964036894e-05, + "loss": 1.1886, + "step": 18324 + }, + { + "epoch": 0.6562572743388185, + "grad_norm": 1.7131010293960571, + "learning_rate": 5.582727161967425e-05, + "loss": 1.0399, + "step": 18325 + }, + { + "epoch": 0.6562930864683868, + "grad_norm": 1.4737012386322021, + "learning_rate": 5.5816865869598625e-05, + "loss": 1.0675, + "step": 18326 + }, + { + "epoch": 0.6563288985979552, + "grad_norm": 1.2521746158599854, + "learning_rate": 5.5806460713950145e-05, + "loss": 1.031, + "step": 18327 + }, + { + "epoch": 0.6563647107275234, + "grad_norm": 1.5823129415512085, + "learning_rate": 5.579605615286874e-05, + "loss": 1.1291, + "step": 18328 + }, + { + "epoch": 0.6564005228570917, + "grad_norm": 1.3167494535446167, + "learning_rate": 5.578565218649433e-05, + "loss": 1.1079, + "step": 18329 + }, + { + "epoch": 0.65643633498666, + "grad_norm": 1.8586516380310059, + "learning_rate": 5.577524881496694e-05, + "loss": 1.0283, + "step": 18330 + }, + { + "epoch": 0.6564721471162283, + "grad_norm": 1.5786036252975464, + "learning_rate": 5.5764846038426535e-05, + "loss": 1.0118, + "step": 18331 + }, + { + "epoch": 0.6565079592457965, + "grad_norm": 2.0745975971221924, + "learning_rate": 5.57544438570131e-05, + "loss": 1.2368, + "step": 18332 + }, + { + "epoch": 0.6565437713753648, + "grad_norm": 1.4081240892410278, + "learning_rate": 5.574404227086648e-05, + "loss": 1.0648, + "step": 18333 + }, + { + "epoch": 0.6565795835049332, + "grad_norm": 1.480193853378296, + "learning_rate": 5.573364128012677e-05, + "loss": 0.9516, + "step": 18334 + }, + { + "epoch": 0.6566153956345014, + "grad_norm": 1.3269102573394775, + "learning_rate": 5.572324088493377e-05, + "loss": 0.9986, + "step": 18335 + }, + { + "epoch": 0.6566512077640697, + "grad_norm": 1.532527208328247, + "learning_rate": 5.571284108542748e-05, + "loss": 0.9822, + "step": 18336 + }, + { + "epoch": 0.656687019893638, + "grad_norm": 1.3667702674865723, + "learning_rate": 5.5702441881747755e-05, + "loss": 0.9242, + "step": 18337 + }, + { + "epoch": 0.6567228320232062, + "grad_norm": 1.9809811115264893, + "learning_rate": 5.5692043274034544e-05, + "loss": 1.2409, + "step": 18338 + }, + { + "epoch": 0.6567586441527745, + "grad_norm": 1.634358286857605, + "learning_rate": 5.568164526242776e-05, + "loss": 1.0771, + "step": 18339 + }, + { + "epoch": 0.6567944562823428, + "grad_norm": 1.477860450744629, + "learning_rate": 5.5671247847067254e-05, + "loss": 1.3132, + "step": 18340 + }, + { + "epoch": 0.6568302684119112, + "grad_norm": 1.3309438228607178, + "learning_rate": 5.566085102809291e-05, + "loss": 0.968, + "step": 18341 + }, + { + "epoch": 0.6568660805414794, + "grad_norm": 1.39901864528656, + "learning_rate": 5.565045480564463e-05, + "loss": 1.0051, + "step": 18342 + }, + { + "epoch": 0.6569018926710477, + "grad_norm": 1.4491405487060547, + "learning_rate": 5.5640059179862314e-05, + "loss": 1.1377, + "step": 18343 + }, + { + "epoch": 0.656937704800616, + "grad_norm": 1.5958607196807861, + "learning_rate": 5.562966415088574e-05, + "loss": 1.1771, + "step": 18344 + }, + { + "epoch": 0.6569735169301842, + "grad_norm": 1.5347596406936646, + "learning_rate": 5.5619269718854805e-05, + "loss": 1.3707, + "step": 18345 + }, + { + "epoch": 0.6570093290597525, + "grad_norm": 1.386752963066101, + "learning_rate": 5.560887588390938e-05, + "loss": 0.9123, + "step": 18346 + }, + { + "epoch": 0.6570451411893208, + "grad_norm": 1.6635810136795044, + "learning_rate": 5.559848264618923e-05, + "loss": 0.9055, + "step": 18347 + }, + { + "epoch": 0.6570809533188892, + "grad_norm": 1.7693718671798706, + "learning_rate": 5.5588090005834224e-05, + "loss": 1.0581, + "step": 18348 + }, + { + "epoch": 0.6571167654484574, + "grad_norm": 1.6496647596359253, + "learning_rate": 5.5577697962984195e-05, + "loss": 1.0861, + "step": 18349 + }, + { + "epoch": 0.6571525775780257, + "grad_norm": 1.3422603607177734, + "learning_rate": 5.556730651777897e-05, + "loss": 1.0616, + "step": 18350 + }, + { + "epoch": 0.657188389707594, + "grad_norm": 1.4172223806381226, + "learning_rate": 5.555691567035828e-05, + "loss": 1.1192, + "step": 18351 + }, + { + "epoch": 0.6572242018371622, + "grad_norm": 1.4541542530059814, + "learning_rate": 5.554652542086196e-05, + "loss": 1.1356, + "step": 18352 + }, + { + "epoch": 0.6572600139667305, + "grad_norm": 1.5432208776474, + "learning_rate": 5.5536135769429795e-05, + "loss": 1.0192, + "step": 18353 + }, + { + "epoch": 0.6572958260962988, + "grad_norm": 2.2056257724761963, + "learning_rate": 5.552574671620161e-05, + "loss": 1.155, + "step": 18354 + }, + { + "epoch": 0.6573316382258672, + "grad_norm": 2.1645114421844482, + "learning_rate": 5.551535826131711e-05, + "loss": 1.019, + "step": 18355 + }, + { + "epoch": 0.6573674503554354, + "grad_norm": 2.1277658939361572, + "learning_rate": 5.5504970404916066e-05, + "loss": 0.9677, + "step": 18356 + }, + { + "epoch": 0.6574032624850037, + "grad_norm": 1.3596346378326416, + "learning_rate": 5.54945831471383e-05, + "loss": 1.1588, + "step": 18357 + }, + { + "epoch": 0.657439074614572, + "grad_norm": 1.8428336381912231, + "learning_rate": 5.548419648812346e-05, + "loss": 1.0994, + "step": 18358 + }, + { + "epoch": 0.6574748867441402, + "grad_norm": 1.3079296350479126, + "learning_rate": 5.547381042801135e-05, + "loss": 1.228, + "step": 18359 + }, + { + "epoch": 0.6575106988737085, + "grad_norm": 1.5180878639221191, + "learning_rate": 5.5463424966941676e-05, + "loss": 0.8754, + "step": 18360 + }, + { + "epoch": 0.6575465110032768, + "grad_norm": 1.4948545694351196, + "learning_rate": 5.545304010505421e-05, + "loss": 1.1171, + "step": 18361 + }, + { + "epoch": 0.6575823231328451, + "grad_norm": 1.4272316694259644, + "learning_rate": 5.54426558424886e-05, + "loss": 1.1675, + "step": 18362 + }, + { + "epoch": 0.6576181352624134, + "grad_norm": 1.9268628358840942, + "learning_rate": 5.543227217938457e-05, + "loss": 1.1615, + "step": 18363 + }, + { + "epoch": 0.6576539473919817, + "grad_norm": 1.320315957069397, + "learning_rate": 5.5421889115881875e-05, + "loss": 0.9855, + "step": 18364 + }, + { + "epoch": 0.65768975952155, + "grad_norm": 1.3576568365097046, + "learning_rate": 5.5411506652120115e-05, + "loss": 1.109, + "step": 18365 + }, + { + "epoch": 0.6577255716511182, + "grad_norm": 1.1837900876998901, + "learning_rate": 5.540112478823902e-05, + "loss": 1.0286, + "step": 18366 + }, + { + "epoch": 0.6577613837806865, + "grad_norm": 1.4112797975540161, + "learning_rate": 5.5390743524378266e-05, + "loss": 1.2267, + "step": 18367 + }, + { + "epoch": 0.6577971959102548, + "grad_norm": 1.305327296257019, + "learning_rate": 5.538036286067756e-05, + "loss": 0.9808, + "step": 18368 + }, + { + "epoch": 0.6578330080398231, + "grad_norm": 1.833990216255188, + "learning_rate": 5.5369982797276454e-05, + "loss": 1.1777, + "step": 18369 + }, + { + "epoch": 0.6578688201693914, + "grad_norm": 1.7496569156646729, + "learning_rate": 5.5359603334314695e-05, + "loss": 1.0773, + "step": 18370 + }, + { + "epoch": 0.6579046322989597, + "grad_norm": 1.6340532302856445, + "learning_rate": 5.534922447193187e-05, + "loss": 1.2972, + "step": 18371 + }, + { + "epoch": 0.657940444428528, + "grad_norm": 1.2719128131866455, + "learning_rate": 5.533884621026767e-05, + "loss": 0.9554, + "step": 18372 + }, + { + "epoch": 0.6579762565580962, + "grad_norm": 1.3992173671722412, + "learning_rate": 5.5328468549461657e-05, + "loss": 1.2282, + "step": 18373 + }, + { + "epoch": 0.6580120686876645, + "grad_norm": 1.3444710969924927, + "learning_rate": 5.531809148965347e-05, + "loss": 1.1751, + "step": 18374 + }, + { + "epoch": 0.6580478808172328, + "grad_norm": 1.4329817295074463, + "learning_rate": 5.530771503098278e-05, + "loss": 1.0783, + "step": 18375 + }, + { + "epoch": 0.6580836929468011, + "grad_norm": 1.3897101879119873, + "learning_rate": 5.529733917358908e-05, + "loss": 1.1826, + "step": 18376 + }, + { + "epoch": 0.6581195050763694, + "grad_norm": 1.4492679834365845, + "learning_rate": 5.528696391761201e-05, + "loss": 1.1935, + "step": 18377 + }, + { + "epoch": 0.6581553172059377, + "grad_norm": 1.3895550966262817, + "learning_rate": 5.527658926319119e-05, + "loss": 1.1332, + "step": 18378 + }, + { + "epoch": 0.6581911293355059, + "grad_norm": 1.4840306043624878, + "learning_rate": 5.52662152104662e-05, + "loss": 0.9482, + "step": 18379 + }, + { + "epoch": 0.6582269414650742, + "grad_norm": 1.7069804668426514, + "learning_rate": 5.5255841759576544e-05, + "loss": 0.9677, + "step": 18380 + }, + { + "epoch": 0.6582627535946425, + "grad_norm": 1.4013601541519165, + "learning_rate": 5.524546891066182e-05, + "loss": 1.1544, + "step": 18381 + }, + { + "epoch": 0.6582985657242107, + "grad_norm": 1.5793122053146362, + "learning_rate": 5.5235096663861617e-05, + "loss": 1.3661, + "step": 18382 + }, + { + "epoch": 0.6583343778537791, + "grad_norm": 1.3272737264633179, + "learning_rate": 5.5224725019315416e-05, + "loss": 1.1324, + "step": 18383 + }, + { + "epoch": 0.6583701899833474, + "grad_norm": 1.8925758600234985, + "learning_rate": 5.521435397716278e-05, + "loss": 1.137, + "step": 18384 + }, + { + "epoch": 0.6584060021129157, + "grad_norm": 1.2608404159545898, + "learning_rate": 5.520398353754324e-05, + "loss": 1.0429, + "step": 18385 + }, + { + "epoch": 0.6584418142424839, + "grad_norm": 1.501800298690796, + "learning_rate": 5.519361370059637e-05, + "loss": 0.9513, + "step": 18386 + }, + { + "epoch": 0.6584776263720522, + "grad_norm": 1.5214550495147705, + "learning_rate": 5.518324446646157e-05, + "loss": 1.2385, + "step": 18387 + }, + { + "epoch": 0.6585134385016205, + "grad_norm": 1.4434940814971924, + "learning_rate": 5.517287583527843e-05, + "loss": 1.0507, + "step": 18388 + }, + { + "epoch": 0.6585492506311887, + "grad_norm": 1.6266754865646362, + "learning_rate": 5.51625078071864e-05, + "loss": 1.0184, + "step": 18389 + }, + { + "epoch": 0.6585850627607571, + "grad_norm": 1.495844841003418, + "learning_rate": 5.5152140382325044e-05, + "loss": 1.0987, + "step": 18390 + }, + { + "epoch": 0.6586208748903254, + "grad_norm": 1.5164891481399536, + "learning_rate": 5.5141773560833756e-05, + "loss": 1.277, + "step": 18391 + }, + { + "epoch": 0.6586566870198937, + "grad_norm": 1.512982964515686, + "learning_rate": 5.5131407342852026e-05, + "loss": 0.9517, + "step": 18392 + }, + { + "epoch": 0.6586924991494619, + "grad_norm": 1.5049121379852295, + "learning_rate": 5.5121041728519386e-05, + "loss": 1.1116, + "step": 18393 + }, + { + "epoch": 0.6587283112790302, + "grad_norm": 1.388453483581543, + "learning_rate": 5.5110676717975194e-05, + "loss": 0.9121, + "step": 18394 + }, + { + "epoch": 0.6587641234085985, + "grad_norm": 1.2856383323669434, + "learning_rate": 5.510031231135895e-05, + "loss": 1.2043, + "step": 18395 + }, + { + "epoch": 0.6587999355381667, + "grad_norm": 1.5987050533294678, + "learning_rate": 5.508994850881008e-05, + "loss": 1.0834, + "step": 18396 + }, + { + "epoch": 0.6588357476677351, + "grad_norm": 1.9622114896774292, + "learning_rate": 5.507958531046806e-05, + "loss": 1.0543, + "step": 18397 + }, + { + "epoch": 0.6588715597973034, + "grad_norm": 1.3600480556488037, + "learning_rate": 5.506922271647228e-05, + "loss": 1.2956, + "step": 18398 + }, + { + "epoch": 0.6589073719268717, + "grad_norm": 1.7969027757644653, + "learning_rate": 5.505886072696208e-05, + "loss": 1.0757, + "step": 18399 + }, + { + "epoch": 0.6589431840564399, + "grad_norm": 1.4916549921035767, + "learning_rate": 5.504849934207701e-05, + "loss": 1.0777, + "step": 18400 + }, + { + "epoch": 0.6589789961860082, + "grad_norm": 1.4490392208099365, + "learning_rate": 5.503813856195637e-05, + "loss": 1.005, + "step": 18401 + }, + { + "epoch": 0.6590148083155765, + "grad_norm": 1.4333384037017822, + "learning_rate": 5.5027778386739606e-05, + "loss": 1.1057, + "step": 18402 + }, + { + "epoch": 0.6590506204451447, + "grad_norm": 1.3539772033691406, + "learning_rate": 5.5017418816565994e-05, + "loss": 1.1501, + "step": 18403 + }, + { + "epoch": 0.6590864325747131, + "grad_norm": 2.176102876663208, + "learning_rate": 5.500705985157508e-05, + "loss": 1.1854, + "step": 18404 + }, + { + "epoch": 0.6591222447042814, + "grad_norm": 1.3929228782653809, + "learning_rate": 5.499670149190609e-05, + "loss": 0.961, + "step": 18405 + }, + { + "epoch": 0.6591580568338496, + "grad_norm": 1.4723964929580688, + "learning_rate": 5.498634373769843e-05, + "loss": 1.3084, + "step": 18406 + }, + { + "epoch": 0.6591938689634179, + "grad_norm": 1.5582033395767212, + "learning_rate": 5.497598658909149e-05, + "loss": 1.1587, + "step": 18407 + }, + { + "epoch": 0.6592296810929862, + "grad_norm": 1.26312255859375, + "learning_rate": 5.496563004622455e-05, + "loss": 1.0575, + "step": 18408 + }, + { + "epoch": 0.6592654932225545, + "grad_norm": 1.835058569908142, + "learning_rate": 5.495527410923699e-05, + "loss": 1.1347, + "step": 18409 + }, + { + "epoch": 0.6593013053521227, + "grad_norm": 1.7792950868606567, + "learning_rate": 5.494491877826804e-05, + "loss": 0.9637, + "step": 18410 + }, + { + "epoch": 0.6593371174816911, + "grad_norm": 1.6124378442764282, + "learning_rate": 5.493456405345716e-05, + "loss": 1.0882, + "step": 18411 + }, + { + "epoch": 0.6593729296112594, + "grad_norm": 1.4512195587158203, + "learning_rate": 5.492420993494357e-05, + "loss": 1.0611, + "step": 18412 + }, + { + "epoch": 0.6594087417408276, + "grad_norm": 1.5000754594802856, + "learning_rate": 5.491385642286662e-05, + "loss": 0.9012, + "step": 18413 + }, + { + "epoch": 0.6594445538703959, + "grad_norm": 1.4879530668258667, + "learning_rate": 5.49035035173655e-05, + "loss": 1.0324, + "step": 18414 + }, + { + "epoch": 0.6594803659999642, + "grad_norm": 1.580729603767395, + "learning_rate": 5.4893151218579655e-05, + "loss": 1.205, + "step": 18415 + }, + { + "epoch": 0.6595161781295324, + "grad_norm": 1.6071738004684448, + "learning_rate": 5.488279952664826e-05, + "loss": 1.0652, + "step": 18416 + }, + { + "epoch": 0.6595519902591007, + "grad_norm": 2.3391757011413574, + "learning_rate": 5.4872448441710536e-05, + "loss": 1.2124, + "step": 18417 + }, + { + "epoch": 0.659587802388669, + "grad_norm": 1.6058940887451172, + "learning_rate": 5.4862097963905865e-05, + "loss": 1.3175, + "step": 18418 + }, + { + "epoch": 0.6596236145182374, + "grad_norm": 1.104469656944275, + "learning_rate": 5.485174809337342e-05, + "loss": 1.0054, + "step": 18419 + }, + { + "epoch": 0.6596594266478056, + "grad_norm": 1.6655898094177246, + "learning_rate": 5.484139883025251e-05, + "loss": 1.1308, + "step": 18420 + }, + { + "epoch": 0.6596952387773739, + "grad_norm": 1.5326205492019653, + "learning_rate": 5.4831050174682243e-05, + "loss": 1.1657, + "step": 18421 + }, + { + "epoch": 0.6597310509069422, + "grad_norm": 1.6571145057678223, + "learning_rate": 5.482070212680201e-05, + "loss": 1.0485, + "step": 18422 + }, + { + "epoch": 0.6597668630365104, + "grad_norm": 2.307868480682373, + "learning_rate": 5.481035468675092e-05, + "loss": 1.0724, + "step": 18423 + }, + { + "epoch": 0.6598026751660787, + "grad_norm": 1.407983660697937, + "learning_rate": 5.4800007854668254e-05, + "loss": 0.9857, + "step": 18424 + }, + { + "epoch": 0.659838487295647, + "grad_norm": 2.099175453186035, + "learning_rate": 5.478966163069313e-05, + "loss": 1.2261, + "step": 18425 + }, + { + "epoch": 0.6598742994252154, + "grad_norm": 1.7716071605682373, + "learning_rate": 5.47793160149648e-05, + "loss": 1.2027, + "step": 18426 + }, + { + "epoch": 0.6599101115547836, + "grad_norm": 1.641731858253479, + "learning_rate": 5.476897100762248e-05, + "loss": 1.2639, + "step": 18427 + }, + { + "epoch": 0.6599459236843519, + "grad_norm": 1.4149317741394043, + "learning_rate": 5.475862660880529e-05, + "loss": 0.8778, + "step": 18428 + }, + { + "epoch": 0.6599817358139202, + "grad_norm": 1.5069407224655151, + "learning_rate": 5.4748282818652386e-05, + "loss": 1.071, + "step": 18429 + }, + { + "epoch": 0.6600175479434884, + "grad_norm": 1.469603180885315, + "learning_rate": 5.473793963730299e-05, + "loss": 1.0531, + "step": 18430 + }, + { + "epoch": 0.6600533600730567, + "grad_norm": 1.3424152135849, + "learning_rate": 5.4727597064896276e-05, + "loss": 1.1031, + "step": 18431 + }, + { + "epoch": 0.660089172202625, + "grad_norm": 1.5385997295379639, + "learning_rate": 5.4717255101571253e-05, + "loss": 1.0258, + "step": 18432 + }, + { + "epoch": 0.6601249843321934, + "grad_norm": 1.7189658880233765, + "learning_rate": 5.470691374746724e-05, + "loss": 1.1926, + "step": 18433 + }, + { + "epoch": 0.6601607964617616, + "grad_norm": 1.7740979194641113, + "learning_rate": 5.469657300272326e-05, + "loss": 1.0846, + "step": 18434 + }, + { + "epoch": 0.6601966085913299, + "grad_norm": 1.5048547983169556, + "learning_rate": 5.468623286747844e-05, + "loss": 1.0152, + "step": 18435 + }, + { + "epoch": 0.6602324207208982, + "grad_norm": 1.419838786125183, + "learning_rate": 5.4675893341871886e-05, + "loss": 1.1222, + "step": 18436 + }, + { + "epoch": 0.6602682328504664, + "grad_norm": 1.1325900554656982, + "learning_rate": 5.4665554426042734e-05, + "loss": 0.7439, + "step": 18437 + }, + { + "epoch": 0.6603040449800347, + "grad_norm": 1.5532561540603638, + "learning_rate": 5.465521612013012e-05, + "loss": 1.0732, + "step": 18438 + }, + { + "epoch": 0.660339857109603, + "grad_norm": 1.786676287651062, + "learning_rate": 5.464487842427302e-05, + "loss": 1.1802, + "step": 18439 + }, + { + "epoch": 0.6603756692391713, + "grad_norm": 1.3403334617614746, + "learning_rate": 5.463454133861059e-05, + "loss": 0.9879, + "step": 18440 + }, + { + "epoch": 0.6604114813687396, + "grad_norm": 1.2948917150497437, + "learning_rate": 5.462420486328188e-05, + "loss": 0.9624, + "step": 18441 + }, + { + "epoch": 0.6604472934983079, + "grad_norm": 1.3447091579437256, + "learning_rate": 5.461386899842601e-05, + "loss": 0.8906, + "step": 18442 + }, + { + "epoch": 0.6604831056278762, + "grad_norm": 1.4649485349655151, + "learning_rate": 5.460353374418195e-05, + "loss": 1.1334, + "step": 18443 + }, + { + "epoch": 0.6605189177574444, + "grad_norm": 1.4757870435714722, + "learning_rate": 5.459319910068879e-05, + "loss": 1.0392, + "step": 18444 + }, + { + "epoch": 0.6605547298870127, + "grad_norm": 1.7399227619171143, + "learning_rate": 5.4582865068085585e-05, + "loss": 1.1335, + "step": 18445 + }, + { + "epoch": 0.660590542016581, + "grad_norm": 1.9548158645629883, + "learning_rate": 5.4572531646511325e-05, + "loss": 1.2238, + "step": 18446 + }, + { + "epoch": 0.6606263541461493, + "grad_norm": 2.3486692905426025, + "learning_rate": 5.456219883610505e-05, + "loss": 1.1332, + "step": 18447 + }, + { + "epoch": 0.6606621662757176, + "grad_norm": 1.3052996397018433, + "learning_rate": 5.455186663700578e-05, + "loss": 1.1527, + "step": 18448 + }, + { + "epoch": 0.6606979784052859, + "grad_norm": 1.574581265449524, + "learning_rate": 5.4541535049352566e-05, + "loss": 1.2269, + "step": 18449 + }, + { + "epoch": 0.6607337905348541, + "grad_norm": 1.7830984592437744, + "learning_rate": 5.4531204073284316e-05, + "loss": 1.2944, + "step": 18450 + }, + { + "epoch": 0.6607696026644224, + "grad_norm": 1.4104132652282715, + "learning_rate": 5.4520873708940056e-05, + "loss": 1.0453, + "step": 18451 + }, + { + "epoch": 0.6608054147939907, + "grad_norm": 1.69120454788208, + "learning_rate": 5.451054395645883e-05, + "loss": 1.2539, + "step": 18452 + }, + { + "epoch": 0.660841226923559, + "grad_norm": 1.5550456047058105, + "learning_rate": 5.450021481597951e-05, + "loss": 1.0303, + "step": 18453 + }, + { + "epoch": 0.6608770390531273, + "grad_norm": 1.445150375366211, + "learning_rate": 5.448988628764111e-05, + "loss": 1.0131, + "step": 18454 + }, + { + "epoch": 0.6609128511826956, + "grad_norm": 1.6260451078414917, + "learning_rate": 5.4479558371582584e-05, + "loss": 1.0234, + "step": 18455 + }, + { + "epoch": 0.6609486633122639, + "grad_norm": 1.7272660732269287, + "learning_rate": 5.446923106794293e-05, + "loss": 1.1762, + "step": 18456 + }, + { + "epoch": 0.6609844754418321, + "grad_norm": 1.512112021446228, + "learning_rate": 5.4458904376860997e-05, + "loss": 1.144, + "step": 18457 + }, + { + "epoch": 0.6610202875714004, + "grad_norm": 1.3669382333755493, + "learning_rate": 5.444857829847576e-05, + "loss": 0.9027, + "step": 18458 + }, + { + "epoch": 0.6610560997009687, + "grad_norm": 1.3794848918914795, + "learning_rate": 5.443825283292615e-05, + "loss": 1.0483, + "step": 18459 + }, + { + "epoch": 0.661091911830537, + "grad_norm": 1.496660590171814, + "learning_rate": 5.4427927980351124e-05, + "loss": 1.0186, + "step": 18460 + }, + { + "epoch": 0.6611277239601053, + "grad_norm": 1.541720986366272, + "learning_rate": 5.441760374088949e-05, + "loss": 0.9184, + "step": 18461 + }, + { + "epoch": 0.6611635360896736, + "grad_norm": 1.2969837188720703, + "learning_rate": 5.4407280114680206e-05, + "loss": 0.9283, + "step": 18462 + }, + { + "epoch": 0.6611993482192419, + "grad_norm": 1.5762553215026855, + "learning_rate": 5.439695710186219e-05, + "loss": 1.2459, + "step": 18463 + }, + { + "epoch": 0.6612351603488101, + "grad_norm": 1.360571265220642, + "learning_rate": 5.4386634702574255e-05, + "loss": 1.131, + "step": 18464 + }, + { + "epoch": 0.6612709724783784, + "grad_norm": 1.4218459129333496, + "learning_rate": 5.437631291695533e-05, + "loss": 1.1512, + "step": 18465 + }, + { + "epoch": 0.6613067846079467, + "grad_norm": 1.3147660493850708, + "learning_rate": 5.436599174514425e-05, + "loss": 1.0685, + "step": 18466 + }, + { + "epoch": 0.6613425967375149, + "grad_norm": 1.5185575485229492, + "learning_rate": 5.435567118727993e-05, + "loss": 1.0175, + "step": 18467 + }, + { + "epoch": 0.6613784088670833, + "grad_norm": 1.4610904455184937, + "learning_rate": 5.434535124350113e-05, + "loss": 1.2456, + "step": 18468 + }, + { + "epoch": 0.6614142209966516, + "grad_norm": 1.7700928449630737, + "learning_rate": 5.433503191394675e-05, + "loss": 1.1408, + "step": 18469 + }, + { + "epoch": 0.6614500331262199, + "grad_norm": 1.4224052429199219, + "learning_rate": 5.432471319875565e-05, + "loss": 0.9961, + "step": 18470 + }, + { + "epoch": 0.6614858452557881, + "grad_norm": 1.4033277034759521, + "learning_rate": 5.431439509806657e-05, + "loss": 0.9778, + "step": 18471 + }, + { + "epoch": 0.6615216573853564, + "grad_norm": 1.9221237897872925, + "learning_rate": 5.4304077612018375e-05, + "loss": 1.2438, + "step": 18472 + }, + { + "epoch": 0.6615574695149247, + "grad_norm": 1.667985200881958, + "learning_rate": 5.429376074074988e-05, + "loss": 1.1463, + "step": 18473 + }, + { + "epoch": 0.6615932816444929, + "grad_norm": 1.5099172592163086, + "learning_rate": 5.4283444484399904e-05, + "loss": 0.9611, + "step": 18474 + }, + { + "epoch": 0.6616290937740613, + "grad_norm": 2.3868532180786133, + "learning_rate": 5.427312884310718e-05, + "loss": 1.151, + "step": 18475 + }, + { + "epoch": 0.6616649059036296, + "grad_norm": 1.4513636827468872, + "learning_rate": 5.426281381701053e-05, + "loss": 1.0492, + "step": 18476 + }, + { + "epoch": 0.6617007180331979, + "grad_norm": 1.3454926013946533, + "learning_rate": 5.4252499406248724e-05, + "loss": 1.031, + "step": 18477 + }, + { + "epoch": 0.6617365301627661, + "grad_norm": 1.5649527311325073, + "learning_rate": 5.424218561096055e-05, + "loss": 1.2961, + "step": 18478 + }, + { + "epoch": 0.6617723422923344, + "grad_norm": 1.4537311792373657, + "learning_rate": 5.423187243128472e-05, + "loss": 1.0194, + "step": 18479 + }, + { + "epoch": 0.6618081544219027, + "grad_norm": 1.5628637075424194, + "learning_rate": 5.4221559867360014e-05, + "loss": 1.1336, + "step": 18480 + }, + { + "epoch": 0.6618439665514709, + "grad_norm": 1.2771893739700317, + "learning_rate": 5.4211247919325206e-05, + "loss": 1.135, + "step": 18481 + }, + { + "epoch": 0.6618797786810393, + "grad_norm": 1.369863748550415, + "learning_rate": 5.4200936587318954e-05, + "loss": 1.0796, + "step": 18482 + }, + { + "epoch": 0.6619155908106076, + "grad_norm": 2.4346745014190674, + "learning_rate": 5.4190625871480016e-05, + "loss": 1.2691, + "step": 18483 + }, + { + "epoch": 0.6619514029401758, + "grad_norm": 1.2996374368667603, + "learning_rate": 5.4180315771947123e-05, + "loss": 1.0197, + "step": 18484 + }, + { + "epoch": 0.6619872150697441, + "grad_norm": 1.5432522296905518, + "learning_rate": 5.417000628885902e-05, + "loss": 1.2229, + "step": 18485 + }, + { + "epoch": 0.6620230271993124, + "grad_norm": 1.7987451553344727, + "learning_rate": 5.415969742235432e-05, + "loss": 1.3288, + "step": 18486 + }, + { + "epoch": 0.6620588393288807, + "grad_norm": 2.6872942447662354, + "learning_rate": 5.414938917257177e-05, + "loss": 1.1855, + "step": 18487 + }, + { + "epoch": 0.6620946514584489, + "grad_norm": 1.494395136833191, + "learning_rate": 5.4139081539650084e-05, + "loss": 1.1199, + "step": 18488 + }, + { + "epoch": 0.6621304635880173, + "grad_norm": 1.1656666994094849, + "learning_rate": 5.412877452372784e-05, + "loss": 1.0777, + "step": 18489 + }, + { + "epoch": 0.6621662757175856, + "grad_norm": 1.4290134906768799, + "learning_rate": 5.411846812494379e-05, + "loss": 1.1053, + "step": 18490 + }, + { + "epoch": 0.6622020878471538, + "grad_norm": 1.87358558177948, + "learning_rate": 5.410816234343656e-05, + "loss": 1.1484, + "step": 18491 + }, + { + "epoch": 0.6622378999767221, + "grad_norm": 1.794234037399292, + "learning_rate": 5.4097857179344846e-05, + "loss": 1.1149, + "step": 18492 + }, + { + "epoch": 0.6622737121062904, + "grad_norm": 1.6008050441741943, + "learning_rate": 5.4087552632807225e-05, + "loss": 1.0473, + "step": 18493 + }, + { + "epoch": 0.6623095242358586, + "grad_norm": 1.4798280000686646, + "learning_rate": 5.407724870396235e-05, + "loss": 1.1452, + "step": 18494 + }, + { + "epoch": 0.6623453363654269, + "grad_norm": 1.3584426641464233, + "learning_rate": 5.4066945392948896e-05, + "loss": 0.8443, + "step": 18495 + }, + { + "epoch": 0.6623811484949953, + "grad_norm": 1.5057393312454224, + "learning_rate": 5.40566426999054e-05, + "loss": 1.1199, + "step": 18496 + }, + { + "epoch": 0.6624169606245636, + "grad_norm": 1.63643217086792, + "learning_rate": 5.404634062497057e-05, + "loss": 1.1193, + "step": 18497 + }, + { + "epoch": 0.6624527727541318, + "grad_norm": 1.6695643663406372, + "learning_rate": 5.403603916828286e-05, + "loss": 1.2489, + "step": 18498 + }, + { + "epoch": 0.6624885848837001, + "grad_norm": 1.499293565750122, + "learning_rate": 5.4025738329981035e-05, + "loss": 1.0457, + "step": 18499 + }, + { + "epoch": 0.6625243970132684, + "grad_norm": 1.5231025218963623, + "learning_rate": 5.401543811020356e-05, + "loss": 0.9986, + "step": 18500 + }, + { + "epoch": 0.6625602091428366, + "grad_norm": 1.740864872932434, + "learning_rate": 5.400513850908905e-05, + "loss": 1.0953, + "step": 18501 + }, + { + "epoch": 0.6625960212724049, + "grad_norm": 1.8209627866744995, + "learning_rate": 5.3994839526776065e-05, + "loss": 1.1989, + "step": 18502 + }, + { + "epoch": 0.6626318334019733, + "grad_norm": 1.291778326034546, + "learning_rate": 5.398454116340322e-05, + "loss": 1.063, + "step": 18503 + }, + { + "epoch": 0.6626676455315416, + "grad_norm": 1.3225425481796265, + "learning_rate": 5.3974243419109016e-05, + "loss": 0.8634, + "step": 18504 + }, + { + "epoch": 0.6627034576611098, + "grad_norm": 1.6983028650283813, + "learning_rate": 5.396394629403192e-05, + "loss": 1.223, + "step": 18505 + }, + { + "epoch": 0.6627392697906781, + "grad_norm": 1.3330436944961548, + "learning_rate": 5.395364978831061e-05, + "loss": 1.1823, + "step": 18506 + }, + { + "epoch": 0.6627750819202464, + "grad_norm": 1.4920791387557983, + "learning_rate": 5.394335390208352e-05, + "loss": 0.941, + "step": 18507 + }, + { + "epoch": 0.6628108940498146, + "grad_norm": 1.2776877880096436, + "learning_rate": 5.393305863548924e-05, + "loss": 1.0403, + "step": 18508 + }, + { + "epoch": 0.6628467061793829, + "grad_norm": 1.5033775568008423, + "learning_rate": 5.392276398866615e-05, + "loss": 1.1051, + "step": 18509 + }, + { + "epoch": 0.6628825183089513, + "grad_norm": 1.7318774461746216, + "learning_rate": 5.391246996175291e-05, + "loss": 1.2117, + "step": 18510 + }, + { + "epoch": 0.6629183304385196, + "grad_norm": 1.9151196479797363, + "learning_rate": 5.39021765548879e-05, + "loss": 1.2755, + "step": 18511 + }, + { + "epoch": 0.6629541425680878, + "grad_norm": 1.3497365713119507, + "learning_rate": 5.3891883768209686e-05, + "loss": 1.0651, + "step": 18512 + }, + { + "epoch": 0.6629899546976561, + "grad_norm": 1.6987885236740112, + "learning_rate": 5.388159160185665e-05, + "loss": 1.1132, + "step": 18513 + }, + { + "epoch": 0.6630257668272244, + "grad_norm": 1.483980417251587, + "learning_rate": 5.387130005596732e-05, + "loss": 1.1761, + "step": 18514 + }, + { + "epoch": 0.6630615789567926, + "grad_norm": 2.1761763095855713, + "learning_rate": 5.386100913068017e-05, + "loss": 1.1961, + "step": 18515 + }, + { + "epoch": 0.6630973910863609, + "grad_norm": 1.4752002954483032, + "learning_rate": 5.385071882613357e-05, + "loss": 0.9217, + "step": 18516 + }, + { + "epoch": 0.6631332032159293, + "grad_norm": 1.5494556427001953, + "learning_rate": 5.3840429142466096e-05, + "loss": 1.3383, + "step": 18517 + }, + { + "epoch": 0.6631690153454975, + "grad_norm": 1.4992222785949707, + "learning_rate": 5.383014007981606e-05, + "loss": 1.1244, + "step": 18518 + }, + { + "epoch": 0.6632048274750658, + "grad_norm": 1.4771902561187744, + "learning_rate": 5.381985163832197e-05, + "loss": 1.1688, + "step": 18519 + }, + { + "epoch": 0.6632406396046341, + "grad_norm": 1.775254726409912, + "learning_rate": 5.380956381812213e-05, + "loss": 1.2149, + "step": 18520 + }, + { + "epoch": 0.6632764517342024, + "grad_norm": 1.4852595329284668, + "learning_rate": 5.379927661935511e-05, + "loss": 0.96, + "step": 18521 + }, + { + "epoch": 0.6633122638637706, + "grad_norm": 1.3732534646987915, + "learning_rate": 5.3788990042159224e-05, + "loss": 1.0637, + "step": 18522 + }, + { + "epoch": 0.6633480759933389, + "grad_norm": 1.27687406539917, + "learning_rate": 5.377870408667285e-05, + "loss": 1.0546, + "step": 18523 + }, + { + "epoch": 0.6633838881229073, + "grad_norm": 1.3258260488510132, + "learning_rate": 5.3768418753034375e-05, + "loss": 1.1757, + "step": 18524 + }, + { + "epoch": 0.6634197002524755, + "grad_norm": 1.5699853897094727, + "learning_rate": 5.375813404138219e-05, + "loss": 1.0714, + "step": 18525 + }, + { + "epoch": 0.6634555123820438, + "grad_norm": 1.7335506677627563, + "learning_rate": 5.37478499518547e-05, + "loss": 1.0338, + "step": 18526 + }, + { + "epoch": 0.6634913245116121, + "grad_norm": 1.2418124675750732, + "learning_rate": 5.3737566484590164e-05, + "loss": 1.0338, + "step": 18527 + }, + { + "epoch": 0.6635271366411803, + "grad_norm": 1.5092723369598389, + "learning_rate": 5.372728363972706e-05, + "loss": 1.0801, + "step": 18528 + }, + { + "epoch": 0.6635629487707486, + "grad_norm": 1.633535385131836, + "learning_rate": 5.371700141740364e-05, + "loss": 1.4599, + "step": 18529 + }, + { + "epoch": 0.6635987609003169, + "grad_norm": 1.3785096406936646, + "learning_rate": 5.3706719817758286e-05, + "loss": 1.1273, + "step": 18530 + }, + { + "epoch": 0.6636345730298853, + "grad_norm": 2.2319834232330322, + "learning_rate": 5.3696438840929276e-05, + "loss": 1.1656, + "step": 18531 + }, + { + "epoch": 0.6636703851594535, + "grad_norm": 1.5463837385177612, + "learning_rate": 5.368615848705496e-05, + "loss": 1.3502, + "step": 18532 + }, + { + "epoch": 0.6637061972890218, + "grad_norm": 1.384433388710022, + "learning_rate": 5.367587875627367e-05, + "loss": 0.9771, + "step": 18533 + }, + { + "epoch": 0.6637420094185901, + "grad_norm": 1.7168207168579102, + "learning_rate": 5.366559964872364e-05, + "loss": 1.0415, + "step": 18534 + }, + { + "epoch": 0.6637778215481583, + "grad_norm": 1.4393194913864136, + "learning_rate": 5.36553211645432e-05, + "loss": 1.0659, + "step": 18535 + }, + { + "epoch": 0.6638136336777266, + "grad_norm": 1.7489509582519531, + "learning_rate": 5.3645043303870634e-05, + "loss": 1.1185, + "step": 18536 + }, + { + "epoch": 0.6638494458072949, + "grad_norm": 1.3783447742462158, + "learning_rate": 5.363476606684425e-05, + "loss": 1.1217, + "step": 18537 + }, + { + "epoch": 0.6638852579368633, + "grad_norm": 1.692649483680725, + "learning_rate": 5.3624489453602255e-05, + "loss": 1.1438, + "step": 18538 + }, + { + "epoch": 0.6639210700664315, + "grad_norm": 1.9972944259643555, + "learning_rate": 5.361421346428294e-05, + "loss": 1.1808, + "step": 18539 + }, + { + "epoch": 0.6639568821959998, + "grad_norm": 1.4380379915237427, + "learning_rate": 5.3603938099024576e-05, + "loss": 1.0541, + "step": 18540 + }, + { + "epoch": 0.6639926943255681, + "grad_norm": 1.415486216545105, + "learning_rate": 5.359366335796534e-05, + "loss": 1.0485, + "step": 18541 + }, + { + "epoch": 0.6640285064551363, + "grad_norm": 1.3865599632263184, + "learning_rate": 5.35833892412435e-05, + "loss": 1.2005, + "step": 18542 + }, + { + "epoch": 0.6640643185847046, + "grad_norm": 1.6615639925003052, + "learning_rate": 5.3573115748997284e-05, + "loss": 1.0779, + "step": 18543 + }, + { + "epoch": 0.6641001307142729, + "grad_norm": 1.366500735282898, + "learning_rate": 5.356284288136496e-05, + "loss": 0.9646, + "step": 18544 + }, + { + "epoch": 0.6641359428438413, + "grad_norm": 1.48382568359375, + "learning_rate": 5.3552570638484644e-05, + "loss": 0.9698, + "step": 18545 + }, + { + "epoch": 0.6641717549734095, + "grad_norm": 1.7583261728286743, + "learning_rate": 5.3542299020494567e-05, + "loss": 1.092, + "step": 18546 + }, + { + "epoch": 0.6642075671029778, + "grad_norm": 1.5075451135635376, + "learning_rate": 5.3532028027532947e-05, + "loss": 1.2283, + "step": 18547 + }, + { + "epoch": 0.6642433792325461, + "grad_norm": 1.8192150592803955, + "learning_rate": 5.352175765973797e-05, + "loss": 1.1821, + "step": 18548 + }, + { + "epoch": 0.6642791913621143, + "grad_norm": 1.5484038591384888, + "learning_rate": 5.351148791724776e-05, + "loss": 1.0683, + "step": 18549 + }, + { + "epoch": 0.6643150034916826, + "grad_norm": 1.4940142631530762, + "learning_rate": 5.3501218800200514e-05, + "loss": 1.1228, + "step": 18550 + }, + { + "epoch": 0.6643508156212509, + "grad_norm": 1.3943732976913452, + "learning_rate": 5.349095030873443e-05, + "loss": 1.0993, + "step": 18551 + }, + { + "epoch": 0.6643866277508192, + "grad_norm": 1.586138367652893, + "learning_rate": 5.348068244298758e-05, + "loss": 1.0312, + "step": 18552 + }, + { + "epoch": 0.6644224398803875, + "grad_norm": 1.4804174900054932, + "learning_rate": 5.347041520309815e-05, + "loss": 1.1396, + "step": 18553 + }, + { + "epoch": 0.6644582520099558, + "grad_norm": 1.2784844636917114, + "learning_rate": 5.346014858920425e-05, + "loss": 1.1233, + "step": 18554 + }, + { + "epoch": 0.664494064139524, + "grad_norm": 1.7303351163864136, + "learning_rate": 5.3449882601444054e-05, + "loss": 0.9111, + "step": 18555 + }, + { + "epoch": 0.6645298762690923, + "grad_norm": 2.047459363937378, + "learning_rate": 5.343961723995561e-05, + "loss": 1.2276, + "step": 18556 + }, + { + "epoch": 0.6645656883986606, + "grad_norm": 1.264215350151062, + "learning_rate": 5.342935250487706e-05, + "loss": 1.0442, + "step": 18557 + }, + { + "epoch": 0.6646015005282289, + "grad_norm": 1.328842282295227, + "learning_rate": 5.341908839634654e-05, + "loss": 1.2149, + "step": 18558 + }, + { + "epoch": 0.6646373126577972, + "grad_norm": 1.4805550575256348, + "learning_rate": 5.340882491450205e-05, + "loss": 1.0857, + "step": 18559 + }, + { + "epoch": 0.6646731247873655, + "grad_norm": 1.5711193084716797, + "learning_rate": 5.339856205948175e-05, + "loss": 1.2415, + "step": 18560 + }, + { + "epoch": 0.6647089369169338, + "grad_norm": 1.5882036685943604, + "learning_rate": 5.338829983142366e-05, + "loss": 1.3734, + "step": 18561 + }, + { + "epoch": 0.664744749046502, + "grad_norm": 1.537758469581604, + "learning_rate": 5.337803823046592e-05, + "loss": 1.0652, + "step": 18562 + }, + { + "epoch": 0.6647805611760703, + "grad_norm": 1.6673702001571655, + "learning_rate": 5.33677772567465e-05, + "loss": 1.1128, + "step": 18563 + }, + { + "epoch": 0.6648163733056386, + "grad_norm": 1.8126939535140991, + "learning_rate": 5.335751691040348e-05, + "loss": 1.3232, + "step": 18564 + }, + { + "epoch": 0.6648521854352069, + "grad_norm": 1.7400511503219604, + "learning_rate": 5.334725719157492e-05, + "loss": 1.0872, + "step": 18565 + }, + { + "epoch": 0.6648879975647752, + "grad_norm": 1.4815597534179688, + "learning_rate": 5.333699810039885e-05, + "loss": 1.1756, + "step": 18566 + }, + { + "epoch": 0.6649238096943435, + "grad_norm": 1.466475009918213, + "learning_rate": 5.3326739637013255e-05, + "loss": 1.173, + "step": 18567 + }, + { + "epoch": 0.6649596218239118, + "grad_norm": 1.476586103439331, + "learning_rate": 5.3316481801556173e-05, + "loss": 1.1812, + "step": 18568 + }, + { + "epoch": 0.66499543395348, + "grad_norm": 1.4618239402770996, + "learning_rate": 5.3306224594165654e-05, + "loss": 1.1282, + "step": 18569 + }, + { + "epoch": 0.6650312460830483, + "grad_norm": 1.51393723487854, + "learning_rate": 5.3295968014979613e-05, + "loss": 1.2619, + "step": 18570 + }, + { + "epoch": 0.6650670582126166, + "grad_norm": 1.3129875659942627, + "learning_rate": 5.328571206413607e-05, + "loss": 0.8838, + "step": 18571 + }, + { + "epoch": 0.6651028703421848, + "grad_norm": 1.495316982269287, + "learning_rate": 5.3275456741773025e-05, + "loss": 1.2334, + "step": 18572 + }, + { + "epoch": 0.6651386824717532, + "grad_norm": 1.7674304246902466, + "learning_rate": 5.3265202048028474e-05, + "loss": 1.3237, + "step": 18573 + }, + { + "epoch": 0.6651744946013215, + "grad_norm": 1.518075942993164, + "learning_rate": 5.32549479830403e-05, + "loss": 0.9684, + "step": 18574 + }, + { + "epoch": 0.6652103067308898, + "grad_norm": 1.425788164138794, + "learning_rate": 5.324469454694651e-05, + "loss": 1.0285, + "step": 18575 + }, + { + "epoch": 0.665246118860458, + "grad_norm": 1.3974822759628296, + "learning_rate": 5.323444173988509e-05, + "loss": 1.1068, + "step": 18576 + }, + { + "epoch": 0.6652819309900263, + "grad_norm": 1.7130515575408936, + "learning_rate": 5.3224189561993886e-05, + "loss": 1.2086, + "step": 18577 + }, + { + "epoch": 0.6653177431195946, + "grad_norm": 1.6248695850372314, + "learning_rate": 5.321393801341088e-05, + "loss": 0.8891, + "step": 18578 + }, + { + "epoch": 0.6653535552491628, + "grad_norm": 1.699680209159851, + "learning_rate": 5.320368709427399e-05, + "loss": 1.1559, + "step": 18579 + }, + { + "epoch": 0.6653893673787312, + "grad_norm": 1.5290061235427856, + "learning_rate": 5.3193436804721154e-05, + "loss": 1.117, + "step": 18580 + }, + { + "epoch": 0.6654251795082995, + "grad_norm": 1.2987916469573975, + "learning_rate": 5.318318714489021e-05, + "loss": 1.1938, + "step": 18581 + }, + { + "epoch": 0.6654609916378678, + "grad_norm": 1.3020135164260864, + "learning_rate": 5.317293811491911e-05, + "loss": 0.9511, + "step": 18582 + }, + { + "epoch": 0.665496803767436, + "grad_norm": 1.5631312131881714, + "learning_rate": 5.316268971494571e-05, + "loss": 1.0555, + "step": 18583 + }, + { + "epoch": 0.6655326158970043, + "grad_norm": 1.2770154476165771, + "learning_rate": 5.315244194510795e-05, + "loss": 1.156, + "step": 18584 + }, + { + "epoch": 0.6655684280265726, + "grad_norm": 1.46540105342865, + "learning_rate": 5.3142194805543625e-05, + "loss": 0.9943, + "step": 18585 + }, + { + "epoch": 0.6656042401561408, + "grad_norm": 1.3069692850112915, + "learning_rate": 5.313194829639061e-05, + "loss": 1.0788, + "step": 18586 + }, + { + "epoch": 0.6656400522857092, + "grad_norm": 1.4490855932235718, + "learning_rate": 5.312170241778682e-05, + "loss": 1.1539, + "step": 18587 + }, + { + "epoch": 0.6656758644152775, + "grad_norm": 1.3520973920822144, + "learning_rate": 5.311145716987003e-05, + "loss": 1.0918, + "step": 18588 + }, + { + "epoch": 0.6657116765448458, + "grad_norm": 1.36529541015625, + "learning_rate": 5.310121255277809e-05, + "loss": 0.9495, + "step": 18589 + }, + { + "epoch": 0.665747488674414, + "grad_norm": 1.6337894201278687, + "learning_rate": 5.3090968566648836e-05, + "loss": 0.9942, + "step": 18590 + }, + { + "epoch": 0.6657833008039823, + "grad_norm": 1.3431097269058228, + "learning_rate": 5.308072521162013e-05, + "loss": 1.0675, + "step": 18591 + }, + { + "epoch": 0.6658191129335506, + "grad_norm": 1.510436773300171, + "learning_rate": 5.307048248782975e-05, + "loss": 1.0725, + "step": 18592 + }, + { + "epoch": 0.6658549250631188, + "grad_norm": 1.4562135934829712, + "learning_rate": 5.306024039541542e-05, + "loss": 1.0633, + "step": 18593 + }, + { + "epoch": 0.6658907371926872, + "grad_norm": 1.399817705154419, + "learning_rate": 5.3049998934515076e-05, + "loss": 1.0726, + "step": 18594 + }, + { + "epoch": 0.6659265493222555, + "grad_norm": 1.3983818292617798, + "learning_rate": 5.30397581052664e-05, + "loss": 0.9668, + "step": 18595 + }, + { + "epoch": 0.6659623614518237, + "grad_norm": 1.7343345880508423, + "learning_rate": 5.302951790780725e-05, + "loss": 1.2694, + "step": 18596 + }, + { + "epoch": 0.665998173581392, + "grad_norm": 1.7100926637649536, + "learning_rate": 5.3019278342275256e-05, + "loss": 1.0266, + "step": 18597 + }, + { + "epoch": 0.6660339857109603, + "grad_norm": 1.4932270050048828, + "learning_rate": 5.300903940880837e-05, + "loss": 1.1716, + "step": 18598 + }, + { + "epoch": 0.6660697978405286, + "grad_norm": 1.2972445487976074, + "learning_rate": 5.299880110754418e-05, + "loss": 1.1589, + "step": 18599 + }, + { + "epoch": 0.6661056099700968, + "grad_norm": 1.436098337173462, + "learning_rate": 5.298856343862051e-05, + "loss": 1.0194, + "step": 18600 + }, + { + "epoch": 0.6661414220996652, + "grad_norm": 2.0076236724853516, + "learning_rate": 5.2978326402175125e-05, + "loss": 1.0048, + "step": 18601 + }, + { + "epoch": 0.6661772342292335, + "grad_norm": 1.3750168085098267, + "learning_rate": 5.296808999834565e-05, + "loss": 0.8912, + "step": 18602 + }, + { + "epoch": 0.6662130463588017, + "grad_norm": 1.6760252714157104, + "learning_rate": 5.295785422726991e-05, + "loss": 1.2182, + "step": 18603 + }, + { + "epoch": 0.66624885848837, + "grad_norm": 1.3809266090393066, + "learning_rate": 5.2947619089085463e-05, + "loss": 0.9642, + "step": 18604 + }, + { + "epoch": 0.6662846706179383, + "grad_norm": 1.3613613843917847, + "learning_rate": 5.2937384583930204e-05, + "loss": 1.1325, + "step": 18605 + }, + { + "epoch": 0.6663204827475065, + "grad_norm": 1.504321813583374, + "learning_rate": 5.2927150711941675e-05, + "loss": 0.9255, + "step": 18606 + }, + { + "epoch": 0.6663562948770748, + "grad_norm": 1.4037843942642212, + "learning_rate": 5.2916917473257665e-05, + "loss": 1.2071, + "step": 18607 + }, + { + "epoch": 0.6663921070066432, + "grad_norm": 1.1252381801605225, + "learning_rate": 5.2906684868015724e-05, + "loss": 1.1964, + "step": 18608 + }, + { + "epoch": 0.6664279191362115, + "grad_norm": 1.3836299180984497, + "learning_rate": 5.2896452896353656e-05, + "loss": 1.0082, + "step": 18609 + }, + { + "epoch": 0.6664637312657797, + "grad_norm": 1.8429192304611206, + "learning_rate": 5.2886221558409065e-05, + "loss": 1.0661, + "step": 18610 + }, + { + "epoch": 0.666499543395348, + "grad_norm": 1.5939115285873413, + "learning_rate": 5.287599085431951e-05, + "loss": 1.3033, + "step": 18611 + }, + { + "epoch": 0.6665353555249163, + "grad_norm": 1.4291563034057617, + "learning_rate": 5.2865760784222786e-05, + "loss": 1.0357, + "step": 18612 + }, + { + "epoch": 0.6665711676544845, + "grad_norm": 1.9966652393341064, + "learning_rate": 5.2855531348256424e-05, + "loss": 1.2544, + "step": 18613 + }, + { + "epoch": 0.6666069797840528, + "grad_norm": 1.4189540147781372, + "learning_rate": 5.2845302546558105e-05, + "loss": 1.1431, + "step": 18614 + }, + { + "epoch": 0.6666427919136212, + "grad_norm": 1.7503957748413086, + "learning_rate": 5.283507437926534e-05, + "loss": 1.2074, + "step": 18615 + }, + { + "epoch": 0.6666786040431895, + "grad_norm": 2.0085127353668213, + "learning_rate": 5.2824846846515886e-05, + "loss": 1.0472, + "step": 18616 + }, + { + "epoch": 0.6667144161727577, + "grad_norm": 1.2629061937332153, + "learning_rate": 5.281461994844723e-05, + "loss": 0.9751, + "step": 18617 + }, + { + "epoch": 0.666750228302326, + "grad_norm": 1.8417394161224365, + "learning_rate": 5.280439368519703e-05, + "loss": 1.1453, + "step": 18618 + }, + { + "epoch": 0.6667860404318943, + "grad_norm": 1.6843594312667847, + "learning_rate": 5.27941680569028e-05, + "loss": 1.1799, + "step": 18619 + }, + { + "epoch": 0.6668218525614625, + "grad_norm": 1.5870686769485474, + "learning_rate": 5.2783943063702155e-05, + "loss": 1.2747, + "step": 18620 + }, + { + "epoch": 0.6668576646910308, + "grad_norm": 1.7671478986740112, + "learning_rate": 5.277371870573269e-05, + "loss": 1.2076, + "step": 18621 + }, + { + "epoch": 0.6668934768205992, + "grad_norm": 1.3245611190795898, + "learning_rate": 5.276349498313188e-05, + "loss": 1.0558, + "step": 18622 + }, + { + "epoch": 0.6669292889501675, + "grad_norm": 1.5661931037902832, + "learning_rate": 5.2753271896037316e-05, + "loss": 1.0426, + "step": 18623 + }, + { + "epoch": 0.6669651010797357, + "grad_norm": 1.0920370817184448, + "learning_rate": 5.274304944458652e-05, + "loss": 1.064, + "step": 18624 + }, + { + "epoch": 0.667000913209304, + "grad_norm": 1.7455086708068848, + "learning_rate": 5.273282762891709e-05, + "loss": 1.1122, + "step": 18625 + }, + { + "epoch": 0.6670367253388723, + "grad_norm": 1.8705031871795654, + "learning_rate": 5.2722606449166426e-05, + "loss": 0.9378, + "step": 18626 + }, + { + "epoch": 0.6670725374684405, + "grad_norm": 1.4019595384597778, + "learning_rate": 5.271238590547216e-05, + "loss": 1.2289, + "step": 18627 + }, + { + "epoch": 0.6671083495980088, + "grad_norm": 1.4902230501174927, + "learning_rate": 5.270216599797176e-05, + "loss": 1.0233, + "step": 18628 + }, + { + "epoch": 0.6671441617275772, + "grad_norm": 1.3395354747772217, + "learning_rate": 5.269194672680267e-05, + "loss": 1.0981, + "step": 18629 + }, + { + "epoch": 0.6671799738571454, + "grad_norm": 1.3309260606765747, + "learning_rate": 5.268172809210241e-05, + "loss": 0.9449, + "step": 18630 + }, + { + "epoch": 0.6672157859867137, + "grad_norm": 1.6592530012130737, + "learning_rate": 5.267151009400846e-05, + "loss": 1.1628, + "step": 18631 + }, + { + "epoch": 0.667251598116282, + "grad_norm": 1.4192795753479004, + "learning_rate": 5.266129273265834e-05, + "loss": 1.1637, + "step": 18632 + }, + { + "epoch": 0.6672874102458503, + "grad_norm": 1.7511637210845947, + "learning_rate": 5.2651076008189415e-05, + "loss": 1.0263, + "step": 18633 + }, + { + "epoch": 0.6673232223754185, + "grad_norm": 1.699882984161377, + "learning_rate": 5.2640859920739194e-05, + "loss": 1.2694, + "step": 18634 + }, + { + "epoch": 0.6673590345049868, + "grad_norm": 1.6140600442886353, + "learning_rate": 5.263064447044511e-05, + "loss": 1.2481, + "step": 18635 + }, + { + "epoch": 0.6673948466345552, + "grad_norm": 1.4193347692489624, + "learning_rate": 5.262042965744465e-05, + "loss": 1.0773, + "step": 18636 + }, + { + "epoch": 0.6674306587641234, + "grad_norm": 1.4779380559921265, + "learning_rate": 5.261021548187515e-05, + "loss": 0.9583, + "step": 18637 + }, + { + "epoch": 0.6674664708936917, + "grad_norm": 1.3881362676620483, + "learning_rate": 5.260000194387407e-05, + "loss": 0.9244, + "step": 18638 + }, + { + "epoch": 0.66750228302326, + "grad_norm": 1.3384313583374023, + "learning_rate": 5.2589789043578855e-05, + "loss": 1.0126, + "step": 18639 + }, + { + "epoch": 0.6675380951528282, + "grad_norm": 1.4837567806243896, + "learning_rate": 5.257957678112684e-05, + "loss": 0.966, + "step": 18640 + }, + { + "epoch": 0.6675739072823965, + "grad_norm": 1.5075520277023315, + "learning_rate": 5.2569365156655446e-05, + "loss": 1.3272, + "step": 18641 + }, + { + "epoch": 0.6676097194119648, + "grad_norm": 1.361773133277893, + "learning_rate": 5.255915417030206e-05, + "loss": 1.053, + "step": 18642 + }, + { + "epoch": 0.6676455315415332, + "grad_norm": 1.188388705253601, + "learning_rate": 5.254894382220412e-05, + "loss": 0.9691, + "step": 18643 + }, + { + "epoch": 0.6676813436711014, + "grad_norm": 1.5707200765609741, + "learning_rate": 5.2538734112498876e-05, + "loss": 1.1829, + "step": 18644 + }, + { + "epoch": 0.6677171558006697, + "grad_norm": 1.6064763069152832, + "learning_rate": 5.252852504132375e-05, + "loss": 1.061, + "step": 18645 + }, + { + "epoch": 0.667752967930238, + "grad_norm": 1.8150862455368042, + "learning_rate": 5.251831660881612e-05, + "loss": 1.1122, + "step": 18646 + }, + { + "epoch": 0.6677887800598062, + "grad_norm": 1.5770410299301147, + "learning_rate": 5.2508108815113264e-05, + "loss": 1.0678, + "step": 18647 + }, + { + "epoch": 0.6678245921893745, + "grad_norm": 1.8293339014053345, + "learning_rate": 5.249790166035253e-05, + "loss": 0.9381, + "step": 18648 + }, + { + "epoch": 0.6678604043189428, + "grad_norm": 1.4187695980072021, + "learning_rate": 5.2487695144671264e-05, + "loss": 1.2278, + "step": 18649 + }, + { + "epoch": 0.6678962164485112, + "grad_norm": 1.4541594982147217, + "learning_rate": 5.247748926820683e-05, + "loss": 0.9478, + "step": 18650 + }, + { + "epoch": 0.6679320285780794, + "grad_norm": 1.3882440328598022, + "learning_rate": 5.246728403109642e-05, + "loss": 1.2546, + "step": 18651 + }, + { + "epoch": 0.6679678407076477, + "grad_norm": 1.9255290031433105, + "learning_rate": 5.245707943347738e-05, + "loss": 0.9499, + "step": 18652 + }, + { + "epoch": 0.668003652837216, + "grad_norm": 1.6289005279541016, + "learning_rate": 5.244687547548703e-05, + "loss": 0.8971, + "step": 18653 + }, + { + "epoch": 0.6680394649667842, + "grad_norm": 1.6965276002883911, + "learning_rate": 5.243667215726267e-05, + "loss": 1.1836, + "step": 18654 + }, + { + "epoch": 0.6680752770963525, + "grad_norm": 1.5805102586746216, + "learning_rate": 5.242646947894148e-05, + "loss": 0.9393, + "step": 18655 + }, + { + "epoch": 0.6681110892259208, + "grad_norm": 1.4305185079574585, + "learning_rate": 5.241626744066079e-05, + "loss": 1.2184, + "step": 18656 + }, + { + "epoch": 0.6681469013554892, + "grad_norm": 1.3084726333618164, + "learning_rate": 5.240606604255787e-05, + "loss": 0.9298, + "step": 18657 + }, + { + "epoch": 0.6681827134850574, + "grad_norm": 1.4808506965637207, + "learning_rate": 5.239586528476992e-05, + "loss": 1.0909, + "step": 18658 + }, + { + "epoch": 0.6682185256146257, + "grad_norm": 1.6126905679702759, + "learning_rate": 5.2385665167434175e-05, + "loss": 1.048, + "step": 18659 + }, + { + "epoch": 0.668254337744194, + "grad_norm": 1.5076686143875122, + "learning_rate": 5.2375465690687895e-05, + "loss": 1.0235, + "step": 18660 + }, + { + "epoch": 0.6682901498737622, + "grad_norm": 1.4185060262680054, + "learning_rate": 5.236526685466834e-05, + "loss": 1.1788, + "step": 18661 + }, + { + "epoch": 0.6683259620033305, + "grad_norm": 1.3801301717758179, + "learning_rate": 5.235506865951263e-05, + "loss": 1.0821, + "step": 18662 + }, + { + "epoch": 0.6683617741328988, + "grad_norm": 1.4840028285980225, + "learning_rate": 5.234487110535802e-05, + "loss": 1.0406, + "step": 18663 + }, + { + "epoch": 0.6683975862624671, + "grad_norm": 1.2509026527404785, + "learning_rate": 5.233467419234173e-05, + "loss": 1.0585, + "step": 18664 + }, + { + "epoch": 0.6684333983920354, + "grad_norm": 1.497056484222412, + "learning_rate": 5.2324477920600876e-05, + "loss": 0.9768, + "step": 18665 + }, + { + "epoch": 0.6684692105216037, + "grad_norm": 1.5788006782531738, + "learning_rate": 5.231428229027269e-05, + "loss": 1.0457, + "step": 18666 + }, + { + "epoch": 0.668505022651172, + "grad_norm": 1.5258945226669312, + "learning_rate": 5.23040873014943e-05, + "loss": 1.048, + "step": 18667 + }, + { + "epoch": 0.6685408347807402, + "grad_norm": 1.4885551929473877, + "learning_rate": 5.229389295440295e-05, + "loss": 1.217, + "step": 18668 + }, + { + "epoch": 0.6685766469103085, + "grad_norm": 1.4547051191329956, + "learning_rate": 5.228369924913567e-05, + "loss": 0.881, + "step": 18669 + }, + { + "epoch": 0.6686124590398768, + "grad_norm": 2.255619525909424, + "learning_rate": 5.22735061858297e-05, + "loss": 1.032, + "step": 18670 + }, + { + "epoch": 0.6686482711694451, + "grad_norm": 1.2502721548080444, + "learning_rate": 5.2263313764622124e-05, + "loss": 0.8687, + "step": 18671 + }, + { + "epoch": 0.6686840832990134, + "grad_norm": 1.3420237302780151, + "learning_rate": 5.225312198565013e-05, + "loss": 1.143, + "step": 18672 + }, + { + "epoch": 0.6687198954285817, + "grad_norm": 1.4458361864089966, + "learning_rate": 5.224293084905074e-05, + "loss": 1.0446, + "step": 18673 + }, + { + "epoch": 0.66875570755815, + "grad_norm": 1.4120969772338867, + "learning_rate": 5.223274035496113e-05, + "loss": 1.1579, + "step": 18674 + }, + { + "epoch": 0.6687915196877182, + "grad_norm": 1.2632777690887451, + "learning_rate": 5.222255050351841e-05, + "loss": 1.1595, + "step": 18675 + }, + { + "epoch": 0.6688273318172865, + "grad_norm": 1.3815758228302002, + "learning_rate": 5.221236129485961e-05, + "loss": 0.8145, + "step": 18676 + }, + { + "epoch": 0.6688631439468548, + "grad_norm": 1.1857318878173828, + "learning_rate": 5.2202172729121844e-05, + "loss": 1.0715, + "step": 18677 + }, + { + "epoch": 0.6688989560764231, + "grad_norm": 1.7823607921600342, + "learning_rate": 5.219198480644221e-05, + "loss": 1.24, + "step": 18678 + }, + { + "epoch": 0.6689347682059914, + "grad_norm": 1.468261957168579, + "learning_rate": 5.2181797526957764e-05, + "loss": 1.1165, + "step": 18679 + }, + { + "epoch": 0.6689705803355597, + "grad_norm": 1.6573960781097412, + "learning_rate": 5.2171610890805524e-05, + "loss": 0.9808, + "step": 18680 + }, + { + "epoch": 0.6690063924651279, + "grad_norm": 1.4681601524353027, + "learning_rate": 5.216142489812256e-05, + "loss": 1.0897, + "step": 18681 + }, + { + "epoch": 0.6690422045946962, + "grad_norm": 1.7345912456512451, + "learning_rate": 5.215123954904596e-05, + "loss": 1.0899, + "step": 18682 + }, + { + "epoch": 0.6690780167242645, + "grad_norm": 1.251541018486023, + "learning_rate": 5.2141054843712675e-05, + "loss": 1.067, + "step": 18683 + }, + { + "epoch": 0.6691138288538327, + "grad_norm": 1.2250401973724365, + "learning_rate": 5.213087078225975e-05, + "loss": 1.175, + "step": 18684 + }, + { + "epoch": 0.6691496409834011, + "grad_norm": 1.5026860237121582, + "learning_rate": 5.212068736482423e-05, + "loss": 1.3494, + "step": 18685 + }, + { + "epoch": 0.6691854531129694, + "grad_norm": 1.2857197523117065, + "learning_rate": 5.211050459154313e-05, + "loss": 1.1533, + "step": 18686 + }, + { + "epoch": 0.6692212652425377, + "grad_norm": 1.7913817167282104, + "learning_rate": 5.210032246255338e-05, + "loss": 1.0498, + "step": 18687 + }, + { + "epoch": 0.6692570773721059, + "grad_norm": 1.7068698406219482, + "learning_rate": 5.209014097799201e-05, + "loss": 1.1362, + "step": 18688 + }, + { + "epoch": 0.6692928895016742, + "grad_norm": 1.7611156702041626, + "learning_rate": 5.207996013799603e-05, + "loss": 1.1116, + "step": 18689 + }, + { + "epoch": 0.6693287016312425, + "grad_norm": 2.1859631538391113, + "learning_rate": 5.206977994270233e-05, + "loss": 1.22, + "step": 18690 + }, + { + "epoch": 0.6693645137608107, + "grad_norm": 1.886218786239624, + "learning_rate": 5.205960039224795e-05, + "loss": 1.1942, + "step": 18691 + }, + { + "epoch": 0.6694003258903791, + "grad_norm": 1.7146146297454834, + "learning_rate": 5.2049421486769744e-05, + "loss": 1.2008, + "step": 18692 + }, + { + "epoch": 0.6694361380199474, + "grad_norm": 1.3550589084625244, + "learning_rate": 5.203924322640479e-05, + "loss": 0.9513, + "step": 18693 + }, + { + "epoch": 0.6694719501495157, + "grad_norm": 1.2934885025024414, + "learning_rate": 5.2029065611289926e-05, + "loss": 1.1736, + "step": 18694 + }, + { + "epoch": 0.6695077622790839, + "grad_norm": 1.3973313570022583, + "learning_rate": 5.2018888641562126e-05, + "loss": 0.9498, + "step": 18695 + }, + { + "epoch": 0.6695435744086522, + "grad_norm": 1.4840930700302124, + "learning_rate": 5.200871231735822e-05, + "loss": 1.084, + "step": 18696 + }, + { + "epoch": 0.6695793865382205, + "grad_norm": 1.4668978452682495, + "learning_rate": 5.1998536638815266e-05, + "loss": 1.1517, + "step": 18697 + }, + { + "epoch": 0.6696151986677887, + "grad_norm": 1.2886923551559448, + "learning_rate": 5.198836160607008e-05, + "loss": 1.0905, + "step": 18698 + }, + { + "epoch": 0.6696510107973571, + "grad_norm": 1.4364475011825562, + "learning_rate": 5.197818721925949e-05, + "loss": 1.1292, + "step": 18699 + }, + { + "epoch": 0.6696868229269254, + "grad_norm": 1.3651903867721558, + "learning_rate": 5.196801347852051e-05, + "loss": 0.9775, + "step": 18700 + }, + { + "epoch": 0.6697226350564937, + "grad_norm": 1.4661824703216553, + "learning_rate": 5.195784038398992e-05, + "loss": 1.1738, + "step": 18701 + }, + { + "epoch": 0.6697584471860619, + "grad_norm": 1.1258591413497925, + "learning_rate": 5.194766793580466e-05, + "loss": 0.8086, + "step": 18702 + }, + { + "epoch": 0.6697942593156302, + "grad_norm": 1.5824363231658936, + "learning_rate": 5.193749613410146e-05, + "loss": 0.9887, + "step": 18703 + }, + { + "epoch": 0.6698300714451985, + "grad_norm": 1.5720601081848145, + "learning_rate": 5.1927324979017335e-05, + "loss": 1.0943, + "step": 18704 + }, + { + "epoch": 0.6698658835747667, + "grad_norm": 1.5334759950637817, + "learning_rate": 5.191715447068901e-05, + "loss": 0.8846, + "step": 18705 + }, + { + "epoch": 0.6699016957043351, + "grad_norm": 1.900829792022705, + "learning_rate": 5.190698460925338e-05, + "loss": 1.2143, + "step": 18706 + }, + { + "epoch": 0.6699375078339034, + "grad_norm": 1.2251733541488647, + "learning_rate": 5.1896815394847195e-05, + "loss": 1.1434, + "step": 18707 + }, + { + "epoch": 0.6699733199634716, + "grad_norm": 1.8243491649627686, + "learning_rate": 5.188664682760731e-05, + "loss": 1.2712, + "step": 18708 + }, + { + "epoch": 0.6700091320930399, + "grad_norm": 1.3230359554290771, + "learning_rate": 5.1876478907670576e-05, + "loss": 1.116, + "step": 18709 + }, + { + "epoch": 0.6700449442226082, + "grad_norm": 1.3189493417739868, + "learning_rate": 5.186631163517367e-05, + "loss": 0.9052, + "step": 18710 + }, + { + "epoch": 0.6700807563521765, + "grad_norm": 1.348238229751587, + "learning_rate": 5.185614501025353e-05, + "loss": 1.0331, + "step": 18711 + }, + { + "epoch": 0.6701165684817447, + "grad_norm": 1.3121615648269653, + "learning_rate": 5.184597903304681e-05, + "loss": 1.0798, + "step": 18712 + }, + { + "epoch": 0.6701523806113131, + "grad_norm": 1.9334417581558228, + "learning_rate": 5.183581370369037e-05, + "loss": 1.3115, + "step": 18713 + }, + { + "epoch": 0.6701881927408814, + "grad_norm": 1.4889501333236694, + "learning_rate": 5.182564902232086e-05, + "loss": 1.0928, + "step": 18714 + }, + { + "epoch": 0.6702240048704496, + "grad_norm": 1.8466887474060059, + "learning_rate": 5.1815484989075157e-05, + "loss": 1.0959, + "step": 18715 + }, + { + "epoch": 0.6702598170000179, + "grad_norm": 1.4551985263824463, + "learning_rate": 5.1805321604089974e-05, + "loss": 1.1443, + "step": 18716 + }, + { + "epoch": 0.6702956291295862, + "grad_norm": 1.6550787687301636, + "learning_rate": 5.1795158867501966e-05, + "loss": 1.0181, + "step": 18717 + }, + { + "epoch": 0.6703314412591544, + "grad_norm": 1.57337486743927, + "learning_rate": 5.1784996779447926e-05, + "loss": 1.0087, + "step": 18718 + }, + { + "epoch": 0.6703672533887227, + "grad_norm": 1.8528954982757568, + "learning_rate": 5.177483534006455e-05, + "loss": 1.2163, + "step": 18719 + }, + { + "epoch": 0.6704030655182911, + "grad_norm": 1.966099500656128, + "learning_rate": 5.1764674549488614e-05, + "loss": 0.8283, + "step": 18720 + }, + { + "epoch": 0.6704388776478594, + "grad_norm": 1.4578207731246948, + "learning_rate": 5.175451440785671e-05, + "loss": 1.2352, + "step": 18721 + }, + { + "epoch": 0.6704746897774276, + "grad_norm": 1.8122076988220215, + "learning_rate": 5.174435491530559e-05, + "loss": 1.2174, + "step": 18722 + }, + { + "epoch": 0.6705105019069959, + "grad_norm": 1.942992091178894, + "learning_rate": 5.173419607197193e-05, + "loss": 0.9214, + "step": 18723 + }, + { + "epoch": 0.6705463140365642, + "grad_norm": 1.5003920793533325, + "learning_rate": 5.172403787799245e-05, + "loss": 0.8448, + "step": 18724 + }, + { + "epoch": 0.6705821261661324, + "grad_norm": 1.4884060621261597, + "learning_rate": 5.1713880333503704e-05, + "loss": 1.2087, + "step": 18725 + }, + { + "epoch": 0.6706179382957007, + "grad_norm": 1.8997490406036377, + "learning_rate": 5.1703723438642436e-05, + "loss": 1.3114, + "step": 18726 + }, + { + "epoch": 0.6706537504252691, + "grad_norm": 1.6259870529174805, + "learning_rate": 5.16935671935453e-05, + "loss": 1.1036, + "step": 18727 + }, + { + "epoch": 0.6706895625548374, + "grad_norm": 2.4235270023345947, + "learning_rate": 5.1683411598348876e-05, + "loss": 1.1863, + "step": 18728 + }, + { + "epoch": 0.6707253746844056, + "grad_norm": 1.48825204372406, + "learning_rate": 5.167325665318983e-05, + "loss": 1.1955, + "step": 18729 + }, + { + "epoch": 0.6707611868139739, + "grad_norm": 1.6042503118515015, + "learning_rate": 5.1663102358204754e-05, + "loss": 1.2391, + "step": 18730 + }, + { + "epoch": 0.6707969989435422, + "grad_norm": 1.4304777383804321, + "learning_rate": 5.165294871353035e-05, + "loss": 1.0122, + "step": 18731 + }, + { + "epoch": 0.6708328110731104, + "grad_norm": 1.3864485025405884, + "learning_rate": 5.16427957193031e-05, + "loss": 1.0639, + "step": 18732 + }, + { + "epoch": 0.6708686232026787, + "grad_norm": 1.5695353746414185, + "learning_rate": 5.163264337565967e-05, + "loss": 1.2265, + "step": 18733 + }, + { + "epoch": 0.6709044353322471, + "grad_norm": 1.3843544721603394, + "learning_rate": 5.1622491682736675e-05, + "loss": 0.921, + "step": 18734 + }, + { + "epoch": 0.6709402474618154, + "grad_norm": 1.7114338874816895, + "learning_rate": 5.16123406406706e-05, + "loss": 1.3109, + "step": 18735 + }, + { + "epoch": 0.6709760595913836, + "grad_norm": 1.7240734100341797, + "learning_rate": 5.160219024959807e-05, + "loss": 1.3556, + "step": 18736 + }, + { + "epoch": 0.6710118717209519, + "grad_norm": 1.3847373723983765, + "learning_rate": 5.159204050965565e-05, + "loss": 0.875, + "step": 18737 + }, + { + "epoch": 0.6710476838505202, + "grad_norm": 1.3642833232879639, + "learning_rate": 5.158189142097991e-05, + "loss": 0.9878, + "step": 18738 + }, + { + "epoch": 0.6710834959800884, + "grad_norm": 1.2971513271331787, + "learning_rate": 5.157174298370734e-05, + "loss": 0.9623, + "step": 18739 + }, + { + "epoch": 0.6711193081096567, + "grad_norm": 1.653354287147522, + "learning_rate": 5.15615951979745e-05, + "loss": 0.9118, + "step": 18740 + }, + { + "epoch": 0.6711551202392251, + "grad_norm": 1.5033047199249268, + "learning_rate": 5.155144806391789e-05, + "loss": 1.0416, + "step": 18741 + }, + { + "epoch": 0.6711909323687933, + "grad_norm": 1.417401909828186, + "learning_rate": 5.154130158167412e-05, + "loss": 1.1373, + "step": 18742 + }, + { + "epoch": 0.6712267444983616, + "grad_norm": 1.2631627321243286, + "learning_rate": 5.153115575137959e-05, + "loss": 0.9035, + "step": 18743 + }, + { + "epoch": 0.6712625566279299, + "grad_norm": 1.8462613821029663, + "learning_rate": 5.152101057317082e-05, + "loss": 1.0692, + "step": 18744 + }, + { + "epoch": 0.6712983687574982, + "grad_norm": 2.2978386878967285, + "learning_rate": 5.151086604718438e-05, + "loss": 0.8445, + "step": 18745 + }, + { + "epoch": 0.6713341808870664, + "grad_norm": 1.283471941947937, + "learning_rate": 5.150072217355664e-05, + "loss": 0.8638, + "step": 18746 + }, + { + "epoch": 0.6713699930166347, + "grad_norm": 1.4029357433319092, + "learning_rate": 5.149057895242412e-05, + "loss": 1.1056, + "step": 18747 + }, + { + "epoch": 0.6714058051462031, + "grad_norm": 2.2969179153442383, + "learning_rate": 5.148043638392329e-05, + "loss": 0.9747, + "step": 18748 + }, + { + "epoch": 0.6714416172757713, + "grad_norm": 1.6791678667068481, + "learning_rate": 5.147029446819065e-05, + "loss": 1.1477, + "step": 18749 + }, + { + "epoch": 0.6714774294053396, + "grad_norm": 1.6851333379745483, + "learning_rate": 5.146015320536255e-05, + "loss": 0.905, + "step": 18750 + }, + { + "epoch": 0.6715132415349079, + "grad_norm": 1.450018048286438, + "learning_rate": 5.145001259557548e-05, + "loss": 1.0331, + "step": 18751 + }, + { + "epoch": 0.6715490536644761, + "grad_norm": 1.424946665763855, + "learning_rate": 5.14398726389659e-05, + "loss": 0.8618, + "step": 18752 + }, + { + "epoch": 0.6715848657940444, + "grad_norm": 1.694478988647461, + "learning_rate": 5.142973333567016e-05, + "loss": 1.0033, + "step": 18753 + }, + { + "epoch": 0.6716206779236127, + "grad_norm": 1.827938437461853, + "learning_rate": 5.141959468582471e-05, + "loss": 1.1625, + "step": 18754 + }, + { + "epoch": 0.6716564900531811, + "grad_norm": 1.4590281248092651, + "learning_rate": 5.140945668956595e-05, + "loss": 1.146, + "step": 18755 + }, + { + "epoch": 0.6716923021827493, + "grad_norm": 1.4775478839874268, + "learning_rate": 5.1399319347030306e-05, + "loss": 1.2671, + "step": 18756 + }, + { + "epoch": 0.6717281143123176, + "grad_norm": 1.737074851989746, + "learning_rate": 5.1389182658354105e-05, + "loss": 1.3872, + "step": 18757 + }, + { + "epoch": 0.6717639264418859, + "grad_norm": 1.5571104288101196, + "learning_rate": 5.137904662367373e-05, + "loss": 1.0731, + "step": 18758 + }, + { + "epoch": 0.6717997385714541, + "grad_norm": 1.5040961503982544, + "learning_rate": 5.136891124312557e-05, + "loss": 0.9967, + "step": 18759 + }, + { + "epoch": 0.6718355507010224, + "grad_norm": 1.8494561910629272, + "learning_rate": 5.135877651684603e-05, + "loss": 1.0928, + "step": 18760 + }, + { + "epoch": 0.6718713628305907, + "grad_norm": 1.651287317276001, + "learning_rate": 5.1348642444971364e-05, + "loss": 1.0824, + "step": 18761 + }, + { + "epoch": 0.6719071749601591, + "grad_norm": 1.448621153831482, + "learning_rate": 5.133850902763795e-05, + "loss": 1.0972, + "step": 18762 + }, + { + "epoch": 0.6719429870897273, + "grad_norm": 1.6723077297210693, + "learning_rate": 5.132837626498217e-05, + "loss": 1.1153, + "step": 18763 + }, + { + "epoch": 0.6719787992192956, + "grad_norm": 1.5176984071731567, + "learning_rate": 5.1318244157140285e-05, + "loss": 1.0182, + "step": 18764 + }, + { + "epoch": 0.6720146113488639, + "grad_norm": 1.300607681274414, + "learning_rate": 5.13081127042486e-05, + "loss": 0.8886, + "step": 18765 + }, + { + "epoch": 0.6720504234784321, + "grad_norm": 1.5084418058395386, + "learning_rate": 5.129798190644348e-05, + "loss": 1.0065, + "step": 18766 + }, + { + "epoch": 0.6720862356080004, + "grad_norm": 1.4171243906021118, + "learning_rate": 5.128785176386122e-05, + "loss": 1.0548, + "step": 18767 + }, + { + "epoch": 0.6721220477375687, + "grad_norm": 2.294034481048584, + "learning_rate": 5.127772227663803e-05, + "loss": 1.0868, + "step": 18768 + }, + { + "epoch": 0.672157859867137, + "grad_norm": 1.5722886323928833, + "learning_rate": 5.1267593444910254e-05, + "loss": 1.1449, + "step": 18769 + }, + { + "epoch": 0.6721936719967053, + "grad_norm": 1.5798593759536743, + "learning_rate": 5.125746526881417e-05, + "loss": 1.0436, + "step": 18770 + }, + { + "epoch": 0.6722294841262736, + "grad_norm": 1.297400951385498, + "learning_rate": 5.1247337748486005e-05, + "loss": 1.0405, + "step": 18771 + }, + { + "epoch": 0.6722652962558419, + "grad_norm": 1.6094695329666138, + "learning_rate": 5.1237210884061994e-05, + "loss": 1.0776, + "step": 18772 + }, + { + "epoch": 0.6723011083854101, + "grad_norm": 1.7391507625579834, + "learning_rate": 5.1227084675678425e-05, + "loss": 0.9404, + "step": 18773 + }, + { + "epoch": 0.6723369205149784, + "grad_norm": 2.0703444480895996, + "learning_rate": 5.121695912347156e-05, + "loss": 1.1235, + "step": 18774 + }, + { + "epoch": 0.6723727326445467, + "grad_norm": 1.3871036767959595, + "learning_rate": 5.120683422757755e-05, + "loss": 1.0932, + "step": 18775 + }, + { + "epoch": 0.672408544774115, + "grad_norm": 1.4377059936523438, + "learning_rate": 5.119670998813264e-05, + "loss": 1.0627, + "step": 18776 + }, + { + "epoch": 0.6724443569036833, + "grad_norm": 1.7169320583343506, + "learning_rate": 5.1186586405273055e-05, + "loss": 1.1322, + "step": 18777 + }, + { + "epoch": 0.6724801690332516, + "grad_norm": 3.5163705348968506, + "learning_rate": 5.117646347913501e-05, + "loss": 1.0074, + "step": 18778 + }, + { + "epoch": 0.6725159811628199, + "grad_norm": 1.452775001525879, + "learning_rate": 5.116634120985467e-05, + "loss": 1.0048, + "step": 18779 + }, + { + "epoch": 0.6725517932923881, + "grad_norm": 1.4036084413528442, + "learning_rate": 5.115621959756815e-05, + "loss": 0.938, + "step": 18780 + }, + { + "epoch": 0.6725876054219564, + "grad_norm": 1.3410178422927856, + "learning_rate": 5.1146098642411765e-05, + "loss": 1.0624, + "step": 18781 + }, + { + "epoch": 0.6726234175515247, + "grad_norm": 1.2313449382781982, + "learning_rate": 5.113597834452157e-05, + "loss": 0.9613, + "step": 18782 + }, + { + "epoch": 0.672659229681093, + "grad_norm": 1.3407132625579834, + "learning_rate": 5.1125858704033745e-05, + "loss": 1.255, + "step": 18783 + }, + { + "epoch": 0.6726950418106613, + "grad_norm": 1.6899969577789307, + "learning_rate": 5.111573972108446e-05, + "loss": 1.2876, + "step": 18784 + }, + { + "epoch": 0.6727308539402296, + "grad_norm": 1.1120318174362183, + "learning_rate": 5.1105621395809875e-05, + "loss": 0.9779, + "step": 18785 + }, + { + "epoch": 0.6727666660697978, + "grad_norm": 1.3777810335159302, + "learning_rate": 5.1095503728346095e-05, + "loss": 1.0732, + "step": 18786 + }, + { + "epoch": 0.6728024781993661, + "grad_norm": 1.8452224731445312, + "learning_rate": 5.108538671882914e-05, + "loss": 1.1976, + "step": 18787 + }, + { + "epoch": 0.6728382903289344, + "grad_norm": 1.4947082996368408, + "learning_rate": 5.10752703673953e-05, + "loss": 1.1318, + "step": 18788 + }, + { + "epoch": 0.6728741024585027, + "grad_norm": 1.6771275997161865, + "learning_rate": 5.106515467418054e-05, + "loss": 0.9911, + "step": 18789 + }, + { + "epoch": 0.672909914588071, + "grad_norm": 1.715796709060669, + "learning_rate": 5.1055039639321046e-05, + "loss": 0.9778, + "step": 18790 + }, + { + "epoch": 0.6729457267176393, + "grad_norm": 1.6228349208831787, + "learning_rate": 5.104492526295278e-05, + "loss": 1.2326, + "step": 18791 + }, + { + "epoch": 0.6729815388472076, + "grad_norm": 1.4022117853164673, + "learning_rate": 5.103481154521197e-05, + "loss": 0.8041, + "step": 18792 + }, + { + "epoch": 0.6730173509767758, + "grad_norm": 1.6268398761749268, + "learning_rate": 5.102469848623459e-05, + "loss": 0.8399, + "step": 18793 + }, + { + "epoch": 0.6730531631063441, + "grad_norm": 1.5040704011917114, + "learning_rate": 5.10145860861567e-05, + "loss": 1.0231, + "step": 18794 + }, + { + "epoch": 0.6730889752359124, + "grad_norm": 1.6484442949295044, + "learning_rate": 5.1004474345114404e-05, + "loss": 1.2006, + "step": 18795 + }, + { + "epoch": 0.6731247873654806, + "grad_norm": 1.577577829360962, + "learning_rate": 5.099436326324367e-05, + "loss": 1.1136, + "step": 18796 + }, + { + "epoch": 0.673160599495049, + "grad_norm": 1.5009734630584717, + "learning_rate": 5.098425284068062e-05, + "loss": 0.9803, + "step": 18797 + }, + { + "epoch": 0.6731964116246173, + "grad_norm": 1.3161181211471558, + "learning_rate": 5.0974143077561135e-05, + "loss": 1.082, + "step": 18798 + }, + { + "epoch": 0.6732322237541856, + "grad_norm": 1.3983811140060425, + "learning_rate": 5.0964033974021386e-05, + "loss": 1.0815, + "step": 18799 + }, + { + "epoch": 0.6732680358837538, + "grad_norm": 1.3388439416885376, + "learning_rate": 5.095392553019728e-05, + "loss": 1.1295, + "step": 18800 + }, + { + "epoch": 0.6733038480133221, + "grad_norm": 1.9327589273452759, + "learning_rate": 5.094381774622488e-05, + "loss": 1.165, + "step": 18801 + }, + { + "epoch": 0.6733396601428904, + "grad_norm": 1.995801329612732, + "learning_rate": 5.0933710622240036e-05, + "loss": 1.1961, + "step": 18802 + }, + { + "epoch": 0.6733754722724586, + "grad_norm": 1.8676629066467285, + "learning_rate": 5.0923604158378924e-05, + "loss": 1.2056, + "step": 18803 + }, + { + "epoch": 0.673411284402027, + "grad_norm": 1.6736146211624146, + "learning_rate": 5.091349835477741e-05, + "loss": 1.1102, + "step": 18804 + }, + { + "epoch": 0.6734470965315953, + "grad_norm": 1.5446724891662598, + "learning_rate": 5.0903393211571414e-05, + "loss": 1.2403, + "step": 18805 + }, + { + "epoch": 0.6734829086611636, + "grad_norm": 2.0372581481933594, + "learning_rate": 5.089328872889694e-05, + "loss": 1.0858, + "step": 18806 + }, + { + "epoch": 0.6735187207907318, + "grad_norm": 1.5590873956680298, + "learning_rate": 5.0883184906889924e-05, + "loss": 1.0245, + "step": 18807 + }, + { + "epoch": 0.6735545329203001, + "grad_norm": 1.6735327243804932, + "learning_rate": 5.087308174568632e-05, + "loss": 1.2552, + "step": 18808 + }, + { + "epoch": 0.6735903450498684, + "grad_norm": 1.4615215063095093, + "learning_rate": 5.086297924542198e-05, + "loss": 0.9411, + "step": 18809 + }, + { + "epoch": 0.6736261571794366, + "grad_norm": 1.261224627494812, + "learning_rate": 5.085287740623292e-05, + "loss": 1.149, + "step": 18810 + }, + { + "epoch": 0.673661969309005, + "grad_norm": 1.915242314338684, + "learning_rate": 5.0842776228255e-05, + "loss": 1.0285, + "step": 18811 + }, + { + "epoch": 0.6736977814385733, + "grad_norm": 1.4283499717712402, + "learning_rate": 5.083267571162412e-05, + "loss": 1.1871, + "step": 18812 + }, + { + "epoch": 0.6737335935681416, + "grad_norm": 2.207976818084717, + "learning_rate": 5.082257585647614e-05, + "loss": 1.2343, + "step": 18813 + }, + { + "epoch": 0.6737694056977098, + "grad_norm": 2.171072006225586, + "learning_rate": 5.0812476662946975e-05, + "loss": 1.0677, + "step": 18814 + }, + { + "epoch": 0.6738052178272781, + "grad_norm": 1.687248945236206, + "learning_rate": 5.0802378131172525e-05, + "loss": 1.4033, + "step": 18815 + }, + { + "epoch": 0.6738410299568464, + "grad_norm": 1.8631763458251953, + "learning_rate": 5.079228026128857e-05, + "loss": 0.9943, + "step": 18816 + }, + { + "epoch": 0.6738768420864146, + "grad_norm": 1.3755419254302979, + "learning_rate": 5.078218305343102e-05, + "loss": 1.2068, + "step": 18817 + }, + { + "epoch": 0.6739126542159829, + "grad_norm": 1.8507195711135864, + "learning_rate": 5.07720865077357e-05, + "loss": 1.0728, + "step": 18818 + }, + { + "epoch": 0.6739484663455513, + "grad_norm": 1.3623566627502441, + "learning_rate": 5.0761990624338504e-05, + "loss": 1.1979, + "step": 18819 + }, + { + "epoch": 0.6739842784751195, + "grad_norm": 1.270957112312317, + "learning_rate": 5.075189540337514e-05, + "loss": 1.1755, + "step": 18820 + }, + { + "epoch": 0.6740200906046878, + "grad_norm": 1.8727366924285889, + "learning_rate": 5.074180084498157e-05, + "loss": 1.1728, + "step": 18821 + }, + { + "epoch": 0.6740559027342561, + "grad_norm": 1.672404408454895, + "learning_rate": 5.0731706949293525e-05, + "loss": 1.1119, + "step": 18822 + }, + { + "epoch": 0.6740917148638244, + "grad_norm": 1.413801670074463, + "learning_rate": 5.072161371644677e-05, + "loss": 1.0339, + "step": 18823 + }, + { + "epoch": 0.6741275269933926, + "grad_norm": 1.5326871871948242, + "learning_rate": 5.0711521146577156e-05, + "loss": 1.046, + "step": 18824 + }, + { + "epoch": 0.6741633391229609, + "grad_norm": 1.5472455024719238, + "learning_rate": 5.070142923982043e-05, + "loss": 1.0327, + "step": 18825 + }, + { + "epoch": 0.6741991512525293, + "grad_norm": 1.6669819355010986, + "learning_rate": 5.069133799631243e-05, + "loss": 1.0919, + "step": 18826 + }, + { + "epoch": 0.6742349633820975, + "grad_norm": 1.9660276174545288, + "learning_rate": 5.0681247416188826e-05, + "loss": 1.205, + "step": 18827 + }, + { + "epoch": 0.6742707755116658, + "grad_norm": 1.569909691810608, + "learning_rate": 5.067115749958543e-05, + "loss": 1.0373, + "step": 18828 + }, + { + "epoch": 0.6743065876412341, + "grad_norm": 1.4357339143753052, + "learning_rate": 5.066106824663798e-05, + "loss": 1.0711, + "step": 18829 + }, + { + "epoch": 0.6743423997708023, + "grad_norm": 1.1876213550567627, + "learning_rate": 5.065097965748224e-05, + "loss": 1.1741, + "step": 18830 + }, + { + "epoch": 0.6743782119003706, + "grad_norm": 1.5430779457092285, + "learning_rate": 5.0640891732253905e-05, + "loss": 1.0405, + "step": 18831 + }, + { + "epoch": 0.6744140240299389, + "grad_norm": 1.4803388118743896, + "learning_rate": 5.063080447108868e-05, + "loss": 1.1579, + "step": 18832 + }, + { + "epoch": 0.6744498361595073, + "grad_norm": 1.7319990396499634, + "learning_rate": 5.0620717874122336e-05, + "loss": 1.1319, + "step": 18833 + }, + { + "epoch": 0.6744856482890755, + "grad_norm": 1.7321690320968628, + "learning_rate": 5.06106319414905e-05, + "loss": 1.4987, + "step": 18834 + }, + { + "epoch": 0.6745214604186438, + "grad_norm": 1.5797735452651978, + "learning_rate": 5.0600546673328916e-05, + "loss": 1.0711, + "step": 18835 + }, + { + "epoch": 0.6745572725482121, + "grad_norm": 1.392364740371704, + "learning_rate": 5.059046206977325e-05, + "loss": 1.4235, + "step": 18836 + }, + { + "epoch": 0.6745930846777803, + "grad_norm": 1.5466675758361816, + "learning_rate": 5.0580378130959216e-05, + "loss": 1.339, + "step": 18837 + }, + { + "epoch": 0.6746288968073486, + "grad_norm": 1.4273608922958374, + "learning_rate": 5.05702948570224e-05, + "loss": 0.9708, + "step": 18838 + }, + { + "epoch": 0.6746647089369169, + "grad_norm": 1.6830672025680542, + "learning_rate": 5.056021224809853e-05, + "loss": 1.0482, + "step": 18839 + }, + { + "epoch": 0.6747005210664853, + "grad_norm": 1.391306757926941, + "learning_rate": 5.055013030432326e-05, + "loss": 0.9047, + "step": 18840 + }, + { + "epoch": 0.6747363331960535, + "grad_norm": 2.151766300201416, + "learning_rate": 5.054004902583216e-05, + "loss": 1.109, + "step": 18841 + }, + { + "epoch": 0.6747721453256218, + "grad_norm": 2.065199613571167, + "learning_rate": 5.052996841276091e-05, + "loss": 1.0998, + "step": 18842 + }, + { + "epoch": 0.6748079574551901, + "grad_norm": 1.4419647455215454, + "learning_rate": 5.0519888465245116e-05, + "loss": 1.1785, + "step": 18843 + }, + { + "epoch": 0.6748437695847583, + "grad_norm": 1.9207993745803833, + "learning_rate": 5.050980918342043e-05, + "loss": 1.2528, + "step": 18844 + }, + { + "epoch": 0.6748795817143266, + "grad_norm": 1.422327995300293, + "learning_rate": 5.04997305674224e-05, + "loss": 1.0893, + "step": 18845 + }, + { + "epoch": 0.6749153938438949, + "grad_norm": 2.362302780151367, + "learning_rate": 5.048965261738664e-05, + "loss": 1.1656, + "step": 18846 + }, + { + "epoch": 0.6749512059734633, + "grad_norm": 1.4899520874023438, + "learning_rate": 5.047957533344874e-05, + "loss": 0.9715, + "step": 18847 + }, + { + "epoch": 0.6749870181030315, + "grad_norm": 1.3583420515060425, + "learning_rate": 5.0469498715744314e-05, + "loss": 1.0491, + "step": 18848 + }, + { + "epoch": 0.6750228302325998, + "grad_norm": 1.3858389854431152, + "learning_rate": 5.045942276440885e-05, + "loss": 1.2499, + "step": 18849 + }, + { + "epoch": 0.6750586423621681, + "grad_norm": 1.5263091325759888, + "learning_rate": 5.0449347479577946e-05, + "loss": 0.8965, + "step": 18850 + }, + { + "epoch": 0.6750944544917363, + "grad_norm": 1.9451074600219727, + "learning_rate": 5.043927286138721e-05, + "loss": 1.1574, + "step": 18851 + }, + { + "epoch": 0.6751302666213046, + "grad_norm": 1.7569549083709717, + "learning_rate": 5.0429198909972086e-05, + "loss": 1.0796, + "step": 18852 + }, + { + "epoch": 0.6751660787508729, + "grad_norm": 2.3248791694641113, + "learning_rate": 5.041912562546813e-05, + "loss": 1.1819, + "step": 18853 + }, + { + "epoch": 0.6752018908804412, + "grad_norm": 1.3482085466384888, + "learning_rate": 5.040905300801091e-05, + "loss": 0.94, + "step": 18854 + }, + { + "epoch": 0.6752377030100095, + "grad_norm": 1.7388145923614502, + "learning_rate": 5.039898105773594e-05, + "loss": 1.1815, + "step": 18855 + }, + { + "epoch": 0.6752735151395778, + "grad_norm": 1.3372715711593628, + "learning_rate": 5.038890977477866e-05, + "loss": 0.9474, + "step": 18856 + }, + { + "epoch": 0.675309327269146, + "grad_norm": 1.7970608472824097, + "learning_rate": 5.037883915927462e-05, + "loss": 1.0885, + "step": 18857 + }, + { + "epoch": 0.6753451393987143, + "grad_norm": 1.5723299980163574, + "learning_rate": 5.036876921135931e-05, + "loss": 0.9021, + "step": 18858 + }, + { + "epoch": 0.6753809515282826, + "grad_norm": 1.5603972673416138, + "learning_rate": 5.035869993116816e-05, + "loss": 1.1802, + "step": 18859 + }, + { + "epoch": 0.6754167636578509, + "grad_norm": 1.4115105867385864, + "learning_rate": 5.034863131883667e-05, + "loss": 1.0692, + "step": 18860 + }, + { + "epoch": 0.6754525757874192, + "grad_norm": 1.5618422031402588, + "learning_rate": 5.03385633745003e-05, + "loss": 1.1349, + "step": 18861 + }, + { + "epoch": 0.6754883879169875, + "grad_norm": 1.5184533596038818, + "learning_rate": 5.032849609829454e-05, + "loss": 1.1154, + "step": 18862 + }, + { + "epoch": 0.6755242000465558, + "grad_norm": 1.9580198526382446, + "learning_rate": 5.0318429490354754e-05, + "loss": 1.0126, + "step": 18863 + }, + { + "epoch": 0.675560012176124, + "grad_norm": 1.3815008401870728, + "learning_rate": 5.030836355081643e-05, + "loss": 1.04, + "step": 18864 + }, + { + "epoch": 0.6755958243056923, + "grad_norm": 1.8554223775863647, + "learning_rate": 5.0298298279814956e-05, + "loss": 1.146, + "step": 18865 + }, + { + "epoch": 0.6756316364352606, + "grad_norm": 1.5165122747421265, + "learning_rate": 5.0288233677485806e-05, + "loss": 1.099, + "step": 18866 + }, + { + "epoch": 0.6756674485648289, + "grad_norm": 1.5365660190582275, + "learning_rate": 5.027816974396432e-05, + "loss": 0.9337, + "step": 18867 + }, + { + "epoch": 0.6757032606943972, + "grad_norm": 1.5714460611343384, + "learning_rate": 5.0268106479385924e-05, + "loss": 0.9942, + "step": 18868 + }, + { + "epoch": 0.6757390728239655, + "grad_norm": 1.59028160572052, + "learning_rate": 5.025804388388604e-05, + "loss": 1.3152, + "step": 18869 + }, + { + "epoch": 0.6757748849535338, + "grad_norm": 1.6934605836868286, + "learning_rate": 5.024798195759998e-05, + "loss": 1.255, + "step": 18870 + }, + { + "epoch": 0.675810697083102, + "grad_norm": 1.4592260122299194, + "learning_rate": 5.023792070066313e-05, + "loss": 1.1922, + "step": 18871 + }, + { + "epoch": 0.6758465092126703, + "grad_norm": 1.4688720703125, + "learning_rate": 5.022786011321089e-05, + "loss": 0.9349, + "step": 18872 + }, + { + "epoch": 0.6758823213422386, + "grad_norm": 1.4715315103530884, + "learning_rate": 5.021780019537862e-05, + "loss": 1.1563, + "step": 18873 + }, + { + "epoch": 0.6759181334718068, + "grad_norm": 1.4326170682907104, + "learning_rate": 5.02077409473016e-05, + "loss": 1.2633, + "step": 18874 + }, + { + "epoch": 0.6759539456013752, + "grad_norm": 1.6059203147888184, + "learning_rate": 5.019768236911519e-05, + "loss": 1.1806, + "step": 18875 + }, + { + "epoch": 0.6759897577309435, + "grad_norm": 1.3588365316390991, + "learning_rate": 5.018762446095476e-05, + "loss": 0.9971, + "step": 18876 + }, + { + "epoch": 0.6760255698605118, + "grad_norm": 1.5859851837158203, + "learning_rate": 5.017756722295557e-05, + "loss": 0.9111, + "step": 18877 + }, + { + "epoch": 0.67606138199008, + "grad_norm": 1.6490812301635742, + "learning_rate": 5.016751065525292e-05, + "loss": 1.0308, + "step": 18878 + }, + { + "epoch": 0.6760971941196483, + "grad_norm": 1.5959761142730713, + "learning_rate": 5.015745475798215e-05, + "loss": 1.0651, + "step": 18879 + }, + { + "epoch": 0.6761330062492166, + "grad_norm": 1.3000450134277344, + "learning_rate": 5.014739953127857e-05, + "loss": 0.8701, + "step": 18880 + }, + { + "epoch": 0.6761688183787848, + "grad_norm": 1.4500625133514404, + "learning_rate": 5.013734497527739e-05, + "loss": 0.9784, + "step": 18881 + }, + { + "epoch": 0.6762046305083532, + "grad_norm": 1.4868762493133545, + "learning_rate": 5.0127291090113917e-05, + "loss": 1.1001, + "step": 18882 + }, + { + "epoch": 0.6762404426379215, + "grad_norm": 1.9462308883666992, + "learning_rate": 5.011723787592344e-05, + "loss": 1.1375, + "step": 18883 + }, + { + "epoch": 0.6762762547674898, + "grad_norm": 1.4143962860107422, + "learning_rate": 5.0107185332841155e-05, + "loss": 0.9519, + "step": 18884 + }, + { + "epoch": 0.676312066897058, + "grad_norm": 1.7359864711761475, + "learning_rate": 5.009713346100235e-05, + "loss": 1.5251, + "step": 18885 + }, + { + "epoch": 0.6763478790266263, + "grad_norm": 1.5614783763885498, + "learning_rate": 5.008708226054219e-05, + "loss": 1.3046, + "step": 18886 + }, + { + "epoch": 0.6763836911561946, + "grad_norm": 1.854400873184204, + "learning_rate": 5.007703173159604e-05, + "loss": 0.9291, + "step": 18887 + }, + { + "epoch": 0.6764195032857628, + "grad_norm": 1.8594270944595337, + "learning_rate": 5.0066981874298967e-05, + "loss": 1.1194, + "step": 18888 + }, + { + "epoch": 0.6764553154153312, + "grad_norm": 1.5475573539733887, + "learning_rate": 5.0056932688786294e-05, + "loss": 1.1783, + "step": 18889 + }, + { + "epoch": 0.6764911275448995, + "grad_norm": 1.6319698095321655, + "learning_rate": 5.00468841751931e-05, + "loss": 1.1672, + "step": 18890 + }, + { + "epoch": 0.6765269396744678, + "grad_norm": 1.4897651672363281, + "learning_rate": 5.0036836333654715e-05, + "loss": 0.9521, + "step": 18891 + }, + { + "epoch": 0.676562751804036, + "grad_norm": 1.4013245105743408, + "learning_rate": 5.0026789164306255e-05, + "loss": 1.1516, + "step": 18892 + }, + { + "epoch": 0.6765985639336043, + "grad_norm": 1.5549391508102417, + "learning_rate": 5.00167426672828e-05, + "loss": 0.9989, + "step": 18893 + }, + { + "epoch": 0.6766343760631726, + "grad_norm": 1.3875082731246948, + "learning_rate": 5.000669684271968e-05, + "loss": 1.0537, + "step": 18894 + }, + { + "epoch": 0.6766701881927408, + "grad_norm": 1.5135225057601929, + "learning_rate": 4.999665169075193e-05, + "loss": 0.9805, + "step": 18895 + }, + { + "epoch": 0.6767060003223092, + "grad_norm": 1.1904200315475464, + "learning_rate": 4.998660721151476e-05, + "loss": 0.7961, + "step": 18896 + }, + { + "epoch": 0.6767418124518775, + "grad_norm": 1.4424492120742798, + "learning_rate": 4.997656340514321e-05, + "loss": 1.0932, + "step": 18897 + }, + { + "epoch": 0.6767776245814457, + "grad_norm": 1.6106114387512207, + "learning_rate": 4.996652027177255e-05, + "loss": 1.0554, + "step": 18898 + }, + { + "epoch": 0.676813436711014, + "grad_norm": 1.820142149925232, + "learning_rate": 4.995647781153778e-05, + "loss": 1.1906, + "step": 18899 + }, + { + "epoch": 0.6768492488405823, + "grad_norm": 1.9480156898498535, + "learning_rate": 4.99464360245741e-05, + "loss": 1.1763, + "step": 18900 + }, + { + "epoch": 0.6768850609701506, + "grad_norm": 1.8290568590164185, + "learning_rate": 4.9936394911016504e-05, + "loss": 1.2559, + "step": 18901 + }, + { + "epoch": 0.6769208730997188, + "grad_norm": 1.4510433673858643, + "learning_rate": 4.992635447100015e-05, + "loss": 1.2564, + "step": 18902 + }, + { + "epoch": 0.6769566852292872, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9916314704660126e-05, + "loss": 1.049, + "step": 18903 + }, + { + "epoch": 0.6769924973588555, + "grad_norm": 1.4846951961517334, + "learning_rate": 4.9906275612131424e-05, + "loss": 1.1628, + "step": 18904 + }, + { + "epoch": 0.6770283094884237, + "grad_norm": 2.1924397945404053, + "learning_rate": 4.9896237193549244e-05, + "loss": 1.1903, + "step": 18905 + }, + { + "epoch": 0.677064121617992, + "grad_norm": 1.8705377578735352, + "learning_rate": 4.988619944904852e-05, + "loss": 1.1365, + "step": 18906 + }, + { + "epoch": 0.6770999337475603, + "grad_norm": 1.3093010187149048, + "learning_rate": 4.987616237876438e-05, + "loss": 0.9766, + "step": 18907 + }, + { + "epoch": 0.6771357458771285, + "grad_norm": 1.3773871660232544, + "learning_rate": 4.9866125982831745e-05, + "loss": 1.1091, + "step": 18908 + }, + { + "epoch": 0.6771715580066968, + "grad_norm": 1.8917542695999146, + "learning_rate": 4.9856090261385793e-05, + "loss": 0.9854, + "step": 18909 + }, + { + "epoch": 0.6772073701362652, + "grad_norm": 1.4388055801391602, + "learning_rate": 4.984605521456146e-05, + "loss": 1.0081, + "step": 18910 + }, + { + "epoch": 0.6772431822658335, + "grad_norm": 1.4424718618392944, + "learning_rate": 4.983602084249372e-05, + "loss": 1.265, + "step": 18911 + }, + { + "epoch": 0.6772789943954017, + "grad_norm": 1.3846657276153564, + "learning_rate": 4.982598714531762e-05, + "loss": 0.98, + "step": 18912 + }, + { + "epoch": 0.67731480652497, + "grad_norm": 1.5070819854736328, + "learning_rate": 4.981595412316815e-05, + "loss": 1.1701, + "step": 18913 + }, + { + "epoch": 0.6773506186545383, + "grad_norm": 1.2920361757278442, + "learning_rate": 4.980592177618031e-05, + "loss": 0.9548, + "step": 18914 + }, + { + "epoch": 0.6773864307841065, + "grad_norm": 2.704608678817749, + "learning_rate": 4.979589010448902e-05, + "loss": 1.0066, + "step": 18915 + }, + { + "epoch": 0.6774222429136748, + "grad_norm": 1.680281400680542, + "learning_rate": 4.978585910822926e-05, + "loss": 0.9851, + "step": 18916 + }, + { + "epoch": 0.6774580550432432, + "grad_norm": 1.5228948593139648, + "learning_rate": 4.977582878753599e-05, + "loss": 1.2052, + "step": 18917 + }, + { + "epoch": 0.6774938671728115, + "grad_norm": 1.2401868104934692, + "learning_rate": 4.9765799142544215e-05, + "loss": 1.0946, + "step": 18918 + }, + { + "epoch": 0.6775296793023797, + "grad_norm": 1.3843302726745605, + "learning_rate": 4.975577017338876e-05, + "loss": 0.9915, + "step": 18919 + }, + { + "epoch": 0.677565491431948, + "grad_norm": 1.3833767175674438, + "learning_rate": 4.9745741880204613e-05, + "loss": 1.0793, + "step": 18920 + }, + { + "epoch": 0.6776013035615163, + "grad_norm": 1.3466390371322632, + "learning_rate": 4.973571426312673e-05, + "loss": 0.9843, + "step": 18921 + }, + { + "epoch": 0.6776371156910845, + "grad_norm": 1.5702260732650757, + "learning_rate": 4.9725687322289926e-05, + "loss": 1.1364, + "step": 18922 + }, + { + "epoch": 0.6776729278206528, + "grad_norm": 1.421081304550171, + "learning_rate": 4.971566105782916e-05, + "loss": 1.2302, + "step": 18923 + }, + { + "epoch": 0.6777087399502212, + "grad_norm": 2.1599485874176025, + "learning_rate": 4.9705635469879306e-05, + "loss": 1.0531, + "step": 18924 + }, + { + "epoch": 0.6777445520797895, + "grad_norm": 1.3732640743255615, + "learning_rate": 4.969561055857529e-05, + "loss": 0.9393, + "step": 18925 + }, + { + "epoch": 0.6777803642093577, + "grad_norm": 1.4820538759231567, + "learning_rate": 4.9685586324051915e-05, + "loss": 1.0836, + "step": 18926 + }, + { + "epoch": 0.677816176338926, + "grad_norm": 1.535851240158081, + "learning_rate": 4.967556276644406e-05, + "loss": 1.006, + "step": 18927 + }, + { + "epoch": 0.6778519884684943, + "grad_norm": 1.3564882278442383, + "learning_rate": 4.966553988588665e-05, + "loss": 1.0316, + "step": 18928 + }, + { + "epoch": 0.6778878005980625, + "grad_norm": 1.526377558708191, + "learning_rate": 4.965551768251442e-05, + "loss": 0.981, + "step": 18929 + }, + { + "epoch": 0.6779236127276308, + "grad_norm": 1.5620564222335815, + "learning_rate": 4.9645496156462266e-05, + "loss": 1.1226, + "step": 18930 + }, + { + "epoch": 0.6779594248571992, + "grad_norm": 1.3387274742126465, + "learning_rate": 4.963547530786501e-05, + "loss": 1.1284, + "step": 18931 + }, + { + "epoch": 0.6779952369867674, + "grad_norm": 1.5250431299209595, + "learning_rate": 4.962545513685751e-05, + "loss": 1.0599, + "step": 18932 + }, + { + "epoch": 0.6780310491163357, + "grad_norm": 1.2125842571258545, + "learning_rate": 4.961543564357449e-05, + "loss": 0.9304, + "step": 18933 + }, + { + "epoch": 0.678066861245904, + "grad_norm": 1.34574556350708, + "learning_rate": 4.9605416828150795e-05, + "loss": 1.1542, + "step": 18934 + }, + { + "epoch": 0.6781026733754723, + "grad_norm": 1.2656079530715942, + "learning_rate": 4.959539869072121e-05, + "loss": 1.1858, + "step": 18935 + }, + { + "epoch": 0.6781384855050405, + "grad_norm": 1.4896721839904785, + "learning_rate": 4.958538123142056e-05, + "loss": 0.895, + "step": 18936 + }, + { + "epoch": 0.6781742976346088, + "grad_norm": 1.5530081987380981, + "learning_rate": 4.957536445038353e-05, + "loss": 1.0457, + "step": 18937 + }, + { + "epoch": 0.6782101097641772, + "grad_norm": 1.6623810529708862, + "learning_rate": 4.9565348347744934e-05, + "loss": 1.133, + "step": 18938 + }, + { + "epoch": 0.6782459218937454, + "grad_norm": 1.6524816751480103, + "learning_rate": 4.955533292363955e-05, + "loss": 1.0475, + "step": 18939 + }, + { + "epoch": 0.6782817340233137, + "grad_norm": 1.343307614326477, + "learning_rate": 4.954531817820206e-05, + "loss": 1.1672, + "step": 18940 + }, + { + "epoch": 0.678317546152882, + "grad_norm": 1.5181938409805298, + "learning_rate": 4.953530411156724e-05, + "loss": 1.0374, + "step": 18941 + }, + { + "epoch": 0.6783533582824502, + "grad_norm": 1.5941907167434692, + "learning_rate": 4.95252907238698e-05, + "loss": 1.0827, + "step": 18942 + }, + { + "epoch": 0.6783891704120185, + "grad_norm": 1.569088339805603, + "learning_rate": 4.95152780152445e-05, + "loss": 1.1803, + "step": 18943 + }, + { + "epoch": 0.6784249825415868, + "grad_norm": 1.7710870504379272, + "learning_rate": 4.9505265985825976e-05, + "loss": 1.2437, + "step": 18944 + }, + { + "epoch": 0.6784607946711552, + "grad_norm": 1.6206064224243164, + "learning_rate": 4.9495254635748975e-05, + "loss": 1.0519, + "step": 18945 + }, + { + "epoch": 0.6784966068007234, + "grad_norm": 1.493377685546875, + "learning_rate": 4.948524396514821e-05, + "loss": 1.0938, + "step": 18946 + }, + { + "epoch": 0.6785324189302917, + "grad_norm": 1.3704679012298584, + "learning_rate": 4.947523397415829e-05, + "loss": 1.1368, + "step": 18947 + }, + { + "epoch": 0.67856823105986, + "grad_norm": 1.3636376857757568, + "learning_rate": 4.9465224662913925e-05, + "loss": 1.0177, + "step": 18948 + }, + { + "epoch": 0.6786040431894282, + "grad_norm": 1.5713893175125122, + "learning_rate": 4.9455216031549766e-05, + "loss": 1.2124, + "step": 18949 + }, + { + "epoch": 0.6786398553189965, + "grad_norm": 1.3589427471160889, + "learning_rate": 4.9445208080200536e-05, + "loss": 1.0561, + "step": 18950 + }, + { + "epoch": 0.6786756674485648, + "grad_norm": 1.754638671875, + "learning_rate": 4.943520080900076e-05, + "loss": 1.2603, + "step": 18951 + }, + { + "epoch": 0.6787114795781332, + "grad_norm": 1.3987901210784912, + "learning_rate": 4.9425194218085145e-05, + "loss": 1.0907, + "step": 18952 + }, + { + "epoch": 0.6787472917077014, + "grad_norm": 1.3915766477584839, + "learning_rate": 4.94151883075883e-05, + "loss": 1.1923, + "step": 18953 + }, + { + "epoch": 0.6787831038372697, + "grad_norm": 1.6878238916397095, + "learning_rate": 4.940518307764489e-05, + "loss": 1.0328, + "step": 18954 + }, + { + "epoch": 0.678818915966838, + "grad_norm": 1.7761831283569336, + "learning_rate": 4.939517852838944e-05, + "loss": 0.9204, + "step": 18955 + }, + { + "epoch": 0.6788547280964062, + "grad_norm": 1.482427716255188, + "learning_rate": 4.938517465995659e-05, + "loss": 1.1975, + "step": 18956 + }, + { + "epoch": 0.6788905402259745, + "grad_norm": 1.5041158199310303, + "learning_rate": 4.937517147248096e-05, + "loss": 1.3207, + "step": 18957 + }, + { + "epoch": 0.6789263523555428, + "grad_norm": 1.836581826210022, + "learning_rate": 4.936516896609707e-05, + "loss": 1.1715, + "step": 18958 + }, + { + "epoch": 0.6789621644851112, + "grad_norm": 1.7969814538955688, + "learning_rate": 4.9355167140939494e-05, + "loss": 1.1178, + "step": 18959 + }, + { + "epoch": 0.6789979766146794, + "grad_norm": 1.7930432558059692, + "learning_rate": 4.934516599714284e-05, + "loss": 1.0649, + "step": 18960 + }, + { + "epoch": 0.6790337887442477, + "grad_norm": 1.4731900691986084, + "learning_rate": 4.933516553484167e-05, + "loss": 1.408, + "step": 18961 + }, + { + "epoch": 0.679069600873816, + "grad_norm": 1.3859164714813232, + "learning_rate": 4.9325165754170446e-05, + "loss": 0.8797, + "step": 18962 + }, + { + "epoch": 0.6791054130033842, + "grad_norm": 1.794975996017456, + "learning_rate": 4.931516665526376e-05, + "loss": 1.1855, + "step": 18963 + }, + { + "epoch": 0.6791412251329525, + "grad_norm": 1.6050777435302734, + "learning_rate": 4.930516823825616e-05, + "loss": 1.1595, + "step": 18964 + }, + { + "epoch": 0.6791770372625208, + "grad_norm": 1.298718810081482, + "learning_rate": 4.9295170503282095e-05, + "loss": 1.1253, + "step": 18965 + }, + { + "epoch": 0.6792128493920891, + "grad_norm": 1.7988742589950562, + "learning_rate": 4.928517345047611e-05, + "loss": 1.051, + "step": 18966 + }, + { + "epoch": 0.6792486615216574, + "grad_norm": 1.6896175146102905, + "learning_rate": 4.927517707997269e-05, + "loss": 1.0455, + "step": 18967 + }, + { + "epoch": 0.6792844736512257, + "grad_norm": 1.6045440435409546, + "learning_rate": 4.926518139190638e-05, + "loss": 1.0813, + "step": 18968 + }, + { + "epoch": 0.679320285780794, + "grad_norm": 1.797660231590271, + "learning_rate": 4.925518638641157e-05, + "loss": 1.4228, + "step": 18969 + }, + { + "epoch": 0.6793560979103622, + "grad_norm": 1.4052842855453491, + "learning_rate": 4.924519206362276e-05, + "loss": 1.0196, + "step": 18970 + }, + { + "epoch": 0.6793919100399305, + "grad_norm": 1.6116230487823486, + "learning_rate": 4.9235198423674435e-05, + "loss": 1.1139, + "step": 18971 + }, + { + "epoch": 0.6794277221694988, + "grad_norm": 1.5468086004257202, + "learning_rate": 4.9225205466701064e-05, + "loss": 1.1736, + "step": 18972 + }, + { + "epoch": 0.6794635342990671, + "grad_norm": 1.6572281122207642, + "learning_rate": 4.9215213192837064e-05, + "loss": 1.2973, + "step": 18973 + }, + { + "epoch": 0.6794993464286354, + "grad_norm": 1.2575677633285522, + "learning_rate": 4.920522160221679e-05, + "loss": 0.9793, + "step": 18974 + }, + { + "epoch": 0.6795351585582037, + "grad_norm": 1.7619749307632446, + "learning_rate": 4.91952306949748e-05, + "loss": 0.8904, + "step": 18975 + }, + { + "epoch": 0.679570970687772, + "grad_norm": 1.6427042484283447, + "learning_rate": 4.918524047124543e-05, + "loss": 1.314, + "step": 18976 + }, + { + "epoch": 0.6796067828173402, + "grad_norm": 1.218179702758789, + "learning_rate": 4.9175250931163085e-05, + "loss": 1.1155, + "step": 18977 + }, + { + "epoch": 0.6796425949469085, + "grad_norm": 1.2490037679672241, + "learning_rate": 4.916526207486219e-05, + "loss": 1.1293, + "step": 18978 + }, + { + "epoch": 0.6796784070764768, + "grad_norm": 1.9429277181625366, + "learning_rate": 4.915527390247716e-05, + "loss": 1.2455, + "step": 18979 + }, + { + "epoch": 0.6797142192060451, + "grad_norm": 2.8928332328796387, + "learning_rate": 4.914528641414233e-05, + "loss": 1.4339, + "step": 18980 + }, + { + "epoch": 0.6797500313356134, + "grad_norm": 1.4784467220306396, + "learning_rate": 4.9135299609992004e-05, + "loss": 1.017, + "step": 18981 + }, + { + "epoch": 0.6797858434651817, + "grad_norm": 1.4824602603912354, + "learning_rate": 4.912531349016067e-05, + "loss": 1.0974, + "step": 18982 + }, + { + "epoch": 0.6798216555947499, + "grad_norm": 1.4509050846099854, + "learning_rate": 4.911532805478259e-05, + "loss": 0.9632, + "step": 18983 + }, + { + "epoch": 0.6798574677243182, + "grad_norm": 1.3256309032440186, + "learning_rate": 4.910534330399219e-05, + "loss": 1.0442, + "step": 18984 + }, + { + "epoch": 0.6798932798538865, + "grad_norm": 1.608473539352417, + "learning_rate": 4.909535923792365e-05, + "loss": 1.1014, + "step": 18985 + }, + { + "epoch": 0.6799290919834547, + "grad_norm": 1.459423542022705, + "learning_rate": 4.9085375856711465e-05, + "loss": 1.2462, + "step": 18986 + }, + { + "epoch": 0.6799649041130231, + "grad_norm": 1.5557119846343994, + "learning_rate": 4.907539316048985e-05, + "loss": 1.1013, + "step": 18987 + }, + { + "epoch": 0.6800007162425914, + "grad_norm": 2.46038556098938, + "learning_rate": 4.906541114939313e-05, + "loss": 1.2274, + "step": 18988 + }, + { + "epoch": 0.6800365283721597, + "grad_norm": 1.734032154083252, + "learning_rate": 4.9055429823555624e-05, + "loss": 1.1541, + "step": 18989 + }, + { + "epoch": 0.6800723405017279, + "grad_norm": 1.8033968210220337, + "learning_rate": 4.9045449183111566e-05, + "loss": 1.0413, + "step": 18990 + }, + { + "epoch": 0.6801081526312962, + "grad_norm": 1.7498400211334229, + "learning_rate": 4.903546922819531e-05, + "loss": 1.1309, + "step": 18991 + }, + { + "epoch": 0.6801439647608645, + "grad_norm": 1.5260525941848755, + "learning_rate": 4.9025489958940985e-05, + "loss": 1.092, + "step": 18992 + }, + { + "epoch": 0.6801797768904327, + "grad_norm": 2.0719809532165527, + "learning_rate": 4.9015511375483026e-05, + "loss": 1.0518, + "step": 18993 + }, + { + "epoch": 0.6802155890200011, + "grad_norm": 1.5206432342529297, + "learning_rate": 4.900553347795556e-05, + "loss": 0.9994, + "step": 18994 + }, + { + "epoch": 0.6802514011495694, + "grad_norm": 1.6279164552688599, + "learning_rate": 4.899555626649289e-05, + "loss": 0.8254, + "step": 18995 + }, + { + "epoch": 0.6802872132791377, + "grad_norm": 1.7660795450210571, + "learning_rate": 4.898557974122915e-05, + "loss": 1.0876, + "step": 18996 + }, + { + "epoch": 0.6803230254087059, + "grad_norm": 2.1396026611328125, + "learning_rate": 4.8975603902298704e-05, + "loss": 1.1168, + "step": 18997 + }, + { + "epoch": 0.6803588375382742, + "grad_norm": 1.5211700201034546, + "learning_rate": 4.896562874983569e-05, + "loss": 0.9312, + "step": 18998 + }, + { + "epoch": 0.6803946496678425, + "grad_norm": 1.6477652788162231, + "learning_rate": 4.8955654283974284e-05, + "loss": 1.174, + "step": 18999 + }, + { + "epoch": 0.6804304617974107, + "grad_norm": 1.4225066900253296, + "learning_rate": 4.89456805048487e-05, + "loss": 1.1731, + "step": 19000 + }, + { + "epoch": 0.6804662739269791, + "grad_norm": 1.9299967288970947, + "learning_rate": 4.893570741259312e-05, + "loss": 1.3977, + "step": 19001 + }, + { + "epoch": 0.6805020860565474, + "grad_norm": 1.7080508470535278, + "learning_rate": 4.892573500734179e-05, + "loss": 1.0638, + "step": 19002 + }, + { + "epoch": 0.6805378981861157, + "grad_norm": 1.4910908937454224, + "learning_rate": 4.891576328922872e-05, + "loss": 1.0774, + "step": 19003 + }, + { + "epoch": 0.6805737103156839, + "grad_norm": 1.630881428718567, + "learning_rate": 4.890579225838824e-05, + "loss": 1.1292, + "step": 19004 + }, + { + "epoch": 0.6806095224452522, + "grad_norm": 1.4570605754852295, + "learning_rate": 4.8895821914954376e-05, + "loss": 1.0693, + "step": 19005 + }, + { + "epoch": 0.6806453345748205, + "grad_norm": 1.4446775913238525, + "learning_rate": 4.888585225906136e-05, + "loss": 1.0765, + "step": 19006 + }, + { + "epoch": 0.6806811467043887, + "grad_norm": 1.4425846338272095, + "learning_rate": 4.8875883290843214e-05, + "loss": 1.0231, + "step": 19007 + }, + { + "epoch": 0.6807169588339571, + "grad_norm": 1.9239712953567505, + "learning_rate": 4.886591501043413e-05, + "loss": 1.3231, + "step": 19008 + }, + { + "epoch": 0.6807527709635254, + "grad_norm": 1.3756269216537476, + "learning_rate": 4.885594741796823e-05, + "loss": 1.0482, + "step": 19009 + }, + { + "epoch": 0.6807885830930936, + "grad_norm": 1.3874776363372803, + "learning_rate": 4.884598051357955e-05, + "loss": 1.1681, + "step": 19010 + }, + { + "epoch": 0.6808243952226619, + "grad_norm": 1.5552643537521362, + "learning_rate": 4.883601429740222e-05, + "loss": 1.0343, + "step": 19011 + }, + { + "epoch": 0.6808602073522302, + "grad_norm": 1.4772554636001587, + "learning_rate": 4.882604876957032e-05, + "loss": 1.1818, + "step": 19012 + }, + { + "epoch": 0.6808960194817985, + "grad_norm": 1.518031120300293, + "learning_rate": 4.881608393021796e-05, + "loss": 1.0747, + "step": 19013 + }, + { + "epoch": 0.6809318316113667, + "grad_norm": 1.4783027172088623, + "learning_rate": 4.880611977947909e-05, + "loss": 1.0747, + "step": 19014 + }, + { + "epoch": 0.6809676437409351, + "grad_norm": 1.4851529598236084, + "learning_rate": 4.879615631748793e-05, + "loss": 1.2887, + "step": 19015 + }, + { + "epoch": 0.6810034558705034, + "grad_norm": 1.4566441774368286, + "learning_rate": 4.8786193544378424e-05, + "loss": 1.138, + "step": 19016 + }, + { + "epoch": 0.6810392680000716, + "grad_norm": 1.375792145729065, + "learning_rate": 4.8776231460284595e-05, + "loss": 1.2702, + "step": 19017 + }, + { + "epoch": 0.6810750801296399, + "grad_norm": 1.2668476104736328, + "learning_rate": 4.876627006534049e-05, + "loss": 1.0898, + "step": 19018 + }, + { + "epoch": 0.6811108922592082, + "grad_norm": 1.3284797668457031, + "learning_rate": 4.8756309359680145e-05, + "loss": 1.0326, + "step": 19019 + }, + { + "epoch": 0.6811467043887764, + "grad_norm": 1.432564377784729, + "learning_rate": 4.874634934343759e-05, + "loss": 1.1307, + "step": 19020 + }, + { + "epoch": 0.6811825165183447, + "grad_norm": 1.3784430027008057, + "learning_rate": 4.873639001674676e-05, + "loss": 0.9825, + "step": 19021 + }, + { + "epoch": 0.6812183286479131, + "grad_norm": 1.5218783617019653, + "learning_rate": 4.872643137974167e-05, + "loss": 1.1276, + "step": 19022 + }, + { + "epoch": 0.6812541407774814, + "grad_norm": 1.3602169752120972, + "learning_rate": 4.87164734325563e-05, + "loss": 1.0441, + "step": 19023 + }, + { + "epoch": 0.6812899529070496, + "grad_norm": 1.6695135831832886, + "learning_rate": 4.870651617532468e-05, + "loss": 1.1153, + "step": 19024 + }, + { + "epoch": 0.6813257650366179, + "grad_norm": 1.29396390914917, + "learning_rate": 4.869655960818068e-05, + "loss": 0.8934, + "step": 19025 + }, + { + "epoch": 0.6813615771661862, + "grad_norm": 1.811947226524353, + "learning_rate": 4.868660373125829e-05, + "loss": 1.1363, + "step": 19026 + }, + { + "epoch": 0.6813973892957544, + "grad_norm": 1.7001943588256836, + "learning_rate": 4.8676648544691495e-05, + "loss": 1.1498, + "step": 19027 + }, + { + "epoch": 0.6814332014253227, + "grad_norm": 1.4276317358016968, + "learning_rate": 4.866669404861416e-05, + "loss": 1.07, + "step": 19028 + }, + { + "epoch": 0.6814690135548911, + "grad_norm": 2.3490567207336426, + "learning_rate": 4.8656740243160236e-05, + "loss": 1.4119, + "step": 19029 + }, + { + "epoch": 0.6815048256844594, + "grad_norm": 1.4504139423370361, + "learning_rate": 4.864678712846365e-05, + "loss": 1.0234, + "step": 19030 + }, + { + "epoch": 0.6815406378140276, + "grad_norm": 1.7839081287384033, + "learning_rate": 4.863683470465833e-05, + "loss": 0.9701, + "step": 19031 + }, + { + "epoch": 0.6815764499435959, + "grad_norm": 1.3022617101669312, + "learning_rate": 4.862688297187812e-05, + "loss": 1.2076, + "step": 19032 + }, + { + "epoch": 0.6816122620731642, + "grad_norm": 2.07409930229187, + "learning_rate": 4.8616931930256926e-05, + "loss": 1.1922, + "step": 19033 + }, + { + "epoch": 0.6816480742027324, + "grad_norm": 1.4471510648727417, + "learning_rate": 4.860698157992867e-05, + "loss": 0.8464, + "step": 19034 + }, + { + "epoch": 0.6816838863323007, + "grad_norm": 2.0566225051879883, + "learning_rate": 4.859703192102715e-05, + "loss": 0.9592, + "step": 19035 + }, + { + "epoch": 0.6817196984618691, + "grad_norm": 1.3216887712478638, + "learning_rate": 4.858708295368626e-05, + "loss": 1.1512, + "step": 19036 + }, + { + "epoch": 0.6817555105914374, + "grad_norm": 1.5106048583984375, + "learning_rate": 4.857713467803985e-05, + "loss": 1.2259, + "step": 19037 + }, + { + "epoch": 0.6817913227210056, + "grad_norm": 1.6490708589553833, + "learning_rate": 4.85671870942218e-05, + "loss": 1.1818, + "step": 19038 + }, + { + "epoch": 0.6818271348505739, + "grad_norm": 1.5176066160202026, + "learning_rate": 4.855724020236586e-05, + "loss": 1.2075, + "step": 19039 + }, + { + "epoch": 0.6818629469801422, + "grad_norm": 1.277696967124939, + "learning_rate": 4.854729400260591e-05, + "loss": 1.1848, + "step": 19040 + }, + { + "epoch": 0.6818987591097104, + "grad_norm": 1.3481560945510864, + "learning_rate": 4.853734849507574e-05, + "loss": 1.1808, + "step": 19041 + }, + { + "epoch": 0.6819345712392787, + "grad_norm": 1.3642854690551758, + "learning_rate": 4.8527403679909214e-05, + "loss": 0.8976, + "step": 19042 + }, + { + "epoch": 0.6819703833688471, + "grad_norm": 1.3045084476470947, + "learning_rate": 4.851745955724002e-05, + "loss": 1.2131, + "step": 19043 + }, + { + "epoch": 0.6820061954984153, + "grad_norm": 1.3259011507034302, + "learning_rate": 4.8507516127202014e-05, + "loss": 0.9022, + "step": 19044 + }, + { + "epoch": 0.6820420076279836, + "grad_norm": 1.5721471309661865, + "learning_rate": 4.849757338992898e-05, + "loss": 1.1268, + "step": 19045 + }, + { + "epoch": 0.6820778197575519, + "grad_norm": 1.0886293649673462, + "learning_rate": 4.848763134555465e-05, + "loss": 0.8658, + "step": 19046 + }, + { + "epoch": 0.6821136318871202, + "grad_norm": 1.6803598403930664, + "learning_rate": 4.847768999421277e-05, + "loss": 1.0414, + "step": 19047 + }, + { + "epoch": 0.6821494440166884, + "grad_norm": 2.5486490726470947, + "learning_rate": 4.8467749336037124e-05, + "loss": 1.0369, + "step": 19048 + }, + { + "epoch": 0.6821852561462567, + "grad_norm": 1.3840068578720093, + "learning_rate": 4.8457809371161476e-05, + "loss": 0.963, + "step": 19049 + }, + { + "epoch": 0.6822210682758251, + "grad_norm": 1.4168719053268433, + "learning_rate": 4.844787009971949e-05, + "loss": 1.0705, + "step": 19050 + }, + { + "epoch": 0.6822568804053933, + "grad_norm": 1.3377608060836792, + "learning_rate": 4.8437931521844894e-05, + "loss": 1.0137, + "step": 19051 + }, + { + "epoch": 0.6822926925349616, + "grad_norm": 2.0232741832733154, + "learning_rate": 4.8427993637671474e-05, + "loss": 1.479, + "step": 19052 + }, + { + "epoch": 0.6823285046645299, + "grad_norm": 1.3748822212219238, + "learning_rate": 4.841805644733283e-05, + "loss": 1.1855, + "step": 19053 + }, + { + "epoch": 0.6823643167940981, + "grad_norm": 1.4964277744293213, + "learning_rate": 4.8408119950962704e-05, + "loss": 1.231, + "step": 19054 + }, + { + "epoch": 0.6824001289236664, + "grad_norm": 1.3874918222427368, + "learning_rate": 4.839818414869477e-05, + "loss": 1.0453, + "step": 19055 + }, + { + "epoch": 0.6824359410532347, + "grad_norm": 1.7473300695419312, + "learning_rate": 4.8388249040662744e-05, + "loss": 1.1868, + "step": 19056 + }, + { + "epoch": 0.6824717531828031, + "grad_norm": 1.6654947996139526, + "learning_rate": 4.8378314627000224e-05, + "loss": 1.4346, + "step": 19057 + }, + { + "epoch": 0.6825075653123713, + "grad_norm": 1.9552180767059326, + "learning_rate": 4.836838090784088e-05, + "loss": 1.1078, + "step": 19058 + }, + { + "epoch": 0.6825433774419396, + "grad_norm": 1.2628244161605835, + "learning_rate": 4.835844788331839e-05, + "loss": 1.1154, + "step": 19059 + }, + { + "epoch": 0.6825791895715079, + "grad_norm": 1.3114804029464722, + "learning_rate": 4.8348515553566396e-05, + "loss": 0.9493, + "step": 19060 + }, + { + "epoch": 0.6826150017010761, + "grad_norm": 1.2782714366912842, + "learning_rate": 4.833858391871846e-05, + "loss": 1.0976, + "step": 19061 + }, + { + "epoch": 0.6826508138306444, + "grad_norm": 1.7248166799545288, + "learning_rate": 4.832865297890825e-05, + "loss": 1.157, + "step": 19062 + }, + { + "epoch": 0.6826866259602127, + "grad_norm": 1.5076838731765747, + "learning_rate": 4.83187227342694e-05, + "loss": 0.9905, + "step": 19063 + }, + { + "epoch": 0.6827224380897811, + "grad_norm": 1.2206947803497314, + "learning_rate": 4.830879318493542e-05, + "loss": 1.0017, + "step": 19064 + }, + { + "epoch": 0.6827582502193493, + "grad_norm": 1.2888615131378174, + "learning_rate": 4.829886433103995e-05, + "loss": 1.0745, + "step": 19065 + }, + { + "epoch": 0.6827940623489176, + "grad_norm": 1.2866395711898804, + "learning_rate": 4.828893617271658e-05, + "loss": 0.9895, + "step": 19066 + }, + { + "epoch": 0.6828298744784859, + "grad_norm": 1.4222431182861328, + "learning_rate": 4.8279008710098916e-05, + "loss": 1.2628, + "step": 19067 + }, + { + "epoch": 0.6828656866080541, + "grad_norm": 1.4168401956558228, + "learning_rate": 4.8269081943320424e-05, + "loss": 1.0808, + "step": 19068 + }, + { + "epoch": 0.6829014987376224, + "grad_norm": 1.7078373432159424, + "learning_rate": 4.825915587251472e-05, + "loss": 1.1291, + "step": 19069 + }, + { + "epoch": 0.6829373108671907, + "grad_norm": 1.4532835483551025, + "learning_rate": 4.824923049781536e-05, + "loss": 0.9057, + "step": 19070 + }, + { + "epoch": 0.682973122996759, + "grad_norm": 1.8301759958267212, + "learning_rate": 4.8239305819355805e-05, + "loss": 1.3594, + "step": 19071 + }, + { + "epoch": 0.6830089351263273, + "grad_norm": 1.3628432750701904, + "learning_rate": 4.822938183726967e-05, + "loss": 1.022, + "step": 19072 + }, + { + "epoch": 0.6830447472558956, + "grad_norm": 1.4188514947891235, + "learning_rate": 4.821945855169035e-05, + "loss": 1.2375, + "step": 19073 + }, + { + "epoch": 0.6830805593854639, + "grad_norm": 1.5197452306747437, + "learning_rate": 4.8209535962751494e-05, + "loss": 1.2136, + "step": 19074 + }, + { + "epoch": 0.6831163715150321, + "grad_norm": 1.5406081676483154, + "learning_rate": 4.81996140705865e-05, + "loss": 1.1163, + "step": 19075 + }, + { + "epoch": 0.6831521836446004, + "grad_norm": 1.752833604812622, + "learning_rate": 4.8189692875328864e-05, + "loss": 1.2077, + "step": 19076 + }, + { + "epoch": 0.6831879957741687, + "grad_norm": 1.6233603954315186, + "learning_rate": 4.817977237711213e-05, + "loss": 1.1554, + "step": 19077 + }, + { + "epoch": 0.683223807903737, + "grad_norm": 1.2887494564056396, + "learning_rate": 4.816985257606967e-05, + "loss": 0.9959, + "step": 19078 + }, + { + "epoch": 0.6832596200333053, + "grad_norm": 2.0337367057800293, + "learning_rate": 4.815993347233503e-05, + "loss": 1.3292, + "step": 19079 + }, + { + "epoch": 0.6832954321628736, + "grad_norm": 1.5795265436172485, + "learning_rate": 4.8150015066041545e-05, + "loss": 1.0759, + "step": 19080 + }, + { + "epoch": 0.6833312442924419, + "grad_norm": 1.4086941480636597, + "learning_rate": 4.814009735732279e-05, + "loss": 1.047, + "step": 19081 + }, + { + "epoch": 0.6833670564220101, + "grad_norm": 1.3063175678253174, + "learning_rate": 4.8130180346312105e-05, + "loss": 1.1259, + "step": 19082 + }, + { + "epoch": 0.6834028685515784, + "grad_norm": 1.7356852293014526, + "learning_rate": 4.812026403314297e-05, + "loss": 1.0507, + "step": 19083 + }, + { + "epoch": 0.6834386806811467, + "grad_norm": 2.041297674179077, + "learning_rate": 4.811034841794868e-05, + "loss": 1.0341, + "step": 19084 + }, + { + "epoch": 0.683474492810715, + "grad_norm": 1.2792654037475586, + "learning_rate": 4.8100433500862794e-05, + "loss": 1.0563, + "step": 19085 + }, + { + "epoch": 0.6835103049402833, + "grad_norm": 1.959654688835144, + "learning_rate": 4.809051928201864e-05, + "loss": 0.948, + "step": 19086 + }, + { + "epoch": 0.6835461170698516, + "grad_norm": 1.2853175401687622, + "learning_rate": 4.808060576154951e-05, + "loss": 0.9052, + "step": 19087 + }, + { + "epoch": 0.6835819291994198, + "grad_norm": 1.429190754890442, + "learning_rate": 4.8070692939588934e-05, + "loss": 1.0717, + "step": 19088 + }, + { + "epoch": 0.6836177413289881, + "grad_norm": 2.072711229324341, + "learning_rate": 4.8060780816270165e-05, + "loss": 1.0659, + "step": 19089 + }, + { + "epoch": 0.6836535534585564, + "grad_norm": 1.3297784328460693, + "learning_rate": 4.805086939172663e-05, + "loss": 1.1375, + "step": 19090 + }, + { + "epoch": 0.6836893655881247, + "grad_norm": 1.638877272605896, + "learning_rate": 4.804095866609156e-05, + "loss": 1.1, + "step": 19091 + }, + { + "epoch": 0.683725177717693, + "grad_norm": 1.377615213394165, + "learning_rate": 4.803104863949844e-05, + "loss": 1.1659, + "step": 19092 + }, + { + "epoch": 0.6837609898472613, + "grad_norm": 2.873516798019409, + "learning_rate": 4.80211393120805e-05, + "loss": 1.2162, + "step": 19093 + }, + { + "epoch": 0.6837968019768296, + "grad_norm": 1.4686261415481567, + "learning_rate": 4.801123068397111e-05, + "loss": 0.9728, + "step": 19094 + }, + { + "epoch": 0.6838326141063978, + "grad_norm": 4.066555976867676, + "learning_rate": 4.800132275530351e-05, + "loss": 1.0963, + "step": 19095 + }, + { + "epoch": 0.6838684262359661, + "grad_norm": 1.3321839570999146, + "learning_rate": 4.799141552621105e-05, + "loss": 1.1506, + "step": 19096 + }, + { + "epoch": 0.6839042383655344, + "grad_norm": 1.4118142127990723, + "learning_rate": 4.798150899682704e-05, + "loss": 1.099, + "step": 19097 + }, + { + "epoch": 0.6839400504951026, + "grad_norm": 1.7569352388381958, + "learning_rate": 4.79716031672847e-05, + "loss": 1.3207, + "step": 19098 + }, + { + "epoch": 0.683975862624671, + "grad_norm": 1.6463948488235474, + "learning_rate": 4.7961698037717306e-05, + "loss": 1.0002, + "step": 19099 + }, + { + "epoch": 0.6840116747542393, + "grad_norm": 1.727056622505188, + "learning_rate": 4.795179360825815e-05, + "loss": 1.07, + "step": 19100 + }, + { + "epoch": 0.6840474868838076, + "grad_norm": 1.3884820938110352, + "learning_rate": 4.794188987904051e-05, + "loss": 1.093, + "step": 19101 + }, + { + "epoch": 0.6840832990133758, + "grad_norm": 1.467398762702942, + "learning_rate": 4.793198685019753e-05, + "loss": 1.1293, + "step": 19102 + }, + { + "epoch": 0.6841191111429441, + "grad_norm": 1.3944905996322632, + "learning_rate": 4.7922084521862565e-05, + "loss": 1.061, + "step": 19103 + }, + { + "epoch": 0.6841549232725124, + "grad_norm": 1.4234199523925781, + "learning_rate": 4.791218289416879e-05, + "loss": 1.0075, + "step": 19104 + }, + { + "epoch": 0.6841907354020806, + "grad_norm": 1.3846608400344849, + "learning_rate": 4.790228196724935e-05, + "loss": 1.0622, + "step": 19105 + }, + { + "epoch": 0.684226547531649, + "grad_norm": 1.5260390043258667, + "learning_rate": 4.789238174123751e-05, + "loss": 1.071, + "step": 19106 + }, + { + "epoch": 0.6842623596612173, + "grad_norm": 2.0682780742645264, + "learning_rate": 4.788248221626647e-05, + "loss": 1.0558, + "step": 19107 + }, + { + "epoch": 0.6842981717907856, + "grad_norm": 1.981428623199463, + "learning_rate": 4.7872583392469436e-05, + "loss": 1.0865, + "step": 19108 + }, + { + "epoch": 0.6843339839203538, + "grad_norm": 1.4120668172836304, + "learning_rate": 4.786268526997951e-05, + "loss": 1.1282, + "step": 19109 + }, + { + "epoch": 0.6843697960499221, + "grad_norm": 1.682153344154358, + "learning_rate": 4.7852787848929916e-05, + "loss": 1.1745, + "step": 19110 + }, + { + "epoch": 0.6844056081794904, + "grad_norm": 1.4137015342712402, + "learning_rate": 4.7842891129453784e-05, + "loss": 1.0582, + "step": 19111 + }, + { + "epoch": 0.6844414203090586, + "grad_norm": 1.768123745918274, + "learning_rate": 4.783299511168432e-05, + "loss": 1.1305, + "step": 19112 + }, + { + "epoch": 0.684477232438627, + "grad_norm": 1.6291863918304443, + "learning_rate": 4.7823099795754566e-05, + "loss": 1.0325, + "step": 19113 + }, + { + "epoch": 0.6845130445681953, + "grad_norm": 1.3186250925064087, + "learning_rate": 4.781320518179772e-05, + "loss": 1.182, + "step": 19114 + }, + { + "epoch": 0.6845488566977636, + "grad_norm": 1.5150483846664429, + "learning_rate": 4.780331126994691e-05, + "loss": 1.2576, + "step": 19115 + }, + { + "epoch": 0.6845846688273318, + "grad_norm": 1.2902039289474487, + "learning_rate": 4.779341806033517e-05, + "loss": 0.9111, + "step": 19116 + }, + { + "epoch": 0.6846204809569001, + "grad_norm": 1.4645769596099854, + "learning_rate": 4.778352555309565e-05, + "loss": 0.9528, + "step": 19117 + }, + { + "epoch": 0.6846562930864684, + "grad_norm": 1.4349591732025146, + "learning_rate": 4.777363374836146e-05, + "loss": 1.0979, + "step": 19118 + }, + { + "epoch": 0.6846921052160366, + "grad_norm": 1.4069900512695312, + "learning_rate": 4.7763742646265674e-05, + "loss": 1.1834, + "step": 19119 + }, + { + "epoch": 0.684727917345605, + "grad_norm": 1.8663454055786133, + "learning_rate": 4.7753852246941335e-05, + "loss": 1.0966, + "step": 19120 + }, + { + "epoch": 0.6847637294751733, + "grad_norm": 1.4115465879440308, + "learning_rate": 4.774396255052151e-05, + "loss": 0.9954, + "step": 19121 + }, + { + "epoch": 0.6847995416047415, + "grad_norm": 1.9221904277801514, + "learning_rate": 4.773407355713929e-05, + "loss": 1.2435, + "step": 19122 + }, + { + "epoch": 0.6848353537343098, + "grad_norm": 1.3776674270629883, + "learning_rate": 4.7724185266927666e-05, + "loss": 1.0907, + "step": 19123 + }, + { + "epoch": 0.6848711658638781, + "grad_norm": 1.5971683263778687, + "learning_rate": 4.7714297680019704e-05, + "loss": 1.1145, + "step": 19124 + }, + { + "epoch": 0.6849069779934464, + "grad_norm": 1.4446349143981934, + "learning_rate": 4.770441079654841e-05, + "loss": 1.0263, + "step": 19125 + }, + { + "epoch": 0.6849427901230146, + "grad_norm": 1.6985043287277222, + "learning_rate": 4.7694524616646865e-05, + "loss": 1.2909, + "step": 19126 + }, + { + "epoch": 0.684978602252583, + "grad_norm": 1.168282151222229, + "learning_rate": 4.768463914044797e-05, + "loss": 1.0861, + "step": 19127 + }, + { + "epoch": 0.6850144143821513, + "grad_norm": 1.4676809310913086, + "learning_rate": 4.767475436808478e-05, + "loss": 1.1361, + "step": 19128 + }, + { + "epoch": 0.6850502265117195, + "grad_norm": 1.355637788772583, + "learning_rate": 4.766487029969028e-05, + "loss": 1.0893, + "step": 19129 + }, + { + "epoch": 0.6850860386412878, + "grad_norm": 1.4948772192001343, + "learning_rate": 4.765498693539747e-05, + "loss": 1.0071, + "step": 19130 + }, + { + "epoch": 0.6851218507708561, + "grad_norm": 1.9374393224716187, + "learning_rate": 4.764510427533926e-05, + "loss": 1.1155, + "step": 19131 + }, + { + "epoch": 0.6851576629004243, + "grad_norm": 1.4988833665847778, + "learning_rate": 4.763522231964864e-05, + "loss": 0.9191, + "step": 19132 + }, + { + "epoch": 0.6851934750299926, + "grad_norm": 1.704292893409729, + "learning_rate": 4.76253410684586e-05, + "loss": 1.2028, + "step": 19133 + }, + { + "epoch": 0.685229287159561, + "grad_norm": 1.6031951904296875, + "learning_rate": 4.761546052190199e-05, + "loss": 1.0071, + "step": 19134 + }, + { + "epoch": 0.6852650992891293, + "grad_norm": 1.3562296628952026, + "learning_rate": 4.7605580680111785e-05, + "loss": 1.0029, + "step": 19135 + }, + { + "epoch": 0.6853009114186975, + "grad_norm": 1.284392237663269, + "learning_rate": 4.7595701543220916e-05, + "loss": 1.0543, + "step": 19136 + }, + { + "epoch": 0.6853367235482658, + "grad_norm": 1.7733784914016724, + "learning_rate": 4.758582311136231e-05, + "loss": 1.4296, + "step": 19137 + }, + { + "epoch": 0.6853725356778341, + "grad_norm": 1.4087140560150146, + "learning_rate": 4.757594538466883e-05, + "loss": 1.2067, + "step": 19138 + }, + { + "epoch": 0.6854083478074023, + "grad_norm": 1.4903203248977661, + "learning_rate": 4.756606836327337e-05, + "loss": 0.9394, + "step": 19139 + }, + { + "epoch": 0.6854441599369706, + "grad_norm": 1.4471768140792847, + "learning_rate": 4.755619204730886e-05, + "loss": 1.0923, + "step": 19140 + }, + { + "epoch": 0.685479972066539, + "grad_norm": 1.5510988235473633, + "learning_rate": 4.75463164369081e-05, + "loss": 0.9797, + "step": 19141 + }, + { + "epoch": 0.6855157841961073, + "grad_norm": 1.992522120475769, + "learning_rate": 4.7536441532204e-05, + "loss": 1.2701, + "step": 19142 + }, + { + "epoch": 0.6855515963256755, + "grad_norm": 1.8772284984588623, + "learning_rate": 4.752656733332941e-05, + "loss": 1.2499, + "step": 19143 + }, + { + "epoch": 0.6855874084552438, + "grad_norm": 1.9329471588134766, + "learning_rate": 4.751669384041719e-05, + "loss": 1.1297, + "step": 19144 + }, + { + "epoch": 0.6856232205848121, + "grad_norm": 1.4067474603652954, + "learning_rate": 4.750682105360014e-05, + "loss": 1.1197, + "step": 19145 + }, + { + "epoch": 0.6856590327143803, + "grad_norm": 1.5383446216583252, + "learning_rate": 4.749694897301108e-05, + "loss": 1.143, + "step": 19146 + }, + { + "epoch": 0.6856948448439486, + "grad_norm": 1.3840171098709106, + "learning_rate": 4.7487077598782856e-05, + "loss": 1.0812, + "step": 19147 + }, + { + "epoch": 0.685730656973517, + "grad_norm": 1.3171330690383911, + "learning_rate": 4.747720693104831e-05, + "loss": 1.0536, + "step": 19148 + }, + { + "epoch": 0.6857664691030853, + "grad_norm": 1.298089861869812, + "learning_rate": 4.7467336969940156e-05, + "loss": 0.8751, + "step": 19149 + }, + { + "epoch": 0.6858022812326535, + "grad_norm": 1.84935462474823, + "learning_rate": 4.745746771559122e-05, + "loss": 0.9837, + "step": 19150 + }, + { + "epoch": 0.6858380933622218, + "grad_norm": 1.2745230197906494, + "learning_rate": 4.744759916813432e-05, + "loss": 1.136, + "step": 19151 + }, + { + "epoch": 0.6858739054917901, + "grad_norm": 1.3381037712097168, + "learning_rate": 4.743773132770214e-05, + "loss": 0.7198, + "step": 19152 + }, + { + "epoch": 0.6859097176213583, + "grad_norm": 1.61928129196167, + "learning_rate": 4.7427864194427484e-05, + "loss": 1.3019, + "step": 19153 + }, + { + "epoch": 0.6859455297509266, + "grad_norm": 1.3758790493011475, + "learning_rate": 4.74179977684431e-05, + "loss": 0.9996, + "step": 19154 + }, + { + "epoch": 0.685981341880495, + "grad_norm": 1.343704342842102, + "learning_rate": 4.740813204988178e-05, + "loss": 0.9939, + "step": 19155 + }, + { + "epoch": 0.6860171540100632, + "grad_norm": 1.596480369567871, + "learning_rate": 4.739826703887616e-05, + "loss": 1.0835, + "step": 19156 + }, + { + "epoch": 0.6860529661396315, + "grad_norm": 1.386954665184021, + "learning_rate": 4.7388402735559014e-05, + "loss": 1.041, + "step": 19157 + }, + { + "epoch": 0.6860887782691998, + "grad_norm": 1.592178225517273, + "learning_rate": 4.737853914006307e-05, + "loss": 0.9919, + "step": 19158 + }, + { + "epoch": 0.686124590398768, + "grad_norm": 1.5369133949279785, + "learning_rate": 4.736867625252097e-05, + "loss": 0.8823, + "step": 19159 + }, + { + "epoch": 0.6861604025283363, + "grad_norm": 1.7215745449066162, + "learning_rate": 4.735881407306545e-05, + "loss": 0.9417, + "step": 19160 + }, + { + "epoch": 0.6861962146579046, + "grad_norm": 2.0789146423339844, + "learning_rate": 4.734895260182918e-05, + "loss": 1.1739, + "step": 19161 + }, + { + "epoch": 0.686232026787473, + "grad_norm": 1.2600680589675903, + "learning_rate": 4.733909183894487e-05, + "loss": 0.9339, + "step": 19162 + }, + { + "epoch": 0.6862678389170412, + "grad_norm": 1.592675805091858, + "learning_rate": 4.732923178454512e-05, + "loss": 1.0788, + "step": 19163 + }, + { + "epoch": 0.6863036510466095, + "grad_norm": 1.4066606760025024, + "learning_rate": 4.731937243876262e-05, + "loss": 1.0978, + "step": 19164 + }, + { + "epoch": 0.6863394631761778, + "grad_norm": 2.2144689559936523, + "learning_rate": 4.730951380173e-05, + "loss": 1.2181, + "step": 19165 + }, + { + "epoch": 0.686375275305746, + "grad_norm": 1.384918212890625, + "learning_rate": 4.729965587357995e-05, + "loss": 0.853, + "step": 19166 + }, + { + "epoch": 0.6864110874353143, + "grad_norm": 1.5565673112869263, + "learning_rate": 4.728979865444505e-05, + "loss": 0.9512, + "step": 19167 + }, + { + "epoch": 0.6864468995648826, + "grad_norm": 1.3327524662017822, + "learning_rate": 4.7279942144457847e-05, + "loss": 1.251, + "step": 19168 + }, + { + "epoch": 0.686482711694451, + "grad_norm": 1.3942347764968872, + "learning_rate": 4.7270086343751085e-05, + "loss": 1.1628, + "step": 19169 + }, + { + "epoch": 0.6865185238240192, + "grad_norm": 1.8693327903747559, + "learning_rate": 4.7260231252457265e-05, + "loss": 1.2927, + "step": 19170 + }, + { + "epoch": 0.6865543359535875, + "grad_norm": 1.5985233783721924, + "learning_rate": 4.7250376870709e-05, + "loss": 0.9049, + "step": 19171 + }, + { + "epoch": 0.6865901480831558, + "grad_norm": 1.3760108947753906, + "learning_rate": 4.7240523198638875e-05, + "loss": 0.976, + "step": 19172 + }, + { + "epoch": 0.686625960212724, + "grad_norm": 2.0195038318634033, + "learning_rate": 4.723067023637949e-05, + "loss": 1.4887, + "step": 19173 + }, + { + "epoch": 0.6866617723422923, + "grad_norm": 1.40108323097229, + "learning_rate": 4.722081798406337e-05, + "loss": 1.2236, + "step": 19174 + }, + { + "epoch": 0.6866975844718606, + "grad_norm": 2.0296149253845215, + "learning_rate": 4.721096644182299e-05, + "loss": 1.1019, + "step": 19175 + }, + { + "epoch": 0.686733396601429, + "grad_norm": 1.6573433876037598, + "learning_rate": 4.720111560979104e-05, + "loss": 1.0863, + "step": 19176 + }, + { + "epoch": 0.6867692087309972, + "grad_norm": 1.2146193981170654, + "learning_rate": 4.719126548809993e-05, + "loss": 1.1337, + "step": 19177 + }, + { + "epoch": 0.6868050208605655, + "grad_norm": 1.4749351739883423, + "learning_rate": 4.7181416076882266e-05, + "loss": 1.0974, + "step": 19178 + }, + { + "epoch": 0.6868408329901338, + "grad_norm": 1.4906957149505615, + "learning_rate": 4.7171567376270443e-05, + "loss": 1.0833, + "step": 19179 + }, + { + "epoch": 0.686876645119702, + "grad_norm": 1.3404461145401, + "learning_rate": 4.716171938639711e-05, + "loss": 1.1582, + "step": 19180 + }, + { + "epoch": 0.6869124572492703, + "grad_norm": 1.4572272300720215, + "learning_rate": 4.715187210739466e-05, + "loss": 0.9057, + "step": 19181 + }, + { + "epoch": 0.6869482693788386, + "grad_norm": 1.476235032081604, + "learning_rate": 4.714202553939562e-05, + "loss": 1.1068, + "step": 19182 + }, + { + "epoch": 0.686984081508407, + "grad_norm": 1.381506085395813, + "learning_rate": 4.713217968253242e-05, + "loss": 1.1443, + "step": 19183 + }, + { + "epoch": 0.6870198936379752, + "grad_norm": 1.7176920175552368, + "learning_rate": 4.712233453693754e-05, + "loss": 1.1661, + "step": 19184 + }, + { + "epoch": 0.6870557057675435, + "grad_norm": 1.5768373012542725, + "learning_rate": 4.711249010274349e-05, + "loss": 1.1476, + "step": 19185 + }, + { + "epoch": 0.6870915178971118, + "grad_norm": 1.7266234159469604, + "learning_rate": 4.710264638008258e-05, + "loss": 1.214, + "step": 19186 + }, + { + "epoch": 0.68712733002668, + "grad_norm": 1.601955771446228, + "learning_rate": 4.709280336908741e-05, + "loss": 1.1317, + "step": 19187 + }, + { + "epoch": 0.6871631421562483, + "grad_norm": 1.488871693611145, + "learning_rate": 4.7082961069890284e-05, + "loss": 1.1561, + "step": 19188 + }, + { + "epoch": 0.6871989542858166, + "grad_norm": 1.61320161819458, + "learning_rate": 4.707311948262371e-05, + "loss": 1.1293, + "step": 19189 + }, + { + "epoch": 0.687234766415385, + "grad_norm": 1.3472853899002075, + "learning_rate": 4.7063278607419944e-05, + "loss": 1.1022, + "step": 19190 + }, + { + "epoch": 0.6872705785449532, + "grad_norm": 1.3796337842941284, + "learning_rate": 4.705343844441158e-05, + "loss": 1.0193, + "step": 19191 + }, + { + "epoch": 0.6873063906745215, + "grad_norm": 2.0235586166381836, + "learning_rate": 4.704359899373089e-05, + "loss": 1.0753, + "step": 19192 + }, + { + "epoch": 0.6873422028040898, + "grad_norm": 1.490079641342163, + "learning_rate": 4.703376025551023e-05, + "loss": 1.0776, + "step": 19193 + }, + { + "epoch": 0.687378014933658, + "grad_norm": 1.5553948879241943, + "learning_rate": 4.7023922229882013e-05, + "loss": 0.9859, + "step": 19194 + }, + { + "epoch": 0.6874138270632263, + "grad_norm": 1.6203155517578125, + "learning_rate": 4.701408491697859e-05, + "loss": 1.2147, + "step": 19195 + }, + { + "epoch": 0.6874496391927946, + "grad_norm": 1.7721794843673706, + "learning_rate": 4.700424831693233e-05, + "loss": 1.0287, + "step": 19196 + }, + { + "epoch": 0.6874854513223629, + "grad_norm": 1.336967945098877, + "learning_rate": 4.699441242987548e-05, + "loss": 0.9052, + "step": 19197 + }, + { + "epoch": 0.6875212634519312, + "grad_norm": 1.5182597637176514, + "learning_rate": 4.698457725594052e-05, + "loss": 1.3052, + "step": 19198 + }, + { + "epoch": 0.6875570755814995, + "grad_norm": 1.589920997619629, + "learning_rate": 4.697474279525964e-05, + "loss": 1.015, + "step": 19199 + }, + { + "epoch": 0.6875928877110677, + "grad_norm": 1.838592767715454, + "learning_rate": 4.6964909047965246e-05, + "loss": 0.9441, + "step": 19200 + }, + { + "epoch": 0.687628699840636, + "grad_norm": 1.293207049369812, + "learning_rate": 4.6955076014189545e-05, + "loss": 1.0177, + "step": 19201 + }, + { + "epoch": 0.6876645119702043, + "grad_norm": 1.48116135597229, + "learning_rate": 4.694524369406488e-05, + "loss": 1.2514, + "step": 19202 + }, + { + "epoch": 0.6877003240997726, + "grad_norm": 1.9681168794631958, + "learning_rate": 4.693541208772356e-05, + "loss": 1.1478, + "step": 19203 + }, + { + "epoch": 0.6877361362293409, + "grad_norm": 1.366044044494629, + "learning_rate": 4.692558119529778e-05, + "loss": 1.0334, + "step": 19204 + }, + { + "epoch": 0.6877719483589092, + "grad_norm": 1.3286720514297485, + "learning_rate": 4.691575101691985e-05, + "loss": 1.0025, + "step": 19205 + }, + { + "epoch": 0.6878077604884775, + "grad_norm": 1.4213588237762451, + "learning_rate": 4.6905921552722024e-05, + "loss": 0.9069, + "step": 19206 + }, + { + "epoch": 0.6878435726180457, + "grad_norm": 1.5485384464263916, + "learning_rate": 4.6896092802836555e-05, + "loss": 0.9777, + "step": 19207 + }, + { + "epoch": 0.687879384747614, + "grad_norm": 1.8563385009765625, + "learning_rate": 4.6886264767395635e-05, + "loss": 1.1429, + "step": 19208 + }, + { + "epoch": 0.6879151968771823, + "grad_norm": 1.673978328704834, + "learning_rate": 4.687643744653151e-05, + "loss": 1.067, + "step": 19209 + }, + { + "epoch": 0.6879510090067505, + "grad_norm": 1.381314992904663, + "learning_rate": 4.6866610840376424e-05, + "loss": 1.0919, + "step": 19210 + }, + { + "epoch": 0.6879868211363188, + "grad_norm": 2.6082916259765625, + "learning_rate": 4.6856784949062516e-05, + "loss": 1.3181, + "step": 19211 + }, + { + "epoch": 0.6880226332658872, + "grad_norm": 1.2934821844100952, + "learning_rate": 4.6846959772722023e-05, + "loss": 1.0275, + "step": 19212 + }, + { + "epoch": 0.6880584453954555, + "grad_norm": 1.5330066680908203, + "learning_rate": 4.6837135311487125e-05, + "loss": 0.9559, + "step": 19213 + }, + { + "epoch": 0.6880942575250237, + "grad_norm": 1.634684681892395, + "learning_rate": 4.6827311565490026e-05, + "loss": 1.2912, + "step": 19214 + }, + { + "epoch": 0.688130069654592, + "grad_norm": 1.3418571949005127, + "learning_rate": 4.681748853486283e-05, + "loss": 0.973, + "step": 19215 + }, + { + "epoch": 0.6881658817841603, + "grad_norm": 1.7568260431289673, + "learning_rate": 4.6807666219737724e-05, + "loss": 1.1455, + "step": 19216 + }, + { + "epoch": 0.6882016939137285, + "grad_norm": 1.401871681213379, + "learning_rate": 4.679784462024686e-05, + "loss": 1.2174, + "step": 19217 + }, + { + "epoch": 0.6882375060432968, + "grad_norm": 1.2066781520843506, + "learning_rate": 4.6788023736522405e-05, + "loss": 0.8245, + "step": 19218 + }, + { + "epoch": 0.6882733181728652, + "grad_norm": 1.3199925422668457, + "learning_rate": 4.677820356869641e-05, + "loss": 1.1085, + "step": 19219 + }, + { + "epoch": 0.6883091303024335, + "grad_norm": 1.8579691648483276, + "learning_rate": 4.676838411690103e-05, + "loss": 1.152, + "step": 19220 + }, + { + "epoch": 0.6883449424320017, + "grad_norm": 1.3979172706604004, + "learning_rate": 4.675856538126843e-05, + "loss": 1.0895, + "step": 19221 + }, + { + "epoch": 0.68838075456157, + "grad_norm": 1.573072910308838, + "learning_rate": 4.674874736193061e-05, + "loss": 1.2582, + "step": 19222 + }, + { + "epoch": 0.6884165666911383, + "grad_norm": 2.004122734069824, + "learning_rate": 4.67389300590197e-05, + "loss": 1.029, + "step": 19223 + }, + { + "epoch": 0.6884523788207065, + "grad_norm": 1.454297661781311, + "learning_rate": 4.67291134726678e-05, + "loss": 0.9034, + "step": 19224 + }, + { + "epoch": 0.6884881909502748, + "grad_norm": 1.6618313789367676, + "learning_rate": 4.6719297603006994e-05, + "loss": 1.2072, + "step": 19225 + }, + { + "epoch": 0.6885240030798432, + "grad_norm": 1.786848783493042, + "learning_rate": 4.6709482450169275e-05, + "loss": 1.1137, + "step": 19226 + }, + { + "epoch": 0.6885598152094115, + "grad_norm": 1.4959194660186768, + "learning_rate": 4.6699668014286724e-05, + "loss": 1.1954, + "step": 19227 + }, + { + "epoch": 0.6885956273389797, + "grad_norm": 1.2924184799194336, + "learning_rate": 4.668985429549143e-05, + "loss": 1.2269, + "step": 19228 + }, + { + "epoch": 0.688631439468548, + "grad_norm": 1.2547820806503296, + "learning_rate": 4.6680041293915336e-05, + "loss": 0.9614, + "step": 19229 + }, + { + "epoch": 0.6886672515981163, + "grad_norm": 1.177778720855713, + "learning_rate": 4.6670229009690516e-05, + "loss": 1.0797, + "step": 19230 + }, + { + "epoch": 0.6887030637276845, + "grad_norm": 1.9995381832122803, + "learning_rate": 4.666041744294898e-05, + "loss": 1.2802, + "step": 19231 + }, + { + "epoch": 0.6887388758572528, + "grad_norm": 1.4061707258224487, + "learning_rate": 4.665060659382274e-05, + "loss": 0.9146, + "step": 19232 + }, + { + "epoch": 0.6887746879868212, + "grad_norm": 2.2549071311950684, + "learning_rate": 4.664079646244376e-05, + "loss": 1.0399, + "step": 19233 + }, + { + "epoch": 0.6888105001163894, + "grad_norm": 1.2854585647583008, + "learning_rate": 4.663098704894402e-05, + "loss": 1.1542, + "step": 19234 + }, + { + "epoch": 0.6888463122459577, + "grad_norm": 1.4289201498031616, + "learning_rate": 4.662117835345552e-05, + "loss": 1.1251, + "step": 19235 + }, + { + "epoch": 0.688882124375526, + "grad_norm": 1.5167896747589111, + "learning_rate": 4.661137037611024e-05, + "loss": 0.9378, + "step": 19236 + }, + { + "epoch": 0.6889179365050943, + "grad_norm": 1.4607034921646118, + "learning_rate": 4.660156311704007e-05, + "loss": 1.3398, + "step": 19237 + }, + { + "epoch": 0.6889537486346625, + "grad_norm": 1.5534758567810059, + "learning_rate": 4.659175657637699e-05, + "loss": 0.9246, + "step": 19238 + }, + { + "epoch": 0.6889895607642308, + "grad_norm": 1.7552015781402588, + "learning_rate": 4.658195075425297e-05, + "loss": 0.936, + "step": 19239 + }, + { + "epoch": 0.6890253728937992, + "grad_norm": 1.2369990348815918, + "learning_rate": 4.657214565079986e-05, + "loss": 1.1157, + "step": 19240 + }, + { + "epoch": 0.6890611850233674, + "grad_norm": 1.9329519271850586, + "learning_rate": 4.656234126614961e-05, + "loss": 1.2277, + "step": 19241 + }, + { + "epoch": 0.6890969971529357, + "grad_norm": 1.2126009464263916, + "learning_rate": 4.655253760043413e-05, + "loss": 1.1234, + "step": 19242 + }, + { + "epoch": 0.689132809282504, + "grad_norm": 1.2901394367218018, + "learning_rate": 4.654273465378536e-05, + "loss": 0.935, + "step": 19243 + }, + { + "epoch": 0.6891686214120722, + "grad_norm": 1.4700840711593628, + "learning_rate": 4.65329324263351e-05, + "loss": 0.9429, + "step": 19244 + }, + { + "epoch": 0.6892044335416405, + "grad_norm": 1.2670001983642578, + "learning_rate": 4.652313091821526e-05, + "loss": 1.0302, + "step": 19245 + }, + { + "epoch": 0.6892402456712088, + "grad_norm": 1.1746209859848022, + "learning_rate": 4.651333012955775e-05, + "loss": 1.1594, + "step": 19246 + }, + { + "epoch": 0.6892760578007772, + "grad_norm": 1.3423157930374146, + "learning_rate": 4.650353006049436e-05, + "loss": 1.1278, + "step": 19247 + }, + { + "epoch": 0.6893118699303454, + "grad_norm": 1.8354429006576538, + "learning_rate": 4.649373071115697e-05, + "loss": 1.1054, + "step": 19248 + }, + { + "epoch": 0.6893476820599137, + "grad_norm": 1.3257935047149658, + "learning_rate": 4.6483932081677407e-05, + "loss": 0.9663, + "step": 19249 + }, + { + "epoch": 0.689383494189482, + "grad_norm": 1.290063500404358, + "learning_rate": 4.647413417218756e-05, + "loss": 0.7977, + "step": 19250 + }, + { + "epoch": 0.6894193063190502, + "grad_norm": 1.3853764533996582, + "learning_rate": 4.646433698281913e-05, + "loss": 1.1174, + "step": 19251 + }, + { + "epoch": 0.6894551184486185, + "grad_norm": 1.3864318132400513, + "learning_rate": 4.645454051370401e-05, + "loss": 0.9347, + "step": 19252 + }, + { + "epoch": 0.6894909305781868, + "grad_norm": 1.4032678604125977, + "learning_rate": 4.644474476497397e-05, + "loss": 1.0486, + "step": 19253 + }, + { + "epoch": 0.6895267427077552, + "grad_norm": 1.5500068664550781, + "learning_rate": 4.6434949736760844e-05, + "loss": 1.1608, + "step": 19254 + }, + { + "epoch": 0.6895625548373234, + "grad_norm": 1.7818868160247803, + "learning_rate": 4.642515542919635e-05, + "loss": 1.1121, + "step": 19255 + }, + { + "epoch": 0.6895983669668917, + "grad_norm": 1.8414943218231201, + "learning_rate": 4.641536184241228e-05, + "loss": 1.0857, + "step": 19256 + }, + { + "epoch": 0.68963417909646, + "grad_norm": 2.0648446083068848, + "learning_rate": 4.640556897654042e-05, + "loss": 1.1018, + "step": 19257 + }, + { + "epoch": 0.6896699912260282, + "grad_norm": 1.652227759361267, + "learning_rate": 4.639577683171248e-05, + "loss": 1.2472, + "step": 19258 + }, + { + "epoch": 0.6897058033555965, + "grad_norm": 1.3451682329177856, + "learning_rate": 4.638598540806021e-05, + "loss": 1.0056, + "step": 19259 + }, + { + "epoch": 0.6897416154851648, + "grad_norm": 1.579386591911316, + "learning_rate": 4.637619470571535e-05, + "loss": 1.2655, + "step": 19260 + }, + { + "epoch": 0.6897774276147332, + "grad_norm": 1.4569274187088013, + "learning_rate": 4.636640472480965e-05, + "loss": 1.1105, + "step": 19261 + }, + { + "epoch": 0.6898132397443014, + "grad_norm": 1.866028904914856, + "learning_rate": 4.635661546547476e-05, + "loss": 1.2525, + "step": 19262 + }, + { + "epoch": 0.6898490518738697, + "grad_norm": 1.3681896924972534, + "learning_rate": 4.634682692784241e-05, + "loss": 0.8921, + "step": 19263 + }, + { + "epoch": 0.689884864003438, + "grad_norm": 1.1308488845825195, + "learning_rate": 4.6337039112044346e-05, + "loss": 0.7496, + "step": 19264 + }, + { + "epoch": 0.6899206761330062, + "grad_norm": 1.2421830892562866, + "learning_rate": 4.632725201821215e-05, + "loss": 0.7421, + "step": 19265 + }, + { + "epoch": 0.6899564882625745, + "grad_norm": 1.3427413702011108, + "learning_rate": 4.6317465646477584e-05, + "loss": 0.9424, + "step": 19266 + }, + { + "epoch": 0.6899923003921428, + "grad_norm": 1.2528512477874756, + "learning_rate": 4.6307679996972205e-05, + "loss": 1.0012, + "step": 19267 + }, + { + "epoch": 0.6900281125217111, + "grad_norm": 1.4026352167129517, + "learning_rate": 4.62978950698278e-05, + "loss": 1.0493, + "step": 19268 + }, + { + "epoch": 0.6900639246512794, + "grad_norm": 1.4991685152053833, + "learning_rate": 4.6288110865175914e-05, + "loss": 1.0035, + "step": 19269 + }, + { + "epoch": 0.6900997367808477, + "grad_norm": 1.3362778425216675, + "learning_rate": 4.627832738314821e-05, + "loss": 1.0825, + "step": 19270 + }, + { + "epoch": 0.690135548910416, + "grad_norm": 1.2332228422164917, + "learning_rate": 4.6268544623876364e-05, + "loss": 1.0223, + "step": 19271 + }, + { + "epoch": 0.6901713610399842, + "grad_norm": 1.780925989151001, + "learning_rate": 4.625876258749189e-05, + "loss": 1.0655, + "step": 19272 + }, + { + "epoch": 0.6902071731695525, + "grad_norm": 1.5014253854751587, + "learning_rate": 4.624898127412649e-05, + "loss": 1.1242, + "step": 19273 + }, + { + "epoch": 0.6902429852991208, + "grad_norm": 2.0621819496154785, + "learning_rate": 4.623920068391163e-05, + "loss": 1.001, + "step": 19274 + }, + { + "epoch": 0.6902787974286891, + "grad_norm": 1.508083701133728, + "learning_rate": 4.622942081697906e-05, + "loss": 1.1226, + "step": 19275 + }, + { + "epoch": 0.6903146095582574, + "grad_norm": 1.4265427589416504, + "learning_rate": 4.6219641673460236e-05, + "loss": 0.9367, + "step": 19276 + }, + { + "epoch": 0.6903504216878257, + "grad_norm": 1.6382427215576172, + "learning_rate": 4.62098632534868e-05, + "loss": 1.0315, + "step": 19277 + }, + { + "epoch": 0.690386233817394, + "grad_norm": 1.3508721590042114, + "learning_rate": 4.620008555719019e-05, + "loss": 1.0757, + "step": 19278 + }, + { + "epoch": 0.6904220459469622, + "grad_norm": 1.5963983535766602, + "learning_rate": 4.619030858470211e-05, + "loss": 1.1435, + "step": 19279 + }, + { + "epoch": 0.6904578580765305, + "grad_norm": 1.1350219249725342, + "learning_rate": 4.6180532336154014e-05, + "loss": 0.8464, + "step": 19280 + }, + { + "epoch": 0.6904936702060988, + "grad_norm": 1.4710991382598877, + "learning_rate": 4.617075681167736e-05, + "loss": 1.0512, + "step": 19281 + }, + { + "epoch": 0.6905294823356671, + "grad_norm": 1.5219722986221313, + "learning_rate": 4.616098201140382e-05, + "loss": 1.1124, + "step": 19282 + }, + { + "epoch": 0.6905652944652354, + "grad_norm": 1.3436062335968018, + "learning_rate": 4.615120793546478e-05, + "loss": 1.0224, + "step": 19283 + }, + { + "epoch": 0.6906011065948037, + "grad_norm": 1.645815372467041, + "learning_rate": 4.6141434583991803e-05, + "loss": 1.1314, + "step": 19284 + }, + { + "epoch": 0.6906369187243719, + "grad_norm": 1.303968071937561, + "learning_rate": 4.613166195711629e-05, + "loss": 1.0537, + "step": 19285 + }, + { + "epoch": 0.6906727308539402, + "grad_norm": 1.5747427940368652, + "learning_rate": 4.612189005496985e-05, + "loss": 1.0281, + "step": 19286 + }, + { + "epoch": 0.6907085429835085, + "grad_norm": 1.1995570659637451, + "learning_rate": 4.611211887768384e-05, + "loss": 1.031, + "step": 19287 + }, + { + "epoch": 0.6907443551130767, + "grad_norm": 1.4982750415802002, + "learning_rate": 4.6102348425389804e-05, + "loss": 0.933, + "step": 19288 + }, + { + "epoch": 0.6907801672426451, + "grad_norm": 1.5621346235275269, + "learning_rate": 4.609257869821911e-05, + "loss": 0.9185, + "step": 19289 + }, + { + "epoch": 0.6908159793722134, + "grad_norm": 1.552982211112976, + "learning_rate": 4.608280969630323e-05, + "loss": 1.321, + "step": 19290 + }, + { + "epoch": 0.6908517915017817, + "grad_norm": 1.7749738693237305, + "learning_rate": 4.6073041419773635e-05, + "loss": 0.9627, + "step": 19291 + }, + { + "epoch": 0.6908876036313499, + "grad_norm": 1.521122694015503, + "learning_rate": 4.606327386876167e-05, + "loss": 1.0803, + "step": 19292 + }, + { + "epoch": 0.6909234157609182, + "grad_norm": 1.4514652490615845, + "learning_rate": 4.605350704339879e-05, + "loss": 1.0836, + "step": 19293 + }, + { + "epoch": 0.6909592278904865, + "grad_norm": 1.6863712072372437, + "learning_rate": 4.604374094381637e-05, + "loss": 1.2344, + "step": 19294 + }, + { + "epoch": 0.6909950400200547, + "grad_norm": 1.65874183177948, + "learning_rate": 4.603397557014587e-05, + "loss": 1.1269, + "step": 19295 + }, + { + "epoch": 0.6910308521496231, + "grad_norm": 1.6167696714401245, + "learning_rate": 4.602421092251854e-05, + "loss": 1.0415, + "step": 19296 + }, + { + "epoch": 0.6910666642791914, + "grad_norm": 1.5035051107406616, + "learning_rate": 4.60144470010659e-05, + "loss": 1.4139, + "step": 19297 + }, + { + "epoch": 0.6911024764087597, + "grad_norm": 1.4994930028915405, + "learning_rate": 4.600468380591923e-05, + "loss": 1.085, + "step": 19298 + }, + { + "epoch": 0.6911382885383279, + "grad_norm": 1.6580450534820557, + "learning_rate": 4.599492133720986e-05, + "loss": 0.9781, + "step": 19299 + }, + { + "epoch": 0.6911741006678962, + "grad_norm": 2.192168951034546, + "learning_rate": 4.598515959506917e-05, + "loss": 1.0543, + "step": 19300 + }, + { + "epoch": 0.6912099127974645, + "grad_norm": 1.4416550397872925, + "learning_rate": 4.597539857962848e-05, + "loss": 1.0231, + "step": 19301 + }, + { + "epoch": 0.6912457249270327, + "grad_norm": 1.2839525938034058, + "learning_rate": 4.5965638291019145e-05, + "loss": 1.1122, + "step": 19302 + }, + { + "epoch": 0.6912815370566011, + "grad_norm": 1.5194194316864014, + "learning_rate": 4.595587872937241e-05, + "loss": 1.225, + "step": 19303 + }, + { + "epoch": 0.6913173491861694, + "grad_norm": 1.326805830001831, + "learning_rate": 4.594611989481963e-05, + "loss": 0.8725, + "step": 19304 + }, + { + "epoch": 0.6913531613157377, + "grad_norm": 1.2696648836135864, + "learning_rate": 4.593636178749206e-05, + "loss": 1.19, + "step": 19305 + }, + { + "epoch": 0.6913889734453059, + "grad_norm": 1.387256145477295, + "learning_rate": 4.592660440752107e-05, + "loss": 1.0078, + "step": 19306 + }, + { + "epoch": 0.6914247855748742, + "grad_norm": 1.4825160503387451, + "learning_rate": 4.5916847755037806e-05, + "loss": 1.0517, + "step": 19307 + }, + { + "epoch": 0.6914605977044425, + "grad_norm": 1.8684799671173096, + "learning_rate": 4.590709183017361e-05, + "loss": 1.2268, + "step": 19308 + }, + { + "epoch": 0.6914964098340107, + "grad_norm": 1.3186171054840088, + "learning_rate": 4.5897336633059737e-05, + "loss": 1.0294, + "step": 19309 + }, + { + "epoch": 0.6915322219635791, + "grad_norm": 2.4061267375946045, + "learning_rate": 4.588758216382739e-05, + "loss": 1.2755, + "step": 19310 + }, + { + "epoch": 0.6915680340931474, + "grad_norm": 1.8978019952774048, + "learning_rate": 4.5877828422607824e-05, + "loss": 1.3282, + "step": 19311 + }, + { + "epoch": 0.6916038462227156, + "grad_norm": 1.7061275243759155, + "learning_rate": 4.5868075409532265e-05, + "loss": 0.9832, + "step": 19312 + }, + { + "epoch": 0.6916396583522839, + "grad_norm": 1.200205683708191, + "learning_rate": 4.585832312473196e-05, + "loss": 1.1723, + "step": 19313 + }, + { + "epoch": 0.6916754704818522, + "grad_norm": 1.6760982275009155, + "learning_rate": 4.584857156833804e-05, + "loss": 1.1873, + "step": 19314 + }, + { + "epoch": 0.6917112826114205, + "grad_norm": 1.9557958841323853, + "learning_rate": 4.583882074048174e-05, + "loss": 0.9646, + "step": 19315 + }, + { + "epoch": 0.6917470947409887, + "grad_norm": 1.2613691091537476, + "learning_rate": 4.582907064129428e-05, + "loss": 1.0374, + "step": 19316 + }, + { + "epoch": 0.6917829068705571, + "grad_norm": 1.5590927600860596, + "learning_rate": 4.5819321270906765e-05, + "loss": 1.1273, + "step": 19317 + }, + { + "epoch": 0.6918187190001254, + "grad_norm": 1.5656025409698486, + "learning_rate": 4.580957262945039e-05, + "loss": 0.9972, + "step": 19318 + }, + { + "epoch": 0.6918545311296936, + "grad_norm": 2.076612949371338, + "learning_rate": 4.5799824717056325e-05, + "loss": 1.3077, + "step": 19319 + }, + { + "epoch": 0.6918903432592619, + "grad_norm": 1.047538161277771, + "learning_rate": 4.579007753385573e-05, + "loss": 1.0613, + "step": 19320 + }, + { + "epoch": 0.6919261553888302, + "grad_norm": 1.4809952974319458, + "learning_rate": 4.578033107997969e-05, + "loss": 1.1674, + "step": 19321 + }, + { + "epoch": 0.6919619675183984, + "grad_norm": 1.3402507305145264, + "learning_rate": 4.577058535555935e-05, + "loss": 1.167, + "step": 19322 + }, + { + "epoch": 0.6919977796479667, + "grad_norm": 1.531411051750183, + "learning_rate": 4.576084036072584e-05, + "loss": 1.027, + "step": 19323 + }, + { + "epoch": 0.6920335917775351, + "grad_norm": 1.5957794189453125, + "learning_rate": 4.575109609561029e-05, + "loss": 1.1099, + "step": 19324 + }, + { + "epoch": 0.6920694039071034, + "grad_norm": 1.427066683769226, + "learning_rate": 4.5741352560343734e-05, + "loss": 1.0661, + "step": 19325 + }, + { + "epoch": 0.6921052160366716, + "grad_norm": 1.485115885734558, + "learning_rate": 4.5731609755057284e-05, + "loss": 1.2196, + "step": 19326 + }, + { + "epoch": 0.6921410281662399, + "grad_norm": 2.2965118885040283, + "learning_rate": 4.572186767988206e-05, + "loss": 1.1826, + "step": 19327 + }, + { + "epoch": 0.6921768402958082, + "grad_norm": 1.8453965187072754, + "learning_rate": 4.571212633494906e-05, + "loss": 1.4324, + "step": 19328 + }, + { + "epoch": 0.6922126524253764, + "grad_norm": 1.8722498416900635, + "learning_rate": 4.5702385720389376e-05, + "loss": 1.0503, + "step": 19329 + }, + { + "epoch": 0.6922484645549447, + "grad_norm": 1.5218205451965332, + "learning_rate": 4.569264583633405e-05, + "loss": 1.0334, + "step": 19330 + }, + { + "epoch": 0.6922842766845131, + "grad_norm": 1.3406298160552979, + "learning_rate": 4.568290668291416e-05, + "loss": 0.9716, + "step": 19331 + }, + { + "epoch": 0.6923200888140814, + "grad_norm": 1.3085925579071045, + "learning_rate": 4.567316826026066e-05, + "loss": 0.8857, + "step": 19332 + }, + { + "epoch": 0.6923559009436496, + "grad_norm": 1.462762713432312, + "learning_rate": 4.5663430568504603e-05, + "loss": 1.0902, + "step": 19333 + }, + { + "epoch": 0.6923917130732179, + "grad_norm": 1.3307106494903564, + "learning_rate": 4.565369360777704e-05, + "loss": 1.1581, + "step": 19334 + }, + { + "epoch": 0.6924275252027862, + "grad_norm": 1.6632307767868042, + "learning_rate": 4.564395737820888e-05, + "loss": 1.3085, + "step": 19335 + }, + { + "epoch": 0.6924633373323544, + "grad_norm": 1.6171245574951172, + "learning_rate": 4.563422187993117e-05, + "loss": 1.0284, + "step": 19336 + }, + { + "epoch": 0.6924991494619227, + "grad_norm": 1.457824945449829, + "learning_rate": 4.5624487113074874e-05, + "loss": 1.0726, + "step": 19337 + }, + { + "epoch": 0.6925349615914911, + "grad_norm": 1.5783331394195557, + "learning_rate": 4.5614753077771e-05, + "loss": 1.2372, + "step": 19338 + }, + { + "epoch": 0.6925707737210594, + "grad_norm": 1.3821158409118652, + "learning_rate": 4.560501977415044e-05, + "loss": 1.2121, + "step": 19339 + }, + { + "epoch": 0.6926065858506276, + "grad_norm": 1.726990818977356, + "learning_rate": 4.5595287202344175e-05, + "loss": 1.1676, + "step": 19340 + }, + { + "epoch": 0.6926423979801959, + "grad_norm": 1.3547080755233765, + "learning_rate": 4.558555536248313e-05, + "loss": 1.2137, + "step": 19341 + }, + { + "epoch": 0.6926782101097642, + "grad_norm": 1.3664941787719727, + "learning_rate": 4.55758242546983e-05, + "loss": 1.0691, + "step": 19342 + }, + { + "epoch": 0.6927140222393324, + "grad_norm": 1.4494554996490479, + "learning_rate": 4.5566093879120505e-05, + "loss": 1.0142, + "step": 19343 + }, + { + "epoch": 0.6927498343689007, + "grad_norm": 1.5307806730270386, + "learning_rate": 4.555636423588071e-05, + "loss": 1.0971, + "step": 19344 + }, + { + "epoch": 0.6927856464984691, + "grad_norm": 1.4545294046401978, + "learning_rate": 4.5546635325109844e-05, + "loss": 0.8657, + "step": 19345 + }, + { + "epoch": 0.6928214586280373, + "grad_norm": 1.3278801441192627, + "learning_rate": 4.553690714693872e-05, + "loss": 1.0759, + "step": 19346 + }, + { + "epoch": 0.6928572707576056, + "grad_norm": 1.6947550773620605, + "learning_rate": 4.5527179701498256e-05, + "loss": 1.0863, + "step": 19347 + }, + { + "epoch": 0.6928930828871739, + "grad_norm": 1.5879809856414795, + "learning_rate": 4.551745298891933e-05, + "loss": 1.016, + "step": 19348 + }, + { + "epoch": 0.6929288950167422, + "grad_norm": 1.6985434293746948, + "learning_rate": 4.5507727009332824e-05, + "loss": 0.9645, + "step": 19349 + }, + { + "epoch": 0.6929647071463104, + "grad_norm": 1.7676891088485718, + "learning_rate": 4.549800176286954e-05, + "loss": 1.1824, + "step": 19350 + }, + { + "epoch": 0.6930005192758787, + "grad_norm": 1.9324469566345215, + "learning_rate": 4.5488277249660325e-05, + "loss": 1.2874, + "step": 19351 + }, + { + "epoch": 0.6930363314054471, + "grad_norm": 1.7021729946136475, + "learning_rate": 4.5478553469836064e-05, + "loss": 1.0249, + "step": 19352 + }, + { + "epoch": 0.6930721435350153, + "grad_norm": 1.723177194595337, + "learning_rate": 4.546883042352751e-05, + "loss": 1.1865, + "step": 19353 + }, + { + "epoch": 0.6931079556645836, + "grad_norm": 1.8957194089889526, + "learning_rate": 4.545910811086549e-05, + "loss": 1.0832, + "step": 19354 + }, + { + "epoch": 0.6931437677941519, + "grad_norm": 1.3579533100128174, + "learning_rate": 4.544938653198082e-05, + "loss": 1.0907, + "step": 19355 + }, + { + "epoch": 0.6931795799237201, + "grad_norm": 1.5238747596740723, + "learning_rate": 4.543966568700433e-05, + "loss": 1.1345, + "step": 19356 + }, + { + "epoch": 0.6932153920532884, + "grad_norm": 1.5204659700393677, + "learning_rate": 4.542994557606672e-05, + "loss": 1.3377, + "step": 19357 + }, + { + "epoch": 0.6932512041828567, + "grad_norm": 1.1854610443115234, + "learning_rate": 4.542022619929881e-05, + "loss": 1.1411, + "step": 19358 + }, + { + "epoch": 0.6932870163124251, + "grad_norm": 1.2292548418045044, + "learning_rate": 4.541050755683135e-05, + "loss": 1.1142, + "step": 19359 + }, + { + "epoch": 0.6933228284419933, + "grad_norm": 1.7669585943222046, + "learning_rate": 4.5400789648795136e-05, + "loss": 1.1783, + "step": 19360 + }, + { + "epoch": 0.6933586405715616, + "grad_norm": 1.821512222290039, + "learning_rate": 4.539107247532086e-05, + "loss": 1.2992, + "step": 19361 + }, + { + "epoch": 0.6933944527011299, + "grad_norm": 1.6200157403945923, + "learning_rate": 4.5381356036539204e-05, + "loss": 1.0378, + "step": 19362 + }, + { + "epoch": 0.6934302648306981, + "grad_norm": 1.6562391519546509, + "learning_rate": 4.537164033258101e-05, + "loss": 1.1608, + "step": 19363 + }, + { + "epoch": 0.6934660769602664, + "grad_norm": 1.5021997690200806, + "learning_rate": 4.53619253635769e-05, + "loss": 1.1965, + "step": 19364 + }, + { + "epoch": 0.6935018890898347, + "grad_norm": 1.2842015027999878, + "learning_rate": 4.5352211129657596e-05, + "loss": 1.0506, + "step": 19365 + }, + { + "epoch": 0.6935377012194031, + "grad_norm": 1.4354788064956665, + "learning_rate": 4.5342497630953806e-05, + "loss": 1.1831, + "step": 19366 + }, + { + "epoch": 0.6935735133489713, + "grad_norm": 1.3009402751922607, + "learning_rate": 4.533278486759625e-05, + "loss": 0.9897, + "step": 19367 + }, + { + "epoch": 0.6936093254785396, + "grad_norm": 1.6655343770980835, + "learning_rate": 4.5323072839715555e-05, + "loss": 1.0438, + "step": 19368 + }, + { + "epoch": 0.6936451376081079, + "grad_norm": 1.5668153762817383, + "learning_rate": 4.531336154744231e-05, + "loss": 1.081, + "step": 19369 + }, + { + "epoch": 0.6936809497376761, + "grad_norm": 1.4079563617706299, + "learning_rate": 4.530365099090732e-05, + "loss": 1.1536, + "step": 19370 + }, + { + "epoch": 0.6937167618672444, + "grad_norm": 1.4319044351577759, + "learning_rate": 4.5293941170241116e-05, + "loss": 1.1765, + "step": 19371 + }, + { + "epoch": 0.6937525739968127, + "grad_norm": 1.8802821636199951, + "learning_rate": 4.528423208557441e-05, + "loss": 1.1075, + "step": 19372 + }, + { + "epoch": 0.693788386126381, + "grad_norm": 1.3785457611083984, + "learning_rate": 4.52745237370377e-05, + "loss": 1.0376, + "step": 19373 + }, + { + "epoch": 0.6938241982559493, + "grad_norm": 1.1537599563598633, + "learning_rate": 4.526481612476176e-05, + "loss": 0.9482, + "step": 19374 + }, + { + "epoch": 0.6938600103855176, + "grad_norm": 1.5577391386032104, + "learning_rate": 4.525510924887707e-05, + "loss": 1.2624, + "step": 19375 + }, + { + "epoch": 0.6938958225150859, + "grad_norm": 1.534879207611084, + "learning_rate": 4.524540310951432e-05, + "loss": 1.063, + "step": 19376 + }, + { + "epoch": 0.6939316346446541, + "grad_norm": 1.904039978981018, + "learning_rate": 4.5235697706804e-05, + "loss": 1.1323, + "step": 19377 + }, + { + "epoch": 0.6939674467742224, + "grad_norm": 1.4763977527618408, + "learning_rate": 4.5225993040876724e-05, + "loss": 1.2266, + "step": 19378 + }, + { + "epoch": 0.6940032589037907, + "grad_norm": 1.6407947540283203, + "learning_rate": 4.521628911186311e-05, + "loss": 1.1667, + "step": 19379 + }, + { + "epoch": 0.694039071033359, + "grad_norm": 1.3711222410202026, + "learning_rate": 4.5206585919893563e-05, + "loss": 1.1492, + "step": 19380 + }, + { + "epoch": 0.6940748831629273, + "grad_norm": 2.144192934036255, + "learning_rate": 4.519688346509881e-05, + "loss": 1.4921, + "step": 19381 + }, + { + "epoch": 0.6941106952924956, + "grad_norm": 1.4667572975158691, + "learning_rate": 4.5187181747609265e-05, + "loss": 1.1413, + "step": 19382 + }, + { + "epoch": 0.6941465074220639, + "grad_norm": 1.1410809755325317, + "learning_rate": 4.5177480767555525e-05, + "loss": 1.0875, + "step": 19383 + }, + { + "epoch": 0.6941823195516321, + "grad_norm": 1.4901574850082397, + "learning_rate": 4.516778052506798e-05, + "loss": 1.096, + "step": 19384 + }, + { + "epoch": 0.6942181316812004, + "grad_norm": 2.2640812397003174, + "learning_rate": 4.5158081020277296e-05, + "loss": 1.2132, + "step": 19385 + }, + { + "epoch": 0.6942539438107687, + "grad_norm": 1.3988977670669556, + "learning_rate": 4.5148382253313904e-05, + "loss": 1.0395, + "step": 19386 + }, + { + "epoch": 0.694289755940337, + "grad_norm": 1.4725074768066406, + "learning_rate": 4.513868422430823e-05, + "loss": 1.0523, + "step": 19387 + }, + { + "epoch": 0.6943255680699053, + "grad_norm": 1.7837117910385132, + "learning_rate": 4.5128986933390785e-05, + "loss": 0.9833, + "step": 19388 + }, + { + "epoch": 0.6943613801994736, + "grad_norm": 1.5758317708969116, + "learning_rate": 4.5119290380692046e-05, + "loss": 1.1497, + "step": 19389 + }, + { + "epoch": 0.6943971923290418, + "grad_norm": 1.3726311922073364, + "learning_rate": 4.5109594566342515e-05, + "loss": 1.1666, + "step": 19390 + }, + { + "epoch": 0.6944330044586101, + "grad_norm": 1.370651125907898, + "learning_rate": 4.50998994904725e-05, + "loss": 0.9035, + "step": 19391 + }, + { + "epoch": 0.6944688165881784, + "grad_norm": 1.2438929080963135, + "learning_rate": 4.50902051532126e-05, + "loss": 0.9015, + "step": 19392 + }, + { + "epoch": 0.6945046287177467, + "grad_norm": 3.19736647605896, + "learning_rate": 4.508051155469312e-05, + "loss": 1.3886, + "step": 19393 + }, + { + "epoch": 0.694540440847315, + "grad_norm": 1.2489383220672607, + "learning_rate": 4.5070818695044545e-05, + "loss": 1.0448, + "step": 19394 + }, + { + "epoch": 0.6945762529768833, + "grad_norm": 1.2968353033065796, + "learning_rate": 4.5061126574397225e-05, + "loss": 0.9089, + "step": 19395 + }, + { + "epoch": 0.6946120651064516, + "grad_norm": 1.1894183158874512, + "learning_rate": 4.505143519288159e-05, + "loss": 1.0104, + "step": 19396 + }, + { + "epoch": 0.6946478772360198, + "grad_norm": 1.4300931692123413, + "learning_rate": 4.504174455062803e-05, + "loss": 1.0518, + "step": 19397 + }, + { + "epoch": 0.6946836893655881, + "grad_norm": 1.4056506156921387, + "learning_rate": 4.503205464776689e-05, + "loss": 0.9446, + "step": 19398 + }, + { + "epoch": 0.6947195014951564, + "grad_norm": 1.4788559675216675, + "learning_rate": 4.502236548442853e-05, + "loss": 0.9437, + "step": 19399 + }, + { + "epoch": 0.6947553136247246, + "grad_norm": 1.2039536237716675, + "learning_rate": 4.501267706074335e-05, + "loss": 0.7686, + "step": 19400 + }, + { + "epoch": 0.694791125754293, + "grad_norm": 1.232452392578125, + "learning_rate": 4.5002989376841684e-05, + "loss": 0.9607, + "step": 19401 + }, + { + "epoch": 0.6948269378838613, + "grad_norm": 1.7734299898147583, + "learning_rate": 4.499330243285383e-05, + "loss": 1.2125, + "step": 19402 + }, + { + "epoch": 0.6948627500134296, + "grad_norm": 1.231917381286621, + "learning_rate": 4.4983616228910144e-05, + "loss": 1.0969, + "step": 19403 + }, + { + "epoch": 0.6948985621429978, + "grad_norm": 2.3687193393707275, + "learning_rate": 4.497393076514097e-05, + "loss": 0.9936, + "step": 19404 + }, + { + "epoch": 0.6949343742725661, + "grad_norm": 1.3524892330169678, + "learning_rate": 4.496424604167654e-05, + "loss": 1.2771, + "step": 19405 + }, + { + "epoch": 0.6949701864021344, + "grad_norm": 1.442974328994751, + "learning_rate": 4.4954562058647187e-05, + "loss": 1.0738, + "step": 19406 + }, + { + "epoch": 0.6950059985317026, + "grad_norm": 1.5272035598754883, + "learning_rate": 4.49448788161832e-05, + "loss": 1.176, + "step": 19407 + }, + { + "epoch": 0.695041810661271, + "grad_norm": 2.44109845161438, + "learning_rate": 4.493519631441488e-05, + "loss": 1.2972, + "step": 19408 + }, + { + "epoch": 0.6950776227908393, + "grad_norm": 1.3912558555603027, + "learning_rate": 4.492551455347245e-05, + "loss": 1.134, + "step": 19409 + }, + { + "epoch": 0.6951134349204076, + "grad_norm": 1.4845978021621704, + "learning_rate": 4.491583353348616e-05, + "loss": 1.0728, + "step": 19410 + }, + { + "epoch": 0.6951492470499758, + "grad_norm": 1.886683464050293, + "learning_rate": 4.490615325458628e-05, + "loss": 1.306, + "step": 19411 + }, + { + "epoch": 0.6951850591795441, + "grad_norm": 1.4328762292861938, + "learning_rate": 4.4896473716903085e-05, + "loss": 1.0966, + "step": 19412 + }, + { + "epoch": 0.6952208713091124, + "grad_norm": 1.4712773561477661, + "learning_rate": 4.488679492056672e-05, + "loss": 1.0339, + "step": 19413 + }, + { + "epoch": 0.6952566834386806, + "grad_norm": 1.6179975271224976, + "learning_rate": 4.487711686570744e-05, + "loss": 1.0039, + "step": 19414 + }, + { + "epoch": 0.695292495568249, + "grad_norm": 2.178757905960083, + "learning_rate": 4.4867439552455485e-05, + "loss": 1.1591, + "step": 19415 + }, + { + "epoch": 0.6953283076978173, + "grad_norm": 1.5596941709518433, + "learning_rate": 4.4857762980940974e-05, + "loss": 1.0611, + "step": 19416 + }, + { + "epoch": 0.6953641198273856, + "grad_norm": 1.399263620376587, + "learning_rate": 4.484808715129414e-05, + "loss": 1.2442, + "step": 19417 + }, + { + "epoch": 0.6953999319569538, + "grad_norm": 1.7717127799987793, + "learning_rate": 4.483841206364514e-05, + "loss": 1.2416, + "step": 19418 + }, + { + "epoch": 0.6954357440865221, + "grad_norm": 1.3948299884796143, + "learning_rate": 4.4828737718124204e-05, + "loss": 0.9275, + "step": 19419 + }, + { + "epoch": 0.6954715562160904, + "grad_norm": 1.3967127799987793, + "learning_rate": 4.481906411486139e-05, + "loss": 0.8246, + "step": 19420 + }, + { + "epoch": 0.6955073683456586, + "grad_norm": 1.738537311553955, + "learning_rate": 4.480939125398689e-05, + "loss": 1.2359, + "step": 19421 + }, + { + "epoch": 0.695543180475227, + "grad_norm": 1.5152653455734253, + "learning_rate": 4.479971913563088e-05, + "loss": 0.941, + "step": 19422 + }, + { + "epoch": 0.6955789926047953, + "grad_norm": 1.4903638362884521, + "learning_rate": 4.4790047759923406e-05, + "loss": 1.0339, + "step": 19423 + }, + { + "epoch": 0.6956148047343635, + "grad_norm": 1.485552430152893, + "learning_rate": 4.478037712699463e-05, + "loss": 1.1544, + "step": 19424 + }, + { + "epoch": 0.6956506168639318, + "grad_norm": 1.354925274848938, + "learning_rate": 4.477070723697464e-05, + "loss": 1.046, + "step": 19425 + }, + { + "epoch": 0.6956864289935001, + "grad_norm": 1.7521857023239136, + "learning_rate": 4.476103808999359e-05, + "loss": 1.0401, + "step": 19426 + }, + { + "epoch": 0.6957222411230684, + "grad_norm": 1.3570163249969482, + "learning_rate": 4.475136968618149e-05, + "loss": 1.0562, + "step": 19427 + }, + { + "epoch": 0.6957580532526366, + "grad_norm": 1.5699983835220337, + "learning_rate": 4.474170202566843e-05, + "loss": 1.0234, + "step": 19428 + }, + { + "epoch": 0.695793865382205, + "grad_norm": 1.3866153955459595, + "learning_rate": 4.47320351085845e-05, + "loss": 0.887, + "step": 19429 + }, + { + "epoch": 0.6958296775117733, + "grad_norm": 1.5429317951202393, + "learning_rate": 4.472236893505978e-05, + "loss": 1.0719, + "step": 19430 + }, + { + "epoch": 0.6958654896413415, + "grad_norm": 1.4708932638168335, + "learning_rate": 4.471270350522424e-05, + "loss": 0.9592, + "step": 19431 + }, + { + "epoch": 0.6959013017709098, + "grad_norm": 2.1242105960845947, + "learning_rate": 4.4703038819207975e-05, + "loss": 1.149, + "step": 19432 + }, + { + "epoch": 0.6959371139004781, + "grad_norm": 1.5892211198806763, + "learning_rate": 4.4693374877141015e-05, + "loss": 1.0658, + "step": 19433 + }, + { + "epoch": 0.6959729260300463, + "grad_norm": 1.3910493850708008, + "learning_rate": 4.4683711679153325e-05, + "loss": 1.1982, + "step": 19434 + }, + { + "epoch": 0.6960087381596146, + "grad_norm": 1.5041604042053223, + "learning_rate": 4.467404922537495e-05, + "loss": 1.119, + "step": 19435 + }, + { + "epoch": 0.696044550289183, + "grad_norm": 1.4446747303009033, + "learning_rate": 4.466438751593587e-05, + "loss": 1.1486, + "step": 19436 + }, + { + "epoch": 0.6960803624187513, + "grad_norm": 1.8813412189483643, + "learning_rate": 4.465472655096611e-05, + "loss": 1.1377, + "step": 19437 + }, + { + "epoch": 0.6961161745483195, + "grad_norm": 1.9060686826705933, + "learning_rate": 4.464506633059559e-05, + "loss": 1.0323, + "step": 19438 + }, + { + "epoch": 0.6961519866778878, + "grad_norm": 1.5574668645858765, + "learning_rate": 4.463540685495429e-05, + "loss": 1.1217, + "step": 19439 + }, + { + "epoch": 0.6961877988074561, + "grad_norm": 1.2416143417358398, + "learning_rate": 4.4625748124172204e-05, + "loss": 1.1959, + "step": 19440 + }, + { + "epoch": 0.6962236109370243, + "grad_norm": 1.5328162908554077, + "learning_rate": 4.461609013837923e-05, + "loss": 0.8801, + "step": 19441 + }, + { + "epoch": 0.6962594230665926, + "grad_norm": 1.7531434297561646, + "learning_rate": 4.460643289770532e-05, + "loss": 1.2506, + "step": 19442 + }, + { + "epoch": 0.696295235196161, + "grad_norm": 1.6778829097747803, + "learning_rate": 4.4596776402280396e-05, + "loss": 1.1236, + "step": 19443 + }, + { + "epoch": 0.6963310473257293, + "grad_norm": 1.2737091779708862, + "learning_rate": 4.458712065223442e-05, + "loss": 1.124, + "step": 19444 + }, + { + "epoch": 0.6963668594552975, + "grad_norm": 2.1609888076782227, + "learning_rate": 4.4577465647697223e-05, + "loss": 1.0405, + "step": 19445 + }, + { + "epoch": 0.6964026715848658, + "grad_norm": 1.3402711153030396, + "learning_rate": 4.456781138879873e-05, + "loss": 0.8492, + "step": 19446 + }, + { + "epoch": 0.6964384837144341, + "grad_norm": 1.6682100296020508, + "learning_rate": 4.455815787566884e-05, + "loss": 1.3694, + "step": 19447 + }, + { + "epoch": 0.6964742958440023, + "grad_norm": 1.367397427558899, + "learning_rate": 4.454850510843745e-05, + "loss": 1.0929, + "step": 19448 + }, + { + "epoch": 0.6965101079735706, + "grad_norm": 1.9245045185089111, + "learning_rate": 4.45388530872344e-05, + "loss": 1.0223, + "step": 19449 + }, + { + "epoch": 0.696545920103139, + "grad_norm": 1.7976493835449219, + "learning_rate": 4.452920181218947e-05, + "loss": 1.2391, + "step": 19450 + }, + { + "epoch": 0.6965817322327073, + "grad_norm": 1.4790040254592896, + "learning_rate": 4.451955128343266e-05, + "loss": 1.1616, + "step": 19451 + }, + { + "epoch": 0.6966175443622755, + "grad_norm": 1.556089162826538, + "learning_rate": 4.450990150109367e-05, + "loss": 1.127, + "step": 19452 + }, + { + "epoch": 0.6966533564918438, + "grad_norm": 1.6550817489624023, + "learning_rate": 4.4500252465302384e-05, + "loss": 1.2187, + "step": 19453 + }, + { + "epoch": 0.6966891686214121, + "grad_norm": 1.342861294746399, + "learning_rate": 4.449060417618861e-05, + "loss": 1.1208, + "step": 19454 + }, + { + "epoch": 0.6967249807509803, + "grad_norm": 1.1856237649917603, + "learning_rate": 4.44809566338822e-05, + "loss": 1.0846, + "step": 19455 + }, + { + "epoch": 0.6967607928805486, + "grad_norm": 1.2165741920471191, + "learning_rate": 4.447130983851285e-05, + "loss": 1.1542, + "step": 19456 + }, + { + "epoch": 0.696796605010117, + "grad_norm": 1.2638341188430786, + "learning_rate": 4.446166379021042e-05, + "loss": 1.126, + "step": 19457 + }, + { + "epoch": 0.6968324171396852, + "grad_norm": 1.6001887321472168, + "learning_rate": 4.4452018489104684e-05, + "loss": 1.0853, + "step": 19458 + }, + { + "epoch": 0.6968682292692535, + "grad_norm": 1.4424000978469849, + "learning_rate": 4.4442373935325364e-05, + "loss": 1.0158, + "step": 19459 + }, + { + "epoch": 0.6969040413988218, + "grad_norm": 1.5878052711486816, + "learning_rate": 4.4432730129002265e-05, + "loss": 1.159, + "step": 19460 + }, + { + "epoch": 0.69693985352839, + "grad_norm": 1.7067575454711914, + "learning_rate": 4.442308707026504e-05, + "loss": 1.08, + "step": 19461 + }, + { + "epoch": 0.6969756656579583, + "grad_norm": 1.6191654205322266, + "learning_rate": 4.4413444759243564e-05, + "loss": 1.1024, + "step": 19462 + }, + { + "epoch": 0.6970114777875266, + "grad_norm": 1.872454047203064, + "learning_rate": 4.440380319606744e-05, + "loss": 1.301, + "step": 19463 + }, + { + "epoch": 0.697047289917095, + "grad_norm": 1.3651701211929321, + "learning_rate": 4.439416238086643e-05, + "loss": 1.062, + "step": 19464 + }, + { + "epoch": 0.6970831020466632, + "grad_norm": 1.2860653400421143, + "learning_rate": 4.438452231377025e-05, + "loss": 1.0529, + "step": 19465 + }, + { + "epoch": 0.6971189141762315, + "grad_norm": 1.5489336252212524, + "learning_rate": 4.4374882994908615e-05, + "loss": 1.1635, + "step": 19466 + }, + { + "epoch": 0.6971547263057998, + "grad_norm": 1.2446002960205078, + "learning_rate": 4.436524442441118e-05, + "loss": 0.8464, + "step": 19467 + }, + { + "epoch": 0.697190538435368, + "grad_norm": 1.688114881515503, + "learning_rate": 4.435560660240754e-05, + "loss": 0.9253, + "step": 19468 + }, + { + "epoch": 0.6972263505649363, + "grad_norm": 1.4895695447921753, + "learning_rate": 4.434596952902752e-05, + "loss": 1.155, + "step": 19469 + }, + { + "epoch": 0.6972621626945046, + "grad_norm": 1.4979361295700073, + "learning_rate": 4.433633320440064e-05, + "loss": 1.0526, + "step": 19470 + }, + { + "epoch": 0.697297974824073, + "grad_norm": 1.3165128231048584, + "learning_rate": 4.432669762865664e-05, + "loss": 1.2184, + "step": 19471 + }, + { + "epoch": 0.6973337869536412, + "grad_norm": 1.4068957567214966, + "learning_rate": 4.431706280192503e-05, + "loss": 0.9619, + "step": 19472 + }, + { + "epoch": 0.6973695990832095, + "grad_norm": 1.435619592666626, + "learning_rate": 4.4307428724335595e-05, + "loss": 1.0961, + "step": 19473 + }, + { + "epoch": 0.6974054112127778, + "grad_norm": 1.5011749267578125, + "learning_rate": 4.429779539601787e-05, + "loss": 1.1604, + "step": 19474 + }, + { + "epoch": 0.697441223342346, + "grad_norm": 1.77822744846344, + "learning_rate": 4.428816281710142e-05, + "loss": 1.0611, + "step": 19475 + }, + { + "epoch": 0.6974770354719143, + "grad_norm": 1.3528623580932617, + "learning_rate": 4.427853098771587e-05, + "loss": 1.2017, + "step": 19476 + }, + { + "epoch": 0.6975128476014826, + "grad_norm": 1.1560845375061035, + "learning_rate": 4.426889990799082e-05, + "loss": 1.0941, + "step": 19477 + }, + { + "epoch": 0.697548659731051, + "grad_norm": 1.491883635520935, + "learning_rate": 4.425926957805586e-05, + "loss": 0.7715, + "step": 19478 + }, + { + "epoch": 0.6975844718606192, + "grad_norm": 1.7930172681808472, + "learning_rate": 4.424963999804046e-05, + "loss": 1.0978, + "step": 19479 + }, + { + "epoch": 0.6976202839901875, + "grad_norm": 1.7214311361312866, + "learning_rate": 4.4240011168074315e-05, + "loss": 0.9725, + "step": 19480 + }, + { + "epoch": 0.6976560961197558, + "grad_norm": 1.3757647275924683, + "learning_rate": 4.423038308828685e-05, + "loss": 1.1937, + "step": 19481 + }, + { + "epoch": 0.697691908249324, + "grad_norm": 1.3002378940582275, + "learning_rate": 4.4220755758807695e-05, + "loss": 1.0403, + "step": 19482 + }, + { + "epoch": 0.6977277203788923, + "grad_norm": 1.7265310287475586, + "learning_rate": 4.421112917976628e-05, + "loss": 1.0422, + "step": 19483 + }, + { + "epoch": 0.6977635325084606, + "grad_norm": 1.406862735748291, + "learning_rate": 4.420150335129215e-05, + "loss": 0.846, + "step": 19484 + }, + { + "epoch": 0.697799344638029, + "grad_norm": 1.594080924987793, + "learning_rate": 4.419187827351485e-05, + "loss": 1.2632, + "step": 19485 + }, + { + "epoch": 0.6978351567675972, + "grad_norm": 1.4356797933578491, + "learning_rate": 4.418225394656382e-05, + "loss": 1.1647, + "step": 19486 + }, + { + "epoch": 0.6978709688971655, + "grad_norm": 2.318751573562622, + "learning_rate": 4.417263037056856e-05, + "loss": 1.1976, + "step": 19487 + }, + { + "epoch": 0.6979067810267338, + "grad_norm": 1.5910788774490356, + "learning_rate": 4.416300754565854e-05, + "loss": 1.2751, + "step": 19488 + }, + { + "epoch": 0.697942593156302, + "grad_norm": 1.2869795560836792, + "learning_rate": 4.415338547196326e-05, + "loss": 0.9877, + "step": 19489 + }, + { + "epoch": 0.6979784052858703, + "grad_norm": 1.4110326766967773, + "learning_rate": 4.414376414961208e-05, + "loss": 1.1885, + "step": 19490 + }, + { + "epoch": 0.6980142174154386, + "grad_norm": 1.4335062503814697, + "learning_rate": 4.4134143578734576e-05, + "loss": 1.0932, + "step": 19491 + }, + { + "epoch": 0.698050029545007, + "grad_norm": 1.5857707262039185, + "learning_rate": 4.41245237594601e-05, + "loss": 0.9885, + "step": 19492 + }, + { + "epoch": 0.6980858416745752, + "grad_norm": 1.8114991188049316, + "learning_rate": 4.411490469191806e-05, + "loss": 1.4581, + "step": 19493 + }, + { + "epoch": 0.6981216538041435, + "grad_norm": 1.5089457035064697, + "learning_rate": 4.4105286376237874e-05, + "loss": 1.2167, + "step": 19494 + }, + { + "epoch": 0.6981574659337118, + "grad_norm": 1.4805001020431519, + "learning_rate": 4.409566881254897e-05, + "loss": 1.119, + "step": 19495 + }, + { + "epoch": 0.69819327806328, + "grad_norm": 2.1204679012298584, + "learning_rate": 4.408605200098077e-05, + "loss": 1.3459, + "step": 19496 + }, + { + "epoch": 0.6982290901928483, + "grad_norm": 1.5872174501419067, + "learning_rate": 4.407643594166257e-05, + "loss": 1.0461, + "step": 19497 + }, + { + "epoch": 0.6982649023224166, + "grad_norm": 1.6273488998413086, + "learning_rate": 4.4066820634723805e-05, + "loss": 1.1688, + "step": 19498 + }, + { + "epoch": 0.6983007144519849, + "grad_norm": 1.30527925491333, + "learning_rate": 4.405720608029381e-05, + "loss": 1.1515, + "step": 19499 + }, + { + "epoch": 0.6983365265815532, + "grad_norm": 1.4327833652496338, + "learning_rate": 4.404759227850198e-05, + "loss": 1.0404, + "step": 19500 + }, + { + "epoch": 0.6983723387111215, + "grad_norm": 1.537561058998108, + "learning_rate": 4.403797922947759e-05, + "loss": 1.1181, + "step": 19501 + }, + { + "epoch": 0.6984081508406897, + "grad_norm": 1.463400959968567, + "learning_rate": 4.4028366933349996e-05, + "loss": 1.1125, + "step": 19502 + }, + { + "epoch": 0.698443962970258, + "grad_norm": 1.6105587482452393, + "learning_rate": 4.4018755390248566e-05, + "loss": 1.1148, + "step": 19503 + }, + { + "epoch": 0.6984797750998263, + "grad_norm": 1.4717178344726562, + "learning_rate": 4.400914460030254e-05, + "loss": 1.2617, + "step": 19504 + }, + { + "epoch": 0.6985155872293946, + "grad_norm": 1.8511122465133667, + "learning_rate": 4.3999534563641253e-05, + "loss": 1.1229, + "step": 19505 + }, + { + "epoch": 0.6985513993589629, + "grad_norm": 1.400892972946167, + "learning_rate": 4.3989925280393986e-05, + "loss": 1.2683, + "step": 19506 + }, + { + "epoch": 0.6985872114885312, + "grad_norm": 1.5544172525405884, + "learning_rate": 4.3980316750690065e-05, + "loss": 1.0638, + "step": 19507 + }, + { + "epoch": 0.6986230236180995, + "grad_norm": 1.3173346519470215, + "learning_rate": 4.397070897465869e-05, + "loss": 1.2179, + "step": 19508 + }, + { + "epoch": 0.6986588357476677, + "grad_norm": 1.6283702850341797, + "learning_rate": 4.396110195242915e-05, + "loss": 1.2869, + "step": 19509 + }, + { + "epoch": 0.698694647877236, + "grad_norm": 1.6611295938491821, + "learning_rate": 4.395149568413073e-05, + "loss": 1.1519, + "step": 19510 + }, + { + "epoch": 0.6987304600068043, + "grad_norm": 1.1966631412506104, + "learning_rate": 4.394189016989261e-05, + "loss": 1.0178, + "step": 19511 + }, + { + "epoch": 0.6987662721363725, + "grad_norm": 1.7308156490325928, + "learning_rate": 4.3932285409844046e-05, + "loss": 1.0626, + "step": 19512 + }, + { + "epoch": 0.6988020842659409, + "grad_norm": 1.9448509216308594, + "learning_rate": 4.392268140411425e-05, + "loss": 1.0695, + "step": 19513 + }, + { + "epoch": 0.6988378963955092, + "grad_norm": 1.821269154548645, + "learning_rate": 4.391307815283249e-05, + "loss": 1.4528, + "step": 19514 + }, + { + "epoch": 0.6988737085250775, + "grad_norm": 1.686468482017517, + "learning_rate": 4.390347565612787e-05, + "loss": 1.1804, + "step": 19515 + }, + { + "epoch": 0.6989095206546457, + "grad_norm": 1.473440408706665, + "learning_rate": 4.3893873914129635e-05, + "loss": 0.8703, + "step": 19516 + }, + { + "epoch": 0.698945332784214, + "grad_norm": 1.4245141744613647, + "learning_rate": 4.388427292696695e-05, + "loss": 0.9436, + "step": 19517 + }, + { + "epoch": 0.6989811449137823, + "grad_norm": 1.3230763673782349, + "learning_rate": 4.387467269476902e-05, + "loss": 1.0093, + "step": 19518 + }, + { + "epoch": 0.6990169570433505, + "grad_norm": 2.1105844974517822, + "learning_rate": 4.3865073217664944e-05, + "loss": 1.1279, + "step": 19519 + }, + { + "epoch": 0.6990527691729189, + "grad_norm": 1.9347596168518066, + "learning_rate": 4.38554744957839e-05, + "loss": 1.2153, + "step": 19520 + }, + { + "epoch": 0.6990885813024872, + "grad_norm": 2.072237730026245, + "learning_rate": 4.384587652925506e-05, + "loss": 1.2405, + "step": 19521 + }, + { + "epoch": 0.6991243934320555, + "grad_norm": 1.6396846771240234, + "learning_rate": 4.383627931820747e-05, + "loss": 1.1393, + "step": 19522 + }, + { + "epoch": 0.6991602055616237, + "grad_norm": 2.2401843070983887, + "learning_rate": 4.382668286277031e-05, + "loss": 1.3328, + "step": 19523 + }, + { + "epoch": 0.699196017691192, + "grad_norm": 1.3530185222625732, + "learning_rate": 4.381708716307267e-05, + "loss": 0.9751, + "step": 19524 + }, + { + "epoch": 0.6992318298207603, + "grad_norm": 1.2625908851623535, + "learning_rate": 4.3807492219243686e-05, + "loss": 1.0638, + "step": 19525 + }, + { + "epoch": 0.6992676419503285, + "grad_norm": 1.5509716272354126, + "learning_rate": 4.379789803141238e-05, + "loss": 1.1662, + "step": 19526 + }, + { + "epoch": 0.6993034540798969, + "grad_norm": 1.537358045578003, + "learning_rate": 4.378830459970785e-05, + "loss": 1.1616, + "step": 19527 + }, + { + "epoch": 0.6993392662094652, + "grad_norm": 1.3314744234085083, + "learning_rate": 4.3778711924259216e-05, + "loss": 1.0966, + "step": 19528 + }, + { + "epoch": 0.6993750783390335, + "grad_norm": 1.2393282651901245, + "learning_rate": 4.3769120005195465e-05, + "loss": 0.8645, + "step": 19529 + }, + { + "epoch": 0.6994108904686017, + "grad_norm": 1.6419655084609985, + "learning_rate": 4.375952884264566e-05, + "loss": 1.0951, + "step": 19530 + }, + { + "epoch": 0.69944670259817, + "grad_norm": 1.3758407831192017, + "learning_rate": 4.374993843673886e-05, + "loss": 0.9026, + "step": 19531 + }, + { + "epoch": 0.6994825147277383, + "grad_norm": 1.3194286823272705, + "learning_rate": 4.37403487876041e-05, + "loss": 0.9813, + "step": 19532 + }, + { + "epoch": 0.6995183268573065, + "grad_norm": 1.483726143836975, + "learning_rate": 4.373075989537035e-05, + "loss": 1.0057, + "step": 19533 + }, + { + "epoch": 0.6995541389868749, + "grad_norm": 1.6564316749572754, + "learning_rate": 4.372117176016665e-05, + "loss": 1.0457, + "step": 19534 + }, + { + "epoch": 0.6995899511164432, + "grad_norm": 1.5304068326950073, + "learning_rate": 4.371158438212199e-05, + "loss": 0.9244, + "step": 19535 + }, + { + "epoch": 0.6996257632460114, + "grad_norm": 1.746561884880066, + "learning_rate": 4.370199776136538e-05, + "loss": 1.2052, + "step": 19536 + }, + { + "epoch": 0.6996615753755797, + "grad_norm": 1.9495092630386353, + "learning_rate": 4.3692411898025746e-05, + "loss": 1.1772, + "step": 19537 + }, + { + "epoch": 0.699697387505148, + "grad_norm": 1.789594054222107, + "learning_rate": 4.368282679223207e-05, + "loss": 1.3086, + "step": 19538 + }, + { + "epoch": 0.6997331996347163, + "grad_norm": 1.1959803104400635, + "learning_rate": 4.367324244411335e-05, + "loss": 0.7871, + "step": 19539 + }, + { + "epoch": 0.6997690117642845, + "grad_norm": 1.3788129091262817, + "learning_rate": 4.3663658853798476e-05, + "loss": 1.0746, + "step": 19540 + }, + { + "epoch": 0.6998048238938529, + "grad_norm": 1.313615083694458, + "learning_rate": 4.365407602141639e-05, + "loss": 1.0825, + "step": 19541 + }, + { + "epoch": 0.6998406360234212, + "grad_norm": 1.709593415260315, + "learning_rate": 4.364449394709603e-05, + "loss": 1.2, + "step": 19542 + }, + { + "epoch": 0.6998764481529894, + "grad_norm": 1.8916280269622803, + "learning_rate": 4.363491263096635e-05, + "loss": 1.3382, + "step": 19543 + }, + { + "epoch": 0.6999122602825577, + "grad_norm": 1.2318781614303589, + "learning_rate": 4.362533207315618e-05, + "loss": 1.1501, + "step": 19544 + }, + { + "epoch": 0.699948072412126, + "grad_norm": 1.7694686651229858, + "learning_rate": 4.361575227379444e-05, + "loss": 1.1418, + "step": 19545 + }, + { + "epoch": 0.6999838845416942, + "grad_norm": 1.475451111793518, + "learning_rate": 4.360617323301007e-05, + "loss": 1.1348, + "step": 19546 + }, + { + "epoch": 0.7000196966712625, + "grad_norm": 1.6778773069381714, + "learning_rate": 4.359659495093186e-05, + "loss": 1.1782, + "step": 19547 + }, + { + "epoch": 0.7000555088008309, + "grad_norm": 1.3437243700027466, + "learning_rate": 4.35870174276887e-05, + "loss": 1.1949, + "step": 19548 + }, + { + "epoch": 0.7000913209303992, + "grad_norm": 1.7136768102645874, + "learning_rate": 4.357744066340946e-05, + "loss": 1.186, + "step": 19549 + }, + { + "epoch": 0.7001271330599674, + "grad_norm": 1.4186679124832153, + "learning_rate": 4.356786465822301e-05, + "loss": 1.2781, + "step": 19550 + }, + { + "epoch": 0.7001629451895357, + "grad_norm": 1.2459039688110352, + "learning_rate": 4.3558289412258114e-05, + "loss": 1.0044, + "step": 19551 + }, + { + "epoch": 0.700198757319104, + "grad_norm": 1.7082313299179077, + "learning_rate": 4.354871492564363e-05, + "loss": 1.0754, + "step": 19552 + }, + { + "epoch": 0.7002345694486722, + "grad_norm": 1.5658841133117676, + "learning_rate": 4.353914119850837e-05, + "loss": 0.8126, + "step": 19553 + }, + { + "epoch": 0.7002703815782405, + "grad_norm": 1.5094695091247559, + "learning_rate": 4.3529568230981165e-05, + "loss": 0.909, + "step": 19554 + }, + { + "epoch": 0.7003061937078089, + "grad_norm": 1.410865068435669, + "learning_rate": 4.351999602319079e-05, + "loss": 1.1427, + "step": 19555 + }, + { + "epoch": 0.7003420058373772, + "grad_norm": 1.953940510749817, + "learning_rate": 4.351042457526594e-05, + "loss": 1.0086, + "step": 19556 + }, + { + "epoch": 0.7003778179669454, + "grad_norm": 1.6125938892364502, + "learning_rate": 4.350085388733553e-05, + "loss": 0.8998, + "step": 19557 + }, + { + "epoch": 0.7004136300965137, + "grad_norm": 1.321612000465393, + "learning_rate": 4.349128395952821e-05, + "loss": 1.1309, + "step": 19558 + }, + { + "epoch": 0.700449442226082, + "grad_norm": 1.5601085424423218, + "learning_rate": 4.3481714791972816e-05, + "loss": 1.3181, + "step": 19559 + }, + { + "epoch": 0.7004852543556502, + "grad_norm": 1.322711706161499, + "learning_rate": 4.3472146384797973e-05, + "loss": 1.0576, + "step": 19560 + }, + { + "epoch": 0.7005210664852185, + "grad_norm": 1.4656033515930176, + "learning_rate": 4.3462578738132557e-05, + "loss": 0.8851, + "step": 19561 + }, + { + "epoch": 0.7005568786147869, + "grad_norm": 1.5912991762161255, + "learning_rate": 4.345301185210517e-05, + "loss": 1.0223, + "step": 19562 + }, + { + "epoch": 0.7005926907443552, + "grad_norm": 1.7114596366882324, + "learning_rate": 4.344344572684459e-05, + "loss": 1.1321, + "step": 19563 + }, + { + "epoch": 0.7006285028739234, + "grad_norm": 1.490259051322937, + "learning_rate": 4.343388036247952e-05, + "loss": 0.9523, + "step": 19564 + }, + { + "epoch": 0.7006643150034917, + "grad_norm": 1.4175457954406738, + "learning_rate": 4.342431575913858e-05, + "loss": 1.0014, + "step": 19565 + }, + { + "epoch": 0.70070012713306, + "grad_norm": 1.8971225023269653, + "learning_rate": 4.341475191695054e-05, + "loss": 1.2309, + "step": 19566 + }, + { + "epoch": 0.7007359392626282, + "grad_norm": 1.313364028930664, + "learning_rate": 4.340518883604395e-05, + "loss": 1.0569, + "step": 19567 + }, + { + "epoch": 0.7007717513921965, + "grad_norm": 1.3286187648773193, + "learning_rate": 4.339562651654761e-05, + "loss": 0.8448, + "step": 19568 + }, + { + "epoch": 0.7008075635217649, + "grad_norm": 1.4674023389816284, + "learning_rate": 4.338606495859007e-05, + "loss": 1.2641, + "step": 19569 + }, + { + "epoch": 0.7008433756513331, + "grad_norm": 1.8015286922454834, + "learning_rate": 4.3376504162300035e-05, + "loss": 1.2402, + "step": 19570 + }, + { + "epoch": 0.7008791877809014, + "grad_norm": 1.646942138671875, + "learning_rate": 4.336694412780605e-05, + "loss": 1.0514, + "step": 19571 + }, + { + "epoch": 0.7009149999104697, + "grad_norm": 1.5919573307037354, + "learning_rate": 4.3357384855236796e-05, + "loss": 1.0844, + "step": 19572 + }, + { + "epoch": 0.700950812040038, + "grad_norm": 1.5554910898208618, + "learning_rate": 4.33478263447209e-05, + "loss": 1.0019, + "step": 19573 + }, + { + "epoch": 0.7009866241696062, + "grad_norm": 1.4144930839538574, + "learning_rate": 4.333826859638684e-05, + "loss": 1.2801, + "step": 19574 + }, + { + "epoch": 0.7010224362991745, + "grad_norm": 1.3851178884506226, + "learning_rate": 4.332871161036337e-05, + "loss": 0.8792, + "step": 19575 + }, + { + "epoch": 0.7010582484287429, + "grad_norm": 1.3328777551651, + "learning_rate": 4.331915538677894e-05, + "loss": 1.0346, + "step": 19576 + }, + { + "epoch": 0.7010940605583111, + "grad_norm": 1.365717887878418, + "learning_rate": 4.3309599925762214e-05, + "loss": 0.9785, + "step": 19577 + }, + { + "epoch": 0.7011298726878794, + "grad_norm": 1.2452305555343628, + "learning_rate": 4.330004522744161e-05, + "loss": 1.2199, + "step": 19578 + }, + { + "epoch": 0.7011656848174477, + "grad_norm": 1.485102653503418, + "learning_rate": 4.329049129194583e-05, + "loss": 1.1082, + "step": 19579 + }, + { + "epoch": 0.701201496947016, + "grad_norm": 1.7309839725494385, + "learning_rate": 4.3280938119403346e-05, + "loss": 1.1368, + "step": 19580 + }, + { + "epoch": 0.7012373090765842, + "grad_norm": 1.317213535308838, + "learning_rate": 4.3271385709942636e-05, + "loss": 1.0513, + "step": 19581 + }, + { + "epoch": 0.7012731212061525, + "grad_norm": 1.4566084146499634, + "learning_rate": 4.326183406369226e-05, + "loss": 1.0234, + "step": 19582 + }, + { + "epoch": 0.7013089333357209, + "grad_norm": 1.9967243671417236, + "learning_rate": 4.325228318078073e-05, + "loss": 1.0256, + "step": 19583 + }, + { + "epoch": 0.7013447454652891, + "grad_norm": 1.4118854999542236, + "learning_rate": 4.324273306133655e-05, + "loss": 1.2476, + "step": 19584 + }, + { + "epoch": 0.7013805575948574, + "grad_norm": 1.4073420763015747, + "learning_rate": 4.3233183705488156e-05, + "loss": 0.7839, + "step": 19585 + }, + { + "epoch": 0.7014163697244257, + "grad_norm": 1.5061050653457642, + "learning_rate": 4.322363511336405e-05, + "loss": 1.0875, + "step": 19586 + }, + { + "epoch": 0.7014521818539939, + "grad_norm": 1.7733137607574463, + "learning_rate": 4.32140872850927e-05, + "loss": 0.9972, + "step": 19587 + }, + { + "epoch": 0.7014879939835622, + "grad_norm": 1.5084199905395508, + "learning_rate": 4.320454022080259e-05, + "loss": 1.1867, + "step": 19588 + }, + { + "epoch": 0.7015238061131305, + "grad_norm": 1.3849657773971558, + "learning_rate": 4.3194993920622095e-05, + "loss": 1.1502, + "step": 19589 + }, + { + "epoch": 0.7015596182426989, + "grad_norm": 1.5608237981796265, + "learning_rate": 4.318544838467968e-05, + "loss": 1.1377, + "step": 19590 + }, + { + "epoch": 0.7015954303722671, + "grad_norm": 1.9064749479293823, + "learning_rate": 4.3175903613103815e-05, + "loss": 1.2343, + "step": 19591 + }, + { + "epoch": 0.7016312425018354, + "grad_norm": 1.7057387828826904, + "learning_rate": 4.316635960602283e-05, + "loss": 1.0351, + "step": 19592 + }, + { + "epoch": 0.7016670546314037, + "grad_norm": 1.5163687467575073, + "learning_rate": 4.3156816363565166e-05, + "loss": 1.1248, + "step": 19593 + }, + { + "epoch": 0.7017028667609719, + "grad_norm": 1.2704651355743408, + "learning_rate": 4.3147273885859215e-05, + "loss": 0.9502, + "step": 19594 + }, + { + "epoch": 0.7017386788905402, + "grad_norm": 1.2798036336898804, + "learning_rate": 4.3137732173033394e-05, + "loss": 1.1162, + "step": 19595 + }, + { + "epoch": 0.7017744910201085, + "grad_norm": 1.7599514722824097, + "learning_rate": 4.3128191225216005e-05, + "loss": 0.8574, + "step": 19596 + }, + { + "epoch": 0.7018103031496769, + "grad_norm": 1.4539357423782349, + "learning_rate": 4.3118651042535444e-05, + "loss": 1.0429, + "step": 19597 + }, + { + "epoch": 0.7018461152792451, + "grad_norm": 1.3385146856307983, + "learning_rate": 4.31091116251201e-05, + "loss": 1.0223, + "step": 19598 + }, + { + "epoch": 0.7018819274088134, + "grad_norm": 1.6778699159622192, + "learning_rate": 4.3099572973098236e-05, + "loss": 1.1079, + "step": 19599 + }, + { + "epoch": 0.7019177395383817, + "grad_norm": 1.9509786367416382, + "learning_rate": 4.309003508659822e-05, + "loss": 1.1329, + "step": 19600 + }, + { + "epoch": 0.7019535516679499, + "grad_norm": 1.3522450923919678, + "learning_rate": 4.3080497965748376e-05, + "loss": 1.2102, + "step": 19601 + }, + { + "epoch": 0.7019893637975182, + "grad_norm": 1.2519117593765259, + "learning_rate": 4.307096161067704e-05, + "loss": 0.86, + "step": 19602 + }, + { + "epoch": 0.7020251759270865, + "grad_norm": 1.422998070716858, + "learning_rate": 4.3061426021512453e-05, + "loss": 0.9275, + "step": 19603 + }, + { + "epoch": 0.7020609880566548, + "grad_norm": 1.356559157371521, + "learning_rate": 4.305189119838293e-05, + "loss": 0.9588, + "step": 19604 + }, + { + "epoch": 0.7020968001862231, + "grad_norm": 1.4489116668701172, + "learning_rate": 4.304235714141677e-05, + "loss": 1.2185, + "step": 19605 + }, + { + "epoch": 0.7021326123157914, + "grad_norm": 1.259275197982788, + "learning_rate": 4.303282385074224e-05, + "loss": 1.116, + "step": 19606 + }, + { + "epoch": 0.7021684244453597, + "grad_norm": 1.2023553848266602, + "learning_rate": 4.3023291326487556e-05, + "loss": 1.0891, + "step": 19607 + }, + { + "epoch": 0.7022042365749279, + "grad_norm": 1.3872557878494263, + "learning_rate": 4.301375956878099e-05, + "loss": 0.8081, + "step": 19608 + }, + { + "epoch": 0.7022400487044962, + "grad_norm": 1.9767264127731323, + "learning_rate": 4.300422857775081e-05, + "loss": 1.0916, + "step": 19609 + }, + { + "epoch": 0.7022758608340645, + "grad_norm": 1.3956633806228638, + "learning_rate": 4.2994698353525184e-05, + "loss": 1.255, + "step": 19610 + }, + { + "epoch": 0.7023116729636327, + "grad_norm": 1.5907262563705444, + "learning_rate": 4.298516889623238e-05, + "loss": 1.1029, + "step": 19611 + }, + { + "epoch": 0.7023474850932011, + "grad_norm": 1.9705770015716553, + "learning_rate": 4.297564020600056e-05, + "loss": 1.2573, + "step": 19612 + }, + { + "epoch": 0.7023832972227694, + "grad_norm": 1.828176736831665, + "learning_rate": 4.2966112282957985e-05, + "loss": 1.2187, + "step": 19613 + }, + { + "epoch": 0.7024191093523376, + "grad_norm": 1.6527734994888306, + "learning_rate": 4.295658512723277e-05, + "loss": 0.9851, + "step": 19614 + }, + { + "epoch": 0.7024549214819059, + "grad_norm": 1.3870441913604736, + "learning_rate": 4.2947058738953115e-05, + "loss": 1.1036, + "step": 19615 + }, + { + "epoch": 0.7024907336114742, + "grad_norm": 1.2824350595474243, + "learning_rate": 4.293753311824724e-05, + "loss": 1.0573, + "step": 19616 + }, + { + "epoch": 0.7025265457410425, + "grad_norm": 1.1591942310333252, + "learning_rate": 4.2928008265243205e-05, + "loss": 0.8758, + "step": 19617 + }, + { + "epoch": 0.7025623578706107, + "grad_norm": 1.350498080253601, + "learning_rate": 4.2918484180069205e-05, + "loss": 0.9603, + "step": 19618 + }, + { + "epoch": 0.7025981700001791, + "grad_norm": 1.1484408378601074, + "learning_rate": 4.290896086285338e-05, + "loss": 0.7947, + "step": 19619 + }, + { + "epoch": 0.7026339821297474, + "grad_norm": 1.816713571548462, + "learning_rate": 4.289943831372386e-05, + "loss": 1.1187, + "step": 19620 + }, + { + "epoch": 0.7026697942593156, + "grad_norm": 1.340672492980957, + "learning_rate": 4.2889916532808716e-05, + "loss": 1.1758, + "step": 19621 + }, + { + "epoch": 0.7027056063888839, + "grad_norm": 1.2501569986343384, + "learning_rate": 4.2880395520236086e-05, + "loss": 0.7444, + "step": 19622 + }, + { + "epoch": 0.7027414185184522, + "grad_norm": 1.2416845560073853, + "learning_rate": 4.287087527613405e-05, + "loss": 1.1203, + "step": 19623 + }, + { + "epoch": 0.7027772306480204, + "grad_norm": 1.7102099657058716, + "learning_rate": 4.2861355800630734e-05, + "loss": 1.1842, + "step": 19624 + }, + { + "epoch": 0.7028130427775887, + "grad_norm": 1.4815258979797363, + "learning_rate": 4.285183709385413e-05, + "loss": 1.2939, + "step": 19625 + }, + { + "epoch": 0.7028488549071571, + "grad_norm": 1.8995578289031982, + "learning_rate": 4.284231915593234e-05, + "loss": 0.9686, + "step": 19626 + }, + { + "epoch": 0.7028846670367254, + "grad_norm": 1.5592892169952393, + "learning_rate": 4.283280198699346e-05, + "loss": 1.1682, + "step": 19627 + }, + { + "epoch": 0.7029204791662936, + "grad_norm": 1.6670172214508057, + "learning_rate": 4.2823285587165454e-05, + "loss": 0.9078, + "step": 19628 + }, + { + "epoch": 0.7029562912958619, + "grad_norm": 1.682679295539856, + "learning_rate": 4.281376995657638e-05, + "loss": 1.3059, + "step": 19629 + }, + { + "epoch": 0.7029921034254302, + "grad_norm": 1.5927150249481201, + "learning_rate": 4.2804255095354276e-05, + "loss": 1.1996, + "step": 19630 + }, + { + "epoch": 0.7030279155549984, + "grad_norm": 1.4685436487197876, + "learning_rate": 4.279474100362717e-05, + "loss": 1.2511, + "step": 19631 + }, + { + "epoch": 0.7030637276845667, + "grad_norm": 1.4057447910308838, + "learning_rate": 4.278522768152301e-05, + "loss": 1.2029, + "step": 19632 + }, + { + "epoch": 0.7030995398141351, + "grad_norm": 1.4210416078567505, + "learning_rate": 4.27757151291698e-05, + "loss": 0.9518, + "step": 19633 + }, + { + "epoch": 0.7031353519437034, + "grad_norm": 1.490327000617981, + "learning_rate": 4.2766203346695565e-05, + "loss": 1.1964, + "step": 19634 + }, + { + "epoch": 0.7031711640732716, + "grad_norm": 1.6014256477355957, + "learning_rate": 4.27566923342282e-05, + "loss": 1.1776, + "step": 19635 + }, + { + "epoch": 0.7032069762028399, + "grad_norm": 2.211932420730591, + "learning_rate": 4.274718209189571e-05, + "loss": 1.0115, + "step": 19636 + }, + { + "epoch": 0.7032427883324082, + "grad_norm": 1.2059658765792847, + "learning_rate": 4.273767261982603e-05, + "loss": 1.0749, + "step": 19637 + }, + { + "epoch": 0.7032786004619764, + "grad_norm": 1.5097397565841675, + "learning_rate": 4.272816391814714e-05, + "loss": 1.1939, + "step": 19638 + }, + { + "epoch": 0.7033144125915447, + "grad_norm": 1.3461146354675293, + "learning_rate": 4.271865598698689e-05, + "loss": 0.8555, + "step": 19639 + }, + { + "epoch": 0.7033502247211131, + "grad_norm": 1.5005762577056885, + "learning_rate": 4.2709148826473234e-05, + "loss": 1.141, + "step": 19640 + }, + { + "epoch": 0.7033860368506814, + "grad_norm": 1.2759031057357788, + "learning_rate": 4.269964243673408e-05, + "loss": 0.9586, + "step": 19641 + }, + { + "epoch": 0.7034218489802496, + "grad_norm": 1.321832537651062, + "learning_rate": 4.2690136817897363e-05, + "loss": 0.9433, + "step": 19642 + }, + { + "epoch": 0.7034576611098179, + "grad_norm": 1.4690964221954346, + "learning_rate": 4.2680631970090935e-05, + "loss": 0.981, + "step": 19643 + }, + { + "epoch": 0.7034934732393862, + "grad_norm": 1.356557011604309, + "learning_rate": 4.2671127893442586e-05, + "loss": 1.1514, + "step": 19644 + }, + { + "epoch": 0.7035292853689544, + "grad_norm": 1.4752850532531738, + "learning_rate": 4.266162458808034e-05, + "loss": 1.1119, + "step": 19645 + }, + { + "epoch": 0.7035650974985227, + "grad_norm": 1.3479994535446167, + "learning_rate": 4.2652122054131936e-05, + "loss": 1.2631, + "step": 19646 + }, + { + "epoch": 0.7036009096280911, + "grad_norm": 1.4404785633087158, + "learning_rate": 4.264262029172527e-05, + "loss": 1.191, + "step": 19647 + }, + { + "epoch": 0.7036367217576593, + "grad_norm": 1.8989479541778564, + "learning_rate": 4.2633119300988146e-05, + "loss": 1.0592, + "step": 19648 + }, + { + "epoch": 0.7036725338872276, + "grad_norm": 1.4241245985031128, + "learning_rate": 4.262361908204844e-05, + "loss": 1.0279, + "step": 19649 + }, + { + "epoch": 0.7037083460167959, + "grad_norm": 2.5862743854522705, + "learning_rate": 4.26141196350339e-05, + "loss": 1.0321, + "step": 19650 + }, + { + "epoch": 0.7037441581463642, + "grad_norm": 1.5044251680374146, + "learning_rate": 4.260462096007235e-05, + "loss": 1.0735, + "step": 19651 + }, + { + "epoch": 0.7037799702759324, + "grad_norm": 1.452555537223816, + "learning_rate": 4.259512305729164e-05, + "loss": 1.1216, + "step": 19652 + }, + { + "epoch": 0.7038157824055007, + "grad_norm": 1.4431933164596558, + "learning_rate": 4.258562592681945e-05, + "loss": 1.043, + "step": 19653 + }, + { + "epoch": 0.7038515945350691, + "grad_norm": 1.4995111227035522, + "learning_rate": 4.2576129568783654e-05, + "loss": 1.2122, + "step": 19654 + }, + { + "epoch": 0.7038874066646373, + "grad_norm": 1.925184965133667, + "learning_rate": 4.2566633983311885e-05, + "loss": 1.2865, + "step": 19655 + }, + { + "epoch": 0.7039232187942056, + "grad_norm": 1.7544444799423218, + "learning_rate": 4.2557139170532045e-05, + "loss": 1.0557, + "step": 19656 + }, + { + "epoch": 0.7039590309237739, + "grad_norm": 1.409363031387329, + "learning_rate": 4.2547645130571764e-05, + "loss": 1.2036, + "step": 19657 + }, + { + "epoch": 0.7039948430533421, + "grad_norm": 1.5086348056793213, + "learning_rate": 4.253815186355881e-05, + "loss": 1.2057, + "step": 19658 + }, + { + "epoch": 0.7040306551829104, + "grad_norm": 1.9017088413238525, + "learning_rate": 4.2528659369620905e-05, + "loss": 1.1609, + "step": 19659 + }, + { + "epoch": 0.7040664673124787, + "grad_norm": 1.877361536026001, + "learning_rate": 4.2519167648885785e-05, + "loss": 1.2838, + "step": 19660 + }, + { + "epoch": 0.7041022794420471, + "grad_norm": 1.4408998489379883, + "learning_rate": 4.250967670148113e-05, + "loss": 1.0012, + "step": 19661 + }, + { + "epoch": 0.7041380915716153, + "grad_norm": 1.3571456670761108, + "learning_rate": 4.250018652753454e-05, + "loss": 1.2121, + "step": 19662 + }, + { + "epoch": 0.7041739037011836, + "grad_norm": 1.527081847190857, + "learning_rate": 4.2490697127173826e-05, + "loss": 0.9708, + "step": 19663 + }, + { + "epoch": 0.7042097158307519, + "grad_norm": 1.6810481548309326, + "learning_rate": 4.248120850052658e-05, + "loss": 1.006, + "step": 19664 + }, + { + "epoch": 0.7042455279603201, + "grad_norm": 1.6127232313156128, + "learning_rate": 4.247172064772053e-05, + "loss": 1.0698, + "step": 19665 + }, + { + "epoch": 0.7042813400898884, + "grad_norm": 2.438001871109009, + "learning_rate": 4.246223356888318e-05, + "loss": 1.2019, + "step": 19666 + }, + { + "epoch": 0.7043171522194567, + "grad_norm": 1.7401901483535767, + "learning_rate": 4.2452747264142335e-05, + "loss": 0.9374, + "step": 19667 + }, + { + "epoch": 0.7043529643490251, + "grad_norm": 1.72780442237854, + "learning_rate": 4.244326173362555e-05, + "loss": 1.2839, + "step": 19668 + }, + { + "epoch": 0.7043887764785933, + "grad_norm": 2.196096897125244, + "learning_rate": 4.2433776977460396e-05, + "loss": 1.2922, + "step": 19669 + }, + { + "epoch": 0.7044245886081616, + "grad_norm": 1.2271729707717896, + "learning_rate": 4.242429299577452e-05, + "loss": 0.8662, + "step": 19670 + }, + { + "epoch": 0.7044604007377299, + "grad_norm": 2.1597800254821777, + "learning_rate": 4.241480978869551e-05, + "loss": 1.0976, + "step": 19671 + }, + { + "epoch": 0.7044962128672981, + "grad_norm": 1.8050132989883423, + "learning_rate": 4.2405327356351e-05, + "loss": 1.1508, + "step": 19672 + }, + { + "epoch": 0.7045320249968664, + "grad_norm": 1.27716064453125, + "learning_rate": 4.239584569886843e-05, + "loss": 1.1021, + "step": 19673 + }, + { + "epoch": 0.7045678371264347, + "grad_norm": 1.3805392980575562, + "learning_rate": 4.2386364816375545e-05, + "loss": 1.1182, + "step": 19674 + }, + { + "epoch": 0.704603649256003, + "grad_norm": 1.834807276725769, + "learning_rate": 4.2376884708999754e-05, + "loss": 1.1879, + "step": 19675 + }, + { + "epoch": 0.7046394613855713, + "grad_norm": 1.3678481578826904, + "learning_rate": 4.23674053768687e-05, + "loss": 1.0506, + "step": 19676 + }, + { + "epoch": 0.7046752735151396, + "grad_norm": 1.6735215187072754, + "learning_rate": 4.2357926820109816e-05, + "loss": 1.0452, + "step": 19677 + }, + { + "epoch": 0.7047110856447079, + "grad_norm": 1.5208208560943604, + "learning_rate": 4.234844903885068e-05, + "loss": 1.0266, + "step": 19678 + }, + { + "epoch": 0.7047468977742761, + "grad_norm": 1.426080584526062, + "learning_rate": 4.233897203321883e-05, + "loss": 1.1666, + "step": 19679 + }, + { + "epoch": 0.7047827099038444, + "grad_norm": 1.3033174276351929, + "learning_rate": 4.23294958033417e-05, + "loss": 1.1712, + "step": 19680 + }, + { + "epoch": 0.7048185220334127, + "grad_norm": 1.663332462310791, + "learning_rate": 4.232002034934681e-05, + "loss": 1.0977, + "step": 19681 + }, + { + "epoch": 0.704854334162981, + "grad_norm": 1.5644245147705078, + "learning_rate": 4.231054567136166e-05, + "loss": 1.0572, + "step": 19682 + }, + { + "epoch": 0.7048901462925493, + "grad_norm": 1.7643237113952637, + "learning_rate": 4.230107176951372e-05, + "loss": 1.073, + "step": 19683 + }, + { + "epoch": 0.7049259584221176, + "grad_norm": 1.4821484088897705, + "learning_rate": 4.229159864393037e-05, + "loss": 1.2249, + "step": 19684 + }, + { + "epoch": 0.7049617705516859, + "grad_norm": 1.6047289371490479, + "learning_rate": 4.2282126294739186e-05, + "loss": 1.1962, + "step": 19685 + }, + { + "epoch": 0.7049975826812541, + "grad_norm": 1.4162609577178955, + "learning_rate": 4.227265472206756e-05, + "loss": 1.1671, + "step": 19686 + }, + { + "epoch": 0.7050333948108224, + "grad_norm": 1.514267086982727, + "learning_rate": 4.226318392604285e-05, + "loss": 0.9021, + "step": 19687 + }, + { + "epoch": 0.7050692069403907, + "grad_norm": 1.411516547203064, + "learning_rate": 4.225371390679254e-05, + "loss": 1.0705, + "step": 19688 + }, + { + "epoch": 0.705105019069959, + "grad_norm": 1.4504971504211426, + "learning_rate": 4.224424466444401e-05, + "loss": 1.2525, + "step": 19689 + }, + { + "epoch": 0.7051408311995273, + "grad_norm": 1.4386566877365112, + "learning_rate": 4.2234776199124705e-05, + "loss": 0.9945, + "step": 19690 + }, + { + "epoch": 0.7051766433290956, + "grad_norm": 1.4442592859268188, + "learning_rate": 4.222530851096194e-05, + "loss": 1.2579, + "step": 19691 + }, + { + "epoch": 0.7052124554586638, + "grad_norm": 2.548572540283203, + "learning_rate": 4.221584160008313e-05, + "loss": 1.1766, + "step": 19692 + }, + { + "epoch": 0.7052482675882321, + "grad_norm": 1.364974856376648, + "learning_rate": 4.220637546661562e-05, + "loss": 1.0694, + "step": 19693 + }, + { + "epoch": 0.7052840797178004, + "grad_norm": 1.3251299858093262, + "learning_rate": 4.2196910110686826e-05, + "loss": 1.1662, + "step": 19694 + }, + { + "epoch": 0.7053198918473687, + "grad_norm": 1.3597261905670166, + "learning_rate": 4.218744553242402e-05, + "loss": 1.2914, + "step": 19695 + }, + { + "epoch": 0.705355703976937, + "grad_norm": 1.977690577507019, + "learning_rate": 4.217798173195454e-05, + "loss": 1.0339, + "step": 19696 + }, + { + "epoch": 0.7053915161065053, + "grad_norm": 1.6101926565170288, + "learning_rate": 4.216851870940578e-05, + "loss": 1.202, + "step": 19697 + }, + { + "epoch": 0.7054273282360736, + "grad_norm": 1.3116800785064697, + "learning_rate": 4.2159056464904964e-05, + "loss": 1.0123, + "step": 19698 + }, + { + "epoch": 0.7054631403656418, + "grad_norm": 1.8481826782226562, + "learning_rate": 4.2149594998579445e-05, + "loss": 0.918, + "step": 19699 + }, + { + "epoch": 0.7054989524952101, + "grad_norm": 1.460764765739441, + "learning_rate": 4.214013431055649e-05, + "loss": 0.9906, + "step": 19700 + }, + { + "epoch": 0.7055347646247784, + "grad_norm": 1.3565325736999512, + "learning_rate": 4.213067440096343e-05, + "loss": 1.0979, + "step": 19701 + }, + { + "epoch": 0.7055705767543466, + "grad_norm": 1.4243299961090088, + "learning_rate": 4.212121526992747e-05, + "loss": 1.1893, + "step": 19702 + }, + { + "epoch": 0.705606388883915, + "grad_norm": 1.4048235416412354, + "learning_rate": 4.211175691757591e-05, + "loss": 1.0684, + "step": 19703 + }, + { + "epoch": 0.7056422010134833, + "grad_norm": 1.4582549333572388, + "learning_rate": 4.2102299344036014e-05, + "loss": 1.167, + "step": 19704 + }, + { + "epoch": 0.7056780131430516, + "grad_norm": 1.3932653665542603, + "learning_rate": 4.2092842549434954e-05, + "loss": 0.9253, + "step": 19705 + }, + { + "epoch": 0.7057138252726198, + "grad_norm": 1.4037946462631226, + "learning_rate": 4.208338653390002e-05, + "loss": 1.2031, + "step": 19706 + }, + { + "epoch": 0.7057496374021881, + "grad_norm": 1.329181432723999, + "learning_rate": 4.20739312975584e-05, + "loss": 1.118, + "step": 19707 + }, + { + "epoch": 0.7057854495317564, + "grad_norm": 1.3853604793548584, + "learning_rate": 4.206447684053735e-05, + "loss": 1.0817, + "step": 19708 + }, + { + "epoch": 0.7058212616613246, + "grad_norm": 1.6013215780258179, + "learning_rate": 4.2055023162964e-05, + "loss": 1.2438, + "step": 19709 + }, + { + "epoch": 0.705857073790893, + "grad_norm": 1.5667520761489868, + "learning_rate": 4.2045570264965574e-05, + "loss": 1.2636, + "step": 19710 + }, + { + "epoch": 0.7058928859204613, + "grad_norm": 1.4089823961257935, + "learning_rate": 4.203611814666925e-05, + "loss": 0.9542, + "step": 19711 + }, + { + "epoch": 0.7059286980500296, + "grad_norm": 1.565793514251709, + "learning_rate": 4.202666680820221e-05, + "loss": 1.0287, + "step": 19712 + }, + { + "epoch": 0.7059645101795978, + "grad_norm": 1.5340778827667236, + "learning_rate": 4.201721624969156e-05, + "loss": 1.0915, + "step": 19713 + }, + { + "epoch": 0.7060003223091661, + "grad_norm": 1.5194103717803955, + "learning_rate": 4.200776647126447e-05, + "loss": 1.1185, + "step": 19714 + }, + { + "epoch": 0.7060361344387344, + "grad_norm": 1.2188405990600586, + "learning_rate": 4.199831747304811e-05, + "loss": 1.1352, + "step": 19715 + }, + { + "epoch": 0.7060719465683026, + "grad_norm": 1.368438720703125, + "learning_rate": 4.198886925516954e-05, + "loss": 1.1571, + "step": 19716 + }, + { + "epoch": 0.706107758697871, + "grad_norm": 1.330841302871704, + "learning_rate": 4.19794218177559e-05, + "loss": 1.1904, + "step": 19717 + }, + { + "epoch": 0.7061435708274393, + "grad_norm": 1.5857187509536743, + "learning_rate": 4.196997516093431e-05, + "loss": 1.1057, + "step": 19718 + }, + { + "epoch": 0.7061793829570076, + "grad_norm": 1.4301191568374634, + "learning_rate": 4.196052928483188e-05, + "loss": 1.1478, + "step": 19719 + }, + { + "epoch": 0.7062151950865758, + "grad_norm": 1.43088960647583, + "learning_rate": 4.195108418957563e-05, + "loss": 1.1902, + "step": 19720 + }, + { + "epoch": 0.7062510072161441, + "grad_norm": 1.3448421955108643, + "learning_rate": 4.194163987529266e-05, + "loss": 0.9134, + "step": 19721 + }, + { + "epoch": 0.7062868193457124, + "grad_norm": 1.831264853477478, + "learning_rate": 4.1932196342110076e-05, + "loss": 1.0736, + "step": 19722 + }, + { + "epoch": 0.7063226314752806, + "grad_norm": 1.386744737625122, + "learning_rate": 4.1922753590154854e-05, + "loss": 1.1464, + "step": 19723 + }, + { + "epoch": 0.706358443604849, + "grad_norm": 1.6325037479400635, + "learning_rate": 4.1913311619554064e-05, + "loss": 1.0551, + "step": 19724 + }, + { + "epoch": 0.7063942557344173, + "grad_norm": 1.6244761943817139, + "learning_rate": 4.1903870430434736e-05, + "loss": 1.3168, + "step": 19725 + }, + { + "epoch": 0.7064300678639855, + "grad_norm": 1.2568719387054443, + "learning_rate": 4.189443002292392e-05, + "loss": 1.0822, + "step": 19726 + }, + { + "epoch": 0.7064658799935538, + "grad_norm": 1.7448468208312988, + "learning_rate": 4.1884990397148584e-05, + "loss": 0.9737, + "step": 19727 + }, + { + "epoch": 0.7065016921231221, + "grad_norm": 1.8970365524291992, + "learning_rate": 4.187555155323572e-05, + "loss": 1.243, + "step": 19728 + }, + { + "epoch": 0.7065375042526904, + "grad_norm": 1.4173967838287354, + "learning_rate": 4.186611349131235e-05, + "loss": 1.1404, + "step": 19729 + }, + { + "epoch": 0.7065733163822586, + "grad_norm": 1.322033405303955, + "learning_rate": 4.1856676211505465e-05, + "loss": 1.2551, + "step": 19730 + }, + { + "epoch": 0.706609128511827, + "grad_norm": 1.6867406368255615, + "learning_rate": 4.184723971394197e-05, + "loss": 1.1438, + "step": 19731 + }, + { + "epoch": 0.7066449406413953, + "grad_norm": 1.6642905473709106, + "learning_rate": 4.183780399874885e-05, + "loss": 1.3805, + "step": 19732 + }, + { + "epoch": 0.7066807527709635, + "grad_norm": 1.6168829202651978, + "learning_rate": 4.182836906605309e-05, + "loss": 0.9988, + "step": 19733 + }, + { + "epoch": 0.7067165649005318, + "grad_norm": 1.369585633277893, + "learning_rate": 4.1818934915981544e-05, + "loss": 1.084, + "step": 19734 + }, + { + "epoch": 0.7067523770301001, + "grad_norm": 1.4245355129241943, + "learning_rate": 4.180950154866119e-05, + "loss": 1.2818, + "step": 19735 + }, + { + "epoch": 0.7067881891596683, + "grad_norm": 1.4033023118972778, + "learning_rate": 4.180006896421893e-05, + "loss": 1.0052, + "step": 19736 + }, + { + "epoch": 0.7068240012892366, + "grad_norm": 1.5972274541854858, + "learning_rate": 4.179063716278171e-05, + "loss": 1.2385, + "step": 19737 + }, + { + "epoch": 0.706859813418805, + "grad_norm": 1.2838506698608398, + "learning_rate": 4.178120614447634e-05, + "loss": 1.1622, + "step": 19738 + }, + { + "epoch": 0.7068956255483733, + "grad_norm": 1.4538370370864868, + "learning_rate": 4.177177590942974e-05, + "loss": 1.2678, + "step": 19739 + }, + { + "epoch": 0.7069314376779415, + "grad_norm": 1.300761103630066, + "learning_rate": 4.176234645776883e-05, + "loss": 1.2642, + "step": 19740 + }, + { + "epoch": 0.7069672498075098, + "grad_norm": 1.5366344451904297, + "learning_rate": 4.1752917789620395e-05, + "loss": 1.0052, + "step": 19741 + }, + { + "epoch": 0.7070030619370781, + "grad_norm": 1.339562177658081, + "learning_rate": 4.174348990511131e-05, + "loss": 0.9873, + "step": 19742 + }, + { + "epoch": 0.7070388740666463, + "grad_norm": 1.78595769405365, + "learning_rate": 4.1734062804368426e-05, + "loss": 0.8988, + "step": 19743 + }, + { + "epoch": 0.7070746861962146, + "grad_norm": 1.6662448644638062, + "learning_rate": 4.17246364875186e-05, + "loss": 1.0811, + "step": 19744 + }, + { + "epoch": 0.707110498325783, + "grad_norm": 1.3603442907333374, + "learning_rate": 4.171521095468859e-05, + "loss": 1.2051, + "step": 19745 + }, + { + "epoch": 0.7071463104553513, + "grad_norm": 1.7230539321899414, + "learning_rate": 4.1705786206005235e-05, + "loss": 1.2663, + "step": 19746 + }, + { + "epoch": 0.7071821225849195, + "grad_norm": 1.5580500364303589, + "learning_rate": 4.169636224159533e-05, + "loss": 0.9613, + "step": 19747 + }, + { + "epoch": 0.7072179347144878, + "grad_norm": 1.6210354566574097, + "learning_rate": 4.1686939061585694e-05, + "loss": 1.3535, + "step": 19748 + }, + { + "epoch": 0.7072537468440561, + "grad_norm": 1.4496031999588013, + "learning_rate": 4.167751666610309e-05, + "loss": 1.2928, + "step": 19749 + }, + { + "epoch": 0.7072895589736243, + "grad_norm": 1.354241967201233, + "learning_rate": 4.166809505527418e-05, + "loss": 0.9549, + "step": 19750 + }, + { + "epoch": 0.7073253711031926, + "grad_norm": 1.4647928476333618, + "learning_rate": 4.165867422922589e-05, + "loss": 1.0875, + "step": 19751 + }, + { + "epoch": 0.707361183232761, + "grad_norm": 1.3889093399047852, + "learning_rate": 4.1649254188084854e-05, + "loss": 0.9574, + "step": 19752 + }, + { + "epoch": 0.7073969953623293, + "grad_norm": 1.5583852529525757, + "learning_rate": 4.1639834931977864e-05, + "loss": 1.2394, + "step": 19753 + }, + { + "epoch": 0.7074328074918975, + "grad_norm": 1.4087611436843872, + "learning_rate": 4.163041646103154e-05, + "loss": 1.0209, + "step": 19754 + }, + { + "epoch": 0.7074686196214658, + "grad_norm": 1.8186283111572266, + "learning_rate": 4.162099877537274e-05, + "loss": 0.9068, + "step": 19755 + }, + { + "epoch": 0.7075044317510341, + "grad_norm": 1.3042889833450317, + "learning_rate": 4.161158187512808e-05, + "loss": 1.2479, + "step": 19756 + }, + { + "epoch": 0.7075402438806023, + "grad_norm": 2.0913147926330566, + "learning_rate": 4.160216576042426e-05, + "loss": 1.3058, + "step": 19757 + }, + { + "epoch": 0.7075760560101706, + "grad_norm": 1.5315717458724976, + "learning_rate": 4.159275043138801e-05, + "loss": 1.1476, + "step": 19758 + }, + { + "epoch": 0.707611868139739, + "grad_norm": 1.3922169208526611, + "learning_rate": 4.1583335888145915e-05, + "loss": 1.2494, + "step": 19759 + }, + { + "epoch": 0.7076476802693072, + "grad_norm": 1.6476784944534302, + "learning_rate": 4.1573922130824725e-05, + "loss": 1.0651, + "step": 19760 + }, + { + "epoch": 0.7076834923988755, + "grad_norm": 1.3828787803649902, + "learning_rate": 4.156450915955099e-05, + "loss": 1.1255, + "step": 19761 + }, + { + "epoch": 0.7077193045284438, + "grad_norm": 2.112422227859497, + "learning_rate": 4.155509697445147e-05, + "loss": 1.311, + "step": 19762 + }, + { + "epoch": 0.707755116658012, + "grad_norm": 1.316869854927063, + "learning_rate": 4.1545685575652695e-05, + "loss": 1.2195, + "step": 19763 + }, + { + "epoch": 0.7077909287875803, + "grad_norm": 1.4996238946914673, + "learning_rate": 4.1536274963281355e-05, + "loss": 1.1553, + "step": 19764 + }, + { + "epoch": 0.7078267409171486, + "grad_norm": 1.3428183794021606, + "learning_rate": 4.152686513746399e-05, + "loss": 1.1229, + "step": 19765 + }, + { + "epoch": 0.707862553046717, + "grad_norm": 1.310902714729309, + "learning_rate": 4.151745609832722e-05, + "loss": 0.9905, + "step": 19766 + }, + { + "epoch": 0.7078983651762852, + "grad_norm": 1.5419999361038208, + "learning_rate": 4.150804784599769e-05, + "loss": 0.9495, + "step": 19767 + }, + { + "epoch": 0.7079341773058535, + "grad_norm": 1.4437793493270874, + "learning_rate": 4.149864038060185e-05, + "loss": 1.231, + "step": 19768 + }, + { + "epoch": 0.7079699894354218, + "grad_norm": 1.396967887878418, + "learning_rate": 4.148923370226642e-05, + "loss": 1.0841, + "step": 19769 + }, + { + "epoch": 0.70800580156499, + "grad_norm": 1.8347307443618774, + "learning_rate": 4.147982781111783e-05, + "loss": 1.0641, + "step": 19770 + }, + { + "epoch": 0.7080416136945583, + "grad_norm": 1.6161315441131592, + "learning_rate": 4.147042270728272e-05, + "loss": 0.914, + "step": 19771 + }, + { + "epoch": 0.7080774258241266, + "grad_norm": 1.193536400794983, + "learning_rate": 4.146101839088749e-05, + "loss": 0.9761, + "step": 19772 + }, + { + "epoch": 0.708113237953695, + "grad_norm": 1.5150943994522095, + "learning_rate": 4.145161486205883e-05, + "loss": 1.2438, + "step": 19773 + }, + { + "epoch": 0.7081490500832632, + "grad_norm": 1.3298735618591309, + "learning_rate": 4.144221212092316e-05, + "loss": 1.073, + "step": 19774 + }, + { + "epoch": 0.7081848622128315, + "grad_norm": 1.5428438186645508, + "learning_rate": 4.1432810167606964e-05, + "loss": 1.1368, + "step": 19775 + }, + { + "epoch": 0.7082206743423998, + "grad_norm": 1.4450795650482178, + "learning_rate": 4.1423409002236755e-05, + "loss": 1.0693, + "step": 19776 + }, + { + "epoch": 0.708256486471968, + "grad_norm": 1.4249036312103271, + "learning_rate": 4.141400862493903e-05, + "loss": 1.0176, + "step": 19777 + }, + { + "epoch": 0.7082922986015363, + "grad_norm": 1.6738793849945068, + "learning_rate": 4.140460903584027e-05, + "loss": 1.2536, + "step": 19778 + }, + { + "epoch": 0.7083281107311046, + "grad_norm": 1.5917853116989136, + "learning_rate": 4.139521023506688e-05, + "loss": 1.2524, + "step": 19779 + }, + { + "epoch": 0.708363922860673, + "grad_norm": 1.509375810623169, + "learning_rate": 4.1385812222745344e-05, + "loss": 1.1141, + "step": 19780 + }, + { + "epoch": 0.7083997349902412, + "grad_norm": 1.735701322555542, + "learning_rate": 4.13764149990021e-05, + "loss": 0.983, + "step": 19781 + }, + { + "epoch": 0.7084355471198095, + "grad_norm": 1.9351152181625366, + "learning_rate": 4.136701856396361e-05, + "loss": 1.0368, + "step": 19782 + }, + { + "epoch": 0.7084713592493778, + "grad_norm": 1.4475505352020264, + "learning_rate": 4.135762291775622e-05, + "loss": 1.0468, + "step": 19783 + }, + { + "epoch": 0.708507171378946, + "grad_norm": 1.3165066242218018, + "learning_rate": 4.1348228060506364e-05, + "loss": 0.9988, + "step": 19784 + }, + { + "epoch": 0.7085429835085143, + "grad_norm": 1.5439128875732422, + "learning_rate": 4.133883399234049e-05, + "loss": 1.049, + "step": 19785 + }, + { + "epoch": 0.7085787956380826, + "grad_norm": 1.4471684694290161, + "learning_rate": 4.132944071338489e-05, + "loss": 1.29, + "step": 19786 + }, + { + "epoch": 0.708614607767651, + "grad_norm": 1.327303171157837, + "learning_rate": 4.132004822376598e-05, + "loss": 0.9832, + "step": 19787 + }, + { + "epoch": 0.7086504198972192, + "grad_norm": 1.445858359336853, + "learning_rate": 4.1310656523610144e-05, + "loss": 0.9459, + "step": 19788 + }, + { + "epoch": 0.7086862320267875, + "grad_norm": 1.3263593912124634, + "learning_rate": 4.130126561304376e-05, + "loss": 1.162, + "step": 19789 + }, + { + "epoch": 0.7087220441563558, + "grad_norm": 1.9096463918685913, + "learning_rate": 4.129187549219308e-05, + "loss": 1.193, + "step": 19790 + }, + { + "epoch": 0.708757856285924, + "grad_norm": 1.6299283504486084, + "learning_rate": 4.12824861611845e-05, + "loss": 1.1499, + "step": 19791 + }, + { + "epoch": 0.7087936684154923, + "grad_norm": 1.6387724876403809, + "learning_rate": 4.127309762014435e-05, + "loss": 1.2145, + "step": 19792 + }, + { + "epoch": 0.7088294805450606, + "grad_norm": 1.6671749353408813, + "learning_rate": 4.12637098691989e-05, + "loss": 1.0821, + "step": 19793 + }, + { + "epoch": 0.7088652926746289, + "grad_norm": 1.6705737113952637, + "learning_rate": 4.125432290847446e-05, + "loss": 0.9481, + "step": 19794 + }, + { + "epoch": 0.7089011048041972, + "grad_norm": 1.4600613117218018, + "learning_rate": 4.124493673809733e-05, + "loss": 0.9184, + "step": 19795 + }, + { + "epoch": 0.7089369169337655, + "grad_norm": 2.1939454078674316, + "learning_rate": 4.123555135819382e-05, + "loss": 1.2557, + "step": 19796 + }, + { + "epoch": 0.7089727290633338, + "grad_norm": 1.4875580072402954, + "learning_rate": 4.122616676889014e-05, + "loss": 1.1489, + "step": 19797 + }, + { + "epoch": 0.709008541192902, + "grad_norm": 1.8212848901748657, + "learning_rate": 4.121678297031256e-05, + "loss": 1.1548, + "step": 19798 + }, + { + "epoch": 0.7090443533224703, + "grad_norm": 1.8699716329574585, + "learning_rate": 4.1207399962587356e-05, + "loss": 1.1352, + "step": 19799 + }, + { + "epoch": 0.7090801654520386, + "grad_norm": 1.3277360200881958, + "learning_rate": 4.119801774584077e-05, + "loss": 0.9388, + "step": 19800 + }, + { + "epoch": 0.7091159775816069, + "grad_norm": 1.724356770515442, + "learning_rate": 4.118863632019898e-05, + "loss": 1.0973, + "step": 19801 + }, + { + "epoch": 0.7091517897111752, + "grad_norm": 2.282465696334839, + "learning_rate": 4.117925568578822e-05, + "loss": 1.0501, + "step": 19802 + }, + { + "epoch": 0.7091876018407435, + "grad_norm": 1.5681428909301758, + "learning_rate": 4.116987584273474e-05, + "loss": 0.8886, + "step": 19803 + }, + { + "epoch": 0.7092234139703117, + "grad_norm": 1.36475670337677, + "learning_rate": 4.116049679116466e-05, + "loss": 0.8721, + "step": 19804 + }, + { + "epoch": 0.70925922609988, + "grad_norm": 1.510061264038086, + "learning_rate": 4.11511185312042e-05, + "loss": 1.2033, + "step": 19805 + }, + { + "epoch": 0.7092950382294483, + "grad_norm": 1.5638095140457153, + "learning_rate": 4.114174106297952e-05, + "loss": 0.931, + "step": 19806 + }, + { + "epoch": 0.7093308503590166, + "grad_norm": 1.5506243705749512, + "learning_rate": 4.113236438661684e-05, + "loss": 0.9939, + "step": 19807 + }, + { + "epoch": 0.7093666624885849, + "grad_norm": 1.5984351634979248, + "learning_rate": 4.112298850224223e-05, + "loss": 1.0833, + "step": 19808 + }, + { + "epoch": 0.7094024746181532, + "grad_norm": 1.1541565656661987, + "learning_rate": 4.111361340998186e-05, + "loss": 1.0914, + "step": 19809 + }, + { + "epoch": 0.7094382867477215, + "grad_norm": 1.8074474334716797, + "learning_rate": 4.11042391099619e-05, + "loss": 1.0608, + "step": 19810 + }, + { + "epoch": 0.7094740988772897, + "grad_norm": 1.459343671798706, + "learning_rate": 4.109486560230839e-05, + "loss": 1.1554, + "step": 19811 + }, + { + "epoch": 0.709509911006858, + "grad_norm": 1.397610068321228, + "learning_rate": 4.108549288714748e-05, + "loss": 1.1963, + "step": 19812 + }, + { + "epoch": 0.7095457231364263, + "grad_norm": 1.530643343925476, + "learning_rate": 4.107612096460528e-05, + "loss": 1.1169, + "step": 19813 + }, + { + "epoch": 0.7095815352659945, + "grad_norm": 1.386247992515564, + "learning_rate": 4.1066749834807895e-05, + "loss": 1.1023, + "step": 19814 + }, + { + "epoch": 0.7096173473955629, + "grad_norm": 1.3136731386184692, + "learning_rate": 4.105737949788133e-05, + "loss": 1.1546, + "step": 19815 + }, + { + "epoch": 0.7096531595251312, + "grad_norm": 1.1867680549621582, + "learning_rate": 4.10480099539517e-05, + "loss": 1.184, + "step": 19816 + }, + { + "epoch": 0.7096889716546995, + "grad_norm": 1.4248127937316895, + "learning_rate": 4.103864120314506e-05, + "loss": 1.0957, + "step": 19817 + }, + { + "epoch": 0.7097247837842677, + "grad_norm": 1.598416805267334, + "learning_rate": 4.1029273245587476e-05, + "loss": 1.1674, + "step": 19818 + }, + { + "epoch": 0.709760595913836, + "grad_norm": 1.6394050121307373, + "learning_rate": 4.101990608140492e-05, + "loss": 1.1384, + "step": 19819 + }, + { + "epoch": 0.7097964080434043, + "grad_norm": 1.3309564590454102, + "learning_rate": 4.101053971072345e-05, + "loss": 1.0853, + "step": 19820 + }, + { + "epoch": 0.7098322201729725, + "grad_norm": 1.7929539680480957, + "learning_rate": 4.1001174133669116e-05, + "loss": 1.211, + "step": 19821 + }, + { + "epoch": 0.7098680323025409, + "grad_norm": 1.589555025100708, + "learning_rate": 4.099180935036784e-05, + "loss": 1.1159, + "step": 19822 + }, + { + "epoch": 0.7099038444321092, + "grad_norm": 1.7943637371063232, + "learning_rate": 4.0982445360945654e-05, + "loss": 1.2465, + "step": 19823 + }, + { + "epoch": 0.7099396565616775, + "grad_norm": 1.5794798135757446, + "learning_rate": 4.097308216552854e-05, + "loss": 1.2082, + "step": 19824 + }, + { + "epoch": 0.7099754686912457, + "grad_norm": 1.1373004913330078, + "learning_rate": 4.0963719764242504e-05, + "loss": 1.1286, + "step": 19825 + }, + { + "epoch": 0.710011280820814, + "grad_norm": 1.5198605060577393, + "learning_rate": 4.0954358157213436e-05, + "loss": 0.9667, + "step": 19826 + }, + { + "epoch": 0.7100470929503823, + "grad_norm": 1.4054044485092163, + "learning_rate": 4.0944997344567304e-05, + "loss": 0.9499, + "step": 19827 + }, + { + "epoch": 0.7100829050799505, + "grad_norm": 1.3229047060012817, + "learning_rate": 4.0935637326430095e-05, + "loss": 0.9965, + "step": 19828 + }, + { + "epoch": 0.7101187172095189, + "grad_norm": 1.3342316150665283, + "learning_rate": 4.092627810292767e-05, + "loss": 1.1793, + "step": 19829 + }, + { + "epoch": 0.7101545293390872, + "grad_norm": 1.3451999425888062, + "learning_rate": 4.0916919674185974e-05, + "loss": 1.0893, + "step": 19830 + }, + { + "epoch": 0.7101903414686555, + "grad_norm": 1.4729527235031128, + "learning_rate": 4.09075620403309e-05, + "loss": 1.1313, + "step": 19831 + }, + { + "epoch": 0.7102261535982237, + "grad_norm": 1.6414110660552979, + "learning_rate": 4.0898205201488404e-05, + "loss": 1.3023, + "step": 19832 + }, + { + "epoch": 0.710261965727792, + "grad_norm": 1.5351927280426025, + "learning_rate": 4.088884915778427e-05, + "loss": 1.0496, + "step": 19833 + }, + { + "epoch": 0.7102977778573603, + "grad_norm": 1.2975846529006958, + "learning_rate": 4.087949390934443e-05, + "loss": 1.1269, + "step": 19834 + }, + { + "epoch": 0.7103335899869285, + "grad_norm": 1.6300668716430664, + "learning_rate": 4.0870139456294745e-05, + "loss": 1.0625, + "step": 19835 + }, + { + "epoch": 0.7103694021164969, + "grad_norm": 1.448746919631958, + "learning_rate": 4.0860785798761094e-05, + "loss": 1.0112, + "step": 19836 + }, + { + "epoch": 0.7104052142460652, + "grad_norm": 1.3101807832717896, + "learning_rate": 4.0851432936869296e-05, + "loss": 0.9426, + "step": 19837 + }, + { + "epoch": 0.7104410263756334, + "grad_norm": 1.557068109512329, + "learning_rate": 4.0842080870745084e-05, + "loss": 1.1715, + "step": 19838 + }, + { + "epoch": 0.7104768385052017, + "grad_norm": 1.6848877668380737, + "learning_rate": 4.083272960051444e-05, + "loss": 1.1247, + "step": 19839 + }, + { + "epoch": 0.71051265063477, + "grad_norm": 1.7432414293289185, + "learning_rate": 4.0823379126303064e-05, + "loss": 1.0091, + "step": 19840 + }, + { + "epoch": 0.7105484627643383, + "grad_norm": 1.4805221557617188, + "learning_rate": 4.0814029448236803e-05, + "loss": 1.0383, + "step": 19841 + }, + { + "epoch": 0.7105842748939065, + "grad_norm": 1.740105152130127, + "learning_rate": 4.080468056644141e-05, + "loss": 0.9328, + "step": 19842 + }, + { + "epoch": 0.7106200870234749, + "grad_norm": 1.7938693761825562, + "learning_rate": 4.0795332481042736e-05, + "loss": 0.9825, + "step": 19843 + }, + { + "epoch": 0.7106558991530432, + "grad_norm": 1.4994608163833618, + "learning_rate": 4.078598519216645e-05, + "loss": 1.1415, + "step": 19844 + }, + { + "epoch": 0.7106917112826114, + "grad_norm": 1.9219577312469482, + "learning_rate": 4.077663869993835e-05, + "loss": 1.0311, + "step": 19845 + }, + { + "epoch": 0.7107275234121797, + "grad_norm": 1.3750977516174316, + "learning_rate": 4.076729300448423e-05, + "loss": 1.0264, + "step": 19846 + }, + { + "epoch": 0.710763335541748, + "grad_norm": 1.4601980447769165, + "learning_rate": 4.075794810592973e-05, + "loss": 0.8724, + "step": 19847 + }, + { + "epoch": 0.7107991476713162, + "grad_norm": 1.6197582483291626, + "learning_rate": 4.074860400440067e-05, + "loss": 1.1034, + "step": 19848 + }, + { + "epoch": 0.7108349598008845, + "grad_norm": 1.333065152168274, + "learning_rate": 4.073926070002264e-05, + "loss": 1.0934, + "step": 19849 + }, + { + "epoch": 0.7108707719304529, + "grad_norm": 1.3134870529174805, + "learning_rate": 4.072991819292148e-05, + "loss": 1.3584, + "step": 19850 + }, + { + "epoch": 0.7109065840600212, + "grad_norm": 1.5797460079193115, + "learning_rate": 4.0720576483222795e-05, + "loss": 1.0269, + "step": 19851 + }, + { + "epoch": 0.7109423961895894, + "grad_norm": 1.437678337097168, + "learning_rate": 4.0711235571052306e-05, + "loss": 1.2089, + "step": 19852 + }, + { + "epoch": 0.7109782083191577, + "grad_norm": 1.362870454788208, + "learning_rate": 4.070189545653561e-05, + "loss": 0.9713, + "step": 19853 + }, + { + "epoch": 0.711014020448726, + "grad_norm": 1.524302363395691, + "learning_rate": 4.069255613979849e-05, + "loss": 1.1208, + "step": 19854 + }, + { + "epoch": 0.7110498325782942, + "grad_norm": 1.465962290763855, + "learning_rate": 4.068321762096652e-05, + "loss": 0.9511, + "step": 19855 + }, + { + "epoch": 0.7110856447078625, + "grad_norm": 1.8152214288711548, + "learning_rate": 4.067387990016528e-05, + "loss": 1.114, + "step": 19856 + }, + { + "epoch": 0.7111214568374309, + "grad_norm": 1.3935328722000122, + "learning_rate": 4.0664542977520526e-05, + "loss": 1.0701, + "step": 19857 + }, + { + "epoch": 0.7111572689669992, + "grad_norm": 1.341173768043518, + "learning_rate": 4.065520685315777e-05, + "loss": 0.9661, + "step": 19858 + }, + { + "epoch": 0.7111930810965674, + "grad_norm": 1.5437393188476562, + "learning_rate": 4.0645871527202695e-05, + "loss": 1.0219, + "step": 19859 + }, + { + "epoch": 0.7112288932261357, + "grad_norm": 1.4624803066253662, + "learning_rate": 4.063653699978079e-05, + "loss": 1.1185, + "step": 19860 + }, + { + "epoch": 0.711264705355704, + "grad_norm": 1.3092159032821655, + "learning_rate": 4.062720327101778e-05, + "loss": 1.2216, + "step": 19861 + }, + { + "epoch": 0.7113005174852722, + "grad_norm": 1.8240495920181274, + "learning_rate": 4.0617870341039155e-05, + "loss": 1.1417, + "step": 19862 + }, + { + "epoch": 0.7113363296148405, + "grad_norm": 1.5892084836959839, + "learning_rate": 4.060853820997046e-05, + "loss": 1.0391, + "step": 19863 + }, + { + "epoch": 0.7113721417444089, + "grad_norm": 1.4721471071243286, + "learning_rate": 4.059920687793727e-05, + "loss": 1.1487, + "step": 19864 + }, + { + "epoch": 0.7114079538739772, + "grad_norm": 1.3369191884994507, + "learning_rate": 4.058987634506514e-05, + "loss": 1.0181, + "step": 19865 + }, + { + "epoch": 0.7114437660035454, + "grad_norm": 1.4438382387161255, + "learning_rate": 4.058054661147961e-05, + "loss": 1.2104, + "step": 19866 + }, + { + "epoch": 0.7114795781331137, + "grad_norm": 1.3985077142715454, + "learning_rate": 4.057121767730612e-05, + "loss": 1.0794, + "step": 19867 + }, + { + "epoch": 0.711515390262682, + "grad_norm": 1.3013545274734497, + "learning_rate": 4.05618895426703e-05, + "loss": 1.1779, + "step": 19868 + }, + { + "epoch": 0.7115512023922502, + "grad_norm": 1.6006778478622437, + "learning_rate": 4.055256220769755e-05, + "loss": 1.0369, + "step": 19869 + }, + { + "epoch": 0.7115870145218185, + "grad_norm": 1.496996521949768, + "learning_rate": 4.0543235672513434e-05, + "loss": 1.0455, + "step": 19870 + }, + { + "epoch": 0.7116228266513869, + "grad_norm": 1.5247526168823242, + "learning_rate": 4.0533909937243365e-05, + "loss": 1.0469, + "step": 19871 + }, + { + "epoch": 0.7116586387809551, + "grad_norm": 1.4711021184921265, + "learning_rate": 4.0524585002012815e-05, + "loss": 1.0799, + "step": 19872 + }, + { + "epoch": 0.7116944509105234, + "grad_norm": 1.4971121549606323, + "learning_rate": 4.05152608669473e-05, + "loss": 1.0103, + "step": 19873 + }, + { + "epoch": 0.7117302630400917, + "grad_norm": 1.530035376548767, + "learning_rate": 4.0505937532172175e-05, + "loss": 1.0781, + "step": 19874 + }, + { + "epoch": 0.71176607516966, + "grad_norm": 1.5622495412826538, + "learning_rate": 4.049661499781293e-05, + "loss": 1.1977, + "step": 19875 + }, + { + "epoch": 0.7118018872992282, + "grad_norm": 1.3826642036437988, + "learning_rate": 4.048729326399498e-05, + "loss": 1.0218, + "step": 19876 + }, + { + "epoch": 0.7118376994287965, + "grad_norm": 1.4727274179458618, + "learning_rate": 4.047797233084375e-05, + "loss": 1.0503, + "step": 19877 + }, + { + "epoch": 0.7118735115583649, + "grad_norm": 1.4892833232879639, + "learning_rate": 4.0468652198484603e-05, + "loss": 0.7673, + "step": 19878 + }, + { + "epoch": 0.7119093236879331, + "grad_norm": 1.5200010538101196, + "learning_rate": 4.045933286704296e-05, + "loss": 1.1301, + "step": 19879 + }, + { + "epoch": 0.7119451358175014, + "grad_norm": 1.272925853729248, + "learning_rate": 4.0450014336644204e-05, + "loss": 0.9509, + "step": 19880 + }, + { + "epoch": 0.7119809479470697, + "grad_norm": 1.924553632736206, + "learning_rate": 4.0440696607413665e-05, + "loss": 1.2034, + "step": 19881 + }, + { + "epoch": 0.712016760076638, + "grad_norm": 1.5795788764953613, + "learning_rate": 4.0431379679476735e-05, + "loss": 1.191, + "step": 19882 + }, + { + "epoch": 0.7120525722062062, + "grad_norm": 1.2863924503326416, + "learning_rate": 4.042206355295875e-05, + "loss": 1.0986, + "step": 19883 + }, + { + "epoch": 0.7120883843357745, + "grad_norm": 1.2149213552474976, + "learning_rate": 4.0412748227985075e-05, + "loss": 1.0199, + "step": 19884 + }, + { + "epoch": 0.7121241964653429, + "grad_norm": 1.4094399213790894, + "learning_rate": 4.040343370468098e-05, + "loss": 1.128, + "step": 19885 + }, + { + "epoch": 0.7121600085949111, + "grad_norm": 1.3196462392807007, + "learning_rate": 4.039411998317182e-05, + "loss": 1.2018, + "step": 19886 + }, + { + "epoch": 0.7121958207244794, + "grad_norm": 1.3143614530563354, + "learning_rate": 4.038480706358287e-05, + "loss": 0.9616, + "step": 19887 + }, + { + "epoch": 0.7122316328540477, + "grad_norm": 1.2486367225646973, + "learning_rate": 4.0375494946039495e-05, + "loss": 0.8359, + "step": 19888 + }, + { + "epoch": 0.7122674449836159, + "grad_norm": 1.575297236442566, + "learning_rate": 4.0366183630666885e-05, + "loss": 1.0859, + "step": 19889 + }, + { + "epoch": 0.7123032571131842, + "grad_norm": 1.6089564561843872, + "learning_rate": 4.035687311759036e-05, + "loss": 0.9753, + "step": 19890 + }, + { + "epoch": 0.7123390692427525, + "grad_norm": 1.8845710754394531, + "learning_rate": 4.03475634069352e-05, + "loss": 0.9889, + "step": 19891 + }, + { + "epoch": 0.7123748813723209, + "grad_norm": 1.4113293886184692, + "learning_rate": 4.033825449882659e-05, + "loss": 1.1352, + "step": 19892 + }, + { + "epoch": 0.7124106935018891, + "grad_norm": 1.2770005464553833, + "learning_rate": 4.032894639338981e-05, + "loss": 1.2759, + "step": 19893 + }, + { + "epoch": 0.7124465056314574, + "grad_norm": 1.6303642988204956, + "learning_rate": 4.031963909075009e-05, + "loss": 1.0678, + "step": 19894 + }, + { + "epoch": 0.7124823177610257, + "grad_norm": 1.3850769996643066, + "learning_rate": 4.0310332591032675e-05, + "loss": 0.9948, + "step": 19895 + }, + { + "epoch": 0.7125181298905939, + "grad_norm": 1.9720654487609863, + "learning_rate": 4.030102689436271e-05, + "loss": 1.0812, + "step": 19896 + }, + { + "epoch": 0.7125539420201622, + "grad_norm": 1.3742806911468506, + "learning_rate": 4.0291722000865416e-05, + "loss": 0.9993, + "step": 19897 + }, + { + "epoch": 0.7125897541497305, + "grad_norm": 1.5753098726272583, + "learning_rate": 4.0282417910666025e-05, + "loss": 1.2159, + "step": 19898 + }, + { + "epoch": 0.7126255662792988, + "grad_norm": 1.5580188035964966, + "learning_rate": 4.027311462388964e-05, + "loss": 1.2175, + "step": 19899 + }, + { + "epoch": 0.7126613784088671, + "grad_norm": 1.685380458831787, + "learning_rate": 4.026381214066145e-05, + "loss": 1.159, + "step": 19900 + }, + { + "epoch": 0.7126971905384354, + "grad_norm": 1.677437424659729, + "learning_rate": 4.025451046110661e-05, + "loss": 1.2172, + "step": 19901 + }, + { + "epoch": 0.7127330026680037, + "grad_norm": 1.5325183868408203, + "learning_rate": 4.024520958535031e-05, + "loss": 1.2062, + "step": 19902 + }, + { + "epoch": 0.7127688147975719, + "grad_norm": 1.989314079284668, + "learning_rate": 4.023590951351759e-05, + "loss": 1.0633, + "step": 19903 + }, + { + "epoch": 0.7128046269271402, + "grad_norm": 1.3443384170532227, + "learning_rate": 4.022661024573362e-05, + "loss": 0.9604, + "step": 19904 + }, + { + "epoch": 0.7128404390567085, + "grad_norm": 1.5743073225021362, + "learning_rate": 4.0217311782123514e-05, + "loss": 1.2635, + "step": 19905 + }, + { + "epoch": 0.7128762511862768, + "grad_norm": 1.135441780090332, + "learning_rate": 4.020801412281239e-05, + "loss": 0.9204, + "step": 19906 + }, + { + "epoch": 0.7129120633158451, + "grad_norm": 1.5515044927597046, + "learning_rate": 4.019871726792528e-05, + "loss": 1.0519, + "step": 19907 + }, + { + "epoch": 0.7129478754454134, + "grad_norm": 1.452314019203186, + "learning_rate": 4.0189421217587297e-05, + "loss": 1.1597, + "step": 19908 + }, + { + "epoch": 0.7129836875749817, + "grad_norm": 1.323270559310913, + "learning_rate": 4.0180125971923524e-05, + "loss": 1.1209, + "step": 19909 + }, + { + "epoch": 0.7130194997045499, + "grad_norm": 1.625570297241211, + "learning_rate": 4.017083153105897e-05, + "loss": 1.1898, + "step": 19910 + }, + { + "epoch": 0.7130553118341182, + "grad_norm": 1.4692285060882568, + "learning_rate": 4.0161537895118695e-05, + "loss": 1.0115, + "step": 19911 + }, + { + "epoch": 0.7130911239636865, + "grad_norm": 1.4072948694229126, + "learning_rate": 4.0152245064227745e-05, + "loss": 0.8768, + "step": 19912 + }, + { + "epoch": 0.7131269360932548, + "grad_norm": 1.6781015396118164, + "learning_rate": 4.0142953038511176e-05, + "loss": 1.1146, + "step": 19913 + }, + { + "epoch": 0.7131627482228231, + "grad_norm": 1.6506657600402832, + "learning_rate": 4.013366181809393e-05, + "loss": 0.957, + "step": 19914 + }, + { + "epoch": 0.7131985603523914, + "grad_norm": 1.583111047744751, + "learning_rate": 4.0124371403101034e-05, + "loss": 1.2132, + "step": 19915 + }, + { + "epoch": 0.7132343724819596, + "grad_norm": 2.2833662033081055, + "learning_rate": 4.0115081793657525e-05, + "loss": 1.2375, + "step": 19916 + }, + { + "epoch": 0.7132701846115279, + "grad_norm": 2.1634020805358887, + "learning_rate": 4.010579298988832e-05, + "loss": 1.3188, + "step": 19917 + }, + { + "epoch": 0.7133059967410962, + "grad_norm": 1.5510650873184204, + "learning_rate": 4.00965049919184e-05, + "loss": 1.0696, + "step": 19918 + }, + { + "epoch": 0.7133418088706645, + "grad_norm": 1.8610690832138062, + "learning_rate": 4.0087217799872746e-05, + "loss": 1.111, + "step": 19919 + }, + { + "epoch": 0.7133776210002328, + "grad_norm": 1.6060444116592407, + "learning_rate": 4.007793141387633e-05, + "loss": 1.0497, + "step": 19920 + }, + { + "epoch": 0.7134134331298011, + "grad_norm": 1.3621490001678467, + "learning_rate": 4.0068645834054e-05, + "loss": 1.0316, + "step": 19921 + }, + { + "epoch": 0.7134492452593694, + "grad_norm": 1.753942847251892, + "learning_rate": 4.0059361060530755e-05, + "loss": 1.3088, + "step": 19922 + }, + { + "epoch": 0.7134850573889376, + "grad_norm": 1.7130298614501953, + "learning_rate": 4.005007709343147e-05, + "loss": 1.0066, + "step": 19923 + }, + { + "epoch": 0.7135208695185059, + "grad_norm": 1.3243564367294312, + "learning_rate": 4.004079393288112e-05, + "loss": 1.0947, + "step": 19924 + }, + { + "epoch": 0.7135566816480742, + "grad_norm": 1.3621553182601929, + "learning_rate": 4.00315115790045e-05, + "loss": 1.3134, + "step": 19925 + }, + { + "epoch": 0.7135924937776424, + "grad_norm": 1.6574366092681885, + "learning_rate": 4.002223003192654e-05, + "loss": 1.1122, + "step": 19926 + }, + { + "epoch": 0.7136283059072108, + "grad_norm": 1.4134894609451294, + "learning_rate": 4.001294929177215e-05, + "loss": 1.2988, + "step": 19927 + }, + { + "epoch": 0.7136641180367791, + "grad_norm": 1.5092947483062744, + "learning_rate": 4.0003669358666106e-05, + "loss": 1.1069, + "step": 19928 + }, + { + "epoch": 0.7136999301663474, + "grad_norm": 1.0842621326446533, + "learning_rate": 3.9994390232733304e-05, + "loss": 1.0047, + "step": 19929 + }, + { + "epoch": 0.7137357422959156, + "grad_norm": 1.5270392894744873, + "learning_rate": 3.9985111914098585e-05, + "loss": 1.0455, + "step": 19930 + }, + { + "epoch": 0.7137715544254839, + "grad_norm": 1.612499475479126, + "learning_rate": 3.99758344028868e-05, + "loss": 1.1587, + "step": 19931 + }, + { + "epoch": 0.7138073665550522, + "grad_norm": 2.107576847076416, + "learning_rate": 3.99665576992227e-05, + "loss": 1.2617, + "step": 19932 + }, + { + "epoch": 0.7138431786846204, + "grad_norm": 1.4349068403244019, + "learning_rate": 3.995728180323114e-05, + "loss": 1.0861, + "step": 19933 + }, + { + "epoch": 0.7138789908141888, + "grad_norm": 1.6064937114715576, + "learning_rate": 3.994800671503694e-05, + "loss": 1.0641, + "step": 19934 + }, + { + "epoch": 0.7139148029437571, + "grad_norm": 1.4306738376617432, + "learning_rate": 3.9938732434764805e-05, + "loss": 1.1695, + "step": 19935 + }, + { + "epoch": 0.7139506150733254, + "grad_norm": 1.450032114982605, + "learning_rate": 3.992945896253958e-05, + "loss": 1.1676, + "step": 19936 + }, + { + "epoch": 0.7139864272028936, + "grad_norm": 1.566333532333374, + "learning_rate": 3.992018629848594e-05, + "loss": 1.0385, + "step": 19937 + }, + { + "epoch": 0.7140222393324619, + "grad_norm": 1.581192970275879, + "learning_rate": 3.991091444272876e-05, + "loss": 0.9992, + "step": 19938 + }, + { + "epoch": 0.7140580514620302, + "grad_norm": 1.4058130979537964, + "learning_rate": 3.9901643395392685e-05, + "loss": 1.1639, + "step": 19939 + }, + { + "epoch": 0.7140938635915984, + "grad_norm": 1.4530644416809082, + "learning_rate": 3.989237315660248e-05, + "loss": 1.0415, + "step": 19940 + }, + { + "epoch": 0.7141296757211668, + "grad_norm": 1.7481436729431152, + "learning_rate": 3.988310372648285e-05, + "loss": 1.0976, + "step": 19941 + }, + { + "epoch": 0.7141654878507351, + "grad_norm": 1.368700623512268, + "learning_rate": 3.9873835105158564e-05, + "loss": 1.162, + "step": 19942 + }, + { + "epoch": 0.7142012999803034, + "grad_norm": 1.6358046531677246, + "learning_rate": 3.9864567292754266e-05, + "loss": 1.2911, + "step": 19943 + }, + { + "epoch": 0.7142371121098716, + "grad_norm": 1.877323031425476, + "learning_rate": 3.985530028939456e-05, + "loss": 0.9841, + "step": 19944 + }, + { + "epoch": 0.7142729242394399, + "grad_norm": 1.626046895980835, + "learning_rate": 3.9846034095204285e-05, + "loss": 1.1178, + "step": 19945 + }, + { + "epoch": 0.7143087363690082, + "grad_norm": 1.512300968170166, + "learning_rate": 3.9836768710308e-05, + "loss": 1.2719, + "step": 19946 + }, + { + "epoch": 0.7143445484985764, + "grad_norm": 1.5662834644317627, + "learning_rate": 3.982750413483043e-05, + "loss": 0.9083, + "step": 19947 + }, + { + "epoch": 0.7143803606281448, + "grad_norm": 1.4179354906082153, + "learning_rate": 3.981824036889609e-05, + "loss": 1.043, + "step": 19948 + }, + { + "epoch": 0.7144161727577131, + "grad_norm": 2.2024474143981934, + "learning_rate": 3.9808977412629764e-05, + "loss": 1.1236, + "step": 19949 + }, + { + "epoch": 0.7144519848872813, + "grad_norm": 1.3445385694503784, + "learning_rate": 3.979971526615598e-05, + "loss": 0.8832, + "step": 19950 + }, + { + "epoch": 0.7144877970168496, + "grad_norm": 1.460424542427063, + "learning_rate": 3.9790453929599384e-05, + "loss": 1.0134, + "step": 19951 + }, + { + "epoch": 0.7145236091464179, + "grad_norm": 1.563347578048706, + "learning_rate": 3.978119340308458e-05, + "loss": 1.1029, + "step": 19952 + }, + { + "epoch": 0.7145594212759862, + "grad_norm": 1.3440603017807007, + "learning_rate": 3.977193368673612e-05, + "loss": 0.9632, + "step": 19953 + }, + { + "epoch": 0.7145952334055544, + "grad_norm": 1.7457096576690674, + "learning_rate": 3.976267478067863e-05, + "loss": 1.116, + "step": 19954 + }, + { + "epoch": 0.7146310455351228, + "grad_norm": 2.040177345275879, + "learning_rate": 3.975341668503659e-05, + "loss": 1.1715, + "step": 19955 + }, + { + "epoch": 0.7146668576646911, + "grad_norm": 1.998655915260315, + "learning_rate": 3.9744159399934676e-05, + "loss": 1.1923, + "step": 19956 + }, + { + "epoch": 0.7147026697942593, + "grad_norm": 1.2471472024917603, + "learning_rate": 3.973490292549735e-05, + "loss": 1.1959, + "step": 19957 + }, + { + "epoch": 0.7147384819238276, + "grad_norm": 1.4957674741744995, + "learning_rate": 3.97256472618492e-05, + "loss": 1.1458, + "step": 19958 + }, + { + "epoch": 0.7147742940533959, + "grad_norm": 1.2919107675552368, + "learning_rate": 3.971639240911468e-05, + "loss": 0.8734, + "step": 19959 + }, + { + "epoch": 0.7148101061829641, + "grad_norm": 1.3541940450668335, + "learning_rate": 3.970713836741834e-05, + "loss": 0.9003, + "step": 19960 + }, + { + "epoch": 0.7148459183125324, + "grad_norm": 1.293745517730713, + "learning_rate": 3.9697885136884716e-05, + "loss": 1.116, + "step": 19961 + }, + { + "epoch": 0.7148817304421008, + "grad_norm": 1.2961771488189697, + "learning_rate": 3.968863271763822e-05, + "loss": 1.1033, + "step": 19962 + }, + { + "epoch": 0.7149175425716691, + "grad_norm": 1.4320566654205322, + "learning_rate": 3.967938110980338e-05, + "loss": 1.015, + "step": 19963 + }, + { + "epoch": 0.7149533547012373, + "grad_norm": 1.7321830987930298, + "learning_rate": 3.9670130313504675e-05, + "loss": 1.1916, + "step": 19964 + }, + { + "epoch": 0.7149891668308056, + "grad_norm": 1.6137233972549438, + "learning_rate": 3.9660880328866556e-05, + "loss": 0.9866, + "step": 19965 + }, + { + "epoch": 0.7150249789603739, + "grad_norm": 1.2842581272125244, + "learning_rate": 3.96516311560134e-05, + "loss": 1.158, + "step": 19966 + }, + { + "epoch": 0.7150607910899421, + "grad_norm": 1.8845502138137817, + "learning_rate": 3.964238279506979e-05, + "loss": 1.2689, + "step": 19967 + }, + { + "epoch": 0.7150966032195104, + "grad_norm": 1.4926878213882446, + "learning_rate": 3.963313524616005e-05, + "loss": 1.0664, + "step": 19968 + }, + { + "epoch": 0.7151324153490788, + "grad_norm": 1.5640817880630493, + "learning_rate": 3.962388850940857e-05, + "loss": 1.1938, + "step": 19969 + }, + { + "epoch": 0.7151682274786471, + "grad_norm": 1.6266807317733765, + "learning_rate": 3.9614642584939784e-05, + "loss": 0.9988, + "step": 19970 + }, + { + "epoch": 0.7152040396082153, + "grad_norm": 1.5353496074676514, + "learning_rate": 3.96053974728781e-05, + "loss": 1.1889, + "step": 19971 + }, + { + "epoch": 0.7152398517377836, + "grad_norm": 1.8530263900756836, + "learning_rate": 3.9596153173347925e-05, + "loss": 1.2896, + "step": 19972 + }, + { + "epoch": 0.7152756638673519, + "grad_norm": 1.3062500953674316, + "learning_rate": 3.958690968647356e-05, + "loss": 1.135, + "step": 19973 + }, + { + "epoch": 0.7153114759969201, + "grad_norm": 2.0014429092407227, + "learning_rate": 3.9577667012379395e-05, + "loss": 1.3396, + "step": 19974 + }, + { + "epoch": 0.7153472881264884, + "grad_norm": 1.42420494556427, + "learning_rate": 3.956842515118978e-05, + "loss": 1.0829, + "step": 19975 + }, + { + "epoch": 0.7153831002560568, + "grad_norm": 1.3357372283935547, + "learning_rate": 3.955918410302909e-05, + "loss": 0.9373, + "step": 19976 + }, + { + "epoch": 0.715418912385625, + "grad_norm": 1.4356225728988647, + "learning_rate": 3.954994386802158e-05, + "loss": 1.1288, + "step": 19977 + }, + { + "epoch": 0.7154547245151933, + "grad_norm": 1.960385799407959, + "learning_rate": 3.95407044462916e-05, + "loss": 1.0279, + "step": 19978 + }, + { + "epoch": 0.7154905366447616, + "grad_norm": 1.6278530359268188, + "learning_rate": 3.953146583796349e-05, + "loss": 1.0006, + "step": 19979 + }, + { + "epoch": 0.7155263487743299, + "grad_norm": 1.4973726272583008, + "learning_rate": 3.952222804316148e-05, + "loss": 1.2698, + "step": 19980 + }, + { + "epoch": 0.7155621609038981, + "grad_norm": 1.4605125188827515, + "learning_rate": 3.9512991062009874e-05, + "loss": 1.0285, + "step": 19981 + }, + { + "epoch": 0.7155979730334664, + "grad_norm": 1.2336605787277222, + "learning_rate": 3.9503754894632947e-05, + "loss": 1.1331, + "step": 19982 + }, + { + "epoch": 0.7156337851630348, + "grad_norm": 1.5223499536514282, + "learning_rate": 3.949451954115501e-05, + "loss": 1.2548, + "step": 19983 + }, + { + "epoch": 0.715669597292603, + "grad_norm": 1.6962127685546875, + "learning_rate": 3.948528500170021e-05, + "loss": 1.062, + "step": 19984 + }, + { + "epoch": 0.7157054094221713, + "grad_norm": 1.3597387075424194, + "learning_rate": 3.9476051276392853e-05, + "loss": 1.15, + "step": 19985 + }, + { + "epoch": 0.7157412215517396, + "grad_norm": 1.528100609779358, + "learning_rate": 3.946681836535721e-05, + "loss": 0.8334, + "step": 19986 + }, + { + "epoch": 0.7157770336813079, + "grad_norm": 1.7914583683013916, + "learning_rate": 3.945758626871738e-05, + "loss": 1.2342, + "step": 19987 + }, + { + "epoch": 0.7158128458108761, + "grad_norm": 1.6191867589950562, + "learning_rate": 3.9448354986597645e-05, + "loss": 1.0347, + "step": 19988 + }, + { + "epoch": 0.7158486579404444, + "grad_norm": 1.7248972654342651, + "learning_rate": 3.943912451912219e-05, + "loss": 1.1436, + "step": 19989 + }, + { + "epoch": 0.7158844700700128, + "grad_norm": 1.6661653518676758, + "learning_rate": 3.9429894866415226e-05, + "loss": 1.0639, + "step": 19990 + }, + { + "epoch": 0.715920282199581, + "grad_norm": 1.7606744766235352, + "learning_rate": 3.9420666028600874e-05, + "loss": 1.2727, + "step": 19991 + }, + { + "epoch": 0.7159560943291493, + "grad_norm": 1.413799524307251, + "learning_rate": 3.9411438005803305e-05, + "loss": 0.9709, + "step": 19992 + }, + { + "epoch": 0.7159919064587176, + "grad_norm": 1.590872883796692, + "learning_rate": 3.9402210798146686e-05, + "loss": 1.1173, + "step": 19993 + }, + { + "epoch": 0.7160277185882858, + "grad_norm": 1.3411520719528198, + "learning_rate": 3.939298440575519e-05, + "loss": 1.1845, + "step": 19994 + }, + { + "epoch": 0.7160635307178541, + "grad_norm": 1.7579081058502197, + "learning_rate": 3.9383758828752884e-05, + "loss": 1.152, + "step": 19995 + }, + { + "epoch": 0.7160993428474224, + "grad_norm": 1.273149013519287, + "learning_rate": 3.93745340672639e-05, + "loss": 1.1313, + "step": 19996 + }, + { + "epoch": 0.7161351549769908, + "grad_norm": 1.309494972229004, + "learning_rate": 3.936531012141241e-05, + "loss": 1.0316, + "step": 19997 + }, + { + "epoch": 0.716170967106559, + "grad_norm": 1.231075406074524, + "learning_rate": 3.935608699132242e-05, + "loss": 0.8883, + "step": 19998 + }, + { + "epoch": 0.7162067792361273, + "grad_norm": 1.3461781740188599, + "learning_rate": 3.9346864677118046e-05, + "loss": 1.0345, + "step": 19999 + }, + { + "epoch": 0.7162425913656956, + "grad_norm": 1.593717336654663, + "learning_rate": 3.9337643178923376e-05, + "loss": 1.0412, + "step": 20000 + }, + { + "epoch": 0.7162784034952638, + "grad_norm": 1.851452112197876, + "learning_rate": 3.932842249686251e-05, + "loss": 1.1282, + "step": 20001 + }, + { + "epoch": 0.7163142156248321, + "grad_norm": 1.434279203414917, + "learning_rate": 3.9319202631059414e-05, + "loss": 1.0715, + "step": 20002 + }, + { + "epoch": 0.7163500277544004, + "grad_norm": 1.6585959196090698, + "learning_rate": 3.9309983581638173e-05, + "loss": 1.0905, + "step": 20003 + }, + { + "epoch": 0.7163858398839686, + "grad_norm": 1.6368474960327148, + "learning_rate": 3.9300765348722854e-05, + "loss": 0.9473, + "step": 20004 + }, + { + "epoch": 0.716421652013537, + "grad_norm": 1.7079414129257202, + "learning_rate": 3.929154793243741e-05, + "loss": 1.1221, + "step": 20005 + }, + { + "epoch": 0.7164574641431053, + "grad_norm": 1.6701778173446655, + "learning_rate": 3.928233133290589e-05, + "loss": 1.1915, + "step": 20006 + }, + { + "epoch": 0.7164932762726736, + "grad_norm": 1.8792985677719116, + "learning_rate": 3.927311555025227e-05, + "loss": 1.2742, + "step": 20007 + }, + { + "epoch": 0.7165290884022418, + "grad_norm": 1.47493314743042, + "learning_rate": 3.926390058460058e-05, + "loss": 1.0447, + "step": 20008 + }, + { + "epoch": 0.7165649005318101, + "grad_norm": 1.7910220623016357, + "learning_rate": 3.925468643607473e-05, + "loss": 1.1938, + "step": 20009 + }, + { + "epoch": 0.7166007126613784, + "grad_norm": 1.3768740892410278, + "learning_rate": 3.9245473104798726e-05, + "loss": 0.9913, + "step": 20010 + }, + { + "epoch": 0.7166365247909466, + "grad_norm": 1.5578688383102417, + "learning_rate": 3.923626059089651e-05, + "loss": 1.0036, + "step": 20011 + }, + { + "epoch": 0.716672336920515, + "grad_norm": 1.5425482988357544, + "learning_rate": 3.9227048894492055e-05, + "loss": 1.1828, + "step": 20012 + }, + { + "epoch": 0.7167081490500833, + "grad_norm": 1.3758541345596313, + "learning_rate": 3.921783801570924e-05, + "loss": 1.0071, + "step": 20013 + }, + { + "epoch": 0.7167439611796516, + "grad_norm": 1.3871614933013916, + "learning_rate": 3.9208627954672014e-05, + "loss": 1.0899, + "step": 20014 + }, + { + "epoch": 0.7167797733092198, + "grad_norm": 1.6479051113128662, + "learning_rate": 3.9199418711504307e-05, + "loss": 0.9984, + "step": 20015 + }, + { + "epoch": 0.7168155854387881, + "grad_norm": 1.4085549116134644, + "learning_rate": 3.919021028632998e-05, + "loss": 1.2234, + "step": 20016 + }, + { + "epoch": 0.7168513975683564, + "grad_norm": 1.3872815370559692, + "learning_rate": 3.918100267927292e-05, + "loss": 1.1853, + "step": 20017 + }, + { + "epoch": 0.7168872096979246, + "grad_norm": 1.2027618885040283, + "learning_rate": 3.917179589045701e-05, + "loss": 0.9932, + "step": 20018 + }, + { + "epoch": 0.716923021827493, + "grad_norm": 1.5552738904953003, + "learning_rate": 3.9162589920006164e-05, + "loss": 1.1579, + "step": 20019 + }, + { + "epoch": 0.7169588339570613, + "grad_norm": 1.6187328100204468, + "learning_rate": 3.9153384768044163e-05, + "loss": 0.9243, + "step": 20020 + }, + { + "epoch": 0.7169946460866296, + "grad_norm": 1.3606513738632202, + "learning_rate": 3.9144180434694885e-05, + "loss": 1.019, + "step": 20021 + }, + { + "epoch": 0.7170304582161978, + "grad_norm": 1.4585285186767578, + "learning_rate": 3.91349769200822e-05, + "loss": 0.8797, + "step": 20022 + }, + { + "epoch": 0.7170662703457661, + "grad_norm": 1.2393627166748047, + "learning_rate": 3.9125774224329845e-05, + "loss": 1.0706, + "step": 20023 + }, + { + "epoch": 0.7171020824753344, + "grad_norm": 1.50989830493927, + "learning_rate": 3.911657234756169e-05, + "loss": 1.0632, + "step": 20024 + }, + { + "epoch": 0.7171378946049026, + "grad_norm": 1.7280333042144775, + "learning_rate": 3.9107371289901504e-05, + "loss": 1.2251, + "step": 20025 + }, + { + "epoch": 0.717173706734471, + "grad_norm": 1.6256067752838135, + "learning_rate": 3.909817105147314e-05, + "loss": 0.9632, + "step": 20026 + }, + { + "epoch": 0.7172095188640393, + "grad_norm": 1.2900855541229248, + "learning_rate": 3.9088971632400286e-05, + "loss": 0.9754, + "step": 20027 + }, + { + "epoch": 0.7172453309936075, + "grad_norm": 1.4669007062911987, + "learning_rate": 3.907977303280674e-05, + "loss": 1.2258, + "step": 20028 + }, + { + "epoch": 0.7172811431231758, + "grad_norm": 1.3805599212646484, + "learning_rate": 3.907057525281628e-05, + "loss": 1.0389, + "step": 20029 + }, + { + "epoch": 0.7173169552527441, + "grad_norm": 1.4706881046295166, + "learning_rate": 3.906137829255266e-05, + "loss": 1.0492, + "step": 20030 + }, + { + "epoch": 0.7173527673823124, + "grad_norm": 1.3978062868118286, + "learning_rate": 3.90521821521396e-05, + "loss": 0.925, + "step": 20031 + }, + { + "epoch": 0.7173885795118806, + "grad_norm": 1.2531508207321167, + "learning_rate": 3.904298683170074e-05, + "loss": 1.1512, + "step": 20032 + }, + { + "epoch": 0.717424391641449, + "grad_norm": 1.437495231628418, + "learning_rate": 3.903379233135994e-05, + "loss": 1.0028, + "step": 20033 + }, + { + "epoch": 0.7174602037710173, + "grad_norm": 1.2211227416992188, + "learning_rate": 3.9024598651240774e-05, + "loss": 1.14, + "step": 20034 + }, + { + "epoch": 0.7174960159005855, + "grad_norm": 1.370334506034851, + "learning_rate": 3.901540579146698e-05, + "loss": 1.0933, + "step": 20035 + }, + { + "epoch": 0.7175318280301538, + "grad_norm": 1.3797385692596436, + "learning_rate": 3.900621375216226e-05, + "loss": 1.165, + "step": 20036 + }, + { + "epoch": 0.7175676401597221, + "grad_norm": 1.466452717781067, + "learning_rate": 3.8997022533450264e-05, + "loss": 1.0695, + "step": 20037 + }, + { + "epoch": 0.7176034522892903, + "grad_norm": 1.6464710235595703, + "learning_rate": 3.898783213545463e-05, + "loss": 0.8625, + "step": 20038 + }, + { + "epoch": 0.7176392644188586, + "grad_norm": 1.6767024993896484, + "learning_rate": 3.8978642558298994e-05, + "loss": 1.2171, + "step": 20039 + }, + { + "epoch": 0.717675076548427, + "grad_norm": 1.629317045211792, + "learning_rate": 3.8969453802107057e-05, + "loss": 1.1147, + "step": 20040 + }, + { + "epoch": 0.7177108886779953, + "grad_norm": 1.4562366008758545, + "learning_rate": 3.8960265867002364e-05, + "loss": 1.1739, + "step": 20041 + }, + { + "epoch": 0.7177467008075635, + "grad_norm": 1.5011276006698608, + "learning_rate": 3.895107875310858e-05, + "loss": 0.9583, + "step": 20042 + }, + { + "epoch": 0.7177825129371318, + "grad_norm": 1.5171849727630615, + "learning_rate": 3.894189246054922e-05, + "loss": 1.0631, + "step": 20043 + }, + { + "epoch": 0.7178183250667001, + "grad_norm": 1.2655755281448364, + "learning_rate": 3.893270698944802e-05, + "loss": 1.0029, + "step": 20044 + }, + { + "epoch": 0.7178541371962683, + "grad_norm": 1.5503363609313965, + "learning_rate": 3.892352233992843e-05, + "loss": 1.1614, + "step": 20045 + }, + { + "epoch": 0.7178899493258366, + "grad_norm": 1.5507601499557495, + "learning_rate": 3.89143385121141e-05, + "loss": 1.1002, + "step": 20046 + }, + { + "epoch": 0.717925761455405, + "grad_norm": 1.3198314905166626, + "learning_rate": 3.8905155506128476e-05, + "loss": 1.0431, + "step": 20047 + }, + { + "epoch": 0.7179615735849733, + "grad_norm": 2.0065059661865234, + "learning_rate": 3.889597332209526e-05, + "loss": 0.8893, + "step": 20048 + }, + { + "epoch": 0.7179973857145415, + "grad_norm": 1.629863977432251, + "learning_rate": 3.888679196013789e-05, + "loss": 1.2455, + "step": 20049 + }, + { + "epoch": 0.7180331978441098, + "grad_norm": 1.4620758295059204, + "learning_rate": 3.887761142037984e-05, + "loss": 1.1261, + "step": 20050 + }, + { + "epoch": 0.7180690099736781, + "grad_norm": 1.6959420442581177, + "learning_rate": 3.886843170294475e-05, + "loss": 1.3027, + "step": 20051 + }, + { + "epoch": 0.7181048221032463, + "grad_norm": 1.6038568019866943, + "learning_rate": 3.8859252807956035e-05, + "loss": 1.0573, + "step": 20052 + }, + { + "epoch": 0.7181406342328146, + "grad_norm": 1.2269291877746582, + "learning_rate": 3.885007473553723e-05, + "loss": 0.8886, + "step": 20053 + }, + { + "epoch": 0.718176446362383, + "grad_norm": 1.3463729619979858, + "learning_rate": 3.8840897485811737e-05, + "loss": 0.9691, + "step": 20054 + }, + { + "epoch": 0.7182122584919512, + "grad_norm": 1.3638356924057007, + "learning_rate": 3.883172105890314e-05, + "loss": 1.2436, + "step": 20055 + }, + { + "epoch": 0.7182480706215195, + "grad_norm": 1.334145188331604, + "learning_rate": 3.8822545454934836e-05, + "loss": 0.9941, + "step": 20056 + }, + { + "epoch": 0.7182838827510878, + "grad_norm": 1.3977607488632202, + "learning_rate": 3.881337067403022e-05, + "loss": 1.1904, + "step": 20057 + }, + { + "epoch": 0.7183196948806561, + "grad_norm": 1.5599278211593628, + "learning_rate": 3.8804196716312805e-05, + "loss": 1.2137, + "step": 20058 + }, + { + "epoch": 0.7183555070102243, + "grad_norm": 1.4144517183303833, + "learning_rate": 3.879502358190596e-05, + "loss": 1.103, + "step": 20059 + }, + { + "epoch": 0.7183913191397926, + "grad_norm": 1.7401057481765747, + "learning_rate": 3.878585127093317e-05, + "loss": 1.1411, + "step": 20060 + }, + { + "epoch": 0.718427131269361, + "grad_norm": 1.3955765962600708, + "learning_rate": 3.877667978351772e-05, + "loss": 1.0671, + "step": 20061 + }, + { + "epoch": 0.7184629433989292, + "grad_norm": 1.618909478187561, + "learning_rate": 3.876750911978315e-05, + "loss": 1.008, + "step": 20062 + }, + { + "epoch": 0.7184987555284975, + "grad_norm": 1.3855077028274536, + "learning_rate": 3.875833927985272e-05, + "loss": 0.9763, + "step": 20063 + }, + { + "epoch": 0.7185345676580658, + "grad_norm": 1.3396745920181274, + "learning_rate": 3.8749170263849865e-05, + "loss": 0.9938, + "step": 20064 + }, + { + "epoch": 0.718570379787634, + "grad_norm": 1.339918851852417, + "learning_rate": 3.874000207189789e-05, + "loss": 1.052, + "step": 20065 + }, + { + "epoch": 0.7186061919172023, + "grad_norm": 1.6129199266433716, + "learning_rate": 3.8730834704120164e-05, + "loss": 1.3342, + "step": 20066 + }, + { + "epoch": 0.7186420040467706, + "grad_norm": 1.7343721389770508, + "learning_rate": 3.8721668160640054e-05, + "loss": 1.2087, + "step": 20067 + }, + { + "epoch": 0.718677816176339, + "grad_norm": 1.7020739316940308, + "learning_rate": 3.871250244158083e-05, + "loss": 1.205, + "step": 20068 + }, + { + "epoch": 0.7187136283059072, + "grad_norm": 1.635473370552063, + "learning_rate": 3.870333754706583e-05, + "loss": 1.2499, + "step": 20069 + }, + { + "epoch": 0.7187494404354755, + "grad_norm": 1.8514269590377808, + "learning_rate": 3.8694173477218355e-05, + "loss": 1.3137, + "step": 20070 + }, + { + "epoch": 0.7187852525650438, + "grad_norm": 1.4302315711975098, + "learning_rate": 3.8685010232161736e-05, + "loss": 0.8705, + "step": 20071 + }, + { + "epoch": 0.718821064694612, + "grad_norm": 1.8852885961532593, + "learning_rate": 3.8675847812019175e-05, + "loss": 1.0624, + "step": 20072 + }, + { + "epoch": 0.7188568768241803, + "grad_norm": 1.4131276607513428, + "learning_rate": 3.866668621691397e-05, + "loss": 1.0209, + "step": 20073 + }, + { + "epoch": 0.7188926889537486, + "grad_norm": 1.4975852966308594, + "learning_rate": 3.8657525446969436e-05, + "loss": 1.0769, + "step": 20074 + }, + { + "epoch": 0.718928501083317, + "grad_norm": 1.7516343593597412, + "learning_rate": 3.864836550230874e-05, + "loss": 1.2096, + "step": 20075 + }, + { + "epoch": 0.7189643132128852, + "grad_norm": 1.7002086639404297, + "learning_rate": 3.863920638305512e-05, + "loss": 1.0655, + "step": 20076 + }, + { + "epoch": 0.7190001253424535, + "grad_norm": 1.7094329595565796, + "learning_rate": 3.863004808933186e-05, + "loss": 1.2447, + "step": 20077 + }, + { + "epoch": 0.7190359374720218, + "grad_norm": 1.5854564905166626, + "learning_rate": 3.8620890621262164e-05, + "loss": 1.2273, + "step": 20078 + }, + { + "epoch": 0.71907174960159, + "grad_norm": 1.4068158864974976, + "learning_rate": 3.8611733978969176e-05, + "loss": 1.1094, + "step": 20079 + }, + { + "epoch": 0.7191075617311583, + "grad_norm": 1.4714149236679077, + "learning_rate": 3.860257816257612e-05, + "loss": 1.0051, + "step": 20080 + }, + { + "epoch": 0.7191433738607266, + "grad_norm": 1.4619172811508179, + "learning_rate": 3.859342317220619e-05, + "loss": 1.0214, + "step": 20081 + }, + { + "epoch": 0.719179185990295, + "grad_norm": 1.3692309856414795, + "learning_rate": 3.8584269007982565e-05, + "loss": 1.1019, + "step": 20082 + }, + { + "epoch": 0.7192149981198632, + "grad_norm": 1.6505446434020996, + "learning_rate": 3.857511567002835e-05, + "loss": 0.9792, + "step": 20083 + }, + { + "epoch": 0.7192508102494315, + "grad_norm": 1.479779839515686, + "learning_rate": 3.8565963158466714e-05, + "loss": 1.1219, + "step": 20084 + }, + { + "epoch": 0.7192866223789998, + "grad_norm": 1.5199352502822876, + "learning_rate": 3.855681147342084e-05, + "loss": 1.1237, + "step": 20085 + }, + { + "epoch": 0.719322434508568, + "grad_norm": 1.3395113945007324, + "learning_rate": 3.854766061501378e-05, + "loss": 1.2474, + "step": 20086 + }, + { + "epoch": 0.7193582466381363, + "grad_norm": 1.6321700811386108, + "learning_rate": 3.853851058336867e-05, + "loss": 1.0995, + "step": 20087 + }, + { + "epoch": 0.7193940587677046, + "grad_norm": 1.6657427549362183, + "learning_rate": 3.852936137860863e-05, + "loss": 0.9293, + "step": 20088 + }, + { + "epoch": 0.719429870897273, + "grad_norm": 1.447771430015564, + "learning_rate": 3.8520213000856763e-05, + "loss": 1.1426, + "step": 20089 + }, + { + "epoch": 0.7194656830268412, + "grad_norm": 1.4404102563858032, + "learning_rate": 3.85110654502361e-05, + "loss": 1.111, + "step": 20090 + }, + { + "epoch": 0.7195014951564095, + "grad_norm": 1.5831369161605835, + "learning_rate": 3.8501918726869744e-05, + "loss": 1.2328, + "step": 20091 + }, + { + "epoch": 0.7195373072859778, + "grad_norm": 1.541593313217163, + "learning_rate": 3.8492772830880776e-05, + "loss": 1.0331, + "step": 20092 + }, + { + "epoch": 0.719573119415546, + "grad_norm": 1.3040220737457275, + "learning_rate": 3.848362776239217e-05, + "loss": 1.1911, + "step": 20093 + }, + { + "epoch": 0.7196089315451143, + "grad_norm": 1.45150887966156, + "learning_rate": 3.847448352152701e-05, + "loss": 1.1138, + "step": 20094 + }, + { + "epoch": 0.7196447436746826, + "grad_norm": 1.3576918840408325, + "learning_rate": 3.84653401084083e-05, + "loss": 1.0875, + "step": 20095 + }, + { + "epoch": 0.7196805558042509, + "grad_norm": 1.3863797187805176, + "learning_rate": 3.8456197523159096e-05, + "loss": 1.0176, + "step": 20096 + }, + { + "epoch": 0.7197163679338192, + "grad_norm": 1.8163779973983765, + "learning_rate": 3.844705576590235e-05, + "loss": 0.9987, + "step": 20097 + }, + { + "epoch": 0.7197521800633875, + "grad_norm": 1.4338808059692383, + "learning_rate": 3.843791483676107e-05, + "loss": 1.1479, + "step": 20098 + }, + { + "epoch": 0.7197879921929558, + "grad_norm": 1.7920397520065308, + "learning_rate": 3.842877473585823e-05, + "loss": 0.9644, + "step": 20099 + }, + { + "epoch": 0.719823804322524, + "grad_norm": 1.3837891817092896, + "learning_rate": 3.841963546331684e-05, + "loss": 1.11, + "step": 20100 + }, + { + "epoch": 0.7198596164520923, + "grad_norm": 1.2323521375656128, + "learning_rate": 3.841049701925978e-05, + "loss": 1.005, + "step": 20101 + }, + { + "epoch": 0.7198954285816606, + "grad_norm": 1.3706696033477783, + "learning_rate": 3.840135940381006e-05, + "loss": 1.0181, + "step": 20102 + }, + { + "epoch": 0.7199312407112289, + "grad_norm": 1.6860246658325195, + "learning_rate": 3.839222261709061e-05, + "loss": 1.0281, + "step": 20103 + }, + { + "epoch": 0.7199670528407972, + "grad_norm": 1.6461143493652344, + "learning_rate": 3.83830866592243e-05, + "loss": 1.1014, + "step": 20104 + }, + { + "epoch": 0.7200028649703655, + "grad_norm": 1.2465269565582275, + "learning_rate": 3.8373951530334086e-05, + "loss": 0.791, + "step": 20105 + }, + { + "epoch": 0.7200386770999337, + "grad_norm": 1.79696786403656, + "learning_rate": 3.836481723054286e-05, + "loss": 1.0456, + "step": 20106 + }, + { + "epoch": 0.720074489229502, + "grad_norm": 1.6245546340942383, + "learning_rate": 3.835568375997355e-05, + "loss": 1.1362, + "step": 20107 + }, + { + "epoch": 0.7201103013590703, + "grad_norm": 1.8583903312683105, + "learning_rate": 3.8346551118748967e-05, + "loss": 1.0471, + "step": 20108 + }, + { + "epoch": 0.7201461134886386, + "grad_norm": 1.4983094930648804, + "learning_rate": 3.8337419306992e-05, + "loss": 1.0942, + "step": 20109 + }, + { + "epoch": 0.7201819256182069, + "grad_norm": 1.49557626247406, + "learning_rate": 3.8328288324825566e-05, + "loss": 1.0654, + "step": 20110 + }, + { + "epoch": 0.7202177377477752, + "grad_norm": 1.602957010269165, + "learning_rate": 3.831915817237243e-05, + "loss": 0.9678, + "step": 20111 + }, + { + "epoch": 0.7202535498773435, + "grad_norm": 1.5628355741500854, + "learning_rate": 3.831002884975544e-05, + "loss": 1.0848, + "step": 20112 + }, + { + "epoch": 0.7202893620069117, + "grad_norm": 1.2844972610473633, + "learning_rate": 3.830090035709745e-05, + "loss": 0.9842, + "step": 20113 + }, + { + "epoch": 0.72032517413648, + "grad_norm": 1.5308480262756348, + "learning_rate": 3.8291772694521285e-05, + "loss": 1.193, + "step": 20114 + }, + { + "epoch": 0.7203609862660483, + "grad_norm": 1.6285269260406494, + "learning_rate": 3.82826458621497e-05, + "loss": 1.0802, + "step": 20115 + }, + { + "epoch": 0.7203967983956165, + "grad_norm": 1.9959959983825684, + "learning_rate": 3.82735198601055e-05, + "loss": 1.2659, + "step": 20116 + }, + { + "epoch": 0.7204326105251849, + "grad_norm": 1.2765588760375977, + "learning_rate": 3.8264394688511466e-05, + "loss": 1.0106, + "step": 20117 + }, + { + "epoch": 0.7204684226547532, + "grad_norm": 1.8315763473510742, + "learning_rate": 3.82552703474904e-05, + "loss": 1.0848, + "step": 20118 + }, + { + "epoch": 0.7205042347843215, + "grad_norm": 1.8122283220291138, + "learning_rate": 3.8246146837165e-05, + "loss": 1.2187, + "step": 20119 + }, + { + "epoch": 0.7205400469138897, + "grad_norm": 1.388466238975525, + "learning_rate": 3.823702415765803e-05, + "loss": 1.0462, + "step": 20120 + }, + { + "epoch": 0.720575859043458, + "grad_norm": 1.9461040496826172, + "learning_rate": 3.822790230909227e-05, + "loss": 1.1849, + "step": 20121 + }, + { + "epoch": 0.7206116711730263, + "grad_norm": 1.5297906398773193, + "learning_rate": 3.821878129159037e-05, + "loss": 1.1775, + "step": 20122 + }, + { + "epoch": 0.7206474833025945, + "grad_norm": 1.5940086841583252, + "learning_rate": 3.8209661105275077e-05, + "loss": 1.0053, + "step": 20123 + }, + { + "epoch": 0.7206832954321629, + "grad_norm": 1.2003670930862427, + "learning_rate": 3.820054175026908e-05, + "loss": 0.8389, + "step": 20124 + }, + { + "epoch": 0.7207191075617312, + "grad_norm": 1.5059806108474731, + "learning_rate": 3.8191423226695125e-05, + "loss": 1.0213, + "step": 20125 + }, + { + "epoch": 0.7207549196912995, + "grad_norm": 1.5559701919555664, + "learning_rate": 3.81823055346758e-05, + "loss": 0.979, + "step": 20126 + }, + { + "epoch": 0.7207907318208677, + "grad_norm": 1.4039288759231567, + "learning_rate": 3.817318867433383e-05, + "loss": 1.1277, + "step": 20127 + }, + { + "epoch": 0.720826543950436, + "grad_norm": 1.3330353498458862, + "learning_rate": 3.816407264579187e-05, + "loss": 0.9511, + "step": 20128 + }, + { + "epoch": 0.7208623560800043, + "grad_norm": 1.7471060752868652, + "learning_rate": 3.8154957449172524e-05, + "loss": 0.8994, + "step": 20129 + }, + { + "epoch": 0.7208981682095725, + "grad_norm": 1.4115039110183716, + "learning_rate": 3.814584308459849e-05, + "loss": 1.0302, + "step": 20130 + }, + { + "epoch": 0.7209339803391409, + "grad_norm": 1.678914189338684, + "learning_rate": 3.8136729552192274e-05, + "loss": 1.0275, + "step": 20131 + }, + { + "epoch": 0.7209697924687092, + "grad_norm": 1.3715426921844482, + "learning_rate": 3.812761685207664e-05, + "loss": 0.9358, + "step": 20132 + }, + { + "epoch": 0.7210056045982774, + "grad_norm": 1.7719920873641968, + "learning_rate": 3.811850498437407e-05, + "loss": 0.9664, + "step": 20133 + }, + { + "epoch": 0.7210414167278457, + "grad_norm": 1.588884711265564, + "learning_rate": 3.81093939492072e-05, + "loss": 0.9902, + "step": 20134 + }, + { + "epoch": 0.721077228857414, + "grad_norm": 1.5538569688796997, + "learning_rate": 3.810028374669859e-05, + "loss": 0.9701, + "step": 20135 + }, + { + "epoch": 0.7211130409869823, + "grad_norm": 1.7236992120742798, + "learning_rate": 3.8091174376970876e-05, + "loss": 0.9895, + "step": 20136 + }, + { + "epoch": 0.7211488531165505, + "grad_norm": 1.6447242498397827, + "learning_rate": 3.808206584014653e-05, + "loss": 0.9485, + "step": 20137 + }, + { + "epoch": 0.7211846652461189, + "grad_norm": 1.5655180215835571, + "learning_rate": 3.807295813634807e-05, + "loss": 1.2145, + "step": 20138 + }, + { + "epoch": 0.7212204773756872, + "grad_norm": 1.9937689304351807, + "learning_rate": 3.8063851265698134e-05, + "loss": 1.0609, + "step": 20139 + }, + { + "epoch": 0.7212562895052554, + "grad_norm": 1.605264663696289, + "learning_rate": 3.805474522831916e-05, + "loss": 1.2301, + "step": 20140 + }, + { + "epoch": 0.7212921016348237, + "grad_norm": 1.4282954931259155, + "learning_rate": 3.804564002433371e-05, + "loss": 1.0836, + "step": 20141 + }, + { + "epoch": 0.721327913764392, + "grad_norm": 1.946874976158142, + "learning_rate": 3.8036535653864193e-05, + "loss": 1.1302, + "step": 20142 + }, + { + "epoch": 0.7213637258939603, + "grad_norm": 1.442498803138733, + "learning_rate": 3.8027432117033237e-05, + "loss": 1.2955, + "step": 20143 + }, + { + "epoch": 0.7213995380235285, + "grad_norm": 1.6579095125198364, + "learning_rate": 3.80183294139632e-05, + "loss": 1.1409, + "step": 20144 + }, + { + "epoch": 0.7214353501530969, + "grad_norm": 1.5205349922180176, + "learning_rate": 3.8009227544776595e-05, + "loss": 1.1897, + "step": 20145 + }, + { + "epoch": 0.7214711622826652, + "grad_norm": 1.2058188915252686, + "learning_rate": 3.80001265095959e-05, + "loss": 0.7327, + "step": 20146 + }, + { + "epoch": 0.7215069744122334, + "grad_norm": 1.3835397958755493, + "learning_rate": 3.799102630854351e-05, + "loss": 1.1138, + "step": 20147 + }, + { + "epoch": 0.7215427865418017, + "grad_norm": 1.6927247047424316, + "learning_rate": 3.79819269417419e-05, + "loss": 1.2351, + "step": 20148 + }, + { + "epoch": 0.72157859867137, + "grad_norm": 2.073709011077881, + "learning_rate": 3.797282840931339e-05, + "loss": 1.1288, + "step": 20149 + }, + { + "epoch": 0.7216144108009382, + "grad_norm": 1.2802492380142212, + "learning_rate": 3.796373071138054e-05, + "loss": 1.2771, + "step": 20150 + }, + { + "epoch": 0.7216502229305065, + "grad_norm": 1.8909485340118408, + "learning_rate": 3.795463384806564e-05, + "loss": 1.3085, + "step": 20151 + }, + { + "epoch": 0.7216860350600749, + "grad_norm": 1.5449550151824951, + "learning_rate": 3.794553781949114e-05, + "loss": 1.036, + "step": 20152 + }, + { + "epoch": 0.7217218471896432, + "grad_norm": 1.5110883712768555, + "learning_rate": 3.793644262577934e-05, + "loss": 0.9913, + "step": 20153 + }, + { + "epoch": 0.7217576593192114, + "grad_norm": 1.4119073152542114, + "learning_rate": 3.7927348267052666e-05, + "loss": 1.0847, + "step": 20154 + }, + { + "epoch": 0.7217934714487797, + "grad_norm": 1.543902039527893, + "learning_rate": 3.791825474343348e-05, + "loss": 1.1303, + "step": 20155 + }, + { + "epoch": 0.721829283578348, + "grad_norm": 1.5976461172103882, + "learning_rate": 3.790916205504406e-05, + "loss": 1.0269, + "step": 20156 + }, + { + "epoch": 0.7218650957079162, + "grad_norm": 1.5034534931182861, + "learning_rate": 3.7900070202006764e-05, + "loss": 0.9047, + "step": 20157 + }, + { + "epoch": 0.7219009078374845, + "grad_norm": 1.2759735584259033, + "learning_rate": 3.789097918444394e-05, + "loss": 1.0644, + "step": 20158 + }, + { + "epoch": 0.7219367199670529, + "grad_norm": 1.4047552347183228, + "learning_rate": 3.78818890024779e-05, + "loss": 0.9785, + "step": 20159 + }, + { + "epoch": 0.7219725320966212, + "grad_norm": 1.662957787513733, + "learning_rate": 3.787279965623085e-05, + "loss": 1.337, + "step": 20160 + }, + { + "epoch": 0.7220083442261894, + "grad_norm": 1.1620858907699585, + "learning_rate": 3.786371114582521e-05, + "loss": 0.8946, + "step": 20161 + }, + { + "epoch": 0.7220441563557577, + "grad_norm": 1.5347199440002441, + "learning_rate": 3.785462347138319e-05, + "loss": 1.1465, + "step": 20162 + }, + { + "epoch": 0.722079968485326, + "grad_norm": 1.422802448272705, + "learning_rate": 3.784553663302701e-05, + "loss": 1.0957, + "step": 20163 + }, + { + "epoch": 0.7221157806148942, + "grad_norm": 2.0823724269866943, + "learning_rate": 3.783645063087896e-05, + "loss": 1.0385, + "step": 20164 + }, + { + "epoch": 0.7221515927444625, + "grad_norm": 1.883963942527771, + "learning_rate": 3.782736546506128e-05, + "loss": 1.1392, + "step": 20165 + }, + { + "epoch": 0.7221874048740309, + "grad_norm": 1.4902585744857788, + "learning_rate": 3.781828113569624e-05, + "loss": 1.2789, + "step": 20166 + }, + { + "epoch": 0.7222232170035991, + "grad_norm": 1.2696096897125244, + "learning_rate": 3.780919764290599e-05, + "loss": 1.1092, + "step": 20167 + }, + { + "epoch": 0.7222590291331674, + "grad_norm": 1.1902222633361816, + "learning_rate": 3.780011498681276e-05, + "loss": 0.9215, + "step": 20168 + }, + { + "epoch": 0.7222948412627357, + "grad_norm": 1.94163179397583, + "learning_rate": 3.779103316753875e-05, + "loss": 1.1221, + "step": 20169 + }, + { + "epoch": 0.722330653392304, + "grad_norm": 1.5815603733062744, + "learning_rate": 3.778195218520618e-05, + "loss": 0.9868, + "step": 20170 + }, + { + "epoch": 0.7223664655218722, + "grad_norm": 1.562735676765442, + "learning_rate": 3.777287203993716e-05, + "loss": 1.1792, + "step": 20171 + }, + { + "epoch": 0.7224022776514405, + "grad_norm": 1.566542148590088, + "learning_rate": 3.7763792731853865e-05, + "loss": 1.2217, + "step": 20172 + }, + { + "epoch": 0.7224380897810089, + "grad_norm": 1.4204473495483398, + "learning_rate": 3.77547142610785e-05, + "loss": 0.944, + "step": 20173 + }, + { + "epoch": 0.7224739019105771, + "grad_norm": 1.3503062725067139, + "learning_rate": 3.774563662773314e-05, + "loss": 0.9727, + "step": 20174 + }, + { + "epoch": 0.7225097140401454, + "grad_norm": 1.4863388538360596, + "learning_rate": 3.773655983193992e-05, + "loss": 1.2174, + "step": 20175 + }, + { + "epoch": 0.7225455261697137, + "grad_norm": 1.8903677463531494, + "learning_rate": 3.772748387382099e-05, + "loss": 1.0296, + "step": 20176 + }, + { + "epoch": 0.722581338299282, + "grad_norm": 1.6295002698898315, + "learning_rate": 3.7718408753498456e-05, + "loss": 1.0068, + "step": 20177 + }, + { + "epoch": 0.7226171504288502, + "grad_norm": 1.4722260236740112, + "learning_rate": 3.770933447109437e-05, + "loss": 0.9156, + "step": 20178 + }, + { + "epoch": 0.7226529625584185, + "grad_norm": 1.5577926635742188, + "learning_rate": 3.7700261026730844e-05, + "loss": 0.9742, + "step": 20179 + }, + { + "epoch": 0.7226887746879869, + "grad_norm": 1.7680439949035645, + "learning_rate": 3.7691188420529974e-05, + "loss": 1.1001, + "step": 20180 + }, + { + "epoch": 0.7227245868175551, + "grad_norm": 1.5369796752929688, + "learning_rate": 3.768211665261375e-05, + "loss": 1.2089, + "step": 20181 + }, + { + "epoch": 0.7227603989471234, + "grad_norm": 1.478981614112854, + "learning_rate": 3.7673045723104275e-05, + "loss": 1.0449, + "step": 20182 + }, + { + "epoch": 0.7227962110766917, + "grad_norm": 1.3251818418502808, + "learning_rate": 3.7663975632123574e-05, + "loss": 1.082, + "step": 20183 + }, + { + "epoch": 0.7228320232062599, + "grad_norm": 1.5141563415527344, + "learning_rate": 3.76549063797937e-05, + "loss": 0.8276, + "step": 20184 + }, + { + "epoch": 0.7228678353358282, + "grad_norm": 1.6025646924972534, + "learning_rate": 3.7645837966236605e-05, + "loss": 1.0235, + "step": 20185 + }, + { + "epoch": 0.7229036474653965, + "grad_norm": 1.7082223892211914, + "learning_rate": 3.763677039157433e-05, + "loss": 1.0019, + "step": 20186 + }, + { + "epoch": 0.7229394595949649, + "grad_norm": 1.3236223459243774, + "learning_rate": 3.762770365592887e-05, + "loss": 1.0317, + "step": 20187 + }, + { + "epoch": 0.7229752717245331, + "grad_norm": 1.5434821844100952, + "learning_rate": 3.7618637759422236e-05, + "loss": 1.0993, + "step": 20188 + }, + { + "epoch": 0.7230110838541014, + "grad_norm": 1.4383658170700073, + "learning_rate": 3.760957270217633e-05, + "loss": 1.0752, + "step": 20189 + }, + { + "epoch": 0.7230468959836697, + "grad_norm": 1.450273036956787, + "learning_rate": 3.7600508484313146e-05, + "loss": 1.0607, + "step": 20190 + }, + { + "epoch": 0.7230827081132379, + "grad_norm": 1.7969293594360352, + "learning_rate": 3.759144510595467e-05, + "loss": 1.1427, + "step": 20191 + }, + { + "epoch": 0.7231185202428062, + "grad_norm": 1.2333307266235352, + "learning_rate": 3.7582382567222754e-05, + "loss": 0.7718, + "step": 20192 + }, + { + "epoch": 0.7231543323723745, + "grad_norm": 1.333814024925232, + "learning_rate": 3.757332086823937e-05, + "loss": 1.0973, + "step": 20193 + }, + { + "epoch": 0.7231901445019429, + "grad_norm": 1.3415558338165283, + "learning_rate": 3.756426000912644e-05, + "loss": 1.0961, + "step": 20194 + }, + { + "epoch": 0.7232259566315111, + "grad_norm": 1.4559359550476074, + "learning_rate": 3.7555199990005874e-05, + "loss": 0.8286, + "step": 20195 + }, + { + "epoch": 0.7232617687610794, + "grad_norm": 1.2134398221969604, + "learning_rate": 3.754614081099952e-05, + "loss": 0.9244, + "step": 20196 + }, + { + "epoch": 0.7232975808906477, + "grad_norm": 1.1446607112884521, + "learning_rate": 3.753708247222928e-05, + "loss": 0.9744, + "step": 20197 + }, + { + "epoch": 0.7233333930202159, + "grad_norm": 1.4531444311141968, + "learning_rate": 3.752802497381706e-05, + "loss": 1.1255, + "step": 20198 + }, + { + "epoch": 0.7233692051497842, + "grad_norm": 1.469375729560852, + "learning_rate": 3.751896831588464e-05, + "loss": 1.0538, + "step": 20199 + }, + { + "epoch": 0.7234050172793525, + "grad_norm": 1.5043398141860962, + "learning_rate": 3.7509912498553914e-05, + "loss": 1.1355, + "step": 20200 + }, + { + "epoch": 0.7234408294089208, + "grad_norm": 1.8141897916793823, + "learning_rate": 3.750085752194671e-05, + "loss": 0.9522, + "step": 20201 + }, + { + "epoch": 0.7234766415384891, + "grad_norm": 1.472542405128479, + "learning_rate": 3.749180338618488e-05, + "loss": 1.2246, + "step": 20202 + }, + { + "epoch": 0.7235124536680574, + "grad_norm": 1.614037036895752, + "learning_rate": 3.7482750091390176e-05, + "loss": 1.1787, + "step": 20203 + }, + { + "epoch": 0.7235482657976257, + "grad_norm": 1.3800424337387085, + "learning_rate": 3.7473697637684416e-05, + "loss": 1.0016, + "step": 20204 + }, + { + "epoch": 0.7235840779271939, + "grad_norm": 1.2877049446105957, + "learning_rate": 3.746464602518941e-05, + "loss": 1.0474, + "step": 20205 + }, + { + "epoch": 0.7236198900567622, + "grad_norm": 1.8072481155395508, + "learning_rate": 3.745559525402696e-05, + "loss": 1.29, + "step": 20206 + }, + { + "epoch": 0.7236557021863305, + "grad_norm": 1.7188340425491333, + "learning_rate": 3.744654532431876e-05, + "loss": 1.1495, + "step": 20207 + }, + { + "epoch": 0.7236915143158988, + "grad_norm": 1.8005207777023315, + "learning_rate": 3.743749623618661e-05, + "loss": 1.1002, + "step": 20208 + }, + { + "epoch": 0.7237273264454671, + "grad_norm": 1.4144973754882812, + "learning_rate": 3.742844798975229e-05, + "loss": 1.0787, + "step": 20209 + }, + { + "epoch": 0.7237631385750354, + "grad_norm": 1.9920176267623901, + "learning_rate": 3.7419400585137444e-05, + "loss": 1.2693, + "step": 20210 + }, + { + "epoch": 0.7237989507046036, + "grad_norm": 1.3573344945907593, + "learning_rate": 3.741035402246385e-05, + "loss": 1.1415, + "step": 20211 + }, + { + "epoch": 0.7238347628341719, + "grad_norm": 1.315794825553894, + "learning_rate": 3.74013083018532e-05, + "loss": 1.2117, + "step": 20212 + }, + { + "epoch": 0.7238705749637402, + "grad_norm": 1.576195478439331, + "learning_rate": 3.7392263423427234e-05, + "loss": 1.212, + "step": 20213 + }, + { + "epoch": 0.7239063870933085, + "grad_norm": 1.385349988937378, + "learning_rate": 3.738321938730758e-05, + "loss": 1.0218, + "step": 20214 + }, + { + "epoch": 0.7239421992228768, + "grad_norm": 1.4855049848556519, + "learning_rate": 3.737417619361593e-05, + "loss": 1.0278, + "step": 20215 + }, + { + "epoch": 0.7239780113524451, + "grad_norm": 1.3722401857376099, + "learning_rate": 3.7365133842473995e-05, + "loss": 1.1246, + "step": 20216 + }, + { + "epoch": 0.7240138234820134, + "grad_norm": 1.295350193977356, + "learning_rate": 3.735609233400336e-05, + "loss": 1.105, + "step": 20217 + }, + { + "epoch": 0.7240496356115816, + "grad_norm": 1.496270775794983, + "learning_rate": 3.734705166832569e-05, + "loss": 0.979, + "step": 20218 + }, + { + "epoch": 0.7240854477411499, + "grad_norm": 1.5977015495300293, + "learning_rate": 3.7338011845562624e-05, + "loss": 1.0212, + "step": 20219 + }, + { + "epoch": 0.7241212598707182, + "grad_norm": 1.7099528312683105, + "learning_rate": 3.732897286583582e-05, + "loss": 1.1789, + "step": 20220 + }, + { + "epoch": 0.7241570720002865, + "grad_norm": 1.5410704612731934, + "learning_rate": 3.7319934729266814e-05, + "loss": 1.21, + "step": 20221 + }, + { + "epoch": 0.7241928841298548, + "grad_norm": 1.6787137985229492, + "learning_rate": 3.731089743597723e-05, + "loss": 1.3365, + "step": 20222 + }, + { + "epoch": 0.7242286962594231, + "grad_norm": 1.793055772781372, + "learning_rate": 3.7301860986088666e-05, + "loss": 1.2753, + "step": 20223 + }, + { + "epoch": 0.7242645083889914, + "grad_norm": 2.1498608589172363, + "learning_rate": 3.729282537972272e-05, + "loss": 1.2161, + "step": 20224 + }, + { + "epoch": 0.7243003205185596, + "grad_norm": 1.4193267822265625, + "learning_rate": 3.728379061700091e-05, + "loss": 1.0155, + "step": 20225 + }, + { + "epoch": 0.7243361326481279, + "grad_norm": 1.8800077438354492, + "learning_rate": 3.727475669804474e-05, + "loss": 1.0871, + "step": 20226 + }, + { + "epoch": 0.7243719447776962, + "grad_norm": 1.6940672397613525, + "learning_rate": 3.726572362297588e-05, + "loss": 1.2151, + "step": 20227 + }, + { + "epoch": 0.7244077569072644, + "grad_norm": 1.368072271347046, + "learning_rate": 3.725669139191574e-05, + "loss": 1.0931, + "step": 20228 + }, + { + "epoch": 0.7244435690368328, + "grad_norm": 1.6835294961929321, + "learning_rate": 3.7247660004985897e-05, + "loss": 1.2034, + "step": 20229 + }, + { + "epoch": 0.7244793811664011, + "grad_norm": 1.4944958686828613, + "learning_rate": 3.723862946230784e-05, + "loss": 0.9792, + "step": 20230 + }, + { + "epoch": 0.7245151932959694, + "grad_norm": 1.5507893562316895, + "learning_rate": 3.7229599764003096e-05, + "loss": 1.0567, + "step": 20231 + }, + { + "epoch": 0.7245510054255376, + "grad_norm": 1.5279786586761475, + "learning_rate": 3.7220570910193096e-05, + "loss": 1.0686, + "step": 20232 + }, + { + "epoch": 0.7245868175551059, + "grad_norm": 1.2407785654067993, + "learning_rate": 3.721154290099933e-05, + "loss": 0.9636, + "step": 20233 + }, + { + "epoch": 0.7246226296846742, + "grad_norm": 1.1384119987487793, + "learning_rate": 3.7202515736543296e-05, + "loss": 0.9933, + "step": 20234 + }, + { + "epoch": 0.7246584418142424, + "grad_norm": 1.433936595916748, + "learning_rate": 3.7193489416946383e-05, + "loss": 1.102, + "step": 20235 + }, + { + "epoch": 0.7246942539438108, + "grad_norm": 1.2357282638549805, + "learning_rate": 3.718446394233007e-05, + "loss": 1.1786, + "step": 20236 + }, + { + "epoch": 0.7247300660733791, + "grad_norm": 1.453347086906433, + "learning_rate": 3.717543931281572e-05, + "loss": 1.0273, + "step": 20237 + }, + { + "epoch": 0.7247658782029474, + "grad_norm": 1.5569952726364136, + "learning_rate": 3.7166415528524854e-05, + "loss": 0.843, + "step": 20238 + }, + { + "epoch": 0.7248016903325156, + "grad_norm": 1.5447070598602295, + "learning_rate": 3.715739258957879e-05, + "loss": 1.0591, + "step": 20239 + }, + { + "epoch": 0.7248375024620839, + "grad_norm": 1.2614531517028809, + "learning_rate": 3.714837049609898e-05, + "loss": 1.1972, + "step": 20240 + }, + { + "epoch": 0.7248733145916522, + "grad_norm": 1.231037974357605, + "learning_rate": 3.71393492482067e-05, + "loss": 1.0872, + "step": 20241 + }, + { + "epoch": 0.7249091267212204, + "grad_norm": 1.5318448543548584, + "learning_rate": 3.713032884602346e-05, + "loss": 1.0839, + "step": 20242 + }, + { + "epoch": 0.7249449388507888, + "grad_norm": 1.6646864414215088, + "learning_rate": 3.712130928967056e-05, + "loss": 1.1538, + "step": 20243 + }, + { + "epoch": 0.7249807509803571, + "grad_norm": 1.2316625118255615, + "learning_rate": 3.711229057926925e-05, + "loss": 1.288, + "step": 20244 + }, + { + "epoch": 0.7250165631099253, + "grad_norm": 1.3288743495941162, + "learning_rate": 3.710327271494103e-05, + "loss": 1.1127, + "step": 20245 + }, + { + "epoch": 0.7250523752394936, + "grad_norm": 1.7972254753112793, + "learning_rate": 3.709425569680711e-05, + "loss": 1.3031, + "step": 20246 + }, + { + "epoch": 0.7250881873690619, + "grad_norm": 1.5744205713272095, + "learning_rate": 3.708523952498887e-05, + "loss": 1.095, + "step": 20247 + }, + { + "epoch": 0.7251239994986302, + "grad_norm": 1.422448992729187, + "learning_rate": 3.707622419960751e-05, + "loss": 1.114, + "step": 20248 + }, + { + "epoch": 0.7251598116281984, + "grad_norm": 1.6741845607757568, + "learning_rate": 3.7067209720784456e-05, + "loss": 0.9722, + "step": 20249 + }, + { + "epoch": 0.7251956237577668, + "grad_norm": 1.2731021642684937, + "learning_rate": 3.705819608864092e-05, + "loss": 1.0754, + "step": 20250 + }, + { + "epoch": 0.7252314358873351, + "grad_norm": 1.5772334337234497, + "learning_rate": 3.704918330329813e-05, + "loss": 1.2161, + "step": 20251 + }, + { + "epoch": 0.7252672480169033, + "grad_norm": 1.9488611221313477, + "learning_rate": 3.704017136487737e-05, + "loss": 1.1008, + "step": 20252 + }, + { + "epoch": 0.7253030601464716, + "grad_norm": 1.327275037765503, + "learning_rate": 3.70311602734999e-05, + "loss": 1.1029, + "step": 20253 + }, + { + "epoch": 0.7253388722760399, + "grad_norm": 1.5375217199325562, + "learning_rate": 3.702215002928699e-05, + "loss": 0.9721, + "step": 20254 + }, + { + "epoch": 0.7253746844056081, + "grad_norm": 2.3161253929138184, + "learning_rate": 3.701314063235972e-05, + "loss": 1.031, + "step": 20255 + }, + { + "epoch": 0.7254104965351764, + "grad_norm": 1.3651247024536133, + "learning_rate": 3.7004132082839485e-05, + "loss": 0.775, + "step": 20256 + }, + { + "epoch": 0.7254463086647448, + "grad_norm": 1.4083740711212158, + "learning_rate": 3.699512438084736e-05, + "loss": 0.932, + "step": 20257 + }, + { + "epoch": 0.7254821207943131, + "grad_norm": 1.6415237188339233, + "learning_rate": 3.6986117526504595e-05, + "loss": 1.0642, + "step": 20258 + }, + { + "epoch": 0.7255179329238813, + "grad_norm": 1.8860714435577393, + "learning_rate": 3.6977111519932295e-05, + "loss": 1.0622, + "step": 20259 + }, + { + "epoch": 0.7255537450534496, + "grad_norm": 1.5875200033187866, + "learning_rate": 3.696810636125168e-05, + "loss": 1.1915, + "step": 20260 + }, + { + "epoch": 0.7255895571830179, + "grad_norm": 1.9184483289718628, + "learning_rate": 3.69591020505839e-05, + "loss": 1.2723, + "step": 20261 + }, + { + "epoch": 0.7256253693125861, + "grad_norm": 1.3043006658554077, + "learning_rate": 3.6950098588050074e-05, + "loss": 0.9326, + "step": 20262 + }, + { + "epoch": 0.7256611814421544, + "grad_norm": 1.3949589729309082, + "learning_rate": 3.6941095973771334e-05, + "loss": 1.015, + "step": 20263 + }, + { + "epoch": 0.7256969935717228, + "grad_norm": 1.5395275354385376, + "learning_rate": 3.6932094207868806e-05, + "loss": 1.1878, + "step": 20264 + }, + { + "epoch": 0.7257328057012911, + "grad_norm": 1.6790809631347656, + "learning_rate": 3.692309329046364e-05, + "loss": 1.1523, + "step": 20265 + }, + { + "epoch": 0.7257686178308593, + "grad_norm": 1.3951787948608398, + "learning_rate": 3.691409322167685e-05, + "loss": 1.1409, + "step": 20266 + }, + { + "epoch": 0.7258044299604276, + "grad_norm": 1.3277356624603271, + "learning_rate": 3.690509400162957e-05, + "loss": 0.9221, + "step": 20267 + }, + { + "epoch": 0.7258402420899959, + "grad_norm": 1.4342652559280396, + "learning_rate": 3.689609563044288e-05, + "loss": 1.0988, + "step": 20268 + }, + { + "epoch": 0.7258760542195641, + "grad_norm": 1.4835834503173828, + "learning_rate": 3.68870981082378e-05, + "loss": 1.008, + "step": 20269 + }, + { + "epoch": 0.7259118663491324, + "grad_norm": 1.4087071418762207, + "learning_rate": 3.687810143513541e-05, + "loss": 0.9386, + "step": 20270 + }, + { + "epoch": 0.7259476784787008, + "grad_norm": 1.3782154321670532, + "learning_rate": 3.686910561125675e-05, + "loss": 1.112, + "step": 20271 + }, + { + "epoch": 0.725983490608269, + "grad_norm": 1.5693477392196655, + "learning_rate": 3.6860110636722856e-05, + "loss": 1.1702, + "step": 20272 + }, + { + "epoch": 0.7260193027378373, + "grad_norm": 1.3124364614486694, + "learning_rate": 3.6851116511654705e-05, + "loss": 0.9827, + "step": 20273 + }, + { + "epoch": 0.7260551148674056, + "grad_norm": 1.223139762878418, + "learning_rate": 3.684212323617333e-05, + "loss": 1.0568, + "step": 20274 + }, + { + "epoch": 0.7260909269969739, + "grad_norm": 1.3059138059616089, + "learning_rate": 3.683313081039971e-05, + "loss": 1.1484, + "step": 20275 + }, + { + "epoch": 0.7261267391265421, + "grad_norm": 1.3771421909332275, + "learning_rate": 3.6824139234454876e-05, + "loss": 1.1091, + "step": 20276 + }, + { + "epoch": 0.7261625512561104, + "grad_norm": 1.5163146257400513, + "learning_rate": 3.681514850845972e-05, + "loss": 1.1349, + "step": 20277 + }, + { + "epoch": 0.7261983633856788, + "grad_norm": 1.7554609775543213, + "learning_rate": 3.6806158632535235e-05, + "loss": 1.1277, + "step": 20278 + }, + { + "epoch": 0.726234175515247, + "grad_norm": 1.3747751712799072, + "learning_rate": 3.679716960680242e-05, + "loss": 1.1981, + "step": 20279 + }, + { + "epoch": 0.7262699876448153, + "grad_norm": 1.3403066396713257, + "learning_rate": 3.6788181431382106e-05, + "loss": 1.2317, + "step": 20280 + }, + { + "epoch": 0.7263057997743836, + "grad_norm": 1.3462518453598022, + "learning_rate": 3.6779194106395285e-05, + "loss": 1.1975, + "step": 20281 + }, + { + "epoch": 0.7263416119039519, + "grad_norm": 1.3301961421966553, + "learning_rate": 3.677020763196286e-05, + "loss": 1.2098, + "step": 20282 + }, + { + "epoch": 0.7263774240335201, + "grad_norm": 2.68928861618042, + "learning_rate": 3.676122200820577e-05, + "loss": 1.6141, + "step": 20283 + }, + { + "epoch": 0.7264132361630884, + "grad_norm": 1.9639216661453247, + "learning_rate": 3.6752237235244825e-05, + "loss": 1.156, + "step": 20284 + }, + { + "epoch": 0.7264490482926568, + "grad_norm": 1.154819369316101, + "learning_rate": 3.6743253313200945e-05, + "loss": 0.9207, + "step": 20285 + }, + { + "epoch": 0.726484860422225, + "grad_norm": 1.475355863571167, + "learning_rate": 3.673427024219502e-05, + "loss": 1.3696, + "step": 20286 + }, + { + "epoch": 0.7265206725517933, + "grad_norm": 1.443724513053894, + "learning_rate": 3.672528802234786e-05, + "loss": 0.8686, + "step": 20287 + }, + { + "epoch": 0.7265564846813616, + "grad_norm": 1.367855429649353, + "learning_rate": 3.671630665378033e-05, + "loss": 1.2244, + "step": 20288 + }, + { + "epoch": 0.7265922968109298, + "grad_norm": 1.3237850666046143, + "learning_rate": 3.670732613661326e-05, + "loss": 1.0663, + "step": 20289 + }, + { + "epoch": 0.7266281089404981, + "grad_norm": 1.3087369203567505, + "learning_rate": 3.669834647096752e-05, + "loss": 1.0799, + "step": 20290 + }, + { + "epoch": 0.7266639210700664, + "grad_norm": 1.769494891166687, + "learning_rate": 3.668936765696383e-05, + "loss": 1.0673, + "step": 20291 + }, + { + "epoch": 0.7266997331996348, + "grad_norm": 1.7277048826217651, + "learning_rate": 3.6680389694723025e-05, + "loss": 1.2288, + "step": 20292 + }, + { + "epoch": 0.726735545329203, + "grad_norm": 2.0763766765594482, + "learning_rate": 3.667141258436592e-05, + "loss": 1.3077, + "step": 20293 + }, + { + "epoch": 0.7267713574587713, + "grad_norm": 1.3345342874526978, + "learning_rate": 3.666243632601329e-05, + "loss": 1.2631, + "step": 20294 + }, + { + "epoch": 0.7268071695883396, + "grad_norm": 1.5232303142547607, + "learning_rate": 3.6653460919785855e-05, + "loss": 1.1319, + "step": 20295 + }, + { + "epoch": 0.7268429817179078, + "grad_norm": 1.5934008359909058, + "learning_rate": 3.6644486365804385e-05, + "loss": 0.9659, + "step": 20296 + }, + { + "epoch": 0.7268787938474761, + "grad_norm": 1.4938950538635254, + "learning_rate": 3.663551266418966e-05, + "loss": 1.1468, + "step": 20297 + }, + { + "epoch": 0.7269146059770444, + "grad_norm": 1.6983680725097656, + "learning_rate": 3.662653981506235e-05, + "loss": 1.0503, + "step": 20298 + }, + { + "epoch": 0.7269504181066128, + "grad_norm": 1.847158670425415, + "learning_rate": 3.661756781854321e-05, + "loss": 1.0633, + "step": 20299 + }, + { + "epoch": 0.726986230236181, + "grad_norm": 1.4004197120666504, + "learning_rate": 3.660859667475293e-05, + "loss": 0.9528, + "step": 20300 + }, + { + "epoch": 0.7270220423657493, + "grad_norm": 1.9773757457733154, + "learning_rate": 3.659962638381224e-05, + "loss": 1.2634, + "step": 20301 + }, + { + "epoch": 0.7270578544953176, + "grad_norm": 1.522645354270935, + "learning_rate": 3.6590656945841775e-05, + "loss": 1.082, + "step": 20302 + }, + { + "epoch": 0.7270936666248858, + "grad_norm": 1.3402354717254639, + "learning_rate": 3.6581688360962206e-05, + "loss": 1.0746, + "step": 20303 + }, + { + "epoch": 0.7271294787544541, + "grad_norm": 1.4645977020263672, + "learning_rate": 3.6572720629294276e-05, + "loss": 1.2414, + "step": 20304 + }, + { + "epoch": 0.7271652908840224, + "grad_norm": 1.4708505868911743, + "learning_rate": 3.656375375095853e-05, + "loss": 0.9907, + "step": 20305 + }, + { + "epoch": 0.7272011030135908, + "grad_norm": 1.715419888496399, + "learning_rate": 3.655478772607565e-05, + "loss": 1.1349, + "step": 20306 + }, + { + "epoch": 0.727236915143159, + "grad_norm": 1.7285668849945068, + "learning_rate": 3.654582255476626e-05, + "loss": 1.2193, + "step": 20307 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.7003909349441528, + "learning_rate": 3.6536858237151015e-05, + "loss": 0.939, + "step": 20308 + }, + { + "epoch": 0.7273085394022956, + "grad_norm": 1.54055917263031, + "learning_rate": 3.652789477335045e-05, + "loss": 1.0832, + "step": 20309 + }, + { + "epoch": 0.7273443515318638, + "grad_norm": 2.045171022415161, + "learning_rate": 3.651893216348517e-05, + "loss": 1.1205, + "step": 20310 + }, + { + "epoch": 0.7273801636614321, + "grad_norm": 1.4154284000396729, + "learning_rate": 3.65099704076758e-05, + "loss": 0.9539, + "step": 20311 + }, + { + "epoch": 0.7274159757910004, + "grad_norm": 1.7653424739837646, + "learning_rate": 3.650100950604289e-05, + "loss": 1.2127, + "step": 20312 + }, + { + "epoch": 0.7274517879205687, + "grad_norm": 1.347849726676941, + "learning_rate": 3.649204945870701e-05, + "loss": 1.1281, + "step": 20313 + }, + { + "epoch": 0.727487600050137, + "grad_norm": 1.35219407081604, + "learning_rate": 3.6483090265788614e-05, + "loss": 0.9617, + "step": 20314 + }, + { + "epoch": 0.7275234121797053, + "grad_norm": 1.4067796468734741, + "learning_rate": 3.647413192740836e-05, + "loss": 0.9356, + "step": 20315 + }, + { + "epoch": 0.7275592243092736, + "grad_norm": 1.9088937044143677, + "learning_rate": 3.64651744436867e-05, + "loss": 1.1285, + "step": 20316 + }, + { + "epoch": 0.7275950364388418, + "grad_norm": 2.5009355545043945, + "learning_rate": 3.6456217814744165e-05, + "loss": 1.2545, + "step": 20317 + }, + { + "epoch": 0.7276308485684101, + "grad_norm": 1.455925703048706, + "learning_rate": 3.644726204070125e-05, + "loss": 1.2379, + "step": 20318 + }, + { + "epoch": 0.7276666606979784, + "grad_norm": 1.3460594415664673, + "learning_rate": 3.643830712167847e-05, + "loss": 1.171, + "step": 20319 + }, + { + "epoch": 0.7277024728275467, + "grad_norm": 1.436478853225708, + "learning_rate": 3.6429353057796255e-05, + "loss": 1.0059, + "step": 20320 + }, + { + "epoch": 0.727738284957115, + "grad_norm": 1.9546873569488525, + "learning_rate": 3.642039984917509e-05, + "loss": 1.1089, + "step": 20321 + }, + { + "epoch": 0.7277740970866833, + "grad_norm": 1.6718662977218628, + "learning_rate": 3.641144749593548e-05, + "loss": 1.1322, + "step": 20322 + }, + { + "epoch": 0.7278099092162515, + "grad_norm": 1.4107675552368164, + "learning_rate": 3.640249599819777e-05, + "loss": 1.1476, + "step": 20323 + }, + { + "epoch": 0.7278457213458198, + "grad_norm": 1.3446643352508545, + "learning_rate": 3.639354535608248e-05, + "loss": 1.1044, + "step": 20324 + }, + { + "epoch": 0.7278815334753881, + "grad_norm": 1.1492470502853394, + "learning_rate": 3.638459556970993e-05, + "loss": 0.9587, + "step": 20325 + }, + { + "epoch": 0.7279173456049564, + "grad_norm": 1.6996194124221802, + "learning_rate": 3.637564663920066e-05, + "loss": 1.1607, + "step": 20326 + }, + { + "epoch": 0.7279531577345247, + "grad_norm": 1.194525957107544, + "learning_rate": 3.636669856467495e-05, + "loss": 1.1015, + "step": 20327 + }, + { + "epoch": 0.727988969864093, + "grad_norm": 1.849172592163086, + "learning_rate": 3.635775134625323e-05, + "loss": 1.1464, + "step": 20328 + }, + { + "epoch": 0.7280247819936613, + "grad_norm": 1.1057987213134766, + "learning_rate": 3.634880498405587e-05, + "loss": 0.974, + "step": 20329 + }, + { + "epoch": 0.7280605941232295, + "grad_norm": 1.579934000968933, + "learning_rate": 3.6339859478203274e-05, + "loss": 0.9514, + "step": 20330 + }, + { + "epoch": 0.7280964062527978, + "grad_norm": 2.3171098232269287, + "learning_rate": 3.6330914828815755e-05, + "loss": 1.051, + "step": 20331 + }, + { + "epoch": 0.7281322183823661, + "grad_norm": 1.602730631828308, + "learning_rate": 3.632197103601358e-05, + "loss": 0.8693, + "step": 20332 + }, + { + "epoch": 0.7281680305119343, + "grad_norm": 1.498876929283142, + "learning_rate": 3.6313028099917226e-05, + "loss": 1.0645, + "step": 20333 + }, + { + "epoch": 0.7282038426415027, + "grad_norm": 1.3784674406051636, + "learning_rate": 3.6304086020646874e-05, + "loss": 0.9857, + "step": 20334 + }, + { + "epoch": 0.728239654771071, + "grad_norm": 1.846372365951538, + "learning_rate": 3.629514479832292e-05, + "loss": 1.1941, + "step": 20335 + }, + { + "epoch": 0.7282754669006393, + "grad_norm": 1.947084903717041, + "learning_rate": 3.628620443306556e-05, + "loss": 1.1175, + "step": 20336 + }, + { + "epoch": 0.7283112790302075, + "grad_norm": 1.398413062095642, + "learning_rate": 3.62772649249952e-05, + "loss": 1.12, + "step": 20337 + }, + { + "epoch": 0.7283470911597758, + "grad_norm": 1.44347083568573, + "learning_rate": 3.626832627423201e-05, + "loss": 1.306, + "step": 20338 + }, + { + "epoch": 0.7283829032893441, + "grad_norm": 1.3068617582321167, + "learning_rate": 3.6259388480896316e-05, + "loss": 1.0723, + "step": 20339 + }, + { + "epoch": 0.7284187154189123, + "grad_norm": 1.4629755020141602, + "learning_rate": 3.62504515451083e-05, + "loss": 0.9787, + "step": 20340 + }, + { + "epoch": 0.7284545275484807, + "grad_norm": 1.3924696445465088, + "learning_rate": 3.624151546698822e-05, + "loss": 1.1536, + "step": 20341 + }, + { + "epoch": 0.728490339678049, + "grad_norm": 1.631791591644287, + "learning_rate": 3.623258024665635e-05, + "loss": 1.0943, + "step": 20342 + }, + { + "epoch": 0.7285261518076173, + "grad_norm": 2.8699097633361816, + "learning_rate": 3.6223645884232784e-05, + "loss": 1.1403, + "step": 20343 + }, + { + "epoch": 0.7285619639371855, + "grad_norm": 1.6028741598129272, + "learning_rate": 3.621471237983787e-05, + "loss": 1.1469, + "step": 20344 + }, + { + "epoch": 0.7285977760667538, + "grad_norm": 2.152329444885254, + "learning_rate": 3.620577973359168e-05, + "loss": 1.1508, + "step": 20345 + }, + { + "epoch": 0.7286335881963221, + "grad_norm": 1.0386662483215332, + "learning_rate": 3.619684794561448e-05, + "loss": 0.7588, + "step": 20346 + }, + { + "epoch": 0.7286694003258903, + "grad_norm": 1.5132412910461426, + "learning_rate": 3.618791701602635e-05, + "loss": 1.0735, + "step": 20347 + }, + { + "epoch": 0.7287052124554587, + "grad_norm": 1.5054810047149658, + "learning_rate": 3.617898694494749e-05, + "loss": 1.1394, + "step": 20348 + }, + { + "epoch": 0.728741024585027, + "grad_norm": 1.402051329612732, + "learning_rate": 3.6170057732498064e-05, + "loss": 1.2431, + "step": 20349 + }, + { + "epoch": 0.7287768367145953, + "grad_norm": 1.5448884963989258, + "learning_rate": 3.616112937879814e-05, + "loss": 1.1633, + "step": 20350 + }, + { + "epoch": 0.7288126488441635, + "grad_norm": 1.4976102113723755, + "learning_rate": 3.6152201883967885e-05, + "loss": 1.1616, + "step": 20351 + }, + { + "epoch": 0.7288484609737318, + "grad_norm": 1.179800271987915, + "learning_rate": 3.6143275248127394e-05, + "loss": 1.0933, + "step": 20352 + }, + { + "epoch": 0.7288842731033001, + "grad_norm": 1.6484434604644775, + "learning_rate": 3.61343494713968e-05, + "loss": 0.9531, + "step": 20353 + }, + { + "epoch": 0.7289200852328683, + "grad_norm": 1.574347972869873, + "learning_rate": 3.612542455389608e-05, + "loss": 1.0662, + "step": 20354 + }, + { + "epoch": 0.7289558973624367, + "grad_norm": 1.561328649520874, + "learning_rate": 3.611650049574545e-05, + "loss": 1.1223, + "step": 20355 + }, + { + "epoch": 0.728991709492005, + "grad_norm": 1.5480252504348755, + "learning_rate": 3.61075772970649e-05, + "loss": 1.0998, + "step": 20356 + }, + { + "epoch": 0.7290275216215732, + "grad_norm": 1.7608948945999146, + "learning_rate": 3.609865495797445e-05, + "loss": 1.1617, + "step": 20357 + }, + { + "epoch": 0.7290633337511415, + "grad_norm": 1.5457850694656372, + "learning_rate": 3.608973347859418e-05, + "loss": 1.0654, + "step": 20358 + }, + { + "epoch": 0.7290991458807098, + "grad_norm": 1.3617408275604248, + "learning_rate": 3.6080812859044086e-05, + "loss": 1.1216, + "step": 20359 + }, + { + "epoch": 0.7291349580102781, + "grad_norm": 2.027702569961548, + "learning_rate": 3.607189309944427e-05, + "loss": 1.0854, + "step": 20360 + }, + { + "epoch": 0.7291707701398463, + "grad_norm": 1.5921249389648438, + "learning_rate": 3.6062974199914615e-05, + "loss": 1.1543, + "step": 20361 + }, + { + "epoch": 0.7292065822694147, + "grad_norm": 1.5193920135498047, + "learning_rate": 3.6054056160575164e-05, + "loss": 1.0654, + "step": 20362 + }, + { + "epoch": 0.729242394398983, + "grad_norm": 1.4763883352279663, + "learning_rate": 3.6045138981545915e-05, + "loss": 1.079, + "step": 20363 + }, + { + "epoch": 0.7292782065285512, + "grad_norm": 1.6553826332092285, + "learning_rate": 3.603622266294686e-05, + "loss": 0.9083, + "step": 20364 + }, + { + "epoch": 0.7293140186581195, + "grad_norm": 1.5033965110778809, + "learning_rate": 3.6027307204897886e-05, + "loss": 1.0856, + "step": 20365 + }, + { + "epoch": 0.7293498307876878, + "grad_norm": 1.669795036315918, + "learning_rate": 3.601839260751897e-05, + "loss": 1.0161, + "step": 20366 + }, + { + "epoch": 0.729385642917256, + "grad_norm": 1.430585265159607, + "learning_rate": 3.600947887093009e-05, + "loss": 1.0173, + "step": 20367 + }, + { + "epoch": 0.7294214550468243, + "grad_norm": 1.563734531402588, + "learning_rate": 3.600056599525109e-05, + "loss": 1.0082, + "step": 20368 + }, + { + "epoch": 0.7294572671763927, + "grad_norm": 1.4756085872650146, + "learning_rate": 3.5991653980601926e-05, + "loss": 0.9355, + "step": 20369 + }, + { + "epoch": 0.729493079305961, + "grad_norm": 1.9010878801345825, + "learning_rate": 3.59827428271025e-05, + "loss": 1.2172, + "step": 20370 + }, + { + "epoch": 0.7295288914355292, + "grad_norm": 1.3119187355041504, + "learning_rate": 3.597383253487272e-05, + "loss": 0.9942, + "step": 20371 + }, + { + "epoch": 0.7295647035650975, + "grad_norm": 1.3256675004959106, + "learning_rate": 3.59649231040324e-05, + "loss": 1.1328, + "step": 20372 + }, + { + "epoch": 0.7296005156946658, + "grad_norm": 1.4285409450531006, + "learning_rate": 3.595601453470143e-05, + "loss": 1.1289, + "step": 20373 + }, + { + "epoch": 0.729636327824234, + "grad_norm": 1.3867480754852295, + "learning_rate": 3.594710682699972e-05, + "loss": 1.2036, + "step": 20374 + }, + { + "epoch": 0.7296721399538023, + "grad_norm": 1.8197499513626099, + "learning_rate": 3.5938199981047036e-05, + "loss": 1.206, + "step": 20375 + }, + { + "epoch": 0.7297079520833707, + "grad_norm": 1.7940119504928589, + "learning_rate": 3.592929399696323e-05, + "loss": 1.081, + "step": 20376 + }, + { + "epoch": 0.729743764212939, + "grad_norm": 1.4694833755493164, + "learning_rate": 3.592038887486813e-05, + "loss": 1.1005, + "step": 20377 + }, + { + "epoch": 0.7297795763425072, + "grad_norm": 1.572357416152954, + "learning_rate": 3.591148461488157e-05, + "loss": 1.1147, + "step": 20378 + }, + { + "epoch": 0.7298153884720755, + "grad_norm": 1.8451966047286987, + "learning_rate": 3.590258121712329e-05, + "loss": 1.3606, + "step": 20379 + }, + { + "epoch": 0.7298512006016438, + "grad_norm": 1.4736701250076294, + "learning_rate": 3.589367868171309e-05, + "loss": 1.0406, + "step": 20380 + }, + { + "epoch": 0.729887012731212, + "grad_norm": 1.5121991634368896, + "learning_rate": 3.5884777008770765e-05, + "loss": 1.1577, + "step": 20381 + }, + { + "epoch": 0.7299228248607803, + "grad_norm": 1.158320665359497, + "learning_rate": 3.587587619841609e-05, + "loss": 0.7352, + "step": 20382 + }, + { + "epoch": 0.7299586369903487, + "grad_norm": 1.294810175895691, + "learning_rate": 3.586697625076876e-05, + "loss": 1.0774, + "step": 20383 + }, + { + "epoch": 0.729994449119917, + "grad_norm": 1.6082932949066162, + "learning_rate": 3.585807716594853e-05, + "loss": 1.204, + "step": 20384 + }, + { + "epoch": 0.7300302612494852, + "grad_norm": 1.7911632061004639, + "learning_rate": 3.584917894407517e-05, + "loss": 1.1596, + "step": 20385 + }, + { + "epoch": 0.7300660733790535, + "grad_norm": 1.4667490720748901, + "learning_rate": 3.584028158526832e-05, + "loss": 1.1605, + "step": 20386 + }, + { + "epoch": 0.7301018855086218, + "grad_norm": 1.8367403745651245, + "learning_rate": 3.583138508964773e-05, + "loss": 1.2263, + "step": 20387 + }, + { + "epoch": 0.73013769763819, + "grad_norm": 1.5558415651321411, + "learning_rate": 3.582248945733307e-05, + "loss": 1.1909, + "step": 20388 + }, + { + "epoch": 0.7301735097677583, + "grad_norm": 2.331193685531616, + "learning_rate": 3.581359468844408e-05, + "loss": 1.3013, + "step": 20389 + }, + { + "epoch": 0.7302093218973267, + "grad_norm": 1.53253972530365, + "learning_rate": 3.580470078310034e-05, + "loss": 0.9568, + "step": 20390 + }, + { + "epoch": 0.730245134026895, + "grad_norm": 1.9772151708602905, + "learning_rate": 3.579580774142155e-05, + "loss": 0.9516, + "step": 20391 + }, + { + "epoch": 0.7302809461564632, + "grad_norm": 1.603550672531128, + "learning_rate": 3.5786915563527376e-05, + "loss": 0.9165, + "step": 20392 + }, + { + "epoch": 0.7303167582860315, + "grad_norm": 1.5193686485290527, + "learning_rate": 3.577802424953739e-05, + "loss": 1.3136, + "step": 20393 + }, + { + "epoch": 0.7303525704155998, + "grad_norm": 1.682896614074707, + "learning_rate": 3.576913379957125e-05, + "loss": 1.0189, + "step": 20394 + }, + { + "epoch": 0.730388382545168, + "grad_norm": 1.3522123098373413, + "learning_rate": 3.5760244213748565e-05, + "loss": 1.1039, + "step": 20395 + }, + { + "epoch": 0.7304241946747363, + "grad_norm": 1.5496586561203003, + "learning_rate": 3.575135549218895e-05, + "loss": 1.2007, + "step": 20396 + }, + { + "epoch": 0.7304600068043047, + "grad_norm": 1.913693904876709, + "learning_rate": 3.5742467635011956e-05, + "loss": 1.0982, + "step": 20397 + }, + { + "epoch": 0.7304958189338729, + "grad_norm": 1.4487324953079224, + "learning_rate": 3.5733580642337174e-05, + "loss": 1.0831, + "step": 20398 + }, + { + "epoch": 0.7305316310634412, + "grad_norm": 1.314540982246399, + "learning_rate": 3.572469451428415e-05, + "loss": 0.9467, + "step": 20399 + }, + { + "epoch": 0.7305674431930095, + "grad_norm": 1.6447612047195435, + "learning_rate": 3.57158092509725e-05, + "loss": 0.9062, + "step": 20400 + }, + { + "epoch": 0.7306032553225777, + "grad_norm": 1.4503769874572754, + "learning_rate": 3.5706924852521674e-05, + "loss": 1.1502, + "step": 20401 + }, + { + "epoch": 0.730639067452146, + "grad_norm": 1.7861517667770386, + "learning_rate": 3.5698041319051245e-05, + "loss": 1.1836, + "step": 20402 + }, + { + "epoch": 0.7306748795817143, + "grad_norm": 1.249660611152649, + "learning_rate": 3.5689158650680765e-05, + "loss": 1.1281, + "step": 20403 + }, + { + "epoch": 0.7307106917112826, + "grad_norm": 1.5247341394424438, + "learning_rate": 3.568027684752966e-05, + "loss": 1.1144, + "step": 20404 + }, + { + "epoch": 0.7307465038408509, + "grad_norm": 1.4424333572387695, + "learning_rate": 3.5671395909717477e-05, + "loss": 1.061, + "step": 20405 + }, + { + "epoch": 0.7307823159704192, + "grad_norm": 2.0061099529266357, + "learning_rate": 3.566251583736367e-05, + "loss": 1.1597, + "step": 20406 + }, + { + "epoch": 0.7308181280999875, + "grad_norm": 1.3544061183929443, + "learning_rate": 3.5653636630587764e-05, + "loss": 0.9833, + "step": 20407 + }, + { + "epoch": 0.7308539402295557, + "grad_norm": 1.5345370769500732, + "learning_rate": 3.5644758289509126e-05, + "loss": 0.9436, + "step": 20408 + }, + { + "epoch": 0.730889752359124, + "grad_norm": 1.6421613693237305, + "learning_rate": 3.563588081424727e-05, + "loss": 1.0293, + "step": 20409 + }, + { + "epoch": 0.7309255644886923, + "grad_norm": 1.5774985551834106, + "learning_rate": 3.5627004204921645e-05, + "loss": 1.2453, + "step": 20410 + }, + { + "epoch": 0.7309613766182605, + "grad_norm": 1.75455641746521, + "learning_rate": 3.561812846165161e-05, + "loss": 1.1635, + "step": 20411 + }, + { + "epoch": 0.7309971887478289, + "grad_norm": 1.6299731731414795, + "learning_rate": 3.56092535845566e-05, + "loss": 0.965, + "step": 20412 + }, + { + "epoch": 0.7310330008773972, + "grad_norm": 1.3564461469650269, + "learning_rate": 3.560037957375604e-05, + "loss": 1.0178, + "step": 20413 + }, + { + "epoch": 0.7310688130069655, + "grad_norm": 1.5104775428771973, + "learning_rate": 3.5591506429369325e-05, + "loss": 1.2197, + "step": 20414 + }, + { + "epoch": 0.7311046251365337, + "grad_norm": 1.3041579723358154, + "learning_rate": 3.558263415151578e-05, + "loss": 1.0569, + "step": 20415 + }, + { + "epoch": 0.731140437266102, + "grad_norm": 1.2647672891616821, + "learning_rate": 3.557376274031481e-05, + "loss": 0.953, + "step": 20416 + }, + { + "epoch": 0.7311762493956703, + "grad_norm": 1.4995553493499756, + "learning_rate": 3.556489219588575e-05, + "loss": 1.2827, + "step": 20417 + }, + { + "epoch": 0.7312120615252385, + "grad_norm": 1.606918215751648, + "learning_rate": 3.5556022518347975e-05, + "loss": 1.1226, + "step": 20418 + }, + { + "epoch": 0.7312478736548069, + "grad_norm": 1.2943017482757568, + "learning_rate": 3.55471537078208e-05, + "loss": 0.9466, + "step": 20419 + }, + { + "epoch": 0.7312836857843752, + "grad_norm": 1.9945589303970337, + "learning_rate": 3.553828576442346e-05, + "loss": 1.4333, + "step": 20420 + }, + { + "epoch": 0.7313194979139435, + "grad_norm": 1.5014853477478027, + "learning_rate": 3.552941868827542e-05, + "loss": 1.107, + "step": 20421 + }, + { + "epoch": 0.7313553100435117, + "grad_norm": 1.31485915184021, + "learning_rate": 3.552055247949584e-05, + "loss": 1.1822, + "step": 20422 + }, + { + "epoch": 0.73139112217308, + "grad_norm": 1.9373217821121216, + "learning_rate": 3.5511687138204097e-05, + "loss": 0.9963, + "step": 20423 + }, + { + "epoch": 0.7314269343026483, + "grad_norm": 1.375658631324768, + "learning_rate": 3.5502822664519345e-05, + "loss": 1.1595, + "step": 20424 + }, + { + "epoch": 0.7314627464322165, + "grad_norm": 1.2700563669204712, + "learning_rate": 3.549395905856099e-05, + "loss": 0.9897, + "step": 20425 + }, + { + "epoch": 0.7314985585617849, + "grad_norm": 1.4079511165618896, + "learning_rate": 3.5485096320448176e-05, + "loss": 1.1607, + "step": 20426 + }, + { + "epoch": 0.7315343706913532, + "grad_norm": 1.2766467332839966, + "learning_rate": 3.547623445030016e-05, + "loss": 0.9231, + "step": 20427 + }, + { + "epoch": 0.7315701828209215, + "grad_norm": 1.5786253213882446, + "learning_rate": 3.546737344823623e-05, + "loss": 1.1664, + "step": 20428 + }, + { + "epoch": 0.7316059949504897, + "grad_norm": 1.4184545278549194, + "learning_rate": 3.545851331437551e-05, + "loss": 1.0786, + "step": 20429 + }, + { + "epoch": 0.731641807080058, + "grad_norm": 1.6504087448120117, + "learning_rate": 3.544965404883728e-05, + "loss": 0.8537, + "step": 20430 + }, + { + "epoch": 0.7316776192096263, + "grad_norm": 1.3748869895935059, + "learning_rate": 3.544079565174061e-05, + "loss": 1.0961, + "step": 20431 + }, + { + "epoch": 0.7317134313391945, + "grad_norm": 1.7220271825790405, + "learning_rate": 3.543193812320483e-05, + "loss": 1.1294, + "step": 20432 + }, + { + "epoch": 0.7317492434687629, + "grad_norm": 1.4855073690414429, + "learning_rate": 3.542308146334901e-05, + "loss": 1.2513, + "step": 20433 + }, + { + "epoch": 0.7317850555983312, + "grad_norm": 1.7201164960861206, + "learning_rate": 3.541422567229235e-05, + "loss": 1.1165, + "step": 20434 + }, + { + "epoch": 0.7318208677278994, + "grad_norm": 1.3898001909255981, + "learning_rate": 3.540537075015393e-05, + "loss": 1.1839, + "step": 20435 + }, + { + "epoch": 0.7318566798574677, + "grad_norm": 1.5059770345687866, + "learning_rate": 3.539651669705297e-05, + "loss": 1.1341, + "step": 20436 + }, + { + "epoch": 0.731892491987036, + "grad_norm": 1.7153747081756592, + "learning_rate": 3.538766351310856e-05, + "loss": 1.1689, + "step": 20437 + }, + { + "epoch": 0.7319283041166043, + "grad_norm": 1.3963359594345093, + "learning_rate": 3.537881119843972e-05, + "loss": 0.7696, + "step": 20438 + }, + { + "epoch": 0.7319641162461725, + "grad_norm": 1.272800087928772, + "learning_rate": 3.5369959753165694e-05, + "loss": 1.0893, + "step": 20439 + }, + { + "epoch": 0.7319999283757409, + "grad_norm": 1.4487537145614624, + "learning_rate": 3.536110917740545e-05, + "loss": 1.0486, + "step": 20440 + }, + { + "epoch": 0.7320357405053092, + "grad_norm": 1.516292929649353, + "learning_rate": 3.5352259471278146e-05, + "loss": 0.9708, + "step": 20441 + }, + { + "epoch": 0.7320715526348774, + "grad_norm": 1.6411049365997314, + "learning_rate": 3.534341063490273e-05, + "loss": 1.0038, + "step": 20442 + }, + { + "epoch": 0.7321073647644457, + "grad_norm": 1.377791404724121, + "learning_rate": 3.533456266839838e-05, + "loss": 1.0217, + "step": 20443 + }, + { + "epoch": 0.732143176894014, + "grad_norm": 1.3869508504867554, + "learning_rate": 3.532571557188409e-05, + "loss": 1.0776, + "step": 20444 + }, + { + "epoch": 0.7321789890235822, + "grad_norm": 1.8353484869003296, + "learning_rate": 3.531686934547884e-05, + "loss": 0.9746, + "step": 20445 + }, + { + "epoch": 0.7322148011531505, + "grad_norm": 1.408759593963623, + "learning_rate": 3.5308023989301676e-05, + "loss": 0.9796, + "step": 20446 + }, + { + "epoch": 0.7322506132827189, + "grad_norm": 1.8675764799118042, + "learning_rate": 3.529917950347159e-05, + "loss": 1.1551, + "step": 20447 + }, + { + "epoch": 0.7322864254122872, + "grad_norm": 1.350120186805725, + "learning_rate": 3.529033588810764e-05, + "loss": 0.845, + "step": 20448 + }, + { + "epoch": 0.7323222375418554, + "grad_norm": 1.4789886474609375, + "learning_rate": 3.52814931433287e-05, + "loss": 1.1214, + "step": 20449 + }, + { + "epoch": 0.7323580496714237, + "grad_norm": 1.6340352296829224, + "learning_rate": 3.52726512692538e-05, + "loss": 1.2422, + "step": 20450 + }, + { + "epoch": 0.732393861800992, + "grad_norm": 1.4384750127792358, + "learning_rate": 3.526381026600188e-05, + "loss": 0.9699, + "step": 20451 + }, + { + "epoch": 0.7324296739305602, + "grad_norm": 1.3089923858642578, + "learning_rate": 3.5254970133691925e-05, + "loss": 0.9838, + "step": 20452 + }, + { + "epoch": 0.7324654860601285, + "grad_norm": 2.242314100265503, + "learning_rate": 3.5246130872442794e-05, + "loss": 1.0304, + "step": 20453 + }, + { + "epoch": 0.7325012981896969, + "grad_norm": 1.5570831298828125, + "learning_rate": 3.523729248237345e-05, + "loss": 0.9786, + "step": 20454 + }, + { + "epoch": 0.7325371103192652, + "grad_norm": 1.690359115600586, + "learning_rate": 3.522845496360283e-05, + "loss": 1.1792, + "step": 20455 + }, + { + "epoch": 0.7325729224488334, + "grad_norm": 1.5064706802368164, + "learning_rate": 3.5219618316249766e-05, + "loss": 1.1646, + "step": 20456 + }, + { + "epoch": 0.7326087345784017, + "grad_norm": 1.3843210935592651, + "learning_rate": 3.521078254043317e-05, + "loss": 1.2389, + "step": 20457 + }, + { + "epoch": 0.73264454670797, + "grad_norm": 1.4531301259994507, + "learning_rate": 3.5201947636271934e-05, + "loss": 0.9036, + "step": 20458 + }, + { + "epoch": 0.7326803588375382, + "grad_norm": 1.7721443176269531, + "learning_rate": 3.519311360388494e-05, + "loss": 1.1184, + "step": 20459 + }, + { + "epoch": 0.7327161709671065, + "grad_norm": 1.3174203634262085, + "learning_rate": 3.518428044339097e-05, + "loss": 1.0559, + "step": 20460 + }, + { + "epoch": 0.7327519830966749, + "grad_norm": 1.5790385007858276, + "learning_rate": 3.5175448154908895e-05, + "loss": 1.1245, + "step": 20461 + }, + { + "epoch": 0.7327877952262432, + "grad_norm": 1.4918526411056519, + "learning_rate": 3.516661673855759e-05, + "loss": 1.1397, + "step": 20462 + }, + { + "epoch": 0.7328236073558114, + "grad_norm": 1.5187240839004517, + "learning_rate": 3.51577861944558e-05, + "loss": 1.152, + "step": 20463 + }, + { + "epoch": 0.7328594194853797, + "grad_norm": 2.136610507965088, + "learning_rate": 3.5148956522722346e-05, + "loss": 0.9768, + "step": 20464 + }, + { + "epoch": 0.732895231614948, + "grad_norm": 1.784619688987732, + "learning_rate": 3.5140127723476034e-05, + "loss": 1.085, + "step": 20465 + }, + { + "epoch": 0.7329310437445162, + "grad_norm": 1.4656033515930176, + "learning_rate": 3.513129979683567e-05, + "loss": 0.9537, + "step": 20466 + }, + { + "epoch": 0.7329668558740845, + "grad_norm": 1.3607635498046875, + "learning_rate": 3.5122472742919965e-05, + "loss": 1.002, + "step": 20467 + }, + { + "epoch": 0.7330026680036529, + "grad_norm": 1.682145118713379, + "learning_rate": 3.51136465618477e-05, + "loss": 1.0718, + "step": 20468 + }, + { + "epoch": 0.7330384801332211, + "grad_norm": 1.393545150756836, + "learning_rate": 3.510482125373762e-05, + "loss": 0.8287, + "step": 20469 + }, + { + "epoch": 0.7330742922627894, + "grad_norm": 1.5103236436843872, + "learning_rate": 3.50959968187085e-05, + "loss": 0.9508, + "step": 20470 + }, + { + "epoch": 0.7331101043923577, + "grad_norm": 1.5041073560714722, + "learning_rate": 3.508717325687898e-05, + "loss": 0.9442, + "step": 20471 + }, + { + "epoch": 0.733145916521926, + "grad_norm": 1.921095371246338, + "learning_rate": 3.5078350568367825e-05, + "loss": 1.0936, + "step": 20472 + }, + { + "epoch": 0.7331817286514942, + "grad_norm": 1.689829707145691, + "learning_rate": 3.5069528753293746e-05, + "loss": 1.1713, + "step": 20473 + }, + { + "epoch": 0.7332175407810625, + "grad_norm": 1.5643707513809204, + "learning_rate": 3.506070781177537e-05, + "loss": 1.122, + "step": 20474 + }, + { + "epoch": 0.7332533529106309, + "grad_norm": 1.6106005907058716, + "learning_rate": 3.505188774393141e-05, + "loss": 1.0986, + "step": 20475 + }, + { + "epoch": 0.7332891650401991, + "grad_norm": 1.2528482675552368, + "learning_rate": 3.504306854988052e-05, + "loss": 1.0959, + "step": 20476 + }, + { + "epoch": 0.7333249771697674, + "grad_norm": 1.6176328659057617, + "learning_rate": 3.5034250229741384e-05, + "loss": 1.217, + "step": 20477 + }, + { + "epoch": 0.7333607892993357, + "grad_norm": 1.7032605409622192, + "learning_rate": 3.5025432783632585e-05, + "loss": 1.2785, + "step": 20478 + }, + { + "epoch": 0.733396601428904, + "grad_norm": 1.7889620065689087, + "learning_rate": 3.501661621167277e-05, + "loss": 1.0072, + "step": 20479 + }, + { + "epoch": 0.7334324135584722, + "grad_norm": 1.7494916915893555, + "learning_rate": 3.50078005139806e-05, + "loss": 1.1979, + "step": 20480 + }, + { + "epoch": 0.7334682256880405, + "grad_norm": 2.1243996620178223, + "learning_rate": 3.49989856906746e-05, + "loss": 1.1621, + "step": 20481 + }, + { + "epoch": 0.7335040378176089, + "grad_norm": 2.1853339672088623, + "learning_rate": 3.499017174187341e-05, + "loss": 1.3115, + "step": 20482 + }, + { + "epoch": 0.7335398499471771, + "grad_norm": 1.3893216848373413, + "learning_rate": 3.498135866769561e-05, + "loss": 1.1678, + "step": 20483 + }, + { + "epoch": 0.7335756620767454, + "grad_norm": 1.4884768724441528, + "learning_rate": 3.497254646825978e-05, + "loss": 1.0727, + "step": 20484 + }, + { + "epoch": 0.7336114742063137, + "grad_norm": 1.6442304849624634, + "learning_rate": 3.496373514368443e-05, + "loss": 1.0799, + "step": 20485 + }, + { + "epoch": 0.7336472863358819, + "grad_norm": 1.5999102592468262, + "learning_rate": 3.495492469408813e-05, + "loss": 0.8428, + "step": 20486 + }, + { + "epoch": 0.7336830984654502, + "grad_norm": 1.3655297756195068, + "learning_rate": 3.494611511958942e-05, + "loss": 0.9977, + "step": 20487 + }, + { + "epoch": 0.7337189105950185, + "grad_norm": 1.6383392810821533, + "learning_rate": 3.493730642030685e-05, + "loss": 1.1137, + "step": 20488 + }, + { + "epoch": 0.7337547227245869, + "grad_norm": 1.4986101388931274, + "learning_rate": 3.492849859635885e-05, + "loss": 0.9696, + "step": 20489 + }, + { + "epoch": 0.7337905348541551, + "grad_norm": 1.6090699434280396, + "learning_rate": 3.4919691647863984e-05, + "loss": 0.9898, + "step": 20490 + }, + { + "epoch": 0.7338263469837234, + "grad_norm": 1.371721863746643, + "learning_rate": 3.491088557494074e-05, + "loss": 1.0282, + "step": 20491 + }, + { + "epoch": 0.7338621591132917, + "grad_norm": 1.5017900466918945, + "learning_rate": 3.490208037770755e-05, + "loss": 1.1907, + "step": 20492 + }, + { + "epoch": 0.7338979712428599, + "grad_norm": 1.540077567100525, + "learning_rate": 3.4893276056282894e-05, + "loss": 1.1303, + "step": 20493 + }, + { + "epoch": 0.7339337833724282, + "grad_norm": 1.7730060815811157, + "learning_rate": 3.4884472610785224e-05, + "loss": 1.0154, + "step": 20494 + }, + { + "epoch": 0.7339695955019965, + "grad_norm": 1.499311923980713, + "learning_rate": 3.487567004133302e-05, + "loss": 1.1494, + "step": 20495 + }, + { + "epoch": 0.7340054076315649, + "grad_norm": 1.207066297531128, + "learning_rate": 3.4866868348044634e-05, + "loss": 1.0638, + "step": 20496 + }, + { + "epoch": 0.7340412197611331, + "grad_norm": 1.5986602306365967, + "learning_rate": 3.485806753103852e-05, + "loss": 1.1177, + "step": 20497 + }, + { + "epoch": 0.7340770318907014, + "grad_norm": 1.9515626430511475, + "learning_rate": 3.484926759043311e-05, + "loss": 1.1306, + "step": 20498 + }, + { + "epoch": 0.7341128440202697, + "grad_norm": 1.596950650215149, + "learning_rate": 3.484046852634674e-05, + "loss": 1.003, + "step": 20499 + }, + { + "epoch": 0.7341486561498379, + "grad_norm": 1.3334351778030396, + "learning_rate": 3.483167033889781e-05, + "loss": 1.0342, + "step": 20500 + }, + { + "epoch": 0.7341844682794062, + "grad_norm": 1.536409854888916, + "learning_rate": 3.4822873028204694e-05, + "loss": 1.0444, + "step": 20501 + }, + { + "epoch": 0.7342202804089745, + "grad_norm": 1.2843059301376343, + "learning_rate": 3.481407659438579e-05, + "loss": 1.0056, + "step": 20502 + }, + { + "epoch": 0.7342560925385428, + "grad_norm": 1.6854652166366577, + "learning_rate": 3.480528103755937e-05, + "loss": 1.2294, + "step": 20503 + }, + { + "epoch": 0.7342919046681111, + "grad_norm": 1.3259330987930298, + "learning_rate": 3.479648635784378e-05, + "loss": 0.9532, + "step": 20504 + }, + { + "epoch": 0.7343277167976794, + "grad_norm": 1.6642416715621948, + "learning_rate": 3.478769255535738e-05, + "loss": 1.1472, + "step": 20505 + }, + { + "epoch": 0.7343635289272477, + "grad_norm": 1.6232305765151978, + "learning_rate": 3.4778899630218483e-05, + "loss": 1.2467, + "step": 20506 + }, + { + "epoch": 0.7343993410568159, + "grad_norm": 2.1456515789031982, + "learning_rate": 3.4770107582545365e-05, + "loss": 1.1134, + "step": 20507 + }, + { + "epoch": 0.7344351531863842, + "grad_norm": 1.1568318605422974, + "learning_rate": 3.4761316412456235e-05, + "loss": 1.0615, + "step": 20508 + }, + { + "epoch": 0.7344709653159525, + "grad_norm": 1.386183261871338, + "learning_rate": 3.4752526120069516e-05, + "loss": 0.9399, + "step": 20509 + }, + { + "epoch": 0.7345067774455208, + "grad_norm": 1.6854443550109863, + "learning_rate": 3.474373670550336e-05, + "loss": 1.1398, + "step": 20510 + }, + { + "epoch": 0.7345425895750891, + "grad_norm": 1.6039459705352783, + "learning_rate": 3.4734948168876045e-05, + "loss": 1.2341, + "step": 20511 + }, + { + "epoch": 0.7345784017046574, + "grad_norm": 1.6891393661499023, + "learning_rate": 3.4726160510305824e-05, + "loss": 1.1611, + "step": 20512 + }, + { + "epoch": 0.7346142138342256, + "grad_norm": 1.3641082048416138, + "learning_rate": 3.471737372991095e-05, + "loss": 1.1881, + "step": 20513 + }, + { + "epoch": 0.7346500259637939, + "grad_norm": 1.9808611869812012, + "learning_rate": 3.470858782780957e-05, + "loss": 1.2816, + "step": 20514 + }, + { + "epoch": 0.7346858380933622, + "grad_norm": 1.6149871349334717, + "learning_rate": 3.469980280411992e-05, + "loss": 0.9439, + "step": 20515 + }, + { + "epoch": 0.7347216502229305, + "grad_norm": 1.647196888923645, + "learning_rate": 3.469101865896023e-05, + "loss": 1.1972, + "step": 20516 + }, + { + "epoch": 0.7347574623524988, + "grad_norm": 1.3026676177978516, + "learning_rate": 3.468223539244859e-05, + "loss": 1.0212, + "step": 20517 + }, + { + "epoch": 0.7347932744820671, + "grad_norm": 1.4178940057754517, + "learning_rate": 3.467345300470327e-05, + "loss": 1.1198, + "step": 20518 + }, + { + "epoch": 0.7348290866116354, + "grad_norm": 1.443494439125061, + "learning_rate": 3.466467149584231e-05, + "loss": 1.3486, + "step": 20519 + }, + { + "epoch": 0.7348648987412036, + "grad_norm": 1.764363408088684, + "learning_rate": 3.4655890865983975e-05, + "loss": 0.9866, + "step": 20520 + }, + { + "epoch": 0.7349007108707719, + "grad_norm": 1.7185598611831665, + "learning_rate": 3.464711111524631e-05, + "loss": 1.1004, + "step": 20521 + }, + { + "epoch": 0.7349365230003402, + "grad_norm": 1.4014559984207153, + "learning_rate": 3.4638332243747464e-05, + "loss": 0.9242, + "step": 20522 + }, + { + "epoch": 0.7349723351299084, + "grad_norm": 1.7110729217529297, + "learning_rate": 3.4629554251605545e-05, + "loss": 1.2139, + "step": 20523 + }, + { + "epoch": 0.7350081472594768, + "grad_norm": 1.5219377279281616, + "learning_rate": 3.4620777138938695e-05, + "loss": 1.1708, + "step": 20524 + }, + { + "epoch": 0.7350439593890451, + "grad_norm": 1.7370275259017944, + "learning_rate": 3.461200090586495e-05, + "loss": 1.1235, + "step": 20525 + }, + { + "epoch": 0.7350797715186134, + "grad_norm": 1.61952543258667, + "learning_rate": 3.4603225552502315e-05, + "loss": 1.089, + "step": 20526 + }, + { + "epoch": 0.7351155836481816, + "grad_norm": 1.3678897619247437, + "learning_rate": 3.4594451078969005e-05, + "loss": 0.932, + "step": 20527 + }, + { + "epoch": 0.7351513957777499, + "grad_norm": 1.4545832872390747, + "learning_rate": 3.458567748538295e-05, + "loss": 1.1501, + "step": 20528 + }, + { + "epoch": 0.7351872079073182, + "grad_norm": 1.389992356300354, + "learning_rate": 3.457690477186225e-05, + "loss": 1.1327, + "step": 20529 + }, + { + "epoch": 0.7352230200368864, + "grad_norm": 1.2716162204742432, + "learning_rate": 3.4568132938524845e-05, + "loss": 0.945, + "step": 20530 + }, + { + "epoch": 0.7352588321664548, + "grad_norm": 1.4208999872207642, + "learning_rate": 3.455936198548888e-05, + "loss": 1.0579, + "step": 20531 + }, + { + "epoch": 0.7352946442960231, + "grad_norm": 1.4567186832427979, + "learning_rate": 3.455059191287225e-05, + "loss": 1.1064, + "step": 20532 + }, + { + "epoch": 0.7353304564255914, + "grad_norm": 1.409636378288269, + "learning_rate": 3.454182272079303e-05, + "loss": 1.0658, + "step": 20533 + }, + { + "epoch": 0.7353662685551596, + "grad_norm": 1.3819599151611328, + "learning_rate": 3.45330544093691e-05, + "loss": 1.105, + "step": 20534 + }, + { + "epoch": 0.7354020806847279, + "grad_norm": 1.4109530448913574, + "learning_rate": 3.4524286978718475e-05, + "loss": 0.9921, + "step": 20535 + }, + { + "epoch": 0.7354378928142962, + "grad_norm": 1.2766833305358887, + "learning_rate": 3.451552042895916e-05, + "loss": 1.0189, + "step": 20536 + }, + { + "epoch": 0.7354737049438644, + "grad_norm": 1.474422574043274, + "learning_rate": 3.450675476020897e-05, + "loss": 1.0075, + "step": 20537 + }, + { + "epoch": 0.7355095170734328, + "grad_norm": 1.7024482488632202, + "learning_rate": 3.449798997258599e-05, + "loss": 1.0872, + "step": 20538 + }, + { + "epoch": 0.7355453292030011, + "grad_norm": 1.3471157550811768, + "learning_rate": 3.4489226066208025e-05, + "loss": 0.9284, + "step": 20539 + }, + { + "epoch": 0.7355811413325694, + "grad_norm": 1.3836915493011475, + "learning_rate": 3.448046304119306e-05, + "loss": 1.0895, + "step": 20540 + }, + { + "epoch": 0.7356169534621376, + "grad_norm": 1.5507980585098267, + "learning_rate": 3.44717008976589e-05, + "loss": 1.0775, + "step": 20541 + }, + { + "epoch": 0.7356527655917059, + "grad_norm": 1.7880961894989014, + "learning_rate": 3.446293963572349e-05, + "loss": 1.0861, + "step": 20542 + }, + { + "epoch": 0.7356885777212742, + "grad_norm": 1.3472753763198853, + "learning_rate": 3.4454179255504726e-05, + "loss": 0.9994, + "step": 20543 + }, + { + "epoch": 0.7357243898508424, + "grad_norm": 1.4191575050354004, + "learning_rate": 3.44454197571204e-05, + "loss": 1.0529, + "step": 20544 + }, + { + "epoch": 0.7357602019804108, + "grad_norm": 1.4489632844924927, + "learning_rate": 3.4436661140688386e-05, + "loss": 0.9874, + "step": 20545 + }, + { + "epoch": 0.7357960141099791, + "grad_norm": 1.2122855186462402, + "learning_rate": 3.442790340632652e-05, + "loss": 1.0067, + "step": 20546 + }, + { + "epoch": 0.7358318262395473, + "grad_norm": 1.4697158336639404, + "learning_rate": 3.441914655415268e-05, + "loss": 1.1544, + "step": 20547 + }, + { + "epoch": 0.7358676383691156, + "grad_norm": 1.5097905397415161, + "learning_rate": 3.441039058428456e-05, + "loss": 1.1351, + "step": 20548 + }, + { + "epoch": 0.7359034504986839, + "grad_norm": 1.7249486446380615, + "learning_rate": 3.440163549684009e-05, + "loss": 1.0555, + "step": 20549 + }, + { + "epoch": 0.7359392626282522, + "grad_norm": 1.4860548973083496, + "learning_rate": 3.4392881291936995e-05, + "loss": 1.1664, + "step": 20550 + }, + { + "epoch": 0.7359750747578204, + "grad_norm": 1.2687782049179077, + "learning_rate": 3.438412796969304e-05, + "loss": 1.155, + "step": 20551 + }, + { + "epoch": 0.7360108868873888, + "grad_norm": 1.2986587285995483, + "learning_rate": 3.4375375530225984e-05, + "loss": 1.0385, + "step": 20552 + }, + { + "epoch": 0.7360466990169571, + "grad_norm": 1.3403760194778442, + "learning_rate": 3.436662397365361e-05, + "loss": 1.1622, + "step": 20553 + }, + { + "epoch": 0.7360825111465253, + "grad_norm": 2.6122329235076904, + "learning_rate": 3.435787330009369e-05, + "loss": 1.006, + "step": 20554 + }, + { + "epoch": 0.7361183232760936, + "grad_norm": 1.331803560256958, + "learning_rate": 3.4349123509663874e-05, + "loss": 1.2083, + "step": 20555 + }, + { + "epoch": 0.7361541354056619, + "grad_norm": 1.2766963243484497, + "learning_rate": 3.434037460248191e-05, + "loss": 1.0954, + "step": 20556 + }, + { + "epoch": 0.7361899475352301, + "grad_norm": 1.3058379888534546, + "learning_rate": 3.433162657866552e-05, + "loss": 1.0682, + "step": 20557 + }, + { + "epoch": 0.7362257596647984, + "grad_norm": 2.232043981552124, + "learning_rate": 3.4322879438332414e-05, + "loss": 0.9395, + "step": 20558 + }, + { + "epoch": 0.7362615717943668, + "grad_norm": 1.3932244777679443, + "learning_rate": 3.431413318160022e-05, + "loss": 0.8762, + "step": 20559 + }, + { + "epoch": 0.7362973839239351, + "grad_norm": 1.602191686630249, + "learning_rate": 3.430538780858663e-05, + "loss": 1.1124, + "step": 20560 + }, + { + "epoch": 0.7363331960535033, + "grad_norm": 1.5942487716674805, + "learning_rate": 3.429664331940935e-05, + "loss": 1.0552, + "step": 20561 + }, + { + "epoch": 0.7363690081830716, + "grad_norm": 1.414975643157959, + "learning_rate": 3.4287899714185944e-05, + "loss": 0.9875, + "step": 20562 + }, + { + "epoch": 0.7364048203126399, + "grad_norm": 1.7916908264160156, + "learning_rate": 3.427915699303408e-05, + "loss": 0.8204, + "step": 20563 + }, + { + "epoch": 0.7364406324422081, + "grad_norm": 1.4615070819854736, + "learning_rate": 3.427041515607139e-05, + "loss": 1.0245, + "step": 20564 + }, + { + "epoch": 0.7364764445717764, + "grad_norm": 1.3474875688552856, + "learning_rate": 3.426167420341552e-05, + "loss": 1.0133, + "step": 20565 + }, + { + "epoch": 0.7365122567013448, + "grad_norm": 1.3775348663330078, + "learning_rate": 3.4252934135183977e-05, + "loss": 0.9469, + "step": 20566 + }, + { + "epoch": 0.7365480688309131, + "grad_norm": 1.4959841966629028, + "learning_rate": 3.4244194951494414e-05, + "loss": 1.0105, + "step": 20567 + }, + { + "epoch": 0.7365838809604813, + "grad_norm": 1.3410937786102295, + "learning_rate": 3.4235456652464405e-05, + "loss": 1.1298, + "step": 20568 + }, + { + "epoch": 0.7366196930900496, + "grad_norm": 1.3025848865509033, + "learning_rate": 3.422671923821148e-05, + "loss": 0.9146, + "step": 20569 + }, + { + "epoch": 0.7366555052196179, + "grad_norm": 1.384865403175354, + "learning_rate": 3.421798270885319e-05, + "loss": 0.8734, + "step": 20570 + }, + { + "epoch": 0.7366913173491861, + "grad_norm": 1.2285012006759644, + "learning_rate": 3.420924706450711e-05, + "loss": 1.1153, + "step": 20571 + }, + { + "epoch": 0.7367271294787544, + "grad_norm": 1.4440752267837524, + "learning_rate": 3.4200512305290764e-05, + "loss": 1.1578, + "step": 20572 + }, + { + "epoch": 0.7367629416083228, + "grad_norm": 1.2551254034042358, + "learning_rate": 3.419177843132162e-05, + "loss": 1.0607, + "step": 20573 + }, + { + "epoch": 0.736798753737891, + "grad_norm": 1.7801984548568726, + "learning_rate": 3.418304544271721e-05, + "loss": 1.0244, + "step": 20574 + }, + { + "epoch": 0.7368345658674593, + "grad_norm": 1.3915103673934937, + "learning_rate": 3.417431333959503e-05, + "loss": 1.1026, + "step": 20575 + }, + { + "epoch": 0.7368703779970276, + "grad_norm": 1.3254090547561646, + "learning_rate": 3.4165582122072594e-05, + "loss": 1.1042, + "step": 20576 + }, + { + "epoch": 0.7369061901265959, + "grad_norm": 1.7712507247924805, + "learning_rate": 3.4156851790267283e-05, + "loss": 1.0758, + "step": 20577 + }, + { + "epoch": 0.7369420022561641, + "grad_norm": 1.878022313117981, + "learning_rate": 3.4148122344296605e-05, + "loss": 1.2907, + "step": 20578 + }, + { + "epoch": 0.7369778143857324, + "grad_norm": 1.3954565525054932, + "learning_rate": 3.413939378427804e-05, + "loss": 0.9479, + "step": 20579 + }, + { + "epoch": 0.7370136265153008, + "grad_norm": 1.304923176765442, + "learning_rate": 3.413066611032894e-05, + "loss": 1.2861, + "step": 20580 + }, + { + "epoch": 0.737049438644869, + "grad_norm": 1.4049429893493652, + "learning_rate": 3.412193932256675e-05, + "loss": 1.2436, + "step": 20581 + }, + { + "epoch": 0.7370852507744373, + "grad_norm": 1.773053526878357, + "learning_rate": 3.41132134211089e-05, + "loss": 1.228, + "step": 20582 + }, + { + "epoch": 0.7371210629040056, + "grad_norm": 1.5743986368179321, + "learning_rate": 3.410448840607281e-05, + "loss": 0.9848, + "step": 20583 + }, + { + "epoch": 0.7371568750335739, + "grad_norm": 1.4033902883529663, + "learning_rate": 3.4095764277575795e-05, + "loss": 1.0023, + "step": 20584 + }, + { + "epoch": 0.7371926871631421, + "grad_norm": 1.306687831878662, + "learning_rate": 3.4087041035735256e-05, + "loss": 1.0634, + "step": 20585 + }, + { + "epoch": 0.7372284992927104, + "grad_norm": 1.825703740119934, + "learning_rate": 3.40783186806686e-05, + "loss": 1.0417, + "step": 20586 + }, + { + "epoch": 0.7372643114222788, + "grad_norm": 1.4441288709640503, + "learning_rate": 3.406959721249309e-05, + "loss": 0.9535, + "step": 20587 + }, + { + "epoch": 0.737300123551847, + "grad_norm": 1.6931155920028687, + "learning_rate": 3.406087663132611e-05, + "loss": 1.0322, + "step": 20588 + }, + { + "epoch": 0.7373359356814153, + "grad_norm": 1.6012123823165894, + "learning_rate": 3.4052156937284984e-05, + "loss": 0.8777, + "step": 20589 + }, + { + "epoch": 0.7373717478109836, + "grad_norm": 1.4128977060317993, + "learning_rate": 3.404343813048705e-05, + "loss": 0.9005, + "step": 20590 + }, + { + "epoch": 0.7374075599405518, + "grad_norm": 1.3247560262680054, + "learning_rate": 3.4034720211049544e-05, + "loss": 1.1034, + "step": 20591 + }, + { + "epoch": 0.7374433720701201, + "grad_norm": 1.4504965543746948, + "learning_rate": 3.402600317908978e-05, + "loss": 0.8455, + "step": 20592 + }, + { + "epoch": 0.7374791841996884, + "grad_norm": 1.7740833759307861, + "learning_rate": 3.401728703472505e-05, + "loss": 1.0368, + "step": 20593 + }, + { + "epoch": 0.7375149963292568, + "grad_norm": 2.1046533584594727, + "learning_rate": 3.400857177807265e-05, + "loss": 0.8493, + "step": 20594 + }, + { + "epoch": 0.737550808458825, + "grad_norm": 1.7807233333587646, + "learning_rate": 3.399985740924976e-05, + "loss": 1.0397, + "step": 20595 + }, + { + "epoch": 0.7375866205883933, + "grad_norm": 1.7649799585342407, + "learning_rate": 3.399114392837365e-05, + "loss": 0.9912, + "step": 20596 + }, + { + "epoch": 0.7376224327179616, + "grad_norm": 1.5687899589538574, + "learning_rate": 3.3982431335561596e-05, + "loss": 1.0845, + "step": 20597 + }, + { + "epoch": 0.7376582448475298, + "grad_norm": 1.896018385887146, + "learning_rate": 3.397371963093072e-05, + "loss": 0.9163, + "step": 20598 + }, + { + "epoch": 0.7376940569770981, + "grad_norm": 1.2882617712020874, + "learning_rate": 3.39650088145983e-05, + "loss": 0.9887, + "step": 20599 + }, + { + "epoch": 0.7377298691066664, + "grad_norm": 1.3571968078613281, + "learning_rate": 3.3956298886681496e-05, + "loss": 0.9909, + "step": 20600 + }, + { + "epoch": 0.7377656812362348, + "grad_norm": 2.555281400680542, + "learning_rate": 3.3947589847297537e-05, + "loss": 1.21, + "step": 20601 + }, + { + "epoch": 0.737801493365803, + "grad_norm": 1.3999184370040894, + "learning_rate": 3.393888169656351e-05, + "loss": 1.1959, + "step": 20602 + }, + { + "epoch": 0.7378373054953713, + "grad_norm": 1.5618818998336792, + "learning_rate": 3.393017443459663e-05, + "loss": 0.8878, + "step": 20603 + }, + { + "epoch": 0.7378731176249396, + "grad_norm": 1.4581372737884521, + "learning_rate": 3.392146806151405e-05, + "loss": 1.1242, + "step": 20604 + }, + { + "epoch": 0.7379089297545078, + "grad_norm": 1.3998125791549683, + "learning_rate": 3.3912762577432864e-05, + "loss": 0.999, + "step": 20605 + }, + { + "epoch": 0.7379447418840761, + "grad_norm": 1.4089491367340088, + "learning_rate": 3.3904057982470204e-05, + "loss": 1.2164, + "step": 20606 + }, + { + "epoch": 0.7379805540136444, + "grad_norm": 1.5982444286346436, + "learning_rate": 3.389535427674318e-05, + "loss": 1.008, + "step": 20607 + }, + { + "epoch": 0.7380163661432128, + "grad_norm": 1.52167809009552, + "learning_rate": 3.3886651460368934e-05, + "loss": 0.9931, + "step": 20608 + }, + { + "epoch": 0.738052178272781, + "grad_norm": 1.5092217922210693, + "learning_rate": 3.3877949533464485e-05, + "loss": 1.277, + "step": 20609 + }, + { + "epoch": 0.7380879904023493, + "grad_norm": 1.270132064819336, + "learning_rate": 3.3869248496146935e-05, + "loss": 1.1396, + "step": 20610 + }, + { + "epoch": 0.7381238025319176, + "grad_norm": 1.9975686073303223, + "learning_rate": 3.3860548348533326e-05, + "loss": 1.1649, + "step": 20611 + }, + { + "epoch": 0.7381596146614858, + "grad_norm": 1.2116508483886719, + "learning_rate": 3.385184909074077e-05, + "loss": 1.1222, + "step": 20612 + }, + { + "epoch": 0.7381954267910541, + "grad_norm": 1.6119157075881958, + "learning_rate": 3.384315072288626e-05, + "loss": 1.1678, + "step": 20613 + }, + { + "epoch": 0.7382312389206224, + "grad_norm": 1.7523964643478394, + "learning_rate": 3.383445324508676e-05, + "loss": 0.9456, + "step": 20614 + }, + { + "epoch": 0.7382670510501907, + "grad_norm": 1.267722725868225, + "learning_rate": 3.382575665745941e-05, + "loss": 0.758, + "step": 20615 + }, + { + "epoch": 0.738302863179759, + "grad_norm": 1.3376431465148926, + "learning_rate": 3.3817060960121105e-05, + "loss": 1.1607, + "step": 20616 + }, + { + "epoch": 0.7383386753093273, + "grad_norm": 1.4823296070098877, + "learning_rate": 3.380836615318891e-05, + "loss": 1.0755, + "step": 20617 + }, + { + "epoch": 0.7383744874388956, + "grad_norm": 1.630316138267517, + "learning_rate": 3.37996722367797e-05, + "loss": 1.1282, + "step": 20618 + }, + { + "epoch": 0.7384102995684638, + "grad_norm": 1.5869543552398682, + "learning_rate": 3.3790979211010576e-05, + "loss": 1.1144, + "step": 20619 + }, + { + "epoch": 0.7384461116980321, + "grad_norm": 1.6612794399261475, + "learning_rate": 3.3782287075998386e-05, + "loss": 1.1389, + "step": 20620 + }, + { + "epoch": 0.7384819238276004, + "grad_norm": 1.899888515472412, + "learning_rate": 3.377359583186012e-05, + "loss": 1.0207, + "step": 20621 + }, + { + "epoch": 0.7385177359571687, + "grad_norm": 1.6859898567199707, + "learning_rate": 3.376490547871272e-05, + "loss": 1.2861, + "step": 20622 + }, + { + "epoch": 0.738553548086737, + "grad_norm": 1.391411542892456, + "learning_rate": 3.375621601667305e-05, + "loss": 0.955, + "step": 20623 + }, + { + "epoch": 0.7385893602163053, + "grad_norm": 1.5714365243911743, + "learning_rate": 3.3747527445858074e-05, + "loss": 1.1935, + "step": 20624 + }, + { + "epoch": 0.7386251723458735, + "grad_norm": 1.3990066051483154, + "learning_rate": 3.373883976638459e-05, + "loss": 0.9716, + "step": 20625 + }, + { + "epoch": 0.7386609844754418, + "grad_norm": 1.4059674739837646, + "learning_rate": 3.3730152978369614e-05, + "loss": 1.1892, + "step": 20626 + }, + { + "epoch": 0.7386967966050101, + "grad_norm": 1.2484623193740845, + "learning_rate": 3.3721467081929914e-05, + "loss": 0.8026, + "step": 20627 + }, + { + "epoch": 0.7387326087345784, + "grad_norm": 1.4614685773849487, + "learning_rate": 3.371278207718241e-05, + "loss": 0.999, + "step": 20628 + }, + { + "epoch": 0.7387684208641467, + "grad_norm": 1.7365002632141113, + "learning_rate": 3.370409796424386e-05, + "loss": 1.2205, + "step": 20629 + }, + { + "epoch": 0.738804232993715, + "grad_norm": 1.6401481628417969, + "learning_rate": 3.369541474323122e-05, + "loss": 0.9923, + "step": 20630 + }, + { + "epoch": 0.7388400451232833, + "grad_norm": 1.3826324939727783, + "learning_rate": 3.3686732414261254e-05, + "loss": 1.1544, + "step": 20631 + }, + { + "epoch": 0.7388758572528515, + "grad_norm": 1.2165435552597046, + "learning_rate": 3.367805097745069e-05, + "loss": 1.0522, + "step": 20632 + }, + { + "epoch": 0.7389116693824198, + "grad_norm": 1.5104068517684937, + "learning_rate": 3.366937043291648e-05, + "loss": 1.0523, + "step": 20633 + }, + { + "epoch": 0.7389474815119881, + "grad_norm": 1.7905073165893555, + "learning_rate": 3.3660690780775286e-05, + "loss": 1.1343, + "step": 20634 + }, + { + "epoch": 0.7389832936415563, + "grad_norm": 1.5143111944198608, + "learning_rate": 3.3652012021143964e-05, + "loss": 1.2092, + "step": 20635 + }, + { + "epoch": 0.7390191057711247, + "grad_norm": 1.3572471141815186, + "learning_rate": 3.364333415413917e-05, + "loss": 0.9044, + "step": 20636 + }, + { + "epoch": 0.739054917900693, + "grad_norm": 1.3579710721969604, + "learning_rate": 3.363465717987778e-05, + "loss": 1.1582, + "step": 20637 + }, + { + "epoch": 0.7390907300302613, + "grad_norm": 1.6028878688812256, + "learning_rate": 3.3625981098476444e-05, + "loss": 1.1228, + "step": 20638 + }, + { + "epoch": 0.7391265421598295, + "grad_norm": 1.4177000522613525, + "learning_rate": 3.3617305910051956e-05, + "loss": 1.0249, + "step": 20639 + }, + { + "epoch": 0.7391623542893978, + "grad_norm": 1.5417803525924683, + "learning_rate": 3.3608631614720955e-05, + "loss": 1.2632, + "step": 20640 + }, + { + "epoch": 0.7391981664189661, + "grad_norm": 1.2246886491775513, + "learning_rate": 3.359995821260017e-05, + "loss": 1.0571, + "step": 20641 + }, + { + "epoch": 0.7392339785485343, + "grad_norm": 1.4783704280853271, + "learning_rate": 3.359128570380633e-05, + "loss": 0.9737, + "step": 20642 + }, + { + "epoch": 0.7392697906781027, + "grad_norm": 1.7897155284881592, + "learning_rate": 3.3582614088456055e-05, + "loss": 1.1503, + "step": 20643 + }, + { + "epoch": 0.739305602807671, + "grad_norm": 1.4034199714660645, + "learning_rate": 3.3573943366666026e-05, + "loss": 0.9812, + "step": 20644 + }, + { + "epoch": 0.7393414149372393, + "grad_norm": 1.4187742471694946, + "learning_rate": 3.356527353855291e-05, + "loss": 1.115, + "step": 20645 + }, + { + "epoch": 0.7393772270668075, + "grad_norm": 1.5743701457977295, + "learning_rate": 3.355660460423338e-05, + "loss": 1.0076, + "step": 20646 + }, + { + "epoch": 0.7394130391963758, + "grad_norm": 1.419161081314087, + "learning_rate": 3.354793656382399e-05, + "loss": 1.0253, + "step": 20647 + }, + { + "epoch": 0.7394488513259441, + "grad_norm": 1.3024054765701294, + "learning_rate": 3.35392694174414e-05, + "loss": 1.0569, + "step": 20648 + }, + { + "epoch": 0.7394846634555123, + "grad_norm": 1.462085247039795, + "learning_rate": 3.3530603165202245e-05, + "loss": 1.0181, + "step": 20649 + }, + { + "epoch": 0.7395204755850807, + "grad_norm": 1.6257284879684448, + "learning_rate": 3.352193780722306e-05, + "loss": 1.0579, + "step": 20650 + }, + { + "epoch": 0.739556287714649, + "grad_norm": 1.5361006259918213, + "learning_rate": 3.351327334362043e-05, + "loss": 1.0455, + "step": 20651 + }, + { + "epoch": 0.7395920998442173, + "grad_norm": 1.5740365982055664, + "learning_rate": 3.3504609774510964e-05, + "loss": 1.0209, + "step": 20652 + }, + { + "epoch": 0.7396279119737855, + "grad_norm": 1.4311317205429077, + "learning_rate": 3.349594710001123e-05, + "loss": 1.1016, + "step": 20653 + }, + { + "epoch": 0.7396637241033538, + "grad_norm": 1.3099589347839355, + "learning_rate": 3.3487285320237705e-05, + "loss": 0.9201, + "step": 20654 + }, + { + "epoch": 0.7396995362329221, + "grad_norm": 1.5750049352645874, + "learning_rate": 3.347862443530697e-05, + "loss": 1.1049, + "step": 20655 + }, + { + "epoch": 0.7397353483624903, + "grad_norm": 1.3816872835159302, + "learning_rate": 3.3469964445335566e-05, + "loss": 0.9271, + "step": 20656 + }, + { + "epoch": 0.7397711604920587, + "grad_norm": 1.520067572593689, + "learning_rate": 3.346130535043993e-05, + "loss": 0.9463, + "step": 20657 + }, + { + "epoch": 0.739806972621627, + "grad_norm": 1.4574400186538696, + "learning_rate": 3.3452647150736615e-05, + "loss": 0.9658, + "step": 20658 + }, + { + "epoch": 0.7398427847511952, + "grad_norm": 1.5267812013626099, + "learning_rate": 3.3443989846342084e-05, + "loss": 1.0172, + "step": 20659 + }, + { + "epoch": 0.7398785968807635, + "grad_norm": 1.30661940574646, + "learning_rate": 3.3435333437372854e-05, + "loss": 1.1661, + "step": 20660 + }, + { + "epoch": 0.7399144090103318, + "grad_norm": 1.3897415399551392, + "learning_rate": 3.3426677923945314e-05, + "loss": 1.0281, + "step": 20661 + }, + { + "epoch": 0.7399502211399, + "grad_norm": 1.582457184791565, + "learning_rate": 3.341802330617596e-05, + "loss": 1.0525, + "step": 20662 + }, + { + "epoch": 0.7399860332694683, + "grad_norm": 2.3724348545074463, + "learning_rate": 3.3409369584181216e-05, + "loss": 1.0739, + "step": 20663 + }, + { + "epoch": 0.7400218453990367, + "grad_norm": 1.5110046863555908, + "learning_rate": 3.340071675807753e-05, + "loss": 1.3185, + "step": 20664 + }, + { + "epoch": 0.740057657528605, + "grad_norm": 1.478474497795105, + "learning_rate": 3.3392064827981275e-05, + "loss": 1.2742, + "step": 20665 + }, + { + "epoch": 0.7400934696581732, + "grad_norm": 1.819725751876831, + "learning_rate": 3.338341379400885e-05, + "loss": 1.1015, + "step": 20666 + }, + { + "epoch": 0.7401292817877415, + "grad_norm": 1.3303073644638062, + "learning_rate": 3.337476365627672e-05, + "loss": 1.0392, + "step": 20667 + }, + { + "epoch": 0.7401650939173098, + "grad_norm": 1.3106850385665894, + "learning_rate": 3.336611441490115e-05, + "loss": 1.1534, + "step": 20668 + }, + { + "epoch": 0.740200906046878, + "grad_norm": 1.494230031967163, + "learning_rate": 3.335746606999858e-05, + "loss": 1.0368, + "step": 20669 + }, + { + "epoch": 0.7402367181764463, + "grad_norm": 1.62220299243927, + "learning_rate": 3.334881862168532e-05, + "loss": 1.0718, + "step": 20670 + }, + { + "epoch": 0.7402725303060147, + "grad_norm": 1.6056972742080688, + "learning_rate": 3.334017207007778e-05, + "loss": 1.0514, + "step": 20671 + }, + { + "epoch": 0.740308342435583, + "grad_norm": 1.481431245803833, + "learning_rate": 3.33315264152922e-05, + "loss": 0.9787, + "step": 20672 + }, + { + "epoch": 0.7403441545651512, + "grad_norm": 1.8540737628936768, + "learning_rate": 3.332288165744494e-05, + "loss": 1.0743, + "step": 20673 + }, + { + "epoch": 0.7403799666947195, + "grad_norm": 1.3318684101104736, + "learning_rate": 3.3314237796652324e-05, + "loss": 1.2735, + "step": 20674 + }, + { + "epoch": 0.7404157788242878, + "grad_norm": 1.46427321434021, + "learning_rate": 3.33055948330306e-05, + "loss": 1.0356, + "step": 20675 + }, + { + "epoch": 0.740451590953856, + "grad_norm": 1.9230436086654663, + "learning_rate": 3.329695276669605e-05, + "loss": 0.9615, + "step": 20676 + }, + { + "epoch": 0.7404874030834243, + "grad_norm": 1.3956642150878906, + "learning_rate": 3.3288311597764976e-05, + "loss": 1.262, + "step": 20677 + }, + { + "epoch": 0.7405232152129927, + "grad_norm": 1.8260953426361084, + "learning_rate": 3.327967132635364e-05, + "loss": 1.0365, + "step": 20678 + }, + { + "epoch": 0.740559027342561, + "grad_norm": 1.3492826223373413, + "learning_rate": 3.3271031952578245e-05, + "loss": 1.0348, + "step": 20679 + }, + { + "epoch": 0.7405948394721292, + "grad_norm": 1.8157153129577637, + "learning_rate": 3.326239347655503e-05, + "loss": 1.0593, + "step": 20680 + }, + { + "epoch": 0.7406306516016975, + "grad_norm": 1.4266165494918823, + "learning_rate": 3.325375589840023e-05, + "loss": 1.1559, + "step": 20681 + }, + { + "epoch": 0.7406664637312658, + "grad_norm": 1.9789543151855469, + "learning_rate": 3.3245119218230066e-05, + "loss": 1.059, + "step": 20682 + }, + { + "epoch": 0.740702275860834, + "grad_norm": 1.5706454515457153, + "learning_rate": 3.32364834361607e-05, + "loss": 1.0843, + "step": 20683 + }, + { + "epoch": 0.7407380879904023, + "grad_norm": 1.3699369430541992, + "learning_rate": 3.3227848552308326e-05, + "loss": 1.0182, + "step": 20684 + }, + { + "epoch": 0.7407739001199707, + "grad_norm": 1.6460565328598022, + "learning_rate": 3.321921456678915e-05, + "loss": 1.013, + "step": 20685 + }, + { + "epoch": 0.740809712249539, + "grad_norm": 1.5235124826431274, + "learning_rate": 3.321058147971927e-05, + "loss": 1.3458, + "step": 20686 + }, + { + "epoch": 0.7408455243791072, + "grad_norm": 1.4691377878189087, + "learning_rate": 3.320194929121486e-05, + "loss": 1.1665, + "step": 20687 + }, + { + "epoch": 0.7408813365086755, + "grad_norm": 1.5438727140426636, + "learning_rate": 3.319331800139207e-05, + "loss": 1.2969, + "step": 20688 + }, + { + "epoch": 0.7409171486382438, + "grad_norm": 1.3253259658813477, + "learning_rate": 3.318468761036704e-05, + "loss": 1.1469, + "step": 20689 + }, + { + "epoch": 0.740952960767812, + "grad_norm": 1.3066767454147339, + "learning_rate": 3.3176058118255816e-05, + "loss": 1.1251, + "step": 20690 + }, + { + "epoch": 0.7409887728973803, + "grad_norm": 1.3132990598678589, + "learning_rate": 3.316742952517453e-05, + "loss": 1.0027, + "step": 20691 + }, + { + "epoch": 0.7410245850269487, + "grad_norm": 1.5682939291000366, + "learning_rate": 3.3158801831239314e-05, + "loss": 0.9963, + "step": 20692 + }, + { + "epoch": 0.741060397156517, + "grad_norm": 1.452103614807129, + "learning_rate": 3.3150175036566166e-05, + "loss": 1.2146, + "step": 20693 + }, + { + "epoch": 0.7410962092860852, + "grad_norm": 1.48493230342865, + "learning_rate": 3.314154914127118e-05, + "loss": 1.0572, + "step": 20694 + }, + { + "epoch": 0.7411320214156535, + "grad_norm": 1.4022486209869385, + "learning_rate": 3.31329241454704e-05, + "loss": 1.2495, + "step": 20695 + }, + { + "epoch": 0.7411678335452218, + "grad_norm": 1.6511814594268799, + "learning_rate": 3.312430004927992e-05, + "loss": 1.0407, + "step": 20696 + }, + { + "epoch": 0.74120364567479, + "grad_norm": 2.0331497192382812, + "learning_rate": 3.311567685281568e-05, + "loss": 1.3136, + "step": 20697 + }, + { + "epoch": 0.7412394578043583, + "grad_norm": 1.2937450408935547, + "learning_rate": 3.310705455619374e-05, + "loss": 1.0222, + "step": 20698 + }, + { + "epoch": 0.7412752699339267, + "grad_norm": 1.389555811882019, + "learning_rate": 3.309843315953008e-05, + "loss": 0.9708, + "step": 20699 + }, + { + "epoch": 0.7413110820634949, + "grad_norm": 1.6208081245422363, + "learning_rate": 3.3089812662940754e-05, + "loss": 0.9505, + "step": 20700 + }, + { + "epoch": 0.7413468941930632, + "grad_norm": 1.3934910297393799, + "learning_rate": 3.308119306654168e-05, + "loss": 0.9502, + "step": 20701 + }, + { + "epoch": 0.7413827063226315, + "grad_norm": 1.7531794309616089, + "learning_rate": 3.3072574370448783e-05, + "loss": 1.1405, + "step": 20702 + }, + { + "epoch": 0.7414185184521997, + "grad_norm": 1.3753782510757446, + "learning_rate": 3.306395657477812e-05, + "loss": 1.0936, + "step": 20703 + }, + { + "epoch": 0.741454330581768, + "grad_norm": 1.655534029006958, + "learning_rate": 3.3055339679645544e-05, + "loss": 1.1176, + "step": 20704 + }, + { + "epoch": 0.7414901427113363, + "grad_norm": 1.7006791830062866, + "learning_rate": 3.304672368516704e-05, + "loss": 1.2849, + "step": 20705 + }, + { + "epoch": 0.7415259548409047, + "grad_norm": 1.2652581930160522, + "learning_rate": 3.303810859145848e-05, + "loss": 1.0927, + "step": 20706 + }, + { + "epoch": 0.7415617669704729, + "grad_norm": 1.4883095026016235, + "learning_rate": 3.302949439863584e-05, + "loss": 1.1641, + "step": 20707 + }, + { + "epoch": 0.7415975791000412, + "grad_norm": 1.5820375680923462, + "learning_rate": 3.3020881106814936e-05, + "loss": 1.2632, + "step": 20708 + }, + { + "epoch": 0.7416333912296095, + "grad_norm": 1.6992692947387695, + "learning_rate": 3.301226871611168e-05, + "loss": 0.9563, + "step": 20709 + }, + { + "epoch": 0.7416692033591777, + "grad_norm": 1.3604936599731445, + "learning_rate": 3.3003657226641974e-05, + "loss": 1.1356, + "step": 20710 + }, + { + "epoch": 0.741705015488746, + "grad_norm": 1.453790307044983, + "learning_rate": 3.2995046638521595e-05, + "loss": 0.8076, + "step": 20711 + }, + { + "epoch": 0.7417408276183143, + "grad_norm": 1.6442712545394897, + "learning_rate": 3.2986436951866486e-05, + "loss": 1.0288, + "step": 20712 + }, + { + "epoch": 0.7417766397478827, + "grad_norm": 1.4207513332366943, + "learning_rate": 3.2977828166792345e-05, + "loss": 1.3022, + "step": 20713 + }, + { + "epoch": 0.7418124518774509, + "grad_norm": 1.4587472677230835, + "learning_rate": 3.296922028341515e-05, + "loss": 1.026, + "step": 20714 + }, + { + "epoch": 0.7418482640070192, + "grad_norm": 1.3861913681030273, + "learning_rate": 3.29606133018506e-05, + "loss": 1.1225, + "step": 20715 + }, + { + "epoch": 0.7418840761365875, + "grad_norm": 1.401058554649353, + "learning_rate": 3.2952007222214545e-05, + "loss": 0.9041, + "step": 20716 + }, + { + "epoch": 0.7419198882661557, + "grad_norm": 1.3748726844787598, + "learning_rate": 3.29434020446227e-05, + "loss": 0.8507, + "step": 20717 + }, + { + "epoch": 0.741955700395724, + "grad_norm": 1.6130563020706177, + "learning_rate": 3.293479776919093e-05, + "loss": 1.1537, + "step": 20718 + }, + { + "epoch": 0.7419915125252923, + "grad_norm": 1.446271538734436, + "learning_rate": 3.292619439603495e-05, + "loss": 1.0359, + "step": 20719 + }, + { + "epoch": 0.7420273246548607, + "grad_norm": 1.5069392919540405, + "learning_rate": 3.291759192527045e-05, + "loss": 1.1474, + "step": 20720 + }, + { + "epoch": 0.7420631367844289, + "grad_norm": 1.3917800188064575, + "learning_rate": 3.290899035701328e-05, + "loss": 1.2173, + "step": 20721 + }, + { + "epoch": 0.7420989489139972, + "grad_norm": 1.602591872215271, + "learning_rate": 3.2900389691379074e-05, + "loss": 1.0696, + "step": 20722 + }, + { + "epoch": 0.7421347610435655, + "grad_norm": 1.3155691623687744, + "learning_rate": 3.2891789928483594e-05, + "loss": 0.8951, + "step": 20723 + }, + { + "epoch": 0.7421705731731337, + "grad_norm": 1.374745488166809, + "learning_rate": 3.2883191068442464e-05, + "loss": 1.1647, + "step": 20724 + }, + { + "epoch": 0.742206385302702, + "grad_norm": 1.2650012969970703, + "learning_rate": 3.287459311137149e-05, + "loss": 1.2906, + "step": 20725 + }, + { + "epoch": 0.7422421974322703, + "grad_norm": 1.631131649017334, + "learning_rate": 3.286599605738624e-05, + "loss": 1.0831, + "step": 20726 + }, + { + "epoch": 0.7422780095618386, + "grad_norm": 1.7078344821929932, + "learning_rate": 3.285739990660246e-05, + "loss": 0.8297, + "step": 20727 + }, + { + "epoch": 0.7423138216914069, + "grad_norm": 1.7398093938827515, + "learning_rate": 3.284880465913571e-05, + "loss": 1.3, + "step": 20728 + }, + { + "epoch": 0.7423496338209752, + "grad_norm": 1.3963496685028076, + "learning_rate": 3.284021031510168e-05, + "loss": 0.9913, + "step": 20729 + }, + { + "epoch": 0.7423854459505435, + "grad_norm": 1.4664897918701172, + "learning_rate": 3.2831616874616036e-05, + "loss": 1.1474, + "step": 20730 + }, + { + "epoch": 0.7424212580801117, + "grad_norm": 1.3386633396148682, + "learning_rate": 3.282302433779426e-05, + "loss": 1.1562, + "step": 20731 + }, + { + "epoch": 0.74245707020968, + "grad_norm": 1.376879334449768, + "learning_rate": 3.281443270475212e-05, + "loss": 1.0669, + "step": 20732 + }, + { + "epoch": 0.7424928823392483, + "grad_norm": 1.4704005718231201, + "learning_rate": 3.280584197560508e-05, + "loss": 1.211, + "step": 20733 + }, + { + "epoch": 0.7425286944688166, + "grad_norm": 1.5552668571472168, + "learning_rate": 3.2797252150468804e-05, + "loss": 1.1675, + "step": 20734 + }, + { + "epoch": 0.7425645065983849, + "grad_norm": 1.6612982749938965, + "learning_rate": 3.278866322945874e-05, + "loss": 1.0609, + "step": 20735 + }, + { + "epoch": 0.7426003187279532, + "grad_norm": 1.3978503942489624, + "learning_rate": 3.278007521269059e-05, + "loss": 1.0935, + "step": 20736 + }, + { + "epoch": 0.7426361308575214, + "grad_norm": 1.9398672580718994, + "learning_rate": 3.2771488100279814e-05, + "loss": 1.1135, + "step": 20737 + }, + { + "epoch": 0.7426719429870897, + "grad_norm": 1.6217774152755737, + "learning_rate": 3.2762901892341926e-05, + "loss": 0.9739, + "step": 20738 + }, + { + "epoch": 0.742707755116658, + "grad_norm": 1.6857348680496216, + "learning_rate": 3.2754316588992454e-05, + "loss": 1.1169, + "step": 20739 + }, + { + "epoch": 0.7427435672462263, + "grad_norm": 1.5118199586868286, + "learning_rate": 3.274573219034691e-05, + "loss": 1.2667, + "step": 20740 + }, + { + "epoch": 0.7427793793757946, + "grad_norm": 1.5253480672836304, + "learning_rate": 3.2737148696520824e-05, + "loss": 1.0547, + "step": 20741 + }, + { + "epoch": 0.7428151915053629, + "grad_norm": 1.626900315284729, + "learning_rate": 3.272856610762961e-05, + "loss": 1.1574, + "step": 20742 + }, + { + "epoch": 0.7428510036349312, + "grad_norm": 1.501212239265442, + "learning_rate": 3.271998442378875e-05, + "loss": 1.1438, + "step": 20743 + }, + { + "epoch": 0.7428868157644994, + "grad_norm": 1.9944841861724854, + "learning_rate": 3.271140364511377e-05, + "loss": 1.3259, + "step": 20744 + }, + { + "epoch": 0.7429226278940677, + "grad_norm": 1.639847755432129, + "learning_rate": 3.270282377172001e-05, + "loss": 1.0793, + "step": 20745 + }, + { + "epoch": 0.742958440023636, + "grad_norm": 1.6901257038116455, + "learning_rate": 3.269424480372295e-05, + "loss": 1.2878, + "step": 20746 + }, + { + "epoch": 0.7429942521532042, + "grad_norm": 1.203273057937622, + "learning_rate": 3.268566674123802e-05, + "loss": 1.029, + "step": 20747 + }, + { + "epoch": 0.7430300642827726, + "grad_norm": 1.7732703685760498, + "learning_rate": 3.267708958438063e-05, + "loss": 0.9512, + "step": 20748 + }, + { + "epoch": 0.7430658764123409, + "grad_norm": 1.3815556764602661, + "learning_rate": 3.266851333326614e-05, + "loss": 1.1326, + "step": 20749 + }, + { + "epoch": 0.7431016885419092, + "grad_norm": 1.5708904266357422, + "learning_rate": 3.265993798800995e-05, + "loss": 0.8535, + "step": 20750 + }, + { + "epoch": 0.7431375006714774, + "grad_norm": 1.2972098588943481, + "learning_rate": 3.265136354872742e-05, + "loss": 1.0108, + "step": 20751 + }, + { + "epoch": 0.7431733128010457, + "grad_norm": 1.3387526273727417, + "learning_rate": 3.2642790015533965e-05, + "loss": 0.9192, + "step": 20752 + }, + { + "epoch": 0.743209124930614, + "grad_norm": 1.5854226350784302, + "learning_rate": 3.2634217388544855e-05, + "loss": 1.213, + "step": 20753 + }, + { + "epoch": 0.7432449370601822, + "grad_norm": 1.908005714416504, + "learning_rate": 3.2625645667875434e-05, + "loss": 1.1482, + "step": 20754 + }, + { + "epoch": 0.7432807491897506, + "grad_norm": 1.2595267295837402, + "learning_rate": 3.26170748536411e-05, + "loss": 1.0694, + "step": 20755 + }, + { + "epoch": 0.7433165613193189, + "grad_norm": 1.5637657642364502, + "learning_rate": 3.260850494595707e-05, + "loss": 0.9653, + "step": 20756 + }, + { + "epoch": 0.7433523734488872, + "grad_norm": 1.5285944938659668, + "learning_rate": 3.259993594493866e-05, + "loss": 1.3024, + "step": 20757 + }, + { + "epoch": 0.7433881855784554, + "grad_norm": 1.33130943775177, + "learning_rate": 3.2591367850701194e-05, + "loss": 1.077, + "step": 20758 + }, + { + "epoch": 0.7434239977080237, + "grad_norm": 1.439658761024475, + "learning_rate": 3.2582800663359933e-05, + "loss": 1.142, + "step": 20759 + }, + { + "epoch": 0.743459809837592, + "grad_norm": 1.5990334749221802, + "learning_rate": 3.257423438303011e-05, + "loss": 1.0039, + "step": 20760 + }, + { + "epoch": 0.7434956219671602, + "grad_norm": 1.4718246459960938, + "learning_rate": 3.256566900982699e-05, + "loss": 1.059, + "step": 20761 + }, + { + "epoch": 0.7435314340967286, + "grad_norm": 1.3119794130325317, + "learning_rate": 3.255710454386585e-05, + "loss": 1.0035, + "step": 20762 + }, + { + "epoch": 0.7435672462262969, + "grad_norm": 1.2477952241897583, + "learning_rate": 3.2548540985261824e-05, + "loss": 1.1357, + "step": 20763 + }, + { + "epoch": 0.7436030583558652, + "grad_norm": 1.4747991561889648, + "learning_rate": 3.2539978334130174e-05, + "loss": 0.8657, + "step": 20764 + }, + { + "epoch": 0.7436388704854334, + "grad_norm": 1.2829314470291138, + "learning_rate": 3.253141659058611e-05, + "loss": 1.0757, + "step": 20765 + }, + { + "epoch": 0.7436746826150017, + "grad_norm": 1.5325146913528442, + "learning_rate": 3.252285575474483e-05, + "loss": 1.2494, + "step": 20766 + }, + { + "epoch": 0.74371049474457, + "grad_norm": 1.7442177534103394, + "learning_rate": 3.251429582672145e-05, + "loss": 1.4342, + "step": 20767 + }, + { + "epoch": 0.7437463068741382, + "grad_norm": 1.7603864669799805, + "learning_rate": 3.2505736806631185e-05, + "loss": 1.3371, + "step": 20768 + }, + { + "epoch": 0.7437821190037066, + "grad_norm": 1.690983533859253, + "learning_rate": 3.249717869458916e-05, + "loss": 0.9598, + "step": 20769 + }, + { + "epoch": 0.7438179311332749, + "grad_norm": 1.7726293802261353, + "learning_rate": 3.248862149071056e-05, + "loss": 1.1717, + "step": 20770 + }, + { + "epoch": 0.7438537432628431, + "grad_norm": 1.754230260848999, + "learning_rate": 3.248006519511043e-05, + "loss": 1.2838, + "step": 20771 + }, + { + "epoch": 0.7438895553924114, + "grad_norm": 1.5942155122756958, + "learning_rate": 3.247150980790394e-05, + "loss": 1.2119, + "step": 20772 + }, + { + "epoch": 0.7439253675219797, + "grad_norm": 1.2114444971084595, + "learning_rate": 3.2462955329206213e-05, + "loss": 1.0404, + "step": 20773 + }, + { + "epoch": 0.743961179651548, + "grad_norm": 1.4054994583129883, + "learning_rate": 3.245440175913227e-05, + "loss": 1.1004, + "step": 20774 + }, + { + "epoch": 0.7439969917811162, + "grad_norm": 1.4804227352142334, + "learning_rate": 3.244584909779722e-05, + "loss": 1.2194, + "step": 20775 + }, + { + "epoch": 0.7440328039106846, + "grad_norm": 1.7536696195602417, + "learning_rate": 3.243729734531614e-05, + "loss": 1.0384, + "step": 20776 + }, + { + "epoch": 0.7440686160402529, + "grad_norm": 1.4700027704238892, + "learning_rate": 3.2428746501804106e-05, + "loss": 1.1167, + "step": 20777 + }, + { + "epoch": 0.7441044281698211, + "grad_norm": 1.394910454750061, + "learning_rate": 3.2420196567376096e-05, + "loss": 1.1802, + "step": 20778 + }, + { + "epoch": 0.7441402402993894, + "grad_norm": 1.6984046697616577, + "learning_rate": 3.241164754214716e-05, + "loss": 1.0224, + "step": 20779 + }, + { + "epoch": 0.7441760524289577, + "grad_norm": 1.7490588426589966, + "learning_rate": 3.2403099426232365e-05, + "loss": 1.0271, + "step": 20780 + }, + { + "epoch": 0.744211864558526, + "grad_norm": 1.3251819610595703, + "learning_rate": 3.239455221974663e-05, + "loss": 0.7252, + "step": 20781 + }, + { + "epoch": 0.7442476766880942, + "grad_norm": 1.553422451019287, + "learning_rate": 3.2386005922804996e-05, + "loss": 1.0579, + "step": 20782 + }, + { + "epoch": 0.7442834888176626, + "grad_norm": 1.393557071685791, + "learning_rate": 3.237746053552244e-05, + "loss": 1.0129, + "step": 20783 + }, + { + "epoch": 0.7443193009472309, + "grad_norm": 1.387122392654419, + "learning_rate": 3.2368916058013956e-05, + "loss": 0.9735, + "step": 20784 + }, + { + "epoch": 0.7443551130767991, + "grad_norm": 1.6818820238113403, + "learning_rate": 3.236037249039444e-05, + "loss": 1.1726, + "step": 20785 + }, + { + "epoch": 0.7443909252063674, + "grad_norm": 1.5826306343078613, + "learning_rate": 3.235182983277886e-05, + "loss": 1.0685, + "step": 20786 + }, + { + "epoch": 0.7444267373359357, + "grad_norm": 1.4830745458602905, + "learning_rate": 3.234328808528215e-05, + "loss": 1.0336, + "step": 20787 + }, + { + "epoch": 0.7444625494655039, + "grad_norm": 1.4694136381149292, + "learning_rate": 3.233474724801926e-05, + "loss": 1.173, + "step": 20788 + }, + { + "epoch": 0.7444983615950722, + "grad_norm": 1.3176512718200684, + "learning_rate": 3.232620732110503e-05, + "loss": 1.0322, + "step": 20789 + }, + { + "epoch": 0.7445341737246406, + "grad_norm": 1.5444759130477905, + "learning_rate": 3.231766830465439e-05, + "loss": 1.102, + "step": 20790 + }, + { + "epoch": 0.7445699858542089, + "grad_norm": 1.3362547159194946, + "learning_rate": 3.230913019878224e-05, + "loss": 1.0799, + "step": 20791 + }, + { + "epoch": 0.7446057979837771, + "grad_norm": 1.7294063568115234, + "learning_rate": 3.230059300360342e-05, + "loss": 1.1771, + "step": 20792 + }, + { + "epoch": 0.7446416101133454, + "grad_norm": 1.5994925498962402, + "learning_rate": 3.229205671923278e-05, + "loss": 1.1699, + "step": 20793 + }, + { + "epoch": 0.7446774222429137, + "grad_norm": 1.4207849502563477, + "learning_rate": 3.2283521345785176e-05, + "loss": 1.1341, + "step": 20794 + }, + { + "epoch": 0.7447132343724819, + "grad_norm": 1.4471931457519531, + "learning_rate": 3.227498688337548e-05, + "loss": 1.0923, + "step": 20795 + }, + { + "epoch": 0.7447490465020502, + "grad_norm": 1.6963876485824585, + "learning_rate": 3.226645333211845e-05, + "loss": 1.05, + "step": 20796 + }, + { + "epoch": 0.7447848586316185, + "grad_norm": 1.5567774772644043, + "learning_rate": 3.225792069212892e-05, + "loss": 1.1937, + "step": 20797 + }, + { + "epoch": 0.7448206707611869, + "grad_norm": 1.4765862226486206, + "learning_rate": 3.224938896352171e-05, + "loss": 0.9007, + "step": 20798 + }, + { + "epoch": 0.7448564828907551, + "grad_norm": 1.382800817489624, + "learning_rate": 3.2240858146411546e-05, + "loss": 0.8728, + "step": 20799 + }, + { + "epoch": 0.7448922950203234, + "grad_norm": 1.5931025743484497, + "learning_rate": 3.2232328240913277e-05, + "loss": 1.1271, + "step": 20800 + }, + { + "epoch": 0.7449281071498917, + "grad_norm": 1.6816331148147583, + "learning_rate": 3.222379924714155e-05, + "loss": 0.8065, + "step": 20801 + }, + { + "epoch": 0.7449639192794599, + "grad_norm": 1.5526303052902222, + "learning_rate": 3.221527116521124e-05, + "loss": 1.1298, + "step": 20802 + }, + { + "epoch": 0.7449997314090282, + "grad_norm": 2.0081684589385986, + "learning_rate": 3.220674399523699e-05, + "loss": 1.1873, + "step": 20803 + }, + { + "epoch": 0.7450355435385965, + "grad_norm": 1.320580244064331, + "learning_rate": 3.219821773733355e-05, + "loss": 1.2099, + "step": 20804 + }, + { + "epoch": 0.7450713556681648, + "grad_norm": 1.2165461778640747, + "learning_rate": 3.218969239161563e-05, + "loss": 1.0729, + "step": 20805 + }, + { + "epoch": 0.7451071677977331, + "grad_norm": 2.105086326599121, + "learning_rate": 3.2181167958197964e-05, + "loss": 0.9696, + "step": 20806 + }, + { + "epoch": 0.7451429799273014, + "grad_norm": 2.0803475379943848, + "learning_rate": 3.2172644437195207e-05, + "loss": 1.3001, + "step": 20807 + }, + { + "epoch": 0.7451787920568697, + "grad_norm": 1.6209428310394287, + "learning_rate": 3.216412182872196e-05, + "loss": 1.0566, + "step": 20808 + }, + { + "epoch": 0.7452146041864379, + "grad_norm": 1.4422683715820312, + "learning_rate": 3.215560013289301e-05, + "loss": 1.1201, + "step": 20809 + }, + { + "epoch": 0.7452504163160062, + "grad_norm": 2.8435590267181396, + "learning_rate": 3.2147079349822925e-05, + "loss": 1.0886, + "step": 20810 + }, + { + "epoch": 0.7452862284455745, + "grad_norm": 1.4438034296035767, + "learning_rate": 3.2138559479626395e-05, + "loss": 1.0883, + "step": 20811 + }, + { + "epoch": 0.7453220405751428, + "grad_norm": 1.4208426475524902, + "learning_rate": 3.2130040522417946e-05, + "loss": 1.1558, + "step": 20812 + }, + { + "epoch": 0.7453578527047111, + "grad_norm": 1.56270432472229, + "learning_rate": 3.212152247831233e-05, + "loss": 1.0034, + "step": 20813 + }, + { + "epoch": 0.7453936648342794, + "grad_norm": 1.3083611726760864, + "learning_rate": 3.211300534742402e-05, + "loss": 1.3148, + "step": 20814 + }, + { + "epoch": 0.7454294769638476, + "grad_norm": 1.4560692310333252, + "learning_rate": 3.210448912986767e-05, + "loss": 1.0838, + "step": 20815 + }, + { + "epoch": 0.7454652890934159, + "grad_norm": 1.263676404953003, + "learning_rate": 3.209597382575786e-05, + "loss": 1.0057, + "step": 20816 + }, + { + "epoch": 0.7455011012229842, + "grad_norm": 1.3923078775405884, + "learning_rate": 3.208745943520911e-05, + "loss": 0.7861, + "step": 20817 + }, + { + "epoch": 0.7455369133525525, + "grad_norm": 1.4085549116134644, + "learning_rate": 3.207894595833603e-05, + "loss": 1.0879, + "step": 20818 + }, + { + "epoch": 0.7455727254821208, + "grad_norm": 1.5606123208999634, + "learning_rate": 3.207043339525304e-05, + "loss": 1.1897, + "step": 20819 + }, + { + "epoch": 0.7456085376116891, + "grad_norm": 1.3570654392242432, + "learning_rate": 3.206192174607482e-05, + "loss": 0.9823, + "step": 20820 + }, + { + "epoch": 0.7456443497412574, + "grad_norm": 1.4994665384292603, + "learning_rate": 3.205341101091578e-05, + "loss": 0.9552, + "step": 20821 + }, + { + "epoch": 0.7456801618708256, + "grad_norm": 1.4954484701156616, + "learning_rate": 3.2044901189890473e-05, + "loss": 1.1673, + "step": 20822 + }, + { + "epoch": 0.7457159740003939, + "grad_norm": 1.4679653644561768, + "learning_rate": 3.2036392283113304e-05, + "loss": 1.042, + "step": 20823 + }, + { + "epoch": 0.7457517861299622, + "grad_norm": 1.5197848081588745, + "learning_rate": 3.202788429069887e-05, + "loss": 1.1517, + "step": 20824 + }, + { + "epoch": 0.7457875982595304, + "grad_norm": 1.6458570957183838, + "learning_rate": 3.201937721276159e-05, + "loss": 1.1135, + "step": 20825 + }, + { + "epoch": 0.7458234103890988, + "grad_norm": 1.6740614175796509, + "learning_rate": 3.201087104941586e-05, + "loss": 1.0757, + "step": 20826 + }, + { + "epoch": 0.7458592225186671, + "grad_norm": 1.2601100206375122, + "learning_rate": 3.2002365800776154e-05, + "loss": 0.9846, + "step": 20827 + }, + { + "epoch": 0.7458950346482354, + "grad_norm": 1.2941828966140747, + "learning_rate": 3.199386146695691e-05, + "loss": 1.2103, + "step": 20828 + }, + { + "epoch": 0.7459308467778036, + "grad_norm": 1.3667980432510376, + "learning_rate": 3.1985358048072574e-05, + "loss": 1.0665, + "step": 20829 + }, + { + "epoch": 0.7459666589073719, + "grad_norm": 1.2611185312271118, + "learning_rate": 3.197685554423745e-05, + "loss": 0.9849, + "step": 20830 + }, + { + "epoch": 0.7460024710369402, + "grad_norm": 1.5668774843215942, + "learning_rate": 3.1968353955566045e-05, + "loss": 0.9645, + "step": 20831 + }, + { + "epoch": 0.7460382831665084, + "grad_norm": 1.2356603145599365, + "learning_rate": 3.195985328217266e-05, + "loss": 0.8952, + "step": 20832 + }, + { + "epoch": 0.7460740952960768, + "grad_norm": 1.369359016418457, + "learning_rate": 3.1951353524171715e-05, + "loss": 1.1662, + "step": 20833 + }, + { + "epoch": 0.7461099074256451, + "grad_norm": 1.488516926765442, + "learning_rate": 3.194285468167749e-05, + "loss": 1.0994, + "step": 20834 + }, + { + "epoch": 0.7461457195552134, + "grad_norm": 1.440462350845337, + "learning_rate": 3.1934356754804385e-05, + "loss": 1.13, + "step": 20835 + }, + { + "epoch": 0.7461815316847816, + "grad_norm": 1.295182704925537, + "learning_rate": 3.192585974366673e-05, + "loss": 1.0423, + "step": 20836 + }, + { + "epoch": 0.7462173438143499, + "grad_norm": 1.7152410745620728, + "learning_rate": 3.19173636483788e-05, + "loss": 1.0837, + "step": 20837 + }, + { + "epoch": 0.7462531559439182, + "grad_norm": 1.845809817314148, + "learning_rate": 3.190886846905491e-05, + "loss": 1.1331, + "step": 20838 + }, + { + "epoch": 0.7462889680734864, + "grad_norm": 1.5387409925460815, + "learning_rate": 3.190037420580937e-05, + "loss": 1.0077, + "step": 20839 + }, + { + "epoch": 0.7463247802030548, + "grad_norm": 1.4686435461044312, + "learning_rate": 3.1891880858756484e-05, + "loss": 1.1931, + "step": 20840 + }, + { + "epoch": 0.7463605923326231, + "grad_norm": 1.7500072717666626, + "learning_rate": 3.1883388428010465e-05, + "loss": 1.2547, + "step": 20841 + }, + { + "epoch": 0.7463964044621914, + "grad_norm": 1.4618682861328125, + "learning_rate": 3.187489691368558e-05, + "loss": 1.2327, + "step": 20842 + }, + { + "epoch": 0.7464322165917596, + "grad_norm": 1.4949074983596802, + "learning_rate": 3.186640631589611e-05, + "loss": 0.9966, + "step": 20843 + }, + { + "epoch": 0.7464680287213279, + "grad_norm": 1.3706756830215454, + "learning_rate": 3.1857916634756234e-05, + "loss": 1.0175, + "step": 20844 + }, + { + "epoch": 0.7465038408508962, + "grad_norm": 1.7542420625686646, + "learning_rate": 3.184942787038019e-05, + "loss": 1.343, + "step": 20845 + }, + { + "epoch": 0.7465396529804644, + "grad_norm": 1.5104339122772217, + "learning_rate": 3.184094002288219e-05, + "loss": 1.0357, + "step": 20846 + }, + { + "epoch": 0.7465754651100328, + "grad_norm": 1.3253613710403442, + "learning_rate": 3.1832453092376446e-05, + "loss": 0.9907, + "step": 20847 + }, + { + "epoch": 0.7466112772396011, + "grad_norm": 1.7005281448364258, + "learning_rate": 3.182396707897709e-05, + "loss": 0.9961, + "step": 20848 + }, + { + "epoch": 0.7466470893691693, + "grad_norm": 1.3774032592773438, + "learning_rate": 3.1815481982798324e-05, + "loss": 1.0461, + "step": 20849 + }, + { + "epoch": 0.7466829014987376, + "grad_norm": 1.2603265047073364, + "learning_rate": 3.1806997803954316e-05, + "loss": 0.8582, + "step": 20850 + }, + { + "epoch": 0.7467187136283059, + "grad_norm": 1.5095577239990234, + "learning_rate": 3.1798514542559164e-05, + "loss": 0.9622, + "step": 20851 + }, + { + "epoch": 0.7467545257578742, + "grad_norm": 1.6598126888275146, + "learning_rate": 3.1790032198727014e-05, + "loss": 1.1675, + "step": 20852 + }, + { + "epoch": 0.7467903378874424, + "grad_norm": 1.3877713680267334, + "learning_rate": 3.178155077257201e-05, + "loss": 0.9437, + "step": 20853 + }, + { + "epoch": 0.7468261500170108, + "grad_norm": 1.5436302423477173, + "learning_rate": 3.177307026420827e-05, + "loss": 1.1263, + "step": 20854 + }, + { + "epoch": 0.7468619621465791, + "grad_norm": 1.2155293226242065, + "learning_rate": 3.176459067374984e-05, + "loss": 0.9724, + "step": 20855 + }, + { + "epoch": 0.7468977742761473, + "grad_norm": 1.8221547603607178, + "learning_rate": 3.175611200131081e-05, + "loss": 0.9989, + "step": 20856 + }, + { + "epoch": 0.7469335864057156, + "grad_norm": 1.6790003776550293, + "learning_rate": 3.174763424700528e-05, + "loss": 1.149, + "step": 20857 + }, + { + "epoch": 0.7469693985352839, + "grad_norm": 1.521775245666504, + "learning_rate": 3.1739157410947316e-05, + "loss": 1.1601, + "step": 20858 + }, + { + "epoch": 0.7470052106648521, + "grad_norm": 1.665844440460205, + "learning_rate": 3.173068149325091e-05, + "loss": 1.1875, + "step": 20859 + }, + { + "epoch": 0.7470410227944204, + "grad_norm": 2.1629867553710938, + "learning_rate": 3.172220649403011e-05, + "loss": 1.2133, + "step": 20860 + }, + { + "epoch": 0.7470768349239888, + "grad_norm": 1.6019505262374878, + "learning_rate": 3.1713732413399e-05, + "loss": 1.1952, + "step": 20861 + }, + { + "epoch": 0.7471126470535571, + "grad_norm": 1.6149917840957642, + "learning_rate": 3.1705259251471496e-05, + "loss": 1.1162, + "step": 20862 + }, + { + "epoch": 0.7471484591831253, + "grad_norm": 2.446049451828003, + "learning_rate": 3.169678700836164e-05, + "loss": 1.073, + "step": 20863 + }, + { + "epoch": 0.7471842713126936, + "grad_norm": 2.0208568572998047, + "learning_rate": 3.168831568418341e-05, + "loss": 1.2488, + "step": 20864 + }, + { + "epoch": 0.7472200834422619, + "grad_norm": 1.4592812061309814, + "learning_rate": 3.16798452790508e-05, + "loss": 1.0042, + "step": 20865 + }, + { + "epoch": 0.7472558955718301, + "grad_norm": 1.3336427211761475, + "learning_rate": 3.167137579307773e-05, + "loss": 1.147, + "step": 20866 + }, + { + "epoch": 0.7472917077013984, + "grad_norm": 1.4869303703308105, + "learning_rate": 3.1662907226378145e-05, + "loss": 1.1543, + "step": 20867 + }, + { + "epoch": 0.7473275198309668, + "grad_norm": 1.257930874824524, + "learning_rate": 3.165443957906603e-05, + "loss": 0.9177, + "step": 20868 + }, + { + "epoch": 0.7473633319605351, + "grad_norm": 1.323584794998169, + "learning_rate": 3.164597285125525e-05, + "loss": 1.1083, + "step": 20869 + }, + { + "epoch": 0.7473991440901033, + "grad_norm": 1.5214533805847168, + "learning_rate": 3.163750704305972e-05, + "loss": 1.1075, + "step": 20870 + }, + { + "epoch": 0.7474349562196716, + "grad_norm": 2.071115016937256, + "learning_rate": 3.162904215459336e-05, + "loss": 1.096, + "step": 20871 + }, + { + "epoch": 0.7474707683492399, + "grad_norm": 1.4004337787628174, + "learning_rate": 3.1620578185970075e-05, + "loss": 1.095, + "step": 20872 + }, + { + "epoch": 0.7475065804788081, + "grad_norm": 1.4025506973266602, + "learning_rate": 3.161211513730368e-05, + "loss": 1.0081, + "step": 20873 + }, + { + "epoch": 0.7475423926083764, + "grad_norm": 1.7409414052963257, + "learning_rate": 3.160365300870804e-05, + "loss": 0.9999, + "step": 20874 + }, + { + "epoch": 0.7475782047379448, + "grad_norm": 1.7843314409255981, + "learning_rate": 3.159519180029705e-05, + "loss": 1.2475, + "step": 20875 + }, + { + "epoch": 0.747614016867513, + "grad_norm": 1.385764479637146, + "learning_rate": 3.1586731512184545e-05, + "loss": 0.8812, + "step": 20876 + }, + { + "epoch": 0.7476498289970813, + "grad_norm": 1.4643163681030273, + "learning_rate": 3.157827214448428e-05, + "loss": 1.1418, + "step": 20877 + }, + { + "epoch": 0.7476856411266496, + "grad_norm": 1.4580092430114746, + "learning_rate": 3.1569813697310115e-05, + "loss": 1.1349, + "step": 20878 + }, + { + "epoch": 0.7477214532562179, + "grad_norm": 1.448617935180664, + "learning_rate": 3.156135617077587e-05, + "loss": 1.0721, + "step": 20879 + }, + { + "epoch": 0.7477572653857861, + "grad_norm": 1.3566559553146362, + "learning_rate": 3.155289956499525e-05, + "loss": 1.0389, + "step": 20880 + }, + { + "epoch": 0.7477930775153544, + "grad_norm": 1.3068296909332275, + "learning_rate": 3.15444438800821e-05, + "loss": 1.1844, + "step": 20881 + }, + { + "epoch": 0.7478288896449228, + "grad_norm": 1.7540391683578491, + "learning_rate": 3.1535989116150146e-05, + "loss": 1.2183, + "step": 20882 + }, + { + "epoch": 0.747864701774491, + "grad_norm": 1.3821645975112915, + "learning_rate": 3.1527535273313166e-05, + "loss": 1.0055, + "step": 20883 + }, + { + "epoch": 0.7479005139040593, + "grad_norm": 1.9945546388626099, + "learning_rate": 3.151908235168486e-05, + "loss": 1.2965, + "step": 20884 + }, + { + "epoch": 0.7479363260336276, + "grad_norm": 1.542799949645996, + "learning_rate": 3.151063035137896e-05, + "loss": 1.044, + "step": 20885 + }, + { + "epoch": 0.7479721381631959, + "grad_norm": 1.3149381875991821, + "learning_rate": 3.1502179272509216e-05, + "loss": 1.0118, + "step": 20886 + }, + { + "epoch": 0.7480079502927641, + "grad_norm": 1.8461610078811646, + "learning_rate": 3.149372911518926e-05, + "loss": 1.1919, + "step": 20887 + }, + { + "epoch": 0.7480437624223324, + "grad_norm": 1.6685384511947632, + "learning_rate": 3.1485279879532826e-05, + "loss": 1.0635, + "step": 20888 + }, + { + "epoch": 0.7480795745519008, + "grad_norm": 1.4586408138275146, + "learning_rate": 3.147683156565355e-05, + "loss": 1.1333, + "step": 20889 + }, + { + "epoch": 0.748115386681469, + "grad_norm": 1.3198336362838745, + "learning_rate": 3.146838417366517e-05, + "loss": 1.2275, + "step": 20890 + }, + { + "epoch": 0.7481511988110373, + "grad_norm": 1.480590581893921, + "learning_rate": 3.145993770368124e-05, + "loss": 1.0374, + "step": 20891 + }, + { + "epoch": 0.7481870109406056, + "grad_norm": 1.4779293537139893, + "learning_rate": 3.1451492155815444e-05, + "loss": 0.9125, + "step": 20892 + }, + { + "epoch": 0.7482228230701738, + "grad_norm": 1.3439967632293701, + "learning_rate": 3.1443047530181394e-05, + "loss": 1.1718, + "step": 20893 + }, + { + "epoch": 0.7482586351997421, + "grad_norm": 1.163085699081421, + "learning_rate": 3.143460382689274e-05, + "loss": 1.0083, + "step": 20894 + }, + { + "epoch": 0.7482944473293104, + "grad_norm": 1.2833162546157837, + "learning_rate": 3.142616104606304e-05, + "loss": 0.9064, + "step": 20895 + }, + { + "epoch": 0.7483302594588788, + "grad_norm": 1.3105812072753906, + "learning_rate": 3.141771918780584e-05, + "loss": 0.9777, + "step": 20896 + }, + { + "epoch": 0.748366071588447, + "grad_norm": 1.349693775177002, + "learning_rate": 3.140927825223482e-05, + "loss": 1.0166, + "step": 20897 + }, + { + "epoch": 0.7484018837180153, + "grad_norm": 1.5421884059906006, + "learning_rate": 3.140083823946346e-05, + "loss": 1.1317, + "step": 20898 + }, + { + "epoch": 0.7484376958475836, + "grad_norm": 1.2557986974716187, + "learning_rate": 3.139239914960532e-05, + "loss": 1.087, + "step": 20899 + }, + { + "epoch": 0.7484735079771518, + "grad_norm": 1.4242583513259888, + "learning_rate": 3.138396098277396e-05, + "loss": 1.0302, + "step": 20900 + }, + { + "epoch": 0.7485093201067201, + "grad_norm": 1.7467617988586426, + "learning_rate": 3.1375523739082936e-05, + "loss": 1.0524, + "step": 20901 + }, + { + "epoch": 0.7485451322362884, + "grad_norm": 1.4077563285827637, + "learning_rate": 3.136708741864568e-05, + "loss": 0.8562, + "step": 20902 + }, + { + "epoch": 0.7485809443658568, + "grad_norm": 1.3483394384384155, + "learning_rate": 3.135865202157574e-05, + "loss": 1.1495, + "step": 20903 + }, + { + "epoch": 0.748616756495425, + "grad_norm": 1.7903809547424316, + "learning_rate": 3.135021754798663e-05, + "loss": 1.2466, + "step": 20904 + }, + { + "epoch": 0.7486525686249933, + "grad_norm": 1.4711846113204956, + "learning_rate": 3.134178399799175e-05, + "loss": 1.0716, + "step": 20905 + }, + { + "epoch": 0.7486883807545616, + "grad_norm": 1.6307836771011353, + "learning_rate": 3.1333351371704634e-05, + "loss": 1.1431, + "step": 20906 + }, + { + "epoch": 0.7487241928841298, + "grad_norm": 2.0376131534576416, + "learning_rate": 3.132491966923864e-05, + "loss": 1.3571, + "step": 20907 + }, + { + "epoch": 0.7487600050136981, + "grad_norm": 1.2945294380187988, + "learning_rate": 3.131648889070734e-05, + "loss": 1.0714, + "step": 20908 + }, + { + "epoch": 0.7487958171432664, + "grad_norm": 1.2782175540924072, + "learning_rate": 3.130805903622405e-05, + "loss": 1.1527, + "step": 20909 + }, + { + "epoch": 0.7488316292728348, + "grad_norm": 1.5014851093292236, + "learning_rate": 3.129963010590224e-05, + "loss": 1.0808, + "step": 20910 + }, + { + "epoch": 0.748867441402403, + "grad_norm": 1.5921111106872559, + "learning_rate": 3.1291202099855245e-05, + "loss": 1.1654, + "step": 20911 + }, + { + "epoch": 0.7489032535319713, + "grad_norm": 1.3893144130706787, + "learning_rate": 3.1282775018196554e-05, + "loss": 1.1737, + "step": 20912 + }, + { + "epoch": 0.7489390656615396, + "grad_norm": 1.526964783668518, + "learning_rate": 3.127434886103948e-05, + "loss": 1.0706, + "step": 20913 + }, + { + "epoch": 0.7489748777911078, + "grad_norm": 1.2971441745758057, + "learning_rate": 3.1265923628497327e-05, + "loss": 1.007, + "step": 20914 + }, + { + "epoch": 0.7490106899206761, + "grad_norm": 1.6754242181777954, + "learning_rate": 3.125749932068359e-05, + "loss": 1.055, + "step": 20915 + }, + { + "epoch": 0.7490465020502444, + "grad_norm": 1.2948675155639648, + "learning_rate": 3.124907593771148e-05, + "loss": 1.0593, + "step": 20916 + }, + { + "epoch": 0.7490823141798127, + "grad_norm": 2.152395248413086, + "learning_rate": 3.1240653479694415e-05, + "loss": 0.9752, + "step": 20917 + }, + { + "epoch": 0.749118126309381, + "grad_norm": 1.4037370681762695, + "learning_rate": 3.123223194674559e-05, + "loss": 1.0397, + "step": 20918 + }, + { + "epoch": 0.7491539384389493, + "grad_norm": 1.500752568244934, + "learning_rate": 3.122381133897846e-05, + "loss": 1.2659, + "step": 20919 + }, + { + "epoch": 0.7491897505685176, + "grad_norm": 1.3688267469406128, + "learning_rate": 3.121539165650619e-05, + "loss": 0.8808, + "step": 20920 + }, + { + "epoch": 0.7492255626980858, + "grad_norm": 1.3679355382919312, + "learning_rate": 3.120697289944213e-05, + "loss": 1.0853, + "step": 20921 + }, + { + "epoch": 0.7492613748276541, + "grad_norm": 1.56710946559906, + "learning_rate": 3.119855506789948e-05, + "loss": 1.1414, + "step": 20922 + }, + { + "epoch": 0.7492971869572224, + "grad_norm": 1.8395928144454956, + "learning_rate": 3.1190138161991536e-05, + "loss": 1.1231, + "step": 20923 + }, + { + "epoch": 0.7493329990867907, + "grad_norm": 1.86371910572052, + "learning_rate": 3.118172218183154e-05, + "loss": 1.0894, + "step": 20924 + }, + { + "epoch": 0.749368811216359, + "grad_norm": 2.1009275913238525, + "learning_rate": 3.117330712753265e-05, + "loss": 1.1447, + "step": 20925 + }, + { + "epoch": 0.7494046233459273, + "grad_norm": 1.438523292541504, + "learning_rate": 3.11648929992082e-05, + "loss": 1.1179, + "step": 20926 + }, + { + "epoch": 0.7494404354754955, + "grad_norm": 1.4518535137176514, + "learning_rate": 3.115647979697128e-05, + "loss": 0.9671, + "step": 20927 + }, + { + "epoch": 0.7494762476050638, + "grad_norm": 1.1192028522491455, + "learning_rate": 3.114806752093517e-05, + "loss": 1.0496, + "step": 20928 + }, + { + "epoch": 0.7495120597346321, + "grad_norm": 1.6060240268707275, + "learning_rate": 3.113965617121291e-05, + "loss": 1.0852, + "step": 20929 + }, + { + "epoch": 0.7495478718642004, + "grad_norm": 1.4159486293792725, + "learning_rate": 3.1131245747917835e-05, + "loss": 1.1485, + "step": 20930 + }, + { + "epoch": 0.7495836839937687, + "grad_norm": 1.3804540634155273, + "learning_rate": 3.1122836251163014e-05, + "loss": 1.0561, + "step": 20931 + }, + { + "epoch": 0.749619496123337, + "grad_norm": 1.3224543333053589, + "learning_rate": 3.111442768106155e-05, + "loss": 1.1846, + "step": 20932 + }, + { + "epoch": 0.7496553082529053, + "grad_norm": 1.3585033416748047, + "learning_rate": 3.1106020037726615e-05, + "loss": 1.0127, + "step": 20933 + }, + { + "epoch": 0.7496911203824735, + "grad_norm": 1.426831603050232, + "learning_rate": 3.1097613321271304e-05, + "loss": 1.0038, + "step": 20934 + }, + { + "epoch": 0.7497269325120418, + "grad_norm": 1.5209718942642212, + "learning_rate": 3.108920753180875e-05, + "loss": 1.1237, + "step": 20935 + }, + { + "epoch": 0.7497627446416101, + "grad_norm": 1.5799363851547241, + "learning_rate": 3.1080802669452e-05, + "loss": 1.141, + "step": 20936 + }, + { + "epoch": 0.7497985567711783, + "grad_norm": 1.7943127155303955, + "learning_rate": 3.107239873431416e-05, + "loss": 0.9864, + "step": 20937 + }, + { + "epoch": 0.7498343689007467, + "grad_norm": 1.7909858226776123, + "learning_rate": 3.1063995726508296e-05, + "loss": 1.0059, + "step": 20938 + }, + { + "epoch": 0.749870181030315, + "grad_norm": 1.7079746723175049, + "learning_rate": 3.105559364614743e-05, + "loss": 1.0691, + "step": 20939 + }, + { + "epoch": 0.7499059931598833, + "grad_norm": 1.9717631340026855, + "learning_rate": 3.1047192493344624e-05, + "loss": 1.0404, + "step": 20940 + }, + { + "epoch": 0.7499418052894515, + "grad_norm": 1.4736171960830688, + "learning_rate": 3.103879226821289e-05, + "loss": 0.9882, + "step": 20941 + }, + { + "epoch": 0.7499776174190198, + "grad_norm": 2.136772632598877, + "learning_rate": 3.1030392970865286e-05, + "loss": 0.9715, + "step": 20942 + }, + { + "epoch": 0.7500134295485881, + "grad_norm": 1.29421067237854, + "learning_rate": 3.102199460141475e-05, + "loss": 1.0455, + "step": 20943 + }, + { + "epoch": 0.7500492416781563, + "grad_norm": 1.5606807470321655, + "learning_rate": 3.1013597159974304e-05, + "loss": 1.0714, + "step": 20944 + }, + { + "epoch": 0.7500850538077247, + "grad_norm": 1.2176628112792969, + "learning_rate": 3.1005200646656915e-05, + "loss": 0.97, + "step": 20945 + }, + { + "epoch": 0.750120865937293, + "grad_norm": 1.6160095930099487, + "learning_rate": 3.09968050615756e-05, + "loss": 0.962, + "step": 20946 + }, + { + "epoch": 0.7501566780668613, + "grad_norm": 1.6263364553451538, + "learning_rate": 3.0988410404843216e-05, + "loss": 1.3375, + "step": 20947 + }, + { + "epoch": 0.7501924901964295, + "grad_norm": 1.9483253955841064, + "learning_rate": 3.0980016676572766e-05, + "loss": 1.1717, + "step": 20948 + }, + { + "epoch": 0.7502283023259978, + "grad_norm": 1.3582777976989746, + "learning_rate": 3.097162387687719e-05, + "loss": 1.0093, + "step": 20949 + }, + { + "epoch": 0.7502641144555661, + "grad_norm": 1.8927230834960938, + "learning_rate": 3.096323200586934e-05, + "loss": 0.9686, + "step": 20950 + }, + { + "epoch": 0.7502999265851343, + "grad_norm": 1.6169874668121338, + "learning_rate": 3.0954841063662145e-05, + "loss": 1.1463, + "step": 20951 + }, + { + "epoch": 0.7503357387147027, + "grad_norm": 1.6870393753051758, + "learning_rate": 3.094645105036851e-05, + "loss": 0.9837, + "step": 20952 + }, + { + "epoch": 0.750371550844271, + "grad_norm": 1.612646460533142, + "learning_rate": 3.093806196610134e-05, + "loss": 1.2073, + "step": 20953 + }, + { + "epoch": 0.7504073629738393, + "grad_norm": 1.506817102432251, + "learning_rate": 3.092967381097342e-05, + "loss": 0.9585, + "step": 20954 + }, + { + "epoch": 0.7504431751034075, + "grad_norm": 1.439280390739441, + "learning_rate": 3.092128658509765e-05, + "loss": 1.0861, + "step": 20955 + }, + { + "epoch": 0.7504789872329758, + "grad_norm": 1.9788395166397095, + "learning_rate": 3.09129002885869e-05, + "loss": 1.193, + "step": 20956 + }, + { + "epoch": 0.7505147993625441, + "grad_norm": 1.5183392763137817, + "learning_rate": 3.090451492155392e-05, + "loss": 1.1964, + "step": 20957 + }, + { + "epoch": 0.7505506114921123, + "grad_norm": 1.2046760320663452, + "learning_rate": 3.089613048411158e-05, + "loss": 1.008, + "step": 20958 + }, + { + "epoch": 0.7505864236216807, + "grad_norm": 1.610807180404663, + "learning_rate": 3.088774697637265e-05, + "loss": 1.063, + "step": 20959 + }, + { + "epoch": 0.750622235751249, + "grad_norm": 1.6565982103347778, + "learning_rate": 3.087936439844997e-05, + "loss": 1.015, + "step": 20960 + }, + { + "epoch": 0.7506580478808172, + "grad_norm": 1.288490891456604, + "learning_rate": 3.087098275045626e-05, + "loss": 0.9697, + "step": 20961 + }, + { + "epoch": 0.7506938600103855, + "grad_norm": 1.7602897882461548, + "learning_rate": 3.08626020325043e-05, + "loss": 1.2216, + "step": 20962 + }, + { + "epoch": 0.7507296721399538, + "grad_norm": 1.4429841041564941, + "learning_rate": 3.0854222244706857e-05, + "loss": 1.1163, + "step": 20963 + }, + { + "epoch": 0.750765484269522, + "grad_norm": 1.4442590475082397, + "learning_rate": 3.0845843387176686e-05, + "loss": 1.1517, + "step": 20964 + }, + { + "epoch": 0.7508012963990903, + "grad_norm": 1.303200125694275, + "learning_rate": 3.083746546002646e-05, + "loss": 1.0145, + "step": 20965 + }, + { + "epoch": 0.7508371085286587, + "grad_norm": 1.3972114324569702, + "learning_rate": 3.082908846336891e-05, + "loss": 1.0579, + "step": 20966 + }, + { + "epoch": 0.750872920658227, + "grad_norm": 1.3351380825042725, + "learning_rate": 3.082071239731681e-05, + "loss": 1.0318, + "step": 20967 + }, + { + "epoch": 0.7509087327877952, + "grad_norm": 1.2624561786651611, + "learning_rate": 3.0812337261982735e-05, + "loss": 1.1028, + "step": 20968 + }, + { + "epoch": 0.7509445449173635, + "grad_norm": 1.675942301750183, + "learning_rate": 3.080396305747942e-05, + "loss": 1.1444, + "step": 20969 + }, + { + "epoch": 0.7509803570469318, + "grad_norm": 1.2963768243789673, + "learning_rate": 3.0795589783919543e-05, + "loss": 0.9633, + "step": 20970 + }, + { + "epoch": 0.7510161691765, + "grad_norm": 1.353088617324829, + "learning_rate": 3.078721744141575e-05, + "loss": 0.9817, + "step": 20971 + }, + { + "epoch": 0.7510519813060683, + "grad_norm": 1.5567466020584106, + "learning_rate": 3.0778846030080644e-05, + "loss": 1.1597, + "step": 20972 + }, + { + "epoch": 0.7510877934356367, + "grad_norm": 1.3224456310272217, + "learning_rate": 3.077047555002688e-05, + "loss": 1.0953, + "step": 20973 + }, + { + "epoch": 0.751123605565205, + "grad_norm": 1.7031227350234985, + "learning_rate": 3.0762106001367095e-05, + "loss": 1.0062, + "step": 20974 + }, + { + "epoch": 0.7511594176947732, + "grad_norm": 1.7156298160552979, + "learning_rate": 3.075373738421383e-05, + "loss": 0.9972, + "step": 20975 + }, + { + "epoch": 0.7511952298243415, + "grad_norm": 1.5652635097503662, + "learning_rate": 3.0745369698679715e-05, + "loss": 1.023, + "step": 20976 + }, + { + "epoch": 0.7512310419539098, + "grad_norm": 1.451871395111084, + "learning_rate": 3.0737002944877314e-05, + "loss": 0.9136, + "step": 20977 + }, + { + "epoch": 0.751266854083478, + "grad_norm": 1.677382469177246, + "learning_rate": 3.072863712291922e-05, + "loss": 1.0795, + "step": 20978 + }, + { + "epoch": 0.7513026662130463, + "grad_norm": 1.2265416383743286, + "learning_rate": 3.0720272232917934e-05, + "loss": 1.1733, + "step": 20979 + }, + { + "epoch": 0.7513384783426147, + "grad_norm": 1.213577389717102, + "learning_rate": 3.071190827498602e-05, + "loss": 1.0483, + "step": 20980 + }, + { + "epoch": 0.751374290472183, + "grad_norm": 1.545878291130066, + "learning_rate": 3.070354524923601e-05, + "loss": 1.0941, + "step": 20981 + }, + { + "epoch": 0.7514101026017512, + "grad_norm": 1.5199263095855713, + "learning_rate": 3.0695183155780435e-05, + "loss": 1.016, + "step": 20982 + }, + { + "epoch": 0.7514459147313195, + "grad_norm": 1.5286816358566284, + "learning_rate": 3.068682199473175e-05, + "loss": 1.039, + "step": 20983 + }, + { + "epoch": 0.7514817268608878, + "grad_norm": 1.3497086763381958, + "learning_rate": 3.067846176620247e-05, + "loss": 0.9017, + "step": 20984 + }, + { + "epoch": 0.751517538990456, + "grad_norm": 2.178708076477051, + "learning_rate": 3.06701024703051e-05, + "loss": 1.1987, + "step": 20985 + }, + { + "epoch": 0.7515533511200243, + "grad_norm": 2.0252346992492676, + "learning_rate": 3.0661744107152025e-05, + "loss": 1.1174, + "step": 20986 + }, + { + "epoch": 0.7515891632495927, + "grad_norm": 1.6417772769927979, + "learning_rate": 3.0653386676855756e-05, + "loss": 0.9974, + "step": 20987 + }, + { + "epoch": 0.751624975379161, + "grad_norm": 1.7295500040054321, + "learning_rate": 3.064503017952871e-05, + "loss": 1.0646, + "step": 20988 + }, + { + "epoch": 0.7516607875087292, + "grad_norm": 1.7699017524719238, + "learning_rate": 3.0636674615283364e-05, + "loss": 1.1304, + "step": 20989 + }, + { + "epoch": 0.7516965996382975, + "grad_norm": 1.5585122108459473, + "learning_rate": 3.0628319984232056e-05, + "loss": 1.1616, + "step": 20990 + }, + { + "epoch": 0.7517324117678658, + "grad_norm": 1.396611213684082, + "learning_rate": 3.061996628648721e-05, + "loss": 0.904, + "step": 20991 + }, + { + "epoch": 0.751768223897434, + "grad_norm": 1.329169511795044, + "learning_rate": 3.0611613522161266e-05, + "loss": 0.8975, + "step": 20992 + }, + { + "epoch": 0.7518040360270023, + "grad_norm": 1.6459530591964722, + "learning_rate": 3.0603261691366525e-05, + "loss": 1.0489, + "step": 20993 + }, + { + "epoch": 0.7518398481565707, + "grad_norm": 2.081455945968628, + "learning_rate": 3.05949107942154e-05, + "loss": 1.0902, + "step": 20994 + }, + { + "epoch": 0.751875660286139, + "grad_norm": 1.528835654258728, + "learning_rate": 3.0586560830820174e-05, + "loss": 1.1843, + "step": 20995 + }, + { + "epoch": 0.7519114724157072, + "grad_norm": 1.5455125570297241, + "learning_rate": 3.05782118012933e-05, + "loss": 1.0066, + "step": 20996 + }, + { + "epoch": 0.7519472845452755, + "grad_norm": 1.3105357885360718, + "learning_rate": 3.0569863705747004e-05, + "loss": 1.1823, + "step": 20997 + }, + { + "epoch": 0.7519830966748438, + "grad_norm": 2.499936103820801, + "learning_rate": 3.0561516544293634e-05, + "loss": 1.3452, + "step": 20998 + }, + { + "epoch": 0.752018908804412, + "grad_norm": 1.6225379705429077, + "learning_rate": 3.0553170317045485e-05, + "loss": 0.8993, + "step": 20999 + }, + { + "epoch": 0.7520547209339803, + "grad_norm": 1.9764049053192139, + "learning_rate": 3.054482502411489e-05, + "loss": 1.2129, + "step": 21000 + }, + { + "epoch": 0.7520905330635487, + "grad_norm": 2.051246166229248, + "learning_rate": 3.0536480665614075e-05, + "loss": 1.2735, + "step": 21001 + }, + { + "epoch": 0.7521263451931169, + "grad_norm": 1.653026819229126, + "learning_rate": 3.052813724165525e-05, + "loss": 0.9925, + "step": 21002 + }, + { + "epoch": 0.7521621573226852, + "grad_norm": 2.0560805797576904, + "learning_rate": 3.051979475235078e-05, + "loss": 0.8467, + "step": 21003 + }, + { + "epoch": 0.7521979694522535, + "grad_norm": 1.3173887729644775, + "learning_rate": 3.0511453197812834e-05, + "loss": 0.9855, + "step": 21004 + }, + { + "epoch": 0.7522337815818217, + "grad_norm": 1.2984422445297241, + "learning_rate": 3.050311257815368e-05, + "loss": 1.0368, + "step": 21005 + }, + { + "epoch": 0.75226959371139, + "grad_norm": 1.4678301811218262, + "learning_rate": 3.0494772893485435e-05, + "loss": 0.9454, + "step": 21006 + }, + { + "epoch": 0.7523054058409583, + "grad_norm": 1.8313902616500854, + "learning_rate": 3.0486434143920428e-05, + "loss": 0.974, + "step": 21007 + }, + { + "epoch": 0.7523412179705267, + "grad_norm": 1.6561554670333862, + "learning_rate": 3.047809632957075e-05, + "loss": 0.9832, + "step": 21008 + }, + { + "epoch": 0.7523770301000949, + "grad_norm": 1.689245343208313, + "learning_rate": 3.0469759450548607e-05, + "loss": 1.007, + "step": 21009 + }, + { + "epoch": 0.7524128422296632, + "grad_norm": 1.2818443775177002, + "learning_rate": 3.0461423506966203e-05, + "loss": 0.8958, + "step": 21010 + }, + { + "epoch": 0.7524486543592315, + "grad_norm": 1.2511242628097534, + "learning_rate": 3.0453088498935612e-05, + "loss": 1.1434, + "step": 21011 + }, + { + "epoch": 0.7524844664887997, + "grad_norm": 1.5283770561218262, + "learning_rate": 3.0444754426569032e-05, + "loss": 1.1249, + "step": 21012 + }, + { + "epoch": 0.752520278618368, + "grad_norm": 1.8794654607772827, + "learning_rate": 3.04364212899785e-05, + "loss": 1.0647, + "step": 21013 + }, + { + "epoch": 0.7525560907479363, + "grad_norm": 1.3857914209365845, + "learning_rate": 3.0428089089276257e-05, + "loss": 1.2904, + "step": 21014 + }, + { + "epoch": 0.7525919028775047, + "grad_norm": 1.4470744132995605, + "learning_rate": 3.04197578245743e-05, + "loss": 1.1864, + "step": 21015 + }, + { + "epoch": 0.7526277150070729, + "grad_norm": 2.0248587131500244, + "learning_rate": 3.041142749598479e-05, + "loss": 1.1898, + "step": 21016 + }, + { + "epoch": 0.7526635271366412, + "grad_norm": 2.1205966472625732, + "learning_rate": 3.0403098103619687e-05, + "loss": 1.1013, + "step": 21017 + }, + { + "epoch": 0.7526993392662095, + "grad_norm": 1.3696019649505615, + "learning_rate": 3.0394769647591194e-05, + "loss": 1.0895, + "step": 21018 + }, + { + "epoch": 0.7527351513957777, + "grad_norm": 1.576317548751831, + "learning_rate": 3.0386442128011282e-05, + "loss": 0.9676, + "step": 21019 + }, + { + "epoch": 0.752770963525346, + "grad_norm": 1.3360785245895386, + "learning_rate": 3.037811554499197e-05, + "loss": 0.8244, + "step": 21020 + }, + { + "epoch": 0.7528067756549143, + "grad_norm": 1.4227200746536255, + "learning_rate": 3.0369789898645306e-05, + "loss": 1.0935, + "step": 21021 + }, + { + "epoch": 0.7528425877844827, + "grad_norm": 1.4761983156204224, + "learning_rate": 3.0361465189083305e-05, + "loss": 1.0717, + "step": 21022 + }, + { + "epoch": 0.7528783999140509, + "grad_norm": 1.523912787437439, + "learning_rate": 3.0353141416417997e-05, + "loss": 1.0702, + "step": 21023 + }, + { + "epoch": 0.7529142120436192, + "grad_norm": 1.9064102172851562, + "learning_rate": 3.034481858076127e-05, + "loss": 1.0687, + "step": 21024 + }, + { + "epoch": 0.7529500241731875, + "grad_norm": 1.406378984451294, + "learning_rate": 3.0336496682225214e-05, + "loss": 1.0647, + "step": 21025 + }, + { + "epoch": 0.7529858363027557, + "grad_norm": 1.2823224067687988, + "learning_rate": 3.0328175720921715e-05, + "loss": 0.9595, + "step": 21026 + }, + { + "epoch": 0.753021648432324, + "grad_norm": 1.4559274911880493, + "learning_rate": 3.0319855696962762e-05, + "loss": 1.1159, + "step": 21027 + }, + { + "epoch": 0.7530574605618923, + "grad_norm": 1.4471408128738403, + "learning_rate": 3.0311536610460245e-05, + "loss": 1.234, + "step": 21028 + }, + { + "epoch": 0.7530932726914606, + "grad_norm": 1.5680036544799805, + "learning_rate": 3.0303218461526116e-05, + "loss": 1.1759, + "step": 21029 + }, + { + "epoch": 0.7531290848210289, + "grad_norm": 1.926969051361084, + "learning_rate": 3.02949012502723e-05, + "loss": 1.1671, + "step": 21030 + }, + { + "epoch": 0.7531648969505972, + "grad_norm": 1.5298537015914917, + "learning_rate": 3.028658497681065e-05, + "loss": 1.0697, + "step": 21031 + }, + { + "epoch": 0.7532007090801655, + "grad_norm": 1.6236733198165894, + "learning_rate": 3.0278269641253075e-05, + "loss": 1.0851, + "step": 21032 + }, + { + "epoch": 0.7532365212097337, + "grad_norm": 1.4976698160171509, + "learning_rate": 3.0269955243711457e-05, + "loss": 0.9856, + "step": 21033 + }, + { + "epoch": 0.753272333339302, + "grad_norm": 1.3764770030975342, + "learning_rate": 3.0261641784297666e-05, + "loss": 1.211, + "step": 21034 + }, + { + "epoch": 0.7533081454688703, + "grad_norm": 1.4766806364059448, + "learning_rate": 3.0253329263123497e-05, + "loss": 1.0279, + "step": 21035 + }, + { + "epoch": 0.7533439575984386, + "grad_norm": 1.5691957473754883, + "learning_rate": 3.0245017680300813e-05, + "loss": 0.8995, + "step": 21036 + }, + { + "epoch": 0.7533797697280069, + "grad_norm": 1.5414314270019531, + "learning_rate": 3.0236707035941482e-05, + "loss": 1.173, + "step": 21037 + }, + { + "epoch": 0.7534155818575752, + "grad_norm": 1.3493143320083618, + "learning_rate": 3.0228397330157233e-05, + "loss": 0.9206, + "step": 21038 + }, + { + "epoch": 0.7534513939871434, + "grad_norm": 1.3606843948364258, + "learning_rate": 3.022008856305989e-05, + "loss": 1.2182, + "step": 21039 + }, + { + "epoch": 0.7534872061167117, + "grad_norm": 1.3953591585159302, + "learning_rate": 3.0211780734761254e-05, + "loss": 0.8365, + "step": 21040 + }, + { + "epoch": 0.75352301824628, + "grad_norm": 1.3377355337142944, + "learning_rate": 3.020347384537312e-05, + "loss": 1.1269, + "step": 21041 + }, + { + "epoch": 0.7535588303758483, + "grad_norm": 1.343869686126709, + "learning_rate": 3.019516789500718e-05, + "loss": 1.0539, + "step": 21042 + }, + { + "epoch": 0.7535946425054166, + "grad_norm": 1.5031901597976685, + "learning_rate": 3.0186862883775214e-05, + "loss": 0.9696, + "step": 21043 + }, + { + "epoch": 0.7536304546349849, + "grad_norm": 1.6308766603469849, + "learning_rate": 3.017855881178899e-05, + "loss": 1.2924, + "step": 21044 + }, + { + "epoch": 0.7536662667645532, + "grad_norm": 1.6498433351516724, + "learning_rate": 3.0170255679160163e-05, + "loss": 0.9306, + "step": 21045 + }, + { + "epoch": 0.7537020788941214, + "grad_norm": 1.5712462663650513, + "learning_rate": 3.0161953486000473e-05, + "loss": 0.9677, + "step": 21046 + }, + { + "epoch": 0.7537378910236897, + "grad_norm": 1.2979371547698975, + "learning_rate": 3.0153652232421603e-05, + "loss": 1.106, + "step": 21047 + }, + { + "epoch": 0.753773703153258, + "grad_norm": 1.6508067846298218, + "learning_rate": 3.014535191853529e-05, + "loss": 0.9928, + "step": 21048 + }, + { + "epoch": 0.7538095152828262, + "grad_norm": 1.4094442129135132, + "learning_rate": 3.0137052544453126e-05, + "loss": 1.1101, + "step": 21049 + }, + { + "epoch": 0.7538453274123946, + "grad_norm": 1.4826252460479736, + "learning_rate": 3.0128754110286806e-05, + "loss": 1.2355, + "step": 21050 + }, + { + "epoch": 0.7538811395419629, + "grad_norm": 1.2137242555618286, + "learning_rate": 3.012045661614796e-05, + "loss": 1.0531, + "step": 21051 + }, + { + "epoch": 0.7539169516715312, + "grad_norm": 1.4358125925064087, + "learning_rate": 3.0112160062148274e-05, + "loss": 1.114, + "step": 21052 + }, + { + "epoch": 0.7539527638010994, + "grad_norm": 1.3388763666152954, + "learning_rate": 3.01038644483993e-05, + "loss": 1.0064, + "step": 21053 + }, + { + "epoch": 0.7539885759306677, + "grad_norm": 1.779293179512024, + "learning_rate": 3.0095569775012665e-05, + "loss": 0.9809, + "step": 21054 + }, + { + "epoch": 0.754024388060236, + "grad_norm": 1.4945107698440552, + "learning_rate": 3.0087276042099997e-05, + "loss": 1.1722, + "step": 21055 + }, + { + "epoch": 0.7540602001898042, + "grad_norm": 1.3965693712234497, + "learning_rate": 3.007898324977282e-05, + "loss": 0.8805, + "step": 21056 + }, + { + "epoch": 0.7540960123193726, + "grad_norm": 1.563388705253601, + "learning_rate": 3.0070691398142726e-05, + "loss": 1.1379, + "step": 21057 + }, + { + "epoch": 0.7541318244489409, + "grad_norm": 1.7571003437042236, + "learning_rate": 3.0062400487321286e-05, + "loss": 1.1924, + "step": 21058 + }, + { + "epoch": 0.7541676365785092, + "grad_norm": 1.57816743850708, + "learning_rate": 3.0054110517420052e-05, + "loss": 1.1683, + "step": 21059 + }, + { + "epoch": 0.7542034487080774, + "grad_norm": 1.452854037284851, + "learning_rate": 3.004582148855052e-05, + "loss": 1.191, + "step": 21060 + }, + { + "epoch": 0.7542392608376457, + "grad_norm": 1.3565232753753662, + "learning_rate": 3.0037533400824226e-05, + "loss": 0.9389, + "step": 21061 + }, + { + "epoch": 0.754275072967214, + "grad_norm": 1.479698657989502, + "learning_rate": 3.0029246254352694e-05, + "loss": 1.2258, + "step": 21062 + }, + { + "epoch": 0.7543108850967822, + "grad_norm": 1.713167428970337, + "learning_rate": 3.002096004924737e-05, + "loss": 1.1083, + "step": 21063 + }, + { + "epoch": 0.7543466972263506, + "grad_norm": 1.7019219398498535, + "learning_rate": 3.0012674785619766e-05, + "loss": 1.1233, + "step": 21064 + }, + { + "epoch": 0.7543825093559189, + "grad_norm": 1.673471212387085, + "learning_rate": 3.0004390463581345e-05, + "loss": 1.0147, + "step": 21065 + }, + { + "epoch": 0.7544183214854872, + "grad_norm": 1.5385264158248901, + "learning_rate": 2.9996107083243598e-05, + "loss": 1.064, + "step": 21066 + }, + { + "epoch": 0.7544541336150554, + "grad_norm": 1.279028058052063, + "learning_rate": 2.9987824644717898e-05, + "loss": 1.0462, + "step": 21067 + }, + { + "epoch": 0.7544899457446237, + "grad_norm": 1.7477223873138428, + "learning_rate": 2.997954314811571e-05, + "loss": 1.1306, + "step": 21068 + }, + { + "epoch": 0.754525757874192, + "grad_norm": 1.665944218635559, + "learning_rate": 2.9971262593548443e-05, + "loss": 1.0839, + "step": 21069 + }, + { + "epoch": 0.7545615700037602, + "grad_norm": 1.813806176185608, + "learning_rate": 2.996298298112754e-05, + "loss": 1.0947, + "step": 21070 + }, + { + "epoch": 0.7545973821333286, + "grad_norm": 1.3238348960876465, + "learning_rate": 2.9954704310964332e-05, + "loss": 1.0275, + "step": 21071 + }, + { + "epoch": 0.7546331942628969, + "grad_norm": 1.4034862518310547, + "learning_rate": 2.9946426583170217e-05, + "loss": 1.0263, + "step": 21072 + }, + { + "epoch": 0.7546690063924651, + "grad_norm": 1.2045865058898926, + "learning_rate": 2.9938149797856608e-05, + "loss": 0.9249, + "step": 21073 + }, + { + "epoch": 0.7547048185220334, + "grad_norm": 1.5049306154251099, + "learning_rate": 2.992987395513479e-05, + "loss": 1.2102, + "step": 21074 + }, + { + "epoch": 0.7547406306516017, + "grad_norm": 1.6276047229766846, + "learning_rate": 2.9921599055116135e-05, + "loss": 1.1796, + "step": 21075 + }, + { + "epoch": 0.75477644278117, + "grad_norm": 1.401517629623413, + "learning_rate": 2.991332509791196e-05, + "loss": 1.2765, + "step": 21076 + }, + { + "epoch": 0.7548122549107382, + "grad_norm": 1.914136528968811, + "learning_rate": 2.9905052083633632e-05, + "loss": 1.1022, + "step": 21077 + }, + { + "epoch": 0.7548480670403066, + "grad_norm": 1.9067397117614746, + "learning_rate": 2.9896780012392377e-05, + "loss": 1.2377, + "step": 21078 + }, + { + "epoch": 0.7548838791698749, + "grad_norm": 1.3274410963058472, + "learning_rate": 2.9888508884299516e-05, + "loss": 1.1954, + "step": 21079 + }, + { + "epoch": 0.7549196912994431, + "grad_norm": 1.6111183166503906, + "learning_rate": 2.9880238699466367e-05, + "loss": 1.0972, + "step": 21080 + }, + { + "epoch": 0.7549555034290114, + "grad_norm": 1.4980953931808472, + "learning_rate": 2.9871969458004135e-05, + "loss": 1.0112, + "step": 21081 + }, + { + "epoch": 0.7549913155585797, + "grad_norm": 2.1779537200927734, + "learning_rate": 2.9863701160024083e-05, + "loss": 1.1815, + "step": 21082 + }, + { + "epoch": 0.755027127688148, + "grad_norm": 1.7366480827331543, + "learning_rate": 2.9855433805637467e-05, + "loss": 1.0845, + "step": 21083 + }, + { + "epoch": 0.7550629398177162, + "grad_norm": 1.7220443487167358, + "learning_rate": 2.9847167394955543e-05, + "loss": 1.0041, + "step": 21084 + }, + { + "epoch": 0.7550987519472846, + "grad_norm": 1.4523979425430298, + "learning_rate": 2.9838901928089456e-05, + "loss": 0.7937, + "step": 21085 + }, + { + "epoch": 0.7551345640768529, + "grad_norm": 1.6157176494598389, + "learning_rate": 2.983063740515044e-05, + "loss": 1.1848, + "step": 21086 + }, + { + "epoch": 0.7551703762064211, + "grad_norm": 1.28978431224823, + "learning_rate": 2.9822373826249693e-05, + "loss": 0.9157, + "step": 21087 + }, + { + "epoch": 0.7552061883359894, + "grad_norm": 1.4064677953720093, + "learning_rate": 2.9814111191498405e-05, + "loss": 1.0695, + "step": 21088 + }, + { + "epoch": 0.7552420004655577, + "grad_norm": 1.279266357421875, + "learning_rate": 2.9805849501007733e-05, + "loss": 1.106, + "step": 21089 + }, + { + "epoch": 0.7552778125951259, + "grad_norm": 1.3277896642684937, + "learning_rate": 2.979758875488874e-05, + "loss": 0.8465, + "step": 21090 + }, + { + "epoch": 0.7553136247246942, + "grad_norm": 1.4514529705047607, + "learning_rate": 2.9789328953252694e-05, + "loss": 1.1289, + "step": 21091 + }, + { + "epoch": 0.7553494368542626, + "grad_norm": 1.5463382005691528, + "learning_rate": 2.9781070096210627e-05, + "loss": 1.1944, + "step": 21092 + }, + { + "epoch": 0.7553852489838309, + "grad_norm": 1.4630860090255737, + "learning_rate": 2.9772812183873733e-05, + "loss": 1.0375, + "step": 21093 + }, + { + "epoch": 0.7554210611133991, + "grad_norm": 1.1674442291259766, + "learning_rate": 2.9764555216352997e-05, + "loss": 0.8482, + "step": 21094 + }, + { + "epoch": 0.7554568732429674, + "grad_norm": 2.0124034881591797, + "learning_rate": 2.975629919375963e-05, + "loss": 1.0067, + "step": 21095 + }, + { + "epoch": 0.7554926853725357, + "grad_norm": 1.2498382329940796, + "learning_rate": 2.974804411620462e-05, + "loss": 1.0385, + "step": 21096 + }, + { + "epoch": 0.7555284975021039, + "grad_norm": 1.4112095832824707, + "learning_rate": 2.973978998379906e-05, + "loss": 1.1056, + "step": 21097 + }, + { + "epoch": 0.7555643096316722, + "grad_norm": 1.818400502204895, + "learning_rate": 2.9731536796654026e-05, + "loss": 1.127, + "step": 21098 + }, + { + "epoch": 0.7556001217612406, + "grad_norm": 1.895964503288269, + "learning_rate": 2.9723284554880493e-05, + "loss": 0.9193, + "step": 21099 + }, + { + "epoch": 0.7556359338908089, + "grad_norm": 1.497713327407837, + "learning_rate": 2.9715033258589543e-05, + "loss": 1.1484, + "step": 21100 + }, + { + "epoch": 0.7556717460203771, + "grad_norm": 1.3858157396316528, + "learning_rate": 2.9706782907892104e-05, + "loss": 0.921, + "step": 21101 + }, + { + "epoch": 0.7557075581499454, + "grad_norm": 1.3823062181472778, + "learning_rate": 2.9698533502899294e-05, + "loss": 1.155, + "step": 21102 + }, + { + "epoch": 0.7557433702795137, + "grad_norm": 1.3872060775756836, + "learning_rate": 2.9690285043722e-05, + "loss": 1.08, + "step": 21103 + }, + { + "epoch": 0.7557791824090819, + "grad_norm": 1.303404450416565, + "learning_rate": 2.9682037530471252e-05, + "loss": 1.0292, + "step": 21104 + }, + { + "epoch": 0.7558149945386502, + "grad_norm": 1.156402349472046, + "learning_rate": 2.967379096325793e-05, + "loss": 0.9752, + "step": 21105 + }, + { + "epoch": 0.7558508066682186, + "grad_norm": 1.4508271217346191, + "learning_rate": 2.966554534219309e-05, + "loss": 1.2276, + "step": 21106 + }, + { + "epoch": 0.7558866187977868, + "grad_norm": 1.3372502326965332, + "learning_rate": 2.965730066738762e-05, + "loss": 1.1628, + "step": 21107 + }, + { + "epoch": 0.7559224309273551, + "grad_norm": 1.4766756296157837, + "learning_rate": 2.964905693895237e-05, + "loss": 1.2611, + "step": 21108 + }, + { + "epoch": 0.7559582430569234, + "grad_norm": 1.8562288284301758, + "learning_rate": 2.9640814156998374e-05, + "loss": 1.2702, + "step": 21109 + }, + { + "epoch": 0.7559940551864917, + "grad_norm": 1.4979149103164673, + "learning_rate": 2.9632572321636443e-05, + "loss": 1.1118, + "step": 21110 + }, + { + "epoch": 0.7560298673160599, + "grad_norm": 1.2484874725341797, + "learning_rate": 2.9624331432977515e-05, + "loss": 1.1513, + "step": 21111 + }, + { + "epoch": 0.7560656794456282, + "grad_norm": 1.5102026462554932, + "learning_rate": 2.9616091491132357e-05, + "loss": 1.0528, + "step": 21112 + }, + { + "epoch": 0.7561014915751966, + "grad_norm": 1.7134380340576172, + "learning_rate": 2.9607852496211962e-05, + "loss": 1.1642, + "step": 21113 + }, + { + "epoch": 0.7561373037047648, + "grad_norm": 1.2909982204437256, + "learning_rate": 2.9599614448327084e-05, + "loss": 1.0897, + "step": 21114 + }, + { + "epoch": 0.7561731158343331, + "grad_norm": 1.5183743238449097, + "learning_rate": 2.9591377347588623e-05, + "loss": 1.1151, + "step": 21115 + }, + { + "epoch": 0.7562089279639014, + "grad_norm": 1.498671531677246, + "learning_rate": 2.958314119410732e-05, + "loss": 1.159, + "step": 21116 + }, + { + "epoch": 0.7562447400934696, + "grad_norm": 1.3464299440383911, + "learning_rate": 2.9574905987994016e-05, + "loss": 1.2487, + "step": 21117 + }, + { + "epoch": 0.7562805522230379, + "grad_norm": 1.544237732887268, + "learning_rate": 2.9566671729359552e-05, + "loss": 1.1948, + "step": 21118 + }, + { + "epoch": 0.7563163643526062, + "grad_norm": 1.5297269821166992, + "learning_rate": 2.9558438418314626e-05, + "loss": 0.8809, + "step": 21119 + }, + { + "epoch": 0.7563521764821746, + "grad_norm": 1.7658729553222656, + "learning_rate": 2.9550206054970063e-05, + "loss": 0.9962, + "step": 21120 + }, + { + "epoch": 0.7563879886117428, + "grad_norm": 1.6988106966018677, + "learning_rate": 2.9541974639436588e-05, + "loss": 1.1715, + "step": 21121 + }, + { + "epoch": 0.7564238007413111, + "grad_norm": 1.3040622472763062, + "learning_rate": 2.9533744171824996e-05, + "loss": 1.1024, + "step": 21122 + }, + { + "epoch": 0.7564596128708794, + "grad_norm": 1.4761663675308228, + "learning_rate": 2.9525514652245922e-05, + "loss": 0.9699, + "step": 21123 + }, + { + "epoch": 0.7564954250004476, + "grad_norm": 1.78902006149292, + "learning_rate": 2.9517286080810204e-05, + "loss": 1.2349, + "step": 21124 + }, + { + "epoch": 0.7565312371300159, + "grad_norm": 1.4649428129196167, + "learning_rate": 2.950905845762849e-05, + "loss": 1.2167, + "step": 21125 + }, + { + "epoch": 0.7565670492595842, + "grad_norm": 1.5226095914840698, + "learning_rate": 2.9500831782811433e-05, + "loss": 1.0763, + "step": 21126 + }, + { + "epoch": 0.7566028613891526, + "grad_norm": 1.5409948825836182, + "learning_rate": 2.949260605646974e-05, + "loss": 1.2308, + "step": 21127 + }, + { + "epoch": 0.7566386735187208, + "grad_norm": 1.7747899293899536, + "learning_rate": 2.948438127871409e-05, + "loss": 0.9023, + "step": 21128 + }, + { + "epoch": 0.7566744856482891, + "grad_norm": 1.9930051565170288, + "learning_rate": 2.947615744965516e-05, + "loss": 1.0346, + "step": 21129 + }, + { + "epoch": 0.7567102977778574, + "grad_norm": 1.5383509397506714, + "learning_rate": 2.9467934569403542e-05, + "loss": 1.047, + "step": 21130 + }, + { + "epoch": 0.7567461099074256, + "grad_norm": 1.2380441427230835, + "learning_rate": 2.945971263806987e-05, + "loss": 1.0077, + "step": 21131 + }, + { + "epoch": 0.7567819220369939, + "grad_norm": 1.3440970182418823, + "learning_rate": 2.9451491655764816e-05, + "loss": 1.0019, + "step": 21132 + }, + { + "epoch": 0.7568177341665622, + "grad_norm": 1.5152583122253418, + "learning_rate": 2.94432716225989e-05, + "loss": 1.1268, + "step": 21133 + }, + { + "epoch": 0.7568535462961306, + "grad_norm": 1.9189475774765015, + "learning_rate": 2.943505253868276e-05, + "loss": 1.2186, + "step": 21134 + }, + { + "epoch": 0.7568893584256988, + "grad_norm": 1.432651162147522, + "learning_rate": 2.942683440412697e-05, + "loss": 1.0588, + "step": 21135 + }, + { + "epoch": 0.7569251705552671, + "grad_norm": 1.6854007244110107, + "learning_rate": 2.941861721904211e-05, + "loss": 0.9893, + "step": 21136 + }, + { + "epoch": 0.7569609826848354, + "grad_norm": 1.381571888923645, + "learning_rate": 2.941040098353869e-05, + "loss": 1.1889, + "step": 21137 + }, + { + "epoch": 0.7569967948144036, + "grad_norm": 1.5635619163513184, + "learning_rate": 2.940218569772726e-05, + "loss": 1.0025, + "step": 21138 + }, + { + "epoch": 0.7570326069439719, + "grad_norm": 2.929819107055664, + "learning_rate": 2.9393971361718363e-05, + "loss": 1.0316, + "step": 21139 + }, + { + "epoch": 0.7570684190735402, + "grad_norm": 1.67664635181427, + "learning_rate": 2.9385757975622542e-05, + "loss": 1.2718, + "step": 21140 + }, + { + "epoch": 0.7571042312031085, + "grad_norm": 1.7910372018814087, + "learning_rate": 2.937754553955022e-05, + "loss": 1.2047, + "step": 21141 + }, + { + "epoch": 0.7571400433326768, + "grad_norm": 1.3207521438598633, + "learning_rate": 2.936933405361194e-05, + "loss": 0.9285, + "step": 21142 + }, + { + "epoch": 0.7571758554622451, + "grad_norm": 1.4232237339019775, + "learning_rate": 2.936112351791819e-05, + "loss": 1.0356, + "step": 21143 + }, + { + "epoch": 0.7572116675918134, + "grad_norm": 1.333018183708191, + "learning_rate": 2.9352913932579362e-05, + "loss": 1.0346, + "step": 21144 + }, + { + "epoch": 0.7572474797213816, + "grad_norm": 1.3952138423919678, + "learning_rate": 2.9344705297705966e-05, + "loss": 1.0091, + "step": 21145 + }, + { + "epoch": 0.7572832918509499, + "grad_norm": 1.5189385414123535, + "learning_rate": 2.933649761340841e-05, + "loss": 1.0487, + "step": 21146 + }, + { + "epoch": 0.7573191039805182, + "grad_norm": 1.3412874937057495, + "learning_rate": 2.932829087979716e-05, + "loss": 1.0457, + "step": 21147 + }, + { + "epoch": 0.7573549161100865, + "grad_norm": 1.568398118019104, + "learning_rate": 2.9320085096982575e-05, + "loss": 1.2461, + "step": 21148 + }, + { + "epoch": 0.7573907282396548, + "grad_norm": 1.4200955629348755, + "learning_rate": 2.9311880265075066e-05, + "loss": 1.085, + "step": 21149 + }, + { + "epoch": 0.7574265403692231, + "grad_norm": 1.3658301830291748, + "learning_rate": 2.9303676384185064e-05, + "loss": 0.9787, + "step": 21150 + }, + { + "epoch": 0.7574623524987913, + "grad_norm": 1.2478045225143433, + "learning_rate": 2.9295473454422863e-05, + "loss": 0.8687, + "step": 21151 + }, + { + "epoch": 0.7574981646283596, + "grad_norm": 1.3764512538909912, + "learning_rate": 2.928727147589887e-05, + "loss": 1.3208, + "step": 21152 + }, + { + "epoch": 0.7575339767579279, + "grad_norm": 1.4279701709747314, + "learning_rate": 2.9279070448723432e-05, + "loss": 0.9479, + "step": 21153 + }, + { + "epoch": 0.7575697888874962, + "grad_norm": 1.6118552684783936, + "learning_rate": 2.927087037300691e-05, + "loss": 1.0469, + "step": 21154 + }, + { + "epoch": 0.7576056010170645, + "grad_norm": 1.6060067415237427, + "learning_rate": 2.926267124885955e-05, + "loss": 1.0424, + "step": 21155 + }, + { + "epoch": 0.7576414131466328, + "grad_norm": 2.0353031158447266, + "learning_rate": 2.9254473076391708e-05, + "loss": 1.1674, + "step": 21156 + }, + { + "epoch": 0.7576772252762011, + "grad_norm": 1.3658441305160522, + "learning_rate": 2.924627585571368e-05, + "loss": 1.0137, + "step": 21157 + }, + { + "epoch": 0.7577130374057693, + "grad_norm": 1.3914954662322998, + "learning_rate": 2.9238079586935773e-05, + "loss": 1.066, + "step": 21158 + }, + { + "epoch": 0.7577488495353376, + "grad_norm": 1.608113408088684, + "learning_rate": 2.9229884270168195e-05, + "loss": 1.0405, + "step": 21159 + }, + { + "epoch": 0.7577846616649059, + "grad_norm": 1.358682632446289, + "learning_rate": 2.9221689905521245e-05, + "loss": 1.0512, + "step": 21160 + }, + { + "epoch": 0.7578204737944741, + "grad_norm": 1.1723395586013794, + "learning_rate": 2.9213496493105187e-05, + "loss": 0.8741, + "step": 21161 + }, + { + "epoch": 0.7578562859240425, + "grad_norm": 1.4408903121948242, + "learning_rate": 2.920530403303019e-05, + "loss": 1.1381, + "step": 21162 + }, + { + "epoch": 0.7578920980536108, + "grad_norm": 1.2543748617172241, + "learning_rate": 2.9197112525406522e-05, + "loss": 0.7793, + "step": 21163 + }, + { + "epoch": 0.7579279101831791, + "grad_norm": 1.5570170879364014, + "learning_rate": 2.918892197034436e-05, + "loss": 1.0745, + "step": 21164 + }, + { + "epoch": 0.7579637223127473, + "grad_norm": 1.922965168952942, + "learning_rate": 2.9180732367953956e-05, + "loss": 1.3093, + "step": 21165 + }, + { + "epoch": 0.7579995344423156, + "grad_norm": 1.354409098625183, + "learning_rate": 2.9172543718345413e-05, + "loss": 1.0885, + "step": 21166 + }, + { + "epoch": 0.7580353465718839, + "grad_norm": 1.4912155866622925, + "learning_rate": 2.9164356021628923e-05, + "loss": 0.9961, + "step": 21167 + }, + { + "epoch": 0.7580711587014521, + "grad_norm": 1.8118170499801636, + "learning_rate": 2.915616927791469e-05, + "loss": 0.9442, + "step": 21168 + }, + { + "epoch": 0.7581069708310205, + "grad_norm": 1.1913166046142578, + "learning_rate": 2.9147983487312793e-05, + "loss": 1.1826, + "step": 21169 + }, + { + "epoch": 0.7581427829605888, + "grad_norm": 1.6384389400482178, + "learning_rate": 2.913979864993338e-05, + "loss": 1.0158, + "step": 21170 + }, + { + "epoch": 0.7581785950901571, + "grad_norm": 1.2232272624969482, + "learning_rate": 2.9131614765886573e-05, + "loss": 1.0507, + "step": 21171 + }, + { + "epoch": 0.7582144072197253, + "grad_norm": 1.694628119468689, + "learning_rate": 2.912343183528251e-05, + "loss": 1.2002, + "step": 21172 + }, + { + "epoch": 0.7582502193492936, + "grad_norm": 1.768119215965271, + "learning_rate": 2.9115249858231207e-05, + "loss": 1.1634, + "step": 21173 + }, + { + "epoch": 0.7582860314788619, + "grad_norm": 1.472143292427063, + "learning_rate": 2.91070688348428e-05, + "loss": 1.16, + "step": 21174 + }, + { + "epoch": 0.7583218436084301, + "grad_norm": 1.2281668186187744, + "learning_rate": 2.9098888765227316e-05, + "loss": 1.0576, + "step": 21175 + }, + { + "epoch": 0.7583576557379985, + "grad_norm": 1.5836501121520996, + "learning_rate": 2.9090709649494873e-05, + "loss": 1.1274, + "step": 21176 + }, + { + "epoch": 0.7583934678675668, + "grad_norm": 3.8560004234313965, + "learning_rate": 2.908253148775546e-05, + "loss": 1.3478, + "step": 21177 + }, + { + "epoch": 0.758429279997135, + "grad_norm": 1.1618634462356567, + "learning_rate": 2.9074354280119042e-05, + "loss": 0.9541, + "step": 21178 + }, + { + "epoch": 0.7584650921267033, + "grad_norm": 1.325827956199646, + "learning_rate": 2.9066178026695767e-05, + "loss": 0.8902, + "step": 21179 + }, + { + "epoch": 0.7585009042562716, + "grad_norm": 1.1977475881576538, + "learning_rate": 2.9058002727595546e-05, + "loss": 1.1843, + "step": 21180 + }, + { + "epoch": 0.7585367163858399, + "grad_norm": 1.3216984272003174, + "learning_rate": 2.904982838292838e-05, + "loss": 0.9725, + "step": 21181 + }, + { + "epoch": 0.7585725285154081, + "grad_norm": 1.7919317483901978, + "learning_rate": 2.9041654992804256e-05, + "loss": 1.2413, + "step": 21182 + }, + { + "epoch": 0.7586083406449765, + "grad_norm": 1.5073895454406738, + "learning_rate": 2.9033482557333158e-05, + "loss": 1.014, + "step": 21183 + }, + { + "epoch": 0.7586441527745448, + "grad_norm": 1.6239768266677856, + "learning_rate": 2.9025311076624994e-05, + "loss": 1.2072, + "step": 21184 + }, + { + "epoch": 0.758679964904113, + "grad_norm": 1.8707146644592285, + "learning_rate": 2.9017140550789713e-05, + "loss": 0.9521, + "step": 21185 + }, + { + "epoch": 0.7587157770336813, + "grad_norm": 1.6986708641052246, + "learning_rate": 2.9008970979937276e-05, + "loss": 1.081, + "step": 21186 + }, + { + "epoch": 0.7587515891632496, + "grad_norm": 1.4547860622406006, + "learning_rate": 2.9000802364177527e-05, + "loss": 1.025, + "step": 21187 + }, + { + "epoch": 0.7587874012928179, + "grad_norm": 1.4514575004577637, + "learning_rate": 2.8992634703620437e-05, + "loss": 1.136, + "step": 21188 + }, + { + "epoch": 0.7588232134223861, + "grad_norm": 1.2651829719543457, + "learning_rate": 2.8984467998375786e-05, + "loss": 0.8539, + "step": 21189 + }, + { + "epoch": 0.7588590255519545, + "grad_norm": 1.4871156215667725, + "learning_rate": 2.8976302248553576e-05, + "loss": 0.975, + "step": 21190 + }, + { + "epoch": 0.7588948376815228, + "grad_norm": 1.6445177793502808, + "learning_rate": 2.896813745426359e-05, + "loss": 0.8847, + "step": 21191 + }, + { + "epoch": 0.758930649811091, + "grad_norm": 1.477939248085022, + "learning_rate": 2.8959973615615675e-05, + "loss": 0.9717, + "step": 21192 + }, + { + "epoch": 0.7589664619406593, + "grad_norm": 2.1736183166503906, + "learning_rate": 2.8951810732719685e-05, + "loss": 1.1496, + "step": 21193 + }, + { + "epoch": 0.7590022740702276, + "grad_norm": 1.8956438302993774, + "learning_rate": 2.8943648805685464e-05, + "loss": 1.0346, + "step": 21194 + }, + { + "epoch": 0.7590380861997958, + "grad_norm": 1.742112398147583, + "learning_rate": 2.893548783462279e-05, + "loss": 1.0294, + "step": 21195 + }, + { + "epoch": 0.7590738983293641, + "grad_norm": 1.2850372791290283, + "learning_rate": 2.8927327819641403e-05, + "loss": 1.0426, + "step": 21196 + }, + { + "epoch": 0.7591097104589324, + "grad_norm": 1.414541482925415, + "learning_rate": 2.8919168760851202e-05, + "loss": 0.9344, + "step": 21197 + }, + { + "epoch": 0.7591455225885008, + "grad_norm": 1.6241241693496704, + "learning_rate": 2.891101065836187e-05, + "loss": 1.1562, + "step": 21198 + }, + { + "epoch": 0.759181334718069, + "grad_norm": 1.324446439743042, + "learning_rate": 2.8902853512283225e-05, + "loss": 0.9622, + "step": 21199 + }, + { + "epoch": 0.7592171468476373, + "grad_norm": 2.192138671875, + "learning_rate": 2.8894697322724908e-05, + "loss": 1.0106, + "step": 21200 + }, + { + "epoch": 0.7592529589772056, + "grad_norm": 1.3102151155471802, + "learning_rate": 2.8886542089796785e-05, + "loss": 1.0344, + "step": 21201 + }, + { + "epoch": 0.7592887711067738, + "grad_norm": 1.4902608394622803, + "learning_rate": 2.8878387813608477e-05, + "loss": 1.118, + "step": 21202 + }, + { + "epoch": 0.7593245832363421, + "grad_norm": 1.3332180976867676, + "learning_rate": 2.8870234494269756e-05, + "loss": 1.0589, + "step": 21203 + }, + { + "epoch": 0.7593603953659104, + "grad_norm": 1.3660335540771484, + "learning_rate": 2.8862082131890243e-05, + "loss": 1.0477, + "step": 21204 + }, + { + "epoch": 0.7593962074954788, + "grad_norm": 1.7256563901901245, + "learning_rate": 2.885393072657966e-05, + "loss": 1.2894, + "step": 21205 + }, + { + "epoch": 0.759432019625047, + "grad_norm": 1.6127890348434448, + "learning_rate": 2.8845780278447688e-05, + "loss": 0.9431, + "step": 21206 + }, + { + "epoch": 0.7594678317546153, + "grad_norm": 1.3738956451416016, + "learning_rate": 2.8837630787603908e-05, + "loss": 1.0059, + "step": 21207 + }, + { + "epoch": 0.7595036438841836, + "grad_norm": 1.5532565116882324, + "learning_rate": 2.882948225415807e-05, + "loss": 1.2192, + "step": 21208 + }, + { + "epoch": 0.7595394560137518, + "grad_norm": 1.2071694135665894, + "learning_rate": 2.8821334678219712e-05, + "loss": 0.954, + "step": 21209 + }, + { + "epoch": 0.7595752681433201, + "grad_norm": 1.7019962072372437, + "learning_rate": 2.8813188059898512e-05, + "loss": 0.9918, + "step": 21210 + }, + { + "epoch": 0.7596110802728884, + "grad_norm": 1.2316081523895264, + "learning_rate": 2.8805042399303984e-05, + "loss": 1.1829, + "step": 21211 + }, + { + "epoch": 0.7596468924024568, + "grad_norm": 1.779550552368164, + "learning_rate": 2.8796897696545832e-05, + "loss": 1.2002, + "step": 21212 + }, + { + "epoch": 0.759682704532025, + "grad_norm": 1.3523484468460083, + "learning_rate": 2.878875395173358e-05, + "loss": 1.0936, + "step": 21213 + }, + { + "epoch": 0.7597185166615933, + "grad_norm": 1.3159679174423218, + "learning_rate": 2.8780611164976767e-05, + "loss": 0.9443, + "step": 21214 + }, + { + "epoch": 0.7597543287911616, + "grad_norm": 1.3055496215820312, + "learning_rate": 2.8772469336384954e-05, + "loss": 0.9683, + "step": 21215 + }, + { + "epoch": 0.7597901409207298, + "grad_norm": 1.3527570962905884, + "learning_rate": 2.876432846606769e-05, + "loss": 1.016, + "step": 21216 + }, + { + "epoch": 0.7598259530502981, + "grad_norm": 1.5398324728012085, + "learning_rate": 2.8756188554134522e-05, + "loss": 1.0263, + "step": 21217 + }, + { + "epoch": 0.7598617651798664, + "grad_norm": 1.5300322771072388, + "learning_rate": 2.8748049600694893e-05, + "loss": 1.0297, + "step": 21218 + }, + { + "epoch": 0.7598975773094347, + "grad_norm": 1.6606632471084595, + "learning_rate": 2.8739911605858394e-05, + "loss": 1.0002, + "step": 21219 + }, + { + "epoch": 0.759933389439003, + "grad_norm": 1.4201656579971313, + "learning_rate": 2.873177456973445e-05, + "loss": 0.8703, + "step": 21220 + }, + { + "epoch": 0.7599692015685713, + "grad_norm": 1.6469330787658691, + "learning_rate": 2.872363849243257e-05, + "loss": 1.0891, + "step": 21221 + }, + { + "epoch": 0.7600050136981396, + "grad_norm": 1.2143365144729614, + "learning_rate": 2.871550337406217e-05, + "loss": 0.8731, + "step": 21222 + }, + { + "epoch": 0.7600408258277078, + "grad_norm": 1.5332658290863037, + "learning_rate": 2.8707369214732716e-05, + "loss": 1.0704, + "step": 21223 + }, + { + "epoch": 0.7600766379572761, + "grad_norm": 1.4031805992126465, + "learning_rate": 2.8699236014553686e-05, + "loss": 1.141, + "step": 21224 + }, + { + "epoch": 0.7601124500868444, + "grad_norm": 1.525036334991455, + "learning_rate": 2.869110377363443e-05, + "loss": 1.1589, + "step": 21225 + }, + { + "epoch": 0.7601482622164127, + "grad_norm": 1.4416853189468384, + "learning_rate": 2.868297249208438e-05, + "loss": 1.0414, + "step": 21226 + }, + { + "epoch": 0.760184074345981, + "grad_norm": 1.6876585483551025, + "learning_rate": 2.867484217001296e-05, + "loss": 1.117, + "step": 21227 + }, + { + "epoch": 0.7602198864755493, + "grad_norm": 1.4698246717453003, + "learning_rate": 2.866671280752956e-05, + "loss": 1.061, + "step": 21228 + }, + { + "epoch": 0.7602556986051175, + "grad_norm": 1.4571338891983032, + "learning_rate": 2.8658584404743493e-05, + "loss": 1.0283, + "step": 21229 + }, + { + "epoch": 0.7602915107346858, + "grad_norm": 1.4172486066818237, + "learning_rate": 2.865045696176415e-05, + "loss": 1.0834, + "step": 21230 + }, + { + "epoch": 0.7603273228642541, + "grad_norm": 1.6303749084472656, + "learning_rate": 2.8642330478700908e-05, + "loss": 0.9823, + "step": 21231 + }, + { + "epoch": 0.7603631349938224, + "grad_norm": 1.6476227045059204, + "learning_rate": 2.8634204955663024e-05, + "loss": 1.1159, + "step": 21232 + }, + { + "epoch": 0.7603989471233907, + "grad_norm": 1.429233193397522, + "learning_rate": 2.862608039275987e-05, + "loss": 0.9731, + "step": 21233 + }, + { + "epoch": 0.760434759252959, + "grad_norm": 1.288525104522705, + "learning_rate": 2.861795679010073e-05, + "loss": 1.2213, + "step": 21234 + }, + { + "epoch": 0.7604705713825273, + "grad_norm": 1.3684089183807373, + "learning_rate": 2.8609834147794945e-05, + "loss": 0.918, + "step": 21235 + }, + { + "epoch": 0.7605063835120955, + "grad_norm": 1.9413937330245972, + "learning_rate": 2.8601712465951713e-05, + "loss": 1.0017, + "step": 21236 + }, + { + "epoch": 0.7605421956416638, + "grad_norm": 1.6714324951171875, + "learning_rate": 2.8593591744680348e-05, + "loss": 1.2289, + "step": 21237 + }, + { + "epoch": 0.7605780077712321, + "grad_norm": 1.7906322479248047, + "learning_rate": 2.858547198409013e-05, + "loss": 0.9928, + "step": 21238 + }, + { + "epoch": 0.7606138199008003, + "grad_norm": 1.75892972946167, + "learning_rate": 2.8577353184290236e-05, + "loss": 1.0939, + "step": 21239 + }, + { + "epoch": 0.7606496320303687, + "grad_norm": 1.664933443069458, + "learning_rate": 2.8569235345389922e-05, + "loss": 1.2049, + "step": 21240 + }, + { + "epoch": 0.760685444159937, + "grad_norm": 1.4386056661605835, + "learning_rate": 2.8561118467498415e-05, + "loss": 1.2315, + "step": 21241 + }, + { + "epoch": 0.7607212562895053, + "grad_norm": 1.8348758220672607, + "learning_rate": 2.855300255072494e-05, + "loss": 1.1795, + "step": 21242 + }, + { + "epoch": 0.7607570684190735, + "grad_norm": 1.4944126605987549, + "learning_rate": 2.8544887595178616e-05, + "loss": 1.0408, + "step": 21243 + }, + { + "epoch": 0.7607928805486418, + "grad_norm": 1.3438318967819214, + "learning_rate": 2.853677360096867e-05, + "loss": 1.219, + "step": 21244 + }, + { + "epoch": 0.7608286926782101, + "grad_norm": 1.618812084197998, + "learning_rate": 2.8528660568204247e-05, + "loss": 1.2267, + "step": 21245 + }, + { + "epoch": 0.7608645048077783, + "grad_norm": 1.3656777143478394, + "learning_rate": 2.8520548496994536e-05, + "loss": 0.9626, + "step": 21246 + }, + { + "epoch": 0.7609003169373467, + "grad_norm": 1.835229754447937, + "learning_rate": 2.851243738744862e-05, + "loss": 1.196, + "step": 21247 + }, + { + "epoch": 0.760936129066915, + "grad_norm": 1.652580976486206, + "learning_rate": 2.8504327239675645e-05, + "loss": 0.999, + "step": 21248 + }, + { + "epoch": 0.7609719411964833, + "grad_norm": 1.5928843021392822, + "learning_rate": 2.849621805378474e-05, + "loss": 1.0232, + "step": 21249 + }, + { + "epoch": 0.7610077533260515, + "grad_norm": 1.8072429895401, + "learning_rate": 2.848810982988497e-05, + "loss": 1.2113, + "step": 21250 + }, + { + "epoch": 0.7610435654556198, + "grad_norm": 2.526010274887085, + "learning_rate": 2.848000256808544e-05, + "loss": 1.1186, + "step": 21251 + }, + { + "epoch": 0.7610793775851881, + "grad_norm": 2.193835735321045, + "learning_rate": 2.8471896268495214e-05, + "loss": 1.1029, + "step": 21252 + }, + { + "epoch": 0.7611151897147563, + "grad_norm": 1.7007371187210083, + "learning_rate": 2.84637909312234e-05, + "loss": 1.0827, + "step": 21253 + }, + { + "epoch": 0.7611510018443247, + "grad_norm": 1.6849713325500488, + "learning_rate": 2.845568655637896e-05, + "loss": 1.2023, + "step": 21254 + }, + { + "epoch": 0.761186813973893, + "grad_norm": 1.8424930572509766, + "learning_rate": 2.844758314407098e-05, + "loss": 0.8702, + "step": 21255 + }, + { + "epoch": 0.7612226261034613, + "grad_norm": 1.4858267307281494, + "learning_rate": 2.8439480694408506e-05, + "loss": 1.2461, + "step": 21256 + }, + { + "epoch": 0.7612584382330295, + "grad_norm": 1.3629889488220215, + "learning_rate": 2.8431379207500476e-05, + "loss": 1.1376, + "step": 21257 + }, + { + "epoch": 0.7612942503625978, + "grad_norm": 1.7212114334106445, + "learning_rate": 2.8423278683455922e-05, + "loss": 1.3065, + "step": 21258 + }, + { + "epoch": 0.7613300624921661, + "grad_norm": 1.2330114841461182, + "learning_rate": 2.8415179122383828e-05, + "loss": 0.9178, + "step": 21259 + }, + { + "epoch": 0.7613658746217343, + "grad_norm": 1.8700991868972778, + "learning_rate": 2.840708052439319e-05, + "loss": 1.149, + "step": 21260 + }, + { + "epoch": 0.7614016867513027, + "grad_norm": 1.5760961771011353, + "learning_rate": 2.8398982889592908e-05, + "loss": 0.9112, + "step": 21261 + }, + { + "epoch": 0.761437498880871, + "grad_norm": 1.652930736541748, + "learning_rate": 2.839088621809195e-05, + "loss": 1.1238, + "step": 21262 + }, + { + "epoch": 0.7614733110104392, + "grad_norm": 1.5310616493225098, + "learning_rate": 2.8382790509999257e-05, + "loss": 1.0296, + "step": 21263 + }, + { + "epoch": 0.7615091231400075, + "grad_norm": 1.607019305229187, + "learning_rate": 2.8374695765423753e-05, + "loss": 1.3359, + "step": 21264 + }, + { + "epoch": 0.7615449352695758, + "grad_norm": 1.5067452192306519, + "learning_rate": 2.8366601984474305e-05, + "loss": 1.1394, + "step": 21265 + }, + { + "epoch": 0.761580747399144, + "grad_norm": 1.7603613138198853, + "learning_rate": 2.835850916725983e-05, + "loss": 1.0707, + "step": 21266 + }, + { + "epoch": 0.7616165595287123, + "grad_norm": 1.4985668659210205, + "learning_rate": 2.8350417313889233e-05, + "loss": 0.8829, + "step": 21267 + }, + { + "epoch": 0.7616523716582807, + "grad_norm": 1.5415891408920288, + "learning_rate": 2.8342326424471323e-05, + "loss": 1.0868, + "step": 21268 + }, + { + "epoch": 0.761688183787849, + "grad_norm": 1.8576085567474365, + "learning_rate": 2.8334236499114963e-05, + "loss": 1.1966, + "step": 21269 + }, + { + "epoch": 0.7617239959174172, + "grad_norm": 1.2348320484161377, + "learning_rate": 2.8326147537929027e-05, + "loss": 1.2216, + "step": 21270 + }, + { + "epoch": 0.7617598080469855, + "grad_norm": 1.3642925024032593, + "learning_rate": 2.8318059541022346e-05, + "loss": 1.1451, + "step": 21271 + }, + { + "epoch": 0.7617956201765538, + "grad_norm": 1.5998826026916504, + "learning_rate": 2.830997250850368e-05, + "loss": 1.0517, + "step": 21272 + }, + { + "epoch": 0.761831432306122, + "grad_norm": 1.5735386610031128, + "learning_rate": 2.8301886440481862e-05, + "loss": 1.0027, + "step": 21273 + }, + { + "epoch": 0.7618672444356903, + "grad_norm": 1.8182322978973389, + "learning_rate": 2.8293801337065705e-05, + "loss": 0.9697, + "step": 21274 + }, + { + "epoch": 0.7619030565652587, + "grad_norm": 1.6351864337921143, + "learning_rate": 2.8285717198363924e-05, + "loss": 1.2331, + "step": 21275 + }, + { + "epoch": 0.761938868694827, + "grad_norm": 1.6953072547912598, + "learning_rate": 2.8277634024485322e-05, + "loss": 0.9369, + "step": 21276 + }, + { + "epoch": 0.7619746808243952, + "grad_norm": 1.4293330907821655, + "learning_rate": 2.826955181553863e-05, + "loss": 1.0378, + "step": 21277 + }, + { + "epoch": 0.7620104929539635, + "grad_norm": 1.7211838960647583, + "learning_rate": 2.826147057163263e-05, + "loss": 0.9996, + "step": 21278 + }, + { + "epoch": 0.7620463050835318, + "grad_norm": 2.1869375705718994, + "learning_rate": 2.8253390292875982e-05, + "loss": 1.2491, + "step": 21279 + }, + { + "epoch": 0.7620821172131, + "grad_norm": 1.3830742835998535, + "learning_rate": 2.8245310979377416e-05, + "loss": 1.1284, + "step": 21280 + }, + { + "epoch": 0.7621179293426683, + "grad_norm": 1.4504225254058838, + "learning_rate": 2.8237232631245624e-05, + "loss": 1.1632, + "step": 21281 + }, + { + "epoch": 0.7621537414722367, + "grad_norm": 1.673783540725708, + "learning_rate": 2.8229155248589345e-05, + "loss": 1.1857, + "step": 21282 + }, + { + "epoch": 0.762189553601805, + "grad_norm": 1.3255506753921509, + "learning_rate": 2.822107883151719e-05, + "loss": 1.1739, + "step": 21283 + }, + { + "epoch": 0.7622253657313732, + "grad_norm": 1.7918857336044312, + "learning_rate": 2.8213003380137783e-05, + "loss": 1.1859, + "step": 21284 + }, + { + "epoch": 0.7622611778609415, + "grad_norm": 1.2142035961151123, + "learning_rate": 2.820492889455987e-05, + "loss": 1.0277, + "step": 21285 + }, + { + "epoch": 0.7622969899905098, + "grad_norm": 1.5340360403060913, + "learning_rate": 2.8196855374892006e-05, + "loss": 1.1496, + "step": 21286 + }, + { + "epoch": 0.762332802120078, + "grad_norm": 1.3564873933792114, + "learning_rate": 2.8188782821242855e-05, + "loss": 1.1682, + "step": 21287 + }, + { + "epoch": 0.7623686142496463, + "grad_norm": 1.7076020240783691, + "learning_rate": 2.8180711233720947e-05, + "loss": 1.0682, + "step": 21288 + }, + { + "epoch": 0.7624044263792147, + "grad_norm": 1.6705539226531982, + "learning_rate": 2.8172640612434987e-05, + "loss": 0.9109, + "step": 21289 + }, + { + "epoch": 0.762440238508783, + "grad_norm": 1.7194658517837524, + "learning_rate": 2.8164570957493473e-05, + "loss": 0.9752, + "step": 21290 + }, + { + "epoch": 0.7624760506383512, + "grad_norm": 1.3574531078338623, + "learning_rate": 2.8156502269004992e-05, + "loss": 1.0073, + "step": 21291 + }, + { + "epoch": 0.7625118627679195, + "grad_norm": 2.7255773544311523, + "learning_rate": 2.814843454707813e-05, + "loss": 1.1981, + "step": 21292 + }, + { + "epoch": 0.7625476748974878, + "grad_norm": 1.685479998588562, + "learning_rate": 2.8140367791821363e-05, + "loss": 0.9682, + "step": 21293 + }, + { + "epoch": 0.762583487027056, + "grad_norm": 1.2177131175994873, + "learning_rate": 2.813230200334329e-05, + "loss": 0.8655, + "step": 21294 + }, + { + "epoch": 0.7626192991566243, + "grad_norm": 2.0977611541748047, + "learning_rate": 2.8124237181752334e-05, + "loss": 1.1457, + "step": 21295 + }, + { + "epoch": 0.7626551112861927, + "grad_norm": 1.5319498777389526, + "learning_rate": 2.8116173327157114e-05, + "loss": 1.0105, + "step": 21296 + }, + { + "epoch": 0.762690923415761, + "grad_norm": 1.4874354600906372, + "learning_rate": 2.8108110439666024e-05, + "loss": 1.0582, + "step": 21297 + }, + { + "epoch": 0.7627267355453292, + "grad_norm": 1.341096043586731, + "learning_rate": 2.8100048519387613e-05, + "loss": 0.9813, + "step": 21298 + }, + { + "epoch": 0.7627625476748975, + "grad_norm": 1.7463529109954834, + "learning_rate": 2.8091987566430233e-05, + "loss": 1.068, + "step": 21299 + }, + { + "epoch": 0.7627983598044658, + "grad_norm": 1.3637819290161133, + "learning_rate": 2.808392758090247e-05, + "loss": 1.0489, + "step": 21300 + }, + { + "epoch": 0.762834171934034, + "grad_norm": 1.5524451732635498, + "learning_rate": 2.80758685629127e-05, + "loss": 1.0951, + "step": 21301 + }, + { + "epoch": 0.7628699840636023, + "grad_norm": 1.3992700576782227, + "learning_rate": 2.8067810512569282e-05, + "loss": 0.8649, + "step": 21302 + }, + { + "epoch": 0.7629057961931707, + "grad_norm": 1.8151298761367798, + "learning_rate": 2.805975342998075e-05, + "loss": 1.1042, + "step": 21303 + }, + { + "epoch": 0.7629416083227389, + "grad_norm": 1.3392704725265503, + "learning_rate": 2.80516973152554e-05, + "loss": 0.9133, + "step": 21304 + }, + { + "epoch": 0.7629774204523072, + "grad_norm": 1.294295072555542, + "learning_rate": 2.8043642168501692e-05, + "loss": 1.1041, + "step": 21305 + }, + { + "epoch": 0.7630132325818755, + "grad_norm": 1.492196798324585, + "learning_rate": 2.8035587989827904e-05, + "loss": 0.9014, + "step": 21306 + }, + { + "epoch": 0.7630490447114437, + "grad_norm": 1.4259754419326782, + "learning_rate": 2.802753477934251e-05, + "loss": 1.0751, + "step": 21307 + }, + { + "epoch": 0.763084856841012, + "grad_norm": 2.930122137069702, + "learning_rate": 2.8019482537153762e-05, + "loss": 1.2959, + "step": 21308 + }, + { + "epoch": 0.7631206689705803, + "grad_norm": 1.474818468093872, + "learning_rate": 2.801143126337007e-05, + "loss": 1.1716, + "step": 21309 + }, + { + "epoch": 0.7631564811001487, + "grad_norm": 1.4724619388580322, + "learning_rate": 2.8003380958099677e-05, + "loss": 1.2055, + "step": 21310 + }, + { + "epoch": 0.7631922932297169, + "grad_norm": 1.3220864534378052, + "learning_rate": 2.7995331621450917e-05, + "loss": 0.9154, + "step": 21311 + }, + { + "epoch": 0.7632281053592852, + "grad_norm": 1.3041197061538696, + "learning_rate": 2.7987283253532125e-05, + "loss": 0.9035, + "step": 21312 + }, + { + "epoch": 0.7632639174888535, + "grad_norm": 1.8476002216339111, + "learning_rate": 2.7979235854451523e-05, + "loss": 1.318, + "step": 21313 + }, + { + "epoch": 0.7632997296184217, + "grad_norm": 1.581063985824585, + "learning_rate": 2.79711894243174e-05, + "loss": 1.2497, + "step": 21314 + }, + { + "epoch": 0.76333554174799, + "grad_norm": 1.35936439037323, + "learning_rate": 2.7963143963238005e-05, + "loss": 1.0451, + "step": 21315 + }, + { + "epoch": 0.7633713538775583, + "grad_norm": 1.551076889038086, + "learning_rate": 2.795509947132162e-05, + "loss": 0.9981, + "step": 21316 + }, + { + "epoch": 0.7634071660071267, + "grad_norm": 1.6965570449829102, + "learning_rate": 2.7947055948676392e-05, + "loss": 1.2953, + "step": 21317 + }, + { + "epoch": 0.7634429781366949, + "grad_norm": 1.392828106880188, + "learning_rate": 2.793901339541063e-05, + "loss": 1.1086, + "step": 21318 + }, + { + "epoch": 0.7634787902662632, + "grad_norm": 1.3856264352798462, + "learning_rate": 2.79309718116325e-05, + "loss": 1.0548, + "step": 21319 + }, + { + "epoch": 0.7635146023958315, + "grad_norm": 1.9909013509750366, + "learning_rate": 2.792293119745014e-05, + "loss": 1.1698, + "step": 21320 + }, + { + "epoch": 0.7635504145253997, + "grad_norm": 1.4702792167663574, + "learning_rate": 2.7914891552971776e-05, + "loss": 1.1542, + "step": 21321 + }, + { + "epoch": 0.763586226654968, + "grad_norm": 1.5731933116912842, + "learning_rate": 2.7906852878305567e-05, + "loss": 1.0548, + "step": 21322 + }, + { + "epoch": 0.7636220387845363, + "grad_norm": 1.4008594751358032, + "learning_rate": 2.789881517355969e-05, + "loss": 1.1684, + "step": 21323 + }, + { + "epoch": 0.7636578509141047, + "grad_norm": 1.3864214420318604, + "learning_rate": 2.7890778438842214e-05, + "loss": 0.8897, + "step": 21324 + }, + { + "epoch": 0.7636936630436729, + "grad_norm": 1.8366189002990723, + "learning_rate": 2.7882742674261307e-05, + "loss": 1.4354, + "step": 21325 + }, + { + "epoch": 0.7637294751732412, + "grad_norm": 1.616688847541809, + "learning_rate": 2.78747078799251e-05, + "loss": 1.2006, + "step": 21326 + }, + { + "epoch": 0.7637652873028095, + "grad_norm": 1.644364833831787, + "learning_rate": 2.786667405594163e-05, + "loss": 1.0764, + "step": 21327 + }, + { + "epoch": 0.7638010994323777, + "grad_norm": 1.4081944227218628, + "learning_rate": 2.785864120241901e-05, + "loss": 1.1215, + "step": 21328 + }, + { + "epoch": 0.763836911561946, + "grad_norm": 1.5584408044815063, + "learning_rate": 2.7850609319465325e-05, + "loss": 1.2649, + "step": 21329 + }, + { + "epoch": 0.7638727236915143, + "grad_norm": 1.3658510446548462, + "learning_rate": 2.7842578407188656e-05, + "loss": 1.0511, + "step": 21330 + }, + { + "epoch": 0.7639085358210826, + "grad_norm": 1.3203275203704834, + "learning_rate": 2.7834548465696987e-05, + "loss": 1.0339, + "step": 21331 + }, + { + "epoch": 0.7639443479506509, + "grad_norm": 1.7872353792190552, + "learning_rate": 2.7826519495098378e-05, + "loss": 0.994, + "step": 21332 + }, + { + "epoch": 0.7639801600802192, + "grad_norm": 1.3227399587631226, + "learning_rate": 2.7818491495500864e-05, + "loss": 1.0445, + "step": 21333 + }, + { + "epoch": 0.7640159722097875, + "grad_norm": 1.9572538137435913, + "learning_rate": 2.7810464467012455e-05, + "loss": 1.3281, + "step": 21334 + }, + { + "epoch": 0.7640517843393557, + "grad_norm": 1.4026352167129517, + "learning_rate": 2.7802438409741106e-05, + "loss": 1.1321, + "step": 21335 + }, + { + "epoch": 0.764087596468924, + "grad_norm": 1.5236248970031738, + "learning_rate": 2.7794413323794822e-05, + "loss": 1.0453, + "step": 21336 + }, + { + "epoch": 0.7641234085984923, + "grad_norm": 1.3511875867843628, + "learning_rate": 2.7786389209281592e-05, + "loss": 1.0524, + "step": 21337 + }, + { + "epoch": 0.7641592207280606, + "grad_norm": 1.2410708665847778, + "learning_rate": 2.7778366066309326e-05, + "loss": 0.9521, + "step": 21338 + }, + { + "epoch": 0.7641950328576289, + "grad_norm": 1.4085334539413452, + "learning_rate": 2.7770343894985974e-05, + "loss": 1.0909, + "step": 21339 + }, + { + "epoch": 0.7642308449871972, + "grad_norm": 1.577879548072815, + "learning_rate": 2.7762322695419485e-05, + "loss": 1.364, + "step": 21340 + }, + { + "epoch": 0.7642666571167654, + "grad_norm": 1.9082294702529907, + "learning_rate": 2.7754302467717785e-05, + "loss": 1.2322, + "step": 21341 + }, + { + "epoch": 0.7643024692463337, + "grad_norm": 1.177156686782837, + "learning_rate": 2.7746283211988734e-05, + "loss": 1.1155, + "step": 21342 + }, + { + "epoch": 0.764338281375902, + "grad_norm": 1.3724784851074219, + "learning_rate": 2.773826492834023e-05, + "loss": 1.0101, + "step": 21343 + }, + { + "epoch": 0.7643740935054703, + "grad_norm": 1.6994755268096924, + "learning_rate": 2.77302476168802e-05, + "loss": 1.0027, + "step": 21344 + }, + { + "epoch": 0.7644099056350386, + "grad_norm": 1.748464584350586, + "learning_rate": 2.7722231277716437e-05, + "loss": 1.098, + "step": 21345 + }, + { + "epoch": 0.7644457177646069, + "grad_norm": 2.111222505569458, + "learning_rate": 2.771421591095682e-05, + "loss": 1.3145, + "step": 21346 + }, + { + "epoch": 0.7644815298941752, + "grad_norm": 1.716654896736145, + "learning_rate": 2.7706201516709175e-05, + "loss": 0.8315, + "step": 21347 + }, + { + "epoch": 0.7645173420237434, + "grad_norm": 1.5642993450164795, + "learning_rate": 2.769818809508138e-05, + "loss": 1.1475, + "step": 21348 + }, + { + "epoch": 0.7645531541533117, + "grad_norm": 1.4597012996673584, + "learning_rate": 2.769017564618117e-05, + "loss": 1.0369, + "step": 21349 + }, + { + "epoch": 0.76458896628288, + "grad_norm": 1.2993323802947998, + "learning_rate": 2.7682164170116365e-05, + "loss": 1.1956, + "step": 21350 + }, + { + "epoch": 0.7646247784124482, + "grad_norm": 1.5238927602767944, + "learning_rate": 2.767415366699476e-05, + "loss": 0.9397, + "step": 21351 + }, + { + "epoch": 0.7646605905420166, + "grad_norm": 1.2107008695602417, + "learning_rate": 2.7666144136924166e-05, + "loss": 1.0112, + "step": 21352 + }, + { + "epoch": 0.7646964026715849, + "grad_norm": 1.4473278522491455, + "learning_rate": 2.7658135580012256e-05, + "loss": 0.8813, + "step": 21353 + }, + { + "epoch": 0.7647322148011532, + "grad_norm": 1.548118233680725, + "learning_rate": 2.7650127996366826e-05, + "loss": 0.8867, + "step": 21354 + }, + { + "epoch": 0.7647680269307214, + "grad_norm": 1.5383000373840332, + "learning_rate": 2.764212138609562e-05, + "loss": 1.1112, + "step": 21355 + }, + { + "epoch": 0.7648038390602897, + "grad_norm": 1.7745636701583862, + "learning_rate": 2.7634115749306312e-05, + "loss": 1.0856, + "step": 21356 + }, + { + "epoch": 0.764839651189858, + "grad_norm": 1.521202802658081, + "learning_rate": 2.762611108610663e-05, + "loss": 0.9615, + "step": 21357 + }, + { + "epoch": 0.7648754633194262, + "grad_norm": 1.5458922386169434, + "learning_rate": 2.7618107396604263e-05, + "loss": 1.1685, + "step": 21358 + }, + { + "epoch": 0.7649112754489946, + "grad_norm": 1.3721524477005005, + "learning_rate": 2.7610104680906933e-05, + "loss": 1.1229, + "step": 21359 + }, + { + "epoch": 0.7649470875785629, + "grad_norm": 1.3564419746398926, + "learning_rate": 2.760210293912223e-05, + "loss": 1.3595, + "step": 21360 + }, + { + "epoch": 0.7649828997081312, + "grad_norm": 1.6869429349899292, + "learning_rate": 2.759410217135786e-05, + "loss": 1.0912, + "step": 21361 + }, + { + "epoch": 0.7650187118376994, + "grad_norm": 2.130244255065918, + "learning_rate": 2.7586102377721467e-05, + "loss": 0.969, + "step": 21362 + }, + { + "epoch": 0.7650545239672677, + "grad_norm": 1.32822585105896, + "learning_rate": 2.7578103558320623e-05, + "loss": 1.1189, + "step": 21363 + }, + { + "epoch": 0.765090336096836, + "grad_norm": 1.644812822341919, + "learning_rate": 2.7570105713262995e-05, + "loss": 1.176, + "step": 21364 + }, + { + "epoch": 0.7651261482264042, + "grad_norm": 1.9750404357910156, + "learning_rate": 2.7562108842656152e-05, + "loss": 0.9294, + "step": 21365 + }, + { + "epoch": 0.7651619603559726, + "grad_norm": 1.578244924545288, + "learning_rate": 2.7554112946607735e-05, + "loss": 1.0801, + "step": 21366 + }, + { + "epoch": 0.7651977724855409, + "grad_norm": 1.287043809890747, + "learning_rate": 2.7546118025225244e-05, + "loss": 1.0332, + "step": 21367 + }, + { + "epoch": 0.7652335846151092, + "grad_norm": 1.437324047088623, + "learning_rate": 2.7538124078616278e-05, + "loss": 1.0373, + "step": 21368 + }, + { + "epoch": 0.7652693967446774, + "grad_norm": 1.4171110391616821, + "learning_rate": 2.753013110688839e-05, + "loss": 0.9959, + "step": 21369 + }, + { + "epoch": 0.7653052088742457, + "grad_norm": 1.6505277156829834, + "learning_rate": 2.7522139110149125e-05, + "loss": 1.275, + "step": 21370 + }, + { + "epoch": 0.765341021003814, + "grad_norm": 1.5611670017242432, + "learning_rate": 2.7514148088505998e-05, + "loss": 1.1566, + "step": 21371 + }, + { + "epoch": 0.7653768331333822, + "grad_norm": 2.1370861530303955, + "learning_rate": 2.7506158042066454e-05, + "loss": 1.0963, + "step": 21372 + }, + { + "epoch": 0.7654126452629506, + "grad_norm": 1.5985513925552368, + "learning_rate": 2.74981689709381e-05, + "loss": 1.2454, + "step": 21373 + }, + { + "epoch": 0.7654484573925189, + "grad_norm": 1.1362299919128418, + "learning_rate": 2.749018087522832e-05, + "loss": 0.8502, + "step": 21374 + }, + { + "epoch": 0.7654842695220871, + "grad_norm": 1.5907633304595947, + "learning_rate": 2.7482193755044637e-05, + "loss": 1.1953, + "step": 21375 + }, + { + "epoch": 0.7655200816516554, + "grad_norm": 1.3492885828018188, + "learning_rate": 2.7474207610494495e-05, + "loss": 0.9832, + "step": 21376 + }, + { + "epoch": 0.7655558937812237, + "grad_norm": 1.2236543893814087, + "learning_rate": 2.7466222441685362e-05, + "loss": 0.8636, + "step": 21377 + }, + { + "epoch": 0.765591705910792, + "grad_norm": 1.5412685871124268, + "learning_rate": 2.7458238248724623e-05, + "loss": 1.0433, + "step": 21378 + }, + { + "epoch": 0.7656275180403602, + "grad_norm": 1.4557127952575684, + "learning_rate": 2.7450255031719707e-05, + "loss": 1.1494, + "step": 21379 + }, + { + "epoch": 0.7656633301699286, + "grad_norm": 1.3983172178268433, + "learning_rate": 2.7442272790778057e-05, + "loss": 0.931, + "step": 21380 + }, + { + "epoch": 0.7656991422994969, + "grad_norm": 1.5885791778564453, + "learning_rate": 2.7434291526007004e-05, + "loss": 1.1516, + "step": 21381 + }, + { + "epoch": 0.7657349544290651, + "grad_norm": 1.4838722944259644, + "learning_rate": 2.742631123751399e-05, + "loss": 1.0581, + "step": 21382 + }, + { + "epoch": 0.7657707665586334, + "grad_norm": 1.4048457145690918, + "learning_rate": 2.7418331925406293e-05, + "loss": 1.0834, + "step": 21383 + }, + { + "epoch": 0.7658065786882017, + "grad_norm": 1.4144399166107178, + "learning_rate": 2.741035358979136e-05, + "loss": 1.0883, + "step": 21384 + }, + { + "epoch": 0.76584239081777, + "grad_norm": 1.4799530506134033, + "learning_rate": 2.7402376230776473e-05, + "loss": 0.977, + "step": 21385 + }, + { + "epoch": 0.7658782029473382, + "grad_norm": 1.503700852394104, + "learning_rate": 2.7394399848468953e-05, + "loss": 1.0502, + "step": 21386 + }, + { + "epoch": 0.7659140150769066, + "grad_norm": 1.544055461883545, + "learning_rate": 2.7386424442976132e-05, + "loss": 1.0785, + "step": 21387 + }, + { + "epoch": 0.7659498272064749, + "grad_norm": 1.5007888078689575, + "learning_rate": 2.7378450014405342e-05, + "loss": 0.9716, + "step": 21388 + }, + { + "epoch": 0.7659856393360431, + "grad_norm": 1.565064549446106, + "learning_rate": 2.7370476562863835e-05, + "loss": 0.9416, + "step": 21389 + }, + { + "epoch": 0.7660214514656114, + "grad_norm": 1.5825109481811523, + "learning_rate": 2.7362504088458807e-05, + "loss": 1.0097, + "step": 21390 + }, + { + "epoch": 0.7660572635951797, + "grad_norm": 1.2767562866210938, + "learning_rate": 2.7354532591297666e-05, + "loss": 1.2536, + "step": 21391 + }, + { + "epoch": 0.7660930757247479, + "grad_norm": 1.5427364110946655, + "learning_rate": 2.7346562071487537e-05, + "loss": 1.0665, + "step": 21392 + }, + { + "epoch": 0.7661288878543162, + "grad_norm": 1.7125943899154663, + "learning_rate": 2.7338592529135744e-05, + "loss": 1.2394, + "step": 21393 + }, + { + "epoch": 0.7661646999838846, + "grad_norm": 1.3423001766204834, + "learning_rate": 2.7330623964349387e-05, + "loss": 1.191, + "step": 21394 + }, + { + "epoch": 0.7662005121134529, + "grad_norm": 1.8849170207977295, + "learning_rate": 2.732265637723582e-05, + "loss": 1.0787, + "step": 21395 + }, + { + "epoch": 0.7662363242430211, + "grad_norm": 1.3388006687164307, + "learning_rate": 2.7314689767902134e-05, + "loss": 1.0041, + "step": 21396 + }, + { + "epoch": 0.7662721363725894, + "grad_norm": 1.5320727825164795, + "learning_rate": 2.7306724136455564e-05, + "loss": 0.9803, + "step": 21397 + }, + { + "epoch": 0.7663079485021577, + "grad_norm": 1.4264581203460693, + "learning_rate": 2.7298759483003223e-05, + "loss": 0.8726, + "step": 21398 + }, + { + "epoch": 0.7663437606317259, + "grad_norm": 1.5143461227416992, + "learning_rate": 2.7290795807652305e-05, + "loss": 1.2763, + "step": 21399 + }, + { + "epoch": 0.7663795727612942, + "grad_norm": 1.4611690044403076, + "learning_rate": 2.7282833110509952e-05, + "loss": 1.1598, + "step": 21400 + }, + { + "epoch": 0.7664153848908626, + "grad_norm": 1.295344591140747, + "learning_rate": 2.7274871391683243e-05, + "loss": 0.8661, + "step": 21401 + }, + { + "epoch": 0.7664511970204309, + "grad_norm": 1.6046533584594727, + "learning_rate": 2.7266910651279376e-05, + "loss": 0.96, + "step": 21402 + }, + { + "epoch": 0.7664870091499991, + "grad_norm": 1.3037735223770142, + "learning_rate": 2.725895088940539e-05, + "loss": 1.0342, + "step": 21403 + }, + { + "epoch": 0.7665228212795674, + "grad_norm": 1.4953423738479614, + "learning_rate": 2.7250992106168406e-05, + "loss": 1.1722, + "step": 21404 + }, + { + "epoch": 0.7665586334091357, + "grad_norm": 1.2144083976745605, + "learning_rate": 2.724303430167543e-05, + "loss": 1.0344, + "step": 21405 + }, + { + "epoch": 0.7665944455387039, + "grad_norm": 1.6809308528900146, + "learning_rate": 2.7235077476033645e-05, + "loss": 1.0794, + "step": 21406 + }, + { + "epoch": 0.7666302576682722, + "grad_norm": 1.5638715028762817, + "learning_rate": 2.7227121629350016e-05, + "loss": 1.1235, + "step": 21407 + }, + { + "epoch": 0.7666660697978406, + "grad_norm": 1.7205517292022705, + "learning_rate": 2.7219166761731585e-05, + "loss": 1.4073, + "step": 21408 + }, + { + "epoch": 0.7667018819274088, + "grad_norm": 1.8752977848052979, + "learning_rate": 2.7211212873285376e-05, + "loss": 1.1928, + "step": 21409 + }, + { + "epoch": 0.7667376940569771, + "grad_norm": 1.757399559020996, + "learning_rate": 2.72032599641184e-05, + "loss": 0.9846, + "step": 21410 + }, + { + "epoch": 0.7667735061865454, + "grad_norm": 1.8824485540390015, + "learning_rate": 2.7195308034337698e-05, + "loss": 1.0681, + "step": 21411 + }, + { + "epoch": 0.7668093183161137, + "grad_norm": 1.5739073753356934, + "learning_rate": 2.7187357084050147e-05, + "loss": 0.9524, + "step": 21412 + }, + { + "epoch": 0.7668451304456819, + "grad_norm": 1.8968887329101562, + "learning_rate": 2.7179407113362853e-05, + "loss": 1.2133, + "step": 21413 + }, + { + "epoch": 0.7668809425752502, + "grad_norm": 1.8976247310638428, + "learning_rate": 2.7171458122382675e-05, + "loss": 1.009, + "step": 21414 + }, + { + "epoch": 0.7669167547048186, + "grad_norm": 1.6982570886611938, + "learning_rate": 2.7163510111216618e-05, + "loss": 1.0883, + "step": 21415 + }, + { + "epoch": 0.7669525668343868, + "grad_norm": 1.7526087760925293, + "learning_rate": 2.7155563079971535e-05, + "loss": 1.2038, + "step": 21416 + }, + { + "epoch": 0.7669883789639551, + "grad_norm": 1.537285566329956, + "learning_rate": 2.71476170287544e-05, + "loss": 1.0235, + "step": 21417 + }, + { + "epoch": 0.7670241910935234, + "grad_norm": 1.4665330648422241, + "learning_rate": 2.713967195767214e-05, + "loss": 1.0642, + "step": 21418 + }, + { + "epoch": 0.7670600032230916, + "grad_norm": 1.21971595287323, + "learning_rate": 2.713172786683157e-05, + "loss": 1.056, + "step": 21419 + }, + { + "epoch": 0.7670958153526599, + "grad_norm": 1.389272689819336, + "learning_rate": 2.712378475633961e-05, + "loss": 1.1486, + "step": 21420 + }, + { + "epoch": 0.7671316274822282, + "grad_norm": 1.390775442123413, + "learning_rate": 2.7115842626303134e-05, + "loss": 1.064, + "step": 21421 + }, + { + "epoch": 0.7671674396117966, + "grad_norm": 1.759186863899231, + "learning_rate": 2.7107901476829e-05, + "loss": 1.1522, + "step": 21422 + }, + { + "epoch": 0.7672032517413648, + "grad_norm": 3.3899989128112793, + "learning_rate": 2.7099961308024004e-05, + "loss": 0.9039, + "step": 21423 + }, + { + "epoch": 0.7672390638709331, + "grad_norm": 1.7573760747909546, + "learning_rate": 2.7092022119994988e-05, + "loss": 0.978, + "step": 21424 + }, + { + "epoch": 0.7672748760005014, + "grad_norm": 1.0743532180786133, + "learning_rate": 2.70840839128488e-05, + "loss": 0.9366, + "step": 21425 + }, + { + "epoch": 0.7673106881300696, + "grad_norm": 1.4183357954025269, + "learning_rate": 2.7076146686692184e-05, + "loss": 1.1338, + "step": 21426 + }, + { + "epoch": 0.7673465002596379, + "grad_norm": 1.6340817213058472, + "learning_rate": 2.7068210441631947e-05, + "loss": 1.0584, + "step": 21427 + }, + { + "epoch": 0.7673823123892062, + "grad_norm": 1.5489909648895264, + "learning_rate": 2.7060275177774862e-05, + "loss": 1.0654, + "step": 21428 + }, + { + "epoch": 0.7674181245187746, + "grad_norm": 1.4329986572265625, + "learning_rate": 2.7052340895227714e-05, + "loss": 1.2172, + "step": 21429 + }, + { + "epoch": 0.7674539366483428, + "grad_norm": 1.7014166116714478, + "learning_rate": 2.7044407594097197e-05, + "loss": 1.261, + "step": 21430 + }, + { + "epoch": 0.7674897487779111, + "grad_norm": 1.4894353151321411, + "learning_rate": 2.703647527449007e-05, + "loss": 0.9423, + "step": 21431 + }, + { + "epoch": 0.7675255609074794, + "grad_norm": 1.4111403226852417, + "learning_rate": 2.7028543936513086e-05, + "loss": 0.9305, + "step": 21432 + }, + { + "epoch": 0.7675613730370476, + "grad_norm": 1.4976102113723755, + "learning_rate": 2.7020613580272893e-05, + "loss": 1.088, + "step": 21433 + }, + { + "epoch": 0.7675971851666159, + "grad_norm": 1.6368590593338013, + "learning_rate": 2.7012684205876192e-05, + "loss": 0.971, + "step": 21434 + }, + { + "epoch": 0.7676329972961842, + "grad_norm": 1.4267395734786987, + "learning_rate": 2.7004755813429683e-05, + "loss": 1.1424, + "step": 21435 + }, + { + "epoch": 0.7676688094257526, + "grad_norm": 1.4905422925949097, + "learning_rate": 2.6996828403040064e-05, + "loss": 1.0983, + "step": 21436 + }, + { + "epoch": 0.7677046215553208, + "grad_norm": 1.3183722496032715, + "learning_rate": 2.698890197481392e-05, + "loss": 1.1078, + "step": 21437 + }, + { + "epoch": 0.7677404336848891, + "grad_norm": 1.9634287357330322, + "learning_rate": 2.6980976528857915e-05, + "loss": 1.118, + "step": 21438 + }, + { + "epoch": 0.7677762458144574, + "grad_norm": 1.353740930557251, + "learning_rate": 2.697305206527869e-05, + "loss": 1.0343, + "step": 21439 + }, + { + "epoch": 0.7678120579440256, + "grad_norm": 1.605263352394104, + "learning_rate": 2.6965128584182886e-05, + "loss": 0.9916, + "step": 21440 + }, + { + "epoch": 0.7678478700735939, + "grad_norm": 1.541161060333252, + "learning_rate": 2.6957206085677023e-05, + "loss": 1.233, + "step": 21441 + }, + { + "epoch": 0.7678836822031622, + "grad_norm": 1.6033269166946411, + "learning_rate": 2.694928456986775e-05, + "loss": 1.2368, + "step": 21442 + }, + { + "epoch": 0.7679194943327305, + "grad_norm": 1.3268800973892212, + "learning_rate": 2.6941364036861638e-05, + "loss": 0.8562, + "step": 21443 + }, + { + "epoch": 0.7679553064622988, + "grad_norm": 1.5460799932479858, + "learning_rate": 2.6933444486765212e-05, + "loss": 0.9676, + "step": 21444 + }, + { + "epoch": 0.7679911185918671, + "grad_norm": 1.6085577011108398, + "learning_rate": 2.6925525919685047e-05, + "loss": 1.2667, + "step": 21445 + }, + { + "epoch": 0.7680269307214354, + "grad_norm": 1.62407648563385, + "learning_rate": 2.6917608335727675e-05, + "loss": 1.1461, + "step": 21446 + }, + { + "epoch": 0.7680627428510036, + "grad_norm": 1.2980552911758423, + "learning_rate": 2.6909691734999633e-05, + "loss": 0.9976, + "step": 21447 + }, + { + "epoch": 0.7680985549805719, + "grad_norm": 1.2965922355651855, + "learning_rate": 2.690177611760738e-05, + "loss": 1.115, + "step": 21448 + }, + { + "epoch": 0.7681343671101402, + "grad_norm": 1.4966497421264648, + "learning_rate": 2.6893861483657436e-05, + "loss": 1.1233, + "step": 21449 + }, + { + "epoch": 0.7681701792397085, + "grad_norm": 1.132442831993103, + "learning_rate": 2.688594783325632e-05, + "loss": 1.0226, + "step": 21450 + }, + { + "epoch": 0.7682059913692768, + "grad_norm": 1.7470567226409912, + "learning_rate": 2.687803516651044e-05, + "loss": 1.0263, + "step": 21451 + }, + { + "epoch": 0.7682418034988451, + "grad_norm": 1.3807247877120972, + "learning_rate": 2.6870123483526276e-05, + "loss": 1.0288, + "step": 21452 + }, + { + "epoch": 0.7682776156284133, + "grad_norm": 1.6658927202224731, + "learning_rate": 2.6862212784410258e-05, + "loss": 1.0922, + "step": 21453 + }, + { + "epoch": 0.7683134277579816, + "grad_norm": 1.3044610023498535, + "learning_rate": 2.685430306926887e-05, + "loss": 1.0887, + "step": 21454 + }, + { + "epoch": 0.7683492398875499, + "grad_norm": 1.5388261079788208, + "learning_rate": 2.6846394338208446e-05, + "loss": 1.071, + "step": 21455 + }, + { + "epoch": 0.7683850520171182, + "grad_norm": 1.8941713571548462, + "learning_rate": 2.683848659133542e-05, + "loss": 1.1817, + "step": 21456 + }, + { + "epoch": 0.7684208641466865, + "grad_norm": 1.5702030658721924, + "learning_rate": 2.68305798287562e-05, + "loss": 1.0525, + "step": 21457 + }, + { + "epoch": 0.7684566762762548, + "grad_norm": 1.4922939538955688, + "learning_rate": 2.682267405057717e-05, + "loss": 1.0494, + "step": 21458 + }, + { + "epoch": 0.7684924884058231, + "grad_norm": 2.2517268657684326, + "learning_rate": 2.6814769256904627e-05, + "loss": 1.0481, + "step": 21459 + }, + { + "epoch": 0.7685283005353913, + "grad_norm": 1.3988516330718994, + "learning_rate": 2.6806865447844974e-05, + "loss": 1.0575, + "step": 21460 + }, + { + "epoch": 0.7685641126649596, + "grad_norm": 1.6797561645507812, + "learning_rate": 2.6798962623504566e-05, + "loss": 1.261, + "step": 21461 + }, + { + "epoch": 0.7685999247945279, + "grad_norm": 1.4934254884719849, + "learning_rate": 2.6791060783989653e-05, + "loss": 0.9383, + "step": 21462 + }, + { + "epoch": 0.7686357369240961, + "grad_norm": 1.2352962493896484, + "learning_rate": 2.678315992940659e-05, + "loss": 1.0222, + "step": 21463 + }, + { + "epoch": 0.7686715490536645, + "grad_norm": 1.350996732711792, + "learning_rate": 2.6775260059861673e-05, + "loss": 1.0843, + "step": 21464 + }, + { + "epoch": 0.7687073611832328, + "grad_norm": 1.3285233974456787, + "learning_rate": 2.6767361175461202e-05, + "loss": 0.9874, + "step": 21465 + }, + { + "epoch": 0.7687431733128011, + "grad_norm": 1.4105957746505737, + "learning_rate": 2.6759463276311393e-05, + "loss": 0.9003, + "step": 21466 + }, + { + "epoch": 0.7687789854423693, + "grad_norm": 1.5161303281784058, + "learning_rate": 2.675156636251853e-05, + "loss": 1.0232, + "step": 21467 + }, + { + "epoch": 0.7688147975719376, + "grad_norm": 1.7392666339874268, + "learning_rate": 2.6743670434188893e-05, + "loss": 1.1108, + "step": 21468 + }, + { + "epoch": 0.7688506097015059, + "grad_norm": 1.6781227588653564, + "learning_rate": 2.673577549142864e-05, + "loss": 1.0698, + "step": 21469 + }, + { + "epoch": 0.7688864218310741, + "grad_norm": 1.8814729452133179, + "learning_rate": 2.6727881534344057e-05, + "loss": 1.2499, + "step": 21470 + }, + { + "epoch": 0.7689222339606425, + "grad_norm": 1.3048813343048096, + "learning_rate": 2.6719988563041264e-05, + "loss": 1.1145, + "step": 21471 + }, + { + "epoch": 0.7689580460902108, + "grad_norm": 1.4139580726623535, + "learning_rate": 2.6712096577626543e-05, + "loss": 1.055, + "step": 21472 + }, + { + "epoch": 0.7689938582197791, + "grad_norm": 1.6252870559692383, + "learning_rate": 2.670420557820601e-05, + "loss": 1.349, + "step": 21473 + }, + { + "epoch": 0.7690296703493473, + "grad_norm": 1.3577765226364136, + "learning_rate": 2.6696315564885844e-05, + "loss": 1.2065, + "step": 21474 + }, + { + "epoch": 0.7690654824789156, + "grad_norm": 1.2843213081359863, + "learning_rate": 2.6688426537772194e-05, + "loss": 1.1416, + "step": 21475 + }, + { + "epoch": 0.7691012946084839, + "grad_norm": 1.6504063606262207, + "learning_rate": 2.668053849697123e-05, + "loss": 1.0757, + "step": 21476 + }, + { + "epoch": 0.7691371067380521, + "grad_norm": 1.473122477531433, + "learning_rate": 2.6672651442589046e-05, + "loss": 1.1369, + "step": 21477 + }, + { + "epoch": 0.7691729188676205, + "grad_norm": 1.2356356382369995, + "learning_rate": 2.6664765374731693e-05, + "loss": 0.9009, + "step": 21478 + }, + { + "epoch": 0.7692087309971888, + "grad_norm": 1.5479670763015747, + "learning_rate": 2.665688029350538e-05, + "loss": 1.1621, + "step": 21479 + }, + { + "epoch": 0.769244543126757, + "grad_norm": 1.4303512573242188, + "learning_rate": 2.6648996199016118e-05, + "loss": 1.1727, + "step": 21480 + }, + { + "epoch": 0.7692803552563253, + "grad_norm": 1.779380202293396, + "learning_rate": 2.6641113091370017e-05, + "loss": 1.1336, + "step": 21481 + }, + { + "epoch": 0.7693161673858936, + "grad_norm": 2.358880043029785, + "learning_rate": 2.6633230970673062e-05, + "loss": 1.333, + "step": 21482 + }, + { + "epoch": 0.7693519795154619, + "grad_norm": 1.5934162139892578, + "learning_rate": 2.66253498370314e-05, + "loss": 1.1008, + "step": 21483 + }, + { + "epoch": 0.7693877916450301, + "grad_norm": 1.3428068161010742, + "learning_rate": 2.661746969055098e-05, + "loss": 1.0388, + "step": 21484 + }, + { + "epoch": 0.7694236037745985, + "grad_norm": 1.3417927026748657, + "learning_rate": 2.660959053133786e-05, + "loss": 1.0682, + "step": 21485 + }, + { + "epoch": 0.7694594159041668, + "grad_norm": 1.3211129903793335, + "learning_rate": 2.6601712359498045e-05, + "loss": 1.1843, + "step": 21486 + }, + { + "epoch": 0.769495228033735, + "grad_norm": 1.2705423831939697, + "learning_rate": 2.6593835175137494e-05, + "loss": 0.9495, + "step": 21487 + }, + { + "epoch": 0.7695310401633033, + "grad_norm": 1.6650714874267578, + "learning_rate": 2.6585958978362235e-05, + "loss": 1.0823, + "step": 21488 + }, + { + "epoch": 0.7695668522928716, + "grad_norm": 1.5494948625564575, + "learning_rate": 2.6578083769278127e-05, + "loss": 1.0025, + "step": 21489 + }, + { + "epoch": 0.7696026644224399, + "grad_norm": 2.1176280975341797, + "learning_rate": 2.6570209547991265e-05, + "loss": 1.233, + "step": 21490 + }, + { + "epoch": 0.7696384765520081, + "grad_norm": 2.2156147956848145, + "learning_rate": 2.6562336314607484e-05, + "loss": 1.0474, + "step": 21491 + }, + { + "epoch": 0.7696742886815765, + "grad_norm": 1.575493574142456, + "learning_rate": 2.6554464069232776e-05, + "loss": 1.138, + "step": 21492 + }, + { + "epoch": 0.7697101008111448, + "grad_norm": 1.7375836372375488, + "learning_rate": 2.6546592811972948e-05, + "loss": 1.1384, + "step": 21493 + }, + { + "epoch": 0.769745912940713, + "grad_norm": 2.033195972442627, + "learning_rate": 2.6538722542934035e-05, + "loss": 1.2178, + "step": 21494 + }, + { + "epoch": 0.7697817250702813, + "grad_norm": 1.5180022716522217, + "learning_rate": 2.6530853262221843e-05, + "loss": 1.0366, + "step": 21495 + }, + { + "epoch": 0.7698175371998496, + "grad_norm": 1.3979580402374268, + "learning_rate": 2.652298496994222e-05, + "loss": 1.0487, + "step": 21496 + }, + { + "epoch": 0.7698533493294178, + "grad_norm": 1.8381977081298828, + "learning_rate": 2.6515117666201062e-05, + "loss": 1.2081, + "step": 21497 + }, + { + "epoch": 0.7698891614589861, + "grad_norm": 1.5180180072784424, + "learning_rate": 2.6507251351104212e-05, + "loss": 0.9344, + "step": 21498 + }, + { + "epoch": 0.7699249735885545, + "grad_norm": 1.3176244497299194, + "learning_rate": 2.649938602475751e-05, + "loss": 0.8336, + "step": 21499 + }, + { + "epoch": 0.7699607857181228, + "grad_norm": 1.5867568254470825, + "learning_rate": 2.6491521687266717e-05, + "loss": 1.0918, + "step": 21500 + }, + { + "epoch": 0.769996597847691, + "grad_norm": 1.8013360500335693, + "learning_rate": 2.6483658338737726e-05, + "loss": 1.3233, + "step": 21501 + }, + { + "epoch": 0.7700324099772593, + "grad_norm": 1.5587438344955444, + "learning_rate": 2.6475795979276262e-05, + "loss": 1.1653, + "step": 21502 + }, + { + "epoch": 0.7700682221068276, + "grad_norm": 1.4978599548339844, + "learning_rate": 2.6467934608988155e-05, + "loss": 1.1608, + "step": 21503 + }, + { + "epoch": 0.7701040342363958, + "grad_norm": 1.6971063613891602, + "learning_rate": 2.6460074227979104e-05, + "loss": 1.1452, + "step": 21504 + }, + { + "epoch": 0.7701398463659641, + "grad_norm": 1.4470142126083374, + "learning_rate": 2.6452214836354893e-05, + "loss": 1.3288, + "step": 21505 + }, + { + "epoch": 0.7701756584955325, + "grad_norm": 1.6873223781585693, + "learning_rate": 2.6444356434221296e-05, + "loss": 1.1426, + "step": 21506 + }, + { + "epoch": 0.7702114706251008, + "grad_norm": 1.8348851203918457, + "learning_rate": 2.643649902168397e-05, + "loss": 1.1053, + "step": 21507 + }, + { + "epoch": 0.770247282754669, + "grad_norm": 1.2165498733520508, + "learning_rate": 2.6428642598848663e-05, + "loss": 1.0717, + "step": 21508 + }, + { + "epoch": 0.7702830948842373, + "grad_norm": 1.5839539766311646, + "learning_rate": 2.642078716582107e-05, + "loss": 1.0863, + "step": 21509 + }, + { + "epoch": 0.7703189070138056, + "grad_norm": 1.8466277122497559, + "learning_rate": 2.6412932722706908e-05, + "loss": 1.0659, + "step": 21510 + }, + { + "epoch": 0.7703547191433738, + "grad_norm": 1.513825535774231, + "learning_rate": 2.6405079269611744e-05, + "loss": 1.0116, + "step": 21511 + }, + { + "epoch": 0.7703905312729421, + "grad_norm": 1.2806596755981445, + "learning_rate": 2.6397226806641375e-05, + "loss": 1.0474, + "step": 21512 + }, + { + "epoch": 0.7704263434025105, + "grad_norm": 1.368958830833435, + "learning_rate": 2.6389375333901377e-05, + "loss": 1.0229, + "step": 21513 + }, + { + "epoch": 0.7704621555320788, + "grad_norm": 1.3961658477783203, + "learning_rate": 2.6381524851497353e-05, + "loss": 1.0789, + "step": 21514 + }, + { + "epoch": 0.770497967661647, + "grad_norm": 1.622068166732788, + "learning_rate": 2.6373675359534955e-05, + "loss": 1.1437, + "step": 21515 + }, + { + "epoch": 0.7705337797912153, + "grad_norm": 1.2912416458129883, + "learning_rate": 2.636582685811978e-05, + "loss": 1.0357, + "step": 21516 + }, + { + "epoch": 0.7705695919207836, + "grad_norm": 1.2332346439361572, + "learning_rate": 2.6357979347357454e-05, + "loss": 0.9991, + "step": 21517 + }, + { + "epoch": 0.7706054040503518, + "grad_norm": 1.4168627262115479, + "learning_rate": 2.635013282735349e-05, + "loss": 1.2249, + "step": 21518 + }, + { + "epoch": 0.7706412161799201, + "grad_norm": 1.5714457035064697, + "learning_rate": 2.63422872982135e-05, + "loss": 0.9953, + "step": 21519 + }, + { + "epoch": 0.7706770283094885, + "grad_norm": 1.2504799365997314, + "learning_rate": 2.6334442760043044e-05, + "loss": 1.0448, + "step": 21520 + }, + { + "epoch": 0.7707128404390567, + "grad_norm": 1.5184288024902344, + "learning_rate": 2.632659921294761e-05, + "loss": 1.0963, + "step": 21521 + }, + { + "epoch": 0.770748652568625, + "grad_norm": 1.4867810010910034, + "learning_rate": 2.631875665703275e-05, + "loss": 1.0414, + "step": 21522 + }, + { + "epoch": 0.7707844646981933, + "grad_norm": 1.447638750076294, + "learning_rate": 2.6310915092403976e-05, + "loss": 0.9965, + "step": 21523 + }, + { + "epoch": 0.7708202768277616, + "grad_norm": 1.4234827756881714, + "learning_rate": 2.6303074519166827e-05, + "loss": 1.1399, + "step": 21524 + }, + { + "epoch": 0.7708560889573298, + "grad_norm": 1.5311522483825684, + "learning_rate": 2.6295234937426706e-05, + "loss": 1.015, + "step": 21525 + }, + { + "epoch": 0.7708919010868981, + "grad_norm": 1.2698071002960205, + "learning_rate": 2.628739634728914e-05, + "loss": 0.8704, + "step": 21526 + }, + { + "epoch": 0.7709277132164665, + "grad_norm": 1.7565339803695679, + "learning_rate": 2.6279558748859555e-05, + "loss": 1.2126, + "step": 21527 + }, + { + "epoch": 0.7709635253460347, + "grad_norm": 1.4084817171096802, + "learning_rate": 2.627172214224346e-05, + "loss": 1.1361, + "step": 21528 + }, + { + "epoch": 0.770999337475603, + "grad_norm": 1.5950405597686768, + "learning_rate": 2.626388652754621e-05, + "loss": 1.2271, + "step": 21529 + }, + { + "epoch": 0.7710351496051713, + "grad_norm": 1.6480212211608887, + "learning_rate": 2.6256051904873246e-05, + "loss": 0.8797, + "step": 21530 + }, + { + "epoch": 0.7710709617347395, + "grad_norm": 1.99957275390625, + "learning_rate": 2.6248218274330017e-05, + "loss": 1.1695, + "step": 21531 + }, + { + "epoch": 0.7711067738643078, + "grad_norm": 1.5938695669174194, + "learning_rate": 2.6240385636021847e-05, + "loss": 1.1403, + "step": 21532 + }, + { + "epoch": 0.7711425859938761, + "grad_norm": 1.9331296682357788, + "learning_rate": 2.6232553990054144e-05, + "loss": 1.0925, + "step": 21533 + }, + { + "epoch": 0.7711783981234445, + "grad_norm": 1.5317106246948242, + "learning_rate": 2.6224723336532274e-05, + "loss": 0.9861, + "step": 21534 + }, + { + "epoch": 0.7712142102530127, + "grad_norm": 1.5120759010314941, + "learning_rate": 2.6216893675561617e-05, + "loss": 1.1694, + "step": 21535 + }, + { + "epoch": 0.771250022382581, + "grad_norm": 1.2888000011444092, + "learning_rate": 2.6209065007247458e-05, + "loss": 1.0261, + "step": 21536 + }, + { + "epoch": 0.7712858345121493, + "grad_norm": 1.3054234981536865, + "learning_rate": 2.6201237331695138e-05, + "loss": 1.1131, + "step": 21537 + }, + { + "epoch": 0.7713216466417175, + "grad_norm": 1.5214698314666748, + "learning_rate": 2.619341064901001e-05, + "loss": 1.0006, + "step": 21538 + }, + { + "epoch": 0.7713574587712858, + "grad_norm": 1.6342947483062744, + "learning_rate": 2.6185584959297303e-05, + "loss": 1.1495, + "step": 21539 + }, + { + "epoch": 0.7713932709008541, + "grad_norm": 1.4813246726989746, + "learning_rate": 2.6177760262662345e-05, + "loss": 0.9855, + "step": 21540 + }, + { + "epoch": 0.7714290830304225, + "grad_norm": 2.0448174476623535, + "learning_rate": 2.6169936559210396e-05, + "loss": 1.1126, + "step": 21541 + }, + { + "epoch": 0.7714648951599907, + "grad_norm": 1.3422482013702393, + "learning_rate": 2.6162113849046745e-05, + "loss": 1.196, + "step": 21542 + }, + { + "epoch": 0.771500707289559, + "grad_norm": 1.4530779123306274, + "learning_rate": 2.615429213227658e-05, + "loss": 1.2693, + "step": 21543 + }, + { + "epoch": 0.7715365194191273, + "grad_norm": 1.424881100654602, + "learning_rate": 2.6146471409005158e-05, + "loss": 1.1765, + "step": 21544 + }, + { + "epoch": 0.7715723315486955, + "grad_norm": 1.6942204236984253, + "learning_rate": 2.61386516793377e-05, + "loss": 1.1089, + "step": 21545 + }, + { + "epoch": 0.7716081436782638, + "grad_norm": 1.5277531147003174, + "learning_rate": 2.6130832943379447e-05, + "loss": 1.2076, + "step": 21546 + }, + { + "epoch": 0.7716439558078321, + "grad_norm": 1.3819772005081177, + "learning_rate": 2.612301520123551e-05, + "loss": 0.7978, + "step": 21547 + }, + { + "epoch": 0.7716797679374005, + "grad_norm": 1.8736317157745361, + "learning_rate": 2.6115198453011114e-05, + "loss": 1.0868, + "step": 21548 + }, + { + "epoch": 0.7717155800669687, + "grad_norm": 1.3883066177368164, + "learning_rate": 2.6107382698811446e-05, + "loss": 1.0972, + "step": 21549 + }, + { + "epoch": 0.771751392196537, + "grad_norm": 1.7514652013778687, + "learning_rate": 2.609956793874161e-05, + "loss": 1.2045, + "step": 21550 + }, + { + "epoch": 0.7717872043261053, + "grad_norm": 1.8543927669525146, + "learning_rate": 2.6091754172906747e-05, + "loss": 1.0931, + "step": 21551 + }, + { + "epoch": 0.7718230164556735, + "grad_norm": 1.379164457321167, + "learning_rate": 2.6083941401412005e-05, + "loss": 1.103, + "step": 21552 + }, + { + "epoch": 0.7718588285852418, + "grad_norm": 1.2568310499191284, + "learning_rate": 2.6076129624362512e-05, + "loss": 1.0277, + "step": 21553 + }, + { + "epoch": 0.7718946407148101, + "grad_norm": 1.3528518676757812, + "learning_rate": 2.6068318841863314e-05, + "loss": 0.9029, + "step": 21554 + }, + { + "epoch": 0.7719304528443784, + "grad_norm": 1.272897720336914, + "learning_rate": 2.6060509054019523e-05, + "loss": 0.8634, + "step": 21555 + }, + { + "epoch": 0.7719662649739467, + "grad_norm": 1.3407440185546875, + "learning_rate": 2.6052700260936237e-05, + "loss": 0.9192, + "step": 21556 + }, + { + "epoch": 0.772002077103515, + "grad_norm": 1.6867990493774414, + "learning_rate": 2.604489246271845e-05, + "loss": 1.1837, + "step": 21557 + }, + { + "epoch": 0.7720378892330833, + "grad_norm": 1.4689334630966187, + "learning_rate": 2.6037085659471237e-05, + "loss": 1.1254, + "step": 21558 + }, + { + "epoch": 0.7720737013626515, + "grad_norm": 1.510690689086914, + "learning_rate": 2.6029279851299636e-05, + "loss": 0.9605, + "step": 21559 + }, + { + "epoch": 0.7721095134922198, + "grad_norm": 1.2511765956878662, + "learning_rate": 2.6021475038308694e-05, + "loss": 1.1742, + "step": 21560 + }, + { + "epoch": 0.7721453256217881, + "grad_norm": 1.2944161891937256, + "learning_rate": 2.6013671220603343e-05, + "loss": 1.2167, + "step": 21561 + }, + { + "epoch": 0.7721811377513564, + "grad_norm": 1.8135616779327393, + "learning_rate": 2.6005868398288614e-05, + "loss": 0.9248, + "step": 21562 + }, + { + "epoch": 0.7722169498809247, + "grad_norm": 1.8801772594451904, + "learning_rate": 2.5998066571469482e-05, + "loss": 1.2333, + "step": 21563 + }, + { + "epoch": 0.772252762010493, + "grad_norm": 1.7356616258621216, + "learning_rate": 2.599026574025093e-05, + "loss": 1.1019, + "step": 21564 + }, + { + "epoch": 0.7722885741400612, + "grad_norm": 1.4347113370895386, + "learning_rate": 2.5982465904737895e-05, + "loss": 1.0373, + "step": 21565 + }, + { + "epoch": 0.7723243862696295, + "grad_norm": 1.4274253845214844, + "learning_rate": 2.597466706503524e-05, + "loss": 1.2532, + "step": 21566 + }, + { + "epoch": 0.7723601983991978, + "grad_norm": 1.5646339654922485, + "learning_rate": 2.5966869221248013e-05, + "loss": 1.1263, + "step": 21567 + }, + { + "epoch": 0.772396010528766, + "grad_norm": 1.5191723108291626, + "learning_rate": 2.595907237348104e-05, + "loss": 1.0979, + "step": 21568 + }, + { + "epoch": 0.7724318226583344, + "grad_norm": 1.3890283107757568, + "learning_rate": 2.595127652183924e-05, + "loss": 0.9622, + "step": 21569 + }, + { + "epoch": 0.7724676347879027, + "grad_norm": 1.516605257987976, + "learning_rate": 2.5943481666427506e-05, + "loss": 0.9544, + "step": 21570 + }, + { + "epoch": 0.772503446917471, + "grad_norm": 1.4833060503005981, + "learning_rate": 2.5935687807350718e-05, + "loss": 1.2531, + "step": 21571 + }, + { + "epoch": 0.7725392590470392, + "grad_norm": 1.2167001962661743, + "learning_rate": 2.5927894944713695e-05, + "loss": 0.8816, + "step": 21572 + }, + { + "epoch": 0.7725750711766075, + "grad_norm": 1.911096215248108, + "learning_rate": 2.5920103078621294e-05, + "loss": 1.118, + "step": 21573 + }, + { + "epoch": 0.7726108833061758, + "grad_norm": 1.7941721677780151, + "learning_rate": 2.591231220917837e-05, + "loss": 1.1563, + "step": 21574 + }, + { + "epoch": 0.772646695435744, + "grad_norm": 1.5556517839431763, + "learning_rate": 2.59045223364897e-05, + "loss": 1.1325, + "step": 21575 + }, + { + "epoch": 0.7726825075653124, + "grad_norm": 1.3842062950134277, + "learning_rate": 2.5896733460660138e-05, + "loss": 1.161, + "step": 21576 + }, + { + "epoch": 0.7727183196948807, + "grad_norm": 1.3634830713272095, + "learning_rate": 2.5888945581794377e-05, + "loss": 1.101, + "step": 21577 + }, + { + "epoch": 0.772754131824449, + "grad_norm": 1.538244605064392, + "learning_rate": 2.5881158699997322e-05, + "loss": 1.0401, + "step": 21578 + }, + { + "epoch": 0.7727899439540172, + "grad_norm": 1.4121953248977661, + "learning_rate": 2.5873372815373633e-05, + "loss": 1.0959, + "step": 21579 + }, + { + "epoch": 0.7728257560835855, + "grad_norm": 1.174768328666687, + "learning_rate": 2.5865587928028124e-05, + "loss": 0.8058, + "step": 21580 + }, + { + "epoch": 0.7728615682131538, + "grad_norm": 1.2994368076324463, + "learning_rate": 2.5857804038065446e-05, + "loss": 0.9538, + "step": 21581 + }, + { + "epoch": 0.772897380342722, + "grad_norm": 1.427832007408142, + "learning_rate": 2.585002114559044e-05, + "loss": 1.1044, + "step": 21582 + }, + { + "epoch": 0.7729331924722904, + "grad_norm": 1.299059271812439, + "learning_rate": 2.5842239250707757e-05, + "loss": 0.9205, + "step": 21583 + }, + { + "epoch": 0.7729690046018587, + "grad_norm": 1.5102437734603882, + "learning_rate": 2.5834458353522018e-05, + "loss": 1.1498, + "step": 21584 + }, + { + "epoch": 0.773004816731427, + "grad_norm": 1.674599289894104, + "learning_rate": 2.5826678454138044e-05, + "loss": 1.1758, + "step": 21585 + }, + { + "epoch": 0.7730406288609952, + "grad_norm": 1.4337506294250488, + "learning_rate": 2.5818899552660404e-05, + "loss": 1.158, + "step": 21586 + }, + { + "epoch": 0.7730764409905635, + "grad_norm": 1.4913978576660156, + "learning_rate": 2.5811121649193805e-05, + "loss": 1.0412, + "step": 21587 + }, + { + "epoch": 0.7731122531201318, + "grad_norm": 1.7006813287734985, + "learning_rate": 2.5803344743842817e-05, + "loss": 0.9807, + "step": 21588 + }, + { + "epoch": 0.7731480652497, + "grad_norm": 1.284105658531189, + "learning_rate": 2.579556883671217e-05, + "loss": 1.174, + "step": 21589 + }, + { + "epoch": 0.7731838773792683, + "grad_norm": 1.7379138469696045, + "learning_rate": 2.578779392790641e-05, + "loss": 1.0974, + "step": 21590 + }, + { + "epoch": 0.7732196895088367, + "grad_norm": 1.4394527673721313, + "learning_rate": 2.5780020017530182e-05, + "loss": 1.0271, + "step": 21591 + }, + { + "epoch": 0.773255501638405, + "grad_norm": 2.3443379402160645, + "learning_rate": 2.5772247105688006e-05, + "loss": 0.9674, + "step": 21592 + }, + { + "epoch": 0.7732913137679732, + "grad_norm": 1.3584240674972534, + "learning_rate": 2.5764475192484506e-05, + "loss": 1.061, + "step": 21593 + }, + { + "epoch": 0.7733271258975415, + "grad_norm": 1.4509284496307373, + "learning_rate": 2.5756704278024268e-05, + "loss": 0.9339, + "step": 21594 + }, + { + "epoch": 0.7733629380271098, + "grad_norm": 1.4172332286834717, + "learning_rate": 2.5748934362411747e-05, + "loss": 1.0054, + "step": 21595 + }, + { + "epoch": 0.773398750156678, + "grad_norm": 1.6039371490478516, + "learning_rate": 2.574116544575159e-05, + "loss": 1.125, + "step": 21596 + }, + { + "epoch": 0.7734345622862463, + "grad_norm": 1.223988652229309, + "learning_rate": 2.573339752814825e-05, + "loss": 0.857, + "step": 21597 + }, + { + "epoch": 0.7734703744158147, + "grad_norm": 2.311772108078003, + "learning_rate": 2.5725630609706264e-05, + "loss": 1.2624, + "step": 21598 + }, + { + "epoch": 0.773506186545383, + "grad_norm": 1.3886902332305908, + "learning_rate": 2.571786469053006e-05, + "loss": 1.2902, + "step": 21599 + }, + { + "epoch": 0.7735419986749512, + "grad_norm": 1.235268235206604, + "learning_rate": 2.5710099770724227e-05, + "loss": 1.1685, + "step": 21600 + }, + { + "epoch": 0.7735778108045195, + "grad_norm": 1.353617787361145, + "learning_rate": 2.5702335850393166e-05, + "loss": 0.9153, + "step": 21601 + }, + { + "epoch": 0.7736136229340878, + "grad_norm": 1.5843764543533325, + "learning_rate": 2.5694572929641326e-05, + "loss": 1.0895, + "step": 21602 + }, + { + "epoch": 0.773649435063656, + "grad_norm": 1.4155582189559937, + "learning_rate": 2.5686811008573142e-05, + "loss": 0.9175, + "step": 21603 + }, + { + "epoch": 0.7736852471932243, + "grad_norm": 1.343421459197998, + "learning_rate": 2.5679050087293067e-05, + "loss": 1.014, + "step": 21604 + }, + { + "epoch": 0.7737210593227927, + "grad_norm": 1.6599680185317993, + "learning_rate": 2.5671290165905537e-05, + "loss": 1.2928, + "step": 21605 + }, + { + "epoch": 0.7737568714523609, + "grad_norm": 1.4179277420043945, + "learning_rate": 2.5663531244514892e-05, + "loss": 1.0904, + "step": 21606 + }, + { + "epoch": 0.7737926835819292, + "grad_norm": 1.509030818939209, + "learning_rate": 2.5655773323225552e-05, + "loss": 1.1744, + "step": 21607 + }, + { + "epoch": 0.7738284957114975, + "grad_norm": 2.384384870529175, + "learning_rate": 2.564801640214187e-05, + "loss": 1.2186, + "step": 21608 + }, + { + "epoch": 0.7738643078410657, + "grad_norm": 1.7558019161224365, + "learning_rate": 2.564026048136826e-05, + "loss": 1.3675, + "step": 21609 + }, + { + "epoch": 0.773900119970634, + "grad_norm": 1.618336796760559, + "learning_rate": 2.5632505561009002e-05, + "loss": 1.294, + "step": 21610 + }, + { + "epoch": 0.7739359321002023, + "grad_norm": 1.6793028116226196, + "learning_rate": 2.5624751641168442e-05, + "loss": 1.0429, + "step": 21611 + }, + { + "epoch": 0.7739717442297707, + "grad_norm": 1.356588363647461, + "learning_rate": 2.5616998721950948e-05, + "loss": 1.0158, + "step": 21612 + }, + { + "epoch": 0.7740075563593389, + "grad_norm": 1.3228561878204346, + "learning_rate": 2.5609246803460764e-05, + "loss": 1.2061, + "step": 21613 + }, + { + "epoch": 0.7740433684889072, + "grad_norm": 1.3261692523956299, + "learning_rate": 2.5601495885802196e-05, + "loss": 1.1976, + "step": 21614 + }, + { + "epoch": 0.7740791806184755, + "grad_norm": 2.184025287628174, + "learning_rate": 2.559374596907954e-05, + "loss": 1.0289, + "step": 21615 + }, + { + "epoch": 0.7741149927480437, + "grad_norm": 1.3623679876327515, + "learning_rate": 2.5585997053397083e-05, + "loss": 1.3409, + "step": 21616 + }, + { + "epoch": 0.774150804877612, + "grad_norm": 1.1871076822280884, + "learning_rate": 2.5578249138859023e-05, + "loss": 1.1023, + "step": 21617 + }, + { + "epoch": 0.7741866170071803, + "grad_norm": 1.4205058813095093, + "learning_rate": 2.5570502225569625e-05, + "loss": 1.0595, + "step": 21618 + }, + { + "epoch": 0.7742224291367487, + "grad_norm": 1.9543081521987915, + "learning_rate": 2.556275631363314e-05, + "loss": 1.2263, + "step": 21619 + }, + { + "epoch": 0.7742582412663169, + "grad_norm": 1.512132167816162, + "learning_rate": 2.5555011403153715e-05, + "loss": 1.0076, + "step": 21620 + }, + { + "epoch": 0.7742940533958852, + "grad_norm": 1.4076359272003174, + "learning_rate": 2.5547267494235595e-05, + "loss": 0.8137, + "step": 21621 + }, + { + "epoch": 0.7743298655254535, + "grad_norm": 1.1587696075439453, + "learning_rate": 2.5539524586982944e-05, + "loss": 0.8172, + "step": 21622 + }, + { + "epoch": 0.7743656776550217, + "grad_norm": 1.3377759456634521, + "learning_rate": 2.553178268149997e-05, + "loss": 1.1036, + "step": 21623 + }, + { + "epoch": 0.77440148978459, + "grad_norm": 1.7573686838150024, + "learning_rate": 2.5524041777890783e-05, + "loss": 0.9532, + "step": 21624 + }, + { + "epoch": 0.7744373019141583, + "grad_norm": 1.4741662740707397, + "learning_rate": 2.5516301876259542e-05, + "loss": 1.0279, + "step": 21625 + }, + { + "epoch": 0.7744731140437267, + "grad_norm": 1.4337612390518188, + "learning_rate": 2.5508562976710416e-05, + "loss": 0.9106, + "step": 21626 + }, + { + "epoch": 0.7745089261732949, + "grad_norm": 1.4074451923370361, + "learning_rate": 2.5500825079347458e-05, + "loss": 0.8725, + "step": 21627 + }, + { + "epoch": 0.7745447383028632, + "grad_norm": 1.881911277770996, + "learning_rate": 2.5493088184274795e-05, + "loss": 1.1455, + "step": 21628 + }, + { + "epoch": 0.7745805504324315, + "grad_norm": 1.6949135065078735, + "learning_rate": 2.548535229159653e-05, + "loss": 0.9768, + "step": 21629 + }, + { + "epoch": 0.7746163625619997, + "grad_norm": 1.5490957498550415, + "learning_rate": 2.5477617401416765e-05, + "loss": 1.1228, + "step": 21630 + }, + { + "epoch": 0.774652174691568, + "grad_norm": 1.6421111822128296, + "learning_rate": 2.5469883513839498e-05, + "loss": 1.003, + "step": 21631 + }, + { + "epoch": 0.7746879868211363, + "grad_norm": 1.549246072769165, + "learning_rate": 2.5462150628968806e-05, + "loss": 1.223, + "step": 21632 + }, + { + "epoch": 0.7747237989507046, + "grad_norm": 1.359028697013855, + "learning_rate": 2.5454418746908737e-05, + "loss": 1.0369, + "step": 21633 + }, + { + "epoch": 0.7747596110802729, + "grad_norm": 1.7318564653396606, + "learning_rate": 2.544668786776333e-05, + "loss": 1.0989, + "step": 21634 + }, + { + "epoch": 0.7747954232098412, + "grad_norm": 1.6294105052947998, + "learning_rate": 2.5438957991636546e-05, + "loss": 1.0334, + "step": 21635 + }, + { + "epoch": 0.7748312353394095, + "grad_norm": 1.2415817975997925, + "learning_rate": 2.5431229118632406e-05, + "loss": 1.0264, + "step": 21636 + }, + { + "epoch": 0.7748670474689777, + "grad_norm": 1.264236330986023, + "learning_rate": 2.542350124885492e-05, + "loss": 0.9485, + "step": 21637 + }, + { + "epoch": 0.774902859598546, + "grad_norm": 1.5095444917678833, + "learning_rate": 2.5415774382407997e-05, + "loss": 1.1151, + "step": 21638 + }, + { + "epoch": 0.7749386717281143, + "grad_norm": 1.5789297819137573, + "learning_rate": 2.5408048519395622e-05, + "loss": 1.1174, + "step": 21639 + }, + { + "epoch": 0.7749744838576826, + "grad_norm": 1.4950200319290161, + "learning_rate": 2.5400323659921744e-05, + "loss": 1.0737, + "step": 21640 + }, + { + "epoch": 0.7750102959872509, + "grad_norm": 1.624776840209961, + "learning_rate": 2.539259980409031e-05, + "loss": 1.2726, + "step": 21641 + }, + { + "epoch": 0.7750461081168192, + "grad_norm": 2.082396984100342, + "learning_rate": 2.5384876952005177e-05, + "loss": 1.1884, + "step": 21642 + }, + { + "epoch": 0.7750819202463874, + "grad_norm": 1.4470109939575195, + "learning_rate": 2.537715510377028e-05, + "loss": 1.2737, + "step": 21643 + }, + { + "epoch": 0.7751177323759557, + "grad_norm": 1.354802131652832, + "learning_rate": 2.5369434259489534e-05, + "loss": 1.0688, + "step": 21644 + }, + { + "epoch": 0.775153544505524, + "grad_norm": 1.3264784812927246, + "learning_rate": 2.5361714419266757e-05, + "loss": 1.1022, + "step": 21645 + }, + { + "epoch": 0.7751893566350923, + "grad_norm": 1.3573517799377441, + "learning_rate": 2.5353995583205824e-05, + "loss": 1.101, + "step": 21646 + }, + { + "epoch": 0.7752251687646606, + "grad_norm": 1.439370036125183, + "learning_rate": 2.5346277751410607e-05, + "loss": 0.9846, + "step": 21647 + }, + { + "epoch": 0.7752609808942289, + "grad_norm": 1.605819821357727, + "learning_rate": 2.5338560923984954e-05, + "loss": 1.3129, + "step": 21648 + }, + { + "epoch": 0.7752967930237972, + "grad_norm": 1.445857048034668, + "learning_rate": 2.533084510103263e-05, + "loss": 0.9896, + "step": 21649 + }, + { + "epoch": 0.7753326051533654, + "grad_norm": 1.318897008895874, + "learning_rate": 2.532313028265746e-05, + "loss": 1.142, + "step": 21650 + }, + { + "epoch": 0.7753684172829337, + "grad_norm": 1.5345039367675781, + "learning_rate": 2.531541646896325e-05, + "loss": 0.9771, + "step": 21651 + }, + { + "epoch": 0.775404229412502, + "grad_norm": 1.2179757356643677, + "learning_rate": 2.5307703660053805e-05, + "loss": 0.8172, + "step": 21652 + }, + { + "epoch": 0.7754400415420702, + "grad_norm": 1.5870368480682373, + "learning_rate": 2.5299991856032835e-05, + "loss": 0.8781, + "step": 21653 + }, + { + "epoch": 0.7754758536716386, + "grad_norm": 1.4032106399536133, + "learning_rate": 2.5292281057004108e-05, + "loss": 1.2054, + "step": 21654 + }, + { + "epoch": 0.7755116658012069, + "grad_norm": 1.5988435745239258, + "learning_rate": 2.528457126307141e-05, + "loss": 1.1617, + "step": 21655 + }, + { + "epoch": 0.7755474779307752, + "grad_norm": 1.6276743412017822, + "learning_rate": 2.5276862474338404e-05, + "loss": 1.1046, + "step": 21656 + }, + { + "epoch": 0.7755832900603434, + "grad_norm": 1.9638512134552002, + "learning_rate": 2.5269154690908827e-05, + "loss": 1.1093, + "step": 21657 + }, + { + "epoch": 0.7756191021899117, + "grad_norm": 1.344660997390747, + "learning_rate": 2.526144791288637e-05, + "loss": 0.9274, + "step": 21658 + }, + { + "epoch": 0.77565491431948, + "grad_norm": 1.4264498949050903, + "learning_rate": 2.525374214037476e-05, + "loss": 1.1053, + "step": 21659 + }, + { + "epoch": 0.7756907264490482, + "grad_norm": 1.7543433904647827, + "learning_rate": 2.5246037373477606e-05, + "loss": 1.1122, + "step": 21660 + }, + { + "epoch": 0.7757265385786166, + "grad_norm": 1.7767260074615479, + "learning_rate": 2.523833361229859e-05, + "loss": 1.2731, + "step": 21661 + }, + { + "epoch": 0.7757623507081849, + "grad_norm": 1.4601054191589355, + "learning_rate": 2.5230630856941394e-05, + "loss": 0.9945, + "step": 21662 + }, + { + "epoch": 0.7757981628377532, + "grad_norm": 1.3179447650909424, + "learning_rate": 2.5222929107509584e-05, + "loss": 1.1461, + "step": 21663 + }, + { + "epoch": 0.7758339749673214, + "grad_norm": 1.733141541481018, + "learning_rate": 2.5215228364106835e-05, + "loss": 1.075, + "step": 21664 + }, + { + "epoch": 0.7758697870968897, + "grad_norm": 1.5411032438278198, + "learning_rate": 2.5207528626836662e-05, + "loss": 1.2307, + "step": 21665 + }, + { + "epoch": 0.775905599226458, + "grad_norm": 1.6667360067367554, + "learning_rate": 2.5199829895802775e-05, + "loss": 1.1035, + "step": 21666 + }, + { + "epoch": 0.7759414113560262, + "grad_norm": 1.7424989938735962, + "learning_rate": 2.519213217110866e-05, + "loss": 1.1903, + "step": 21667 + }, + { + "epoch": 0.7759772234855946, + "grad_norm": 1.9032039642333984, + "learning_rate": 2.5184435452857913e-05, + "loss": 1.1981, + "step": 21668 + }, + { + "epoch": 0.7760130356151629, + "grad_norm": 1.153782844543457, + "learning_rate": 2.517673974115409e-05, + "loss": 0.8876, + "step": 21669 + }, + { + "epoch": 0.7760488477447312, + "grad_norm": 1.3342920541763306, + "learning_rate": 2.5169045036100736e-05, + "loss": 1.1425, + "step": 21670 + }, + { + "epoch": 0.7760846598742994, + "grad_norm": 1.3093945980072021, + "learning_rate": 2.5161351337801363e-05, + "loss": 1.1056, + "step": 21671 + }, + { + "epoch": 0.7761204720038677, + "grad_norm": 1.3992677927017212, + "learning_rate": 2.5153658646359412e-05, + "loss": 0.964, + "step": 21672 + }, + { + "epoch": 0.776156284133436, + "grad_norm": 1.064302682876587, + "learning_rate": 2.51459669618785e-05, + "loss": 0.9464, + "step": 21673 + }, + { + "epoch": 0.7761920962630042, + "grad_norm": 1.5623129606246948, + "learning_rate": 2.5138276284462016e-05, + "loss": 0.869, + "step": 21674 + }, + { + "epoch": 0.7762279083925726, + "grad_norm": 1.3711532354354858, + "learning_rate": 2.513058661421349e-05, + "loss": 1.1035, + "step": 21675 + }, + { + "epoch": 0.7762637205221409, + "grad_norm": 1.4062919616699219, + "learning_rate": 2.512289795123629e-05, + "loss": 1.0057, + "step": 21676 + }, + { + "epoch": 0.7762995326517091, + "grad_norm": 1.7994794845581055, + "learning_rate": 2.5115210295633974e-05, + "loss": 0.8497, + "step": 21677 + }, + { + "epoch": 0.7763353447812774, + "grad_norm": 2.006800651550293, + "learning_rate": 2.5107523647509877e-05, + "loss": 1.2086, + "step": 21678 + }, + { + "epoch": 0.7763711569108457, + "grad_norm": 1.492414951324463, + "learning_rate": 2.5099838006967446e-05, + "loss": 0.9931, + "step": 21679 + }, + { + "epoch": 0.776406969040414, + "grad_norm": 1.5997062921524048, + "learning_rate": 2.5092153374110107e-05, + "loss": 1.0323, + "step": 21680 + }, + { + "epoch": 0.7764427811699822, + "grad_norm": 1.4327188730239868, + "learning_rate": 2.5084469749041185e-05, + "loss": 1.1143, + "step": 21681 + }, + { + "epoch": 0.7764785932995506, + "grad_norm": 1.3770179748535156, + "learning_rate": 2.5076787131864132e-05, + "loss": 1.1125, + "step": 21682 + }, + { + "epoch": 0.7765144054291189, + "grad_norm": 1.8856011629104614, + "learning_rate": 2.506910552268219e-05, + "loss": 0.8667, + "step": 21683 + }, + { + "epoch": 0.7765502175586871, + "grad_norm": 1.4429712295532227, + "learning_rate": 2.5061424921598853e-05, + "loss": 1.0783, + "step": 21684 + }, + { + "epoch": 0.7765860296882554, + "grad_norm": 1.3047304153442383, + "learning_rate": 2.5053745328717336e-05, + "loss": 1.0318, + "step": 21685 + }, + { + "epoch": 0.7766218418178237, + "grad_norm": 1.4441391229629517, + "learning_rate": 2.504606674414104e-05, + "loss": 1.0506, + "step": 21686 + }, + { + "epoch": 0.776657653947392, + "grad_norm": 1.4430029392242432, + "learning_rate": 2.5038389167973177e-05, + "loss": 1.1369, + "step": 21687 + }, + { + "epoch": 0.7766934660769602, + "grad_norm": 1.8220621347427368, + "learning_rate": 2.5030712600317143e-05, + "loss": 1.2859, + "step": 21688 + }, + { + "epoch": 0.7767292782065286, + "grad_norm": 1.2611302137374878, + "learning_rate": 2.5023037041276175e-05, + "loss": 1.1162, + "step": 21689 + }, + { + "epoch": 0.7767650903360969, + "grad_norm": 1.4445635080337524, + "learning_rate": 2.5015362490953497e-05, + "loss": 1.0161, + "step": 21690 + }, + { + "epoch": 0.7768009024656651, + "grad_norm": 1.6241004467010498, + "learning_rate": 2.5007688949452402e-05, + "loss": 0.9498, + "step": 21691 + }, + { + "epoch": 0.7768367145952334, + "grad_norm": 1.5715562105178833, + "learning_rate": 2.5000016416876103e-05, + "loss": 1.0916, + "step": 21692 + }, + { + "epoch": 0.7768725267248017, + "grad_norm": 1.554871916770935, + "learning_rate": 2.499234489332788e-05, + "loss": 1.1663, + "step": 21693 + }, + { + "epoch": 0.7769083388543699, + "grad_norm": 1.6246631145477295, + "learning_rate": 2.4984674378910845e-05, + "loss": 0.9875, + "step": 21694 + }, + { + "epoch": 0.7769441509839382, + "grad_norm": 2.0861990451812744, + "learning_rate": 2.4977004873728315e-05, + "loss": 0.9665, + "step": 21695 + }, + { + "epoch": 0.7769799631135066, + "grad_norm": 1.8799843788146973, + "learning_rate": 2.496933637788338e-05, + "loss": 1.0863, + "step": 21696 + }, + { + "epoch": 0.7770157752430749, + "grad_norm": 1.761149525642395, + "learning_rate": 2.496166889147926e-05, + "loss": 1.0786, + "step": 21697 + }, + { + "epoch": 0.7770515873726431, + "grad_norm": 1.5216788053512573, + "learning_rate": 2.495400241461907e-05, + "loss": 1.2075, + "step": 21698 + }, + { + "epoch": 0.7770873995022114, + "grad_norm": 1.3009837865829468, + "learning_rate": 2.494633694740598e-05, + "loss": 1.0487, + "step": 21699 + }, + { + "epoch": 0.7771232116317797, + "grad_norm": 1.486910104751587, + "learning_rate": 2.4938672489943138e-05, + "loss": 1.0124, + "step": 21700 + }, + { + "epoch": 0.7771590237613479, + "grad_norm": 1.2356934547424316, + "learning_rate": 2.493100904233361e-05, + "loss": 1.0201, + "step": 21701 + }, + { + "epoch": 0.7771948358909162, + "grad_norm": 1.5201665163040161, + "learning_rate": 2.4923346604680532e-05, + "loss": 0.9143, + "step": 21702 + }, + { + "epoch": 0.7772306480204846, + "grad_norm": 1.4985262155532837, + "learning_rate": 2.4915685177086967e-05, + "loss": 1.0756, + "step": 21703 + }, + { + "epoch": 0.7772664601500529, + "grad_norm": 1.557102084159851, + "learning_rate": 2.4908024759656046e-05, + "loss": 1.1588, + "step": 21704 + }, + { + "epoch": 0.7773022722796211, + "grad_norm": 1.418373703956604, + "learning_rate": 2.490036535249073e-05, + "loss": 1.0274, + "step": 21705 + }, + { + "epoch": 0.7773380844091894, + "grad_norm": 1.4912570714950562, + "learning_rate": 2.489270695569418e-05, + "loss": 1.1028, + "step": 21706 + }, + { + "epoch": 0.7773738965387577, + "grad_norm": 1.484142541885376, + "learning_rate": 2.4885049569369378e-05, + "loss": 1.0662, + "step": 21707 + }, + { + "epoch": 0.7774097086683259, + "grad_norm": 1.3730111122131348, + "learning_rate": 2.4877393193619315e-05, + "loss": 0.9916, + "step": 21708 + }, + { + "epoch": 0.7774455207978942, + "grad_norm": 1.2239267826080322, + "learning_rate": 2.4869737828547024e-05, + "loss": 1.024, + "step": 21709 + }, + { + "epoch": 0.7774813329274626, + "grad_norm": 1.5557535886764526, + "learning_rate": 2.4862083474255503e-05, + "loss": 1.1227, + "step": 21710 + }, + { + "epoch": 0.7775171450570308, + "grad_norm": 2.1143288612365723, + "learning_rate": 2.485443013084775e-05, + "loss": 1.2887, + "step": 21711 + }, + { + "epoch": 0.7775529571865991, + "grad_norm": 1.5048853158950806, + "learning_rate": 2.484677779842669e-05, + "loss": 1.1126, + "step": 21712 + }, + { + "epoch": 0.7775887693161674, + "grad_norm": 1.5329804420471191, + "learning_rate": 2.4839126477095287e-05, + "loss": 1.0131, + "step": 21713 + }, + { + "epoch": 0.7776245814457357, + "grad_norm": 2.0468218326568604, + "learning_rate": 2.4831476166956515e-05, + "loss": 1.1924, + "step": 21714 + }, + { + "epoch": 0.7776603935753039, + "grad_norm": 1.4946954250335693, + "learning_rate": 2.482382686811324e-05, + "loss": 1.1226, + "step": 21715 + }, + { + "epoch": 0.7776962057048722, + "grad_norm": 1.5745104551315308, + "learning_rate": 2.4816178580668415e-05, + "loss": 1.0944, + "step": 21716 + }, + { + "epoch": 0.7777320178344406, + "grad_norm": 1.4914745092391968, + "learning_rate": 2.4808531304724913e-05, + "loss": 1.0952, + "step": 21717 + }, + { + "epoch": 0.7777678299640088, + "grad_norm": 1.1811878681182861, + "learning_rate": 2.4800885040385668e-05, + "loss": 1.1233, + "step": 21718 + }, + { + "epoch": 0.7778036420935771, + "grad_norm": 1.8498728275299072, + "learning_rate": 2.4793239787753487e-05, + "loss": 1.0632, + "step": 21719 + }, + { + "epoch": 0.7778394542231454, + "grad_norm": 1.5991641283035278, + "learning_rate": 2.478559554693125e-05, + "loss": 1.0836, + "step": 21720 + }, + { + "epoch": 0.7778752663527136, + "grad_norm": 1.3711175918579102, + "learning_rate": 2.4777952318021814e-05, + "loss": 1.189, + "step": 21721 + }, + { + "epoch": 0.7779110784822819, + "grad_norm": 1.7485582828521729, + "learning_rate": 2.4770310101128026e-05, + "loss": 1.0548, + "step": 21722 + }, + { + "epoch": 0.7779468906118502, + "grad_norm": 1.3140603303909302, + "learning_rate": 2.476266889635265e-05, + "loss": 1.2099, + "step": 21723 + }, + { + "epoch": 0.7779827027414186, + "grad_norm": 1.6377465724945068, + "learning_rate": 2.475502870379851e-05, + "loss": 0.8756, + "step": 21724 + }, + { + "epoch": 0.7780185148709868, + "grad_norm": 1.6380665302276611, + "learning_rate": 2.474738952356842e-05, + "loss": 0.9881, + "step": 21725 + }, + { + "epoch": 0.7780543270005551, + "grad_norm": 1.5266995429992676, + "learning_rate": 2.4739751355765116e-05, + "loss": 1.1983, + "step": 21726 + }, + { + "epoch": 0.7780901391301234, + "grad_norm": 1.2909646034240723, + "learning_rate": 2.4732114200491386e-05, + "loss": 1.0629, + "step": 21727 + }, + { + "epoch": 0.7781259512596916, + "grad_norm": 1.4563636779785156, + "learning_rate": 2.4724478057849965e-05, + "loss": 1.0554, + "step": 21728 + }, + { + "epoch": 0.7781617633892599, + "grad_norm": 1.6233845949172974, + "learning_rate": 2.4716842927943617e-05, + "loss": 1.1579, + "step": 21729 + }, + { + "epoch": 0.7781975755188282, + "grad_norm": 1.526767373085022, + "learning_rate": 2.4709208810875017e-05, + "loss": 1.1977, + "step": 21730 + }, + { + "epoch": 0.7782333876483966, + "grad_norm": 1.3619192838668823, + "learning_rate": 2.4701575706746882e-05, + "loss": 0.9255, + "step": 21731 + }, + { + "epoch": 0.7782691997779648, + "grad_norm": 1.2837955951690674, + "learning_rate": 2.4693943615661963e-05, + "loss": 1.015, + "step": 21732 + }, + { + "epoch": 0.7783050119075331, + "grad_norm": 1.5845977067947388, + "learning_rate": 2.4686312537722855e-05, + "loss": 1.1461, + "step": 21733 + }, + { + "epoch": 0.7783408240371014, + "grad_norm": 1.875542402267456, + "learning_rate": 2.4678682473032267e-05, + "loss": 1.1302, + "step": 21734 + }, + { + "epoch": 0.7783766361666696, + "grad_norm": 1.5604491233825684, + "learning_rate": 2.4671053421692845e-05, + "loss": 1.086, + "step": 21735 + }, + { + "epoch": 0.7784124482962379, + "grad_norm": 1.6037495136260986, + "learning_rate": 2.466342538380727e-05, + "loss": 1.2082, + "step": 21736 + }, + { + "epoch": 0.7784482604258062, + "grad_norm": 1.6886779069900513, + "learning_rate": 2.46557983594781e-05, + "loss": 1.0476, + "step": 21737 + }, + { + "epoch": 0.7784840725553746, + "grad_norm": 1.5646326541900635, + "learning_rate": 2.4648172348807963e-05, + "loss": 1.1194, + "step": 21738 + }, + { + "epoch": 0.7785198846849428, + "grad_norm": 1.7127691507339478, + "learning_rate": 2.464054735189948e-05, + "loss": 1.0049, + "step": 21739 + }, + { + "epoch": 0.7785556968145111, + "grad_norm": 1.5049152374267578, + "learning_rate": 2.4632923368855254e-05, + "loss": 0.9813, + "step": 21740 + }, + { + "epoch": 0.7785915089440794, + "grad_norm": 1.587254285812378, + "learning_rate": 2.4625300399777806e-05, + "loss": 1.1154, + "step": 21741 + }, + { + "epoch": 0.7786273210736476, + "grad_norm": 1.2642561197280884, + "learning_rate": 2.46176784447697e-05, + "loss": 0.9309, + "step": 21742 + }, + { + "epoch": 0.7786631332032159, + "grad_norm": 1.7001707553863525, + "learning_rate": 2.4610057503933537e-05, + "loss": 1.236, + "step": 21743 + }, + { + "epoch": 0.7786989453327842, + "grad_norm": 1.4724650382995605, + "learning_rate": 2.4602437577371763e-05, + "loss": 0.9264, + "step": 21744 + }, + { + "epoch": 0.7787347574623525, + "grad_norm": 1.3202159404754639, + "learning_rate": 2.4594818665186937e-05, + "loss": 1.0396, + "step": 21745 + }, + { + "epoch": 0.7787705695919208, + "grad_norm": 1.5246800184249878, + "learning_rate": 2.4587200767481565e-05, + "loss": 0.8781, + "step": 21746 + }, + { + "epoch": 0.7788063817214891, + "grad_norm": 1.825522541999817, + "learning_rate": 2.457958388435816e-05, + "loss": 1.1544, + "step": 21747 + }, + { + "epoch": 0.7788421938510574, + "grad_norm": 1.3541470766067505, + "learning_rate": 2.4571968015919144e-05, + "loss": 0.9697, + "step": 21748 + }, + { + "epoch": 0.7788780059806256, + "grad_norm": 1.3701637983322144, + "learning_rate": 2.4564353162266996e-05, + "loss": 0.8003, + "step": 21749 + }, + { + "epoch": 0.7789138181101939, + "grad_norm": 1.4585357904434204, + "learning_rate": 2.4556739323504195e-05, + "loss": 1.0598, + "step": 21750 + }, + { + "epoch": 0.7789496302397622, + "grad_norm": 2.0669915676116943, + "learning_rate": 2.454912649973313e-05, + "loss": 1.2407, + "step": 21751 + }, + { + "epoch": 0.7789854423693305, + "grad_norm": 1.507920265197754, + "learning_rate": 2.4541514691056245e-05, + "loss": 1.0138, + "step": 21752 + }, + { + "epoch": 0.7790212544988988, + "grad_norm": 1.7459025382995605, + "learning_rate": 2.453390389757595e-05, + "loss": 1.1624, + "step": 21753 + }, + { + "epoch": 0.7790570666284671, + "grad_norm": 1.4769073724746704, + "learning_rate": 2.4526294119394653e-05, + "loss": 1.1354, + "step": 21754 + }, + { + "epoch": 0.7790928787580353, + "grad_norm": 1.3403064012527466, + "learning_rate": 2.451868535661469e-05, + "loss": 1.0279, + "step": 21755 + }, + { + "epoch": 0.7791286908876036, + "grad_norm": 1.3771485090255737, + "learning_rate": 2.451107760933845e-05, + "loss": 1.1073, + "step": 21756 + }, + { + "epoch": 0.7791645030171719, + "grad_norm": 1.574975848197937, + "learning_rate": 2.4503470877668287e-05, + "loss": 1.1573, + "step": 21757 + }, + { + "epoch": 0.7792003151467402, + "grad_norm": 1.533064603805542, + "learning_rate": 2.4495865161706567e-05, + "loss": 1.0059, + "step": 21758 + }, + { + "epoch": 0.7792361272763085, + "grad_norm": 1.7061454057693481, + "learning_rate": 2.448826046155559e-05, + "loss": 1.0793, + "step": 21759 + }, + { + "epoch": 0.7792719394058768, + "grad_norm": 1.419399380683899, + "learning_rate": 2.4480656777317613e-05, + "loss": 1.0513, + "step": 21760 + }, + { + "epoch": 0.7793077515354451, + "grad_norm": 1.5041015148162842, + "learning_rate": 2.447305410909504e-05, + "loss": 0.994, + "step": 21761 + }, + { + "epoch": 0.7793435636650133, + "grad_norm": 1.6147632598876953, + "learning_rate": 2.4465452456990067e-05, + "loss": 1.0661, + "step": 21762 + }, + { + "epoch": 0.7793793757945816, + "grad_norm": 1.1595205068588257, + "learning_rate": 2.4457851821105006e-05, + "loss": 1.2017, + "step": 21763 + }, + { + "epoch": 0.7794151879241499, + "grad_norm": 1.5124701261520386, + "learning_rate": 2.4450252201542102e-05, + "loss": 0.9593, + "step": 21764 + }, + { + "epoch": 0.7794510000537181, + "grad_norm": 1.2960015535354614, + "learning_rate": 2.444265359840363e-05, + "loss": 0.965, + "step": 21765 + }, + { + "epoch": 0.7794868121832865, + "grad_norm": 1.5595800876617432, + "learning_rate": 2.4435056011791768e-05, + "loss": 1.0892, + "step": 21766 + }, + { + "epoch": 0.7795226243128548, + "grad_norm": 1.410023808479309, + "learning_rate": 2.4427459441808754e-05, + "loss": 1.2651, + "step": 21767 + }, + { + "epoch": 0.7795584364424231, + "grad_norm": 1.355248212814331, + "learning_rate": 2.4419863888556815e-05, + "loss": 1.0242, + "step": 21768 + }, + { + "epoch": 0.7795942485719913, + "grad_norm": 1.4233533143997192, + "learning_rate": 2.4412269352138097e-05, + "loss": 1.1391, + "step": 21769 + }, + { + "epoch": 0.7796300607015596, + "grad_norm": 1.6790399551391602, + "learning_rate": 2.4404675832654812e-05, + "loss": 1.1115, + "step": 21770 + }, + { + "epoch": 0.7796658728311279, + "grad_norm": 1.365712285041809, + "learning_rate": 2.4397083330209046e-05, + "loss": 0.8341, + "step": 21771 + }, + { + "epoch": 0.7797016849606961, + "grad_norm": 1.3997204303741455, + "learning_rate": 2.438949184490307e-05, + "loss": 1.0752, + "step": 21772 + }, + { + "epoch": 0.7797374970902645, + "grad_norm": 1.5732009410858154, + "learning_rate": 2.438190137683891e-05, + "loss": 1.0987, + "step": 21773 + }, + { + "epoch": 0.7797733092198328, + "grad_norm": 1.5958174467086792, + "learning_rate": 2.4374311926118765e-05, + "loss": 1.0206, + "step": 21774 + }, + { + "epoch": 0.7798091213494011, + "grad_norm": 1.346454381942749, + "learning_rate": 2.4366723492844644e-05, + "loss": 1.179, + "step": 21775 + }, + { + "epoch": 0.7798449334789693, + "grad_norm": 1.2653675079345703, + "learning_rate": 2.435913607711876e-05, + "loss": 1.0798, + "step": 21776 + }, + { + "epoch": 0.7798807456085376, + "grad_norm": 1.5291792154312134, + "learning_rate": 2.4351549679043118e-05, + "loss": 1.3257, + "step": 21777 + }, + { + "epoch": 0.7799165577381059, + "grad_norm": 1.6730120182037354, + "learning_rate": 2.4343964298719746e-05, + "loss": 1.0611, + "step": 21778 + }, + { + "epoch": 0.7799523698676741, + "grad_norm": 1.423319697380066, + "learning_rate": 2.4336379936250808e-05, + "loss": 1.0774, + "step": 21779 + }, + { + "epoch": 0.7799881819972425, + "grad_norm": 1.6614872217178345, + "learning_rate": 2.4328796591738236e-05, + "loss": 1.1414, + "step": 21780 + }, + { + "epoch": 0.7800239941268108, + "grad_norm": 1.7135745286941528, + "learning_rate": 2.432121426528414e-05, + "loss": 0.9809, + "step": 21781 + }, + { + "epoch": 0.780059806256379, + "grad_norm": 1.4470274448394775, + "learning_rate": 2.431363295699042e-05, + "loss": 1.1998, + "step": 21782 + }, + { + "epoch": 0.7800956183859473, + "grad_norm": 1.6627440452575684, + "learning_rate": 2.43060526669592e-05, + "loss": 0.9531, + "step": 21783 + }, + { + "epoch": 0.7801314305155156, + "grad_norm": 1.5098000764846802, + "learning_rate": 2.4298473395292378e-05, + "loss": 1.0595, + "step": 21784 + }, + { + "epoch": 0.7801672426450839, + "grad_norm": 1.7274601459503174, + "learning_rate": 2.4290895142091974e-05, + "loss": 1.0617, + "step": 21785 + }, + { + "epoch": 0.7802030547746521, + "grad_norm": 1.5713496208190918, + "learning_rate": 2.428331790745989e-05, + "loss": 1.2425, + "step": 21786 + }, + { + "epoch": 0.7802388669042205, + "grad_norm": 1.4461625814437866, + "learning_rate": 2.42757416914981e-05, + "loss": 1.2736, + "step": 21787 + }, + { + "epoch": 0.7802746790337888, + "grad_norm": 1.2242257595062256, + "learning_rate": 2.4268166494308553e-05, + "loss": 1.0607, + "step": 21788 + }, + { + "epoch": 0.780310491163357, + "grad_norm": 1.3360241651535034, + "learning_rate": 2.426059231599308e-05, + "loss": 0.9834, + "step": 21789 + }, + { + "epoch": 0.7803463032929253, + "grad_norm": 1.5604928731918335, + "learning_rate": 2.42530191566537e-05, + "loss": 0.9883, + "step": 21790 + }, + { + "epoch": 0.7803821154224936, + "grad_norm": 1.475325345993042, + "learning_rate": 2.4245447016392207e-05, + "loss": 1.073, + "step": 21791 + }, + { + "epoch": 0.7804179275520619, + "grad_norm": 1.5174161195755005, + "learning_rate": 2.4237875895310548e-05, + "loss": 1.0976, + "step": 21792 + }, + { + "epoch": 0.7804537396816301, + "grad_norm": 1.5508201122283936, + "learning_rate": 2.4230305793510478e-05, + "loss": 1.1581, + "step": 21793 + }, + { + "epoch": 0.7804895518111985, + "grad_norm": 1.1914924383163452, + "learning_rate": 2.4222736711093964e-05, + "loss": 1.1338, + "step": 21794 + }, + { + "epoch": 0.7805253639407668, + "grad_norm": 1.6498891115188599, + "learning_rate": 2.4215168648162778e-05, + "loss": 1.0485, + "step": 21795 + }, + { + "epoch": 0.780561176070335, + "grad_norm": 1.4517302513122559, + "learning_rate": 2.420760160481872e-05, + "loss": 1.1379, + "step": 21796 + }, + { + "epoch": 0.7805969881999033, + "grad_norm": 1.2505124807357788, + "learning_rate": 2.4200035581163614e-05, + "loss": 1.1182, + "step": 21797 + }, + { + "epoch": 0.7806328003294716, + "grad_norm": 1.6318962574005127, + "learning_rate": 2.4192470577299263e-05, + "loss": 0.9591, + "step": 21798 + }, + { + "epoch": 0.7806686124590398, + "grad_norm": 1.302316427230835, + "learning_rate": 2.418490659332746e-05, + "loss": 0.9582, + "step": 21799 + }, + { + "epoch": 0.7807044245886081, + "grad_norm": 1.3340909481048584, + "learning_rate": 2.4177343629349912e-05, + "loss": 1.0526, + "step": 21800 + }, + { + "epoch": 0.7807402367181765, + "grad_norm": 1.4409111738204956, + "learning_rate": 2.4169781685468407e-05, + "loss": 0.9732, + "step": 21801 + }, + { + "epoch": 0.7807760488477448, + "grad_norm": 1.679172158241272, + "learning_rate": 2.416222076178467e-05, + "loss": 0.9606, + "step": 21802 + }, + { + "epoch": 0.780811860977313, + "grad_norm": 1.5449628829956055, + "learning_rate": 2.4154660858400456e-05, + "loss": 1.1819, + "step": 21803 + }, + { + "epoch": 0.7808476731068813, + "grad_norm": 1.5824702978134155, + "learning_rate": 2.414710197541743e-05, + "loss": 1.211, + "step": 21804 + }, + { + "epoch": 0.7808834852364496, + "grad_norm": 1.7203105688095093, + "learning_rate": 2.4139544112937283e-05, + "loss": 1.1803, + "step": 21805 + }, + { + "epoch": 0.7809192973660178, + "grad_norm": 1.3823614120483398, + "learning_rate": 2.413198727106176e-05, + "loss": 1.2038, + "step": 21806 + }, + { + "epoch": 0.7809551094955861, + "grad_norm": 1.6776750087738037, + "learning_rate": 2.412443144989246e-05, + "loss": 0.9941, + "step": 21807 + }, + { + "epoch": 0.7809909216251545, + "grad_norm": 2.3716506958007812, + "learning_rate": 2.411687664953106e-05, + "loss": 1.2108, + "step": 21808 + }, + { + "epoch": 0.7810267337547228, + "grad_norm": 1.3145028352737427, + "learning_rate": 2.41093228700792e-05, + "loss": 1.1623, + "step": 21809 + }, + { + "epoch": 0.781062545884291, + "grad_norm": 1.4257822036743164, + "learning_rate": 2.4101770111638534e-05, + "loss": 0.9604, + "step": 21810 + }, + { + "epoch": 0.7810983580138593, + "grad_norm": 1.3440275192260742, + "learning_rate": 2.409421837431063e-05, + "loss": 1.0703, + "step": 21811 + }, + { + "epoch": 0.7811341701434276, + "grad_norm": 1.4030808210372925, + "learning_rate": 2.4086667658197093e-05, + "loss": 0.9895, + "step": 21812 + }, + { + "epoch": 0.7811699822729958, + "grad_norm": 1.2682896852493286, + "learning_rate": 2.4079117963399554e-05, + "loss": 1.129, + "step": 21813 + }, + { + "epoch": 0.7812057944025641, + "grad_norm": 1.6954350471496582, + "learning_rate": 2.4071569290019535e-05, + "loss": 1.0703, + "step": 21814 + }, + { + "epoch": 0.7812416065321325, + "grad_norm": 1.5488996505737305, + "learning_rate": 2.4064021638158596e-05, + "loss": 1.3609, + "step": 21815 + }, + { + "epoch": 0.7812774186617008, + "grad_norm": 1.3443392515182495, + "learning_rate": 2.40564750079183e-05, + "loss": 1.0479, + "step": 21816 + }, + { + "epoch": 0.781313230791269, + "grad_norm": 1.7966151237487793, + "learning_rate": 2.404892939940021e-05, + "loss": 1.2643, + "step": 21817 + }, + { + "epoch": 0.7813490429208373, + "grad_norm": 1.2975623607635498, + "learning_rate": 2.404138481270577e-05, + "loss": 1.111, + "step": 21818 + }, + { + "epoch": 0.7813848550504056, + "grad_norm": 1.752102255821228, + "learning_rate": 2.4033841247936517e-05, + "loss": 1.2065, + "step": 21819 + }, + { + "epoch": 0.7814206671799738, + "grad_norm": 1.6601862907409668, + "learning_rate": 2.4026298705193972e-05, + "loss": 1.1668, + "step": 21820 + }, + { + "epoch": 0.7814564793095421, + "grad_norm": 1.4020661115646362, + "learning_rate": 2.4018757184579545e-05, + "loss": 0.9436, + "step": 21821 + }, + { + "epoch": 0.7814922914391105, + "grad_norm": 1.5659871101379395, + "learning_rate": 2.401121668619474e-05, + "loss": 1.2333, + "step": 21822 + }, + { + "epoch": 0.7815281035686787, + "grad_norm": 1.4856044054031372, + "learning_rate": 2.4003677210140986e-05, + "loss": 0.9767, + "step": 21823 + }, + { + "epoch": 0.781563915698247, + "grad_norm": 1.3543016910552979, + "learning_rate": 2.3996138756519758e-05, + "loss": 1.0326, + "step": 21824 + }, + { + "epoch": 0.7815997278278153, + "grad_norm": 1.4586730003356934, + "learning_rate": 2.3988601325432415e-05, + "loss": 1.2254, + "step": 21825 + }, + { + "epoch": 0.7816355399573836, + "grad_norm": 1.6443791389465332, + "learning_rate": 2.39810649169804e-05, + "loss": 1.1175, + "step": 21826 + }, + { + "epoch": 0.7816713520869518, + "grad_norm": 1.2273857593536377, + "learning_rate": 2.3973529531265095e-05, + "loss": 0.8724, + "step": 21827 + }, + { + "epoch": 0.7817071642165201, + "grad_norm": 1.3047760725021362, + "learning_rate": 2.396599516838791e-05, + "loss": 0.9393, + "step": 21828 + }, + { + "epoch": 0.7817429763460885, + "grad_norm": 1.324086308479309, + "learning_rate": 2.3958461828450164e-05, + "loss": 1.0057, + "step": 21829 + }, + { + "epoch": 0.7817787884756567, + "grad_norm": 1.5149750709533691, + "learning_rate": 2.3950929511553223e-05, + "loss": 1.1269, + "step": 21830 + }, + { + "epoch": 0.781814600605225, + "grad_norm": 1.9913427829742432, + "learning_rate": 2.3943398217798452e-05, + "loss": 1.0628, + "step": 21831 + }, + { + "epoch": 0.7818504127347933, + "grad_norm": 1.3717163801193237, + "learning_rate": 2.393586794728713e-05, + "loss": 0.9391, + "step": 21832 + }, + { + "epoch": 0.7818862248643615, + "grad_norm": 1.6160105466842651, + "learning_rate": 2.3928338700120578e-05, + "loss": 1.0627, + "step": 21833 + }, + { + "epoch": 0.7819220369939298, + "grad_norm": 1.3037723302841187, + "learning_rate": 2.3920810476400112e-05, + "loss": 1.1575, + "step": 21834 + }, + { + "epoch": 0.7819578491234981, + "grad_norm": 1.3465242385864258, + "learning_rate": 2.391328327622704e-05, + "loss": 1.0945, + "step": 21835 + }, + { + "epoch": 0.7819936612530665, + "grad_norm": 1.6231786012649536, + "learning_rate": 2.3905757099702564e-05, + "loss": 1.2892, + "step": 21836 + }, + { + "epoch": 0.7820294733826347, + "grad_norm": 1.6316289901733398, + "learning_rate": 2.3898231946927963e-05, + "loss": 1.1894, + "step": 21837 + }, + { + "epoch": 0.782065285512203, + "grad_norm": 1.1121256351470947, + "learning_rate": 2.3890707818004522e-05, + "loss": 1.0821, + "step": 21838 + }, + { + "epoch": 0.7821010976417713, + "grad_norm": 1.4257922172546387, + "learning_rate": 2.3883184713033414e-05, + "loss": 0.955, + "step": 21839 + }, + { + "epoch": 0.7821369097713395, + "grad_norm": 1.2172980308532715, + "learning_rate": 2.387566263211586e-05, + "loss": 1.04, + "step": 21840 + }, + { + "epoch": 0.7821727219009078, + "grad_norm": 1.6653600931167603, + "learning_rate": 2.3868141575353077e-05, + "loss": 1.1184, + "step": 21841 + }, + { + "epoch": 0.7822085340304761, + "grad_norm": 2.0366933345794678, + "learning_rate": 2.3860621542846273e-05, + "loss": 1.1705, + "step": 21842 + }, + { + "epoch": 0.7822443461600445, + "grad_norm": 1.6012139320373535, + "learning_rate": 2.3853102534696557e-05, + "loss": 0.8001, + "step": 21843 + }, + { + "epoch": 0.7822801582896127, + "grad_norm": 1.415152668952942, + "learning_rate": 2.384558455100514e-05, + "loss": 1.0974, + "step": 21844 + }, + { + "epoch": 0.782315970419181, + "grad_norm": 1.4024993181228638, + "learning_rate": 2.3838067591873136e-05, + "loss": 0.9614, + "step": 21845 + }, + { + "epoch": 0.7823517825487493, + "grad_norm": 1.5571850538253784, + "learning_rate": 2.3830551657401723e-05, + "loss": 0.9457, + "step": 21846 + }, + { + "epoch": 0.7823875946783175, + "grad_norm": 1.4353337287902832, + "learning_rate": 2.3823036747691995e-05, + "loss": 0.8535, + "step": 21847 + }, + { + "epoch": 0.7824234068078858, + "grad_norm": 1.7650848627090454, + "learning_rate": 2.3815522862844985e-05, + "loss": 1.1572, + "step": 21848 + }, + { + "epoch": 0.7824592189374541, + "grad_norm": 2.0312564373016357, + "learning_rate": 2.3808010002961902e-05, + "loss": 1.0614, + "step": 21849 + }, + { + "epoch": 0.7824950310670225, + "grad_norm": 1.3023672103881836, + "learning_rate": 2.3800498168143726e-05, + "loss": 0.8399, + "step": 21850 + }, + { + "epoch": 0.7825308431965907, + "grad_norm": 1.501562476158142, + "learning_rate": 2.379298735849156e-05, + "loss": 1.0578, + "step": 21851 + }, + { + "epoch": 0.782566655326159, + "grad_norm": 1.62947678565979, + "learning_rate": 2.378547757410645e-05, + "loss": 0.9581, + "step": 21852 + }, + { + "epoch": 0.7826024674557273, + "grad_norm": 1.2570668458938599, + "learning_rate": 2.377796881508947e-05, + "loss": 0.9722, + "step": 21853 + }, + { + "epoch": 0.7826382795852955, + "grad_norm": 1.4180582761764526, + "learning_rate": 2.3770461081541563e-05, + "loss": 1.0551, + "step": 21854 + }, + { + "epoch": 0.7826740917148638, + "grad_norm": 2.39404559135437, + "learning_rate": 2.3762954373563763e-05, + "loss": 1.0529, + "step": 21855 + }, + { + "epoch": 0.7827099038444321, + "grad_norm": 1.566523551940918, + "learning_rate": 2.375544869125711e-05, + "loss": 1.1177, + "step": 21856 + }, + { + "epoch": 0.7827457159740004, + "grad_norm": 1.7578355073928833, + "learning_rate": 2.3747944034722524e-05, + "loss": 1.1485, + "step": 21857 + }, + { + "epoch": 0.7827815281035687, + "grad_norm": 1.6556320190429688, + "learning_rate": 2.3740440404061015e-05, + "loss": 0.9786, + "step": 21858 + }, + { + "epoch": 0.782817340233137, + "grad_norm": 1.704459309577942, + "learning_rate": 2.3732937799373455e-05, + "loss": 1.0199, + "step": 21859 + }, + { + "epoch": 0.7828531523627053, + "grad_norm": 1.8248927593231201, + "learning_rate": 2.37254362207609e-05, + "loss": 1.1044, + "step": 21860 + }, + { + "epoch": 0.7828889644922735, + "grad_norm": 1.4454888105392456, + "learning_rate": 2.3717935668324186e-05, + "loss": 1.0697, + "step": 21861 + }, + { + "epoch": 0.7829247766218418, + "grad_norm": 1.4102071523666382, + "learning_rate": 2.371043614216425e-05, + "loss": 0.9328, + "step": 21862 + }, + { + "epoch": 0.7829605887514101, + "grad_norm": 2.283243179321289, + "learning_rate": 2.3702937642381985e-05, + "loss": 1.0659, + "step": 21863 + }, + { + "epoch": 0.7829964008809784, + "grad_norm": 1.415614128112793, + "learning_rate": 2.369544016907831e-05, + "loss": 0.9555, + "step": 21864 + }, + { + "epoch": 0.7830322130105467, + "grad_norm": 1.6331781148910522, + "learning_rate": 2.3687943722354056e-05, + "loss": 1.3257, + "step": 21865 + }, + { + "epoch": 0.783068025140115, + "grad_norm": 1.4544334411621094, + "learning_rate": 2.3680448302310032e-05, + "loss": 1.0394, + "step": 21866 + }, + { + "epoch": 0.7831038372696832, + "grad_norm": 1.536407709121704, + "learning_rate": 2.367295390904719e-05, + "loss": 1.3471, + "step": 21867 + }, + { + "epoch": 0.7831396493992515, + "grad_norm": 1.900152325630188, + "learning_rate": 2.3665460542666263e-05, + "loss": 1.0546, + "step": 21868 + }, + { + "epoch": 0.7831754615288198, + "grad_norm": 1.8780157566070557, + "learning_rate": 2.3657968203268133e-05, + "loss": 1.1693, + "step": 21869 + }, + { + "epoch": 0.783211273658388, + "grad_norm": 1.3920987844467163, + "learning_rate": 2.365047689095351e-05, + "loss": 0.9214, + "step": 21870 + }, + { + "epoch": 0.7832470857879564, + "grad_norm": 2.2981674671173096, + "learning_rate": 2.3642986605823292e-05, + "loss": 1.2797, + "step": 21871 + }, + { + "epoch": 0.7832828979175247, + "grad_norm": 1.6486889123916626, + "learning_rate": 2.3635497347978176e-05, + "loss": 1.1218, + "step": 21872 + }, + { + "epoch": 0.783318710047093, + "grad_norm": 1.723544955253601, + "learning_rate": 2.3628009117518956e-05, + "loss": 1.0104, + "step": 21873 + }, + { + "epoch": 0.7833545221766612, + "grad_norm": 1.5684045553207397, + "learning_rate": 2.3620521914546334e-05, + "loss": 1.0474, + "step": 21874 + }, + { + "epoch": 0.7833903343062295, + "grad_norm": 1.6598535776138306, + "learning_rate": 2.361303573916107e-05, + "loss": 1.1368, + "step": 21875 + }, + { + "epoch": 0.7834261464357978, + "grad_norm": 1.392905831336975, + "learning_rate": 2.360555059146391e-05, + "loss": 1.0031, + "step": 21876 + }, + { + "epoch": 0.783461958565366, + "grad_norm": 1.419296145439148, + "learning_rate": 2.359806647155547e-05, + "loss": 0.9809, + "step": 21877 + }, + { + "epoch": 0.7834977706949344, + "grad_norm": 1.4265469312667847, + "learning_rate": 2.3590583379536535e-05, + "loss": 1.2017, + "step": 21878 + }, + { + "epoch": 0.7835335828245027, + "grad_norm": 1.6002613306045532, + "learning_rate": 2.358310131550773e-05, + "loss": 1.1175, + "step": 21879 + }, + { + "epoch": 0.783569394954071, + "grad_norm": 1.4704664945602417, + "learning_rate": 2.3575620279569743e-05, + "loss": 1.0058, + "step": 21880 + }, + { + "epoch": 0.7836052070836392, + "grad_norm": 1.2822237014770508, + "learning_rate": 2.3568140271823147e-05, + "loss": 1.1325, + "step": 21881 + }, + { + "epoch": 0.7836410192132075, + "grad_norm": 1.6081206798553467, + "learning_rate": 2.3560661292368702e-05, + "loss": 0.9958, + "step": 21882 + }, + { + "epoch": 0.7836768313427758, + "grad_norm": 1.6080671548843384, + "learning_rate": 2.355318334130695e-05, + "loss": 1.0364, + "step": 21883 + }, + { + "epoch": 0.783712643472344, + "grad_norm": 1.755091667175293, + "learning_rate": 2.3545706418738476e-05, + "loss": 1.1527, + "step": 21884 + }, + { + "epoch": 0.7837484556019124, + "grad_norm": 1.415678858757019, + "learning_rate": 2.3538230524763914e-05, + "loss": 1.1403, + "step": 21885 + }, + { + "epoch": 0.7837842677314807, + "grad_norm": 1.199790596961975, + "learning_rate": 2.353075565948383e-05, + "loss": 0.8324, + "step": 21886 + }, + { + "epoch": 0.783820079861049, + "grad_norm": 1.5532630681991577, + "learning_rate": 2.352328182299881e-05, + "loss": 0.931, + "step": 21887 + }, + { + "epoch": 0.7838558919906172, + "grad_norm": 1.8039908409118652, + "learning_rate": 2.351580901540933e-05, + "loss": 1.0251, + "step": 21888 + }, + { + "epoch": 0.7838917041201855, + "grad_norm": 1.3298498392105103, + "learning_rate": 2.3508337236816047e-05, + "loss": 1.1329, + "step": 21889 + }, + { + "epoch": 0.7839275162497538, + "grad_norm": 1.4615401029586792, + "learning_rate": 2.3500866487319384e-05, + "loss": 0.8724, + "step": 21890 + }, + { + "epoch": 0.783963328379322, + "grad_norm": 1.901906967163086, + "learning_rate": 2.3493396767019915e-05, + "loss": 1.3646, + "step": 21891 + }, + { + "epoch": 0.7839991405088904, + "grad_norm": 1.5949268341064453, + "learning_rate": 2.348592807601808e-05, + "loss": 1.1067, + "step": 21892 + }, + { + "epoch": 0.7840349526384587, + "grad_norm": 1.4763333797454834, + "learning_rate": 2.3478460414414382e-05, + "loss": 0.9347, + "step": 21893 + }, + { + "epoch": 0.784070764768027, + "grad_norm": 1.5174920558929443, + "learning_rate": 2.3470993782309324e-05, + "loss": 1.0226, + "step": 21894 + }, + { + "epoch": 0.7841065768975952, + "grad_norm": 1.3392835855484009, + "learning_rate": 2.3463528179803305e-05, + "loss": 0.9551, + "step": 21895 + }, + { + "epoch": 0.7841423890271635, + "grad_norm": 1.3507832288742065, + "learning_rate": 2.3456063606996783e-05, + "loss": 1.0753, + "step": 21896 + }, + { + "epoch": 0.7841782011567318, + "grad_norm": 1.4563134908676147, + "learning_rate": 2.34486000639902e-05, + "loss": 0.9251, + "step": 21897 + }, + { + "epoch": 0.7842140132863, + "grad_norm": 1.5724810361862183, + "learning_rate": 2.3441137550883974e-05, + "loss": 0.9729, + "step": 21898 + }, + { + "epoch": 0.7842498254158684, + "grad_norm": 1.392591118812561, + "learning_rate": 2.3433676067778465e-05, + "loss": 1.1963, + "step": 21899 + }, + { + "epoch": 0.7842856375454367, + "grad_norm": 1.5075501203536987, + "learning_rate": 2.3426215614774094e-05, + "loss": 1.1203, + "step": 21900 + }, + { + "epoch": 0.784321449675005, + "grad_norm": 1.2891100645065308, + "learning_rate": 2.3418756191971235e-05, + "loss": 1.2522, + "step": 21901 + }, + { + "epoch": 0.7843572618045732, + "grad_norm": 1.4839231967926025, + "learning_rate": 2.34112977994702e-05, + "loss": 1.2054, + "step": 21902 + }, + { + "epoch": 0.7843930739341415, + "grad_norm": 1.7070086002349854, + "learning_rate": 2.340384043737136e-05, + "loss": 0.9971, + "step": 21903 + }, + { + "epoch": 0.7844288860637098, + "grad_norm": 1.6335558891296387, + "learning_rate": 2.339638410577505e-05, + "loss": 1.1709, + "step": 21904 + }, + { + "epoch": 0.784464698193278, + "grad_norm": 1.4616601467132568, + "learning_rate": 2.3388928804781608e-05, + "loss": 0.9529, + "step": 21905 + }, + { + "epoch": 0.7845005103228464, + "grad_norm": 1.8844996690750122, + "learning_rate": 2.3381474534491276e-05, + "loss": 0.9967, + "step": 21906 + }, + { + "epoch": 0.7845363224524147, + "grad_norm": 1.4292160272598267, + "learning_rate": 2.337402129500438e-05, + "loss": 0.8812, + "step": 21907 + }, + { + "epoch": 0.7845721345819829, + "grad_norm": 2.0051212310791016, + "learning_rate": 2.3366569086421175e-05, + "loss": 1.1001, + "step": 21908 + }, + { + "epoch": 0.7846079467115512, + "grad_norm": 1.3315309286117554, + "learning_rate": 2.3359117908841966e-05, + "loss": 1.1158, + "step": 21909 + }, + { + "epoch": 0.7846437588411195, + "grad_norm": 1.218713402748108, + "learning_rate": 2.3351667762366948e-05, + "loss": 1.0233, + "step": 21910 + }, + { + "epoch": 0.7846795709706877, + "grad_norm": 1.6550421714782715, + "learning_rate": 2.334421864709636e-05, + "loss": 1.2571, + "step": 21911 + }, + { + "epoch": 0.784715383100256, + "grad_norm": 1.8413151502609253, + "learning_rate": 2.3336770563130463e-05, + "loss": 1.1149, + "step": 21912 + }, + { + "epoch": 0.7847511952298244, + "grad_norm": 1.8804212808609009, + "learning_rate": 2.33293235105694e-05, + "loss": 1.0872, + "step": 21913 + }, + { + "epoch": 0.7847870073593927, + "grad_norm": 1.3501869440078735, + "learning_rate": 2.332187748951339e-05, + "loss": 0.9592, + "step": 21914 + }, + { + "epoch": 0.7848228194889609, + "grad_norm": 1.488682508468628, + "learning_rate": 2.331443250006261e-05, + "loss": 1.107, + "step": 21915 + }, + { + "epoch": 0.7848586316185292, + "grad_norm": 1.6136651039123535, + "learning_rate": 2.3306988542317255e-05, + "loss": 0.9304, + "step": 21916 + }, + { + "epoch": 0.7848944437480975, + "grad_norm": 1.2459163665771484, + "learning_rate": 2.3299545616377415e-05, + "loss": 0.9683, + "step": 21917 + }, + { + "epoch": 0.7849302558776657, + "grad_norm": 1.1827479600906372, + "learning_rate": 2.329210372234325e-05, + "loss": 1.0687, + "step": 21918 + }, + { + "epoch": 0.784966068007234, + "grad_norm": 1.6253210306167603, + "learning_rate": 2.3284662860314922e-05, + "loss": 0.9253, + "step": 21919 + }, + { + "epoch": 0.7850018801368024, + "grad_norm": 1.4664719104766846, + "learning_rate": 2.3277223030392458e-05, + "loss": 0.9674, + "step": 21920 + }, + { + "epoch": 0.7850376922663707, + "grad_norm": 1.6888189315795898, + "learning_rate": 2.3269784232675995e-05, + "loss": 1.0811, + "step": 21921 + }, + { + "epoch": 0.7850735043959389, + "grad_norm": 1.6080164909362793, + "learning_rate": 2.3262346467265605e-05, + "loss": 1.0052, + "step": 21922 + }, + { + "epoch": 0.7851093165255072, + "grad_norm": 1.3008686304092407, + "learning_rate": 2.3254909734261398e-05, + "loss": 1.0901, + "step": 21923 + }, + { + "epoch": 0.7851451286550755, + "grad_norm": 1.6347086429595947, + "learning_rate": 2.324747403376336e-05, + "loss": 1.1727, + "step": 21924 + }, + { + "epoch": 0.7851809407846437, + "grad_norm": 2.049722194671631, + "learning_rate": 2.3240039365871546e-05, + "loss": 1.2716, + "step": 21925 + }, + { + "epoch": 0.785216752914212, + "grad_norm": 2.0162153244018555, + "learning_rate": 2.3232605730686018e-05, + "loss": 1.1383, + "step": 21926 + }, + { + "epoch": 0.7852525650437804, + "grad_norm": 1.7006267309188843, + "learning_rate": 2.3225173128306733e-05, + "loss": 1.0467, + "step": 21927 + }, + { + "epoch": 0.7852883771733487, + "grad_norm": 1.626059889793396, + "learning_rate": 2.3217741558833706e-05, + "loss": 1.1544, + "step": 21928 + }, + { + "epoch": 0.7853241893029169, + "grad_norm": 1.331045389175415, + "learning_rate": 2.321031102236694e-05, + "loss": 0.8946, + "step": 21929 + }, + { + "epoch": 0.7853600014324852, + "grad_norm": 1.5286102294921875, + "learning_rate": 2.3202881519006393e-05, + "loss": 1.0236, + "step": 21930 + }, + { + "epoch": 0.7853958135620535, + "grad_norm": 1.6348634958267212, + "learning_rate": 2.3195453048852e-05, + "loss": 1.2326, + "step": 21931 + }, + { + "epoch": 0.7854316256916217, + "grad_norm": 1.533511757850647, + "learning_rate": 2.3188025612003718e-05, + "loss": 1.2853, + "step": 21932 + }, + { + "epoch": 0.78546743782119, + "grad_norm": 1.292959213256836, + "learning_rate": 2.318059920856146e-05, + "loss": 1.0306, + "step": 21933 + }, + { + "epoch": 0.7855032499507584, + "grad_norm": 1.4405916929244995, + "learning_rate": 2.3173173838625183e-05, + "loss": 1.0387, + "step": 21934 + }, + { + "epoch": 0.7855390620803266, + "grad_norm": 1.499813437461853, + "learning_rate": 2.316574950229472e-05, + "loss": 1.2457, + "step": 21935 + }, + { + "epoch": 0.7855748742098949, + "grad_norm": 1.5283528566360474, + "learning_rate": 2.315832619967e-05, + "loss": 0.83, + "step": 21936 + }, + { + "epoch": 0.7856106863394632, + "grad_norm": 1.590445876121521, + "learning_rate": 2.3150903930850896e-05, + "loss": 1.1335, + "step": 21937 + }, + { + "epoch": 0.7856464984690315, + "grad_norm": 1.4689871072769165, + "learning_rate": 2.3143482695937235e-05, + "loss": 0.9381, + "step": 21938 + }, + { + "epoch": 0.7856823105985997, + "grad_norm": 1.3919997215270996, + "learning_rate": 2.3136062495028876e-05, + "loss": 1.0859, + "step": 21939 + }, + { + "epoch": 0.785718122728168, + "grad_norm": 1.3706276416778564, + "learning_rate": 2.312864332822564e-05, + "loss": 1.0972, + "step": 21940 + }, + { + "epoch": 0.7857539348577364, + "grad_norm": 2.020033359527588, + "learning_rate": 2.3121225195627382e-05, + "loss": 1.2623, + "step": 21941 + }, + { + "epoch": 0.7857897469873046, + "grad_norm": 1.5927590131759644, + "learning_rate": 2.3113808097333854e-05, + "loss": 1.1064, + "step": 21942 + }, + { + "epoch": 0.7858255591168729, + "grad_norm": 1.1250602006912231, + "learning_rate": 2.3106392033444856e-05, + "loss": 0.9093, + "step": 21943 + }, + { + "epoch": 0.7858613712464412, + "grad_norm": 1.3710001707077026, + "learning_rate": 2.3098977004060185e-05, + "loss": 1.0558, + "step": 21944 + }, + { + "epoch": 0.7858971833760094, + "grad_norm": 1.4555498361587524, + "learning_rate": 2.309156300927957e-05, + "loss": 1.1247, + "step": 21945 + }, + { + "epoch": 0.7859329955055777, + "grad_norm": 1.5381888151168823, + "learning_rate": 2.308415004920277e-05, + "loss": 1.1227, + "step": 21946 + }, + { + "epoch": 0.785968807635146, + "grad_norm": 1.6672282218933105, + "learning_rate": 2.307673812392951e-05, + "loss": 1.2153, + "step": 21947 + }, + { + "epoch": 0.7860046197647144, + "grad_norm": 1.3715492486953735, + "learning_rate": 2.3069327233559533e-05, + "loss": 0.9082, + "step": 21948 + }, + { + "epoch": 0.7860404318942826, + "grad_norm": 1.4816392660140991, + "learning_rate": 2.306191737819251e-05, + "loss": 0.956, + "step": 21949 + }, + { + "epoch": 0.7860762440238509, + "grad_norm": 1.5166058540344238, + "learning_rate": 2.3054508557928144e-05, + "loss": 1.1869, + "step": 21950 + }, + { + "epoch": 0.7861120561534192, + "grad_norm": 1.4297915697097778, + "learning_rate": 2.3047100772866114e-05, + "loss": 1.0246, + "step": 21951 + }, + { + "epoch": 0.7861478682829874, + "grad_norm": 1.752697467803955, + "learning_rate": 2.3039694023106106e-05, + "loss": 0.9983, + "step": 21952 + }, + { + "epoch": 0.7861836804125557, + "grad_norm": 1.4244431257247925, + "learning_rate": 2.303228830874775e-05, + "loss": 0.9887, + "step": 21953 + }, + { + "epoch": 0.786219492542124, + "grad_norm": 1.629014015197754, + "learning_rate": 2.3024883629890604e-05, + "loss": 1.1126, + "step": 21954 + }, + { + "epoch": 0.7862553046716924, + "grad_norm": 1.6044089794158936, + "learning_rate": 2.3017479986634426e-05, + "loss": 1.1279, + "step": 21955 + }, + { + "epoch": 0.7862911168012606, + "grad_norm": 1.5099190473556519, + "learning_rate": 2.3010077379078722e-05, + "loss": 1.0131, + "step": 21956 + }, + { + "epoch": 0.7863269289308289, + "grad_norm": 1.4597383737564087, + "learning_rate": 2.300267580732315e-05, + "loss": 1.0151, + "step": 21957 + }, + { + "epoch": 0.7863627410603972, + "grad_norm": 1.5485810041427612, + "learning_rate": 2.2995275271467187e-05, + "loss": 1.168, + "step": 21958 + }, + { + "epoch": 0.7863985531899654, + "grad_norm": 1.5265846252441406, + "learning_rate": 2.2987875771610534e-05, + "loss": 1.0362, + "step": 21959 + }, + { + "epoch": 0.7864343653195337, + "grad_norm": 1.350969910621643, + "learning_rate": 2.2980477307852642e-05, + "loss": 0.9105, + "step": 21960 + }, + { + "epoch": 0.786470177449102, + "grad_norm": 1.49599027633667, + "learning_rate": 2.297307988029308e-05, + "loss": 1.1532, + "step": 21961 + }, + { + "epoch": 0.7865059895786704, + "grad_norm": 1.3857998847961426, + "learning_rate": 2.29656834890314e-05, + "loss": 0.9965, + "step": 21962 + }, + { + "epoch": 0.7865418017082386, + "grad_norm": 1.3580697774887085, + "learning_rate": 2.2958288134167048e-05, + "loss": 0.9419, + "step": 21963 + }, + { + "epoch": 0.7865776138378069, + "grad_norm": 1.3603800535202026, + "learning_rate": 2.295089381579959e-05, + "loss": 1.0446, + "step": 21964 + }, + { + "epoch": 0.7866134259673752, + "grad_norm": 1.5096733570098877, + "learning_rate": 2.2943500534028406e-05, + "loss": 1.0581, + "step": 21965 + }, + { + "epoch": 0.7866492380969434, + "grad_norm": 1.702592372894287, + "learning_rate": 2.2936108288953083e-05, + "loss": 0.9993, + "step": 21966 + }, + { + "epoch": 0.7866850502265117, + "grad_norm": 1.6337474584579468, + "learning_rate": 2.292871708067299e-05, + "loss": 0.9851, + "step": 21967 + }, + { + "epoch": 0.78672086235608, + "grad_norm": 1.9383947849273682, + "learning_rate": 2.2921326909287634e-05, + "loss": 1.2315, + "step": 21968 + }, + { + "epoch": 0.7867566744856483, + "grad_norm": 1.7297282218933105, + "learning_rate": 2.291393777489632e-05, + "loss": 1.0376, + "step": 21969 + }, + { + "epoch": 0.7867924866152166, + "grad_norm": 1.1722561120986938, + "learning_rate": 2.290654967759862e-05, + "loss": 1.1503, + "step": 21970 + }, + { + "epoch": 0.7868282987447849, + "grad_norm": 1.8606183528900146, + "learning_rate": 2.289916261749383e-05, + "loss": 0.9781, + "step": 21971 + }, + { + "epoch": 0.7868641108743532, + "grad_norm": 1.1927242279052734, + "learning_rate": 2.2891776594681315e-05, + "loss": 1.0438, + "step": 21972 + }, + { + "epoch": 0.7868999230039214, + "grad_norm": 2.0364344120025635, + "learning_rate": 2.2884391609260525e-05, + "loss": 0.954, + "step": 21973 + }, + { + "epoch": 0.7869357351334897, + "grad_norm": 1.3245285749435425, + "learning_rate": 2.2877007661330762e-05, + "loss": 1.1383, + "step": 21974 + }, + { + "epoch": 0.786971547263058, + "grad_norm": 1.5655508041381836, + "learning_rate": 2.2869624750991393e-05, + "loss": 1.071, + "step": 21975 + }, + { + "epoch": 0.7870073593926263, + "grad_norm": 1.3992011547088623, + "learning_rate": 2.2862242878341678e-05, + "loss": 1.3116, + "step": 21976 + }, + { + "epoch": 0.7870431715221946, + "grad_norm": 1.4866069555282593, + "learning_rate": 2.285486204348105e-05, + "loss": 1.2158, + "step": 21977 + }, + { + "epoch": 0.7870789836517629, + "grad_norm": 1.6836544275283813, + "learning_rate": 2.284748224650871e-05, + "loss": 1.1444, + "step": 21978 + }, + { + "epoch": 0.7871147957813311, + "grad_norm": 1.8827142715454102, + "learning_rate": 2.2840103487524e-05, + "loss": 1.0821, + "step": 21979 + }, + { + "epoch": 0.7871506079108994, + "grad_norm": 1.4224773645401, + "learning_rate": 2.283272576662615e-05, + "loss": 1.0712, + "step": 21980 + }, + { + "epoch": 0.7871864200404677, + "grad_norm": 1.4673067331314087, + "learning_rate": 2.2825349083914426e-05, + "loss": 1.0805, + "step": 21981 + }, + { + "epoch": 0.787222232170036, + "grad_norm": 1.6439698934555054, + "learning_rate": 2.2817973439488117e-05, + "loss": 0.9744, + "step": 21982 + }, + { + "epoch": 0.7872580442996042, + "grad_norm": 1.706809401512146, + "learning_rate": 2.2810598833446382e-05, + "loss": 0.998, + "step": 21983 + }, + { + "epoch": 0.7872938564291726, + "grad_norm": 1.424228310585022, + "learning_rate": 2.2803225265888484e-05, + "loss": 1.1672, + "step": 21984 + }, + { + "epoch": 0.7873296685587409, + "grad_norm": 1.7280818223953247, + "learning_rate": 2.2795852736913604e-05, + "loss": 1.1265, + "step": 21985 + }, + { + "epoch": 0.7873654806883091, + "grad_norm": 1.6754783391952515, + "learning_rate": 2.2788481246620973e-05, + "loss": 0.9948, + "step": 21986 + }, + { + "epoch": 0.7874012928178774, + "grad_norm": 1.3277044296264648, + "learning_rate": 2.2781110795109674e-05, + "loss": 1.1395, + "step": 21987 + }, + { + "epoch": 0.7874371049474457, + "grad_norm": 1.32645583152771, + "learning_rate": 2.2773741382478975e-05, + "loss": 1.1476, + "step": 21988 + }, + { + "epoch": 0.787472917077014, + "grad_norm": 1.3886079788208008, + "learning_rate": 2.276637300882797e-05, + "loss": 1.1362, + "step": 21989 + }, + { + "epoch": 0.7875087292065822, + "grad_norm": 1.4849098920822144, + "learning_rate": 2.2759005674255774e-05, + "loss": 0.9985, + "step": 21990 + }, + { + "epoch": 0.7875445413361506, + "grad_norm": 1.4226573705673218, + "learning_rate": 2.275163937886151e-05, + "loss": 1.1055, + "step": 21991 + }, + { + "epoch": 0.7875803534657189, + "grad_norm": 1.6589772701263428, + "learning_rate": 2.2744274122744304e-05, + "loss": 0.9821, + "step": 21992 + }, + { + "epoch": 0.7876161655952871, + "grad_norm": 1.9574581384658813, + "learning_rate": 2.2736909906003266e-05, + "loss": 1.3348, + "step": 21993 + }, + { + "epoch": 0.7876519777248554, + "grad_norm": 1.2710574865341187, + "learning_rate": 2.2729546728737416e-05, + "loss": 1.1796, + "step": 21994 + }, + { + "epoch": 0.7876877898544237, + "grad_norm": 1.500236988067627, + "learning_rate": 2.2722184591045835e-05, + "loss": 1.1743, + "step": 21995 + }, + { + "epoch": 0.7877236019839919, + "grad_norm": 1.5090901851654053, + "learning_rate": 2.2714823493027583e-05, + "loss": 1.1147, + "step": 21996 + }, + { + "epoch": 0.7877594141135602, + "grad_norm": 1.3683171272277832, + "learning_rate": 2.2707463434781718e-05, + "loss": 1.0815, + "step": 21997 + }, + { + "epoch": 0.7877952262431286, + "grad_norm": 1.4178590774536133, + "learning_rate": 2.2700104416407208e-05, + "loss": 1.2704, + "step": 21998 + }, + { + "epoch": 0.7878310383726969, + "grad_norm": 1.326702356338501, + "learning_rate": 2.2692746438003078e-05, + "loss": 0.9591, + "step": 21999 + }, + { + "epoch": 0.7878668505022651, + "grad_norm": 1.8684910535812378, + "learning_rate": 2.2685389499668352e-05, + "loss": 1.0542, + "step": 22000 + }, + { + "epoch": 0.7879026626318334, + "grad_norm": 1.4931037425994873, + "learning_rate": 2.2678033601501957e-05, + "loss": 1.1896, + "step": 22001 + }, + { + "epoch": 0.7879384747614017, + "grad_norm": 1.1716651916503906, + "learning_rate": 2.2670678743602892e-05, + "loss": 0.9663, + "step": 22002 + }, + { + "epoch": 0.7879742868909699, + "grad_norm": 1.3023349046707153, + "learning_rate": 2.2663324926070086e-05, + "loss": 0.9608, + "step": 22003 + }, + { + "epoch": 0.7880100990205382, + "grad_norm": 1.5583657026290894, + "learning_rate": 2.2655972149002512e-05, + "loss": 0.9266, + "step": 22004 + }, + { + "epoch": 0.7880459111501066, + "grad_norm": 1.939138650894165, + "learning_rate": 2.2648620412499045e-05, + "loss": 1.2819, + "step": 22005 + }, + { + "epoch": 0.7880817232796749, + "grad_norm": 1.5104280710220337, + "learning_rate": 2.264126971665861e-05, + "loss": 0.9845, + "step": 22006 + }, + { + "epoch": 0.7881175354092431, + "grad_norm": 1.6967785358428955, + "learning_rate": 2.2633920061580127e-05, + "loss": 1.3342, + "step": 22007 + }, + { + "epoch": 0.7881533475388114, + "grad_norm": 1.2662626504898071, + "learning_rate": 2.262657144736243e-05, + "loss": 0.9072, + "step": 22008 + }, + { + "epoch": 0.7881891596683797, + "grad_norm": 1.581868052482605, + "learning_rate": 2.2619223874104423e-05, + "loss": 1.215, + "step": 22009 + }, + { + "epoch": 0.7882249717979479, + "grad_norm": 1.4931602478027344, + "learning_rate": 2.261187734190493e-05, + "loss": 1.0888, + "step": 22010 + }, + { + "epoch": 0.7882607839275162, + "grad_norm": 1.5180003643035889, + "learning_rate": 2.2604531850862832e-05, + "loss": 1.1686, + "step": 22011 + }, + { + "epoch": 0.7882965960570846, + "grad_norm": 1.9874860048294067, + "learning_rate": 2.2597187401076903e-05, + "loss": 1.1615, + "step": 22012 + }, + { + "epoch": 0.7883324081866528, + "grad_norm": 1.7115627527236938, + "learning_rate": 2.2589843992645977e-05, + "loss": 1.2172, + "step": 22013 + }, + { + "epoch": 0.7883682203162211, + "grad_norm": 1.3964126110076904, + "learning_rate": 2.258250162566887e-05, + "loss": 1.1261, + "step": 22014 + }, + { + "epoch": 0.7884040324457894, + "grad_norm": 1.5460100173950195, + "learning_rate": 2.2575160300244314e-05, + "loss": 0.985, + "step": 22015 + }, + { + "epoch": 0.7884398445753577, + "grad_norm": 1.5955613851547241, + "learning_rate": 2.2567820016471107e-05, + "loss": 0.9252, + "step": 22016 + }, + { + "epoch": 0.7884756567049259, + "grad_norm": 1.40509033203125, + "learning_rate": 2.256048077444801e-05, + "loss": 1.0683, + "step": 22017 + }, + { + "epoch": 0.7885114688344942, + "grad_norm": 1.67960524559021, + "learning_rate": 2.2553142574273777e-05, + "loss": 1.1115, + "step": 22018 + }, + { + "epoch": 0.7885472809640626, + "grad_norm": 1.6528449058532715, + "learning_rate": 2.2545805416047073e-05, + "loss": 1.1016, + "step": 22019 + }, + { + "epoch": 0.7885830930936308, + "grad_norm": 1.3431240320205688, + "learning_rate": 2.253846929986666e-05, + "loss": 0.9806, + "step": 22020 + }, + { + "epoch": 0.7886189052231991, + "grad_norm": 1.6507097482681274, + "learning_rate": 2.253113422583122e-05, + "loss": 0.9012, + "step": 22021 + }, + { + "epoch": 0.7886547173527674, + "grad_norm": 1.2954683303833008, + "learning_rate": 2.252380019403947e-05, + "loss": 0.9316, + "step": 22022 + }, + { + "epoch": 0.7886905294823356, + "grad_norm": 1.175822377204895, + "learning_rate": 2.251646720459003e-05, + "loss": 0.918, + "step": 22023 + }, + { + "epoch": 0.7887263416119039, + "grad_norm": 1.583213210105896, + "learning_rate": 2.250913525758157e-05, + "loss": 1.0933, + "step": 22024 + }, + { + "epoch": 0.7887621537414722, + "grad_norm": 1.4419997930526733, + "learning_rate": 2.2501804353112765e-05, + "loss": 1.282, + "step": 22025 + }, + { + "epoch": 0.7887979658710406, + "grad_norm": 1.298425316810608, + "learning_rate": 2.249447449128219e-05, + "loss": 1.1982, + "step": 22026 + }, + { + "epoch": 0.7888337780006088, + "grad_norm": 1.1414369344711304, + "learning_rate": 2.248714567218849e-05, + "loss": 0.8513, + "step": 22027 + }, + { + "epoch": 0.7888695901301771, + "grad_norm": 1.8769159317016602, + "learning_rate": 2.2479817895930256e-05, + "loss": 0.9762, + "step": 22028 + }, + { + "epoch": 0.7889054022597454, + "grad_norm": 1.842009425163269, + "learning_rate": 2.247249116260611e-05, + "loss": 1.1736, + "step": 22029 + }, + { + "epoch": 0.7889412143893136, + "grad_norm": 1.4401757717132568, + "learning_rate": 2.2465165472314564e-05, + "loss": 1.0488, + "step": 22030 + }, + { + "epoch": 0.7889770265188819, + "grad_norm": 1.2964870929718018, + "learning_rate": 2.2457840825154198e-05, + "loss": 0.9498, + "step": 22031 + }, + { + "epoch": 0.7890128386484502, + "grad_norm": 1.363722801208496, + "learning_rate": 2.24505172212236e-05, + "loss": 1.0263, + "step": 22032 + }, + { + "epoch": 0.7890486507780186, + "grad_norm": 2.243387460708618, + "learning_rate": 2.2443194660621225e-05, + "loss": 1.1656, + "step": 22033 + }, + { + "epoch": 0.7890844629075868, + "grad_norm": 1.3242021799087524, + "learning_rate": 2.243587314344563e-05, + "loss": 1.0189, + "step": 22034 + }, + { + "epoch": 0.7891202750371551, + "grad_norm": 1.4478368759155273, + "learning_rate": 2.242855266979531e-05, + "loss": 1.0214, + "step": 22035 + }, + { + "epoch": 0.7891560871667234, + "grad_norm": 1.6955026388168335, + "learning_rate": 2.242123323976878e-05, + "loss": 1.2531, + "step": 22036 + }, + { + "epoch": 0.7891918992962916, + "grad_norm": 1.8974802494049072, + "learning_rate": 2.2413914853464455e-05, + "loss": 1.1444, + "step": 22037 + }, + { + "epoch": 0.7892277114258599, + "grad_norm": 1.6036583185195923, + "learning_rate": 2.240659751098083e-05, + "loss": 1.123, + "step": 22038 + }, + { + "epoch": 0.7892635235554282, + "grad_norm": 1.4045767784118652, + "learning_rate": 2.2399281212416346e-05, + "loss": 1.0008, + "step": 22039 + }, + { + "epoch": 0.7892993356849966, + "grad_norm": 1.3689957857131958, + "learning_rate": 2.2391965957869464e-05, + "loss": 1.1835, + "step": 22040 + }, + { + "epoch": 0.7893351478145648, + "grad_norm": 1.631499171257019, + "learning_rate": 2.2384651747438578e-05, + "loss": 1.107, + "step": 22041 + }, + { + "epoch": 0.7893709599441331, + "grad_norm": 1.402976393699646, + "learning_rate": 2.237733858122203e-05, + "loss": 1.1379, + "step": 22042 + }, + { + "epoch": 0.7894067720737014, + "grad_norm": 1.606776475906372, + "learning_rate": 2.2370026459318315e-05, + "loss": 1.1683, + "step": 22043 + }, + { + "epoch": 0.7894425842032696, + "grad_norm": 1.6349503993988037, + "learning_rate": 2.236271538182574e-05, + "loss": 1.1187, + "step": 22044 + }, + { + "epoch": 0.7894783963328379, + "grad_norm": 1.4535279273986816, + "learning_rate": 2.2355405348842672e-05, + "loss": 1.0146, + "step": 22045 + }, + { + "epoch": 0.7895142084624062, + "grad_norm": 1.6503729820251465, + "learning_rate": 2.2348096360467484e-05, + "loss": 1.0529, + "step": 22046 + }, + { + "epoch": 0.7895500205919745, + "grad_norm": 1.453033685684204, + "learning_rate": 2.2340788416798518e-05, + "loss": 1.0093, + "step": 22047 + }, + { + "epoch": 0.7895858327215428, + "grad_norm": 1.3581149578094482, + "learning_rate": 2.233348151793404e-05, + "loss": 0.8646, + "step": 22048 + }, + { + "epoch": 0.7896216448511111, + "grad_norm": 1.290942907333374, + "learning_rate": 2.232617566397238e-05, + "loss": 0.9498, + "step": 22049 + }, + { + "epoch": 0.7896574569806794, + "grad_norm": 1.3281900882720947, + "learning_rate": 2.2318870855011874e-05, + "loss": 1.1578, + "step": 22050 + }, + { + "epoch": 0.7896932691102476, + "grad_norm": 1.3277130126953125, + "learning_rate": 2.231156709115073e-05, + "loss": 0.9814, + "step": 22051 + }, + { + "epoch": 0.7897290812398159, + "grad_norm": 1.8864343166351318, + "learning_rate": 2.230426437248726e-05, + "loss": 0.9505, + "step": 22052 + }, + { + "epoch": 0.7897648933693842, + "grad_norm": 1.2881742715835571, + "learning_rate": 2.229696269911965e-05, + "loss": 0.8749, + "step": 22053 + }, + { + "epoch": 0.7898007054989525, + "grad_norm": 1.2098979949951172, + "learning_rate": 2.228966207114622e-05, + "loss": 1.2307, + "step": 22054 + }, + { + "epoch": 0.7898365176285208, + "grad_norm": 1.355217695236206, + "learning_rate": 2.228236248866512e-05, + "loss": 1.0482, + "step": 22055 + }, + { + "epoch": 0.7898723297580891, + "grad_norm": 2.021214723587036, + "learning_rate": 2.2275063951774587e-05, + "loss": 1.1746, + "step": 22056 + }, + { + "epoch": 0.7899081418876573, + "grad_norm": 1.5123860836029053, + "learning_rate": 2.2267766460572814e-05, + "loss": 1.0249, + "step": 22057 + }, + { + "epoch": 0.7899439540172256, + "grad_norm": 1.3702620267868042, + "learning_rate": 2.226047001515801e-05, + "loss": 1.1832, + "step": 22058 + }, + { + "epoch": 0.7899797661467939, + "grad_norm": 1.4638245105743408, + "learning_rate": 2.225317461562829e-05, + "loss": 0.9809, + "step": 22059 + }, + { + "epoch": 0.7900155782763622, + "grad_norm": 1.687139868736267, + "learning_rate": 2.2245880262081774e-05, + "loss": 1.1822, + "step": 22060 + }, + { + "epoch": 0.7900513904059305, + "grad_norm": 1.829280138015747, + "learning_rate": 2.223858695461669e-05, + "loss": 1.12, + "step": 22061 + }, + { + "epoch": 0.7900872025354988, + "grad_norm": 1.1998451948165894, + "learning_rate": 2.2231294693331096e-05, + "loss": 1.0281, + "step": 22062 + }, + { + "epoch": 0.7901230146650671, + "grad_norm": 1.2730560302734375, + "learning_rate": 2.222400347832314e-05, + "loss": 1.1316, + "step": 22063 + }, + { + "epoch": 0.7901588267946353, + "grad_norm": 1.4394582509994507, + "learning_rate": 2.221671330969084e-05, + "loss": 1.1237, + "step": 22064 + }, + { + "epoch": 0.7901946389242036, + "grad_norm": 1.565589427947998, + "learning_rate": 2.220942418753238e-05, + "loss": 1.0893, + "step": 22065 + }, + { + "epoch": 0.7902304510537719, + "grad_norm": 1.4843153953552246, + "learning_rate": 2.220213611194576e-05, + "loss": 1.2373, + "step": 22066 + }, + { + "epoch": 0.7902662631833401, + "grad_norm": 1.6205463409423828, + "learning_rate": 2.2194849083029057e-05, + "loss": 0.9825, + "step": 22067 + }, + { + "epoch": 0.7903020753129085, + "grad_norm": 1.4530123472213745, + "learning_rate": 2.2187563100880282e-05, + "loss": 1.1681, + "step": 22068 + }, + { + "epoch": 0.7903378874424768, + "grad_norm": 1.4935742616653442, + "learning_rate": 2.2180278165597467e-05, + "loss": 1.15, + "step": 22069 + }, + { + "epoch": 0.7903736995720451, + "grad_norm": 1.4750173091888428, + "learning_rate": 2.2172994277278668e-05, + "loss": 1.0965, + "step": 22070 + }, + { + "epoch": 0.7904095117016133, + "grad_norm": 1.5973377227783203, + "learning_rate": 2.2165711436021774e-05, + "loss": 0.9669, + "step": 22071 + }, + { + "epoch": 0.7904453238311816, + "grad_norm": 1.5652098655700684, + "learning_rate": 2.2158429641924895e-05, + "loss": 0.9675, + "step": 22072 + }, + { + "epoch": 0.7904811359607499, + "grad_norm": 1.3553258180618286, + "learning_rate": 2.2151148895085906e-05, + "loss": 1.1853, + "step": 22073 + }, + { + "epoch": 0.7905169480903181, + "grad_norm": 1.766391634941101, + "learning_rate": 2.2143869195602816e-05, + "loss": 1.1102, + "step": 22074 + }, + { + "epoch": 0.7905527602198865, + "grad_norm": 1.4652959108352661, + "learning_rate": 2.2136590543573497e-05, + "loss": 1.1325, + "step": 22075 + }, + { + "epoch": 0.7905885723494548, + "grad_norm": 1.2332442998886108, + "learning_rate": 2.212931293909596e-05, + "loss": 0.959, + "step": 22076 + }, + { + "epoch": 0.7906243844790231, + "grad_norm": 1.5910663604736328, + "learning_rate": 2.2122036382268074e-05, + "loss": 0.7739, + "step": 22077 + }, + { + "epoch": 0.7906601966085913, + "grad_norm": 1.4421038627624512, + "learning_rate": 2.21147608731877e-05, + "loss": 1.2548, + "step": 22078 + }, + { + "epoch": 0.7906960087381596, + "grad_norm": 1.2055180072784424, + "learning_rate": 2.210748641195276e-05, + "loss": 1.1221, + "step": 22079 + }, + { + "epoch": 0.7907318208677279, + "grad_norm": 1.4019607305526733, + "learning_rate": 2.210021299866112e-05, + "loss": 1.0909, + "step": 22080 + }, + { + "epoch": 0.7907676329972961, + "grad_norm": 1.5090807676315308, + "learning_rate": 2.209294063341065e-05, + "loss": 1.0401, + "step": 22081 + }, + { + "epoch": 0.7908034451268645, + "grad_norm": 1.5741490125656128, + "learning_rate": 2.2085669316299117e-05, + "loss": 1.3295, + "step": 22082 + }, + { + "epoch": 0.7908392572564328, + "grad_norm": 1.686299204826355, + "learning_rate": 2.207839904742446e-05, + "loss": 1.0902, + "step": 22083 + }, + { + "epoch": 0.790875069386001, + "grad_norm": 1.6036038398742676, + "learning_rate": 2.2071129826884397e-05, + "loss": 0.9548, + "step": 22084 + }, + { + "epoch": 0.7909108815155693, + "grad_norm": 1.3684147596359253, + "learning_rate": 2.2063861654776798e-05, + "loss": 1.1014, + "step": 22085 + }, + { + "epoch": 0.7909466936451376, + "grad_norm": 1.2804194688796997, + "learning_rate": 2.205659453119938e-05, + "loss": 0.9033, + "step": 22086 + }, + { + "epoch": 0.7909825057747059, + "grad_norm": 1.3152872323989868, + "learning_rate": 2.204932845624994e-05, + "loss": 1.0694, + "step": 22087 + }, + { + "epoch": 0.7910183179042741, + "grad_norm": 1.3221904039382935, + "learning_rate": 2.204206343002626e-05, + "loss": 1.0651, + "step": 22088 + }, + { + "epoch": 0.7910541300338425, + "grad_norm": 1.627146601676941, + "learning_rate": 2.2034799452626043e-05, + "loss": 1.206, + "step": 22089 + }, + { + "epoch": 0.7910899421634108, + "grad_norm": 1.4994627237319946, + "learning_rate": 2.2027536524147017e-05, + "loss": 1.2815, + "step": 22090 + }, + { + "epoch": 0.791125754292979, + "grad_norm": 1.788844108581543, + "learning_rate": 2.2020274644686922e-05, + "loss": 0.9722, + "step": 22091 + }, + { + "epoch": 0.7911615664225473, + "grad_norm": 1.3841087818145752, + "learning_rate": 2.2013013814343465e-05, + "loss": 1.005, + "step": 22092 + }, + { + "epoch": 0.7911973785521156, + "grad_norm": 1.6487914323806763, + "learning_rate": 2.200575403321429e-05, + "loss": 1.2179, + "step": 22093 + }, + { + "epoch": 0.7912331906816839, + "grad_norm": 1.4862672090530396, + "learning_rate": 2.1998495301397083e-05, + "loss": 1.2563, + "step": 22094 + }, + { + "epoch": 0.7912690028112521, + "grad_norm": 1.415475606918335, + "learning_rate": 2.1991237618989535e-05, + "loss": 0.921, + "step": 22095 + }, + { + "epoch": 0.7913048149408205, + "grad_norm": 1.4112870693206787, + "learning_rate": 2.1983980986089235e-05, + "loss": 0.8504, + "step": 22096 + }, + { + "epoch": 0.7913406270703888, + "grad_norm": 1.3954567909240723, + "learning_rate": 2.197672540279384e-05, + "loss": 1.0314, + "step": 22097 + }, + { + "epoch": 0.791376439199957, + "grad_norm": 1.8912204504013062, + "learning_rate": 2.196947086920096e-05, + "loss": 1.1579, + "step": 22098 + }, + { + "epoch": 0.7914122513295253, + "grad_norm": 1.4866210222244263, + "learning_rate": 2.1962217385408225e-05, + "loss": 1.1293, + "step": 22099 + }, + { + "epoch": 0.7914480634590936, + "grad_norm": 1.5524345636367798, + "learning_rate": 2.1954964951513168e-05, + "loss": 1.1381, + "step": 22100 + }, + { + "epoch": 0.7914838755886618, + "grad_norm": 1.4740917682647705, + "learning_rate": 2.1947713567613382e-05, + "loss": 0.9375, + "step": 22101 + }, + { + "epoch": 0.7915196877182301, + "grad_norm": 1.2753593921661377, + "learning_rate": 2.194046323380643e-05, + "loss": 1.0407, + "step": 22102 + }, + { + "epoch": 0.7915554998477985, + "grad_norm": 1.4630767107009888, + "learning_rate": 2.193321395018989e-05, + "loss": 0.959, + "step": 22103 + }, + { + "epoch": 0.7915913119773668, + "grad_norm": 1.7853665351867676, + "learning_rate": 2.192596571686123e-05, + "loss": 1.1793, + "step": 22104 + }, + { + "epoch": 0.791627124106935, + "grad_norm": 2.367027759552002, + "learning_rate": 2.1918718533917982e-05, + "loss": 1.4257, + "step": 22105 + }, + { + "epoch": 0.7916629362365033, + "grad_norm": 1.400525689125061, + "learning_rate": 2.1911472401457688e-05, + "loss": 1.0895, + "step": 22106 + }, + { + "epoch": 0.7916987483660716, + "grad_norm": 1.7410521507263184, + "learning_rate": 2.1904227319577786e-05, + "loss": 1.0837, + "step": 22107 + }, + { + "epoch": 0.7917345604956398, + "grad_norm": 1.361877202987671, + "learning_rate": 2.1896983288375773e-05, + "loss": 0.8488, + "step": 22108 + }, + { + "epoch": 0.7917703726252081, + "grad_norm": 1.754581093788147, + "learning_rate": 2.188974030794909e-05, + "loss": 1.1769, + "step": 22109 + }, + { + "epoch": 0.7918061847547765, + "grad_norm": 1.2970622777938843, + "learning_rate": 2.1882498378395232e-05, + "loss": 1.1307, + "step": 22110 + }, + { + "epoch": 0.7918419968843448, + "grad_norm": 1.5596301555633545, + "learning_rate": 2.1875257499811563e-05, + "loss": 1.0225, + "step": 22111 + }, + { + "epoch": 0.791877809013913, + "grad_norm": 1.6504638195037842, + "learning_rate": 2.1868017672295537e-05, + "loss": 1.08, + "step": 22112 + }, + { + "epoch": 0.7919136211434813, + "grad_norm": 1.2661203145980835, + "learning_rate": 2.1860778895944566e-05, + "loss": 1.0861, + "step": 22113 + }, + { + "epoch": 0.7919494332730496, + "grad_norm": 1.3722786903381348, + "learning_rate": 2.1853541170856007e-05, + "loss": 1.0707, + "step": 22114 + }, + { + "epoch": 0.7919852454026178, + "grad_norm": 1.4590507745742798, + "learning_rate": 2.1846304497127247e-05, + "loss": 1.0207, + "step": 22115 + }, + { + "epoch": 0.7920210575321861, + "grad_norm": 1.5272530317306519, + "learning_rate": 2.183906887485565e-05, + "loss": 1.1569, + "step": 22116 + }, + { + "epoch": 0.7920568696617545, + "grad_norm": 1.42844820022583, + "learning_rate": 2.1831834304138587e-05, + "loss": 1.1181, + "step": 22117 + }, + { + "epoch": 0.7920926817913228, + "grad_norm": 1.8652526140213013, + "learning_rate": 2.1824600785073335e-05, + "loss": 1.146, + "step": 22118 + }, + { + "epoch": 0.792128493920891, + "grad_norm": 1.3163163661956787, + "learning_rate": 2.1817368317757235e-05, + "loss": 0.991, + "step": 22119 + }, + { + "epoch": 0.7921643060504593, + "grad_norm": 1.5952465534210205, + "learning_rate": 2.1810136902287625e-05, + "loss": 0.7814, + "step": 22120 + }, + { + "epoch": 0.7922001181800276, + "grad_norm": 2.316544771194458, + "learning_rate": 2.1802906538761748e-05, + "loss": 1.1174, + "step": 22121 + }, + { + "epoch": 0.7922359303095958, + "grad_norm": 2.0180907249450684, + "learning_rate": 2.179567722727689e-05, + "loss": 1.0629, + "step": 22122 + }, + { + "epoch": 0.7922717424391641, + "grad_norm": 1.2497810125350952, + "learning_rate": 2.178844896793032e-05, + "loss": 0.9186, + "step": 22123 + }, + { + "epoch": 0.7923075545687325, + "grad_norm": 1.303674340248108, + "learning_rate": 2.1781221760819303e-05, + "loss": 1.1846, + "step": 22124 + }, + { + "epoch": 0.7923433666983007, + "grad_norm": 1.721300721168518, + "learning_rate": 2.1773995606041044e-05, + "loss": 0.9623, + "step": 22125 + }, + { + "epoch": 0.792379178827869, + "grad_norm": 1.4415124654769897, + "learning_rate": 2.1766770503692748e-05, + "loss": 1.159, + "step": 22126 + }, + { + "epoch": 0.7924149909574373, + "grad_norm": 1.6798685789108276, + "learning_rate": 2.1759546453871647e-05, + "loss": 1.0723, + "step": 22127 + }, + { + "epoch": 0.7924508030870056, + "grad_norm": 1.6959363222122192, + "learning_rate": 2.1752323456674962e-05, + "loss": 0.9712, + "step": 22128 + }, + { + "epoch": 0.7924866152165738, + "grad_norm": 1.433667778968811, + "learning_rate": 2.1745101512199806e-05, + "loss": 1.0404, + "step": 22129 + }, + { + "epoch": 0.7925224273461421, + "grad_norm": 1.2924662828445435, + "learning_rate": 2.173788062054336e-05, + "loss": 0.9808, + "step": 22130 + }, + { + "epoch": 0.7925582394757105, + "grad_norm": 1.433458924293518, + "learning_rate": 2.1730660781802804e-05, + "loss": 1.175, + "step": 22131 + }, + { + "epoch": 0.7925940516052787, + "grad_norm": 1.2857410907745361, + "learning_rate": 2.1723441996075223e-05, + "loss": 0.9962, + "step": 22132 + }, + { + "epoch": 0.792629863734847, + "grad_norm": 1.1561936140060425, + "learning_rate": 2.1716224263457763e-05, + "loss": 1.0511, + "step": 22133 + }, + { + "epoch": 0.7926656758644153, + "grad_norm": 1.6072806119918823, + "learning_rate": 2.1709007584047524e-05, + "loss": 1.1594, + "step": 22134 + }, + { + "epoch": 0.7927014879939835, + "grad_norm": 1.535966396331787, + "learning_rate": 2.170179195794163e-05, + "loss": 0.9832, + "step": 22135 + }, + { + "epoch": 0.7927373001235518, + "grad_norm": 1.4678585529327393, + "learning_rate": 2.1694577385237104e-05, + "loss": 0.9892, + "step": 22136 + }, + { + "epoch": 0.7927731122531201, + "grad_norm": 1.342995285987854, + "learning_rate": 2.168736386603102e-05, + "loss": 0.9143, + "step": 22137 + }, + { + "epoch": 0.7928089243826885, + "grad_norm": 1.7360527515411377, + "learning_rate": 2.168015140042048e-05, + "loss": 1.165, + "step": 22138 + }, + { + "epoch": 0.7928447365122567, + "grad_norm": 1.477337121963501, + "learning_rate": 2.167293998850244e-05, + "loss": 0.8903, + "step": 22139 + }, + { + "epoch": 0.792880548641825, + "grad_norm": 1.4452416896820068, + "learning_rate": 2.1665729630373965e-05, + "loss": 1.212, + "step": 22140 + }, + { + "epoch": 0.7929163607713933, + "grad_norm": 1.3880399465560913, + "learning_rate": 2.165852032613205e-05, + "loss": 0.9158, + "step": 22141 + }, + { + "epoch": 0.7929521729009615, + "grad_norm": 1.5057239532470703, + "learning_rate": 2.1651312075873718e-05, + "loss": 0.9157, + "step": 22142 + }, + { + "epoch": 0.7929879850305298, + "grad_norm": 1.4534324407577515, + "learning_rate": 2.1644104879695892e-05, + "loss": 0.9575, + "step": 22143 + }, + { + "epoch": 0.7930237971600981, + "grad_norm": 2.0238165855407715, + "learning_rate": 2.1636898737695567e-05, + "loss": 1.0893, + "step": 22144 + }, + { + "epoch": 0.7930596092896665, + "grad_norm": 1.6030988693237305, + "learning_rate": 2.1629693649969683e-05, + "loss": 1.1273, + "step": 22145 + }, + { + "epoch": 0.7930954214192347, + "grad_norm": 1.3038270473480225, + "learning_rate": 2.1622489616615203e-05, + "loss": 0.9888, + "step": 22146 + }, + { + "epoch": 0.793131233548803, + "grad_norm": 1.5106019973754883, + "learning_rate": 2.1615286637729037e-05, + "loss": 1.1278, + "step": 22147 + }, + { + "epoch": 0.7931670456783713, + "grad_norm": 1.264251470565796, + "learning_rate": 2.1608084713408018e-05, + "loss": 0.9885, + "step": 22148 + }, + { + "epoch": 0.7932028578079395, + "grad_norm": 1.342583179473877, + "learning_rate": 2.1600883843749165e-05, + "loss": 1.1352, + "step": 22149 + }, + { + "epoch": 0.7932386699375078, + "grad_norm": 1.4730579853057861, + "learning_rate": 2.159368402884926e-05, + "loss": 1.0969, + "step": 22150 + }, + { + "epoch": 0.7932744820670761, + "grad_norm": 1.3960951566696167, + "learning_rate": 2.1586485268805225e-05, + "loss": 1.0765, + "step": 22151 + }, + { + "epoch": 0.7933102941966444, + "grad_norm": 1.5591028928756714, + "learning_rate": 2.1579287563713823e-05, + "loss": 0.838, + "step": 22152 + }, + { + "epoch": 0.7933461063262127, + "grad_norm": 1.7614145278930664, + "learning_rate": 2.1572090913672017e-05, + "loss": 1.1095, + "step": 22153 + }, + { + "epoch": 0.793381918455781, + "grad_norm": 1.8838894367218018, + "learning_rate": 2.1564895318776534e-05, + "loss": 1.0376, + "step": 22154 + }, + { + "epoch": 0.7934177305853493, + "grad_norm": 1.6105997562408447, + "learning_rate": 2.1557700779124214e-05, + "loss": 1.1654, + "step": 22155 + }, + { + "epoch": 0.7934535427149175, + "grad_norm": 1.7267271280288696, + "learning_rate": 2.1550507294811863e-05, + "loss": 1.1488, + "step": 22156 + }, + { + "epoch": 0.7934893548444858, + "grad_norm": 1.4286785125732422, + "learning_rate": 2.1543314865936225e-05, + "loss": 1.1316, + "step": 22157 + }, + { + "epoch": 0.7935251669740541, + "grad_norm": 1.2716584205627441, + "learning_rate": 2.1536123492594106e-05, + "loss": 1.015, + "step": 22158 + }, + { + "epoch": 0.7935609791036224, + "grad_norm": 1.6258188486099243, + "learning_rate": 2.1528933174882183e-05, + "loss": 1.304, + "step": 22159 + }, + { + "epoch": 0.7935967912331907, + "grad_norm": 1.2634921073913574, + "learning_rate": 2.1521743912897296e-05, + "loss": 0.9572, + "step": 22160 + }, + { + "epoch": 0.793632603362759, + "grad_norm": 1.4189269542694092, + "learning_rate": 2.1514555706736084e-05, + "loss": 1.0113, + "step": 22161 + }, + { + "epoch": 0.7936684154923273, + "grad_norm": 1.6140317916870117, + "learning_rate": 2.1507368556495323e-05, + "loss": 1.1328, + "step": 22162 + }, + { + "epoch": 0.7937042276218955, + "grad_norm": 1.7939345836639404, + "learning_rate": 2.150018246227161e-05, + "loss": 1.138, + "step": 22163 + }, + { + "epoch": 0.7937400397514638, + "grad_norm": 1.2277674674987793, + "learning_rate": 2.1492997424161744e-05, + "loss": 0.9817, + "step": 22164 + }, + { + "epoch": 0.7937758518810321, + "grad_norm": 1.456335425376892, + "learning_rate": 2.1485813442262325e-05, + "loss": 0.8933, + "step": 22165 + }, + { + "epoch": 0.7938116640106004, + "grad_norm": 1.8003679513931274, + "learning_rate": 2.1478630516669952e-05, + "loss": 1.1482, + "step": 22166 + }, + { + "epoch": 0.7938474761401687, + "grad_norm": 1.2694921493530273, + "learning_rate": 2.1471448647481384e-05, + "loss": 0.801, + "step": 22167 + }, + { + "epoch": 0.793883288269737, + "grad_norm": 1.6783415079116821, + "learning_rate": 2.1464267834793152e-05, + "loss": 0.9876, + "step": 22168 + }, + { + "epoch": 0.7939191003993052, + "grad_norm": 1.3122985363006592, + "learning_rate": 2.1457088078701916e-05, + "loss": 1.1694, + "step": 22169 + }, + { + "epoch": 0.7939549125288735, + "grad_norm": 1.3271043300628662, + "learning_rate": 2.144990937930419e-05, + "loss": 1.1217, + "step": 22170 + }, + { + "epoch": 0.7939907246584418, + "grad_norm": 1.5685217380523682, + "learning_rate": 2.1442731736696666e-05, + "loss": 1.2399, + "step": 22171 + }, + { + "epoch": 0.79402653678801, + "grad_norm": 1.652841567993164, + "learning_rate": 2.143555515097583e-05, + "loss": 1.0103, + "step": 22172 + }, + { + "epoch": 0.7940623489175784, + "grad_norm": 1.5052307844161987, + "learning_rate": 2.1428379622238283e-05, + "loss": 1.2071, + "step": 22173 + }, + { + "epoch": 0.7940981610471467, + "grad_norm": 1.7948634624481201, + "learning_rate": 2.1421205150580514e-05, + "loss": 1.364, + "step": 22174 + }, + { + "epoch": 0.794133973176715, + "grad_norm": 1.516770601272583, + "learning_rate": 2.1414031736099072e-05, + "loss": 0.9184, + "step": 22175 + }, + { + "epoch": 0.7941697853062832, + "grad_norm": 1.5595974922180176, + "learning_rate": 2.1406859378890486e-05, + "loss": 1.1755, + "step": 22176 + }, + { + "epoch": 0.7942055974358515, + "grad_norm": 1.3913074731826782, + "learning_rate": 2.1399688079051205e-05, + "loss": 0.8625, + "step": 22177 + }, + { + "epoch": 0.7942414095654198, + "grad_norm": 1.3454344272613525, + "learning_rate": 2.1392517836677738e-05, + "loss": 0.929, + "step": 22178 + }, + { + "epoch": 0.794277221694988, + "grad_norm": 1.8657114505767822, + "learning_rate": 2.1385348651866542e-05, + "loss": 1.0671, + "step": 22179 + }, + { + "epoch": 0.7943130338245564, + "grad_norm": 1.3645368814468384, + "learning_rate": 2.13781805247141e-05, + "loss": 1.1334, + "step": 22180 + }, + { + "epoch": 0.7943488459541247, + "grad_norm": 1.418553352355957, + "learning_rate": 2.137101345531677e-05, + "loss": 1.1428, + "step": 22181 + }, + { + "epoch": 0.794384658083693, + "grad_norm": 1.2418937683105469, + "learning_rate": 2.136384744377109e-05, + "loss": 1.1095, + "step": 22182 + }, + { + "epoch": 0.7944204702132612, + "grad_norm": 1.7440125942230225, + "learning_rate": 2.135668249017341e-05, + "loss": 1.2453, + "step": 22183 + }, + { + "epoch": 0.7944562823428295, + "grad_norm": 1.5438973903656006, + "learning_rate": 2.134951859462009e-05, + "loss": 1.0299, + "step": 22184 + }, + { + "epoch": 0.7944920944723978, + "grad_norm": 1.2055683135986328, + "learning_rate": 2.1342355757207544e-05, + "loss": 0.9271, + "step": 22185 + }, + { + "epoch": 0.794527906601966, + "grad_norm": 1.6700456142425537, + "learning_rate": 2.1335193978032152e-05, + "loss": 0.996, + "step": 22186 + }, + { + "epoch": 0.7945637187315344, + "grad_norm": 1.436408281326294, + "learning_rate": 2.1328033257190272e-05, + "loss": 1.0531, + "step": 22187 + }, + { + "epoch": 0.7945995308611027, + "grad_norm": 1.5656968355178833, + "learning_rate": 2.13208735947782e-05, + "loss": 1.0943, + "step": 22188 + }, + { + "epoch": 0.794635342990671, + "grad_norm": 1.2345370054244995, + "learning_rate": 2.1313714990892285e-05, + "loss": 1.113, + "step": 22189 + }, + { + "epoch": 0.7946711551202392, + "grad_norm": 1.5418668985366821, + "learning_rate": 2.1306557445628837e-05, + "loss": 1.1518, + "step": 22190 + }, + { + "epoch": 0.7947069672498075, + "grad_norm": 1.2456541061401367, + "learning_rate": 2.1299400959084183e-05, + "loss": 1.0946, + "step": 22191 + }, + { + "epoch": 0.7947427793793758, + "grad_norm": 1.391823410987854, + "learning_rate": 2.1292245531354538e-05, + "loss": 0.9105, + "step": 22192 + }, + { + "epoch": 0.794778591508944, + "grad_norm": 1.529728889465332, + "learning_rate": 2.128509116253621e-05, + "loss": 1.08, + "step": 22193 + }, + { + "epoch": 0.7948144036385124, + "grad_norm": 1.4052183628082275, + "learning_rate": 2.1277937852725472e-05, + "loss": 0.8678, + "step": 22194 + }, + { + "epoch": 0.7948502157680807, + "grad_norm": 1.6288925409317017, + "learning_rate": 2.1270785602018505e-05, + "loss": 1.0101, + "step": 22195 + }, + { + "epoch": 0.794886027897649, + "grad_norm": 1.8080511093139648, + "learning_rate": 2.1263634410511567e-05, + "loss": 1.1056, + "step": 22196 + }, + { + "epoch": 0.7949218400272172, + "grad_norm": 1.7290003299713135, + "learning_rate": 2.125648427830086e-05, + "loss": 1.2203, + "step": 22197 + }, + { + "epoch": 0.7949576521567855, + "grad_norm": 1.3792489767074585, + "learning_rate": 2.1249335205482613e-05, + "loss": 1.1168, + "step": 22198 + }, + { + "epoch": 0.7949934642863538, + "grad_norm": 1.3627959489822388, + "learning_rate": 2.1242187192152964e-05, + "loss": 1.0303, + "step": 22199 + }, + { + "epoch": 0.795029276415922, + "grad_norm": 1.451546311378479, + "learning_rate": 2.1235040238408087e-05, + "loss": 1.2258, + "step": 22200 + }, + { + "epoch": 0.7950650885454904, + "grad_norm": 1.3060287237167358, + "learning_rate": 2.122789434434417e-05, + "loss": 1.1812, + "step": 22201 + }, + { + "epoch": 0.7951009006750587, + "grad_norm": 1.6218384504318237, + "learning_rate": 2.1220749510057304e-05, + "loss": 1.1809, + "step": 22202 + }, + { + "epoch": 0.7951367128046269, + "grad_norm": 1.8242424726486206, + "learning_rate": 2.1213605735643625e-05, + "loss": 1.3812, + "step": 22203 + }, + { + "epoch": 0.7951725249341952, + "grad_norm": 1.483254075050354, + "learning_rate": 2.1206463021199263e-05, + "loss": 1.03, + "step": 22204 + }, + { + "epoch": 0.7952083370637635, + "grad_norm": 1.2594215869903564, + "learning_rate": 2.1199321366820336e-05, + "loss": 1.0705, + "step": 22205 + }, + { + "epoch": 0.7952441491933318, + "grad_norm": 1.7309235334396362, + "learning_rate": 2.1192180772602867e-05, + "loss": 1.1346, + "step": 22206 + }, + { + "epoch": 0.7952799613229, + "grad_norm": 1.4757007360458374, + "learning_rate": 2.1185041238642934e-05, + "loss": 0.9916, + "step": 22207 + }, + { + "epoch": 0.7953157734524684, + "grad_norm": 1.5655932426452637, + "learning_rate": 2.117790276503665e-05, + "loss": 0.9599, + "step": 22208 + }, + { + "epoch": 0.7953515855820367, + "grad_norm": 1.5348107814788818, + "learning_rate": 2.1170765351879985e-05, + "loss": 1.2684, + "step": 22209 + }, + { + "epoch": 0.7953873977116049, + "grad_norm": 1.297319769859314, + "learning_rate": 2.116362899926898e-05, + "loss": 0.9524, + "step": 22210 + }, + { + "epoch": 0.7954232098411732, + "grad_norm": 1.3123635053634644, + "learning_rate": 2.1156493707299664e-05, + "loss": 0.8505, + "step": 22211 + }, + { + "epoch": 0.7954590219707415, + "grad_norm": 1.8706363439559937, + "learning_rate": 2.1149359476068043e-05, + "loss": 1.2656, + "step": 22212 + }, + { + "epoch": 0.7954948341003097, + "grad_norm": 1.5094488859176636, + "learning_rate": 2.1142226305670054e-05, + "loss": 1.0591, + "step": 22213 + }, + { + "epoch": 0.795530646229878, + "grad_norm": 1.5911144018173218, + "learning_rate": 2.1135094196201698e-05, + "loss": 1.1378, + "step": 22214 + }, + { + "epoch": 0.7955664583594464, + "grad_norm": 1.6456762552261353, + "learning_rate": 2.112796314775892e-05, + "loss": 1.0929, + "step": 22215 + }, + { + "epoch": 0.7956022704890147, + "grad_norm": 1.423862099647522, + "learning_rate": 2.112083316043768e-05, + "loss": 1.1682, + "step": 22216 + }, + { + "epoch": 0.7956380826185829, + "grad_norm": 1.1723713874816895, + "learning_rate": 2.1113704234333866e-05, + "loss": 1.0619, + "step": 22217 + }, + { + "epoch": 0.7956738947481512, + "grad_norm": 1.726173758506775, + "learning_rate": 2.1106576369543395e-05, + "loss": 1.0721, + "step": 22218 + }, + { + "epoch": 0.7957097068777195, + "grad_norm": 1.3148776292800903, + "learning_rate": 2.109944956616221e-05, + "loss": 1.0541, + "step": 22219 + }, + { + "epoch": 0.7957455190072877, + "grad_norm": 1.4693526029586792, + "learning_rate": 2.109232382428612e-05, + "loss": 1.0159, + "step": 22220 + }, + { + "epoch": 0.795781331136856, + "grad_norm": 1.3454742431640625, + "learning_rate": 2.1085199144011037e-05, + "loss": 1.069, + "step": 22221 + }, + { + "epoch": 0.7958171432664244, + "grad_norm": 1.6621270179748535, + "learning_rate": 2.1078075525432805e-05, + "loss": 1.1807, + "step": 22222 + }, + { + "epoch": 0.7958529553959927, + "grad_norm": 1.7462302446365356, + "learning_rate": 2.1070952968647296e-05, + "loss": 1.0595, + "step": 22223 + }, + { + "epoch": 0.7958887675255609, + "grad_norm": 1.2603824138641357, + "learning_rate": 2.1063831473750272e-05, + "loss": 0.8659, + "step": 22224 + }, + { + "epoch": 0.7959245796551292, + "grad_norm": 1.4763379096984863, + "learning_rate": 2.1056711040837574e-05, + "loss": 1.2011, + "step": 22225 + }, + { + "epoch": 0.7959603917846975, + "grad_norm": 1.3318166732788086, + "learning_rate": 2.104959167000503e-05, + "loss": 1.0719, + "step": 22226 + }, + { + "epoch": 0.7959962039142657, + "grad_norm": 1.2040683031082153, + "learning_rate": 2.104247336134836e-05, + "loss": 0.9697, + "step": 22227 + }, + { + "epoch": 0.796032016043834, + "grad_norm": 1.472931981086731, + "learning_rate": 2.103535611496337e-05, + "loss": 0.93, + "step": 22228 + }, + { + "epoch": 0.7960678281734024, + "grad_norm": 1.9464235305786133, + "learning_rate": 2.1028239930945794e-05, + "loss": 0.9712, + "step": 22229 + }, + { + "epoch": 0.7961036403029706, + "grad_norm": 1.7412158250808716, + "learning_rate": 2.1021124809391423e-05, + "loss": 1.0185, + "step": 22230 + }, + { + "epoch": 0.7961394524325389, + "grad_norm": 1.795257806777954, + "learning_rate": 2.1014010750395907e-05, + "loss": 1.1299, + "step": 22231 + }, + { + "epoch": 0.7961752645621072, + "grad_norm": 2.5820393562316895, + "learning_rate": 2.1006897754055e-05, + "loss": 1.2865, + "step": 22232 + }, + { + "epoch": 0.7962110766916755, + "grad_norm": 1.6978965997695923, + "learning_rate": 2.099978582046438e-05, + "loss": 1.022, + "step": 22233 + }, + { + "epoch": 0.7962468888212437, + "grad_norm": 1.4386560916900635, + "learning_rate": 2.099267494971977e-05, + "loss": 1.2613, + "step": 22234 + }, + { + "epoch": 0.796282700950812, + "grad_norm": 1.6438517570495605, + "learning_rate": 2.0985565141916808e-05, + "loss": 0.9031, + "step": 22235 + }, + { + "epoch": 0.7963185130803804, + "grad_norm": 1.6131153106689453, + "learning_rate": 2.097845639715109e-05, + "loss": 1.1203, + "step": 22236 + }, + { + "epoch": 0.7963543252099486, + "grad_norm": 1.320329189300537, + "learning_rate": 2.0971348715518368e-05, + "loss": 0.8578, + "step": 22237 + }, + { + "epoch": 0.7963901373395169, + "grad_norm": 1.5270171165466309, + "learning_rate": 2.0964242097114184e-05, + "loss": 1.376, + "step": 22238 + }, + { + "epoch": 0.7964259494690852, + "grad_norm": 2.171999454498291, + "learning_rate": 2.0957136542034172e-05, + "loss": 1.0101, + "step": 22239 + }, + { + "epoch": 0.7964617615986535, + "grad_norm": 1.5629292726516724, + "learning_rate": 2.0950032050373925e-05, + "loss": 0.8649, + "step": 22240 + }, + { + "epoch": 0.7964975737282217, + "grad_norm": 1.7760865688323975, + "learning_rate": 2.0942928622229064e-05, + "loss": 1.1602, + "step": 22241 + }, + { + "epoch": 0.79653338585779, + "grad_norm": 1.4889901876449585, + "learning_rate": 2.093582625769509e-05, + "loss": 1.3079, + "step": 22242 + }, + { + "epoch": 0.7965691979873584, + "grad_norm": 1.4742162227630615, + "learning_rate": 2.0928724956867585e-05, + "loss": 0.9076, + "step": 22243 + }, + { + "epoch": 0.7966050101169266, + "grad_norm": 1.7531658411026, + "learning_rate": 2.0921624719842126e-05, + "loss": 1.1493, + "step": 22244 + }, + { + "epoch": 0.7966408222464949, + "grad_norm": 1.364315390586853, + "learning_rate": 2.091452554671417e-05, + "loss": 0.9994, + "step": 22245 + }, + { + "epoch": 0.7966766343760632, + "grad_norm": 1.8572630882263184, + "learning_rate": 2.0907427437579287e-05, + "loss": 1.0934, + "step": 22246 + }, + { + "epoch": 0.7967124465056314, + "grad_norm": 1.3070697784423828, + "learning_rate": 2.0900330392532895e-05, + "loss": 1.1472, + "step": 22247 + }, + { + "epoch": 0.7967482586351997, + "grad_norm": 1.562401294708252, + "learning_rate": 2.089323441167058e-05, + "loss": 1.2319, + "step": 22248 + }, + { + "epoch": 0.796784070764768, + "grad_norm": 1.43215811252594, + "learning_rate": 2.088613949508772e-05, + "loss": 0.9765, + "step": 22249 + }, + { + "epoch": 0.7968198828943364, + "grad_norm": 1.5285286903381348, + "learning_rate": 2.0879045642879814e-05, + "loss": 0.9793, + "step": 22250 + }, + { + "epoch": 0.7968556950239046, + "grad_norm": 1.3523180484771729, + "learning_rate": 2.0871952855142286e-05, + "loss": 0.9515, + "step": 22251 + }, + { + "epoch": 0.7968915071534729, + "grad_norm": 2.1246466636657715, + "learning_rate": 2.0864861131970594e-05, + "loss": 1.1971, + "step": 22252 + }, + { + "epoch": 0.7969273192830412, + "grad_norm": 1.2782093286514282, + "learning_rate": 2.0857770473460115e-05, + "loss": 0.9551, + "step": 22253 + }, + { + "epoch": 0.7969631314126094, + "grad_norm": 1.455438256263733, + "learning_rate": 2.08506808797062e-05, + "loss": 0.987, + "step": 22254 + }, + { + "epoch": 0.7969989435421777, + "grad_norm": 1.6816365718841553, + "learning_rate": 2.084359235080433e-05, + "loss": 1.1283, + "step": 22255 + }, + { + "epoch": 0.797034755671746, + "grad_norm": 1.4635910987854004, + "learning_rate": 2.0836504886849785e-05, + "loss": 1.3051, + "step": 22256 + }, + { + "epoch": 0.7970705678013144, + "grad_norm": 2.1462879180908203, + "learning_rate": 2.082941848793799e-05, + "loss": 1.0645, + "step": 22257 + }, + { + "epoch": 0.7971063799308826, + "grad_norm": 1.4505958557128906, + "learning_rate": 2.0822333154164187e-05, + "loss": 0.8962, + "step": 22258 + }, + { + "epoch": 0.7971421920604509, + "grad_norm": 1.4922230243682861, + "learning_rate": 2.0815248885623817e-05, + "loss": 0.8876, + "step": 22259 + }, + { + "epoch": 0.7971780041900192, + "grad_norm": 1.4404083490371704, + "learning_rate": 2.0808165682412107e-05, + "loss": 0.9828, + "step": 22260 + }, + { + "epoch": 0.7972138163195874, + "grad_norm": 1.9544514417648315, + "learning_rate": 2.08010835446244e-05, + "loss": 1.2148, + "step": 22261 + }, + { + "epoch": 0.7972496284491557, + "grad_norm": 1.5506007671356201, + "learning_rate": 2.0794002472355933e-05, + "loss": 1.1674, + "step": 22262 + }, + { + "epoch": 0.797285440578724, + "grad_norm": 1.3620469570159912, + "learning_rate": 2.0786922465701997e-05, + "loss": 1.1405, + "step": 22263 + }, + { + "epoch": 0.7973212527082923, + "grad_norm": 1.4245381355285645, + "learning_rate": 2.0779843524757858e-05, + "loss": 0.8799, + "step": 22264 + }, + { + "epoch": 0.7973570648378606, + "grad_norm": 1.6480921506881714, + "learning_rate": 2.0772765649618688e-05, + "loss": 1.0239, + "step": 22265 + }, + { + "epoch": 0.7973928769674289, + "grad_norm": 1.8395891189575195, + "learning_rate": 2.0765688840379816e-05, + "loss": 1.0356, + "step": 22266 + }, + { + "epoch": 0.7974286890969972, + "grad_norm": 1.427481770515442, + "learning_rate": 2.075861309713637e-05, + "loss": 0.8151, + "step": 22267 + }, + { + "epoch": 0.7974645012265654, + "grad_norm": 1.7916961908340454, + "learning_rate": 2.0751538419983598e-05, + "loss": 0.8946, + "step": 22268 + }, + { + "epoch": 0.7975003133561337, + "grad_norm": 1.574236512184143, + "learning_rate": 2.0744464809016593e-05, + "loss": 1.2394, + "step": 22269 + }, + { + "epoch": 0.797536125485702, + "grad_norm": 1.711976170539856, + "learning_rate": 2.0737392264330635e-05, + "loss": 0.9681, + "step": 22270 + }, + { + "epoch": 0.7975719376152703, + "grad_norm": 1.39157235622406, + "learning_rate": 2.073032078602083e-05, + "loss": 1.1085, + "step": 22271 + }, + { + "epoch": 0.7976077497448386, + "grad_norm": 1.4365308284759521, + "learning_rate": 2.0723250374182278e-05, + "loss": 1.1866, + "step": 22272 + }, + { + "epoch": 0.7976435618744069, + "grad_norm": 1.385881781578064, + "learning_rate": 2.071618102891013e-05, + "loss": 0.9767, + "step": 22273 + }, + { + "epoch": 0.7976793740039752, + "grad_norm": 1.488710641860962, + "learning_rate": 2.070911275029951e-05, + "loss": 1.1604, + "step": 22274 + }, + { + "epoch": 0.7977151861335434, + "grad_norm": 1.7577850818634033, + "learning_rate": 2.0702045538445515e-05, + "loss": 1.2276, + "step": 22275 + }, + { + "epoch": 0.7977509982631117, + "grad_norm": 1.5395488739013672, + "learning_rate": 2.069497939344316e-05, + "loss": 0.998, + "step": 22276 + }, + { + "epoch": 0.79778681039268, + "grad_norm": 1.2259291410446167, + "learning_rate": 2.0687914315387613e-05, + "loss": 1.1802, + "step": 22277 + }, + { + "epoch": 0.7978226225222483, + "grad_norm": 1.9035367965698242, + "learning_rate": 2.0680850304373843e-05, + "loss": 1.1513, + "step": 22278 + }, + { + "epoch": 0.7978584346518166, + "grad_norm": 1.7724868059158325, + "learning_rate": 2.0673787360496954e-05, + "loss": 1.1535, + "step": 22279 + }, + { + "epoch": 0.7978942467813849, + "grad_norm": 1.5209108591079712, + "learning_rate": 2.06667254838519e-05, + "loss": 1.1559, + "step": 22280 + }, + { + "epoch": 0.7979300589109531, + "grad_norm": 1.3625096082687378, + "learning_rate": 2.0659664674533728e-05, + "loss": 1.2032, + "step": 22281 + }, + { + "epoch": 0.7979658710405214, + "grad_norm": 1.6136257648468018, + "learning_rate": 2.065260493263744e-05, + "loss": 1.1244, + "step": 22282 + }, + { + "epoch": 0.7980016831700897, + "grad_norm": 1.435038685798645, + "learning_rate": 2.0645546258257987e-05, + "loss": 1.1589, + "step": 22283 + }, + { + "epoch": 0.798037495299658, + "grad_norm": 1.5442012548446655, + "learning_rate": 2.063848865149035e-05, + "loss": 1.1115, + "step": 22284 + }, + { + "epoch": 0.7980733074292263, + "grad_norm": 1.6549588441848755, + "learning_rate": 2.0631432112429473e-05, + "loss": 1.1843, + "step": 22285 + }, + { + "epoch": 0.7981091195587946, + "grad_norm": 1.588953971862793, + "learning_rate": 2.062437664117033e-05, + "loss": 1.2007, + "step": 22286 + }, + { + "epoch": 0.7981449316883629, + "grad_norm": 1.3854470252990723, + "learning_rate": 2.061732223780778e-05, + "loss": 1.0927, + "step": 22287 + }, + { + "epoch": 0.7981807438179311, + "grad_norm": 1.246159315109253, + "learning_rate": 2.061026890243677e-05, + "loss": 0.9897, + "step": 22288 + }, + { + "epoch": 0.7982165559474994, + "grad_norm": 1.1424192190170288, + "learning_rate": 2.06032166351522e-05, + "loss": 0.8352, + "step": 22289 + }, + { + "epoch": 0.7982523680770677, + "grad_norm": 1.511212706565857, + "learning_rate": 2.059616543604892e-05, + "loss": 1.1433, + "step": 22290 + }, + { + "epoch": 0.7982881802066359, + "grad_norm": 1.515836477279663, + "learning_rate": 2.0589115305221807e-05, + "loss": 1.1006, + "step": 22291 + }, + { + "epoch": 0.7983239923362043, + "grad_norm": 1.282468318939209, + "learning_rate": 2.058206624276572e-05, + "loss": 1.1582, + "step": 22292 + }, + { + "epoch": 0.7983598044657726, + "grad_norm": 1.359622836112976, + "learning_rate": 2.0575018248775513e-05, + "loss": 1.3086, + "step": 22293 + }, + { + "epoch": 0.7983956165953409, + "grad_norm": 1.2769416570663452, + "learning_rate": 2.056797132334596e-05, + "loss": 1.0433, + "step": 22294 + }, + { + "epoch": 0.7984314287249091, + "grad_norm": 1.4343364238739014, + "learning_rate": 2.056092546657189e-05, + "loss": 1.0081, + "step": 22295 + }, + { + "epoch": 0.7984672408544774, + "grad_norm": 1.4390385150909424, + "learning_rate": 2.0553880678548097e-05, + "loss": 1.0263, + "step": 22296 + }, + { + "epoch": 0.7985030529840457, + "grad_norm": 1.4879592657089233, + "learning_rate": 2.0546836959369387e-05, + "loss": 1.0405, + "step": 22297 + }, + { + "epoch": 0.7985388651136139, + "grad_norm": 1.8801459074020386, + "learning_rate": 2.0539794309130478e-05, + "loss": 0.903, + "step": 22298 + }, + { + "epoch": 0.7985746772431823, + "grad_norm": 1.6579437255859375, + "learning_rate": 2.0532752727926142e-05, + "loss": 1.2993, + "step": 22299 + }, + { + "epoch": 0.7986104893727506, + "grad_norm": 1.5549921989440918, + "learning_rate": 2.0525712215851132e-05, + "loss": 1.026, + "step": 22300 + }, + { + "epoch": 0.7986463015023189, + "grad_norm": 1.2805520296096802, + "learning_rate": 2.0518672773000124e-05, + "loss": 1.0539, + "step": 22301 + }, + { + "epoch": 0.7986821136318871, + "grad_norm": 2.2828855514526367, + "learning_rate": 2.051163439946786e-05, + "loss": 1.3055, + "step": 22302 + }, + { + "epoch": 0.7987179257614554, + "grad_norm": 1.7243845462799072, + "learning_rate": 2.050459709534901e-05, + "loss": 1.2353, + "step": 22303 + }, + { + "epoch": 0.7987537378910237, + "grad_norm": 2.335035562515259, + "learning_rate": 2.0497560860738295e-05, + "loss": 1.0131, + "step": 22304 + }, + { + "epoch": 0.7987895500205919, + "grad_norm": 1.7044262886047363, + "learning_rate": 2.0490525695730323e-05, + "loss": 1.1905, + "step": 22305 + }, + { + "epoch": 0.7988253621501603, + "grad_norm": 1.4775464534759521, + "learning_rate": 2.048349160041977e-05, + "loss": 0.9948, + "step": 22306 + }, + { + "epoch": 0.7988611742797286, + "grad_norm": 1.3571205139160156, + "learning_rate": 2.0476458574901293e-05, + "loss": 0.9593, + "step": 22307 + }, + { + "epoch": 0.7988969864092968, + "grad_norm": 1.7667338848114014, + "learning_rate": 2.046942661926946e-05, + "loss": 1.102, + "step": 22308 + }, + { + "epoch": 0.7989327985388651, + "grad_norm": 1.3661049604415894, + "learning_rate": 2.04623957336189e-05, + "loss": 0.9084, + "step": 22309 + }, + { + "epoch": 0.7989686106684334, + "grad_norm": 1.5919150114059448, + "learning_rate": 2.0455365918044224e-05, + "loss": 0.9425, + "step": 22310 + }, + { + "epoch": 0.7990044227980017, + "grad_norm": 1.544590950012207, + "learning_rate": 2.044833717264001e-05, + "loss": 0.9919, + "step": 22311 + }, + { + "epoch": 0.7990402349275699, + "grad_norm": 1.5462392568588257, + "learning_rate": 2.044130949750077e-05, + "loss": 1.0012, + "step": 22312 + }, + { + "epoch": 0.7990760470571383, + "grad_norm": 1.410276174545288, + "learning_rate": 2.04342828927211e-05, + "loss": 1.0195, + "step": 22313 + }, + { + "epoch": 0.7991118591867066, + "grad_norm": 1.719663143157959, + "learning_rate": 2.0427257358395546e-05, + "loss": 1.1223, + "step": 22314 + }, + { + "epoch": 0.7991476713162748, + "grad_norm": 1.3796237707138062, + "learning_rate": 2.0420232894618573e-05, + "loss": 1.0778, + "step": 22315 + }, + { + "epoch": 0.7991834834458431, + "grad_norm": 1.5289274454116821, + "learning_rate": 2.041320950148472e-05, + "loss": 1.1054, + "step": 22316 + }, + { + "epoch": 0.7992192955754114, + "grad_norm": 1.5800330638885498, + "learning_rate": 2.0406187179088477e-05, + "loss": 1.1363, + "step": 22317 + }, + { + "epoch": 0.7992551077049797, + "grad_norm": 1.6352906227111816, + "learning_rate": 2.0399165927524334e-05, + "loss": 1.1146, + "step": 22318 + }, + { + "epoch": 0.7992909198345479, + "grad_norm": 1.7796684503555298, + "learning_rate": 2.0392145746886714e-05, + "loss": 1.2735, + "step": 22319 + }, + { + "epoch": 0.7993267319641163, + "grad_norm": 1.521898627281189, + "learning_rate": 2.038512663727009e-05, + "loss": 1.0736, + "step": 22320 + }, + { + "epoch": 0.7993625440936846, + "grad_norm": 1.4429991245269775, + "learning_rate": 2.0378108598768887e-05, + "loss": 0.9728, + "step": 22321 + }, + { + "epoch": 0.7993983562232528, + "grad_norm": 1.5101159811019897, + "learning_rate": 2.0371091631477557e-05, + "loss": 1.0129, + "step": 22322 + }, + { + "epoch": 0.7994341683528211, + "grad_norm": 1.4537631273269653, + "learning_rate": 2.036407573549044e-05, + "loss": 1.1371, + "step": 22323 + }, + { + "epoch": 0.7994699804823894, + "grad_norm": 2.1055378913879395, + "learning_rate": 2.035706091090197e-05, + "loss": 1.0497, + "step": 22324 + }, + { + "epoch": 0.7995057926119576, + "grad_norm": 1.6377053260803223, + "learning_rate": 2.035004715780654e-05, + "loss": 0.9221, + "step": 22325 + }, + { + "epoch": 0.7995416047415259, + "grad_norm": 1.6352483034133911, + "learning_rate": 2.0343034476298452e-05, + "loss": 1.1657, + "step": 22326 + }, + { + "epoch": 0.7995774168710943, + "grad_norm": 1.5250892639160156, + "learning_rate": 2.0336022866472092e-05, + "loss": 1.2352, + "step": 22327 + }, + { + "epoch": 0.7996132290006626, + "grad_norm": 1.22910737991333, + "learning_rate": 2.0329012328421783e-05, + "loss": 0.9494, + "step": 22328 + }, + { + "epoch": 0.7996490411302308, + "grad_norm": 1.647802472114563, + "learning_rate": 2.0322002862241863e-05, + "loss": 1.1855, + "step": 22329 + }, + { + "epoch": 0.7996848532597991, + "grad_norm": 1.187914252281189, + "learning_rate": 2.0314994468026606e-05, + "loss": 1.0403, + "step": 22330 + }, + { + "epoch": 0.7997206653893674, + "grad_norm": 1.5531384944915771, + "learning_rate": 2.03079871458703e-05, + "loss": 1.2489, + "step": 22331 + }, + { + "epoch": 0.7997564775189356, + "grad_norm": 1.271217703819275, + "learning_rate": 2.0300980895867263e-05, + "loss": 0.9616, + "step": 22332 + }, + { + "epoch": 0.7997922896485039, + "grad_norm": 1.4564381837844849, + "learning_rate": 2.029397571811169e-05, + "loss": 1.1476, + "step": 22333 + }, + { + "epoch": 0.7998281017780723, + "grad_norm": 1.7977006435394287, + "learning_rate": 2.0286971612697902e-05, + "loss": 1.1206, + "step": 22334 + }, + { + "epoch": 0.7998639139076406, + "grad_norm": 1.5450222492218018, + "learning_rate": 2.027996857972002e-05, + "loss": 0.9374, + "step": 22335 + }, + { + "epoch": 0.7998997260372088, + "grad_norm": 1.68083655834198, + "learning_rate": 2.0272966619272392e-05, + "loss": 0.8586, + "step": 22336 + }, + { + "epoch": 0.7999355381667771, + "grad_norm": 1.9046814441680908, + "learning_rate": 2.026596573144913e-05, + "loss": 0.8564, + "step": 22337 + }, + { + "epoch": 0.7999713502963454, + "grad_norm": 1.434614658355713, + "learning_rate": 2.025896591634444e-05, + "loss": 0.9969, + "step": 22338 + }, + { + "epoch": 0.8000071624259136, + "grad_norm": 1.4373587369918823, + "learning_rate": 2.0251967174052523e-05, + "loss": 0.8556, + "step": 22339 + }, + { + "epoch": 0.8000429745554819, + "grad_norm": 1.393925666809082, + "learning_rate": 2.024496950466753e-05, + "loss": 1.3959, + "step": 22340 + }, + { + "epoch": 0.8000787866850503, + "grad_norm": 1.4610364437103271, + "learning_rate": 2.023797290828361e-05, + "loss": 0.9984, + "step": 22341 + }, + { + "epoch": 0.8001145988146185, + "grad_norm": 1.548851490020752, + "learning_rate": 2.0230977384994808e-05, + "loss": 0.8497, + "step": 22342 + }, + { + "epoch": 0.8001504109441868, + "grad_norm": 1.4102622270584106, + "learning_rate": 2.022398293489538e-05, + "loss": 1.2656, + "step": 22343 + }, + { + "epoch": 0.8001862230737551, + "grad_norm": 1.505203127861023, + "learning_rate": 2.0216989558079326e-05, + "loss": 1.0904, + "step": 22344 + }, + { + "epoch": 0.8002220352033234, + "grad_norm": 1.5403677225112915, + "learning_rate": 2.020999725464079e-05, + "loss": 1.0628, + "step": 22345 + }, + { + "epoch": 0.8002578473328916, + "grad_norm": 1.598924160003662, + "learning_rate": 2.0203006024673764e-05, + "loss": 1.0665, + "step": 22346 + }, + { + "epoch": 0.8002936594624599, + "grad_norm": 1.7144421339035034, + "learning_rate": 2.0196015868272412e-05, + "loss": 1.1553, + "step": 22347 + }, + { + "epoch": 0.8003294715920283, + "grad_norm": 1.6410282850265503, + "learning_rate": 2.0189026785530705e-05, + "loss": 0.9474, + "step": 22348 + }, + { + "epoch": 0.8003652837215965, + "grad_norm": 1.9362342357635498, + "learning_rate": 2.01820387765427e-05, + "loss": 1.1336, + "step": 22349 + }, + { + "epoch": 0.8004010958511648, + "grad_norm": 1.5981801748275757, + "learning_rate": 2.0175051841402426e-05, + "loss": 1.1868, + "step": 22350 + }, + { + "epoch": 0.8004369079807331, + "grad_norm": 1.3807706832885742, + "learning_rate": 2.016806598020383e-05, + "loss": 0.9389, + "step": 22351 + }, + { + "epoch": 0.8004727201103014, + "grad_norm": 1.4066314697265625, + "learning_rate": 2.0161081193040964e-05, + "loss": 0.9274, + "step": 22352 + }, + { + "epoch": 0.8005085322398696, + "grad_norm": 1.3328895568847656, + "learning_rate": 2.0154097480007716e-05, + "loss": 0.9115, + "step": 22353 + }, + { + "epoch": 0.8005443443694379, + "grad_norm": 1.3249492645263672, + "learning_rate": 2.0147114841198144e-05, + "loss": 1.1695, + "step": 22354 + }, + { + "epoch": 0.8005801564990063, + "grad_norm": 2.4629058837890625, + "learning_rate": 2.014013327670611e-05, + "loss": 0.9752, + "step": 22355 + }, + { + "epoch": 0.8006159686285745, + "grad_norm": 1.7547178268432617, + "learning_rate": 2.0133152786625598e-05, + "loss": 1.104, + "step": 22356 + }, + { + "epoch": 0.8006517807581428, + "grad_norm": 1.2318089008331299, + "learning_rate": 2.012617337105044e-05, + "loss": 1.1119, + "step": 22357 + }, + { + "epoch": 0.8006875928877111, + "grad_norm": 1.3192023038864136, + "learning_rate": 2.0119195030074645e-05, + "loss": 0.9798, + "step": 22358 + }, + { + "epoch": 0.8007234050172793, + "grad_norm": 1.4324955940246582, + "learning_rate": 2.011221776379204e-05, + "loss": 1.2088, + "step": 22359 + }, + { + "epoch": 0.8007592171468476, + "grad_norm": 1.6124248504638672, + "learning_rate": 2.0105241572296463e-05, + "loss": 1.0996, + "step": 22360 + }, + { + "epoch": 0.8007950292764159, + "grad_norm": 1.6006392240524292, + "learning_rate": 2.0098266455681812e-05, + "loss": 1.2333, + "step": 22361 + }, + { + "epoch": 0.8008308414059843, + "grad_norm": 1.7117236852645874, + "learning_rate": 2.009129241404192e-05, + "loss": 1.1498, + "step": 22362 + }, + { + "epoch": 0.8008666535355525, + "grad_norm": 1.451597809791565, + "learning_rate": 2.0084319447470645e-05, + "loss": 1.0964, + "step": 22363 + }, + { + "epoch": 0.8009024656651208, + "grad_norm": 1.6794896125793457, + "learning_rate": 2.007734755606171e-05, + "loss": 1.0726, + "step": 22364 + }, + { + "epoch": 0.8009382777946891, + "grad_norm": 1.4887877702713013, + "learning_rate": 2.0070376739909024e-05, + "loss": 1.0655, + "step": 22365 + }, + { + "epoch": 0.8009740899242573, + "grad_norm": 1.3533620834350586, + "learning_rate": 2.0063406999106293e-05, + "loss": 1.0107, + "step": 22366 + }, + { + "epoch": 0.8010099020538256, + "grad_norm": 1.3623052835464478, + "learning_rate": 2.005643833374733e-05, + "loss": 1.0148, + "step": 22367 + }, + { + "epoch": 0.8010457141833939, + "grad_norm": 1.3138278722763062, + "learning_rate": 2.0049470743925845e-05, + "loss": 0.9743, + "step": 22368 + }, + { + "epoch": 0.8010815263129623, + "grad_norm": 1.4916090965270996, + "learning_rate": 2.0042504229735604e-05, + "loss": 1.1056, + "step": 22369 + }, + { + "epoch": 0.8011173384425305, + "grad_norm": 1.546741247177124, + "learning_rate": 2.0035538791270358e-05, + "loss": 1.0517, + "step": 22370 + }, + { + "epoch": 0.8011531505720988, + "grad_norm": 1.9358422756195068, + "learning_rate": 2.002857442862377e-05, + "loss": 1.1115, + "step": 22371 + }, + { + "epoch": 0.8011889627016671, + "grad_norm": 1.3533835411071777, + "learning_rate": 2.002161114188955e-05, + "loss": 1.1664, + "step": 22372 + }, + { + "epoch": 0.8012247748312353, + "grad_norm": 1.438886284828186, + "learning_rate": 2.0014648931161386e-05, + "loss": 0.985, + "step": 22373 + }, + { + "epoch": 0.8012605869608036, + "grad_norm": 1.56078040599823, + "learning_rate": 2.000768779653298e-05, + "loss": 0.8792, + "step": 22374 + }, + { + "epoch": 0.8012963990903719, + "grad_norm": 1.3654102087020874, + "learning_rate": 2.000072773809789e-05, + "loss": 1.2114, + "step": 22375 + }, + { + "epoch": 0.8013322112199402, + "grad_norm": 1.2745403051376343, + "learning_rate": 1.9993768755949882e-05, + "loss": 1.0667, + "step": 22376 + }, + { + "epoch": 0.8013680233495085, + "grad_norm": 1.5120691061019897, + "learning_rate": 1.99868108501825e-05, + "loss": 0.9526, + "step": 22377 + }, + { + "epoch": 0.8014038354790768, + "grad_norm": 1.382308840751648, + "learning_rate": 1.9979854020889356e-05, + "loss": 1.0962, + "step": 22378 + }, + { + "epoch": 0.8014396476086451, + "grad_norm": 1.7460105419158936, + "learning_rate": 1.9972898268164052e-05, + "loss": 1.1679, + "step": 22379 + }, + { + "epoch": 0.8014754597382133, + "grad_norm": 1.4029946327209473, + "learning_rate": 1.9965943592100166e-05, + "loss": 1.0135, + "step": 22380 + }, + { + "epoch": 0.8015112718677816, + "grad_norm": 2.1347262859344482, + "learning_rate": 1.995898999279131e-05, + "loss": 1.0697, + "step": 22381 + }, + { + "epoch": 0.8015470839973499, + "grad_norm": 1.8172377347946167, + "learning_rate": 1.9952037470330964e-05, + "loss": 1.2183, + "step": 22382 + }, + { + "epoch": 0.8015828961269181, + "grad_norm": 1.3483465909957886, + "learning_rate": 1.994508602481271e-05, + "loss": 1.1417, + "step": 22383 + }, + { + "epoch": 0.8016187082564865, + "grad_norm": 1.360947847366333, + "learning_rate": 1.993813565633005e-05, + "loss": 1.2208, + "step": 22384 + }, + { + "epoch": 0.8016545203860548, + "grad_norm": 1.817399501800537, + "learning_rate": 1.993118636497654e-05, + "loss": 1.1628, + "step": 22385 + }, + { + "epoch": 0.801690332515623, + "grad_norm": 1.7366833686828613, + "learning_rate": 1.99242381508456e-05, + "loss": 0.8966, + "step": 22386 + }, + { + "epoch": 0.8017261446451913, + "grad_norm": 1.2984508275985718, + "learning_rate": 1.9917291014030747e-05, + "loss": 0.8646, + "step": 22387 + }, + { + "epoch": 0.8017619567747596, + "grad_norm": 1.4337117671966553, + "learning_rate": 1.991034495462547e-05, + "loss": 1.0086, + "step": 22388 + }, + { + "epoch": 0.8017977689043279, + "grad_norm": 1.3928537368774414, + "learning_rate": 1.990339997272317e-05, + "loss": 0.8897, + "step": 22389 + }, + { + "epoch": 0.8018335810338961, + "grad_norm": 1.668196201324463, + "learning_rate": 1.9896456068417302e-05, + "loss": 1.0858, + "step": 22390 + }, + { + "epoch": 0.8018693931634645, + "grad_norm": 1.7071954011917114, + "learning_rate": 1.9889513241801295e-05, + "loss": 1.0425, + "step": 22391 + }, + { + "epoch": 0.8019052052930328, + "grad_norm": 1.4956532716751099, + "learning_rate": 1.988257149296857e-05, + "loss": 1.2177, + "step": 22392 + }, + { + "epoch": 0.801941017422601, + "grad_norm": 1.4144043922424316, + "learning_rate": 1.987563082201249e-05, + "loss": 1.1263, + "step": 22393 + }, + { + "epoch": 0.8019768295521693, + "grad_norm": 1.230950117111206, + "learning_rate": 1.9868691229026437e-05, + "loss": 1.1149, + "step": 22394 + }, + { + "epoch": 0.8020126416817376, + "grad_norm": 1.362170696258545, + "learning_rate": 1.986175271410381e-05, + "loss": 0.9499, + "step": 22395 + }, + { + "epoch": 0.8020484538113059, + "grad_norm": 1.5504565238952637, + "learning_rate": 1.9854815277337902e-05, + "loss": 0.9957, + "step": 22396 + }, + { + "epoch": 0.8020842659408741, + "grad_norm": 1.7617771625518799, + "learning_rate": 1.9847878918822073e-05, + "loss": 0.9213, + "step": 22397 + }, + { + "epoch": 0.8021200780704425, + "grad_norm": 1.495188593864441, + "learning_rate": 1.9840943638649635e-05, + "loss": 0.9114, + "step": 22398 + }, + { + "epoch": 0.8021558902000108, + "grad_norm": 1.5222736597061157, + "learning_rate": 1.9834009436913948e-05, + "loss": 1.079, + "step": 22399 + }, + { + "epoch": 0.802191702329579, + "grad_norm": 1.6439086198806763, + "learning_rate": 1.9827076313708216e-05, + "loss": 1.2133, + "step": 22400 + }, + { + "epoch": 0.8022275144591473, + "grad_norm": 1.6264034509658813, + "learning_rate": 1.9820144269125763e-05, + "loss": 1.216, + "step": 22401 + }, + { + "epoch": 0.8022633265887156, + "grad_norm": 1.476920485496521, + "learning_rate": 1.981321330325987e-05, + "loss": 1.1487, + "step": 22402 + }, + { + "epoch": 0.8022991387182838, + "grad_norm": 1.29268217086792, + "learning_rate": 1.980628341620373e-05, + "loss": 1.0106, + "step": 22403 + }, + { + "epoch": 0.8023349508478521, + "grad_norm": 1.8041201829910278, + "learning_rate": 1.9799354608050614e-05, + "loss": 1.1424, + "step": 22404 + }, + { + "epoch": 0.8023707629774205, + "grad_norm": 1.527841567993164, + "learning_rate": 1.979242687889372e-05, + "loss": 1.0741, + "step": 22405 + }, + { + "epoch": 0.8024065751069888, + "grad_norm": 1.5794005393981934, + "learning_rate": 1.9785500228826292e-05, + "loss": 1.1258, + "step": 22406 + }, + { + "epoch": 0.802442387236557, + "grad_norm": 1.5244321823120117, + "learning_rate": 1.977857465794146e-05, + "loss": 1.2911, + "step": 22407 + }, + { + "epoch": 0.8024781993661253, + "grad_norm": 1.6380605697631836, + "learning_rate": 1.977165016633242e-05, + "loss": 1.0649, + "step": 22408 + }, + { + "epoch": 0.8025140114956936, + "grad_norm": 2.3610589504241943, + "learning_rate": 1.9764726754092354e-05, + "loss": 1.2035, + "step": 22409 + }, + { + "epoch": 0.8025498236252618, + "grad_norm": 1.5903571844100952, + "learning_rate": 1.975780442131442e-05, + "loss": 1.199, + "step": 22410 + }, + { + "epoch": 0.8025856357548301, + "grad_norm": 1.2078174352645874, + "learning_rate": 1.9750883168091684e-05, + "loss": 1.1082, + "step": 22411 + }, + { + "epoch": 0.8026214478843985, + "grad_norm": 1.2142618894577026, + "learning_rate": 1.9743962994517316e-05, + "loss": 1.0526, + "step": 22412 + }, + { + "epoch": 0.8026572600139668, + "grad_norm": 1.4562287330627441, + "learning_rate": 1.9737043900684416e-05, + "loss": 1.0359, + "step": 22413 + }, + { + "epoch": 0.802693072143535, + "grad_norm": 1.6155145168304443, + "learning_rate": 1.9730125886686033e-05, + "loss": 0.9779, + "step": 22414 + }, + { + "epoch": 0.8027288842731033, + "grad_norm": 1.5094319581985474, + "learning_rate": 1.972320895261528e-05, + "loss": 1.1255, + "step": 22415 + }, + { + "epoch": 0.8027646964026716, + "grad_norm": 1.468475103378296, + "learning_rate": 1.9716293098565186e-05, + "loss": 1.0252, + "step": 22416 + }, + { + "epoch": 0.8028005085322398, + "grad_norm": 1.6243335008621216, + "learning_rate": 1.9709378324628848e-05, + "loss": 1.1774, + "step": 22417 + }, + { + "epoch": 0.8028363206618081, + "grad_norm": 1.2520084381103516, + "learning_rate": 1.970246463089922e-05, + "loss": 0.9589, + "step": 22418 + }, + { + "epoch": 0.8028721327913765, + "grad_norm": 1.4006445407867432, + "learning_rate": 1.9695552017469364e-05, + "loss": 1.1645, + "step": 22419 + }, + { + "epoch": 0.8029079449209447, + "grad_norm": 1.488540768623352, + "learning_rate": 1.9688640484432287e-05, + "loss": 1.103, + "step": 22420 + }, + { + "epoch": 0.802943757050513, + "grad_norm": 1.3686920404434204, + "learning_rate": 1.968173003188094e-05, + "loss": 0.9604, + "step": 22421 + }, + { + "epoch": 0.8029795691800813, + "grad_norm": 1.713547945022583, + "learning_rate": 1.96748206599083e-05, + "loss": 1.0402, + "step": 22422 + }, + { + "epoch": 0.8030153813096496, + "grad_norm": 1.2712773084640503, + "learning_rate": 1.9667912368607344e-05, + "loss": 0.9803, + "step": 22423 + }, + { + "epoch": 0.8030511934392178, + "grad_norm": 1.8350944519042969, + "learning_rate": 1.9661005158071033e-05, + "loss": 0.9793, + "step": 22424 + }, + { + "epoch": 0.8030870055687861, + "grad_norm": 1.431777834892273, + "learning_rate": 1.965409902839225e-05, + "loss": 1.2433, + "step": 22425 + }, + { + "epoch": 0.8031228176983545, + "grad_norm": 1.644594430923462, + "learning_rate": 1.9647193979663915e-05, + "loss": 1.0799, + "step": 22426 + }, + { + "epoch": 0.8031586298279227, + "grad_norm": 1.5752640962600708, + "learning_rate": 1.9640290011978935e-05, + "loss": 1.0759, + "step": 22427 + }, + { + "epoch": 0.803194441957491, + "grad_norm": 1.4705084562301636, + "learning_rate": 1.9633387125430226e-05, + "loss": 1.1043, + "step": 22428 + }, + { + "epoch": 0.8032302540870593, + "grad_norm": 1.3940376043319702, + "learning_rate": 1.9626485320110632e-05, + "loss": 0.9695, + "step": 22429 + }, + { + "epoch": 0.8032660662166276, + "grad_norm": 1.6106939315795898, + "learning_rate": 1.961958459611295e-05, + "loss": 0.8729, + "step": 22430 + }, + { + "epoch": 0.8033018783461958, + "grad_norm": 1.2168983221054077, + "learning_rate": 1.9612684953530124e-05, + "loss": 0.9539, + "step": 22431 + }, + { + "epoch": 0.8033376904757641, + "grad_norm": 2.221379041671753, + "learning_rate": 1.9605786392454904e-05, + "loss": 1.2476, + "step": 22432 + }, + { + "epoch": 0.8033735026053325, + "grad_norm": 1.656269907951355, + "learning_rate": 1.9598888912980117e-05, + "loss": 0.9821, + "step": 22433 + }, + { + "epoch": 0.8034093147349007, + "grad_norm": 1.7926592826843262, + "learning_rate": 1.9591992515198588e-05, + "loss": 1.1602, + "step": 22434 + }, + { + "epoch": 0.803445126864469, + "grad_norm": 1.5865305662155151, + "learning_rate": 1.9585097199203096e-05, + "loss": 1.1712, + "step": 22435 + }, + { + "epoch": 0.8034809389940373, + "grad_norm": 1.4416965246200562, + "learning_rate": 1.957820296508637e-05, + "loss": 1.2663, + "step": 22436 + }, + { + "epoch": 0.8035167511236055, + "grad_norm": 1.9341681003570557, + "learning_rate": 1.9571309812941184e-05, + "loss": 0.9439, + "step": 22437 + }, + { + "epoch": 0.8035525632531738, + "grad_norm": 1.875167727470398, + "learning_rate": 1.9564417742860307e-05, + "loss": 1.2436, + "step": 22438 + }, + { + "epoch": 0.8035883753827421, + "grad_norm": 1.7967908382415771, + "learning_rate": 1.9557526754936405e-05, + "loss": 1.2499, + "step": 22439 + }, + { + "epoch": 0.8036241875123105, + "grad_norm": 1.304781436920166, + "learning_rate": 1.955063684926225e-05, + "loss": 1.0216, + "step": 22440 + }, + { + "epoch": 0.8036599996418787, + "grad_norm": 1.466518521308899, + "learning_rate": 1.9543748025930452e-05, + "loss": 1.011, + "step": 22441 + }, + { + "epoch": 0.803695811771447, + "grad_norm": 1.4014544486999512, + "learning_rate": 1.9536860285033797e-05, + "loss": 1.1667, + "step": 22442 + }, + { + "epoch": 0.8037316239010153, + "grad_norm": 1.4148921966552734, + "learning_rate": 1.9529973626664865e-05, + "loss": 1.0223, + "step": 22443 + }, + { + "epoch": 0.8037674360305835, + "grad_norm": 1.5722733736038208, + "learning_rate": 1.952308805091636e-05, + "loss": 1.0587, + "step": 22444 + }, + { + "epoch": 0.8038032481601518, + "grad_norm": 1.557368516921997, + "learning_rate": 1.9516203557880852e-05, + "loss": 1.1585, + "step": 22445 + }, + { + "epoch": 0.8038390602897201, + "grad_norm": 1.7671518325805664, + "learning_rate": 1.9509320147651068e-05, + "loss": 1.0303, + "step": 22446 + }, + { + "epoch": 0.8038748724192885, + "grad_norm": 1.7816752195358276, + "learning_rate": 1.950243782031954e-05, + "loss": 1.1247, + "step": 22447 + }, + { + "epoch": 0.8039106845488567, + "grad_norm": 1.4324171543121338, + "learning_rate": 1.9495556575978836e-05, + "loss": 1.0227, + "step": 22448 + }, + { + "epoch": 0.803946496678425, + "grad_norm": 1.4759712219238281, + "learning_rate": 1.948867641472163e-05, + "loss": 1.093, + "step": 22449 + }, + { + "epoch": 0.8039823088079933, + "grad_norm": 1.5200895071029663, + "learning_rate": 1.9481797336640396e-05, + "loss": 0.9408, + "step": 22450 + }, + { + "epoch": 0.8040181209375615, + "grad_norm": 1.454814076423645, + "learning_rate": 1.9474919341827746e-05, + "loss": 0.9378, + "step": 22451 + }, + { + "epoch": 0.8040539330671298, + "grad_norm": 1.738572120666504, + "learning_rate": 1.946804243037613e-05, + "loss": 1.0555, + "step": 22452 + }, + { + "epoch": 0.8040897451966981, + "grad_norm": 1.7394793033599854, + "learning_rate": 1.9461166602378176e-05, + "loss": 1.3074, + "step": 22453 + }, + { + "epoch": 0.8041255573262664, + "grad_norm": 1.2670872211456299, + "learning_rate": 1.9454291857926323e-05, + "loss": 1.177, + "step": 22454 + }, + { + "epoch": 0.8041613694558347, + "grad_norm": 1.2068854570388794, + "learning_rate": 1.94474181971131e-05, + "loss": 1.0794, + "step": 22455 + }, + { + "epoch": 0.804197181585403, + "grad_norm": 1.5729002952575684, + "learning_rate": 1.9440545620030924e-05, + "loss": 1.1268, + "step": 22456 + }, + { + "epoch": 0.8042329937149713, + "grad_norm": 1.4967222213745117, + "learning_rate": 1.9433674126772306e-05, + "loss": 1.0921, + "step": 22457 + }, + { + "epoch": 0.8042688058445395, + "grad_norm": 1.5171293020248413, + "learning_rate": 1.9426803717429696e-05, + "loss": 1.0324, + "step": 22458 + }, + { + "epoch": 0.8043046179741078, + "grad_norm": 1.3528980016708374, + "learning_rate": 1.941993439209546e-05, + "loss": 0.9363, + "step": 22459 + }, + { + "epoch": 0.8043404301036761, + "grad_norm": 1.3133435249328613, + "learning_rate": 1.9413066150862113e-05, + "loss": 0.8968, + "step": 22460 + }, + { + "epoch": 0.8043762422332444, + "grad_norm": 1.3950265645980835, + "learning_rate": 1.9406198993822e-05, + "loss": 1.1862, + "step": 22461 + }, + { + "epoch": 0.8044120543628127, + "grad_norm": 1.6596134901046753, + "learning_rate": 1.9399332921067537e-05, + "loss": 1.3641, + "step": 22462 + }, + { + "epoch": 0.804447866492381, + "grad_norm": 1.7229669094085693, + "learning_rate": 1.939246793269103e-05, + "loss": 1.292, + "step": 22463 + }, + { + "epoch": 0.8044836786219492, + "grad_norm": 2.2311713695526123, + "learning_rate": 1.938560402878494e-05, + "loss": 1.221, + "step": 22464 + }, + { + "epoch": 0.8045194907515175, + "grad_norm": 1.5980761051177979, + "learning_rate": 1.9378741209441565e-05, + "loss": 1.0102, + "step": 22465 + }, + { + "epoch": 0.8045553028810858, + "grad_norm": 1.3709638118743896, + "learning_rate": 1.9371879474753208e-05, + "loss": 1.1056, + "step": 22466 + }, + { + "epoch": 0.8045911150106541, + "grad_norm": 1.4165000915527344, + "learning_rate": 1.9365018824812208e-05, + "loss": 1.102, + "step": 22467 + }, + { + "epoch": 0.8046269271402224, + "grad_norm": 1.5516494512557983, + "learning_rate": 1.9358159259710874e-05, + "loss": 0.9677, + "step": 22468 + }, + { + "epoch": 0.8046627392697907, + "grad_norm": 1.8236867189407349, + "learning_rate": 1.9351300779541503e-05, + "loss": 1.1914, + "step": 22469 + }, + { + "epoch": 0.804698551399359, + "grad_norm": 1.3912725448608398, + "learning_rate": 1.9344443384396337e-05, + "loss": 1.2635, + "step": 22470 + }, + { + "epoch": 0.8047343635289272, + "grad_norm": 1.8307288885116577, + "learning_rate": 1.9337587074367637e-05, + "loss": 1.1285, + "step": 22471 + }, + { + "epoch": 0.8047701756584955, + "grad_norm": 1.336430549621582, + "learning_rate": 1.9330731849547655e-05, + "loss": 0.8574, + "step": 22472 + }, + { + "epoch": 0.8048059877880638, + "grad_norm": 1.4335532188415527, + "learning_rate": 1.9323877710028658e-05, + "loss": 1.1134, + "step": 22473 + }, + { + "epoch": 0.804841799917632, + "grad_norm": 1.3839606046676636, + "learning_rate": 1.9317024655902782e-05, + "loss": 1.0772, + "step": 22474 + }, + { + "epoch": 0.8048776120472004, + "grad_norm": 1.7554677724838257, + "learning_rate": 1.9310172687262273e-05, + "loss": 1.0313, + "step": 22475 + }, + { + "epoch": 0.8049134241767687, + "grad_norm": 2.384551763534546, + "learning_rate": 1.9303321804199338e-05, + "loss": 1.0536, + "step": 22476 + }, + { + "epoch": 0.804949236306337, + "grad_norm": 1.2872496843338013, + "learning_rate": 1.9296472006806087e-05, + "loss": 0.7649, + "step": 22477 + }, + { + "epoch": 0.8049850484359052, + "grad_norm": 1.5940912961959839, + "learning_rate": 1.9289623295174697e-05, + "loss": 1.2436, + "step": 22478 + }, + { + "epoch": 0.8050208605654735, + "grad_norm": 1.6509697437286377, + "learning_rate": 1.9282775669397324e-05, + "loss": 1.2547, + "step": 22479 + }, + { + "epoch": 0.8050566726950418, + "grad_norm": 1.170957326889038, + "learning_rate": 1.9275929129566116e-05, + "loss": 1.1573, + "step": 22480 + }, + { + "epoch": 0.80509248482461, + "grad_norm": 1.4824079275131226, + "learning_rate": 1.9269083675773126e-05, + "loss": 1.1202, + "step": 22481 + }, + { + "epoch": 0.8051282969541784, + "grad_norm": 1.526732325553894, + "learning_rate": 1.9262239308110474e-05, + "loss": 1.0758, + "step": 22482 + }, + { + "epoch": 0.8051641090837467, + "grad_norm": 1.1563479900360107, + "learning_rate": 1.925539602667028e-05, + "loss": 1.0266, + "step": 22483 + }, + { + "epoch": 0.805199921213315, + "grad_norm": 1.6110410690307617, + "learning_rate": 1.924855383154456e-05, + "loss": 1.2764, + "step": 22484 + }, + { + "epoch": 0.8052357333428832, + "grad_norm": 1.6279836893081665, + "learning_rate": 1.924171272282538e-05, + "loss": 1.2088, + "step": 22485 + }, + { + "epoch": 0.8052715454724515, + "grad_norm": 1.4049630165100098, + "learning_rate": 1.9234872700604777e-05, + "loss": 1.0383, + "step": 22486 + }, + { + "epoch": 0.8053073576020198, + "grad_norm": 1.4118520021438599, + "learning_rate": 1.9228033764974818e-05, + "loss": 0.919, + "step": 22487 + }, + { + "epoch": 0.805343169731588, + "grad_norm": 1.638472557067871, + "learning_rate": 1.9221195916027445e-05, + "loss": 1.2448, + "step": 22488 + }, + { + "epoch": 0.8053789818611564, + "grad_norm": 1.6878548860549927, + "learning_rate": 1.921435915385469e-05, + "loss": 1.3572, + "step": 22489 + }, + { + "epoch": 0.8054147939907247, + "grad_norm": 1.3589940071105957, + "learning_rate": 1.9207523478548518e-05, + "loss": 1.0409, + "step": 22490 + }, + { + "epoch": 0.805450606120293, + "grad_norm": 1.3018019199371338, + "learning_rate": 1.9200688890200936e-05, + "loss": 0.8294, + "step": 22491 + }, + { + "epoch": 0.8054864182498612, + "grad_norm": 1.4297525882720947, + "learning_rate": 1.9193855388903824e-05, + "loss": 0.9849, + "step": 22492 + }, + { + "epoch": 0.8055222303794295, + "grad_norm": 1.4127951860427856, + "learning_rate": 1.918702297474917e-05, + "loss": 1.1365, + "step": 22493 + }, + { + "epoch": 0.8055580425089978, + "grad_norm": 1.5020031929016113, + "learning_rate": 1.9180191647828906e-05, + "loss": 1.0408, + "step": 22494 + }, + { + "epoch": 0.805593854638566, + "grad_norm": 1.7893718481063843, + "learning_rate": 1.917336140823488e-05, + "loss": 1.0374, + "step": 22495 + }, + { + "epoch": 0.8056296667681344, + "grad_norm": 1.564475655555725, + "learning_rate": 1.916653225605901e-05, + "loss": 1.1343, + "step": 22496 + }, + { + "epoch": 0.8056654788977027, + "grad_norm": 1.4536641836166382, + "learning_rate": 1.915970419139319e-05, + "loss": 1.2965, + "step": 22497 + }, + { + "epoch": 0.805701291027271, + "grad_norm": 1.3825844526290894, + "learning_rate": 1.91528772143293e-05, + "loss": 0.8568, + "step": 22498 + }, + { + "epoch": 0.8057371031568392, + "grad_norm": 1.759499192237854, + "learning_rate": 1.9146051324959134e-05, + "loss": 1.1539, + "step": 22499 + }, + { + "epoch": 0.8057729152864075, + "grad_norm": 1.4304238557815552, + "learning_rate": 1.9139226523374566e-05, + "loss": 1.1147, + "step": 22500 + }, + { + "epoch": 0.8058087274159758, + "grad_norm": 1.4915742874145508, + "learning_rate": 1.9132402809667416e-05, + "loss": 1.0868, + "step": 22501 + }, + { + "epoch": 0.805844539545544, + "grad_norm": 1.8583987951278687, + "learning_rate": 1.9125580183929448e-05, + "loss": 1.2935, + "step": 22502 + }, + { + "epoch": 0.8058803516751124, + "grad_norm": 1.2942333221435547, + "learning_rate": 1.9118758646252477e-05, + "loss": 0.9198, + "step": 22503 + }, + { + "epoch": 0.8059161638046807, + "grad_norm": 1.579957127571106, + "learning_rate": 1.9111938196728284e-05, + "loss": 1.0802, + "step": 22504 + }, + { + "epoch": 0.8059519759342489, + "grad_norm": 1.5800986289978027, + "learning_rate": 1.9105118835448644e-05, + "loss": 1.2746, + "step": 22505 + }, + { + "epoch": 0.8059877880638172, + "grad_norm": 1.4296810626983643, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.87, + "step": 22506 + }, + { + "epoch": 0.8060236001933855, + "grad_norm": 1.3873487710952759, + "learning_rate": 1.9091483377989895e-05, + "loss": 1.0905, + "step": 22507 + }, + { + "epoch": 0.8060594123229537, + "grad_norm": 1.769485354423523, + "learning_rate": 1.9084667281994273e-05, + "loss": 1.1024, + "step": 22508 + }, + { + "epoch": 0.806095224452522, + "grad_norm": 1.3115997314453125, + "learning_rate": 1.9077852274610055e-05, + "loss": 1.2448, + "step": 22509 + }, + { + "epoch": 0.8061310365820904, + "grad_norm": 1.988982081413269, + "learning_rate": 1.9071038355928948e-05, + "loss": 1.1917, + "step": 22510 + }, + { + "epoch": 0.8061668487116587, + "grad_norm": 1.3664157390594482, + "learning_rate": 1.9064225526042644e-05, + "loss": 0.9348, + "step": 22511 + }, + { + "epoch": 0.8062026608412269, + "grad_norm": 1.2893821001052856, + "learning_rate": 1.90574137850428e-05, + "loss": 1.1092, + "step": 22512 + }, + { + "epoch": 0.8062384729707952, + "grad_norm": 1.6932108402252197, + "learning_rate": 1.9050603133021017e-05, + "loss": 1.1473, + "step": 22513 + }, + { + "epoch": 0.8062742851003635, + "grad_norm": 1.4819058179855347, + "learning_rate": 1.904379357006896e-05, + "loss": 0.9557, + "step": 22514 + }, + { + "epoch": 0.8063100972299317, + "grad_norm": 1.5278465747833252, + "learning_rate": 1.9036985096278227e-05, + "loss": 1.2543, + "step": 22515 + }, + { + "epoch": 0.8063459093595, + "grad_norm": 1.784099817276001, + "learning_rate": 1.903017771174046e-05, + "loss": 1.2022, + "step": 22516 + }, + { + "epoch": 0.8063817214890684, + "grad_norm": 1.4253132343292236, + "learning_rate": 1.9023371416547177e-05, + "loss": 1.157, + "step": 22517 + }, + { + "epoch": 0.8064175336186367, + "grad_norm": 1.3456758260726929, + "learning_rate": 1.9016566210789977e-05, + "loss": 1.0696, + "step": 22518 + }, + { + "epoch": 0.8064533457482049, + "grad_norm": 1.2703773975372314, + "learning_rate": 1.9009762094560446e-05, + "loss": 1.2061, + "step": 22519 + }, + { + "epoch": 0.8064891578777732, + "grad_norm": 1.4915353059768677, + "learning_rate": 1.9002959067950066e-05, + "loss": 1.1073, + "step": 22520 + }, + { + "epoch": 0.8065249700073415, + "grad_norm": 1.558637022972107, + "learning_rate": 1.8996157131050395e-05, + "loss": 1.0946, + "step": 22521 + }, + { + "epoch": 0.8065607821369097, + "grad_norm": 1.579343557357788, + "learning_rate": 1.8989356283952943e-05, + "loss": 1.1758, + "step": 22522 + }, + { + "epoch": 0.806596594266478, + "grad_norm": 1.3704968690872192, + "learning_rate": 1.898255652674924e-05, + "loss": 1.1193, + "step": 22523 + }, + { + "epoch": 0.8066324063960464, + "grad_norm": 1.2518121004104614, + "learning_rate": 1.8975757859530696e-05, + "loss": 1.1448, + "step": 22524 + }, + { + "epoch": 0.8066682185256147, + "grad_norm": 1.5560390949249268, + "learning_rate": 1.8968960282388826e-05, + "loss": 1.2469, + "step": 22525 + }, + { + "epoch": 0.8067040306551829, + "grad_norm": 1.6451301574707031, + "learning_rate": 1.896216379541509e-05, + "loss": 1.0864, + "step": 22526 + }, + { + "epoch": 0.8067398427847512, + "grad_norm": 1.5422307252883911, + "learning_rate": 1.895536839870089e-05, + "loss": 1.0011, + "step": 22527 + }, + { + "epoch": 0.8067756549143195, + "grad_norm": 1.2926552295684814, + "learning_rate": 1.894857409233769e-05, + "loss": 0.8261, + "step": 22528 + }, + { + "epoch": 0.8068114670438877, + "grad_norm": 1.764553189277649, + "learning_rate": 1.8941780876416826e-05, + "loss": 1.2165, + "step": 22529 + }, + { + "epoch": 0.806847279173456, + "grad_norm": 1.6553250551223755, + "learning_rate": 1.893498875102979e-05, + "loss": 1.2097, + "step": 22530 + }, + { + "epoch": 0.8068830913030244, + "grad_norm": 1.5823791027069092, + "learning_rate": 1.8928197716267894e-05, + "loss": 1.1253, + "step": 22531 + }, + { + "epoch": 0.8069189034325926, + "grad_norm": 1.3668209314346313, + "learning_rate": 1.892140777222252e-05, + "loss": 1.0545, + "step": 22532 + }, + { + "epoch": 0.8069547155621609, + "grad_norm": 1.4251964092254639, + "learning_rate": 1.8914618918985028e-05, + "loss": 0.9131, + "step": 22533 + }, + { + "epoch": 0.8069905276917292, + "grad_norm": 1.393776774406433, + "learning_rate": 1.890783115664676e-05, + "loss": 0.9146, + "step": 22534 + }, + { + "epoch": 0.8070263398212975, + "grad_norm": 1.3000966310501099, + "learning_rate": 1.8901044485299034e-05, + "loss": 0.9939, + "step": 22535 + }, + { + "epoch": 0.8070621519508657, + "grad_norm": 1.6849722862243652, + "learning_rate": 1.889425890503308e-05, + "loss": 1.226, + "step": 22536 + }, + { + "epoch": 0.807097964080434, + "grad_norm": 1.460184097290039, + "learning_rate": 1.888747441594031e-05, + "loss": 0.994, + "step": 22537 + }, + { + "epoch": 0.8071337762100024, + "grad_norm": 1.940331220626831, + "learning_rate": 1.888069101811193e-05, + "loss": 0.9572, + "step": 22538 + }, + { + "epoch": 0.8071695883395706, + "grad_norm": 1.2492071390151978, + "learning_rate": 1.887390871163922e-05, + "loss": 0.9734, + "step": 22539 + }, + { + "epoch": 0.8072054004691389, + "grad_norm": 1.2687822580337524, + "learning_rate": 1.886712749661339e-05, + "loss": 1.1645, + "step": 22540 + }, + { + "epoch": 0.8072412125987072, + "grad_norm": 2.0318796634674072, + "learning_rate": 1.8860347373125753e-05, + "loss": 1.0976, + "step": 22541 + }, + { + "epoch": 0.8072770247282754, + "grad_norm": 1.2919496297836304, + "learning_rate": 1.8853568341267448e-05, + "loss": 0.9912, + "step": 22542 + }, + { + "epoch": 0.8073128368578437, + "grad_norm": 1.4465135335922241, + "learning_rate": 1.884679040112971e-05, + "loss": 0.8991, + "step": 22543 + }, + { + "epoch": 0.807348648987412, + "grad_norm": 1.6968172788619995, + "learning_rate": 1.884001355280376e-05, + "loss": 1.2459, + "step": 22544 + }, + { + "epoch": 0.8073844611169804, + "grad_norm": 1.5907886028289795, + "learning_rate": 1.8833237796380708e-05, + "loss": 1.026, + "step": 22545 + }, + { + "epoch": 0.8074202732465486, + "grad_norm": 1.5388270616531372, + "learning_rate": 1.8826463131951767e-05, + "loss": 1.0935, + "step": 22546 + }, + { + "epoch": 0.8074560853761169, + "grad_norm": 1.4662445783615112, + "learning_rate": 1.8819689559608012e-05, + "loss": 1.0482, + "step": 22547 + }, + { + "epoch": 0.8074918975056852, + "grad_norm": 1.2720284461975098, + "learning_rate": 1.8812917079440673e-05, + "loss": 1.154, + "step": 22548 + }, + { + "epoch": 0.8075277096352534, + "grad_norm": 1.5665262937545776, + "learning_rate": 1.8806145691540777e-05, + "loss": 1.2563, + "step": 22549 + }, + { + "epoch": 0.8075635217648217, + "grad_norm": 1.4491814374923706, + "learning_rate": 1.8799375395999487e-05, + "loss": 1.1036, + "step": 22550 + }, + { + "epoch": 0.80759933389439, + "grad_norm": 1.3542101383209229, + "learning_rate": 1.8792606192907813e-05, + "loss": 1.041, + "step": 22551 + }, + { + "epoch": 0.8076351460239584, + "grad_norm": 1.4733880758285522, + "learning_rate": 1.878583808235692e-05, + "loss": 1.058, + "step": 22552 + }, + { + "epoch": 0.8076709581535266, + "grad_norm": 2.0986897945404053, + "learning_rate": 1.8779071064437813e-05, + "loss": 1.3705, + "step": 22553 + }, + { + "epoch": 0.8077067702830949, + "grad_norm": 1.8696414232254028, + "learning_rate": 1.877230513924152e-05, + "loss": 0.8704, + "step": 22554 + }, + { + "epoch": 0.8077425824126632, + "grad_norm": 1.6433600187301636, + "learning_rate": 1.8765540306859076e-05, + "loss": 1.1513, + "step": 22555 + }, + { + "epoch": 0.8077783945422314, + "grad_norm": 2.310802936553955, + "learning_rate": 1.8758776567381508e-05, + "loss": 1.0461, + "step": 22556 + }, + { + "epoch": 0.8078142066717997, + "grad_norm": 1.5797293186187744, + "learning_rate": 1.8752013920899836e-05, + "loss": 1.0077, + "step": 22557 + }, + { + "epoch": 0.807850018801368, + "grad_norm": 1.4912432432174683, + "learning_rate": 1.874525236750495e-05, + "loss": 1.218, + "step": 22558 + }, + { + "epoch": 0.8078858309309364, + "grad_norm": 1.6996943950653076, + "learning_rate": 1.8738491907287946e-05, + "loss": 1.2092, + "step": 22559 + }, + { + "epoch": 0.8079216430605046, + "grad_norm": 1.4322565793991089, + "learning_rate": 1.8731732540339684e-05, + "loss": 1.0827, + "step": 22560 + }, + { + "epoch": 0.8079574551900729, + "grad_norm": 1.3938252925872803, + "learning_rate": 1.872497426675116e-05, + "loss": 0.9717, + "step": 22561 + }, + { + "epoch": 0.8079932673196412, + "grad_norm": 1.3520210981369019, + "learning_rate": 1.8718217086613242e-05, + "loss": 0.9465, + "step": 22562 + }, + { + "epoch": 0.8080290794492094, + "grad_norm": 1.39162278175354, + "learning_rate": 1.871146100001687e-05, + "loss": 0.9611, + "step": 22563 + }, + { + "epoch": 0.8080648915787777, + "grad_norm": 1.4587384462356567, + "learning_rate": 1.8704706007052963e-05, + "loss": 1.0813, + "step": 22564 + }, + { + "epoch": 0.808100703708346, + "grad_norm": 1.5884284973144531, + "learning_rate": 1.8697952107812344e-05, + "loss": 1.0038, + "step": 22565 + }, + { + "epoch": 0.8081365158379143, + "grad_norm": 1.301527976989746, + "learning_rate": 1.86911993023859e-05, + "loss": 1.041, + "step": 22566 + }, + { + "epoch": 0.8081723279674826, + "grad_norm": 1.3875027894973755, + "learning_rate": 1.8684447590864494e-05, + "loss": 1.1383, + "step": 22567 + }, + { + "epoch": 0.8082081400970509, + "grad_norm": 1.520033597946167, + "learning_rate": 1.867769697333899e-05, + "loss": 1.0252, + "step": 22568 + }, + { + "epoch": 0.8082439522266192, + "grad_norm": 1.6937907934188843, + "learning_rate": 1.8670947449900113e-05, + "loss": 0.9783, + "step": 22569 + }, + { + "epoch": 0.8082797643561874, + "grad_norm": 1.3227415084838867, + "learning_rate": 1.8664199020638785e-05, + "loss": 1.2373, + "step": 22570 + }, + { + "epoch": 0.8083155764857557, + "grad_norm": 1.4330549240112305, + "learning_rate": 1.8657451685645756e-05, + "loss": 1.1608, + "step": 22571 + }, + { + "epoch": 0.808351388615324, + "grad_norm": 1.3558467626571655, + "learning_rate": 1.8650705445011752e-05, + "loss": 0.9203, + "step": 22572 + }, + { + "epoch": 0.8083872007448923, + "grad_norm": 1.5050486326217651, + "learning_rate": 1.8643960298827566e-05, + "loss": 0.9632, + "step": 22573 + }, + { + "epoch": 0.8084230128744606, + "grad_norm": 1.6557923555374146, + "learning_rate": 1.8637216247183966e-05, + "loss": 1.0323, + "step": 22574 + }, + { + "epoch": 0.8084588250040289, + "grad_norm": 1.6064622402191162, + "learning_rate": 1.8630473290171692e-05, + "loss": 1.167, + "step": 22575 + }, + { + "epoch": 0.8084946371335971, + "grad_norm": 1.6989831924438477, + "learning_rate": 1.8623731427881418e-05, + "loss": 1.1397, + "step": 22576 + }, + { + "epoch": 0.8085304492631654, + "grad_norm": 1.471929907798767, + "learning_rate": 1.8616990660403865e-05, + "loss": 1.1828, + "step": 22577 + }, + { + "epoch": 0.8085662613927337, + "grad_norm": 1.3740869760513306, + "learning_rate": 1.8610250987829725e-05, + "loss": 1.0268, + "step": 22578 + }, + { + "epoch": 0.808602073522302, + "grad_norm": 1.6006546020507812, + "learning_rate": 1.86035124102497e-05, + "loss": 1.0443, + "step": 22579 + }, + { + "epoch": 0.8086378856518703, + "grad_norm": 1.7797904014587402, + "learning_rate": 1.85967749277544e-05, + "loss": 0.849, + "step": 22580 + }, + { + "epoch": 0.8086736977814386, + "grad_norm": 1.5239225625991821, + "learning_rate": 1.8590038540434485e-05, + "loss": 1.1459, + "step": 22581 + }, + { + "epoch": 0.8087095099110069, + "grad_norm": 1.3690390586853027, + "learning_rate": 1.8583303248380625e-05, + "loss": 1.0082, + "step": 22582 + }, + { + "epoch": 0.8087453220405751, + "grad_norm": 1.4311312437057495, + "learning_rate": 1.8576569051683368e-05, + "loss": 1.1669, + "step": 22583 + }, + { + "epoch": 0.8087811341701434, + "grad_norm": 1.6710898876190186, + "learning_rate": 1.8569835950433344e-05, + "loss": 1.2945, + "step": 22584 + }, + { + "epoch": 0.8088169462997117, + "grad_norm": 1.6778171062469482, + "learning_rate": 1.856310394472114e-05, + "loss": 0.9886, + "step": 22585 + }, + { + "epoch": 0.80885275842928, + "grad_norm": 1.3882125616073608, + "learning_rate": 1.8556373034637353e-05, + "loss": 1.2161, + "step": 22586 + }, + { + "epoch": 0.8088885705588483, + "grad_norm": 1.3440749645233154, + "learning_rate": 1.8549643220272494e-05, + "loss": 1.0811, + "step": 22587 + }, + { + "epoch": 0.8089243826884166, + "grad_norm": 2.3135290145874023, + "learning_rate": 1.8542914501717113e-05, + "loss": 1.0581, + "step": 22588 + }, + { + "epoch": 0.8089601948179849, + "grad_norm": 1.4504952430725098, + "learning_rate": 1.853618687906177e-05, + "loss": 0.7482, + "step": 22589 + }, + { + "epoch": 0.8089960069475531, + "grad_norm": 1.2885416746139526, + "learning_rate": 1.852946035239693e-05, + "loss": 1.0229, + "step": 22590 + }, + { + "epoch": 0.8090318190771214, + "grad_norm": 1.50973379611969, + "learning_rate": 1.8522734921813113e-05, + "loss": 1.0794, + "step": 22591 + }, + { + "epoch": 0.8090676312066897, + "grad_norm": 1.479687213897705, + "learning_rate": 1.85160105874008e-05, + "loss": 0.8661, + "step": 22592 + }, + { + "epoch": 0.8091034433362579, + "grad_norm": 1.6496896743774414, + "learning_rate": 1.8509287349250482e-05, + "loss": 1.0853, + "step": 22593 + }, + { + "epoch": 0.8091392554658263, + "grad_norm": 1.4987016916275024, + "learning_rate": 1.850256520745256e-05, + "loss": 1.1516, + "step": 22594 + }, + { + "epoch": 0.8091750675953946, + "grad_norm": 1.7329161167144775, + "learning_rate": 1.84958441620975e-05, + "loss": 1.1623, + "step": 22595 + }, + { + "epoch": 0.8092108797249629, + "grad_norm": 1.4747581481933594, + "learning_rate": 1.8489124213275745e-05, + "loss": 0.9916, + "step": 22596 + }, + { + "epoch": 0.8092466918545311, + "grad_norm": 1.9019834995269775, + "learning_rate": 1.8482405361077658e-05, + "loss": 1.2312, + "step": 22597 + }, + { + "epoch": 0.8092825039840994, + "grad_norm": 1.3672349452972412, + "learning_rate": 1.847568760559366e-05, + "loss": 1.086, + "step": 22598 + }, + { + "epoch": 0.8093183161136677, + "grad_norm": 1.5594682693481445, + "learning_rate": 1.8468970946914134e-05, + "loss": 0.9388, + "step": 22599 + }, + { + "epoch": 0.8093541282432359, + "grad_norm": 1.4000495672225952, + "learning_rate": 1.8462255385129447e-05, + "loss": 1.0402, + "step": 22600 + }, + { + "epoch": 0.8093899403728043, + "grad_norm": 1.6279536485671997, + "learning_rate": 1.8455540920329916e-05, + "loss": 1.2304, + "step": 22601 + }, + { + "epoch": 0.8094257525023726, + "grad_norm": 1.7256470918655396, + "learning_rate": 1.8448827552605907e-05, + "loss": 1.0289, + "step": 22602 + }, + { + "epoch": 0.8094615646319409, + "grad_norm": 1.7997546195983887, + "learning_rate": 1.8442115282047723e-05, + "loss": 1.1837, + "step": 22603 + }, + { + "epoch": 0.8094973767615091, + "grad_norm": 1.428439736366272, + "learning_rate": 1.8435404108745702e-05, + "loss": 1.0616, + "step": 22604 + }, + { + "epoch": 0.8095331888910774, + "grad_norm": 1.6137771606445312, + "learning_rate": 1.8428694032790074e-05, + "loss": 0.9243, + "step": 22605 + }, + { + "epoch": 0.8095690010206457, + "grad_norm": 1.6227582693099976, + "learning_rate": 1.8421985054271163e-05, + "loss": 1.052, + "step": 22606 + }, + { + "epoch": 0.8096048131502139, + "grad_norm": 1.9287775754928589, + "learning_rate": 1.8415277173279234e-05, + "loss": 1.071, + "step": 22607 + }, + { + "epoch": 0.8096406252797823, + "grad_norm": 1.3434840440750122, + "learning_rate": 1.840857038990449e-05, + "loss": 1.0137, + "step": 22608 + }, + { + "epoch": 0.8096764374093506, + "grad_norm": 2.522773504257202, + "learning_rate": 1.840186470423718e-05, + "loss": 1.1512, + "step": 22609 + }, + { + "epoch": 0.8097122495389188, + "grad_norm": 1.3014861345291138, + "learning_rate": 1.8395160116367528e-05, + "loss": 0.9751, + "step": 22610 + }, + { + "epoch": 0.8097480616684871, + "grad_norm": 1.9921973943710327, + "learning_rate": 1.8388456626385765e-05, + "loss": 1.1355, + "step": 22611 + }, + { + "epoch": 0.8097838737980554, + "grad_norm": 1.634827733039856, + "learning_rate": 1.838175423438202e-05, + "loss": 1.0549, + "step": 22612 + }, + { + "epoch": 0.8098196859276237, + "grad_norm": 1.311232566833496, + "learning_rate": 1.837505294044649e-05, + "loss": 1.0417, + "step": 22613 + }, + { + "epoch": 0.8098554980571919, + "grad_norm": 1.4719098806381226, + "learning_rate": 1.836835274466936e-05, + "loss": 1.0343, + "step": 22614 + }, + { + "epoch": 0.8098913101867603, + "grad_norm": 2.7987234592437744, + "learning_rate": 1.836165364714072e-05, + "loss": 1.1101, + "step": 22615 + }, + { + "epoch": 0.8099271223163286, + "grad_norm": 1.2742695808410645, + "learning_rate": 1.835495564795072e-05, + "loss": 1.0598, + "step": 22616 + }, + { + "epoch": 0.8099629344458968, + "grad_norm": 1.7650259733200073, + "learning_rate": 1.8348258747189484e-05, + "loss": 1.047, + "step": 22617 + }, + { + "epoch": 0.8099987465754651, + "grad_norm": 1.6449958086013794, + "learning_rate": 1.8341562944947134e-05, + "loss": 1.1174, + "step": 22618 + }, + { + "epoch": 0.8100345587050334, + "grad_norm": 1.428259015083313, + "learning_rate": 1.8334868241313685e-05, + "loss": 1.161, + "step": 22619 + }, + { + "epoch": 0.8100703708346016, + "grad_norm": 1.453089714050293, + "learning_rate": 1.832817463637925e-05, + "loss": 1.0659, + "step": 22620 + }, + { + "epoch": 0.8101061829641699, + "grad_norm": 1.2135818004608154, + "learning_rate": 1.832148213023387e-05, + "loss": 0.9078, + "step": 22621 + }, + { + "epoch": 0.8101419950937383, + "grad_norm": 1.7943803071975708, + "learning_rate": 1.8314790722967624e-05, + "loss": 1.1328, + "step": 22622 + }, + { + "epoch": 0.8101778072233066, + "grad_norm": 1.3601096868515015, + "learning_rate": 1.8308100414670504e-05, + "loss": 1.0528, + "step": 22623 + }, + { + "epoch": 0.8102136193528748, + "grad_norm": 1.202375888824463, + "learning_rate": 1.830141120543246e-05, + "loss": 0.9321, + "step": 22624 + }, + { + "epoch": 0.8102494314824431, + "grad_norm": 1.52666175365448, + "learning_rate": 1.829472309534359e-05, + "loss": 1.0925, + "step": 22625 + }, + { + "epoch": 0.8102852436120114, + "grad_norm": 1.4486850500106812, + "learning_rate": 1.828803608449382e-05, + "loss": 1.1148, + "step": 22626 + }, + { + "epoch": 0.8103210557415796, + "grad_norm": 1.3982398509979248, + "learning_rate": 1.828135017297311e-05, + "loss": 1.0724, + "step": 22627 + }, + { + "epoch": 0.8103568678711479, + "grad_norm": 1.4901877641677856, + "learning_rate": 1.8274665360871425e-05, + "loss": 0.9641, + "step": 22628 + }, + { + "epoch": 0.8103926800007163, + "grad_norm": 1.762386679649353, + "learning_rate": 1.8267981648278733e-05, + "loss": 1.1433, + "step": 22629 + }, + { + "epoch": 0.8104284921302846, + "grad_norm": 1.4983775615692139, + "learning_rate": 1.8261299035284883e-05, + "loss": 1.0792, + "step": 22630 + }, + { + "epoch": 0.8104643042598528, + "grad_norm": 1.3038599491119385, + "learning_rate": 1.825461752197983e-05, + "loss": 1.0201, + "step": 22631 + }, + { + "epoch": 0.8105001163894211, + "grad_norm": 1.6321040391921997, + "learning_rate": 1.8247937108453482e-05, + "loss": 1.0328, + "step": 22632 + }, + { + "epoch": 0.8105359285189894, + "grad_norm": 2.0518085956573486, + "learning_rate": 1.8241257794795653e-05, + "loss": 1.0651, + "step": 22633 + }, + { + "epoch": 0.8105717406485576, + "grad_norm": 1.6846994161605835, + "learning_rate": 1.8234579581096266e-05, + "loss": 1.043, + "step": 22634 + }, + { + "epoch": 0.8106075527781259, + "grad_norm": 1.2893608808517456, + "learning_rate": 1.82279024674451e-05, + "loss": 0.9666, + "step": 22635 + }, + { + "epoch": 0.8106433649076943, + "grad_norm": 1.4544507265090942, + "learning_rate": 1.8221226453932074e-05, + "loss": 1.037, + "step": 22636 + }, + { + "epoch": 0.8106791770372626, + "grad_norm": 1.8472563028335571, + "learning_rate": 1.821455154064693e-05, + "loss": 0.9996, + "step": 22637 + }, + { + "epoch": 0.8107149891668308, + "grad_norm": 1.7544753551483154, + "learning_rate": 1.8207877727679523e-05, + "loss": 0.9947, + "step": 22638 + }, + { + "epoch": 0.8107508012963991, + "grad_norm": 1.5465035438537598, + "learning_rate": 1.820120501511957e-05, + "loss": 0.9708, + "step": 22639 + }, + { + "epoch": 0.8107866134259674, + "grad_norm": 1.4273747205734253, + "learning_rate": 1.8194533403056935e-05, + "loss": 1.1777, + "step": 22640 + }, + { + "epoch": 0.8108224255555356, + "grad_norm": 1.4429765939712524, + "learning_rate": 1.8187862891581343e-05, + "loss": 1.0672, + "step": 22641 + }, + { + "epoch": 0.8108582376851039, + "grad_norm": 1.247565746307373, + "learning_rate": 1.8181193480782466e-05, + "loss": 0.8956, + "step": 22642 + }, + { + "epoch": 0.8108940498146723, + "grad_norm": 1.3395072221755981, + "learning_rate": 1.8174525170750145e-05, + "loss": 0.841, + "step": 22643 + }, + { + "epoch": 0.8109298619442405, + "grad_norm": 1.9355961084365845, + "learning_rate": 1.816785796157402e-05, + "loss": 1.2253, + "step": 22644 + }, + { + "epoch": 0.8109656740738088, + "grad_norm": 1.6661593914031982, + "learning_rate": 1.8161191853343827e-05, + "loss": 0.9624, + "step": 22645 + }, + { + "epoch": 0.8110014862033771, + "grad_norm": 1.2683935165405273, + "learning_rate": 1.815452684614919e-05, + "loss": 1.1406, + "step": 22646 + }, + { + "epoch": 0.8110372983329454, + "grad_norm": 1.6810520887374878, + "learning_rate": 1.8147862940079875e-05, + "loss": 1.2592, + "step": 22647 + }, + { + "epoch": 0.8110731104625136, + "grad_norm": 1.4417800903320312, + "learning_rate": 1.8141200135225444e-05, + "loss": 1.1873, + "step": 22648 + }, + { + "epoch": 0.8111089225920819, + "grad_norm": 1.2699203491210938, + "learning_rate": 1.8134538431675608e-05, + "loss": 1.1364, + "step": 22649 + }, + { + "epoch": 0.8111447347216503, + "grad_norm": 1.4729132652282715, + "learning_rate": 1.8127877829519935e-05, + "loss": 1.2754, + "step": 22650 + }, + { + "epoch": 0.8111805468512185, + "grad_norm": 1.7304686307907104, + "learning_rate": 1.8121218328848054e-05, + "loss": 0.9905, + "step": 22651 + }, + { + "epoch": 0.8112163589807868, + "grad_norm": 1.2513446807861328, + "learning_rate": 1.8114559929749586e-05, + "loss": 0.8215, + "step": 22652 + }, + { + "epoch": 0.8112521711103551, + "grad_norm": 1.819739580154419, + "learning_rate": 1.8107902632314044e-05, + "loss": 0.943, + "step": 22653 + }, + { + "epoch": 0.8112879832399233, + "grad_norm": 1.6919232606887817, + "learning_rate": 1.8101246436631093e-05, + "loss": 0.9822, + "step": 22654 + }, + { + "epoch": 0.8113237953694916, + "grad_norm": 1.51112699508667, + "learning_rate": 1.8094591342790202e-05, + "loss": 1.1868, + "step": 22655 + }, + { + "epoch": 0.8113596074990599, + "grad_norm": 1.6799442768096924, + "learning_rate": 1.8087937350880957e-05, + "loss": 1.1222, + "step": 22656 + }, + { + "epoch": 0.8113954196286283, + "grad_norm": 1.6442546844482422, + "learning_rate": 1.8081284460992808e-05, + "loss": 1.0216, + "step": 22657 + }, + { + "epoch": 0.8114312317581965, + "grad_norm": 1.289688229560852, + "learning_rate": 1.8074632673215365e-05, + "loss": 1.1731, + "step": 22658 + }, + { + "epoch": 0.8114670438877648, + "grad_norm": 2.2207655906677246, + "learning_rate": 1.806798198763805e-05, + "loss": 1.2604, + "step": 22659 + }, + { + "epoch": 0.8115028560173331, + "grad_norm": 1.3006799221038818, + "learning_rate": 1.806133240435034e-05, + "loss": 0.9865, + "step": 22660 + }, + { + "epoch": 0.8115386681469013, + "grad_norm": 2.706003189086914, + "learning_rate": 1.8054683923441694e-05, + "loss": 1.1351, + "step": 22661 + }, + { + "epoch": 0.8115744802764696, + "grad_norm": 1.4971624612808228, + "learning_rate": 1.804803654500159e-05, + "loss": 0.9568, + "step": 22662 + }, + { + "epoch": 0.8116102924060379, + "grad_norm": 1.8662521839141846, + "learning_rate": 1.8041390269119463e-05, + "loss": 1.3012, + "step": 22663 + }, + { + "epoch": 0.8116461045356063, + "grad_norm": 1.4173970222473145, + "learning_rate": 1.8034745095884687e-05, + "loss": 1.0113, + "step": 22664 + }, + { + "epoch": 0.8116819166651745, + "grad_norm": 1.3667351007461548, + "learning_rate": 1.802810102538668e-05, + "loss": 1.1857, + "step": 22665 + }, + { + "epoch": 0.8117177287947428, + "grad_norm": 1.8435111045837402, + "learning_rate": 1.8021458057714845e-05, + "loss": 1.0945, + "step": 22666 + }, + { + "epoch": 0.8117535409243111, + "grad_norm": 2.1902575492858887, + "learning_rate": 1.8014816192958574e-05, + "loss": 1.0295, + "step": 22667 + }, + { + "epoch": 0.8117893530538793, + "grad_norm": 1.206457257270813, + "learning_rate": 1.8008175431207173e-05, + "loss": 1.0838, + "step": 22668 + }, + { + "epoch": 0.8118251651834476, + "grad_norm": 1.5035432577133179, + "learning_rate": 1.8001535772550006e-05, + "loss": 1.0366, + "step": 22669 + }, + { + "epoch": 0.8118609773130159, + "grad_norm": 1.4546101093292236, + "learning_rate": 1.7994897217076423e-05, + "loss": 0.905, + "step": 22670 + }, + { + "epoch": 0.8118967894425843, + "grad_norm": 1.340751051902771, + "learning_rate": 1.7988259764875705e-05, + "loss": 1.1356, + "step": 22671 + }, + { + "epoch": 0.8119326015721525, + "grad_norm": 1.497911810874939, + "learning_rate": 1.7981623416037163e-05, + "loss": 0.8726, + "step": 22672 + }, + { + "epoch": 0.8119684137017208, + "grad_norm": 1.386393427848816, + "learning_rate": 1.7974988170650075e-05, + "loss": 1.1681, + "step": 22673 + }, + { + "epoch": 0.8120042258312891, + "grad_norm": 1.4802701473236084, + "learning_rate": 1.7968354028803748e-05, + "loss": 1.0742, + "step": 22674 + }, + { + "epoch": 0.8120400379608573, + "grad_norm": 1.6420013904571533, + "learning_rate": 1.7961720990587382e-05, + "loss": 0.914, + "step": 22675 + }, + { + "epoch": 0.8120758500904256, + "grad_norm": 1.2899718284606934, + "learning_rate": 1.7955089056090234e-05, + "loss": 1.1591, + "step": 22676 + }, + { + "epoch": 0.8121116622199939, + "grad_norm": 1.587284803390503, + "learning_rate": 1.7948458225401553e-05, + "loss": 1.3755, + "step": 22677 + }, + { + "epoch": 0.8121474743495622, + "grad_norm": 1.3804577589035034, + "learning_rate": 1.7941828498610503e-05, + "loss": 1.1123, + "step": 22678 + }, + { + "epoch": 0.8121832864791305, + "grad_norm": 2.2273659706115723, + "learning_rate": 1.793519987580631e-05, + "loss": 0.9583, + "step": 22679 + }, + { + "epoch": 0.8122190986086988, + "grad_norm": 1.7586196660995483, + "learning_rate": 1.7928572357078143e-05, + "loss": 0.9497, + "step": 22680 + }, + { + "epoch": 0.812254910738267, + "grad_norm": 1.466135859489441, + "learning_rate": 1.7921945942515195e-05, + "loss": 1.1391, + "step": 22681 + }, + { + "epoch": 0.8122907228678353, + "grad_norm": 1.610788106918335, + "learning_rate": 1.7915320632206566e-05, + "loss": 0.8305, + "step": 22682 + }, + { + "epoch": 0.8123265349974036, + "grad_norm": 1.1338915824890137, + "learning_rate": 1.7908696426241422e-05, + "loss": 0.9441, + "step": 22683 + }, + { + "epoch": 0.8123623471269719, + "grad_norm": 1.5046073198318481, + "learning_rate": 1.790207332470887e-05, + "loss": 1.1357, + "step": 22684 + }, + { + "epoch": 0.8123981592565402, + "grad_norm": 1.8033980131149292, + "learning_rate": 1.7895451327698054e-05, + "loss": 1.1055, + "step": 22685 + }, + { + "epoch": 0.8124339713861085, + "grad_norm": 1.429363489151001, + "learning_rate": 1.788883043529801e-05, + "loss": 1.0495, + "step": 22686 + }, + { + "epoch": 0.8124697835156768, + "grad_norm": 1.3390116691589355, + "learning_rate": 1.788221064759783e-05, + "loss": 1.1251, + "step": 22687 + }, + { + "epoch": 0.812505595645245, + "grad_norm": 1.50331449508667, + "learning_rate": 1.78755919646866e-05, + "loss": 1.0412, + "step": 22688 + }, + { + "epoch": 0.8125414077748133, + "grad_norm": 1.6867235898971558, + "learning_rate": 1.7868974386653336e-05, + "loss": 1.153, + "step": 22689 + }, + { + "epoch": 0.8125772199043816, + "grad_norm": 2.0435214042663574, + "learning_rate": 1.786235791358707e-05, + "loss": 1.0343, + "step": 22690 + }, + { + "epoch": 0.8126130320339499, + "grad_norm": 1.6266900300979614, + "learning_rate": 1.7855742545576836e-05, + "loss": 1.4202, + "step": 22691 + }, + { + "epoch": 0.8126488441635182, + "grad_norm": 1.9760127067565918, + "learning_rate": 1.7849128282711647e-05, + "loss": 1.1009, + "step": 22692 + }, + { + "epoch": 0.8126846562930865, + "grad_norm": 1.2146326303482056, + "learning_rate": 1.784251512508045e-05, + "loss": 1.128, + "step": 22693 + }, + { + "epoch": 0.8127204684226548, + "grad_norm": 1.6186331510543823, + "learning_rate": 1.783590307277223e-05, + "loss": 1.3555, + "step": 22694 + }, + { + "epoch": 0.812756280552223, + "grad_norm": 1.74001944065094, + "learning_rate": 1.7829292125875984e-05, + "loss": 1.1506, + "step": 22695 + }, + { + "epoch": 0.8127920926817913, + "grad_norm": 1.4830679893493652, + "learning_rate": 1.7822682284480585e-05, + "loss": 1.2102, + "step": 22696 + }, + { + "epoch": 0.8128279048113596, + "grad_norm": 1.6250019073486328, + "learning_rate": 1.7816073548675004e-05, + "loss": 1.1297, + "step": 22697 + }, + { + "epoch": 0.8128637169409278, + "grad_norm": 1.510123610496521, + "learning_rate": 1.780946591854814e-05, + "loss": 0.9578, + "step": 22698 + }, + { + "epoch": 0.8128995290704962, + "grad_norm": 1.6674736738204956, + "learning_rate": 1.780285939418892e-05, + "loss": 0.9671, + "step": 22699 + }, + { + "epoch": 0.8129353412000645, + "grad_norm": 1.3557968139648438, + "learning_rate": 1.7796253975686172e-05, + "loss": 1.052, + "step": 22700 + }, + { + "epoch": 0.8129711533296328, + "grad_norm": 1.2798854112625122, + "learning_rate": 1.7789649663128793e-05, + "loss": 0.8352, + "step": 22701 + }, + { + "epoch": 0.813006965459201, + "grad_norm": 1.3093875646591187, + "learning_rate": 1.7783046456605658e-05, + "loss": 1.1791, + "step": 22702 + }, + { + "epoch": 0.8130427775887693, + "grad_norm": 1.6024264097213745, + "learning_rate": 1.7776444356205556e-05, + "loss": 1.0662, + "step": 22703 + }, + { + "epoch": 0.8130785897183376, + "grad_norm": 1.2995516061782837, + "learning_rate": 1.7769843362017336e-05, + "loss": 1.204, + "step": 22704 + }, + { + "epoch": 0.8131144018479058, + "grad_norm": 1.818915605545044, + "learning_rate": 1.77632434741298e-05, + "loss": 1.1785, + "step": 22705 + }, + { + "epoch": 0.8131502139774742, + "grad_norm": 1.3522330522537231, + "learning_rate": 1.7756644692631773e-05, + "loss": 1.0076, + "step": 22706 + }, + { + "epoch": 0.8131860261070425, + "grad_norm": 1.733638882637024, + "learning_rate": 1.7750047017611983e-05, + "loss": 1.063, + "step": 22707 + }, + { + "epoch": 0.8132218382366108, + "grad_norm": 1.340772032737732, + "learning_rate": 1.7743450449159217e-05, + "loss": 0.8085, + "step": 22708 + }, + { + "epoch": 0.813257650366179, + "grad_norm": 1.7630335092544556, + "learning_rate": 1.7736854987362217e-05, + "loss": 1.2439, + "step": 22709 + }, + { + "epoch": 0.8132934624957473, + "grad_norm": 1.4288150072097778, + "learning_rate": 1.773026063230975e-05, + "loss": 1.1717, + "step": 22710 + }, + { + "epoch": 0.8133292746253156, + "grad_norm": 1.53235924243927, + "learning_rate": 1.7723667384090503e-05, + "loss": 1.1842, + "step": 22711 + }, + { + "epoch": 0.8133650867548838, + "grad_norm": 1.3621439933776855, + "learning_rate": 1.7717075242793123e-05, + "loss": 1.0647, + "step": 22712 + }, + { + "epoch": 0.8134008988844522, + "grad_norm": 1.341142177581787, + "learning_rate": 1.771048420850643e-05, + "loss": 1.1388, + "step": 22713 + }, + { + "epoch": 0.8134367110140205, + "grad_norm": 1.4572252035140991, + "learning_rate": 1.770389428131899e-05, + "loss": 1.146, + "step": 22714 + }, + { + "epoch": 0.8134725231435888, + "grad_norm": 1.464153528213501, + "learning_rate": 1.769730546131949e-05, + "loss": 1.0848, + "step": 22715 + }, + { + "epoch": 0.813508335273157, + "grad_norm": 1.3249492645263672, + "learning_rate": 1.7690717748596585e-05, + "loss": 1.2194, + "step": 22716 + }, + { + "epoch": 0.8135441474027253, + "grad_norm": 1.7168033123016357, + "learning_rate": 1.7684131143238937e-05, + "loss": 1.0925, + "step": 22717 + }, + { + "epoch": 0.8135799595322936, + "grad_norm": 1.420678973197937, + "learning_rate": 1.767754564533509e-05, + "loss": 1.0907, + "step": 22718 + }, + { + "epoch": 0.8136157716618618, + "grad_norm": 1.5768144130706787, + "learning_rate": 1.7670961254973682e-05, + "loss": 1.2697, + "step": 22719 + }, + { + "epoch": 0.8136515837914302, + "grad_norm": 1.7436929941177368, + "learning_rate": 1.766437797224332e-05, + "loss": 0.9105, + "step": 22720 + }, + { + "epoch": 0.8136873959209985, + "grad_norm": 1.3963367938995361, + "learning_rate": 1.7657795797232525e-05, + "loss": 1.0887, + "step": 22721 + }, + { + "epoch": 0.8137232080505667, + "grad_norm": 1.3354744911193848, + "learning_rate": 1.7651214730029897e-05, + "loss": 1.1942, + "step": 22722 + }, + { + "epoch": 0.813759020180135, + "grad_norm": 1.4784629344940186, + "learning_rate": 1.7644634770723888e-05, + "loss": 0.9587, + "step": 22723 + }, + { + "epoch": 0.8137948323097033, + "grad_norm": 1.2645776271820068, + "learning_rate": 1.763805591940315e-05, + "loss": 0.901, + "step": 22724 + }, + { + "epoch": 0.8138306444392716, + "grad_norm": 1.6490416526794434, + "learning_rate": 1.7631478176156113e-05, + "loss": 1.0785, + "step": 22725 + }, + { + "epoch": 0.8138664565688398, + "grad_norm": 1.5624585151672363, + "learning_rate": 1.762490154107128e-05, + "loss": 1.1118, + "step": 22726 + }, + { + "epoch": 0.8139022686984082, + "grad_norm": 1.2904069423675537, + "learning_rate": 1.761832601423714e-05, + "loss": 0.9793, + "step": 22727 + }, + { + "epoch": 0.8139380808279765, + "grad_norm": 1.1727620363235474, + "learning_rate": 1.7611751595742188e-05, + "loss": 0.8619, + "step": 22728 + }, + { + "epoch": 0.8139738929575447, + "grad_norm": 1.396956205368042, + "learning_rate": 1.760517828567484e-05, + "loss": 1.1697, + "step": 22729 + }, + { + "epoch": 0.814009705087113, + "grad_norm": 1.2164386510849, + "learning_rate": 1.759860608412349e-05, + "loss": 0.8717, + "step": 22730 + }, + { + "epoch": 0.8140455172166813, + "grad_norm": 1.3462591171264648, + "learning_rate": 1.7592034991176654e-05, + "loss": 1.1136, + "step": 22731 + }, + { + "epoch": 0.8140813293462495, + "grad_norm": 1.8068459033966064, + "learning_rate": 1.7585465006922662e-05, + "loss": 0.9861, + "step": 22732 + }, + { + "epoch": 0.8141171414758178, + "grad_norm": 1.3133021593093872, + "learning_rate": 1.757889613144995e-05, + "loss": 1.0493, + "step": 22733 + }, + { + "epoch": 0.8141529536053862, + "grad_norm": 1.301302433013916, + "learning_rate": 1.7572328364846836e-05, + "loss": 1.075, + "step": 22734 + }, + { + "epoch": 0.8141887657349545, + "grad_norm": 1.3468581438064575, + "learning_rate": 1.7565761707201767e-05, + "loss": 0.9242, + "step": 22735 + }, + { + "epoch": 0.8142245778645227, + "grad_norm": 1.6484074592590332, + "learning_rate": 1.7559196158603018e-05, + "loss": 1.1165, + "step": 22736 + }, + { + "epoch": 0.814260389994091, + "grad_norm": 1.6316179037094116, + "learning_rate": 1.755263171913897e-05, + "loss": 0.9713, + "step": 22737 + }, + { + "epoch": 0.8142962021236593, + "grad_norm": 1.5355361700057983, + "learning_rate": 1.7546068388897885e-05, + "loss": 1.1441, + "step": 22738 + }, + { + "epoch": 0.8143320142532275, + "grad_norm": 1.5717028379440308, + "learning_rate": 1.753950616796809e-05, + "loss": 1.0111, + "step": 22739 + }, + { + "epoch": 0.8143678263827958, + "grad_norm": 1.5269906520843506, + "learning_rate": 1.75329450564379e-05, + "loss": 0.9743, + "step": 22740 + }, + { + "epoch": 0.8144036385123642, + "grad_norm": 1.2421146631240845, + "learning_rate": 1.75263850543955e-05, + "loss": 0.8899, + "step": 22741 + }, + { + "epoch": 0.8144394506419325, + "grad_norm": 1.6893965005874634, + "learning_rate": 1.7519826161929266e-05, + "loss": 1.1527, + "step": 22742 + }, + { + "epoch": 0.8144752627715007, + "grad_norm": 1.3165836334228516, + "learning_rate": 1.7513268379127356e-05, + "loss": 1.1282, + "step": 22743 + }, + { + "epoch": 0.814511074901069, + "grad_norm": 1.5593822002410889, + "learning_rate": 1.750671170607804e-05, + "loss": 0.8545, + "step": 22744 + }, + { + "epoch": 0.8145468870306373, + "grad_norm": 1.3136489391326904, + "learning_rate": 1.7500156142869462e-05, + "loss": 1.0026, + "step": 22745 + }, + { + "epoch": 0.8145826991602055, + "grad_norm": 1.5826530456542969, + "learning_rate": 1.7493601689589913e-05, + "loss": 0.9142, + "step": 22746 + }, + { + "epoch": 0.8146185112897738, + "grad_norm": 1.5778659582138062, + "learning_rate": 1.748704834632753e-05, + "loss": 1.1003, + "step": 22747 + }, + { + "epoch": 0.8146543234193422, + "grad_norm": 1.3859493732452393, + "learning_rate": 1.748049611317045e-05, + "loss": 1.1731, + "step": 22748 + }, + { + "epoch": 0.8146901355489105, + "grad_norm": 1.483838677406311, + "learning_rate": 1.7473944990206858e-05, + "loss": 0.8566, + "step": 22749 + }, + { + "epoch": 0.8147259476784787, + "grad_norm": 1.351955771446228, + "learning_rate": 1.7467394977524876e-05, + "loss": 1.0414, + "step": 22750 + }, + { + "epoch": 0.814761759808047, + "grad_norm": 1.610310673713684, + "learning_rate": 1.7460846075212666e-05, + "loss": 0.8908, + "step": 22751 + }, + { + "epoch": 0.8147975719376153, + "grad_norm": 1.2262712717056274, + "learning_rate": 1.745429828335826e-05, + "loss": 1.1151, + "step": 22752 + }, + { + "epoch": 0.8148333840671835, + "grad_norm": 1.6060004234313965, + "learning_rate": 1.7447751602049835e-05, + "loss": 0.8954, + "step": 22753 + }, + { + "epoch": 0.8148691961967518, + "grad_norm": 1.4006460905075073, + "learning_rate": 1.74412060313754e-05, + "loss": 1.0823, + "step": 22754 + }, + { + "epoch": 0.8149050083263202, + "grad_norm": 1.4644505977630615, + "learning_rate": 1.7434661571423084e-05, + "loss": 1.0372, + "step": 22755 + }, + { + "epoch": 0.8149408204558884, + "grad_norm": 1.4999418258666992, + "learning_rate": 1.7428118222280855e-05, + "loss": 1.0573, + "step": 22756 + }, + { + "epoch": 0.8149766325854567, + "grad_norm": 1.6726857423782349, + "learning_rate": 1.7421575984036797e-05, + "loss": 1.2006, + "step": 22757 + }, + { + "epoch": 0.815012444715025, + "grad_norm": 1.2728782892227173, + "learning_rate": 1.7415034856778934e-05, + "loss": 1.029, + "step": 22758 + }, + { + "epoch": 0.8150482568445933, + "grad_norm": 1.4438902139663696, + "learning_rate": 1.7408494840595224e-05, + "loss": 0.908, + "step": 22759 + }, + { + "epoch": 0.8150840689741615, + "grad_norm": 1.739009976387024, + "learning_rate": 1.7401955935573688e-05, + "loss": 1.2224, + "step": 22760 + }, + { + "epoch": 0.8151198811037298, + "grad_norm": 1.7995097637176514, + "learning_rate": 1.739541814180228e-05, + "loss": 1.0913, + "step": 22761 + }, + { + "epoch": 0.8151556932332982, + "grad_norm": 1.3259389400482178, + "learning_rate": 1.7388881459369e-05, + "loss": 1.0503, + "step": 22762 + }, + { + "epoch": 0.8151915053628664, + "grad_norm": 1.6532461643218994, + "learning_rate": 1.738234588836174e-05, + "loss": 0.9954, + "step": 22763 + }, + { + "epoch": 0.8152273174924347, + "grad_norm": 1.3310086727142334, + "learning_rate": 1.737581142886844e-05, + "loss": 0.935, + "step": 22764 + }, + { + "epoch": 0.815263129622003, + "grad_norm": 1.496411681175232, + "learning_rate": 1.7369278080977037e-05, + "loss": 1.0708, + "step": 22765 + }, + { + "epoch": 0.8152989417515712, + "grad_norm": 1.390197515487671, + "learning_rate": 1.7362745844775396e-05, + "loss": 1.0668, + "step": 22766 + }, + { + "epoch": 0.8153347538811395, + "grad_norm": 1.2856825590133667, + "learning_rate": 1.7356214720351414e-05, + "loss": 0.8461, + "step": 22767 + }, + { + "epoch": 0.8153705660107078, + "grad_norm": 1.7878551483154297, + "learning_rate": 1.7349684707792956e-05, + "loss": 1.1477, + "step": 22768 + }, + { + "epoch": 0.8154063781402762, + "grad_norm": 1.4338852167129517, + "learning_rate": 1.7343155807187915e-05, + "loss": 0.9872, + "step": 22769 + }, + { + "epoch": 0.8154421902698444, + "grad_norm": 1.7481951713562012, + "learning_rate": 1.7336628018624058e-05, + "loss": 1.0935, + "step": 22770 + }, + { + "epoch": 0.8154780023994127, + "grad_norm": 1.746605396270752, + "learning_rate": 1.7330101342189254e-05, + "loss": 1.1126, + "step": 22771 + }, + { + "epoch": 0.815513814528981, + "grad_norm": 1.5636495351791382, + "learning_rate": 1.732357577797129e-05, + "loss": 0.8408, + "step": 22772 + }, + { + "epoch": 0.8155496266585492, + "grad_norm": 1.6531397104263306, + "learning_rate": 1.7317051326057998e-05, + "loss": 1.1216, + "step": 22773 + }, + { + "epoch": 0.8155854387881175, + "grad_norm": 1.4002591371536255, + "learning_rate": 1.7310527986537095e-05, + "loss": 1.05, + "step": 22774 + }, + { + "epoch": 0.8156212509176858, + "grad_norm": 1.245290994644165, + "learning_rate": 1.7304005759496377e-05, + "loss": 1.0862, + "step": 22775 + }, + { + "epoch": 0.815657063047254, + "grad_norm": 1.4101513624191284, + "learning_rate": 1.729748464502362e-05, + "loss": 1.0412, + "step": 22776 + }, + { + "epoch": 0.8156928751768224, + "grad_norm": 1.933080792427063, + "learning_rate": 1.729096464320651e-05, + "loss": 1.1521, + "step": 22777 + }, + { + "epoch": 0.8157286873063907, + "grad_norm": 1.2568050622940063, + "learning_rate": 1.7284445754132772e-05, + "loss": 1.1976, + "step": 22778 + }, + { + "epoch": 0.815764499435959, + "grad_norm": 1.560887098312378, + "learning_rate": 1.727792797789013e-05, + "loss": 1.059, + "step": 22779 + }, + { + "epoch": 0.8158003115655272, + "grad_norm": 1.4733376502990723, + "learning_rate": 1.7271411314566287e-05, + "loss": 1.1022, + "step": 22780 + }, + { + "epoch": 0.8158361236950955, + "grad_norm": 1.3832733631134033, + "learning_rate": 1.7264895764248868e-05, + "loss": 1.0979, + "step": 22781 + }, + { + "epoch": 0.8158719358246638, + "grad_norm": 1.5493569374084473, + "learning_rate": 1.7258381327025552e-05, + "loss": 1.0664, + "step": 22782 + }, + { + "epoch": 0.815907747954232, + "grad_norm": 1.5502978563308716, + "learning_rate": 1.7251868002984005e-05, + "loss": 1.1385, + "step": 22783 + }, + { + "epoch": 0.8159435600838004, + "grad_norm": 1.582699179649353, + "learning_rate": 1.7245355792211826e-05, + "loss": 1.2312, + "step": 22784 + }, + { + "epoch": 0.8159793722133687, + "grad_norm": 1.474182367324829, + "learning_rate": 1.723884469479663e-05, + "loss": 1.0006, + "step": 22785 + }, + { + "epoch": 0.816015184342937, + "grad_norm": 1.4934173822402954, + "learning_rate": 1.7232334710826025e-05, + "loss": 1.0366, + "step": 22786 + }, + { + "epoch": 0.8160509964725052, + "grad_norm": 1.5212266445159912, + "learning_rate": 1.722582584038762e-05, + "loss": 1.1805, + "step": 22787 + }, + { + "epoch": 0.8160868086020735, + "grad_norm": 1.7919368743896484, + "learning_rate": 1.7219318083568937e-05, + "loss": 1.0041, + "step": 22788 + }, + { + "epoch": 0.8161226207316418, + "grad_norm": 1.2265517711639404, + "learning_rate": 1.7212811440457545e-05, + "loss": 0.8942, + "step": 22789 + }, + { + "epoch": 0.81615843286121, + "grad_norm": 1.5184763669967651, + "learning_rate": 1.7206305911141017e-05, + "loss": 1.1479, + "step": 22790 + }, + { + "epoch": 0.8161942449907784, + "grad_norm": 1.318463683128357, + "learning_rate": 1.7199801495706812e-05, + "loss": 0.7746, + "step": 22791 + }, + { + "epoch": 0.8162300571203467, + "grad_norm": 1.324381709098816, + "learning_rate": 1.719329819424248e-05, + "loss": 1.0815, + "step": 22792 + }, + { + "epoch": 0.816265869249915, + "grad_norm": 1.680427074432373, + "learning_rate": 1.7186796006835514e-05, + "loss": 1.1144, + "step": 22793 + }, + { + "epoch": 0.8163016813794832, + "grad_norm": 1.4832890033721924, + "learning_rate": 1.7180294933573405e-05, + "loss": 1.1355, + "step": 22794 + }, + { + "epoch": 0.8163374935090515, + "grad_norm": 2.039355516433716, + "learning_rate": 1.7173794974543568e-05, + "loss": 0.9851, + "step": 22795 + }, + { + "epoch": 0.8163733056386198, + "grad_norm": 1.7616140842437744, + "learning_rate": 1.7167296129833488e-05, + "loss": 1.1856, + "step": 22796 + }, + { + "epoch": 0.816409117768188, + "grad_norm": 1.3446581363677979, + "learning_rate": 1.7160798399530586e-05, + "loss": 0.9388, + "step": 22797 + }, + { + "epoch": 0.8164449298977564, + "grad_norm": 1.3960638046264648, + "learning_rate": 1.7154301783722315e-05, + "loss": 0.909, + "step": 22798 + }, + { + "epoch": 0.8164807420273247, + "grad_norm": 1.1831406354904175, + "learning_rate": 1.7147806282496027e-05, + "loss": 1.0912, + "step": 22799 + }, + { + "epoch": 0.816516554156893, + "grad_norm": 1.4159603118896484, + "learning_rate": 1.7141311895939137e-05, + "loss": 1.1818, + "step": 22800 + }, + { + "epoch": 0.8165523662864612, + "grad_norm": 1.453316569328308, + "learning_rate": 1.7134818624139036e-05, + "loss": 0.9762, + "step": 22801 + }, + { + "epoch": 0.8165881784160295, + "grad_norm": 1.588761329650879, + "learning_rate": 1.7128326467183032e-05, + "loss": 1.0005, + "step": 22802 + }, + { + "epoch": 0.8166239905455978, + "grad_norm": 1.6542184352874756, + "learning_rate": 1.7121835425158506e-05, + "loss": 1.1648, + "step": 22803 + }, + { + "epoch": 0.816659802675166, + "grad_norm": 1.7024099826812744, + "learning_rate": 1.711534549815278e-05, + "loss": 0.979, + "step": 22804 + }, + { + "epoch": 0.8166956148047344, + "grad_norm": 1.269293189048767, + "learning_rate": 1.7108856686253183e-05, + "loss": 0.9499, + "step": 22805 + }, + { + "epoch": 0.8167314269343027, + "grad_norm": 1.396317958831787, + "learning_rate": 1.710236898954698e-05, + "loss": 1.0129, + "step": 22806 + }, + { + "epoch": 0.8167672390638709, + "grad_norm": 1.5138719081878662, + "learning_rate": 1.7095882408121468e-05, + "loss": 0.9915, + "step": 22807 + }, + { + "epoch": 0.8168030511934392, + "grad_norm": 1.3281042575836182, + "learning_rate": 1.708939694206395e-05, + "loss": 1.2197, + "step": 22808 + }, + { + "epoch": 0.8168388633230075, + "grad_norm": 1.4016757011413574, + "learning_rate": 1.708291259146162e-05, + "loss": 0.9827, + "step": 22809 + }, + { + "epoch": 0.8168746754525757, + "grad_norm": 1.272007703781128, + "learning_rate": 1.7076429356401748e-05, + "loss": 1.1933, + "step": 22810 + }, + { + "epoch": 0.816910487582144, + "grad_norm": 1.442850947380066, + "learning_rate": 1.706994723697155e-05, + "loss": 1.0126, + "step": 22811 + }, + { + "epoch": 0.8169462997117124, + "grad_norm": 1.525905728340149, + "learning_rate": 1.7063466233258275e-05, + "loss": 0.976, + "step": 22812 + }, + { + "epoch": 0.8169821118412807, + "grad_norm": 1.415657877922058, + "learning_rate": 1.7056986345349046e-05, + "loss": 1.0687, + "step": 22813 + }, + { + "epoch": 0.8170179239708489, + "grad_norm": 1.6882637739181519, + "learning_rate": 1.7050507573331077e-05, + "loss": 0.9769, + "step": 22814 + }, + { + "epoch": 0.8170537361004172, + "grad_norm": 1.6173065900802612, + "learning_rate": 1.7044029917291536e-05, + "loss": 0.9261, + "step": 22815 + }, + { + "epoch": 0.8170895482299855, + "grad_norm": 1.5648812055587769, + "learning_rate": 1.7037553377317595e-05, + "loss": 0.9296, + "step": 22816 + }, + { + "epoch": 0.8171253603595537, + "grad_norm": 1.664471983909607, + "learning_rate": 1.7031077953496356e-05, + "loss": 1.0733, + "step": 22817 + }, + { + "epoch": 0.817161172489122, + "grad_norm": 1.5338563919067383, + "learning_rate": 1.7024603645914896e-05, + "loss": 0.9758, + "step": 22818 + }, + { + "epoch": 0.8171969846186904, + "grad_norm": 1.412377119064331, + "learning_rate": 1.7018130454660395e-05, + "loss": 1.1004, + "step": 22819 + }, + { + "epoch": 0.8172327967482587, + "grad_norm": 1.9355506896972656, + "learning_rate": 1.7011658379819904e-05, + "loss": 0.9305, + "step": 22820 + }, + { + "epoch": 0.8172686088778269, + "grad_norm": 1.4641398191452026, + "learning_rate": 1.7005187421480517e-05, + "loss": 0.9458, + "step": 22821 + }, + { + "epoch": 0.8173044210073952, + "grad_norm": 1.6429009437561035, + "learning_rate": 1.699871757972924e-05, + "loss": 1.1735, + "step": 22822 + }, + { + "epoch": 0.8173402331369635, + "grad_norm": 1.6914805173873901, + "learning_rate": 1.6992248854653192e-05, + "loss": 1.1289, + "step": 22823 + }, + { + "epoch": 0.8173760452665317, + "grad_norm": 1.735321283340454, + "learning_rate": 1.698578124633934e-05, + "loss": 1.2693, + "step": 22824 + }, + { + "epoch": 0.8174118573961, + "grad_norm": 1.5823079347610474, + "learning_rate": 1.6979314754874733e-05, + "loss": 1.1968, + "step": 22825 + }, + { + "epoch": 0.8174476695256684, + "grad_norm": 1.4436274766921997, + "learning_rate": 1.6972849380346367e-05, + "loss": 1.0786, + "step": 22826 + }, + { + "epoch": 0.8174834816552367, + "grad_norm": 1.4735386371612549, + "learning_rate": 1.696638512284119e-05, + "loss": 1.14, + "step": 22827 + }, + { + "epoch": 0.8175192937848049, + "grad_norm": 1.7771509885787964, + "learning_rate": 1.6959921982446225e-05, + "loss": 1.194, + "step": 22828 + }, + { + "epoch": 0.8175551059143732, + "grad_norm": 1.525323748588562, + "learning_rate": 1.6953459959248354e-05, + "loss": 0.9615, + "step": 22829 + }, + { + "epoch": 0.8175909180439415, + "grad_norm": 1.6394963264465332, + "learning_rate": 1.69469990533346e-05, + "loss": 1.1009, + "step": 22830 + }, + { + "epoch": 0.8176267301735097, + "grad_norm": 1.4990499019622803, + "learning_rate": 1.694053926479181e-05, + "loss": 0.9912, + "step": 22831 + }, + { + "epoch": 0.817662542303078, + "grad_norm": 1.6069467067718506, + "learning_rate": 1.6934080593706958e-05, + "loss": 0.9019, + "step": 22832 + }, + { + "epoch": 0.8176983544326464, + "grad_norm": 1.5294194221496582, + "learning_rate": 1.692762304016685e-05, + "loss": 1.0281, + "step": 22833 + }, + { + "epoch": 0.8177341665622146, + "grad_norm": 1.5621163845062256, + "learning_rate": 1.6921166604258475e-05, + "loss": 1.0275, + "step": 22834 + }, + { + "epoch": 0.8177699786917829, + "grad_norm": 1.8543647527694702, + "learning_rate": 1.691471128606864e-05, + "loss": 1.1996, + "step": 22835 + }, + { + "epoch": 0.8178057908213512, + "grad_norm": 1.7831863164901733, + "learning_rate": 1.6908257085684143e-05, + "loss": 1.2007, + "step": 22836 + }, + { + "epoch": 0.8178416029509195, + "grad_norm": 1.7533888816833496, + "learning_rate": 1.6901804003191914e-05, + "loss": 1.0711, + "step": 22837 + }, + { + "epoch": 0.8178774150804877, + "grad_norm": 1.6461987495422363, + "learning_rate": 1.6895352038678692e-05, + "loss": 1.2078, + "step": 22838 + }, + { + "epoch": 0.817913227210056, + "grad_norm": 1.2909778356552124, + "learning_rate": 1.6888901192231342e-05, + "loss": 1.0641, + "step": 22839 + }, + { + "epoch": 0.8179490393396244, + "grad_norm": 1.6436887979507446, + "learning_rate": 1.6882451463936566e-05, + "loss": 1.0304, + "step": 22840 + }, + { + "epoch": 0.8179848514691926, + "grad_norm": 1.443349003791809, + "learning_rate": 1.6876002853881244e-05, + "loss": 1.0977, + "step": 22841 + }, + { + "epoch": 0.8180206635987609, + "grad_norm": 1.4883086681365967, + "learning_rate": 1.6869555362152056e-05, + "loss": 0.8932, + "step": 22842 + }, + { + "epoch": 0.8180564757283292, + "grad_norm": 1.386790156364441, + "learning_rate": 1.6863108988835797e-05, + "loss": 0.9299, + "step": 22843 + }, + { + "epoch": 0.8180922878578974, + "grad_norm": 1.5254369974136353, + "learning_rate": 1.685666373401914e-05, + "loss": 1.0868, + "step": 22844 + }, + { + "epoch": 0.8181280999874657, + "grad_norm": 1.492121696472168, + "learning_rate": 1.685021959778883e-05, + "loss": 1.0131, + "step": 22845 + }, + { + "epoch": 0.818163912117034, + "grad_norm": 1.7744667530059814, + "learning_rate": 1.6843776580231586e-05, + "loss": 0.956, + "step": 22846 + }, + { + "epoch": 0.8181997242466024, + "grad_norm": 1.4741002321243286, + "learning_rate": 1.6837334681434037e-05, + "loss": 1.086, + "step": 22847 + }, + { + "epoch": 0.8182355363761706, + "grad_norm": 1.715418815612793, + "learning_rate": 1.683089390148287e-05, + "loss": 1.1802, + "step": 22848 + }, + { + "epoch": 0.8182713485057389, + "grad_norm": 1.893279790878296, + "learning_rate": 1.6824454240464748e-05, + "loss": 1.1784, + "step": 22849 + }, + { + "epoch": 0.8183071606353072, + "grad_norm": 2.084430694580078, + "learning_rate": 1.6818015698466338e-05, + "loss": 1.0324, + "step": 22850 + }, + { + "epoch": 0.8183429727648754, + "grad_norm": 1.5775790214538574, + "learning_rate": 1.681157827557418e-05, + "loss": 0.9226, + "step": 22851 + }, + { + "epoch": 0.8183787848944437, + "grad_norm": 1.3438011407852173, + "learning_rate": 1.680514197187497e-05, + "loss": 1.1293, + "step": 22852 + }, + { + "epoch": 0.818414597024012, + "grad_norm": 1.2804811000823975, + "learning_rate": 1.6798706787455264e-05, + "loss": 1.0901, + "step": 22853 + }, + { + "epoch": 0.8184504091535804, + "grad_norm": 1.726360559463501, + "learning_rate": 1.6792272722401626e-05, + "loss": 0.9259, + "step": 22854 + }, + { + "epoch": 0.8184862212831486, + "grad_norm": 1.5592217445373535, + "learning_rate": 1.6785839776800615e-05, + "loss": 1.0734, + "step": 22855 + }, + { + "epoch": 0.8185220334127169, + "grad_norm": 1.273164987564087, + "learning_rate": 1.677940795073879e-05, + "loss": 1.2023, + "step": 22856 + }, + { + "epoch": 0.8185578455422852, + "grad_norm": 1.648593783378601, + "learning_rate": 1.6772977244302714e-05, + "loss": 1.1588, + "step": 22857 + }, + { + "epoch": 0.8185936576718534, + "grad_norm": 1.6923962831497192, + "learning_rate": 1.6766547657578844e-05, + "loss": 1.0358, + "step": 22858 + }, + { + "epoch": 0.8186294698014217, + "grad_norm": 1.3655693531036377, + "learning_rate": 1.6760119190653724e-05, + "loss": 0.9256, + "step": 22859 + }, + { + "epoch": 0.81866528193099, + "grad_norm": 1.4479687213897705, + "learning_rate": 1.6753691843613818e-05, + "loss": 1.1016, + "step": 22860 + }, + { + "epoch": 0.8187010940605584, + "grad_norm": 1.498482584953308, + "learning_rate": 1.6747265616545625e-05, + "loss": 0.8773, + "step": 22861 + }, + { + "epoch": 0.8187369061901266, + "grad_norm": 1.489197015762329, + "learning_rate": 1.674084050953557e-05, + "loss": 0.9915, + "step": 22862 + }, + { + "epoch": 0.8187727183196949, + "grad_norm": 1.4025747776031494, + "learning_rate": 1.6734416522670114e-05, + "loss": 1.0605, + "step": 22863 + }, + { + "epoch": 0.8188085304492632, + "grad_norm": 1.1459791660308838, + "learning_rate": 1.6727993656035702e-05, + "loss": 0.8296, + "step": 22864 + }, + { + "epoch": 0.8188443425788314, + "grad_norm": 1.71796715259552, + "learning_rate": 1.672157190971869e-05, + "loss": 1.139, + "step": 22865 + }, + { + "epoch": 0.8188801547083997, + "grad_norm": 1.598968744277954, + "learning_rate": 1.671515128380551e-05, + "loss": 1.0831, + "step": 22866 + }, + { + "epoch": 0.818915966837968, + "grad_norm": 1.653905987739563, + "learning_rate": 1.6708731778382546e-05, + "loss": 1.086, + "step": 22867 + }, + { + "epoch": 0.8189517789675363, + "grad_norm": 1.5114387273788452, + "learning_rate": 1.6702313393536173e-05, + "loss": 1.1419, + "step": 22868 + }, + { + "epoch": 0.8189875910971046, + "grad_norm": 1.5536167621612549, + "learning_rate": 1.6695896129352705e-05, + "loss": 1.2146, + "step": 22869 + }, + { + "epoch": 0.8190234032266729, + "grad_norm": 1.3374302387237549, + "learning_rate": 1.66894799859185e-05, + "loss": 0.919, + "step": 22870 + }, + { + "epoch": 0.8190592153562412, + "grad_norm": 1.3071928024291992, + "learning_rate": 1.6683064963319906e-05, + "loss": 0.7756, + "step": 22871 + }, + { + "epoch": 0.8190950274858094, + "grad_norm": 1.313724398612976, + "learning_rate": 1.6676651061643177e-05, + "loss": 1.0092, + "step": 22872 + }, + { + "epoch": 0.8191308396153777, + "grad_norm": 1.4440817832946777, + "learning_rate": 1.6670238280974627e-05, + "loss": 1.0953, + "step": 22873 + }, + { + "epoch": 0.819166651744946, + "grad_norm": 1.4013071060180664, + "learning_rate": 1.6663826621400537e-05, + "loss": 1.021, + "step": 22874 + }, + { + "epoch": 0.8192024638745143, + "grad_norm": 2.107281446456909, + "learning_rate": 1.6657416083007184e-05, + "loss": 1.2415, + "step": 22875 + }, + { + "epoch": 0.8192382760040826, + "grad_norm": 1.4326523542404175, + "learning_rate": 1.6651006665880776e-05, + "loss": 1.041, + "step": 22876 + }, + { + "epoch": 0.8192740881336509, + "grad_norm": 1.8325031995773315, + "learning_rate": 1.6644598370107554e-05, + "loss": 1.3361, + "step": 22877 + }, + { + "epoch": 0.8193099002632191, + "grad_norm": 1.3629428148269653, + "learning_rate": 1.6638191195773744e-05, + "loss": 0.9577, + "step": 22878 + }, + { + "epoch": 0.8193457123927874, + "grad_norm": 1.623874306678772, + "learning_rate": 1.6631785142965563e-05, + "loss": 1.2224, + "step": 22879 + }, + { + "epoch": 0.8193815245223557, + "grad_norm": 1.3467007875442505, + "learning_rate": 1.6625380211769147e-05, + "loss": 1.0151, + "step": 22880 + }, + { + "epoch": 0.819417336651924, + "grad_norm": 1.5584266185760498, + "learning_rate": 1.6618976402270704e-05, + "loss": 1.0902, + "step": 22881 + }, + { + "epoch": 0.8194531487814923, + "grad_norm": 1.4276282787322998, + "learning_rate": 1.66125737145564e-05, + "loss": 1.1871, + "step": 22882 + }, + { + "epoch": 0.8194889609110606, + "grad_norm": 1.4620188474655151, + "learning_rate": 1.6606172148712328e-05, + "loss": 1.1195, + "step": 22883 + }, + { + "epoch": 0.8195247730406289, + "grad_norm": 1.672499179840088, + "learning_rate": 1.659977170482464e-05, + "loss": 0.9062, + "step": 22884 + }, + { + "epoch": 0.8195605851701971, + "grad_norm": 1.4617440700531006, + "learning_rate": 1.6593372382979455e-05, + "loss": 0.9861, + "step": 22885 + }, + { + "epoch": 0.8195963972997654, + "grad_norm": 1.3042618036270142, + "learning_rate": 1.658697418326287e-05, + "loss": 0.8706, + "step": 22886 + }, + { + "epoch": 0.8196322094293337, + "grad_norm": 1.5473926067352295, + "learning_rate": 1.658057710576093e-05, + "loss": 1.1166, + "step": 22887 + }, + { + "epoch": 0.819668021558902, + "grad_norm": 1.423761010169983, + "learning_rate": 1.657418115055973e-05, + "loss": 1.093, + "step": 22888 + }, + { + "epoch": 0.8197038336884703, + "grad_norm": 1.5824439525604248, + "learning_rate": 1.6567786317745327e-05, + "loss": 1.0459, + "step": 22889 + }, + { + "epoch": 0.8197396458180386, + "grad_norm": 1.6537811756134033, + "learning_rate": 1.6561392607403713e-05, + "loss": 1.0488, + "step": 22890 + }, + { + "epoch": 0.8197754579476069, + "grad_norm": 1.3371825218200684, + "learning_rate": 1.655500001962095e-05, + "loss": 1.1033, + "step": 22891 + }, + { + "epoch": 0.8198112700771751, + "grad_norm": 1.3476148843765259, + "learning_rate": 1.6548608554483e-05, + "loss": 1.0649, + "step": 22892 + }, + { + "epoch": 0.8198470822067434, + "grad_norm": 1.4175242185592651, + "learning_rate": 1.6542218212075923e-05, + "loss": 1.2222, + "step": 22893 + }, + { + "epoch": 0.8198828943363117, + "grad_norm": 1.4828464984893799, + "learning_rate": 1.6535828992485613e-05, + "loss": 1.0864, + "step": 22894 + }, + { + "epoch": 0.8199187064658799, + "grad_norm": 1.6772351264953613, + "learning_rate": 1.6529440895798065e-05, + "loss": 1.0226, + "step": 22895 + }, + { + "epoch": 0.8199545185954483, + "grad_norm": 1.3752586841583252, + "learning_rate": 1.6523053922099242e-05, + "loss": 1.0473, + "step": 22896 + }, + { + "epoch": 0.8199903307250166, + "grad_norm": 1.742909550666809, + "learning_rate": 1.651666807147503e-05, + "loss": 1.2113, + "step": 22897 + }, + { + "epoch": 0.8200261428545849, + "grad_norm": 1.5476945638656616, + "learning_rate": 1.651028334401137e-05, + "loss": 1.1078, + "step": 22898 + }, + { + "epoch": 0.8200619549841531, + "grad_norm": 1.5827072858810425, + "learning_rate": 1.6503899739794138e-05, + "loss": 1.3283, + "step": 22899 + }, + { + "epoch": 0.8200977671137214, + "grad_norm": 1.335956335067749, + "learning_rate": 1.6497517258909267e-05, + "loss": 1.0601, + "step": 22900 + }, + { + "epoch": 0.8201335792432897, + "grad_norm": 1.5811365842819214, + "learning_rate": 1.6491135901442567e-05, + "loss": 1.1244, + "step": 22901 + }, + { + "epoch": 0.8201693913728579, + "grad_norm": 1.4520035982131958, + "learning_rate": 1.648475566747991e-05, + "loss": 1.1729, + "step": 22902 + }, + { + "epoch": 0.8202052035024263, + "grad_norm": 1.6032671928405762, + "learning_rate": 1.6478376557107145e-05, + "loss": 1.0531, + "step": 22903 + }, + { + "epoch": 0.8202410156319946, + "grad_norm": 1.4751840829849243, + "learning_rate": 1.647199857041011e-05, + "loss": 1.248, + "step": 22904 + }, + { + "epoch": 0.8202768277615629, + "grad_norm": 1.3942327499389648, + "learning_rate": 1.6465621707474587e-05, + "loss": 1.294, + "step": 22905 + }, + { + "epoch": 0.8203126398911311, + "grad_norm": 1.7680703401565552, + "learning_rate": 1.6459245968386327e-05, + "loss": 1.2569, + "step": 22906 + }, + { + "epoch": 0.8203484520206994, + "grad_norm": 1.3926753997802734, + "learning_rate": 1.64528713532312e-05, + "loss": 1.0305, + "step": 22907 + }, + { + "epoch": 0.8203842641502677, + "grad_norm": 1.6523613929748535, + "learning_rate": 1.64464978620949e-05, + "loss": 1.0617, + "step": 22908 + }, + { + "epoch": 0.8204200762798359, + "grad_norm": 1.4642246961593628, + "learning_rate": 1.6440125495063185e-05, + "loss": 0.9999, + "step": 22909 + }, + { + "epoch": 0.8204558884094043, + "grad_norm": 1.4547501802444458, + "learning_rate": 1.643375425222181e-05, + "loss": 1.3243, + "step": 22910 + }, + { + "epoch": 0.8204917005389726, + "grad_norm": 1.4336566925048828, + "learning_rate": 1.6427384133656498e-05, + "loss": 0.8534, + "step": 22911 + }, + { + "epoch": 0.8205275126685408, + "grad_norm": 1.3943411111831665, + "learning_rate": 1.64210151394529e-05, + "loss": 1.1314, + "step": 22912 + }, + { + "epoch": 0.8205633247981091, + "grad_norm": 1.517595887184143, + "learning_rate": 1.641464726969675e-05, + "loss": 1.0479, + "step": 22913 + }, + { + "epoch": 0.8205991369276774, + "grad_norm": 1.5495448112487793, + "learning_rate": 1.6408280524473706e-05, + "loss": 1.2536, + "step": 22914 + }, + { + "epoch": 0.8206349490572457, + "grad_norm": 1.2191957235336304, + "learning_rate": 1.640191490386942e-05, + "loss": 1.0979, + "step": 22915 + }, + { + "epoch": 0.8206707611868139, + "grad_norm": 1.445582628250122, + "learning_rate": 1.6395550407969552e-05, + "loss": 0.8637, + "step": 22916 + }, + { + "epoch": 0.8207065733163823, + "grad_norm": 1.5318183898925781, + "learning_rate": 1.6389187036859655e-05, + "loss": 1.0324, + "step": 22917 + }, + { + "epoch": 0.8207423854459506, + "grad_norm": 1.7394850254058838, + "learning_rate": 1.638282479062545e-05, + "loss": 1.2451, + "step": 22918 + }, + { + "epoch": 0.8207781975755188, + "grad_norm": 1.3195335865020752, + "learning_rate": 1.637646366935246e-05, + "loss": 0.9613, + "step": 22919 + }, + { + "epoch": 0.8208140097050871, + "grad_norm": 1.570497989654541, + "learning_rate": 1.6370103673126267e-05, + "loss": 1.1982, + "step": 22920 + }, + { + "epoch": 0.8208498218346554, + "grad_norm": 1.332701563835144, + "learning_rate": 1.6363744802032476e-05, + "loss": 1.08, + "step": 22921 + }, + { + "epoch": 0.8208856339642236, + "grad_norm": 1.5513116121292114, + "learning_rate": 1.6357387056156626e-05, + "loss": 1.1341, + "step": 22922 + }, + { + "epoch": 0.8209214460937919, + "grad_norm": 1.3408743143081665, + "learning_rate": 1.6351030435584245e-05, + "loss": 1.1514, + "step": 22923 + }, + { + "epoch": 0.8209572582233603, + "grad_norm": 1.6177783012390137, + "learning_rate": 1.6344674940400805e-05, + "loss": 1.1723, + "step": 22924 + }, + { + "epoch": 0.8209930703529286, + "grad_norm": 1.54426908493042, + "learning_rate": 1.633832057069191e-05, + "loss": 0.998, + "step": 22925 + }, + { + "epoch": 0.8210288824824968, + "grad_norm": 1.6745110750198364, + "learning_rate": 1.6331967326542963e-05, + "loss": 1.2152, + "step": 22926 + }, + { + "epoch": 0.8210646946120651, + "grad_norm": 1.722639799118042, + "learning_rate": 1.63256152080395e-05, + "loss": 1.0273, + "step": 22927 + }, + { + "epoch": 0.8211005067416334, + "grad_norm": 1.3044267892837524, + "learning_rate": 1.6319264215266894e-05, + "loss": 0.9526, + "step": 22928 + }, + { + "epoch": 0.8211363188712016, + "grad_norm": 1.6430262327194214, + "learning_rate": 1.6312914348310704e-05, + "loss": 1.1022, + "step": 22929 + }, + { + "epoch": 0.8211721310007699, + "grad_norm": 1.282019853591919, + "learning_rate": 1.6306565607256285e-05, + "loss": 0.8529, + "step": 22930 + }, + { + "epoch": 0.8212079431303383, + "grad_norm": 1.675061821937561, + "learning_rate": 1.6300217992189082e-05, + "loss": 1.2803, + "step": 22931 + }, + { + "epoch": 0.8212437552599066, + "grad_norm": 1.427815318107605, + "learning_rate": 1.6293871503194458e-05, + "loss": 0.9485, + "step": 22932 + }, + { + "epoch": 0.8212795673894748, + "grad_norm": 1.7063566446304321, + "learning_rate": 1.6287526140357822e-05, + "loss": 1.0916, + "step": 22933 + }, + { + "epoch": 0.8213153795190431, + "grad_norm": 1.508997917175293, + "learning_rate": 1.6281181903764565e-05, + "loss": 1.2509, + "step": 22934 + }, + { + "epoch": 0.8213511916486114, + "grad_norm": 1.5300161838531494, + "learning_rate": 1.627483879349997e-05, + "loss": 1.1501, + "step": 22935 + }, + { + "epoch": 0.8213870037781796, + "grad_norm": 1.2668691873550415, + "learning_rate": 1.626849680964947e-05, + "loss": 1.0466, + "step": 22936 + }, + { + "epoch": 0.8214228159077479, + "grad_norm": 1.3555103540420532, + "learning_rate": 1.6262155952298307e-05, + "loss": 0.9748, + "step": 22937 + }, + { + "epoch": 0.8214586280373163, + "grad_norm": 1.4151418209075928, + "learning_rate": 1.625581622153186e-05, + "loss": 1.0841, + "step": 22938 + }, + { + "epoch": 0.8214944401668846, + "grad_norm": 1.3842980861663818, + "learning_rate": 1.6249477617435327e-05, + "loss": 1.051, + "step": 22939 + }, + { + "epoch": 0.8215302522964528, + "grad_norm": 1.6076468229293823, + "learning_rate": 1.6243140140094093e-05, + "loss": 1.1679, + "step": 22940 + }, + { + "epoch": 0.8215660644260211, + "grad_norm": 1.2304909229278564, + "learning_rate": 1.6236803789593368e-05, + "loss": 1.0678, + "step": 22941 + }, + { + "epoch": 0.8216018765555894, + "grad_norm": 1.4524240493774414, + "learning_rate": 1.6230468566018375e-05, + "loss": 1.122, + "step": 22942 + }, + { + "epoch": 0.8216376886851576, + "grad_norm": 1.5021746158599854, + "learning_rate": 1.6224134469454366e-05, + "loss": 1.1316, + "step": 22943 + }, + { + "epoch": 0.8216735008147259, + "grad_norm": 1.7472217082977295, + "learning_rate": 1.6217801499986573e-05, + "loss": 1.2047, + "step": 22944 + }, + { + "epoch": 0.8217093129442943, + "grad_norm": 1.2875230312347412, + "learning_rate": 1.6211469657700217e-05, + "loss": 1.0654, + "step": 22945 + }, + { + "epoch": 0.8217451250738625, + "grad_norm": 1.736508846282959, + "learning_rate": 1.6205138942680408e-05, + "loss": 1.2377, + "step": 22946 + }, + { + "epoch": 0.8217809372034308, + "grad_norm": 1.7225669622421265, + "learning_rate": 1.6198809355012412e-05, + "loss": 1.0999, + "step": 22947 + }, + { + "epoch": 0.8218167493329991, + "grad_norm": 1.8745073080062866, + "learning_rate": 1.6192480894781316e-05, + "loss": 1.0288, + "step": 22948 + }, + { + "epoch": 0.8218525614625674, + "grad_norm": 1.600203275680542, + "learning_rate": 1.6186153562072316e-05, + "loss": 1.2089, + "step": 22949 + }, + { + "epoch": 0.8218883735921356, + "grad_norm": 1.2825005054473877, + "learning_rate": 1.617982735697048e-05, + "loss": 0.8509, + "step": 22950 + }, + { + "epoch": 0.8219241857217039, + "grad_norm": 1.3348418474197388, + "learning_rate": 1.6173502279560936e-05, + "loss": 1.1355, + "step": 22951 + }, + { + "epoch": 0.8219599978512723, + "grad_norm": 1.1216192245483398, + "learning_rate": 1.6167178329928823e-05, + "loss": 0.937, + "step": 22952 + }, + { + "epoch": 0.8219958099808405, + "grad_norm": 1.2915986776351929, + "learning_rate": 1.6160855508159168e-05, + "loss": 0.9833, + "step": 22953 + }, + { + "epoch": 0.8220316221104088, + "grad_norm": 1.6826162338256836, + "learning_rate": 1.6154533814337058e-05, + "loss": 1.1602, + "step": 22954 + }, + { + "epoch": 0.8220674342399771, + "grad_norm": 1.7639458179473877, + "learning_rate": 1.614821324854754e-05, + "loss": 1.1573, + "step": 22955 + }, + { + "epoch": 0.8221032463695453, + "grad_norm": 1.4773212671279907, + "learning_rate": 1.6141893810875675e-05, + "loss": 1.1755, + "step": 22956 + }, + { + "epoch": 0.8221390584991136, + "grad_norm": 1.9124658107757568, + "learning_rate": 1.6135575501406432e-05, + "loss": 1.3356, + "step": 22957 + }, + { + "epoch": 0.8221748706286819, + "grad_norm": 1.7730743885040283, + "learning_rate": 1.6129258320224848e-05, + "loss": 0.9882, + "step": 22958 + }, + { + "epoch": 0.8222106827582503, + "grad_norm": 1.680497407913208, + "learning_rate": 1.612294226741593e-05, + "loss": 1.0416, + "step": 22959 + }, + { + "epoch": 0.8222464948878185, + "grad_norm": 1.9612597227096558, + "learning_rate": 1.6116627343064605e-05, + "loss": 1.3102, + "step": 22960 + }, + { + "epoch": 0.8222823070173868, + "grad_norm": 1.7163355350494385, + "learning_rate": 1.611031354725586e-05, + "loss": 1.0343, + "step": 22961 + }, + { + "epoch": 0.8223181191469551, + "grad_norm": 1.9539051055908203, + "learning_rate": 1.6104000880074642e-05, + "loss": 1.095, + "step": 22962 + }, + { + "epoch": 0.8223539312765233, + "grad_norm": 1.8203786611557007, + "learning_rate": 1.6097689341605894e-05, + "loss": 1.2023, + "step": 22963 + }, + { + "epoch": 0.8223897434060916, + "grad_norm": 1.295386791229248, + "learning_rate": 1.6091378931934474e-05, + "loss": 0.9632, + "step": 22964 + }, + { + "epoch": 0.8224255555356599, + "grad_norm": 1.3737529516220093, + "learning_rate": 1.6085069651145334e-05, + "loss": 1.0507, + "step": 22965 + }, + { + "epoch": 0.8224613676652283, + "grad_norm": 1.3038842678070068, + "learning_rate": 1.6078761499323326e-05, + "loss": 1.0699, + "step": 22966 + }, + { + "epoch": 0.8224971797947965, + "grad_norm": 1.3673635721206665, + "learning_rate": 1.6072454476553357e-05, + "loss": 1.1837, + "step": 22967 + }, + { + "epoch": 0.8225329919243648, + "grad_norm": 1.6218878030776978, + "learning_rate": 1.6066148582920237e-05, + "loss": 1.1446, + "step": 22968 + }, + { + "epoch": 0.8225688040539331, + "grad_norm": 1.3580570220947266, + "learning_rate": 1.6059843818508814e-05, + "loss": 0.9774, + "step": 22969 + }, + { + "epoch": 0.8226046161835013, + "grad_norm": 1.5041871070861816, + "learning_rate": 1.605354018340395e-05, + "loss": 0.8479, + "step": 22970 + }, + { + "epoch": 0.8226404283130696, + "grad_norm": 1.563703179359436, + "learning_rate": 1.6047237677690386e-05, + "loss": 1.0182, + "step": 22971 + }, + { + "epoch": 0.8226762404426379, + "grad_norm": 1.7126448154449463, + "learning_rate": 1.6040936301452957e-05, + "loss": 1.3219, + "step": 22972 + }, + { + "epoch": 0.8227120525722063, + "grad_norm": 1.654541254043579, + "learning_rate": 1.603463605477643e-05, + "loss": 1.0509, + "step": 22973 + }, + { + "epoch": 0.8227478647017745, + "grad_norm": 1.2374688386917114, + "learning_rate": 1.602833693774558e-05, + "loss": 0.8269, + "step": 22974 + }, + { + "epoch": 0.8227836768313428, + "grad_norm": 1.687105655670166, + "learning_rate": 1.6022038950445127e-05, + "loss": 1.1917, + "step": 22975 + }, + { + "epoch": 0.8228194889609111, + "grad_norm": 1.6507149934768677, + "learning_rate": 1.6015742092959818e-05, + "loss": 1.172, + "step": 22976 + }, + { + "epoch": 0.8228553010904793, + "grad_norm": 1.3825222253799438, + "learning_rate": 1.6009446365374383e-05, + "loss": 1.0535, + "step": 22977 + }, + { + "epoch": 0.8228911132200476, + "grad_norm": 1.4764926433563232, + "learning_rate": 1.6003151767773485e-05, + "loss": 0.9203, + "step": 22978 + }, + { + "epoch": 0.8229269253496159, + "grad_norm": 1.9314303398132324, + "learning_rate": 1.5996858300241834e-05, + "loss": 1.0289, + "step": 22979 + }, + { + "epoch": 0.8229627374791842, + "grad_norm": 1.2540850639343262, + "learning_rate": 1.5990565962864103e-05, + "loss": 1.0589, + "step": 22980 + }, + { + "epoch": 0.8229985496087525, + "grad_norm": 1.5870212316513062, + "learning_rate": 1.5984274755724958e-05, + "loss": 1.2057, + "step": 22981 + }, + { + "epoch": 0.8230343617383208, + "grad_norm": 1.5434093475341797, + "learning_rate": 1.5977984678909008e-05, + "loss": 1.0204, + "step": 22982 + }, + { + "epoch": 0.823070173867889, + "grad_norm": 1.9261142015457153, + "learning_rate": 1.597169573250089e-05, + "loss": 1.2042, + "step": 22983 + }, + { + "epoch": 0.8231059859974573, + "grad_norm": 1.8281389474868774, + "learning_rate": 1.5965407916585208e-05, + "loss": 1.0971, + "step": 22984 + }, + { + "epoch": 0.8231417981270256, + "grad_norm": 1.3931277990341187, + "learning_rate": 1.59591212312466e-05, + "loss": 0.8999, + "step": 22985 + }, + { + "epoch": 0.8231776102565939, + "grad_norm": 1.906306266784668, + "learning_rate": 1.595283567656959e-05, + "loss": 1.1569, + "step": 22986 + }, + { + "epoch": 0.8232134223861622, + "grad_norm": 1.414353370666504, + "learning_rate": 1.5946551252638754e-05, + "loss": 1.2745, + "step": 22987 + }, + { + "epoch": 0.8232492345157305, + "grad_norm": 1.4642033576965332, + "learning_rate": 1.594026795953868e-05, + "loss": 0.9078, + "step": 22988 + }, + { + "epoch": 0.8232850466452988, + "grad_norm": 1.6412100791931152, + "learning_rate": 1.5933985797353844e-05, + "loss": 1.2112, + "step": 22989 + }, + { + "epoch": 0.823320858774867, + "grad_norm": 1.8878200054168701, + "learning_rate": 1.5927704766168793e-05, + "loss": 1.088, + "step": 22990 + }, + { + "epoch": 0.8233566709044353, + "grad_norm": 1.590309500694275, + "learning_rate": 1.5921424866068026e-05, + "loss": 1.3042, + "step": 22991 + }, + { + "epoch": 0.8233924830340036, + "grad_norm": 1.6716679334640503, + "learning_rate": 1.5915146097136056e-05, + "loss": 1.1792, + "step": 22992 + }, + { + "epoch": 0.8234282951635719, + "grad_norm": 1.2873164415359497, + "learning_rate": 1.5908868459457317e-05, + "loss": 1.0539, + "step": 22993 + }, + { + "epoch": 0.8234641072931402, + "grad_norm": 1.2680121660232544, + "learning_rate": 1.5902591953116287e-05, + "loss": 0.8703, + "step": 22994 + }, + { + "epoch": 0.8234999194227085, + "grad_norm": 1.486959457397461, + "learning_rate": 1.589631657819741e-05, + "loss": 1.1844, + "step": 22995 + }, + { + "epoch": 0.8235357315522768, + "grad_norm": 1.4706698656082153, + "learning_rate": 1.5890042334785104e-05, + "loss": 0.9448, + "step": 22996 + }, + { + "epoch": 0.823571543681845, + "grad_norm": 1.5038663148880005, + "learning_rate": 1.5883769222963775e-05, + "loss": 1.012, + "step": 22997 + }, + { + "epoch": 0.8236073558114133, + "grad_norm": 1.614964485168457, + "learning_rate": 1.587749724281783e-05, + "loss": 0.8736, + "step": 22998 + }, + { + "epoch": 0.8236431679409816, + "grad_norm": 1.4957152605056763, + "learning_rate": 1.5871226394431672e-05, + "loss": 1.1886, + "step": 22999 + }, + { + "epoch": 0.8236789800705498, + "grad_norm": 1.5027892589569092, + "learning_rate": 1.586495667788962e-05, + "loss": 1.0356, + "step": 23000 + }, + { + "epoch": 0.8237147922001182, + "grad_norm": 1.3322186470031738, + "learning_rate": 1.5858688093276042e-05, + "loss": 0.8775, + "step": 23001 + }, + { + "epoch": 0.8237506043296865, + "grad_norm": 1.7346796989440918, + "learning_rate": 1.5852420640675313e-05, + "loss": 1.0728, + "step": 23002 + }, + { + "epoch": 0.8237864164592548, + "grad_norm": 1.4360426664352417, + "learning_rate": 1.5846154320171703e-05, + "loss": 1.1778, + "step": 23003 + }, + { + "epoch": 0.823822228588823, + "grad_norm": 1.4473042488098145, + "learning_rate": 1.583988913184953e-05, + "loss": 1.0541, + "step": 23004 + }, + { + "epoch": 0.8238580407183913, + "grad_norm": 1.6316338777542114, + "learning_rate": 1.583362507579309e-05, + "loss": 0.9545, + "step": 23005 + }, + { + "epoch": 0.8238938528479596, + "grad_norm": 1.393298864364624, + "learning_rate": 1.582736215208669e-05, + "loss": 0.9382, + "step": 23006 + }, + { + "epoch": 0.8239296649775278, + "grad_norm": 1.279061198234558, + "learning_rate": 1.582110036081452e-05, + "loss": 0.8341, + "step": 23007 + }, + { + "epoch": 0.8239654771070962, + "grad_norm": 1.3817648887634277, + "learning_rate": 1.581483970206087e-05, + "loss": 1.0799, + "step": 23008 + }, + { + "epoch": 0.8240012892366645, + "grad_norm": 1.2480920553207397, + "learning_rate": 1.580858017590996e-05, + "loss": 1.0086, + "step": 23009 + }, + { + "epoch": 0.8240371013662328, + "grad_norm": 1.423662781715393, + "learning_rate": 1.5802321782446028e-05, + "loss": 1.0858, + "step": 23010 + }, + { + "epoch": 0.824072913495801, + "grad_norm": 1.2519673109054565, + "learning_rate": 1.5796064521753252e-05, + "loss": 1.0164, + "step": 23011 + }, + { + "epoch": 0.8241087256253693, + "grad_norm": 1.4387884140014648, + "learning_rate": 1.5789808393915763e-05, + "loss": 1.2342, + "step": 23012 + }, + { + "epoch": 0.8241445377549376, + "grad_norm": 1.4363033771514893, + "learning_rate": 1.5783553399017825e-05, + "loss": 0.9422, + "step": 23013 + }, + { + "epoch": 0.8241803498845058, + "grad_norm": 2.2231438159942627, + "learning_rate": 1.577729953714352e-05, + "loss": 1.0299, + "step": 23014 + }, + { + "epoch": 0.8242161620140742, + "grad_norm": 1.3634767532348633, + "learning_rate": 1.577104680837703e-05, + "loss": 1.0821, + "step": 23015 + }, + { + "epoch": 0.8242519741436425, + "grad_norm": 1.543036699295044, + "learning_rate": 1.576479521280242e-05, + "loss": 1.2555, + "step": 23016 + }, + { + "epoch": 0.8242877862732108, + "grad_norm": 1.2777647972106934, + "learning_rate": 1.575854475050388e-05, + "loss": 0.7843, + "step": 23017 + }, + { + "epoch": 0.824323598402779, + "grad_norm": 1.4284775257110596, + "learning_rate": 1.5752295421565423e-05, + "loss": 1.0862, + "step": 23018 + }, + { + "epoch": 0.8243594105323473, + "grad_norm": 1.7855162620544434, + "learning_rate": 1.574604722607117e-05, + "loss": 1.1543, + "step": 23019 + }, + { + "epoch": 0.8243952226619156, + "grad_norm": 1.599955677986145, + "learning_rate": 1.573980016410519e-05, + "loss": 1.1069, + "step": 23020 + }, + { + "epoch": 0.8244310347914838, + "grad_norm": 1.4148799180984497, + "learning_rate": 1.573355423575149e-05, + "loss": 1.243, + "step": 23021 + }, + { + "epoch": 0.8244668469210522, + "grad_norm": 1.8980969190597534, + "learning_rate": 1.572730944109415e-05, + "loss": 1.0621, + "step": 23022 + }, + { + "epoch": 0.8245026590506205, + "grad_norm": 1.2150394916534424, + "learning_rate": 1.5721065780217103e-05, + "loss": 1.0599, + "step": 23023 + }, + { + "epoch": 0.8245384711801887, + "grad_norm": 1.6506555080413818, + "learning_rate": 1.5714823253204447e-05, + "loss": 1.2333, + "step": 23024 + }, + { + "epoch": 0.824574283309757, + "grad_norm": 1.3420859575271606, + "learning_rate": 1.5708581860140113e-05, + "loss": 0.9773, + "step": 23025 + }, + { + "epoch": 0.8246100954393253, + "grad_norm": 1.6255563497543335, + "learning_rate": 1.5702341601108094e-05, + "loss": 1.1313, + "step": 23026 + }, + { + "epoch": 0.8246459075688936, + "grad_norm": 1.482812523841858, + "learning_rate": 1.56961024761923e-05, + "loss": 1.0004, + "step": 23027 + }, + { + "epoch": 0.8246817196984618, + "grad_norm": 1.443434476852417, + "learning_rate": 1.5689864485476736e-05, + "loss": 1.0552, + "step": 23028 + }, + { + "epoch": 0.8247175318280302, + "grad_norm": 1.6303813457489014, + "learning_rate": 1.5683627629045295e-05, + "loss": 1.1001, + "step": 23029 + }, + { + "epoch": 0.8247533439575985, + "grad_norm": 1.6696876287460327, + "learning_rate": 1.5677391906981842e-05, + "loss": 0.9885, + "step": 23030 + }, + { + "epoch": 0.8247891560871667, + "grad_norm": 1.5496362447738647, + "learning_rate": 1.5671157319370357e-05, + "loss": 1.0579, + "step": 23031 + }, + { + "epoch": 0.824824968216735, + "grad_norm": 1.3249894380569458, + "learning_rate": 1.5664923866294655e-05, + "loss": 0.9122, + "step": 23032 + }, + { + "epoch": 0.8248607803463033, + "grad_norm": 1.6188451051712036, + "learning_rate": 1.565869154783863e-05, + "loss": 1.1976, + "step": 23033 + }, + { + "epoch": 0.8248965924758715, + "grad_norm": 2.0183627605438232, + "learning_rate": 1.5652460364086084e-05, + "loss": 1.0687, + "step": 23034 + }, + { + "epoch": 0.8249324046054398, + "grad_norm": 1.5435439348220825, + "learning_rate": 1.5646230315120923e-05, + "loss": 1.0065, + "step": 23035 + }, + { + "epoch": 0.8249682167350082, + "grad_norm": 1.713118553161621, + "learning_rate": 1.5640001401026904e-05, + "loss": 1.1603, + "step": 23036 + }, + { + "epoch": 0.8250040288645765, + "grad_norm": 1.5241254568099976, + "learning_rate": 1.5633773621887872e-05, + "loss": 0.9022, + "step": 23037 + }, + { + "epoch": 0.8250398409941447, + "grad_norm": 1.5869834423065186, + "learning_rate": 1.5627546977787565e-05, + "loss": 1.0956, + "step": 23038 + }, + { + "epoch": 0.825075653123713, + "grad_norm": 1.3848568201065063, + "learning_rate": 1.5621321468809778e-05, + "loss": 1.0649, + "step": 23039 + }, + { + "epoch": 0.8251114652532813, + "grad_norm": 1.9521888494491577, + "learning_rate": 1.56150970950383e-05, + "loss": 0.9797, + "step": 23040 + }, + { + "epoch": 0.8251472773828495, + "grad_norm": 1.4043917655944824, + "learning_rate": 1.5608873856556828e-05, + "loss": 1.0297, + "step": 23041 + }, + { + "epoch": 0.8251830895124178, + "grad_norm": 1.5390866994857788, + "learning_rate": 1.5602651753449083e-05, + "loss": 0.9895, + "step": 23042 + }, + { + "epoch": 0.8252189016419862, + "grad_norm": 1.5211275815963745, + "learning_rate": 1.5596430785798798e-05, + "loss": 1.168, + "step": 23043 + }, + { + "epoch": 0.8252547137715545, + "grad_norm": 1.3538782596588135, + "learning_rate": 1.55902109536897e-05, + "loss": 0.9746, + "step": 23044 + }, + { + "epoch": 0.8252905259011227, + "grad_norm": 1.6903997659683228, + "learning_rate": 1.558399225720537e-05, + "loss": 0.9174, + "step": 23045 + }, + { + "epoch": 0.825326338030691, + "grad_norm": 1.424484133720398, + "learning_rate": 1.5577774696429592e-05, + "loss": 1.1531, + "step": 23046 + }, + { + "epoch": 0.8253621501602593, + "grad_norm": 1.398321270942688, + "learning_rate": 1.5571558271445952e-05, + "loss": 1.1465, + "step": 23047 + }, + { + "epoch": 0.8253979622898275, + "grad_norm": 1.5263575315475464, + "learning_rate": 1.556534298233807e-05, + "loss": 0.9557, + "step": 23048 + }, + { + "epoch": 0.8254337744193958, + "grad_norm": 1.2499498128890991, + "learning_rate": 1.5559128829189597e-05, + "loss": 1.2342, + "step": 23049 + }, + { + "epoch": 0.8254695865489642, + "grad_norm": 1.4498741626739502, + "learning_rate": 1.5552915812084113e-05, + "loss": 1.0698, + "step": 23050 + }, + { + "epoch": 0.8255053986785325, + "grad_norm": 1.795820951461792, + "learning_rate": 1.5546703931105233e-05, + "loss": 1.1764, + "step": 23051 + }, + { + "epoch": 0.8255412108081007, + "grad_norm": 1.8113372325897217, + "learning_rate": 1.5540493186336503e-05, + "loss": 1.1416, + "step": 23052 + }, + { + "epoch": 0.825577022937669, + "grad_norm": 1.3539742231369019, + "learning_rate": 1.5534283577861497e-05, + "loss": 0.9972, + "step": 23053 + }, + { + "epoch": 0.8256128350672373, + "grad_norm": 1.5499136447906494, + "learning_rate": 1.552807510576374e-05, + "loss": 1.1144, + "step": 23054 + }, + { + "epoch": 0.8256486471968055, + "grad_norm": 1.7209891080856323, + "learning_rate": 1.5521867770126795e-05, + "loss": 1.0551, + "step": 23055 + }, + { + "epoch": 0.8256844593263738, + "grad_norm": 1.541487455368042, + "learning_rate": 1.5515661571034134e-05, + "loss": 0.9446, + "step": 23056 + }, + { + "epoch": 0.8257202714559422, + "grad_norm": 1.9997437000274658, + "learning_rate": 1.5509456508569275e-05, + "loss": 1.1161, + "step": 23057 + }, + { + "epoch": 0.8257560835855104, + "grad_norm": 2.3080055713653564, + "learning_rate": 1.5503252582815707e-05, + "loss": 1.2064, + "step": 23058 + }, + { + "epoch": 0.8257918957150787, + "grad_norm": 1.4460477828979492, + "learning_rate": 1.5497049793856868e-05, + "loss": 0.8202, + "step": 23059 + }, + { + "epoch": 0.825827707844647, + "grad_norm": 1.2712293863296509, + "learning_rate": 1.5490848141776214e-05, + "loss": 0.886, + "step": 23060 + }, + { + "epoch": 0.8258635199742153, + "grad_norm": 1.2770304679870605, + "learning_rate": 1.548464762665719e-05, + "loss": 1.077, + "step": 23061 + }, + { + "epoch": 0.8258993321037835, + "grad_norm": 1.3932746648788452, + "learning_rate": 1.5478448248583244e-05, + "loss": 0.9367, + "step": 23062 + }, + { + "epoch": 0.8259351442333518, + "grad_norm": 1.2092827558517456, + "learning_rate": 1.5472250007637724e-05, + "loss": 1.0503, + "step": 23063 + }, + { + "epoch": 0.8259709563629202, + "grad_norm": 1.571975588798523, + "learning_rate": 1.546605290390405e-05, + "loss": 1.16, + "step": 23064 + }, + { + "epoch": 0.8260067684924884, + "grad_norm": 1.5665432214736938, + "learning_rate": 1.545985693746561e-05, + "loss": 0.9671, + "step": 23065 + }, + { + "epoch": 0.8260425806220567, + "grad_norm": 1.3584834337234497, + "learning_rate": 1.545366210840573e-05, + "loss": 1.0277, + "step": 23066 + }, + { + "epoch": 0.826078392751625, + "grad_norm": 1.4556519985198975, + "learning_rate": 1.5447468416807766e-05, + "loss": 1.1129, + "step": 23067 + }, + { + "epoch": 0.8261142048811932, + "grad_norm": 1.7427115440368652, + "learning_rate": 1.5441275862755043e-05, + "loss": 0.9736, + "step": 23068 + }, + { + "epoch": 0.8261500170107615, + "grad_norm": 1.5091813802719116, + "learning_rate": 1.5435084446330917e-05, + "loss": 1.2533, + "step": 23069 + }, + { + "epoch": 0.8261858291403298, + "grad_norm": 1.4850059747695923, + "learning_rate": 1.5428894167618622e-05, + "loss": 1.2203, + "step": 23070 + }, + { + "epoch": 0.8262216412698982, + "grad_norm": 1.4208968877792358, + "learning_rate": 1.5422705026701468e-05, + "loss": 1.0171, + "step": 23071 + }, + { + "epoch": 0.8262574533994664, + "grad_norm": 1.4615142345428467, + "learning_rate": 1.5416517023662713e-05, + "loss": 1.1535, + "step": 23072 + }, + { + "epoch": 0.8262932655290347, + "grad_norm": 1.6013529300689697, + "learning_rate": 1.541033015858565e-05, + "loss": 1.0662, + "step": 23073 + }, + { + "epoch": 0.826329077658603, + "grad_norm": 1.924383521080017, + "learning_rate": 1.540414443155345e-05, + "loss": 1.1806, + "step": 23074 + }, + { + "epoch": 0.8263648897881712, + "grad_norm": 1.4285612106323242, + "learning_rate": 1.5397959842649367e-05, + "loss": 1.1674, + "step": 23075 + }, + { + "epoch": 0.8264007019177395, + "grad_norm": 1.4641640186309814, + "learning_rate": 1.5391776391956638e-05, + "loss": 1.3252, + "step": 23076 + }, + { + "epoch": 0.8264365140473078, + "grad_norm": 1.4600322246551514, + "learning_rate": 1.5385594079558387e-05, + "loss": 1.239, + "step": 23077 + }, + { + "epoch": 0.8264723261768762, + "grad_norm": 1.5041152238845825, + "learning_rate": 1.5379412905537828e-05, + "loss": 1.0516, + "step": 23078 + }, + { + "epoch": 0.8265081383064444, + "grad_norm": 1.6320581436157227, + "learning_rate": 1.5373232869978116e-05, + "loss": 1.0934, + "step": 23079 + }, + { + "epoch": 0.8265439504360127, + "grad_norm": 1.2817599773406982, + "learning_rate": 1.5367053972962408e-05, + "loss": 0.9215, + "step": 23080 + }, + { + "epoch": 0.826579762565581, + "grad_norm": 1.8138840198516846, + "learning_rate": 1.5360876214573806e-05, + "loss": 1.0698, + "step": 23081 + }, + { + "epoch": 0.8266155746951492, + "grad_norm": 1.9071027040481567, + "learning_rate": 1.5354699594895438e-05, + "loss": 1.0489, + "step": 23082 + }, + { + "epoch": 0.8266513868247175, + "grad_norm": 1.8046622276306152, + "learning_rate": 1.534852411401043e-05, + "loss": 0.9997, + "step": 23083 + }, + { + "epoch": 0.8266871989542858, + "grad_norm": 1.5920509099960327, + "learning_rate": 1.5342349772001808e-05, + "loss": 1.0498, + "step": 23084 + }, + { + "epoch": 0.8267230110838542, + "grad_norm": 1.941642165184021, + "learning_rate": 1.5336176568952666e-05, + "loss": 1.1275, + "step": 23085 + }, + { + "epoch": 0.8267588232134224, + "grad_norm": 1.3555195331573486, + "learning_rate": 1.5330004504946072e-05, + "loss": 1.2156, + "step": 23086 + }, + { + "epoch": 0.8267946353429907, + "grad_norm": 1.8431074619293213, + "learning_rate": 1.532383358006506e-05, + "loss": 1.1732, + "step": 23087 + }, + { + "epoch": 0.826830447472559, + "grad_norm": 1.5027928352355957, + "learning_rate": 1.5317663794392634e-05, + "loss": 1.0119, + "step": 23088 + }, + { + "epoch": 0.8268662596021272, + "grad_norm": 1.3142699003219604, + "learning_rate": 1.53114951480118e-05, + "loss": 1.056, + "step": 23089 + }, + { + "epoch": 0.8269020717316955, + "grad_norm": 1.8608567714691162, + "learning_rate": 1.5305327641005584e-05, + "loss": 1.1576, + "step": 23090 + }, + { + "epoch": 0.8269378838612638, + "grad_norm": 1.4917863607406616, + "learning_rate": 1.5299161273456907e-05, + "loss": 1.1132, + "step": 23091 + }, + { + "epoch": 0.8269736959908321, + "grad_norm": 1.349442958831787, + "learning_rate": 1.529299604544876e-05, + "loss": 0.8353, + "step": 23092 + }, + { + "epoch": 0.8270095081204004, + "grad_norm": 1.537500262260437, + "learning_rate": 1.5286831957064095e-05, + "loss": 0.9634, + "step": 23093 + }, + { + "epoch": 0.8270453202499687, + "grad_norm": 1.36647367477417, + "learning_rate": 1.528066900838585e-05, + "loss": 1.0492, + "step": 23094 + }, + { + "epoch": 0.827081132379537, + "grad_norm": 1.3791654109954834, + "learning_rate": 1.5274507199496913e-05, + "loss": 1.0245, + "step": 23095 + }, + { + "epoch": 0.8271169445091052, + "grad_norm": 1.6936454772949219, + "learning_rate": 1.526834653048018e-05, + "loss": 1.0266, + "step": 23096 + }, + { + "epoch": 0.8271527566386735, + "grad_norm": 1.4851821660995483, + "learning_rate": 1.526218700141855e-05, + "loss": 1.0885, + "step": 23097 + }, + { + "epoch": 0.8271885687682418, + "grad_norm": 1.2637348175048828, + "learning_rate": 1.5256028612394913e-05, + "loss": 1.0094, + "step": 23098 + }, + { + "epoch": 0.8272243808978101, + "grad_norm": 1.1826752424240112, + "learning_rate": 1.5249871363492107e-05, + "loss": 0.9635, + "step": 23099 + }, + { + "epoch": 0.8272601930273784, + "grad_norm": 1.663088083267212, + "learning_rate": 1.5243715254792912e-05, + "loss": 1.1885, + "step": 23100 + }, + { + "epoch": 0.8272960051569467, + "grad_norm": 1.3895341157913208, + "learning_rate": 1.5237560286380247e-05, + "loss": 0.9091, + "step": 23101 + }, + { + "epoch": 0.827331817286515, + "grad_norm": 1.2828350067138672, + "learning_rate": 1.5231406458336839e-05, + "loss": 1.0316, + "step": 23102 + }, + { + "epoch": 0.8273676294160832, + "grad_norm": 1.8128145933151245, + "learning_rate": 1.5225253770745529e-05, + "loss": 1.2157, + "step": 23103 + }, + { + "epoch": 0.8274034415456515, + "grad_norm": 1.6340250968933105, + "learning_rate": 1.5219102223689074e-05, + "loss": 1.183, + "step": 23104 + }, + { + "epoch": 0.8274392536752198, + "grad_norm": 1.483026146888733, + "learning_rate": 1.5212951817250253e-05, + "loss": 1.0246, + "step": 23105 + }, + { + "epoch": 0.8274750658047881, + "grad_norm": 1.4601125717163086, + "learning_rate": 1.5206802551511778e-05, + "loss": 1.2907, + "step": 23106 + }, + { + "epoch": 0.8275108779343564, + "grad_norm": 1.449472427368164, + "learning_rate": 1.5200654426556405e-05, + "loss": 1.1102, + "step": 23107 + }, + { + "epoch": 0.8275466900639247, + "grad_norm": 1.6595700979232788, + "learning_rate": 1.5194507442466865e-05, + "loss": 1.0247, + "step": 23108 + }, + { + "epoch": 0.8275825021934929, + "grad_norm": 1.4470816850662231, + "learning_rate": 1.5188361599325817e-05, + "loss": 0.9943, + "step": 23109 + }, + { + "epoch": 0.8276183143230612, + "grad_norm": 1.3133087158203125, + "learning_rate": 1.5182216897215984e-05, + "loss": 1.1791, + "step": 23110 + }, + { + "epoch": 0.8276541264526295, + "grad_norm": 1.600827932357788, + "learning_rate": 1.5176073336219965e-05, + "loss": 0.9085, + "step": 23111 + }, + { + "epoch": 0.8276899385821977, + "grad_norm": 1.7004855871200562, + "learning_rate": 1.5169930916420516e-05, + "loss": 1.1239, + "step": 23112 + }, + { + "epoch": 0.8277257507117661, + "grad_norm": 1.3639249801635742, + "learning_rate": 1.5163789637900194e-05, + "loss": 1.0708, + "step": 23113 + }, + { + "epoch": 0.8277615628413344, + "grad_norm": 1.4786300659179688, + "learning_rate": 1.5157649500741678e-05, + "loss": 1.1198, + "step": 23114 + }, + { + "epoch": 0.8277973749709027, + "grad_norm": 1.8182084560394287, + "learning_rate": 1.5151510505027499e-05, + "loss": 1.0035, + "step": 23115 + }, + { + "epoch": 0.8278331871004709, + "grad_norm": 1.8850412368774414, + "learning_rate": 1.5145372650840361e-05, + "loss": 1.1431, + "step": 23116 + }, + { + "epoch": 0.8278689992300392, + "grad_norm": 1.2961938381195068, + "learning_rate": 1.5139235938262763e-05, + "loss": 1.1285, + "step": 23117 + }, + { + "epoch": 0.8279048113596075, + "grad_norm": 1.987143874168396, + "learning_rate": 1.513310036737724e-05, + "loss": 1.2461, + "step": 23118 + }, + { + "epoch": 0.8279406234891757, + "grad_norm": 1.491686463356018, + "learning_rate": 1.5126965938266436e-05, + "loss": 1.1867, + "step": 23119 + }, + { + "epoch": 0.8279764356187441, + "grad_norm": 1.4275392293930054, + "learning_rate": 1.5120832651012795e-05, + "loss": 1.0119, + "step": 23120 + }, + { + "epoch": 0.8280122477483124, + "grad_norm": 1.8827284574508667, + "learning_rate": 1.5114700505698886e-05, + "loss": 1.0662, + "step": 23121 + }, + { + "epoch": 0.8280480598778807, + "grad_norm": 1.2521617412567139, + "learning_rate": 1.5108569502407155e-05, + "loss": 0.9443, + "step": 23122 + }, + { + "epoch": 0.8280838720074489, + "grad_norm": 1.3085484504699707, + "learning_rate": 1.5102439641220156e-05, + "loss": 0.9842, + "step": 23123 + }, + { + "epoch": 0.8281196841370172, + "grad_norm": 1.3016207218170166, + "learning_rate": 1.5096310922220291e-05, + "loss": 1.2677, + "step": 23124 + }, + { + "epoch": 0.8281554962665855, + "grad_norm": 1.3866263628005981, + "learning_rate": 1.5090183345490084e-05, + "loss": 0.9207, + "step": 23125 + }, + { + "epoch": 0.8281913083961537, + "grad_norm": 1.4179295301437378, + "learning_rate": 1.50840569111119e-05, + "loss": 1.145, + "step": 23126 + }, + { + "epoch": 0.8282271205257221, + "grad_norm": 1.6329660415649414, + "learning_rate": 1.5077931619168196e-05, + "loss": 0.9463, + "step": 23127 + }, + { + "epoch": 0.8282629326552904, + "grad_norm": 1.2689945697784424, + "learning_rate": 1.5071807469741406e-05, + "loss": 1.1113, + "step": 23128 + }, + { + "epoch": 0.8282987447848587, + "grad_norm": 1.653818130493164, + "learning_rate": 1.5065684462913853e-05, + "loss": 1.0291, + "step": 23129 + }, + { + "epoch": 0.8283345569144269, + "grad_norm": 1.4561556577682495, + "learning_rate": 1.5059562598768007e-05, + "loss": 1.2133, + "step": 23130 + }, + { + "epoch": 0.8283703690439952, + "grad_norm": 1.7970203161239624, + "learning_rate": 1.5053441877386154e-05, + "loss": 1.1286, + "step": 23131 + }, + { + "epoch": 0.8284061811735635, + "grad_norm": 2.026989698410034, + "learning_rate": 1.5047322298850685e-05, + "loss": 0.9611, + "step": 23132 + }, + { + "epoch": 0.8284419933031317, + "grad_norm": 1.5707993507385254, + "learning_rate": 1.504120386324387e-05, + "loss": 1.1781, + "step": 23133 + }, + { + "epoch": 0.8284778054327001, + "grad_norm": 1.6175540685653687, + "learning_rate": 1.5035086570648115e-05, + "loss": 0.968, + "step": 23134 + }, + { + "epoch": 0.8285136175622684, + "grad_norm": 1.7065403461456299, + "learning_rate": 1.5028970421145684e-05, + "loss": 0.9527, + "step": 23135 + }, + { + "epoch": 0.8285494296918366, + "grad_norm": 1.6515320539474487, + "learning_rate": 1.5022855414818816e-05, + "loss": 1.0568, + "step": 23136 + }, + { + "epoch": 0.8285852418214049, + "grad_norm": 1.7296597957611084, + "learning_rate": 1.5016741551749813e-05, + "loss": 1.1304, + "step": 23137 + }, + { + "epoch": 0.8286210539509732, + "grad_norm": 1.6973326206207275, + "learning_rate": 1.5010628832020945e-05, + "loss": 1.2157, + "step": 23138 + }, + { + "epoch": 0.8286568660805415, + "grad_norm": 1.4919511079788208, + "learning_rate": 1.5004517255714456e-05, + "loss": 1.249, + "step": 23139 + }, + { + "epoch": 0.8286926782101097, + "grad_norm": 1.3093622922897339, + "learning_rate": 1.4998406822912525e-05, + "loss": 1.2008, + "step": 23140 + }, + { + "epoch": 0.8287284903396781, + "grad_norm": 1.7635793685913086, + "learning_rate": 1.4992297533697387e-05, + "loss": 1.1392, + "step": 23141 + }, + { + "epoch": 0.8287643024692464, + "grad_norm": 1.5192314386367798, + "learning_rate": 1.4986189388151229e-05, + "loss": 0.9875, + "step": 23142 + }, + { + "epoch": 0.8288001145988146, + "grad_norm": 1.3610553741455078, + "learning_rate": 1.4980082386356264e-05, + "loss": 0.9822, + "step": 23143 + }, + { + "epoch": 0.8288359267283829, + "grad_norm": 1.4612298011779785, + "learning_rate": 1.4973976528394596e-05, + "loss": 0.9035, + "step": 23144 + }, + { + "epoch": 0.8288717388579512, + "grad_norm": 2.007904052734375, + "learning_rate": 1.4967871814348399e-05, + "loss": 1.171, + "step": 23145 + }, + { + "epoch": 0.8289075509875194, + "grad_norm": 1.3683533668518066, + "learning_rate": 1.4961768244299823e-05, + "loss": 1.021, + "step": 23146 + }, + { + "epoch": 0.8289433631170877, + "grad_norm": 1.564653992652893, + "learning_rate": 1.4955665818330944e-05, + "loss": 1.1803, + "step": 23147 + }, + { + "epoch": 0.8289791752466561, + "grad_norm": 1.1709938049316406, + "learning_rate": 1.4949564536523874e-05, + "loss": 0.8775, + "step": 23148 + }, + { + "epoch": 0.8290149873762244, + "grad_norm": 1.598215937614441, + "learning_rate": 1.4943464398960716e-05, + "loss": 1.3015, + "step": 23149 + }, + { + "epoch": 0.8290507995057926, + "grad_norm": 1.7489148378372192, + "learning_rate": 1.4937365405723547e-05, + "loss": 1.0229, + "step": 23150 + }, + { + "epoch": 0.8290866116353609, + "grad_norm": 1.6985102891921997, + "learning_rate": 1.493126755689439e-05, + "loss": 1.0978, + "step": 23151 + }, + { + "epoch": 0.8291224237649292, + "grad_norm": 1.48262619972229, + "learning_rate": 1.4925170852555282e-05, + "loss": 0.9978, + "step": 23152 + }, + { + "epoch": 0.8291582358944974, + "grad_norm": 1.4496170282363892, + "learning_rate": 1.4919075292788298e-05, + "loss": 1.0012, + "step": 23153 + }, + { + "epoch": 0.8291940480240657, + "grad_norm": 1.2212090492248535, + "learning_rate": 1.4912980877675387e-05, + "loss": 0.9968, + "step": 23154 + }, + { + "epoch": 0.8292298601536341, + "grad_norm": 1.3442072868347168, + "learning_rate": 1.4906887607298548e-05, + "loss": 1.4429, + "step": 23155 + }, + { + "epoch": 0.8292656722832024, + "grad_norm": 1.43435800075531, + "learning_rate": 1.4900795481739793e-05, + "loss": 1.1735, + "step": 23156 + }, + { + "epoch": 0.8293014844127706, + "grad_norm": 1.2909083366394043, + "learning_rate": 1.4894704501081069e-05, + "loss": 1.1472, + "step": 23157 + }, + { + "epoch": 0.8293372965423389, + "grad_norm": 1.6717572212219238, + "learning_rate": 1.488861466540431e-05, + "loss": 1.0881, + "step": 23158 + }, + { + "epoch": 0.8293731086719072, + "grad_norm": 1.2774207592010498, + "learning_rate": 1.488252597479145e-05, + "loss": 0.9832, + "step": 23159 + }, + { + "epoch": 0.8294089208014754, + "grad_norm": 1.5668665170669556, + "learning_rate": 1.4876438429324414e-05, + "loss": 0.9523, + "step": 23160 + }, + { + "epoch": 0.8294447329310437, + "grad_norm": 1.860343098640442, + "learning_rate": 1.487035202908511e-05, + "loss": 1.2089, + "step": 23161 + }, + { + "epoch": 0.8294805450606121, + "grad_norm": 1.558165431022644, + "learning_rate": 1.4864266774155389e-05, + "loss": 1.0101, + "step": 23162 + }, + { + "epoch": 0.8295163571901804, + "grad_norm": 1.5541412830352783, + "learning_rate": 1.4858182664617148e-05, + "loss": 1.0033, + "step": 23163 + }, + { + "epoch": 0.8295521693197486, + "grad_norm": 1.5222283601760864, + "learning_rate": 1.4852099700552259e-05, + "loss": 1.0659, + "step": 23164 + }, + { + "epoch": 0.8295879814493169, + "grad_norm": 1.6831233501434326, + "learning_rate": 1.4846017882042506e-05, + "loss": 1.0865, + "step": 23165 + }, + { + "epoch": 0.8296237935788852, + "grad_norm": 1.5420210361480713, + "learning_rate": 1.4839937209169741e-05, + "loss": 1.0785, + "step": 23166 + }, + { + "epoch": 0.8296596057084534, + "grad_norm": 1.109666109085083, + "learning_rate": 1.4833857682015773e-05, + "loss": 1.0316, + "step": 23167 + }, + { + "epoch": 0.8296954178380217, + "grad_norm": 1.5056426525115967, + "learning_rate": 1.4827779300662425e-05, + "loss": 1.109, + "step": 23168 + }, + { + "epoch": 0.8297312299675901, + "grad_norm": 1.4193320274353027, + "learning_rate": 1.4821702065191413e-05, + "loss": 1.0395, + "step": 23169 + }, + { + "epoch": 0.8297670420971583, + "grad_norm": 1.3689073324203491, + "learning_rate": 1.4815625975684522e-05, + "loss": 1.0385, + "step": 23170 + }, + { + "epoch": 0.8298028542267266, + "grad_norm": 1.5107989311218262, + "learning_rate": 1.4809551032223534e-05, + "loss": 1.0983, + "step": 23171 + }, + { + "epoch": 0.8298386663562949, + "grad_norm": 1.2692121267318726, + "learning_rate": 1.480347723489013e-05, + "loss": 1.0841, + "step": 23172 + }, + { + "epoch": 0.8298744784858632, + "grad_norm": 1.3976737260818481, + "learning_rate": 1.4797404583766028e-05, + "loss": 0.814, + "step": 23173 + }, + { + "epoch": 0.8299102906154314, + "grad_norm": 1.753376841545105, + "learning_rate": 1.4791333078932956e-05, + "loss": 1.1379, + "step": 23174 + }, + { + "epoch": 0.8299461027449997, + "grad_norm": 1.557287335395813, + "learning_rate": 1.4785262720472615e-05, + "loss": 1.1076, + "step": 23175 + }, + { + "epoch": 0.829981914874568, + "grad_norm": 1.2976405620574951, + "learning_rate": 1.4779193508466604e-05, + "loss": 1.0157, + "step": 23176 + }, + { + "epoch": 0.8300177270041363, + "grad_norm": 1.4979219436645508, + "learning_rate": 1.4773125442996626e-05, + "loss": 1.1472, + "step": 23177 + }, + { + "epoch": 0.8300535391337046, + "grad_norm": 1.660601258277893, + "learning_rate": 1.4767058524144318e-05, + "loss": 1.0311, + "step": 23178 + }, + { + "epoch": 0.8300893512632729, + "grad_norm": 1.56390380859375, + "learning_rate": 1.476099275199131e-05, + "loss": 1.1017, + "step": 23179 + }, + { + "epoch": 0.8301251633928411, + "grad_norm": 1.5838783979415894, + "learning_rate": 1.4754928126619172e-05, + "loss": 1.0764, + "step": 23180 + }, + { + "epoch": 0.8301609755224094, + "grad_norm": 1.6898435354232788, + "learning_rate": 1.4748864648109518e-05, + "loss": 1.0258, + "step": 23181 + }, + { + "epoch": 0.8301967876519777, + "grad_norm": 1.5153089761734009, + "learning_rate": 1.4742802316543947e-05, + "loss": 1.069, + "step": 23182 + }, + { + "epoch": 0.830232599781546, + "grad_norm": 1.9368178844451904, + "learning_rate": 1.4736741132003984e-05, + "loss": 1.0907, + "step": 23183 + }, + { + "epoch": 0.8302684119111143, + "grad_norm": 1.5517377853393555, + "learning_rate": 1.4730681094571175e-05, + "loss": 1.1719, + "step": 23184 + }, + { + "epoch": 0.8303042240406826, + "grad_norm": 1.5919908285140991, + "learning_rate": 1.4724622204327066e-05, + "loss": 1.2759, + "step": 23185 + }, + { + "epoch": 0.8303400361702509, + "grad_norm": 1.3735811710357666, + "learning_rate": 1.471856446135319e-05, + "loss": 1.2382, + "step": 23186 + }, + { + "epoch": 0.8303758482998191, + "grad_norm": 1.474161148071289, + "learning_rate": 1.4712507865730996e-05, + "loss": 1.1614, + "step": 23187 + }, + { + "epoch": 0.8304116604293874, + "grad_norm": 1.5639803409576416, + "learning_rate": 1.4706452417542006e-05, + "loss": 1.0624, + "step": 23188 + }, + { + "epoch": 0.8304474725589557, + "grad_norm": 1.534462332725525, + "learning_rate": 1.4700398116867697e-05, + "loss": 0.953, + "step": 23189 + }, + { + "epoch": 0.830483284688524, + "grad_norm": 1.3585454225540161, + "learning_rate": 1.4694344963789474e-05, + "loss": 1.1002, + "step": 23190 + }, + { + "epoch": 0.8305190968180923, + "grad_norm": 1.4376837015151978, + "learning_rate": 1.4688292958388816e-05, + "loss": 0.925, + "step": 23191 + }, + { + "epoch": 0.8305549089476606, + "grad_norm": 1.3176040649414062, + "learning_rate": 1.4682242100747123e-05, + "loss": 1.1486, + "step": 23192 + }, + { + "epoch": 0.8305907210772289, + "grad_norm": 1.5518244504928589, + "learning_rate": 1.467619239094583e-05, + "loss": 1.0789, + "step": 23193 + }, + { + "epoch": 0.8306265332067971, + "grad_norm": 1.7346107959747314, + "learning_rate": 1.4670143829066296e-05, + "loss": 1.0885, + "step": 23194 + }, + { + "epoch": 0.8306623453363654, + "grad_norm": 1.5995951890945435, + "learning_rate": 1.4664096415189899e-05, + "loss": 1.145, + "step": 23195 + }, + { + "epoch": 0.8306981574659337, + "grad_norm": 1.512908935546875, + "learning_rate": 1.465805014939804e-05, + "loss": 1.0446, + "step": 23196 + }, + { + "epoch": 0.8307339695955019, + "grad_norm": 1.276509404182434, + "learning_rate": 1.465200503177201e-05, + "loss": 1.0606, + "step": 23197 + }, + { + "epoch": 0.8307697817250703, + "grad_norm": 1.7979191541671753, + "learning_rate": 1.4645961062393177e-05, + "loss": 1.0632, + "step": 23198 + }, + { + "epoch": 0.8308055938546386, + "grad_norm": 1.5503865480422974, + "learning_rate": 1.4639918241342798e-05, + "loss": 1.1557, + "step": 23199 + }, + { + "epoch": 0.8308414059842069, + "grad_norm": 1.2819461822509766, + "learning_rate": 1.4633876568702254e-05, + "loss": 0.9409, + "step": 23200 + }, + { + "epoch": 0.8308772181137751, + "grad_norm": 1.4497045278549194, + "learning_rate": 1.4627836044552767e-05, + "loss": 1.0218, + "step": 23201 + }, + { + "epoch": 0.8309130302433434, + "grad_norm": 2.29728102684021, + "learning_rate": 1.462179666897563e-05, + "loss": 1.0749, + "step": 23202 + }, + { + "epoch": 0.8309488423729117, + "grad_norm": 1.25211501121521, + "learning_rate": 1.4615758442052085e-05, + "loss": 0.9013, + "step": 23203 + }, + { + "epoch": 0.8309846545024799, + "grad_norm": 1.5708140134811401, + "learning_rate": 1.4609721363863393e-05, + "loss": 0.9407, + "step": 23204 + }, + { + "epoch": 0.8310204666320483, + "grad_norm": 1.5606625080108643, + "learning_rate": 1.4603685434490756e-05, + "loss": 0.9329, + "step": 23205 + }, + { + "epoch": 0.8310562787616166, + "grad_norm": 1.4825267791748047, + "learning_rate": 1.4597650654015327e-05, + "loss": 0.8911, + "step": 23206 + }, + { + "epoch": 0.8310920908911849, + "grad_norm": 1.5180240869522095, + "learning_rate": 1.45916170225184e-05, + "loss": 1.1964, + "step": 23207 + }, + { + "epoch": 0.8311279030207531, + "grad_norm": 1.8661020994186401, + "learning_rate": 1.4585584540081066e-05, + "loss": 1.0597, + "step": 23208 + }, + { + "epoch": 0.8311637151503214, + "grad_norm": 1.2166306972503662, + "learning_rate": 1.4579553206784546e-05, + "loss": 1.0228, + "step": 23209 + }, + { + "epoch": 0.8311995272798897, + "grad_norm": 1.3919086456298828, + "learning_rate": 1.45735230227099e-05, + "loss": 1.0944, + "step": 23210 + }, + { + "epoch": 0.8312353394094579, + "grad_norm": 1.8115664720535278, + "learning_rate": 1.4567493987938364e-05, + "loss": 1.1321, + "step": 23211 + }, + { + "epoch": 0.8312711515390263, + "grad_norm": 1.3497765064239502, + "learning_rate": 1.456146610255097e-05, + "loss": 1.0495, + "step": 23212 + }, + { + "epoch": 0.8313069636685946, + "grad_norm": 1.9741321802139282, + "learning_rate": 1.4555439366628843e-05, + "loss": 1.0319, + "step": 23213 + }, + { + "epoch": 0.8313427757981628, + "grad_norm": 1.3797669410705566, + "learning_rate": 1.4549413780253085e-05, + "loss": 1.0267, + "step": 23214 + }, + { + "epoch": 0.8313785879277311, + "grad_norm": 1.6463035345077515, + "learning_rate": 1.454338934350472e-05, + "loss": 1.3027, + "step": 23215 + }, + { + "epoch": 0.8314144000572994, + "grad_norm": 1.2371196746826172, + "learning_rate": 1.453736605646484e-05, + "loss": 1.1146, + "step": 23216 + }, + { + "epoch": 0.8314502121868677, + "grad_norm": 1.6827144622802734, + "learning_rate": 1.4531343919214414e-05, + "loss": 1.0239, + "step": 23217 + }, + { + "epoch": 0.8314860243164359, + "grad_norm": 1.261766791343689, + "learning_rate": 1.4525322931834562e-05, + "loss": 0.8757, + "step": 23218 + }, + { + "epoch": 0.8315218364460043, + "grad_norm": 1.5700838565826416, + "learning_rate": 1.4519303094406211e-05, + "loss": 1.1841, + "step": 23219 + }, + { + "epoch": 0.8315576485755726, + "grad_norm": 1.588816523551941, + "learning_rate": 1.4513284407010385e-05, + "loss": 1.1671, + "step": 23220 + }, + { + "epoch": 0.8315934607051408, + "grad_norm": 1.7070367336273193, + "learning_rate": 1.450726686972802e-05, + "loss": 1.14, + "step": 23221 + }, + { + "epoch": 0.8316292728347091, + "grad_norm": 1.2877641916275024, + "learning_rate": 1.4501250482640139e-05, + "loss": 1.1271, + "step": 23222 + }, + { + "epoch": 0.8316650849642774, + "grad_norm": 1.821668267250061, + "learning_rate": 1.4495235245827642e-05, + "loss": 0.9746, + "step": 23223 + }, + { + "epoch": 0.8317008970938456, + "grad_norm": 2.9545435905456543, + "learning_rate": 1.4489221159371447e-05, + "loss": 1.0636, + "step": 23224 + }, + { + "epoch": 0.8317367092234139, + "grad_norm": 1.5329606533050537, + "learning_rate": 1.4483208223352474e-05, + "loss": 1.1565, + "step": 23225 + }, + { + "epoch": 0.8317725213529823, + "grad_norm": 1.5588968992233276, + "learning_rate": 1.4477196437851625e-05, + "loss": 1.1592, + "step": 23226 + }, + { + "epoch": 0.8318083334825506, + "grad_norm": 1.6067768335342407, + "learning_rate": 1.4471185802949816e-05, + "loss": 1.2566, + "step": 23227 + }, + { + "epoch": 0.8318441456121188, + "grad_norm": 1.3369001150131226, + "learning_rate": 1.4465176318727825e-05, + "loss": 1.1946, + "step": 23228 + }, + { + "epoch": 0.8318799577416871, + "grad_norm": 1.6263912916183472, + "learning_rate": 1.4459167985266597e-05, + "loss": 1.2178, + "step": 23229 + }, + { + "epoch": 0.8319157698712554, + "grad_norm": 1.32587730884552, + "learning_rate": 1.4453160802646903e-05, + "loss": 1.0532, + "step": 23230 + }, + { + "epoch": 0.8319515820008236, + "grad_norm": 1.5135700702667236, + "learning_rate": 1.444715477094961e-05, + "loss": 1.0527, + "step": 23231 + }, + { + "epoch": 0.8319873941303919, + "grad_norm": 1.3283040523529053, + "learning_rate": 1.4441149890255467e-05, + "loss": 0.9955, + "step": 23232 + }, + { + "epoch": 0.8320232062599603, + "grad_norm": 1.51017427444458, + "learning_rate": 1.4435146160645285e-05, + "loss": 1.0537, + "step": 23233 + }, + { + "epoch": 0.8320590183895286, + "grad_norm": 1.378475308418274, + "learning_rate": 1.4429143582199866e-05, + "loss": 1.1572, + "step": 23234 + }, + { + "epoch": 0.8320948305190968, + "grad_norm": 1.2268171310424805, + "learning_rate": 1.4423142154999925e-05, + "loss": 0.8876, + "step": 23235 + }, + { + "epoch": 0.8321306426486651, + "grad_norm": 1.3828442096710205, + "learning_rate": 1.4417141879126218e-05, + "loss": 1.2174, + "step": 23236 + }, + { + "epoch": 0.8321664547782334, + "grad_norm": 1.4938738346099854, + "learning_rate": 1.4411142754659468e-05, + "loss": 0.9959, + "step": 23237 + }, + { + "epoch": 0.8322022669078016, + "grad_norm": 1.5539429187774658, + "learning_rate": 1.4405144781680424e-05, + "loss": 0.931, + "step": 23238 + }, + { + "epoch": 0.8322380790373699, + "grad_norm": 1.190860629081726, + "learning_rate": 1.4399147960269688e-05, + "loss": 1.2005, + "step": 23239 + }, + { + "epoch": 0.8322738911669383, + "grad_norm": 1.4099146127700806, + "learning_rate": 1.439315229050805e-05, + "loss": 1.1316, + "step": 23240 + }, + { + "epoch": 0.8323097032965066, + "grad_norm": 1.608985185623169, + "learning_rate": 1.4387157772476134e-05, + "loss": 1.0992, + "step": 23241 + }, + { + "epoch": 0.8323455154260748, + "grad_norm": 1.4414236545562744, + "learning_rate": 1.4381164406254544e-05, + "loss": 1.0768, + "step": 23242 + }, + { + "epoch": 0.8323813275556431, + "grad_norm": 1.333376407623291, + "learning_rate": 1.4375172191923947e-05, + "loss": 1.049, + "step": 23243 + }, + { + "epoch": 0.8324171396852114, + "grad_norm": 1.3636070489883423, + "learning_rate": 1.4369181129564957e-05, + "loss": 1.1171, + "step": 23244 + }, + { + "epoch": 0.8324529518147796, + "grad_norm": 1.5453938245773315, + "learning_rate": 1.4363191219258209e-05, + "loss": 1.0323, + "step": 23245 + }, + { + "epoch": 0.8324887639443479, + "grad_norm": 1.3445589542388916, + "learning_rate": 1.4357202461084229e-05, + "loss": 0.9789, + "step": 23246 + }, + { + "epoch": 0.8325245760739163, + "grad_norm": 1.5757163763046265, + "learning_rate": 1.4351214855123629e-05, + "loss": 0.9644, + "step": 23247 + }, + { + "epoch": 0.8325603882034845, + "grad_norm": 1.316070556640625, + "learning_rate": 1.4345228401456945e-05, + "loss": 0.9706, + "step": 23248 + }, + { + "epoch": 0.8325962003330528, + "grad_norm": 1.4751466512680054, + "learning_rate": 1.4339243100164757e-05, + "loss": 1.2982, + "step": 23249 + }, + { + "epoch": 0.8326320124626211, + "grad_norm": 1.351641297340393, + "learning_rate": 1.4333258951327534e-05, + "loss": 0.9787, + "step": 23250 + }, + { + "epoch": 0.8326678245921894, + "grad_norm": 1.6444677114486694, + "learning_rate": 1.4327275955025798e-05, + "loss": 0.8947, + "step": 23251 + }, + { + "epoch": 0.8327036367217576, + "grad_norm": 1.4763014316558838, + "learning_rate": 1.4321294111340089e-05, + "loss": 1.0301, + "step": 23252 + }, + { + "epoch": 0.8327394488513259, + "grad_norm": 1.5425264835357666, + "learning_rate": 1.4315313420350829e-05, + "loss": 1.0015, + "step": 23253 + }, + { + "epoch": 0.8327752609808943, + "grad_norm": 1.5496920347213745, + "learning_rate": 1.4309333882138488e-05, + "loss": 1.1169, + "step": 23254 + }, + { + "epoch": 0.8328110731104625, + "grad_norm": 1.4692872762680054, + "learning_rate": 1.4303355496783544e-05, + "loss": 1.0102, + "step": 23255 + }, + { + "epoch": 0.8328468852400308, + "grad_norm": 1.37952721118927, + "learning_rate": 1.4297378264366423e-05, + "loss": 1.1659, + "step": 23256 + }, + { + "epoch": 0.8328826973695991, + "grad_norm": 1.3316317796707153, + "learning_rate": 1.4291402184967507e-05, + "loss": 1.0915, + "step": 23257 + }, + { + "epoch": 0.8329185094991673, + "grad_norm": 1.8107749223709106, + "learning_rate": 1.4285427258667217e-05, + "loss": 1.2329, + "step": 23258 + }, + { + "epoch": 0.8329543216287356, + "grad_norm": 1.6767070293426514, + "learning_rate": 1.4279453485545968e-05, + "loss": 1.3762, + "step": 23259 + }, + { + "epoch": 0.8329901337583039, + "grad_norm": 1.4005522727966309, + "learning_rate": 1.4273480865684074e-05, + "loss": 1.0152, + "step": 23260 + }, + { + "epoch": 0.8330259458878723, + "grad_norm": 1.592882513999939, + "learning_rate": 1.4267509399161916e-05, + "loss": 0.9906, + "step": 23261 + }, + { + "epoch": 0.8330617580174405, + "grad_norm": 1.4321086406707764, + "learning_rate": 1.4261539086059839e-05, + "loss": 1.1208, + "step": 23262 + }, + { + "epoch": 0.8330975701470088, + "grad_norm": 1.6055151224136353, + "learning_rate": 1.4255569926458168e-05, + "loss": 1.043, + "step": 23263 + }, + { + "epoch": 0.8331333822765771, + "grad_norm": 1.5664730072021484, + "learning_rate": 1.4249601920437194e-05, + "loss": 1.0392, + "step": 23264 + }, + { + "epoch": 0.8331691944061453, + "grad_norm": 1.3948012590408325, + "learning_rate": 1.42436350680772e-05, + "loss": 1.2882, + "step": 23265 + }, + { + "epoch": 0.8332050065357136, + "grad_norm": 1.5331815481185913, + "learning_rate": 1.4237669369458495e-05, + "loss": 1.0988, + "step": 23266 + }, + { + "epoch": 0.8332408186652819, + "grad_norm": 2.0924289226531982, + "learning_rate": 1.4231704824661329e-05, + "loss": 1.1403, + "step": 23267 + }, + { + "epoch": 0.8332766307948503, + "grad_norm": 1.4293618202209473, + "learning_rate": 1.4225741433765927e-05, + "loss": 1.0396, + "step": 23268 + }, + { + "epoch": 0.8333124429244185, + "grad_norm": 1.3997350931167603, + "learning_rate": 1.4219779196852534e-05, + "loss": 1.109, + "step": 23269 + }, + { + "epoch": 0.8333482550539868, + "grad_norm": 1.2367545366287231, + "learning_rate": 1.4213818114001387e-05, + "loss": 0.8524, + "step": 23270 + }, + { + "epoch": 0.8333840671835551, + "grad_norm": 1.6644724607467651, + "learning_rate": 1.4207858185292643e-05, + "loss": 1.0264, + "step": 23271 + }, + { + "epoch": 0.8334198793131233, + "grad_norm": 1.7070401906967163, + "learning_rate": 1.4201899410806496e-05, + "loss": 1.1486, + "step": 23272 + }, + { + "epoch": 0.8334556914426916, + "grad_norm": 1.4740922451019287, + "learning_rate": 1.4195941790623124e-05, + "loss": 1.1652, + "step": 23273 + }, + { + "epoch": 0.8334915035722599, + "grad_norm": 1.7005826234817505, + "learning_rate": 1.4189985324822697e-05, + "loss": 1.3406, + "step": 23274 + }, + { + "epoch": 0.8335273157018283, + "grad_norm": 1.3647109270095825, + "learning_rate": 1.4184030013485305e-05, + "loss": 1.1209, + "step": 23275 + }, + { + "epoch": 0.8335631278313965, + "grad_norm": 1.4063689708709717, + "learning_rate": 1.4178075856691097e-05, + "loss": 1.0055, + "step": 23276 + }, + { + "epoch": 0.8335989399609648, + "grad_norm": 1.5727990865707397, + "learning_rate": 1.4172122854520198e-05, + "loss": 1.3085, + "step": 23277 + }, + { + "epoch": 0.8336347520905331, + "grad_norm": 1.428337812423706, + "learning_rate": 1.4166171007052653e-05, + "loss": 0.8727, + "step": 23278 + }, + { + "epoch": 0.8336705642201013, + "grad_norm": 1.726276159286499, + "learning_rate": 1.4160220314368555e-05, + "loss": 1.2208, + "step": 23279 + }, + { + "epoch": 0.8337063763496696, + "grad_norm": 1.4317907094955444, + "learning_rate": 1.4154270776547974e-05, + "loss": 0.9385, + "step": 23280 + }, + { + "epoch": 0.8337421884792379, + "grad_norm": 2.6272833347320557, + "learning_rate": 1.4148322393670976e-05, + "loss": 1.226, + "step": 23281 + }, + { + "epoch": 0.8337780006088062, + "grad_norm": 1.981170892715454, + "learning_rate": 1.4142375165817523e-05, + "loss": 1.0637, + "step": 23282 + }, + { + "epoch": 0.8338138127383745, + "grad_norm": 1.655245304107666, + "learning_rate": 1.413642909306767e-05, + "loss": 1.0984, + "step": 23283 + }, + { + "epoch": 0.8338496248679428, + "grad_norm": 1.257720708847046, + "learning_rate": 1.4130484175501435e-05, + "loss": 0.9398, + "step": 23284 + }, + { + "epoch": 0.833885436997511, + "grad_norm": 2.9790337085723877, + "learning_rate": 1.412454041319874e-05, + "loss": 1.3893, + "step": 23285 + }, + { + "epoch": 0.8339212491270793, + "grad_norm": 2.237494468688965, + "learning_rate": 1.4118597806239585e-05, + "loss": 1.1509, + "step": 23286 + }, + { + "epoch": 0.8339570612566476, + "grad_norm": 1.5313278436660767, + "learning_rate": 1.4112656354703924e-05, + "loss": 0.9516, + "step": 23287 + }, + { + "epoch": 0.8339928733862159, + "grad_norm": 1.1697794198989868, + "learning_rate": 1.41067160586717e-05, + "loss": 1.0266, + "step": 23288 + }, + { + "epoch": 0.8340286855157842, + "grad_norm": 1.6510088443756104, + "learning_rate": 1.4100776918222802e-05, + "loss": 0.9994, + "step": 23289 + }, + { + "epoch": 0.8340644976453525, + "grad_norm": 1.2758656740188599, + "learning_rate": 1.4094838933437138e-05, + "loss": 1.0694, + "step": 23290 + }, + { + "epoch": 0.8341003097749208, + "grad_norm": 1.7262330055236816, + "learning_rate": 1.4088902104394607e-05, + "loss": 1.2504, + "step": 23291 + }, + { + "epoch": 0.834136121904489, + "grad_norm": 1.4066323041915894, + "learning_rate": 1.4082966431175116e-05, + "loss": 0.8955, + "step": 23292 + }, + { + "epoch": 0.8341719340340573, + "grad_norm": 1.3830604553222656, + "learning_rate": 1.4077031913858474e-05, + "loss": 1.1303, + "step": 23293 + }, + { + "epoch": 0.8342077461636256, + "grad_norm": 1.3598167896270752, + "learning_rate": 1.4071098552524497e-05, + "loss": 0.9679, + "step": 23294 + }, + { + "epoch": 0.8342435582931939, + "grad_norm": 1.7535433769226074, + "learning_rate": 1.4065166347253089e-05, + "loss": 0.998, + "step": 23295 + }, + { + "epoch": 0.8342793704227622, + "grad_norm": 1.834324598312378, + "learning_rate": 1.4059235298124006e-05, + "loss": 1.0612, + "step": 23296 + }, + { + "epoch": 0.8343151825523305, + "grad_norm": 1.5003010034561157, + "learning_rate": 1.4053305405217044e-05, + "loss": 1.1822, + "step": 23297 + }, + { + "epoch": 0.8343509946818988, + "grad_norm": 1.2753028869628906, + "learning_rate": 1.4047376668612e-05, + "loss": 1.0151, + "step": 23298 + }, + { + "epoch": 0.834386806811467, + "grad_norm": 1.4854707717895508, + "learning_rate": 1.4041449088388659e-05, + "loss": 0.8399, + "step": 23299 + }, + { + "epoch": 0.8344226189410353, + "grad_norm": 1.4539889097213745, + "learning_rate": 1.4035522664626721e-05, + "loss": 1.1323, + "step": 23300 + }, + { + "epoch": 0.8344584310706036, + "grad_norm": 4.2321367263793945, + "learning_rate": 1.4029597397405925e-05, + "loss": 1.3118, + "step": 23301 + }, + { + "epoch": 0.8344942432001718, + "grad_norm": 1.5025477409362793, + "learning_rate": 1.4023673286806039e-05, + "loss": 0.885, + "step": 23302 + }, + { + "epoch": 0.8345300553297402, + "grad_norm": 1.450788974761963, + "learning_rate": 1.4017750332906698e-05, + "loss": 1.2245, + "step": 23303 + }, + { + "epoch": 0.8345658674593085, + "grad_norm": 1.599503993988037, + "learning_rate": 1.4011828535787642e-05, + "loss": 1.1788, + "step": 23304 + }, + { + "epoch": 0.8346016795888768, + "grad_norm": 1.6280145645141602, + "learning_rate": 1.400590789552847e-05, + "loss": 1.0353, + "step": 23305 + }, + { + "epoch": 0.834637491718445, + "grad_norm": 1.797907829284668, + "learning_rate": 1.3999988412208931e-05, + "loss": 1.0687, + "step": 23306 + }, + { + "epoch": 0.8346733038480133, + "grad_norm": 1.331120252609253, + "learning_rate": 1.3994070085908596e-05, + "loss": 1.0972, + "step": 23307 + }, + { + "epoch": 0.8347091159775816, + "grad_norm": 1.722778558731079, + "learning_rate": 1.3988152916707121e-05, + "loss": 1.0251, + "step": 23308 + }, + { + "epoch": 0.8347449281071498, + "grad_norm": 1.5352424383163452, + "learning_rate": 1.3982236904684064e-05, + "loss": 0.9941, + "step": 23309 + }, + { + "epoch": 0.8347807402367182, + "grad_norm": 1.3119540214538574, + "learning_rate": 1.3976322049919088e-05, + "loss": 1.1223, + "step": 23310 + }, + { + "epoch": 0.8348165523662865, + "grad_norm": 1.3786312341690063, + "learning_rate": 1.3970408352491749e-05, + "loss": 1.1049, + "step": 23311 + }, + { + "epoch": 0.8348523644958548, + "grad_norm": 1.7015657424926758, + "learning_rate": 1.3964495812481548e-05, + "loss": 1.1452, + "step": 23312 + }, + { + "epoch": 0.834888176625423, + "grad_norm": 1.455676794052124, + "learning_rate": 1.3958584429968124e-05, + "loss": 1.0952, + "step": 23313 + }, + { + "epoch": 0.8349239887549913, + "grad_norm": 1.5539082288742065, + "learning_rate": 1.3952674205030935e-05, + "loss": 0.915, + "step": 23314 + }, + { + "epoch": 0.8349598008845596, + "grad_norm": 1.5700780153274536, + "learning_rate": 1.394676513774954e-05, + "loss": 1.2216, + "step": 23315 + }, + { + "epoch": 0.8349956130141278, + "grad_norm": 1.4141377210617065, + "learning_rate": 1.3940857228203386e-05, + "loss": 1.0281, + "step": 23316 + }, + { + "epoch": 0.8350314251436962, + "grad_norm": 1.4168528318405151, + "learning_rate": 1.393495047647202e-05, + "loss": 1.0894, + "step": 23317 + }, + { + "epoch": 0.8350672372732645, + "grad_norm": 1.5812331438064575, + "learning_rate": 1.3929044882634867e-05, + "loss": 1.1336, + "step": 23318 + }, + { + "epoch": 0.8351030494028328, + "grad_norm": 1.4389923810958862, + "learning_rate": 1.3923140446771409e-05, + "loss": 1.3579, + "step": 23319 + }, + { + "epoch": 0.835138861532401, + "grad_norm": 1.4549564123153687, + "learning_rate": 1.3917237168961051e-05, + "loss": 1.0168, + "step": 23320 + }, + { + "epoch": 0.8351746736619693, + "grad_norm": 1.5111870765686035, + "learning_rate": 1.3911335049283225e-05, + "loss": 1.1724, + "step": 23321 + }, + { + "epoch": 0.8352104857915376, + "grad_norm": 1.3311371803283691, + "learning_rate": 1.390543408781736e-05, + "loss": 1.1574, + "step": 23322 + }, + { + "epoch": 0.8352462979211058, + "grad_norm": 1.3755367994308472, + "learning_rate": 1.3899534284642779e-05, + "loss": 0.8568, + "step": 23323 + }, + { + "epoch": 0.8352821100506742, + "grad_norm": 1.519187331199646, + "learning_rate": 1.3893635639838942e-05, + "loss": 1.0101, + "step": 23324 + }, + { + "epoch": 0.8353179221802425, + "grad_norm": 1.519086480140686, + "learning_rate": 1.3887738153485153e-05, + "loss": 1.1787, + "step": 23325 + }, + { + "epoch": 0.8353537343098107, + "grad_norm": 1.400217890739441, + "learning_rate": 1.388184182566079e-05, + "loss": 1.2161, + "step": 23326 + }, + { + "epoch": 0.835389546439379, + "grad_norm": 1.6525676250457764, + "learning_rate": 1.3875946656445126e-05, + "loss": 1.0617, + "step": 23327 + }, + { + "epoch": 0.8354253585689473, + "grad_norm": 1.3070260286331177, + "learning_rate": 1.3870052645917542e-05, + "loss": 1.0705, + "step": 23328 + }, + { + "epoch": 0.8354611706985156, + "grad_norm": 1.4356175661087036, + "learning_rate": 1.3864159794157305e-05, + "loss": 0.71, + "step": 23329 + }, + { + "epoch": 0.8354969828280838, + "grad_norm": 1.9601632356643677, + "learning_rate": 1.3858268101243666e-05, + "loss": 1.1281, + "step": 23330 + }, + { + "epoch": 0.8355327949576522, + "grad_norm": 1.8830996751785278, + "learning_rate": 1.3852377567255913e-05, + "loss": 1.0839, + "step": 23331 + }, + { + "epoch": 0.8355686070872205, + "grad_norm": 1.5979187488555908, + "learning_rate": 1.3846488192273298e-05, + "loss": 1.2638, + "step": 23332 + }, + { + "epoch": 0.8356044192167887, + "grad_norm": 1.3006610870361328, + "learning_rate": 1.384059997637508e-05, + "loss": 1.1386, + "step": 23333 + }, + { + "epoch": 0.835640231346357, + "grad_norm": 1.35885751247406, + "learning_rate": 1.3834712919640424e-05, + "loss": 0.8733, + "step": 23334 + }, + { + "epoch": 0.8356760434759253, + "grad_norm": 1.4248305559158325, + "learning_rate": 1.382882702214856e-05, + "loss": 1.1296, + "step": 23335 + }, + { + "epoch": 0.8357118556054935, + "grad_norm": 1.9937503337860107, + "learning_rate": 1.382294228397868e-05, + "loss": 1.3766, + "step": 23336 + }, + { + "epoch": 0.8357476677350618, + "grad_norm": 1.2479572296142578, + "learning_rate": 1.3817058705209973e-05, + "loss": 1.1956, + "step": 23337 + }, + { + "epoch": 0.8357834798646302, + "grad_norm": 1.3860275745391846, + "learning_rate": 1.3811176285921557e-05, + "loss": 1.0323, + "step": 23338 + }, + { + "epoch": 0.8358192919941985, + "grad_norm": 1.6404602527618408, + "learning_rate": 1.3805295026192577e-05, + "loss": 1.0133, + "step": 23339 + }, + { + "epoch": 0.8358551041237667, + "grad_norm": 1.470441222190857, + "learning_rate": 1.3799414926102194e-05, + "loss": 0.9278, + "step": 23340 + }, + { + "epoch": 0.835890916253335, + "grad_norm": 1.3731437921524048, + "learning_rate": 1.3793535985729478e-05, + "loss": 1.2238, + "step": 23341 + }, + { + "epoch": 0.8359267283829033, + "grad_norm": 1.2877256870269775, + "learning_rate": 1.3787658205153532e-05, + "loss": 0.9795, + "step": 23342 + }, + { + "epoch": 0.8359625405124715, + "grad_norm": 1.4622790813446045, + "learning_rate": 1.3781781584453435e-05, + "loss": 1.0957, + "step": 23343 + }, + { + "epoch": 0.8359983526420398, + "grad_norm": 1.2930597066879272, + "learning_rate": 1.3775906123708282e-05, + "loss": 0.941, + "step": 23344 + }, + { + "epoch": 0.8360341647716082, + "grad_norm": 1.5638381242752075, + "learning_rate": 1.3770031822997064e-05, + "loss": 1.0158, + "step": 23345 + }, + { + "epoch": 0.8360699769011765, + "grad_norm": 1.332075834274292, + "learning_rate": 1.3764158682398843e-05, + "loss": 0.9801, + "step": 23346 + }, + { + "epoch": 0.8361057890307447, + "grad_norm": 1.2791297435760498, + "learning_rate": 1.3758286701992651e-05, + "loss": 1.0753, + "step": 23347 + }, + { + "epoch": 0.836141601160313, + "grad_norm": 1.1944435834884644, + "learning_rate": 1.375241588185744e-05, + "loss": 1.0338, + "step": 23348 + }, + { + "epoch": 0.8361774132898813, + "grad_norm": 1.316946029663086, + "learning_rate": 1.3746546222072232e-05, + "loss": 0.8486, + "step": 23349 + }, + { + "epoch": 0.8362132254194495, + "grad_norm": 1.5333380699157715, + "learning_rate": 1.3740677722715977e-05, + "loss": 1.2001, + "step": 23350 + }, + { + "epoch": 0.8362490375490178, + "grad_norm": 1.2865692377090454, + "learning_rate": 1.373481038386767e-05, + "loss": 1.1175, + "step": 23351 + }, + { + "epoch": 0.8362848496785862, + "grad_norm": 1.6588964462280273, + "learning_rate": 1.3728944205606186e-05, + "loss": 0.9948, + "step": 23352 + }, + { + "epoch": 0.8363206618081545, + "grad_norm": 1.6304956674575806, + "learning_rate": 1.3723079188010469e-05, + "loss": 1.0802, + "step": 23353 + }, + { + "epoch": 0.8363564739377227, + "grad_norm": 1.4208273887634277, + "learning_rate": 1.3717215331159439e-05, + "loss": 1.0354, + "step": 23354 + }, + { + "epoch": 0.836392286067291, + "grad_norm": 1.2237590551376343, + "learning_rate": 1.3711352635132002e-05, + "loss": 0.8485, + "step": 23355 + }, + { + "epoch": 0.8364280981968593, + "grad_norm": 1.2668442726135254, + "learning_rate": 1.3705491100006995e-05, + "loss": 1.0204, + "step": 23356 + }, + { + "epoch": 0.8364639103264275, + "grad_norm": 1.3714600801467896, + "learning_rate": 1.3699630725863289e-05, + "loss": 1.0218, + "step": 23357 + }, + { + "epoch": 0.8364997224559958, + "grad_norm": 1.5734294652938843, + "learning_rate": 1.3693771512779752e-05, + "loss": 1.2152, + "step": 23358 + }, + { + "epoch": 0.8365355345855642, + "grad_norm": 1.2551730871200562, + "learning_rate": 1.3687913460835167e-05, + "loss": 1.2572, + "step": 23359 + }, + { + "epoch": 0.8365713467151324, + "grad_norm": 1.416684627532959, + "learning_rate": 1.3682056570108382e-05, + "loss": 1.1255, + "step": 23360 + }, + { + "epoch": 0.8366071588447007, + "grad_norm": 1.3098716735839844, + "learning_rate": 1.3676200840678167e-05, + "loss": 0.9471, + "step": 23361 + }, + { + "epoch": 0.836642970974269, + "grad_norm": 1.5186715126037598, + "learning_rate": 1.3670346272623357e-05, + "loss": 0.9437, + "step": 23362 + }, + { + "epoch": 0.8366787831038373, + "grad_norm": 1.4308116436004639, + "learning_rate": 1.366449286602265e-05, + "loss": 1.0686, + "step": 23363 + }, + { + "epoch": 0.8367145952334055, + "grad_norm": 1.6845229864120483, + "learning_rate": 1.3658640620954832e-05, + "loss": 1.096, + "step": 23364 + }, + { + "epoch": 0.8367504073629738, + "grad_norm": 1.317466139793396, + "learning_rate": 1.3652789537498656e-05, + "loss": 1.0994, + "step": 23365 + }, + { + "epoch": 0.8367862194925422, + "grad_norm": 2.2176644802093506, + "learning_rate": 1.364693961573279e-05, + "loss": 1.0559, + "step": 23366 + }, + { + "epoch": 0.8368220316221104, + "grad_norm": 1.3696104288101196, + "learning_rate": 1.3641090855735972e-05, + "loss": 0.8666, + "step": 23367 + }, + { + "epoch": 0.8368578437516787, + "grad_norm": 1.4169760942459106, + "learning_rate": 1.3635243257586872e-05, + "loss": 1.1097, + "step": 23368 + }, + { + "epoch": 0.836893655881247, + "grad_norm": 1.5538034439086914, + "learning_rate": 1.3629396821364193e-05, + "loss": 1.0802, + "step": 23369 + }, + { + "epoch": 0.8369294680108152, + "grad_norm": 2.4003031253814697, + "learning_rate": 1.3623551547146552e-05, + "loss": 1.2149, + "step": 23370 + }, + { + "epoch": 0.8369652801403835, + "grad_norm": 1.406350016593933, + "learning_rate": 1.3617707435012606e-05, + "loss": 1.0214, + "step": 23371 + }, + { + "epoch": 0.8370010922699518, + "grad_norm": 1.8647444248199463, + "learning_rate": 1.3611864485040982e-05, + "loss": 1.2061, + "step": 23372 + }, + { + "epoch": 0.8370369043995202, + "grad_norm": 1.492708444595337, + "learning_rate": 1.3606022697310316e-05, + "loss": 1.1901, + "step": 23373 + }, + { + "epoch": 0.8370727165290884, + "grad_norm": 1.4116966724395752, + "learning_rate": 1.3600182071899148e-05, + "loss": 1.1725, + "step": 23374 + }, + { + "epoch": 0.8371085286586567, + "grad_norm": 1.5870013236999512, + "learning_rate": 1.359434260888608e-05, + "loss": 1.046, + "step": 23375 + }, + { + "epoch": 0.837144340788225, + "grad_norm": 1.5716376304626465, + "learning_rate": 1.3588504308349703e-05, + "loss": 0.9697, + "step": 23376 + }, + { + "epoch": 0.8371801529177932, + "grad_norm": 1.4855120182037354, + "learning_rate": 1.3582667170368513e-05, + "loss": 1.1437, + "step": 23377 + }, + { + "epoch": 0.8372159650473615, + "grad_norm": 1.3794357776641846, + "learning_rate": 1.3576831195021067e-05, + "loss": 1.1885, + "step": 23378 + }, + { + "epoch": 0.8372517771769298, + "grad_norm": 1.9156217575073242, + "learning_rate": 1.357099638238587e-05, + "loss": 1.1101, + "step": 23379 + }, + { + "epoch": 0.8372875893064982, + "grad_norm": 1.438919186592102, + "learning_rate": 1.356516273254147e-05, + "loss": 0.9367, + "step": 23380 + }, + { + "epoch": 0.8373234014360664, + "grad_norm": 1.618557095527649, + "learning_rate": 1.3559330245566282e-05, + "loss": 0.9957, + "step": 23381 + }, + { + "epoch": 0.8373592135656347, + "grad_norm": 1.2857495546340942, + "learning_rate": 1.3553498921538798e-05, + "loss": 1.0567, + "step": 23382 + }, + { + "epoch": 0.837395025695203, + "grad_norm": 1.4861575365066528, + "learning_rate": 1.3547668760537514e-05, + "loss": 1.1249, + "step": 23383 + }, + { + "epoch": 0.8374308378247712, + "grad_norm": 2.0529685020446777, + "learning_rate": 1.3541839762640796e-05, + "loss": 1.0595, + "step": 23384 + }, + { + "epoch": 0.8374666499543395, + "grad_norm": 1.7170103788375854, + "learning_rate": 1.3536011927927117e-05, + "loss": 1.1284, + "step": 23385 + }, + { + "epoch": 0.8375024620839078, + "grad_norm": 1.6626882553100586, + "learning_rate": 1.3530185256474848e-05, + "loss": 1.0501, + "step": 23386 + }, + { + "epoch": 0.8375382742134762, + "grad_norm": 1.4271864891052246, + "learning_rate": 1.3524359748362437e-05, + "loss": 1.0404, + "step": 23387 + }, + { + "epoch": 0.8375740863430444, + "grad_norm": 1.6812299489974976, + "learning_rate": 1.3518535403668186e-05, + "loss": 1.0222, + "step": 23388 + }, + { + "epoch": 0.8376098984726127, + "grad_norm": 1.3195264339447021, + "learning_rate": 1.3512712222470491e-05, + "loss": 0.8641, + "step": 23389 + }, + { + "epoch": 0.837645710602181, + "grad_norm": 1.915985107421875, + "learning_rate": 1.3506890204847722e-05, + "loss": 1.2499, + "step": 23390 + }, + { + "epoch": 0.8376815227317492, + "grad_norm": 1.469488263130188, + "learning_rate": 1.3501069350878149e-05, + "loss": 0.9096, + "step": 23391 + }, + { + "epoch": 0.8377173348613175, + "grad_norm": 1.4515100717544556, + "learning_rate": 1.3495249660640142e-05, + "loss": 0.9266, + "step": 23392 + }, + { + "epoch": 0.8377531469908858, + "grad_norm": 1.7856760025024414, + "learning_rate": 1.3489431134211916e-05, + "loss": 1.1408, + "step": 23393 + }, + { + "epoch": 0.8377889591204541, + "grad_norm": 1.5227254629135132, + "learning_rate": 1.3483613771671843e-05, + "loss": 1.1297, + "step": 23394 + }, + { + "epoch": 0.8378247712500224, + "grad_norm": 1.460973858833313, + "learning_rate": 1.3477797573098128e-05, + "loss": 1.0738, + "step": 23395 + }, + { + "epoch": 0.8378605833795907, + "grad_norm": 1.489893913269043, + "learning_rate": 1.347198253856905e-05, + "loss": 0.9677, + "step": 23396 + }, + { + "epoch": 0.837896395509159, + "grad_norm": 1.462762713432312, + "learning_rate": 1.3466168668162827e-05, + "loss": 0.9157, + "step": 23397 + }, + { + "epoch": 0.8379322076387272, + "grad_norm": 1.5152865648269653, + "learning_rate": 1.3460355961957704e-05, + "loss": 1.0932, + "step": 23398 + }, + { + "epoch": 0.8379680197682955, + "grad_norm": 1.3460497856140137, + "learning_rate": 1.3454544420031878e-05, + "loss": 1.2479, + "step": 23399 + }, + { + "epoch": 0.8380038318978638, + "grad_norm": 1.4833941459655762, + "learning_rate": 1.3448734042463463e-05, + "loss": 1.2552, + "step": 23400 + }, + { + "epoch": 0.8380396440274321, + "grad_norm": 1.4063395261764526, + "learning_rate": 1.3442924829330738e-05, + "loss": 1.1526, + "step": 23401 + }, + { + "epoch": 0.8380754561570004, + "grad_norm": 1.417467474937439, + "learning_rate": 1.3437116780711778e-05, + "loss": 0.9317, + "step": 23402 + }, + { + "epoch": 0.8381112682865687, + "grad_norm": 1.1424251794815063, + "learning_rate": 1.3431309896684785e-05, + "loss": 0.9665, + "step": 23403 + }, + { + "epoch": 0.838147080416137, + "grad_norm": 1.3296120166778564, + "learning_rate": 1.3425504177327808e-05, + "loss": 0.8887, + "step": 23404 + }, + { + "epoch": 0.8381828925457052, + "grad_norm": 2.0088484287261963, + "learning_rate": 1.341969962271904e-05, + "loss": 1.0502, + "step": 23405 + }, + { + "epoch": 0.8382187046752735, + "grad_norm": 1.2119032144546509, + "learning_rate": 1.3413896232936506e-05, + "loss": 1.0016, + "step": 23406 + }, + { + "epoch": 0.8382545168048418, + "grad_norm": 1.3123772144317627, + "learning_rate": 1.3408094008058314e-05, + "loss": 0.9638, + "step": 23407 + }, + { + "epoch": 0.8382903289344101, + "grad_norm": 1.3805365562438965, + "learning_rate": 1.3402292948162554e-05, + "loss": 0.9634, + "step": 23408 + }, + { + "epoch": 0.8383261410639784, + "grad_norm": 1.6499602794647217, + "learning_rate": 1.3396493053327208e-05, + "loss": 0.9697, + "step": 23409 + }, + { + "epoch": 0.8383619531935467, + "grad_norm": 1.5968478918075562, + "learning_rate": 1.339069432363036e-05, + "loss": 0.9847, + "step": 23410 + }, + { + "epoch": 0.8383977653231149, + "grad_norm": 1.466844916343689, + "learning_rate": 1.3384896759149957e-05, + "loss": 1.0157, + "step": 23411 + }, + { + "epoch": 0.8384335774526832, + "grad_norm": 1.445504069328308, + "learning_rate": 1.3379100359964082e-05, + "loss": 1.2929, + "step": 23412 + }, + { + "epoch": 0.8384693895822515, + "grad_norm": 1.6651266813278198, + "learning_rate": 1.337330512615066e-05, + "loss": 1.0435, + "step": 23413 + }, + { + "epoch": 0.8385052017118197, + "grad_norm": 1.2981144189834595, + "learning_rate": 1.3367511057787707e-05, + "loss": 1.0001, + "step": 23414 + }, + { + "epoch": 0.8385410138413881, + "grad_norm": 1.4922865629196167, + "learning_rate": 1.3361718154953096e-05, + "loss": 1.1463, + "step": 23415 + }, + { + "epoch": 0.8385768259709564, + "grad_norm": 1.1776667833328247, + "learning_rate": 1.3355926417724852e-05, + "loss": 0.8545, + "step": 23416 + }, + { + "epoch": 0.8386126381005247, + "grad_norm": 1.7035531997680664, + "learning_rate": 1.3350135846180856e-05, + "loss": 1.2161, + "step": 23417 + }, + { + "epoch": 0.8386484502300929, + "grad_norm": 1.565994381904602, + "learning_rate": 1.3344346440398992e-05, + "loss": 1.0157, + "step": 23418 + }, + { + "epoch": 0.8386842623596612, + "grad_norm": 1.3841685056686401, + "learning_rate": 1.3338558200457174e-05, + "loss": 0.9689, + "step": 23419 + }, + { + "epoch": 0.8387200744892295, + "grad_norm": 2.0601301193237305, + "learning_rate": 1.3332771126433263e-05, + "loss": 1.1147, + "step": 23420 + }, + { + "epoch": 0.8387558866187977, + "grad_norm": 1.4947365522384644, + "learning_rate": 1.3326985218405152e-05, + "loss": 0.8967, + "step": 23421 + }, + { + "epoch": 0.8387916987483661, + "grad_norm": 1.411848783493042, + "learning_rate": 1.3321200476450602e-05, + "loss": 1.2811, + "step": 23422 + }, + { + "epoch": 0.8388275108779344, + "grad_norm": 1.4864332675933838, + "learning_rate": 1.3315416900647548e-05, + "loss": 1.0376, + "step": 23423 + }, + { + "epoch": 0.8388633230075027, + "grad_norm": 1.4076772928237915, + "learning_rate": 1.3309634491073707e-05, + "loss": 1.1922, + "step": 23424 + }, + { + "epoch": 0.8388991351370709, + "grad_norm": 1.6690475940704346, + "learning_rate": 1.330385324780694e-05, + "loss": 1.1292, + "step": 23425 + }, + { + "epoch": 0.8389349472666392, + "grad_norm": 1.7033064365386963, + "learning_rate": 1.3298073170924986e-05, + "loss": 1.2249, + "step": 23426 + }, + { + "epoch": 0.8389707593962075, + "grad_norm": 1.4393171072006226, + "learning_rate": 1.3292294260505611e-05, + "loss": 1.057, + "step": 23427 + }, + { + "epoch": 0.8390065715257757, + "grad_norm": 1.3987168073654175, + "learning_rate": 1.328651651662659e-05, + "loss": 0.939, + "step": 23428 + }, + { + "epoch": 0.8390423836553441, + "grad_norm": 1.349797248840332, + "learning_rate": 1.3280739939365617e-05, + "loss": 1.1533, + "step": 23429 + }, + { + "epoch": 0.8390781957849124, + "grad_norm": 1.9890074729919434, + "learning_rate": 1.3274964528800437e-05, + "loss": 1.3706, + "step": 23430 + }, + { + "epoch": 0.8391140079144807, + "grad_norm": 1.408143162727356, + "learning_rate": 1.3269190285008737e-05, + "loss": 1.0531, + "step": 23431 + }, + { + "epoch": 0.8391498200440489, + "grad_norm": 1.452472448348999, + "learning_rate": 1.3263417208068218e-05, + "loss": 0.9858, + "step": 23432 + }, + { + "epoch": 0.8391856321736172, + "grad_norm": 1.4507114887237549, + "learning_rate": 1.325764529805651e-05, + "loss": 1.0629, + "step": 23433 + }, + { + "epoch": 0.8392214443031855, + "grad_norm": 1.3377374410629272, + "learning_rate": 1.3251874555051336e-05, + "loss": 1.1497, + "step": 23434 + }, + { + "epoch": 0.8392572564327537, + "grad_norm": 1.8413918018341064, + "learning_rate": 1.3246104979130281e-05, + "loss": 1.035, + "step": 23435 + }, + { + "epoch": 0.8392930685623221, + "grad_norm": 1.6004472970962524, + "learning_rate": 1.324033657037097e-05, + "loss": 1.2003, + "step": 23436 + }, + { + "epoch": 0.8393288806918904, + "grad_norm": 1.6026957035064697, + "learning_rate": 1.323456932885101e-05, + "loss": 1.104, + "step": 23437 + }, + { + "epoch": 0.8393646928214586, + "grad_norm": 1.6365636587142944, + "learning_rate": 1.3228803254648004e-05, + "loss": 1.2463, + "step": 23438 + }, + { + "epoch": 0.8394005049510269, + "grad_norm": 2.3744332790374756, + "learning_rate": 1.3223038347839544e-05, + "loss": 1.1038, + "step": 23439 + }, + { + "epoch": 0.8394363170805952, + "grad_norm": 1.5409992933273315, + "learning_rate": 1.321727460850315e-05, + "loss": 1.0931, + "step": 23440 + }, + { + "epoch": 0.8394721292101635, + "grad_norm": 1.5196031332015991, + "learning_rate": 1.321151203671639e-05, + "loss": 0.9054, + "step": 23441 + }, + { + "epoch": 0.8395079413397317, + "grad_norm": 1.8644016981124878, + "learning_rate": 1.320575063255678e-05, + "loss": 1.0094, + "step": 23442 + }, + { + "epoch": 0.8395437534693001, + "grad_norm": 1.5117216110229492, + "learning_rate": 1.3199990396101858e-05, + "loss": 1.0275, + "step": 23443 + }, + { + "epoch": 0.8395795655988684, + "grad_norm": 2.103581190109253, + "learning_rate": 1.3194231327429085e-05, + "loss": 1.3092, + "step": 23444 + }, + { + "epoch": 0.8396153777284366, + "grad_norm": 1.585429310798645, + "learning_rate": 1.3188473426615956e-05, + "loss": 1.1764, + "step": 23445 + }, + { + "epoch": 0.8396511898580049, + "grad_norm": 1.6623655557632446, + "learning_rate": 1.3182716693739949e-05, + "loss": 0.973, + "step": 23446 + }, + { + "epoch": 0.8396870019875732, + "grad_norm": 1.444573163986206, + "learning_rate": 1.3176961128878495e-05, + "loss": 0.9414, + "step": 23447 + }, + { + "epoch": 0.8397228141171414, + "grad_norm": 1.660712480545044, + "learning_rate": 1.3171206732109031e-05, + "loss": 1.052, + "step": 23448 + }, + { + "epoch": 0.8397586262467097, + "grad_norm": 1.356169581413269, + "learning_rate": 1.3165453503508984e-05, + "loss": 1.1377, + "step": 23449 + }, + { + "epoch": 0.8397944383762781, + "grad_norm": 1.761664628982544, + "learning_rate": 1.3159701443155759e-05, + "loss": 1.3221, + "step": 23450 + }, + { + "epoch": 0.8398302505058464, + "grad_norm": 1.2301307916641235, + "learning_rate": 1.3153950551126725e-05, + "loss": 1.1176, + "step": 23451 + }, + { + "epoch": 0.8398660626354146, + "grad_norm": 1.5365365743637085, + "learning_rate": 1.3148200827499269e-05, + "loss": 1.141, + "step": 23452 + }, + { + "epoch": 0.8399018747649829, + "grad_norm": 1.8177993297576904, + "learning_rate": 1.3142452272350747e-05, + "loss": 1.0649, + "step": 23453 + }, + { + "epoch": 0.8399376868945512, + "grad_norm": 1.5863847732543945, + "learning_rate": 1.3136704885758477e-05, + "loss": 1.2287, + "step": 23454 + }, + { + "epoch": 0.8399734990241194, + "grad_norm": 1.364754557609558, + "learning_rate": 1.3130958667799798e-05, + "loss": 1.2369, + "step": 23455 + }, + { + "epoch": 0.8400093111536877, + "grad_norm": 1.4997608661651611, + "learning_rate": 1.3125213618552013e-05, + "loss": 1.0829, + "step": 23456 + }, + { + "epoch": 0.8400451232832561, + "grad_norm": 1.6370041370391846, + "learning_rate": 1.3119469738092449e-05, + "loss": 0.9962, + "step": 23457 + }, + { + "epoch": 0.8400809354128244, + "grad_norm": 1.350154995918274, + "learning_rate": 1.3113727026498323e-05, + "loss": 0.9183, + "step": 23458 + }, + { + "epoch": 0.8401167475423926, + "grad_norm": 1.4773653745651245, + "learning_rate": 1.310798548384693e-05, + "loss": 1.1303, + "step": 23459 + }, + { + "epoch": 0.8401525596719609, + "grad_norm": 1.3310595750808716, + "learning_rate": 1.3102245110215495e-05, + "loss": 1.0383, + "step": 23460 + }, + { + "epoch": 0.8401883718015292, + "grad_norm": 1.3793103694915771, + "learning_rate": 1.30965059056813e-05, + "loss": 0.846, + "step": 23461 + }, + { + "epoch": 0.8402241839310974, + "grad_norm": 2.370246648788452, + "learning_rate": 1.3090767870321496e-05, + "loss": 1.1698, + "step": 23462 + }, + { + "epoch": 0.8402599960606657, + "grad_norm": 1.6110522747039795, + "learning_rate": 1.30850310042133e-05, + "loss": 1.0575, + "step": 23463 + }, + { + "epoch": 0.8402958081902341, + "grad_norm": 1.4660829305648804, + "learning_rate": 1.3079295307433925e-05, + "loss": 1.0881, + "step": 23464 + }, + { + "epoch": 0.8403316203198024, + "grad_norm": 1.2922508716583252, + "learning_rate": 1.307356078006049e-05, + "loss": 0.9459, + "step": 23465 + }, + { + "epoch": 0.8403674324493706, + "grad_norm": 1.5008152723312378, + "learning_rate": 1.3067827422170165e-05, + "loss": 1.1104, + "step": 23466 + }, + { + "epoch": 0.8404032445789389, + "grad_norm": 1.3063634634017944, + "learning_rate": 1.3062095233840089e-05, + "loss": 0.9328, + "step": 23467 + }, + { + "epoch": 0.8404390567085072, + "grad_norm": 1.2276856899261475, + "learning_rate": 1.30563642151474e-05, + "loss": 1.0994, + "step": 23468 + }, + { + "epoch": 0.8404748688380754, + "grad_norm": 1.398151159286499, + "learning_rate": 1.3050634366169156e-05, + "loss": 0.8513, + "step": 23469 + }, + { + "epoch": 0.8405106809676437, + "grad_norm": 1.344638705253601, + "learning_rate": 1.3044905686982479e-05, + "loss": 0.9709, + "step": 23470 + }, + { + "epoch": 0.8405464930972121, + "grad_norm": 1.4775062799453735, + "learning_rate": 1.3039178177664458e-05, + "loss": 0.9199, + "step": 23471 + }, + { + "epoch": 0.8405823052267803, + "grad_norm": 1.3342763185501099, + "learning_rate": 1.3033451838292088e-05, + "loss": 1.0234, + "step": 23472 + }, + { + "epoch": 0.8406181173563486, + "grad_norm": 1.190123438835144, + "learning_rate": 1.3027726668942452e-05, + "loss": 1.0497, + "step": 23473 + }, + { + "epoch": 0.8406539294859169, + "grad_norm": 1.4709384441375732, + "learning_rate": 1.3022002669692568e-05, + "loss": 0.898, + "step": 23474 + }, + { + "epoch": 0.8406897416154852, + "grad_norm": 1.3552534580230713, + "learning_rate": 1.3016279840619461e-05, + "loss": 1.2883, + "step": 23475 + }, + { + "epoch": 0.8407255537450534, + "grad_norm": 1.4925752878189087, + "learning_rate": 1.3010558181800091e-05, + "loss": 1.1277, + "step": 23476 + }, + { + "epoch": 0.8407613658746217, + "grad_norm": 1.6494710445404053, + "learning_rate": 1.3004837693311445e-05, + "loss": 1.3368, + "step": 23477 + }, + { + "epoch": 0.8407971780041901, + "grad_norm": 1.3868529796600342, + "learning_rate": 1.2999118375230523e-05, + "loss": 1.0478, + "step": 23478 + }, + { + "epoch": 0.8408329901337583, + "grad_norm": 1.5646119117736816, + "learning_rate": 1.2993400227634211e-05, + "loss": 0.9028, + "step": 23479 + }, + { + "epoch": 0.8408688022633266, + "grad_norm": 1.3280003070831299, + "learning_rate": 1.2987683250599481e-05, + "loss": 0.9508, + "step": 23480 + }, + { + "epoch": 0.8409046143928949, + "grad_norm": 1.5879446268081665, + "learning_rate": 1.2981967444203224e-05, + "loss": 1.0304, + "step": 23481 + }, + { + "epoch": 0.8409404265224631, + "grad_norm": 1.3448460102081299, + "learning_rate": 1.297625280852237e-05, + "loss": 0.9662, + "step": 23482 + }, + { + "epoch": 0.8409762386520314, + "grad_norm": 1.46843421459198, + "learning_rate": 1.297053934363377e-05, + "loss": 1.2454, + "step": 23483 + }, + { + "epoch": 0.8410120507815997, + "grad_norm": 1.4319119453430176, + "learning_rate": 1.2964827049614291e-05, + "loss": 1.0698, + "step": 23484 + }, + { + "epoch": 0.8410478629111681, + "grad_norm": 1.7022656202316284, + "learning_rate": 1.295911592654081e-05, + "loss": 1.4214, + "step": 23485 + }, + { + "epoch": 0.8410836750407363, + "grad_norm": 1.202928066253662, + "learning_rate": 1.2953405974490163e-05, + "loss": 1.0318, + "step": 23486 + }, + { + "epoch": 0.8411194871703046, + "grad_norm": 1.5265854597091675, + "learning_rate": 1.2947697193539154e-05, + "loss": 1.0999, + "step": 23487 + }, + { + "epoch": 0.8411552992998729, + "grad_norm": 1.258968472480774, + "learning_rate": 1.2941989583764547e-05, + "loss": 0.8367, + "step": 23488 + }, + { + "epoch": 0.8411911114294411, + "grad_norm": 1.1607328653335571, + "learning_rate": 1.2936283145243222e-05, + "loss": 0.948, + "step": 23489 + }, + { + "epoch": 0.8412269235590094, + "grad_norm": 1.6781302690505981, + "learning_rate": 1.2930577878051887e-05, + "loss": 1.2346, + "step": 23490 + }, + { + "epoch": 0.8412627356885777, + "grad_norm": 1.3882277011871338, + "learning_rate": 1.2924873782267322e-05, + "loss": 1.1467, + "step": 23491 + }, + { + "epoch": 0.8412985478181461, + "grad_norm": 1.3301723003387451, + "learning_rate": 1.2919170857966223e-05, + "loss": 0.9557, + "step": 23492 + }, + { + "epoch": 0.8413343599477143, + "grad_norm": 1.4914370775222778, + "learning_rate": 1.2913469105225407e-05, + "loss": 1.1587, + "step": 23493 + }, + { + "epoch": 0.8413701720772826, + "grad_norm": 1.5815550088882446, + "learning_rate": 1.29077685241215e-05, + "loss": 0.9898, + "step": 23494 + }, + { + "epoch": 0.8414059842068509, + "grad_norm": 1.3562984466552734, + "learning_rate": 1.290206911473123e-05, + "loss": 1.1183, + "step": 23495 + }, + { + "epoch": 0.8414417963364191, + "grad_norm": 1.4300376176834106, + "learning_rate": 1.2896370877131293e-05, + "loss": 1.1648, + "step": 23496 + }, + { + "epoch": 0.8414776084659874, + "grad_norm": 1.3532207012176514, + "learning_rate": 1.2890673811398301e-05, + "loss": 1.0953, + "step": 23497 + }, + { + "epoch": 0.8415134205955557, + "grad_norm": 1.5705968141555786, + "learning_rate": 1.2884977917608964e-05, + "loss": 1.263, + "step": 23498 + }, + { + "epoch": 0.841549232725124, + "grad_norm": 1.8398749828338623, + "learning_rate": 1.287928319583983e-05, + "loss": 1.0262, + "step": 23499 + }, + { + "epoch": 0.8415850448546923, + "grad_norm": 1.3229036331176758, + "learning_rate": 1.2873589646167605e-05, + "loss": 0.9506, + "step": 23500 + }, + { + "epoch": 0.8416208569842606, + "grad_norm": 1.5911157131195068, + "learning_rate": 1.2867897268668826e-05, + "loss": 0.9586, + "step": 23501 + }, + { + "epoch": 0.8416566691138289, + "grad_norm": 1.540263295173645, + "learning_rate": 1.2862206063420113e-05, + "loss": 1.0401, + "step": 23502 + }, + { + "epoch": 0.8416924812433971, + "grad_norm": 1.5171393156051636, + "learning_rate": 1.2856516030497979e-05, + "loss": 1.0352, + "step": 23503 + }, + { + "epoch": 0.8417282933729654, + "grad_norm": 1.7090883255004883, + "learning_rate": 1.2850827169979063e-05, + "loss": 0.9481, + "step": 23504 + }, + { + "epoch": 0.8417641055025337, + "grad_norm": 1.2371577024459839, + "learning_rate": 1.284513948193985e-05, + "loss": 0.9779, + "step": 23505 + }, + { + "epoch": 0.841799917632102, + "grad_norm": 1.2861382961273193, + "learning_rate": 1.2839452966456822e-05, + "loss": 0.8615, + "step": 23506 + }, + { + "epoch": 0.8418357297616703, + "grad_norm": 1.4343855381011963, + "learning_rate": 1.2833767623606563e-05, + "loss": 1.1209, + "step": 23507 + }, + { + "epoch": 0.8418715418912386, + "grad_norm": 1.2917757034301758, + "learning_rate": 1.28280834534655e-05, + "loss": 0.8451, + "step": 23508 + }, + { + "epoch": 0.8419073540208069, + "grad_norm": 1.7370755672454834, + "learning_rate": 1.2822400456110162e-05, + "loss": 1.0627, + "step": 23509 + }, + { + "epoch": 0.8419431661503751, + "grad_norm": 1.831226110458374, + "learning_rate": 1.281671863161693e-05, + "loss": 1.0689, + "step": 23510 + }, + { + "epoch": 0.8419789782799434, + "grad_norm": 1.2809185981750488, + "learning_rate": 1.2811037980062324e-05, + "loss": 1.0808, + "step": 23511 + }, + { + "epoch": 0.8420147904095117, + "grad_norm": 1.361853837966919, + "learning_rate": 1.2805358501522724e-05, + "loss": 1.0029, + "step": 23512 + }, + { + "epoch": 0.84205060253908, + "grad_norm": 1.6175503730773926, + "learning_rate": 1.279968019607457e-05, + "loss": 1.0635, + "step": 23513 + }, + { + "epoch": 0.8420864146686483, + "grad_norm": 1.304930329322815, + "learning_rate": 1.2794003063794225e-05, + "loss": 1.2609, + "step": 23514 + }, + { + "epoch": 0.8421222267982166, + "grad_norm": 1.4113093614578247, + "learning_rate": 1.2788327104758068e-05, + "loss": 1.171, + "step": 23515 + }, + { + "epoch": 0.8421580389277848, + "grad_norm": 1.4470847845077515, + "learning_rate": 1.278265231904251e-05, + "loss": 1.196, + "step": 23516 + }, + { + "epoch": 0.8421938510573531, + "grad_norm": 1.2305935621261597, + "learning_rate": 1.277697870672383e-05, + "loss": 0.9454, + "step": 23517 + }, + { + "epoch": 0.8422296631869214, + "grad_norm": 1.3484704494476318, + "learning_rate": 1.2771306267878392e-05, + "loss": 0.9709, + "step": 23518 + }, + { + "epoch": 0.8422654753164897, + "grad_norm": 2.044004440307617, + "learning_rate": 1.2765635002582521e-05, + "loss": 1.0497, + "step": 23519 + }, + { + "epoch": 0.842301287446058, + "grad_norm": 1.240472674369812, + "learning_rate": 1.2759964910912524e-05, + "loss": 0.9486, + "step": 23520 + }, + { + "epoch": 0.8423370995756263, + "grad_norm": 1.969125509262085, + "learning_rate": 1.275429599294462e-05, + "loss": 1.2329, + "step": 23521 + }, + { + "epoch": 0.8423729117051946, + "grad_norm": 1.6564459800720215, + "learning_rate": 1.2748628248755167e-05, + "loss": 1.1259, + "step": 23522 + }, + { + "epoch": 0.8424087238347628, + "grad_norm": 1.4711843729019165, + "learning_rate": 1.2742961678420385e-05, + "loss": 1.0924, + "step": 23523 + }, + { + "epoch": 0.8424445359643311, + "grad_norm": 1.2575010061264038, + "learning_rate": 1.2737296282016464e-05, + "loss": 1.087, + "step": 23524 + }, + { + "epoch": 0.8424803480938994, + "grad_norm": 1.892659306526184, + "learning_rate": 1.2731632059619669e-05, + "loss": 1.0717, + "step": 23525 + }, + { + "epoch": 0.8425161602234676, + "grad_norm": 1.9585291147232056, + "learning_rate": 1.2725969011306204e-05, + "loss": 1.122, + "step": 23526 + }, + { + "epoch": 0.842551972353036, + "grad_norm": 1.4829531908035278, + "learning_rate": 1.2720307137152266e-05, + "loss": 1.1349, + "step": 23527 + }, + { + "epoch": 0.8425877844826043, + "grad_norm": 1.4722665548324585, + "learning_rate": 1.271464643723399e-05, + "loss": 0.9868, + "step": 23528 + }, + { + "epoch": 0.8426235966121726, + "grad_norm": 1.6179709434509277, + "learning_rate": 1.2708986911627551e-05, + "loss": 1.1432, + "step": 23529 + }, + { + "epoch": 0.8426594087417408, + "grad_norm": 1.7184054851531982, + "learning_rate": 1.27033285604091e-05, + "loss": 1.0358, + "step": 23530 + }, + { + "epoch": 0.8426952208713091, + "grad_norm": 1.6476553678512573, + "learning_rate": 1.2697671383654786e-05, + "loss": 1.1664, + "step": 23531 + }, + { + "epoch": 0.8427310330008774, + "grad_norm": 1.3236929178237915, + "learning_rate": 1.2692015381440658e-05, + "loss": 0.8971, + "step": 23532 + }, + { + "epoch": 0.8427668451304456, + "grad_norm": 1.3074086904525757, + "learning_rate": 1.2686360553842857e-05, + "loss": 1.0789, + "step": 23533 + }, + { + "epoch": 0.842802657260014, + "grad_norm": 1.247637152671814, + "learning_rate": 1.2680706900937455e-05, + "loss": 0.9031, + "step": 23534 + }, + { + "epoch": 0.8428384693895823, + "grad_norm": 1.545599341392517, + "learning_rate": 1.2675054422800503e-05, + "loss": 1.2378, + "step": 23535 + }, + { + "epoch": 0.8428742815191506, + "grad_norm": 1.6455833911895752, + "learning_rate": 1.2669403119508039e-05, + "loss": 1.1813, + "step": 23536 + }, + { + "epoch": 0.8429100936487188, + "grad_norm": 1.3719524145126343, + "learning_rate": 1.2663752991136112e-05, + "loss": 1.1843, + "step": 23537 + }, + { + "epoch": 0.8429459057782871, + "grad_norm": 1.4945931434631348, + "learning_rate": 1.2658104037760753e-05, + "loss": 1.0607, + "step": 23538 + }, + { + "epoch": 0.8429817179078554, + "grad_norm": 1.748793125152588, + "learning_rate": 1.2652456259457924e-05, + "loss": 1.1604, + "step": 23539 + }, + { + "epoch": 0.8430175300374236, + "grad_norm": 1.9122028350830078, + "learning_rate": 1.2646809656303627e-05, + "loss": 1.1297, + "step": 23540 + }, + { + "epoch": 0.843053342166992, + "grad_norm": 1.8823301792144775, + "learning_rate": 1.2641164228373847e-05, + "loss": 1.0393, + "step": 23541 + }, + { + "epoch": 0.8430891542965603, + "grad_norm": 2.204671621322632, + "learning_rate": 1.2635519975744503e-05, + "loss": 1.3105, + "step": 23542 + }, + { + "epoch": 0.8431249664261286, + "grad_norm": 1.4669923782348633, + "learning_rate": 1.2629876898491532e-05, + "loss": 1.0649, + "step": 23543 + }, + { + "epoch": 0.8431607785556968, + "grad_norm": 2.039494514465332, + "learning_rate": 1.2624234996690875e-05, + "loss": 0.927, + "step": 23544 + }, + { + "epoch": 0.8431965906852651, + "grad_norm": 1.53396475315094, + "learning_rate": 1.2618594270418448e-05, + "loss": 0.825, + "step": 23545 + }, + { + "epoch": 0.8432324028148334, + "grad_norm": 1.3280766010284424, + "learning_rate": 1.2612954719750103e-05, + "loss": 1.0472, + "step": 23546 + }, + { + "epoch": 0.8432682149444016, + "grad_norm": 1.6059638261795044, + "learning_rate": 1.2607316344761733e-05, + "loss": 1.0125, + "step": 23547 + }, + { + "epoch": 0.84330402707397, + "grad_norm": 1.7215889692306519, + "learning_rate": 1.2601679145529189e-05, + "loss": 1.078, + "step": 23548 + }, + { + "epoch": 0.8433398392035383, + "grad_norm": 1.612866759300232, + "learning_rate": 1.2596043122128343e-05, + "loss": 0.9228, + "step": 23549 + }, + { + "epoch": 0.8433756513331065, + "grad_norm": 1.8388032913208008, + "learning_rate": 1.2590408274634969e-05, + "loss": 0.8522, + "step": 23550 + }, + { + "epoch": 0.8434114634626748, + "grad_norm": 1.4553309679031372, + "learning_rate": 1.2584774603124905e-05, + "loss": 1.0446, + "step": 23551 + }, + { + "epoch": 0.8434472755922431, + "grad_norm": 1.6450904607772827, + "learning_rate": 1.2579142107673959e-05, + "loss": 0.9509, + "step": 23552 + }, + { + "epoch": 0.8434830877218114, + "grad_norm": 1.3134915828704834, + "learning_rate": 1.2573510788357867e-05, + "loss": 1.0299, + "step": 23553 + }, + { + "epoch": 0.8435188998513796, + "grad_norm": 2.1425862312316895, + "learning_rate": 1.2567880645252417e-05, + "loss": 1.0247, + "step": 23554 + }, + { + "epoch": 0.843554711980948, + "grad_norm": 1.8830010890960693, + "learning_rate": 1.2562251678433356e-05, + "loss": 1.2223, + "step": 23555 + }, + { + "epoch": 0.8435905241105163, + "grad_norm": 1.4504786729812622, + "learning_rate": 1.2556623887976427e-05, + "loss": 1.1608, + "step": 23556 + }, + { + "epoch": 0.8436263362400845, + "grad_norm": 1.370876431465149, + "learning_rate": 1.255099727395732e-05, + "loss": 0.9589, + "step": 23557 + }, + { + "epoch": 0.8436621483696528, + "grad_norm": 1.424474835395813, + "learning_rate": 1.2545371836451736e-05, + "loss": 0.9503, + "step": 23558 + }, + { + "epoch": 0.8436979604992211, + "grad_norm": 1.2229441404342651, + "learning_rate": 1.2539747575535387e-05, + "loss": 1.0406, + "step": 23559 + }, + { + "epoch": 0.8437337726287893, + "grad_norm": 1.8459101915359497, + "learning_rate": 1.2534124491283893e-05, + "loss": 1.141, + "step": 23560 + }, + { + "epoch": 0.8437695847583576, + "grad_norm": 1.596775770187378, + "learning_rate": 1.2528502583772938e-05, + "loss": 1.2099, + "step": 23561 + }, + { + "epoch": 0.843805396887926, + "grad_norm": 1.6419891119003296, + "learning_rate": 1.252288185307815e-05, + "loss": 1.1812, + "step": 23562 + }, + { + "epoch": 0.8438412090174943, + "grad_norm": 1.776708960533142, + "learning_rate": 1.2517262299275167e-05, + "loss": 1.1327, + "step": 23563 + }, + { + "epoch": 0.8438770211470625, + "grad_norm": 1.4125584363937378, + "learning_rate": 1.2511643922439564e-05, + "loss": 1.0961, + "step": 23564 + }, + { + "epoch": 0.8439128332766308, + "grad_norm": 1.5517492294311523, + "learning_rate": 1.2506026722646924e-05, + "loss": 0.8183, + "step": 23565 + }, + { + "epoch": 0.8439486454061991, + "grad_norm": 1.3764983415603638, + "learning_rate": 1.2500410699972853e-05, + "loss": 0.7714, + "step": 23566 + }, + { + "epoch": 0.8439844575357673, + "grad_norm": 2.0595321655273438, + "learning_rate": 1.2494795854492903e-05, + "loss": 1.2534, + "step": 23567 + }, + { + "epoch": 0.8440202696653356, + "grad_norm": 1.4664182662963867, + "learning_rate": 1.2489182186282577e-05, + "loss": 0.9625, + "step": 23568 + }, + { + "epoch": 0.8440560817949039, + "grad_norm": 1.4276480674743652, + "learning_rate": 1.2483569695417418e-05, + "loss": 1.1117, + "step": 23569 + }, + { + "epoch": 0.8440918939244723, + "grad_norm": 1.3961466550827026, + "learning_rate": 1.2477958381972977e-05, + "loss": 0.8505, + "step": 23570 + }, + { + "epoch": 0.8441277060540405, + "grad_norm": 1.6615346670150757, + "learning_rate": 1.2472348246024679e-05, + "loss": 1.0133, + "step": 23571 + }, + { + "epoch": 0.8441635181836088, + "grad_norm": 1.5861414670944214, + "learning_rate": 1.2466739287648032e-05, + "loss": 1.157, + "step": 23572 + }, + { + "epoch": 0.8441993303131771, + "grad_norm": 1.3938814401626587, + "learning_rate": 1.24611315069185e-05, + "loss": 0.9878, + "step": 23573 + }, + { + "epoch": 0.8442351424427453, + "grad_norm": 1.3864009380340576, + "learning_rate": 1.2455524903911552e-05, + "loss": 1.0389, + "step": 23574 + }, + { + "epoch": 0.8442709545723136, + "grad_norm": 2.1067395210266113, + "learning_rate": 1.2449919478702587e-05, + "loss": 1.2824, + "step": 23575 + }, + { + "epoch": 0.8443067667018819, + "grad_norm": 2.565563678741455, + "learning_rate": 1.2444315231366988e-05, + "loss": 1.2587, + "step": 23576 + }, + { + "epoch": 0.8443425788314503, + "grad_norm": 1.404055118560791, + "learning_rate": 1.2438712161980226e-05, + "loss": 1.1607, + "step": 23577 + }, + { + "epoch": 0.8443783909610185, + "grad_norm": 1.4316885471343994, + "learning_rate": 1.2433110270617632e-05, + "loss": 1.183, + "step": 23578 + }, + { + "epoch": 0.8444142030905868, + "grad_norm": 1.3852492570877075, + "learning_rate": 1.2427509557354578e-05, + "loss": 0.8809, + "step": 23579 + }, + { + "epoch": 0.8444500152201551, + "grad_norm": 1.610059142112732, + "learning_rate": 1.2421910022266425e-05, + "loss": 1.0497, + "step": 23580 + }, + { + "epoch": 0.8444858273497233, + "grad_norm": 1.276381015777588, + "learning_rate": 1.2416311665428526e-05, + "loss": 1.0486, + "step": 23581 + }, + { + "epoch": 0.8445216394792916, + "grad_norm": 1.233426570892334, + "learning_rate": 1.2410714486916164e-05, + "loss": 1.1517, + "step": 23582 + }, + { + "epoch": 0.8445574516088599, + "grad_norm": 1.4596554040908813, + "learning_rate": 1.2405118486804646e-05, + "loss": 1.0486, + "step": 23583 + }, + { + "epoch": 0.8445932637384282, + "grad_norm": 1.4386239051818848, + "learning_rate": 1.2399523665169298e-05, + "loss": 0.9899, + "step": 23584 + }, + { + "epoch": 0.8446290758679965, + "grad_norm": 1.6907118558883667, + "learning_rate": 1.239393002208533e-05, + "loss": 0.9622, + "step": 23585 + }, + { + "epoch": 0.8446648879975648, + "grad_norm": 1.3728703260421753, + "learning_rate": 1.238833755762806e-05, + "loss": 1.0341, + "step": 23586 + }, + { + "epoch": 0.844700700127133, + "grad_norm": 1.8546901941299438, + "learning_rate": 1.2382746271872658e-05, + "loss": 1.2462, + "step": 23587 + }, + { + "epoch": 0.8447365122567013, + "grad_norm": 1.336356282234192, + "learning_rate": 1.2377156164894422e-05, + "loss": 0.8005, + "step": 23588 + }, + { + "epoch": 0.8447723243862696, + "grad_norm": 1.5620248317718506, + "learning_rate": 1.2371567236768511e-05, + "loss": 0.9122, + "step": 23589 + }, + { + "epoch": 0.8448081365158379, + "grad_norm": 1.6024088859558105, + "learning_rate": 1.2365979487570122e-05, + "loss": 1.0046, + "step": 23590 + }, + { + "epoch": 0.8448439486454062, + "grad_norm": 2.081648349761963, + "learning_rate": 1.2360392917374442e-05, + "loss": 0.9891, + "step": 23591 + }, + { + "epoch": 0.8448797607749745, + "grad_norm": 1.2587425708770752, + "learning_rate": 1.235480752625665e-05, + "loss": 1.1713, + "step": 23592 + }, + { + "epoch": 0.8449155729045428, + "grad_norm": 1.877810001373291, + "learning_rate": 1.234922331429188e-05, + "loss": 0.9809, + "step": 23593 + }, + { + "epoch": 0.844951385034111, + "grad_norm": 1.394511342048645, + "learning_rate": 1.2343640281555191e-05, + "loss": 0.933, + "step": 23594 + }, + { + "epoch": 0.8449871971636793, + "grad_norm": 2.098541736602783, + "learning_rate": 1.2338058428121802e-05, + "loss": 1.1439, + "step": 23595 + }, + { + "epoch": 0.8450230092932476, + "grad_norm": 1.6131494045257568, + "learning_rate": 1.233247775406674e-05, + "loss": 1.0825, + "step": 23596 + }, + { + "epoch": 0.8450588214228159, + "grad_norm": 1.603793978691101, + "learning_rate": 1.2326898259465125e-05, + "loss": 1.1702, + "step": 23597 + }, + { + "epoch": 0.8450946335523842, + "grad_norm": 1.6203737258911133, + "learning_rate": 1.2321319944391963e-05, + "loss": 1.0522, + "step": 23598 + }, + { + "epoch": 0.8451304456819525, + "grad_norm": 1.2484962940216064, + "learning_rate": 1.2315742808922382e-05, + "loss": 0.8806, + "step": 23599 + }, + { + "epoch": 0.8451662578115208, + "grad_norm": 1.3433847427368164, + "learning_rate": 1.2310166853131366e-05, + "loss": 1.0221, + "step": 23600 + }, + { + "epoch": 0.845202069941089, + "grad_norm": 1.2145334482192993, + "learning_rate": 1.2304592077093958e-05, + "loss": 0.9637, + "step": 23601 + }, + { + "epoch": 0.8452378820706573, + "grad_norm": 1.2949951887130737, + "learning_rate": 1.2299018480885117e-05, + "loss": 0.7894, + "step": 23602 + }, + { + "epoch": 0.8452736942002256, + "grad_norm": 1.4667657613754272, + "learning_rate": 1.2293446064579873e-05, + "loss": 0.9724, + "step": 23603 + }, + { + "epoch": 0.8453095063297938, + "grad_norm": 1.5422786474227905, + "learning_rate": 1.2287874828253187e-05, + "loss": 1.1731, + "step": 23604 + }, + { + "epoch": 0.8453453184593622, + "grad_norm": 1.9985435009002686, + "learning_rate": 1.2282304771979958e-05, + "loss": 1.239, + "step": 23605 + }, + { + "epoch": 0.8453811305889305, + "grad_norm": 1.8458914756774902, + "learning_rate": 1.2276735895835223e-05, + "loss": 1.0661, + "step": 23606 + }, + { + "epoch": 0.8454169427184988, + "grad_norm": 1.4303359985351562, + "learning_rate": 1.2271168199893834e-05, + "loss": 1.2284, + "step": 23607 + }, + { + "epoch": 0.845452754848067, + "grad_norm": 1.554088830947876, + "learning_rate": 1.2265601684230732e-05, + "loss": 1.0046, + "step": 23608 + }, + { + "epoch": 0.8454885669776353, + "grad_norm": 1.3662105798721313, + "learning_rate": 1.2260036348920745e-05, + "loss": 1.2437, + "step": 23609 + }, + { + "epoch": 0.8455243791072036, + "grad_norm": 1.4695919752120972, + "learning_rate": 1.2254472194038835e-05, + "loss": 1.1061, + "step": 23610 + }, + { + "epoch": 0.8455601912367718, + "grad_norm": 1.5635807514190674, + "learning_rate": 1.224890921965981e-05, + "loss": 1.1765, + "step": 23611 + }, + { + "epoch": 0.8455960033663402, + "grad_norm": 1.4832613468170166, + "learning_rate": 1.2243347425858508e-05, + "loss": 1.0434, + "step": 23612 + }, + { + "epoch": 0.8456318154959085, + "grad_norm": 1.6287473440170288, + "learning_rate": 1.2237786812709773e-05, + "loss": 1.1379, + "step": 23613 + }, + { + "epoch": 0.8456676276254768, + "grad_norm": 1.3725882768630981, + "learning_rate": 1.2232227380288408e-05, + "loss": 1.1085, + "step": 23614 + }, + { + "epoch": 0.845703439755045, + "grad_norm": 1.303849458694458, + "learning_rate": 1.2226669128669232e-05, + "loss": 1.0079, + "step": 23615 + }, + { + "epoch": 0.8457392518846133, + "grad_norm": 1.6112674474716187, + "learning_rate": 1.2221112057926954e-05, + "loss": 1.116, + "step": 23616 + }, + { + "epoch": 0.8457750640141816, + "grad_norm": 1.4160038232803345, + "learning_rate": 1.2215556168136443e-05, + "loss": 1.2578, + "step": 23617 + }, + { + "epoch": 0.8458108761437498, + "grad_norm": 1.4555764198303223, + "learning_rate": 1.2210001459372355e-05, + "loss": 1.145, + "step": 23618 + }, + { + "epoch": 0.8458466882733182, + "grad_norm": 1.3037998676300049, + "learning_rate": 1.2204447931709484e-05, + "loss": 1.2007, + "step": 23619 + }, + { + "epoch": 0.8458825004028865, + "grad_norm": 1.470997929573059, + "learning_rate": 1.2198895585222503e-05, + "loss": 1.122, + "step": 23620 + }, + { + "epoch": 0.8459183125324548, + "grad_norm": 1.8185335397720337, + "learning_rate": 1.219334441998612e-05, + "loss": 1.0874, + "step": 23621 + }, + { + "epoch": 0.845954124662023, + "grad_norm": 1.385180115699768, + "learning_rate": 1.2187794436075039e-05, + "loss": 0.8341, + "step": 23622 + }, + { + "epoch": 0.8459899367915913, + "grad_norm": 1.4343875646591187, + "learning_rate": 1.2182245633563905e-05, + "loss": 0.9439, + "step": 23623 + }, + { + "epoch": 0.8460257489211596, + "grad_norm": 1.396295428276062, + "learning_rate": 1.2176698012527376e-05, + "loss": 1.0272, + "step": 23624 + }, + { + "epoch": 0.8460615610507278, + "grad_norm": 1.654306411743164, + "learning_rate": 1.2171151573040085e-05, + "loss": 1.1433, + "step": 23625 + }, + { + "epoch": 0.8460973731802962, + "grad_norm": 2.0333526134490967, + "learning_rate": 1.2165606315176691e-05, + "loss": 1.3511, + "step": 23626 + }, + { + "epoch": 0.8461331853098645, + "grad_norm": 1.4473031759262085, + "learning_rate": 1.2160062239011739e-05, + "loss": 1.03, + "step": 23627 + }, + { + "epoch": 0.8461689974394327, + "grad_norm": 1.244805097579956, + "learning_rate": 1.2154519344619841e-05, + "loss": 1.1384, + "step": 23628 + }, + { + "epoch": 0.846204809569001, + "grad_norm": 1.3952445983886719, + "learning_rate": 1.2148977632075598e-05, + "loss": 1.1227, + "step": 23629 + }, + { + "epoch": 0.8462406216985693, + "grad_norm": 1.6553385257720947, + "learning_rate": 1.2143437101453514e-05, + "loss": 1.0781, + "step": 23630 + }, + { + "epoch": 0.8462764338281376, + "grad_norm": 1.337793231010437, + "learning_rate": 1.2137897752828165e-05, + "loss": 0.9553, + "step": 23631 + }, + { + "epoch": 0.8463122459577058, + "grad_norm": 1.3044105768203735, + "learning_rate": 1.2132359586274067e-05, + "loss": 1.0654, + "step": 23632 + }, + { + "epoch": 0.8463480580872742, + "grad_norm": 1.790814757347107, + "learning_rate": 1.212682260186575e-05, + "loss": 1.2879, + "step": 23633 + }, + { + "epoch": 0.8463838702168425, + "grad_norm": 1.3796228170394897, + "learning_rate": 1.2121286799677667e-05, + "loss": 0.7216, + "step": 23634 + }, + { + "epoch": 0.8464196823464107, + "grad_norm": 1.6554665565490723, + "learning_rate": 1.2115752179784312e-05, + "loss": 1.0526, + "step": 23635 + }, + { + "epoch": 0.846455494475979, + "grad_norm": 1.4619966745376587, + "learning_rate": 1.211021874226015e-05, + "loss": 1.2591, + "step": 23636 + }, + { + "epoch": 0.8464913066055473, + "grad_norm": 1.126943826675415, + "learning_rate": 1.2104686487179639e-05, + "loss": 0.9848, + "step": 23637 + }, + { + "epoch": 0.8465271187351155, + "grad_norm": 1.432204008102417, + "learning_rate": 1.209915541461718e-05, + "loss": 0.9805, + "step": 23638 + }, + { + "epoch": 0.8465629308646838, + "grad_norm": 1.4874271154403687, + "learning_rate": 1.20936255246472e-05, + "loss": 0.8871, + "step": 23639 + }, + { + "epoch": 0.8465987429942522, + "grad_norm": 1.6288385391235352, + "learning_rate": 1.2088096817344118e-05, + "loss": 1.3639, + "step": 23640 + }, + { + "epoch": 0.8466345551238205, + "grad_norm": 1.8568919897079468, + "learning_rate": 1.2082569292782275e-05, + "loss": 1.1125, + "step": 23641 + }, + { + "epoch": 0.8466703672533887, + "grad_norm": 1.2960214614868164, + "learning_rate": 1.2077042951036055e-05, + "loss": 0.9229, + "step": 23642 + }, + { + "epoch": 0.846706179382957, + "grad_norm": 1.9167439937591553, + "learning_rate": 1.207151779217981e-05, + "loss": 0.9789, + "step": 23643 + }, + { + "epoch": 0.8467419915125253, + "grad_norm": 1.5508882999420166, + "learning_rate": 1.2065993816287901e-05, + "loss": 0.925, + "step": 23644 + }, + { + "epoch": 0.8467778036420935, + "grad_norm": 1.5241104364395142, + "learning_rate": 1.2060471023434594e-05, + "loss": 1.2621, + "step": 23645 + }, + { + "epoch": 0.8468136157716618, + "grad_norm": 1.4295437335968018, + "learning_rate": 1.2054949413694216e-05, + "loss": 0.9622, + "step": 23646 + }, + { + "epoch": 0.8468494279012302, + "grad_norm": 1.316565990447998, + "learning_rate": 1.2049428987141065e-05, + "loss": 1.0869, + "step": 23647 + }, + { + "epoch": 0.8468852400307985, + "grad_norm": 1.5576125383377075, + "learning_rate": 1.204390974384939e-05, + "loss": 1.0336, + "step": 23648 + }, + { + "epoch": 0.8469210521603667, + "grad_norm": 1.700176477432251, + "learning_rate": 1.2038391683893446e-05, + "loss": 1.1674, + "step": 23649 + }, + { + "epoch": 0.846956864289935, + "grad_norm": 1.634017825126648, + "learning_rate": 1.2032874807347484e-05, + "loss": 1.2104, + "step": 23650 + }, + { + "epoch": 0.8469926764195033, + "grad_norm": 1.2724889516830444, + "learning_rate": 1.2027359114285741e-05, + "loss": 1.0858, + "step": 23651 + }, + { + "epoch": 0.8470284885490715, + "grad_norm": 1.534701943397522, + "learning_rate": 1.2021844604782384e-05, + "loss": 1.1445, + "step": 23652 + }, + { + "epoch": 0.8470643006786398, + "grad_norm": 1.6062169075012207, + "learning_rate": 1.2016331278911619e-05, + "loss": 0.9634, + "step": 23653 + }, + { + "epoch": 0.8471001128082082, + "grad_norm": 1.6156179904937744, + "learning_rate": 1.201081913674763e-05, + "loss": 1.2909, + "step": 23654 + }, + { + "epoch": 0.8471359249377765, + "grad_norm": 1.2232613563537598, + "learning_rate": 1.2005308178364593e-05, + "loss": 1.1335, + "step": 23655 + }, + { + "epoch": 0.8471717370673447, + "grad_norm": 1.770695686340332, + "learning_rate": 1.1999798403836615e-05, + "loss": 0.8935, + "step": 23656 + }, + { + "epoch": 0.847207549196913, + "grad_norm": 1.4942231178283691, + "learning_rate": 1.1994289813237835e-05, + "loss": 1.0572, + "step": 23657 + }, + { + "epoch": 0.8472433613264813, + "grad_norm": 1.5439671277999878, + "learning_rate": 1.1988782406642385e-05, + "loss": 0.8866, + "step": 23658 + }, + { + "epoch": 0.8472791734560495, + "grad_norm": 1.4481513500213623, + "learning_rate": 1.1983276184124314e-05, + "loss": 1.0265, + "step": 23659 + }, + { + "epoch": 0.8473149855856178, + "grad_norm": 1.376541018486023, + "learning_rate": 1.1977771145757733e-05, + "loss": 1.1888, + "step": 23660 + }, + { + "epoch": 0.8473507977151862, + "grad_norm": 1.6305387020111084, + "learning_rate": 1.1972267291616702e-05, + "loss": 1.1126, + "step": 23661 + }, + { + "epoch": 0.8473866098447544, + "grad_norm": 1.3512924909591675, + "learning_rate": 1.1966764621775284e-05, + "loss": 0.9572, + "step": 23662 + }, + { + "epoch": 0.8474224219743227, + "grad_norm": 1.5943101644515991, + "learning_rate": 1.1961263136307477e-05, + "loss": 0.9751, + "step": 23663 + }, + { + "epoch": 0.847458234103891, + "grad_norm": 1.636391282081604, + "learning_rate": 1.195576283528731e-05, + "loss": 1.1427, + "step": 23664 + }, + { + "epoch": 0.8474940462334593, + "grad_norm": 1.3810477256774902, + "learning_rate": 1.1950263718788812e-05, + "loss": 1.1947, + "step": 23665 + }, + { + "epoch": 0.8475298583630275, + "grad_norm": 1.5579888820648193, + "learning_rate": 1.1944765786885914e-05, + "loss": 1.1426, + "step": 23666 + }, + { + "epoch": 0.8475656704925958, + "grad_norm": 1.9295657873153687, + "learning_rate": 1.1939269039652612e-05, + "loss": 1.1736, + "step": 23667 + }, + { + "epoch": 0.8476014826221642, + "grad_norm": 1.6411559581756592, + "learning_rate": 1.1933773477162847e-05, + "loss": 1.1645, + "step": 23668 + }, + { + "epoch": 0.8476372947517324, + "grad_norm": 1.4277621507644653, + "learning_rate": 1.192827909949059e-05, + "loss": 0.9545, + "step": 23669 + }, + { + "epoch": 0.8476731068813007, + "grad_norm": 1.4953548908233643, + "learning_rate": 1.1922785906709711e-05, + "loss": 1.0313, + "step": 23670 + }, + { + "epoch": 0.847708919010869, + "grad_norm": 1.696679949760437, + "learning_rate": 1.1917293898894145e-05, + "loss": 1.0556, + "step": 23671 + }, + { + "epoch": 0.8477447311404372, + "grad_norm": 1.3711097240447998, + "learning_rate": 1.1911803076117777e-05, + "loss": 0.9861, + "step": 23672 + }, + { + "epoch": 0.8477805432700055, + "grad_norm": 1.293489933013916, + "learning_rate": 1.1906313438454464e-05, + "loss": 1.1028, + "step": 23673 + }, + { + "epoch": 0.8478163553995738, + "grad_norm": 1.2589609622955322, + "learning_rate": 1.1900824985978066e-05, + "loss": 0.8505, + "step": 23674 + }, + { + "epoch": 0.8478521675291422, + "grad_norm": 1.2296922206878662, + "learning_rate": 1.1895337718762422e-05, + "loss": 1.1306, + "step": 23675 + }, + { + "epoch": 0.8478879796587104, + "grad_norm": 1.5397077798843384, + "learning_rate": 1.1889851636881388e-05, + "loss": 1.026, + "step": 23676 + }, + { + "epoch": 0.8479237917882787, + "grad_norm": 1.6962757110595703, + "learning_rate": 1.1884366740408726e-05, + "loss": 1.1207, + "step": 23677 + }, + { + "epoch": 0.847959603917847, + "grad_norm": 1.2948553562164307, + "learning_rate": 1.1878883029418253e-05, + "loss": 1.2031, + "step": 23678 + }, + { + "epoch": 0.8479954160474152, + "grad_norm": 1.31011962890625, + "learning_rate": 1.1873400503983733e-05, + "loss": 1.0676, + "step": 23679 + }, + { + "epoch": 0.8480312281769835, + "grad_norm": 1.6406269073486328, + "learning_rate": 1.1867919164178964e-05, + "loss": 1.1387, + "step": 23680 + }, + { + "epoch": 0.8480670403065518, + "grad_norm": 1.500786542892456, + "learning_rate": 1.1862439010077653e-05, + "loss": 1.1816, + "step": 23681 + }, + { + "epoch": 0.8481028524361202, + "grad_norm": 1.7255390882492065, + "learning_rate": 1.1856960041753495e-05, + "loss": 1.0578, + "step": 23682 + }, + { + "epoch": 0.8481386645656884, + "grad_norm": 1.3953183889389038, + "learning_rate": 1.185148225928029e-05, + "loss": 1.322, + "step": 23683 + }, + { + "epoch": 0.8481744766952567, + "grad_norm": 1.6438995599746704, + "learning_rate": 1.1846005662731663e-05, + "loss": 1.1412, + "step": 23684 + }, + { + "epoch": 0.848210288824825, + "grad_norm": 1.5832542181015015, + "learning_rate": 1.1840530252181336e-05, + "loss": 1.248, + "step": 23685 + }, + { + "epoch": 0.8482461009543932, + "grad_norm": 1.3821011781692505, + "learning_rate": 1.1835056027702918e-05, + "loss": 1.0027, + "step": 23686 + }, + { + "epoch": 0.8482819130839615, + "grad_norm": 1.7612532377243042, + "learning_rate": 1.1829582989370148e-05, + "loss": 1.2129, + "step": 23687 + }, + { + "epoch": 0.8483177252135298, + "grad_norm": 1.447311282157898, + "learning_rate": 1.1824111137256577e-05, + "loss": 1.0128, + "step": 23688 + }, + { + "epoch": 0.8483535373430982, + "grad_norm": 1.7684494256973267, + "learning_rate": 1.1818640471435848e-05, + "loss": 1.1743, + "step": 23689 + }, + { + "epoch": 0.8483893494726664, + "grad_norm": 1.7766261100769043, + "learning_rate": 1.1813170991981593e-05, + "loss": 1.2071, + "step": 23690 + }, + { + "epoch": 0.8484251616022347, + "grad_norm": 1.4640761613845825, + "learning_rate": 1.1807702698967349e-05, + "loss": 1.0291, + "step": 23691 + }, + { + "epoch": 0.848460973731803, + "grad_norm": 1.1867083311080933, + "learning_rate": 1.1802235592466727e-05, + "loss": 0.8223, + "step": 23692 + }, + { + "epoch": 0.8484967858613712, + "grad_norm": 1.5917479991912842, + "learning_rate": 1.179676967255321e-05, + "loss": 1.0845, + "step": 23693 + }, + { + "epoch": 0.8485325979909395, + "grad_norm": 1.0796024799346924, + "learning_rate": 1.1791304939300429e-05, + "loss": 0.7408, + "step": 23694 + }, + { + "epoch": 0.8485684101205078, + "grad_norm": 1.1245273351669312, + "learning_rate": 1.1785841392781838e-05, + "loss": 0.8758, + "step": 23695 + }, + { + "epoch": 0.8486042222500761, + "grad_norm": 1.322117567062378, + "learning_rate": 1.1780379033070988e-05, + "loss": 1.1087, + "step": 23696 + }, + { + "epoch": 0.8486400343796444, + "grad_norm": 1.3573720455169678, + "learning_rate": 1.1774917860241297e-05, + "loss": 1.0965, + "step": 23697 + }, + { + "epoch": 0.8486758465092127, + "grad_norm": 1.4361602067947388, + "learning_rate": 1.1769457874366318e-05, + "loss": 0.982, + "step": 23698 + }, + { + "epoch": 0.848711658638781, + "grad_norm": 1.1886954307556152, + "learning_rate": 1.1763999075519482e-05, + "loss": 0.8757, + "step": 23699 + }, + { + "epoch": 0.8487474707683492, + "grad_norm": 1.68196702003479, + "learning_rate": 1.1758541463774186e-05, + "loss": 1.1447, + "step": 23700 + }, + { + "epoch": 0.8487832828979175, + "grad_norm": 1.6656630039215088, + "learning_rate": 1.1753085039203926e-05, + "loss": 0.8917, + "step": 23701 + }, + { + "epoch": 0.8488190950274858, + "grad_norm": 1.2669179439544678, + "learning_rate": 1.1747629801882054e-05, + "loss": 1.1936, + "step": 23702 + }, + { + "epoch": 0.8488549071570541, + "grad_norm": 1.626273274421692, + "learning_rate": 1.1742175751882012e-05, + "loss": 1.1064, + "step": 23703 + }, + { + "epoch": 0.8488907192866224, + "grad_norm": 1.3793301582336426, + "learning_rate": 1.1736722889277107e-05, + "loss": 1.1067, + "step": 23704 + }, + { + "epoch": 0.8489265314161907, + "grad_norm": 1.692594051361084, + "learning_rate": 1.1731271214140783e-05, + "loss": 1.126, + "step": 23705 + }, + { + "epoch": 0.848962343545759, + "grad_norm": 2.2444822788238525, + "learning_rate": 1.1725820726546322e-05, + "loss": 1.117, + "step": 23706 + }, + { + "epoch": 0.8489981556753272, + "grad_norm": 1.6293201446533203, + "learning_rate": 1.1720371426567111e-05, + "loss": 1.1122, + "step": 23707 + }, + { + "epoch": 0.8490339678048955, + "grad_norm": 1.4570550918579102, + "learning_rate": 1.1714923314276405e-05, + "loss": 0.987, + "step": 23708 + }, + { + "epoch": 0.8490697799344638, + "grad_norm": 1.5109633207321167, + "learning_rate": 1.170947638974752e-05, + "loss": 1.0881, + "step": 23709 + }, + { + "epoch": 0.8491055920640321, + "grad_norm": 2.3121495246887207, + "learning_rate": 1.1704030653053766e-05, + "loss": 1.2496, + "step": 23710 + }, + { + "epoch": 0.8491414041936004, + "grad_norm": 1.3241130113601685, + "learning_rate": 1.1698586104268372e-05, + "loss": 1.033, + "step": 23711 + }, + { + "epoch": 0.8491772163231687, + "grad_norm": 1.7928715944290161, + "learning_rate": 1.169314274346459e-05, + "loss": 1.1203, + "step": 23712 + }, + { + "epoch": 0.8492130284527369, + "grad_norm": 1.6005682945251465, + "learning_rate": 1.1687700570715677e-05, + "loss": 1.0468, + "step": 23713 + }, + { + "epoch": 0.8492488405823052, + "grad_norm": 1.645617961883545, + "learning_rate": 1.1682259586094845e-05, + "loss": 1.122, + "step": 23714 + }, + { + "epoch": 0.8492846527118735, + "grad_norm": 1.3382099866867065, + "learning_rate": 1.1676819789675264e-05, + "loss": 1.007, + "step": 23715 + }, + { + "epoch": 0.8493204648414417, + "grad_norm": 1.418846845626831, + "learning_rate": 1.1671381181530171e-05, + "loss": 1.099, + "step": 23716 + }, + { + "epoch": 0.8493562769710101, + "grad_norm": 1.4019593000411987, + "learning_rate": 1.1665943761732712e-05, + "loss": 1.1485, + "step": 23717 + }, + { + "epoch": 0.8493920891005784, + "grad_norm": 1.6272125244140625, + "learning_rate": 1.1660507530356024e-05, + "loss": 1.0086, + "step": 23718 + }, + { + "epoch": 0.8494279012301467, + "grad_norm": 1.4573678970336914, + "learning_rate": 1.1655072487473251e-05, + "loss": 1.0786, + "step": 23719 + }, + { + "epoch": 0.8494637133597149, + "grad_norm": 1.7563984394073486, + "learning_rate": 1.1649638633157523e-05, + "loss": 1.2137, + "step": 23720 + }, + { + "epoch": 0.8494995254892832, + "grad_norm": 1.4297592639923096, + "learning_rate": 1.1644205967481959e-05, + "loss": 1.0264, + "step": 23721 + }, + { + "epoch": 0.8495353376188515, + "grad_norm": 1.4539604187011719, + "learning_rate": 1.1638774490519622e-05, + "loss": 1.1176, + "step": 23722 + }, + { + "epoch": 0.8495711497484197, + "grad_norm": 1.63752019405365, + "learning_rate": 1.1633344202343587e-05, + "loss": 0.9635, + "step": 23723 + }, + { + "epoch": 0.8496069618779881, + "grad_norm": 1.3280065059661865, + "learning_rate": 1.162791510302692e-05, + "loss": 0.8752, + "step": 23724 + }, + { + "epoch": 0.8496427740075564, + "grad_norm": 1.5961500406265259, + "learning_rate": 1.1622487192642694e-05, + "loss": 0.9957, + "step": 23725 + }, + { + "epoch": 0.8496785861371247, + "grad_norm": 1.5567978620529175, + "learning_rate": 1.1617060471263875e-05, + "loss": 0.922, + "step": 23726 + }, + { + "epoch": 0.8497143982666929, + "grad_norm": 2.8388285636901855, + "learning_rate": 1.1611634938963512e-05, + "loss": 0.9795, + "step": 23727 + }, + { + "epoch": 0.8497502103962612, + "grad_norm": 1.5758860111236572, + "learning_rate": 1.1606210595814593e-05, + "loss": 1.0277, + "step": 23728 + }, + { + "epoch": 0.8497860225258295, + "grad_norm": 1.3304656744003296, + "learning_rate": 1.1600787441890082e-05, + "loss": 1.1217, + "step": 23729 + }, + { + "epoch": 0.8498218346553977, + "grad_norm": 1.392754077911377, + "learning_rate": 1.1595365477262944e-05, + "loss": 1.0423, + "step": 23730 + }, + { + "epoch": 0.8498576467849661, + "grad_norm": 1.5829293727874756, + "learning_rate": 1.1589944702006129e-05, + "loss": 1.0344, + "step": 23731 + }, + { + "epoch": 0.8498934589145344, + "grad_norm": 1.9659571647644043, + "learning_rate": 1.158452511619259e-05, + "loss": 1.1676, + "step": 23732 + }, + { + "epoch": 0.8499292710441027, + "grad_norm": 1.2599552869796753, + "learning_rate": 1.1579106719895205e-05, + "loss": 1.0602, + "step": 23733 + }, + { + "epoch": 0.8499650831736709, + "grad_norm": 2.150752544403076, + "learning_rate": 1.157368951318687e-05, + "loss": 1.2809, + "step": 23734 + }, + { + "epoch": 0.8500008953032392, + "grad_norm": 1.5923912525177002, + "learning_rate": 1.1568273496140513e-05, + "loss": 1.114, + "step": 23735 + }, + { + "epoch": 0.8500367074328075, + "grad_norm": 1.2763508558273315, + "learning_rate": 1.1562858668828936e-05, + "loss": 1.1426, + "step": 23736 + }, + { + "epoch": 0.8500725195623757, + "grad_norm": 1.5323151350021362, + "learning_rate": 1.1557445031325032e-05, + "loss": 1.035, + "step": 23737 + }, + { + "epoch": 0.8501083316919441, + "grad_norm": 1.9351075887680054, + "learning_rate": 1.1552032583701612e-05, + "loss": 1.0969, + "step": 23738 + }, + { + "epoch": 0.8501441438215124, + "grad_norm": 1.3734554052352905, + "learning_rate": 1.1546621326031526e-05, + "loss": 1.1188, + "step": 23739 + }, + { + "epoch": 0.8501799559510806, + "grad_norm": 1.427292823791504, + "learning_rate": 1.154121125838754e-05, + "loss": 1.091, + "step": 23740 + }, + { + "epoch": 0.8502157680806489, + "grad_norm": 1.5608251094818115, + "learning_rate": 1.1535802380842453e-05, + "loss": 0.9415, + "step": 23741 + }, + { + "epoch": 0.8502515802102172, + "grad_norm": 1.685019612312317, + "learning_rate": 1.1530394693469026e-05, + "loss": 1.1479, + "step": 23742 + }, + { + "epoch": 0.8502873923397855, + "grad_norm": 1.344948649406433, + "learning_rate": 1.1524988196340048e-05, + "loss": 1.043, + "step": 23743 + }, + { + "epoch": 0.8503232044693537, + "grad_norm": 1.1292660236358643, + "learning_rate": 1.1519582889528202e-05, + "loss": 1.0058, + "step": 23744 + }, + { + "epoch": 0.8503590165989221, + "grad_norm": 1.2847598791122437, + "learning_rate": 1.1514178773106243e-05, + "loss": 0.9771, + "step": 23745 + }, + { + "epoch": 0.8503948287284904, + "grad_norm": 2.030421018600464, + "learning_rate": 1.150877584714689e-05, + "loss": 1.0975, + "step": 23746 + }, + { + "epoch": 0.8504306408580586, + "grad_norm": 1.2717784643173218, + "learning_rate": 1.1503374111722786e-05, + "loss": 1.1423, + "step": 23747 + }, + { + "epoch": 0.8504664529876269, + "grad_norm": 1.3433538675308228, + "learning_rate": 1.149797356690664e-05, + "loss": 0.9501, + "step": 23748 + }, + { + "epoch": 0.8505022651171952, + "grad_norm": 1.6480648517608643, + "learning_rate": 1.149257421277109e-05, + "loss": 1.068, + "step": 23749 + }, + { + "epoch": 0.8505380772467634, + "grad_norm": 1.6339894533157349, + "learning_rate": 1.1487176049388814e-05, + "loss": 1.052, + "step": 23750 + }, + { + "epoch": 0.8505738893763317, + "grad_norm": 1.7075979709625244, + "learning_rate": 1.1481779076832388e-05, + "loss": 1.07, + "step": 23751 + }, + { + "epoch": 0.8506097015059001, + "grad_norm": 1.5826267004013062, + "learning_rate": 1.1476383295174452e-05, + "loss": 1.1908, + "step": 23752 + }, + { + "epoch": 0.8506455136354684, + "grad_norm": 1.5961979627609253, + "learning_rate": 1.1470988704487607e-05, + "loss": 0.9438, + "step": 23753 + }, + { + "epoch": 0.8506813257650366, + "grad_norm": 1.595068097114563, + "learning_rate": 1.146559530484439e-05, + "loss": 0.9939, + "step": 23754 + }, + { + "epoch": 0.8507171378946049, + "grad_norm": 1.4952694177627563, + "learning_rate": 1.146020309631739e-05, + "loss": 1.2049, + "step": 23755 + }, + { + "epoch": 0.8507529500241732, + "grad_norm": 1.36769437789917, + "learning_rate": 1.145481207897915e-05, + "loss": 1.0225, + "step": 23756 + }, + { + "epoch": 0.8507887621537414, + "grad_norm": 1.402025580406189, + "learning_rate": 1.144942225290222e-05, + "loss": 1.1165, + "step": 23757 + }, + { + "epoch": 0.8508245742833097, + "grad_norm": 1.9867427349090576, + "learning_rate": 1.1444033618159068e-05, + "loss": 1.3836, + "step": 23758 + }, + { + "epoch": 0.8508603864128781, + "grad_norm": 1.3962528705596924, + "learning_rate": 1.143864617482222e-05, + "loss": 1.2457, + "step": 23759 + }, + { + "epoch": 0.8508961985424464, + "grad_norm": 1.483086109161377, + "learning_rate": 1.1433259922964146e-05, + "loss": 1.0005, + "step": 23760 + }, + { + "epoch": 0.8509320106720146, + "grad_norm": 2.37707781791687, + "learning_rate": 1.1427874862657339e-05, + "loss": 1.264, + "step": 23761 + }, + { + "epoch": 0.8509678228015829, + "grad_norm": 1.5797008275985718, + "learning_rate": 1.1422490993974199e-05, + "loss": 0.8506, + "step": 23762 + }, + { + "epoch": 0.8510036349311512, + "grad_norm": 1.6933318376541138, + "learning_rate": 1.1417108316987201e-05, + "loss": 1.0011, + "step": 23763 + }, + { + "epoch": 0.8510394470607194, + "grad_norm": 1.2575687170028687, + "learning_rate": 1.1411726831768754e-05, + "loss": 1.2149, + "step": 23764 + }, + { + "epoch": 0.8510752591902877, + "grad_norm": 1.5423023700714111, + "learning_rate": 1.1406346538391243e-05, + "loss": 1.0724, + "step": 23765 + }, + { + "epoch": 0.8511110713198561, + "grad_norm": 1.4307013750076294, + "learning_rate": 1.1400967436927056e-05, + "loss": 1.0009, + "step": 23766 + }, + { + "epoch": 0.8511468834494244, + "grad_norm": 1.4299445152282715, + "learning_rate": 1.1395589527448558e-05, + "loss": 1.0276, + "step": 23767 + }, + { + "epoch": 0.8511826955789926, + "grad_norm": 1.6788330078125, + "learning_rate": 1.1390212810028144e-05, + "loss": 1.4061, + "step": 23768 + }, + { + "epoch": 0.8512185077085609, + "grad_norm": 1.3924150466918945, + "learning_rate": 1.1384837284738114e-05, + "loss": 0.8876, + "step": 23769 + }, + { + "epoch": 0.8512543198381292, + "grad_norm": 1.3329744338989258, + "learning_rate": 1.1379462951650755e-05, + "loss": 1.083, + "step": 23770 + }, + { + "epoch": 0.8512901319676974, + "grad_norm": 1.4462777376174927, + "learning_rate": 1.137408981083845e-05, + "loss": 0.8225, + "step": 23771 + }, + { + "epoch": 0.8513259440972657, + "grad_norm": 1.978481650352478, + "learning_rate": 1.1368717862373424e-05, + "loss": 1.2206, + "step": 23772 + }, + { + "epoch": 0.8513617562268341, + "grad_norm": 1.459340214729309, + "learning_rate": 1.136334710632797e-05, + "loss": 0.9953, + "step": 23773 + }, + { + "epoch": 0.8513975683564023, + "grad_norm": 1.6709283590316772, + "learning_rate": 1.1357977542774356e-05, + "loss": 0.9507, + "step": 23774 + }, + { + "epoch": 0.8514333804859706, + "grad_norm": 1.6936888694763184, + "learning_rate": 1.1352609171784834e-05, + "loss": 1.0376, + "step": 23775 + }, + { + "epoch": 0.8514691926155389, + "grad_norm": 1.3116203546524048, + "learning_rate": 1.1347241993431578e-05, + "loss": 1.0716, + "step": 23776 + }, + { + "epoch": 0.8515050047451072, + "grad_norm": 1.6948368549346924, + "learning_rate": 1.1341876007786845e-05, + "loss": 1.1985, + "step": 23777 + }, + { + "epoch": 0.8515408168746754, + "grad_norm": 1.8966248035430908, + "learning_rate": 1.1336511214922819e-05, + "loss": 0.932, + "step": 23778 + }, + { + "epoch": 0.8515766290042437, + "grad_norm": 1.282055139541626, + "learning_rate": 1.1331147614911641e-05, + "loss": 1.1986, + "step": 23779 + }, + { + "epoch": 0.8516124411338121, + "grad_norm": 1.5720069408416748, + "learning_rate": 1.1325785207825524e-05, + "loss": 1.0705, + "step": 23780 + }, + { + "epoch": 0.8516482532633803, + "grad_norm": 1.4293731451034546, + "learning_rate": 1.132042399373654e-05, + "loss": 0.9945, + "step": 23781 + }, + { + "epoch": 0.8516840653929486, + "grad_norm": 1.5532335042953491, + "learning_rate": 1.131506397271691e-05, + "loss": 1.2981, + "step": 23782 + }, + { + "epoch": 0.8517198775225169, + "grad_norm": 1.5937530994415283, + "learning_rate": 1.1309705144838678e-05, + "loss": 1.1381, + "step": 23783 + }, + { + "epoch": 0.8517556896520851, + "grad_norm": 1.7247657775878906, + "learning_rate": 1.1304347510173963e-05, + "loss": 1.2004, + "step": 23784 + }, + { + "epoch": 0.8517915017816534, + "grad_norm": 1.3662402629852295, + "learning_rate": 1.129899106879484e-05, + "loss": 1.1217, + "step": 23785 + }, + { + "epoch": 0.8518273139112217, + "grad_norm": 1.2546581029891968, + "learning_rate": 1.1293635820773397e-05, + "loss": 0.9502, + "step": 23786 + }, + { + "epoch": 0.8518631260407901, + "grad_norm": 1.4231401681900024, + "learning_rate": 1.1288281766181651e-05, + "loss": 1.1831, + "step": 23787 + }, + { + "epoch": 0.8518989381703583, + "grad_norm": 1.5492258071899414, + "learning_rate": 1.1282928905091616e-05, + "loss": 0.964, + "step": 23788 + }, + { + "epoch": 0.8519347502999266, + "grad_norm": 1.275322437286377, + "learning_rate": 1.1277577237575377e-05, + "loss": 1.253, + "step": 23789 + }, + { + "epoch": 0.8519705624294949, + "grad_norm": 1.4524966478347778, + "learning_rate": 1.1272226763704863e-05, + "loss": 1.301, + "step": 23790 + }, + { + "epoch": 0.8520063745590631, + "grad_norm": 1.931268334388733, + "learning_rate": 1.1266877483552118e-05, + "loss": 1.0763, + "step": 23791 + }, + { + "epoch": 0.8520421866886314, + "grad_norm": 1.2864470481872559, + "learning_rate": 1.126152939718903e-05, + "loss": 1.1755, + "step": 23792 + }, + { + "epoch": 0.8520779988181997, + "grad_norm": 1.3266222476959229, + "learning_rate": 1.125618250468764e-05, + "loss": 1.1355, + "step": 23793 + }, + { + "epoch": 0.8521138109477681, + "grad_norm": 1.3761944770812988, + "learning_rate": 1.1250836806119824e-05, + "loss": 1.0574, + "step": 23794 + }, + { + "epoch": 0.8521496230773363, + "grad_norm": 1.7493482828140259, + "learning_rate": 1.1245492301557547e-05, + "loss": 1.0108, + "step": 23795 + }, + { + "epoch": 0.8521854352069046, + "grad_norm": 1.7362407445907593, + "learning_rate": 1.1240148991072662e-05, + "loss": 1.0354, + "step": 23796 + }, + { + "epoch": 0.8522212473364729, + "grad_norm": 1.5362495183944702, + "learning_rate": 1.123480687473708e-05, + "loss": 1.0905, + "step": 23797 + }, + { + "epoch": 0.8522570594660411, + "grad_norm": 1.525272250175476, + "learning_rate": 1.1229465952622686e-05, + "loss": 1.0718, + "step": 23798 + }, + { + "epoch": 0.8522928715956094, + "grad_norm": 1.5643184185028076, + "learning_rate": 1.122412622480129e-05, + "loss": 1.1034, + "step": 23799 + }, + { + "epoch": 0.8523286837251777, + "grad_norm": 1.5947922468185425, + "learning_rate": 1.1218787691344801e-05, + "loss": 1.1685, + "step": 23800 + }, + { + "epoch": 0.852364495854746, + "grad_norm": 2.0287859439849854, + "learning_rate": 1.1213450352324983e-05, + "loss": 1.3895, + "step": 23801 + }, + { + "epoch": 0.8524003079843143, + "grad_norm": 1.5014691352844238, + "learning_rate": 1.1208114207813691e-05, + "loss": 1.168, + "step": 23802 + }, + { + "epoch": 0.8524361201138826, + "grad_norm": 1.7517701387405396, + "learning_rate": 1.1202779257882645e-05, + "loss": 1.0301, + "step": 23803 + }, + { + "epoch": 0.8524719322434509, + "grad_norm": 1.5076462030410767, + "learning_rate": 1.1197445502603698e-05, + "loss": 1.1371, + "step": 23804 + }, + { + "epoch": 0.8525077443730191, + "grad_norm": 1.315124273300171, + "learning_rate": 1.1192112942048582e-05, + "loss": 0.9894, + "step": 23805 + }, + { + "epoch": 0.8525435565025874, + "grad_norm": 1.4413150548934937, + "learning_rate": 1.1186781576289007e-05, + "loss": 1.0558, + "step": 23806 + }, + { + "epoch": 0.8525793686321557, + "grad_norm": 1.6367636919021606, + "learning_rate": 1.1181451405396725e-05, + "loss": 1.1416, + "step": 23807 + }, + { + "epoch": 0.852615180761724, + "grad_norm": 1.3245633840560913, + "learning_rate": 1.1176122429443458e-05, + "loss": 0.9908, + "step": 23808 + }, + { + "epoch": 0.8526509928912923, + "grad_norm": 1.6310862302780151, + "learning_rate": 1.1170794648500893e-05, + "loss": 1.0224, + "step": 23809 + }, + { + "epoch": 0.8526868050208606, + "grad_norm": 1.4582798480987549, + "learning_rate": 1.116546806264067e-05, + "loss": 1.0943, + "step": 23810 + }, + { + "epoch": 0.8527226171504289, + "grad_norm": 1.5703939199447632, + "learning_rate": 1.1160142671934537e-05, + "loss": 1.0743, + "step": 23811 + }, + { + "epoch": 0.8527584292799971, + "grad_norm": 1.5752884149551392, + "learning_rate": 1.1154818476454054e-05, + "loss": 1.1111, + "step": 23812 + }, + { + "epoch": 0.8527942414095654, + "grad_norm": 1.3454384803771973, + "learning_rate": 1.114949547627091e-05, + "loss": 1.0894, + "step": 23813 + }, + { + "epoch": 0.8528300535391337, + "grad_norm": 1.7697035074234009, + "learning_rate": 1.1144173671456682e-05, + "loss": 1.0931, + "step": 23814 + }, + { + "epoch": 0.852865865668702, + "grad_norm": 2.2397286891937256, + "learning_rate": 1.1138853062082977e-05, + "loss": 1.1397, + "step": 23815 + }, + { + "epoch": 0.8529016777982703, + "grad_norm": 1.5481020212173462, + "learning_rate": 1.1133533648221405e-05, + "loss": 0.9235, + "step": 23816 + }, + { + "epoch": 0.8529374899278386, + "grad_norm": 1.5973870754241943, + "learning_rate": 1.1128215429943477e-05, + "loss": 1.054, + "step": 23817 + }, + { + "epoch": 0.8529733020574068, + "grad_norm": 1.658456802368164, + "learning_rate": 1.1122898407320791e-05, + "loss": 1.0782, + "step": 23818 + }, + { + "epoch": 0.8530091141869751, + "grad_norm": 1.6056474447250366, + "learning_rate": 1.1117582580424857e-05, + "loss": 0.9589, + "step": 23819 + }, + { + "epoch": 0.8530449263165434, + "grad_norm": 1.9316496849060059, + "learning_rate": 1.1112267949327216e-05, + "loss": 1.0851, + "step": 23820 + }, + { + "epoch": 0.8530807384461117, + "grad_norm": 1.8587819337844849, + "learning_rate": 1.1106954514099332e-05, + "loss": 1.0892, + "step": 23821 + }, + { + "epoch": 0.85311655057568, + "grad_norm": 1.603713035583496, + "learning_rate": 1.1101642274812706e-05, + "loss": 1.0857, + "step": 23822 + }, + { + "epoch": 0.8531523627052483, + "grad_norm": 1.6571252346038818, + "learning_rate": 1.1096331231538847e-05, + "loss": 1.0079, + "step": 23823 + }, + { + "epoch": 0.8531881748348166, + "grad_norm": 1.5214303731918335, + "learning_rate": 1.1091021384349143e-05, + "loss": 0.9595, + "step": 23824 + }, + { + "epoch": 0.8532239869643848, + "grad_norm": 1.7039790153503418, + "learning_rate": 1.1085712733315068e-05, + "loss": 1.0981, + "step": 23825 + }, + { + "epoch": 0.8532597990939531, + "grad_norm": 1.4505997896194458, + "learning_rate": 1.1080405278508033e-05, + "loss": 1.1479, + "step": 23826 + }, + { + "epoch": 0.8532956112235214, + "grad_norm": 1.7869317531585693, + "learning_rate": 1.1075099019999468e-05, + "loss": 1.2047, + "step": 23827 + }, + { + "epoch": 0.8533314233530896, + "grad_norm": 1.8141839504241943, + "learning_rate": 1.106979395786072e-05, + "loss": 1.321, + "step": 23828 + }, + { + "epoch": 0.853367235482658, + "grad_norm": 1.4668186902999878, + "learning_rate": 1.1064490092163181e-05, + "loss": 1.0887, + "step": 23829 + }, + { + "epoch": 0.8534030476122263, + "grad_norm": 1.3552296161651611, + "learning_rate": 1.1059187422978211e-05, + "loss": 1.1499, + "step": 23830 + }, + { + "epoch": 0.8534388597417946, + "grad_norm": 1.4626717567443848, + "learning_rate": 1.1053885950377174e-05, + "loss": 1.0465, + "step": 23831 + }, + { + "epoch": 0.8534746718713628, + "grad_norm": 1.733862042427063, + "learning_rate": 1.1048585674431345e-05, + "loss": 1.2249, + "step": 23832 + }, + { + "epoch": 0.8535104840009311, + "grad_norm": 1.6961774826049805, + "learning_rate": 1.1043286595212054e-05, + "loss": 1.1147, + "step": 23833 + }, + { + "epoch": 0.8535462961304994, + "grad_norm": 1.5135798454284668, + "learning_rate": 1.1037988712790626e-05, + "loss": 1.0657, + "step": 23834 + }, + { + "epoch": 0.8535821082600676, + "grad_norm": 1.3254097700119019, + "learning_rate": 1.1032692027238279e-05, + "loss": 1.0976, + "step": 23835 + }, + { + "epoch": 0.853617920389636, + "grad_norm": 1.3523741960525513, + "learning_rate": 1.10273965386263e-05, + "loss": 1.1028, + "step": 23836 + }, + { + "epoch": 0.8536537325192043, + "grad_norm": 1.350998878479004, + "learning_rate": 1.1022102247025934e-05, + "loss": 1.1707, + "step": 23837 + }, + { + "epoch": 0.8536895446487726, + "grad_norm": 1.1938176155090332, + "learning_rate": 1.1016809152508434e-05, + "loss": 1.1222, + "step": 23838 + }, + { + "epoch": 0.8537253567783408, + "grad_norm": 1.1166260242462158, + "learning_rate": 1.1011517255144965e-05, + "loss": 0.9997, + "step": 23839 + }, + { + "epoch": 0.8537611689079091, + "grad_norm": 1.8444955348968506, + "learning_rate": 1.1006226555006749e-05, + "loss": 0.9314, + "step": 23840 + }, + { + "epoch": 0.8537969810374774, + "grad_norm": 1.2607604265213013, + "learning_rate": 1.1000937052164973e-05, + "loss": 0.883, + "step": 23841 + }, + { + "epoch": 0.8538327931670456, + "grad_norm": 1.7090837955474854, + "learning_rate": 1.0995648746690768e-05, + "loss": 0.9005, + "step": 23842 + }, + { + "epoch": 0.853868605296614, + "grad_norm": 1.3513872623443604, + "learning_rate": 1.0990361638655311e-05, + "loss": 1.0691, + "step": 23843 + }, + { + "epoch": 0.8539044174261823, + "grad_norm": 1.4559837579727173, + "learning_rate": 1.0985075728129712e-05, + "loss": 1.0268, + "step": 23844 + }, + { + "epoch": 0.8539402295557506, + "grad_norm": 1.8153527975082397, + "learning_rate": 1.0979791015185125e-05, + "loss": 1.1198, + "step": 23845 + }, + { + "epoch": 0.8539760416853188, + "grad_norm": 1.4445569515228271, + "learning_rate": 1.0974507499892605e-05, + "loss": 1.0649, + "step": 23846 + }, + { + "epoch": 0.8540118538148871, + "grad_norm": 1.4971840381622314, + "learning_rate": 1.0969225182323239e-05, + "loss": 1.084, + "step": 23847 + }, + { + "epoch": 0.8540476659444554, + "grad_norm": 1.4288641214370728, + "learning_rate": 1.0963944062548125e-05, + "loss": 1.1269, + "step": 23848 + }, + { + "epoch": 0.8540834780740236, + "grad_norm": 1.453650712966919, + "learning_rate": 1.0958664140638297e-05, + "loss": 1.0833, + "step": 23849 + }, + { + "epoch": 0.854119290203592, + "grad_norm": 1.1979142427444458, + "learning_rate": 1.0953385416664785e-05, + "loss": 0.7947, + "step": 23850 + }, + { + "epoch": 0.8541551023331603, + "grad_norm": 1.723349690437317, + "learning_rate": 1.09481078906986e-05, + "loss": 1.1166, + "step": 23851 + }, + { + "epoch": 0.8541909144627285, + "grad_norm": 1.4790898561477661, + "learning_rate": 1.0942831562810774e-05, + "loss": 1.0666, + "step": 23852 + }, + { + "epoch": 0.8542267265922968, + "grad_norm": 1.4797966480255127, + "learning_rate": 1.093755643307226e-05, + "loss": 1.0654, + "step": 23853 + }, + { + "epoch": 0.8542625387218651, + "grad_norm": 2.4151535034179688, + "learning_rate": 1.0932282501554037e-05, + "loss": 1.0254, + "step": 23854 + }, + { + "epoch": 0.8542983508514334, + "grad_norm": 1.3190481662750244, + "learning_rate": 1.0927009768327068e-05, + "loss": 1.0055, + "step": 23855 + }, + { + "epoch": 0.8543341629810016, + "grad_norm": 1.5212846994400024, + "learning_rate": 1.0921738233462297e-05, + "loss": 1.0881, + "step": 23856 + }, + { + "epoch": 0.85436997511057, + "grad_norm": 1.4252572059631348, + "learning_rate": 1.0916467897030625e-05, + "loss": 0.8917, + "step": 23857 + }, + { + "epoch": 0.8544057872401383, + "grad_norm": 1.3717480897903442, + "learning_rate": 1.091119875910297e-05, + "loss": 0.9742, + "step": 23858 + }, + { + "epoch": 0.8544415993697065, + "grad_norm": 1.34640634059906, + "learning_rate": 1.0905930819750232e-05, + "loss": 0.8308, + "step": 23859 + }, + { + "epoch": 0.8544774114992748, + "grad_norm": 1.653994083404541, + "learning_rate": 1.0900664079043255e-05, + "loss": 0.9559, + "step": 23860 + }, + { + "epoch": 0.8545132236288431, + "grad_norm": 1.657479166984558, + "learning_rate": 1.0895398537052914e-05, + "loss": 0.8938, + "step": 23861 + }, + { + "epoch": 0.8545490357584113, + "grad_norm": 1.9081867933273315, + "learning_rate": 1.0890134193850043e-05, + "loss": 1.0903, + "step": 23862 + }, + { + "epoch": 0.8545848478879796, + "grad_norm": 1.4257724285125732, + "learning_rate": 1.0884871049505507e-05, + "loss": 1.1243, + "step": 23863 + }, + { + "epoch": 0.854620660017548, + "grad_norm": 1.5050956010818481, + "learning_rate": 1.0879609104090049e-05, + "loss": 1.2062, + "step": 23864 + }, + { + "epoch": 0.8546564721471163, + "grad_norm": 1.1812200546264648, + "learning_rate": 1.0874348357674492e-05, + "loss": 0.9217, + "step": 23865 + }, + { + "epoch": 0.8546922842766845, + "grad_norm": 1.6901674270629883, + "learning_rate": 1.0869088810329642e-05, + "loss": 1.0355, + "step": 23866 + }, + { + "epoch": 0.8547280964062528, + "grad_norm": 1.1104077100753784, + "learning_rate": 1.0863830462126202e-05, + "loss": 1.042, + "step": 23867 + }, + { + "epoch": 0.8547639085358211, + "grad_norm": 1.1653623580932617, + "learning_rate": 1.085857331313498e-05, + "loss": 1.1015, + "step": 23868 + }, + { + "epoch": 0.8547997206653893, + "grad_norm": 1.334567666053772, + "learning_rate": 1.0853317363426618e-05, + "loss": 1.0382, + "step": 23869 + }, + { + "epoch": 0.8548355327949576, + "grad_norm": 1.6283513307571411, + "learning_rate": 1.0848062613071918e-05, + "loss": 1.2649, + "step": 23870 + }, + { + "epoch": 0.854871344924526, + "grad_norm": 1.2849074602127075, + "learning_rate": 1.0842809062141524e-05, + "loss": 1.1533, + "step": 23871 + }, + { + "epoch": 0.8549071570540943, + "grad_norm": 1.532746434211731, + "learning_rate": 1.083755671070613e-05, + "loss": 1.1919, + "step": 23872 + }, + { + "epoch": 0.8549429691836625, + "grad_norm": 1.2592730522155762, + "learning_rate": 1.0832305558836397e-05, + "loss": 0.9947, + "step": 23873 + }, + { + "epoch": 0.8549787813132308, + "grad_norm": 2.5089187622070312, + "learning_rate": 1.0827055606602998e-05, + "loss": 1.2763, + "step": 23874 + }, + { + "epoch": 0.8550145934427991, + "grad_norm": 1.4822430610656738, + "learning_rate": 1.0821806854076533e-05, + "loss": 1.184, + "step": 23875 + }, + { + "epoch": 0.8550504055723673, + "grad_norm": 1.725411295890808, + "learning_rate": 1.0816559301327589e-05, + "loss": 1.0529, + "step": 23876 + }, + { + "epoch": 0.8550862177019356, + "grad_norm": 1.4963206052780151, + "learning_rate": 1.0811312948426844e-05, + "loss": 1.1262, + "step": 23877 + }, + { + "epoch": 0.855122029831504, + "grad_norm": 1.4341950416564941, + "learning_rate": 1.0806067795444818e-05, + "loss": 1.2285, + "step": 23878 + }, + { + "epoch": 0.8551578419610723, + "grad_norm": 1.4204448461532593, + "learning_rate": 1.0800823842452113e-05, + "loss": 1.0777, + "step": 23879 + }, + { + "epoch": 0.8551936540906405, + "grad_norm": 2.1638712882995605, + "learning_rate": 1.0795581089519236e-05, + "loss": 1.3132, + "step": 23880 + }, + { + "epoch": 0.8552294662202088, + "grad_norm": 1.8180506229400635, + "learning_rate": 1.0790339536716776e-05, + "loss": 1.005, + "step": 23881 + }, + { + "epoch": 0.8552652783497771, + "grad_norm": 1.8687461614608765, + "learning_rate": 1.078509918411521e-05, + "loss": 0.9256, + "step": 23882 + }, + { + "epoch": 0.8553010904793453, + "grad_norm": 1.5172468423843384, + "learning_rate": 1.0779860031785061e-05, + "loss": 1.1305, + "step": 23883 + }, + { + "epoch": 0.8553369026089136, + "grad_norm": 1.3916826248168945, + "learning_rate": 1.0774622079796826e-05, + "loss": 1.2051, + "step": 23884 + }, + { + "epoch": 0.855372714738482, + "grad_norm": 1.7819968461990356, + "learning_rate": 1.0769385328220938e-05, + "loss": 0.9261, + "step": 23885 + }, + { + "epoch": 0.8554085268680502, + "grad_norm": 1.5630970001220703, + "learning_rate": 1.0764149777127897e-05, + "loss": 1.0999, + "step": 23886 + }, + { + "epoch": 0.8554443389976185, + "grad_norm": 1.6602089405059814, + "learning_rate": 1.0758915426588068e-05, + "loss": 1.1098, + "step": 23887 + }, + { + "epoch": 0.8554801511271868, + "grad_norm": 1.4314181804656982, + "learning_rate": 1.0753682276671961e-05, + "loss": 0.9588, + "step": 23888 + }, + { + "epoch": 0.855515963256755, + "grad_norm": 1.6326342821121216, + "learning_rate": 1.074845032744991e-05, + "loss": 0.9753, + "step": 23889 + }, + { + "epoch": 0.8555517753863233, + "grad_norm": 1.6445133686065674, + "learning_rate": 1.0743219578992369e-05, + "loss": 1.0698, + "step": 23890 + }, + { + "epoch": 0.8555875875158916, + "grad_norm": 1.5069329738616943, + "learning_rate": 1.0737990031369627e-05, + "loss": 1.0919, + "step": 23891 + }, + { + "epoch": 0.85562339964546, + "grad_norm": 1.805224895477295, + "learning_rate": 1.0732761684652127e-05, + "loss": 1.0097, + "step": 23892 + }, + { + "epoch": 0.8556592117750282, + "grad_norm": 1.4136221408843994, + "learning_rate": 1.0727534538910177e-05, + "loss": 1.2698, + "step": 23893 + }, + { + "epoch": 0.8556950239045965, + "grad_norm": 1.2691210508346558, + "learning_rate": 1.0722308594214081e-05, + "loss": 1.0424, + "step": 23894 + }, + { + "epoch": 0.8557308360341648, + "grad_norm": 1.3880813121795654, + "learning_rate": 1.0717083850634158e-05, + "loss": 1.0286, + "step": 23895 + }, + { + "epoch": 0.855766648163733, + "grad_norm": 1.5678709745407104, + "learning_rate": 1.0711860308240706e-05, + "loss": 1.1558, + "step": 23896 + }, + { + "epoch": 0.8558024602933013, + "grad_norm": 1.6730502843856812, + "learning_rate": 1.0706637967104016e-05, + "loss": 1.0661, + "step": 23897 + }, + { + "epoch": 0.8558382724228696, + "grad_norm": 1.4107666015625, + "learning_rate": 1.0701416827294297e-05, + "loss": 1.0509, + "step": 23898 + }, + { + "epoch": 0.855874084552438, + "grad_norm": 1.3851633071899414, + "learning_rate": 1.069619688888187e-05, + "loss": 1.0558, + "step": 23899 + }, + { + "epoch": 0.8559098966820062, + "grad_norm": 1.6844488382339478, + "learning_rate": 1.0690978151936892e-05, + "loss": 1.1677, + "step": 23900 + }, + { + "epoch": 0.8559457088115745, + "grad_norm": 1.7232751846313477, + "learning_rate": 1.0685760616529628e-05, + "loss": 1.1425, + "step": 23901 + }, + { + "epoch": 0.8559815209411428, + "grad_norm": 1.5414741039276123, + "learning_rate": 1.068054428273022e-05, + "loss": 1.0787, + "step": 23902 + }, + { + "epoch": 0.856017333070711, + "grad_norm": 1.4109872579574585, + "learning_rate": 1.0675329150608892e-05, + "loss": 1.0761, + "step": 23903 + }, + { + "epoch": 0.8560531452002793, + "grad_norm": 1.385194182395935, + "learning_rate": 1.0670115220235799e-05, + "loss": 0.945, + "step": 23904 + }, + { + "epoch": 0.8560889573298476, + "grad_norm": 1.405292272567749, + "learning_rate": 1.0664902491681051e-05, + "loss": 1.1945, + "step": 23905 + }, + { + "epoch": 0.856124769459416, + "grad_norm": 2.252455949783325, + "learning_rate": 1.0659690965014813e-05, + "loss": 1.0633, + "step": 23906 + }, + { + "epoch": 0.8561605815889842, + "grad_norm": 1.3769285678863525, + "learning_rate": 1.0654480640307195e-05, + "loss": 1.1441, + "step": 23907 + }, + { + "epoch": 0.8561963937185525, + "grad_norm": 1.635262131690979, + "learning_rate": 1.0649271517628313e-05, + "loss": 1.1539, + "step": 23908 + }, + { + "epoch": 0.8562322058481208, + "grad_norm": 1.440936803817749, + "learning_rate": 1.0644063597048182e-05, + "loss": 1.0452, + "step": 23909 + }, + { + "epoch": 0.856268017977689, + "grad_norm": 1.628374457359314, + "learning_rate": 1.063885687863696e-05, + "loss": 0.9002, + "step": 23910 + }, + { + "epoch": 0.8563038301072573, + "grad_norm": 1.3852195739746094, + "learning_rate": 1.0633651362464647e-05, + "loss": 1.0397, + "step": 23911 + }, + { + "epoch": 0.8563396422368256, + "grad_norm": 1.3576924800872803, + "learning_rate": 1.0628447048601265e-05, + "loss": 1.2956, + "step": 23912 + }, + { + "epoch": 0.856375454366394, + "grad_norm": 1.4979124069213867, + "learning_rate": 1.0623243937116845e-05, + "loss": 0.9624, + "step": 23913 + }, + { + "epoch": 0.8564112664959622, + "grad_norm": 1.217299222946167, + "learning_rate": 1.06180420280814e-05, + "loss": 1.1504, + "step": 23914 + }, + { + "epoch": 0.8564470786255305, + "grad_norm": 1.48603093624115, + "learning_rate": 1.0612841321564915e-05, + "loss": 0.9606, + "step": 23915 + }, + { + "epoch": 0.8564828907550988, + "grad_norm": 1.6100174188613892, + "learning_rate": 1.0607641817637326e-05, + "loss": 1.0791, + "step": 23916 + }, + { + "epoch": 0.856518702884667, + "grad_norm": 1.7028714418411255, + "learning_rate": 1.060244351636861e-05, + "loss": 0.9316, + "step": 23917 + }, + { + "epoch": 0.8565545150142353, + "grad_norm": 1.339950442314148, + "learning_rate": 1.0597246417828698e-05, + "loss": 0.9688, + "step": 23918 + }, + { + "epoch": 0.8565903271438036, + "grad_norm": 2.0653796195983887, + "learning_rate": 1.0592050522087549e-05, + "loss": 1.0636, + "step": 23919 + }, + { + "epoch": 0.856626139273372, + "grad_norm": 1.3524945974349976, + "learning_rate": 1.0586855829215003e-05, + "loss": 0.9603, + "step": 23920 + }, + { + "epoch": 0.8566619514029402, + "grad_norm": 1.484632968902588, + "learning_rate": 1.0581662339280973e-05, + "loss": 1.15, + "step": 23921 + }, + { + "epoch": 0.8566977635325085, + "grad_norm": 1.4769814014434814, + "learning_rate": 1.0576470052355358e-05, + "loss": 1.0055, + "step": 23922 + }, + { + "epoch": 0.8567335756620768, + "grad_norm": 1.1651744842529297, + "learning_rate": 1.057127896850797e-05, + "loss": 0.815, + "step": 23923 + }, + { + "epoch": 0.856769387791645, + "grad_norm": 1.5803759098052979, + "learning_rate": 1.0566089087808672e-05, + "loss": 1.0045, + "step": 23924 + }, + { + "epoch": 0.8568051999212133, + "grad_norm": 1.401100516319275, + "learning_rate": 1.056090041032729e-05, + "loss": 1.0958, + "step": 23925 + }, + { + "epoch": 0.8568410120507816, + "grad_norm": 1.9684147834777832, + "learning_rate": 1.0555712936133633e-05, + "loss": 1.0406, + "step": 23926 + }, + { + "epoch": 0.8568768241803499, + "grad_norm": 1.723109245300293, + "learning_rate": 1.0550526665297466e-05, + "loss": 1.0441, + "step": 23927 + }, + { + "epoch": 0.8569126363099182, + "grad_norm": 2.0506656169891357, + "learning_rate": 1.0545341597888581e-05, + "loss": 0.9835, + "step": 23928 + }, + { + "epoch": 0.8569484484394865, + "grad_norm": 1.7659637928009033, + "learning_rate": 1.0540157733976763e-05, + "loss": 1.1783, + "step": 23929 + }, + { + "epoch": 0.8569842605690547, + "grad_norm": 1.4943320751190186, + "learning_rate": 1.0534975073631703e-05, + "loss": 1.0243, + "step": 23930 + }, + { + "epoch": 0.857020072698623, + "grad_norm": 1.7109819650650024, + "learning_rate": 1.0529793616923157e-05, + "loss": 1.1623, + "step": 23931 + }, + { + "epoch": 0.8570558848281913, + "grad_norm": 1.3377329111099243, + "learning_rate": 1.052461336392082e-05, + "loss": 0.7388, + "step": 23932 + }, + { + "epoch": 0.8570916969577596, + "grad_norm": 1.194085717201233, + "learning_rate": 1.0519434314694422e-05, + "loss": 1.0721, + "step": 23933 + }, + { + "epoch": 0.8571275090873279, + "grad_norm": 1.2308834791183472, + "learning_rate": 1.0514256469313588e-05, + "loss": 1.1526, + "step": 23934 + }, + { + "epoch": 0.8571633212168962, + "grad_norm": 1.643069863319397, + "learning_rate": 1.0509079827848012e-05, + "loss": 1.1665, + "step": 23935 + }, + { + "epoch": 0.8571991333464645, + "grad_norm": 1.3138447999954224, + "learning_rate": 1.0503904390367325e-05, + "loss": 1.0281, + "step": 23936 + }, + { + "epoch": 0.8572349454760327, + "grad_norm": 1.7924041748046875, + "learning_rate": 1.0498730156941184e-05, + "loss": 1.2622, + "step": 23937 + }, + { + "epoch": 0.857270757605601, + "grad_norm": 1.4470888376235962, + "learning_rate": 1.0493557127639164e-05, + "loss": 1.0119, + "step": 23938 + }, + { + "epoch": 0.8573065697351693, + "grad_norm": 1.4742751121520996, + "learning_rate": 1.0488385302530878e-05, + "loss": 0.9409, + "step": 23939 + }, + { + "epoch": 0.8573423818647375, + "grad_norm": 1.4617366790771484, + "learning_rate": 1.0483214681685927e-05, + "loss": 1.2713, + "step": 23940 + }, + { + "epoch": 0.8573781939943059, + "grad_norm": 2.1833274364471436, + "learning_rate": 1.047804526517383e-05, + "loss": 1.0334, + "step": 23941 + }, + { + "epoch": 0.8574140061238742, + "grad_norm": 1.4928514957427979, + "learning_rate": 1.0472877053064156e-05, + "loss": 1.0968, + "step": 23942 + }, + { + "epoch": 0.8574498182534425, + "grad_norm": 1.4585973024368286, + "learning_rate": 1.0467710045426449e-05, + "loss": 1.2407, + "step": 23943 + }, + { + "epoch": 0.8574856303830107, + "grad_norm": 1.3833211660385132, + "learning_rate": 1.046254424233023e-05, + "loss": 0.9184, + "step": 23944 + }, + { + "epoch": 0.857521442512579, + "grad_norm": 1.5349804162979126, + "learning_rate": 1.0457379643844966e-05, + "loss": 1.1096, + "step": 23945 + }, + { + "epoch": 0.8575572546421473, + "grad_norm": 1.6119716167449951, + "learning_rate": 1.0452216250040148e-05, + "loss": 1.1726, + "step": 23946 + }, + { + "epoch": 0.8575930667717155, + "grad_norm": 1.551888108253479, + "learning_rate": 1.0447054060985284e-05, + "loss": 1.1002, + "step": 23947 + }, + { + "epoch": 0.8576288789012839, + "grad_norm": 1.3758444786071777, + "learning_rate": 1.0441893076749765e-05, + "loss": 1.184, + "step": 23948 + }, + { + "epoch": 0.8576646910308522, + "grad_norm": 1.3826457262039185, + "learning_rate": 1.0436733297403056e-05, + "loss": 0.8509, + "step": 23949 + }, + { + "epoch": 0.8577005031604205, + "grad_norm": 1.298732876777649, + "learning_rate": 1.043157472301457e-05, + "loss": 1.0274, + "step": 23950 + }, + { + "epoch": 0.8577363152899887, + "grad_norm": 1.4425290822982788, + "learning_rate": 1.0426417353653739e-05, + "loss": 0.8962, + "step": 23951 + }, + { + "epoch": 0.857772127419557, + "grad_norm": 1.90996515750885, + "learning_rate": 1.0421261189389885e-05, + "loss": 1.0272, + "step": 23952 + }, + { + "epoch": 0.8578079395491253, + "grad_norm": 1.8785032033920288, + "learning_rate": 1.0416106230292432e-05, + "loss": 1.1506, + "step": 23953 + }, + { + "epoch": 0.8578437516786935, + "grad_norm": 1.3899084329605103, + "learning_rate": 1.0410952476430703e-05, + "loss": 1.1235, + "step": 23954 + }, + { + "epoch": 0.8578795638082619, + "grad_norm": 1.7392776012420654, + "learning_rate": 1.0405799927874072e-05, + "loss": 1.1035, + "step": 23955 + }, + { + "epoch": 0.8579153759378302, + "grad_norm": 1.32732355594635, + "learning_rate": 1.0400648584691808e-05, + "loss": 1.111, + "step": 23956 + }, + { + "epoch": 0.8579511880673985, + "grad_norm": 1.5333579778671265, + "learning_rate": 1.0395498446953245e-05, + "loss": 1.0782, + "step": 23957 + }, + { + "epoch": 0.8579870001969667, + "grad_norm": 1.4810575246810913, + "learning_rate": 1.0390349514727694e-05, + "loss": 0.9581, + "step": 23958 + }, + { + "epoch": 0.858022812326535, + "grad_norm": 1.427073359489441, + "learning_rate": 1.0385201788084375e-05, + "loss": 1.1983, + "step": 23959 + }, + { + "epoch": 0.8580586244561033, + "grad_norm": 1.3249515295028687, + "learning_rate": 1.0380055267092581e-05, + "loss": 1.3461, + "step": 23960 + }, + { + "epoch": 0.8580944365856715, + "grad_norm": 1.5047345161437988, + "learning_rate": 1.0374909951821532e-05, + "loss": 1.0188, + "step": 23961 + }, + { + "epoch": 0.8581302487152399, + "grad_norm": 1.493790626525879, + "learning_rate": 1.0369765842340484e-05, + "loss": 1.0351, + "step": 23962 + }, + { + "epoch": 0.8581660608448082, + "grad_norm": 1.3870294094085693, + "learning_rate": 1.0364622938718627e-05, + "loss": 0.9792, + "step": 23963 + }, + { + "epoch": 0.8582018729743764, + "grad_norm": 1.2261254787445068, + "learning_rate": 1.0359481241025105e-05, + "loss": 1.0102, + "step": 23964 + }, + { + "epoch": 0.8582376851039447, + "grad_norm": 1.5579508543014526, + "learning_rate": 1.0354340749329172e-05, + "loss": 0.8696, + "step": 23965 + }, + { + "epoch": 0.858273497233513, + "grad_norm": 1.5142583847045898, + "learning_rate": 1.0349201463699932e-05, + "loss": 1.0899, + "step": 23966 + }, + { + "epoch": 0.8583093093630813, + "grad_norm": 1.2209070920944214, + "learning_rate": 1.0344063384206537e-05, + "loss": 0.9997, + "step": 23967 + }, + { + "epoch": 0.8583451214926495, + "grad_norm": 1.7188359498977661, + "learning_rate": 1.0338926510918134e-05, + "loss": 1.4418, + "step": 23968 + }, + { + "epoch": 0.8583809336222178, + "grad_norm": 1.4072871208190918, + "learning_rate": 1.0333790843903835e-05, + "loss": 0.965, + "step": 23969 + }, + { + "epoch": 0.8584167457517862, + "grad_norm": 1.473140835762024, + "learning_rate": 1.0328656383232692e-05, + "loss": 0.9782, + "step": 23970 + }, + { + "epoch": 0.8584525578813544, + "grad_norm": 1.3162028789520264, + "learning_rate": 1.0323523128973822e-05, + "loss": 1.0037, + "step": 23971 + }, + { + "epoch": 0.8584883700109227, + "grad_norm": 1.5313256978988647, + "learning_rate": 1.0318391081196288e-05, + "loss": 1.0913, + "step": 23972 + }, + { + "epoch": 0.858524182140491, + "grad_norm": 1.376757264137268, + "learning_rate": 1.0313260239969102e-05, + "loss": 1.0611, + "step": 23973 + }, + { + "epoch": 0.8585599942700592, + "grad_norm": 1.678606629371643, + "learning_rate": 1.0308130605361333e-05, + "loss": 1.0946, + "step": 23974 + }, + { + "epoch": 0.8585958063996275, + "grad_norm": 1.4306206703186035, + "learning_rate": 1.0303002177441934e-05, + "loss": 0.9921, + "step": 23975 + }, + { + "epoch": 0.8586316185291958, + "grad_norm": 1.5072098970413208, + "learning_rate": 1.0297874956279974e-05, + "loss": 1.0293, + "step": 23976 + }, + { + "epoch": 0.8586674306587642, + "grad_norm": 1.1880143880844116, + "learning_rate": 1.0292748941944385e-05, + "loss": 0.9047, + "step": 23977 + }, + { + "epoch": 0.8587032427883324, + "grad_norm": 1.475826382637024, + "learning_rate": 1.0287624134504158e-05, + "loss": 0.9388, + "step": 23978 + }, + { + "epoch": 0.8587390549179007, + "grad_norm": 1.528542160987854, + "learning_rate": 1.0282500534028195e-05, + "loss": 1.1828, + "step": 23979 + }, + { + "epoch": 0.858774867047469, + "grad_norm": 1.6472015380859375, + "learning_rate": 1.0277378140585491e-05, + "loss": 1.0966, + "step": 23980 + }, + { + "epoch": 0.8588106791770372, + "grad_norm": 1.2708940505981445, + "learning_rate": 1.0272256954244941e-05, + "loss": 1.1637, + "step": 23981 + }, + { + "epoch": 0.8588464913066055, + "grad_norm": 1.5250375270843506, + "learning_rate": 1.0267136975075386e-05, + "loss": 1.0302, + "step": 23982 + }, + { + "epoch": 0.8588823034361738, + "grad_norm": 1.5313262939453125, + "learning_rate": 1.0262018203145796e-05, + "loss": 1.0045, + "step": 23983 + }, + { + "epoch": 0.8589181155657422, + "grad_norm": 1.412479281425476, + "learning_rate": 1.0256900638524979e-05, + "loss": 1.2436, + "step": 23984 + }, + { + "epoch": 0.8589539276953104, + "grad_norm": 1.269407868385315, + "learning_rate": 1.0251784281281829e-05, + "loss": 1.0285, + "step": 23985 + }, + { + "epoch": 0.8589897398248787, + "grad_norm": 1.9349199533462524, + "learning_rate": 1.0246669131485109e-05, + "loss": 1.1444, + "step": 23986 + }, + { + "epoch": 0.859025551954447, + "grad_norm": 1.576823353767395, + "learning_rate": 1.0241555189203722e-05, + "loss": 0.9719, + "step": 23987 + }, + { + "epoch": 0.8590613640840152, + "grad_norm": 1.2709423303604126, + "learning_rate": 1.0236442454506411e-05, + "loss": 1.008, + "step": 23988 + }, + { + "epoch": 0.8590971762135835, + "grad_norm": 1.7730530500411987, + "learning_rate": 1.0231330927462002e-05, + "loss": 1.0229, + "step": 23989 + }, + { + "epoch": 0.8591329883431518, + "grad_norm": 1.578791618347168, + "learning_rate": 1.0226220608139214e-05, + "loss": 1.2064, + "step": 23990 + }, + { + "epoch": 0.8591688004727202, + "grad_norm": 1.6139039993286133, + "learning_rate": 1.022111149660684e-05, + "loss": 1.3196, + "step": 23991 + }, + { + "epoch": 0.8592046126022884, + "grad_norm": 1.3554794788360596, + "learning_rate": 1.021600359293361e-05, + "loss": 0.9989, + "step": 23992 + }, + { + "epoch": 0.8592404247318567, + "grad_norm": 1.376258373260498, + "learning_rate": 1.0210896897188216e-05, + "loss": 1.0921, + "step": 23993 + }, + { + "epoch": 0.859276236861425, + "grad_norm": 2.9546010494232178, + "learning_rate": 1.0205791409439413e-05, + "loss": 1.0725, + "step": 23994 + }, + { + "epoch": 0.8593120489909932, + "grad_norm": 1.2787591218948364, + "learning_rate": 1.0200687129755837e-05, + "loss": 0.8609, + "step": 23995 + }, + { + "epoch": 0.8593478611205615, + "grad_norm": 1.3151228427886963, + "learning_rate": 1.0195584058206209e-05, + "loss": 0.887, + "step": 23996 + }, + { + "epoch": 0.8593836732501298, + "grad_norm": 1.4460337162017822, + "learning_rate": 1.0190482194859119e-05, + "loss": 0.9771, + "step": 23997 + }, + { + "epoch": 0.8594194853796981, + "grad_norm": 1.5018352270126343, + "learning_rate": 1.018538153978329e-05, + "loss": 1.1173, + "step": 23998 + }, + { + "epoch": 0.8594552975092664, + "grad_norm": 1.3104267120361328, + "learning_rate": 1.0180282093047288e-05, + "loss": 0.8002, + "step": 23999 + }, + { + "epoch": 0.8594911096388347, + "grad_norm": 1.3778376579284668, + "learning_rate": 1.0175183854719716e-05, + "loss": 1.2012, + "step": 24000 + }, + { + "epoch": 0.859526921768403, + "grad_norm": 1.4013688564300537, + "learning_rate": 1.0170086824869184e-05, + "loss": 1.0595, + "step": 24001 + }, + { + "epoch": 0.8595627338979712, + "grad_norm": 1.3465466499328613, + "learning_rate": 1.0164991003564261e-05, + "loss": 0.9512, + "step": 24002 + }, + { + "epoch": 0.8595985460275395, + "grad_norm": 1.8205479383468628, + "learning_rate": 1.0159896390873524e-05, + "loss": 1.1265, + "step": 24003 + }, + { + "epoch": 0.8596343581571078, + "grad_norm": 1.4426510334014893, + "learning_rate": 1.0154802986865475e-05, + "loss": 1.1714, + "step": 24004 + }, + { + "epoch": 0.8596701702866761, + "grad_norm": 1.35122811794281, + "learning_rate": 1.0149710791608657e-05, + "loss": 1.0388, + "step": 24005 + }, + { + "epoch": 0.8597059824162444, + "grad_norm": 1.3115196228027344, + "learning_rate": 1.0144619805171584e-05, + "loss": 1.0015, + "step": 24006 + }, + { + "epoch": 0.8597417945458127, + "grad_norm": 1.3358911275863647, + "learning_rate": 1.0139530027622768e-05, + "loss": 0.9851, + "step": 24007 + }, + { + "epoch": 0.859777606675381, + "grad_norm": 1.844347357749939, + "learning_rate": 1.0134441459030642e-05, + "loss": 1.0649, + "step": 24008 + }, + { + "epoch": 0.8598134188049492, + "grad_norm": 1.3047956228256226, + "learning_rate": 1.0129354099463683e-05, + "loss": 1.1378, + "step": 24009 + }, + { + "epoch": 0.8598492309345175, + "grad_norm": 1.8410142660140991, + "learning_rate": 1.0124267948990363e-05, + "loss": 1.251, + "step": 24010 + }, + { + "epoch": 0.8598850430640858, + "grad_norm": 1.2340433597564697, + "learning_rate": 1.0119183007679067e-05, + "loss": 1.0236, + "step": 24011 + }, + { + "epoch": 0.8599208551936541, + "grad_norm": 1.476151704788208, + "learning_rate": 1.0114099275598232e-05, + "loss": 1.1398, + "step": 24012 + }, + { + "epoch": 0.8599566673232224, + "grad_norm": 1.7415802478790283, + "learning_rate": 1.0109016752816247e-05, + "loss": 1.0613, + "step": 24013 + }, + { + "epoch": 0.8599924794527907, + "grad_norm": 1.8740978240966797, + "learning_rate": 1.0103935439401502e-05, + "loss": 0.9983, + "step": 24014 + }, + { + "epoch": 0.8600282915823589, + "grad_norm": 1.723813533782959, + "learning_rate": 1.0098855335422331e-05, + "loss": 1.1074, + "step": 24015 + }, + { + "epoch": 0.8600641037119272, + "grad_norm": 1.4705240726470947, + "learning_rate": 1.00937764409471e-05, + "loss": 1.234, + "step": 24016 + }, + { + "epoch": 0.8600999158414955, + "grad_norm": 1.3574469089508057, + "learning_rate": 1.0088698756044146e-05, + "loss": 0.9304, + "step": 24017 + }, + { + "epoch": 0.8601357279710637, + "grad_norm": 1.4086827039718628, + "learning_rate": 1.0083622280781769e-05, + "loss": 1.0539, + "step": 24018 + }, + { + "epoch": 0.8601715401006321, + "grad_norm": 1.3308061361312866, + "learning_rate": 1.0078547015228257e-05, + "loss": 1.1106, + "step": 24019 + }, + { + "epoch": 0.8602073522302004, + "grad_norm": 1.3906949758529663, + "learning_rate": 1.0073472959451913e-05, + "loss": 0.8793, + "step": 24020 + }, + { + "epoch": 0.8602431643597687, + "grad_norm": 1.766332745552063, + "learning_rate": 1.0068400113521014e-05, + "loss": 1.1295, + "step": 24021 + }, + { + "epoch": 0.8602789764893369, + "grad_norm": 1.6007283926010132, + "learning_rate": 1.0063328477503764e-05, + "loss": 1.1374, + "step": 24022 + }, + { + "epoch": 0.8603147886189052, + "grad_norm": 2.423781394958496, + "learning_rate": 1.0058258051468417e-05, + "loss": 1.0131, + "step": 24023 + }, + { + "epoch": 0.8603506007484735, + "grad_norm": 2.090325117111206, + "learning_rate": 1.0053188835483197e-05, + "loss": 1.16, + "step": 24024 + }, + { + "epoch": 0.8603864128780417, + "grad_norm": 2.0720772743225098, + "learning_rate": 1.0048120829616314e-05, + "loss": 1.0247, + "step": 24025 + }, + { + "epoch": 0.8604222250076101, + "grad_norm": 1.4193164110183716, + "learning_rate": 1.0043054033935917e-05, + "loss": 1.0938, + "step": 24026 + }, + { + "epoch": 0.8604580371371784, + "grad_norm": 1.226370930671692, + "learning_rate": 1.0037988448510193e-05, + "loss": 0.9088, + "step": 24027 + }, + { + "epoch": 0.8604938492667467, + "grad_norm": 1.293366551399231, + "learning_rate": 1.0032924073407313e-05, + "loss": 1.1867, + "step": 24028 + }, + { + "epoch": 0.8605296613963149, + "grad_norm": 1.598150610923767, + "learning_rate": 1.0027860908695363e-05, + "loss": 1.0773, + "step": 24029 + }, + { + "epoch": 0.8605654735258832, + "grad_norm": 1.4808709621429443, + "learning_rate": 1.0022798954442491e-05, + "loss": 0.9529, + "step": 24030 + }, + { + "epoch": 0.8606012856554515, + "grad_norm": 1.7413355112075806, + "learning_rate": 1.001773821071681e-05, + "loss": 1.0674, + "step": 24031 + }, + { + "epoch": 0.8606370977850197, + "grad_norm": 1.8177051544189453, + "learning_rate": 1.0012678677586396e-05, + "loss": 1.1313, + "step": 24032 + }, + { + "epoch": 0.8606729099145881, + "grad_norm": 2.059858798980713, + "learning_rate": 1.0007620355119307e-05, + "loss": 1.0472, + "step": 24033 + }, + { + "epoch": 0.8607087220441564, + "grad_norm": 1.310294508934021, + "learning_rate": 1.000256324338359e-05, + "loss": 0.9783, + "step": 24034 + }, + { + "epoch": 0.8607445341737247, + "grad_norm": 1.527279019355774, + "learning_rate": 9.997507342447333e-06, + "loss": 1.1354, + "step": 24035 + }, + { + "epoch": 0.8607803463032929, + "grad_norm": 1.5383356809616089, + "learning_rate": 9.992452652378493e-06, + "loss": 1.1542, + "step": 24036 + }, + { + "epoch": 0.8608161584328612, + "grad_norm": 1.635591745376587, + "learning_rate": 9.987399173245093e-06, + "loss": 0.8776, + "step": 24037 + }, + { + "epoch": 0.8608519705624295, + "grad_norm": 1.4875606298446655, + "learning_rate": 9.982346905115137e-06, + "loss": 0.9763, + "step": 24038 + }, + { + "epoch": 0.8608877826919977, + "grad_norm": 1.4542144536972046, + "learning_rate": 9.977295848056612e-06, + "loss": 1.1065, + "step": 24039 + }, + { + "epoch": 0.8609235948215661, + "grad_norm": 1.54017972946167, + "learning_rate": 9.97224600213742e-06, + "loss": 1.047, + "step": 24040 + }, + { + "epoch": 0.8609594069511344, + "grad_norm": 1.4494688510894775, + "learning_rate": 9.96719736742554e-06, + "loss": 1.215, + "step": 24041 + }, + { + "epoch": 0.8609952190807026, + "grad_norm": 1.4270137548446655, + "learning_rate": 9.962149943988885e-06, + "loss": 1.074, + "step": 24042 + }, + { + "epoch": 0.8610310312102709, + "grad_norm": 1.4953371286392212, + "learning_rate": 9.957103731895379e-06, + "loss": 1.1591, + "step": 24043 + }, + { + "epoch": 0.8610668433398392, + "grad_norm": 1.7613499164581299, + "learning_rate": 9.952058731212877e-06, + "loss": 1.3257, + "step": 24044 + }, + { + "epoch": 0.8611026554694075, + "grad_norm": 1.4308803081512451, + "learning_rate": 9.947014942009269e-06, + "loss": 1.1289, + "step": 24045 + }, + { + "epoch": 0.8611384675989757, + "grad_norm": 1.5092114210128784, + "learning_rate": 9.941972364352436e-06, + "loss": 1.2612, + "step": 24046 + }, + { + "epoch": 0.8611742797285441, + "grad_norm": 1.4389317035675049, + "learning_rate": 9.936930998310179e-06, + "loss": 1.029, + "step": 24047 + }, + { + "epoch": 0.8612100918581124, + "grad_norm": 1.399460792541504, + "learning_rate": 9.931890843950342e-06, + "loss": 1.118, + "step": 24048 + }, + { + "epoch": 0.8612459039876806, + "grad_norm": 1.511217474937439, + "learning_rate": 9.92685190134074e-06, + "loss": 1.187, + "step": 24049 + }, + { + "epoch": 0.8612817161172489, + "grad_norm": 1.5620448589324951, + "learning_rate": 9.921814170549171e-06, + "loss": 0.9809, + "step": 24050 + }, + { + "epoch": 0.8613175282468172, + "grad_norm": 1.6434775590896606, + "learning_rate": 9.916777651643383e-06, + "loss": 1.0438, + "step": 24051 + }, + { + "epoch": 0.8613533403763854, + "grad_norm": 1.4916133880615234, + "learning_rate": 9.911742344691156e-06, + "loss": 1.1149, + "step": 24052 + }, + { + "epoch": 0.8613891525059537, + "grad_norm": 1.7207579612731934, + "learning_rate": 9.906708249760244e-06, + "loss": 0.9828, + "step": 24053 + }, + { + "epoch": 0.8614249646355221, + "grad_norm": 1.3962734937667847, + "learning_rate": 9.901675366918339e-06, + "loss": 0.9691, + "step": 24054 + }, + { + "epoch": 0.8614607767650904, + "grad_norm": 1.2775473594665527, + "learning_rate": 9.896643696233177e-06, + "loss": 0.8225, + "step": 24055 + }, + { + "epoch": 0.8614965888946586, + "grad_norm": 1.3807134628295898, + "learning_rate": 9.891613237772458e-06, + "loss": 1.1879, + "step": 24056 + }, + { + "epoch": 0.8615324010242269, + "grad_norm": 1.209101676940918, + "learning_rate": 9.88658399160386e-06, + "loss": 0.9976, + "step": 24057 + }, + { + "epoch": 0.8615682131537952, + "grad_norm": 1.282573938369751, + "learning_rate": 9.88155595779502e-06, + "loss": 1.1858, + "step": 24058 + }, + { + "epoch": 0.8616040252833634, + "grad_norm": 1.4741857051849365, + "learning_rate": 9.876529136413593e-06, + "loss": 0.9702, + "step": 24059 + }, + { + "epoch": 0.8616398374129317, + "grad_norm": 1.3898407220840454, + "learning_rate": 9.871503527527226e-06, + "loss": 1.1187, + "step": 24060 + }, + { + "epoch": 0.8616756495425001, + "grad_norm": 1.4377944469451904, + "learning_rate": 9.866479131203544e-06, + "loss": 1.0914, + "step": 24061 + }, + { + "epoch": 0.8617114616720684, + "grad_norm": 1.3382467031478882, + "learning_rate": 9.861455947510112e-06, + "loss": 1.0523, + "step": 24062 + }, + { + "epoch": 0.8617472738016366, + "grad_norm": 1.3917299509048462, + "learning_rate": 9.856433976514479e-06, + "loss": 1.0213, + "step": 24063 + }, + { + "epoch": 0.8617830859312049, + "grad_norm": 1.8732017278671265, + "learning_rate": 9.8514132182843e-06, + "loss": 1.0491, + "step": 24064 + }, + { + "epoch": 0.8618188980607732, + "grad_norm": 1.975398063659668, + "learning_rate": 9.846393672887044e-06, + "loss": 1.1515, + "step": 24065 + }, + { + "epoch": 0.8618547101903414, + "grad_norm": 1.2833383083343506, + "learning_rate": 9.841375340390268e-06, + "loss": 1.1004, + "step": 24066 + }, + { + "epoch": 0.8618905223199097, + "grad_norm": 1.4248541593551636, + "learning_rate": 9.836358220861508e-06, + "loss": 1.0753, + "step": 24067 + }, + { + "epoch": 0.8619263344494781, + "grad_norm": 1.3483192920684814, + "learning_rate": 9.831342314368252e-06, + "loss": 0.8933, + "step": 24068 + }, + { + "epoch": 0.8619621465790464, + "grad_norm": 1.3611443042755127, + "learning_rate": 9.826327620977972e-06, + "loss": 1.0431, + "step": 24069 + }, + { + "epoch": 0.8619979587086146, + "grad_norm": 1.3560229539871216, + "learning_rate": 9.82131414075811e-06, + "loss": 0.8589, + "step": 24070 + }, + { + "epoch": 0.8620337708381829, + "grad_norm": 1.3722048997879028, + "learning_rate": 9.816301873776178e-06, + "loss": 0.9497, + "step": 24071 + }, + { + "epoch": 0.8620695829677512, + "grad_norm": 1.5585566759109497, + "learning_rate": 9.81129082009955e-06, + "loss": 1.0424, + "step": 24072 + }, + { + "epoch": 0.8621053950973194, + "grad_norm": 1.4044203758239746, + "learning_rate": 9.8062809797957e-06, + "loss": 0.9708, + "step": 24073 + }, + { + "epoch": 0.8621412072268877, + "grad_norm": 1.503035306930542, + "learning_rate": 9.801272352931957e-06, + "loss": 1.1191, + "step": 24074 + }, + { + "epoch": 0.8621770193564561, + "grad_norm": 1.3447368144989014, + "learning_rate": 9.796264939575784e-06, + "loss": 1.1982, + "step": 24075 + }, + { + "epoch": 0.8622128314860243, + "grad_norm": 1.255627989768982, + "learning_rate": 9.791258739794484e-06, + "loss": 1.2185, + "step": 24076 + }, + { + "epoch": 0.8622486436155926, + "grad_norm": 1.531503438949585, + "learning_rate": 9.78625375365545e-06, + "loss": 1.0406, + "step": 24077 + }, + { + "epoch": 0.8622844557451609, + "grad_norm": 1.6112452745437622, + "learning_rate": 9.781249981226015e-06, + "loss": 1.1324, + "step": 24078 + }, + { + "epoch": 0.8623202678747292, + "grad_norm": 1.7658108472824097, + "learning_rate": 9.77624742257347e-06, + "loss": 1.2099, + "step": 24079 + }, + { + "epoch": 0.8623560800042974, + "grad_norm": 1.6166305541992188, + "learning_rate": 9.771246077765151e-06, + "loss": 1.1443, + "step": 24080 + }, + { + "epoch": 0.8623918921338657, + "grad_norm": 1.6830174922943115, + "learning_rate": 9.766245946868302e-06, + "loss": 1.2318, + "step": 24081 + }, + { + "epoch": 0.8624277042634341, + "grad_norm": 1.2797483205795288, + "learning_rate": 9.761247029950249e-06, + "loss": 0.9272, + "step": 24082 + }, + { + "epoch": 0.8624635163930023, + "grad_norm": 1.3734352588653564, + "learning_rate": 9.756249327078204e-06, + "loss": 1.0341, + "step": 24083 + }, + { + "epoch": 0.8624993285225706, + "grad_norm": 1.87959623336792, + "learning_rate": 9.751252838319436e-06, + "loss": 1.2151, + "step": 24084 + }, + { + "epoch": 0.8625351406521389, + "grad_norm": 1.4236282110214233, + "learning_rate": 9.746257563741102e-06, + "loss": 0.9091, + "step": 24085 + }, + { + "epoch": 0.8625709527817071, + "grad_norm": 1.28976571559906, + "learning_rate": 9.741263503410503e-06, + "loss": 1.0532, + "step": 24086 + }, + { + "epoch": 0.8626067649112754, + "grad_norm": 1.2679507732391357, + "learning_rate": 9.736270657394774e-06, + "loss": 0.8947, + "step": 24087 + }, + { + "epoch": 0.8626425770408437, + "grad_norm": 1.8667470216751099, + "learning_rate": 9.731279025761076e-06, + "loss": 1.0244, + "step": 24088 + }, + { + "epoch": 0.8626783891704121, + "grad_norm": 1.6430346965789795, + "learning_rate": 9.726288608576573e-06, + "loss": 1.0416, + "step": 24089 + }, + { + "epoch": 0.8627142012999803, + "grad_norm": 1.3995131254196167, + "learning_rate": 9.721299405908412e-06, + "loss": 1.1539, + "step": 24090 + }, + { + "epoch": 0.8627500134295486, + "grad_norm": 1.529228925704956, + "learning_rate": 9.716311417823742e-06, + "loss": 1.2707, + "step": 24091 + }, + { + "epoch": 0.8627858255591169, + "grad_norm": 1.802844762802124, + "learning_rate": 9.711324644389609e-06, + "loss": 1.1252, + "step": 24092 + }, + { + "epoch": 0.8628216376886851, + "grad_norm": 1.4119062423706055, + "learning_rate": 9.706339085673167e-06, + "loss": 1.0744, + "step": 24093 + }, + { + "epoch": 0.8628574498182534, + "grad_norm": 1.7962005138397217, + "learning_rate": 9.701354741741454e-06, + "loss": 1.169, + "step": 24094 + }, + { + "epoch": 0.8628932619478217, + "grad_norm": 1.4859310388565063, + "learning_rate": 9.696371612661548e-06, + "loss": 1.0388, + "step": 24095 + }, + { + "epoch": 0.8629290740773901, + "grad_norm": 1.5832688808441162, + "learning_rate": 9.691389698500463e-06, + "loss": 0.9249, + "step": 24096 + }, + { + "epoch": 0.8629648862069583, + "grad_norm": 1.4981980323791504, + "learning_rate": 9.686408999325236e-06, + "loss": 1.0127, + "step": 24097 + }, + { + "epoch": 0.8630006983365266, + "grad_norm": 1.7798678874969482, + "learning_rate": 9.6814295152029e-06, + "loss": 1.0283, + "step": 24098 + }, + { + "epoch": 0.8630365104660949, + "grad_norm": 1.300840139389038, + "learning_rate": 9.676451246200401e-06, + "loss": 1.0431, + "step": 24099 + }, + { + "epoch": 0.8630723225956631, + "grad_norm": 1.527562141418457, + "learning_rate": 9.671474192384755e-06, + "loss": 0.9552, + "step": 24100 + }, + { + "epoch": 0.8631081347252314, + "grad_norm": 1.611753225326538, + "learning_rate": 9.666498353822905e-06, + "loss": 1.1976, + "step": 24101 + }, + { + "epoch": 0.8631439468547997, + "grad_norm": 1.870469331741333, + "learning_rate": 9.661523730581813e-06, + "loss": 1.0085, + "step": 24102 + }, + { + "epoch": 0.863179758984368, + "grad_norm": 1.9369761943817139, + "learning_rate": 9.656550322728353e-06, + "loss": 0.9707, + "step": 24103 + }, + { + "epoch": 0.8632155711139363, + "grad_norm": 1.5551509857177734, + "learning_rate": 9.651578130329508e-06, + "loss": 1.2566, + "step": 24104 + }, + { + "epoch": 0.8632513832435046, + "grad_norm": 1.1973707675933838, + "learning_rate": 9.646607153452147e-06, + "loss": 0.8772, + "step": 24105 + }, + { + "epoch": 0.8632871953730729, + "grad_norm": 1.3694589138031006, + "learning_rate": 9.641637392163116e-06, + "loss": 0.8225, + "step": 24106 + }, + { + "epoch": 0.8633230075026411, + "grad_norm": 1.3717272281646729, + "learning_rate": 9.636668846529296e-06, + "loss": 1.0955, + "step": 24107 + }, + { + "epoch": 0.8633588196322094, + "grad_norm": 1.3173264265060425, + "learning_rate": 9.631701516617542e-06, + "loss": 0.9048, + "step": 24108 + }, + { + "epoch": 0.8633946317617777, + "grad_norm": 1.6054198741912842, + "learning_rate": 9.626735402494703e-06, + "loss": 0.9139, + "step": 24109 + }, + { + "epoch": 0.863430443891346, + "grad_norm": 1.5173234939575195, + "learning_rate": 9.621770504227534e-06, + "loss": 1.0078, + "step": 24110 + }, + { + "epoch": 0.8634662560209143, + "grad_norm": 1.9143099784851074, + "learning_rate": 9.616806821882873e-06, + "loss": 0.9981, + "step": 24111 + }, + { + "epoch": 0.8635020681504826, + "grad_norm": 1.499354362487793, + "learning_rate": 9.611844355527477e-06, + "loss": 1.1447, + "step": 24112 + }, + { + "epoch": 0.8635378802800509, + "grad_norm": 1.563800573348999, + "learning_rate": 9.60688310522816e-06, + "loss": 1.1386, + "step": 24113 + }, + { + "epoch": 0.8635736924096191, + "grad_norm": 2.0134198665618896, + "learning_rate": 9.6019230710516e-06, + "loss": 1.2684, + "step": 24114 + }, + { + "epoch": 0.8636095045391874, + "grad_norm": 1.6875691413879395, + "learning_rate": 9.596964253064567e-06, + "loss": 1.0726, + "step": 24115 + }, + { + "epoch": 0.8636453166687557, + "grad_norm": 1.628737449645996, + "learning_rate": 9.592006651333785e-06, + "loss": 1.1297, + "step": 24116 + }, + { + "epoch": 0.863681128798324, + "grad_norm": 1.8251491785049438, + "learning_rate": 9.587050265925912e-06, + "loss": 1.3236, + "step": 24117 + }, + { + "epoch": 0.8637169409278923, + "grad_norm": 1.575192928314209, + "learning_rate": 9.582095096907651e-06, + "loss": 1.1295, + "step": 24118 + }, + { + "epoch": 0.8637527530574606, + "grad_norm": 1.2604568004608154, + "learning_rate": 9.57714114434568e-06, + "loss": 1.0393, + "step": 24119 + }, + { + "epoch": 0.8637885651870288, + "grad_norm": 1.3777557611465454, + "learning_rate": 9.572188408306649e-06, + "loss": 1.1162, + "step": 24120 + }, + { + "epoch": 0.8638243773165971, + "grad_norm": 1.4674372673034668, + "learning_rate": 9.567236888857166e-06, + "loss": 1.0687, + "step": 24121 + }, + { + "epoch": 0.8638601894461654, + "grad_norm": 1.4230064153671265, + "learning_rate": 9.562286586063861e-06, + "loss": 1.4317, + "step": 24122 + }, + { + "epoch": 0.8638960015757337, + "grad_norm": 1.5594807863235474, + "learning_rate": 9.557337499993346e-06, + "loss": 1.0423, + "step": 24123 + }, + { + "epoch": 0.863931813705302, + "grad_norm": 1.307666540145874, + "learning_rate": 9.552389630712178e-06, + "loss": 1.1744, + "step": 24124 + }, + { + "epoch": 0.8639676258348703, + "grad_norm": 1.580182433128357, + "learning_rate": 9.547442978286946e-06, + "loss": 1.1358, + "step": 24125 + }, + { + "epoch": 0.8640034379644386, + "grad_norm": 1.5878103971481323, + "learning_rate": 9.542497542784178e-06, + "loss": 0.9419, + "step": 24126 + }, + { + "epoch": 0.8640392500940068, + "grad_norm": 1.4378604888916016, + "learning_rate": 9.537553324270455e-06, + "loss": 1.2539, + "step": 24127 + }, + { + "epoch": 0.8640750622235751, + "grad_norm": 1.9763388633728027, + "learning_rate": 9.53261032281224e-06, + "loss": 1.2029, + "step": 24128 + }, + { + "epoch": 0.8641108743531434, + "grad_norm": 1.3347139358520508, + "learning_rate": 9.527668538476054e-06, + "loss": 1.0156, + "step": 24129 + }, + { + "epoch": 0.8641466864827116, + "grad_norm": 1.2514828443527222, + "learning_rate": 9.522727971328393e-06, + "loss": 0.7612, + "step": 24130 + }, + { + "epoch": 0.86418249861228, + "grad_norm": 1.7547361850738525, + "learning_rate": 9.517788621435742e-06, + "loss": 1.2116, + "step": 24131 + }, + { + "epoch": 0.8642183107418483, + "grad_norm": 2.0091681480407715, + "learning_rate": 9.512850488864511e-06, + "loss": 1.2005, + "step": 24132 + }, + { + "epoch": 0.8642541228714166, + "grad_norm": 1.5879230499267578, + "learning_rate": 9.50791357368115e-06, + "loss": 0.9306, + "step": 24133 + }, + { + "epoch": 0.8642899350009848, + "grad_norm": 1.4372951984405518, + "learning_rate": 9.502977875952113e-06, + "loss": 1.1393, + "step": 24134 + }, + { + "epoch": 0.8643257471305531, + "grad_norm": 1.5949276685714722, + "learning_rate": 9.49804339574375e-06, + "loss": 1.04, + "step": 24135 + }, + { + "epoch": 0.8643615592601214, + "grad_norm": 1.49424409866333, + "learning_rate": 9.493110133122474e-06, + "loss": 1.1913, + "step": 24136 + }, + { + "epoch": 0.8643973713896896, + "grad_norm": 1.475303292274475, + "learning_rate": 9.488178088154654e-06, + "loss": 0.8572, + "step": 24137 + }, + { + "epoch": 0.864433183519258, + "grad_norm": 1.3997997045516968, + "learning_rate": 9.48324726090667e-06, + "loss": 1.0465, + "step": 24138 + }, + { + "epoch": 0.8644689956488263, + "grad_norm": 1.3392165899276733, + "learning_rate": 9.478317651444812e-06, + "loss": 0.9809, + "step": 24139 + }, + { + "epoch": 0.8645048077783946, + "grad_norm": 1.3907954692840576, + "learning_rate": 9.47338925983543e-06, + "loss": 0.9368, + "step": 24140 + }, + { + "epoch": 0.8645406199079628, + "grad_norm": 1.3972262144088745, + "learning_rate": 9.468462086144847e-06, + "loss": 1.0147, + "step": 24141 + }, + { + "epoch": 0.8645764320375311, + "grad_norm": 1.5459076166152954, + "learning_rate": 9.46353613043931e-06, + "loss": 0.9853, + "step": 24142 + }, + { + "epoch": 0.8646122441670994, + "grad_norm": 1.1302086114883423, + "learning_rate": 9.4586113927851e-06, + "loss": 0.973, + "step": 24143 + }, + { + "epoch": 0.8646480562966676, + "grad_norm": 1.5858474969863892, + "learning_rate": 9.453687873248495e-06, + "loss": 1.1371, + "step": 24144 + }, + { + "epoch": 0.864683868426236, + "grad_norm": 1.1160085201263428, + "learning_rate": 9.448765571895735e-06, + "loss": 1.0127, + "step": 24145 + }, + { + "epoch": 0.8647196805558043, + "grad_norm": 1.5628788471221924, + "learning_rate": 9.443844488793018e-06, + "loss": 1.3098, + "step": 24146 + }, + { + "epoch": 0.8647554926853726, + "grad_norm": 1.2904016971588135, + "learning_rate": 9.438924624006563e-06, + "loss": 1.1627, + "step": 24147 + }, + { + "epoch": 0.8647913048149408, + "grad_norm": 1.25664484500885, + "learning_rate": 9.434005977602556e-06, + "loss": 0.9271, + "step": 24148 + }, + { + "epoch": 0.8648271169445091, + "grad_norm": 1.4972708225250244, + "learning_rate": 9.429088549647203e-06, + "loss": 1.008, + "step": 24149 + }, + { + "epoch": 0.8648629290740774, + "grad_norm": 1.6375652551651, + "learning_rate": 9.424172340206616e-06, + "loss": 1.0593, + "step": 24150 + }, + { + "epoch": 0.8648987412036456, + "grad_norm": 1.2339732646942139, + "learning_rate": 9.419257349346956e-06, + "loss": 0.9991, + "step": 24151 + }, + { + "epoch": 0.864934553333214, + "grad_norm": 1.6111329793930054, + "learning_rate": 9.414343577134355e-06, + "loss": 1.1677, + "step": 24152 + }, + { + "epoch": 0.8649703654627823, + "grad_norm": 1.4279186725616455, + "learning_rate": 9.409431023634908e-06, + "loss": 1.0399, + "step": 24153 + }, + { + "epoch": 0.8650061775923505, + "grad_norm": 1.5040103197097778, + "learning_rate": 9.404519688914703e-06, + "loss": 1.0054, + "step": 24154 + }, + { + "epoch": 0.8650419897219188, + "grad_norm": 1.8035119771957397, + "learning_rate": 9.399609573039836e-06, + "loss": 1.0915, + "step": 24155 + }, + { + "epoch": 0.8650778018514871, + "grad_norm": 1.5912643671035767, + "learning_rate": 9.394700676076374e-06, + "loss": 0.9471, + "step": 24156 + }, + { + "epoch": 0.8651136139810554, + "grad_norm": 1.4430549144744873, + "learning_rate": 9.389792998090319e-06, + "loss": 1.1276, + "step": 24157 + }, + { + "epoch": 0.8651494261106236, + "grad_norm": 1.648330569267273, + "learning_rate": 9.384886539147718e-06, + "loss": 1.1018, + "step": 24158 + }, + { + "epoch": 0.865185238240192, + "grad_norm": 1.655334234237671, + "learning_rate": 9.379981299314611e-06, + "loss": 1.2965, + "step": 24159 + }, + { + "epoch": 0.8652210503697603, + "grad_norm": 1.6934747695922852, + "learning_rate": 9.375077278656941e-06, + "loss": 1.0544, + "step": 24160 + }, + { + "epoch": 0.8652568624993285, + "grad_norm": 1.3550459146499634, + "learning_rate": 9.370174477240712e-06, + "loss": 0.9992, + "step": 24161 + }, + { + "epoch": 0.8652926746288968, + "grad_norm": 1.3145712614059448, + "learning_rate": 9.36527289513187e-06, + "loss": 1.0618, + "step": 24162 + }, + { + "epoch": 0.8653284867584651, + "grad_norm": 1.312612533569336, + "learning_rate": 9.3603725323964e-06, + "loss": 0.9512, + "step": 24163 + }, + { + "epoch": 0.8653642988880333, + "grad_norm": 2.202249526977539, + "learning_rate": 9.355473389100178e-06, + "loss": 1.0449, + "step": 24164 + }, + { + "epoch": 0.8654001110176016, + "grad_norm": 1.3822219371795654, + "learning_rate": 9.350575465309142e-06, + "loss": 0.9597, + "step": 24165 + }, + { + "epoch": 0.86543592314717, + "grad_norm": 1.6946583986282349, + "learning_rate": 9.345678761089194e-06, + "loss": 1.2513, + "step": 24166 + }, + { + "epoch": 0.8654717352767383, + "grad_norm": 1.5607333183288574, + "learning_rate": 9.340783276506193e-06, + "loss": 0.9976, + "step": 24167 + }, + { + "epoch": 0.8655075474063065, + "grad_norm": 1.320783257484436, + "learning_rate": 9.335889011626032e-06, + "loss": 0.8961, + "step": 24168 + }, + { + "epoch": 0.8655433595358748, + "grad_norm": 1.782055139541626, + "learning_rate": 9.330995966514489e-06, + "loss": 1.0739, + "step": 24169 + }, + { + "epoch": 0.8655791716654431, + "grad_norm": 1.7472342252731323, + "learning_rate": 9.32610414123748e-06, + "loss": 0.9726, + "step": 24170 + }, + { + "epoch": 0.8656149837950113, + "grad_norm": 1.3447545766830444, + "learning_rate": 9.321213535860763e-06, + "loss": 1.0238, + "step": 24171 + }, + { + "epoch": 0.8656507959245796, + "grad_norm": 1.508644700050354, + "learning_rate": 9.316324150450173e-06, + "loss": 1.0253, + "step": 24172 + }, + { + "epoch": 0.865686608054148, + "grad_norm": 1.643084168434143, + "learning_rate": 9.311435985071426e-06, + "loss": 1.0335, + "step": 24173 + }, + { + "epoch": 0.8657224201837163, + "grad_norm": 1.6239533424377441, + "learning_rate": 9.30654903979037e-06, + "loss": 1.1057, + "step": 24174 + }, + { + "epoch": 0.8657582323132845, + "grad_norm": 1.7131716012954712, + "learning_rate": 9.301663314672704e-06, + "loss": 1.1348, + "step": 24175 + }, + { + "epoch": 0.8657940444428528, + "grad_norm": 1.4993839263916016, + "learning_rate": 9.296778809784123e-06, + "loss": 0.8872, + "step": 24176 + }, + { + "epoch": 0.8658298565724211, + "grad_norm": 1.3950443267822266, + "learning_rate": 9.29189552519043e-06, + "loss": 1.0568, + "step": 24177 + }, + { + "epoch": 0.8658656687019893, + "grad_norm": 1.3243074417114258, + "learning_rate": 9.287013460957261e-06, + "loss": 0.9387, + "step": 24178 + }, + { + "epoch": 0.8659014808315576, + "grad_norm": 1.6107187271118164, + "learning_rate": 9.28213261715033e-06, + "loss": 1.1444, + "step": 24179 + }, + { + "epoch": 0.865937292961126, + "grad_norm": 1.5634346008300781, + "learning_rate": 9.27725299383525e-06, + "loss": 1.1347, + "step": 24180 + }, + { + "epoch": 0.8659731050906943, + "grad_norm": 1.6356228590011597, + "learning_rate": 9.272374591077748e-06, + "loss": 1.0522, + "step": 24181 + }, + { + "epoch": 0.8660089172202625, + "grad_norm": 1.2912518978118896, + "learning_rate": 9.267497408943393e-06, + "loss": 1.1466, + "step": 24182 + }, + { + "epoch": 0.8660447293498308, + "grad_norm": 1.7318812608718872, + "learning_rate": 9.262621447497844e-06, + "loss": 1.0928, + "step": 24183 + }, + { + "epoch": 0.8660805414793991, + "grad_norm": 1.5112310647964478, + "learning_rate": 9.257746706806658e-06, + "loss": 0.9608, + "step": 24184 + }, + { + "epoch": 0.8661163536089673, + "grad_norm": 1.4560588598251343, + "learning_rate": 9.252873186935452e-06, + "loss": 0.9657, + "step": 24185 + }, + { + "epoch": 0.8661521657385356, + "grad_norm": 1.8350273370742798, + "learning_rate": 9.248000887949782e-06, + "loss": 1.1966, + "step": 24186 + }, + { + "epoch": 0.866187977868104, + "grad_norm": 1.4818376302719116, + "learning_rate": 9.243129809915175e-06, + "loss": 1.1078, + "step": 24187 + }, + { + "epoch": 0.8662237899976722, + "grad_norm": 1.3297653198242188, + "learning_rate": 9.238259952897221e-06, + "loss": 1.1911, + "step": 24188 + }, + { + "epoch": 0.8662596021272405, + "grad_norm": 1.5664371252059937, + "learning_rate": 9.233391316961393e-06, + "loss": 1.1035, + "step": 24189 + }, + { + "epoch": 0.8662954142568088, + "grad_norm": 1.3652453422546387, + "learning_rate": 9.228523902173214e-06, + "loss": 0.9282, + "step": 24190 + }, + { + "epoch": 0.866331226386377, + "grad_norm": 1.5577207803726196, + "learning_rate": 9.223657708598133e-06, + "loss": 1.0871, + "step": 24191 + }, + { + "epoch": 0.8663670385159453, + "grad_norm": 1.798730731010437, + "learning_rate": 9.218792736301674e-06, + "loss": 1.195, + "step": 24192 + }, + { + "epoch": 0.8664028506455136, + "grad_norm": 1.3950754404067993, + "learning_rate": 9.213928985349252e-06, + "loss": 1.0935, + "step": 24193 + }, + { + "epoch": 0.866438662775082, + "grad_norm": 1.602658987045288, + "learning_rate": 9.209066455806303e-06, + "loss": 1.1302, + "step": 24194 + }, + { + "epoch": 0.8664744749046502, + "grad_norm": 1.451781988143921, + "learning_rate": 9.204205147738254e-06, + "loss": 0.9585, + "step": 24195 + }, + { + "epoch": 0.8665102870342185, + "grad_norm": 1.750510334968567, + "learning_rate": 9.199345061210495e-06, + "loss": 1.1326, + "step": 24196 + }, + { + "epoch": 0.8665460991637868, + "grad_norm": 1.3916693925857544, + "learning_rate": 9.194486196288454e-06, + "loss": 1.0864, + "step": 24197 + }, + { + "epoch": 0.866581911293355, + "grad_norm": 1.177852988243103, + "learning_rate": 9.189628553037445e-06, + "loss": 1.0698, + "step": 24198 + }, + { + "epoch": 0.8666177234229233, + "grad_norm": 1.2645407915115356, + "learning_rate": 9.184772131522845e-06, + "loss": 1.0202, + "step": 24199 + }, + { + "epoch": 0.8666535355524916, + "grad_norm": 1.2488728761672974, + "learning_rate": 9.179916931809995e-06, + "loss": 0.996, + "step": 24200 + }, + { + "epoch": 0.86668934768206, + "grad_norm": 1.2851570844650269, + "learning_rate": 9.175062953964242e-06, + "loss": 1.1906, + "step": 24201 + }, + { + "epoch": 0.8667251598116282, + "grad_norm": 1.3967161178588867, + "learning_rate": 9.170210198050833e-06, + "loss": 0.8803, + "step": 24202 + }, + { + "epoch": 0.8667609719411965, + "grad_norm": 1.4777445793151855, + "learning_rate": 9.165358664135082e-06, + "loss": 1.0302, + "step": 24203 + }, + { + "epoch": 0.8667967840707648, + "grad_norm": 1.5072968006134033, + "learning_rate": 9.160508352282282e-06, + "loss": 1.0872, + "step": 24204 + }, + { + "epoch": 0.866832596200333, + "grad_norm": 1.8810014724731445, + "learning_rate": 9.155659262557648e-06, + "loss": 1.1035, + "step": 24205 + }, + { + "epoch": 0.8668684083299013, + "grad_norm": 1.325049877166748, + "learning_rate": 9.150811395026448e-06, + "loss": 1.0257, + "step": 24206 + }, + { + "epoch": 0.8669042204594696, + "grad_norm": 1.627274751663208, + "learning_rate": 9.145964749753888e-06, + "loss": 1.2151, + "step": 24207 + }, + { + "epoch": 0.866940032589038, + "grad_norm": 1.7226461172103882, + "learning_rate": 9.141119326805193e-06, + "loss": 1.1258, + "step": 24208 + }, + { + "epoch": 0.8669758447186062, + "grad_norm": 1.2359157800674438, + "learning_rate": 9.13627512624552e-06, + "loss": 0.9765, + "step": 24209 + }, + { + "epoch": 0.8670116568481745, + "grad_norm": 1.4150649309158325, + "learning_rate": 9.131432148140062e-06, + "loss": 1.1222, + "step": 24210 + }, + { + "epoch": 0.8670474689777428, + "grad_norm": 1.6735397577285767, + "learning_rate": 9.126590392553992e-06, + "loss": 1.0968, + "step": 24211 + }, + { + "epoch": 0.867083281107311, + "grad_norm": 1.4356701374053955, + "learning_rate": 9.12174985955241e-06, + "loss": 0.9147, + "step": 24212 + }, + { + "epoch": 0.8671190932368793, + "grad_norm": 1.4291185140609741, + "learning_rate": 9.116910549200452e-06, + "loss": 0.9425, + "step": 24213 + }, + { + "epoch": 0.8671549053664476, + "grad_norm": 1.628212809562683, + "learning_rate": 9.112072461563248e-06, + "loss": 1.1126, + "step": 24214 + }, + { + "epoch": 0.867190717496016, + "grad_norm": 1.5203138589859009, + "learning_rate": 9.107235596705877e-06, + "loss": 0.9055, + "step": 24215 + }, + { + "epoch": 0.8672265296255842, + "grad_norm": 1.6613764762878418, + "learning_rate": 9.102399954693396e-06, + "loss": 1.1811, + "step": 24216 + }, + { + "epoch": 0.8672623417551525, + "grad_norm": 1.880300760269165, + "learning_rate": 9.097565535590869e-06, + "loss": 1.1291, + "step": 24217 + }, + { + "epoch": 0.8672981538847208, + "grad_norm": 1.535727858543396, + "learning_rate": 9.092732339463339e-06, + "loss": 1.0188, + "step": 24218 + }, + { + "epoch": 0.867333966014289, + "grad_norm": 1.6059051752090454, + "learning_rate": 9.087900366375868e-06, + "loss": 1.2272, + "step": 24219 + }, + { + "epoch": 0.8673697781438573, + "grad_norm": 1.3853646516799927, + "learning_rate": 9.083069616393392e-06, + "loss": 1.0379, + "step": 24220 + }, + { + "epoch": 0.8674055902734256, + "grad_norm": 1.7468070983886719, + "learning_rate": 9.078240089580948e-06, + "loss": 1.2839, + "step": 24221 + }, + { + "epoch": 0.8674414024029939, + "grad_norm": 1.3066248893737793, + "learning_rate": 9.073411786003527e-06, + "loss": 1.1834, + "step": 24222 + }, + { + "epoch": 0.8674772145325622, + "grad_norm": 1.4617928266525269, + "learning_rate": 9.068584705726035e-06, + "loss": 0.9509, + "step": 24223 + }, + { + "epoch": 0.8675130266621305, + "grad_norm": 1.584323525428772, + "learning_rate": 9.063758848813452e-06, + "loss": 1.0584, + "step": 24224 + }, + { + "epoch": 0.8675488387916988, + "grad_norm": 1.7700616121292114, + "learning_rate": 9.058934215330695e-06, + "loss": 1.0726, + "step": 24225 + }, + { + "epoch": 0.867584650921267, + "grad_norm": 1.5928304195404053, + "learning_rate": 9.054110805342686e-06, + "loss": 1.1376, + "step": 24226 + }, + { + "epoch": 0.8676204630508353, + "grad_norm": 1.7762691974639893, + "learning_rate": 9.049288618914276e-06, + "loss": 1.0238, + "step": 24227 + }, + { + "epoch": 0.8676562751804036, + "grad_norm": 1.4906984567642212, + "learning_rate": 9.044467656110389e-06, + "loss": 1.0735, + "step": 24228 + }, + { + "epoch": 0.8676920873099719, + "grad_norm": 1.405441164970398, + "learning_rate": 9.039647916995874e-06, + "loss": 0.9058, + "step": 24229 + }, + { + "epoch": 0.8677278994395402, + "grad_norm": 1.947869896888733, + "learning_rate": 9.034829401635547e-06, + "loss": 1.0793, + "step": 24230 + }, + { + "epoch": 0.8677637115691085, + "grad_norm": 1.653285026550293, + "learning_rate": 9.030012110094255e-06, + "loss": 1.1448, + "step": 24231 + }, + { + "epoch": 0.8677995236986767, + "grad_norm": 1.3658347129821777, + "learning_rate": 9.025196042436802e-06, + "loss": 1.1657, + "step": 24232 + }, + { + "epoch": 0.867835335828245, + "grad_norm": 1.2727092504501343, + "learning_rate": 9.020381198728011e-06, + "loss": 1.2565, + "step": 24233 + }, + { + "epoch": 0.8678711479578133, + "grad_norm": 1.3991007804870605, + "learning_rate": 9.015567579032614e-06, + "loss": 1.0797, + "step": 24234 + }, + { + "epoch": 0.8679069600873816, + "grad_norm": 1.4853777885437012, + "learning_rate": 9.010755183415398e-06, + "loss": 1.1166, + "step": 24235 + }, + { + "epoch": 0.8679427722169499, + "grad_norm": 1.4228073358535767, + "learning_rate": 9.005944011941103e-06, + "loss": 1.06, + "step": 24236 + }, + { + "epoch": 0.8679785843465182, + "grad_norm": 1.5635815858840942, + "learning_rate": 9.001134064674476e-06, + "loss": 1.1828, + "step": 24237 + }, + { + "epoch": 0.8680143964760865, + "grad_norm": 1.306955337524414, + "learning_rate": 8.99632534168019e-06, + "loss": 1.2406, + "step": 24238 + }, + { + "epoch": 0.8680502086056547, + "grad_norm": 1.5851659774780273, + "learning_rate": 8.991517843022968e-06, + "loss": 1.1054, + "step": 24239 + }, + { + "epoch": 0.868086020735223, + "grad_norm": 1.476270318031311, + "learning_rate": 8.986711568767493e-06, + "loss": 0.9437, + "step": 24240 + }, + { + "epoch": 0.8681218328647913, + "grad_norm": 1.6687445640563965, + "learning_rate": 8.981906518978389e-06, + "loss": 1.1642, + "step": 24241 + }, + { + "epoch": 0.8681576449943595, + "grad_norm": 1.2324484586715698, + "learning_rate": 8.977102693720341e-06, + "loss": 1.1192, + "step": 24242 + }, + { + "epoch": 0.8681934571239279, + "grad_norm": 1.7334327697753906, + "learning_rate": 8.97230009305795e-06, + "loss": 1.1596, + "step": 24243 + }, + { + "epoch": 0.8682292692534962, + "grad_norm": 2.3910577297210693, + "learning_rate": 8.967498717055878e-06, + "loss": 1.212, + "step": 24244 + }, + { + "epoch": 0.8682650813830645, + "grad_norm": 1.2883297204971313, + "learning_rate": 8.96269856577866e-06, + "loss": 1.1145, + "step": 24245 + }, + { + "epoch": 0.8683008935126327, + "grad_norm": 1.23215913772583, + "learning_rate": 8.9578996392909e-06, + "loss": 0.9266, + "step": 24246 + }, + { + "epoch": 0.868336705642201, + "grad_norm": 1.5834769010543823, + "learning_rate": 8.953101937657194e-06, + "loss": 1.1074, + "step": 24247 + }, + { + "epoch": 0.8683725177717693, + "grad_norm": 1.560104250907898, + "learning_rate": 8.94830546094203e-06, + "loss": 1.1892, + "step": 24248 + }, + { + "epoch": 0.8684083299013375, + "grad_norm": 1.4391050338745117, + "learning_rate": 8.943510209209971e-06, + "loss": 1.1787, + "step": 24249 + }, + { + "epoch": 0.8684441420309059, + "grad_norm": 1.3117331266403198, + "learning_rate": 8.93871618252553e-06, + "loss": 0.9436, + "step": 24250 + }, + { + "epoch": 0.8684799541604742, + "grad_norm": 1.3004153966903687, + "learning_rate": 8.933923380953224e-06, + "loss": 0.8177, + "step": 24251 + }, + { + "epoch": 0.8685157662900425, + "grad_norm": 1.305384874343872, + "learning_rate": 8.92913180455749e-06, + "loss": 1.0891, + "step": 24252 + }, + { + "epoch": 0.8685515784196107, + "grad_norm": 1.4461077451705933, + "learning_rate": 8.924341453402817e-06, + "loss": 0.9667, + "step": 24253 + }, + { + "epoch": 0.868587390549179, + "grad_norm": 1.3915585279464722, + "learning_rate": 8.919552327553648e-06, + "loss": 1.1374, + "step": 24254 + }, + { + "epoch": 0.8686232026787473, + "grad_norm": 1.498006820678711, + "learning_rate": 8.914764427074428e-06, + "loss": 1.0879, + "step": 24255 + }, + { + "epoch": 0.8686590148083155, + "grad_norm": 1.570896029472351, + "learning_rate": 8.909977752029574e-06, + "loss": 1.0735, + "step": 24256 + }, + { + "epoch": 0.8686948269378839, + "grad_norm": 1.4566504955291748, + "learning_rate": 8.905192302483433e-06, + "loss": 1.0739, + "step": 24257 + }, + { + "epoch": 0.8687306390674522, + "grad_norm": 1.4173166751861572, + "learning_rate": 8.900408078500454e-06, + "loss": 1.0246, + "step": 24258 + }, + { + "epoch": 0.8687664511970205, + "grad_norm": 1.9019230604171753, + "learning_rate": 8.895625080144965e-06, + "loss": 1.0616, + "step": 24259 + }, + { + "epoch": 0.8688022633265887, + "grad_norm": 1.47428297996521, + "learning_rate": 8.890843307481322e-06, + "loss": 1.0468, + "step": 24260 + }, + { + "epoch": 0.868838075456157, + "grad_norm": 1.5129743814468384, + "learning_rate": 8.886062760573854e-06, + "loss": 1.1183, + "step": 24261 + }, + { + "epoch": 0.8688738875857253, + "grad_norm": 1.59774649143219, + "learning_rate": 8.88128343948691e-06, + "loss": 1.1112, + "step": 24262 + }, + { + "epoch": 0.8689096997152935, + "grad_norm": 1.396848201751709, + "learning_rate": 8.876505344284758e-06, + "loss": 1.0375, + "step": 24263 + }, + { + "epoch": 0.8689455118448619, + "grad_norm": 1.8135420083999634, + "learning_rate": 8.871728475031649e-06, + "loss": 1.2491, + "step": 24264 + }, + { + "epoch": 0.8689813239744302, + "grad_norm": 1.7681511640548706, + "learning_rate": 8.86695283179192e-06, + "loss": 0.9929, + "step": 24265 + }, + { + "epoch": 0.8690171361039984, + "grad_norm": 1.5872552394866943, + "learning_rate": 8.862178414629774e-06, + "loss": 1.1798, + "step": 24266 + }, + { + "epoch": 0.8690529482335667, + "grad_norm": 1.4600520133972168, + "learning_rate": 8.857405223609472e-06, + "loss": 1.1243, + "step": 24267 + }, + { + "epoch": 0.869088760363135, + "grad_norm": 1.8139159679412842, + "learning_rate": 8.852633258795185e-06, + "loss": 1.0956, + "step": 24268 + }, + { + "epoch": 0.8691245724927033, + "grad_norm": 1.3000383377075195, + "learning_rate": 8.847862520251182e-06, + "loss": 0.8442, + "step": 24269 + }, + { + "epoch": 0.8691603846222715, + "grad_norm": 1.3719491958618164, + "learning_rate": 8.843093008041591e-06, + "loss": 1.2099, + "step": 24270 + }, + { + "epoch": 0.8691961967518399, + "grad_norm": 1.4133095741271973, + "learning_rate": 8.838324722230595e-06, + "loss": 0.8786, + "step": 24271 + }, + { + "epoch": 0.8692320088814082, + "grad_norm": 1.237138032913208, + "learning_rate": 8.833557662882374e-06, + "loss": 1.0963, + "step": 24272 + }, + { + "epoch": 0.8692678210109764, + "grad_norm": 1.683642029762268, + "learning_rate": 8.828791830061022e-06, + "loss": 1.0244, + "step": 24273 + }, + { + "epoch": 0.8693036331405447, + "grad_norm": 1.3026255369186401, + "learning_rate": 8.824027223830688e-06, + "loss": 1.0835, + "step": 24274 + }, + { + "epoch": 0.869339445270113, + "grad_norm": 1.3820215463638306, + "learning_rate": 8.819263844255432e-06, + "loss": 1.1749, + "step": 24275 + }, + { + "epoch": 0.8693752573996812, + "grad_norm": 1.387532114982605, + "learning_rate": 8.81450169139939e-06, + "loss": 1.0183, + "step": 24276 + }, + { + "epoch": 0.8694110695292495, + "grad_norm": 1.3365511894226074, + "learning_rate": 8.809740765326591e-06, + "loss": 0.9397, + "step": 24277 + }, + { + "epoch": 0.8694468816588179, + "grad_norm": 1.5967826843261719, + "learning_rate": 8.804981066101126e-06, + "loss": 1.1643, + "step": 24278 + }, + { + "epoch": 0.8694826937883862, + "grad_norm": 1.5326523780822754, + "learning_rate": 8.800222593786967e-06, + "loss": 1.2133, + "step": 24279 + }, + { + "epoch": 0.8695185059179544, + "grad_norm": 1.6516554355621338, + "learning_rate": 8.795465348448218e-06, + "loss": 1.0247, + "step": 24280 + }, + { + "epoch": 0.8695543180475227, + "grad_norm": 1.6294634342193604, + "learning_rate": 8.790709330148828e-06, + "loss": 1.0197, + "step": 24281 + }, + { + "epoch": 0.869590130177091, + "grad_norm": 1.6893091201782227, + "learning_rate": 8.78595453895278e-06, + "loss": 1.0199, + "step": 24282 + }, + { + "epoch": 0.8696259423066592, + "grad_norm": 1.4356422424316406, + "learning_rate": 8.781200974924053e-06, + "loss": 1.1697, + "step": 24283 + }, + { + "epoch": 0.8696617544362275, + "grad_norm": 1.4432289600372314, + "learning_rate": 8.7764486381266e-06, + "loss": 0.9958, + "step": 24284 + }, + { + "epoch": 0.8696975665657959, + "grad_norm": 1.849922776222229, + "learning_rate": 8.77169752862439e-06, + "loss": 1.1464, + "step": 24285 + }, + { + "epoch": 0.8697333786953642, + "grad_norm": 1.2692606449127197, + "learning_rate": 8.76694764648126e-06, + "loss": 1.1733, + "step": 24286 + }, + { + "epoch": 0.8697691908249324, + "grad_norm": 1.4632240533828735, + "learning_rate": 8.762198991761217e-06, + "loss": 0.9679, + "step": 24287 + }, + { + "epoch": 0.8698050029545007, + "grad_norm": 1.2211294174194336, + "learning_rate": 8.757451564528074e-06, + "loss": 0.9306, + "step": 24288 + }, + { + "epoch": 0.869840815084069, + "grad_norm": 1.2846477031707764, + "learning_rate": 8.752705364845748e-06, + "loss": 0.9749, + "step": 24289 + }, + { + "epoch": 0.8698766272136372, + "grad_norm": 1.616460919380188, + "learning_rate": 8.747960392778053e-06, + "loss": 1.0412, + "step": 24290 + }, + { + "epoch": 0.8699124393432055, + "grad_norm": 1.5182876586914062, + "learning_rate": 8.74321664838884e-06, + "loss": 0.9946, + "step": 24291 + }, + { + "epoch": 0.8699482514727739, + "grad_norm": 1.2435868978500366, + "learning_rate": 8.738474131741958e-06, + "loss": 0.8536, + "step": 24292 + }, + { + "epoch": 0.8699840636023422, + "grad_norm": 1.770867109298706, + "learning_rate": 8.733732842901166e-06, + "loss": 0.9766, + "step": 24293 + }, + { + "epoch": 0.8700198757319104, + "grad_norm": 1.6666150093078613, + "learning_rate": 8.728992781930278e-06, + "loss": 1.0557, + "step": 24294 + }, + { + "epoch": 0.8700556878614787, + "grad_norm": 1.409013032913208, + "learning_rate": 8.724253948893057e-06, + "loss": 1.0752, + "step": 24295 + }, + { + "epoch": 0.870091499991047, + "grad_norm": 1.4450223445892334, + "learning_rate": 8.719516343853273e-06, + "loss": 1.1846, + "step": 24296 + }, + { + "epoch": 0.8701273121206152, + "grad_norm": 1.1806278228759766, + "learning_rate": 8.71477996687463e-06, + "loss": 1.192, + "step": 24297 + }, + { + "epoch": 0.8701631242501835, + "grad_norm": 1.3820278644561768, + "learning_rate": 8.710044818020902e-06, + "loss": 0.9475, + "step": 24298 + }, + { + "epoch": 0.8701989363797519, + "grad_norm": 1.787523865699768, + "learning_rate": 8.705310897355768e-06, + "loss": 1.1049, + "step": 24299 + }, + { + "epoch": 0.8702347485093201, + "grad_norm": 1.340371012687683, + "learning_rate": 8.700578204942889e-06, + "loss": 1.1113, + "step": 24300 + }, + { + "epoch": 0.8702705606388884, + "grad_norm": 1.3365761041641235, + "learning_rate": 8.69584674084597e-06, + "loss": 0.8059, + "step": 24301 + }, + { + "epoch": 0.8703063727684567, + "grad_norm": 1.3297380208969116, + "learning_rate": 8.69111650512866e-06, + "loss": 1.078, + "step": 24302 + }, + { + "epoch": 0.870342184898025, + "grad_norm": 1.6844584941864014, + "learning_rate": 8.686387497854609e-06, + "loss": 1.0142, + "step": 24303 + }, + { + "epoch": 0.8703779970275932, + "grad_norm": 1.3586498498916626, + "learning_rate": 8.681659719087421e-06, + "loss": 1.2273, + "step": 24304 + }, + { + "epoch": 0.8704138091571615, + "grad_norm": 1.1649988889694214, + "learning_rate": 8.676933168890699e-06, + "loss": 0.9215, + "step": 24305 + }, + { + "epoch": 0.8704496212867299, + "grad_norm": 1.6057461500167847, + "learning_rate": 8.67220784732804e-06, + "loss": 0.8417, + "step": 24306 + }, + { + "epoch": 0.8704854334162981, + "grad_norm": 1.4883291721343994, + "learning_rate": 8.667483754463046e-06, + "loss": 1.0713, + "step": 24307 + }, + { + "epoch": 0.8705212455458664, + "grad_norm": 1.522841215133667, + "learning_rate": 8.662760890359233e-06, + "loss": 0.9558, + "step": 24308 + }, + { + "epoch": 0.8705570576754347, + "grad_norm": 1.424201250076294, + "learning_rate": 8.658039255080153e-06, + "loss": 1.0675, + "step": 24309 + }, + { + "epoch": 0.870592869805003, + "grad_norm": 1.4195873737335205, + "learning_rate": 8.65331884868934e-06, + "loss": 1.2241, + "step": 24310 + }, + { + "epoch": 0.8706286819345712, + "grad_norm": 1.272011399269104, + "learning_rate": 8.64859967125029e-06, + "loss": 1.004, + "step": 24311 + }, + { + "epoch": 0.8706644940641395, + "grad_norm": 1.4377782344818115, + "learning_rate": 8.643881722826486e-06, + "loss": 0.9434, + "step": 24312 + }, + { + "epoch": 0.8707003061937079, + "grad_norm": 1.8713890314102173, + "learning_rate": 8.639165003481408e-06, + "loss": 1.0669, + "step": 24313 + }, + { + "epoch": 0.8707361183232761, + "grad_norm": 1.3729506731033325, + "learning_rate": 8.634449513278553e-06, + "loss": 1.0409, + "step": 24314 + }, + { + "epoch": 0.8707719304528444, + "grad_norm": 1.6694284677505493, + "learning_rate": 8.629735252281301e-06, + "loss": 1.0212, + "step": 24315 + }, + { + "epoch": 0.8708077425824127, + "grad_norm": 1.4996761083602905, + "learning_rate": 8.625022220553091e-06, + "loss": 0.9882, + "step": 24316 + }, + { + "epoch": 0.8708435547119809, + "grad_norm": 1.3587785959243774, + "learning_rate": 8.620310418157374e-06, + "loss": 0.9698, + "step": 24317 + }, + { + "epoch": 0.8708793668415492, + "grad_norm": 1.478166103363037, + "learning_rate": 8.615599845157484e-06, + "loss": 1.2785, + "step": 24318 + }, + { + "epoch": 0.8709151789711175, + "grad_norm": 1.5259968042373657, + "learning_rate": 8.61089050161683e-06, + "loss": 1.3802, + "step": 24319 + }, + { + "epoch": 0.8709509911006859, + "grad_norm": 1.9852122068405151, + "learning_rate": 8.60618238759875e-06, + "loss": 1.0784, + "step": 24320 + }, + { + "epoch": 0.8709868032302541, + "grad_norm": 1.1740316152572632, + "learning_rate": 8.601475503166623e-06, + "loss": 0.9576, + "step": 24321 + }, + { + "epoch": 0.8710226153598224, + "grad_norm": 1.5303704738616943, + "learning_rate": 8.596769848383723e-06, + "loss": 1.1262, + "step": 24322 + }, + { + "epoch": 0.8710584274893907, + "grad_norm": 1.3245340585708618, + "learning_rate": 8.592065423313378e-06, + "loss": 0.8915, + "step": 24323 + }, + { + "epoch": 0.8710942396189589, + "grad_norm": 1.413736343383789, + "learning_rate": 8.587362228018892e-06, + "loss": 1.1007, + "step": 24324 + }, + { + "epoch": 0.8711300517485272, + "grad_norm": 1.6062325239181519, + "learning_rate": 8.582660262563558e-06, + "loss": 1.2053, + "step": 24325 + }, + { + "epoch": 0.8711658638780955, + "grad_norm": 1.638725757598877, + "learning_rate": 8.577959527010582e-06, + "loss": 0.8841, + "step": 24326 + }, + { + "epoch": 0.8712016760076638, + "grad_norm": 1.4728747606277466, + "learning_rate": 8.573260021423236e-06, + "loss": 0.9725, + "step": 24327 + }, + { + "epoch": 0.8712374881372321, + "grad_norm": 1.661342740058899, + "learning_rate": 8.568561745864766e-06, + "loss": 1.1506, + "step": 24328 + }, + { + "epoch": 0.8712733002668004, + "grad_norm": 1.4991447925567627, + "learning_rate": 8.563864700398338e-06, + "loss": 1.1413, + "step": 24329 + }, + { + "epoch": 0.8713091123963687, + "grad_norm": 1.504106879234314, + "learning_rate": 8.559168885087165e-06, + "loss": 1.1578, + "step": 24330 + }, + { + "epoch": 0.8713449245259369, + "grad_norm": 1.376562476158142, + "learning_rate": 8.554474299994431e-06, + "loss": 1.1211, + "step": 24331 + }, + { + "epoch": 0.8713807366555052, + "grad_norm": 1.9077814817428589, + "learning_rate": 8.549780945183306e-06, + "loss": 1.0951, + "step": 24332 + }, + { + "epoch": 0.8714165487850735, + "grad_norm": 1.8631563186645508, + "learning_rate": 8.545088820716895e-06, + "loss": 1.208, + "step": 24333 + }, + { + "epoch": 0.8714523609146418, + "grad_norm": 1.3134406805038452, + "learning_rate": 8.54039792665835e-06, + "loss": 0.9539, + "step": 24334 + }, + { + "epoch": 0.8714881730442101, + "grad_norm": 1.4219090938568115, + "learning_rate": 8.535708263070785e-06, + "loss": 1.0766, + "step": 24335 + }, + { + "epoch": 0.8715239851737784, + "grad_norm": 1.318912386894226, + "learning_rate": 8.531019830017272e-06, + "loss": 1.1042, + "step": 24336 + }, + { + "epoch": 0.8715597973033467, + "grad_norm": 1.3473271131515503, + "learning_rate": 8.526332627560906e-06, + "loss": 0.904, + "step": 24337 + }, + { + "epoch": 0.8715956094329149, + "grad_norm": 1.38038969039917, + "learning_rate": 8.521646655764736e-06, + "loss": 0.9543, + "step": 24338 + }, + { + "epoch": 0.8716314215624832, + "grad_norm": 1.4674136638641357, + "learning_rate": 8.516961914691835e-06, + "loss": 0.913, + "step": 24339 + }, + { + "epoch": 0.8716672336920515, + "grad_norm": 1.737212061882019, + "learning_rate": 8.512278404405182e-06, + "loss": 1.0746, + "step": 24340 + }, + { + "epoch": 0.8717030458216198, + "grad_norm": 1.2311174869537354, + "learning_rate": 8.507596124967821e-06, + "loss": 1.1308, + "step": 24341 + }, + { + "epoch": 0.8717388579511881, + "grad_norm": 1.7027322053909302, + "learning_rate": 8.50291507644273e-06, + "loss": 1.0825, + "step": 24342 + }, + { + "epoch": 0.8717746700807564, + "grad_norm": 1.4798485040664673, + "learning_rate": 8.498235258892907e-06, + "loss": 1.2543, + "step": 24343 + }, + { + "epoch": 0.8718104822103246, + "grad_norm": 1.2819360494613647, + "learning_rate": 8.493556672381297e-06, + "loss": 1.0614, + "step": 24344 + }, + { + "epoch": 0.8718462943398929, + "grad_norm": 1.4792096614837646, + "learning_rate": 8.488879316970832e-06, + "loss": 1.0001, + "step": 24345 + }, + { + "epoch": 0.8718821064694612, + "grad_norm": 2.0212669372558594, + "learning_rate": 8.484203192724482e-06, + "loss": 1.0883, + "step": 24346 + }, + { + "epoch": 0.8719179185990295, + "grad_norm": 1.4054896831512451, + "learning_rate": 8.479528299705108e-06, + "loss": 0.9478, + "step": 24347 + }, + { + "epoch": 0.8719537307285978, + "grad_norm": 1.468978762626648, + "learning_rate": 8.474854637975638e-06, + "loss": 0.9657, + "step": 24348 + }, + { + "epoch": 0.8719895428581661, + "grad_norm": 1.6097010374069214, + "learning_rate": 8.47018220759893e-06, + "loss": 1.382, + "step": 24349 + }, + { + "epoch": 0.8720253549877344, + "grad_norm": 1.536367654800415, + "learning_rate": 8.465511008637872e-06, + "loss": 1.1716, + "step": 24350 + }, + { + "epoch": 0.8720611671173026, + "grad_norm": 1.765263557434082, + "learning_rate": 8.460841041155277e-06, + "loss": 1.325, + "step": 24351 + }, + { + "epoch": 0.8720969792468709, + "grad_norm": 1.6076191663742065, + "learning_rate": 8.456172305213995e-06, + "loss": 0.9841, + "step": 24352 + }, + { + "epoch": 0.8721327913764392, + "grad_norm": 1.7217609882354736, + "learning_rate": 8.45150480087684e-06, + "loss": 1.1131, + "step": 24353 + }, + { + "epoch": 0.8721686035060074, + "grad_norm": 1.4891685247421265, + "learning_rate": 8.44683852820659e-06, + "loss": 1.1545, + "step": 24354 + }, + { + "epoch": 0.8722044156355758, + "grad_norm": 1.3529690504074097, + "learning_rate": 8.442173487266047e-06, + "loss": 0.9049, + "step": 24355 + }, + { + "epoch": 0.8722402277651441, + "grad_norm": 1.3290202617645264, + "learning_rate": 8.437509678117916e-06, + "loss": 0.9632, + "step": 24356 + }, + { + "epoch": 0.8722760398947124, + "grad_norm": 1.2095602750778198, + "learning_rate": 8.432847100825025e-06, + "loss": 0.8923, + "step": 24357 + }, + { + "epoch": 0.8723118520242806, + "grad_norm": 1.8624718189239502, + "learning_rate": 8.428185755450047e-06, + "loss": 1.0275, + "step": 24358 + }, + { + "epoch": 0.8723476641538489, + "grad_norm": 1.4701584577560425, + "learning_rate": 8.423525642055719e-06, + "loss": 1.1887, + "step": 24359 + }, + { + "epoch": 0.8723834762834172, + "grad_norm": 1.6968181133270264, + "learning_rate": 8.418866760704735e-06, + "loss": 1.1384, + "step": 24360 + }, + { + "epoch": 0.8724192884129854, + "grad_norm": 1.8770604133605957, + "learning_rate": 8.414209111459747e-06, + "loss": 1.1774, + "step": 24361 + }, + { + "epoch": 0.8724551005425537, + "grad_norm": 1.4763234853744507, + "learning_rate": 8.409552694383472e-06, + "loss": 0.9074, + "step": 24362 + }, + { + "epoch": 0.8724909126721221, + "grad_norm": 1.4898579120635986, + "learning_rate": 8.404897509538468e-06, + "loss": 1.0989, + "step": 24363 + }, + { + "epoch": 0.8725267248016904, + "grad_norm": 1.4071334600448608, + "learning_rate": 8.400243556987464e-06, + "loss": 0.8315, + "step": 24364 + }, + { + "epoch": 0.8725625369312586, + "grad_norm": 1.4749923944473267, + "learning_rate": 8.39559083679301e-06, + "loss": 1.122, + "step": 24365 + }, + { + "epoch": 0.8725983490608269, + "grad_norm": 1.301979422569275, + "learning_rate": 8.390939349017735e-06, + "loss": 0.9262, + "step": 24366 + }, + { + "epoch": 0.8726341611903952, + "grad_norm": 1.8377888202667236, + "learning_rate": 8.386289093724175e-06, + "loss": 1.065, + "step": 24367 + }, + { + "epoch": 0.8726699733199634, + "grad_norm": 1.5611344575881958, + "learning_rate": 8.38164007097495e-06, + "loss": 0.9735, + "step": 24368 + }, + { + "epoch": 0.8727057854495317, + "grad_norm": 2.152172803878784, + "learning_rate": 8.376992280832574e-06, + "loss": 1.1291, + "step": 24369 + }, + { + "epoch": 0.8727415975791001, + "grad_norm": 2.024851083755493, + "learning_rate": 8.372345723359553e-06, + "loss": 1.1519, + "step": 24370 + }, + { + "epoch": 0.8727774097086684, + "grad_norm": 1.5080907344818115, + "learning_rate": 8.367700398618472e-06, + "loss": 0.9742, + "step": 24371 + }, + { + "epoch": 0.8728132218382366, + "grad_norm": 1.2190998792648315, + "learning_rate": 8.363056306671757e-06, + "loss": 0.9765, + "step": 24372 + }, + { + "epoch": 0.8728490339678049, + "grad_norm": 1.4127062559127808, + "learning_rate": 8.358413447581937e-06, + "loss": 0.9649, + "step": 24373 + }, + { + "epoch": 0.8728848460973732, + "grad_norm": 1.9169955253601074, + "learning_rate": 8.353771821411415e-06, + "loss": 1.3681, + "step": 24374 + }, + { + "epoch": 0.8729206582269414, + "grad_norm": 1.362157940864563, + "learning_rate": 8.349131428222723e-06, + "loss": 1.0306, + "step": 24375 + }, + { + "epoch": 0.8729564703565097, + "grad_norm": 1.8077712059020996, + "learning_rate": 8.344492268078219e-06, + "loss": 0.9758, + "step": 24376 + }, + { + "epoch": 0.8729922824860781, + "grad_norm": 1.2307840585708618, + "learning_rate": 8.339854341040376e-06, + "loss": 1.0126, + "step": 24377 + }, + { + "epoch": 0.8730280946156463, + "grad_norm": 1.7792130708694458, + "learning_rate": 8.335217647171533e-06, + "loss": 1.0799, + "step": 24378 + }, + { + "epoch": 0.8730639067452146, + "grad_norm": 1.4851385354995728, + "learning_rate": 8.330582186534097e-06, + "loss": 1.002, + "step": 24379 + }, + { + "epoch": 0.8730997188747829, + "grad_norm": 1.6996582746505737, + "learning_rate": 8.32594795919045e-06, + "loss": 1.2923, + "step": 24380 + }, + { + "epoch": 0.8731355310043512, + "grad_norm": 1.6674515008926392, + "learning_rate": 8.321314965202898e-06, + "loss": 1.0685, + "step": 24381 + }, + { + "epoch": 0.8731713431339194, + "grad_norm": 1.5443168878555298, + "learning_rate": 8.316683204633814e-06, + "loss": 1.1076, + "step": 24382 + }, + { + "epoch": 0.8732071552634877, + "grad_norm": 2.0909674167633057, + "learning_rate": 8.312052677545478e-06, + "loss": 1.0943, + "step": 24383 + }, + { + "epoch": 0.8732429673930561, + "grad_norm": 1.4658212661743164, + "learning_rate": 8.307423384000224e-06, + "loss": 1.19, + "step": 24384 + }, + { + "epoch": 0.8732787795226243, + "grad_norm": 1.355003833770752, + "learning_rate": 8.302795324060287e-06, + "loss": 0.9891, + "step": 24385 + }, + { + "epoch": 0.8733145916521926, + "grad_norm": 1.7185864448547363, + "learning_rate": 8.298168497787984e-06, + "loss": 0.9764, + "step": 24386 + }, + { + "epoch": 0.8733504037817609, + "grad_norm": 1.3107906579971313, + "learning_rate": 8.293542905245543e-06, + "loss": 0.8896, + "step": 24387 + }, + { + "epoch": 0.8733862159113291, + "grad_norm": 1.2993016242980957, + "learning_rate": 8.288918546495172e-06, + "loss": 1.0741, + "step": 24388 + }, + { + "epoch": 0.8734220280408974, + "grad_norm": 1.9206403493881226, + "learning_rate": 8.284295421599097e-06, + "loss": 1.1745, + "step": 24389 + }, + { + "epoch": 0.8734578401704657, + "grad_norm": 1.4712661504745483, + "learning_rate": 8.279673530619525e-06, + "loss": 1.1089, + "step": 24390 + }, + { + "epoch": 0.8734936523000341, + "grad_norm": 1.384613037109375, + "learning_rate": 8.27505287361866e-06, + "loss": 1.05, + "step": 24391 + }, + { + "epoch": 0.8735294644296023, + "grad_norm": 1.7405502796173096, + "learning_rate": 8.270433450658621e-06, + "loss": 1.0962, + "step": 24392 + }, + { + "epoch": 0.8735652765591706, + "grad_norm": 1.2284884452819824, + "learning_rate": 8.265815261801568e-06, + "loss": 1.0816, + "step": 24393 + }, + { + "epoch": 0.8736010886887389, + "grad_norm": 1.7968741655349731, + "learning_rate": 8.261198307109651e-06, + "loss": 1.1491, + "step": 24394 + }, + { + "epoch": 0.8736369008183071, + "grad_norm": 1.7924907207489014, + "learning_rate": 8.25658258664499e-06, + "loss": 1.234, + "step": 24395 + }, + { + "epoch": 0.8736727129478754, + "grad_norm": 1.2710144519805908, + "learning_rate": 8.251968100469653e-06, + "loss": 0.9605, + "step": 24396 + }, + { + "epoch": 0.8737085250774437, + "grad_norm": 1.5111193656921387, + "learning_rate": 8.247354848645738e-06, + "loss": 1.1329, + "step": 24397 + }, + { + "epoch": 0.8737443372070121, + "grad_norm": 1.297457218170166, + "learning_rate": 8.242742831235339e-06, + "loss": 1.1112, + "step": 24398 + }, + { + "epoch": 0.8737801493365803, + "grad_norm": 1.5080335140228271, + "learning_rate": 8.23813204830045e-06, + "loss": 0.9035, + "step": 24399 + }, + { + "epoch": 0.8738159614661486, + "grad_norm": 1.4796111583709717, + "learning_rate": 8.233522499903123e-06, + "loss": 0.9367, + "step": 24400 + }, + { + "epoch": 0.8738517735957169, + "grad_norm": 1.278388500213623, + "learning_rate": 8.228914186105397e-06, + "loss": 0.9658, + "step": 24401 + }, + { + "epoch": 0.8738875857252851, + "grad_norm": 1.3660701513290405, + "learning_rate": 8.224307106969264e-06, + "loss": 1.0455, + "step": 24402 + }, + { + "epoch": 0.8739233978548534, + "grad_norm": 2.1175053119659424, + "learning_rate": 8.219701262556678e-06, + "loss": 0.9531, + "step": 24403 + }, + { + "epoch": 0.8739592099844217, + "grad_norm": 1.567977786064148, + "learning_rate": 8.21509665292962e-06, + "loss": 1.048, + "step": 24404 + }, + { + "epoch": 0.87399502211399, + "grad_norm": 1.4836935997009277, + "learning_rate": 8.210493278150066e-06, + "loss": 1.0155, + "step": 24405 + }, + { + "epoch": 0.8740308342435583, + "grad_norm": 1.5064046382904053, + "learning_rate": 8.205891138279898e-06, + "loss": 1.1498, + "step": 24406 + }, + { + "epoch": 0.8740666463731266, + "grad_norm": 1.5920662879943848, + "learning_rate": 8.201290233381075e-06, + "loss": 1.232, + "step": 24407 + }, + { + "epoch": 0.8741024585026949, + "grad_norm": 1.2765023708343506, + "learning_rate": 8.196690563515463e-06, + "loss": 0.7571, + "step": 24408 + }, + { + "epoch": 0.8741382706322631, + "grad_norm": 1.6526920795440674, + "learning_rate": 8.192092128744988e-06, + "loss": 0.9996, + "step": 24409 + }, + { + "epoch": 0.8741740827618314, + "grad_norm": 1.4342584609985352, + "learning_rate": 8.187494929131478e-06, + "loss": 1.0923, + "step": 24410 + }, + { + "epoch": 0.8742098948913997, + "grad_norm": 1.4707638025283813, + "learning_rate": 8.182898964736785e-06, + "loss": 1.084, + "step": 24411 + }, + { + "epoch": 0.874245707020968, + "grad_norm": 1.6014411449432373, + "learning_rate": 8.178304235622758e-06, + "loss": 1.3945, + "step": 24412 + }, + { + "epoch": 0.8742815191505363, + "grad_norm": 1.4478259086608887, + "learning_rate": 8.173710741851215e-06, + "loss": 1.0985, + "step": 24413 + }, + { + "epoch": 0.8743173312801046, + "grad_norm": 1.7437992095947266, + "learning_rate": 8.169118483483928e-06, + "loss": 1.1414, + "step": 24414 + }, + { + "epoch": 0.8743531434096729, + "grad_norm": 1.4639829397201538, + "learning_rate": 8.164527460582705e-06, + "loss": 1.0534, + "step": 24415 + }, + { + "epoch": 0.8743889555392411, + "grad_norm": 1.8823624849319458, + "learning_rate": 8.159937673209327e-06, + "loss": 1.2218, + "step": 24416 + }, + { + "epoch": 0.8744247676688094, + "grad_norm": 1.7782608270645142, + "learning_rate": 8.155349121425504e-06, + "loss": 1.1195, + "step": 24417 + }, + { + "epoch": 0.8744605797983777, + "grad_norm": 1.3767136335372925, + "learning_rate": 8.150761805292983e-06, + "loss": 0.9913, + "step": 24418 + }, + { + "epoch": 0.874496391927946, + "grad_norm": 1.5506069660186768, + "learning_rate": 8.146175724873485e-06, + "loss": 1.0346, + "step": 24419 + }, + { + "epoch": 0.8745322040575143, + "grad_norm": 1.2660441398620605, + "learning_rate": 8.141590880228722e-06, + "loss": 1.0214, + "step": 24420 + }, + { + "epoch": 0.8745680161870826, + "grad_norm": 1.4673702716827393, + "learning_rate": 8.137007271420349e-06, + "loss": 0.9985, + "step": 24421 + }, + { + "epoch": 0.8746038283166508, + "grad_norm": 1.56795072555542, + "learning_rate": 8.132424898510061e-06, + "loss": 1.045, + "step": 24422 + }, + { + "epoch": 0.8746396404462191, + "grad_norm": 1.4275093078613281, + "learning_rate": 8.127843761559506e-06, + "loss": 1.1271, + "step": 24423 + }, + { + "epoch": 0.8746754525757874, + "grad_norm": 1.7439837455749512, + "learning_rate": 8.123263860630282e-06, + "loss": 1.1206, + "step": 24424 + }, + { + "epoch": 0.8747112647053557, + "grad_norm": 1.504072904586792, + "learning_rate": 8.118685195784037e-06, + "loss": 0.9879, + "step": 24425 + }, + { + "epoch": 0.874747076834924, + "grad_norm": 1.4777636528015137, + "learning_rate": 8.114107767082358e-06, + "loss": 1.1472, + "step": 24426 + }, + { + "epoch": 0.8747828889644923, + "grad_norm": 1.6076477766036987, + "learning_rate": 8.109531574586859e-06, + "loss": 0.8845, + "step": 24427 + }, + { + "epoch": 0.8748187010940606, + "grad_norm": 2.6162705421447754, + "learning_rate": 8.10495661835906e-06, + "loss": 1.1165, + "step": 24428 + }, + { + "epoch": 0.8748545132236288, + "grad_norm": 1.984944224357605, + "learning_rate": 8.100382898460546e-06, + "loss": 1.2893, + "step": 24429 + }, + { + "epoch": 0.8748903253531971, + "grad_norm": 1.4153541326522827, + "learning_rate": 8.095810414952832e-06, + "loss": 0.9406, + "step": 24430 + }, + { + "epoch": 0.8749261374827654, + "grad_norm": 1.6982377767562866, + "learning_rate": 8.091239167897446e-06, + "loss": 1.1504, + "step": 24431 + }, + { + "epoch": 0.8749619496123336, + "grad_norm": 1.7575805187225342, + "learning_rate": 8.086669157355876e-06, + "loss": 1.2602, + "step": 24432 + }, + { + "epoch": 0.874997761741902, + "grad_norm": 1.5233957767486572, + "learning_rate": 8.082100383389613e-06, + "loss": 0.9817, + "step": 24433 + }, + { + "epoch": 0.8750335738714703, + "grad_norm": 1.542233943939209, + "learning_rate": 8.077532846060143e-06, + "loss": 1.0045, + "step": 24434 + }, + { + "epoch": 0.8750693860010386, + "grad_norm": 1.3083949089050293, + "learning_rate": 8.072966545428873e-06, + "loss": 1.0842, + "step": 24435 + }, + { + "epoch": 0.8751051981306068, + "grad_norm": 1.817145586013794, + "learning_rate": 8.068401481557263e-06, + "loss": 1.0018, + "step": 24436 + }, + { + "epoch": 0.8751410102601751, + "grad_norm": 1.5116678476333618, + "learning_rate": 8.063837654506734e-06, + "loss": 1.0883, + "step": 24437 + }, + { + "epoch": 0.8751768223897434, + "grad_norm": 1.3512272834777832, + "learning_rate": 8.059275064338689e-06, + "loss": 0.9695, + "step": 24438 + }, + { + "epoch": 0.8752126345193116, + "grad_norm": 1.6562557220458984, + "learning_rate": 8.054713711114491e-06, + "loss": 1.103, + "step": 24439 + }, + { + "epoch": 0.87524844664888, + "grad_norm": 2.078819513320923, + "learning_rate": 8.050153594895526e-06, + "loss": 1.1851, + "step": 24440 + }, + { + "epoch": 0.8752842587784483, + "grad_norm": 1.2073190212249756, + "learning_rate": 8.045594715743144e-06, + "loss": 0.7592, + "step": 24441 + }, + { + "epoch": 0.8753200709080166, + "grad_norm": 1.4624277353286743, + "learning_rate": 8.04103707371866e-06, + "loss": 1.0111, + "step": 24442 + }, + { + "epoch": 0.8753558830375848, + "grad_norm": 1.7580130100250244, + "learning_rate": 8.036480668883394e-06, + "loss": 1.0641, + "step": 24443 + }, + { + "epoch": 0.8753916951671531, + "grad_norm": 1.3616727590560913, + "learning_rate": 8.031925501298666e-06, + "loss": 0.9776, + "step": 24444 + }, + { + "epoch": 0.8754275072967214, + "grad_norm": 1.50996732711792, + "learning_rate": 8.027371571025765e-06, + "loss": 1.2173, + "step": 24445 + }, + { + "epoch": 0.8754633194262896, + "grad_norm": 1.4114432334899902, + "learning_rate": 8.022818878125926e-06, + "loss": 1.1168, + "step": 24446 + }, + { + "epoch": 0.875499131555858, + "grad_norm": 1.4301565885543823, + "learning_rate": 8.018267422660419e-06, + "loss": 1.246, + "step": 24447 + }, + { + "epoch": 0.8755349436854263, + "grad_norm": 1.267006278038025, + "learning_rate": 8.013717204690474e-06, + "loss": 0.9704, + "step": 24448 + }, + { + "epoch": 0.8755707558149946, + "grad_norm": 1.6164950132369995, + "learning_rate": 8.00916822427733e-06, + "loss": 1.0248, + "step": 24449 + }, + { + "epoch": 0.8756065679445628, + "grad_norm": 1.4524670839309692, + "learning_rate": 8.004620481482161e-06, + "loss": 0.9807, + "step": 24450 + }, + { + "epoch": 0.8756423800741311, + "grad_norm": 1.789200782775879, + "learning_rate": 8.00007397636613e-06, + "loss": 1.1284, + "step": 24451 + }, + { + "epoch": 0.8756781922036994, + "grad_norm": 1.2817091941833496, + "learning_rate": 7.995528708990463e-06, + "loss": 1.1575, + "step": 24452 + }, + { + "epoch": 0.8757140043332676, + "grad_norm": 1.7161985635757446, + "learning_rate": 7.990984679416269e-06, + "loss": 1.0627, + "step": 24453 + }, + { + "epoch": 0.875749816462836, + "grad_norm": 1.5022205114364624, + "learning_rate": 7.986441887704687e-06, + "loss": 0.977, + "step": 24454 + }, + { + "epoch": 0.8757856285924043, + "grad_norm": 1.5512934923171997, + "learning_rate": 7.981900333916848e-06, + "loss": 0.9499, + "step": 24455 + }, + { + "epoch": 0.8758214407219725, + "grad_norm": 1.6206021308898926, + "learning_rate": 7.977360018113855e-06, + "loss": 1.2038, + "step": 24456 + }, + { + "epoch": 0.8758572528515408, + "grad_norm": 1.5107909440994263, + "learning_rate": 7.972820940356785e-06, + "loss": 1.0707, + "step": 24457 + }, + { + "epoch": 0.8758930649811091, + "grad_norm": 1.775083065032959, + "learning_rate": 7.968283100706664e-06, + "loss": 1.0679, + "step": 24458 + }, + { + "epoch": 0.8759288771106774, + "grad_norm": 2.316127300262451, + "learning_rate": 7.963746499224611e-06, + "loss": 0.9719, + "step": 24459 + }, + { + "epoch": 0.8759646892402456, + "grad_norm": 1.5119149684906006, + "learning_rate": 7.959211135971622e-06, + "loss": 1.0772, + "step": 24460 + }, + { + "epoch": 0.876000501369814, + "grad_norm": 1.2340327501296997, + "learning_rate": 7.954677011008749e-06, + "loss": 0.8702, + "step": 24461 + }, + { + "epoch": 0.8760363134993823, + "grad_norm": 1.3779242038726807, + "learning_rate": 7.95014412439692e-06, + "loss": 1.0378, + "step": 24462 + }, + { + "epoch": 0.8760721256289505, + "grad_norm": 1.6914639472961426, + "learning_rate": 7.945612476197207e-06, + "loss": 1.2688, + "step": 24463 + }, + { + "epoch": 0.8761079377585188, + "grad_norm": 1.6267659664154053, + "learning_rate": 7.941082066470507e-06, + "loss": 0.9839, + "step": 24464 + }, + { + "epoch": 0.8761437498880871, + "grad_norm": 1.9874517917633057, + "learning_rate": 7.936552895277826e-06, + "loss": 1.3008, + "step": 24465 + }, + { + "epoch": 0.8761795620176553, + "grad_norm": 2.5448296070098877, + "learning_rate": 7.932024962680062e-06, + "loss": 1.1052, + "step": 24466 + }, + { + "epoch": 0.8762153741472236, + "grad_norm": 1.6308037042617798, + "learning_rate": 7.927498268738132e-06, + "loss": 0.9015, + "step": 24467 + }, + { + "epoch": 0.876251186276792, + "grad_norm": 1.7647360563278198, + "learning_rate": 7.922972813512974e-06, + "loss": 1.1415, + "step": 24468 + }, + { + "epoch": 0.8762869984063603, + "grad_norm": 1.323601245880127, + "learning_rate": 7.918448597065408e-06, + "loss": 1.1235, + "step": 24469 + }, + { + "epoch": 0.8763228105359285, + "grad_norm": 1.481039047241211, + "learning_rate": 7.913925619456374e-06, + "loss": 1.1951, + "step": 24470 + }, + { + "epoch": 0.8763586226654968, + "grad_norm": 1.339404582977295, + "learning_rate": 7.909403880746669e-06, + "loss": 1.0031, + "step": 24471 + }, + { + "epoch": 0.8763944347950651, + "grad_norm": 1.9528977870941162, + "learning_rate": 7.904883380997164e-06, + "loss": 1.0132, + "step": 24472 + }, + { + "epoch": 0.8764302469246333, + "grad_norm": 1.3657585382461548, + "learning_rate": 7.900364120268622e-06, + "loss": 1.1377, + "step": 24473 + }, + { + "epoch": 0.8764660590542016, + "grad_norm": 1.5277880430221558, + "learning_rate": 7.895846098621917e-06, + "loss": 0.8993, + "step": 24474 + }, + { + "epoch": 0.87650187118377, + "grad_norm": 1.703515887260437, + "learning_rate": 7.891329316117801e-06, + "loss": 1.1258, + "step": 24475 + }, + { + "epoch": 0.8765376833133383, + "grad_norm": 1.4354066848754883, + "learning_rate": 7.886813772817026e-06, + "loss": 0.8979, + "step": 24476 + }, + { + "epoch": 0.8765734954429065, + "grad_norm": 1.716810703277588, + "learning_rate": 7.88229946878034e-06, + "loss": 0.9497, + "step": 24477 + }, + { + "epoch": 0.8766093075724748, + "grad_norm": 1.700275182723999, + "learning_rate": 7.877786404068498e-06, + "loss": 1.2803, + "step": 24478 + }, + { + "epoch": 0.8766451197020431, + "grad_norm": 1.3127021789550781, + "learning_rate": 7.873274578742229e-06, + "loss": 1.0314, + "step": 24479 + }, + { + "epoch": 0.8766809318316113, + "grad_norm": 1.3631312847137451, + "learning_rate": 7.868763992862182e-06, + "loss": 0.9712, + "step": 24480 + }, + { + "epoch": 0.8767167439611796, + "grad_norm": 1.6239479780197144, + "learning_rate": 7.864254646489099e-06, + "loss": 0.9544, + "step": 24481 + }, + { + "epoch": 0.876752556090748, + "grad_norm": 1.293117642402649, + "learning_rate": 7.859746539683621e-06, + "loss": 1.1827, + "step": 24482 + }, + { + "epoch": 0.8767883682203162, + "grad_norm": 1.2259410619735718, + "learning_rate": 7.855239672506408e-06, + "loss": 1.1194, + "step": 24483 + }, + { + "epoch": 0.8768241803498845, + "grad_norm": 1.7329050302505493, + "learning_rate": 7.85073404501807e-06, + "loss": 1.0964, + "step": 24484 + }, + { + "epoch": 0.8768599924794528, + "grad_norm": 1.5379774570465088, + "learning_rate": 7.846229657279246e-06, + "loss": 1.1006, + "step": 24485 + }, + { + "epoch": 0.8768958046090211, + "grad_norm": 1.5629796981811523, + "learning_rate": 7.841726509350545e-06, + "loss": 1.1151, + "step": 24486 + }, + { + "epoch": 0.8769316167385893, + "grad_norm": 1.5264286994934082, + "learning_rate": 7.837224601292525e-06, + "loss": 0.9927, + "step": 24487 + }, + { + "epoch": 0.8769674288681576, + "grad_norm": 2.0146098136901855, + "learning_rate": 7.832723933165764e-06, + "loss": 1.2065, + "step": 24488 + }, + { + "epoch": 0.877003240997726, + "grad_norm": 1.463188886642456, + "learning_rate": 7.828224505030823e-06, + "loss": 1.0884, + "step": 24489 + }, + { + "epoch": 0.8770390531272942, + "grad_norm": 1.6750009059906006, + "learning_rate": 7.823726316948232e-06, + "loss": 1.1591, + "step": 24490 + }, + { + "epoch": 0.8770748652568625, + "grad_norm": 1.6696317195892334, + "learning_rate": 7.819229368978498e-06, + "loss": 1.1462, + "step": 24491 + }, + { + "epoch": 0.8771106773864308, + "grad_norm": 1.728314995765686, + "learning_rate": 7.814733661182116e-06, + "loss": 1.1927, + "step": 24492 + }, + { + "epoch": 0.877146489515999, + "grad_norm": 1.2629802227020264, + "learning_rate": 7.810239193619618e-06, + "loss": 0.9365, + "step": 24493 + }, + { + "epoch": 0.8771823016455673, + "grad_norm": 1.7370816469192505, + "learning_rate": 7.805745966351407e-06, + "loss": 0.8263, + "step": 24494 + }, + { + "epoch": 0.8772181137751356, + "grad_norm": 1.4036527872085571, + "learning_rate": 7.801253979437962e-06, + "loss": 1.0743, + "step": 24495 + }, + { + "epoch": 0.877253925904704, + "grad_norm": 1.3907568454742432, + "learning_rate": 7.796763232939719e-06, + "loss": 1.0609, + "step": 24496 + }, + { + "epoch": 0.8772897380342722, + "grad_norm": 1.803852915763855, + "learning_rate": 7.79227372691711e-06, + "loss": 1.0547, + "step": 24497 + }, + { + "epoch": 0.8773255501638405, + "grad_norm": 1.4528533220291138, + "learning_rate": 7.787785461430498e-06, + "loss": 1.1142, + "step": 24498 + }, + { + "epoch": 0.8773613622934088, + "grad_norm": 1.4028581380844116, + "learning_rate": 7.783298436540288e-06, + "loss": 1.1077, + "step": 24499 + }, + { + "epoch": 0.877397174422977, + "grad_norm": 1.52777099609375, + "learning_rate": 7.778812652306844e-06, + "loss": 1.1086, + "step": 24500 + }, + { + "epoch": 0.8774329865525453, + "grad_norm": 1.5394344329833984, + "learning_rate": 7.774328108790541e-06, + "loss": 1.0511, + "step": 24501 + }, + { + "epoch": 0.8774687986821136, + "grad_norm": 2.050502061843872, + "learning_rate": 7.769844806051674e-06, + "loss": 1.341, + "step": 24502 + }, + { + "epoch": 0.877504610811682, + "grad_norm": 1.266723871231079, + "learning_rate": 7.765362744150573e-06, + "loss": 1.0138, + "step": 24503 + }, + { + "epoch": 0.8775404229412502, + "grad_norm": 1.5434088706970215, + "learning_rate": 7.760881923147567e-06, + "loss": 1.22, + "step": 24504 + }, + { + "epoch": 0.8775762350708185, + "grad_norm": 1.3933305740356445, + "learning_rate": 7.756402343102897e-06, + "loss": 1.1208, + "step": 24505 + }, + { + "epoch": 0.8776120472003868, + "grad_norm": 1.7517962455749512, + "learning_rate": 7.751924004076837e-06, + "loss": 0.9552, + "step": 24506 + }, + { + "epoch": 0.877647859329955, + "grad_norm": 1.4886245727539062, + "learning_rate": 7.747446906129662e-06, + "loss": 1.1811, + "step": 24507 + }, + { + "epoch": 0.8776836714595233, + "grad_norm": 1.6065410375595093, + "learning_rate": 7.742971049321601e-06, + "loss": 1.0268, + "step": 24508 + }, + { + "epoch": 0.8777194835890916, + "grad_norm": 1.1146306991577148, + "learning_rate": 7.738496433712839e-06, + "loss": 0.9956, + "step": 24509 + }, + { + "epoch": 0.87775529571866, + "grad_norm": 1.3476835489273071, + "learning_rate": 7.734023059363605e-06, + "loss": 1.0708, + "step": 24510 + }, + { + "epoch": 0.8777911078482282, + "grad_norm": 1.3569194078445435, + "learning_rate": 7.729550926334094e-06, + "loss": 0.9952, + "step": 24511 + }, + { + "epoch": 0.8778269199777965, + "grad_norm": 1.4196091890335083, + "learning_rate": 7.72508003468444e-06, + "loss": 1.2893, + "step": 24512 + }, + { + "epoch": 0.8778627321073648, + "grad_norm": 1.4198973178863525, + "learning_rate": 7.720610384474802e-06, + "loss": 1.1152, + "step": 24513 + }, + { + "epoch": 0.877898544236933, + "grad_norm": 1.2766331434249878, + "learning_rate": 7.716141975765322e-06, + "loss": 1.1435, + "step": 24514 + }, + { + "epoch": 0.8779343563665013, + "grad_norm": 1.5546884536743164, + "learning_rate": 7.711674808616132e-06, + "loss": 1.1255, + "step": 24515 + }, + { + "epoch": 0.8779701684960696, + "grad_norm": 1.5469216108322144, + "learning_rate": 7.70720888308729e-06, + "loss": 1.1371, + "step": 24516 + }, + { + "epoch": 0.878005980625638, + "grad_norm": 2.0058372020721436, + "learning_rate": 7.70274419923892e-06, + "loss": 1.0359, + "step": 24517 + }, + { + "epoch": 0.8780417927552062, + "grad_norm": 1.5408062934875488, + "learning_rate": 7.69828075713106e-06, + "loss": 1.1013, + "step": 24518 + }, + { + "epoch": 0.8780776048847745, + "grad_norm": 1.7419089078903198, + "learning_rate": 7.693818556823784e-06, + "loss": 1.1881, + "step": 24519 + }, + { + "epoch": 0.8781134170143428, + "grad_norm": 1.3581538200378418, + "learning_rate": 7.6893575983771e-06, + "loss": 0.9907, + "step": 24520 + }, + { + "epoch": 0.878149229143911, + "grad_norm": 1.5594695806503296, + "learning_rate": 7.68489788185105e-06, + "loss": 1.195, + "step": 24521 + }, + { + "epoch": 0.8781850412734793, + "grad_norm": 1.3552120923995972, + "learning_rate": 7.680439407305629e-06, + "loss": 1.0982, + "step": 24522 + }, + { + "epoch": 0.8782208534030476, + "grad_norm": 1.4977115392684937, + "learning_rate": 7.675982174800788e-06, + "loss": 0.9822, + "step": 24523 + }, + { + "epoch": 0.8782566655326159, + "grad_norm": 1.4990519285202026, + "learning_rate": 7.671526184396527e-06, + "loss": 1.0964, + "step": 24524 + }, + { + "epoch": 0.8782924776621842, + "grad_norm": 1.8484232425689697, + "learning_rate": 7.667071436152784e-06, + "loss": 1.0676, + "step": 24525 + }, + { + "epoch": 0.8783282897917525, + "grad_norm": 1.3728986978530884, + "learning_rate": 7.662617930129502e-06, + "loss": 1.0775, + "step": 24526 + }, + { + "epoch": 0.8783641019213208, + "grad_norm": 1.5818982124328613, + "learning_rate": 7.658165666386585e-06, + "loss": 1.121, + "step": 24527 + }, + { + "epoch": 0.878399914050889, + "grad_norm": 1.414091944694519, + "learning_rate": 7.653714644983923e-06, + "loss": 0.9299, + "step": 24528 + }, + { + "epoch": 0.8784357261804573, + "grad_norm": 1.6123783588409424, + "learning_rate": 7.649264865981443e-06, + "loss": 1.2076, + "step": 24529 + }, + { + "epoch": 0.8784715383100256, + "grad_norm": 1.35234797000885, + "learning_rate": 7.644816329438952e-06, + "loss": 1.1089, + "step": 24530 + }, + { + "epoch": 0.8785073504395939, + "grad_norm": 1.3999133110046387, + "learning_rate": 7.640369035416339e-06, + "loss": 1.2118, + "step": 24531 + }, + { + "epoch": 0.8785431625691622, + "grad_norm": 1.3393474817276, + "learning_rate": 7.63592298397342e-06, + "loss": 0.9494, + "step": 24532 + }, + { + "epoch": 0.8785789746987305, + "grad_norm": 1.5074058771133423, + "learning_rate": 7.631478175170026e-06, + "loss": 1.027, + "step": 24533 + }, + { + "epoch": 0.8786147868282987, + "grad_norm": 1.5602079629898071, + "learning_rate": 7.627034609065942e-06, + "loss": 0.9879, + "step": 24534 + }, + { + "epoch": 0.878650598957867, + "grad_norm": 1.3569490909576416, + "learning_rate": 7.622592285720942e-06, + "loss": 0.9618, + "step": 24535 + }, + { + "epoch": 0.8786864110874353, + "grad_norm": 2.00715970993042, + "learning_rate": 7.618151205194813e-06, + "loss": 1.0324, + "step": 24536 + }, + { + "epoch": 0.8787222232170036, + "grad_norm": 2.2096691131591797, + "learning_rate": 7.613711367547316e-06, + "loss": 1.1612, + "step": 24537 + }, + { + "epoch": 0.8787580353465719, + "grad_norm": 1.8913681507110596, + "learning_rate": 7.609272772838138e-06, + "loss": 1.3843, + "step": 24538 + }, + { + "epoch": 0.8787938474761402, + "grad_norm": 1.2400161027908325, + "learning_rate": 7.604835421127021e-06, + "loss": 0.9829, + "step": 24539 + }, + { + "epoch": 0.8788296596057085, + "grad_norm": 1.589781641960144, + "learning_rate": 7.600399312473683e-06, + "loss": 1.2576, + "step": 24540 + }, + { + "epoch": 0.8788654717352767, + "grad_norm": 1.5826210975646973, + "learning_rate": 7.595964446937764e-06, + "loss": 1.2485, + "step": 24541 + }, + { + "epoch": 0.878901283864845, + "grad_norm": 1.5725592374801636, + "learning_rate": 7.591530824578952e-06, + "loss": 1.2435, + "step": 24542 + }, + { + "epoch": 0.8789370959944133, + "grad_norm": 1.257857322692871, + "learning_rate": 7.587098445456897e-06, + "loss": 1.0376, + "step": 24543 + }, + { + "epoch": 0.8789729081239815, + "grad_norm": 1.4514216184616089, + "learning_rate": 7.582667309631242e-06, + "loss": 1.0266, + "step": 24544 + }, + { + "epoch": 0.8790087202535499, + "grad_norm": 1.5605127811431885, + "learning_rate": 7.578237417161571e-06, + "loss": 1.1149, + "step": 24545 + }, + { + "epoch": 0.8790445323831182, + "grad_norm": 1.831618309020996, + "learning_rate": 7.573808768107504e-06, + "loss": 1.2089, + "step": 24546 + }, + { + "epoch": 0.8790803445126865, + "grad_norm": 1.5352708101272583, + "learning_rate": 7.569381362528638e-06, + "loss": 1.0124, + "step": 24547 + }, + { + "epoch": 0.8791161566422547, + "grad_norm": 1.2538220882415771, + "learning_rate": 7.5649552004844915e-06, + "loss": 1.0244, + "step": 24548 + }, + { + "epoch": 0.879151968771823, + "grad_norm": 1.8444302082061768, + "learning_rate": 7.560530282034662e-06, + "loss": 1.0379, + "step": 24549 + }, + { + "epoch": 0.8791877809013913, + "grad_norm": 1.380650281906128, + "learning_rate": 7.556106607238633e-06, + "loss": 1.0687, + "step": 24550 + }, + { + "epoch": 0.8792235930309595, + "grad_norm": 1.4955201148986816, + "learning_rate": 7.551684176155971e-06, + "loss": 1.0316, + "step": 24551 + }, + { + "epoch": 0.8792594051605279, + "grad_norm": 1.6359986066818237, + "learning_rate": 7.547262988846126e-06, + "loss": 1.0361, + "step": 24552 + }, + { + "epoch": 0.8792952172900962, + "grad_norm": 1.469880223274231, + "learning_rate": 7.542843045368609e-06, + "loss": 1.2095, + "step": 24553 + }, + { + "epoch": 0.8793310294196645, + "grad_norm": 1.4026966094970703, + "learning_rate": 7.538424345782902e-06, + "loss": 1.0721, + "step": 24554 + }, + { + "epoch": 0.8793668415492327, + "grad_norm": 1.4470685720443726, + "learning_rate": 7.534006890148404e-06, + "loss": 0.9777, + "step": 24555 + }, + { + "epoch": 0.879402653678801, + "grad_norm": 1.5515356063842773, + "learning_rate": 7.52959067852459e-06, + "loss": 1.0718, + "step": 24556 + }, + { + "epoch": 0.8794384658083693, + "grad_norm": 1.7721689939498901, + "learning_rate": 7.525175710970811e-06, + "loss": 1.0883, + "step": 24557 + }, + { + "epoch": 0.8794742779379375, + "grad_norm": 1.448683261871338, + "learning_rate": 7.520761987546554e-06, + "loss": 0.9852, + "step": 24558 + }, + { + "epoch": 0.8795100900675059, + "grad_norm": 1.7358622550964355, + "learning_rate": 7.516349508311138e-06, + "loss": 1.0301, + "step": 24559 + }, + { + "epoch": 0.8795459021970742, + "grad_norm": 1.7126250267028809, + "learning_rate": 7.51193827332396e-06, + "loss": 1.1934, + "step": 24560 + }, + { + "epoch": 0.8795817143266424, + "grad_norm": 1.5009711980819702, + "learning_rate": 7.507528282644316e-06, + "loss": 1.2037, + "step": 24561 + }, + { + "epoch": 0.8796175264562107, + "grad_norm": 1.4200831651687622, + "learning_rate": 7.503119536331604e-06, + "loss": 1.076, + "step": 24562 + }, + { + "epoch": 0.879653338585779, + "grad_norm": 1.612572431564331, + "learning_rate": 7.49871203444511e-06, + "loss": 0.9721, + "step": 24563 + }, + { + "epoch": 0.8796891507153473, + "grad_norm": 1.3891633749008179, + "learning_rate": 7.494305777044086e-06, + "loss": 1.0036, + "step": 24564 + }, + { + "epoch": 0.8797249628449155, + "grad_norm": 1.4219412803649902, + "learning_rate": 7.489900764187896e-06, + "loss": 1.2353, + "step": 24565 + }, + { + "epoch": 0.8797607749744839, + "grad_norm": 1.6109182834625244, + "learning_rate": 7.485496995935748e-06, + "loss": 1.087, + "step": 24566 + }, + { + "epoch": 0.8797965871040522, + "grad_norm": 1.168009877204895, + "learning_rate": 7.481094472346905e-06, + "loss": 1.0587, + "step": 24567 + }, + { + "epoch": 0.8798323992336204, + "grad_norm": 1.4668188095092773, + "learning_rate": 7.476693193480577e-06, + "loss": 1.1723, + "step": 24568 + }, + { + "epoch": 0.8798682113631887, + "grad_norm": 1.45083487033844, + "learning_rate": 7.472293159396027e-06, + "loss": 1.1284, + "step": 24569 + }, + { + "epoch": 0.879904023492757, + "grad_norm": 1.5043715238571167, + "learning_rate": 7.4678943701523954e-06, + "loss": 1.1107, + "step": 24570 + }, + { + "epoch": 0.8799398356223253, + "grad_norm": 2.0034306049346924, + "learning_rate": 7.4634968258089135e-06, + "loss": 1.1725, + "step": 24571 + }, + { + "epoch": 0.8799756477518935, + "grad_norm": 1.5498775243759155, + "learning_rate": 7.4591005264246895e-06, + "loss": 1.0113, + "step": 24572 + }, + { + "epoch": 0.8800114598814619, + "grad_norm": 2.139763593673706, + "learning_rate": 7.454705472058909e-06, + "loss": 0.9825, + "step": 24573 + }, + { + "epoch": 0.8800472720110302, + "grad_norm": 1.624570369720459, + "learning_rate": 7.450311662770704e-06, + "loss": 0.9533, + "step": 24574 + }, + { + "epoch": 0.8800830841405984, + "grad_norm": 1.4396629333496094, + "learning_rate": 7.445919098619159e-06, + "loss": 0.9767, + "step": 24575 + }, + { + "epoch": 0.8801188962701667, + "grad_norm": 1.3811004161834717, + "learning_rate": 7.441527779663382e-06, + "loss": 1.1236, + "step": 24576 + }, + { + "epoch": 0.880154708399735, + "grad_norm": 1.5447341203689575, + "learning_rate": 7.43713770596246e-06, + "loss": 1.0784, + "step": 24577 + }, + { + "epoch": 0.8801905205293032, + "grad_norm": 1.5772367715835571, + "learning_rate": 7.4327488775754794e-06, + "loss": 1.0541, + "step": 24578 + }, + { + "epoch": 0.8802263326588715, + "grad_norm": 1.4601738452911377, + "learning_rate": 7.428361294561415e-06, + "loss": 1.0431, + "step": 24579 + }, + { + "epoch": 0.8802621447884399, + "grad_norm": 1.8559173345565796, + "learning_rate": 7.423974956979374e-06, + "loss": 1.1307, + "step": 24580 + }, + { + "epoch": 0.8802979569180082, + "grad_norm": 1.3938319683074951, + "learning_rate": 7.419589864888332e-06, + "loss": 1.0529, + "step": 24581 + }, + { + "epoch": 0.8803337690475764, + "grad_norm": 1.5606220960617065, + "learning_rate": 7.415206018347287e-06, + "loss": 0.8591, + "step": 24582 + }, + { + "epoch": 0.8803695811771447, + "grad_norm": 1.3142238855361938, + "learning_rate": 7.410823417415203e-06, + "loss": 0.9328, + "step": 24583 + }, + { + "epoch": 0.880405393306713, + "grad_norm": 1.8055987358093262, + "learning_rate": 7.406442062151064e-06, + "loss": 1.0373, + "step": 24584 + }, + { + "epoch": 0.8804412054362812, + "grad_norm": 1.3382705450057983, + "learning_rate": 7.402061952613826e-06, + "loss": 0.8577, + "step": 24585 + }, + { + "epoch": 0.8804770175658495, + "grad_norm": 1.3975032567977905, + "learning_rate": 7.397683088862395e-06, + "loss": 1.091, + "step": 24586 + }, + { + "epoch": 0.8805128296954179, + "grad_norm": 1.352641224861145, + "learning_rate": 7.393305470955681e-06, + "loss": 0.8661, + "step": 24587 + }, + { + "epoch": 0.8805486418249862, + "grad_norm": 1.8922079801559448, + "learning_rate": 7.388929098952579e-06, + "loss": 0.9391, + "step": 24588 + }, + { + "epoch": 0.8805844539545544, + "grad_norm": 1.2099894285202026, + "learning_rate": 7.384553972912011e-06, + "loss": 1.0873, + "step": 24589 + }, + { + "epoch": 0.8806202660841227, + "grad_norm": 1.3405019044876099, + "learning_rate": 7.380180092892775e-06, + "loss": 1.098, + "step": 24590 + }, + { + "epoch": 0.880656078213691, + "grad_norm": 1.9687790870666504, + "learning_rate": 7.375807458953743e-06, + "loss": 1.2202, + "step": 24591 + }, + { + "epoch": 0.8806918903432592, + "grad_norm": 1.416220784187317, + "learning_rate": 7.37143607115377e-06, + "loss": 1.0131, + "step": 24592 + }, + { + "epoch": 0.8807277024728275, + "grad_norm": 1.529239296913147, + "learning_rate": 7.36706592955162e-06, + "loss": 0.9843, + "step": 24593 + }, + { + "epoch": 0.8807635146023959, + "grad_norm": 1.571297526359558, + "learning_rate": 7.362697034206112e-06, + "loss": 0.8778, + "step": 24594 + }, + { + "epoch": 0.8807993267319641, + "grad_norm": 1.5638182163238525, + "learning_rate": 7.358329385176033e-06, + "loss": 1.1097, + "step": 24595 + }, + { + "epoch": 0.8808351388615324, + "grad_norm": 1.6581871509552002, + "learning_rate": 7.353962982520135e-06, + "loss": 1.1525, + "step": 24596 + }, + { + "epoch": 0.8808709509911007, + "grad_norm": 1.6675150394439697, + "learning_rate": 7.34959782629715e-06, + "loss": 1.0023, + "step": 24597 + }, + { + "epoch": 0.880906763120669, + "grad_norm": 1.5263152122497559, + "learning_rate": 7.345233916565808e-06, + "loss": 1.0274, + "step": 24598 + }, + { + "epoch": 0.8809425752502372, + "grad_norm": 1.2160375118255615, + "learning_rate": 7.340871253384851e-06, + "loss": 1.0474, + "step": 24599 + }, + { + "epoch": 0.8809783873798055, + "grad_norm": 1.7761367559432983, + "learning_rate": 7.336509836812933e-06, + "loss": 0.9391, + "step": 24600 + }, + { + "epoch": 0.8810141995093739, + "grad_norm": 1.6131409406661987, + "learning_rate": 7.3321496669087495e-06, + "loss": 1.0852, + "step": 24601 + }, + { + "epoch": 0.8810500116389421, + "grad_norm": 1.2989872694015503, + "learning_rate": 7.327790743730956e-06, + "loss": 0.9926, + "step": 24602 + }, + { + "epoch": 0.8810858237685104, + "grad_norm": 1.3796120882034302, + "learning_rate": 7.323433067338214e-06, + "loss": 1.0104, + "step": 24603 + }, + { + "epoch": 0.8811216358980787, + "grad_norm": 1.8204296827316284, + "learning_rate": 7.319076637789124e-06, + "loss": 1.0034, + "step": 24604 + }, + { + "epoch": 0.881157448027647, + "grad_norm": 1.377787470817566, + "learning_rate": 7.314721455142304e-06, + "loss": 1.0621, + "step": 24605 + }, + { + "epoch": 0.8811932601572152, + "grad_norm": 1.3725217580795288, + "learning_rate": 7.310367519456352e-06, + "loss": 1.1938, + "step": 24606 + }, + { + "epoch": 0.8812290722867835, + "grad_norm": 1.6467009782791138, + "learning_rate": 7.306014830789865e-06, + "loss": 1.0993, + "step": 24607 + }, + { + "epoch": 0.8812648844163519, + "grad_norm": 1.5030295848846436, + "learning_rate": 7.3016633892013634e-06, + "loss": 1.1539, + "step": 24608 + }, + { + "epoch": 0.8813006965459201, + "grad_norm": 1.4564110040664673, + "learning_rate": 7.2973131947494e-06, + "loss": 0.9431, + "step": 24609 + }, + { + "epoch": 0.8813365086754884, + "grad_norm": 1.486098289489746, + "learning_rate": 7.292964247492539e-06, + "loss": 1.0385, + "step": 24610 + }, + { + "epoch": 0.8813723208050567, + "grad_norm": 1.2940870523452759, + "learning_rate": 7.288616547489235e-06, + "loss": 1.0358, + "step": 24611 + }, + { + "epoch": 0.8814081329346249, + "grad_norm": 1.3486065864562988, + "learning_rate": 7.284270094798018e-06, + "loss": 0.9584, + "step": 24612 + }, + { + "epoch": 0.8814439450641932, + "grad_norm": 1.297621250152588, + "learning_rate": 7.279924889477341e-06, + "loss": 1.1409, + "step": 24613 + }, + { + "epoch": 0.8814797571937615, + "grad_norm": 1.9812226295471191, + "learning_rate": 7.27558093158569e-06, + "loss": 1.2499, + "step": 24614 + }, + { + "epoch": 0.8815155693233299, + "grad_norm": 1.3047242164611816, + "learning_rate": 7.2712382211814865e-06, + "loss": 1.0895, + "step": 24615 + }, + { + "epoch": 0.8815513814528981, + "grad_norm": 1.1261281967163086, + "learning_rate": 7.266896758323149e-06, + "loss": 0.8477, + "step": 24616 + }, + { + "epoch": 0.8815871935824664, + "grad_norm": 1.7848286628723145, + "learning_rate": 7.2625565430691214e-06, + "loss": 0.8239, + "step": 24617 + }, + { + "epoch": 0.8816230057120347, + "grad_norm": 1.8151119947433472, + "learning_rate": 7.258217575477755e-06, + "loss": 1.1449, + "step": 24618 + }, + { + "epoch": 0.8816588178416029, + "grad_norm": 1.4841574430465698, + "learning_rate": 7.253879855607437e-06, + "loss": 1.1097, + "step": 24619 + }, + { + "epoch": 0.8816946299711712, + "grad_norm": 1.3886325359344482, + "learning_rate": 7.249543383516544e-06, + "loss": 0.9885, + "step": 24620 + }, + { + "epoch": 0.8817304421007395, + "grad_norm": 1.4462940692901611, + "learning_rate": 7.245208159263417e-06, + "loss": 1.1059, + "step": 24621 + }, + { + "epoch": 0.8817662542303079, + "grad_norm": 1.4268527030944824, + "learning_rate": 7.240874182906343e-06, + "loss": 1.0192, + "step": 24622 + }, + { + "epoch": 0.8818020663598761, + "grad_norm": 1.3595143556594849, + "learning_rate": 7.236541454503664e-06, + "loss": 1.1538, + "step": 24623 + }, + { + "epoch": 0.8818378784894444, + "grad_norm": 1.4948512315750122, + "learning_rate": 7.232209974113668e-06, + "loss": 1.0517, + "step": 24624 + }, + { + "epoch": 0.8818736906190127, + "grad_norm": 2.0243887901306152, + "learning_rate": 7.2278797417946405e-06, + "loss": 1.1894, + "step": 24625 + }, + { + "epoch": 0.8819095027485809, + "grad_norm": 1.481156349182129, + "learning_rate": 7.2235507576048024e-06, + "loss": 0.786, + "step": 24626 + }, + { + "epoch": 0.8819453148781492, + "grad_norm": 1.323594331741333, + "learning_rate": 7.219223021602417e-06, + "loss": 0.8961, + "step": 24627 + }, + { + "epoch": 0.8819811270077175, + "grad_norm": 1.5115394592285156, + "learning_rate": 7.214896533845716e-06, + "loss": 1.1202, + "step": 24628 + }, + { + "epoch": 0.8820169391372858, + "grad_norm": 1.6494203805923462, + "learning_rate": 7.210571294392898e-06, + "loss": 1.1486, + "step": 24629 + }, + { + "epoch": 0.8820527512668541, + "grad_norm": 1.3309684991836548, + "learning_rate": 7.206247303302138e-06, + "loss": 1.0464, + "step": 24630 + }, + { + "epoch": 0.8820885633964224, + "grad_norm": 1.9870409965515137, + "learning_rate": 7.201924560631634e-06, + "loss": 1.1737, + "step": 24631 + }, + { + "epoch": 0.8821243755259907, + "grad_norm": 1.515248417854309, + "learning_rate": 7.197603066439551e-06, + "loss": 0.9298, + "step": 24632 + }, + { + "epoch": 0.8821601876555589, + "grad_norm": 1.4083250761032104, + "learning_rate": 7.193282820783987e-06, + "loss": 0.9232, + "step": 24633 + }, + { + "epoch": 0.8821959997851272, + "grad_norm": 1.7557426691055298, + "learning_rate": 7.188963823723105e-06, + "loss": 1.1321, + "step": 24634 + }, + { + "epoch": 0.8822318119146955, + "grad_norm": 1.366318941116333, + "learning_rate": 7.184646075315005e-06, + "loss": 0.8513, + "step": 24635 + }, + { + "epoch": 0.8822676240442638, + "grad_norm": 1.464727520942688, + "learning_rate": 7.18032957561775e-06, + "loss": 1.1121, + "step": 24636 + }, + { + "epoch": 0.8823034361738321, + "grad_norm": 1.5384045839309692, + "learning_rate": 7.176014324689428e-06, + "loss": 1.1032, + "step": 24637 + }, + { + "epoch": 0.8823392483034004, + "grad_norm": 1.2650842666625977, + "learning_rate": 7.171700322588115e-06, + "loss": 0.9662, + "step": 24638 + }, + { + "epoch": 0.8823750604329686, + "grad_norm": 1.3544721603393555, + "learning_rate": 7.167387569371842e-06, + "loss": 0.9027, + "step": 24639 + }, + { + "epoch": 0.8824108725625369, + "grad_norm": 1.4673441648483276, + "learning_rate": 7.1630760650986065e-06, + "loss": 1.0886, + "step": 24640 + }, + { + "epoch": 0.8824466846921052, + "grad_norm": 1.5494791269302368, + "learning_rate": 7.158765809826429e-06, + "loss": 1.1674, + "step": 24641 + }, + { + "epoch": 0.8824824968216735, + "grad_norm": 1.4999048709869385, + "learning_rate": 7.154456803613297e-06, + "loss": 0.9813, + "step": 24642 + }, + { + "epoch": 0.8825183089512418, + "grad_norm": 1.6651114225387573, + "learning_rate": 7.150149046517218e-06, + "loss": 1.2461, + "step": 24643 + }, + { + "epoch": 0.8825541210808101, + "grad_norm": 1.3230962753295898, + "learning_rate": 7.145842538596104e-06, + "loss": 1.0264, + "step": 24644 + }, + { + "epoch": 0.8825899332103784, + "grad_norm": 1.5649346113204956, + "learning_rate": 7.141537279907873e-06, + "loss": 1.0838, + "step": 24645 + }, + { + "epoch": 0.8826257453399466, + "grad_norm": 1.3220256567001343, + "learning_rate": 7.1372332705105125e-06, + "loss": 0.9065, + "step": 24646 + }, + { + "epoch": 0.8826615574695149, + "grad_norm": 1.5275050401687622, + "learning_rate": 7.132930510461889e-06, + "loss": 1.2058, + "step": 24647 + }, + { + "epoch": 0.8826973695990832, + "grad_norm": 1.6474803686141968, + "learning_rate": 7.128628999819886e-06, + "loss": 1.1147, + "step": 24648 + }, + { + "epoch": 0.8827331817286515, + "grad_norm": 1.53829824924469, + "learning_rate": 7.1243287386423826e-06, + "loss": 1.0468, + "step": 24649 + }, + { + "epoch": 0.8827689938582198, + "grad_norm": 1.3875771760940552, + "learning_rate": 7.120029726987254e-06, + "loss": 1.0745, + "step": 24650 + }, + { + "epoch": 0.8828048059877881, + "grad_norm": 1.34926176071167, + "learning_rate": 7.11573196491232e-06, + "loss": 1.1789, + "step": 24651 + }, + { + "epoch": 0.8828406181173564, + "grad_norm": 1.5242843627929688, + "learning_rate": 7.111435452475368e-06, + "loss": 1.1454, + "step": 24652 + }, + { + "epoch": 0.8828764302469246, + "grad_norm": 1.4089300632476807, + "learning_rate": 7.1071401897342625e-06, + "loss": 0.9826, + "step": 24653 + }, + { + "epoch": 0.8829122423764929, + "grad_norm": 1.600061058998108, + "learning_rate": 7.1028461767467466e-06, + "loss": 1.2259, + "step": 24654 + }, + { + "epoch": 0.8829480545060612, + "grad_norm": 1.7001497745513916, + "learning_rate": 7.0985534135706296e-06, + "loss": 1.2116, + "step": 24655 + }, + { + "epoch": 0.8829838666356294, + "grad_norm": 1.6519767045974731, + "learning_rate": 7.0942619002635995e-06, + "loss": 1.1951, + "step": 24656 + }, + { + "epoch": 0.8830196787651978, + "grad_norm": 1.3067625761032104, + "learning_rate": 7.089971636883475e-06, + "loss": 1.0326, + "step": 24657 + }, + { + "epoch": 0.8830554908947661, + "grad_norm": 1.7507367134094238, + "learning_rate": 7.085682623487921e-06, + "loss": 1.0058, + "step": 24658 + }, + { + "epoch": 0.8830913030243344, + "grad_norm": 1.5127851963043213, + "learning_rate": 7.0813948601346715e-06, + "loss": 1.1801, + "step": 24659 + }, + { + "epoch": 0.8831271151539026, + "grad_norm": 1.7371385097503662, + "learning_rate": 7.077108346881378e-06, + "loss": 1.0124, + "step": 24660 + }, + { + "epoch": 0.8831629272834709, + "grad_norm": 1.2070778608322144, + "learning_rate": 7.07282308378574e-06, + "loss": 1.0325, + "step": 24661 + }, + { + "epoch": 0.8831987394130392, + "grad_norm": 2.036513090133667, + "learning_rate": 7.068539070905411e-06, + "loss": 1.3891, + "step": 24662 + }, + { + "epoch": 0.8832345515426074, + "grad_norm": 1.7395224571228027, + "learning_rate": 7.064256308297978e-06, + "loss": 1.3174, + "step": 24663 + }, + { + "epoch": 0.8832703636721758, + "grad_norm": 1.455288290977478, + "learning_rate": 7.05997479602114e-06, + "loss": 0.9242, + "step": 24664 + }, + { + "epoch": 0.8833061758017441, + "grad_norm": 1.3791251182556152, + "learning_rate": 7.0556945341324284e-06, + "loss": 1.1003, + "step": 24665 + }, + { + "epoch": 0.8833419879313124, + "grad_norm": 1.6263728141784668, + "learning_rate": 7.051415522689487e-06, + "loss": 1.2362, + "step": 24666 + }, + { + "epoch": 0.8833778000608806, + "grad_norm": 1.4323121309280396, + "learning_rate": 7.047137761749811e-06, + "loss": 1.0288, + "step": 24667 + }, + { + "epoch": 0.8834136121904489, + "grad_norm": 1.5993322134017944, + "learning_rate": 7.042861251371036e-06, + "loss": 0.9838, + "step": 24668 + }, + { + "epoch": 0.8834494243200172, + "grad_norm": 1.5984628200531006, + "learning_rate": 7.038585991610647e-06, + "loss": 1.0158, + "step": 24669 + }, + { + "epoch": 0.8834852364495854, + "grad_norm": 1.99858820438385, + "learning_rate": 7.034311982526165e-06, + "loss": 1.1499, + "step": 24670 + }, + { + "epoch": 0.8835210485791538, + "grad_norm": 1.9528908729553223, + "learning_rate": 7.0300392241751e-06, + "loss": 1.2284, + "step": 24671 + }, + { + "epoch": 0.8835568607087221, + "grad_norm": 1.2349481582641602, + "learning_rate": 7.025767716614928e-06, + "loss": 0.9453, + "step": 24672 + }, + { + "epoch": 0.8835926728382903, + "grad_norm": 1.5175682306289673, + "learning_rate": 7.021497459903137e-06, + "loss": 1.1901, + "step": 24673 + }, + { + "epoch": 0.8836284849678586, + "grad_norm": 1.8561638593673706, + "learning_rate": 7.017228454097136e-06, + "loss": 1.0791, + "step": 24674 + }, + { + "epoch": 0.8836642970974269, + "grad_norm": 1.6244242191314697, + "learning_rate": 7.012960699254423e-06, + "loss": 0.9771, + "step": 24675 + }, + { + "epoch": 0.8837001092269952, + "grad_norm": 1.4900058507919312, + "learning_rate": 7.0086941954323634e-06, + "loss": 0.9358, + "step": 24676 + }, + { + "epoch": 0.8837359213565634, + "grad_norm": 1.5437417030334473, + "learning_rate": 7.004428942688379e-06, + "loss": 1.0091, + "step": 24677 + }, + { + "epoch": 0.8837717334861318, + "grad_norm": 1.6090790033340454, + "learning_rate": 7.000164941079846e-06, + "loss": 1.0155, + "step": 24678 + }, + { + "epoch": 0.8838075456157001, + "grad_norm": 1.2392936944961548, + "learning_rate": 6.995902190664116e-06, + "loss": 1.0459, + "step": 24679 + }, + { + "epoch": 0.8838433577452683, + "grad_norm": 1.522114634513855, + "learning_rate": 6.99164069149858e-06, + "loss": 1.0661, + "step": 24680 + }, + { + "epoch": 0.8838791698748366, + "grad_norm": 1.4603097438812256, + "learning_rate": 6.9873804436405345e-06, + "loss": 1.0731, + "step": 24681 + }, + { + "epoch": 0.8839149820044049, + "grad_norm": 1.4340695142745972, + "learning_rate": 6.9831214471473e-06, + "loss": 0.9317, + "step": 24682 + }, + { + "epoch": 0.8839507941339732, + "grad_norm": 1.5659598112106323, + "learning_rate": 6.978863702076188e-06, + "loss": 1.08, + "step": 24683 + }, + { + "epoch": 0.8839866062635414, + "grad_norm": 1.5148402452468872, + "learning_rate": 6.974607208484496e-06, + "loss": 1.0665, + "step": 24684 + }, + { + "epoch": 0.8840224183931098, + "grad_norm": 1.2792232036590576, + "learning_rate": 6.970351966429445e-06, + "loss": 0.9716, + "step": 24685 + }, + { + "epoch": 0.8840582305226781, + "grad_norm": 1.5278716087341309, + "learning_rate": 6.966097975968311e-06, + "loss": 0.9605, + "step": 24686 + }, + { + "epoch": 0.8840940426522463, + "grad_norm": 1.6166942119598389, + "learning_rate": 6.961845237158337e-06, + "loss": 1.1139, + "step": 24687 + }, + { + "epoch": 0.8841298547818146, + "grad_norm": 1.315977692604065, + "learning_rate": 6.957593750056712e-06, + "loss": 0.9301, + "step": 24688 + }, + { + "epoch": 0.8841656669113829, + "grad_norm": 1.436269760131836, + "learning_rate": 6.953343514720656e-06, + "loss": 1.1876, + "step": 24689 + }, + { + "epoch": 0.8842014790409511, + "grad_norm": 1.557397484779358, + "learning_rate": 6.949094531207334e-06, + "loss": 0.994, + "step": 24690 + }, + { + "epoch": 0.8842372911705194, + "grad_norm": 1.7056962251663208, + "learning_rate": 6.944846799573934e-06, + "loss": 1.0043, + "step": 24691 + }, + { + "epoch": 0.8842731033000878, + "grad_norm": 2.077890157699585, + "learning_rate": 6.940600319877566e-06, + "loss": 1.0625, + "step": 24692 + }, + { + "epoch": 0.8843089154296561, + "grad_norm": 1.3635879755020142, + "learning_rate": 6.936355092175384e-06, + "loss": 1.0564, + "step": 24693 + }, + { + "epoch": 0.8843447275592243, + "grad_norm": 1.5278645753860474, + "learning_rate": 6.932111116524509e-06, + "loss": 1.0636, + "step": 24694 + }, + { + "epoch": 0.8843805396887926, + "grad_norm": 1.4421147108078003, + "learning_rate": 6.92786839298204e-06, + "loss": 1.2457, + "step": 24695 + }, + { + "epoch": 0.8844163518183609, + "grad_norm": 1.5062587261199951, + "learning_rate": 6.923626921605031e-06, + "loss": 1.001, + "step": 24696 + }, + { + "epoch": 0.8844521639479291, + "grad_norm": 1.367352843284607, + "learning_rate": 6.9193867024505695e-06, + "loss": 1.1202, + "step": 24697 + }, + { + "epoch": 0.8844879760774974, + "grad_norm": 1.198480248451233, + "learning_rate": 6.9151477355757e-06, + "loss": 1.0409, + "step": 24698 + }, + { + "epoch": 0.8845237882070658, + "grad_norm": 1.4974024295806885, + "learning_rate": 6.910910021037431e-06, + "loss": 1.005, + "step": 24699 + }, + { + "epoch": 0.884559600336634, + "grad_norm": 1.2842371463775635, + "learning_rate": 6.906673558892807e-06, + "loss": 1.1127, + "step": 24700 + }, + { + "epoch": 0.8845954124662023, + "grad_norm": 1.2943683862686157, + "learning_rate": 6.902438349198792e-06, + "loss": 1.2486, + "step": 24701 + }, + { + "epoch": 0.8846312245957706, + "grad_norm": 1.8238178491592407, + "learning_rate": 6.898204392012408e-06, + "loss": 1.2456, + "step": 24702 + }, + { + "epoch": 0.8846670367253389, + "grad_norm": 1.5489445924758911, + "learning_rate": 6.893971687390566e-06, + "loss": 1.145, + "step": 24703 + }, + { + "epoch": 0.8847028488549071, + "grad_norm": 1.6776583194732666, + "learning_rate": 6.889740235390241e-06, + "loss": 1.0645, + "step": 24704 + }, + { + "epoch": 0.8847386609844754, + "grad_norm": 1.6833537817001343, + "learning_rate": 6.885510036068377e-06, + "loss": 1.1826, + "step": 24705 + }, + { + "epoch": 0.8847744731140438, + "grad_norm": 1.6026887893676758, + "learning_rate": 6.881281089481839e-06, + "loss": 1.0388, + "step": 24706 + }, + { + "epoch": 0.884810285243612, + "grad_norm": 1.4883896112442017, + "learning_rate": 6.877053395687561e-06, + "loss": 1.2153, + "step": 24707 + }, + { + "epoch": 0.8848460973731803, + "grad_norm": 1.7314536571502686, + "learning_rate": 6.872826954742406e-06, + "loss": 1.2653, + "step": 24708 + }, + { + "epoch": 0.8848819095027486, + "grad_norm": 1.564477801322937, + "learning_rate": 6.868601766703253e-06, + "loss": 0.9235, + "step": 24709 + }, + { + "epoch": 0.8849177216323169, + "grad_norm": 1.5381165742874146, + "learning_rate": 6.8643778316269226e-06, + "loss": 1.0084, + "step": 24710 + }, + { + "epoch": 0.8849535337618851, + "grad_norm": 1.7533910274505615, + "learning_rate": 6.860155149570246e-06, + "loss": 1.2038, + "step": 24711 + }, + { + "epoch": 0.8849893458914534, + "grad_norm": 1.3483706712722778, + "learning_rate": 6.855933720590047e-06, + "loss": 1.0756, + "step": 24712 + }, + { + "epoch": 0.8850251580210218, + "grad_norm": 1.9112491607666016, + "learning_rate": 6.8517135447431215e-06, + "loss": 1.1708, + "step": 24713 + }, + { + "epoch": 0.88506097015059, + "grad_norm": 1.2876783609390259, + "learning_rate": 6.847494622086226e-06, + "loss": 0.9596, + "step": 24714 + }, + { + "epoch": 0.8850967822801583, + "grad_norm": 1.6386834383010864, + "learning_rate": 6.843276952676125e-06, + "loss": 1.0816, + "step": 24715 + }, + { + "epoch": 0.8851325944097266, + "grad_norm": 1.5228509902954102, + "learning_rate": 6.839060536569597e-06, + "loss": 1.0748, + "step": 24716 + }, + { + "epoch": 0.8851684065392948, + "grad_norm": 1.345090389251709, + "learning_rate": 6.834845373823317e-06, + "loss": 0.903, + "step": 24717 + }, + { + "epoch": 0.8852042186688631, + "grad_norm": 1.4915560483932495, + "learning_rate": 6.830631464494019e-06, + "loss": 1.1492, + "step": 24718 + }, + { + "epoch": 0.8852400307984314, + "grad_norm": 1.4256958961486816, + "learning_rate": 6.826418808638391e-06, + "loss": 1.2089, + "step": 24719 + }, + { + "epoch": 0.8852758429279998, + "grad_norm": 1.4362207651138306, + "learning_rate": 6.82220740631313e-06, + "loss": 1.0295, + "step": 24720 + }, + { + "epoch": 0.885311655057568, + "grad_norm": 1.6674654483795166, + "learning_rate": 6.8179972575748706e-06, + "loss": 1.0798, + "step": 24721 + }, + { + "epoch": 0.8853474671871363, + "grad_norm": 1.418357491493225, + "learning_rate": 6.813788362480256e-06, + "loss": 0.9395, + "step": 24722 + }, + { + "epoch": 0.8853832793167046, + "grad_norm": 1.3574403524398804, + "learning_rate": 6.809580721085929e-06, + "loss": 1.1424, + "step": 24723 + }, + { + "epoch": 0.8854190914462728, + "grad_norm": 1.5260215997695923, + "learning_rate": 6.805374333448478e-06, + "loss": 0.8781, + "step": 24724 + }, + { + "epoch": 0.8854549035758411, + "grad_norm": 1.3360873460769653, + "learning_rate": 6.801169199624502e-06, + "loss": 1.0368, + "step": 24725 + }, + { + "epoch": 0.8854907157054094, + "grad_norm": 1.3493996858596802, + "learning_rate": 6.796965319670568e-06, + "loss": 1.1865, + "step": 24726 + }, + { + "epoch": 0.8855265278349778, + "grad_norm": 1.7585631608963013, + "learning_rate": 6.792762693643262e-06, + "loss": 1.0442, + "step": 24727 + }, + { + "epoch": 0.885562339964546, + "grad_norm": 1.3570653200149536, + "learning_rate": 6.7885613215990965e-06, + "loss": 1.064, + "step": 24728 + }, + { + "epoch": 0.8855981520941143, + "grad_norm": 1.914278507232666, + "learning_rate": 6.7843612035945915e-06, + "loss": 1.2844, + "step": 24729 + }, + { + "epoch": 0.8856339642236826, + "grad_norm": 1.9665110111236572, + "learning_rate": 6.78016233968628e-06, + "loss": 1.0711, + "step": 24730 + }, + { + "epoch": 0.8856697763532508, + "grad_norm": 1.5753049850463867, + "learning_rate": 6.775964729930651e-06, + "loss": 1.0127, + "step": 24731 + }, + { + "epoch": 0.8857055884828191, + "grad_norm": 1.2621217966079712, + "learning_rate": 6.771768374384168e-06, + "loss": 0.8154, + "step": 24732 + }, + { + "epoch": 0.8857414006123874, + "grad_norm": 1.4563289880752563, + "learning_rate": 6.767573273103245e-06, + "loss": 1.0761, + "step": 24733 + }, + { + "epoch": 0.8857772127419558, + "grad_norm": 1.3421992063522339, + "learning_rate": 6.7633794261444005e-06, + "loss": 1.1388, + "step": 24734 + }, + { + "epoch": 0.885813024871524, + "grad_norm": 1.7052502632141113, + "learning_rate": 6.7591868335640016e-06, + "loss": 1.1118, + "step": 24735 + }, + { + "epoch": 0.8858488370010923, + "grad_norm": 1.4717538356781006, + "learning_rate": 6.754995495418482e-06, + "loss": 1.1192, + "step": 24736 + }, + { + "epoch": 0.8858846491306606, + "grad_norm": 1.6601499319076538, + "learning_rate": 6.750805411764205e-06, + "loss": 1.0289, + "step": 24737 + }, + { + "epoch": 0.8859204612602288, + "grad_norm": 1.4687801599502563, + "learning_rate": 6.746616582657583e-06, + "loss": 1.0926, + "step": 24738 + }, + { + "epoch": 0.8859562733897971, + "grad_norm": 1.646043062210083, + "learning_rate": 6.742429008154927e-06, + "loss": 1.1205, + "step": 24739 + }, + { + "epoch": 0.8859920855193654, + "grad_norm": 1.437503457069397, + "learning_rate": 6.738242688312602e-06, + "loss": 1.0156, + "step": 24740 + }, + { + "epoch": 0.8860278976489337, + "grad_norm": 1.6723984479904175, + "learning_rate": 6.734057623186929e-06, + "loss": 1.0379, + "step": 24741 + }, + { + "epoch": 0.886063709778502, + "grad_norm": 1.605608582496643, + "learning_rate": 6.729873812834198e-06, + "loss": 0.9364, + "step": 24742 + }, + { + "epoch": 0.8860995219080703, + "grad_norm": 1.6099116802215576, + "learning_rate": 6.725691257310718e-06, + "loss": 1.2013, + "step": 24743 + }, + { + "epoch": 0.8861353340376386, + "grad_norm": 1.5440309047698975, + "learning_rate": 6.721509956672711e-06, + "loss": 1.0796, + "step": 24744 + }, + { + "epoch": 0.8861711461672068, + "grad_norm": 1.7223788499832153, + "learning_rate": 6.7173299109765e-06, + "loss": 1.1149, + "step": 24745 + }, + { + "epoch": 0.8862069582967751, + "grad_norm": 2.0385303497314453, + "learning_rate": 6.713151120278283e-06, + "loss": 1.1772, + "step": 24746 + }, + { + "epoch": 0.8862427704263434, + "grad_norm": 1.825337290763855, + "learning_rate": 6.7089735846342815e-06, + "loss": 1.0327, + "step": 24747 + }, + { + "epoch": 0.8862785825559117, + "grad_norm": 1.4460850954055786, + "learning_rate": 6.704797304100707e-06, + "loss": 0.9058, + "step": 24748 + }, + { + "epoch": 0.88631439468548, + "grad_norm": 1.868308424949646, + "learning_rate": 6.700622278733748e-06, + "loss": 1.089, + "step": 24749 + }, + { + "epoch": 0.8863502068150483, + "grad_norm": 1.5673421621322632, + "learning_rate": 6.69644850858957e-06, + "loss": 1.071, + "step": 24750 + }, + { + "epoch": 0.8863860189446165, + "grad_norm": 1.6754724979400635, + "learning_rate": 6.692275993724295e-06, + "loss": 1.0228, + "step": 24751 + }, + { + "epoch": 0.8864218310741848, + "grad_norm": 1.7604631185531616, + "learning_rate": 6.688104734194123e-06, + "loss": 1.0589, + "step": 24752 + }, + { + "epoch": 0.8864576432037531, + "grad_norm": 1.332777738571167, + "learning_rate": 6.683934730055119e-06, + "loss": 0.9946, + "step": 24753 + }, + { + "epoch": 0.8864934553333214, + "grad_norm": 1.608269214630127, + "learning_rate": 6.679765981363417e-06, + "loss": 1.2673, + "step": 24754 + }, + { + "epoch": 0.8865292674628896, + "grad_norm": 1.1981017589569092, + "learning_rate": 6.675598488175061e-06, + "loss": 1.0242, + "step": 24755 + }, + { + "epoch": 0.886565079592458, + "grad_norm": 1.2775262594223022, + "learning_rate": 6.671432250546184e-06, + "loss": 1.06, + "step": 24756 + }, + { + "epoch": 0.8866008917220263, + "grad_norm": 1.7825862169265747, + "learning_rate": 6.6672672685327955e-06, + "loss": 1.1236, + "step": 24757 + }, + { + "epoch": 0.8866367038515945, + "grad_norm": 1.7374963760375977, + "learning_rate": 6.663103542190918e-06, + "loss": 1.2737, + "step": 24758 + }, + { + "epoch": 0.8866725159811628, + "grad_norm": 1.4065005779266357, + "learning_rate": 6.658941071576597e-06, + "loss": 0.9759, + "step": 24759 + }, + { + "epoch": 0.8867083281107311, + "grad_norm": 1.6148642301559448, + "learning_rate": 6.654779856745807e-06, + "loss": 1.2663, + "step": 24760 + }, + { + "epoch": 0.8867441402402993, + "grad_norm": 1.5960971117019653, + "learning_rate": 6.650619897754573e-06, + "loss": 1.0159, + "step": 24761 + }, + { + "epoch": 0.8867799523698676, + "grad_norm": 1.2692118883132935, + "learning_rate": 6.646461194658804e-06, + "loss": 1.1656, + "step": 24762 + }, + { + "epoch": 0.886815764499436, + "grad_norm": 1.5209263563156128, + "learning_rate": 6.642303747514511e-06, + "loss": 1.0163, + "step": 24763 + }, + { + "epoch": 0.8868515766290043, + "grad_norm": 1.3790181875228882, + "learning_rate": 6.638147556377583e-06, + "loss": 0.9164, + "step": 24764 + }, + { + "epoch": 0.8868873887585725, + "grad_norm": 1.3841795921325684, + "learning_rate": 6.633992621303975e-06, + "loss": 1.2191, + "step": 24765 + }, + { + "epoch": 0.8869232008881408, + "grad_norm": 1.1842050552368164, + "learning_rate": 6.629838942349542e-06, + "loss": 1.0091, + "step": 24766 + }, + { + "epoch": 0.8869590130177091, + "grad_norm": 1.2403095960617065, + "learning_rate": 6.625686519570184e-06, + "loss": 1.1555, + "step": 24767 + }, + { + "epoch": 0.8869948251472773, + "grad_norm": 1.5377278327941895, + "learning_rate": 6.621535353021791e-06, + "loss": 1.0702, + "step": 24768 + }, + { + "epoch": 0.8870306372768456, + "grad_norm": 1.9620176553726196, + "learning_rate": 6.617385442760171e-06, + "loss": 1.1674, + "step": 24769 + }, + { + "epoch": 0.887066449406414, + "grad_norm": 1.8240423202514648, + "learning_rate": 6.61323678884117e-06, + "loss": 1.1631, + "step": 24770 + }, + { + "epoch": 0.8871022615359823, + "grad_norm": 1.5382505655288696, + "learning_rate": 6.6090893913206106e-06, + "loss": 1.1788, + "step": 24771 + }, + { + "epoch": 0.8871380736655505, + "grad_norm": 1.2263050079345703, + "learning_rate": 6.604943250254303e-06, + "loss": 0.7592, + "step": 24772 + }, + { + "epoch": 0.8871738857951188, + "grad_norm": 1.5225648880004883, + "learning_rate": 6.600798365697991e-06, + "loss": 0.9988, + "step": 24773 + }, + { + "epoch": 0.8872096979246871, + "grad_norm": 1.7057523727416992, + "learning_rate": 6.596654737707486e-06, + "loss": 1.1312, + "step": 24774 + }, + { + "epoch": 0.8872455100542553, + "grad_norm": 1.4237873554229736, + "learning_rate": 6.592512366338499e-06, + "loss": 1.0413, + "step": 24775 + }, + { + "epoch": 0.8872813221838236, + "grad_norm": 1.5121934413909912, + "learning_rate": 6.588371251646774e-06, + "loss": 1.1663, + "step": 24776 + }, + { + "epoch": 0.887317134313392, + "grad_norm": 1.3967664241790771, + "learning_rate": 6.584231393688012e-06, + "loss": 1.1406, + "step": 24777 + }, + { + "epoch": 0.8873529464429603, + "grad_norm": 1.4884734153747559, + "learning_rate": 6.5800927925179115e-06, + "loss": 1.043, + "step": 24778 + }, + { + "epoch": 0.8873887585725285, + "grad_norm": 1.476714849472046, + "learning_rate": 6.575955448192184e-06, + "loss": 1.0275, + "step": 24779 + }, + { + "epoch": 0.8874245707020968, + "grad_norm": 1.3075414896011353, + "learning_rate": 6.5718193607664516e-06, + "loss": 0.8098, + "step": 24780 + }, + { + "epoch": 0.8874603828316651, + "grad_norm": 2.3002066612243652, + "learning_rate": 6.5676845302963805e-06, + "loss": 1.1229, + "step": 24781 + }, + { + "epoch": 0.8874961949612333, + "grad_norm": 1.5903412103652954, + "learning_rate": 6.563550956837594e-06, + "loss": 0.9696, + "step": 24782 + }, + { + "epoch": 0.8875320070908016, + "grad_norm": 1.867339015007019, + "learning_rate": 6.559418640445714e-06, + "loss": 1.1245, + "step": 24783 + }, + { + "epoch": 0.88756781922037, + "grad_norm": 1.271859049797058, + "learning_rate": 6.555287581176317e-06, + "loss": 1.1395, + "step": 24784 + }, + { + "epoch": 0.8876036313499382, + "grad_norm": 1.4108529090881348, + "learning_rate": 6.551157779084982e-06, + "loss": 0.982, + "step": 24785 + }, + { + "epoch": 0.8876394434795065, + "grad_norm": 1.2754545211791992, + "learning_rate": 6.547029234227298e-06, + "loss": 1.1148, + "step": 24786 + }, + { + "epoch": 0.8876752556090748, + "grad_norm": 1.3372535705566406, + "learning_rate": 6.5429019466587745e-06, + "loss": 1.2296, + "step": 24787 + }, + { + "epoch": 0.8877110677386431, + "grad_norm": 1.451960563659668, + "learning_rate": 6.5387759164349585e-06, + "loss": 1.1186, + "step": 24788 + }, + { + "epoch": 0.8877468798682113, + "grad_norm": 1.2318989038467407, + "learning_rate": 6.5346511436113585e-06, + "loss": 0.905, + "step": 24789 + }, + { + "epoch": 0.8877826919977796, + "grad_norm": 1.7072198390960693, + "learning_rate": 6.5305276282434765e-06, + "loss": 1.0162, + "step": 24790 + }, + { + "epoch": 0.887818504127348, + "grad_norm": 1.5960217714309692, + "learning_rate": 6.526405370386757e-06, + "loss": 1.1127, + "step": 24791 + }, + { + "epoch": 0.8878543162569162, + "grad_norm": 1.4923123121261597, + "learning_rate": 6.522284370096687e-06, + "loss": 1.1319, + "step": 24792 + }, + { + "epoch": 0.8878901283864845, + "grad_norm": 1.8055779933929443, + "learning_rate": 6.518164627428724e-06, + "loss": 1.2445, + "step": 24793 + }, + { + "epoch": 0.8879259405160528, + "grad_norm": 1.5496807098388672, + "learning_rate": 6.514046142438246e-06, + "loss": 1.0293, + "step": 24794 + }, + { + "epoch": 0.887961752645621, + "grad_norm": 1.7351555824279785, + "learning_rate": 6.509928915180697e-06, + "loss": 1.1432, + "step": 24795 + }, + { + "epoch": 0.8879975647751893, + "grad_norm": 1.3093360662460327, + "learning_rate": 6.505812945711454e-06, + "loss": 0.9949, + "step": 24796 + }, + { + "epoch": 0.8880333769047576, + "grad_norm": 1.5363845825195312, + "learning_rate": 6.501698234085929e-06, + "loss": 1.1274, + "step": 24797 + }, + { + "epoch": 0.888069189034326, + "grad_norm": 1.4915670156478882, + "learning_rate": 6.497584780359423e-06, + "loss": 1.0158, + "step": 24798 + }, + { + "epoch": 0.8881050011638942, + "grad_norm": 1.7239021062850952, + "learning_rate": 6.4934725845873016e-06, + "loss": 1.2085, + "step": 24799 + }, + { + "epoch": 0.8881408132934625, + "grad_norm": 1.6954907178878784, + "learning_rate": 6.489361646824898e-06, + "loss": 0.9874, + "step": 24800 + }, + { + "epoch": 0.8881766254230308, + "grad_norm": 1.5137981176376343, + "learning_rate": 6.485251967127526e-06, + "loss": 1.2008, + "step": 24801 + }, + { + "epoch": 0.888212437552599, + "grad_norm": 1.5367071628570557, + "learning_rate": 6.48114354555045e-06, + "loss": 1.0433, + "step": 24802 + }, + { + "epoch": 0.8882482496821673, + "grad_norm": 1.218315839767456, + "learning_rate": 6.47703638214896e-06, + "loss": 1.0348, + "step": 24803 + }, + { + "epoch": 0.8882840618117356, + "grad_norm": 1.2479069232940674, + "learning_rate": 6.4729304769783225e-06, + "loss": 1.0182, + "step": 24804 + }, + { + "epoch": 0.888319873941304, + "grad_norm": 1.3354337215423584, + "learning_rate": 6.468825830093739e-06, + "loss": 1.1814, + "step": 24805 + }, + { + "epoch": 0.8883556860708722, + "grad_norm": 1.589475393295288, + "learning_rate": 6.4647224415504745e-06, + "loss": 1.153, + "step": 24806 + }, + { + "epoch": 0.8883914982004405, + "grad_norm": 1.4259674549102783, + "learning_rate": 6.460620311403709e-06, + "loss": 1.1398, + "step": 24807 + }, + { + "epoch": 0.8884273103300088, + "grad_norm": 1.8924700021743774, + "learning_rate": 6.456519439708653e-06, + "loss": 1.138, + "step": 24808 + }, + { + "epoch": 0.888463122459577, + "grad_norm": 1.6465340852737427, + "learning_rate": 6.452419826520451e-06, + "loss": 1.0617, + "step": 24809 + }, + { + "epoch": 0.8884989345891453, + "grad_norm": 1.5402542352676392, + "learning_rate": 6.44832147189427e-06, + "loss": 1.0986, + "step": 24810 + }, + { + "epoch": 0.8885347467187136, + "grad_norm": 1.5021657943725586, + "learning_rate": 6.444224375885277e-06, + "loss": 1.1162, + "step": 24811 + }, + { + "epoch": 0.888570558848282, + "grad_norm": 1.4267688989639282, + "learning_rate": 6.44012853854854e-06, + "loss": 0.9925, + "step": 24812 + }, + { + "epoch": 0.8886063709778502, + "grad_norm": 1.315917730331421, + "learning_rate": 6.436033959939192e-06, + "loss": 0.8985, + "step": 24813 + }, + { + "epoch": 0.8886421831074185, + "grad_norm": 1.202033281326294, + "learning_rate": 6.431940640112322e-06, + "loss": 1.1771, + "step": 24814 + }, + { + "epoch": 0.8886779952369868, + "grad_norm": 1.5704433917999268, + "learning_rate": 6.4278485791230195e-06, + "loss": 1.1117, + "step": 24815 + }, + { + "epoch": 0.888713807366555, + "grad_norm": 1.9383519887924194, + "learning_rate": 6.423757777026285e-06, + "loss": 1.0309, + "step": 24816 + }, + { + "epoch": 0.8887496194961233, + "grad_norm": 1.5599337816238403, + "learning_rate": 6.419668233877197e-06, + "loss": 1.1015, + "step": 24817 + }, + { + "epoch": 0.8887854316256916, + "grad_norm": 1.2166929244995117, + "learning_rate": 6.415579949730755e-06, + "loss": 1.0902, + "step": 24818 + }, + { + "epoch": 0.88882124375526, + "grad_norm": 1.4745283126831055, + "learning_rate": 6.411492924641982e-06, + "loss": 1.1294, + "step": 24819 + }, + { + "epoch": 0.8888570558848282, + "grad_norm": 1.3388543128967285, + "learning_rate": 6.407407158665846e-06, + "loss": 1.0358, + "step": 24820 + }, + { + "epoch": 0.8888928680143965, + "grad_norm": 1.5835744142532349, + "learning_rate": 6.403322651857313e-06, + "loss": 1.0725, + "step": 24821 + }, + { + "epoch": 0.8889286801439648, + "grad_norm": 1.3964245319366455, + "learning_rate": 6.399239404271362e-06, + "loss": 1.1159, + "step": 24822 + }, + { + "epoch": 0.888964492273533, + "grad_norm": 1.3870857954025269, + "learning_rate": 6.395157415962894e-06, + "loss": 1.1163, + "step": 24823 + }, + { + "epoch": 0.8890003044031013, + "grad_norm": 1.3632196187973022, + "learning_rate": 6.39107668698683e-06, + "loss": 0.9505, + "step": 24824 + }, + { + "epoch": 0.8890361165326696, + "grad_norm": 1.4850200414657593, + "learning_rate": 6.386997217398094e-06, + "loss": 1.2507, + "step": 24825 + }, + { + "epoch": 0.8890719286622379, + "grad_norm": 1.6235260963439941, + "learning_rate": 6.382919007251575e-06, + "loss": 1.1116, + "step": 24826 + }, + { + "epoch": 0.8891077407918062, + "grad_norm": 1.3720307350158691, + "learning_rate": 6.378842056602097e-06, + "loss": 1.0053, + "step": 24827 + }, + { + "epoch": 0.8891435529213745, + "grad_norm": 1.856320858001709, + "learning_rate": 6.374766365504547e-06, + "loss": 1.1534, + "step": 24828 + }, + { + "epoch": 0.8891793650509427, + "grad_norm": 1.6968427896499634, + "learning_rate": 6.370691934013761e-06, + "loss": 1.0176, + "step": 24829 + }, + { + "epoch": 0.889215177180511, + "grad_norm": 1.4099003076553345, + "learning_rate": 6.366618762184529e-06, + "loss": 1.1194, + "step": 24830 + }, + { + "epoch": 0.8892509893100793, + "grad_norm": 1.4611746072769165, + "learning_rate": 6.36254685007166e-06, + "loss": 1.2384, + "step": 24831 + }, + { + "epoch": 0.8892868014396476, + "grad_norm": 1.2749677896499634, + "learning_rate": 6.358476197729934e-06, + "loss": 0.8198, + "step": 24832 + }, + { + "epoch": 0.8893226135692159, + "grad_norm": 1.5134326219558716, + "learning_rate": 6.3544068052141415e-06, + "loss": 1.0568, + "step": 24833 + }, + { + "epoch": 0.8893584256987842, + "grad_norm": 1.4933850765228271, + "learning_rate": 6.3503386725790034e-06, + "loss": 0.9739, + "step": 24834 + }, + { + "epoch": 0.8893942378283525, + "grad_norm": 1.3020867109298706, + "learning_rate": 6.346271799879244e-06, + "loss": 0.8359, + "step": 24835 + }, + { + "epoch": 0.8894300499579207, + "grad_norm": 1.473387598991394, + "learning_rate": 6.342206187169608e-06, + "loss": 0.8763, + "step": 24836 + }, + { + "epoch": 0.889465862087489, + "grad_norm": 1.7025257349014282, + "learning_rate": 6.338141834504785e-06, + "loss": 1.0615, + "step": 24837 + }, + { + "epoch": 0.8895016742170573, + "grad_norm": 1.8295365571975708, + "learning_rate": 6.3340787419394535e-06, + "loss": 0.9535, + "step": 24838 + }, + { + "epoch": 0.8895374863466255, + "grad_norm": 1.4299675226211548, + "learning_rate": 6.330016909528236e-06, + "loss": 0.9131, + "step": 24839 + }, + { + "epoch": 0.8895732984761939, + "grad_norm": 1.7790395021438599, + "learning_rate": 6.325956337325845e-06, + "loss": 1.0641, + "step": 24840 + }, + { + "epoch": 0.8896091106057622, + "grad_norm": 1.2091163396835327, + "learning_rate": 6.321897025386869e-06, + "loss": 0.821, + "step": 24841 + }, + { + "epoch": 0.8896449227353305, + "grad_norm": 1.4736942052841187, + "learning_rate": 6.317838973765944e-06, + "loss": 1.2557, + "step": 24842 + }, + { + "epoch": 0.8896807348648987, + "grad_norm": 1.6015197038650513, + "learning_rate": 6.313782182517636e-06, + "loss": 1.1871, + "step": 24843 + }, + { + "epoch": 0.889716546994467, + "grad_norm": 1.3166110515594482, + "learning_rate": 6.309726651696557e-06, + "loss": 1.0217, + "step": 24844 + }, + { + "epoch": 0.8897523591240353, + "grad_norm": 1.4003584384918213, + "learning_rate": 6.305672381357264e-06, + "loss": 1.0008, + "step": 24845 + }, + { + "epoch": 0.8897881712536035, + "grad_norm": 1.7589325904846191, + "learning_rate": 6.301619371554257e-06, + "loss": 1.2757, + "step": 24846 + }, + { + "epoch": 0.8898239833831719, + "grad_norm": 2.2412569522857666, + "learning_rate": 6.297567622342127e-06, + "loss": 1.1246, + "step": 24847 + }, + { + "epoch": 0.8898597955127402, + "grad_norm": 1.4506174325942993, + "learning_rate": 6.29351713377535e-06, + "loss": 0.9764, + "step": 24848 + }, + { + "epoch": 0.8898956076423085, + "grad_norm": 1.4348970651626587, + "learning_rate": 6.289467905908442e-06, + "loss": 1.1254, + "step": 24849 + }, + { + "epoch": 0.8899314197718767, + "grad_norm": 1.6456583738327026, + "learning_rate": 6.285419938795833e-06, + "loss": 1.177, + "step": 24850 + }, + { + "epoch": 0.889967231901445, + "grad_norm": 1.3862488269805908, + "learning_rate": 6.281373232492038e-06, + "loss": 1.0059, + "step": 24851 + }, + { + "epoch": 0.8900030440310133, + "grad_norm": 1.3690153360366821, + "learning_rate": 6.2773277870514675e-06, + "loss": 0.9909, + "step": 24852 + }, + { + "epoch": 0.8900388561605815, + "grad_norm": 1.514828085899353, + "learning_rate": 6.273283602528579e-06, + "loss": 0.8752, + "step": 24853 + }, + { + "epoch": 0.8900746682901499, + "grad_norm": 1.2619776725769043, + "learning_rate": 6.269240678977739e-06, + "loss": 1.0057, + "step": 24854 + }, + { + "epoch": 0.8901104804197182, + "grad_norm": 1.6477406024932861, + "learning_rate": 6.265199016453371e-06, + "loss": 1.0891, + "step": 24855 + }, + { + "epoch": 0.8901462925492865, + "grad_norm": 1.303377628326416, + "learning_rate": 6.261158615009843e-06, + "loss": 1.003, + "step": 24856 + }, + { + "epoch": 0.8901821046788547, + "grad_norm": 1.4385443925857544, + "learning_rate": 6.25711947470149e-06, + "loss": 1.2594, + "step": 24857 + }, + { + "epoch": 0.890217916808423, + "grad_norm": 1.3726774454116821, + "learning_rate": 6.253081595582699e-06, + "loss": 1.0028, + "step": 24858 + }, + { + "epoch": 0.8902537289379913, + "grad_norm": 1.3230767250061035, + "learning_rate": 6.249044977707763e-06, + "loss": 0.966, + "step": 24859 + }, + { + "epoch": 0.8902895410675595, + "grad_norm": 1.51340913772583, + "learning_rate": 6.245009621131004e-06, + "loss": 1.0348, + "step": 24860 + }, + { + "epoch": 0.8903253531971279, + "grad_norm": 1.4140151739120483, + "learning_rate": 6.2409755259066786e-06, + "loss": 1.0203, + "step": 24861 + }, + { + "epoch": 0.8903611653266962, + "grad_norm": 1.5968427658081055, + "learning_rate": 6.23694269208912e-06, + "loss": 0.9676, + "step": 24862 + }, + { + "epoch": 0.8903969774562644, + "grad_norm": 1.54584801197052, + "learning_rate": 6.232911119732554e-06, + "loss": 0.997, + "step": 24863 + }, + { + "epoch": 0.8904327895858327, + "grad_norm": 1.5952681303024292, + "learning_rate": 6.228880808891202e-06, + "loss": 0.8983, + "step": 24864 + }, + { + "epoch": 0.890468601715401, + "grad_norm": 1.3669273853302002, + "learning_rate": 6.224851759619299e-06, + "loss": 1.0619, + "step": 24865 + }, + { + "epoch": 0.8905044138449693, + "grad_norm": 1.568811058998108, + "learning_rate": 6.220823971971046e-06, + "loss": 1.1435, + "step": 24866 + }, + { + "epoch": 0.8905402259745375, + "grad_norm": 1.6442549228668213, + "learning_rate": 6.216797446000666e-06, + "loss": 1.0434, + "step": 24867 + }, + { + "epoch": 0.8905760381041059, + "grad_norm": 1.2705873250961304, + "learning_rate": 6.212772181762283e-06, + "loss": 0.9841, + "step": 24868 + }, + { + "epoch": 0.8906118502336742, + "grad_norm": 1.7279201745986938, + "learning_rate": 6.208748179310087e-06, + "loss": 0.892, + "step": 24869 + }, + { + "epoch": 0.8906476623632424, + "grad_norm": 1.4061847925186157, + "learning_rate": 6.204725438698189e-06, + "loss": 1.1336, + "step": 24870 + }, + { + "epoch": 0.8906834744928107, + "grad_norm": 1.379198431968689, + "learning_rate": 6.200703959980747e-06, + "loss": 1.0349, + "step": 24871 + }, + { + "epoch": 0.890719286622379, + "grad_norm": 1.4944252967834473, + "learning_rate": 6.196683743211818e-06, + "loss": 1.109, + "step": 24872 + }, + { + "epoch": 0.8907550987519472, + "grad_norm": 1.5745643377304077, + "learning_rate": 6.192664788445513e-06, + "loss": 1.0848, + "step": 24873 + }, + { + "epoch": 0.8907909108815155, + "grad_norm": 1.3716423511505127, + "learning_rate": 6.188647095735911e-06, + "loss": 1.2148, + "step": 24874 + }, + { + "epoch": 0.8908267230110839, + "grad_norm": 1.5856928825378418, + "learning_rate": 6.184630665137048e-06, + "loss": 1.0143, + "step": 24875 + }, + { + "epoch": 0.8908625351406522, + "grad_norm": 1.7206182479858398, + "learning_rate": 6.180615496702968e-06, + "loss": 1.0955, + "step": 24876 + }, + { + "epoch": 0.8908983472702204, + "grad_norm": 1.38218355178833, + "learning_rate": 6.176601590487685e-06, + "loss": 0.9442, + "step": 24877 + }, + { + "epoch": 0.8909341593997887, + "grad_norm": 1.443109154701233, + "learning_rate": 6.17258894654521e-06, + "loss": 1.1125, + "step": 24878 + }, + { + "epoch": 0.890969971529357, + "grad_norm": 1.3886141777038574, + "learning_rate": 6.168577564929523e-06, + "loss": 1.1628, + "step": 24879 + }, + { + "epoch": 0.8910057836589252, + "grad_norm": 1.471952199935913, + "learning_rate": 6.16456744569458e-06, + "loss": 0.9322, + "step": 24880 + }, + { + "epoch": 0.8910415957884935, + "grad_norm": 1.4998769760131836, + "learning_rate": 6.160558588894361e-06, + "loss": 1.0646, + "step": 24881 + }, + { + "epoch": 0.8910774079180619, + "grad_norm": 1.4136635065078735, + "learning_rate": 6.156550994582766e-06, + "loss": 0.818, + "step": 24882 + }, + { + "epoch": 0.8911132200476302, + "grad_norm": 1.2558395862579346, + "learning_rate": 6.1525446628137306e-06, + "loss": 0.9448, + "step": 24883 + }, + { + "epoch": 0.8911490321771984, + "grad_norm": 1.7791212797164917, + "learning_rate": 6.148539593641156e-06, + "loss": 1.2048, + "step": 24884 + }, + { + "epoch": 0.8911848443067667, + "grad_norm": 1.4442126750946045, + "learning_rate": 6.144535787118921e-06, + "loss": 1.1605, + "step": 24885 + }, + { + "epoch": 0.891220656436335, + "grad_norm": 1.6233261823654175, + "learning_rate": 6.140533243300894e-06, + "loss": 1.1431, + "step": 24886 + }, + { + "epoch": 0.8912564685659032, + "grad_norm": 1.3042151927947998, + "learning_rate": 6.13653196224091e-06, + "loss": 0.8567, + "step": 24887 + }, + { + "epoch": 0.8912922806954715, + "grad_norm": 1.4830137491226196, + "learning_rate": 6.132531943992826e-06, + "loss": 0.8894, + "step": 24888 + }, + { + "epoch": 0.8913280928250399, + "grad_norm": 1.7755190134048462, + "learning_rate": 6.128533188610453e-06, + "loss": 1.1069, + "step": 24889 + }, + { + "epoch": 0.8913639049546082, + "grad_norm": 1.7786345481872559, + "learning_rate": 6.124535696147559e-06, + "loss": 1.2499, + "step": 24890 + }, + { + "epoch": 0.8913997170841764, + "grad_norm": 1.6826417446136475, + "learning_rate": 6.12053946665796e-06, + "loss": 1.0598, + "step": 24891 + }, + { + "epoch": 0.8914355292137447, + "grad_norm": 1.4305272102355957, + "learning_rate": 6.1165445001954095e-06, + "loss": 0.9244, + "step": 24892 + }, + { + "epoch": 0.891471341343313, + "grad_norm": 1.702847957611084, + "learning_rate": 6.112550796813643e-06, + "loss": 1.1745, + "step": 24893 + }, + { + "epoch": 0.8915071534728812, + "grad_norm": 1.4258077144622803, + "learning_rate": 6.108558356566396e-06, + "loss": 1.1532, + "step": 24894 + }, + { + "epoch": 0.8915429656024495, + "grad_norm": 1.3065907955169678, + "learning_rate": 6.104567179507381e-06, + "loss": 0.9803, + "step": 24895 + }, + { + "epoch": 0.8915787777320179, + "grad_norm": 1.4972318410873413, + "learning_rate": 6.100577265690321e-06, + "loss": 1.2115, + "step": 24896 + }, + { + "epoch": 0.8916145898615861, + "grad_norm": 1.2994424104690552, + "learning_rate": 6.096588615168864e-06, + "loss": 0.9426, + "step": 24897 + }, + { + "epoch": 0.8916504019911544, + "grad_norm": 1.3572893142700195, + "learning_rate": 6.092601227996664e-06, + "loss": 1.0798, + "step": 24898 + }, + { + "epoch": 0.8916862141207227, + "grad_norm": 1.437619686126709, + "learning_rate": 6.088615104227413e-06, + "loss": 1.0385, + "step": 24899 + }, + { + "epoch": 0.891722026250291, + "grad_norm": 1.4448283910751343, + "learning_rate": 6.084630243914679e-06, + "loss": 0.997, + "step": 24900 + }, + { + "epoch": 0.8917578383798592, + "grad_norm": 1.4407602548599243, + "learning_rate": 6.080646647112109e-06, + "loss": 1.0305, + "step": 24901 + }, + { + "epoch": 0.8917936505094275, + "grad_norm": 1.2892463207244873, + "learning_rate": 6.076664313873293e-06, + "loss": 0.9712, + "step": 24902 + }, + { + "epoch": 0.8918294626389959, + "grad_norm": 1.5724990367889404, + "learning_rate": 6.07268324425182e-06, + "loss": 1.1452, + "step": 24903 + }, + { + "epoch": 0.8918652747685641, + "grad_norm": 1.7584294080734253, + "learning_rate": 6.068703438301226e-06, + "loss": 1.1367, + "step": 24904 + }, + { + "epoch": 0.8919010868981324, + "grad_norm": 1.3405423164367676, + "learning_rate": 6.064724896075058e-06, + "loss": 1.1751, + "step": 24905 + }, + { + "epoch": 0.8919368990277007, + "grad_norm": 1.4410243034362793, + "learning_rate": 6.06074761762685e-06, + "loss": 0.9167, + "step": 24906 + }, + { + "epoch": 0.891972711157269, + "grad_norm": 1.2001609802246094, + "learning_rate": 6.056771603010125e-06, + "loss": 0.9929, + "step": 24907 + }, + { + "epoch": 0.8920085232868372, + "grad_norm": 1.5562241077423096, + "learning_rate": 6.052796852278353e-06, + "loss": 1.1394, + "step": 24908 + }, + { + "epoch": 0.8920443354164055, + "grad_norm": 1.5408713817596436, + "learning_rate": 6.048823365485012e-06, + "loss": 0.9824, + "step": 24909 + }, + { + "epoch": 0.8920801475459739, + "grad_norm": 1.392735242843628, + "learning_rate": 6.044851142683572e-06, + "loss": 1.0795, + "step": 24910 + }, + { + "epoch": 0.8921159596755421, + "grad_norm": 1.823682188987732, + "learning_rate": 6.040880183927455e-06, + "loss": 1.0726, + "step": 24911 + }, + { + "epoch": 0.8921517718051104, + "grad_norm": 1.5231233835220337, + "learning_rate": 6.036910489270098e-06, + "loss": 0.9942, + "step": 24912 + }, + { + "epoch": 0.8921875839346787, + "grad_norm": 1.7374874353408813, + "learning_rate": 6.0329420587649124e-06, + "loss": 1.0563, + "step": 24913 + }, + { + "epoch": 0.8922233960642469, + "grad_norm": 1.735414981842041, + "learning_rate": 6.028974892465289e-06, + "loss": 0.9633, + "step": 24914 + }, + { + "epoch": 0.8922592081938152, + "grad_norm": 1.3128879070281982, + "learning_rate": 6.025008990424585e-06, + "loss": 0.9447, + "step": 24915 + }, + { + "epoch": 0.8922950203233835, + "grad_norm": 1.80473792552948, + "learning_rate": 6.021044352696159e-06, + "loss": 1.3548, + "step": 24916 + }, + { + "epoch": 0.8923308324529519, + "grad_norm": 1.6320546865463257, + "learning_rate": 6.017080979333378e-06, + "loss": 1.2115, + "step": 24917 + }, + { + "epoch": 0.8923666445825201, + "grad_norm": 2.3577208518981934, + "learning_rate": 6.013118870389523e-06, + "loss": 1.2623, + "step": 24918 + }, + { + "epoch": 0.8924024567120884, + "grad_norm": 1.5269620418548584, + "learning_rate": 6.009158025917927e-06, + "loss": 1.0094, + "step": 24919 + }, + { + "epoch": 0.8924382688416567, + "grad_norm": 1.2436882257461548, + "learning_rate": 6.00519844597186e-06, + "loss": 0.9489, + "step": 24920 + }, + { + "epoch": 0.8924740809712249, + "grad_norm": 1.4370815753936768, + "learning_rate": 6.001240130604624e-06, + "loss": 1.1427, + "step": 24921 + }, + { + "epoch": 0.8925098931007932, + "grad_norm": 1.466346263885498, + "learning_rate": 5.997283079869442e-06, + "loss": 0.9831, + "step": 24922 + }, + { + "epoch": 0.8925457052303615, + "grad_norm": 1.7407362461090088, + "learning_rate": 5.993327293819562e-06, + "loss": 0.9464, + "step": 24923 + }, + { + "epoch": 0.8925815173599299, + "grad_norm": 1.9645428657531738, + "learning_rate": 5.989372772508195e-06, + "loss": 1.1117, + "step": 24924 + }, + { + "epoch": 0.8926173294894981, + "grad_norm": 1.8031059503555298, + "learning_rate": 5.985419515988566e-06, + "loss": 1.025, + "step": 24925 + }, + { + "epoch": 0.8926531416190664, + "grad_norm": 1.4304261207580566, + "learning_rate": 5.981467524313855e-06, + "loss": 0.908, + "step": 24926 + }, + { + "epoch": 0.8926889537486347, + "grad_norm": 1.6558878421783447, + "learning_rate": 5.977516797537186e-06, + "loss": 1.2328, + "step": 24927 + }, + { + "epoch": 0.8927247658782029, + "grad_norm": 1.4991973638534546, + "learning_rate": 5.973567335711783e-06, + "loss": 0.9258, + "step": 24928 + }, + { + "epoch": 0.8927605780077712, + "grad_norm": 1.8169881105422974, + "learning_rate": 5.969619138890737e-06, + "loss": 0.9793, + "step": 24929 + }, + { + "epoch": 0.8927963901373395, + "grad_norm": 1.3265979290008545, + "learning_rate": 5.965672207127171e-06, + "loss": 0.9513, + "step": 24930 + }, + { + "epoch": 0.8928322022669078, + "grad_norm": 1.1930031776428223, + "learning_rate": 5.961726540474189e-06, + "loss": 1.0025, + "step": 24931 + }, + { + "epoch": 0.8928680143964761, + "grad_norm": 1.5359383821487427, + "learning_rate": 5.95778213898488e-06, + "loss": 1.0554, + "step": 24932 + }, + { + "epoch": 0.8929038265260444, + "grad_norm": 1.6725319623947144, + "learning_rate": 5.9538390027123025e-06, + "loss": 1.0731, + "step": 24933 + }, + { + "epoch": 0.8929396386556127, + "grad_norm": 1.56717050075531, + "learning_rate": 5.949897131709514e-06, + "loss": 1.1962, + "step": 24934 + }, + { + "epoch": 0.8929754507851809, + "grad_norm": 1.3893448114395142, + "learning_rate": 5.94595652602955e-06, + "loss": 1.0592, + "step": 24935 + }, + { + "epoch": 0.8930112629147492, + "grad_norm": 1.6010504961013794, + "learning_rate": 5.9420171857254126e-06, + "loss": 1.1927, + "step": 24936 + }, + { + "epoch": 0.8930470750443175, + "grad_norm": 1.4081162214279175, + "learning_rate": 5.938079110850114e-06, + "loss": 1.0956, + "step": 24937 + }, + { + "epoch": 0.8930828871738858, + "grad_norm": 1.6908098459243774, + "learning_rate": 5.934142301456613e-06, + "loss": 1.1308, + "step": 24938 + }, + { + "epoch": 0.8931186993034541, + "grad_norm": 1.470892071723938, + "learning_rate": 5.9302067575979115e-06, + "loss": 1.0168, + "step": 24939 + }, + { + "epoch": 0.8931545114330224, + "grad_norm": 1.047878384590149, + "learning_rate": 5.926272479326922e-06, + "loss": 1.0792, + "step": 24940 + }, + { + "epoch": 0.8931903235625906, + "grad_norm": 1.312562108039856, + "learning_rate": 5.922339466696591e-06, + "loss": 0.9111, + "step": 24941 + }, + { + "epoch": 0.8932261356921589, + "grad_norm": 1.4885278940200806, + "learning_rate": 5.918407719759844e-06, + "loss": 1.1838, + "step": 24942 + }, + { + "epoch": 0.8932619478217272, + "grad_norm": 1.4052575826644897, + "learning_rate": 5.914477238569549e-06, + "loss": 0.9602, + "step": 24943 + }, + { + "epoch": 0.8932977599512955, + "grad_norm": 1.8396544456481934, + "learning_rate": 5.91054802317862e-06, + "loss": 1.3297, + "step": 24944 + }, + { + "epoch": 0.8933335720808638, + "grad_norm": 1.758575439453125, + "learning_rate": 5.906620073639868e-06, + "loss": 1.1148, + "step": 24945 + }, + { + "epoch": 0.8933693842104321, + "grad_norm": 1.544176459312439, + "learning_rate": 5.902693390006209e-06, + "loss": 1.1919, + "step": 24946 + }, + { + "epoch": 0.8934051963400004, + "grad_norm": 1.8621158599853516, + "learning_rate": 5.89876797233041e-06, + "loss": 1.191, + "step": 24947 + }, + { + "epoch": 0.8934410084695686, + "grad_norm": 1.9253125190734863, + "learning_rate": 5.894843820665319e-06, + "loss": 1.2421, + "step": 24948 + }, + { + "epoch": 0.8934768205991369, + "grad_norm": 1.4661258459091187, + "learning_rate": 5.890920935063693e-06, + "loss": 1.1307, + "step": 24949 + }, + { + "epoch": 0.8935126327287052, + "grad_norm": 2.570033550262451, + "learning_rate": 5.8869993155783675e-06, + "loss": 0.9511, + "step": 24950 + }, + { + "epoch": 0.8935484448582734, + "grad_norm": 1.1693652868270874, + "learning_rate": 5.883078962262056e-06, + "loss": 1.284, + "step": 24951 + }, + { + "epoch": 0.8935842569878418, + "grad_norm": 1.4022626876831055, + "learning_rate": 5.879159875167517e-06, + "loss": 1.0974, + "step": 24952 + }, + { + "epoch": 0.8936200691174101, + "grad_norm": 1.3443681001663208, + "learning_rate": 5.875242054347463e-06, + "loss": 0.9781, + "step": 24953 + }, + { + "epoch": 0.8936558812469784, + "grad_norm": 1.464443564414978, + "learning_rate": 5.871325499854618e-06, + "loss": 0.9502, + "step": 24954 + }, + { + "epoch": 0.8936916933765466, + "grad_norm": 1.3601624965667725, + "learning_rate": 5.867410211741686e-06, + "loss": 1.254, + "step": 24955 + }, + { + "epoch": 0.8937275055061149, + "grad_norm": 1.4517717361450195, + "learning_rate": 5.863496190061302e-06, + "loss": 1.1887, + "step": 24956 + }, + { + "epoch": 0.8937633176356832, + "grad_norm": 1.5851165056228638, + "learning_rate": 5.859583434866167e-06, + "loss": 0.9506, + "step": 24957 + }, + { + "epoch": 0.8937991297652514, + "grad_norm": 1.3049615621566772, + "learning_rate": 5.855671946208896e-06, + "loss": 0.9936, + "step": 24958 + }, + { + "epoch": 0.8938349418948198, + "grad_norm": 1.5838874578475952, + "learning_rate": 5.851761724142147e-06, + "loss": 0.954, + "step": 24959 + }, + { + "epoch": 0.8938707540243881, + "grad_norm": 1.6251094341278076, + "learning_rate": 5.8478527687184755e-06, + "loss": 0.9602, + "step": 24960 + }, + { + "epoch": 0.8939065661539564, + "grad_norm": 1.3291015625, + "learning_rate": 5.843945079990498e-06, + "loss": 1.0862, + "step": 24961 + }, + { + "epoch": 0.8939423782835246, + "grad_norm": 1.6550065279006958, + "learning_rate": 5.840038658010805e-06, + "loss": 1.0105, + "step": 24962 + }, + { + "epoch": 0.8939781904130929, + "grad_norm": 1.526771903038025, + "learning_rate": 5.83613350283192e-06, + "loss": 1.059, + "step": 24963 + }, + { + "epoch": 0.8940140025426612, + "grad_norm": 2.0472826957702637, + "learning_rate": 5.83222961450639e-06, + "loss": 1.1862, + "step": 24964 + }, + { + "epoch": 0.8940498146722294, + "grad_norm": 1.7179306745529175, + "learning_rate": 5.828326993086741e-06, + "loss": 0.9795, + "step": 24965 + }, + { + "epoch": 0.8940856268017978, + "grad_norm": 1.4377727508544922, + "learning_rate": 5.824425638625508e-06, + "loss": 0.94, + "step": 24966 + }, + { + "epoch": 0.8941214389313661, + "grad_norm": 1.409003734588623, + "learning_rate": 5.820525551175104e-06, + "loss": 1.0263, + "step": 24967 + }, + { + "epoch": 0.8941572510609344, + "grad_norm": 1.6309328079223633, + "learning_rate": 5.8166267307880885e-06, + "loss": 1.1839, + "step": 24968 + }, + { + "epoch": 0.8941930631905026, + "grad_norm": 1.2422016859054565, + "learning_rate": 5.812729177516874e-06, + "loss": 1.0328, + "step": 24969 + }, + { + "epoch": 0.8942288753200709, + "grad_norm": 1.3344396352767944, + "learning_rate": 5.808832891413873e-06, + "loss": 0.9092, + "step": 24970 + }, + { + "epoch": 0.8942646874496392, + "grad_norm": 1.2319986820220947, + "learning_rate": 5.804937872531524e-06, + "loss": 0.8495, + "step": 24971 + }, + { + "epoch": 0.8943004995792074, + "grad_norm": 1.567103385925293, + "learning_rate": 5.8010441209222384e-06, + "loss": 1.0839, + "step": 24972 + }, + { + "epoch": 0.8943363117087758, + "grad_norm": 1.4120687246322632, + "learning_rate": 5.797151636638409e-06, + "loss": 0.9167, + "step": 24973 + }, + { + "epoch": 0.8943721238383441, + "grad_norm": 1.3094757795333862, + "learning_rate": 5.7932604197323826e-06, + "loss": 0.8048, + "step": 24974 + }, + { + "epoch": 0.8944079359679123, + "grad_norm": 1.7647985219955444, + "learning_rate": 5.789370470256517e-06, + "loss": 0.995, + "step": 24975 + }, + { + "epoch": 0.8944437480974806, + "grad_norm": 1.6747490167617798, + "learning_rate": 5.785481788263147e-06, + "loss": 1.137, + "step": 24976 + }, + { + "epoch": 0.8944795602270489, + "grad_norm": 1.4676587581634521, + "learning_rate": 5.7815943738046e-06, + "loss": 0.8987, + "step": 24977 + }, + { + "epoch": 0.8945153723566172, + "grad_norm": 1.3061234951019287, + "learning_rate": 5.777708226933165e-06, + "loss": 1.02, + "step": 24978 + }, + { + "epoch": 0.8945511844861854, + "grad_norm": 1.5622239112854004, + "learning_rate": 5.773823347701124e-06, + "loss": 1.1739, + "step": 24979 + }, + { + "epoch": 0.8945869966157538, + "grad_norm": 1.7466638088226318, + "learning_rate": 5.7699397361607564e-06, + "loss": 0.9743, + "step": 24980 + }, + { + "epoch": 0.8946228087453221, + "grad_norm": 1.848927617073059, + "learning_rate": 5.766057392364288e-06, + "loss": 1.1474, + "step": 24981 + }, + { + "epoch": 0.8946586208748903, + "grad_norm": 1.2158159017562866, + "learning_rate": 5.7621763163639655e-06, + "loss": 1.1167, + "step": 24982 + }, + { + "epoch": 0.8946944330044586, + "grad_norm": 1.4776469469070435, + "learning_rate": 5.758296508212013e-06, + "loss": 1.0326, + "step": 24983 + }, + { + "epoch": 0.8947302451340269, + "grad_norm": 1.513715147972107, + "learning_rate": 5.7544179679606234e-06, + "loss": 1.3595, + "step": 24984 + }, + { + "epoch": 0.8947660572635951, + "grad_norm": 1.8631101846694946, + "learning_rate": 5.750540695661955e-06, + "loss": 1.1051, + "step": 24985 + }, + { + "epoch": 0.8948018693931634, + "grad_norm": 1.6185874938964844, + "learning_rate": 5.746664691368187e-06, + "loss": 0.9883, + "step": 24986 + }, + { + "epoch": 0.8948376815227318, + "grad_norm": 1.6612251996994019, + "learning_rate": 5.742789955131489e-06, + "loss": 0.9894, + "step": 24987 + }, + { + "epoch": 0.8948734936523001, + "grad_norm": 1.824102759361267, + "learning_rate": 5.7389164870039535e-06, + "loss": 1.0932, + "step": 24988 + }, + { + "epoch": 0.8949093057818683, + "grad_norm": 1.3705213069915771, + "learning_rate": 5.735044287037705e-06, + "loss": 0.9966, + "step": 24989 + }, + { + "epoch": 0.8949451179114366, + "grad_norm": 1.429732084274292, + "learning_rate": 5.7311733552848355e-06, + "loss": 1.1306, + "step": 24990 + }, + { + "epoch": 0.8949809300410049, + "grad_norm": 1.4096314907073975, + "learning_rate": 5.727303691797459e-06, + "loss": 1.045, + "step": 24991 + }, + { + "epoch": 0.8950167421705731, + "grad_norm": 2.5301878452301025, + "learning_rate": 5.723435296627588e-06, + "loss": 1.1512, + "step": 24992 + }, + { + "epoch": 0.8950525543001414, + "grad_norm": 1.6462771892547607, + "learning_rate": 5.719568169827283e-06, + "loss": 0.8897, + "step": 24993 + }, + { + "epoch": 0.8950883664297098, + "grad_norm": 1.343767523765564, + "learning_rate": 5.71570231144859e-06, + "loss": 1.0283, + "step": 24994 + }, + { + "epoch": 0.8951241785592781, + "grad_norm": 1.958445429801941, + "learning_rate": 5.7118377215435e-06, + "loss": 1.0384, + "step": 24995 + }, + { + "epoch": 0.8951599906888463, + "grad_norm": 1.4258307218551636, + "learning_rate": 5.7079744001640065e-06, + "loss": 1.173, + "step": 24996 + }, + { + "epoch": 0.8951958028184146, + "grad_norm": 1.5223599672317505, + "learning_rate": 5.70411234736209e-06, + "loss": 0.9956, + "step": 24997 + }, + { + "epoch": 0.8952316149479829, + "grad_norm": 1.5462515354156494, + "learning_rate": 5.700251563189718e-06, + "loss": 1.2748, + "step": 24998 + }, + { + "epoch": 0.8952674270775511, + "grad_norm": 1.8011884689331055, + "learning_rate": 5.696392047698817e-06, + "loss": 1.2772, + "step": 24999 + }, + { + "epoch": 0.8953032392071194, + "grad_norm": 1.4765892028808594, + "learning_rate": 5.6925338009413136e-06, + "loss": 1.0756, + "step": 25000 + }, + { + "epoch": 0.8953390513366878, + "grad_norm": 1.2378438711166382, + "learning_rate": 5.688676822969119e-06, + "loss": 0.9762, + "step": 25001 + }, + { + "epoch": 0.895374863466256, + "grad_norm": 1.5083011388778687, + "learning_rate": 5.684821113834138e-06, + "loss": 1.1196, + "step": 25002 + }, + { + "epoch": 0.8954106755958243, + "grad_norm": 1.252503514289856, + "learning_rate": 5.680966673588217e-06, + "loss": 1.0337, + "step": 25003 + }, + { + "epoch": 0.8954464877253926, + "grad_norm": 1.4890880584716797, + "learning_rate": 5.677113502283227e-06, + "loss": 0.9549, + "step": 25004 + }, + { + "epoch": 0.8954822998549609, + "grad_norm": 1.5846513509750366, + "learning_rate": 5.673261599971025e-06, + "loss": 1.2373, + "step": 25005 + }, + { + "epoch": 0.8955181119845291, + "grad_norm": 1.5496150255203247, + "learning_rate": 5.669410966703393e-06, + "loss": 0.9584, + "step": 25006 + }, + { + "epoch": 0.8955539241140974, + "grad_norm": 1.484841227531433, + "learning_rate": 5.665561602532165e-06, + "loss": 1.2563, + "step": 25007 + }, + { + "epoch": 0.8955897362436658, + "grad_norm": 1.4767229557037354, + "learning_rate": 5.661713507509126e-06, + "loss": 0.9123, + "step": 25008 + }, + { + "epoch": 0.895625548373234, + "grad_norm": 1.658414363861084, + "learning_rate": 5.657866681686053e-06, + "loss": 1.2687, + "step": 25009 + }, + { + "epoch": 0.8956613605028023, + "grad_norm": 1.6487300395965576, + "learning_rate": 5.654021125114672e-06, + "loss": 1.3915, + "step": 25010 + }, + { + "epoch": 0.8956971726323706, + "grad_norm": 1.6598315238952637, + "learning_rate": 5.6501768378467546e-06, + "loss": 1.0217, + "step": 25011 + }, + { + "epoch": 0.8957329847619389, + "grad_norm": 1.7844319343566895, + "learning_rate": 5.646333819933991e-06, + "loss": 0.9878, + "step": 25012 + }, + { + "epoch": 0.8957687968915071, + "grad_norm": 1.6493375301361084, + "learning_rate": 5.642492071428118e-06, + "loss": 1.1789, + "step": 25013 + }, + { + "epoch": 0.8958046090210754, + "grad_norm": 2.027172327041626, + "learning_rate": 5.638651592380795e-06, + "loss": 1.3398, + "step": 25014 + }, + { + "epoch": 0.8958404211506438, + "grad_norm": 1.7791595458984375, + "learning_rate": 5.63481238284368e-06, + "loss": 0.8758, + "step": 25015 + }, + { + "epoch": 0.895876233280212, + "grad_norm": 1.4278733730316162, + "learning_rate": 5.630974442868475e-06, + "loss": 1.1059, + "step": 25016 + }, + { + "epoch": 0.8959120454097803, + "grad_norm": 1.2702009677886963, + "learning_rate": 5.627137772506752e-06, + "loss": 0.9804, + "step": 25017 + }, + { + "epoch": 0.8959478575393486, + "grad_norm": 1.421750545501709, + "learning_rate": 5.623302371810169e-06, + "loss": 0.9534, + "step": 25018 + }, + { + "epoch": 0.8959836696689168, + "grad_norm": 1.6928601264953613, + "learning_rate": 5.619468240830306e-06, + "loss": 1.0751, + "step": 25019 + }, + { + "epoch": 0.8960194817984851, + "grad_norm": 1.9351344108581543, + "learning_rate": 5.615635379618778e-06, + "loss": 1.0023, + "step": 25020 + }, + { + "epoch": 0.8960552939280534, + "grad_norm": 1.5017145872116089, + "learning_rate": 5.61180378822711e-06, + "loss": 1.138, + "step": 25021 + }, + { + "epoch": 0.8960911060576218, + "grad_norm": 1.5998493432998657, + "learning_rate": 5.607973466706873e-06, + "loss": 0.9328, + "step": 25022 + }, + { + "epoch": 0.89612691818719, + "grad_norm": 1.2801355123519897, + "learning_rate": 5.604144415109614e-06, + "loss": 0.9351, + "step": 25023 + }, + { + "epoch": 0.8961627303167583, + "grad_norm": 1.4217807054519653, + "learning_rate": 5.600316633486802e-06, + "loss": 1.0635, + "step": 25024 + }, + { + "epoch": 0.8961985424463266, + "grad_norm": 1.8471620082855225, + "learning_rate": 5.596490121889975e-06, + "loss": 1.1658, + "step": 25025 + }, + { + "epoch": 0.8962343545758948, + "grad_norm": 1.4842886924743652, + "learning_rate": 5.592664880370602e-06, + "loss": 1.0361, + "step": 25026 + }, + { + "epoch": 0.8962701667054631, + "grad_norm": 1.9636459350585938, + "learning_rate": 5.588840908980153e-06, + "loss": 1.0693, + "step": 25027 + }, + { + "epoch": 0.8963059788350314, + "grad_norm": 1.4469542503356934, + "learning_rate": 5.585018207770054e-06, + "loss": 1.2756, + "step": 25028 + }, + { + "epoch": 0.8963417909645998, + "grad_norm": 1.3716518878936768, + "learning_rate": 5.581196776791752e-06, + "loss": 1.0997, + "step": 25029 + }, + { + "epoch": 0.896377603094168, + "grad_norm": 1.6094554662704468, + "learning_rate": 5.5773766160966634e-06, + "loss": 1.1935, + "step": 25030 + }, + { + "epoch": 0.8964134152237363, + "grad_norm": 1.6802676916122437, + "learning_rate": 5.5735577257361785e-06, + "loss": 0.9229, + "step": 25031 + }, + { + "epoch": 0.8964492273533046, + "grad_norm": 1.6625680923461914, + "learning_rate": 5.569740105761679e-06, + "loss": 1.1297, + "step": 25032 + }, + { + "epoch": 0.8964850394828728, + "grad_norm": 1.6288869380950928, + "learning_rate": 5.565923756224489e-06, + "loss": 1.0744, + "step": 25033 + }, + { + "epoch": 0.8965208516124411, + "grad_norm": 1.7969555854797363, + "learning_rate": 5.562108677176015e-06, + "loss": 1.0199, + "step": 25034 + }, + { + "epoch": 0.8965566637420094, + "grad_norm": 1.654267430305481, + "learning_rate": 5.558294868667535e-06, + "loss": 1.1423, + "step": 25035 + }, + { + "epoch": 0.8965924758715778, + "grad_norm": 1.6083534955978394, + "learning_rate": 5.554482330750388e-06, + "loss": 1.1096, + "step": 25036 + }, + { + "epoch": 0.896628288001146, + "grad_norm": 1.6665427684783936, + "learning_rate": 5.550671063475832e-06, + "loss": 1.0349, + "step": 25037 + }, + { + "epoch": 0.8966641001307143, + "grad_norm": 1.556931734085083, + "learning_rate": 5.546861066895193e-06, + "loss": 1.2734, + "step": 25038 + }, + { + "epoch": 0.8966999122602826, + "grad_norm": 1.255969524383545, + "learning_rate": 5.543052341059707e-06, + "loss": 0.9267, + "step": 25039 + }, + { + "epoch": 0.8967357243898508, + "grad_norm": 1.4811853170394897, + "learning_rate": 5.5392448860205785e-06, + "loss": 1.0359, + "step": 25040 + }, + { + "epoch": 0.8967715365194191, + "grad_norm": 1.9558639526367188, + "learning_rate": 5.535438701829088e-06, + "loss": 1.2226, + "step": 25041 + }, + { + "epoch": 0.8968073486489874, + "grad_norm": 1.6572480201721191, + "learning_rate": 5.5316337885364165e-06, + "loss": 1.2407, + "step": 25042 + }, + { + "epoch": 0.8968431607785557, + "grad_norm": 1.9519718885421753, + "learning_rate": 5.527830146193758e-06, + "loss": 1.2818, + "step": 25043 + }, + { + "epoch": 0.896878972908124, + "grad_norm": 1.6020989418029785, + "learning_rate": 5.5240277748522694e-06, + "loss": 0.8062, + "step": 25044 + }, + { + "epoch": 0.8969147850376923, + "grad_norm": 1.6889318227767944, + "learning_rate": 5.520226674563145e-06, + "loss": 0.9795, + "step": 25045 + }, + { + "epoch": 0.8969505971672606, + "grad_norm": 1.192337989807129, + "learning_rate": 5.516426845377476e-06, + "loss": 1.0062, + "step": 25046 + }, + { + "epoch": 0.8969864092968288, + "grad_norm": 1.299309253692627, + "learning_rate": 5.512628287346433e-06, + "loss": 0.9864, + "step": 25047 + }, + { + "epoch": 0.8970222214263971, + "grad_norm": 1.3693221807479858, + "learning_rate": 5.5088310005210865e-06, + "loss": 0.9483, + "step": 25048 + }, + { + "epoch": 0.8970580335559654, + "grad_norm": 1.327775239944458, + "learning_rate": 5.505034984952529e-06, + "loss": 1.0673, + "step": 25049 + }, + { + "epoch": 0.8970938456855337, + "grad_norm": 1.344411849975586, + "learning_rate": 5.501240240691852e-06, + "loss": 1.243, + "step": 25050 + }, + { + "epoch": 0.897129657815102, + "grad_norm": 1.5174885988235474, + "learning_rate": 5.49744676779006e-06, + "loss": 1.1242, + "step": 25051 + }, + { + "epoch": 0.8971654699446703, + "grad_norm": 1.509929895401001, + "learning_rate": 5.4936545662982455e-06, + "loss": 1.1395, + "step": 25052 + }, + { + "epoch": 0.8972012820742385, + "grad_norm": 1.4959744215011597, + "learning_rate": 5.4898636362674e-06, + "loss": 0.9365, + "step": 25053 + }, + { + "epoch": 0.8972370942038068, + "grad_norm": 2.059774875640869, + "learning_rate": 5.486073977748541e-06, + "loss": 0.9163, + "step": 25054 + }, + { + "epoch": 0.8972729063333751, + "grad_norm": 1.661645770072937, + "learning_rate": 5.482285590792613e-06, + "loss": 1.0164, + "step": 25055 + }, + { + "epoch": 0.8973087184629434, + "grad_norm": 1.5371626615524292, + "learning_rate": 5.478498475450644e-06, + "loss": 1.1519, + "step": 25056 + }, + { + "epoch": 0.8973445305925117, + "grad_norm": 1.738862156867981, + "learning_rate": 5.47471263177356e-06, + "loss": 1.0574, + "step": 25057 + }, + { + "epoch": 0.89738034272208, + "grad_norm": 1.6199225187301636, + "learning_rate": 5.470928059812264e-06, + "loss": 0.9636, + "step": 25058 + }, + { + "epoch": 0.8974161548516483, + "grad_norm": 1.375292420387268, + "learning_rate": 5.467144759617704e-06, + "loss": 1.1434, + "step": 25059 + }, + { + "epoch": 0.8974519669812165, + "grad_norm": 1.5238896608352661, + "learning_rate": 5.463362731240773e-06, + "loss": 1.107, + "step": 25060 + }, + { + "epoch": 0.8974877791107848, + "grad_norm": 1.4591729640960693, + "learning_rate": 5.4595819747323636e-06, + "loss": 1.0615, + "step": 25061 + }, + { + "epoch": 0.8975235912403531, + "grad_norm": 1.3816856145858765, + "learning_rate": 5.455802490143314e-06, + "loss": 1.0482, + "step": 25062 + }, + { + "epoch": 0.8975594033699213, + "grad_norm": 1.697014570236206, + "learning_rate": 5.4520242775244925e-06, + "loss": 0.9136, + "step": 25063 + }, + { + "epoch": 0.8975952154994897, + "grad_norm": 1.4200491905212402, + "learning_rate": 5.4482473369267264e-06, + "loss": 0.8561, + "step": 25064 + }, + { + "epoch": 0.897631027629058, + "grad_norm": 1.1766777038574219, + "learning_rate": 5.444471668400841e-06, + "loss": 1.0984, + "step": 25065 + }, + { + "epoch": 0.8976668397586263, + "grad_norm": 1.8750513792037964, + "learning_rate": 5.440697271997608e-06, + "loss": 0.8325, + "step": 25066 + }, + { + "epoch": 0.8977026518881945, + "grad_norm": 1.3941752910614014, + "learning_rate": 5.436924147767819e-06, + "loss": 1.1512, + "step": 25067 + }, + { + "epoch": 0.8977384640177628, + "grad_norm": 1.6508065462112427, + "learning_rate": 5.433152295762256e-06, + "loss": 0.9539, + "step": 25068 + }, + { + "epoch": 0.8977742761473311, + "grad_norm": 1.4108980894088745, + "learning_rate": 5.429381716031634e-06, + "loss": 0.9373, + "step": 25069 + }, + { + "epoch": 0.8978100882768993, + "grad_norm": 1.5109246969223022, + "learning_rate": 5.42561240862669e-06, + "loss": 1.2143, + "step": 25070 + }, + { + "epoch": 0.8978459004064677, + "grad_norm": 1.244258999824524, + "learning_rate": 5.421844373598139e-06, + "loss": 0.951, + "step": 25071 + }, + { + "epoch": 0.897881712536036, + "grad_norm": 1.25485098361969, + "learning_rate": 5.418077610996686e-06, + "loss": 1.09, + "step": 25072 + }, + { + "epoch": 0.8979175246656043, + "grad_norm": 1.7515753507614136, + "learning_rate": 5.4143121208729885e-06, + "loss": 1.1729, + "step": 25073 + }, + { + "epoch": 0.8979533367951725, + "grad_norm": 1.44644033908844, + "learning_rate": 5.410547903277707e-06, + "loss": 1.01, + "step": 25074 + }, + { + "epoch": 0.8979891489247408, + "grad_norm": 1.426082730293274, + "learning_rate": 5.4067849582615124e-06, + "loss": 1.1527, + "step": 25075 + }, + { + "epoch": 0.8980249610543091, + "grad_norm": 1.5911647081375122, + "learning_rate": 5.403023285874997e-06, + "loss": 1.1105, + "step": 25076 + }, + { + "epoch": 0.8980607731838773, + "grad_norm": 1.5881726741790771, + "learning_rate": 5.399262886168777e-06, + "loss": 1.0816, + "step": 25077 + }, + { + "epoch": 0.8980965853134457, + "grad_norm": 1.673531174659729, + "learning_rate": 5.395503759193454e-06, + "loss": 1.2805, + "step": 25078 + }, + { + "epoch": 0.898132397443014, + "grad_norm": 1.5406444072723389, + "learning_rate": 5.391745904999601e-06, + "loss": 1.0986, + "step": 25079 + }, + { + "epoch": 0.8981682095725823, + "grad_norm": 1.6463903188705444, + "learning_rate": 5.387989323637765e-06, + "loss": 1.2155, + "step": 25080 + }, + { + "epoch": 0.8982040217021505, + "grad_norm": 1.2159247398376465, + "learning_rate": 5.384234015158495e-06, + "loss": 1.0518, + "step": 25081 + }, + { + "epoch": 0.8982398338317188, + "grad_norm": 1.7013002634048462, + "learning_rate": 5.380479979612307e-06, + "loss": 1.0502, + "step": 25082 + }, + { + "epoch": 0.8982756459612871, + "grad_norm": 1.4624392986297607, + "learning_rate": 5.376727217049726e-06, + "loss": 0.9909, + "step": 25083 + }, + { + "epoch": 0.8983114580908553, + "grad_norm": 1.317621111869812, + "learning_rate": 5.372975727521201e-06, + "loss": 0.9743, + "step": 25084 + }, + { + "epoch": 0.8983472702204237, + "grad_norm": 1.671460509300232, + "learning_rate": 5.369225511077236e-06, + "loss": 1.0963, + "step": 25085 + }, + { + "epoch": 0.898383082349992, + "grad_norm": 1.6899490356445312, + "learning_rate": 5.36547656776829e-06, + "loss": 1.149, + "step": 25086 + }, + { + "epoch": 0.8984188944795602, + "grad_norm": 1.3059396743774414, + "learning_rate": 5.36172889764478e-06, + "loss": 1.2412, + "step": 25087 + }, + { + "epoch": 0.8984547066091285, + "grad_norm": 1.596358060836792, + "learning_rate": 5.357982500757119e-06, + "loss": 1.0594, + "step": 25088 + }, + { + "epoch": 0.8984905187386968, + "grad_norm": 2.1707680225372314, + "learning_rate": 5.354237377155735e-06, + "loss": 1.075, + "step": 25089 + }, + { + "epoch": 0.898526330868265, + "grad_norm": 1.5002708435058594, + "learning_rate": 5.3504935268910095e-06, + "loss": 0.9761, + "step": 25090 + }, + { + "epoch": 0.8985621429978333, + "grad_norm": 1.607490062713623, + "learning_rate": 5.346750950013301e-06, + "loss": 1.224, + "step": 25091 + }, + { + "epoch": 0.8985979551274017, + "grad_norm": 1.4443484544754028, + "learning_rate": 5.343009646572949e-06, + "loss": 1.1133, + "step": 25092 + }, + { + "epoch": 0.89863376725697, + "grad_norm": 1.7366721630096436, + "learning_rate": 5.3392696166203345e-06, + "loss": 1.2614, + "step": 25093 + }, + { + "epoch": 0.8986695793865382, + "grad_norm": 1.8047124147415161, + "learning_rate": 5.335530860205718e-06, + "loss": 0.9067, + "step": 25094 + }, + { + "epoch": 0.8987053915161065, + "grad_norm": 1.4232357740402222, + "learning_rate": 5.331793377379435e-06, + "loss": 1.1177, + "step": 25095 + }, + { + "epoch": 0.8987412036456748, + "grad_norm": 1.483058214187622, + "learning_rate": 5.328057168191747e-06, + "loss": 0.9522, + "step": 25096 + }, + { + "epoch": 0.898777015775243, + "grad_norm": 1.7423183917999268, + "learning_rate": 5.324322232692947e-06, + "loss": 1.0592, + "step": 25097 + }, + { + "epoch": 0.8988128279048113, + "grad_norm": 1.4527190923690796, + "learning_rate": 5.32058857093326e-06, + "loss": 1.1002, + "step": 25098 + }, + { + "epoch": 0.8988486400343797, + "grad_norm": 1.8015271425247192, + "learning_rate": 5.316856182962926e-06, + "loss": 1.2344, + "step": 25099 + }, + { + "epoch": 0.898884452163948, + "grad_norm": 1.4429757595062256, + "learning_rate": 5.313125068832159e-06, + "loss": 1.0994, + "step": 25100 + }, + { + "epoch": 0.8989202642935162, + "grad_norm": 1.7698390483856201, + "learning_rate": 5.309395228591174e-06, + "loss": 0.9866, + "step": 25101 + }, + { + "epoch": 0.8989560764230845, + "grad_norm": 1.4424588680267334, + "learning_rate": 5.305666662290121e-06, + "loss": 1.0205, + "step": 25102 + }, + { + "epoch": 0.8989918885526528, + "grad_norm": 1.3118681907653809, + "learning_rate": 5.30193936997917e-06, + "loss": 1.2602, + "step": 25103 + }, + { + "epoch": 0.899027700682221, + "grad_norm": 1.609412670135498, + "learning_rate": 5.298213351708492e-06, + "loss": 0.9885, + "step": 25104 + }, + { + "epoch": 0.8990635128117893, + "grad_norm": 1.5119876861572266, + "learning_rate": 5.29448860752817e-06, + "loss": 1.0243, + "step": 25105 + }, + { + "epoch": 0.8990993249413577, + "grad_norm": 1.3542206287384033, + "learning_rate": 5.290765137488351e-06, + "loss": 1.0523, + "step": 25106 + }, + { + "epoch": 0.899135137070926, + "grad_norm": 1.3358237743377686, + "learning_rate": 5.287042941639131e-06, + "loss": 1.1854, + "step": 25107 + }, + { + "epoch": 0.8991709492004942, + "grad_norm": 1.9660706520080566, + "learning_rate": 5.2833220200305785e-06, + "loss": 1.3884, + "step": 25108 + }, + { + "epoch": 0.8992067613300625, + "grad_norm": 1.507032871246338, + "learning_rate": 5.279602372712744e-06, + "loss": 1.2292, + "step": 25109 + }, + { + "epoch": 0.8992425734596308, + "grad_norm": 1.4630478620529175, + "learning_rate": 5.275883999735676e-06, + "loss": 0.9889, + "step": 25110 + }, + { + "epoch": 0.899278385589199, + "grad_norm": 1.5930074453353882, + "learning_rate": 5.272166901149423e-06, + "loss": 0.8756, + "step": 25111 + }, + { + "epoch": 0.8993141977187673, + "grad_norm": 1.71818208694458, + "learning_rate": 5.2684510770039556e-06, + "loss": 1.2166, + "step": 25112 + }, + { + "epoch": 0.8993500098483357, + "grad_norm": 1.580816388130188, + "learning_rate": 5.264736527349279e-06, + "loss": 0.9748, + "step": 25113 + }, + { + "epoch": 0.899385821977904, + "grad_norm": 1.5996522903442383, + "learning_rate": 5.261023252235386e-06, + "loss": 1.1297, + "step": 25114 + }, + { + "epoch": 0.8994216341074722, + "grad_norm": 1.5970044136047363, + "learning_rate": 5.257311251712227e-06, + "loss": 1.1576, + "step": 25115 + }, + { + "epoch": 0.8994574462370405, + "grad_norm": 1.497700810432434, + "learning_rate": 5.253600525829716e-06, + "loss": 1.3028, + "step": 25116 + }, + { + "epoch": 0.8994932583666088, + "grad_norm": 1.3732579946517944, + "learning_rate": 5.249891074637803e-06, + "loss": 1.0493, + "step": 25117 + }, + { + "epoch": 0.899529070496177, + "grad_norm": 1.647403597831726, + "learning_rate": 5.2461828981863916e-06, + "loss": 1.0974, + "step": 25118 + }, + { + "epoch": 0.8995648826257453, + "grad_norm": 1.6125097274780273, + "learning_rate": 5.2424759965253645e-06, + "loss": 1.2283, + "step": 25119 + }, + { + "epoch": 0.8996006947553137, + "grad_norm": 1.669986605644226, + "learning_rate": 5.2387703697046045e-06, + "loss": 0.923, + "step": 25120 + }, + { + "epoch": 0.899636506884882, + "grad_norm": 1.5210487842559814, + "learning_rate": 5.235066017773926e-06, + "loss": 0.897, + "step": 25121 + }, + { + "epoch": 0.8996723190144502, + "grad_norm": 1.2373945713043213, + "learning_rate": 5.2313629407832355e-06, + "loss": 1.0472, + "step": 25122 + }, + { + "epoch": 0.8997081311440185, + "grad_norm": 1.7042757272720337, + "learning_rate": 5.227661138782281e-06, + "loss": 1.2325, + "step": 25123 + }, + { + "epoch": 0.8997439432735868, + "grad_norm": 1.4382928609848022, + "learning_rate": 5.22396061182091e-06, + "loss": 1.0559, + "step": 25124 + }, + { + "epoch": 0.899779755403155, + "grad_norm": 1.405168056488037, + "learning_rate": 5.220261359948897e-06, + "loss": 1.2221, + "step": 25125 + }, + { + "epoch": 0.8998155675327233, + "grad_norm": 1.4798781871795654, + "learning_rate": 5.216563383216022e-06, + "loss": 1.2064, + "step": 25126 + }, + { + "epoch": 0.8998513796622917, + "grad_norm": 1.643302083015442, + "learning_rate": 5.2128666816720015e-06, + "loss": 1.2385, + "step": 25127 + }, + { + "epoch": 0.8998871917918599, + "grad_norm": 1.996828317642212, + "learning_rate": 5.209171255366607e-06, + "loss": 1.0509, + "step": 25128 + }, + { + "epoch": 0.8999230039214282, + "grad_norm": 1.4288095235824585, + "learning_rate": 5.205477104349554e-06, + "loss": 1.14, + "step": 25129 + }, + { + "epoch": 0.8999588160509965, + "grad_norm": 1.2895365953445435, + "learning_rate": 5.2017842286705145e-06, + "loss": 1.0851, + "step": 25130 + }, + { + "epoch": 0.8999946281805647, + "grad_norm": 1.360884428024292, + "learning_rate": 5.198092628379192e-06, + "loss": 1.1927, + "step": 25131 + }, + { + "epoch": 0.900030440310133, + "grad_norm": 1.9437810182571411, + "learning_rate": 5.194402303525225e-06, + "loss": 1.221, + "step": 25132 + }, + { + "epoch": 0.9000662524397013, + "grad_norm": 1.6426666975021362, + "learning_rate": 5.190713254158319e-06, + "loss": 0.9803, + "step": 25133 + }, + { + "epoch": 0.9001020645692697, + "grad_norm": 1.7669148445129395, + "learning_rate": 5.187025480328056e-06, + "loss": 1.0591, + "step": 25134 + }, + { + "epoch": 0.9001378766988379, + "grad_norm": 1.512288212776184, + "learning_rate": 5.183338982084074e-06, + "loss": 1.2387, + "step": 25135 + }, + { + "epoch": 0.9001736888284062, + "grad_norm": 1.394404411315918, + "learning_rate": 5.179653759475933e-06, + "loss": 1.0096, + "step": 25136 + }, + { + "epoch": 0.9002095009579745, + "grad_norm": 1.6685980558395386, + "learning_rate": 5.175969812553272e-06, + "loss": 0.9303, + "step": 25137 + }, + { + "epoch": 0.9002453130875427, + "grad_norm": 1.5835840702056885, + "learning_rate": 5.172287141365628e-06, + "loss": 1.019, + "step": 25138 + }, + { + "epoch": 0.900281125217111, + "grad_norm": 1.7069064378738403, + "learning_rate": 5.168605745962507e-06, + "loss": 1.1617, + "step": 25139 + }, + { + "epoch": 0.9003169373466793, + "grad_norm": 1.4307801723480225, + "learning_rate": 5.164925626393502e-06, + "loss": 1.1331, + "step": 25140 + }, + { + "epoch": 0.9003527494762477, + "grad_norm": 1.230745792388916, + "learning_rate": 5.161246782708073e-06, + "loss": 1.118, + "step": 25141 + }, + { + "epoch": 0.9003885616058159, + "grad_norm": 1.4042036533355713, + "learning_rate": 5.15756921495576e-06, + "loss": 1.1165, + "step": 25142 + }, + { + "epoch": 0.9004243737353842, + "grad_norm": 4.054474353790283, + "learning_rate": 5.153892923185977e-06, + "loss": 1.1512, + "step": 25143 + }, + { + "epoch": 0.9004601858649525, + "grad_norm": 1.559018611907959, + "learning_rate": 5.150217907448263e-06, + "loss": 1.1947, + "step": 25144 + }, + { + "epoch": 0.9004959979945207, + "grad_norm": 1.5350686311721802, + "learning_rate": 5.146544167792011e-06, + "loss": 1.038, + "step": 25145 + }, + { + "epoch": 0.900531810124089, + "grad_norm": 1.3709766864776611, + "learning_rate": 5.1428717042666385e-06, + "loss": 0.9844, + "step": 25146 + }, + { + "epoch": 0.9005676222536573, + "grad_norm": 1.2435709238052368, + "learning_rate": 5.1392005169215825e-06, + "loss": 1.017, + "step": 25147 + }, + { + "epoch": 0.9006034343832257, + "grad_norm": 1.3397388458251953, + "learning_rate": 5.1355306058062044e-06, + "loss": 1.2598, + "step": 25148 + }, + { + "epoch": 0.9006392465127939, + "grad_norm": 1.8724238872528076, + "learning_rate": 5.13186197096992e-06, + "loss": 0.9609, + "step": 25149 + }, + { + "epoch": 0.9006750586423622, + "grad_norm": 1.414466142654419, + "learning_rate": 5.128194612462034e-06, + "loss": 1.0871, + "step": 25150 + }, + { + "epoch": 0.9007108707719305, + "grad_norm": 1.860758662223816, + "learning_rate": 5.12452853033194e-06, + "loss": 1.3788, + "step": 25151 + }, + { + "epoch": 0.9007466829014987, + "grad_norm": 1.4211349487304688, + "learning_rate": 5.120863724628922e-06, + "loss": 1.2047, + "step": 25152 + }, + { + "epoch": 0.900782495031067, + "grad_norm": 1.4316285848617554, + "learning_rate": 5.117200195402316e-06, + "loss": 0.8623, + "step": 25153 + }, + { + "epoch": 0.9008183071606353, + "grad_norm": 1.3357579708099365, + "learning_rate": 5.113537942701363e-06, + "loss": 1.3095, + "step": 25154 + }, + { + "epoch": 0.9008541192902035, + "grad_norm": 1.4481406211853027, + "learning_rate": 5.109876966575377e-06, + "loss": 0.9747, + "step": 25155 + }, + { + "epoch": 0.9008899314197719, + "grad_norm": 1.414793848991394, + "learning_rate": 5.106217267073598e-06, + "loss": 1.1951, + "step": 25156 + }, + { + "epoch": 0.9009257435493402, + "grad_norm": 1.483848214149475, + "learning_rate": 5.102558844245265e-06, + "loss": 0.8261, + "step": 25157 + }, + { + "epoch": 0.9009615556789085, + "grad_norm": 1.3343138694763184, + "learning_rate": 5.09890169813958e-06, + "loss": 1.1439, + "step": 25158 + }, + { + "epoch": 0.9009973678084767, + "grad_norm": 1.5320677757263184, + "learning_rate": 5.095245828805761e-06, + "loss": 0.9724, + "step": 25159 + }, + { + "epoch": 0.901033179938045, + "grad_norm": 1.891489863395691, + "learning_rate": 5.091591236293003e-06, + "loss": 1.1584, + "step": 25160 + }, + { + "epoch": 0.9010689920676133, + "grad_norm": 1.565850019454956, + "learning_rate": 5.087937920650454e-06, + "loss": 1.0378, + "step": 25161 + }, + { + "epoch": 0.9011048041971815, + "grad_norm": 1.3004299402236938, + "learning_rate": 5.0842858819272644e-06, + "loss": 1.2075, + "step": 25162 + }, + { + "epoch": 0.9011406163267499, + "grad_norm": 1.4198520183563232, + "learning_rate": 5.0806351201725944e-06, + "loss": 0.9997, + "step": 25163 + }, + { + "epoch": 0.9011764284563182, + "grad_norm": 1.463794231414795, + "learning_rate": 5.076985635435527e-06, + "loss": 1.0328, + "step": 25164 + }, + { + "epoch": 0.9012122405858864, + "grad_norm": 1.3922861814498901, + "learning_rate": 5.073337427765179e-06, + "loss": 1.1839, + "step": 25165 + }, + { + "epoch": 0.9012480527154547, + "grad_norm": 1.5188206434249878, + "learning_rate": 5.069690497210633e-06, + "loss": 0.9528, + "step": 25166 + }, + { + "epoch": 0.901283864845023, + "grad_norm": 1.5223037004470825, + "learning_rate": 5.06604484382095e-06, + "loss": 1.1021, + "step": 25167 + }, + { + "epoch": 0.9013196769745913, + "grad_norm": 1.2418071031570435, + "learning_rate": 5.062400467645178e-06, + "loss": 0.9361, + "step": 25168 + }, + { + "epoch": 0.9013554891041595, + "grad_norm": 1.709132432937622, + "learning_rate": 5.058757368732336e-06, + "loss": 1.0949, + "step": 25169 + }, + { + "epoch": 0.9013913012337279, + "grad_norm": 1.472337007522583, + "learning_rate": 5.055115547131462e-06, + "loss": 1.2484, + "step": 25170 + }, + { + "epoch": 0.9014271133632962, + "grad_norm": 1.2734373807907104, + "learning_rate": 5.051475002891537e-06, + "loss": 0.9531, + "step": 25171 + }, + { + "epoch": 0.9014629254928644, + "grad_norm": 1.3903199434280396, + "learning_rate": 5.047835736061535e-06, + "loss": 0.8893, + "step": 25172 + }, + { + "epoch": 0.9014987376224327, + "grad_norm": 1.5587162971496582, + "learning_rate": 5.044197746690427e-06, + "loss": 0.9684, + "step": 25173 + }, + { + "epoch": 0.901534549752001, + "grad_norm": 1.293180227279663, + "learning_rate": 5.040561034827163e-06, + "loss": 1.0661, + "step": 25174 + }, + { + "epoch": 0.9015703618815692, + "grad_norm": 1.6028484106063843, + "learning_rate": 5.036925600520648e-06, + "loss": 1.0007, + "step": 25175 + }, + { + "epoch": 0.9016061740111375, + "grad_norm": 1.5129059553146362, + "learning_rate": 5.0332914438197984e-06, + "loss": 0.8537, + "step": 25176 + }, + { + "epoch": 0.9016419861407059, + "grad_norm": 1.6496394872665405, + "learning_rate": 5.029658564773521e-06, + "loss": 1.1069, + "step": 25177 + }, + { + "epoch": 0.9016777982702742, + "grad_norm": 1.512819528579712, + "learning_rate": 5.026026963430697e-06, + "loss": 0.9468, + "step": 25178 + }, + { + "epoch": 0.9017136103998424, + "grad_norm": 1.597456693649292, + "learning_rate": 5.022396639840166e-06, + "loss": 1.1068, + "step": 25179 + }, + { + "epoch": 0.9017494225294107, + "grad_norm": 1.3523072004318237, + "learning_rate": 5.018767594050766e-06, + "loss": 1.0439, + "step": 25180 + }, + { + "epoch": 0.901785234658979, + "grad_norm": 1.7529717683792114, + "learning_rate": 5.015139826111348e-06, + "loss": 1.4129, + "step": 25181 + }, + { + "epoch": 0.9018210467885472, + "grad_norm": 1.4709445238113403, + "learning_rate": 5.0115133360706945e-06, + "loss": 1.2702, + "step": 25182 + }, + { + "epoch": 0.9018568589181155, + "grad_norm": 1.373668909072876, + "learning_rate": 5.0078881239776e-06, + "loss": 0.8412, + "step": 25183 + }, + { + "epoch": 0.9018926710476839, + "grad_norm": 1.6141659021377563, + "learning_rate": 5.0042641898808364e-06, + "loss": 1.2225, + "step": 25184 + }, + { + "epoch": 0.9019284831772522, + "grad_norm": 1.2940536737442017, + "learning_rate": 5.000641533829176e-06, + "loss": 1.048, + "step": 25185 + }, + { + "epoch": 0.9019642953068204, + "grad_norm": 1.52390718460083, + "learning_rate": 4.9970201558713345e-06, + "loss": 1.1406, + "step": 25186 + }, + { + "epoch": 0.9020001074363887, + "grad_norm": 1.4940249919891357, + "learning_rate": 4.99340005605603e-06, + "loss": 1.1111, + "step": 25187 + }, + { + "epoch": 0.902035919565957, + "grad_norm": 1.357790470123291, + "learning_rate": 4.98978123443199e-06, + "loss": 1.1815, + "step": 25188 + }, + { + "epoch": 0.9020717316955252, + "grad_norm": 1.3630380630493164, + "learning_rate": 4.986163691047896e-06, + "loss": 0.9415, + "step": 25189 + }, + { + "epoch": 0.9021075438250935, + "grad_norm": 1.4228829145431519, + "learning_rate": 4.982547425952399e-06, + "loss": 1.2143, + "step": 25190 + }, + { + "epoch": 0.9021433559546619, + "grad_norm": 1.2189555168151855, + "learning_rate": 4.9789324391941615e-06, + "loss": 1.0258, + "step": 25191 + }, + { + "epoch": 0.9021791680842302, + "grad_norm": 1.5088108777999878, + "learning_rate": 4.975318730821843e-06, + "loss": 0.9939, + "step": 25192 + }, + { + "epoch": 0.9022149802137984, + "grad_norm": 1.226902723312378, + "learning_rate": 4.971706300884016e-06, + "loss": 1.0261, + "step": 25193 + }, + { + "epoch": 0.9022507923433667, + "grad_norm": 1.4472460746765137, + "learning_rate": 4.9680951494292975e-06, + "loss": 1.0505, + "step": 25194 + }, + { + "epoch": 0.902286604472935, + "grad_norm": 1.412825107574463, + "learning_rate": 4.964485276506281e-06, + "loss": 1.0407, + "step": 25195 + }, + { + "epoch": 0.9023224166025032, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.960876682163551e-06, + "loss": 1.185, + "step": 25196 + }, + { + "epoch": 0.9023582287320715, + "grad_norm": 1.2850613594055176, + "learning_rate": 4.957269366449613e-06, + "loss": 0.9295, + "step": 25197 + }, + { + "epoch": 0.9023940408616399, + "grad_norm": 1.688653826713562, + "learning_rate": 4.953663329413017e-06, + "loss": 1.0347, + "step": 25198 + }, + { + "epoch": 0.9024298529912081, + "grad_norm": 1.8774237632751465, + "learning_rate": 4.950058571102289e-06, + "loss": 1.164, + "step": 25199 + }, + { + "epoch": 0.9024656651207764, + "grad_norm": 1.9145463705062866, + "learning_rate": 4.946455091565916e-06, + "loss": 0.9981, + "step": 25200 + }, + { + "epoch": 0.9025014772503447, + "grad_norm": 1.5132052898406982, + "learning_rate": 4.942852890852367e-06, + "loss": 1.0992, + "step": 25201 + }, + { + "epoch": 0.902537289379913, + "grad_norm": 1.7063994407653809, + "learning_rate": 4.939251969010128e-06, + "loss": 1.1288, + "step": 25202 + }, + { + "epoch": 0.9025731015094812, + "grad_norm": 1.7827693223953247, + "learning_rate": 4.935652326087648e-06, + "loss": 1.1543, + "step": 25203 + }, + { + "epoch": 0.9026089136390495, + "grad_norm": 1.4098665714263916, + "learning_rate": 4.932053962133321e-06, + "loss": 1.0563, + "step": 25204 + }, + { + "epoch": 0.9026447257686179, + "grad_norm": 1.4788559675216675, + "learning_rate": 4.928456877195586e-06, + "loss": 1.009, + "step": 25205 + }, + { + "epoch": 0.9026805378981861, + "grad_norm": 1.9285120964050293, + "learning_rate": 4.924861071322817e-06, + "loss": 1.1928, + "step": 25206 + }, + { + "epoch": 0.9027163500277544, + "grad_norm": 1.6698360443115234, + "learning_rate": 4.92126654456343e-06, + "loss": 1.1349, + "step": 25207 + }, + { + "epoch": 0.9027521621573227, + "grad_norm": 1.7064985036849976, + "learning_rate": 4.917673296965741e-06, + "loss": 1.1933, + "step": 25208 + }, + { + "epoch": 0.902787974286891, + "grad_norm": 1.4710054397583008, + "learning_rate": 4.914081328578113e-06, + "loss": 0.9502, + "step": 25209 + }, + { + "epoch": 0.9028237864164592, + "grad_norm": 1.6863220930099487, + "learning_rate": 4.910490639448884e-06, + "loss": 1.0335, + "step": 25210 + }, + { + "epoch": 0.9028595985460275, + "grad_norm": 1.7681583166122437, + "learning_rate": 4.906901229626326e-06, + "loss": 1.2485, + "step": 25211 + }, + { + "epoch": 0.9028954106755959, + "grad_norm": 1.4550405740737915, + "learning_rate": 4.903313099158757e-06, + "loss": 1.0905, + "step": 25212 + }, + { + "epoch": 0.9029312228051641, + "grad_norm": 1.4680205583572388, + "learning_rate": 4.8997262480944385e-06, + "loss": 1.0086, + "step": 25213 + }, + { + "epoch": 0.9029670349347324, + "grad_norm": 1.8353512287139893, + "learning_rate": 4.896140676481653e-06, + "loss": 0.8853, + "step": 25214 + }, + { + "epoch": 0.9030028470643007, + "grad_norm": 1.3239622116088867, + "learning_rate": 4.892556384368607e-06, + "loss": 1.0702, + "step": 25215 + }, + { + "epoch": 0.9030386591938689, + "grad_norm": 1.5888864994049072, + "learning_rate": 4.8889733718035295e-06, + "loss": 1.13, + "step": 25216 + }, + { + "epoch": 0.9030744713234372, + "grad_norm": 1.3579849004745483, + "learning_rate": 4.885391638834646e-06, + "loss": 0.9457, + "step": 25217 + }, + { + "epoch": 0.9031102834530055, + "grad_norm": 1.4782062768936157, + "learning_rate": 4.88181118551011e-06, + "loss": 0.9644, + "step": 25218 + }, + { + "epoch": 0.9031460955825739, + "grad_norm": 1.3220725059509277, + "learning_rate": 4.878232011878136e-06, + "loss": 1.1413, + "step": 25219 + }, + { + "epoch": 0.9031819077121421, + "grad_norm": 1.6850717067718506, + "learning_rate": 4.874654117986821e-06, + "loss": 1.1544, + "step": 25220 + }, + { + "epoch": 0.9032177198417104, + "grad_norm": 1.4065580368041992, + "learning_rate": 4.871077503884358e-06, + "loss": 1.1816, + "step": 25221 + }, + { + "epoch": 0.9032535319712787, + "grad_norm": 1.7265877723693848, + "learning_rate": 4.86750216961882e-06, + "loss": 0.9612, + "step": 25222 + }, + { + "epoch": 0.9032893441008469, + "grad_norm": 1.6098344326019287, + "learning_rate": 4.863928115238336e-06, + "loss": 1.0659, + "step": 25223 + }, + { + "epoch": 0.9033251562304152, + "grad_norm": 1.465248942375183, + "learning_rate": 4.860355340790978e-06, + "loss": 0.8875, + "step": 25224 + }, + { + "epoch": 0.9033609683599835, + "grad_norm": 1.4667702913284302, + "learning_rate": 4.85678384632483e-06, + "loss": 1.072, + "step": 25225 + }, + { + "epoch": 0.9033967804895519, + "grad_norm": 2.166152238845825, + "learning_rate": 4.8532136318879315e-06, + "loss": 1.1463, + "step": 25226 + }, + { + "epoch": 0.9034325926191201, + "grad_norm": 1.3692994117736816, + "learning_rate": 4.8496446975282885e-06, + "loss": 1.1387, + "step": 25227 + }, + { + "epoch": 0.9034684047486884, + "grad_norm": 1.3866021633148193, + "learning_rate": 4.846077043293973e-06, + "loss": 0.7977, + "step": 25228 + }, + { + "epoch": 0.9035042168782567, + "grad_norm": 1.5491609573364258, + "learning_rate": 4.842510669232925e-06, + "loss": 1.057, + "step": 25229 + }, + { + "epoch": 0.9035400290078249, + "grad_norm": 1.7343080043792725, + "learning_rate": 4.8389455753931726e-06, + "loss": 0.9333, + "step": 25230 + }, + { + "epoch": 0.9035758411373932, + "grad_norm": 1.6032334566116333, + "learning_rate": 4.835381761822633e-06, + "loss": 1.1314, + "step": 25231 + }, + { + "epoch": 0.9036116532669615, + "grad_norm": 1.2370150089263916, + "learning_rate": 4.831819228569301e-06, + "loss": 0.8934, + "step": 25232 + }, + { + "epoch": 0.9036474653965298, + "grad_norm": 1.5356322526931763, + "learning_rate": 4.828257975681072e-06, + "loss": 1.2832, + "step": 25233 + }, + { + "epoch": 0.9036832775260981, + "grad_norm": 1.9711109399795532, + "learning_rate": 4.824698003205863e-06, + "loss": 1.1475, + "step": 25234 + }, + { + "epoch": 0.9037190896556664, + "grad_norm": 1.2294281721115112, + "learning_rate": 4.8211393111915915e-06, + "loss": 0.9306, + "step": 25235 + }, + { + "epoch": 0.9037549017852347, + "grad_norm": 1.512943148612976, + "learning_rate": 4.817581899686108e-06, + "loss": 1.2171, + "step": 25236 + }, + { + "epoch": 0.9037907139148029, + "grad_norm": 1.3377301692962646, + "learning_rate": 4.814025768737296e-06, + "loss": 0.9324, + "step": 25237 + }, + { + "epoch": 0.9038265260443712, + "grad_norm": 1.4085290431976318, + "learning_rate": 4.810470918392962e-06, + "loss": 1.0712, + "step": 25238 + }, + { + "epoch": 0.9038623381739395, + "grad_norm": 1.3508843183517456, + "learning_rate": 4.8069173487009785e-06, + "loss": 1.0289, + "step": 25239 + }, + { + "epoch": 0.9038981503035078, + "grad_norm": 1.3690351247787476, + "learning_rate": 4.803365059709131e-06, + "loss": 1.0447, + "step": 25240 + }, + { + "epoch": 0.9039339624330761, + "grad_norm": 1.437583088874817, + "learning_rate": 4.799814051465212e-06, + "loss": 1.172, + "step": 25241 + }, + { + "epoch": 0.9039697745626444, + "grad_norm": 1.6174709796905518, + "learning_rate": 4.7962643240169854e-06, + "loss": 1.0075, + "step": 25242 + }, + { + "epoch": 0.9040055866922126, + "grad_norm": 1.9804210662841797, + "learning_rate": 4.792715877412213e-06, + "loss": 1.1073, + "step": 25243 + }, + { + "epoch": 0.9040413988217809, + "grad_norm": 1.5096871852874756, + "learning_rate": 4.789168711698655e-06, + "loss": 1.1626, + "step": 25244 + }, + { + "epoch": 0.9040772109513492, + "grad_norm": 1.5954958200454712, + "learning_rate": 4.785622826924019e-06, + "loss": 1.2266, + "step": 25245 + }, + { + "epoch": 0.9041130230809175, + "grad_norm": 1.6165035963058472, + "learning_rate": 4.782078223135999e-06, + "loss": 1.2138, + "step": 25246 + }, + { + "epoch": 0.9041488352104858, + "grad_norm": 1.3005340099334717, + "learning_rate": 4.778534900382292e-06, + "loss": 1.0328, + "step": 25247 + }, + { + "epoch": 0.9041846473400541, + "grad_norm": 1.6496227979660034, + "learning_rate": 4.774992858710581e-06, + "loss": 1.0049, + "step": 25248 + }, + { + "epoch": 0.9042204594696224, + "grad_norm": 1.7310267686843872, + "learning_rate": 4.771452098168494e-06, + "loss": 1.1446, + "step": 25249 + }, + { + "epoch": 0.9042562715991906, + "grad_norm": 1.3346511125564575, + "learning_rate": 4.767912618803705e-06, + "loss": 0.9594, + "step": 25250 + }, + { + "epoch": 0.9042920837287589, + "grad_norm": 1.2978147268295288, + "learning_rate": 4.764374420663808e-06, + "loss": 1.036, + "step": 25251 + }, + { + "epoch": 0.9043278958583272, + "grad_norm": 1.7235331535339355, + "learning_rate": 4.7608375037964e-06, + "loss": 0.975, + "step": 25252 + }, + { + "epoch": 0.9043637079878954, + "grad_norm": 1.7615541219711304, + "learning_rate": 4.757301868249076e-06, + "loss": 1.0495, + "step": 25253 + }, + { + "epoch": 0.9043995201174638, + "grad_norm": 1.5977517366409302, + "learning_rate": 4.753767514069396e-06, + "loss": 1.2082, + "step": 25254 + }, + { + "epoch": 0.9044353322470321, + "grad_norm": 1.4277595281600952, + "learning_rate": 4.750234441304924e-06, + "loss": 1.1637, + "step": 25255 + }, + { + "epoch": 0.9044711443766004, + "grad_norm": 1.5261164903640747, + "learning_rate": 4.746702650003176e-06, + "loss": 0.9972, + "step": 25256 + }, + { + "epoch": 0.9045069565061686, + "grad_norm": 1.7333606481552124, + "learning_rate": 4.743172140211683e-06, + "loss": 1.0074, + "step": 25257 + }, + { + "epoch": 0.9045427686357369, + "grad_norm": 1.4190348386764526, + "learning_rate": 4.7396429119779265e-06, + "loss": 1.0595, + "step": 25258 + }, + { + "epoch": 0.9045785807653052, + "grad_norm": 1.4858161211013794, + "learning_rate": 4.736114965349414e-06, + "loss": 1.1673, + "step": 25259 + }, + { + "epoch": 0.9046143928948734, + "grad_norm": 1.5526536703109741, + "learning_rate": 4.732588300373586e-06, + "loss": 0.9664, + "step": 25260 + }, + { + "epoch": 0.9046502050244418, + "grad_norm": 1.5645939111709595, + "learning_rate": 4.729062917097882e-06, + "loss": 0.8083, + "step": 25261 + }, + { + "epoch": 0.9046860171540101, + "grad_norm": 1.4902554750442505, + "learning_rate": 4.725538815569774e-06, + "loss": 1.1283, + "step": 25262 + }, + { + "epoch": 0.9047218292835784, + "grad_norm": 1.5643415451049805, + "learning_rate": 4.722015995836626e-06, + "loss": 1.2361, + "step": 25263 + }, + { + "epoch": 0.9047576414131466, + "grad_norm": 1.347324013710022, + "learning_rate": 4.718494457945855e-06, + "loss": 1.0905, + "step": 25264 + }, + { + "epoch": 0.9047934535427149, + "grad_norm": 1.3066800832748413, + "learning_rate": 4.714974201944833e-06, + "loss": 1.0732, + "step": 25265 + }, + { + "epoch": 0.9048292656722832, + "grad_norm": 1.2166640758514404, + "learning_rate": 4.711455227880935e-06, + "loss": 1.008, + "step": 25266 + }, + { + "epoch": 0.9048650778018514, + "grad_norm": 1.3558272123336792, + "learning_rate": 4.707937535801488e-06, + "loss": 0.8511, + "step": 25267 + }, + { + "epoch": 0.9049008899314198, + "grad_norm": 1.3516496419906616, + "learning_rate": 4.704421125753822e-06, + "loss": 1.0614, + "step": 25268 + }, + { + "epoch": 0.9049367020609881, + "grad_norm": 1.7548933029174805, + "learning_rate": 4.700905997785254e-06, + "loss": 0.9418, + "step": 25269 + }, + { + "epoch": 0.9049725141905564, + "grad_norm": 1.5536893606185913, + "learning_rate": 4.697392151943059e-06, + "loss": 0.9082, + "step": 25270 + }, + { + "epoch": 0.9050083263201246, + "grad_norm": 1.4664020538330078, + "learning_rate": 4.693879588274519e-06, + "loss": 1.04, + "step": 25271 + }, + { + "epoch": 0.9050441384496929, + "grad_norm": 1.4332658052444458, + "learning_rate": 4.690368306826898e-06, + "loss": 1.167, + "step": 25272 + }, + { + "epoch": 0.9050799505792612, + "grad_norm": 1.3293887376785278, + "learning_rate": 4.686858307647446e-06, + "loss": 1.0784, + "step": 25273 + }, + { + "epoch": 0.9051157627088294, + "grad_norm": 1.7050540447235107, + "learning_rate": 4.683349590783348e-06, + "loss": 1.21, + "step": 25274 + }, + { + "epoch": 0.9051515748383978, + "grad_norm": 1.3227521181106567, + "learning_rate": 4.679842156281844e-06, + "loss": 0.9604, + "step": 25275 + }, + { + "epoch": 0.9051873869679661, + "grad_norm": 1.2505130767822266, + "learning_rate": 4.676336004190096e-06, + "loss": 0.9647, + "step": 25276 + }, + { + "epoch": 0.9052231990975343, + "grad_norm": 1.543453574180603, + "learning_rate": 4.6728311345553115e-06, + "loss": 1.0088, + "step": 25277 + }, + { + "epoch": 0.9052590112271026, + "grad_norm": 1.5966641902923584, + "learning_rate": 4.669327547424607e-06, + "loss": 1.1625, + "step": 25278 + }, + { + "epoch": 0.9052948233566709, + "grad_norm": 1.5258686542510986, + "learning_rate": 4.665825242845134e-06, + "loss": 1.1627, + "step": 25279 + }, + { + "epoch": 0.9053306354862392, + "grad_norm": 1.5665541887283325, + "learning_rate": 4.662324220864011e-06, + "loss": 0.9684, + "step": 25280 + }, + { + "epoch": 0.9053664476158074, + "grad_norm": 1.7468327283859253, + "learning_rate": 4.658824481528335e-06, + "loss": 1.1635, + "step": 25281 + }, + { + "epoch": 0.9054022597453758, + "grad_norm": 1.6041520833969116, + "learning_rate": 4.655326024885198e-06, + "loss": 1.096, + "step": 25282 + }, + { + "epoch": 0.9054380718749441, + "grad_norm": 1.5941188335418701, + "learning_rate": 4.651828850981654e-06, + "loss": 1.2497, + "step": 25283 + }, + { + "epoch": 0.9054738840045123, + "grad_norm": 1.637558937072754, + "learning_rate": 4.6483329598647874e-06, + "loss": 1.0725, + "step": 25284 + }, + { + "epoch": 0.9055096961340806, + "grad_norm": 1.5756808519363403, + "learning_rate": 4.644838351581582e-06, + "loss": 1.277, + "step": 25285 + }, + { + "epoch": 0.9055455082636489, + "grad_norm": 1.3642218112945557, + "learning_rate": 4.6413450261790894e-06, + "loss": 1.019, + "step": 25286 + }, + { + "epoch": 0.9055813203932171, + "grad_norm": 1.2805874347686768, + "learning_rate": 4.637852983704294e-06, + "loss": 1.0875, + "step": 25287 + }, + { + "epoch": 0.9056171325227854, + "grad_norm": 1.7573665380477905, + "learning_rate": 4.63436222420417e-06, + "loss": 1.1351, + "step": 25288 + }, + { + "epoch": 0.9056529446523538, + "grad_norm": 1.76301109790802, + "learning_rate": 4.630872747725701e-06, + "loss": 1.1603, + "step": 25289 + }, + { + "epoch": 0.9056887567819221, + "grad_norm": 1.274665117263794, + "learning_rate": 4.627384554315806e-06, + "loss": 0.9939, + "step": 25290 + }, + { + "epoch": 0.9057245689114903, + "grad_norm": 1.5148310661315918, + "learning_rate": 4.623897644021446e-06, + "loss": 0.9347, + "step": 25291 + }, + { + "epoch": 0.9057603810410586, + "grad_norm": 1.6050224304199219, + "learning_rate": 4.6204120168895085e-06, + "loss": 1.3655, + "step": 25292 + }, + { + "epoch": 0.9057961931706269, + "grad_norm": 1.3891161680221558, + "learning_rate": 4.616927672966898e-06, + "loss": 0.964, + "step": 25293 + }, + { + "epoch": 0.9058320053001951, + "grad_norm": 1.325506567955017, + "learning_rate": 4.6134446123004885e-06, + "loss": 0.9666, + "step": 25294 + }, + { + "epoch": 0.9058678174297634, + "grad_norm": 1.467252254486084, + "learning_rate": 4.609962834937153e-06, + "loss": 0.8872, + "step": 25295 + }, + { + "epoch": 0.9059036295593318, + "grad_norm": 1.50468909740448, + "learning_rate": 4.606482340923712e-06, + "loss": 0.9956, + "step": 25296 + }, + { + "epoch": 0.9059394416889001, + "grad_norm": 1.385699987411499, + "learning_rate": 4.6030031303070045e-06, + "loss": 1.1191, + "step": 25297 + }, + { + "epoch": 0.9059752538184683, + "grad_norm": 1.405278205871582, + "learning_rate": 4.599525203133848e-06, + "loss": 1.0132, + "step": 25298 + }, + { + "epoch": 0.9060110659480366, + "grad_norm": 1.4224363565444946, + "learning_rate": 4.596048559451005e-06, + "loss": 1.0924, + "step": 25299 + }, + { + "epoch": 0.9060468780776049, + "grad_norm": 1.5397889614105225, + "learning_rate": 4.592573199305272e-06, + "loss": 1.2596, + "step": 25300 + }, + { + "epoch": 0.9060826902071731, + "grad_norm": 1.7643660306930542, + "learning_rate": 4.58909912274339e-06, + "loss": 1.0512, + "step": 25301 + }, + { + "epoch": 0.9061185023367414, + "grad_norm": 1.5524957180023193, + "learning_rate": 4.585626329812132e-06, + "loss": 1.2311, + "step": 25302 + }, + { + "epoch": 0.9061543144663098, + "grad_norm": 1.215001106262207, + "learning_rate": 4.582154820558182e-06, + "loss": 0.9591, + "step": 25303 + }, + { + "epoch": 0.906190126595878, + "grad_norm": 1.5513074398040771, + "learning_rate": 4.5786845950282486e-06, + "loss": 0.9665, + "step": 25304 + }, + { + "epoch": 0.9062259387254463, + "grad_norm": 1.526606798171997, + "learning_rate": 4.575215653269061e-06, + "loss": 1.1097, + "step": 25305 + }, + { + "epoch": 0.9062617508550146, + "grad_norm": 1.2510517835617065, + "learning_rate": 4.571747995327224e-06, + "loss": 1.1073, + "step": 25306 + }, + { + "epoch": 0.9062975629845829, + "grad_norm": 1.415581226348877, + "learning_rate": 4.568281621249437e-06, + "loss": 0.9145, + "step": 25307 + }, + { + "epoch": 0.9063333751141511, + "grad_norm": 1.7197070121765137, + "learning_rate": 4.564816531082316e-06, + "loss": 1.2253, + "step": 25308 + }, + { + "epoch": 0.9063691872437194, + "grad_norm": 1.4698214530944824, + "learning_rate": 4.561352724872503e-06, + "loss": 1.0489, + "step": 25309 + }, + { + "epoch": 0.9064049993732878, + "grad_norm": 1.4160759449005127, + "learning_rate": 4.557890202666571e-06, + "loss": 0.9885, + "step": 25310 + }, + { + "epoch": 0.906440811502856, + "grad_norm": 1.3709849119186401, + "learning_rate": 4.5544289645111145e-06, + "loss": 1.0224, + "step": 25311 + }, + { + "epoch": 0.9064766236324243, + "grad_norm": 1.4036836624145508, + "learning_rate": 4.5509690104526995e-06, + "loss": 0.9215, + "step": 25312 + }, + { + "epoch": 0.9065124357619926, + "grad_norm": 1.5237014293670654, + "learning_rate": 4.547510340537886e-06, + "loss": 1.1158, + "step": 25313 + }, + { + "epoch": 0.9065482478915609, + "grad_norm": 1.675723671913147, + "learning_rate": 4.544052954813194e-06, + "loss": 1.0478, + "step": 25314 + }, + { + "epoch": 0.9065840600211291, + "grad_norm": 1.3990846872329712, + "learning_rate": 4.540596853325119e-06, + "loss": 0.9141, + "step": 25315 + }, + { + "epoch": 0.9066198721506974, + "grad_norm": 1.7798442840576172, + "learning_rate": 4.537142036120212e-06, + "loss": 1.0923, + "step": 25316 + }, + { + "epoch": 0.9066556842802658, + "grad_norm": 1.2736486196517944, + "learning_rate": 4.533688503244893e-06, + "loss": 0.9386, + "step": 25317 + }, + { + "epoch": 0.906691496409834, + "grad_norm": 1.478209376335144, + "learning_rate": 4.5302362547456565e-06, + "loss": 1.0527, + "step": 25318 + }, + { + "epoch": 0.9067273085394023, + "grad_norm": 1.39156973361969, + "learning_rate": 4.5267852906689555e-06, + "loss": 1.068, + "step": 25319 + }, + { + "epoch": 0.9067631206689706, + "grad_norm": 1.3388375043869019, + "learning_rate": 4.523335611061208e-06, + "loss": 1.209, + "step": 25320 + }, + { + "epoch": 0.9067989327985388, + "grad_norm": 1.420863151550293, + "learning_rate": 4.51988721596881e-06, + "loss": 1.0942, + "step": 25321 + }, + { + "epoch": 0.9068347449281071, + "grad_norm": 1.5613690614700317, + "learning_rate": 4.51644010543818e-06, + "loss": 0.9713, + "step": 25322 + }, + { + "epoch": 0.9068705570576754, + "grad_norm": 1.4124563932418823, + "learning_rate": 4.512994279515692e-06, + "loss": 1.16, + "step": 25323 + }, + { + "epoch": 0.9069063691872438, + "grad_norm": 1.4205946922302246, + "learning_rate": 4.509549738247676e-06, + "loss": 1.05, + "step": 25324 + }, + { + "epoch": 0.906942181316812, + "grad_norm": 1.716301679611206, + "learning_rate": 4.5061064816805165e-06, + "loss": 1.0642, + "step": 25325 + }, + { + "epoch": 0.9069779934463803, + "grad_norm": 1.6391196250915527, + "learning_rate": 4.502664509860488e-06, + "loss": 1.0593, + "step": 25326 + }, + { + "epoch": 0.9070138055759486, + "grad_norm": 1.20779287815094, + "learning_rate": 4.499223822833942e-06, + "loss": 0.997, + "step": 25327 + }, + { + "epoch": 0.9070496177055168, + "grad_norm": 1.58573317527771, + "learning_rate": 4.4957844206471535e-06, + "loss": 1.0489, + "step": 25328 + }, + { + "epoch": 0.9070854298350851, + "grad_norm": 1.461305022239685, + "learning_rate": 4.492346303346395e-06, + "loss": 0.9239, + "step": 25329 + }, + { + "epoch": 0.9071212419646534, + "grad_norm": 1.7771230936050415, + "learning_rate": 4.488909470977909e-06, + "loss": 1.0783, + "step": 25330 + }, + { + "epoch": 0.9071570540942218, + "grad_norm": 1.3899712562561035, + "learning_rate": 4.485473923587957e-06, + "loss": 1.2199, + "step": 25331 + }, + { + "epoch": 0.90719286622379, + "grad_norm": 1.3727552890777588, + "learning_rate": 4.482039661222759e-06, + "loss": 1.0583, + "step": 25332 + }, + { + "epoch": 0.9072286783533583, + "grad_norm": 1.3333368301391602, + "learning_rate": 4.478606683928476e-06, + "loss": 1.1821, + "step": 25333 + }, + { + "epoch": 0.9072644904829266, + "grad_norm": 1.4796756505966187, + "learning_rate": 4.475174991751352e-06, + "loss": 0.88, + "step": 25334 + }, + { + "epoch": 0.9073003026124948, + "grad_norm": 1.447597861289978, + "learning_rate": 4.471744584737525e-06, + "loss": 1.2434, + "step": 25335 + }, + { + "epoch": 0.9073361147420631, + "grad_norm": 2.2202444076538086, + "learning_rate": 4.468315462933159e-06, + "loss": 1.1957, + "step": 25336 + }, + { + "epoch": 0.9073719268716314, + "grad_norm": 1.2340319156646729, + "learning_rate": 4.464887626384362e-06, + "loss": 1.0056, + "step": 25337 + }, + { + "epoch": 0.9074077390011998, + "grad_norm": 1.4401907920837402, + "learning_rate": 4.461461075137285e-06, + "loss": 1.192, + "step": 25338 + }, + { + "epoch": 0.907443551130768, + "grad_norm": 1.4682482481002808, + "learning_rate": 4.458035809238026e-06, + "loss": 1.107, + "step": 25339 + }, + { + "epoch": 0.9074793632603363, + "grad_norm": 1.396623134613037, + "learning_rate": 4.454611828732636e-06, + "loss": 1.0865, + "step": 25340 + }, + { + "epoch": 0.9075151753899046, + "grad_norm": 1.7007741928100586, + "learning_rate": 4.4511891336671885e-06, + "loss": 1.2149, + "step": 25341 + }, + { + "epoch": 0.9075509875194728, + "grad_norm": 1.5810527801513672, + "learning_rate": 4.447767724087759e-06, + "loss": 0.886, + "step": 25342 + }, + { + "epoch": 0.9075867996490411, + "grad_norm": 1.2544432878494263, + "learning_rate": 4.444347600040366e-06, + "loss": 1.0779, + "step": 25343 + }, + { + "epoch": 0.9076226117786094, + "grad_norm": 1.364711880683899, + "learning_rate": 4.440928761570995e-06, + "loss": 0.8665, + "step": 25344 + }, + { + "epoch": 0.9076584239081777, + "grad_norm": 1.2739062309265137, + "learning_rate": 4.4375112087256864e-06, + "loss": 0.8891, + "step": 25345 + }, + { + "epoch": 0.907694236037746, + "grad_norm": 2.7035539150238037, + "learning_rate": 4.434094941550393e-06, + "loss": 1.1667, + "step": 25346 + }, + { + "epoch": 0.9077300481673143, + "grad_norm": 1.551529884338379, + "learning_rate": 4.430679960091089e-06, + "loss": 1.2478, + "step": 25347 + }, + { + "epoch": 0.9077658602968826, + "grad_norm": 1.6099162101745605, + "learning_rate": 4.427266264393693e-06, + "loss": 0.8961, + "step": 25348 + }, + { + "epoch": 0.9078016724264508, + "grad_norm": 1.3514726161956787, + "learning_rate": 4.423853854504156e-06, + "loss": 1.005, + "step": 25349 + }, + { + "epoch": 0.9078374845560191, + "grad_norm": 1.774664044380188, + "learning_rate": 4.420442730468388e-06, + "loss": 1.0213, + "step": 25350 + }, + { + "epoch": 0.9078732966855874, + "grad_norm": 1.4135017395019531, + "learning_rate": 4.417032892332263e-06, + "loss": 1.1655, + "step": 25351 + }, + { + "epoch": 0.9079091088151557, + "grad_norm": 1.4458038806915283, + "learning_rate": 4.413624340141676e-06, + "loss": 0.9385, + "step": 25352 + }, + { + "epoch": 0.907944920944724, + "grad_norm": 1.5969337224960327, + "learning_rate": 4.410217073942468e-06, + "loss": 1.0518, + "step": 25353 + }, + { + "epoch": 0.9079807330742923, + "grad_norm": 1.408882975578308, + "learning_rate": 4.4068110937805055e-06, + "loss": 0.9449, + "step": 25354 + }, + { + "epoch": 0.9080165452038605, + "grad_norm": 1.31583571434021, + "learning_rate": 4.40340639970157e-06, + "loss": 0.971, + "step": 25355 + }, + { + "epoch": 0.9080523573334288, + "grad_norm": 1.4624916315078735, + "learning_rate": 4.400002991751495e-06, + "loss": 1.1718, + "step": 25356 + }, + { + "epoch": 0.9080881694629971, + "grad_norm": 1.8235015869140625, + "learning_rate": 4.396600869976086e-06, + "loss": 1.0542, + "step": 25357 + }, + { + "epoch": 0.9081239815925654, + "grad_norm": 1.5028711557388306, + "learning_rate": 4.393200034421074e-06, + "loss": 1.0641, + "step": 25358 + }, + { + "epoch": 0.9081597937221337, + "grad_norm": 1.7093019485473633, + "learning_rate": 4.3898004851322335e-06, + "loss": 1.1087, + "step": 25359 + }, + { + "epoch": 0.908195605851702, + "grad_norm": 1.4240970611572266, + "learning_rate": 4.386402222155295e-06, + "loss": 1.1395, + "step": 25360 + }, + { + "epoch": 0.9082314179812703, + "grad_norm": 1.3858956098556519, + "learning_rate": 4.383005245535998e-06, + "loss": 1.0099, + "step": 25361 + }, + { + "epoch": 0.9082672301108385, + "grad_norm": 1.3183611631393433, + "learning_rate": 4.379609555320008e-06, + "loss": 0.7892, + "step": 25362 + }, + { + "epoch": 0.9083030422404068, + "grad_norm": 1.455965518951416, + "learning_rate": 4.376215151553042e-06, + "loss": 1.1078, + "step": 25363 + }, + { + "epoch": 0.9083388543699751, + "grad_norm": 1.797743558883667, + "learning_rate": 4.372822034280744e-06, + "loss": 0.7881, + "step": 25364 + }, + { + "epoch": 0.9083746664995433, + "grad_norm": 1.8327265977859497, + "learning_rate": 4.3694302035487965e-06, + "loss": 1.016, + "step": 25365 + }, + { + "epoch": 0.9084104786291117, + "grad_norm": 1.5331146717071533, + "learning_rate": 4.366039659402798e-06, + "loss": 1.1496, + "step": 25366 + }, + { + "epoch": 0.90844629075868, + "grad_norm": 1.8676340579986572, + "learning_rate": 4.362650401888369e-06, + "loss": 1.0161, + "step": 25367 + }, + { + "epoch": 0.9084821028882483, + "grad_norm": 1.359358787536621, + "learning_rate": 4.359262431051137e-06, + "loss": 1.1077, + "step": 25368 + }, + { + "epoch": 0.9085179150178165, + "grad_norm": 1.8273699283599854, + "learning_rate": 4.355875746936644e-06, + "loss": 1.1341, + "step": 25369 + }, + { + "epoch": 0.9085537271473848, + "grad_norm": 1.2964016199111938, + "learning_rate": 4.352490349590477e-06, + "loss": 1.0836, + "step": 25370 + }, + { + "epoch": 0.9085895392769531, + "grad_norm": 1.5239564180374146, + "learning_rate": 4.349106239058165e-06, + "loss": 1.2424, + "step": 25371 + }, + { + "epoch": 0.9086253514065213, + "grad_norm": 1.4296832084655762, + "learning_rate": 4.345723415385272e-06, + "loss": 1.3361, + "step": 25372 + }, + { + "epoch": 0.9086611635360897, + "grad_norm": 1.6789889335632324, + "learning_rate": 4.342341878617262e-06, + "loss": 1.3348, + "step": 25373 + }, + { + "epoch": 0.908696975665658, + "grad_norm": 1.648179054260254, + "learning_rate": 4.338961628799665e-06, + "loss": 0.9553, + "step": 25374 + }, + { + "epoch": 0.9087327877952263, + "grad_norm": 1.5039525032043457, + "learning_rate": 4.335582665977944e-06, + "loss": 1.1184, + "step": 25375 + }, + { + "epoch": 0.9087685999247945, + "grad_norm": 1.4250508546829224, + "learning_rate": 4.332204990197564e-06, + "loss": 1.0392, + "step": 25376 + }, + { + "epoch": 0.9088044120543628, + "grad_norm": 1.1675752401351929, + "learning_rate": 4.328828601503943e-06, + "loss": 0.984, + "step": 25377 + }, + { + "epoch": 0.9088402241839311, + "grad_norm": 1.3466298580169678, + "learning_rate": 4.325453499942545e-06, + "loss": 1.0561, + "step": 25378 + }, + { + "epoch": 0.9088760363134993, + "grad_norm": 1.267128348350525, + "learning_rate": 4.322079685558755e-06, + "loss": 0.88, + "step": 25379 + }, + { + "epoch": 0.9089118484430677, + "grad_norm": 1.8946343660354614, + "learning_rate": 4.318707158397972e-06, + "loss": 1.1666, + "step": 25380 + }, + { + "epoch": 0.908947660572636, + "grad_norm": 1.7416551113128662, + "learning_rate": 4.3153359185055474e-06, + "loss": 1.0783, + "step": 25381 + }, + { + "epoch": 0.9089834727022043, + "grad_norm": 2.259047269821167, + "learning_rate": 4.311965965926867e-06, + "loss": 0.9798, + "step": 25382 + }, + { + "epoch": 0.9090192848317725, + "grad_norm": 1.4517070055007935, + "learning_rate": 4.308597300707262e-06, + "loss": 1.1024, + "step": 25383 + }, + { + "epoch": 0.9090550969613408, + "grad_norm": 1.2911895513534546, + "learning_rate": 4.305229922892029e-06, + "loss": 1.0367, + "step": 25384 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.3818943500518799, + "learning_rate": 4.301863832526498e-06, + "loss": 0.8887, + "step": 25385 + }, + { + "epoch": 0.9091267212204773, + "grad_norm": 1.6518012285232544, + "learning_rate": 4.298499029655967e-06, + "loss": 1.0535, + "step": 25386 + }, + { + "epoch": 0.9091625333500457, + "grad_norm": 1.4850168228149414, + "learning_rate": 4.295135514325654e-06, + "loss": 1.1991, + "step": 25387 + }, + { + "epoch": 0.909198345479614, + "grad_norm": 1.4385830163955688, + "learning_rate": 4.291773286580858e-06, + "loss": 1.1821, + "step": 25388 + }, + { + "epoch": 0.9092341576091822, + "grad_norm": 1.776437759399414, + "learning_rate": 4.288412346466797e-06, + "loss": 1.0269, + "step": 25389 + }, + { + "epoch": 0.9092699697387505, + "grad_norm": 1.4452451467514038, + "learning_rate": 4.28505269402869e-06, + "loss": 1.0639, + "step": 25390 + }, + { + "epoch": 0.9093057818683188, + "grad_norm": 1.6436465978622437, + "learning_rate": 4.281694329311736e-06, + "loss": 1.0992, + "step": 25391 + }, + { + "epoch": 0.909341593997887, + "grad_norm": 1.7477365732192993, + "learning_rate": 4.278337252361109e-06, + "loss": 1.1721, + "step": 25392 + }, + { + "epoch": 0.9093774061274553, + "grad_norm": 2.3283698558807373, + "learning_rate": 4.2749814632219946e-06, + "loss": 1.162, + "step": 25393 + }, + { + "epoch": 0.9094132182570237, + "grad_norm": 1.7970514297485352, + "learning_rate": 4.271626961939524e-06, + "loss": 1.0297, + "step": 25394 + }, + { + "epoch": 0.909449030386592, + "grad_norm": 1.5899772644042969, + "learning_rate": 4.268273748558815e-06, + "loss": 1.0192, + "step": 25395 + }, + { + "epoch": 0.9094848425161602, + "grad_norm": 1.5315794944763184, + "learning_rate": 4.264921823125012e-06, + "loss": 1.0879, + "step": 25396 + }, + { + "epoch": 0.9095206546457285, + "grad_norm": 1.1453450918197632, + "learning_rate": 4.261571185683211e-06, + "loss": 1.1064, + "step": 25397 + }, + { + "epoch": 0.9095564667752968, + "grad_norm": 1.6627031564712524, + "learning_rate": 4.258221836278453e-06, + "loss": 1.2165, + "step": 25398 + }, + { + "epoch": 0.909592278904865, + "grad_norm": 1.6225529909133911, + "learning_rate": 4.2548737749558255e-06, + "loss": 1.058, + "step": 25399 + }, + { + "epoch": 0.9096280910344333, + "grad_norm": 1.3623892068862915, + "learning_rate": 4.2515270017603695e-06, + "loss": 1.1734, + "step": 25400 + }, + { + "epoch": 0.9096639031640017, + "grad_norm": 1.3919799327850342, + "learning_rate": 4.248181516737127e-06, + "loss": 1.0244, + "step": 25401 + }, + { + "epoch": 0.90969971529357, + "grad_norm": 1.4207582473754883, + "learning_rate": 4.244837319931072e-06, + "loss": 1.1754, + "step": 25402 + }, + { + "epoch": 0.9097355274231382, + "grad_norm": 1.3671189546585083, + "learning_rate": 4.2414944113872255e-06, + "loss": 1.0028, + "step": 25403 + }, + { + "epoch": 0.9097713395527065, + "grad_norm": 1.5681349039077759, + "learning_rate": 4.2381527911505625e-06, + "loss": 1.3271, + "step": 25404 + }, + { + "epoch": 0.9098071516822748, + "grad_norm": 1.515984058380127, + "learning_rate": 4.234812459266013e-06, + "loss": 1.1421, + "step": 25405 + }, + { + "epoch": 0.909842963811843, + "grad_norm": 1.8090684413909912, + "learning_rate": 4.231473415778531e-06, + "loss": 1.0832, + "step": 25406 + }, + { + "epoch": 0.9098787759414113, + "grad_norm": 1.7291333675384521, + "learning_rate": 4.228135660733046e-06, + "loss": 1.1712, + "step": 25407 + }, + { + "epoch": 0.9099145880709797, + "grad_norm": 2.1058173179626465, + "learning_rate": 4.224799194174467e-06, + "loss": 1.0566, + "step": 25408 + }, + { + "epoch": 0.909950400200548, + "grad_norm": 1.3183025121688843, + "learning_rate": 4.221464016147669e-06, + "loss": 1.1306, + "step": 25409 + }, + { + "epoch": 0.9099862123301162, + "grad_norm": 1.5941240787506104, + "learning_rate": 4.218130126697517e-06, + "loss": 1.172, + "step": 25410 + }, + { + "epoch": 0.9100220244596845, + "grad_norm": 1.279618740081787, + "learning_rate": 4.214797525868897e-06, + "loss": 0.9593, + "step": 25411 + }, + { + "epoch": 0.9100578365892528, + "grad_norm": 1.6823867559432983, + "learning_rate": 4.2114662137066055e-06, + "loss": 1.3091, + "step": 25412 + }, + { + "epoch": 0.910093648718821, + "grad_norm": 1.3036786317825317, + "learning_rate": 4.208136190255485e-06, + "loss": 1.1201, + "step": 25413 + }, + { + "epoch": 0.9101294608483893, + "grad_norm": 1.7584314346313477, + "learning_rate": 4.204807455560311e-06, + "loss": 1.1507, + "step": 25414 + }, + { + "epoch": 0.9101652729779577, + "grad_norm": 1.2202352285385132, + "learning_rate": 4.201480009665915e-06, + "loss": 1.0363, + "step": 25415 + }, + { + "epoch": 0.910201085107526, + "grad_norm": 1.2408530712127686, + "learning_rate": 4.198153852617015e-06, + "loss": 0.9778, + "step": 25416 + }, + { + "epoch": 0.9102368972370942, + "grad_norm": 1.8920570611953735, + "learning_rate": 4.194828984458376e-06, + "loss": 1.2182, + "step": 25417 + }, + { + "epoch": 0.9102727093666625, + "grad_norm": 1.5726213455200195, + "learning_rate": 4.191505405234741e-06, + "loss": 1.0742, + "step": 25418 + }, + { + "epoch": 0.9103085214962308, + "grad_norm": 1.4560984373092651, + "learning_rate": 4.188183114990829e-06, + "loss": 1.0445, + "step": 25419 + }, + { + "epoch": 0.910344333625799, + "grad_norm": 2.6255292892456055, + "learning_rate": 4.1848621137713154e-06, + "loss": 1.2283, + "step": 25420 + }, + { + "epoch": 0.9103801457553673, + "grad_norm": 1.4155864715576172, + "learning_rate": 4.181542401620875e-06, + "loss": 0.8715, + "step": 25421 + }, + { + "epoch": 0.9104159578849357, + "grad_norm": 1.3763138055801392, + "learning_rate": 4.178223978584206e-06, + "loss": 1.2409, + "step": 25422 + }, + { + "epoch": 0.910451770014504, + "grad_norm": 1.4441205263137817, + "learning_rate": 4.174906844705917e-06, + "loss": 0.9113, + "step": 25423 + }, + { + "epoch": 0.9104875821440722, + "grad_norm": 1.321756362915039, + "learning_rate": 4.171591000030672e-06, + "loss": 1.1542, + "step": 25424 + }, + { + "epoch": 0.9105233942736405, + "grad_norm": 1.5029317140579224, + "learning_rate": 4.168276444603026e-06, + "loss": 1.0767, + "step": 25425 + }, + { + "epoch": 0.9105592064032088, + "grad_norm": 1.520948886871338, + "learning_rate": 4.164963178467629e-06, + "loss": 1.1996, + "step": 25426 + }, + { + "epoch": 0.910595018532777, + "grad_norm": 1.6605522632598877, + "learning_rate": 4.161651201669036e-06, + "loss": 1.1818, + "step": 25427 + }, + { + "epoch": 0.9106308306623453, + "grad_norm": 1.952248454093933, + "learning_rate": 4.1583405142517906e-06, + "loss": 1.2035, + "step": 25428 + }, + { + "epoch": 0.9106666427919137, + "grad_norm": 1.6354625225067139, + "learning_rate": 4.155031116260466e-06, + "loss": 0.9725, + "step": 25429 + }, + { + "epoch": 0.9107024549214819, + "grad_norm": 1.1657794713974, + "learning_rate": 4.15172300773955e-06, + "loss": 1.0892, + "step": 25430 + }, + { + "epoch": 0.9107382670510502, + "grad_norm": 1.4532471895217896, + "learning_rate": 4.148416188733584e-06, + "loss": 0.9919, + "step": 25431 + }, + { + "epoch": 0.9107740791806185, + "grad_norm": 1.3655073642730713, + "learning_rate": 4.1451106592869995e-06, + "loss": 1.067, + "step": 25432 + }, + { + "epoch": 0.9108098913101867, + "grad_norm": 1.4483349323272705, + "learning_rate": 4.14180641944435e-06, + "loss": 1.09, + "step": 25433 + }, + { + "epoch": 0.910845703439755, + "grad_norm": 1.3295961618423462, + "learning_rate": 4.138503469250021e-06, + "loss": 1.0069, + "step": 25434 + }, + { + "epoch": 0.9108815155693233, + "grad_norm": 1.218824863433838, + "learning_rate": 4.13520180874849e-06, + "loss": 1.0758, + "step": 25435 + }, + { + "epoch": 0.9109173276988917, + "grad_norm": 1.7813254594802856, + "learning_rate": 4.131901437984153e-06, + "loss": 1.2319, + "step": 25436 + }, + { + "epoch": 0.9109531398284599, + "grad_norm": 1.3922407627105713, + "learning_rate": 4.128602357001421e-06, + "loss": 1.0143, + "step": 25437 + }, + { + "epoch": 0.9109889519580282, + "grad_norm": 1.3257286548614502, + "learning_rate": 4.12530456584469e-06, + "loss": 1.2149, + "step": 25438 + }, + { + "epoch": 0.9110247640875965, + "grad_norm": 1.2512052059173584, + "learning_rate": 4.122008064558313e-06, + "loss": 0.9387, + "step": 25439 + }, + { + "epoch": 0.9110605762171647, + "grad_norm": 1.6054075956344604, + "learning_rate": 4.118712853186634e-06, + "loss": 1.1129, + "step": 25440 + }, + { + "epoch": 0.911096388346733, + "grad_norm": 1.4072935581207275, + "learning_rate": 4.115418931773996e-06, + "loss": 1.2822, + "step": 25441 + }, + { + "epoch": 0.9111322004763013, + "grad_norm": 1.2914773225784302, + "learning_rate": 4.112126300364727e-06, + "loss": 1.0806, + "step": 25442 + }, + { + "epoch": 0.9111680126058697, + "grad_norm": 1.4116407632827759, + "learning_rate": 4.108834959003094e-06, + "loss": 1.1009, + "step": 25443 + }, + { + "epoch": 0.9112038247354379, + "grad_norm": 1.4697974920272827, + "learning_rate": 4.1055449077334165e-06, + "loss": 1.0349, + "step": 25444 + }, + { + "epoch": 0.9112396368650062, + "grad_norm": 1.395391821861267, + "learning_rate": 4.102256146599936e-06, + "loss": 0.8748, + "step": 25445 + }, + { + "epoch": 0.9112754489945745, + "grad_norm": 1.807930588722229, + "learning_rate": 4.098968675646886e-06, + "loss": 1.0284, + "step": 25446 + }, + { + "epoch": 0.9113112611241427, + "grad_norm": 1.5966230630874634, + "learning_rate": 4.095682494918507e-06, + "loss": 0.8764, + "step": 25447 + }, + { + "epoch": 0.911347073253711, + "grad_norm": 1.9930779933929443, + "learning_rate": 4.092397604459019e-06, + "loss": 1.0849, + "step": 25448 + }, + { + "epoch": 0.9113828853832793, + "grad_norm": 1.360953450202942, + "learning_rate": 4.089114004312622e-06, + "loss": 1.0587, + "step": 25449 + }, + { + "epoch": 0.9114186975128477, + "grad_norm": 1.3996615409851074, + "learning_rate": 4.085831694523456e-06, + "loss": 1.1288, + "step": 25450 + }, + { + "epoch": 0.9114545096424159, + "grad_norm": 1.4827982187271118, + "learning_rate": 4.082550675135721e-06, + "loss": 0.9739, + "step": 25451 + }, + { + "epoch": 0.9114903217719842, + "grad_norm": 1.9141234159469604, + "learning_rate": 4.079270946193525e-06, + "loss": 0.9524, + "step": 25452 + }, + { + "epoch": 0.9115261339015525, + "grad_norm": 1.4970744848251343, + "learning_rate": 4.075992507741033e-06, + "loss": 0.9837, + "step": 25453 + }, + { + "epoch": 0.9115619460311207, + "grad_norm": 1.5132771730422974, + "learning_rate": 4.07271535982231e-06, + "loss": 1.2191, + "step": 25454 + }, + { + "epoch": 0.911597758160689, + "grad_norm": 1.5180280208587646, + "learning_rate": 4.0694395024814754e-06, + "loss": 1.0713, + "step": 25455 + }, + { + "epoch": 0.9116335702902573, + "grad_norm": 1.242854356765747, + "learning_rate": 4.066164935762595e-06, + "loss": 1.0695, + "step": 25456 + }, + { + "epoch": 0.9116693824198256, + "grad_norm": 1.6774258613586426, + "learning_rate": 4.062891659709711e-06, + "loss": 1.0024, + "step": 25457 + }, + { + "epoch": 0.9117051945493939, + "grad_norm": 1.541745901107788, + "learning_rate": 4.059619674366866e-06, + "loss": 1.1684, + "step": 25458 + }, + { + "epoch": 0.9117410066789622, + "grad_norm": 1.929626703262329, + "learning_rate": 4.05634897977808e-06, + "loss": 0.9938, + "step": 25459 + }, + { + "epoch": 0.9117768188085305, + "grad_norm": 1.571602463722229, + "learning_rate": 4.053079575987384e-06, + "loss": 1.1088, + "step": 25460 + }, + { + "epoch": 0.9118126309380987, + "grad_norm": 1.426228404045105, + "learning_rate": 4.049811463038722e-06, + "loss": 1.1174, + "step": 25461 + }, + { + "epoch": 0.911848443067667, + "grad_norm": 1.3335392475128174, + "learning_rate": 4.0465446409760795e-06, + "loss": 1.1238, + "step": 25462 + }, + { + "epoch": 0.9118842551972353, + "grad_norm": 1.2982478141784668, + "learning_rate": 4.043279109843412e-06, + "loss": 1.0804, + "step": 25463 + }, + { + "epoch": 0.9119200673268036, + "grad_norm": 1.4165589809417725, + "learning_rate": 4.04001486968465e-06, + "loss": 1.0424, + "step": 25464 + }, + { + "epoch": 0.9119558794563719, + "grad_norm": 1.5791090726852417, + "learning_rate": 4.036751920543702e-06, + "loss": 1.2203, + "step": 25465 + }, + { + "epoch": 0.9119916915859402, + "grad_norm": 1.8521689176559448, + "learning_rate": 4.033490262464468e-06, + "loss": 0.9446, + "step": 25466 + }, + { + "epoch": 0.9120275037155084, + "grad_norm": 1.3354109525680542, + "learning_rate": 4.030229895490856e-06, + "loss": 1.113, + "step": 25467 + }, + { + "epoch": 0.9120633158450767, + "grad_norm": 1.4246903657913208, + "learning_rate": 4.026970819666698e-06, + "loss": 0.8558, + "step": 25468 + }, + { + "epoch": 0.912099127974645, + "grad_norm": 1.4236053228378296, + "learning_rate": 4.023713035035836e-06, + "loss": 1.1572, + "step": 25469 + }, + { + "epoch": 0.9121349401042133, + "grad_norm": 1.7486886978149414, + "learning_rate": 4.020456541642126e-06, + "loss": 1.1293, + "step": 25470 + }, + { + "epoch": 0.9121707522337816, + "grad_norm": 1.4791443347930908, + "learning_rate": 4.017201339529386e-06, + "loss": 1.082, + "step": 25471 + }, + { + "epoch": 0.9122065643633499, + "grad_norm": 1.7682747840881348, + "learning_rate": 4.013947428741372e-06, + "loss": 1.1746, + "step": 25472 + }, + { + "epoch": 0.9122423764929182, + "grad_norm": 1.4603755474090576, + "learning_rate": 4.01069480932188e-06, + "loss": 1.1286, + "step": 25473 + }, + { + "epoch": 0.9122781886224864, + "grad_norm": 1.2695395946502686, + "learning_rate": 4.007443481314699e-06, + "loss": 1.0238, + "step": 25474 + }, + { + "epoch": 0.9123140007520547, + "grad_norm": 1.3891639709472656, + "learning_rate": 4.0041934447635156e-06, + "loss": 1.0402, + "step": 25475 + }, + { + "epoch": 0.912349812881623, + "grad_norm": 1.363601803779602, + "learning_rate": 4.000944699712094e-06, + "loss": 0.9611, + "step": 25476 + }, + { + "epoch": 0.9123856250111912, + "grad_norm": 1.4713178873062134, + "learning_rate": 3.997697246204124e-06, + "loss": 1.0732, + "step": 25477 + }, + { + "epoch": 0.9124214371407596, + "grad_norm": 1.4540044069290161, + "learning_rate": 3.994451084283324e-06, + "loss": 1.1705, + "step": 25478 + }, + { + "epoch": 0.9124572492703279, + "grad_norm": 1.3097130060195923, + "learning_rate": 3.991206213993326e-06, + "loss": 0.9595, + "step": 25479 + }, + { + "epoch": 0.9124930613998962, + "grad_norm": 1.539259672164917, + "learning_rate": 3.987962635377806e-06, + "loss": 0.9904, + "step": 25480 + }, + { + "epoch": 0.9125288735294644, + "grad_norm": 1.3890870809555054, + "learning_rate": 3.98472034848042e-06, + "loss": 0.9914, + "step": 25481 + }, + { + "epoch": 0.9125646856590327, + "grad_norm": 1.1647735834121704, + "learning_rate": 3.9814793533447635e-06, + "loss": 0.949, + "step": 25482 + }, + { + "epoch": 0.912600497788601, + "grad_norm": 1.696670413017273, + "learning_rate": 3.978239650014437e-06, + "loss": 1.144, + "step": 25483 + }, + { + "epoch": 0.9126363099181692, + "grad_norm": 1.401947259902954, + "learning_rate": 3.975001238533038e-06, + "loss": 1.0934, + "step": 25484 + }, + { + "epoch": 0.9126721220477376, + "grad_norm": 1.5261224508285522, + "learning_rate": 3.971764118944155e-06, + "loss": 1.0894, + "step": 25485 + }, + { + "epoch": 0.9127079341773059, + "grad_norm": 1.475524663925171, + "learning_rate": 3.968528291291296e-06, + "loss": 0.9063, + "step": 25486 + }, + { + "epoch": 0.9127437463068742, + "grad_norm": 1.5003348588943481, + "learning_rate": 3.965293755618027e-06, + "loss": 1.017, + "step": 25487 + }, + { + "epoch": 0.9127795584364424, + "grad_norm": 1.2934728860855103, + "learning_rate": 3.962060511967846e-06, + "loss": 0.8978, + "step": 25488 + }, + { + "epoch": 0.9128153705660107, + "grad_norm": 1.4978411197662354, + "learning_rate": 3.9588285603842755e-06, + "loss": 1.0331, + "step": 25489 + }, + { + "epoch": 0.912851182695579, + "grad_norm": 1.3500288724899292, + "learning_rate": 3.955597900910768e-06, + "loss": 1.1382, + "step": 25490 + }, + { + "epoch": 0.9128869948251472, + "grad_norm": 1.9954440593719482, + "learning_rate": 3.9523685335908e-06, + "loss": 1.2485, + "step": 25491 + }, + { + "epoch": 0.9129228069547156, + "grad_norm": 1.7925312519073486, + "learning_rate": 3.9491404584678485e-06, + "loss": 1.1075, + "step": 25492 + }, + { + "epoch": 0.9129586190842839, + "grad_norm": 1.4814157485961914, + "learning_rate": 3.945913675585289e-06, + "loss": 1.0511, + "step": 25493 + }, + { + "epoch": 0.9129944312138522, + "grad_norm": 1.8059728145599365, + "learning_rate": 3.9426881849865646e-06, + "loss": 1.1421, + "step": 25494 + }, + { + "epoch": 0.9130302433434204, + "grad_norm": 1.4293365478515625, + "learning_rate": 3.939463986715064e-06, + "loss": 1.0953, + "step": 25495 + }, + { + "epoch": 0.9130660554729887, + "grad_norm": 1.2710380554199219, + "learning_rate": 3.936241080814174e-06, + "loss": 0.9441, + "step": 25496 + }, + { + "epoch": 0.913101867602557, + "grad_norm": 1.4242887496948242, + "learning_rate": 3.933019467327248e-06, + "loss": 1.1003, + "step": 25497 + }, + { + "epoch": 0.9131376797321252, + "grad_norm": 1.5776995420455933, + "learning_rate": 3.9297991462976196e-06, + "loss": 1.1609, + "step": 25498 + }, + { + "epoch": 0.9131734918616936, + "grad_norm": 1.574384093284607, + "learning_rate": 3.92658011776863e-06, + "loss": 1.0257, + "step": 25499 + }, + { + "epoch": 0.9132093039912619, + "grad_norm": 1.6472982168197632, + "learning_rate": 3.923362381783568e-06, + "loss": 0.952, + "step": 25500 + }, + { + "epoch": 0.9132451161208301, + "grad_norm": 1.6512914896011353, + "learning_rate": 3.920145938385744e-06, + "loss": 0.9197, + "step": 25501 + }, + { + "epoch": 0.9132809282503984, + "grad_norm": 1.2960830926895142, + "learning_rate": 3.916930787618412e-06, + "loss": 1.0113, + "step": 25502 + }, + { + "epoch": 0.9133167403799667, + "grad_norm": 1.6087725162506104, + "learning_rate": 3.913716929524857e-06, + "loss": 0.9848, + "step": 25503 + }, + { + "epoch": 0.913352552509535, + "grad_norm": 1.4147789478302002, + "learning_rate": 3.910504364148282e-06, + "loss": 1.0785, + "step": 25504 + }, + { + "epoch": 0.9133883646391032, + "grad_norm": 1.656005859375, + "learning_rate": 3.907293091531927e-06, + "loss": 0.9222, + "step": 25505 + }, + { + "epoch": 0.9134241767686716, + "grad_norm": 1.5328013896942139, + "learning_rate": 3.904083111718993e-06, + "loss": 0.9007, + "step": 25506 + }, + { + "epoch": 0.9134599888982399, + "grad_norm": 1.3975582122802734, + "learning_rate": 3.900874424752677e-06, + "loss": 1.0452, + "step": 25507 + }, + { + "epoch": 0.9134958010278081, + "grad_norm": 1.2757340669631958, + "learning_rate": 3.897667030676133e-06, + "loss": 1.0359, + "step": 25508 + }, + { + "epoch": 0.9135316131573764, + "grad_norm": 1.8337018489837646, + "learning_rate": 3.8944609295324955e-06, + "loss": 1.1559, + "step": 25509 + }, + { + "epoch": 0.9135674252869447, + "grad_norm": 1.2142993211746216, + "learning_rate": 3.89125612136495e-06, + "loss": 0.8887, + "step": 25510 + }, + { + "epoch": 0.913603237416513, + "grad_norm": 1.348148226737976, + "learning_rate": 3.888052606216564e-06, + "loss": 1.0859, + "step": 25511 + }, + { + "epoch": 0.9136390495460812, + "grad_norm": 1.3500622510910034, + "learning_rate": 3.884850384130456e-06, + "loss": 0.9177, + "step": 25512 + }, + { + "epoch": 0.9136748616756496, + "grad_norm": 1.78519606590271, + "learning_rate": 3.881649455149694e-06, + "loss": 1.0643, + "step": 25513 + }, + { + "epoch": 0.9137106738052179, + "grad_norm": 1.2712053060531616, + "learning_rate": 3.878449819317376e-06, + "loss": 0.8886, + "step": 25514 + }, + { + "epoch": 0.9137464859347861, + "grad_norm": 1.5025314092636108, + "learning_rate": 3.875251476676522e-06, + "loss": 0.8654, + "step": 25515 + }, + { + "epoch": 0.9137822980643544, + "grad_norm": 1.2756859064102173, + "learning_rate": 3.872054427270167e-06, + "loss": 0.8405, + "step": 25516 + }, + { + "epoch": 0.9138181101939227, + "grad_norm": 1.866407871246338, + "learning_rate": 3.868858671141329e-06, + "loss": 1.1333, + "step": 25517 + }, + { + "epoch": 0.9138539223234909, + "grad_norm": 1.721867561340332, + "learning_rate": 3.865664208332986e-06, + "loss": 1.1051, + "step": 25518 + }, + { + "epoch": 0.9138897344530592, + "grad_norm": 1.6597174406051636, + "learning_rate": 3.862471038888138e-06, + "loss": 0.9418, + "step": 25519 + }, + { + "epoch": 0.9139255465826276, + "grad_norm": 1.508412480354309, + "learning_rate": 3.859279162849716e-06, + "loss": 1.0854, + "step": 25520 + }, + { + "epoch": 0.9139613587121959, + "grad_norm": 2.021669864654541, + "learning_rate": 3.856088580260697e-06, + "loss": 1.3237, + "step": 25521 + }, + { + "epoch": 0.9139971708417641, + "grad_norm": 1.3572282791137695, + "learning_rate": 3.8528992911639806e-06, + "loss": 0.9998, + "step": 25522 + }, + { + "epoch": 0.9140329829713324, + "grad_norm": 1.756705403327942, + "learning_rate": 3.8497112956024875e-06, + "loss": 0.9814, + "step": 25523 + }, + { + "epoch": 0.9140687951009007, + "grad_norm": 1.4218969345092773, + "learning_rate": 3.846524593619094e-06, + "loss": 1.1638, + "step": 25524 + }, + { + "epoch": 0.9141046072304689, + "grad_norm": 1.6461281776428223, + "learning_rate": 3.8433391852567e-06, + "loss": 1.3516, + "step": 25525 + }, + { + "epoch": 0.9141404193600372, + "grad_norm": 1.7552348375320435, + "learning_rate": 3.840155070558149e-06, + "loss": 1.0133, + "step": 25526 + }, + { + "epoch": 0.9141762314896056, + "grad_norm": 1.3230024576187134, + "learning_rate": 3.836972249566239e-06, + "loss": 0.933, + "step": 25527 + }, + { + "epoch": 0.9142120436191739, + "grad_norm": 1.4254008531570435, + "learning_rate": 3.83379072232386e-06, + "loss": 0.984, + "step": 25528 + }, + { + "epoch": 0.9142478557487421, + "grad_norm": 1.5067174434661865, + "learning_rate": 3.830610488873765e-06, + "loss": 0.8421, + "step": 25529 + }, + { + "epoch": 0.9142836678783104, + "grad_norm": 1.355363368988037, + "learning_rate": 3.827431549258764e-06, + "loss": 1.1839, + "step": 25530 + }, + { + "epoch": 0.9143194800078787, + "grad_norm": 1.4981523752212524, + "learning_rate": 3.824253903521602e-06, + "loss": 1.0938, + "step": 25531 + }, + { + "epoch": 0.9143552921374469, + "grad_norm": 1.6109570264816284, + "learning_rate": 3.821077551705065e-06, + "loss": 1.1652, + "step": 25532 + }, + { + "epoch": 0.9143911042670152, + "grad_norm": 1.4108879566192627, + "learning_rate": 3.817902493851877e-06, + "loss": 0.9114, + "step": 25533 + }, + { + "epoch": 0.9144269163965836, + "grad_norm": 1.5342044830322266, + "learning_rate": 3.814728730004724e-06, + "loss": 1.074, + "step": 25534 + }, + { + "epoch": 0.9144627285261518, + "grad_norm": 1.337587833404541, + "learning_rate": 3.811556260206328e-06, + "loss": 0.9924, + "step": 25535 + }, + { + "epoch": 0.9144985406557201, + "grad_norm": 1.5673117637634277, + "learning_rate": 3.808385084499366e-06, + "loss": 1.0228, + "step": 25536 + }, + { + "epoch": 0.9145343527852884, + "grad_norm": 1.5852550268173218, + "learning_rate": 3.8052152029265154e-06, + "loss": 1.0438, + "step": 25537 + }, + { + "epoch": 0.9145701649148567, + "grad_norm": 1.6909873485565186, + "learning_rate": 3.8020466155304078e-06, + "loss": 1.0068, + "step": 25538 + }, + { + "epoch": 0.9146059770444249, + "grad_norm": 1.5762073993682861, + "learning_rate": 3.798879322353666e-06, + "loss": 1.1223, + "step": 25539 + }, + { + "epoch": 0.9146417891739932, + "grad_norm": 1.5965516567230225, + "learning_rate": 3.7957133234389207e-06, + "loss": 1.181, + "step": 25540 + }, + { + "epoch": 0.9146776013035616, + "grad_norm": 1.372104287147522, + "learning_rate": 3.7925486188287727e-06, + "loss": 1.0792, + "step": 25541 + }, + { + "epoch": 0.9147134134331298, + "grad_norm": 1.542988657951355, + "learning_rate": 3.7893852085657657e-06, + "loss": 1.082, + "step": 25542 + }, + { + "epoch": 0.9147492255626981, + "grad_norm": 1.751253366470337, + "learning_rate": 3.786223092692476e-06, + "loss": 1.2669, + "step": 25543 + }, + { + "epoch": 0.9147850376922664, + "grad_norm": 1.2909069061279297, + "learning_rate": 3.7830622712514696e-06, + "loss": 1.0443, + "step": 25544 + }, + { + "epoch": 0.9148208498218346, + "grad_norm": 1.5423812866210938, + "learning_rate": 3.779902744285224e-06, + "loss": 0.9175, + "step": 25545 + }, + { + "epoch": 0.9148566619514029, + "grad_norm": 2.522254228591919, + "learning_rate": 3.7767445118362832e-06, + "loss": 0.9573, + "step": 25546 + }, + { + "epoch": 0.9148924740809712, + "grad_norm": 1.50576913356781, + "learning_rate": 3.7735875739471237e-06, + "loss": 1.0671, + "step": 25547 + }, + { + "epoch": 0.9149282862105395, + "grad_norm": 1.4419198036193848, + "learning_rate": 3.770431930660223e-06, + "loss": 1.1039, + "step": 25548 + }, + { + "epoch": 0.9149640983401078, + "grad_norm": 1.4082304239273071, + "learning_rate": 3.767277582018036e-06, + "loss": 0.9322, + "step": 25549 + }, + { + "epoch": 0.9149999104696761, + "grad_norm": 1.354452133178711, + "learning_rate": 3.7641245280629842e-06, + "loss": 1.0227, + "step": 25550 + }, + { + "epoch": 0.9150357225992444, + "grad_norm": 1.7315537929534912, + "learning_rate": 3.760972768837523e-06, + "loss": 1.0947, + "step": 25551 + }, + { + "epoch": 0.9150715347288126, + "grad_norm": 1.583865761756897, + "learning_rate": 3.757822304384018e-06, + "loss": 1.0312, + "step": 25552 + }, + { + "epoch": 0.9151073468583809, + "grad_norm": 1.5598018169403076, + "learning_rate": 3.7546731347448685e-06, + "loss": 1.1523, + "step": 25553 + }, + { + "epoch": 0.9151431589879492, + "grad_norm": 1.5561121702194214, + "learning_rate": 3.7515252599624516e-06, + "loss": 0.9632, + "step": 25554 + }, + { + "epoch": 0.9151789711175174, + "grad_norm": 1.4675041437149048, + "learning_rate": 3.748378680079112e-06, + "loss": 1.0567, + "step": 25555 + }, + { + "epoch": 0.9152147832470858, + "grad_norm": 1.2963206768035889, + "learning_rate": 3.745233395137182e-06, + "loss": 1.0414, + "step": 25556 + }, + { + "epoch": 0.9152505953766541, + "grad_norm": 1.4076783657073975, + "learning_rate": 3.7420894051789723e-06, + "loss": 0.9741, + "step": 25557 + }, + { + "epoch": 0.9152864075062224, + "grad_norm": 1.5264168977737427, + "learning_rate": 3.7389467102467823e-06, + "loss": 1.0462, + "step": 25558 + }, + { + "epoch": 0.9153222196357906, + "grad_norm": 1.7980393171310425, + "learning_rate": 3.7358053103829117e-06, + "loss": 1.0834, + "step": 25559 + }, + { + "epoch": 0.9153580317653589, + "grad_norm": 1.2250115871429443, + "learning_rate": 3.732665205629593e-06, + "loss": 0.9811, + "step": 25560 + }, + { + "epoch": 0.9153938438949272, + "grad_norm": 1.4016079902648926, + "learning_rate": 3.7295263960290927e-06, + "loss": 0.9711, + "step": 25561 + }, + { + "epoch": 0.9154296560244954, + "grad_norm": 1.272762656211853, + "learning_rate": 3.7263888816236435e-06, + "loss": 0.7463, + "step": 25562 + }, + { + "epoch": 0.9154654681540638, + "grad_norm": 1.3006627559661865, + "learning_rate": 3.7232526624554344e-06, + "loss": 1.1972, + "step": 25563 + }, + { + "epoch": 0.9155012802836321, + "grad_norm": 1.4391728639602661, + "learning_rate": 3.720117738566675e-06, + "loss": 0.9862, + "step": 25564 + }, + { + "epoch": 0.9155370924132004, + "grad_norm": 1.9002714157104492, + "learning_rate": 3.7169841099995438e-06, + "loss": 1.1758, + "step": 25565 + }, + { + "epoch": 0.9155729045427686, + "grad_norm": 1.6175462007522583, + "learning_rate": 3.7138517767961954e-06, + "loss": 0.9615, + "step": 25566 + }, + { + "epoch": 0.9156087166723369, + "grad_norm": 1.5549765825271606, + "learning_rate": 3.710720738998774e-06, + "loss": 1.1264, + "step": 25567 + }, + { + "epoch": 0.9156445288019052, + "grad_norm": 1.8901638984680176, + "learning_rate": 3.7075909966493903e-06, + "loss": 1.2131, + "step": 25568 + }, + { + "epoch": 0.9156803409314734, + "grad_norm": 1.4283682107925415, + "learning_rate": 3.7044625497901774e-06, + "loss": 0.9296, + "step": 25569 + }, + { + "epoch": 0.9157161530610418, + "grad_norm": 1.2582600116729736, + "learning_rate": 3.7013353984631906e-06, + "loss": 1.2141, + "step": 25570 + }, + { + "epoch": 0.9157519651906101, + "grad_norm": 1.643221139907837, + "learning_rate": 3.698209542710529e-06, + "loss": 1.1603, + "step": 25571 + }, + { + "epoch": 0.9157877773201784, + "grad_norm": 1.7443416118621826, + "learning_rate": 3.6950849825742375e-06, + "loss": 1.0901, + "step": 25572 + }, + { + "epoch": 0.9158235894497466, + "grad_norm": 1.9926950931549072, + "learning_rate": 3.6919617180963595e-06, + "loss": 1.2338, + "step": 25573 + }, + { + "epoch": 0.9158594015793149, + "grad_norm": 1.4687936305999756, + "learning_rate": 3.6888397493188954e-06, + "loss": 1.0991, + "step": 25574 + }, + { + "epoch": 0.9158952137088832, + "grad_norm": 1.4777400493621826, + "learning_rate": 3.685719076283867e-06, + "loss": 1.0535, + "step": 25575 + }, + { + "epoch": 0.9159310258384514, + "grad_norm": 1.3940317630767822, + "learning_rate": 3.68259969903324e-06, + "loss": 1.029, + "step": 25576 + }, + { + "epoch": 0.9159668379680198, + "grad_norm": 1.1971668004989624, + "learning_rate": 3.6794816176090152e-06, + "loss": 0.9869, + "step": 25577 + }, + { + "epoch": 0.9160026500975881, + "grad_norm": 1.802567481994629, + "learning_rate": 3.676364832053103e-06, + "loss": 1.0013, + "step": 25578 + }, + { + "epoch": 0.9160384622271563, + "grad_norm": 1.3311636447906494, + "learning_rate": 3.6732493424074587e-06, + "loss": 0.9242, + "step": 25579 + }, + { + "epoch": 0.9160742743567246, + "grad_norm": 1.4174261093139648, + "learning_rate": 3.6701351487140046e-06, + "loss": 1.2285, + "step": 25580 + }, + { + "epoch": 0.9161100864862929, + "grad_norm": 1.8558553457260132, + "learning_rate": 3.667022251014607e-06, + "loss": 0.9679, + "step": 25581 + }, + { + "epoch": 0.9161458986158612, + "grad_norm": 1.444717288017273, + "learning_rate": 3.6639106493511766e-06, + "loss": 0.9149, + "step": 25582 + }, + { + "epoch": 0.9161817107454294, + "grad_norm": 1.601986289024353, + "learning_rate": 3.660800343765547e-06, + "loss": 1.1061, + "step": 25583 + }, + { + "epoch": 0.9162175228749978, + "grad_norm": 1.3253763914108276, + "learning_rate": 3.657691334299607e-06, + "loss": 1.2034, + "step": 25584 + }, + { + "epoch": 0.9162533350045661, + "grad_norm": 1.4478404521942139, + "learning_rate": 3.6545836209951333e-06, + "loss": 1.0816, + "step": 25585 + }, + { + "epoch": 0.9162891471341343, + "grad_norm": 1.8103843927383423, + "learning_rate": 3.6514772038939714e-06, + "loss": 1.1152, + "step": 25586 + }, + { + "epoch": 0.9163249592637026, + "grad_norm": 1.868084192276001, + "learning_rate": 3.64837208303791e-06, + "loss": 1.2363, + "step": 25587 + }, + { + "epoch": 0.9163607713932709, + "grad_norm": 1.2139477729797363, + "learning_rate": 3.6452682584687035e-06, + "loss": 0.8587, + "step": 25588 + }, + { + "epoch": 0.9163965835228391, + "grad_norm": 1.489137887954712, + "learning_rate": 3.642165730228131e-06, + "loss": 1.0783, + "step": 25589 + }, + { + "epoch": 0.9164323956524074, + "grad_norm": 1.420179843902588, + "learning_rate": 3.6390644983579135e-06, + "loss": 1.0889, + "step": 25590 + }, + { + "epoch": 0.9164682077819758, + "grad_norm": 1.6557331085205078, + "learning_rate": 3.6359645628998073e-06, + "loss": 1.1733, + "step": 25591 + }, + { + "epoch": 0.9165040199115441, + "grad_norm": 1.3771649599075317, + "learning_rate": 3.6328659238954897e-06, + "loss": 0.9716, + "step": 25592 + }, + { + "epoch": 0.9165398320411123, + "grad_norm": 1.3851182460784912, + "learning_rate": 3.62976858138665e-06, + "loss": 1.0354, + "step": 25593 + }, + { + "epoch": 0.9165756441706806, + "grad_norm": 1.6184114217758179, + "learning_rate": 3.6266725354149656e-06, + "loss": 1.2914, + "step": 25594 + }, + { + "epoch": 0.9166114563002489, + "grad_norm": 1.5229618549346924, + "learning_rate": 3.6235777860221033e-06, + "loss": 1.1055, + "step": 25595 + }, + { + "epoch": 0.9166472684298171, + "grad_norm": 1.3835316896438599, + "learning_rate": 3.620484333249674e-06, + "loss": 1.0533, + "step": 25596 + }, + { + "epoch": 0.9166830805593854, + "grad_norm": 1.8822154998779297, + "learning_rate": 3.6173921771393003e-06, + "loss": 1.1704, + "step": 25597 + }, + { + "epoch": 0.9167188926889538, + "grad_norm": 1.501899242401123, + "learning_rate": 3.6143013177326046e-06, + "loss": 1.1633, + "step": 25598 + }, + { + "epoch": 0.9167547048185221, + "grad_norm": 1.3372986316680908, + "learning_rate": 3.611211755071142e-06, + "loss": 1.0662, + "step": 25599 + }, + { + "epoch": 0.9167905169480903, + "grad_norm": 2.309089183807373, + "learning_rate": 3.608123489196502e-06, + "loss": 0.9385, + "step": 25600 + }, + { + "epoch": 0.9168263290776586, + "grad_norm": 1.5997042655944824, + "learning_rate": 3.605036520150218e-06, + "loss": 1.0526, + "step": 25601 + }, + { + "epoch": 0.9168621412072269, + "grad_norm": 1.3907017707824707, + "learning_rate": 3.601950847973845e-06, + "loss": 1.1306, + "step": 25602 + }, + { + "epoch": 0.9168979533367951, + "grad_norm": 1.3324650526046753, + "learning_rate": 3.598866472708862e-06, + "loss": 1.0298, + "step": 25603 + }, + { + "epoch": 0.9169337654663634, + "grad_norm": 1.325565218925476, + "learning_rate": 3.595783394396779e-06, + "loss": 1.1232, + "step": 25604 + }, + { + "epoch": 0.9169695775959318, + "grad_norm": 1.5357362031936646, + "learning_rate": 3.592701613079097e-06, + "loss": 0.8646, + "step": 25605 + }, + { + "epoch": 0.9170053897255, + "grad_norm": 1.4979199171066284, + "learning_rate": 3.5896211287972383e-06, + "loss": 1.0239, + "step": 25606 + }, + { + "epoch": 0.9170412018550683, + "grad_norm": 1.6146589517593384, + "learning_rate": 3.5865419415926803e-06, + "loss": 1.2003, + "step": 25607 + }, + { + "epoch": 0.9170770139846366, + "grad_norm": 1.5172877311706543, + "learning_rate": 3.583464051506813e-06, + "loss": 1.0543, + "step": 25608 + }, + { + "epoch": 0.9171128261142049, + "grad_norm": 1.650613784790039, + "learning_rate": 3.5803874585811024e-06, + "loss": 1.0667, + "step": 25609 + }, + { + "epoch": 0.9171486382437731, + "grad_norm": 1.244045376777649, + "learning_rate": 3.577312162856883e-06, + "loss": 1.0224, + "step": 25610 + }, + { + "epoch": 0.9171844503733414, + "grad_norm": 1.2890578508377075, + "learning_rate": 3.574238164375554e-06, + "loss": 1.088, + "step": 25611 + }, + { + "epoch": 0.9172202625029098, + "grad_norm": 1.3574227094650269, + "learning_rate": 3.571165463178472e-06, + "loss": 1.1001, + "step": 25612 + }, + { + "epoch": 0.917256074632478, + "grad_norm": 1.257443904876709, + "learning_rate": 3.568094059306981e-06, + "loss": 1.0967, + "step": 25613 + }, + { + "epoch": 0.9172918867620463, + "grad_norm": 2.019460678100586, + "learning_rate": 3.5650239528024043e-06, + "loss": 1.0276, + "step": 25614 + }, + { + "epoch": 0.9173276988916146, + "grad_norm": 1.2325319051742554, + "learning_rate": 3.5619551437060083e-06, + "loss": 1.1887, + "step": 25615 + }, + { + "epoch": 0.9173635110211829, + "grad_norm": 1.6986786127090454, + "learning_rate": 3.558887632059138e-06, + "loss": 0.7877, + "step": 25616 + }, + { + "epoch": 0.9173993231507511, + "grad_norm": 1.2036159038543701, + "learning_rate": 3.555821417903027e-06, + "loss": 0.8712, + "step": 25617 + }, + { + "epoch": 0.9174351352803194, + "grad_norm": 1.6408679485321045, + "learning_rate": 3.552756501278931e-06, + "loss": 1.008, + "step": 25618 + }, + { + "epoch": 0.9174709474098878, + "grad_norm": 1.7167881727218628, + "learning_rate": 3.549692882228084e-06, + "loss": 1.1881, + "step": 25619 + }, + { + "epoch": 0.917506759539456, + "grad_norm": 1.4505524635314941, + "learning_rate": 3.5466305607917195e-06, + "loss": 0.9, + "step": 25620 + }, + { + "epoch": 0.9175425716690243, + "grad_norm": 1.3343507051467896, + "learning_rate": 3.5435695370110154e-06, + "loss": 1.1201, + "step": 25621 + }, + { + "epoch": 0.9175783837985926, + "grad_norm": 1.583277702331543, + "learning_rate": 3.540509810927173e-06, + "loss": 1.1212, + "step": 25622 + }, + { + "epoch": 0.9176141959281608, + "grad_norm": 1.5701621770858765, + "learning_rate": 3.537451382581336e-06, + "loss": 0.9842, + "step": 25623 + }, + { + "epoch": 0.9176500080577291, + "grad_norm": 1.5388827323913574, + "learning_rate": 3.534394252014661e-06, + "loss": 1.1458, + "step": 25624 + }, + { + "epoch": 0.9176858201872974, + "grad_norm": 1.548045039176941, + "learning_rate": 3.531338419268293e-06, + "loss": 1.261, + "step": 25625 + }, + { + "epoch": 0.9177216323168658, + "grad_norm": 1.7750874757766724, + "learning_rate": 3.52828388438331e-06, + "loss": 0.925, + "step": 25626 + }, + { + "epoch": 0.917757444446434, + "grad_norm": 1.3595370054244995, + "learning_rate": 3.5252306474008457e-06, + "loss": 1.0291, + "step": 25627 + }, + { + "epoch": 0.9177932565760023, + "grad_norm": 1.4877870082855225, + "learning_rate": 3.522178708361956e-06, + "loss": 1.0744, + "step": 25628 + }, + { + "epoch": 0.9178290687055706, + "grad_norm": 1.266007423400879, + "learning_rate": 3.5191280673077086e-06, + "loss": 0.9857, + "step": 25629 + }, + { + "epoch": 0.9178648808351388, + "grad_norm": 1.4129140377044678, + "learning_rate": 3.516078724279137e-06, + "loss": 0.837, + "step": 25630 + }, + { + "epoch": 0.9179006929647071, + "grad_norm": 1.661457896232605, + "learning_rate": 3.513030679317264e-06, + "loss": 0.9974, + "step": 25631 + }, + { + "epoch": 0.9179365050942754, + "grad_norm": 1.5656306743621826, + "learning_rate": 3.5099839324631233e-06, + "loss": 1.0947, + "step": 25632 + }, + { + "epoch": 0.9179723172238438, + "grad_norm": 1.9817748069763184, + "learning_rate": 3.506938483757671e-06, + "loss": 1.0126, + "step": 25633 + }, + { + "epoch": 0.918008129353412, + "grad_norm": 1.8226535320281982, + "learning_rate": 3.503894333241886e-06, + "loss": 0.9837, + "step": 25634 + }, + { + "epoch": 0.9180439414829803, + "grad_norm": 1.5166817903518677, + "learning_rate": 3.500851480956746e-06, + "loss": 0.8566, + "step": 25635 + }, + { + "epoch": 0.9180797536125486, + "grad_norm": 2.0881829261779785, + "learning_rate": 3.497809926943174e-06, + "loss": 1.1638, + "step": 25636 + }, + { + "epoch": 0.9181155657421168, + "grad_norm": 1.6350363492965698, + "learning_rate": 3.4947696712420708e-06, + "loss": 1.1287, + "step": 25637 + }, + { + "epoch": 0.9181513778716851, + "grad_norm": 1.3709779977798462, + "learning_rate": 3.491730713894381e-06, + "loss": 1.0173, + "step": 25638 + }, + { + "epoch": 0.9181871900012534, + "grad_norm": 1.401679515838623, + "learning_rate": 3.4886930549409724e-06, + "loss": 1.0589, + "step": 25639 + }, + { + "epoch": 0.9182230021308218, + "grad_norm": 1.291754126548767, + "learning_rate": 3.485656694422701e-06, + "loss": 0.9004, + "step": 25640 + }, + { + "epoch": 0.91825881426039, + "grad_norm": 1.4792320728302002, + "learning_rate": 3.482621632380412e-06, + "loss": 1.0747, + "step": 25641 + }, + { + "epoch": 0.9182946263899583, + "grad_norm": 1.2384454011917114, + "learning_rate": 3.479587868854961e-06, + "loss": 1.1438, + "step": 25642 + }, + { + "epoch": 0.9183304385195266, + "grad_norm": 1.634453535079956, + "learning_rate": 3.4765554038871607e-06, + "loss": 0.9243, + "step": 25643 + }, + { + "epoch": 0.9183662506490948, + "grad_norm": 1.3630239963531494, + "learning_rate": 3.4735242375177777e-06, + "loss": 1.1885, + "step": 25644 + }, + { + "epoch": 0.9184020627786631, + "grad_norm": 1.385196328163147, + "learning_rate": 3.470494369787636e-06, + "loss": 1.2044, + "step": 25645 + }, + { + "epoch": 0.9184378749082314, + "grad_norm": 1.8806205987930298, + "learning_rate": 3.4674658007374683e-06, + "loss": 1.0712, + "step": 25646 + }, + { + "epoch": 0.9184736870377997, + "grad_norm": 1.438737392425537, + "learning_rate": 3.464438530408043e-06, + "loss": 1.2095, + "step": 25647 + }, + { + "epoch": 0.918509499167368, + "grad_norm": 1.1689541339874268, + "learning_rate": 3.46141255884006e-06, + "loss": 0.8068, + "step": 25648 + }, + { + "epoch": 0.9185453112969363, + "grad_norm": 1.485572338104248, + "learning_rate": 3.4583878860742434e-06, + "loss": 1.1877, + "step": 25649 + }, + { + "epoch": 0.9185811234265046, + "grad_norm": 1.831251859664917, + "learning_rate": 3.4553645121513046e-06, + "loss": 1.2817, + "step": 25650 + }, + { + "epoch": 0.9186169355560728, + "grad_norm": 1.3412220478057861, + "learning_rate": 3.4523424371118885e-06, + "loss": 1.0648, + "step": 25651 + }, + { + "epoch": 0.9186527476856411, + "grad_norm": 2.05161452293396, + "learning_rate": 3.449321660996674e-06, + "loss": 1.1326, + "step": 25652 + }, + { + "epoch": 0.9186885598152094, + "grad_norm": 1.4340107440948486, + "learning_rate": 3.446302183846295e-06, + "loss": 0.8986, + "step": 25653 + }, + { + "epoch": 0.9187243719447777, + "grad_norm": 1.6641572713851929, + "learning_rate": 3.443284005701375e-06, + "loss": 1.1712, + "step": 25654 + }, + { + "epoch": 0.918760184074346, + "grad_norm": 1.1070475578308105, + "learning_rate": 3.4402671266025253e-06, + "loss": 0.9794, + "step": 25655 + }, + { + "epoch": 0.9187959962039143, + "grad_norm": 1.377832055091858, + "learning_rate": 3.4372515465903145e-06, + "loss": 1.1069, + "step": 25656 + }, + { + "epoch": 0.9188318083334825, + "grad_norm": 1.2413325309753418, + "learning_rate": 3.434237265705342e-06, + "loss": 0.9479, + "step": 25657 + }, + { + "epoch": 0.9188676204630508, + "grad_norm": 1.4297599792480469, + "learning_rate": 3.4312242839881325e-06, + "loss": 0.8472, + "step": 25658 + }, + { + "epoch": 0.9189034325926191, + "grad_norm": 1.6050158739089966, + "learning_rate": 3.4282126014792414e-06, + "loss": 1.224, + "step": 25659 + }, + { + "epoch": 0.9189392447221874, + "grad_norm": 1.463304042816162, + "learning_rate": 3.4252022182191813e-06, + "loss": 1.053, + "step": 25660 + }, + { + "epoch": 0.9189750568517557, + "grad_norm": 1.8231607675552368, + "learning_rate": 3.4221931342484525e-06, + "loss": 1.2897, + "step": 25661 + }, + { + "epoch": 0.919010868981324, + "grad_norm": 1.3967816829681396, + "learning_rate": 3.4191853496075343e-06, + "loss": 1.0952, + "step": 25662 + }, + { + "epoch": 0.9190466811108923, + "grad_norm": 1.4565656185150146, + "learning_rate": 3.4161788643369052e-06, + "loss": 1.0291, + "step": 25663 + }, + { + "epoch": 0.9190824932404605, + "grad_norm": 1.9334125518798828, + "learning_rate": 3.4131736784769996e-06, + "loss": 1.1565, + "step": 25664 + }, + { + "epoch": 0.9191183053700288, + "grad_norm": 1.3396896123886108, + "learning_rate": 3.410169792068263e-06, + "loss": 0.9511, + "step": 25665 + }, + { + "epoch": 0.9191541174995971, + "grad_norm": 1.2672548294067383, + "learning_rate": 3.407167205151085e-06, + "loss": 0.8687, + "step": 25666 + }, + { + "epoch": 0.9191899296291653, + "grad_norm": 1.6281830072402954, + "learning_rate": 3.404165917765889e-06, + "loss": 1.166, + "step": 25667 + }, + { + "epoch": 0.9192257417587337, + "grad_norm": 1.43857741355896, + "learning_rate": 3.401165929953043e-06, + "loss": 1.0441, + "step": 25668 + }, + { + "epoch": 0.919261553888302, + "grad_norm": 1.5712798833847046, + "learning_rate": 3.398167241752892e-06, + "loss": 1.1368, + "step": 25669 + }, + { + "epoch": 0.9192973660178703, + "grad_norm": 1.3871856927871704, + "learning_rate": 3.395169853205793e-06, + "loss": 1.2044, + "step": 25670 + }, + { + "epoch": 0.9193331781474385, + "grad_norm": 1.6265074014663696, + "learning_rate": 3.3921737643520803e-06, + "loss": 1.1298, + "step": 25671 + }, + { + "epoch": 0.9193689902770068, + "grad_norm": 1.3713231086730957, + "learning_rate": 3.3891789752320656e-06, + "loss": 1.0458, + "step": 25672 + }, + { + "epoch": 0.9194048024065751, + "grad_norm": 1.6605684757232666, + "learning_rate": 3.3861854858860177e-06, + "loss": 1.16, + "step": 25673 + }, + { + "epoch": 0.9194406145361433, + "grad_norm": 1.4704418182373047, + "learning_rate": 3.3831932963542147e-06, + "loss": 1.0658, + "step": 25674 + }, + { + "epoch": 0.9194764266657117, + "grad_norm": 1.452515959739685, + "learning_rate": 3.3802024066769355e-06, + "loss": 1.0913, + "step": 25675 + }, + { + "epoch": 0.91951223879528, + "grad_norm": 1.8990784883499146, + "learning_rate": 3.3772128168943816e-06, + "loss": 1.2682, + "step": 25676 + }, + { + "epoch": 0.9195480509248483, + "grad_norm": 1.9830585718154907, + "learning_rate": 3.37422452704681e-06, + "loss": 1.1822, + "step": 25677 + }, + { + "epoch": 0.9195838630544165, + "grad_norm": 1.5351520776748657, + "learning_rate": 3.3712375371743987e-06, + "loss": 1.1516, + "step": 25678 + }, + { + "epoch": 0.9196196751839848, + "grad_norm": 2.0260140895843506, + "learning_rate": 3.3682518473173607e-06, + "loss": 0.9916, + "step": 25679 + }, + { + "epoch": 0.9196554873135531, + "grad_norm": 1.8554943799972534, + "learning_rate": 3.3652674575158306e-06, + "loss": 0.9568, + "step": 25680 + }, + { + "epoch": 0.9196912994431213, + "grad_norm": 1.6142916679382324, + "learning_rate": 3.362284367809976e-06, + "loss": 1.0126, + "step": 25681 + }, + { + "epoch": 0.9197271115726897, + "grad_norm": 2.043877124786377, + "learning_rate": 3.3593025782399424e-06, + "loss": 0.987, + "step": 25682 + }, + { + "epoch": 0.919762923702258, + "grad_norm": 1.7472487688064575, + "learning_rate": 3.3563220888458425e-06, + "loss": 0.9974, + "step": 25683 + }, + { + "epoch": 0.9197987358318263, + "grad_norm": 1.624517798423767, + "learning_rate": 3.353342899667755e-06, + "loss": 0.9046, + "step": 25684 + }, + { + "epoch": 0.9198345479613945, + "grad_norm": 1.7205263376235962, + "learning_rate": 3.3503650107457706e-06, + "loss": 1.3659, + "step": 25685 + }, + { + "epoch": 0.9198703600909628, + "grad_norm": 1.687418818473816, + "learning_rate": 3.347388422119968e-06, + "loss": 1.0688, + "step": 25686 + }, + { + "epoch": 0.9199061722205311, + "grad_norm": 1.5063282251358032, + "learning_rate": 3.3444131338303708e-06, + "loss": 0.8948, + "step": 25687 + }, + { + "epoch": 0.9199419843500993, + "grad_norm": 1.536651611328125, + "learning_rate": 3.3414391459170134e-06, + "loss": 1.2141, + "step": 25688 + }, + { + "epoch": 0.9199777964796677, + "grad_norm": 1.4916484355926514, + "learning_rate": 3.33846645841992e-06, + "loss": 1.0357, + "step": 25689 + }, + { + "epoch": 0.920013608609236, + "grad_norm": 1.1869373321533203, + "learning_rate": 3.33549507137908e-06, + "loss": 1.0651, + "step": 25690 + }, + { + "epoch": 0.9200494207388042, + "grad_norm": 1.3034563064575195, + "learning_rate": 3.332524984834462e-06, + "loss": 1.1362, + "step": 25691 + }, + { + "epoch": 0.9200852328683725, + "grad_norm": 1.4523378610610962, + "learning_rate": 3.3295561988260227e-06, + "loss": 1.248, + "step": 25692 + }, + { + "epoch": 0.9201210449979408, + "grad_norm": 1.8363702297210693, + "learning_rate": 3.326588713393719e-06, + "loss": 1.0703, + "step": 25693 + }, + { + "epoch": 0.920156857127509, + "grad_norm": 1.3970692157745361, + "learning_rate": 3.3236225285774637e-06, + "loss": 1.0317, + "step": 25694 + }, + { + "epoch": 0.9201926692570773, + "grad_norm": 1.491361141204834, + "learning_rate": 3.3206576444171577e-06, + "loss": 0.9113, + "step": 25695 + }, + { + "epoch": 0.9202284813866457, + "grad_norm": 1.803730845451355, + "learning_rate": 3.317694060952692e-06, + "loss": 0.9095, + "step": 25696 + }, + { + "epoch": 0.920264293516214, + "grad_norm": 1.282680630683899, + "learning_rate": 3.314731778223956e-06, + "loss": 1.0944, + "step": 25697 + }, + { + "epoch": 0.9203001056457822, + "grad_norm": 1.5170722007751465, + "learning_rate": 3.3117707962707746e-06, + "loss": 1.2148, + "step": 25698 + }, + { + "epoch": 0.9203359177753505, + "grad_norm": 1.227919340133667, + "learning_rate": 3.308811115133004e-06, + "loss": 1.1383, + "step": 25699 + }, + { + "epoch": 0.9203717299049188, + "grad_norm": 1.5444494485855103, + "learning_rate": 3.3058527348504455e-06, + "loss": 0.9898, + "step": 25700 + }, + { + "epoch": 0.920407542034487, + "grad_norm": 1.5867087841033936, + "learning_rate": 3.302895655462934e-06, + "loss": 1.2112, + "step": 25701 + }, + { + "epoch": 0.9204433541640553, + "grad_norm": 1.6487762928009033, + "learning_rate": 3.2999398770102276e-06, + "loss": 0.9431, + "step": 25702 + }, + { + "epoch": 0.9204791662936237, + "grad_norm": 1.4543073177337646, + "learning_rate": 3.296985399532071e-06, + "loss": 1.0762, + "step": 25703 + }, + { + "epoch": 0.920514978423192, + "grad_norm": 1.3759856224060059, + "learning_rate": 3.2940322230682664e-06, + "loss": 0.9817, + "step": 25704 + }, + { + "epoch": 0.9205507905527602, + "grad_norm": 1.3340916633605957, + "learning_rate": 3.291080347658504e-06, + "loss": 0.9865, + "step": 25705 + }, + { + "epoch": 0.9205866026823285, + "grad_norm": 1.4685392379760742, + "learning_rate": 3.2881297733425188e-06, + "loss": 1.0387, + "step": 25706 + }, + { + "epoch": 0.9206224148118968, + "grad_norm": 1.3716503381729126, + "learning_rate": 3.285180500159979e-06, + "loss": 1.1219, + "step": 25707 + }, + { + "epoch": 0.920658226941465, + "grad_norm": 1.474511981010437, + "learning_rate": 3.2822325281505973e-06, + "loss": 1.0699, + "step": 25708 + }, + { + "epoch": 0.9206940390710333, + "grad_norm": 1.6127901077270508, + "learning_rate": 3.27928585735402e-06, + "loss": 1.2768, + "step": 25709 + }, + { + "epoch": 0.9207298512006017, + "grad_norm": 1.5921454429626465, + "learning_rate": 3.2763404878098815e-06, + "loss": 1.1209, + "step": 25710 + }, + { + "epoch": 0.92076566333017, + "grad_norm": 1.5217379331588745, + "learning_rate": 3.273396419557839e-06, + "loss": 0.9337, + "step": 25711 + }, + { + "epoch": 0.9208014754597382, + "grad_norm": 1.2577543258666992, + "learning_rate": 3.2704536526374506e-06, + "loss": 0.9795, + "step": 25712 + }, + { + "epoch": 0.9208372875893065, + "grad_norm": 2.0576651096343994, + "learning_rate": 3.267512187088362e-06, + "loss": 1.2057, + "step": 25713 + }, + { + "epoch": 0.9208730997188748, + "grad_norm": 1.5903769731521606, + "learning_rate": 3.2645720229500965e-06, + "loss": 1.2668, + "step": 25714 + }, + { + "epoch": 0.920908911848443, + "grad_norm": 1.3086804151535034, + "learning_rate": 3.2616331602622565e-06, + "loss": 1.1213, + "step": 25715 + }, + { + "epoch": 0.9209447239780113, + "grad_norm": 1.536529302597046, + "learning_rate": 3.2586955990643432e-06, + "loss": 1.1651, + "step": 25716 + }, + { + "epoch": 0.9209805361075797, + "grad_norm": 1.6697816848754883, + "learning_rate": 3.255759339395903e-06, + "loss": 1.1276, + "step": 25717 + }, + { + "epoch": 0.921016348237148, + "grad_norm": 1.3674176931381226, + "learning_rate": 3.2528243812964156e-06, + "loss": 1.1532, + "step": 25718 + }, + { + "epoch": 0.9210521603667162, + "grad_norm": 1.2274316549301147, + "learning_rate": 3.2498907248054045e-06, + "loss": 1.2206, + "step": 25719 + }, + { + "epoch": 0.9210879724962845, + "grad_norm": 1.395978331565857, + "learning_rate": 3.2469583699623053e-06, + "loss": 1.1344, + "step": 25720 + }, + { + "epoch": 0.9211237846258528, + "grad_norm": 1.5270698070526123, + "learning_rate": 3.2440273168065636e-06, + "loss": 0.9081, + "step": 25721 + }, + { + "epoch": 0.921159596755421, + "grad_norm": 1.6146184206008911, + "learning_rate": 3.241097565377649e-06, + "loss": 1.0802, + "step": 25722 + }, + { + "epoch": 0.9211954088849893, + "grad_norm": 1.5193604230880737, + "learning_rate": 3.2381691157149395e-06, + "loss": 1.101, + "step": 25723 + }, + { + "epoch": 0.9212312210145577, + "grad_norm": 1.8421059846878052, + "learning_rate": 3.2352419678578714e-06, + "loss": 1.0567, + "step": 25724 + }, + { + "epoch": 0.921267033144126, + "grad_norm": 1.4230427742004395, + "learning_rate": 3.2323161218457796e-06, + "loss": 1.1986, + "step": 25725 + }, + { + "epoch": 0.9213028452736942, + "grad_norm": 1.3642396926879883, + "learning_rate": 3.229391577718066e-06, + "loss": 0.8957, + "step": 25726 + }, + { + "epoch": 0.9213386574032625, + "grad_norm": 1.6792056560516357, + "learning_rate": 3.226468335514077e-06, + "loss": 1.1302, + "step": 25727 + }, + { + "epoch": 0.9213744695328308, + "grad_norm": 1.8063493967056274, + "learning_rate": 3.223546395273114e-06, + "loss": 1.0536, + "step": 25728 + }, + { + "epoch": 0.921410281662399, + "grad_norm": 1.5402579307556152, + "learning_rate": 3.220625757034501e-06, + "loss": 1.145, + "step": 25729 + }, + { + "epoch": 0.9214460937919673, + "grad_norm": 1.5312745571136475, + "learning_rate": 3.2177064208375298e-06, + "loss": 1.1931, + "step": 25730 + }, + { + "epoch": 0.9214819059215357, + "grad_norm": 1.7606489658355713, + "learning_rate": 3.21478838672149e-06, + "loss": 1.3295, + "step": 25731 + }, + { + "epoch": 0.9215177180511039, + "grad_norm": 2.237588405609131, + "learning_rate": 3.211871654725618e-06, + "loss": 1.2498, + "step": 25732 + }, + { + "epoch": 0.9215535301806722, + "grad_norm": 1.2661391496658325, + "learning_rate": 3.208956224889159e-06, + "loss": 0.9647, + "step": 25733 + }, + { + "epoch": 0.9215893423102405, + "grad_norm": 1.4739757776260376, + "learning_rate": 3.2060420972513494e-06, + "loss": 1.0094, + "step": 25734 + }, + { + "epoch": 0.9216251544398087, + "grad_norm": 1.4326437711715698, + "learning_rate": 3.203129271851402e-06, + "loss": 1.0771, + "step": 25735 + }, + { + "epoch": 0.921660966569377, + "grad_norm": 1.3930391073226929, + "learning_rate": 3.2002177487284736e-06, + "loss": 1.0443, + "step": 25736 + }, + { + "epoch": 0.9216967786989453, + "grad_norm": 1.473949909210205, + "learning_rate": 3.197307527921756e-06, + "loss": 1.1453, + "step": 25737 + }, + { + "epoch": 0.9217325908285137, + "grad_norm": 1.3870798349380493, + "learning_rate": 3.194398609470406e-06, + "loss": 1.135, + "step": 25738 + }, + { + "epoch": 0.9217684029580819, + "grad_norm": 1.34653902053833, + "learning_rate": 3.1914909934135483e-06, + "loss": 0.8971, + "step": 25739 + }, + { + "epoch": 0.9218042150876502, + "grad_norm": 1.7730484008789062, + "learning_rate": 3.1885846797902964e-06, + "loss": 1.0361, + "step": 25740 + }, + { + "epoch": 0.9218400272172185, + "grad_norm": 1.5039136409759521, + "learning_rate": 3.185679668639763e-06, + "loss": 1.0556, + "step": 25741 + }, + { + "epoch": 0.9218758393467867, + "grad_norm": 1.4422600269317627, + "learning_rate": 3.1827759600010498e-06, + "loss": 1.2077, + "step": 25742 + }, + { + "epoch": 0.921911651476355, + "grad_norm": 1.3889329433441162, + "learning_rate": 3.179873553913171e-06, + "loss": 1.043, + "step": 25743 + }, + { + "epoch": 0.9219474636059233, + "grad_norm": 1.515858769416809, + "learning_rate": 3.1769724504152164e-06, + "loss": 0.911, + "step": 25744 + }, + { + "epoch": 0.9219832757354917, + "grad_norm": 1.3082762956619263, + "learning_rate": 3.1740726495462223e-06, + "loss": 1.0428, + "step": 25745 + }, + { + "epoch": 0.9220190878650599, + "grad_norm": 1.419449806213379, + "learning_rate": 3.1711741513451576e-06, + "loss": 1.0282, + "step": 25746 + }, + { + "epoch": 0.9220548999946282, + "grad_norm": 1.393980860710144, + "learning_rate": 3.1682769558510574e-06, + "loss": 1.2163, + "step": 25747 + }, + { + "epoch": 0.9220907121241965, + "grad_norm": 1.553442358970642, + "learning_rate": 3.165381063102879e-06, + "loss": 1.0035, + "step": 25748 + }, + { + "epoch": 0.9221265242537647, + "grad_norm": 1.3608770370483398, + "learning_rate": 3.162486473139603e-06, + "loss": 0.9771, + "step": 25749 + }, + { + "epoch": 0.922162336383333, + "grad_norm": 1.3694243431091309, + "learning_rate": 3.1595931860001536e-06, + "loss": 0.9409, + "step": 25750 + }, + { + "epoch": 0.9221981485129013, + "grad_norm": 1.3019042015075684, + "learning_rate": 3.1567012017234553e-06, + "loss": 1.0458, + "step": 25751 + }, + { + "epoch": 0.9222339606424697, + "grad_norm": 1.7426961660385132, + "learning_rate": 3.1538105203484323e-06, + "loss": 1.0727, + "step": 25752 + }, + { + "epoch": 0.9222697727720379, + "grad_norm": 1.3767911195755005, + "learning_rate": 3.150921141913965e-06, + "loss": 1.1865, + "step": 25753 + }, + { + "epoch": 0.9223055849016062, + "grad_norm": 1.5722861289978027, + "learning_rate": 3.148033066458933e-06, + "loss": 1.4636, + "step": 25754 + }, + { + "epoch": 0.9223413970311745, + "grad_norm": 1.3141342401504517, + "learning_rate": 3.145146294022172e-06, + "loss": 0.9592, + "step": 25755 + }, + { + "epoch": 0.9223772091607427, + "grad_norm": 1.4248207807540894, + "learning_rate": 3.1422608246425513e-06, + "loss": 1.0104, + "step": 25756 + }, + { + "epoch": 0.922413021290311, + "grad_norm": 1.3366711139678955, + "learning_rate": 3.1393766583588614e-06, + "loss": 1.2068, + "step": 25757 + }, + { + "epoch": 0.9224488334198793, + "grad_norm": 1.7053807973861694, + "learning_rate": 3.136493795209916e-06, + "loss": 1.1848, + "step": 25758 + }, + { + "epoch": 0.9224846455494476, + "grad_norm": 1.636690378189087, + "learning_rate": 3.1336122352345065e-06, + "loss": 1.0935, + "step": 25759 + }, + { + "epoch": 0.9225204576790159, + "grad_norm": 1.4296176433563232, + "learning_rate": 3.130731978471402e-06, + "loss": 0.9439, + "step": 25760 + }, + { + "epoch": 0.9225562698085842, + "grad_norm": 1.4933507442474365, + "learning_rate": 3.1278530249593372e-06, + "loss": 1.0805, + "step": 25761 + }, + { + "epoch": 0.9225920819381525, + "grad_norm": 1.3514254093170166, + "learning_rate": 3.124975374737049e-06, + "loss": 0.8855, + "step": 25762 + }, + { + "epoch": 0.9226278940677207, + "grad_norm": 1.6246765851974487, + "learning_rate": 3.1220990278432727e-06, + "loss": 0.9851, + "step": 25763 + }, + { + "epoch": 0.922663706197289, + "grad_norm": 1.2411177158355713, + "learning_rate": 3.119223984316677e-06, + "loss": 1.1389, + "step": 25764 + }, + { + "epoch": 0.9226995183268573, + "grad_norm": 1.4288568496704102, + "learning_rate": 3.1163502441959647e-06, + "loss": 0.8559, + "step": 25765 + }, + { + "epoch": 0.9227353304564256, + "grad_norm": 1.6317857503890991, + "learning_rate": 3.113477807519782e-06, + "loss": 1.1539, + "step": 25766 + }, + { + "epoch": 0.9227711425859939, + "grad_norm": 1.5563701391220093, + "learning_rate": 3.110606674326788e-06, + "loss": 1.2459, + "step": 25767 + }, + { + "epoch": 0.9228069547155622, + "grad_norm": 1.790894865989685, + "learning_rate": 3.1077368446555956e-06, + "loss": 1.0753, + "step": 25768 + }, + { + "epoch": 0.9228427668451304, + "grad_norm": 1.5234198570251465, + "learning_rate": 3.104868318544818e-06, + "loss": 1.0355, + "step": 25769 + }, + { + "epoch": 0.9228785789746987, + "grad_norm": 1.2434275150299072, + "learning_rate": 3.1020010960330583e-06, + "loss": 1.2345, + "step": 25770 + }, + { + "epoch": 0.922914391104267, + "grad_norm": 1.6628819704055786, + "learning_rate": 3.0991351771588963e-06, + "loss": 1.0039, + "step": 25771 + }, + { + "epoch": 0.9229502032338353, + "grad_norm": 1.5242918729782104, + "learning_rate": 3.0962705619608565e-06, + "loss": 1.027, + "step": 25772 + }, + { + "epoch": 0.9229860153634036, + "grad_norm": 1.5039187669754028, + "learning_rate": 3.093407250477509e-06, + "loss": 1.1024, + "step": 25773 + }, + { + "epoch": 0.9230218274929719, + "grad_norm": 1.556837558746338, + "learning_rate": 3.0905452427473667e-06, + "loss": 1.0163, + "step": 25774 + }, + { + "epoch": 0.9230576396225402, + "grad_norm": 1.735368013381958, + "learning_rate": 3.0876845388089327e-06, + "loss": 1.0649, + "step": 25775 + }, + { + "epoch": 0.9230934517521084, + "grad_norm": 1.2284772396087646, + "learning_rate": 3.084825138700698e-06, + "loss": 1.046, + "step": 25776 + }, + { + "epoch": 0.9231292638816767, + "grad_norm": 1.5324772596359253, + "learning_rate": 3.08196704246112e-06, + "loss": 1.1385, + "step": 25777 + }, + { + "epoch": 0.923165076011245, + "grad_norm": 2.4559662342071533, + "learning_rate": 3.0791102501286804e-06, + "loss": 0.9341, + "step": 25778 + }, + { + "epoch": 0.9232008881408132, + "grad_norm": 1.5710633993148804, + "learning_rate": 3.0762547617417703e-06, + "loss": 0.9756, + "step": 25779 + }, + { + "epoch": 0.9232367002703816, + "grad_norm": 1.253204107284546, + "learning_rate": 3.0734005773388364e-06, + "loss": 1.0093, + "step": 25780 + }, + { + "epoch": 0.9232725123999499, + "grad_norm": 1.2886825799942017, + "learning_rate": 3.0705476969582813e-06, + "loss": 0.8932, + "step": 25781 + }, + { + "epoch": 0.9233083245295182, + "grad_norm": 1.5123380422592163, + "learning_rate": 3.0676961206384746e-06, + "loss": 1.0336, + "step": 25782 + }, + { + "epoch": 0.9233441366590864, + "grad_norm": 1.6127911806106567, + "learning_rate": 3.0648458484177746e-06, + "loss": 1.1794, + "step": 25783 + }, + { + "epoch": 0.9233799487886547, + "grad_norm": 1.4575220346450806, + "learning_rate": 3.061996880334539e-06, + "loss": 0.9878, + "step": 25784 + }, + { + "epoch": 0.923415760918223, + "grad_norm": 1.4611788988113403, + "learning_rate": 3.059149216427104e-06, + "loss": 1.1623, + "step": 25785 + }, + { + "epoch": 0.9234515730477912, + "grad_norm": 1.370015025138855, + "learning_rate": 3.0563028567337614e-06, + "loss": 1.0503, + "step": 25786 + }, + { + "epoch": 0.9234873851773596, + "grad_norm": 1.118876338005066, + "learning_rate": 3.053457801292814e-06, + "loss": 0.9809, + "step": 25787 + }, + { + "epoch": 0.9235231973069279, + "grad_norm": 1.3538721799850464, + "learning_rate": 3.0506140501425417e-06, + "loss": 0.989, + "step": 25788 + }, + { + "epoch": 0.9235590094364962, + "grad_norm": 1.2217841148376465, + "learning_rate": 3.0477716033212032e-06, + "loss": 1.1209, + "step": 25789 + }, + { + "epoch": 0.9235948215660644, + "grad_norm": 1.8437589406967163, + "learning_rate": 3.044930460867046e-06, + "loss": 1.0563, + "step": 25790 + }, + { + "epoch": 0.9236306336956327, + "grad_norm": 1.4525270462036133, + "learning_rate": 3.042090622818272e-06, + "loss": 0.951, + "step": 25791 + }, + { + "epoch": 0.923666445825201, + "grad_norm": 1.5582115650177002, + "learning_rate": 3.039252089213118e-06, + "loss": 0.9716, + "step": 25792 + }, + { + "epoch": 0.9237022579547692, + "grad_norm": 2.089536428451538, + "learning_rate": 3.0364148600897423e-06, + "loss": 1.1696, + "step": 25793 + }, + { + "epoch": 0.9237380700843376, + "grad_norm": 2.079489231109619, + "learning_rate": 3.0335789354863362e-06, + "loss": 1.2482, + "step": 25794 + }, + { + "epoch": 0.9237738822139059, + "grad_norm": 1.4157919883728027, + "learning_rate": 3.0307443154410365e-06, + "loss": 1.1126, + "step": 25795 + }, + { + "epoch": 0.9238096943434742, + "grad_norm": 1.261847972869873, + "learning_rate": 3.027910999992012e-06, + "loss": 1.0173, + "step": 25796 + }, + { + "epoch": 0.9238455064730424, + "grad_norm": 1.488444209098816, + "learning_rate": 3.0250789891773433e-06, + "loss": 1.1979, + "step": 25797 + }, + { + "epoch": 0.9238813186026107, + "grad_norm": 1.6790114641189575, + "learning_rate": 3.022248283035156e-06, + "loss": 1.0407, + "step": 25798 + }, + { + "epoch": 0.923917130732179, + "grad_norm": 1.1393150091171265, + "learning_rate": 3.0194188816035305e-06, + "loss": 0.9939, + "step": 25799 + }, + { + "epoch": 0.9239529428617472, + "grad_norm": 1.9107890129089355, + "learning_rate": 3.0165907849205254e-06, + "loss": 0.9895, + "step": 25800 + }, + { + "epoch": 0.9239887549913156, + "grad_norm": 1.4063054323196411, + "learning_rate": 3.013763993024188e-06, + "loss": 0.9294, + "step": 25801 + }, + { + "epoch": 0.9240245671208839, + "grad_norm": 1.600045084953308, + "learning_rate": 3.010938505952543e-06, + "loss": 1.1212, + "step": 25802 + }, + { + "epoch": 0.9240603792504521, + "grad_norm": 1.2938940525054932, + "learning_rate": 3.008114323743627e-06, + "loss": 0.9908, + "step": 25803 + }, + { + "epoch": 0.9240961913800204, + "grad_norm": 1.5153332948684692, + "learning_rate": 3.005291446435421e-06, + "loss": 1.1286, + "step": 25804 + }, + { + "epoch": 0.9241320035095887, + "grad_norm": 1.3792206048965454, + "learning_rate": 3.002469874065894e-06, + "loss": 0.921, + "step": 25805 + }, + { + "epoch": 0.924167815639157, + "grad_norm": 2.0046234130859375, + "learning_rate": 2.999649606673027e-06, + "loss": 1.146, + "step": 25806 + }, + { + "epoch": 0.9242036277687252, + "grad_norm": 1.3164594173431396, + "learning_rate": 2.996830644294757e-06, + "loss": 1.0126, + "step": 25807 + }, + { + "epoch": 0.9242394398982936, + "grad_norm": 1.5673266649246216, + "learning_rate": 2.994012986969008e-06, + "loss": 1.0412, + "step": 25808 + }, + { + "epoch": 0.9242752520278619, + "grad_norm": 1.5077741146087646, + "learning_rate": 2.991196634733662e-06, + "loss": 0.896, + "step": 25809 + }, + { + "epoch": 0.9243110641574301, + "grad_norm": 1.1486366987228394, + "learning_rate": 2.9883815876266653e-06, + "loss": 1.0215, + "step": 25810 + }, + { + "epoch": 0.9243468762869984, + "grad_norm": 1.4905781745910645, + "learning_rate": 2.985567845685833e-06, + "loss": 1.1012, + "step": 25811 + }, + { + "epoch": 0.9243826884165667, + "grad_norm": 1.4865154027938843, + "learning_rate": 2.982755408949067e-06, + "loss": 1.1011, + "step": 25812 + }, + { + "epoch": 0.924418500546135, + "grad_norm": 1.5157954692840576, + "learning_rate": 2.97994427745415e-06, + "loss": 1.0709, + "step": 25813 + }, + { + "epoch": 0.9244543126757032, + "grad_norm": 1.4461350440979004, + "learning_rate": 2.977134451238972e-06, + "loss": 1.0495, + "step": 25814 + }, + { + "epoch": 0.9244901248052716, + "grad_norm": 1.2827978134155273, + "learning_rate": 2.9743259303412707e-06, + "loss": 1.0327, + "step": 25815 + }, + { + "epoch": 0.9245259369348399, + "grad_norm": 1.244158148765564, + "learning_rate": 2.9715187147988823e-06, + "loss": 0.8965, + "step": 25816 + }, + { + "epoch": 0.9245617490644081, + "grad_norm": 1.5447725057601929, + "learning_rate": 2.968712804649543e-06, + "loss": 1.0855, + "step": 25817 + }, + { + "epoch": 0.9245975611939764, + "grad_norm": 1.3836166858673096, + "learning_rate": 2.9659081999310112e-06, + "loss": 0.9746, + "step": 25818 + }, + { + "epoch": 0.9246333733235447, + "grad_norm": 1.321714997291565, + "learning_rate": 2.9631049006810243e-06, + "loss": 0.9554, + "step": 25819 + }, + { + "epoch": 0.9246691854531129, + "grad_norm": 1.4016211032867432, + "learning_rate": 2.9603029069372733e-06, + "loss": 1.1878, + "step": 25820 + }, + { + "epoch": 0.9247049975826812, + "grad_norm": 1.4288073778152466, + "learning_rate": 2.9575022187374958e-06, + "loss": 1.1026, + "step": 25821 + }, + { + "epoch": 0.9247408097122496, + "grad_norm": 1.5721813440322876, + "learning_rate": 2.9547028361193495e-06, + "loss": 1.1449, + "step": 25822 + }, + { + "epoch": 0.9247766218418179, + "grad_norm": 1.5060534477233887, + "learning_rate": 2.951904759120494e-06, + "loss": 1.0758, + "step": 25823 + }, + { + "epoch": 0.9248124339713861, + "grad_norm": 1.537282109260559, + "learning_rate": 2.9491079877785767e-06, + "loss": 1.0102, + "step": 25824 + }, + { + "epoch": 0.9248482461009544, + "grad_norm": 1.6171859502792358, + "learning_rate": 2.9463125221312117e-06, + "loss": 0.9816, + "step": 25825 + }, + { + "epoch": 0.9248840582305227, + "grad_norm": 1.4442631006240845, + "learning_rate": 2.9435183622160465e-06, + "loss": 1.1508, + "step": 25826 + }, + { + "epoch": 0.9249198703600909, + "grad_norm": 1.6164058446884155, + "learning_rate": 2.9407255080706297e-06, + "loss": 1.0773, + "step": 25827 + }, + { + "epoch": 0.9249556824896592, + "grad_norm": 1.5563671588897705, + "learning_rate": 2.937933959732553e-06, + "loss": 1.2327, + "step": 25828 + }, + { + "epoch": 0.9249914946192276, + "grad_norm": 1.3477320671081543, + "learning_rate": 2.9351437172393746e-06, + "loss": 1.1526, + "step": 25829 + }, + { + "epoch": 0.9250273067487959, + "grad_norm": 1.3934462070465088, + "learning_rate": 2.9323547806286432e-06, + "loss": 1.064, + "step": 25830 + }, + { + "epoch": 0.9250631188783641, + "grad_norm": 2.1154773235321045, + "learning_rate": 2.9295671499378506e-06, + "loss": 1.1906, + "step": 25831 + }, + { + "epoch": 0.9250989310079324, + "grad_norm": 1.472161054611206, + "learning_rate": 2.9267808252045338e-06, + "loss": 1.0977, + "step": 25832 + }, + { + "epoch": 0.9251347431375007, + "grad_norm": 1.4864897727966309, + "learning_rate": 2.923995806466173e-06, + "loss": 0.9876, + "step": 25833 + }, + { + "epoch": 0.9251705552670689, + "grad_norm": 1.4454030990600586, + "learning_rate": 2.9212120937602174e-06, + "loss": 1.2272, + "step": 25834 + }, + { + "epoch": 0.9252063673966372, + "grad_norm": 1.5009654760360718, + "learning_rate": 2.9184296871241357e-06, + "loss": 0.9532, + "step": 25835 + }, + { + "epoch": 0.9252421795262056, + "grad_norm": 1.8378866910934448, + "learning_rate": 2.9156485865953544e-06, + "loss": 1.145, + "step": 25836 + }, + { + "epoch": 0.9252779916557738, + "grad_norm": 1.3385032415390015, + "learning_rate": 2.9128687922112987e-06, + "loss": 1.0048, + "step": 25837 + }, + { + "epoch": 0.9253138037853421, + "grad_norm": 1.4966652393341064, + "learning_rate": 2.91009030400935e-06, + "loss": 1.0311, + "step": 25838 + }, + { + "epoch": 0.9253496159149104, + "grad_norm": 1.687980055809021, + "learning_rate": 2.9073131220269e-06, + "loss": 0.9941, + "step": 25839 + }, + { + "epoch": 0.9253854280444787, + "grad_norm": 1.4621717929840088, + "learning_rate": 2.9045372463013088e-06, + "loss": 1.0407, + "step": 25840 + }, + { + "epoch": 0.9254212401740469, + "grad_norm": 2.5186383724212646, + "learning_rate": 2.9017626768699346e-06, + "loss": 1.0998, + "step": 25841 + }, + { + "epoch": 0.9254570523036152, + "grad_norm": 1.5018730163574219, + "learning_rate": 2.8989894137700924e-06, + "loss": 1.2048, + "step": 25842 + }, + { + "epoch": 0.9254928644331836, + "grad_norm": 1.4448953866958618, + "learning_rate": 2.8962174570390965e-06, + "loss": 1.1821, + "step": 25843 + }, + { + "epoch": 0.9255286765627518, + "grad_norm": 1.4411474466323853, + "learning_rate": 2.8934468067142396e-06, + "loss": 0.9046, + "step": 25844 + }, + { + "epoch": 0.9255644886923201, + "grad_norm": 1.9798556566238403, + "learning_rate": 2.8906774628327917e-06, + "loss": 1.2676, + "step": 25845 + }, + { + "epoch": 0.9256003008218884, + "grad_norm": 1.62058424949646, + "learning_rate": 2.8879094254320225e-06, + "loss": 1.0066, + "step": 25846 + }, + { + "epoch": 0.9256361129514566, + "grad_norm": 1.6796448230743408, + "learning_rate": 2.8851426945491588e-06, + "loss": 1.0881, + "step": 25847 + }, + { + "epoch": 0.9256719250810249, + "grad_norm": 1.7010434865951538, + "learning_rate": 2.882377270221448e-06, + "loss": 1.0945, + "step": 25848 + }, + { + "epoch": 0.9257077372105932, + "grad_norm": 1.6001994609832764, + "learning_rate": 2.8796131524860603e-06, + "loss": 1.1105, + "step": 25849 + }, + { + "epoch": 0.9257435493401616, + "grad_norm": 1.6483557224273682, + "learning_rate": 2.8768503413802108e-06, + "loss": 1.0874, + "step": 25850 + }, + { + "epoch": 0.9257793614697298, + "grad_norm": 1.720130205154419, + "learning_rate": 2.8740888369410577e-06, + "loss": 1.2796, + "step": 25851 + }, + { + "epoch": 0.9258151735992981, + "grad_norm": 1.3123528957366943, + "learning_rate": 2.8713286392057614e-06, + "loss": 1.0798, + "step": 25852 + }, + { + "epoch": 0.9258509857288664, + "grad_norm": 1.3030540943145752, + "learning_rate": 2.868569748211436e-06, + "loss": 1.0252, + "step": 25853 + }, + { + "epoch": 0.9258867978584346, + "grad_norm": 1.3136616945266724, + "learning_rate": 2.8658121639952297e-06, + "loss": 1.0911, + "step": 25854 + }, + { + "epoch": 0.9259226099880029, + "grad_norm": 1.1655993461608887, + "learning_rate": 2.8630558865942237e-06, + "loss": 0.9928, + "step": 25855 + }, + { + "epoch": 0.9259584221175712, + "grad_norm": 1.5330402851104736, + "learning_rate": 2.8603009160454995e-06, + "loss": 1.3241, + "step": 25856 + }, + { + "epoch": 0.9259942342471396, + "grad_norm": 1.3937197923660278, + "learning_rate": 2.857547252386117e-06, + "loss": 1.0457, + "step": 25857 + }, + { + "epoch": 0.9260300463767078, + "grad_norm": 1.7189704179763794, + "learning_rate": 2.854794895653146e-06, + "loss": 0.9647, + "step": 25858 + }, + { + "epoch": 0.9260658585062761, + "grad_norm": 1.302258014678955, + "learning_rate": 2.8520438458836007e-06, + "loss": 1.1152, + "step": 25859 + }, + { + "epoch": 0.9261016706358444, + "grad_norm": 1.5732512474060059, + "learning_rate": 2.849294103114486e-06, + "loss": 1.0459, + "step": 25860 + }, + { + "epoch": 0.9261374827654126, + "grad_norm": 1.3595256805419922, + "learning_rate": 2.846545667382805e-06, + "loss": 1.0379, + "step": 25861 + }, + { + "epoch": 0.9261732948949809, + "grad_norm": 1.5716854333877563, + "learning_rate": 2.8437985387255394e-06, + "loss": 1.2992, + "step": 25862 + }, + { + "epoch": 0.9262091070245492, + "grad_norm": 1.289440631866455, + "learning_rate": 2.8410527171796376e-06, + "loss": 0.8638, + "step": 25863 + }, + { + "epoch": 0.9262449191541176, + "grad_norm": 2.189453125, + "learning_rate": 2.838308202782036e-06, + "loss": 1.3443, + "step": 25864 + }, + { + "epoch": 0.9262807312836858, + "grad_norm": 1.4927301406860352, + "learning_rate": 2.835564995569684e-06, + "loss": 1.2467, + "step": 25865 + }, + { + "epoch": 0.9263165434132541, + "grad_norm": 1.3250941038131714, + "learning_rate": 2.8328230955794733e-06, + "loss": 1.019, + "step": 25866 + }, + { + "epoch": 0.9263523555428224, + "grad_norm": 1.5699806213378906, + "learning_rate": 2.8300825028482748e-06, + "loss": 1.0317, + "step": 25867 + }, + { + "epoch": 0.9263881676723906, + "grad_norm": 1.456436276435852, + "learning_rate": 2.827343217412981e-06, + "loss": 0.9249, + "step": 25868 + }, + { + "epoch": 0.9264239798019589, + "grad_norm": 1.6727805137634277, + "learning_rate": 2.8246052393104516e-06, + "loss": 1.145, + "step": 25869 + }, + { + "epoch": 0.9264597919315272, + "grad_norm": 1.4487719535827637, + "learning_rate": 2.8218685685775015e-06, + "loss": 1.123, + "step": 25870 + }, + { + "epoch": 0.9264956040610955, + "grad_norm": 1.5571420192718506, + "learning_rate": 2.8191332052509567e-06, + "loss": 1.2966, + "step": 25871 + }, + { + "epoch": 0.9265314161906638, + "grad_norm": 1.4836006164550781, + "learning_rate": 2.8163991493676212e-06, + "loss": 0.9517, + "step": 25872 + }, + { + "epoch": 0.9265672283202321, + "grad_norm": 1.571184754371643, + "learning_rate": 2.8136664009642877e-06, + "loss": 1.2797, + "step": 25873 + }, + { + "epoch": 0.9266030404498004, + "grad_norm": 1.7595654726028442, + "learning_rate": 2.8109349600777045e-06, + "loss": 1.226, + "step": 25874 + }, + { + "epoch": 0.9266388525793686, + "grad_norm": 1.3020166158676147, + "learning_rate": 2.8082048267446203e-06, + "loss": 0.8423, + "step": 25875 + }, + { + "epoch": 0.9266746647089369, + "grad_norm": 1.3622941970825195, + "learning_rate": 2.805476001001772e-06, + "loss": 1.0843, + "step": 25876 + }, + { + "epoch": 0.9267104768385052, + "grad_norm": 1.5620665550231934, + "learning_rate": 2.802748482885886e-06, + "loss": 1.066, + "step": 25877 + }, + { + "epoch": 0.9267462889680735, + "grad_norm": 1.2492636442184448, + "learning_rate": 2.800022272433633e-06, + "loss": 0.9174, + "step": 25878 + }, + { + "epoch": 0.9267821010976418, + "grad_norm": 1.6490256786346436, + "learning_rate": 2.797297369681706e-06, + "loss": 1.0296, + "step": 25879 + }, + { + "epoch": 0.9268179132272101, + "grad_norm": 1.5292540788650513, + "learning_rate": 2.7945737746667643e-06, + "loss": 1.1446, + "step": 25880 + }, + { + "epoch": 0.9268537253567783, + "grad_norm": 1.438757061958313, + "learning_rate": 2.7918514874254454e-06, + "loss": 1.0244, + "step": 25881 + }, + { + "epoch": 0.9268895374863466, + "grad_norm": 1.4410510063171387, + "learning_rate": 2.789130507994364e-06, + "loss": 0.894, + "step": 25882 + }, + { + "epoch": 0.9269253496159149, + "grad_norm": 1.239343523979187, + "learning_rate": 2.786410836410147e-06, + "loss": 1.0738, + "step": 25883 + }, + { + "epoch": 0.9269611617454832, + "grad_norm": 1.4076215028762817, + "learning_rate": 2.783692472709376e-06, + "loss": 1.0951, + "step": 25884 + }, + { + "epoch": 0.9269969738750515, + "grad_norm": 1.2577718496322632, + "learning_rate": 2.7809754169286216e-06, + "loss": 0.9803, + "step": 25885 + }, + { + "epoch": 0.9270327860046198, + "grad_norm": 1.6330926418304443, + "learning_rate": 2.7782596691044327e-06, + "loss": 1.0076, + "step": 25886 + }, + { + "epoch": 0.9270685981341881, + "grad_norm": 1.7203764915466309, + "learning_rate": 2.7755452292733684e-06, + "loss": 1.2374, + "step": 25887 + }, + { + "epoch": 0.9271044102637563, + "grad_norm": 1.1951903104782104, + "learning_rate": 2.7728320974719225e-06, + "loss": 0.8843, + "step": 25888 + }, + { + "epoch": 0.9271402223933246, + "grad_norm": 1.2496964931488037, + "learning_rate": 2.7701202737366096e-06, + "loss": 1.0388, + "step": 25889 + }, + { + "epoch": 0.9271760345228929, + "grad_norm": 1.2848927974700928, + "learning_rate": 2.7674097581039004e-06, + "loss": 1.1191, + "step": 25890 + }, + { + "epoch": 0.9272118466524611, + "grad_norm": 1.5213786363601685, + "learning_rate": 2.7647005506102886e-06, + "loss": 1.0037, + "step": 25891 + }, + { + "epoch": 0.9272476587820295, + "grad_norm": 1.7199420928955078, + "learning_rate": 2.7619926512921888e-06, + "loss": 0.9989, + "step": 25892 + }, + { + "epoch": 0.9272834709115978, + "grad_norm": 1.6430906057357788, + "learning_rate": 2.7592860601860616e-06, + "loss": 1.3041, + "step": 25893 + }, + { + "epoch": 0.9273192830411661, + "grad_norm": 1.6380757093429565, + "learning_rate": 2.7565807773282994e-06, + "loss": 1.1135, + "step": 25894 + }, + { + "epoch": 0.9273550951707343, + "grad_norm": 1.6148163080215454, + "learning_rate": 2.7538768027553174e-06, + "loss": 1.2748, + "step": 25895 + }, + { + "epoch": 0.9273909073003026, + "grad_norm": 1.460263967514038, + "learning_rate": 2.751174136503498e-06, + "loss": 1.0648, + "step": 25896 + }, + { + "epoch": 0.9274267194298709, + "grad_norm": 1.4303005933761597, + "learning_rate": 2.748472778609157e-06, + "loss": 1.102, + "step": 25897 + }, + { + "epoch": 0.9274625315594391, + "grad_norm": 1.2952096462249756, + "learning_rate": 2.7457727291086867e-06, + "loss": 1.1181, + "step": 25898 + }, + { + "epoch": 0.9274983436890075, + "grad_norm": 1.3399535417556763, + "learning_rate": 2.7430739880383915e-06, + "loss": 0.9698, + "step": 25899 + }, + { + "epoch": 0.9275341558185758, + "grad_norm": 1.321287751197815, + "learning_rate": 2.7403765554345984e-06, + "loss": 0.9556, + "step": 25900 + }, + { + "epoch": 0.9275699679481441, + "grad_norm": 1.333688735961914, + "learning_rate": 2.737680431333556e-06, + "loss": 1.0873, + "step": 25901 + }, + { + "epoch": 0.9276057800777123, + "grad_norm": 1.498147964477539, + "learning_rate": 2.7349856157715793e-06, + "loss": 1.052, + "step": 25902 + }, + { + "epoch": 0.9276415922072806, + "grad_norm": 1.45205557346344, + "learning_rate": 2.7322921087849063e-06, + "loss": 1.1422, + "step": 25903 + }, + { + "epoch": 0.9276774043368489, + "grad_norm": 1.9621810913085938, + "learning_rate": 2.7295999104097746e-06, + "loss": 1.0694, + "step": 25904 + }, + { + "epoch": 0.9277132164664171, + "grad_norm": 1.456897258758545, + "learning_rate": 2.726909020682422e-06, + "loss": 0.9393, + "step": 25905 + }, + { + "epoch": 0.9277490285959855, + "grad_norm": 1.4337280988693237, + "learning_rate": 2.72421943963902e-06, + "loss": 0.9848, + "step": 25906 + }, + { + "epoch": 0.9277848407255538, + "grad_norm": 1.4742238521575928, + "learning_rate": 2.7215311673157715e-06, + "loss": 1.121, + "step": 25907 + }, + { + "epoch": 0.927820652855122, + "grad_norm": 1.2519242763519287, + "learning_rate": 2.718844203748827e-06, + "loss": 0.9921, + "step": 25908 + }, + { + "epoch": 0.9278564649846903, + "grad_norm": 1.5879439115524292, + "learning_rate": 2.716158548974379e-06, + "loss": 1.2494, + "step": 25909 + }, + { + "epoch": 0.9278922771142586, + "grad_norm": 1.4815547466278076, + "learning_rate": 2.71347420302851e-06, + "loss": 1.1415, + "step": 25910 + }, + { + "epoch": 0.9279280892438269, + "grad_norm": 1.2947430610656738, + "learning_rate": 2.7107911659473682e-06, + "loss": 0.8407, + "step": 25911 + }, + { + "epoch": 0.9279639013733951, + "grad_norm": 1.4165652990341187, + "learning_rate": 2.708109437767015e-06, + "loss": 0.8911, + "step": 25912 + }, + { + "epoch": 0.9279997135029635, + "grad_norm": 1.3419643640518188, + "learning_rate": 2.705429018523575e-06, + "loss": 0.8715, + "step": 25913 + }, + { + "epoch": 0.9280355256325318, + "grad_norm": 2.6924943923950195, + "learning_rate": 2.702749908253077e-06, + "loss": 0.8979, + "step": 25914 + }, + { + "epoch": 0.9280713377621, + "grad_norm": 1.5548433065414429, + "learning_rate": 2.70007210699158e-06, + "loss": 1.0346, + "step": 25915 + }, + { + "epoch": 0.9281071498916683, + "grad_norm": 1.841313362121582, + "learning_rate": 2.697395614775089e-06, + "loss": 0.9719, + "step": 25916 + }, + { + "epoch": 0.9281429620212366, + "grad_norm": 1.4496102333068848, + "learning_rate": 2.694720431639641e-06, + "loss": 0.8854, + "step": 25917 + }, + { + "epoch": 0.9281787741508049, + "grad_norm": 1.6983617544174194, + "learning_rate": 2.6920465576212195e-06, + "loss": 1.3158, + "step": 25918 + }, + { + "epoch": 0.9282145862803731, + "grad_norm": 1.9769147634506226, + "learning_rate": 2.6893739927557725e-06, + "loss": 1.1179, + "step": 25919 + }, + { + "epoch": 0.9282503984099415, + "grad_norm": 1.4260483980178833, + "learning_rate": 2.6867027370793053e-06, + "loss": 0.8722, + "step": 25920 + }, + { + "epoch": 0.9282862105395098, + "grad_norm": 1.8001469373703003, + "learning_rate": 2.684032790627722e-06, + "loss": 0.9824, + "step": 25921 + }, + { + "epoch": 0.928322022669078, + "grad_norm": 1.6057357788085938, + "learning_rate": 2.6813641534369383e-06, + "loss": 1.0581, + "step": 25922 + }, + { + "epoch": 0.9283578347986463, + "grad_norm": 1.2528283596038818, + "learning_rate": 2.678696825542859e-06, + "loss": 1.1582, + "step": 25923 + }, + { + "epoch": 0.9283936469282146, + "grad_norm": 1.3301730155944824, + "learning_rate": 2.676030806981389e-06, + "loss": 1.1363, + "step": 25924 + }, + { + "epoch": 0.9284294590577828, + "grad_norm": 1.6375716924667358, + "learning_rate": 2.673366097788399e-06, + "loss": 1.1256, + "step": 25925 + }, + { + "epoch": 0.9284652711873511, + "grad_norm": 1.3067939281463623, + "learning_rate": 2.670702697999705e-06, + "loss": 1.0981, + "step": 25926 + }, + { + "epoch": 0.9285010833169195, + "grad_norm": 1.3331493139266968, + "learning_rate": 2.6680406076511677e-06, + "loss": 1.0077, + "step": 25927 + }, + { + "epoch": 0.9285368954464878, + "grad_norm": 1.687256932258606, + "learning_rate": 2.6653798267785912e-06, + "loss": 1.0044, + "step": 25928 + }, + { + "epoch": 0.928572707576056, + "grad_norm": 1.4295858144760132, + "learning_rate": 2.6627203554177916e-06, + "loss": 0.8901, + "step": 25929 + }, + { + "epoch": 0.9286085197056243, + "grad_norm": 1.2419923543930054, + "learning_rate": 2.660062193604518e-06, + "loss": 1.0837, + "step": 25930 + }, + { + "epoch": 0.9286443318351926, + "grad_norm": 1.1758090257644653, + "learning_rate": 2.6574053413745524e-06, + "loss": 0.9742, + "step": 25931 + }, + { + "epoch": 0.9286801439647608, + "grad_norm": 1.5202000141143799, + "learning_rate": 2.654749798763645e-06, + "loss": 1.269, + "step": 25932 + }, + { + "epoch": 0.9287159560943291, + "grad_norm": 1.456734538078308, + "learning_rate": 2.6520955658074997e-06, + "loss": 1.1954, + "step": 25933 + }, + { + "epoch": 0.9287517682238975, + "grad_norm": 1.4261382818222046, + "learning_rate": 2.649442642541833e-06, + "loss": 1.1106, + "step": 25934 + }, + { + "epoch": 0.9287875803534658, + "grad_norm": 1.272194504737854, + "learning_rate": 2.646791029002349e-06, + "loss": 1.1455, + "step": 25935 + }, + { + "epoch": 0.928823392483034, + "grad_norm": 1.9032764434814453, + "learning_rate": 2.6441407252247306e-06, + "loss": 1.1063, + "step": 25936 + }, + { + "epoch": 0.9288592046126023, + "grad_norm": 1.1961841583251953, + "learning_rate": 2.641491731244605e-06, + "loss": 1.0705, + "step": 25937 + }, + { + "epoch": 0.9288950167421706, + "grad_norm": 1.5596177577972412, + "learning_rate": 2.6388440470976217e-06, + "loss": 1.0111, + "step": 25938 + }, + { + "epoch": 0.9289308288717388, + "grad_norm": 1.809895634651184, + "learning_rate": 2.6361976728194183e-06, + "loss": 1.0556, + "step": 25939 + }, + { + "epoch": 0.9289666410013071, + "grad_norm": 1.5065861940383911, + "learning_rate": 2.6335526084455665e-06, + "loss": 1.0509, + "step": 25940 + }, + { + "epoch": 0.9290024531308755, + "grad_norm": 1.4352562427520752, + "learning_rate": 2.630908854011682e-06, + "loss": 1.0477, + "step": 25941 + }, + { + "epoch": 0.9290382652604438, + "grad_norm": 1.5435256958007812, + "learning_rate": 2.628266409553315e-06, + "loss": 1.2058, + "step": 25942 + }, + { + "epoch": 0.929074077390012, + "grad_norm": 1.048111915588379, + "learning_rate": 2.625625275106036e-06, + "loss": 0.9391, + "step": 25943 + }, + { + "epoch": 0.9291098895195803, + "grad_norm": 1.5103334188461304, + "learning_rate": 2.6229854507053507e-06, + "loss": 1.224, + "step": 25944 + }, + { + "epoch": 0.9291457016491486, + "grad_norm": 1.4234275817871094, + "learning_rate": 2.6203469363867973e-06, + "loss": 1.0731, + "step": 25945 + }, + { + "epoch": 0.9291815137787168, + "grad_norm": 1.7111823558807373, + "learning_rate": 2.6177097321858578e-06, + "loss": 1.1759, + "step": 25946 + }, + { + "epoch": 0.9292173259082851, + "grad_norm": 1.740193247795105, + "learning_rate": 2.615073838138027e-06, + "loss": 1.1619, + "step": 25947 + }, + { + "epoch": 0.9292531380378534, + "grad_norm": 1.3418649435043335, + "learning_rate": 2.6124392542787645e-06, + "loss": 1.0311, + "step": 25948 + }, + { + "epoch": 0.9292889501674217, + "grad_norm": 1.9282108545303345, + "learning_rate": 2.609805980643498e-06, + "loss": 1.1219, + "step": 25949 + }, + { + "epoch": 0.92932476229699, + "grad_norm": 1.8020046949386597, + "learning_rate": 2.607174017267677e-06, + "loss": 0.956, + "step": 25950 + }, + { + "epoch": 0.9293605744265583, + "grad_norm": 1.240954875946045, + "learning_rate": 2.6045433641866958e-06, + "loss": 1.0595, + "step": 25951 + }, + { + "epoch": 0.9293963865561266, + "grad_norm": 1.5101667642593384, + "learning_rate": 2.6019140214359585e-06, + "loss": 1.034, + "step": 25952 + }, + { + "epoch": 0.9294321986856948, + "grad_norm": 1.6658822298049927, + "learning_rate": 2.599285989050826e-06, + "loss": 1.2124, + "step": 25953 + }, + { + "epoch": 0.9294680108152631, + "grad_norm": 1.8613024950027466, + "learning_rate": 2.59665926706667e-06, + "loss": 1.165, + "step": 25954 + }, + { + "epoch": 0.9295038229448314, + "grad_norm": 1.271822214126587, + "learning_rate": 2.594033855518818e-06, + "loss": 1.1047, + "step": 25955 + }, + { + "epoch": 0.9295396350743997, + "grad_norm": 1.3578547239303589, + "learning_rate": 2.5914097544425975e-06, + "loss": 1.1714, + "step": 25956 + }, + { + "epoch": 0.929575447203968, + "grad_norm": 1.4298086166381836, + "learning_rate": 2.588786963873313e-06, + "loss": 1.1958, + "step": 25957 + }, + { + "epoch": 0.9296112593335363, + "grad_norm": 1.4106194972991943, + "learning_rate": 2.586165483846248e-06, + "loss": 1.1536, + "step": 25958 + }, + { + "epoch": 0.9296470714631045, + "grad_norm": 1.4317125082015991, + "learning_rate": 2.5835453143966627e-06, + "loss": 1.1595, + "step": 25959 + }, + { + "epoch": 0.9296828835926728, + "grad_norm": 1.3756965398788452, + "learning_rate": 2.580926455559829e-06, + "loss": 1.0068, + "step": 25960 + }, + { + "epoch": 0.9297186957222411, + "grad_norm": 1.3563079833984375, + "learning_rate": 2.5783089073709633e-06, + "loss": 0.9536, + "step": 25961 + }, + { + "epoch": 0.9297545078518094, + "grad_norm": 1.3903731107711792, + "learning_rate": 2.5756926698652816e-06, + "loss": 1.0379, + "step": 25962 + }, + { + "epoch": 0.9297903199813777, + "grad_norm": 1.3808807134628296, + "learning_rate": 2.5730777430779895e-06, + "loss": 1.0208, + "step": 25963 + }, + { + "epoch": 0.929826132110946, + "grad_norm": 1.0475305318832397, + "learning_rate": 2.57046412704427e-06, + "loss": 1.0556, + "step": 25964 + }, + { + "epoch": 0.9298619442405143, + "grad_norm": 1.8234310150146484, + "learning_rate": 2.567851821799283e-06, + "loss": 1.2158, + "step": 25965 + }, + { + "epoch": 0.9298977563700825, + "grad_norm": 1.343187928199768, + "learning_rate": 2.565240827378157e-06, + "loss": 0.9544, + "step": 25966 + }, + { + "epoch": 0.9299335684996508, + "grad_norm": 1.3715063333511353, + "learning_rate": 2.562631143816041e-06, + "loss": 0.9511, + "step": 25967 + }, + { + "epoch": 0.9299693806292191, + "grad_norm": 1.272782564163208, + "learning_rate": 2.560022771148052e-06, + "loss": 1.0804, + "step": 25968 + }, + { + "epoch": 0.9300051927587873, + "grad_norm": 2.3224940299987793, + "learning_rate": 2.55741570940925e-06, + "loss": 1.0085, + "step": 25969 + }, + { + "epoch": 0.9300410048883557, + "grad_norm": 1.473073124885559, + "learning_rate": 2.5548099586347296e-06, + "loss": 1.0548, + "step": 25970 + }, + { + "epoch": 0.930076817017924, + "grad_norm": 1.4356317520141602, + "learning_rate": 2.552205518859552e-06, + "loss": 1.102, + "step": 25971 + }, + { + "epoch": 0.9301126291474923, + "grad_norm": 1.5817317962646484, + "learning_rate": 2.549602390118755e-06, + "loss": 0.9381, + "step": 25972 + }, + { + "epoch": 0.9301484412770605, + "grad_norm": 1.5550472736358643, + "learning_rate": 2.5470005724473447e-06, + "loss": 1.0646, + "step": 25973 + }, + { + "epoch": 0.9301842534066288, + "grad_norm": 1.75837242603302, + "learning_rate": 2.544400065880337e-06, + "loss": 1.2053, + "step": 25974 + }, + { + "epoch": 0.9302200655361971, + "grad_norm": 1.461410641670227, + "learning_rate": 2.5418008704527263e-06, + "loss": 1.0014, + "step": 25975 + }, + { + "epoch": 0.9302558776657653, + "grad_norm": 1.427054762840271, + "learning_rate": 2.5392029861994625e-06, + "loss": 1.0685, + "step": 25976 + }, + { + "epoch": 0.9302916897953337, + "grad_norm": 1.455978274345398, + "learning_rate": 2.5366064131555066e-06, + "loss": 1.0762, + "step": 25977 + }, + { + "epoch": 0.930327501924902, + "grad_norm": 2.190239429473877, + "learning_rate": 2.534011151355797e-06, + "loss": 1.2854, + "step": 25978 + }, + { + "epoch": 0.9303633140544703, + "grad_norm": 1.7983956336975098, + "learning_rate": 2.531417200835251e-06, + "loss": 0.9687, + "step": 25979 + }, + { + "epoch": 0.9303991261840385, + "grad_norm": 2.2431800365448, + "learning_rate": 2.528824561628762e-06, + "loss": 1.0782, + "step": 25980 + }, + { + "epoch": 0.9304349383136068, + "grad_norm": 1.8707574605941772, + "learning_rate": 2.5262332337712025e-06, + "loss": 1.2554, + "step": 25981 + }, + { + "epoch": 0.9304707504431751, + "grad_norm": 1.7826048135757446, + "learning_rate": 2.5236432172974333e-06, + "loss": 1.1119, + "step": 25982 + }, + { + "epoch": 0.9305065625727433, + "grad_norm": 1.4704067707061768, + "learning_rate": 2.521054512242338e-06, + "loss": 1.076, + "step": 25983 + }, + { + "epoch": 0.9305423747023117, + "grad_norm": 1.9263992309570312, + "learning_rate": 2.5184671186406996e-06, + "loss": 1.1138, + "step": 25984 + }, + { + "epoch": 0.93057818683188, + "grad_norm": 1.4065877199172974, + "learning_rate": 2.5158810365273345e-06, + "loss": 0.9821, + "step": 25985 + }, + { + "epoch": 0.9306139989614483, + "grad_norm": 1.4055410623550415, + "learning_rate": 2.5132962659370595e-06, + "loss": 1.1428, + "step": 25986 + }, + { + "epoch": 0.9306498110910165, + "grad_norm": 1.5185866355895996, + "learning_rate": 2.510712806904625e-06, + "loss": 0.8733, + "step": 25987 + }, + { + "epoch": 0.9306856232205848, + "grad_norm": 1.2356839179992676, + "learning_rate": 2.5081306594647912e-06, + "loss": 0.9781, + "step": 25988 + }, + { + "epoch": 0.9307214353501531, + "grad_norm": 1.3047841787338257, + "learning_rate": 2.505549823652309e-06, + "loss": 0.9405, + "step": 25989 + }, + { + "epoch": 0.9307572474797213, + "grad_norm": 1.2994657754898071, + "learning_rate": 2.5029702995019055e-06, + "loss": 1.111, + "step": 25990 + }, + { + "epoch": 0.9307930596092897, + "grad_norm": 1.4447169303894043, + "learning_rate": 2.5003920870482644e-06, + "loss": 1.0722, + "step": 25991 + }, + { + "epoch": 0.930828871738858, + "grad_norm": 1.6224119663238525, + "learning_rate": 2.4978151863260914e-06, + "loss": 1.0804, + "step": 25992 + }, + { + "epoch": 0.9308646838684262, + "grad_norm": 1.3827903270721436, + "learning_rate": 2.495239597370047e-06, + "loss": 1.1913, + "step": 25993 + }, + { + "epoch": 0.9309004959979945, + "grad_norm": 1.0624040365219116, + "learning_rate": 2.492665320214771e-06, + "loss": 1.0069, + "step": 25994 + }, + { + "epoch": 0.9309363081275628, + "grad_norm": 1.2557493448257446, + "learning_rate": 2.490092354894913e-06, + "loss": 0.8845, + "step": 25995 + }, + { + "epoch": 0.930972120257131, + "grad_norm": 1.5264033079147339, + "learning_rate": 2.4875207014450785e-06, + "loss": 1.1738, + "step": 25996 + }, + { + "epoch": 0.9310079323866993, + "grad_norm": 1.2010082006454468, + "learning_rate": 2.4849503598998738e-06, + "loss": 1.0208, + "step": 25997 + }, + { + "epoch": 0.9310437445162677, + "grad_norm": 1.497589111328125, + "learning_rate": 2.4823813302938814e-06, + "loss": 0.9969, + "step": 25998 + }, + { + "epoch": 0.931079556645836, + "grad_norm": 1.3122751712799072, + "learning_rate": 2.4798136126616634e-06, + "loss": 1.1061, + "step": 25999 + }, + { + "epoch": 0.9311153687754042, + "grad_norm": 1.4831844568252563, + "learning_rate": 2.477247207037736e-06, + "loss": 1.2595, + "step": 26000 + }, + { + "epoch": 0.9311511809049725, + "grad_norm": 1.5484821796417236, + "learning_rate": 2.4746821134566833e-06, + "loss": 0.9798, + "step": 26001 + }, + { + "epoch": 0.9311869930345408, + "grad_norm": 1.2022736072540283, + "learning_rate": 2.4721183319529774e-06, + "loss": 1.0005, + "step": 26002 + }, + { + "epoch": 0.931222805164109, + "grad_norm": 1.6763653755187988, + "learning_rate": 2.4695558625611015e-06, + "loss": 1.0221, + "step": 26003 + }, + { + "epoch": 0.9312586172936773, + "grad_norm": 1.6571693420410156, + "learning_rate": 2.4669947053155617e-06, + "loss": 0.9327, + "step": 26004 + }, + { + "epoch": 0.9312944294232457, + "grad_norm": 1.2815390825271606, + "learning_rate": 2.464434860250786e-06, + "loss": 1.1898, + "step": 26005 + }, + { + "epoch": 0.931330241552814, + "grad_norm": 1.712526798248291, + "learning_rate": 2.461876327401247e-06, + "loss": 0.9699, + "step": 26006 + }, + { + "epoch": 0.9313660536823822, + "grad_norm": 1.295213222503662, + "learning_rate": 2.4593191068013164e-06, + "loss": 0.9467, + "step": 26007 + }, + { + "epoch": 0.9314018658119505, + "grad_norm": 1.6060712337493896, + "learning_rate": 2.4567631984854566e-06, + "loss": 1.1941, + "step": 26008 + }, + { + "epoch": 0.9314376779415188, + "grad_norm": 1.3598395586013794, + "learning_rate": 2.4542086024880174e-06, + "loss": 1.1303, + "step": 26009 + }, + { + "epoch": 0.931473490071087, + "grad_norm": 1.3222577571868896, + "learning_rate": 2.4516553188433823e-06, + "loss": 0.7861, + "step": 26010 + }, + { + "epoch": 0.9315093022006553, + "grad_norm": 1.6377830505371094, + "learning_rate": 2.4491033475858795e-06, + "loss": 1.1589, + "step": 26011 + }, + { + "epoch": 0.9315451143302237, + "grad_norm": 1.8950196504592896, + "learning_rate": 2.44655268874987e-06, + "loss": 0.9628, + "step": 26012 + }, + { + "epoch": 0.931580926459792, + "grad_norm": 1.7212588787078857, + "learning_rate": 2.4440033423696717e-06, + "loss": 1.1366, + "step": 26013 + }, + { + "epoch": 0.9316167385893602, + "grad_norm": 2.199038028717041, + "learning_rate": 2.4414553084795455e-06, + "loss": 1.1679, + "step": 26014 + }, + { + "epoch": 0.9316525507189285, + "grad_norm": 1.4866466522216797, + "learning_rate": 2.4389085871138086e-06, + "loss": 1.0389, + "step": 26015 + }, + { + "epoch": 0.9316883628484968, + "grad_norm": 1.7605750560760498, + "learning_rate": 2.4363631783067108e-06, + "loss": 1.0943, + "step": 26016 + }, + { + "epoch": 0.931724174978065, + "grad_norm": 1.4708458185195923, + "learning_rate": 2.4338190820925145e-06, + "loss": 1.084, + "step": 26017 + }, + { + "epoch": 0.9317599871076333, + "grad_norm": 1.5160866975784302, + "learning_rate": 2.4312762985054137e-06, + "loss": 1.0004, + "step": 26018 + }, + { + "epoch": 0.9317957992372017, + "grad_norm": 1.8990002870559692, + "learning_rate": 2.4287348275796373e-06, + "loss": 0.8335, + "step": 26019 + }, + { + "epoch": 0.93183161136677, + "grad_norm": 1.2717022895812988, + "learning_rate": 2.4261946693493797e-06, + "loss": 0.8001, + "step": 26020 + }, + { + "epoch": 0.9318674234963382, + "grad_norm": 1.724609613418579, + "learning_rate": 2.4236558238488025e-06, + "loss": 1.1249, + "step": 26021 + }, + { + "epoch": 0.9319032356259065, + "grad_norm": 1.5388332605361938, + "learning_rate": 2.421118291112079e-06, + "loss": 1.0655, + "step": 26022 + }, + { + "epoch": 0.9319390477554748, + "grad_norm": 1.5468705892562866, + "learning_rate": 2.4185820711733363e-06, + "loss": 1.1796, + "step": 26023 + }, + { + "epoch": 0.931974859885043, + "grad_norm": 1.5519908666610718, + "learning_rate": 2.4160471640667147e-06, + "loss": 1.1063, + "step": 26024 + }, + { + "epoch": 0.9320106720146113, + "grad_norm": 1.733167052268982, + "learning_rate": 2.413513569826298e-06, + "loss": 1.0031, + "step": 26025 + }, + { + "epoch": 0.9320464841441797, + "grad_norm": 1.4306119680404663, + "learning_rate": 2.410981288486169e-06, + "loss": 1.0924, + "step": 26026 + }, + { + "epoch": 0.932082296273748, + "grad_norm": 1.5579506158828735, + "learning_rate": 2.408450320080413e-06, + "loss": 1.1241, + "step": 26027 + }, + { + "epoch": 0.9321181084033162, + "grad_norm": 1.688536524772644, + "learning_rate": 2.40592066464308e-06, + "loss": 1.1978, + "step": 26028 + }, + { + "epoch": 0.9321539205328845, + "grad_norm": 1.5433673858642578, + "learning_rate": 2.4033923222081868e-06, + "loss": 0.9184, + "step": 26029 + }, + { + "epoch": 0.9321897326624528, + "grad_norm": 1.5327990055084229, + "learning_rate": 2.400865292809762e-06, + "loss": 1.0677, + "step": 26030 + }, + { + "epoch": 0.932225544792021, + "grad_norm": 1.4565924406051636, + "learning_rate": 2.3983395764818008e-06, + "loss": 1.0183, + "step": 26031 + }, + { + "epoch": 0.9322613569215893, + "grad_norm": 1.4039591550827026, + "learning_rate": 2.395815173258287e-06, + "loss": 1.0238, + "step": 26032 + }, + { + "epoch": 0.9322971690511577, + "grad_norm": 1.3117866516113281, + "learning_rate": 2.393292083173171e-06, + "loss": 1.1489, + "step": 26033 + }, + { + "epoch": 0.9323329811807259, + "grad_norm": 1.7585186958312988, + "learning_rate": 2.390770306260415e-06, + "loss": 1.0131, + "step": 26034 + }, + { + "epoch": 0.9323687933102942, + "grad_norm": 1.5867897272109985, + "learning_rate": 2.388249842553936e-06, + "loss": 1.1992, + "step": 26035 + }, + { + "epoch": 0.9324046054398625, + "grad_norm": 1.26301109790802, + "learning_rate": 2.385730692087651e-06, + "loss": 1.0969, + "step": 26036 + }, + { + "epoch": 0.9324404175694307, + "grad_norm": 1.6882609128952026, + "learning_rate": 2.3832128548954334e-06, + "loss": 1.0765, + "step": 26037 + }, + { + "epoch": 0.932476229698999, + "grad_norm": 1.570857286453247, + "learning_rate": 2.3806963310111786e-06, + "loss": 1.1775, + "step": 26038 + }, + { + "epoch": 0.9325120418285673, + "grad_norm": 1.488288164138794, + "learning_rate": 2.3781811204687367e-06, + "loss": 1.0526, + "step": 26039 + }, + { + "epoch": 0.9325478539581357, + "grad_norm": 1.2724720239639282, + "learning_rate": 2.375667223301936e-06, + "loss": 1.1613, + "step": 26040 + }, + { + "epoch": 0.9325836660877039, + "grad_norm": 1.452569842338562, + "learning_rate": 2.3731546395446056e-06, + "loss": 1.0344, + "step": 26041 + }, + { + "epoch": 0.9326194782172722, + "grad_norm": 1.4145281314849854, + "learning_rate": 2.370643369230563e-06, + "loss": 1.2976, + "step": 26042 + }, + { + "epoch": 0.9326552903468405, + "grad_norm": 1.6391127109527588, + "learning_rate": 2.3681334123935805e-06, + "loss": 1.1984, + "step": 26043 + }, + { + "epoch": 0.9326911024764087, + "grad_norm": 1.3186240196228027, + "learning_rate": 2.3656247690674092e-06, + "loss": 1.2324, + "step": 26044 + }, + { + "epoch": 0.932726914605977, + "grad_norm": 1.2987780570983887, + "learning_rate": 2.3631174392858335e-06, + "loss": 1.1414, + "step": 26045 + }, + { + "epoch": 0.9327627267355453, + "grad_norm": 1.93575119972229, + "learning_rate": 2.3606114230825704e-06, + "loss": 1.1603, + "step": 26046 + }, + { + "epoch": 0.9327985388651137, + "grad_norm": 1.4762803316116333, + "learning_rate": 2.3581067204913267e-06, + "loss": 1.0113, + "step": 26047 + }, + { + "epoch": 0.9328343509946819, + "grad_norm": 1.7981798648834229, + "learning_rate": 2.355603331545808e-06, + "loss": 1.1734, + "step": 26048 + }, + { + "epoch": 0.9328701631242502, + "grad_norm": 1.3938543796539307, + "learning_rate": 2.3531012562796995e-06, + "loss": 1.0051, + "step": 26049 + }, + { + "epoch": 0.9329059752538185, + "grad_norm": 1.3945692777633667, + "learning_rate": 2.3506004947266512e-06, + "loss": 1.1271, + "step": 26050 + }, + { + "epoch": 0.9329417873833867, + "grad_norm": 1.2731661796569824, + "learning_rate": 2.3481010469203256e-06, + "loss": 0.8746, + "step": 26051 + }, + { + "epoch": 0.932977599512955, + "grad_norm": 1.5063334703445435, + "learning_rate": 2.345602912894329e-06, + "loss": 1.0646, + "step": 26052 + }, + { + "epoch": 0.9330134116425233, + "grad_norm": 2.095184326171875, + "learning_rate": 2.3431060926822903e-06, + "loss": 1.0429, + "step": 26053 + }, + { + "epoch": 0.9330492237720917, + "grad_norm": 1.5540746450424194, + "learning_rate": 2.340610586317782e-06, + "loss": 1.1885, + "step": 26054 + }, + { + "epoch": 0.9330850359016599, + "grad_norm": 1.447023868560791, + "learning_rate": 2.3381163938343776e-06, + "loss": 1.1164, + "step": 26055 + }, + { + "epoch": 0.9331208480312282, + "grad_norm": 1.7131422758102417, + "learning_rate": 2.3356235152656613e-06, + "loss": 1.1485, + "step": 26056 + }, + { + "epoch": 0.9331566601607965, + "grad_norm": 1.784659743309021, + "learning_rate": 2.33313195064514e-06, + "loss": 1.1128, + "step": 26057 + }, + { + "epoch": 0.9331924722903647, + "grad_norm": 1.7415353059768677, + "learning_rate": 2.330641700006353e-06, + "loss": 1.1059, + "step": 26058 + }, + { + "epoch": 0.933228284419933, + "grad_norm": 1.895141839981079, + "learning_rate": 2.328152763382796e-06, + "loss": 1.1102, + "step": 26059 + }, + { + "epoch": 0.9332640965495013, + "grad_norm": 1.7111207246780396, + "learning_rate": 2.325665140807964e-06, + "loss": 1.2129, + "step": 26060 + }, + { + "epoch": 0.9332999086790696, + "grad_norm": 1.2817976474761963, + "learning_rate": 2.323178832315298e-06, + "loss": 1.0506, + "step": 26061 + }, + { + "epoch": 0.9333357208086379, + "grad_norm": 1.8200385570526123, + "learning_rate": 2.3206938379382813e-06, + "loss": 1.2166, + "step": 26062 + }, + { + "epoch": 0.9333715329382062, + "grad_norm": 1.4616608619689941, + "learning_rate": 2.318210157710332e-06, + "loss": 1.0097, + "step": 26063 + }, + { + "epoch": 0.9334073450677745, + "grad_norm": 1.5543333292007446, + "learning_rate": 2.3157277916648567e-06, + "loss": 0.8799, + "step": 26064 + }, + { + "epoch": 0.9334431571973427, + "grad_norm": 1.3548154830932617, + "learning_rate": 2.313246739835262e-06, + "loss": 1.067, + "step": 26065 + }, + { + "epoch": 0.933478969326911, + "grad_norm": 1.5379157066345215, + "learning_rate": 2.3107670022549323e-06, + "loss": 1.2791, + "step": 26066 + }, + { + "epoch": 0.9335147814564793, + "grad_norm": 1.6529771089553833, + "learning_rate": 2.3082885789572182e-06, + "loss": 1.0491, + "step": 26067 + }, + { + "epoch": 0.9335505935860476, + "grad_norm": 1.3808037042617798, + "learning_rate": 2.305811469975472e-06, + "loss": 1.0241, + "step": 26068 + }, + { + "epoch": 0.9335864057156159, + "grad_norm": 1.6234169006347656, + "learning_rate": 2.30333567534301e-06, + "loss": 1.1919, + "step": 26069 + }, + { + "epoch": 0.9336222178451842, + "grad_norm": 1.267075777053833, + "learning_rate": 2.3008611950931404e-06, + "loss": 1.1755, + "step": 26070 + }, + { + "epoch": 0.9336580299747524, + "grad_norm": 1.71636164188385, + "learning_rate": 2.2983880292591798e-06, + "loss": 1.215, + "step": 26071 + }, + { + "epoch": 0.9336938421043207, + "grad_norm": 1.7410836219787598, + "learning_rate": 2.295916177874369e-06, + "loss": 0.9976, + "step": 26072 + }, + { + "epoch": 0.933729654233889, + "grad_norm": 1.8081636428833008, + "learning_rate": 2.2934456409719698e-06, + "loss": 0.9731, + "step": 26073 + }, + { + "epoch": 0.9337654663634573, + "grad_norm": 1.3149116039276123, + "learning_rate": 2.2909764185852447e-06, + "loss": 0.8784, + "step": 26074 + }, + { + "epoch": 0.9338012784930256, + "grad_norm": 1.4690651893615723, + "learning_rate": 2.288508510747389e-06, + "loss": 1.0841, + "step": 26075 + }, + { + "epoch": 0.9338370906225939, + "grad_norm": 1.6939319372177124, + "learning_rate": 2.2860419174916104e-06, + "loss": 1.1586, + "step": 26076 + }, + { + "epoch": 0.9338729027521622, + "grad_norm": 1.3718489408493042, + "learning_rate": 2.2835766388510926e-06, + "loss": 0.9866, + "step": 26077 + }, + { + "epoch": 0.9339087148817304, + "grad_norm": 1.3544167280197144, + "learning_rate": 2.2811126748590207e-06, + "loss": 1.1808, + "step": 26078 + }, + { + "epoch": 0.9339445270112987, + "grad_norm": 1.7179206609725952, + "learning_rate": 2.278650025548512e-06, + "loss": 1.1257, + "step": 26079 + }, + { + "epoch": 0.933980339140867, + "grad_norm": 1.8791018724441528, + "learning_rate": 2.2761886909527187e-06, + "loss": 1.1308, + "step": 26080 + }, + { + "epoch": 0.9340161512704352, + "grad_norm": 1.3653050661087036, + "learning_rate": 2.273728671104769e-06, + "loss": 1.0688, + "step": 26081 + }, + { + "epoch": 0.9340519634000036, + "grad_norm": 1.2656198740005493, + "learning_rate": 2.271269966037726e-06, + "loss": 1.0402, + "step": 26082 + }, + { + "epoch": 0.9340877755295719, + "grad_norm": 1.6022812128067017, + "learning_rate": 2.2688125757846957e-06, + "loss": 1.1004, + "step": 26083 + }, + { + "epoch": 0.9341235876591402, + "grad_norm": 1.675599217414856, + "learning_rate": 2.2663565003787078e-06, + "loss": 1.1946, + "step": 26084 + }, + { + "epoch": 0.9341593997887084, + "grad_norm": 1.213317632675171, + "learning_rate": 2.263901739852847e-06, + "loss": 1.0954, + "step": 26085 + }, + { + "epoch": 0.9341952119182767, + "grad_norm": 1.3256789445877075, + "learning_rate": 2.2614482942400984e-06, + "loss": 0.9017, + "step": 26086 + }, + { + "epoch": 0.934231024047845, + "grad_norm": 1.452691674232483, + "learning_rate": 2.2589961635735015e-06, + "loss": 1.0765, + "step": 26087 + }, + { + "epoch": 0.9342668361774132, + "grad_norm": 1.283446192741394, + "learning_rate": 2.2565453478860297e-06, + "loss": 0.8944, + "step": 26088 + }, + { + "epoch": 0.9343026483069816, + "grad_norm": 1.3076717853546143, + "learning_rate": 2.254095847210669e-06, + "loss": 1.1191, + "step": 26089 + }, + { + "epoch": 0.9343384604365499, + "grad_norm": 1.3181123733520508, + "learning_rate": 2.2516476615803694e-06, + "loss": 0.9816, + "step": 26090 + }, + { + "epoch": 0.9343742725661182, + "grad_norm": 1.482881784439087, + "learning_rate": 2.249200791028039e-06, + "loss": 1.0807, + "step": 26091 + }, + { + "epoch": 0.9344100846956864, + "grad_norm": 1.4572854042053223, + "learning_rate": 2.2467552355866505e-06, + "loss": 1.1288, + "step": 26092 + }, + { + "epoch": 0.9344458968252547, + "grad_norm": 1.3558319807052612, + "learning_rate": 2.2443109952890674e-06, + "loss": 1.0152, + "step": 26093 + }, + { + "epoch": 0.934481708954823, + "grad_norm": 1.3475960493087769, + "learning_rate": 2.241868070168185e-06, + "loss": 0.9645, + "step": 26094 + }, + { + "epoch": 0.9345175210843912, + "grad_norm": 1.9547892808914185, + "learning_rate": 2.239426460256855e-06, + "loss": 0.8966, + "step": 26095 + }, + { + "epoch": 0.9345533332139596, + "grad_norm": 1.794445276260376, + "learning_rate": 2.236986165587951e-06, + "loss": 1.2952, + "step": 26096 + }, + { + "epoch": 0.9345891453435279, + "grad_norm": 1.1457055807113647, + "learning_rate": 2.2345471861942914e-06, + "loss": 0.933, + "step": 26097 + }, + { + "epoch": 0.9346249574730962, + "grad_norm": 1.7232935428619385, + "learning_rate": 2.232109522108694e-06, + "loss": 1.0474, + "step": 26098 + }, + { + "epoch": 0.9346607696026644, + "grad_norm": 1.4633409976959229, + "learning_rate": 2.2296731733639552e-06, + "loss": 0.9488, + "step": 26099 + }, + { + "epoch": 0.9346965817322327, + "grad_norm": 1.6185460090637207, + "learning_rate": 2.227238139992849e-06, + "loss": 0.9903, + "step": 26100 + }, + { + "epoch": 0.934732393861801, + "grad_norm": 1.5781183242797852, + "learning_rate": 2.224804422028137e-06, + "loss": 1.0724, + "step": 26101 + }, + { + "epoch": 0.9347682059913692, + "grad_norm": 1.4467964172363281, + "learning_rate": 2.2223720195025386e-06, + "loss": 1.3912, + "step": 26102 + }, + { + "epoch": 0.9348040181209376, + "grad_norm": 1.3825221061706543, + "learning_rate": 2.2199409324488275e-06, + "loss": 0.9624, + "step": 26103 + }, + { + "epoch": 0.9348398302505059, + "grad_norm": 1.6721261739730835, + "learning_rate": 2.2175111608996657e-06, + "loss": 1.0919, + "step": 26104 + }, + { + "epoch": 0.9348756423800741, + "grad_norm": 1.4776352643966675, + "learning_rate": 2.215082704887772e-06, + "loss": 1.2281, + "step": 26105 + }, + { + "epoch": 0.9349114545096424, + "grad_norm": 1.421209692955017, + "learning_rate": 2.212655564445798e-06, + "loss": 0.9728, + "step": 26106 + }, + { + "epoch": 0.9349472666392107, + "grad_norm": 1.4628523588180542, + "learning_rate": 2.2102297396064176e-06, + "loss": 1.058, + "step": 26107 + }, + { + "epoch": 0.934983078768779, + "grad_norm": 1.3293620347976685, + "learning_rate": 2.20780523040226e-06, + "loss": 0.8656, + "step": 26108 + }, + { + "epoch": 0.9350188908983472, + "grad_norm": 1.5419703722000122, + "learning_rate": 2.2053820368659215e-06, + "loss": 1.0648, + "step": 26109 + }, + { + "epoch": 0.9350547030279156, + "grad_norm": 1.2848211526870728, + "learning_rate": 2.202960159030032e-06, + "loss": 1.0047, + "step": 26110 + }, + { + "epoch": 0.9350905151574839, + "grad_norm": 1.5018150806427002, + "learning_rate": 2.200539596927165e-06, + "loss": 1.0263, + "step": 26111 + }, + { + "epoch": 0.9351263272870521, + "grad_norm": 1.3323642015457153, + "learning_rate": 2.1981203505898827e-06, + "loss": 1.1133, + "step": 26112 + }, + { + "epoch": 0.9351621394166204, + "grad_norm": 1.4464865922927856, + "learning_rate": 2.195702420050727e-06, + "loss": 1.317, + "step": 26113 + }, + { + "epoch": 0.9351979515461887, + "grad_norm": 1.6609617471694946, + "learning_rate": 2.19328580534226e-06, + "loss": 0.9478, + "step": 26114 + }, + { + "epoch": 0.935233763675757, + "grad_norm": 1.4674732685089111, + "learning_rate": 2.190870506496956e-06, + "loss": 1.0702, + "step": 26115 + }, + { + "epoch": 0.9352695758053252, + "grad_norm": 1.3589822053909302, + "learning_rate": 2.188456523547322e-06, + "loss": 1.0773, + "step": 26116 + }, + { + "epoch": 0.9353053879348936, + "grad_norm": 1.4080945253372192, + "learning_rate": 2.1860438565258433e-06, + "loss": 0.8364, + "step": 26117 + }, + { + "epoch": 0.9353412000644619, + "grad_norm": 1.5475438833236694, + "learning_rate": 2.183632505464972e-06, + "loss": 1.0568, + "step": 26118 + }, + { + "epoch": 0.9353770121940301, + "grad_norm": 1.4777534008026123, + "learning_rate": 2.1812224703971597e-06, + "loss": 1.0803, + "step": 26119 + }, + { + "epoch": 0.9354128243235984, + "grad_norm": 1.48246169090271, + "learning_rate": 2.1788137513548134e-06, + "loss": 0.9199, + "step": 26120 + }, + { + "epoch": 0.9354486364531667, + "grad_norm": 1.404507040977478, + "learning_rate": 2.176406348370341e-06, + "loss": 1.0411, + "step": 26121 + }, + { + "epoch": 0.9354844485827349, + "grad_norm": 1.578223705291748, + "learning_rate": 2.174000261476139e-06, + "loss": 1.0576, + "step": 26122 + }, + { + "epoch": 0.9355202607123032, + "grad_norm": 1.4669468402862549, + "learning_rate": 2.171595490704592e-06, + "loss": 1.0293, + "step": 26123 + }, + { + "epoch": 0.9355560728418716, + "grad_norm": 1.5578361749649048, + "learning_rate": 2.1691920360880303e-06, + "loss": 0.8898, + "step": 26124 + }, + { + "epoch": 0.9355918849714399, + "grad_norm": 1.492895245552063, + "learning_rate": 2.166789897658794e-06, + "loss": 1.1627, + "step": 26125 + }, + { + "epoch": 0.9356276971010081, + "grad_norm": 1.2892550230026245, + "learning_rate": 2.1643890754492136e-06, + "loss": 1.0106, + "step": 26126 + }, + { + "epoch": 0.9356635092305764, + "grad_norm": 1.3054184913635254, + "learning_rate": 2.1619895694915624e-06, + "loss": 1.3424, + "step": 26127 + }, + { + "epoch": 0.9356993213601447, + "grad_norm": 1.4210716485977173, + "learning_rate": 2.159591379818149e-06, + "loss": 1.1014, + "step": 26128 + }, + { + "epoch": 0.9357351334897129, + "grad_norm": 1.4805470705032349, + "learning_rate": 2.1571945064612243e-06, + "loss": 0.927, + "step": 26129 + }, + { + "epoch": 0.9357709456192812, + "grad_norm": 1.4112111330032349, + "learning_rate": 2.1547989494530517e-06, + "loss": 1.0247, + "step": 26130 + }, + { + "epoch": 0.9358067577488496, + "grad_norm": 1.3743585348129272, + "learning_rate": 2.1524047088258394e-06, + "loss": 1.1844, + "step": 26131 + }, + { + "epoch": 0.9358425698784179, + "grad_norm": 1.389716386795044, + "learning_rate": 2.1500117846118053e-06, + "loss": 0.9512, + "step": 26132 + }, + { + "epoch": 0.9358783820079861, + "grad_norm": 1.434070110321045, + "learning_rate": 2.147620176843157e-06, + "loss": 0.8262, + "step": 26133 + }, + { + "epoch": 0.9359141941375544, + "grad_norm": 1.484706997871399, + "learning_rate": 2.145229885552047e-06, + "loss": 1.1612, + "step": 26134 + }, + { + "epoch": 0.9359500062671227, + "grad_norm": 1.8005589246749878, + "learning_rate": 2.142840910770638e-06, + "loss": 0.9696, + "step": 26135 + }, + { + "epoch": 0.9359858183966909, + "grad_norm": 3.8804192543029785, + "learning_rate": 2.140453252531083e-06, + "loss": 1.1277, + "step": 26136 + }, + { + "epoch": 0.9360216305262592, + "grad_norm": 1.2971798181533813, + "learning_rate": 2.1380669108655105e-06, + "loss": 0.9269, + "step": 26137 + }, + { + "epoch": 0.9360574426558276, + "grad_norm": 1.5656871795654297, + "learning_rate": 2.135681885806007e-06, + "loss": 1.1267, + "step": 26138 + }, + { + "epoch": 0.9360932547853958, + "grad_norm": 1.326291799545288, + "learning_rate": 2.133298177384668e-06, + "loss": 1.0237, + "step": 26139 + }, + { + "epoch": 0.9361290669149641, + "grad_norm": 1.2478082180023193, + "learning_rate": 2.1309157856335694e-06, + "loss": 1.2274, + "step": 26140 + }, + { + "epoch": 0.9361648790445324, + "grad_norm": 1.8521485328674316, + "learning_rate": 2.128534710584751e-06, + "loss": 0.9167, + "step": 26141 + }, + { + "epoch": 0.9362006911741007, + "grad_norm": 1.3024741411209106, + "learning_rate": 2.126154952270254e-06, + "loss": 1.1644, + "step": 26142 + }, + { + "epoch": 0.9362365033036689, + "grad_norm": 1.3753342628479004, + "learning_rate": 2.1237765107220973e-06, + "loss": 0.882, + "step": 26143 + }, + { + "epoch": 0.9362723154332372, + "grad_norm": 1.4893054962158203, + "learning_rate": 2.121399385972278e-06, + "loss": 0.8231, + "step": 26144 + }, + { + "epoch": 0.9363081275628056, + "grad_norm": 1.193023443222046, + "learning_rate": 2.11902357805277e-06, + "loss": 0.7717, + "step": 26145 + }, + { + "epoch": 0.9363439396923738, + "grad_norm": 1.4092717170715332, + "learning_rate": 2.116649086995537e-06, + "loss": 1.1752, + "step": 26146 + }, + { + "epoch": 0.9363797518219421, + "grad_norm": 1.509537935256958, + "learning_rate": 2.1142759128325306e-06, + "loss": 1.0281, + "step": 26147 + }, + { + "epoch": 0.9364155639515104, + "grad_norm": 2.0235960483551025, + "learning_rate": 2.1119040555956925e-06, + "loss": 1.008, + "step": 26148 + }, + { + "epoch": 0.9364513760810786, + "grad_norm": 1.4420888423919678, + "learning_rate": 2.109533515316908e-06, + "loss": 0.9166, + "step": 26149 + }, + { + "epoch": 0.9364871882106469, + "grad_norm": 1.5565520524978638, + "learning_rate": 2.1071642920280855e-06, + "loss": 1.0244, + "step": 26150 + }, + { + "epoch": 0.9365230003402152, + "grad_norm": 1.4260152578353882, + "learning_rate": 2.1047963857610986e-06, + "loss": 1.0233, + "step": 26151 + }, + { + "epoch": 0.9365588124697836, + "grad_norm": 1.6853601932525635, + "learning_rate": 2.102429796547789e-06, + "loss": 1.1843, + "step": 26152 + }, + { + "epoch": 0.9365946245993518, + "grad_norm": 1.3154640197753906, + "learning_rate": 2.100064524420009e-06, + "loss": 0.9043, + "step": 26153 + }, + { + "epoch": 0.9366304367289201, + "grad_norm": 1.461164951324463, + "learning_rate": 2.0977005694095774e-06, + "loss": 0.9671, + "step": 26154 + }, + { + "epoch": 0.9366662488584884, + "grad_norm": 1.7110333442687988, + "learning_rate": 2.0953379315483134e-06, + "loss": 0.9349, + "step": 26155 + }, + { + "epoch": 0.9367020609880566, + "grad_norm": 1.3615639209747314, + "learning_rate": 2.0929766108679803e-06, + "loss": 1.0532, + "step": 26156 + }, + { + "epoch": 0.9367378731176249, + "grad_norm": 1.5717586278915405, + "learning_rate": 2.0906166074003532e-06, + "loss": 0.9422, + "step": 26157 + }, + { + "epoch": 0.9367736852471932, + "grad_norm": 1.4468175172805786, + "learning_rate": 2.0882579211771837e-06, + "loss": 1.102, + "step": 26158 + }, + { + "epoch": 0.9368094973767616, + "grad_norm": 1.7994989156723022, + "learning_rate": 2.0859005522302245e-06, + "loss": 1.1277, + "step": 26159 + }, + { + "epoch": 0.9368453095063298, + "grad_norm": 1.3909821510314941, + "learning_rate": 2.0835445005911503e-06, + "loss": 1.0921, + "step": 26160 + }, + { + "epoch": 0.9368811216358981, + "grad_norm": 1.3321486711502075, + "learning_rate": 2.081189766291691e-06, + "loss": 0.9599, + "step": 26161 + }, + { + "epoch": 0.9369169337654664, + "grad_norm": 1.6747996807098389, + "learning_rate": 2.0788363493635333e-06, + "loss": 1.1493, + "step": 26162 + }, + { + "epoch": 0.9369527458950346, + "grad_norm": 1.814993143081665, + "learning_rate": 2.0764842498383063e-06, + "loss": 1.0231, + "step": 26163 + }, + { + "epoch": 0.9369885580246029, + "grad_norm": 1.236899733543396, + "learning_rate": 2.074133467747663e-06, + "loss": 1.1583, + "step": 26164 + }, + { + "epoch": 0.9370243701541712, + "grad_norm": 1.1932798624038696, + "learning_rate": 2.071784003123256e-06, + "loss": 1.0226, + "step": 26165 + }, + { + "epoch": 0.9370601822837396, + "grad_norm": 1.5959324836730957, + "learning_rate": 2.069435855996671e-06, + "loss": 1.086, + "step": 26166 + }, + { + "epoch": 0.9370959944133078, + "grad_norm": 1.304997205734253, + "learning_rate": 2.0670890263995047e-06, + "loss": 0.9092, + "step": 26167 + }, + { + "epoch": 0.9371318065428761, + "grad_norm": 1.373839259147644, + "learning_rate": 2.0647435143633322e-06, + "loss": 1.0318, + "step": 26168 + }, + { + "epoch": 0.9371676186724444, + "grad_norm": 1.377760410308838, + "learning_rate": 2.0623993199197055e-06, + "loss": 1.0462, + "step": 26169 + }, + { + "epoch": 0.9372034308020126, + "grad_norm": 1.6163997650146484, + "learning_rate": 2.0600564431001668e-06, + "loss": 1.0697, + "step": 26170 + }, + { + "epoch": 0.9372392429315809, + "grad_norm": 1.605126142501831, + "learning_rate": 2.057714883936235e-06, + "loss": 1.0138, + "step": 26171 + }, + { + "epoch": 0.9372750550611492, + "grad_norm": 1.5633628368377686, + "learning_rate": 2.0553746424594065e-06, + "loss": 0.9392, + "step": 26172 + }, + { + "epoch": 0.9373108671907175, + "grad_norm": 1.5611668825149536, + "learning_rate": 2.0530357187011907e-06, + "loss": 1.0743, + "step": 26173 + }, + { + "epoch": 0.9373466793202858, + "grad_norm": 1.31123685836792, + "learning_rate": 2.050698112693028e-06, + "loss": 0.8456, + "step": 26174 + }, + { + "epoch": 0.9373824914498541, + "grad_norm": 1.8766082525253296, + "learning_rate": 2.0483618244663714e-06, + "loss": 1.1008, + "step": 26175 + }, + { + "epoch": 0.9374183035794224, + "grad_norm": 1.5821839570999146, + "learning_rate": 2.0460268540526518e-06, + "loss": 1.0513, + "step": 26176 + }, + { + "epoch": 0.9374541157089906, + "grad_norm": 1.793157696723938, + "learning_rate": 2.04369320148331e-06, + "loss": 1.1698, + "step": 26177 + }, + { + "epoch": 0.9374899278385589, + "grad_norm": 1.4554599523544312, + "learning_rate": 2.041360866789721e-06, + "loss": 0.9012, + "step": 26178 + }, + { + "epoch": 0.9375257399681272, + "grad_norm": 1.9890332221984863, + "learning_rate": 2.0390298500032377e-06, + "loss": 0.9583, + "step": 26179 + }, + { + "epoch": 0.9375615520976955, + "grad_norm": 1.5786820650100708, + "learning_rate": 2.0367001511552685e-06, + "loss": 1.1528, + "step": 26180 + }, + { + "epoch": 0.9375973642272638, + "grad_norm": 1.466352105140686, + "learning_rate": 2.0343717702771325e-06, + "loss": 0.9926, + "step": 26181 + }, + { + "epoch": 0.9376331763568321, + "grad_norm": 1.3010789155960083, + "learning_rate": 2.0320447074001492e-06, + "loss": 1.2401, + "step": 26182 + }, + { + "epoch": 0.9376689884864003, + "grad_norm": 1.833201289176941, + "learning_rate": 2.0297189625556377e-06, + "loss": 0.9591, + "step": 26183 + }, + { + "epoch": 0.9377048006159686, + "grad_norm": 1.6087697744369507, + "learning_rate": 2.027394535774896e-06, + "loss": 1.0942, + "step": 26184 + }, + { + "epoch": 0.9377406127455369, + "grad_norm": 1.6266722679138184, + "learning_rate": 2.0250714270891757e-06, + "loss": 1.1808, + "step": 26185 + }, + { + "epoch": 0.9377764248751052, + "grad_norm": 1.3108078241348267, + "learning_rate": 2.0227496365297304e-06, + "loss": 0.9372, + "step": 26186 + }, + { + "epoch": 0.9378122370046735, + "grad_norm": 1.1305129528045654, + "learning_rate": 2.020429164127835e-06, + "loss": 0.7992, + "step": 26187 + }, + { + "epoch": 0.9378480491342418, + "grad_norm": 1.4537200927734375, + "learning_rate": 2.0181100099146533e-06, + "loss": 0.8948, + "step": 26188 + }, + { + "epoch": 0.9378838612638101, + "grad_norm": 1.4269438982009888, + "learning_rate": 2.015792173921438e-06, + "loss": 0.9869, + "step": 26189 + }, + { + "epoch": 0.9379196733933783, + "grad_norm": 1.4576621055603027, + "learning_rate": 2.01347565617932e-06, + "loss": 1.1066, + "step": 26190 + }, + { + "epoch": 0.9379554855229466, + "grad_norm": 1.4011871814727783, + "learning_rate": 2.0111604567195185e-06, + "loss": 0.9575, + "step": 26191 + }, + { + "epoch": 0.9379912976525149, + "grad_norm": 1.3189853429794312, + "learning_rate": 2.008846575573142e-06, + "loss": 0.8454, + "step": 26192 + }, + { + "epoch": 0.9380271097820831, + "grad_norm": 1.3509697914123535, + "learning_rate": 2.006534012771344e-06, + "loss": 1.1161, + "step": 26193 + }, + { + "epoch": 0.9380629219116515, + "grad_norm": 1.2195417881011963, + "learning_rate": 2.004222768345221e-06, + "loss": 1.0751, + "step": 26194 + }, + { + "epoch": 0.9380987340412198, + "grad_norm": 1.2766445875167847, + "learning_rate": 2.0019128423258816e-06, + "loss": 1.0295, + "step": 26195 + }, + { + "epoch": 0.9381345461707881, + "grad_norm": 1.43094003200531, + "learning_rate": 1.999604234744401e-06, + "loss": 1.1074, + "step": 26196 + }, + { + "epoch": 0.9381703583003563, + "grad_norm": 2.247840166091919, + "learning_rate": 1.99729694563181e-06, + "loss": 1.2326, + "step": 26197 + }, + { + "epoch": 0.9382061704299246, + "grad_norm": 1.5764765739440918, + "learning_rate": 1.9949909750192064e-06, + "loss": 1.0537, + "step": 26198 + }, + { + "epoch": 0.9382419825594929, + "grad_norm": 1.3089076280593872, + "learning_rate": 1.992686322937565e-06, + "loss": 0.9692, + "step": 26199 + }, + { + "epoch": 0.9382777946890611, + "grad_norm": 1.277284026145935, + "learning_rate": 1.990382989417916e-06, + "loss": 1.113, + "step": 26200 + }, + { + "epoch": 0.9383136068186295, + "grad_norm": 1.3217264413833618, + "learning_rate": 1.9880809744912244e-06, + "loss": 1.0321, + "step": 26201 + }, + { + "epoch": 0.9383494189481978, + "grad_norm": 1.236489176750183, + "learning_rate": 1.985780278188487e-06, + "loss": 1.1273, + "step": 26202 + }, + { + "epoch": 0.9383852310777661, + "grad_norm": 1.5481083393096924, + "learning_rate": 1.983480900540646e-06, + "loss": 1.2091, + "step": 26203 + }, + { + "epoch": 0.9384210432073343, + "grad_norm": 1.740814447402954, + "learning_rate": 1.981182841578644e-06, + "loss": 1.1742, + "step": 26204 + }, + { + "epoch": 0.9384568553369026, + "grad_norm": 1.6152865886688232, + "learning_rate": 1.978886101333388e-06, + "loss": 0.801, + "step": 26205 + }, + { + "epoch": 0.9384926674664709, + "grad_norm": 1.5377569198608398, + "learning_rate": 1.9765906798357767e-06, + "loss": 1.0837, + "step": 26206 + }, + { + "epoch": 0.9385284795960391, + "grad_norm": 1.3767787218093872, + "learning_rate": 1.9742965771167077e-06, + "loss": 1.1098, + "step": 26207 + }, + { + "epoch": 0.9385642917256075, + "grad_norm": 1.3609750270843506, + "learning_rate": 1.972003793207011e-06, + "loss": 1.0977, + "step": 26208 + }, + { + "epoch": 0.9386001038551758, + "grad_norm": 1.3003911972045898, + "learning_rate": 1.969712328137574e-06, + "loss": 0.9573, + "step": 26209 + }, + { + "epoch": 0.938635915984744, + "grad_norm": 1.5599595308303833, + "learning_rate": 1.967422181939205e-06, + "loss": 1.0703, + "step": 26210 + }, + { + "epoch": 0.9386717281143123, + "grad_norm": 1.272394061088562, + "learning_rate": 1.9651333546427232e-06, + "loss": 1.0306, + "step": 26211 + }, + { + "epoch": 0.9387075402438806, + "grad_norm": 1.2800360918045044, + "learning_rate": 1.9628458462789044e-06, + "loss": 0.8966, + "step": 26212 + }, + { + "epoch": 0.9387433523734489, + "grad_norm": 1.2292267084121704, + "learning_rate": 1.960559656878547e-06, + "loss": 1.0454, + "step": 26213 + }, + { + "epoch": 0.9387791645030171, + "grad_norm": 1.2152087688446045, + "learning_rate": 1.9582747864723917e-06, + "loss": 0.94, + "step": 26214 + }, + { + "epoch": 0.9388149766325855, + "grad_norm": 1.2165197134017944, + "learning_rate": 1.9559912350911925e-06, + "loss": 1.0175, + "step": 26215 + }, + { + "epoch": 0.9388507887621538, + "grad_norm": 1.3997658491134644, + "learning_rate": 1.953709002765647e-06, + "loss": 1.0122, + "step": 26216 + }, + { + "epoch": 0.938886600891722, + "grad_norm": 1.6473102569580078, + "learning_rate": 1.951428089526486e-06, + "loss": 1.2579, + "step": 26217 + }, + { + "epoch": 0.9389224130212903, + "grad_norm": 1.3513193130493164, + "learning_rate": 1.949148495404396e-06, + "loss": 1.0051, + "step": 26218 + }, + { + "epoch": 0.9389582251508586, + "grad_norm": 1.3806369304656982, + "learning_rate": 1.9468702204300195e-06, + "loss": 1.1598, + "step": 26219 + }, + { + "epoch": 0.9389940372804269, + "grad_norm": 2.641688585281372, + "learning_rate": 1.9445932646340314e-06, + "loss": 1.1341, + "step": 26220 + }, + { + "epoch": 0.9390298494099951, + "grad_norm": 1.5127032995224, + "learning_rate": 1.9423176280470633e-06, + "loss": 1.1998, + "step": 26221 + }, + { + "epoch": 0.9390656615395635, + "grad_norm": 1.528965950012207, + "learning_rate": 1.940043310699724e-06, + "loss": 0.9863, + "step": 26222 + }, + { + "epoch": 0.9391014736691318, + "grad_norm": 1.564592957496643, + "learning_rate": 1.937770312622611e-06, + "loss": 1.161, + "step": 26223 + }, + { + "epoch": 0.9391372857987, + "grad_norm": 1.5303412675857544, + "learning_rate": 1.9354986338463e-06, + "loss": 0.9307, + "step": 26224 + }, + { + "epoch": 0.9391730979282683, + "grad_norm": 1.3648637533187866, + "learning_rate": 1.9332282744013774e-06, + "loss": 1.2092, + "step": 26225 + }, + { + "epoch": 0.9392089100578366, + "grad_norm": 1.6459343433380127, + "learning_rate": 1.9309592343183636e-06, + "loss": 1.1968, + "step": 26226 + }, + { + "epoch": 0.9392447221874048, + "grad_norm": 1.1953078508377075, + "learning_rate": 1.9286915136277894e-06, + "loss": 1.0946, + "step": 26227 + }, + { + "epoch": 0.9392805343169731, + "grad_norm": 1.6554241180419922, + "learning_rate": 1.926425112360164e-06, + "loss": 0.89, + "step": 26228 + }, + { + "epoch": 0.9393163464465415, + "grad_norm": 2.2220816612243652, + "learning_rate": 1.924160030545996e-06, + "loss": 1.1018, + "step": 26229 + }, + { + "epoch": 0.9393521585761098, + "grad_norm": 1.2327500581741333, + "learning_rate": 1.9218962682157395e-06, + "loss": 1.1815, + "step": 26230 + }, + { + "epoch": 0.939387970705678, + "grad_norm": 1.224300503730774, + "learning_rate": 1.919633825399858e-06, + "loss": 0.9903, + "step": 26231 + }, + { + "epoch": 0.9394237828352463, + "grad_norm": 1.5020177364349365, + "learning_rate": 1.9173727021287947e-06, + "loss": 1.0539, + "step": 26232 + }, + { + "epoch": 0.9394595949648146, + "grad_norm": 1.611060380935669, + "learning_rate": 1.915112898432947e-06, + "loss": 1.1695, + "step": 26233 + }, + { + "epoch": 0.9394954070943828, + "grad_norm": 1.3516463041305542, + "learning_rate": 1.9128544143427463e-06, + "loss": 1.1309, + "step": 26234 + }, + { + "epoch": 0.9395312192239511, + "grad_norm": 1.5320965051651, + "learning_rate": 1.910597249888568e-06, + "loss": 1.1001, + "step": 26235 + }, + { + "epoch": 0.9395670313535195, + "grad_norm": 1.4921956062316895, + "learning_rate": 1.9083414051007776e-06, + "loss": 1.2484, + "step": 26236 + }, + { + "epoch": 0.9396028434830878, + "grad_norm": 1.4493125677108765, + "learning_rate": 1.9060868800097164e-06, + "loss": 0.9368, + "step": 26237 + }, + { + "epoch": 0.939638655612656, + "grad_norm": 1.3463622331619263, + "learning_rate": 1.9038336746457276e-06, + "loss": 0.9277, + "step": 26238 + }, + { + "epoch": 0.9396744677422243, + "grad_norm": 1.5903865098953247, + "learning_rate": 1.9015817890391308e-06, + "loss": 0.9475, + "step": 26239 + }, + { + "epoch": 0.9397102798717926, + "grad_norm": 1.5806812047958374, + "learning_rate": 1.8993312232202021e-06, + "loss": 1.1791, + "step": 26240 + }, + { + "epoch": 0.9397460920013608, + "grad_norm": 1.2804021835327148, + "learning_rate": 1.8970819772192394e-06, + "loss": 1.111, + "step": 26241 + }, + { + "epoch": 0.9397819041309291, + "grad_norm": 1.3086806535720825, + "learning_rate": 1.8948340510664853e-06, + "loss": 0.8764, + "step": 26242 + }, + { + "epoch": 0.9398177162604975, + "grad_norm": 1.3680119514465332, + "learning_rate": 1.8925874447922044e-06, + "loss": 0.9876, + "step": 26243 + }, + { + "epoch": 0.9398535283900658, + "grad_norm": 1.6577202081680298, + "learning_rate": 1.8903421584266056e-06, + "loss": 1.0681, + "step": 26244 + }, + { + "epoch": 0.939889340519634, + "grad_norm": 1.2989683151245117, + "learning_rate": 1.8880981919998875e-06, + "loss": 0.9711, + "step": 26245 + }, + { + "epoch": 0.9399251526492023, + "grad_norm": 1.4168033599853516, + "learning_rate": 1.8858555455422699e-06, + "loss": 1.1471, + "step": 26246 + }, + { + "epoch": 0.9399609647787706, + "grad_norm": 1.5787464380264282, + "learning_rate": 1.8836142190839067e-06, + "loss": 1.1077, + "step": 26247 + }, + { + "epoch": 0.9399967769083388, + "grad_norm": 1.5083383321762085, + "learning_rate": 1.8813742126549404e-06, + "loss": 1.1299, + "step": 26248 + }, + { + "epoch": 0.9400325890379071, + "grad_norm": 1.4773192405700684, + "learning_rate": 1.879135526285525e-06, + "loss": 0.96, + "step": 26249 + }, + { + "epoch": 0.9400684011674755, + "grad_norm": 1.2967054843902588, + "learning_rate": 1.876898160005791e-06, + "loss": 0.9568, + "step": 26250 + }, + { + "epoch": 0.9401042132970437, + "grad_norm": 1.7102746963500977, + "learning_rate": 1.8746621138458042e-06, + "loss": 1.0395, + "step": 26251 + }, + { + "epoch": 0.940140025426612, + "grad_norm": 1.8429911136627197, + "learning_rate": 1.8724273878356624e-06, + "loss": 0.9553, + "step": 26252 + }, + { + "epoch": 0.9401758375561803, + "grad_norm": 1.5397028923034668, + "learning_rate": 1.8701939820054414e-06, + "loss": 1.1043, + "step": 26253 + }, + { + "epoch": 0.9402116496857486, + "grad_norm": 1.3354508876800537, + "learning_rate": 1.8679618963851952e-06, + "loss": 1.0986, + "step": 26254 + }, + { + "epoch": 0.9402474618153168, + "grad_norm": 1.269497275352478, + "learning_rate": 1.8657311310049218e-06, + "loss": 1.146, + "step": 26255 + }, + { + "epoch": 0.9402832739448851, + "grad_norm": 1.2328861951828003, + "learning_rate": 1.863501685894664e-06, + "loss": 1.015, + "step": 26256 + }, + { + "epoch": 0.9403190860744535, + "grad_norm": 1.8507310152053833, + "learning_rate": 1.861273561084398e-06, + "loss": 1.0885, + "step": 26257 + }, + { + "epoch": 0.9403548982040217, + "grad_norm": 1.6185568571090698, + "learning_rate": 1.8590467566041104e-06, + "loss": 1.2249, + "step": 26258 + }, + { + "epoch": 0.94039071033359, + "grad_norm": 1.4368503093719482, + "learning_rate": 1.8568212724837442e-06, + "loss": 0.9922, + "step": 26259 + }, + { + "epoch": 0.9404265224631583, + "grad_norm": 1.4440635442733765, + "learning_rate": 1.8545971087532644e-06, + "loss": 1.2849, + "step": 26260 + }, + { + "epoch": 0.9404623345927265, + "grad_norm": 1.5046321153640747, + "learning_rate": 1.8523742654425802e-06, + "loss": 1.0192, + "step": 26261 + }, + { + "epoch": 0.9404981467222948, + "grad_norm": 1.5194706916809082, + "learning_rate": 1.8501527425816012e-06, + "loss": 1.1843, + "step": 26262 + }, + { + "epoch": 0.9405339588518631, + "grad_norm": 1.5926951169967651, + "learning_rate": 1.8479325402002034e-06, + "loss": 1.1785, + "step": 26263 + }, + { + "epoch": 0.9405697709814315, + "grad_norm": 1.9775844812393188, + "learning_rate": 1.8457136583282741e-06, + "loss": 1.1168, + "step": 26264 + }, + { + "epoch": 0.9406055831109997, + "grad_norm": 1.5321089029312134, + "learning_rate": 1.8434960969956561e-06, + "loss": 1.1417, + "step": 26265 + }, + { + "epoch": 0.940641395240568, + "grad_norm": 2.5407378673553467, + "learning_rate": 1.8412798562321809e-06, + "loss": 0.913, + "step": 26266 + }, + { + "epoch": 0.9406772073701363, + "grad_norm": 1.2168337106704712, + "learning_rate": 1.8390649360676692e-06, + "loss": 1.1215, + "step": 26267 + }, + { + "epoch": 0.9407130194997045, + "grad_norm": 1.2902650833129883, + "learning_rate": 1.8368513365319306e-06, + "loss": 0.8271, + "step": 26268 + }, + { + "epoch": 0.9407488316292728, + "grad_norm": 1.5769433975219727, + "learning_rate": 1.83463905765473e-06, + "loss": 1.1686, + "step": 26269 + }, + { + "epoch": 0.9407846437588411, + "grad_norm": 1.2923320531845093, + "learning_rate": 1.8324280994658327e-06, + "loss": 1.0265, + "step": 26270 + }, + { + "epoch": 0.9408204558884095, + "grad_norm": 1.341404676437378, + "learning_rate": 1.8302184619949925e-06, + "loss": 0.989, + "step": 26271 + }, + { + "epoch": 0.9408562680179777, + "grad_norm": 1.7614212036132812, + "learning_rate": 1.8280101452719412e-06, + "loss": 0.9528, + "step": 26272 + }, + { + "epoch": 0.940892080147546, + "grad_norm": 1.3284153938293457, + "learning_rate": 1.825803149326366e-06, + "loss": 1.1134, + "step": 26273 + }, + { + "epoch": 0.9409278922771143, + "grad_norm": 1.1988111734390259, + "learning_rate": 1.8235974741879769e-06, + "loss": 0.9643, + "step": 26274 + }, + { + "epoch": 0.9409637044066825, + "grad_norm": 1.3646955490112305, + "learning_rate": 1.8213931198864608e-06, + "loss": 1.2106, + "step": 26275 + }, + { + "epoch": 0.9409995165362508, + "grad_norm": 1.6587092876434326, + "learning_rate": 1.8191900864514388e-06, + "loss": 0.9538, + "step": 26276 + }, + { + "epoch": 0.9410353286658191, + "grad_norm": 1.4203377962112427, + "learning_rate": 1.816988373912587e-06, + "loss": 1.0624, + "step": 26277 + }, + { + "epoch": 0.9410711407953875, + "grad_norm": 1.5972601175308228, + "learning_rate": 1.8147879822994928e-06, + "loss": 1.1148, + "step": 26278 + }, + { + "epoch": 0.9411069529249557, + "grad_norm": 1.3725007772445679, + "learning_rate": 1.8125889116417883e-06, + "loss": 0.9636, + "step": 26279 + }, + { + "epoch": 0.941142765054524, + "grad_norm": 1.6476761102676392, + "learning_rate": 1.8103911619690384e-06, + "loss": 1.1623, + "step": 26280 + }, + { + "epoch": 0.9411785771840923, + "grad_norm": 1.6385835409164429, + "learning_rate": 1.8081947333108195e-06, + "loss": 1.117, + "step": 26281 + }, + { + "epoch": 0.9412143893136605, + "grad_norm": 1.6111719608306885, + "learning_rate": 1.805999625696686e-06, + "loss": 1.1573, + "step": 26282 + }, + { + "epoch": 0.9412502014432288, + "grad_norm": 1.522313117980957, + "learning_rate": 1.8038058391561697e-06, + "loss": 0.9123, + "step": 26283 + }, + { + "epoch": 0.9412860135727971, + "grad_norm": 1.701704978942871, + "learning_rate": 1.8016133737187913e-06, + "loss": 1.0028, + "step": 26284 + }, + { + "epoch": 0.9413218257023654, + "grad_norm": 1.9187231063842773, + "learning_rate": 1.799422229414016e-06, + "loss": 0.8464, + "step": 26285 + }, + { + "epoch": 0.9413576378319337, + "grad_norm": 1.3187730312347412, + "learning_rate": 1.7972324062713652e-06, + "loss": 1.1402, + "step": 26286 + }, + { + "epoch": 0.941393449961502, + "grad_norm": 1.3791860342025757, + "learning_rate": 1.7950439043202593e-06, + "loss": 1.0482, + "step": 26287 + }, + { + "epoch": 0.9414292620910703, + "grad_norm": 1.5583369731903076, + "learning_rate": 1.7928567235901861e-06, + "loss": 1.1784, + "step": 26288 + }, + { + "epoch": 0.9414650742206385, + "grad_norm": 1.322001338005066, + "learning_rate": 1.790670864110522e-06, + "loss": 0.9797, + "step": 26289 + }, + { + "epoch": 0.9415008863502068, + "grad_norm": 1.4694774150848389, + "learning_rate": 1.7884863259107209e-06, + "loss": 1.2083, + "step": 26290 + }, + { + "epoch": 0.9415366984797751, + "grad_norm": 1.4322359561920166, + "learning_rate": 1.7863031090201377e-06, + "loss": 1.1469, + "step": 26291 + }, + { + "epoch": 0.9415725106093434, + "grad_norm": 1.4167797565460205, + "learning_rate": 1.7841212134681705e-06, + "loss": 1.0525, + "step": 26292 + }, + { + "epoch": 0.9416083227389117, + "grad_norm": 1.6510577201843262, + "learning_rate": 1.781940639284163e-06, + "loss": 1.1492, + "step": 26293 + }, + { + "epoch": 0.94164413486848, + "grad_norm": 1.74041748046875, + "learning_rate": 1.7797613864974472e-06, + "loss": 1.1798, + "step": 26294 + }, + { + "epoch": 0.9416799469980482, + "grad_norm": 1.882664680480957, + "learning_rate": 1.7775834551373548e-06, + "loss": 0.7583, + "step": 26295 + }, + { + "epoch": 0.9417157591276165, + "grad_norm": 2.1336829662323, + "learning_rate": 1.775406845233163e-06, + "loss": 0.9302, + "step": 26296 + }, + { + "epoch": 0.9417515712571848, + "grad_norm": 1.4518365859985352, + "learning_rate": 1.7732315568141811e-06, + "loss": 0.9795, + "step": 26297 + }, + { + "epoch": 0.941787383386753, + "grad_norm": 1.754638671875, + "learning_rate": 1.7710575899096637e-06, + "loss": 1.048, + "step": 26298 + }, + { + "epoch": 0.9418231955163214, + "grad_norm": 1.2293026447296143, + "learning_rate": 1.7688849445488654e-06, + "loss": 0.8964, + "step": 26299 + }, + { + "epoch": 0.9418590076458897, + "grad_norm": 1.3315625190734863, + "learning_rate": 1.7667136207609958e-06, + "loss": 1.0902, + "step": 26300 + }, + { + "epoch": 0.941894819775458, + "grad_norm": 1.6143437623977661, + "learning_rate": 1.7645436185753095e-06, + "loss": 1.2032, + "step": 26301 + }, + { + "epoch": 0.9419306319050262, + "grad_norm": 1.5626906156539917, + "learning_rate": 1.7623749380209609e-06, + "loss": 1.036, + "step": 26302 + }, + { + "epoch": 0.9419664440345945, + "grad_norm": 1.5410710573196411, + "learning_rate": 1.7602075791271377e-06, + "loss": 1.0617, + "step": 26303 + }, + { + "epoch": 0.9420022561641628, + "grad_norm": 1.6744173765182495, + "learning_rate": 1.7580415419229946e-06, + "loss": 0.9053, + "step": 26304 + }, + { + "epoch": 0.942038068293731, + "grad_norm": 1.540026307106018, + "learning_rate": 1.7558768264376856e-06, + "loss": 1.1582, + "step": 26305 + }, + { + "epoch": 0.9420738804232994, + "grad_norm": 1.3936914205551147, + "learning_rate": 1.7537134327003324e-06, + "loss": 1.1132, + "step": 26306 + }, + { + "epoch": 0.9421096925528677, + "grad_norm": 1.731165885925293, + "learning_rate": 1.7515513607400225e-06, + "loss": 1.2029, + "step": 26307 + }, + { + "epoch": 0.942145504682436, + "grad_norm": 1.5405598878860474, + "learning_rate": 1.749390610585877e-06, + "loss": 0.9877, + "step": 26308 + }, + { + "epoch": 0.9421813168120042, + "grad_norm": 1.155150294303894, + "learning_rate": 1.7472311822669397e-06, + "loss": 0.9654, + "step": 26309 + }, + { + "epoch": 0.9422171289415725, + "grad_norm": 1.568843126296997, + "learning_rate": 1.7450730758122757e-06, + "loss": 1.1226, + "step": 26310 + }, + { + "epoch": 0.9422529410711408, + "grad_norm": 1.4366073608398438, + "learning_rate": 1.7429162912508956e-06, + "loss": 1.0569, + "step": 26311 + }, + { + "epoch": 0.942288753200709, + "grad_norm": 1.2804750204086304, + "learning_rate": 1.7407608286118427e-06, + "loss": 1.1291, + "step": 26312 + }, + { + "epoch": 0.9423245653302774, + "grad_norm": 2.387186288833618, + "learning_rate": 1.7386066879241159e-06, + "loss": 1.2562, + "step": 26313 + }, + { + "epoch": 0.9423603774598457, + "grad_norm": 1.3586676120758057, + "learning_rate": 1.736453869216681e-06, + "loss": 1.0376, + "step": 26314 + }, + { + "epoch": 0.942396189589414, + "grad_norm": 1.4057921171188354, + "learning_rate": 1.7343023725185038e-06, + "loss": 1.0846, + "step": 26315 + }, + { + "epoch": 0.9424320017189822, + "grad_norm": 1.5728394985198975, + "learning_rate": 1.7321521978585387e-06, + "loss": 1.1483, + "step": 26316 + }, + { + "epoch": 0.9424678138485505, + "grad_norm": 1.284388780593872, + "learning_rate": 1.7300033452657184e-06, + "loss": 0.988, + "step": 26317 + }, + { + "epoch": 0.9425036259781188, + "grad_norm": 1.524937629699707, + "learning_rate": 1.7278558147689306e-06, + "loss": 1.0293, + "step": 26318 + }, + { + "epoch": 0.942539438107687, + "grad_norm": 1.6257439851760864, + "learning_rate": 1.7257096063970856e-06, + "loss": 1.0611, + "step": 26319 + }, + { + "epoch": 0.9425752502372554, + "grad_norm": 1.8410621881484985, + "learning_rate": 1.7235647201790605e-06, + "loss": 1.4095, + "step": 26320 + }, + { + "epoch": 0.9426110623668237, + "grad_norm": 2.0618462562561035, + "learning_rate": 1.7214211561436987e-06, + "loss": 1.1672, + "step": 26321 + }, + { + "epoch": 0.942646874496392, + "grad_norm": 1.577744722366333, + "learning_rate": 1.719278914319844e-06, + "loss": 1.1605, + "step": 26322 + }, + { + "epoch": 0.9426826866259602, + "grad_norm": 1.4596561193466187, + "learning_rate": 1.7171379947363175e-06, + "loss": 0.9896, + "step": 26323 + }, + { + "epoch": 0.9427184987555285, + "grad_norm": 1.467403769493103, + "learning_rate": 1.7149983974219297e-06, + "loss": 1.0265, + "step": 26324 + }, + { + "epoch": 0.9427543108850968, + "grad_norm": 1.246441125869751, + "learning_rate": 1.7128601224054464e-06, + "loss": 0.9497, + "step": 26325 + }, + { + "epoch": 0.942790123014665, + "grad_norm": 1.4965102672576904, + "learning_rate": 1.7107231697156557e-06, + "loss": 1.0063, + "step": 26326 + }, + { + "epoch": 0.9428259351442334, + "grad_norm": 1.4914772510528564, + "learning_rate": 1.7085875393813123e-06, + "loss": 0.9961, + "step": 26327 + }, + { + "epoch": 0.9428617472738017, + "grad_norm": 1.4358612298965454, + "learning_rate": 1.7064532314311266e-06, + "loss": 1.0218, + "step": 26328 + }, + { + "epoch": 0.94289755940337, + "grad_norm": 1.5658057928085327, + "learning_rate": 1.70432024589382e-06, + "loss": 1.1351, + "step": 26329 + }, + { + "epoch": 0.9429333715329382, + "grad_norm": 2.0144824981689453, + "learning_rate": 1.702188582798092e-06, + "loss": 1.1358, + "step": 26330 + }, + { + "epoch": 0.9429691836625065, + "grad_norm": 1.6752289533615112, + "learning_rate": 1.7000582421726308e-06, + "loss": 1.1205, + "step": 26331 + }, + { + "epoch": 0.9430049957920748, + "grad_norm": 1.431716799736023, + "learning_rate": 1.6979292240460799e-06, + "loss": 1.0768, + "step": 26332 + }, + { + "epoch": 0.943040807921643, + "grad_norm": 1.5112491846084595, + "learning_rate": 1.695801528447094e-06, + "loss": 1.0885, + "step": 26333 + }, + { + "epoch": 0.9430766200512114, + "grad_norm": 1.3637754917144775, + "learning_rate": 1.6936751554042951e-06, + "loss": 1.1157, + "step": 26334 + }, + { + "epoch": 0.9431124321807797, + "grad_norm": 1.4900264739990234, + "learning_rate": 1.6915501049462934e-06, + "loss": 1.0015, + "step": 26335 + }, + { + "epoch": 0.9431482443103479, + "grad_norm": 2.0668692588806152, + "learning_rate": 1.6894263771016661e-06, + "loss": 1.3211, + "step": 26336 + }, + { + "epoch": 0.9431840564399162, + "grad_norm": 1.5005549192428589, + "learning_rate": 1.6873039718990014e-06, + "loss": 1.0176, + "step": 26337 + }, + { + "epoch": 0.9432198685694845, + "grad_norm": 1.405948519706726, + "learning_rate": 1.6851828893668543e-06, + "loss": 0.974, + "step": 26338 + }, + { + "epoch": 0.9432556806990527, + "grad_norm": 1.6291483640670776, + "learning_rate": 1.6830631295337462e-06, + "loss": 1.0456, + "step": 26339 + }, + { + "epoch": 0.943291492828621, + "grad_norm": 1.6723027229309082, + "learning_rate": 1.68094469242821e-06, + "loss": 1.0601, + "step": 26340 + }, + { + "epoch": 0.9433273049581893, + "grad_norm": 1.4770135879516602, + "learning_rate": 1.6788275780787343e-06, + "loss": 1.0916, + "step": 26341 + }, + { + "epoch": 0.9433631170877577, + "grad_norm": 1.3457216024398804, + "learning_rate": 1.6767117865138182e-06, + "loss": 1.0573, + "step": 26342 + }, + { + "epoch": 0.9433989292173259, + "grad_norm": 1.6002284288406372, + "learning_rate": 1.6745973177619056e-06, + "loss": 1.0941, + "step": 26343 + }, + { + "epoch": 0.9434347413468942, + "grad_norm": 1.273008108139038, + "learning_rate": 1.6724841718514629e-06, + "loss": 0.9526, + "step": 26344 + }, + { + "epoch": 0.9434705534764625, + "grad_norm": 1.5704420804977417, + "learning_rate": 1.6703723488109112e-06, + "loss": 1.2027, + "step": 26345 + }, + { + "epoch": 0.9435063656060307, + "grad_norm": 1.3085181713104248, + "learning_rate": 1.6682618486686619e-06, + "loss": 0.964, + "step": 26346 + }, + { + "epoch": 0.943542177735599, + "grad_norm": 2.0376577377319336, + "learning_rate": 1.6661526714531029e-06, + "loss": 0.9321, + "step": 26347 + }, + { + "epoch": 0.9435779898651673, + "grad_norm": 1.8068121671676636, + "learning_rate": 1.6640448171926226e-06, + "loss": 1.0313, + "step": 26348 + }, + { + "epoch": 0.9436138019947357, + "grad_norm": 2.023555278778076, + "learning_rate": 1.6619382859155873e-06, + "loss": 1.251, + "step": 26349 + }, + { + "epoch": 0.9436496141243039, + "grad_norm": 1.5229073762893677, + "learning_rate": 1.659833077650319e-06, + "loss": 0.9502, + "step": 26350 + }, + { + "epoch": 0.9436854262538722, + "grad_norm": 1.6926687955856323, + "learning_rate": 1.6577291924251392e-06, + "loss": 1.1031, + "step": 26351 + }, + { + "epoch": 0.9437212383834405, + "grad_norm": 1.4749068021774292, + "learning_rate": 1.6556266302683588e-06, + "loss": 0.9148, + "step": 26352 + }, + { + "epoch": 0.9437570505130087, + "grad_norm": 1.3398040533065796, + "learning_rate": 1.6535253912082772e-06, + "loss": 0.8841, + "step": 26353 + }, + { + "epoch": 0.943792862642577, + "grad_norm": 1.5666849613189697, + "learning_rate": 1.6514254752731494e-06, + "loss": 1.0428, + "step": 26354 + }, + { + "epoch": 0.9438286747721453, + "grad_norm": 1.4266122579574585, + "learning_rate": 1.6493268824912312e-06, + "loss": 1.0475, + "step": 26355 + }, + { + "epoch": 0.9438644869017137, + "grad_norm": 1.6721335649490356, + "learning_rate": 1.647229612890766e-06, + "loss": 1.1416, + "step": 26356 + }, + { + "epoch": 0.9439002990312819, + "grad_norm": 1.3221402168273926, + "learning_rate": 1.6451336664999539e-06, + "loss": 0.9422, + "step": 26357 + }, + { + "epoch": 0.9439361111608502, + "grad_norm": 1.4999197721481323, + "learning_rate": 1.6430390433469945e-06, + "loss": 1.0543, + "step": 26358 + }, + { + "epoch": 0.9439719232904185, + "grad_norm": 1.911062479019165, + "learning_rate": 1.640945743460065e-06, + "loss": 1.1251, + "step": 26359 + }, + { + "epoch": 0.9440077354199867, + "grad_norm": 1.6351789236068726, + "learning_rate": 1.6388537668673542e-06, + "loss": 0.8843, + "step": 26360 + }, + { + "epoch": 0.944043547549555, + "grad_norm": 1.4641119241714478, + "learning_rate": 1.636763113596984e-06, + "loss": 1.0614, + "step": 26361 + }, + { + "epoch": 0.9440793596791233, + "grad_norm": 2.183424711227417, + "learning_rate": 1.6346737836770875e-06, + "loss": 1.1149, + "step": 26362 + }, + { + "epoch": 0.9441151718086916, + "grad_norm": 1.3873960971832275, + "learning_rate": 1.6325857771357756e-06, + "loss": 1.1966, + "step": 26363 + }, + { + "epoch": 0.9441509839382599, + "grad_norm": 1.5049116611480713, + "learning_rate": 1.6304990940011255e-06, + "loss": 0.9938, + "step": 26364 + }, + { + "epoch": 0.9441867960678282, + "grad_norm": 1.2856175899505615, + "learning_rate": 1.6284137343012263e-06, + "loss": 1.0041, + "step": 26365 + }, + { + "epoch": 0.9442226081973965, + "grad_norm": 1.6411129236221313, + "learning_rate": 1.6263296980641328e-06, + "loss": 0.9759, + "step": 26366 + }, + { + "epoch": 0.9442584203269647, + "grad_norm": 1.3902636766433716, + "learning_rate": 1.62424698531789e-06, + "loss": 0.874, + "step": 26367 + }, + { + "epoch": 0.944294232456533, + "grad_norm": 1.5378843545913696, + "learning_rate": 1.6221655960904968e-06, + "loss": 0.8288, + "step": 26368 + }, + { + "epoch": 0.9443300445861013, + "grad_norm": 1.3479136228561401, + "learning_rate": 1.620085530409965e-06, + "loss": 0.9398, + "step": 26369 + }, + { + "epoch": 0.9443658567156696, + "grad_norm": 1.40557062625885, + "learning_rate": 1.6180067883042937e-06, + "loss": 1.0429, + "step": 26370 + }, + { + "epoch": 0.9444016688452379, + "grad_norm": 1.6865183115005493, + "learning_rate": 1.6159293698014278e-06, + "loss": 1.1401, + "step": 26371 + }, + { + "epoch": 0.9444374809748062, + "grad_norm": 1.6659733057022095, + "learning_rate": 1.6138532749293335e-06, + "loss": 1.1736, + "step": 26372 + }, + { + "epoch": 0.9444732931043744, + "grad_norm": 1.7073330879211426, + "learning_rate": 1.6117785037159216e-06, + "loss": 0.9228, + "step": 26373 + }, + { + "epoch": 0.9445091052339427, + "grad_norm": 1.629603385925293, + "learning_rate": 1.6097050561891369e-06, + "loss": 1.1453, + "step": 26374 + }, + { + "epoch": 0.944544917363511, + "grad_norm": 1.371408462524414, + "learning_rate": 1.6076329323768347e-06, + "loss": 1.0651, + "step": 26375 + }, + { + "epoch": 0.9445807294930793, + "grad_norm": 1.4450427293777466, + "learning_rate": 1.605562132306937e-06, + "loss": 1.0409, + "step": 26376 + }, + { + "epoch": 0.9446165416226476, + "grad_norm": 1.4028090238571167, + "learning_rate": 1.6034926560072549e-06, + "loss": 0.8651, + "step": 26377 + }, + { + "epoch": 0.9446523537522159, + "grad_norm": 1.3630746603012085, + "learning_rate": 1.6014245035056775e-06, + "loss": 0.9418, + "step": 26378 + }, + { + "epoch": 0.9446881658817842, + "grad_norm": 1.322632908821106, + "learning_rate": 1.5993576748300043e-06, + "loss": 1.1005, + "step": 26379 + }, + { + "epoch": 0.9447239780113524, + "grad_norm": 1.482201337814331, + "learning_rate": 1.5972921700080357e-06, + "loss": 0.9814, + "step": 26380 + }, + { + "epoch": 0.9447597901409207, + "grad_norm": 1.4591587781906128, + "learning_rate": 1.5952279890675826e-06, + "loss": 0.9576, + "step": 26381 + }, + { + "epoch": 0.944795602270489, + "grad_norm": 1.3842788934707642, + "learning_rate": 1.5931651320364006e-06, + "loss": 1.1535, + "step": 26382 + }, + { + "epoch": 0.9448314144000572, + "grad_norm": 1.269080638885498, + "learning_rate": 1.5911035989422562e-06, + "loss": 0.9895, + "step": 26383 + }, + { + "epoch": 0.9448672265296256, + "grad_norm": 1.305545449256897, + "learning_rate": 1.5890433898128498e-06, + "loss": 1.1492, + "step": 26384 + }, + { + "epoch": 0.9449030386591939, + "grad_norm": 1.351886510848999, + "learning_rate": 1.5869845046759369e-06, + "loss": 1.104, + "step": 26385 + }, + { + "epoch": 0.9449388507887622, + "grad_norm": 1.6891160011291504, + "learning_rate": 1.5849269435592061e-06, + "loss": 1.2724, + "step": 26386 + }, + { + "epoch": 0.9449746629183304, + "grad_norm": 1.482047438621521, + "learning_rate": 1.5828707064903359e-06, + "loss": 0.966, + "step": 26387 + }, + { + "epoch": 0.9450104750478987, + "grad_norm": 1.7213473320007324, + "learning_rate": 1.5808157934969813e-06, + "loss": 0.913, + "step": 26388 + }, + { + "epoch": 0.945046287177467, + "grad_norm": 1.3203778266906738, + "learning_rate": 1.5787622046068207e-06, + "loss": 1.0036, + "step": 26389 + }, + { + "epoch": 0.9450820993070352, + "grad_norm": 1.992384910583496, + "learning_rate": 1.576709939847454e-06, + "loss": 0.9709, + "step": 26390 + }, + { + "epoch": 0.9451179114366036, + "grad_norm": 1.4780941009521484, + "learning_rate": 1.574658999246481e-06, + "loss": 0.9432, + "step": 26391 + }, + { + "epoch": 0.9451537235661719, + "grad_norm": 1.7598916292190552, + "learning_rate": 1.5726093828315248e-06, + "loss": 0.9727, + "step": 26392 + }, + { + "epoch": 0.9451895356957402, + "grad_norm": 1.4612771272659302, + "learning_rate": 1.5705610906301404e-06, + "loss": 0.9971, + "step": 26393 + }, + { + "epoch": 0.9452253478253084, + "grad_norm": 1.238208532333374, + "learning_rate": 1.5685141226699064e-06, + "loss": 1.1036, + "step": 26394 + }, + { + "epoch": 0.9452611599548767, + "grad_norm": 1.5609194040298462, + "learning_rate": 1.5664684789783224e-06, + "loss": 1.0039, + "step": 26395 + }, + { + "epoch": 0.945296972084445, + "grad_norm": 1.3017852306365967, + "learning_rate": 1.5644241595829557e-06, + "loss": 1.0004, + "step": 26396 + }, + { + "epoch": 0.9453327842140132, + "grad_norm": 1.389532446861267, + "learning_rate": 1.562381164511284e-06, + "loss": 0.9306, + "step": 26397 + }, + { + "epoch": 0.9453685963435816, + "grad_norm": 1.8901256322860718, + "learning_rate": 1.5603394937907967e-06, + "loss": 1.1191, + "step": 26398 + }, + { + "epoch": 0.9454044084731499, + "grad_norm": 1.5986121892929077, + "learning_rate": 1.5582991474489607e-06, + "loss": 1.2052, + "step": 26399 + }, + { + "epoch": 0.9454402206027182, + "grad_norm": 2.567664623260498, + "learning_rate": 1.5562601255132314e-06, + "loss": 0.9801, + "step": 26400 + }, + { + "epoch": 0.9454760327322864, + "grad_norm": 1.9327116012573242, + "learning_rate": 1.554222428011043e-06, + "loss": 1.0521, + "step": 26401 + }, + { + "epoch": 0.9455118448618547, + "grad_norm": 1.3793315887451172, + "learning_rate": 1.5521860549698063e-06, + "loss": 1.267, + "step": 26402 + }, + { + "epoch": 0.945547656991423, + "grad_norm": 1.6573375463485718, + "learning_rate": 1.550151006416911e-06, + "loss": 1.1971, + "step": 26403 + }, + { + "epoch": 0.9455834691209912, + "grad_norm": 1.2714910507202148, + "learning_rate": 1.5481172823797463e-06, + "loss": 1.0667, + "step": 26404 + }, + { + "epoch": 0.9456192812505596, + "grad_norm": 1.5013010501861572, + "learning_rate": 1.5460848828856677e-06, + "loss": 1.2563, + "step": 26405 + }, + { + "epoch": 0.9456550933801279, + "grad_norm": 1.5691938400268555, + "learning_rate": 1.5440538079620204e-06, + "loss": 0.997, + "step": 26406 + }, + { + "epoch": 0.9456909055096961, + "grad_norm": 1.645537257194519, + "learning_rate": 1.5420240576361378e-06, + "loss": 1.1281, + "step": 26407 + }, + { + "epoch": 0.9457267176392644, + "grad_norm": 1.4067820310592651, + "learning_rate": 1.5399956319353092e-06, + "loss": 0.8816, + "step": 26408 + }, + { + "epoch": 0.9457625297688327, + "grad_norm": 1.4838942289352417, + "learning_rate": 1.5379685308868464e-06, + "loss": 1.0217, + "step": 26409 + }, + { + "epoch": 0.945798341898401, + "grad_norm": 1.6547609567642212, + "learning_rate": 1.535942754517994e-06, + "loss": 0.9052, + "step": 26410 + }, + { + "epoch": 0.9458341540279692, + "grad_norm": 1.6436951160430908, + "learning_rate": 1.53391830285603e-06, + "loss": 1.0741, + "step": 26411 + }, + { + "epoch": 0.9458699661575376, + "grad_norm": 1.739171028137207, + "learning_rate": 1.5318951759281885e-06, + "loss": 1.2012, + "step": 26412 + }, + { + "epoch": 0.9459057782871059, + "grad_norm": 1.4058420658111572, + "learning_rate": 1.529873373761681e-06, + "loss": 1.2086, + "step": 26413 + }, + { + "epoch": 0.9459415904166741, + "grad_norm": 1.4369714260101318, + "learning_rate": 1.527852896383708e-06, + "loss": 1.0518, + "step": 26414 + }, + { + "epoch": 0.9459774025462424, + "grad_norm": 1.4823750257492065, + "learning_rate": 1.5258337438214587e-06, + "loss": 1.0981, + "step": 26415 + }, + { + "epoch": 0.9460132146758107, + "grad_norm": 1.3527976274490356, + "learning_rate": 1.5238159161020893e-06, + "loss": 1.1197, + "step": 26416 + }, + { + "epoch": 0.946049026805379, + "grad_norm": 1.5282211303710938, + "learning_rate": 1.5217994132527448e-06, + "loss": 1.1697, + "step": 26417 + }, + { + "epoch": 0.9460848389349472, + "grad_norm": 1.697516918182373, + "learning_rate": 1.5197842353005698e-06, + "loss": 1.1252, + "step": 26418 + }, + { + "epoch": 0.9461206510645156, + "grad_norm": 1.5434670448303223, + "learning_rate": 1.5177703822726652e-06, + "loss": 1.0913, + "step": 26419 + }, + { + "epoch": 0.9461564631940839, + "grad_norm": 1.5943973064422607, + "learning_rate": 1.5157578541961315e-06, + "loss": 0.9111, + "step": 26420 + }, + { + "epoch": 0.9461922753236521, + "grad_norm": 1.518902063369751, + "learning_rate": 1.5137466510980357e-06, + "loss": 1.1307, + "step": 26421 + }, + { + "epoch": 0.9462280874532204, + "grad_norm": 1.3189659118652344, + "learning_rate": 1.5117367730054343e-06, + "loss": 0.9514, + "step": 26422 + }, + { + "epoch": 0.9462638995827887, + "grad_norm": 1.3957775831222534, + "learning_rate": 1.5097282199453943e-06, + "loss": 1.1026, + "step": 26423 + }, + { + "epoch": 0.9462997117123569, + "grad_norm": 1.3013378381729126, + "learning_rate": 1.5077209919449053e-06, + "loss": 0.9493, + "step": 26424 + }, + { + "epoch": 0.9463355238419252, + "grad_norm": 1.5237414836883545, + "learning_rate": 1.505715089030979e-06, + "loss": 0.992, + "step": 26425 + }, + { + "epoch": 0.9463713359714936, + "grad_norm": 1.7879875898361206, + "learning_rate": 1.503710511230616e-06, + "loss": 1.0236, + "step": 26426 + }, + { + "epoch": 0.9464071481010619, + "grad_norm": 1.4492530822753906, + "learning_rate": 1.5017072585707725e-06, + "loss": 0.9688, + "step": 26427 + }, + { + "epoch": 0.9464429602306301, + "grad_norm": 1.4564937353134155, + "learning_rate": 1.4997053310784047e-06, + "loss": 0.9792, + "step": 26428 + }, + { + "epoch": 0.9464787723601984, + "grad_norm": 1.682623028755188, + "learning_rate": 1.497704728780447e-06, + "loss": 1.106, + "step": 26429 + }, + { + "epoch": 0.9465145844897667, + "grad_norm": 1.5194687843322754, + "learning_rate": 1.4957054517038106e-06, + "loss": 1.028, + "step": 26430 + }, + { + "epoch": 0.9465503966193349, + "grad_norm": 2.8008038997650146, + "learning_rate": 1.4937074998753965e-06, + "loss": 1.1148, + "step": 26431 + }, + { + "epoch": 0.9465862087489032, + "grad_norm": 1.6830731630325317, + "learning_rate": 1.491710873322083e-06, + "loss": 1.1255, + "step": 26432 + }, + { + "epoch": 0.9466220208784716, + "grad_norm": 1.4774858951568604, + "learning_rate": 1.489715572070738e-06, + "loss": 1.0025, + "step": 26433 + }, + { + "epoch": 0.9466578330080399, + "grad_norm": 1.7258027791976929, + "learning_rate": 1.4877215961482062e-06, + "loss": 0.9869, + "step": 26434 + }, + { + "epoch": 0.9466936451376081, + "grad_norm": 1.5681307315826416, + "learning_rate": 1.4857289455812883e-06, + "loss": 1.1394, + "step": 26435 + }, + { + "epoch": 0.9467294572671764, + "grad_norm": 1.6858881711959839, + "learning_rate": 1.48373762039683e-06, + "loss": 1.0387, + "step": 26436 + }, + { + "epoch": 0.9467652693967447, + "grad_norm": 1.5289353132247925, + "learning_rate": 1.4817476206216096e-06, + "loss": 0.9284, + "step": 26437 + }, + { + "epoch": 0.9468010815263129, + "grad_norm": 1.3923990726470947, + "learning_rate": 1.4797589462823836e-06, + "loss": 1.1992, + "step": 26438 + }, + { + "epoch": 0.9468368936558812, + "grad_norm": 1.2190837860107422, + "learning_rate": 1.4777715974059192e-06, + "loss": 1.0501, + "step": 26439 + }, + { + "epoch": 0.9468727057854496, + "grad_norm": 1.5063802003860474, + "learning_rate": 1.4757855740189508e-06, + "loss": 1.0727, + "step": 26440 + }, + { + "epoch": 0.9469085179150178, + "grad_norm": 1.7111912965774536, + "learning_rate": 1.4738008761482125e-06, + "loss": 1.253, + "step": 26441 + }, + { + "epoch": 0.9469443300445861, + "grad_norm": 1.5871608257293701, + "learning_rate": 1.471817503820383e-06, + "loss": 1.0206, + "step": 26442 + }, + { + "epoch": 0.9469801421741544, + "grad_norm": 2.0513176918029785, + "learning_rate": 1.469835457062163e-06, + "loss": 1.0053, + "step": 26443 + }, + { + "epoch": 0.9470159543037227, + "grad_norm": 1.6061056852340698, + "learning_rate": 1.4678547359002092e-06, + "loss": 1.1934, + "step": 26444 + }, + { + "epoch": 0.9470517664332909, + "grad_norm": 1.400889277458191, + "learning_rate": 1.465875340361178e-06, + "loss": 1.0552, + "step": 26445 + }, + { + "epoch": 0.9470875785628592, + "grad_norm": 1.5496535301208496, + "learning_rate": 1.4638972704716814e-06, + "loss": 1.3191, + "step": 26446 + }, + { + "epoch": 0.9471233906924276, + "grad_norm": 1.4914087057113647, + "learning_rate": 1.4619205262583536e-06, + "loss": 1.0666, + "step": 26447 + }, + { + "epoch": 0.9471592028219958, + "grad_norm": 1.1733955144882202, + "learning_rate": 1.4599451077477844e-06, + "loss": 1.0858, + "step": 26448 + }, + { + "epoch": 0.9471950149515641, + "grad_norm": 1.2846417427062988, + "learning_rate": 1.4579710149665416e-06, + "loss": 0.9709, + "step": 26449 + }, + { + "epoch": 0.9472308270811324, + "grad_norm": 1.473512887954712, + "learning_rate": 1.4559982479411927e-06, + "loss": 1.1506, + "step": 26450 + }, + { + "epoch": 0.9472666392107006, + "grad_norm": 1.4177966117858887, + "learning_rate": 1.4540268066982722e-06, + "loss": 1.0021, + "step": 26451 + }, + { + "epoch": 0.9473024513402689, + "grad_norm": 1.5565029382705688, + "learning_rate": 1.452056691264303e-06, + "loss": 0.9416, + "step": 26452 + }, + { + "epoch": 0.9473382634698372, + "grad_norm": 1.611181616783142, + "learning_rate": 1.4500879016657865e-06, + "loss": 1.1313, + "step": 26453 + }, + { + "epoch": 0.9473740755994056, + "grad_norm": 1.370858907699585, + "learning_rate": 1.4481204379292234e-06, + "loss": 1.0464, + "step": 26454 + }, + { + "epoch": 0.9474098877289738, + "grad_norm": 1.5303733348846436, + "learning_rate": 1.4461543000810929e-06, + "loss": 1.3006, + "step": 26455 + }, + { + "epoch": 0.9474456998585421, + "grad_norm": 1.7727164030075073, + "learning_rate": 1.4441894881478069e-06, + "loss": 1.1081, + "step": 26456 + }, + { + "epoch": 0.9474815119881104, + "grad_norm": 1.5058941841125488, + "learning_rate": 1.4422260021558331e-06, + "loss": 1.212, + "step": 26457 + }, + { + "epoch": 0.9475173241176786, + "grad_norm": 1.5865395069122314, + "learning_rate": 1.440263842131573e-06, + "loss": 1.1069, + "step": 26458 + }, + { + "epoch": 0.9475531362472469, + "grad_norm": 1.3118879795074463, + "learning_rate": 1.4383030081014493e-06, + "loss": 0.9695, + "step": 26459 + }, + { + "epoch": 0.9475889483768152, + "grad_norm": 1.6963038444519043, + "learning_rate": 1.436343500091808e-06, + "loss": 1.166, + "step": 26460 + }, + { + "epoch": 0.9476247605063836, + "grad_norm": 1.6970152854919434, + "learning_rate": 1.4343853181290168e-06, + "loss": 1.1567, + "step": 26461 + }, + { + "epoch": 0.9476605726359518, + "grad_norm": 1.3813728094100952, + "learning_rate": 1.4324284622394547e-06, + "loss": 1.0716, + "step": 26462 + }, + { + "epoch": 0.9476963847655201, + "grad_norm": 1.352621078491211, + "learning_rate": 1.4304729324494115e-06, + "loss": 1.0881, + "step": 26463 + }, + { + "epoch": 0.9477321968950884, + "grad_norm": 1.370926856994629, + "learning_rate": 1.4285187287851997e-06, + "loss": 1.0369, + "step": 26464 + }, + { + "epoch": 0.9477680090246566, + "grad_norm": 1.501929521560669, + "learning_rate": 1.4265658512731316e-06, + "loss": 1.2055, + "step": 26465 + }, + { + "epoch": 0.9478038211542249, + "grad_norm": 1.7921911478042603, + "learning_rate": 1.4246142999394751e-06, + "loss": 1.0868, + "step": 26466 + }, + { + "epoch": 0.9478396332837932, + "grad_norm": 1.881972074508667, + "learning_rate": 1.4226640748104757e-06, + "loss": 1.1379, + "step": 26467 + }, + { + "epoch": 0.9478754454133616, + "grad_norm": 1.4773551225662231, + "learning_rate": 1.4207151759123683e-06, + "loss": 1.105, + "step": 26468 + }, + { + "epoch": 0.9479112575429298, + "grad_norm": 1.2621561288833618, + "learning_rate": 1.418767603271387e-06, + "loss": 1.045, + "step": 26469 + }, + { + "epoch": 0.9479470696724981, + "grad_norm": 1.2866241931915283, + "learning_rate": 1.4168213569137223e-06, + "loss": 0.9578, + "step": 26470 + }, + { + "epoch": 0.9479828818020664, + "grad_norm": 1.3343671560287476, + "learning_rate": 1.4148764368655754e-06, + "loss": 1.1109, + "step": 26471 + }, + { + "epoch": 0.9480186939316346, + "grad_norm": 1.4469630718231201, + "learning_rate": 1.4129328431530807e-06, + "loss": 0.9237, + "step": 26472 + }, + { + "epoch": 0.9480545060612029, + "grad_norm": 1.9015551805496216, + "learning_rate": 1.4109905758024177e-06, + "loss": 1.4509, + "step": 26473 + }, + { + "epoch": 0.9480903181907712, + "grad_norm": 1.5914846658706665, + "learning_rate": 1.4090496348397097e-06, + "loss": 1.0161, + "step": 26474 + }, + { + "epoch": 0.9481261303203395, + "grad_norm": 1.4940907955169678, + "learning_rate": 1.407110020291058e-06, + "loss": 1.0673, + "step": 26475 + }, + { + "epoch": 0.9481619424499078, + "grad_norm": 1.4334040880203247, + "learning_rate": 1.4051717321825643e-06, + "loss": 1.0408, + "step": 26476 + }, + { + "epoch": 0.9481977545794761, + "grad_norm": 1.453248381614685, + "learning_rate": 1.403234770540307e-06, + "loss": 0.9507, + "step": 26477 + }, + { + "epoch": 0.9482335667090444, + "grad_norm": 1.3411741256713867, + "learning_rate": 1.4012991353903549e-06, + "loss": 1.147, + "step": 26478 + }, + { + "epoch": 0.9482693788386126, + "grad_norm": 1.3510832786560059, + "learning_rate": 1.3993648267587312e-06, + "loss": 1.0126, + "step": 26479 + }, + { + "epoch": 0.9483051909681809, + "grad_norm": 1.4420709609985352, + "learning_rate": 1.3974318446714706e-06, + "loss": 1.1809, + "step": 26480 + }, + { + "epoch": 0.9483410030977492, + "grad_norm": 1.715095043182373, + "learning_rate": 1.395500189154575e-06, + "loss": 1.0987, + "step": 26481 + }, + { + "epoch": 0.9483768152273175, + "grad_norm": 1.5647788047790527, + "learning_rate": 1.3935698602340452e-06, + "loss": 1.0019, + "step": 26482 + }, + { + "epoch": 0.9484126273568858, + "grad_norm": 1.249845266342163, + "learning_rate": 1.3916408579358164e-06, + "loss": 1.1347, + "step": 26483 + }, + { + "epoch": 0.9484484394864541, + "grad_norm": 1.295897364616394, + "learning_rate": 1.3897131822858789e-06, + "loss": 1.1109, + "step": 26484 + }, + { + "epoch": 0.9484842516160223, + "grad_norm": 1.6286026239395142, + "learning_rate": 1.3877868333101562e-06, + "loss": 1.1063, + "step": 26485 + }, + { + "epoch": 0.9485200637455906, + "grad_norm": 1.384636640548706, + "learning_rate": 1.385861811034561e-06, + "loss": 1.0254, + "step": 26486 + }, + { + "epoch": 0.9485558758751589, + "grad_norm": 1.7704384326934814, + "learning_rate": 1.383938115484984e-06, + "loss": 1.0535, + "step": 26487 + }, + { + "epoch": 0.9485916880047272, + "grad_norm": 1.8361846208572388, + "learning_rate": 1.3820157466873152e-06, + "loss": 1.1295, + "step": 26488 + }, + { + "epoch": 0.9486275001342955, + "grad_norm": 1.3956646919250488, + "learning_rate": 1.3800947046674228e-06, + "loss": 0.8734, + "step": 26489 + }, + { + "epoch": 0.9486633122638638, + "grad_norm": 1.597114086151123, + "learning_rate": 1.3781749894511308e-06, + "loss": 1.0215, + "step": 26490 + }, + { + "epoch": 0.9486991243934321, + "grad_norm": 1.642683744430542, + "learning_rate": 1.3762566010642962e-06, + "loss": 1.0364, + "step": 26491 + }, + { + "epoch": 0.9487349365230003, + "grad_norm": 1.3731948137283325, + "learning_rate": 1.3743395395326985e-06, + "loss": 0.9691, + "step": 26492 + }, + { + "epoch": 0.9487707486525686, + "grad_norm": 1.3475059270858765, + "learning_rate": 1.3724238048821615e-06, + "loss": 0.8492, + "step": 26493 + }, + { + "epoch": 0.9488065607821369, + "grad_norm": 1.5442804098129272, + "learning_rate": 1.370509397138431e-06, + "loss": 1.1947, + "step": 26494 + }, + { + "epoch": 0.9488423729117051, + "grad_norm": 1.276982069015503, + "learning_rate": 1.3685963163272752e-06, + "loss": 1.0527, + "step": 26495 + }, + { + "epoch": 0.9488781850412735, + "grad_norm": 1.4148088693618774, + "learning_rate": 1.3666845624744406e-06, + "loss": 1.0804, + "step": 26496 + }, + { + "epoch": 0.9489139971708418, + "grad_norm": 1.1890226602554321, + "learning_rate": 1.3647741356056287e-06, + "loss": 0.9034, + "step": 26497 + }, + { + "epoch": 0.9489498093004101, + "grad_norm": 1.6208477020263672, + "learning_rate": 1.3628650357465522e-06, + "loss": 1.052, + "step": 26498 + }, + { + "epoch": 0.9489856214299783, + "grad_norm": 1.353311538696289, + "learning_rate": 1.3609572629228906e-06, + "loss": 1.0035, + "step": 26499 + }, + { + "epoch": 0.9490214335595466, + "grad_norm": 1.6393396854400635, + "learning_rate": 1.3590508171603233e-06, + "loss": 1.1976, + "step": 26500 + }, + { + "epoch": 0.9490572456891149, + "grad_norm": 1.3721139430999756, + "learning_rate": 1.3571456984844743e-06, + "loss": 0.9406, + "step": 26501 + }, + { + "epoch": 0.9490930578186831, + "grad_norm": 1.274718165397644, + "learning_rate": 1.3552419069210009e-06, + "loss": 0.9471, + "step": 26502 + }, + { + "epoch": 0.9491288699482515, + "grad_norm": 1.4620698690414429, + "learning_rate": 1.3533394424954937e-06, + "loss": 1.0159, + "step": 26503 + }, + { + "epoch": 0.9491646820778198, + "grad_norm": 1.8026596307754517, + "learning_rate": 1.3514383052335766e-06, + "loss": 1.0263, + "step": 26504 + }, + { + "epoch": 0.9492004942073881, + "grad_norm": 1.3068201541900635, + "learning_rate": 1.3495384951607958e-06, + "loss": 0.9368, + "step": 26505 + }, + { + "epoch": 0.9492363063369563, + "grad_norm": 2.6801114082336426, + "learning_rate": 1.3476400123027312e-06, + "loss": 1.1899, + "step": 26506 + }, + { + "epoch": 0.9492721184665246, + "grad_norm": 1.5879571437835693, + "learning_rate": 1.3457428566849173e-06, + "loss": 1.2109, + "step": 26507 + }, + { + "epoch": 0.9493079305960929, + "grad_norm": 1.4035322666168213, + "learning_rate": 1.3438470283328785e-06, + "loss": 1.0904, + "step": 26508 + }, + { + "epoch": 0.9493437427256611, + "grad_norm": 1.3138092756271362, + "learning_rate": 1.3419525272721168e-06, + "loss": 1.0883, + "step": 26509 + }, + { + "epoch": 0.9493795548552295, + "grad_norm": 2.0351319313049316, + "learning_rate": 1.3400593535281224e-06, + "loss": 1.0879, + "step": 26510 + }, + { + "epoch": 0.9494153669847978, + "grad_norm": 1.5501022338867188, + "learning_rate": 1.3381675071263755e-06, + "loss": 1.165, + "step": 26511 + }, + { + "epoch": 0.949451179114366, + "grad_norm": 1.4190436601638794, + "learning_rate": 1.3362769880923221e-06, + "loss": 0.9245, + "step": 26512 + }, + { + "epoch": 0.9494869912439343, + "grad_norm": 1.3567345142364502, + "learning_rate": 1.3343877964513863e-06, + "loss": 1.0504, + "step": 26513 + }, + { + "epoch": 0.9495228033735026, + "grad_norm": 1.4150230884552002, + "learning_rate": 1.3324999322290033e-06, + "loss": 0.9053, + "step": 26514 + }, + { + "epoch": 0.9495586155030709, + "grad_norm": 1.651609182357788, + "learning_rate": 1.330613395450553e-06, + "loss": 1.2126, + "step": 26515 + }, + { + "epoch": 0.9495944276326391, + "grad_norm": 1.383056402206421, + "learning_rate": 1.3287281861414258e-06, + "loss": 1.0253, + "step": 26516 + }, + { + "epoch": 0.9496302397622075, + "grad_norm": 1.3729000091552734, + "learning_rate": 1.3268443043269796e-06, + "loss": 1.182, + "step": 26517 + }, + { + "epoch": 0.9496660518917758, + "grad_norm": 1.3417251110076904, + "learning_rate": 1.3249617500325718e-06, + "loss": 1.255, + "step": 26518 + }, + { + "epoch": 0.949701864021344, + "grad_norm": 1.597096562385559, + "learning_rate": 1.3230805232835153e-06, + "loss": 1.0606, + "step": 26519 + }, + { + "epoch": 0.9497376761509123, + "grad_norm": 1.311333417892456, + "learning_rate": 1.3212006241051345e-06, + "loss": 1.0564, + "step": 26520 + }, + { + "epoch": 0.9497734882804806, + "grad_norm": 1.4820420742034912, + "learning_rate": 1.319322052522709e-06, + "loss": 1.0465, + "step": 26521 + }, + { + "epoch": 0.9498093004100489, + "grad_norm": 1.2865722179412842, + "learning_rate": 1.3174448085615187e-06, + "loss": 1.1013, + "step": 26522 + }, + { + "epoch": 0.9498451125396171, + "grad_norm": 1.3192180395126343, + "learning_rate": 1.3155688922468101e-06, + "loss": 1.0075, + "step": 26523 + }, + { + "epoch": 0.9498809246691855, + "grad_norm": 1.3734990358352661, + "learning_rate": 1.3136943036038297e-06, + "loss": 1.0524, + "step": 26524 + }, + { + "epoch": 0.9499167367987538, + "grad_norm": 1.4901416301727295, + "learning_rate": 1.3118210426578015e-06, + "loss": 1.0471, + "step": 26525 + }, + { + "epoch": 0.949952548928322, + "grad_norm": 1.7273647785186768, + "learning_rate": 1.3099491094339279e-06, + "loss": 1.003, + "step": 26526 + }, + { + "epoch": 0.9499883610578903, + "grad_norm": 1.3771237134933472, + "learning_rate": 1.3080785039573773e-06, + "loss": 1.0452, + "step": 26527 + }, + { + "epoch": 0.9500241731874586, + "grad_norm": 1.4466556310653687, + "learning_rate": 1.3062092262533189e-06, + "loss": 0.9751, + "step": 26528 + }, + { + "epoch": 0.9500599853170268, + "grad_norm": 1.2480919361114502, + "learning_rate": 1.304341276346932e-06, + "loss": 0.9191, + "step": 26529 + }, + { + "epoch": 0.9500957974465951, + "grad_norm": 1.2707120180130005, + "learning_rate": 1.3024746542633082e-06, + "loss": 1.0103, + "step": 26530 + }, + { + "epoch": 0.9501316095761635, + "grad_norm": 1.455418586730957, + "learning_rate": 1.3006093600275825e-06, + "loss": 1.1959, + "step": 26531 + }, + { + "epoch": 0.9501674217057318, + "grad_norm": 1.6904407739639282, + "learning_rate": 1.2987453936648575e-06, + "loss": 1.0656, + "step": 26532 + }, + { + "epoch": 0.9502032338353, + "grad_norm": 1.344893217086792, + "learning_rate": 1.2968827552001793e-06, + "loss": 1.1139, + "step": 26533 + }, + { + "epoch": 0.9502390459648683, + "grad_norm": 1.2872295379638672, + "learning_rate": 1.2950214446586284e-06, + "loss": 1.0165, + "step": 26534 + }, + { + "epoch": 0.9502748580944366, + "grad_norm": 1.3918746709823608, + "learning_rate": 1.2931614620652511e-06, + "loss": 1.0974, + "step": 26535 + }, + { + "epoch": 0.9503106702240048, + "grad_norm": 1.4020136594772339, + "learning_rate": 1.2913028074450607e-06, + "loss": 1.0478, + "step": 26536 + }, + { + "epoch": 0.9503464823535731, + "grad_norm": 1.6400216817855835, + "learning_rate": 1.2894454808230593e-06, + "loss": 1.0809, + "step": 26537 + }, + { + "epoch": 0.9503822944831415, + "grad_norm": 1.39753258228302, + "learning_rate": 1.2875894822242496e-06, + "loss": 1.0245, + "step": 26538 + }, + { + "epoch": 0.9504181066127098, + "grad_norm": 1.351657748222351, + "learning_rate": 1.2857348116736002e-06, + "loss": 1.0228, + "step": 26539 + }, + { + "epoch": 0.950453918742278, + "grad_norm": 1.7489598989486694, + "learning_rate": 1.2838814691960355e-06, + "loss": 1.1483, + "step": 26540 + }, + { + "epoch": 0.9504897308718463, + "grad_norm": 1.4470922946929932, + "learning_rate": 1.2820294548165246e-06, + "loss": 1.0267, + "step": 26541 + }, + { + "epoch": 0.9505255430014146, + "grad_norm": 1.2260204553604126, + "learning_rate": 1.2801787685599698e-06, + "loss": 1.0341, + "step": 26542 + }, + { + "epoch": 0.9505613551309828, + "grad_norm": 1.244554877281189, + "learning_rate": 1.2783294104512734e-06, + "loss": 1.1061, + "step": 26543 + }, + { + "epoch": 0.9505971672605511, + "grad_norm": 1.6360708475112915, + "learning_rate": 1.2764813805153041e-06, + "loss": 1.1418, + "step": 26544 + }, + { + "epoch": 0.9506329793901195, + "grad_norm": 1.3930857181549072, + "learning_rate": 1.2746346787769425e-06, + "loss": 0.8786, + "step": 26545 + }, + { + "epoch": 0.9506687915196878, + "grad_norm": 1.4600046873092651, + "learning_rate": 1.272789305261013e-06, + "loss": 1.1293, + "step": 26546 + }, + { + "epoch": 0.950704603649256, + "grad_norm": 1.3507370948791504, + "learning_rate": 1.2709452599923731e-06, + "loss": 1.0941, + "step": 26547 + }, + { + "epoch": 0.9507404157788243, + "grad_norm": 3.235856056213379, + "learning_rate": 1.2691025429958037e-06, + "loss": 0.9867, + "step": 26548 + }, + { + "epoch": 0.9507762279083926, + "grad_norm": 1.5053939819335938, + "learning_rate": 1.2672611542960954e-06, + "loss": 1.0277, + "step": 26549 + }, + { + "epoch": 0.9508120400379608, + "grad_norm": 1.3195761442184448, + "learning_rate": 1.2654210939180511e-06, + "loss": 1.0856, + "step": 26550 + }, + { + "epoch": 0.9508478521675291, + "grad_norm": 1.889328956604004, + "learning_rate": 1.2635823618863951e-06, + "loss": 1.1596, + "step": 26551 + }, + { + "epoch": 0.9508836642970975, + "grad_norm": 1.7026673555374146, + "learning_rate": 1.2617449582258744e-06, + "loss": 1.1405, + "step": 26552 + }, + { + "epoch": 0.9509194764266657, + "grad_norm": 1.418174147605896, + "learning_rate": 1.2599088829612249e-06, + "loss": 1.1645, + "step": 26553 + }, + { + "epoch": 0.950955288556234, + "grad_norm": 1.3085784912109375, + "learning_rate": 1.2580741361171267e-06, + "loss": 1.1874, + "step": 26554 + }, + { + "epoch": 0.9509911006858023, + "grad_norm": 1.4926855564117432, + "learning_rate": 1.2562407177182712e-06, + "loss": 0.9937, + "step": 26555 + }, + { + "epoch": 0.9510269128153706, + "grad_norm": 1.563780426979065, + "learning_rate": 1.2544086277893386e-06, + "loss": 1.2165, + "step": 26556 + }, + { + "epoch": 0.9510627249449388, + "grad_norm": 1.7109061479568481, + "learning_rate": 1.2525778663549537e-06, + "loss": 1.0284, + "step": 26557 + }, + { + "epoch": 0.9510985370745071, + "grad_norm": 1.5412384271621704, + "learning_rate": 1.2507484334397634e-06, + "loss": 1.1188, + "step": 26558 + }, + { + "epoch": 0.9511343492040755, + "grad_norm": 1.275651454925537, + "learning_rate": 1.2489203290683703e-06, + "loss": 0.9695, + "step": 26559 + }, + { + "epoch": 0.9511701613336437, + "grad_norm": 1.2859424352645874, + "learning_rate": 1.2470935532653772e-06, + "loss": 1.1054, + "step": 26560 + }, + { + "epoch": 0.951205973463212, + "grad_norm": 1.3775041103363037, + "learning_rate": 1.2452681060553639e-06, + "loss": 1.1343, + "step": 26561 + }, + { + "epoch": 0.9512417855927803, + "grad_norm": 1.2928905487060547, + "learning_rate": 1.243443987462878e-06, + "loss": 1.0798, + "step": 26562 + }, + { + "epoch": 0.9512775977223485, + "grad_norm": 2.052504062652588, + "learning_rate": 1.2416211975124658e-06, + "loss": 1.2224, + "step": 26563 + }, + { + "epoch": 0.9513134098519168, + "grad_norm": 1.3983973264694214, + "learning_rate": 1.2397997362286528e-06, + "loss": 0.8954, + "step": 26564 + }, + { + "epoch": 0.9513492219814851, + "grad_norm": 1.3080005645751953, + "learning_rate": 1.2379796036359526e-06, + "loss": 1.0761, + "step": 26565 + }, + { + "epoch": 0.9513850341110535, + "grad_norm": 1.161229133605957, + "learning_rate": 1.2361607997588343e-06, + "loss": 1.1758, + "step": 26566 + }, + { + "epoch": 0.9514208462406217, + "grad_norm": 1.6100637912750244, + "learning_rate": 1.2343433246217673e-06, + "loss": 1.0901, + "step": 26567 + }, + { + "epoch": 0.95145665837019, + "grad_norm": 1.6007354259490967, + "learning_rate": 1.232527178249232e-06, + "loss": 1.1819, + "step": 26568 + }, + { + "epoch": 0.9514924704997583, + "grad_norm": 1.5455288887023926, + "learning_rate": 1.2307123606656312e-06, + "loss": 0.9807, + "step": 26569 + }, + { + "epoch": 0.9515282826293265, + "grad_norm": 2.1019139289855957, + "learning_rate": 1.2288988718953897e-06, + "loss": 1.1226, + "step": 26570 + }, + { + "epoch": 0.9515640947588948, + "grad_norm": 1.7150936126708984, + "learning_rate": 1.2270867119629103e-06, + "loss": 1.057, + "step": 26571 + }, + { + "epoch": 0.9515999068884631, + "grad_norm": 1.3463877439498901, + "learning_rate": 1.2252758808925736e-06, + "loss": 0.9808, + "step": 26572 + }, + { + "epoch": 0.9516357190180315, + "grad_norm": 1.4435484409332275, + "learning_rate": 1.2234663787087375e-06, + "loss": 1.0428, + "step": 26573 + }, + { + "epoch": 0.9516715311475997, + "grad_norm": 1.2994035482406616, + "learning_rate": 1.2216582054357495e-06, + "loss": 1.0565, + "step": 26574 + }, + { + "epoch": 0.951707343277168, + "grad_norm": 1.6295610666275024, + "learning_rate": 1.2198513610979346e-06, + "loss": 1.0965, + "step": 26575 + }, + { + "epoch": 0.9517431554067363, + "grad_norm": 1.621154546737671, + "learning_rate": 1.2180458457196064e-06, + "loss": 0.9319, + "step": 26576 + }, + { + "epoch": 0.9517789675363045, + "grad_norm": 1.5034149885177612, + "learning_rate": 1.2162416593250569e-06, + "loss": 1.084, + "step": 26577 + }, + { + "epoch": 0.9518147796658728, + "grad_norm": 1.608107328414917, + "learning_rate": 1.2144388019385333e-06, + "loss": 0.9142, + "step": 26578 + }, + { + "epoch": 0.9518505917954411, + "grad_norm": 1.2119991779327393, + "learning_rate": 1.2126372735843272e-06, + "loss": 1.0294, + "step": 26579 + }, + { + "epoch": 0.9518864039250094, + "grad_norm": 1.459324836730957, + "learning_rate": 1.2108370742866526e-06, + "loss": 1.2748, + "step": 26580 + }, + { + "epoch": 0.9519222160545777, + "grad_norm": 1.3668735027313232, + "learning_rate": 1.2090382040697456e-06, + "loss": 1.001, + "step": 26581 + }, + { + "epoch": 0.951958028184146, + "grad_norm": 1.743242859840393, + "learning_rate": 1.2072406629577871e-06, + "loss": 1.0938, + "step": 26582 + }, + { + "epoch": 0.9519938403137143, + "grad_norm": 1.572488784790039, + "learning_rate": 1.2054444509749906e-06, + "loss": 1.1146, + "step": 26583 + }, + { + "epoch": 0.9520296524432825, + "grad_norm": 1.3873049020767212, + "learning_rate": 1.203649568145493e-06, + "loss": 1.0736, + "step": 26584 + }, + { + "epoch": 0.9520654645728508, + "grad_norm": 1.5205507278442383, + "learning_rate": 1.201856014493441e-06, + "loss": 1.1014, + "step": 26585 + }, + { + "epoch": 0.9521012767024191, + "grad_norm": 1.4735630750656128, + "learning_rate": 1.2000637900429934e-06, + "loss": 1.0725, + "step": 26586 + }, + { + "epoch": 0.9521370888319874, + "grad_norm": 1.559936761856079, + "learning_rate": 1.1982728948182308e-06, + "loss": 1.1065, + "step": 26587 + }, + { + "epoch": 0.9521729009615557, + "grad_norm": 1.5121790170669556, + "learning_rate": 1.1964833288432674e-06, + "loss": 1.0277, + "step": 26588 + }, + { + "epoch": 0.952208713091124, + "grad_norm": 1.8528242111206055, + "learning_rate": 1.194695092142173e-06, + "loss": 1.2183, + "step": 26589 + }, + { + "epoch": 0.9522445252206923, + "grad_norm": 1.3369793891906738, + "learning_rate": 1.1929081847390056e-06, + "loss": 0.9516, + "step": 26590 + }, + { + "epoch": 0.9522803373502605, + "grad_norm": 1.4215985536575317, + "learning_rate": 1.191122606657813e-06, + "loss": 0.9455, + "step": 26591 + }, + { + "epoch": 0.9523161494798288, + "grad_norm": 1.6099169254302979, + "learning_rate": 1.1893383579226091e-06, + "loss": 1.1672, + "step": 26592 + }, + { + "epoch": 0.9523519616093971, + "grad_norm": 1.9199492931365967, + "learning_rate": 1.1875554385573972e-06, + "loss": 1.181, + "step": 26593 + }, + { + "epoch": 0.9523877737389654, + "grad_norm": 1.2859488725662231, + "learning_rate": 1.185773848586158e-06, + "loss": 1.0976, + "step": 26594 + }, + { + "epoch": 0.9524235858685337, + "grad_norm": 1.5943154096603394, + "learning_rate": 1.1839935880328946e-06, + "loss": 1.1833, + "step": 26595 + }, + { + "epoch": 0.952459397998102, + "grad_norm": 1.5618759393692017, + "learning_rate": 1.1822146569215097e-06, + "loss": 1.145, + "step": 26596 + }, + { + "epoch": 0.9524952101276702, + "grad_norm": 1.1825463771820068, + "learning_rate": 1.1804370552759735e-06, + "loss": 1.0258, + "step": 26597 + }, + { + "epoch": 0.9525310222572385, + "grad_norm": 1.4622396230697632, + "learning_rate": 1.178660783120189e-06, + "loss": 1.168, + "step": 26598 + }, + { + "epoch": 0.9525668343868068, + "grad_norm": 1.6850746870040894, + "learning_rate": 1.176885840478048e-06, + "loss": 1.3053, + "step": 26599 + }, + { + "epoch": 0.952602646516375, + "grad_norm": 1.424410343170166, + "learning_rate": 1.1751122273734316e-06, + "loss": 0.8854, + "step": 26600 + }, + { + "epoch": 0.9526384586459434, + "grad_norm": 1.8735482692718506, + "learning_rate": 1.1733399438302206e-06, + "loss": 0.9756, + "step": 26601 + }, + { + "epoch": 0.9526742707755117, + "grad_norm": 1.2951726913452148, + "learning_rate": 1.1715689898722404e-06, + "loss": 1.2385, + "step": 26602 + }, + { + "epoch": 0.95271008290508, + "grad_norm": 1.512149453163147, + "learning_rate": 1.1697993655233164e-06, + "loss": 1.0992, + "step": 26603 + }, + { + "epoch": 0.9527458950346482, + "grad_norm": 1.498213529586792, + "learning_rate": 1.1680310708072518e-06, + "loss": 1.124, + "step": 26604 + }, + { + "epoch": 0.9527817071642165, + "grad_norm": 1.3113716840744019, + "learning_rate": 1.1662641057478497e-06, + "loss": 1.0673, + "step": 26605 + }, + { + "epoch": 0.9528175192937848, + "grad_norm": 1.5035881996154785, + "learning_rate": 1.1644984703688799e-06, + "loss": 1.1513, + "step": 26606 + }, + { + "epoch": 0.952853331423353, + "grad_norm": 1.674509882926941, + "learning_rate": 1.1627341646941015e-06, + "loss": 1.3138, + "step": 26607 + }, + { + "epoch": 0.9528891435529214, + "grad_norm": 1.3108892440795898, + "learning_rate": 1.1609711887472286e-06, + "loss": 1.0993, + "step": 26608 + }, + { + "epoch": 0.9529249556824897, + "grad_norm": 1.3720232248306274, + "learning_rate": 1.1592095425520088e-06, + "loss": 0.9989, + "step": 26609 + }, + { + "epoch": 0.952960767812058, + "grad_norm": 1.8876662254333496, + "learning_rate": 1.1574492261321236e-06, + "loss": 1.0156, + "step": 26610 + }, + { + "epoch": 0.9529965799416262, + "grad_norm": 1.5744696855545044, + "learning_rate": 1.1556902395112645e-06, + "loss": 0.9993, + "step": 26611 + }, + { + "epoch": 0.9530323920711945, + "grad_norm": 1.491597294807434, + "learning_rate": 1.1539325827130799e-06, + "loss": 0.9065, + "step": 26612 + }, + { + "epoch": 0.9530682042007628, + "grad_norm": 1.393064260482788, + "learning_rate": 1.1521762557612502e-06, + "loss": 0.9072, + "step": 26613 + }, + { + "epoch": 0.953104016330331, + "grad_norm": 1.6025760173797607, + "learning_rate": 1.1504212586793683e-06, + "loss": 1.1346, + "step": 26614 + }, + { + "epoch": 0.9531398284598994, + "grad_norm": 1.826545000076294, + "learning_rate": 1.1486675914910705e-06, + "loss": 1.2325, + "step": 26615 + }, + { + "epoch": 0.9531756405894677, + "grad_norm": 1.2874442338943481, + "learning_rate": 1.1469152542199379e-06, + "loss": 1.032, + "step": 26616 + }, + { + "epoch": 0.953211452719036, + "grad_norm": 1.1868566274642944, + "learning_rate": 1.1451642468895518e-06, + "loss": 0.8956, + "step": 26617 + }, + { + "epoch": 0.9532472648486042, + "grad_norm": 1.5482405424118042, + "learning_rate": 1.14341456952346e-06, + "loss": 1.059, + "step": 26618 + }, + { + "epoch": 0.9532830769781725, + "grad_norm": 1.7129324674606323, + "learning_rate": 1.1416662221452211e-06, + "loss": 1.0162, + "step": 26619 + }, + { + "epoch": 0.9533188891077408, + "grad_norm": 1.9129208326339722, + "learning_rate": 1.139919204778339e-06, + "loss": 1.0509, + "step": 26620 + }, + { + "epoch": 0.953354701237309, + "grad_norm": 1.166998028755188, + "learning_rate": 1.1381735174463283e-06, + "loss": 0.7474, + "step": 26621 + }, + { + "epoch": 0.9533905133668774, + "grad_norm": 1.3569470643997192, + "learning_rate": 1.1364291601726585e-06, + "loss": 1.133, + "step": 26622 + }, + { + "epoch": 0.9534263254964457, + "grad_norm": 1.364592432975769, + "learning_rate": 1.1346861329808112e-06, + "loss": 1.2071, + "step": 26623 + }, + { + "epoch": 0.953462137626014, + "grad_norm": 1.4866126775741577, + "learning_rate": 1.1329444358942454e-06, + "loss": 0.9795, + "step": 26624 + }, + { + "epoch": 0.9534979497555822, + "grad_norm": 1.33805251121521, + "learning_rate": 1.1312040689363757e-06, + "loss": 1.1009, + "step": 26625 + }, + { + "epoch": 0.9535337618851505, + "grad_norm": 1.3073288202285767, + "learning_rate": 1.1294650321306277e-06, + "loss": 0.9949, + "step": 26626 + }, + { + "epoch": 0.9535695740147188, + "grad_norm": 1.4095855951309204, + "learning_rate": 1.127727325500394e-06, + "loss": 1.1585, + "step": 26627 + }, + { + "epoch": 0.953605386144287, + "grad_norm": 1.4851316213607788, + "learning_rate": 1.1259909490690556e-06, + "loss": 0.9814, + "step": 26628 + }, + { + "epoch": 0.9536411982738554, + "grad_norm": 1.527788519859314, + "learning_rate": 1.1242559028599609e-06, + "loss": 1.1318, + "step": 26629 + }, + { + "epoch": 0.9536770104034237, + "grad_norm": 1.5585026741027832, + "learning_rate": 1.1225221868964686e-06, + "loss": 0.9534, + "step": 26630 + }, + { + "epoch": 0.9537128225329919, + "grad_norm": 1.3876525163650513, + "learning_rate": 1.1207898012018936e-06, + "loss": 1.0512, + "step": 26631 + }, + { + "epoch": 0.9537486346625602, + "grad_norm": 1.2757030725479126, + "learning_rate": 1.1190587457995506e-06, + "loss": 1.3686, + "step": 26632 + }, + { + "epoch": 0.9537844467921285, + "grad_norm": 1.3453384637832642, + "learning_rate": 1.1173290207127207e-06, + "loss": 1.1543, + "step": 26633 + }, + { + "epoch": 0.9538202589216968, + "grad_norm": 1.5206745862960815, + "learning_rate": 1.1156006259646856e-06, + "loss": 1.1416, + "step": 26634 + }, + { + "epoch": 0.953856071051265, + "grad_norm": 1.844997525215149, + "learning_rate": 1.1138735615786933e-06, + "loss": 1.0292, + "step": 26635 + }, + { + "epoch": 0.9538918831808334, + "grad_norm": 1.562644124031067, + "learning_rate": 1.1121478275779696e-06, + "loss": 0.8917, + "step": 26636 + }, + { + "epoch": 0.9539276953104017, + "grad_norm": 1.5063927173614502, + "learning_rate": 1.1104234239857402e-06, + "loss": 1.2323, + "step": 26637 + }, + { + "epoch": 0.9539635074399699, + "grad_norm": 1.485219955444336, + "learning_rate": 1.1087003508252202e-06, + "loss": 1.0, + "step": 26638 + }, + { + "epoch": 0.9539993195695382, + "grad_norm": 1.4128490686416626, + "learning_rate": 1.1069786081195687e-06, + "loss": 1.0918, + "step": 26639 + }, + { + "epoch": 0.9540351316991065, + "grad_norm": 1.2607709169387817, + "learning_rate": 1.105258195891945e-06, + "loss": 1.044, + "step": 26640 + }, + { + "epoch": 0.9540709438286747, + "grad_norm": 1.1871284246444702, + "learning_rate": 1.1035391141655195e-06, + "loss": 1.1718, + "step": 26641 + }, + { + "epoch": 0.954106755958243, + "grad_norm": 1.7555326223373413, + "learning_rate": 1.1018213629634178e-06, + "loss": 1.2244, + "step": 26642 + }, + { + "epoch": 0.9541425680878114, + "grad_norm": 1.4730116128921509, + "learning_rate": 1.1001049423087217e-06, + "loss": 0.7785, + "step": 26643 + }, + { + "epoch": 0.9541783802173797, + "grad_norm": 1.3901748657226562, + "learning_rate": 1.098389852224546e-06, + "loss": 0.8732, + "step": 26644 + }, + { + "epoch": 0.9542141923469479, + "grad_norm": 1.4590297937393188, + "learning_rate": 1.0966760927339726e-06, + "loss": 1.1119, + "step": 26645 + }, + { + "epoch": 0.9542500044765162, + "grad_norm": 1.561053991317749, + "learning_rate": 1.094963663860027e-06, + "loss": 1.1488, + "step": 26646 + }, + { + "epoch": 0.9542858166060845, + "grad_norm": 1.2871469259262085, + "learning_rate": 1.0932525656257796e-06, + "loss": 1.1408, + "step": 26647 + }, + { + "epoch": 0.9543216287356527, + "grad_norm": 1.407989263534546, + "learning_rate": 1.0915427980542348e-06, + "loss": 1.2362, + "step": 26648 + }, + { + "epoch": 0.954357440865221, + "grad_norm": 1.3541524410247803, + "learning_rate": 1.089834361168407e-06, + "loss": 0.8952, + "step": 26649 + }, + { + "epoch": 0.9543932529947894, + "grad_norm": 1.7972549200057983, + "learning_rate": 1.088127254991267e-06, + "loss": 1.1278, + "step": 26650 + }, + { + "epoch": 0.9544290651243577, + "grad_norm": 1.4819035530090332, + "learning_rate": 1.086421479545785e-06, + "loss": 1.1657, + "step": 26651 + }, + { + "epoch": 0.9544648772539259, + "grad_norm": 1.363261342048645, + "learning_rate": 1.0847170348549096e-06, + "loss": 1.0217, + "step": 26652 + }, + { + "epoch": 0.9545006893834942, + "grad_norm": 1.834104061126709, + "learning_rate": 1.0830139209415779e-06, + "loss": 1.2108, + "step": 26653 + }, + { + "epoch": 0.9545365015130625, + "grad_norm": 1.4898593425750732, + "learning_rate": 1.081312137828716e-06, + "loss": 1.3516, + "step": 26654 + }, + { + "epoch": 0.9545723136426307, + "grad_norm": 1.5716116428375244, + "learning_rate": 1.0796116855391724e-06, + "loss": 0.8638, + "step": 26655 + }, + { + "epoch": 0.954608125772199, + "grad_norm": 1.767982840538025, + "learning_rate": 1.0779125640958843e-06, + "loss": 1.1661, + "step": 26656 + }, + { + "epoch": 0.9546439379017674, + "grad_norm": 1.4093677997589111, + "learning_rate": 1.0762147735216665e-06, + "loss": 1.1272, + "step": 26657 + }, + { + "epoch": 0.9546797500313356, + "grad_norm": 1.682011365890503, + "learning_rate": 1.0745183138393788e-06, + "loss": 1.1543, + "step": 26658 + }, + { + "epoch": 0.9547155621609039, + "grad_norm": 1.363896131515503, + "learning_rate": 1.0728231850718363e-06, + "loss": 1.2203, + "step": 26659 + }, + { + "epoch": 0.9547513742904722, + "grad_norm": 1.4769057035446167, + "learning_rate": 1.071129387241865e-06, + "loss": 1.1391, + "step": 26660 + }, + { + "epoch": 0.9547871864200405, + "grad_norm": 1.4058878421783447, + "learning_rate": 1.0694369203722354e-06, + "loss": 1.1988, + "step": 26661 + }, + { + "epoch": 0.9548229985496087, + "grad_norm": 2.173013210296631, + "learning_rate": 1.0677457844857186e-06, + "loss": 1.1461, + "step": 26662 + }, + { + "epoch": 0.954858810679177, + "grad_norm": 1.4888545274734497, + "learning_rate": 1.0660559796050739e-06, + "loss": 1.1045, + "step": 26663 + }, + { + "epoch": 0.9548946228087454, + "grad_norm": 1.9700307846069336, + "learning_rate": 1.0643675057530166e-06, + "loss": 0.9718, + "step": 26664 + }, + { + "epoch": 0.9549304349383136, + "grad_norm": 1.336044430732727, + "learning_rate": 1.0626803629522951e-06, + "loss": 0.7431, + "step": 26665 + }, + { + "epoch": 0.9549662470678819, + "grad_norm": 1.3311737775802612, + "learning_rate": 1.0609945512255692e-06, + "loss": 0.9634, + "step": 26666 + }, + { + "epoch": 0.9550020591974502, + "grad_norm": 1.2732720375061035, + "learning_rate": 1.0593100705955538e-06, + "loss": 1.0689, + "step": 26667 + }, + { + "epoch": 0.9550378713270185, + "grad_norm": 1.378029227256775, + "learning_rate": 1.0576269210848867e-06, + "loss": 1.167, + "step": 26668 + }, + { + "epoch": 0.9550736834565867, + "grad_norm": 1.6427106857299805, + "learning_rate": 1.055945102716227e-06, + "loss": 0.9749, + "step": 26669 + }, + { + "epoch": 0.955109495586155, + "grad_norm": 1.5090166330337524, + "learning_rate": 1.0542646155122015e-06, + "loss": 1.3306, + "step": 26670 + }, + { + "epoch": 0.9551453077157234, + "grad_norm": 1.4205865859985352, + "learning_rate": 1.0525854594954143e-06, + "loss": 1.017, + "step": 26671 + }, + { + "epoch": 0.9551811198452916, + "grad_norm": 1.67210853099823, + "learning_rate": 1.0509076346884583e-06, + "loss": 1.407, + "step": 26672 + }, + { + "epoch": 0.9552169319748599, + "grad_norm": 1.7037724256515503, + "learning_rate": 1.0492311411138934e-06, + "loss": 1.204, + "step": 26673 + }, + { + "epoch": 0.9552527441044282, + "grad_norm": 1.906957745552063, + "learning_rate": 1.0475559787943012e-06, + "loss": 1.0319, + "step": 26674 + }, + { + "epoch": 0.9552885562339964, + "grad_norm": 1.5439321994781494, + "learning_rate": 1.0458821477521974e-06, + "loss": 1.1177, + "step": 26675 + }, + { + "epoch": 0.9553243683635647, + "grad_norm": 1.6974098682403564, + "learning_rate": 1.0442096480101082e-06, + "loss": 0.9455, + "step": 26676 + }, + { + "epoch": 0.955360180493133, + "grad_norm": 1.6572614908218384, + "learning_rate": 1.042538479590527e-06, + "loss": 1.2902, + "step": 26677 + }, + { + "epoch": 0.9553959926227014, + "grad_norm": 1.3313251733779907, + "learning_rate": 1.0408686425159574e-06, + "loss": 0.9554, + "step": 26678 + }, + { + "epoch": 0.9554318047522696, + "grad_norm": 1.5967316627502441, + "learning_rate": 1.0392001368088377e-06, + "loss": 1.1003, + "step": 26679 + }, + { + "epoch": 0.9554676168818379, + "grad_norm": 1.5380511283874512, + "learning_rate": 1.0375329624916386e-06, + "loss": 1.039, + "step": 26680 + }, + { + "epoch": 0.9555034290114062, + "grad_norm": 1.557809591293335, + "learning_rate": 1.0358671195867865e-06, + "loss": 1.0636, + "step": 26681 + }, + { + "epoch": 0.9555392411409744, + "grad_norm": 1.724575161933899, + "learning_rate": 1.0342026081166745e-06, + "loss": 1.0085, + "step": 26682 + }, + { + "epoch": 0.9555750532705427, + "grad_norm": 1.2539680004119873, + "learning_rate": 1.0325394281037293e-06, + "loss": 1.0675, + "step": 26683 + }, + { + "epoch": 0.955610865400111, + "grad_norm": 1.3381943702697754, + "learning_rate": 1.0308775795702775e-06, + "loss": 0.9205, + "step": 26684 + }, + { + "epoch": 0.9556466775296794, + "grad_norm": 1.2941486835479736, + "learning_rate": 1.0292170625387342e-06, + "loss": 1.0982, + "step": 26685 + }, + { + "epoch": 0.9556824896592476, + "grad_norm": 1.36019766330719, + "learning_rate": 1.0275578770313933e-06, + "loss": 1.1183, + "step": 26686 + }, + { + "epoch": 0.9557183017888159, + "grad_norm": 1.3647552728652954, + "learning_rate": 1.025900023070614e-06, + "loss": 1.1477, + "step": 26687 + }, + { + "epoch": 0.9557541139183842, + "grad_norm": 1.3864600658416748, + "learning_rate": 1.0242435006786677e-06, + "loss": 1.2, + "step": 26688 + }, + { + "epoch": 0.9557899260479524, + "grad_norm": 1.782486915588379, + "learning_rate": 1.0225883098778588e-06, + "loss": 1.0158, + "step": 26689 + }, + { + "epoch": 0.9558257381775207, + "grad_norm": 1.4336808919906616, + "learning_rate": 1.0209344506904694e-06, + "loss": 1.0762, + "step": 26690 + }, + { + "epoch": 0.955861550307089, + "grad_norm": 1.6423041820526123, + "learning_rate": 1.019281923138715e-06, + "loss": 1.0394, + "step": 26691 + }, + { + "epoch": 0.9558973624366573, + "grad_norm": 2.068969964981079, + "learning_rate": 1.0176307272448448e-06, + "loss": 0.9988, + "step": 26692 + }, + { + "epoch": 0.9559331745662256, + "grad_norm": 1.5134238004684448, + "learning_rate": 1.015980863031074e-06, + "loss": 1.1861, + "step": 26693 + }, + { + "epoch": 0.9559689866957939, + "grad_norm": 1.7603572607040405, + "learning_rate": 1.0143323305196184e-06, + "loss": 1.0884, + "step": 26694 + }, + { + "epoch": 0.9560047988253622, + "grad_norm": 1.3267236948013306, + "learning_rate": 1.0126851297326157e-06, + "loss": 0.9523, + "step": 26695 + }, + { + "epoch": 0.9560406109549304, + "grad_norm": 1.38856840133667, + "learning_rate": 1.0110392606922703e-06, + "loss": 1.0331, + "step": 26696 + }, + { + "epoch": 0.9560764230844987, + "grad_norm": 1.6466617584228516, + "learning_rate": 1.0093947234206868e-06, + "loss": 1.1585, + "step": 26697 + }, + { + "epoch": 0.956112235214067, + "grad_norm": 1.8390421867370605, + "learning_rate": 1.0077515179400254e-06, + "loss": 1.1376, + "step": 26698 + }, + { + "epoch": 0.9561480473436353, + "grad_norm": 1.4711246490478516, + "learning_rate": 1.0061096442723683e-06, + "loss": 0.9392, + "step": 26699 + }, + { + "epoch": 0.9561838594732036, + "grad_norm": 1.4126781225204468, + "learning_rate": 1.004469102439809e-06, + "loss": 0.9256, + "step": 26700 + }, + { + "epoch": 0.9562196716027719, + "grad_norm": 1.3850103616714478, + "learning_rate": 1.0028298924644408e-06, + "loss": 0.9364, + "step": 26701 + }, + { + "epoch": 0.9562554837323402, + "grad_norm": 1.6716570854187012, + "learning_rate": 1.0011920143682796e-06, + "loss": 0.9992, + "step": 26702 + }, + { + "epoch": 0.9562912958619084, + "grad_norm": 1.4395291805267334, + "learning_rate": 9.995554681733855e-07, + "loss": 0.8354, + "step": 26703 + }, + { + "epoch": 0.9563271079914767, + "grad_norm": 1.6661293506622314, + "learning_rate": 9.97920253901774e-07, + "loss": 1.2286, + "step": 26704 + }, + { + "epoch": 0.956362920121045, + "grad_norm": 1.397871732711792, + "learning_rate": 9.96286371575439e-07, + "loss": 1.2003, + "step": 26705 + }, + { + "epoch": 0.9563987322506133, + "grad_norm": 2.380371332168579, + "learning_rate": 9.946538212163736e-07, + "loss": 1.2807, + "step": 26706 + }, + { + "epoch": 0.9564345443801816, + "grad_norm": 1.4239422082901, + "learning_rate": 9.930226028465272e-07, + "loss": 1.1204, + "step": 26707 + }, + { + "epoch": 0.9564703565097499, + "grad_norm": 1.9449031352996826, + "learning_rate": 9.913927164878488e-07, + "loss": 1.2162, + "step": 26708 + }, + { + "epoch": 0.9565061686393181, + "grad_norm": 1.3686479330062866, + "learning_rate": 9.897641621622765e-07, + "loss": 0.9049, + "step": 26709 + }, + { + "epoch": 0.9565419807688864, + "grad_norm": 1.8841912746429443, + "learning_rate": 9.88136939891704e-07, + "loss": 1.1075, + "step": 26710 + }, + { + "epoch": 0.9565777928984547, + "grad_norm": 1.7666676044464111, + "learning_rate": 9.865110496980356e-07, + "loss": 1.1812, + "step": 26711 + }, + { + "epoch": 0.956613605028023, + "grad_norm": 1.4618432521820068, + "learning_rate": 9.84886491603154e-07, + "loss": 1.0577, + "step": 26712 + }, + { + "epoch": 0.9566494171575913, + "grad_norm": 1.6882456541061401, + "learning_rate": 9.832632656288864e-07, + "loss": 1.1604, + "step": 26713 + }, + { + "epoch": 0.9566852292871596, + "grad_norm": 1.560860276222229, + "learning_rate": 9.81641371797104e-07, + "loss": 1.1984, + "step": 26714 + }, + { + "epoch": 0.9567210414167279, + "grad_norm": 1.0977997779846191, + "learning_rate": 9.800208101296115e-07, + "loss": 1.1683, + "step": 26715 + }, + { + "epoch": 0.9567568535462961, + "grad_norm": 1.5384551286697388, + "learning_rate": 9.784015806482028e-07, + "loss": 0.9612, + "step": 26716 + }, + { + "epoch": 0.9567926656758644, + "grad_norm": 1.7710806131362915, + "learning_rate": 9.767836833746714e-07, + "loss": 1.1275, + "step": 26717 + }, + { + "epoch": 0.9568284778054327, + "grad_norm": 1.43655264377594, + "learning_rate": 9.751671183307888e-07, + "loss": 1.0719, + "step": 26718 + }, + { + "epoch": 0.956864289935001, + "grad_norm": 1.6348739862442017, + "learning_rate": 9.735518855383152e-07, + "loss": 1.0985, + "step": 26719 + }, + { + "epoch": 0.9569001020645693, + "grad_norm": 1.561818242073059, + "learning_rate": 9.719379850189447e-07, + "loss": 1.0511, + "step": 26720 + }, + { + "epoch": 0.9569359141941376, + "grad_norm": 1.8695276975631714, + "learning_rate": 9.703254167944154e-07, + "loss": 1.057, + "step": 26721 + }, + { + "epoch": 0.9569717263237059, + "grad_norm": 2.0262091159820557, + "learning_rate": 9.68714180886421e-07, + "loss": 1.0673, + "step": 26722 + }, + { + "epoch": 0.9570075384532741, + "grad_norm": 1.4990991353988647, + "learning_rate": 9.67104277316644e-07, + "loss": 1.1833, + "step": 26723 + }, + { + "epoch": 0.9570433505828424, + "grad_norm": 1.7588615417480469, + "learning_rate": 9.654957061067228e-07, + "loss": 1.0184, + "step": 26724 + }, + { + "epoch": 0.9570791627124107, + "grad_norm": 1.2971839904785156, + "learning_rate": 9.638884672783176e-07, + "loss": 0.9879, + "step": 26725 + }, + { + "epoch": 0.9571149748419789, + "grad_norm": 1.449842095375061, + "learning_rate": 9.622825608530561e-07, + "loss": 0.8649, + "step": 26726 + }, + { + "epoch": 0.9571507869715473, + "grad_norm": 1.2540650367736816, + "learning_rate": 9.606779868525206e-07, + "loss": 0.844, + "step": 26727 + }, + { + "epoch": 0.9571865991011156, + "grad_norm": 1.3955947160720825, + "learning_rate": 9.590747452983161e-07, + "loss": 1.2102, + "step": 26728 + }, + { + "epoch": 0.9572224112306839, + "grad_norm": 1.5455760955810547, + "learning_rate": 9.574728362120033e-07, + "loss": 0.7834, + "step": 26729 + }, + { + "epoch": 0.9572582233602521, + "grad_norm": 1.3471908569335938, + "learning_rate": 9.558722596151425e-07, + "loss": 1.0196, + "step": 26730 + }, + { + "epoch": 0.9572940354898204, + "grad_norm": 1.5231918096542358, + "learning_rate": 9.5427301552925e-07, + "loss": 1.0286, + "step": 26731 + }, + { + "epoch": 0.9573298476193887, + "grad_norm": 1.7280899286270142, + "learning_rate": 9.526751039758641e-07, + "loss": 1.2357, + "step": 26732 + }, + { + "epoch": 0.9573656597489569, + "grad_norm": 1.5723446607589722, + "learning_rate": 9.510785249764786e-07, + "loss": 1.0278, + "step": 26733 + }, + { + "epoch": 0.9574014718785253, + "grad_norm": 1.1390327215194702, + "learning_rate": 9.494832785525653e-07, + "loss": 1.0402, + "step": 26734 + }, + { + "epoch": 0.9574372840080936, + "grad_norm": 1.7468616962432861, + "learning_rate": 9.478893647255849e-07, + "loss": 1.0221, + "step": 26735 + }, + { + "epoch": 0.9574730961376618, + "grad_norm": 1.4803739786148071, + "learning_rate": 9.462967835169756e-07, + "loss": 1.1079, + "step": 26736 + }, + { + "epoch": 0.9575089082672301, + "grad_norm": 1.506550908088684, + "learning_rate": 9.44705534948187e-07, + "loss": 1.1651, + "step": 26737 + }, + { + "epoch": 0.9575447203967984, + "grad_norm": 1.6281744241714478, + "learning_rate": 9.431156190406131e-07, + "loss": 1.0284, + "step": 26738 + }, + { + "epoch": 0.9575805325263667, + "grad_norm": 1.3355422019958496, + "learning_rate": 9.41527035815637e-07, + "loss": 0.9415, + "step": 26739 + }, + { + "epoch": 0.9576163446559349, + "grad_norm": 1.6030081510543823, + "learning_rate": 9.399397852946413e-07, + "loss": 1.1587, + "step": 26740 + }, + { + "epoch": 0.9576521567855032, + "grad_norm": 1.5359044075012207, + "learning_rate": 9.383538674989756e-07, + "loss": 1.2566, + "step": 26741 + }, + { + "epoch": 0.9576879689150716, + "grad_norm": 1.26615309715271, + "learning_rate": 9.367692824499786e-07, + "loss": 1.0622, + "step": 26742 + }, + { + "epoch": 0.9577237810446398, + "grad_norm": 1.5840556621551514, + "learning_rate": 9.351860301689775e-07, + "loss": 0.9335, + "step": 26743 + }, + { + "epoch": 0.9577595931742081, + "grad_norm": 1.5295665264129639, + "learning_rate": 9.336041106772553e-07, + "loss": 1.1569, + "step": 26744 + }, + { + "epoch": 0.9577954053037764, + "grad_norm": 1.2071008682250977, + "learning_rate": 9.320235239961061e-07, + "loss": 0.8957, + "step": 26745 + }, + { + "epoch": 0.9578312174333447, + "grad_norm": 2.7056052684783936, + "learning_rate": 9.304442701467908e-07, + "loss": 1.1945, + "step": 26746 + }, + { + "epoch": 0.9578670295629129, + "grad_norm": 1.6888742446899414, + "learning_rate": 9.288663491505478e-07, + "loss": 1.0301, + "step": 26747 + }, + { + "epoch": 0.9579028416924812, + "grad_norm": 1.4039424657821655, + "learning_rate": 9.27289761028638e-07, + "loss": 0.943, + "step": 26748 + }, + { + "epoch": 0.9579386538220496, + "grad_norm": 1.5235915184020996, + "learning_rate": 9.257145058022331e-07, + "loss": 1.0314, + "step": 26749 + }, + { + "epoch": 0.9579744659516178, + "grad_norm": 1.4115251302719116, + "learning_rate": 9.241405834925388e-07, + "loss": 1.0086, + "step": 26750 + }, + { + "epoch": 0.9580102780811861, + "grad_norm": 1.3803364038467407, + "learning_rate": 9.225679941207488e-07, + "loss": 1.1057, + "step": 26751 + }, + { + "epoch": 0.9580460902107544, + "grad_norm": 1.7344974279403687, + "learning_rate": 9.20996737708002e-07, + "loss": 1.1324, + "step": 26752 + }, + { + "epoch": 0.9580819023403226, + "grad_norm": 1.3309141397476196, + "learning_rate": 9.19426814275437e-07, + "loss": 1.1787, + "step": 26753 + }, + { + "epoch": 0.9581177144698909, + "grad_norm": 1.3737813234329224, + "learning_rate": 9.178582238441702e-07, + "loss": 0.9068, + "step": 26754 + }, + { + "epoch": 0.9581535265994592, + "grad_norm": 1.3917659521102905, + "learning_rate": 9.162909664353292e-07, + "loss": 1.2245, + "step": 26755 + }, + { + "epoch": 0.9581893387290276, + "grad_norm": 1.4107813835144043, + "learning_rate": 9.14725042069986e-07, + "loss": 1.12, + "step": 26756 + }, + { + "epoch": 0.9582251508585958, + "grad_norm": 1.5736701488494873, + "learning_rate": 9.131604507691904e-07, + "loss": 1.1573, + "step": 26757 + }, + { + "epoch": 0.9582609629881641, + "grad_norm": 1.4013170003890991, + "learning_rate": 9.115971925540257e-07, + "loss": 1.0687, + "step": 26758 + }, + { + "epoch": 0.9582967751177324, + "grad_norm": 1.3062113523483276, + "learning_rate": 9.100352674454971e-07, + "loss": 1.0732, + "step": 26759 + }, + { + "epoch": 0.9583325872473006, + "grad_norm": 1.6367971897125244, + "learning_rate": 9.084746754646323e-07, + "loss": 1.0465, + "step": 26760 + }, + { + "epoch": 0.9583683993768689, + "grad_norm": 1.4531440734863281, + "learning_rate": 9.069154166324146e-07, + "loss": 1.0451, + "step": 26761 + }, + { + "epoch": 0.9584042115064372, + "grad_norm": 2.096036911010742, + "learning_rate": 9.053574909698381e-07, + "loss": 1.1632, + "step": 26762 + }, + { + "epoch": 0.9584400236360056, + "grad_norm": 1.2934012413024902, + "learning_rate": 9.038008984978419e-07, + "loss": 1.1229, + "step": 26763 + }, + { + "epoch": 0.9584758357655738, + "grad_norm": 2.5307557582855225, + "learning_rate": 9.022456392373868e-07, + "loss": 1.0242, + "step": 26764 + }, + { + "epoch": 0.9585116478951421, + "grad_norm": 1.3093810081481934, + "learning_rate": 9.006917132093895e-07, + "loss": 0.889, + "step": 26765 + }, + { + "epoch": 0.9585474600247104, + "grad_norm": 1.4682573080062866, + "learning_rate": 8.991391204347555e-07, + "loss": 1.1424, + "step": 26766 + }, + { + "epoch": 0.9585832721542786, + "grad_norm": 1.4666227102279663, + "learning_rate": 8.97587860934368e-07, + "loss": 0.9422, + "step": 26767 + }, + { + "epoch": 0.9586190842838469, + "grad_norm": 1.7877293825149536, + "learning_rate": 8.960379347291103e-07, + "loss": 1.2723, + "step": 26768 + }, + { + "epoch": 0.9586548964134152, + "grad_norm": 1.6681550741195679, + "learning_rate": 8.944893418398326e-07, + "loss": 1.0987, + "step": 26769 + }, + { + "epoch": 0.9586907085429835, + "grad_norm": 1.5579547882080078, + "learning_rate": 8.929420822873513e-07, + "loss": 1.1609, + "step": 26770 + }, + { + "epoch": 0.9587265206725518, + "grad_norm": 1.3640798330307007, + "learning_rate": 8.913961560925055e-07, + "loss": 0.9877, + "step": 26771 + }, + { + "epoch": 0.9587623328021201, + "grad_norm": 1.2399240732192993, + "learning_rate": 8.898515632760784e-07, + "loss": 0.7456, + "step": 26772 + }, + { + "epoch": 0.9587981449316884, + "grad_norm": 1.4734822511672974, + "learning_rate": 8.883083038588536e-07, + "loss": 1.2896, + "step": 26773 + }, + { + "epoch": 0.9588339570612566, + "grad_norm": 1.7363454103469849, + "learning_rate": 8.867663778616031e-07, + "loss": 1.1522, + "step": 26774 + }, + { + "epoch": 0.9588697691908249, + "grad_norm": 1.3897523880004883, + "learning_rate": 8.852257853050661e-07, + "loss": 1.2727, + "step": 26775 + }, + { + "epoch": 0.9589055813203932, + "grad_norm": 1.8103677034378052, + "learning_rate": 8.836865262099481e-07, + "loss": 1.1892, + "step": 26776 + }, + { + "epoch": 0.9589413934499615, + "grad_norm": 1.3900471925735474, + "learning_rate": 8.821486005969992e-07, + "loss": 0.927, + "step": 26777 + }, + { + "epoch": 0.9589772055795298, + "grad_norm": 1.1905708312988281, + "learning_rate": 8.806120084868807e-07, + "loss": 0.9871, + "step": 26778 + }, + { + "epoch": 0.9590130177090981, + "grad_norm": 1.5056824684143066, + "learning_rate": 8.79076749900265e-07, + "loss": 1.1169, + "step": 26779 + }, + { + "epoch": 0.9590488298386664, + "grad_norm": 1.5316660404205322, + "learning_rate": 8.775428248578243e-07, + "loss": 1.426, + "step": 26780 + }, + { + "epoch": 0.9590846419682346, + "grad_norm": 1.401631474494934, + "learning_rate": 8.760102333801756e-07, + "loss": 1.0135, + "step": 26781 + }, + { + "epoch": 0.9591204540978029, + "grad_norm": 1.5082757472991943, + "learning_rate": 8.744789754879579e-07, + "loss": 1.1296, + "step": 26782 + }, + { + "epoch": 0.9591562662273712, + "grad_norm": 1.3102411031723022, + "learning_rate": 8.729490512017547e-07, + "loss": 1.0, + "step": 26783 + }, + { + "epoch": 0.9591920783569395, + "grad_norm": 1.5534985065460205, + "learning_rate": 8.714204605421716e-07, + "loss": 1.1067, + "step": 26784 + }, + { + "epoch": 0.9592278904865078, + "grad_norm": 1.461678147315979, + "learning_rate": 8.69893203529748e-07, + "loss": 0.9522, + "step": 26785 + }, + { + "epoch": 0.9592637026160761, + "grad_norm": 1.6650264263153076, + "learning_rate": 8.683672801850451e-07, + "loss": 1.1534, + "step": 26786 + }, + { + "epoch": 0.9592995147456443, + "grad_norm": 1.2357559204101562, + "learning_rate": 8.668426905285909e-07, + "loss": 0.6914, + "step": 26787 + }, + { + "epoch": 0.9593353268752126, + "grad_norm": 1.5974262952804565, + "learning_rate": 8.653194345808913e-07, + "loss": 1.1534, + "step": 26788 + }, + { + "epoch": 0.9593711390047809, + "grad_norm": 1.3964574337005615, + "learning_rate": 8.63797512362452e-07, + "loss": 0.9145, + "step": 26789 + }, + { + "epoch": 0.9594069511343492, + "grad_norm": 1.4922733306884766, + "learning_rate": 8.622769238937345e-07, + "loss": 0.9677, + "step": 26790 + }, + { + "epoch": 0.9594427632639175, + "grad_norm": 1.7145811319351196, + "learning_rate": 8.607576691952002e-07, + "loss": 1.1691, + "step": 26791 + }, + { + "epoch": 0.9594785753934858, + "grad_norm": 1.6086589097976685, + "learning_rate": 8.592397482872993e-07, + "loss": 1.0935, + "step": 26792 + }, + { + "epoch": 0.9595143875230541, + "grad_norm": 1.5316123962402344, + "learning_rate": 8.577231611904379e-07, + "loss": 1.1987, + "step": 26793 + }, + { + "epoch": 0.9595501996526223, + "grad_norm": 1.4183751344680786, + "learning_rate": 8.562079079250219e-07, + "loss": 0.9787, + "step": 26794 + }, + { + "epoch": 0.9595860117821906, + "grad_norm": 1.7139301300048828, + "learning_rate": 8.546939885114569e-07, + "loss": 1.0105, + "step": 26795 + }, + { + "epoch": 0.9596218239117589, + "grad_norm": 1.474698543548584, + "learning_rate": 8.531814029700935e-07, + "loss": 1.0651, + "step": 26796 + }, + { + "epoch": 0.9596576360413271, + "grad_norm": 1.4403373003005981, + "learning_rate": 8.516701513212821e-07, + "loss": 1.1294, + "step": 26797 + }, + { + "epoch": 0.9596934481708955, + "grad_norm": 1.4990721940994263, + "learning_rate": 8.501602335853509e-07, + "loss": 0.9463, + "step": 26798 + }, + { + "epoch": 0.9597292603004638, + "grad_norm": 1.7553907632827759, + "learning_rate": 8.48651649782628e-07, + "loss": 1.2334, + "step": 26799 + }, + { + "epoch": 0.9597650724300321, + "grad_norm": 1.517164707183838, + "learning_rate": 8.471443999333972e-07, + "loss": 1.0469, + "step": 26800 + }, + { + "epoch": 0.9598008845596003, + "grad_norm": 1.6351141929626465, + "learning_rate": 8.456384840579423e-07, + "loss": 1.1174, + "step": 26801 + }, + { + "epoch": 0.9598366966891686, + "grad_norm": 1.5011882781982422, + "learning_rate": 8.441339021765138e-07, + "loss": 1.0401, + "step": 26802 + }, + { + "epoch": 0.9598725088187369, + "grad_norm": 1.836678385734558, + "learning_rate": 8.426306543093732e-07, + "loss": 1.2453, + "step": 26803 + }, + { + "epoch": 0.9599083209483051, + "grad_norm": 1.5514353513717651, + "learning_rate": 8.411287404767265e-07, + "loss": 1.1449, + "step": 26804 + }, + { + "epoch": 0.9599441330778735, + "grad_norm": 1.5375672578811646, + "learning_rate": 8.396281606987799e-07, + "loss": 1.1205, + "step": 26805 + }, + { + "epoch": 0.9599799452074418, + "grad_norm": 1.5559113025665283, + "learning_rate": 8.381289149957395e-07, + "loss": 1.1605, + "step": 26806 + }, + { + "epoch": 0.9600157573370101, + "grad_norm": 1.3614110946655273, + "learning_rate": 8.366310033877667e-07, + "loss": 1.2005, + "step": 26807 + }, + { + "epoch": 0.9600515694665783, + "grad_norm": 1.752038836479187, + "learning_rate": 8.351344258950123e-07, + "loss": 1.3862, + "step": 26808 + }, + { + "epoch": 0.9600873815961466, + "grad_norm": 1.406797170639038, + "learning_rate": 8.336391825376044e-07, + "loss": 0.9191, + "step": 26809 + }, + { + "epoch": 0.9601231937257149, + "grad_norm": 1.4507997035980225, + "learning_rate": 8.321452733356605e-07, + "loss": 1.0686, + "step": 26810 + }, + { + "epoch": 0.9601590058552831, + "grad_norm": 1.8443338871002197, + "learning_rate": 8.306526983092977e-07, + "loss": 1.069, + "step": 26811 + }, + { + "epoch": 0.9601948179848515, + "grad_norm": 1.6526250839233398, + "learning_rate": 8.291614574785777e-07, + "loss": 1.0519, + "step": 26812 + }, + { + "epoch": 0.9602306301144198, + "grad_norm": 1.4573476314544678, + "learning_rate": 8.276715508635624e-07, + "loss": 1.029, + "step": 26813 + }, + { + "epoch": 0.960266442243988, + "grad_norm": 1.4699375629425049, + "learning_rate": 8.261829784843133e-07, + "loss": 0.9199, + "step": 26814 + }, + { + "epoch": 0.9603022543735563, + "grad_norm": 1.6759146451950073, + "learning_rate": 8.246957403608479e-07, + "loss": 1.0761, + "step": 26815 + }, + { + "epoch": 0.9603380665031246, + "grad_norm": 1.559136986732483, + "learning_rate": 8.232098365131613e-07, + "loss": 0.9823, + "step": 26816 + }, + { + "epoch": 0.9603738786326929, + "grad_norm": 1.2263898849487305, + "learning_rate": 8.217252669612708e-07, + "loss": 1.0086, + "step": 26817 + }, + { + "epoch": 0.9604096907622611, + "grad_norm": 1.6411705017089844, + "learning_rate": 8.20242031725138e-07, + "loss": 1.0916, + "step": 26818 + }, + { + "epoch": 0.9604455028918295, + "grad_norm": 1.8490045070648193, + "learning_rate": 8.187601308247028e-07, + "loss": 1.1495, + "step": 26819 + }, + { + "epoch": 0.9604813150213978, + "grad_norm": 1.386022925376892, + "learning_rate": 8.172795642799269e-07, + "loss": 0.9433, + "step": 26820 + }, + { + "epoch": 0.960517127150966, + "grad_norm": 1.3506804704666138, + "learning_rate": 8.158003321107167e-07, + "loss": 0.9397, + "step": 26821 + }, + { + "epoch": 0.9605529392805343, + "grad_norm": 1.3213465213775635, + "learning_rate": 8.143224343369671e-07, + "loss": 1.1132, + "step": 26822 + }, + { + "epoch": 0.9605887514101026, + "grad_norm": 1.4989062547683716, + "learning_rate": 8.128458709785736e-07, + "loss": 1.1456, + "step": 26823 + }, + { + "epoch": 0.9606245635396709, + "grad_norm": 1.7679239511489868, + "learning_rate": 8.113706420553868e-07, + "loss": 1.2817, + "step": 26824 + }, + { + "epoch": 0.9606603756692391, + "grad_norm": 1.4525471925735474, + "learning_rate": 8.098967475872798e-07, + "loss": 1.0782, + "step": 26825 + }, + { + "epoch": 0.9606961877988075, + "grad_norm": 1.3691160678863525, + "learning_rate": 8.084241875940591e-07, + "loss": 1.0554, + "step": 26826 + }, + { + "epoch": 0.9607319999283758, + "grad_norm": 1.4263197183609009, + "learning_rate": 8.069529620955418e-07, + "loss": 0.9601, + "step": 26827 + }, + { + "epoch": 0.960767812057944, + "grad_norm": 1.456986904144287, + "learning_rate": 8.054830711115236e-07, + "loss": 1.1472, + "step": 26828 + }, + { + "epoch": 0.9608036241875123, + "grad_norm": 1.3779270648956299, + "learning_rate": 8.040145146617883e-07, + "loss": 1.2842, + "step": 26829 + }, + { + "epoch": 0.9608394363170806, + "grad_norm": 1.0799059867858887, + "learning_rate": 8.025472927660649e-07, + "loss": 0.8062, + "step": 26830 + }, + { + "epoch": 0.9608752484466488, + "grad_norm": 1.2918953895568848, + "learning_rate": 8.010814054441262e-07, + "loss": 1.1255, + "step": 26831 + }, + { + "epoch": 0.9609110605762171, + "grad_norm": 1.43514084815979, + "learning_rate": 7.996168527156789e-07, + "loss": 0.7921, + "step": 26832 + }, + { + "epoch": 0.9609468727057855, + "grad_norm": 1.4137749671936035, + "learning_rate": 7.981536346004292e-07, + "loss": 1.0856, + "step": 26833 + }, + { + "epoch": 0.9609826848353538, + "grad_norm": 1.5553086996078491, + "learning_rate": 7.966917511180505e-07, + "loss": 0.9358, + "step": 26834 + }, + { + "epoch": 0.961018496964922, + "grad_norm": 1.6279337406158447, + "learning_rate": 7.952312022882269e-07, + "loss": 1.1939, + "step": 26835 + }, + { + "epoch": 0.9610543090944903, + "grad_norm": 1.3050895929336548, + "learning_rate": 7.937719881306094e-07, + "loss": 1.2006, + "step": 26836 + }, + { + "epoch": 0.9610901212240586, + "grad_norm": 1.8665058612823486, + "learning_rate": 7.923141086648156e-07, + "loss": 0.9801, + "step": 26837 + }, + { + "epoch": 0.9611259333536268, + "grad_norm": 1.6223617792129517, + "learning_rate": 7.908575639104631e-07, + "loss": 1.181, + "step": 26838 + }, + { + "epoch": 0.9611617454831951, + "grad_norm": 1.724398136138916, + "learning_rate": 7.894023538871587e-07, + "loss": 1.1549, + "step": 26839 + }, + { + "epoch": 0.9611975576127635, + "grad_norm": 1.243301272392273, + "learning_rate": 7.879484786144753e-07, + "loss": 0.9245, + "step": 26840 + }, + { + "epoch": 0.9612333697423318, + "grad_norm": 1.3738664388656616, + "learning_rate": 7.864959381119641e-07, + "loss": 1.1876, + "step": 26841 + }, + { + "epoch": 0.9612691818719, + "grad_norm": 1.8624850511550903, + "learning_rate": 7.85044732399165e-07, + "loss": 1.097, + "step": 26842 + }, + { + "epoch": 0.9613049940014683, + "grad_norm": 1.5106127262115479, + "learning_rate": 7.83594861495629e-07, + "loss": 1.0273, + "step": 26843 + }, + { + "epoch": 0.9613408061310366, + "grad_norm": 1.550447940826416, + "learning_rate": 7.821463254208405e-07, + "loss": 1.0643, + "step": 26844 + }, + { + "epoch": 0.9613766182606048, + "grad_norm": 1.3933122158050537, + "learning_rate": 7.80699124194284e-07, + "loss": 1.1576, + "step": 26845 + }, + { + "epoch": 0.9614124303901731, + "grad_norm": 1.490370512008667, + "learning_rate": 7.792532578354439e-07, + "loss": 1.0927, + "step": 26846 + }, + { + "epoch": 0.9614482425197415, + "grad_norm": 1.381386399269104, + "learning_rate": 7.778087263637601e-07, + "loss": 1.15, + "step": 26847 + }, + { + "epoch": 0.9614840546493097, + "grad_norm": 1.8899563550949097, + "learning_rate": 7.763655297986839e-07, + "loss": 1.2972, + "step": 26848 + }, + { + "epoch": 0.961519866778878, + "grad_norm": 1.5378239154815674, + "learning_rate": 7.749236681595995e-07, + "loss": 1.1799, + "step": 26849 + }, + { + "epoch": 0.9615556789084463, + "grad_norm": 1.5129643678665161, + "learning_rate": 7.734831414659471e-07, + "loss": 1.136, + "step": 26850 + }, + { + "epoch": 0.9615914910380146, + "grad_norm": 1.4503074884414673, + "learning_rate": 7.720439497370668e-07, + "loss": 1.0052, + "step": 26851 + }, + { + "epoch": 0.9616273031675828, + "grad_norm": 1.5586260557174683, + "learning_rate": 7.706060929923542e-07, + "loss": 0.9325, + "step": 26852 + }, + { + "epoch": 0.9616631152971511, + "grad_norm": 1.3943166732788086, + "learning_rate": 7.691695712511382e-07, + "loss": 1.1238, + "step": 26853 + }, + { + "epoch": 0.9616989274267195, + "grad_norm": 1.2970911264419556, + "learning_rate": 7.677343845327478e-07, + "loss": 0.9109, + "step": 26854 + }, + { + "epoch": 0.9617347395562877, + "grad_norm": 2.2009124755859375, + "learning_rate": 7.663005328564787e-07, + "loss": 1.2028, + "step": 26855 + }, + { + "epoch": 0.961770551685856, + "grad_norm": 2.294315814971924, + "learning_rate": 7.648680162416489e-07, + "loss": 1.117, + "step": 26856 + }, + { + "epoch": 0.9618063638154243, + "grad_norm": 1.2720566987991333, + "learning_rate": 7.634368347075093e-07, + "loss": 0.9563, + "step": 26857 + }, + { + "epoch": 0.9618421759449926, + "grad_norm": 1.599491834640503, + "learning_rate": 7.620069882733227e-07, + "loss": 1.2177, + "step": 26858 + }, + { + "epoch": 0.9618779880745608, + "grad_norm": 1.970084309577942, + "learning_rate": 7.605784769583291e-07, + "loss": 1.1605, + "step": 26859 + }, + { + "epoch": 0.9619138002041291, + "grad_norm": 1.3638759851455688, + "learning_rate": 7.591513007817242e-07, + "loss": 1.0381, + "step": 26860 + }, + { + "epoch": 0.9619496123336975, + "grad_norm": 1.4832364320755005, + "learning_rate": 7.577254597627481e-07, + "loss": 1.0557, + "step": 26861 + }, + { + "epoch": 0.9619854244632657, + "grad_norm": 1.3579462766647339, + "learning_rate": 7.563009539205524e-07, + "loss": 1.1648, + "step": 26862 + }, + { + "epoch": 0.962021236592834, + "grad_norm": 1.3774406909942627, + "learning_rate": 7.548777832743214e-07, + "loss": 0.9822, + "step": 26863 + }, + { + "epoch": 0.9620570487224023, + "grad_norm": 2.973109245300293, + "learning_rate": 7.534559478431735e-07, + "loss": 1.0343, + "step": 26864 + }, + { + "epoch": 0.9620928608519705, + "grad_norm": 1.3461202383041382, + "learning_rate": 7.52035447646271e-07, + "loss": 1.0103, + "step": 26865 + }, + { + "epoch": 0.9621286729815388, + "grad_norm": 1.3075448274612427, + "learning_rate": 7.506162827027097e-07, + "loss": 1.0486, + "step": 26866 + }, + { + "epoch": 0.9621644851111071, + "grad_norm": 1.4421497583389282, + "learning_rate": 7.491984530315854e-07, + "loss": 1.1859, + "step": 26867 + }, + { + "epoch": 0.9622002972406755, + "grad_norm": 1.09521484375, + "learning_rate": 7.477819586519719e-07, + "loss": 0.8599, + "step": 26868 + }, + { + "epoch": 0.9622361093702437, + "grad_norm": 1.5200340747833252, + "learning_rate": 7.463667995829205e-07, + "loss": 1.1556, + "step": 26869 + }, + { + "epoch": 0.962271921499812, + "grad_norm": 1.75318443775177, + "learning_rate": 7.449529758434826e-07, + "loss": 1.5535, + "step": 26870 + }, + { + "epoch": 0.9623077336293803, + "grad_norm": 1.534009575843811, + "learning_rate": 7.435404874526542e-07, + "loss": 0.9684, + "step": 26871 + }, + { + "epoch": 0.9623435457589485, + "grad_norm": 1.7141308784484863, + "learning_rate": 7.421293344294755e-07, + "loss": 1.164, + "step": 26872 + }, + { + "epoch": 0.9623793578885168, + "grad_norm": 1.4668419361114502, + "learning_rate": 7.407195167929093e-07, + "loss": 0.9928, + "step": 26873 + }, + { + "epoch": 0.9624151700180851, + "grad_norm": 1.4177483320236206, + "learning_rate": 7.393110345619291e-07, + "loss": 0.9401, + "step": 26874 + }, + { + "epoch": 0.9624509821476535, + "grad_norm": 1.8523203134536743, + "learning_rate": 7.379038877554755e-07, + "loss": 0.9544, + "step": 26875 + }, + { + "epoch": 0.9624867942772217, + "grad_norm": 1.7236131429672241, + "learning_rate": 7.364980763924889e-07, + "loss": 0.9677, + "step": 26876 + }, + { + "epoch": 0.96252260640679, + "grad_norm": 1.7606335878372192, + "learning_rate": 7.350936004918873e-07, + "loss": 1.136, + "step": 26877 + }, + { + "epoch": 0.9625584185363583, + "grad_norm": 1.7233150005340576, + "learning_rate": 7.336904600725447e-07, + "loss": 0.9998, + "step": 26878 + }, + { + "epoch": 0.9625942306659265, + "grad_norm": 1.4777135848999023, + "learning_rate": 7.322886551533681e-07, + "loss": 1.2446, + "step": 26879 + }, + { + "epoch": 0.9626300427954948, + "grad_norm": 2.141113042831421, + "learning_rate": 7.308881857531869e-07, + "loss": 1.1475, + "step": 26880 + }, + { + "epoch": 0.9626658549250631, + "grad_norm": 1.3704904317855835, + "learning_rate": 7.294890518908748e-07, + "loss": 1.0182, + "step": 26881 + }, + { + "epoch": 0.9627016670546314, + "grad_norm": 1.23662269115448, + "learning_rate": 7.280912535852169e-07, + "loss": 1.1035, + "step": 26882 + }, + { + "epoch": 0.9627374791841997, + "grad_norm": 1.5006449222564697, + "learning_rate": 7.266947908550536e-07, + "loss": 0.9272, + "step": 26883 + }, + { + "epoch": 0.962773291313768, + "grad_norm": 1.4078325033187866, + "learning_rate": 7.252996637191589e-07, + "loss": 0.807, + "step": 26884 + }, + { + "epoch": 0.9628091034433363, + "grad_norm": 1.6649850606918335, + "learning_rate": 7.239058721962954e-07, + "loss": 0.9772, + "step": 26885 + }, + { + "epoch": 0.9628449155729045, + "grad_norm": 1.8177744150161743, + "learning_rate": 7.22513416305226e-07, + "loss": 1.0055, + "step": 26886 + }, + { + "epoch": 0.9628807277024728, + "grad_norm": 1.485758900642395, + "learning_rate": 7.211222960646691e-07, + "loss": 1.0522, + "step": 26887 + }, + { + "epoch": 0.9629165398320411, + "grad_norm": 1.2279231548309326, + "learning_rate": 7.197325114933651e-07, + "loss": 0.8903, + "step": 26888 + }, + { + "epoch": 0.9629523519616094, + "grad_norm": 1.7699717283248901, + "learning_rate": 7.18344062609988e-07, + "loss": 1.2448, + "step": 26889 + }, + { + "epoch": 0.9629881640911777, + "grad_norm": 1.7987920045852661, + "learning_rate": 7.16956949433234e-07, + "loss": 1.0194, + "step": 26890 + }, + { + "epoch": 0.963023976220746, + "grad_norm": 1.5906528234481812, + "learning_rate": 7.155711719817548e-07, + "loss": 1.0392, + "step": 26891 + }, + { + "epoch": 0.9630597883503142, + "grad_norm": 1.2262908220291138, + "learning_rate": 7.141867302742023e-07, + "loss": 0.9106, + "step": 26892 + }, + { + "epoch": 0.9630956004798825, + "grad_norm": 1.4214078187942505, + "learning_rate": 7.128036243291947e-07, + "loss": 0.9248, + "step": 26893 + }, + { + "epoch": 0.9631314126094508, + "grad_norm": 1.1743367910385132, + "learning_rate": 7.114218541653395e-07, + "loss": 0.8779, + "step": 26894 + }, + { + "epoch": 0.9631672247390191, + "grad_norm": 1.4132013320922852, + "learning_rate": 7.100414198012439e-07, + "loss": 0.9394, + "step": 26895 + }, + { + "epoch": 0.9632030368685874, + "grad_norm": 1.5794578790664673, + "learning_rate": 7.086623212554488e-07, + "loss": 1.1834, + "step": 26896 + }, + { + "epoch": 0.9632388489981557, + "grad_norm": 1.4678866863250732, + "learning_rate": 7.072845585465282e-07, + "loss": 1.0699, + "step": 26897 + }, + { + "epoch": 0.963274661127724, + "grad_norm": 1.8799153566360474, + "learning_rate": 7.059081316930227e-07, + "loss": 1.0322, + "step": 26898 + }, + { + "epoch": 0.9633104732572922, + "grad_norm": 1.6400315761566162, + "learning_rate": 7.045330407134398e-07, + "loss": 1.2754, + "step": 26899 + }, + { + "epoch": 0.9633462853868605, + "grad_norm": 1.4605367183685303, + "learning_rate": 7.03159285626287e-07, + "loss": 1.1611, + "step": 26900 + }, + { + "epoch": 0.9633820975164288, + "grad_norm": 1.8157163858413696, + "learning_rate": 7.017868664500382e-07, + "loss": 1.0876, + "step": 26901 + }, + { + "epoch": 0.963417909645997, + "grad_norm": 1.7187429666519165, + "learning_rate": 7.004157832031677e-07, + "loss": 1.1994, + "step": 26902 + }, + { + "epoch": 0.9634537217755654, + "grad_norm": 1.6527042388916016, + "learning_rate": 6.990460359041051e-07, + "loss": 0.8878, + "step": 26903 + }, + { + "epoch": 0.9634895339051337, + "grad_norm": 1.6513948440551758, + "learning_rate": 6.976776245712913e-07, + "loss": 1.214, + "step": 26904 + }, + { + "epoch": 0.963525346034702, + "grad_norm": 1.427115797996521, + "learning_rate": 6.963105492231336e-07, + "loss": 1.098, + "step": 26905 + }, + { + "epoch": 0.9635611581642702, + "grad_norm": 1.4124571084976196, + "learning_rate": 6.949448098780398e-07, + "loss": 0.8715, + "step": 26906 + }, + { + "epoch": 0.9635969702938385, + "grad_norm": 1.3943605422973633, + "learning_rate": 6.935804065543505e-07, + "loss": 0.8784, + "step": 26907 + }, + { + "epoch": 0.9636327824234068, + "grad_norm": 1.6010758876800537, + "learning_rate": 6.922173392704512e-07, + "loss": 1.2438, + "step": 26908 + }, + { + "epoch": 0.963668594552975, + "grad_norm": 1.3344833850860596, + "learning_rate": 6.908556080446715e-07, + "loss": 0.9608, + "step": 26909 + }, + { + "epoch": 0.9637044066825434, + "grad_norm": 1.603774905204773, + "learning_rate": 6.894952128953191e-07, + "loss": 1.09, + "step": 26910 + }, + { + "epoch": 0.9637402188121117, + "grad_norm": 1.872596025466919, + "learning_rate": 6.881361538407127e-07, + "loss": 1.0863, + "step": 26911 + }, + { + "epoch": 0.96377603094168, + "grad_norm": 1.1491371393203735, + "learning_rate": 6.867784308991266e-07, + "loss": 1.029, + "step": 26912 + }, + { + "epoch": 0.9638118430712482, + "grad_norm": 1.2597166299819946, + "learning_rate": 6.854220440888459e-07, + "loss": 1.101, + "step": 26913 + }, + { + "epoch": 0.9638476552008165, + "grad_norm": 1.1855268478393555, + "learning_rate": 6.840669934280897e-07, + "loss": 0.9952, + "step": 26914 + }, + { + "epoch": 0.9638834673303848, + "grad_norm": 1.463213562965393, + "learning_rate": 6.827132789351098e-07, + "loss": 1.2273, + "step": 26915 + }, + { + "epoch": 0.963919279459953, + "grad_norm": 1.1652815341949463, + "learning_rate": 6.813609006281141e-07, + "loss": 1.0837, + "step": 26916 + }, + { + "epoch": 0.9639550915895214, + "grad_norm": 1.4236286878585815, + "learning_rate": 6.800098585252989e-07, + "loss": 1.055, + "step": 26917 + }, + { + "epoch": 0.9639909037190897, + "grad_norm": 1.4759740829467773, + "learning_rate": 6.786601526448277e-07, + "loss": 0.9841, + "step": 26918 + }, + { + "epoch": 0.964026715848658, + "grad_norm": 1.6189039945602417, + "learning_rate": 6.773117830048747e-07, + "loss": 1.2125, + "step": 26919 + }, + { + "epoch": 0.9640625279782262, + "grad_norm": 1.7251020669937134, + "learning_rate": 6.75964749623581e-07, + "loss": 1.3423, + "step": 26920 + }, + { + "epoch": 0.9640983401077945, + "grad_norm": 1.527726411819458, + "learning_rate": 6.746190525190543e-07, + "loss": 1.0997, + "step": 26921 + }, + { + "epoch": 0.9641341522373628, + "grad_norm": 1.7631648778915405, + "learning_rate": 6.732746917094135e-07, + "loss": 1.2413, + "step": 26922 + }, + { + "epoch": 0.964169964366931, + "grad_norm": 2.1385927200317383, + "learning_rate": 6.719316672127329e-07, + "loss": 1.1357, + "step": 26923 + }, + { + "epoch": 0.9642057764964994, + "grad_norm": 1.3255785703659058, + "learning_rate": 6.70589979047087e-07, + "loss": 0.9836, + "step": 26924 + }, + { + "epoch": 0.9642415886260677, + "grad_norm": 1.258064866065979, + "learning_rate": 6.692496272305282e-07, + "loss": 1.1013, + "step": 26925 + }, + { + "epoch": 0.964277400755636, + "grad_norm": 1.3221861124038696, + "learning_rate": 6.679106117810974e-07, + "loss": 1.0546, + "step": 26926 + }, + { + "epoch": 0.9643132128852042, + "grad_norm": 1.8316539525985718, + "learning_rate": 6.665729327167913e-07, + "loss": 1.2649, + "step": 26927 + }, + { + "epoch": 0.9643490250147725, + "grad_norm": 1.2973544597625732, + "learning_rate": 6.652365900556179e-07, + "loss": 1.1681, + "step": 26928 + }, + { + "epoch": 0.9643848371443408, + "grad_norm": 1.939846396446228, + "learning_rate": 6.639015838155515e-07, + "loss": 1.2246, + "step": 26929 + }, + { + "epoch": 0.964420649273909, + "grad_norm": 1.1443012952804565, + "learning_rate": 6.625679140145557e-07, + "loss": 0.7143, + "step": 26930 + }, + { + "epoch": 0.9644564614034774, + "grad_norm": 1.501771092414856, + "learning_rate": 6.612355806705828e-07, + "loss": 0.9956, + "step": 26931 + }, + { + "epoch": 0.9644922735330457, + "grad_norm": 1.5194475650787354, + "learning_rate": 6.599045838015294e-07, + "loss": 0.8759, + "step": 26932 + }, + { + "epoch": 0.9645280856626139, + "grad_norm": 1.4154462814331055, + "learning_rate": 6.585749234253258e-07, + "loss": 0.9957, + "step": 26933 + }, + { + "epoch": 0.9645638977921822, + "grad_norm": 1.9124956130981445, + "learning_rate": 6.572465995598575e-07, + "loss": 1.1877, + "step": 26934 + }, + { + "epoch": 0.9645997099217505, + "grad_norm": 1.3354699611663818, + "learning_rate": 6.559196122229994e-07, + "loss": 1.0981, + "step": 26935 + }, + { + "epoch": 0.9646355220513188, + "grad_norm": 1.9454841613769531, + "learning_rate": 6.545939614325924e-07, + "loss": 1.1816, + "step": 26936 + }, + { + "epoch": 0.964671334180887, + "grad_norm": 1.6337084770202637, + "learning_rate": 6.532696472064781e-07, + "loss": 1.0352, + "step": 26937 + }, + { + "epoch": 0.9647071463104554, + "grad_norm": 1.5912429094314575, + "learning_rate": 6.519466695624755e-07, + "loss": 1.2327, + "step": 26938 + }, + { + "epoch": 0.9647429584400237, + "grad_norm": 1.2124180793762207, + "learning_rate": 6.506250285183812e-07, + "loss": 1.037, + "step": 26939 + }, + { + "epoch": 0.9647787705695919, + "grad_norm": 1.4246946573257446, + "learning_rate": 6.493047240919703e-07, + "loss": 0.9991, + "step": 26940 + }, + { + "epoch": 0.9648145826991602, + "grad_norm": 1.3395642042160034, + "learning_rate": 6.479857563010062e-07, + "loss": 0.9913, + "step": 26941 + }, + { + "epoch": 0.9648503948287285, + "grad_norm": 1.2496254444122314, + "learning_rate": 6.466681251632522e-07, + "loss": 0.994, + "step": 26942 + }, + { + "epoch": 0.9648862069582967, + "grad_norm": 1.181898832321167, + "learning_rate": 6.453518306964168e-07, + "loss": 1.0123, + "step": 26943 + }, + { + "epoch": 0.964922019087865, + "grad_norm": 1.3013675212860107, + "learning_rate": 6.440368729182078e-07, + "loss": 1.0855, + "step": 26944 + }, + { + "epoch": 0.9649578312174334, + "grad_norm": 1.4567899703979492, + "learning_rate": 6.427232518463333e-07, + "loss": 1.0127, + "step": 26945 + }, + { + "epoch": 0.9649936433470017, + "grad_norm": 1.3451498746871948, + "learning_rate": 6.414109674984458e-07, + "loss": 1.0731, + "step": 26946 + }, + { + "epoch": 0.9650294554765699, + "grad_norm": 1.4410374164581299, + "learning_rate": 6.401000198922202e-07, + "loss": 1.082, + "step": 26947 + }, + { + "epoch": 0.9650652676061382, + "grad_norm": 1.4147355556488037, + "learning_rate": 6.387904090452757e-07, + "loss": 1.1275, + "step": 26948 + }, + { + "epoch": 0.9651010797357065, + "grad_norm": 1.2196754217147827, + "learning_rate": 6.374821349752424e-07, + "loss": 0.8942, + "step": 26949 + }, + { + "epoch": 0.9651368918652747, + "grad_norm": 1.5313465595245361, + "learning_rate": 6.361751976997177e-07, + "loss": 1.1275, + "step": 26950 + }, + { + "epoch": 0.965172703994843, + "grad_norm": 1.2318220138549805, + "learning_rate": 6.348695972362872e-07, + "loss": 1.0563, + "step": 26951 + }, + { + "epoch": 0.9652085161244114, + "grad_norm": 1.6433117389678955, + "learning_rate": 6.33565333602515e-07, + "loss": 1.2049, + "step": 26952 + }, + { + "epoch": 0.9652443282539797, + "grad_norm": 1.4973952770233154, + "learning_rate": 6.322624068159421e-07, + "loss": 0.9344, + "step": 26953 + }, + { + "epoch": 0.9652801403835479, + "grad_norm": 2.0089845657348633, + "learning_rate": 6.309608168941217e-07, + "loss": 1.2444, + "step": 26954 + }, + { + "epoch": 0.9653159525131162, + "grad_norm": 1.7046270370483398, + "learning_rate": 6.296605638545172e-07, + "loss": 1.0375, + "step": 26955 + }, + { + "epoch": 0.9653517646426845, + "grad_norm": 1.5577069520950317, + "learning_rate": 6.283616477146703e-07, + "loss": 0.8213, + "step": 26956 + }, + { + "epoch": 0.9653875767722527, + "grad_norm": 1.436668038368225, + "learning_rate": 6.270640684920337e-07, + "loss": 1.2341, + "step": 26957 + }, + { + "epoch": 0.965423388901821, + "grad_norm": 1.2597142457962036, + "learning_rate": 6.257678262040712e-07, + "loss": 1.0645, + "step": 26958 + }, + { + "epoch": 0.9654592010313894, + "grad_norm": 1.5484895706176758, + "learning_rate": 6.244729208682131e-07, + "loss": 0.8665, + "step": 26959 + }, + { + "epoch": 0.9654950131609576, + "grad_norm": 1.3780243396759033, + "learning_rate": 6.231793525018903e-07, + "loss": 0.8222, + "step": 26960 + }, + { + "epoch": 0.9655308252905259, + "grad_norm": 1.7096726894378662, + "learning_rate": 6.218871211224997e-07, + "loss": 1.2232, + "step": 26961 + }, + { + "epoch": 0.9655666374200942, + "grad_norm": 1.3075844049453735, + "learning_rate": 6.205962267474386e-07, + "loss": 0.8889, + "step": 26962 + }, + { + "epoch": 0.9656024495496625, + "grad_norm": 1.9762803316116333, + "learning_rate": 6.193066693940597e-07, + "loss": 0.9669, + "step": 26963 + }, + { + "epoch": 0.9656382616792307, + "grad_norm": 1.31186044216156, + "learning_rate": 6.180184490797158e-07, + "loss": 1.0216, + "step": 26964 + }, + { + "epoch": 0.965674073808799, + "grad_norm": 1.721439242362976, + "learning_rate": 6.167315658217376e-07, + "loss": 1.1607, + "step": 26965 + }, + { + "epoch": 0.9657098859383674, + "grad_norm": 1.866424322128296, + "learning_rate": 6.154460196374445e-07, + "loss": 1.1419, + "step": 26966 + }, + { + "epoch": 0.9657456980679356, + "grad_norm": 1.4556607007980347, + "learning_rate": 6.141618105441227e-07, + "loss": 1.0376, + "step": 26967 + }, + { + "epoch": 0.9657815101975039, + "grad_norm": 1.4632354974746704, + "learning_rate": 6.128789385590583e-07, + "loss": 0.8699, + "step": 26968 + }, + { + "epoch": 0.9658173223270722, + "grad_norm": 1.6172459125518799, + "learning_rate": 6.115974036995154e-07, + "loss": 0.9642, + "step": 26969 + }, + { + "epoch": 0.9658531344566404, + "grad_norm": 1.391341209411621, + "learning_rate": 6.103172059827134e-07, + "loss": 0.8272, + "step": 26970 + }, + { + "epoch": 0.9658889465862087, + "grad_norm": 1.432655930519104, + "learning_rate": 6.090383454259052e-07, + "loss": 1.2916, + "step": 26971 + }, + { + "epoch": 0.965924758715777, + "grad_norm": 1.6774425506591797, + "learning_rate": 6.077608220462771e-07, + "loss": 0.7754, + "step": 26972 + }, + { + "epoch": 0.9659605708453454, + "grad_norm": 1.5810383558273315, + "learning_rate": 6.064846358610154e-07, + "loss": 1.1131, + "step": 26973 + }, + { + "epoch": 0.9659963829749136, + "grad_norm": 1.8118594884872437, + "learning_rate": 6.052097868872953e-07, + "loss": 1.1041, + "step": 26974 + }, + { + "epoch": 0.9660321951044819, + "grad_norm": 1.8551409244537354, + "learning_rate": 6.039362751422695e-07, + "loss": 1.0657, + "step": 26975 + }, + { + "epoch": 0.9660680072340502, + "grad_norm": 1.61293625831604, + "learning_rate": 6.026641006430689e-07, + "loss": 1.0531, + "step": 26976 + }, + { + "epoch": 0.9661038193636184, + "grad_norm": 1.4724522829055786, + "learning_rate": 6.013932634068021e-07, + "loss": 1.1298, + "step": 26977 + }, + { + "epoch": 0.9661396314931867, + "grad_norm": 1.6188709735870361, + "learning_rate": 6.001237634505885e-07, + "loss": 1.2748, + "step": 26978 + }, + { + "epoch": 0.966175443622755, + "grad_norm": 1.621334195137024, + "learning_rate": 5.988556007914814e-07, + "loss": 1.421, + "step": 26979 + }, + { + "epoch": 0.9662112557523234, + "grad_norm": 1.581833004951477, + "learning_rate": 5.975887754465559e-07, + "loss": 1.1933, + "step": 26980 + }, + { + "epoch": 0.9662470678818916, + "grad_norm": 1.5100555419921875, + "learning_rate": 5.96323287432854e-07, + "loss": 1.0405, + "step": 26981 + }, + { + "epoch": 0.9662828800114599, + "grad_norm": 1.52765691280365, + "learning_rate": 5.950591367674064e-07, + "loss": 0.9406, + "step": 26982 + }, + { + "epoch": 0.9663186921410282, + "grad_norm": 1.8054308891296387, + "learning_rate": 5.937963234672106e-07, + "loss": 1.0815, + "step": 26983 + }, + { + "epoch": 0.9663545042705964, + "grad_norm": 1.807751178741455, + "learning_rate": 5.925348475492643e-07, + "loss": 0.8026, + "step": 26984 + }, + { + "epoch": 0.9663903164001647, + "grad_norm": 1.8302586078643799, + "learning_rate": 5.912747090305315e-07, + "loss": 1.2974, + "step": 26985 + }, + { + "epoch": 0.966426128529733, + "grad_norm": 1.309175729751587, + "learning_rate": 5.900159079279654e-07, + "loss": 1.0386, + "step": 26986 + }, + { + "epoch": 0.9664619406593014, + "grad_norm": 1.7404344081878662, + "learning_rate": 5.887584442585081e-07, + "loss": 0.988, + "step": 26987 + }, + { + "epoch": 0.9664977527888696, + "grad_norm": 1.2734992504119873, + "learning_rate": 5.875023180390793e-07, + "loss": 1.2182, + "step": 26988 + }, + { + "epoch": 0.9665335649184379, + "grad_norm": 1.5696595907211304, + "learning_rate": 5.862475292865655e-07, + "loss": 1.0493, + "step": 26989 + }, + { + "epoch": 0.9665693770480062, + "grad_norm": 1.3618968725204468, + "learning_rate": 5.849940780178642e-07, + "loss": 0.9067, + "step": 26990 + }, + { + "epoch": 0.9666051891775744, + "grad_norm": 1.5051708221435547, + "learning_rate": 5.837419642498288e-07, + "loss": 1.2445, + "step": 26991 + }, + { + "epoch": 0.9666410013071427, + "grad_norm": 1.3541929721832275, + "learning_rate": 5.824911879993123e-07, + "loss": 0.977, + "step": 26992 + }, + { + "epoch": 0.966676813436711, + "grad_norm": 1.7372548580169678, + "learning_rate": 5.812417492831346e-07, + "loss": 1.0653, + "step": 26993 + }, + { + "epoch": 0.9667126255662793, + "grad_norm": 1.538630723953247, + "learning_rate": 5.799936481181045e-07, + "loss": 1.1053, + "step": 26994 + }, + { + "epoch": 0.9667484376958476, + "grad_norm": 1.7572031021118164, + "learning_rate": 5.787468845210198e-07, + "loss": 0.9477, + "step": 26995 + }, + { + "epoch": 0.9667842498254159, + "grad_norm": 1.689582109451294, + "learning_rate": 5.775014585086446e-07, + "loss": 1.2734, + "step": 26996 + }, + { + "epoch": 0.9668200619549842, + "grad_norm": 2.4899396896362305, + "learning_rate": 5.762573700977547e-07, + "loss": 1.2318, + "step": 26997 + }, + { + "epoch": 0.9668558740845524, + "grad_norm": 1.4942419528961182, + "learning_rate": 5.750146193050698e-07, + "loss": 1.0316, + "step": 26998 + }, + { + "epoch": 0.9668916862141207, + "grad_norm": 1.6712594032287598, + "learning_rate": 5.7377320614731e-07, + "loss": 1.1863, + "step": 26999 + }, + { + "epoch": 0.966927498343689, + "grad_norm": 1.5679501295089722, + "learning_rate": 5.725331306411841e-07, + "loss": 1.1678, + "step": 27000 + }, + { + "epoch": 0.9669633104732573, + "grad_norm": 1.8476024866104126, + "learning_rate": 5.712943928033787e-07, + "loss": 0.9293, + "step": 27001 + }, + { + "epoch": 0.9669991226028256, + "grad_norm": 1.3678505420684814, + "learning_rate": 5.700569926505361e-07, + "loss": 0.8868, + "step": 27002 + }, + { + "epoch": 0.9670349347323939, + "grad_norm": 1.7370485067367554, + "learning_rate": 5.688209301993319e-07, + "loss": 1.1663, + "step": 27003 + }, + { + "epoch": 0.9670707468619621, + "grad_norm": 1.5574992895126343, + "learning_rate": 5.675862054663861e-07, + "loss": 1.0865, + "step": 27004 + }, + { + "epoch": 0.9671065589915304, + "grad_norm": 1.8294181823730469, + "learning_rate": 5.663528184683186e-07, + "loss": 1.137, + "step": 27005 + }, + { + "epoch": 0.9671423711210987, + "grad_norm": 1.5500684976577759, + "learning_rate": 5.651207692216942e-07, + "loss": 1.1251, + "step": 27006 + }, + { + "epoch": 0.967178183250667, + "grad_norm": 1.312416434288025, + "learning_rate": 5.638900577431216e-07, + "loss": 1.0184, + "step": 27007 + }, + { + "epoch": 0.9672139953802353, + "grad_norm": 1.2441705465316772, + "learning_rate": 5.626606840491433e-07, + "loss": 1.1504, + "step": 27008 + }, + { + "epoch": 0.9672498075098036, + "grad_norm": 1.3549286127090454, + "learning_rate": 5.614326481562904e-07, + "loss": 1.1361, + "step": 27009 + }, + { + "epoch": 0.9672856196393719, + "grad_norm": 1.6790318489074707, + "learning_rate": 5.602059500811052e-07, + "loss": 1.3744, + "step": 27010 + }, + { + "epoch": 0.9673214317689401, + "grad_norm": 1.3435639142990112, + "learning_rate": 5.589805898400746e-07, + "loss": 1.0134, + "step": 27011 + }, + { + "epoch": 0.9673572438985084, + "grad_norm": 1.5155367851257324, + "learning_rate": 5.577565674496965e-07, + "loss": 1.0602, + "step": 27012 + }, + { + "epoch": 0.9673930560280767, + "grad_norm": 1.5793002843856812, + "learning_rate": 5.565338829264355e-07, + "loss": 1.1365, + "step": 27013 + }, + { + "epoch": 0.967428868157645, + "grad_norm": 1.191259503364563, + "learning_rate": 5.553125362867228e-07, + "loss": 0.9267, + "step": 27014 + }, + { + "epoch": 0.9674646802872133, + "grad_norm": 1.523332953453064, + "learning_rate": 5.540925275470232e-07, + "loss": 1.2539, + "step": 27015 + }, + { + "epoch": 0.9675004924167816, + "grad_norm": 1.4837464094161987, + "learning_rate": 5.528738567237235e-07, + "loss": 1.2355, + "step": 27016 + }, + { + "epoch": 0.9675363045463499, + "grad_norm": 1.3079153299331665, + "learning_rate": 5.516565238332328e-07, + "loss": 0.917, + "step": 27017 + }, + { + "epoch": 0.9675721166759181, + "grad_norm": 1.7098989486694336, + "learning_rate": 5.504405288919156e-07, + "loss": 1.0087, + "step": 27018 + }, + { + "epoch": 0.9676079288054864, + "grad_norm": 1.4408129453659058, + "learning_rate": 5.492258719161481e-07, + "loss": 1.0653, + "step": 27019 + }, + { + "epoch": 0.9676437409350547, + "grad_norm": 1.0701584815979004, + "learning_rate": 5.480125529222613e-07, + "loss": 0.7958, + "step": 27020 + }, + { + "epoch": 0.9676795530646229, + "grad_norm": 1.4751747846603394, + "learning_rate": 5.468005719265868e-07, + "loss": 1.2822, + "step": 27021 + }, + { + "epoch": 0.9677153651941913, + "grad_norm": 1.4060100317001343, + "learning_rate": 5.455899289454225e-07, + "loss": 1.2021, + "step": 27022 + }, + { + "epoch": 0.9677511773237596, + "grad_norm": 1.5967483520507812, + "learning_rate": 5.443806239950555e-07, + "loss": 0.9587, + "step": 27023 + }, + { + "epoch": 0.9677869894533279, + "grad_norm": 1.5564252138137817, + "learning_rate": 5.431726570917617e-07, + "loss": 1.0175, + "step": 27024 + }, + { + "epoch": 0.9678228015828961, + "grad_norm": 1.3113187551498413, + "learning_rate": 5.419660282517836e-07, + "loss": 0.8839, + "step": 27025 + }, + { + "epoch": 0.9678586137124644, + "grad_norm": 1.502988576889038, + "learning_rate": 5.407607374913748e-07, + "loss": 1.0908, + "step": 27026 + }, + { + "epoch": 0.9678944258420327, + "grad_norm": 1.3476581573486328, + "learning_rate": 5.395567848267225e-07, + "loss": 1.144, + "step": 27027 + }, + { + "epoch": 0.9679302379716009, + "grad_norm": 1.5100494623184204, + "learning_rate": 5.383541702740469e-07, + "loss": 1.1252, + "step": 27028 + }, + { + "epoch": 0.9679660501011693, + "grad_norm": 1.361602783203125, + "learning_rate": 5.37152893849513e-07, + "loss": 1.011, + "step": 27029 + }, + { + "epoch": 0.9680018622307376, + "grad_norm": 1.7571381330490112, + "learning_rate": 5.359529555692966e-07, + "loss": 1.1764, + "step": 27030 + }, + { + "epoch": 0.9680376743603059, + "grad_norm": 1.548661470413208, + "learning_rate": 5.347543554495293e-07, + "loss": 1.0218, + "step": 27031 + }, + { + "epoch": 0.9680734864898741, + "grad_norm": 1.937326431274414, + "learning_rate": 5.335570935063427e-07, + "loss": 1.1992, + "step": 27032 + }, + { + "epoch": 0.9681092986194424, + "grad_norm": 1.552627444267273, + "learning_rate": 5.323611697558462e-07, + "loss": 0.9023, + "step": 27033 + }, + { + "epoch": 0.9681451107490107, + "grad_norm": 1.9240103960037231, + "learning_rate": 5.311665842141155e-07, + "loss": 1.0818, + "step": 27034 + }, + { + "epoch": 0.9681809228785789, + "grad_norm": 1.4324544668197632, + "learning_rate": 5.299733368972492e-07, + "loss": 1.113, + "step": 27035 + }, + { + "epoch": 0.9682167350081473, + "grad_norm": 1.6677169799804688, + "learning_rate": 5.287814278212677e-07, + "loss": 1.1373, + "step": 27036 + }, + { + "epoch": 0.9682525471377156, + "grad_norm": 1.4891108274459839, + "learning_rate": 5.275908570022359e-07, + "loss": 1.1493, + "step": 27037 + }, + { + "epoch": 0.9682883592672838, + "grad_norm": 1.283323884010315, + "learning_rate": 5.26401624456152e-07, + "loss": 0.9809, + "step": 27038 + }, + { + "epoch": 0.9683241713968521, + "grad_norm": 1.4460633993148804, + "learning_rate": 5.252137301990256e-07, + "loss": 1.0322, + "step": 27039 + }, + { + "epoch": 0.9683599835264204, + "grad_norm": 1.8169996738433838, + "learning_rate": 5.240271742468328e-07, + "loss": 0.9529, + "step": 27040 + }, + { + "epoch": 0.9683957956559887, + "grad_norm": 1.9847575426101685, + "learning_rate": 5.228419566155385e-07, + "loss": 0.9805, + "step": 27041 + }, + { + "epoch": 0.9684316077855569, + "grad_norm": 1.3907225131988525, + "learning_rate": 5.216580773210966e-07, + "loss": 1.0338, + "step": 27042 + }, + { + "epoch": 0.9684674199151253, + "grad_norm": 1.3776180744171143, + "learning_rate": 5.204755363794167e-07, + "loss": 1.0485, + "step": 27043 + }, + { + "epoch": 0.9685032320446936, + "grad_norm": 1.4344476461410522, + "learning_rate": 5.192943338064305e-07, + "loss": 1.1693, + "step": 27044 + }, + { + "epoch": 0.9685390441742618, + "grad_norm": 1.2939698696136475, + "learning_rate": 5.18114469618014e-07, + "loss": 1.2784, + "step": 27045 + }, + { + "epoch": 0.9685748563038301, + "grad_norm": 1.3718217611312866, + "learning_rate": 5.169359438300436e-07, + "loss": 0.8305, + "step": 27046 + }, + { + "epoch": 0.9686106684333984, + "grad_norm": 1.281718373298645, + "learning_rate": 5.157587564583733e-07, + "loss": 1.02, + "step": 27047 + }, + { + "epoch": 0.9686464805629666, + "grad_norm": 1.2213125228881836, + "learning_rate": 5.145829075188457e-07, + "loss": 0.9108, + "step": 27048 + }, + { + "epoch": 0.9686822926925349, + "grad_norm": 1.4335795640945435, + "learning_rate": 5.134083970272819e-07, + "loss": 1.1817, + "step": 27049 + }, + { + "epoch": 0.9687181048221033, + "grad_norm": 1.580122470855713, + "learning_rate": 5.1223522499948e-07, + "loss": 1.014, + "step": 27050 + }, + { + "epoch": 0.9687539169516716, + "grad_norm": 1.6961798667907715, + "learning_rate": 5.110633914512164e-07, + "loss": 1.1469, + "step": 27051 + }, + { + "epoch": 0.9687897290812398, + "grad_norm": 1.4195812940597534, + "learning_rate": 5.098928963982674e-07, + "loss": 1.1696, + "step": 27052 + }, + { + "epoch": 0.9688255412108081, + "grad_norm": 1.1416420936584473, + "learning_rate": 5.08723739856376e-07, + "loss": 0.9333, + "step": 27053 + }, + { + "epoch": 0.9688613533403764, + "grad_norm": 1.4213025569915771, + "learning_rate": 5.07555921841274e-07, + "loss": 1.0426, + "step": 27054 + }, + { + "epoch": 0.9688971654699446, + "grad_norm": 1.1720945835113525, + "learning_rate": 5.06389442368671e-07, + "loss": 1.0259, + "step": 27055 + }, + { + "epoch": 0.9689329775995129, + "grad_norm": 1.2849435806274414, + "learning_rate": 5.052243014542546e-07, + "loss": 1.1686, + "step": 27056 + }, + { + "epoch": 0.9689687897290813, + "grad_norm": 1.4722814559936523, + "learning_rate": 5.040604991137121e-07, + "loss": 0.8696, + "step": 27057 + }, + { + "epoch": 0.9690046018586496, + "grad_norm": 1.5094801187515259, + "learning_rate": 5.028980353626866e-07, + "loss": 1.0788, + "step": 27058 + }, + { + "epoch": 0.9690404139882178, + "grad_norm": 1.270745873451233, + "learning_rate": 5.017369102168435e-07, + "loss": 0.9633, + "step": 27059 + }, + { + "epoch": 0.9690762261177861, + "grad_norm": 1.3541780710220337, + "learning_rate": 5.005771236917811e-07, + "loss": 1.1079, + "step": 27060 + }, + { + "epoch": 0.9691120382473544, + "grad_norm": 1.3848737478256226, + "learning_rate": 4.994186758030983e-07, + "loss": 1.1652, + "step": 27061 + }, + { + "epoch": 0.9691478503769226, + "grad_norm": 1.2952615022659302, + "learning_rate": 4.982615665663937e-07, + "loss": 1.0659, + "step": 27062 + }, + { + "epoch": 0.9691836625064909, + "grad_norm": 1.7034446001052856, + "learning_rate": 4.971057959972325e-07, + "loss": 0.9372, + "step": 27063 + }, + { + "epoch": 0.9692194746360593, + "grad_norm": 1.3463894128799438, + "learning_rate": 4.95951364111169e-07, + "loss": 0.9621, + "step": 27064 + }, + { + "epoch": 0.9692552867656276, + "grad_norm": 1.643913745880127, + "learning_rate": 4.947982709237131e-07, + "loss": 1.0472, + "step": 27065 + }, + { + "epoch": 0.9692910988951958, + "grad_norm": 1.4343503713607788, + "learning_rate": 4.936465164504079e-07, + "loss": 1.302, + "step": 27066 + }, + { + "epoch": 0.9693269110247641, + "grad_norm": 1.5380115509033203, + "learning_rate": 4.924961007067408e-07, + "loss": 1.2564, + "step": 27067 + }, + { + "epoch": 0.9693627231543324, + "grad_norm": 1.2518161535263062, + "learning_rate": 4.913470237081774e-07, + "loss": 1.1872, + "step": 27068 + }, + { + "epoch": 0.9693985352839006, + "grad_norm": 1.724918246269226, + "learning_rate": 4.90199285470172e-07, + "loss": 1.3036, + "step": 27069 + }, + { + "epoch": 0.9694343474134689, + "grad_norm": 1.312538504600525, + "learning_rate": 4.8905288600819e-07, + "loss": 1.0143, + "step": 27070 + }, + { + "epoch": 0.9694701595430373, + "grad_norm": 1.5106489658355713, + "learning_rate": 4.879078253376412e-07, + "loss": 1.1162, + "step": 27071 + }, + { + "epoch": 0.9695059716726055, + "grad_norm": 1.4137383699417114, + "learning_rate": 4.867641034739134e-07, + "loss": 1.0505, + "step": 27072 + }, + { + "epoch": 0.9695417838021738, + "grad_norm": 1.7810893058776855, + "learning_rate": 4.856217204324275e-07, + "loss": 1.2248, + "step": 27073 + }, + { + "epoch": 0.9695775959317421, + "grad_norm": 1.2999401092529297, + "learning_rate": 4.844806762285381e-07, + "loss": 0.9577, + "step": 27074 + }, + { + "epoch": 0.9696134080613104, + "grad_norm": 1.4628145694732666, + "learning_rate": 4.833409708775882e-07, + "loss": 1.0643, + "step": 27075 + }, + { + "epoch": 0.9696492201908786, + "grad_norm": 1.2602418661117554, + "learning_rate": 4.822026043949213e-07, + "loss": 1.0933, + "step": 27076 + }, + { + "epoch": 0.9696850323204469, + "grad_norm": 1.2083569765090942, + "learning_rate": 4.810655767958583e-07, + "loss": 0.849, + "step": 27077 + }, + { + "epoch": 0.9697208444500153, + "grad_norm": 1.291933536529541, + "learning_rate": 4.799298880956759e-07, + "loss": 1.0702, + "step": 27078 + }, + { + "epoch": 0.9697566565795835, + "grad_norm": 1.481921911239624, + "learning_rate": 4.787955383096731e-07, + "loss": 0.9087, + "step": 27079 + }, + { + "epoch": 0.9697924687091518, + "grad_norm": 1.3039758205413818, + "learning_rate": 4.776625274530933e-07, + "loss": 0.8711, + "step": 27080 + }, + { + "epoch": 0.9698282808387201, + "grad_norm": 1.332092046737671, + "learning_rate": 4.765308555411907e-07, + "loss": 0.9285, + "step": 27081 + }, + { + "epoch": 0.9698640929682883, + "grad_norm": 1.4937000274658203, + "learning_rate": 4.754005225891978e-07, + "loss": 1.0452, + "step": 27082 + }, + { + "epoch": 0.9698999050978566, + "grad_norm": 1.4822261333465576, + "learning_rate": 4.742715286123134e-07, + "loss": 1.0376, + "step": 27083 + }, + { + "epoch": 0.9699357172274249, + "grad_norm": 1.5632785558700562, + "learning_rate": 4.7314387362572545e-07, + "loss": 1.0997, + "step": 27084 + }, + { + "epoch": 0.9699715293569933, + "grad_norm": 1.8467410802841187, + "learning_rate": 4.7201755764459953e-07, + "loss": 1.1439, + "step": 27085 + }, + { + "epoch": 0.9700073414865615, + "grad_norm": 1.3571193218231201, + "learning_rate": 4.7089258068410133e-07, + "loss": 1.0512, + "step": 27086 + }, + { + "epoch": 0.9700431536161298, + "grad_norm": 1.6349046230316162, + "learning_rate": 4.6976894275935215e-07, + "loss": 1.3149, + "step": 27087 + }, + { + "epoch": 0.9700789657456981, + "grad_norm": 1.6779677867889404, + "learning_rate": 4.6864664388548417e-07, + "loss": 1.1684, + "step": 27088 + }, + { + "epoch": 0.9701147778752663, + "grad_norm": 1.4739879369735718, + "learning_rate": 4.6752568407759655e-07, + "loss": 0.9755, + "step": 27089 + }, + { + "epoch": 0.9701505900048346, + "grad_norm": 1.2876430749893188, + "learning_rate": 4.66406063350755e-07, + "loss": 1.0345, + "step": 27090 + }, + { + "epoch": 0.9701864021344029, + "grad_norm": 1.425763726234436, + "learning_rate": 4.652877817200252e-07, + "loss": 1.2099, + "step": 27091 + }, + { + "epoch": 0.9702222142639713, + "grad_norm": 1.2247804403305054, + "learning_rate": 4.6417083920046176e-07, + "loss": 1.0852, + "step": 27092 + }, + { + "epoch": 0.9702580263935395, + "grad_norm": 1.2180415391921997, + "learning_rate": 4.630552358070972e-07, + "loss": 0.9997, + "step": 27093 + }, + { + "epoch": 0.9702938385231078, + "grad_norm": 1.3880187273025513, + "learning_rate": 4.6194097155491944e-07, + "loss": 1.26, + "step": 27094 + }, + { + "epoch": 0.9703296506526761, + "grad_norm": 1.3937877416610718, + "learning_rate": 4.6082804645893874e-07, + "loss": 1.0476, + "step": 27095 + }, + { + "epoch": 0.9703654627822443, + "grad_norm": 1.3630179166793823, + "learning_rate": 4.597164605341209e-07, + "loss": 0.9607, + "step": 27096 + }, + { + "epoch": 0.9704012749118126, + "grad_norm": 1.6742825508117676, + "learning_rate": 4.5860621379540944e-07, + "loss": 1.0682, + "step": 27097 + }, + { + "epoch": 0.9704370870413809, + "grad_norm": 1.3571871519088745, + "learning_rate": 4.574973062577592e-07, + "loss": 0.9161, + "step": 27098 + }, + { + "epoch": 0.9704728991709493, + "grad_norm": 1.6382492780685425, + "learning_rate": 4.5638973793608043e-07, + "loss": 1.079, + "step": 27099 + }, + { + "epoch": 0.9705087113005175, + "grad_norm": 1.3496469259262085, + "learning_rate": 4.5528350884528335e-07, + "loss": 1.1697, + "step": 27100 + }, + { + "epoch": 0.9705445234300858, + "grad_norm": 1.3088037967681885, + "learning_rate": 4.5417861900023397e-07, + "loss": 1.0253, + "step": 27101 + }, + { + "epoch": 0.9705803355596541, + "grad_norm": 1.29262113571167, + "learning_rate": 4.5307506841580914e-07, + "loss": 0.9205, + "step": 27102 + }, + { + "epoch": 0.9706161476892223, + "grad_norm": 1.569222092628479, + "learning_rate": 4.5197285710685265e-07, + "loss": 1.324, + "step": 27103 + }, + { + "epoch": 0.9706519598187906, + "grad_norm": 1.381661295890808, + "learning_rate": 4.5087198508819705e-07, + "loss": 1.0554, + "step": 27104 + }, + { + "epoch": 0.9706877719483589, + "grad_norm": 1.4133498668670654, + "learning_rate": 4.497724523746416e-07, + "loss": 1.0348, + "step": 27105 + }, + { + "epoch": 0.9707235840779272, + "grad_norm": 1.5207966566085815, + "learning_rate": 4.486742589809967e-07, + "loss": 1.0964, + "step": 27106 + }, + { + "epoch": 0.9707593962074955, + "grad_norm": 1.3889762163162231, + "learning_rate": 4.4757740492201717e-07, + "loss": 1.0728, + "step": 27107 + }, + { + "epoch": 0.9707952083370638, + "grad_norm": 1.4595712423324585, + "learning_rate": 4.464818902124801e-07, + "loss": 0.8907, + "step": 27108 + }, + { + "epoch": 0.970831020466632, + "grad_norm": 1.4048527479171753, + "learning_rate": 4.4538771486710706e-07, + "loss": 1.2568, + "step": 27109 + }, + { + "epoch": 0.9708668325962003, + "grad_norm": 2.308934211730957, + "learning_rate": 4.442948789006307e-07, + "loss": 1.1209, + "step": 27110 + }, + { + "epoch": 0.9709026447257686, + "grad_norm": 1.4483436346054077, + "learning_rate": 4.432033823277504e-07, + "loss": 0.8965, + "step": 27111 + }, + { + "epoch": 0.9709384568553369, + "grad_norm": 2.0900776386260986, + "learning_rate": 4.4211322516314324e-07, + "loss": 1.1864, + "step": 27112 + }, + { + "epoch": 0.9709742689849052, + "grad_norm": 1.4121962785720825, + "learning_rate": 4.410244074214864e-07, + "loss": 1.0775, + "step": 27113 + }, + { + "epoch": 0.9710100811144735, + "grad_norm": 1.467872977256775, + "learning_rate": 4.399369291174349e-07, + "loss": 0.9481, + "step": 27114 + }, + { + "epoch": 0.9710458932440418, + "grad_norm": 1.3326990604400635, + "learning_rate": 4.388507902655881e-07, + "loss": 1.0758, + "step": 27115 + }, + { + "epoch": 0.97108170537361, + "grad_norm": 1.5368428230285645, + "learning_rate": 4.377659908805898e-07, + "loss": 1.1208, + "step": 27116 + }, + { + "epoch": 0.9711175175031783, + "grad_norm": 1.2966172695159912, + "learning_rate": 4.366825309770284e-07, + "loss": 0.7856, + "step": 27117 + }, + { + "epoch": 0.9711533296327466, + "grad_norm": 2.0334994792938232, + "learning_rate": 4.3560041056947e-07, + "loss": 1.0998, + "step": 27118 + }, + { + "epoch": 0.9711891417623149, + "grad_norm": 2.541654348373413, + "learning_rate": 4.345196296724807e-07, + "loss": 1.1428, + "step": 27119 + }, + { + "epoch": 0.9712249538918832, + "grad_norm": 1.623500108718872, + "learning_rate": 4.334401883005934e-07, + "loss": 1.1316, + "step": 27120 + }, + { + "epoch": 0.9712607660214515, + "grad_norm": 1.6659623384475708, + "learning_rate": 4.32362086468352e-07, + "loss": 1.1444, + "step": 27121 + }, + { + "epoch": 0.9712965781510198, + "grad_norm": 1.2612414360046387, + "learning_rate": 4.312853241902337e-07, + "loss": 1.0422, + "step": 27122 + }, + { + "epoch": 0.971332390280588, + "grad_norm": 1.4345208406448364, + "learning_rate": 4.3020990148073815e-07, + "loss": 1.1661, + "step": 27123 + }, + { + "epoch": 0.9713682024101563, + "grad_norm": 1.5216487646102905, + "learning_rate": 4.2913581835433147e-07, + "loss": 1.2403, + "step": 27124 + }, + { + "epoch": 0.9714040145397246, + "grad_norm": 1.3651626110076904, + "learning_rate": 4.2806307482546883e-07, + "loss": 1.1875, + "step": 27125 + }, + { + "epoch": 0.9714398266692928, + "grad_norm": 1.408658504486084, + "learning_rate": 4.2699167090858303e-07, + "loss": 0.9768, + "step": 27126 + }, + { + "epoch": 0.9714756387988612, + "grad_norm": 1.731264352798462, + "learning_rate": 4.259216066180738e-07, + "loss": 1.1372, + "step": 27127 + }, + { + "epoch": 0.9715114509284295, + "grad_norm": 1.5188406705856323, + "learning_rate": 4.248528819683517e-07, + "loss": 1.0665, + "step": 27128 + }, + { + "epoch": 0.9715472630579978, + "grad_norm": 1.4015437364578247, + "learning_rate": 4.2378549697380533e-07, + "loss": 1.134, + "step": 27129 + }, + { + "epoch": 0.971583075187566, + "grad_norm": 1.6299535036087036, + "learning_rate": 4.2271945164876756e-07, + "loss": 1.1618, + "step": 27130 + }, + { + "epoch": 0.9716188873171343, + "grad_norm": 1.5027655363082886, + "learning_rate": 4.216547460075937e-07, + "loss": 1.1193, + "step": 27131 + }, + { + "epoch": 0.9716546994467026, + "grad_norm": 1.7369358539581299, + "learning_rate": 4.2059138006460554e-07, + "loss": 1.1333, + "step": 27132 + }, + { + "epoch": 0.9716905115762708, + "grad_norm": 1.3198820352554321, + "learning_rate": 4.1952935383412494e-07, + "loss": 0.9437, + "step": 27133 + }, + { + "epoch": 0.9717263237058391, + "grad_norm": 1.1280663013458252, + "learning_rate": 4.1846866733041834e-07, + "loss": 1.0106, + "step": 27134 + }, + { + "epoch": 0.9717621358354075, + "grad_norm": 1.6076565980911255, + "learning_rate": 4.1740932056776317e-07, + "loss": 1.1672, + "step": 27135 + }, + { + "epoch": 0.9717979479649758, + "grad_norm": 1.3855561017990112, + "learning_rate": 4.1635131356041467e-07, + "loss": 1.1212, + "step": 27136 + }, + { + "epoch": 0.971833760094544, + "grad_norm": 1.6017178297042847, + "learning_rate": 4.1529464632260597e-07, + "loss": 0.9886, + "step": 27137 + }, + { + "epoch": 0.9718695722241123, + "grad_norm": 1.2870696783065796, + "learning_rate": 4.1423931886854785e-07, + "loss": 1.1708, + "step": 27138 + }, + { + "epoch": 0.9719053843536806, + "grad_norm": 1.310845971107483, + "learning_rate": 4.131853312124512e-07, + "loss": 1.1879, + "step": 27139 + }, + { + "epoch": 0.9719411964832488, + "grad_norm": 1.4391274452209473, + "learning_rate": 4.1213268336849355e-07, + "loss": 1.0632, + "step": 27140 + }, + { + "epoch": 0.9719770086128171, + "grad_norm": 1.6123485565185547, + "learning_rate": 4.1108137535081914e-07, + "loss": 1.0118, + "step": 27141 + }, + { + "epoch": 0.9720128207423855, + "grad_norm": 1.2882853746414185, + "learning_rate": 4.1003140717358336e-07, + "loss": 1.0285, + "step": 27142 + }, + { + "epoch": 0.9720486328719538, + "grad_norm": 1.3609386682510376, + "learning_rate": 4.089827788509304e-07, + "loss": 0.9531, + "step": 27143 + }, + { + "epoch": 0.972084445001522, + "grad_norm": 1.9820375442504883, + "learning_rate": 4.079354903969379e-07, + "loss": 1.0705, + "step": 27144 + }, + { + "epoch": 0.9721202571310903, + "grad_norm": 2.166651487350464, + "learning_rate": 4.068895418257057e-07, + "loss": 0.9689, + "step": 27145 + }, + { + "epoch": 0.9721560692606586, + "grad_norm": 1.417643666267395, + "learning_rate": 4.0584493315131146e-07, + "loss": 1.0733, + "step": 27146 + }, + { + "epoch": 0.9721918813902268, + "grad_norm": 1.3564348220825195, + "learning_rate": 4.048016643878105e-07, + "loss": 1.2228, + "step": 27147 + }, + { + "epoch": 0.9722276935197951, + "grad_norm": 1.5418262481689453, + "learning_rate": 4.037597355492362e-07, + "loss": 1.1413, + "step": 27148 + }, + { + "epoch": 0.9722635056493635, + "grad_norm": 1.6149554252624512, + "learning_rate": 4.0271914664959944e-07, + "loss": 1.0988, + "step": 27149 + }, + { + "epoch": 0.9722993177789317, + "grad_norm": 1.455822229385376, + "learning_rate": 4.016798977029113e-07, + "loss": 0.9783, + "step": 27150 + }, + { + "epoch": 0.9723351299085, + "grad_norm": 1.2971248626708984, + "learning_rate": 4.006419887231383e-07, + "loss": 1.1522, + "step": 27151 + }, + { + "epoch": 0.9723709420380683, + "grad_norm": 1.5526193380355835, + "learning_rate": 3.9960541972426936e-07, + "loss": 1.2333, + "step": 27152 + }, + { + "epoch": 0.9724067541676366, + "grad_norm": 1.8460127115249634, + "learning_rate": 3.985701907202155e-07, + "loss": 1.0654, + "step": 27153 + }, + { + "epoch": 0.9724425662972048, + "grad_norm": 1.3634017705917358, + "learning_rate": 3.975363017249323e-07, + "loss": 0.9242, + "step": 27154 + }, + { + "epoch": 0.9724783784267731, + "grad_norm": 1.4082773923873901, + "learning_rate": 3.9650375275231967e-07, + "loss": 1.1142, + "step": 27155 + }, + { + "epoch": 0.9725141905563415, + "grad_norm": 1.410144329071045, + "learning_rate": 3.9547254381626653e-07, + "loss": 1.0061, + "step": 27156 + }, + { + "epoch": 0.9725500026859097, + "grad_norm": 1.5650086402893066, + "learning_rate": 3.944426749306507e-07, + "loss": 1.1315, + "step": 27157 + }, + { + "epoch": 0.972585814815478, + "grad_norm": 1.2590032815933228, + "learning_rate": 3.934141461093277e-07, + "loss": 0.855, + "step": 27158 + }, + { + "epoch": 0.9726216269450463, + "grad_norm": 1.359981894493103, + "learning_rate": 3.9238695736614207e-07, + "loss": 1.0087, + "step": 27159 + }, + { + "epoch": 0.9726574390746145, + "grad_norm": 1.553186297416687, + "learning_rate": 3.913611087148938e-07, + "loss": 1.1263, + "step": 27160 + }, + { + "epoch": 0.9726932512041828, + "grad_norm": 1.579599380493164, + "learning_rate": 3.903366001694053e-07, + "loss": 1.0858, + "step": 27161 + }, + { + "epoch": 0.9727290633337511, + "grad_norm": 1.3469268083572388, + "learning_rate": 3.8931343174344324e-07, + "loss": 0.9248, + "step": 27162 + }, + { + "epoch": 0.9727648754633195, + "grad_norm": 1.510964035987854, + "learning_rate": 3.882916034507855e-07, + "loss": 0.9019, + "step": 27163 + }, + { + "epoch": 0.9728006875928877, + "grad_norm": 1.4522327184677124, + "learning_rate": 3.8727111530516556e-07, + "loss": 1.1241, + "step": 27164 + }, + { + "epoch": 0.972836499722456, + "grad_norm": 1.3645726442337036, + "learning_rate": 3.86251967320328e-07, + "loss": 1.1222, + "step": 27165 + }, + { + "epoch": 0.9728723118520243, + "grad_norm": 1.4031420946121216, + "learning_rate": 3.85234159509984e-07, + "loss": 1.0485, + "step": 27166 + }, + { + "epoch": 0.9729081239815925, + "grad_norm": 1.4976528882980347, + "learning_rate": 3.842176918878115e-07, + "loss": 1.0857, + "step": 27167 + }, + { + "epoch": 0.9729439361111608, + "grad_norm": 1.2720582485198975, + "learning_rate": 3.832025644674886e-07, + "loss": 0.9757, + "step": 27168 + }, + { + "epoch": 0.9729797482407291, + "grad_norm": 1.8680821657180786, + "learning_rate": 3.821887772626931e-07, + "loss": 0.9931, + "step": 27169 + }, + { + "epoch": 0.9730155603702975, + "grad_norm": 1.5966187715530396, + "learning_rate": 3.8117633028704745e-07, + "loss": 0.7831, + "step": 27170 + }, + { + "epoch": 0.9730513724998657, + "grad_norm": 1.2539606094360352, + "learning_rate": 3.801652235541631e-07, + "loss": 1.0933, + "step": 27171 + }, + { + "epoch": 0.973087184629434, + "grad_norm": 1.571058988571167, + "learning_rate": 3.791554570776734e-07, + "loss": 1.0856, + "step": 27172 + }, + { + "epoch": 0.9731229967590023, + "grad_norm": 1.4926470518112183, + "learning_rate": 3.781470308711343e-07, + "loss": 0.9855, + "step": 27173 + }, + { + "epoch": 0.9731588088885705, + "grad_norm": 1.507407784461975, + "learning_rate": 3.771399449481239e-07, + "loss": 0.8393, + "step": 27174 + }, + { + "epoch": 0.9731946210181388, + "grad_norm": 1.5051476955413818, + "learning_rate": 3.761341993221867e-07, + "loss": 0.9872, + "step": 27175 + }, + { + "epoch": 0.9732304331477071, + "grad_norm": 2.093729019165039, + "learning_rate": 3.7512979400686763e-07, + "loss": 1.1347, + "step": 27176 + }, + { + "epoch": 0.9732662452772755, + "grad_norm": 1.4988871812820435, + "learning_rate": 3.741267290156669e-07, + "loss": 1.218, + "step": 27177 + }, + { + "epoch": 0.9733020574068437, + "grad_norm": 1.3911726474761963, + "learning_rate": 3.7312500436208487e-07, + "loss": 0.885, + "step": 27178 + }, + { + "epoch": 0.973337869536412, + "grad_norm": 1.339436650276184, + "learning_rate": 3.7212462005959957e-07, + "loss": 0.9936, + "step": 27179 + }, + { + "epoch": 0.9733736816659803, + "grad_norm": 1.340036153793335, + "learning_rate": 3.7112557612165586e-07, + "loss": 1.0226, + "step": 27180 + }, + { + "epoch": 0.9734094937955485, + "grad_norm": 1.5858625173568726, + "learning_rate": 3.7012787256172075e-07, + "loss": 0.8025, + "step": 27181 + }, + { + "epoch": 0.9734453059251168, + "grad_norm": 1.5768764019012451, + "learning_rate": 3.6913150939318353e-07, + "loss": 1.1518, + "step": 27182 + }, + { + "epoch": 0.9734811180546851, + "grad_norm": 1.4497897624969482, + "learning_rate": 3.6813648662947785e-07, + "loss": 1.1876, + "step": 27183 + }, + { + "epoch": 0.9735169301842534, + "grad_norm": 1.204982876777649, + "learning_rate": 3.67142804283982e-07, + "loss": 1.0156, + "step": 27184 + }, + { + "epoch": 0.9735527423138217, + "grad_norm": 1.3974580764770508, + "learning_rate": 3.661504623700629e-07, + "loss": 0.9434, + "step": 27185 + }, + { + "epoch": 0.97358855444339, + "grad_norm": 1.432465672492981, + "learning_rate": 3.6515946090106557e-07, + "loss": 1.3258, + "step": 27186 + }, + { + "epoch": 0.9736243665729583, + "grad_norm": 1.2793617248535156, + "learning_rate": 3.641697998903237e-07, + "loss": 1.0501, + "step": 27187 + }, + { + "epoch": 0.9736601787025265, + "grad_norm": 1.3163424730300903, + "learning_rate": 3.631814793511712e-07, + "loss": 0.8914, + "step": 27188 + }, + { + "epoch": 0.9736959908320948, + "grad_norm": 1.7273465394973755, + "learning_rate": 3.621944992968751e-07, + "loss": 1.0364, + "step": 27189 + }, + { + "epoch": 0.9737318029616631, + "grad_norm": 1.5042040348052979, + "learning_rate": 3.612088597407359e-07, + "loss": 1.0853, + "step": 27190 + }, + { + "epoch": 0.9737676150912314, + "grad_norm": 1.856017827987671, + "learning_rate": 3.6022456069600973e-07, + "loss": 1.0601, + "step": 27191 + }, + { + "epoch": 0.9738034272207997, + "grad_norm": 1.1821531057357788, + "learning_rate": 3.592416021759304e-07, + "loss": 0.8117, + "step": 27192 + }, + { + "epoch": 0.973839239350368, + "grad_norm": 1.3573625087738037, + "learning_rate": 3.5825998419372065e-07, + "loss": 1.0023, + "step": 27193 + }, + { + "epoch": 0.9738750514799362, + "grad_norm": 1.3040152788162231, + "learning_rate": 3.5727970676260327e-07, + "loss": 0.9309, + "step": 27194 + }, + { + "epoch": 0.9739108636095045, + "grad_norm": 1.4242392778396606, + "learning_rate": 3.563007698957566e-07, + "loss": 0.9745, + "step": 27195 + }, + { + "epoch": 0.9739466757390728, + "grad_norm": 1.540827989578247, + "learning_rate": 3.553231736063589e-07, + "loss": 1.2693, + "step": 27196 + }, + { + "epoch": 0.9739824878686411, + "grad_norm": 1.3965108394622803, + "learning_rate": 3.5434691790754427e-07, + "loss": 1.2208, + "step": 27197 + }, + { + "epoch": 0.9740182999982094, + "grad_norm": 1.4107381105422974, + "learning_rate": 3.5337200281245765e-07, + "loss": 1.0353, + "step": 27198 + }, + { + "epoch": 0.9740541121277777, + "grad_norm": 1.498155951499939, + "learning_rate": 3.52398428334233e-07, + "loss": 1.2243, + "step": 27199 + }, + { + "epoch": 0.974089924257346, + "grad_norm": 1.442104458808899, + "learning_rate": 3.514261944859376e-07, + "loss": 1.0634, + "step": 27200 + }, + { + "epoch": 0.9741257363869142, + "grad_norm": 1.4177089929580688, + "learning_rate": 3.5045530128066106e-07, + "loss": 1.1909, + "step": 27201 + }, + { + "epoch": 0.9741615485164825, + "grad_norm": 1.8053933382034302, + "learning_rate": 3.4948574873148174e-07, + "loss": 1.2315, + "step": 27202 + }, + { + "epoch": 0.9741973606460508, + "grad_norm": 1.6156288385391235, + "learning_rate": 3.4851753685142265e-07, + "loss": 1.1412, + "step": 27203 + }, + { + "epoch": 0.974233172775619, + "grad_norm": 1.4706828594207764, + "learning_rate": 3.4755066565351767e-07, + "loss": 1.0299, + "step": 27204 + }, + { + "epoch": 0.9742689849051874, + "grad_norm": 1.335688829421997, + "learning_rate": 3.465851351507787e-07, + "loss": 0.9117, + "step": 27205 + }, + { + "epoch": 0.9743047970347557, + "grad_norm": 1.505664348602295, + "learning_rate": 3.456209453561954e-07, + "loss": 1.1801, + "step": 27206 + }, + { + "epoch": 0.974340609164324, + "grad_norm": 1.210634708404541, + "learning_rate": 3.4465809628273504e-07, + "loss": 0.753, + "step": 27207 + }, + { + "epoch": 0.9743764212938922, + "grad_norm": 1.3296185731887817, + "learning_rate": 3.4369658794335403e-07, + "loss": 1.0444, + "step": 27208 + }, + { + "epoch": 0.9744122334234605, + "grad_norm": 1.4116909503936768, + "learning_rate": 3.4273642035099753e-07, + "loss": 0.8925, + "step": 27209 + }, + { + "epoch": 0.9744480455530288, + "grad_norm": 1.4374897480010986, + "learning_rate": 3.417775935185663e-07, + "loss": 1.1358, + "step": 27210 + }, + { + "epoch": 0.974483857682597, + "grad_norm": 1.4011424779891968, + "learning_rate": 3.408201074589612e-07, + "loss": 1.3561, + "step": 27211 + }, + { + "epoch": 0.9745196698121654, + "grad_norm": 2.2361655235290527, + "learning_rate": 3.39863962185083e-07, + "loss": 1.1591, + "step": 27212 + }, + { + "epoch": 0.9745554819417337, + "grad_norm": 1.4253689050674438, + "learning_rate": 3.3890915770977694e-07, + "loss": 0.9448, + "step": 27213 + }, + { + "epoch": 0.974591294071302, + "grad_norm": 1.5153323411941528, + "learning_rate": 3.379556940458883e-07, + "loss": 0.9946, + "step": 27214 + }, + { + "epoch": 0.9746271062008702, + "grad_norm": 1.3719990253448486, + "learning_rate": 3.3700357120626247e-07, + "loss": 1.126, + "step": 27215 + }, + { + "epoch": 0.9746629183304385, + "grad_norm": 1.1755179166793823, + "learning_rate": 3.360527892036891e-07, + "loss": 1.2027, + "step": 27216 + }, + { + "epoch": 0.9746987304600068, + "grad_norm": 1.4613347053527832, + "learning_rate": 3.351033480509691e-07, + "loss": 1.0922, + "step": 27217 + }, + { + "epoch": 0.974734542589575, + "grad_norm": 1.7790571451187134, + "learning_rate": 3.3415524776088116e-07, + "loss": 1.0908, + "step": 27218 + }, + { + "epoch": 0.9747703547191434, + "grad_norm": 1.4375656843185425, + "learning_rate": 3.332084883461706e-07, + "loss": 1.012, + "step": 27219 + }, + { + "epoch": 0.9748061668487117, + "grad_norm": 1.2899980545043945, + "learning_rate": 3.3226306981957166e-07, + "loss": 1.2285, + "step": 27220 + }, + { + "epoch": 0.97484197897828, + "grad_norm": 1.4402316808700562, + "learning_rate": 3.313189921938187e-07, + "loss": 0.9544, + "step": 27221 + }, + { + "epoch": 0.9748777911078482, + "grad_norm": 1.7261117696762085, + "learning_rate": 3.3037625548160143e-07, + "loss": 1.2288, + "step": 27222 + }, + { + "epoch": 0.9749136032374165, + "grad_norm": 1.8006500005722046, + "learning_rate": 3.294348596956098e-07, + "loss": 1.1935, + "step": 27223 + }, + { + "epoch": 0.9749494153669848, + "grad_norm": 1.4781209230422974, + "learning_rate": 3.2849480484851145e-07, + "loss": 0.9168, + "step": 27224 + }, + { + "epoch": 0.974985227496553, + "grad_norm": 1.3270771503448486, + "learning_rate": 3.275560909529407e-07, + "loss": 1.1129, + "step": 27225 + }, + { + "epoch": 0.9750210396261214, + "grad_norm": 1.3130995035171509, + "learning_rate": 3.2661871802154296e-07, + "loss": 0.9678, + "step": 27226 + }, + { + "epoch": 0.9750568517556897, + "grad_norm": 1.4667332172393799, + "learning_rate": 3.256826860669193e-07, + "loss": 1.0381, + "step": 27227 + }, + { + "epoch": 0.975092663885258, + "grad_norm": 1.5323305130004883, + "learning_rate": 3.2474799510165965e-07, + "loss": 1.1449, + "step": 27228 + }, + { + "epoch": 0.9751284760148262, + "grad_norm": 1.6587454080581665, + "learning_rate": 3.238146451383428e-07, + "loss": 1.0727, + "step": 27229 + }, + { + "epoch": 0.9751642881443945, + "grad_norm": 1.6397150754928589, + "learning_rate": 3.228826361895254e-07, + "loss": 1.287, + "step": 27230 + }, + { + "epoch": 0.9752001002739628, + "grad_norm": 1.293197512626648, + "learning_rate": 3.21951968267753e-07, + "loss": 1.0388, + "step": 27231 + }, + { + "epoch": 0.975235912403531, + "grad_norm": 1.4407572746276855, + "learning_rate": 3.2102264138553774e-07, + "loss": 0.9787, + "step": 27232 + }, + { + "epoch": 0.9752717245330994, + "grad_norm": 1.360804557800293, + "learning_rate": 3.200946555553919e-07, + "loss": 0.9645, + "step": 27233 + }, + { + "epoch": 0.9753075366626677, + "grad_norm": 1.8388248682022095, + "learning_rate": 3.191680107897943e-07, + "loss": 0.9334, + "step": 27234 + }, + { + "epoch": 0.9753433487922359, + "grad_norm": 1.5056577920913696, + "learning_rate": 3.1824270710121286e-07, + "loss": 1.0022, + "step": 27235 + }, + { + "epoch": 0.9753791609218042, + "grad_norm": 1.1763571500778198, + "learning_rate": 3.173187445020931e-07, + "loss": 0.9244, + "step": 27236 + }, + { + "epoch": 0.9754149730513725, + "grad_norm": 1.4004141092300415, + "learning_rate": 3.1639612300485844e-07, + "loss": 1.0891, + "step": 27237 + }, + { + "epoch": 0.9754507851809407, + "grad_norm": 1.550203561782837, + "learning_rate": 3.1547484262194336e-07, + "loss": 1.1344, + "step": 27238 + }, + { + "epoch": 0.975486597310509, + "grad_norm": 1.6233251094818115, + "learning_rate": 3.1455490336572693e-07, + "loss": 0.901, + "step": 27239 + }, + { + "epoch": 0.9755224094400774, + "grad_norm": 1.4197850227355957, + "learning_rate": 3.13636305248588e-07, + "loss": 1.0669, + "step": 27240 + }, + { + "epoch": 0.9755582215696457, + "grad_norm": 1.4441827535629272, + "learning_rate": 3.1271904828288343e-07, + "loss": 1.0328, + "step": 27241 + }, + { + "epoch": 0.9755940336992139, + "grad_norm": 1.4356050491333008, + "learning_rate": 3.1180313248097004e-07, + "loss": 0.9271, + "step": 27242 + }, + { + "epoch": 0.9756298458287822, + "grad_norm": 1.8452478647232056, + "learning_rate": 3.10888557855149e-07, + "loss": 1.1828, + "step": 27243 + }, + { + "epoch": 0.9756656579583505, + "grad_norm": 2.649013042449951, + "learning_rate": 3.099753244177217e-07, + "loss": 1.0655, + "step": 27244 + }, + { + "epoch": 0.9757014700879187, + "grad_norm": 1.5819588899612427, + "learning_rate": 3.090634321810004e-07, + "loss": 1.0193, + "step": 27245 + }, + { + "epoch": 0.975737282217487, + "grad_norm": 1.8797720670700073, + "learning_rate": 3.0815288115723095e-07, + "loss": 1.1943, + "step": 27246 + }, + { + "epoch": 0.9757730943470554, + "grad_norm": 1.6902892589569092, + "learning_rate": 3.0724367135868126e-07, + "loss": 1.0566, + "step": 27247 + }, + { + "epoch": 0.9758089064766237, + "grad_norm": 1.6941710710525513, + "learning_rate": 3.063358027975638e-07, + "loss": 1.0003, + "step": 27248 + }, + { + "epoch": 0.9758447186061919, + "grad_norm": 1.6595300436019897, + "learning_rate": 3.054292754861021e-07, + "loss": 1.1517, + "step": 27249 + }, + { + "epoch": 0.9758805307357602, + "grad_norm": 1.6749014854431152, + "learning_rate": 3.0452408943649756e-07, + "loss": 1.1841, + "step": 27250 + }, + { + "epoch": 0.9759163428653285, + "grad_norm": 1.3965219259262085, + "learning_rate": 3.0362024466092933e-07, + "loss": 1.1041, + "step": 27251 + }, + { + "epoch": 0.9759521549948967, + "grad_norm": 1.4587063789367676, + "learning_rate": 3.0271774117153207e-07, + "loss": 0.9884, + "step": 27252 + }, + { + "epoch": 0.975987967124465, + "grad_norm": 1.7255668640136719, + "learning_rate": 3.01816578980485e-07, + "loss": 1.119, + "step": 27253 + }, + { + "epoch": 0.9760237792540334, + "grad_norm": 1.6233974695205688, + "learning_rate": 3.009167580998895e-07, + "loss": 1.0109, + "step": 27254 + }, + { + "epoch": 0.9760595913836017, + "grad_norm": 1.9466338157653809, + "learning_rate": 3.0001827854184704e-07, + "loss": 1.127, + "step": 27255 + }, + { + "epoch": 0.9760954035131699, + "grad_norm": 1.6843986511230469, + "learning_rate": 2.9912114031847015e-07, + "loss": 1.0123, + "step": 27256 + }, + { + "epoch": 0.9761312156427382, + "grad_norm": 1.344222903251648, + "learning_rate": 2.9822534344180475e-07, + "loss": 1.0772, + "step": 27257 + }, + { + "epoch": 0.9761670277723065, + "grad_norm": 1.7233060598373413, + "learning_rate": 2.9733088792391894e-07, + "loss": 0.8534, + "step": 27258 + }, + { + "epoch": 0.9762028399018747, + "grad_norm": 1.5814831256866455, + "learning_rate": 2.9643777377682535e-07, + "loss": 1.1867, + "step": 27259 + }, + { + "epoch": 0.976238652031443, + "grad_norm": 1.4409327507019043, + "learning_rate": 2.955460010125699e-07, + "loss": 0.944, + "step": 27260 + }, + { + "epoch": 0.9762744641610114, + "grad_norm": 1.3562291860580444, + "learning_rate": 2.946555696431208e-07, + "loss": 1.1304, + "step": 27261 + }, + { + "epoch": 0.9763102762905796, + "grad_norm": 1.4531238079071045, + "learning_rate": 2.9376647968047954e-07, + "loss": 1.1772, + "step": 27262 + }, + { + "epoch": 0.9763460884201479, + "grad_norm": 1.4267553091049194, + "learning_rate": 2.928787311365921e-07, + "loss": 1.0973, + "step": 27263 + }, + { + "epoch": 0.9763819005497162, + "grad_norm": 1.6201096773147583, + "learning_rate": 2.919923240234046e-07, + "loss": 1.0868, + "step": 27264 + }, + { + "epoch": 0.9764177126792845, + "grad_norm": 1.4919281005859375, + "learning_rate": 2.911072583528518e-07, + "loss": 1.0331, + "step": 27265 + }, + { + "epoch": 0.9764535248088527, + "grad_norm": 1.365738868713379, + "learning_rate": 2.9022353413683534e-07, + "loss": 1.0634, + "step": 27266 + }, + { + "epoch": 0.976489336938421, + "grad_norm": 1.4715139865875244, + "learning_rate": 2.893411513872457e-07, + "loss": 1.1092, + "step": 27267 + }, + { + "epoch": 0.9765251490679894, + "grad_norm": 1.4908430576324463, + "learning_rate": 2.884601101159512e-07, + "loss": 1.023, + "step": 27268 + }, + { + "epoch": 0.9765609611975576, + "grad_norm": 1.524165391921997, + "learning_rate": 2.87580410334809e-07, + "loss": 1.1048, + "step": 27269 + }, + { + "epoch": 0.9765967733271259, + "grad_norm": 1.517472505569458, + "learning_rate": 2.8670205205565406e-07, + "loss": 1.1384, + "step": 27270 + }, + { + "epoch": 0.9766325854566942, + "grad_norm": 1.7061400413513184, + "learning_rate": 2.8582503529029916e-07, + "loss": 0.8946, + "step": 27271 + }, + { + "epoch": 0.9766683975862624, + "grad_norm": 1.3598567247390747, + "learning_rate": 2.84949360050546e-07, + "loss": 0.979, + "step": 27272 + }, + { + "epoch": 0.9767042097158307, + "grad_norm": 1.3308360576629639, + "learning_rate": 2.8407502634817395e-07, + "loss": 0.9667, + "step": 27273 + }, + { + "epoch": 0.976740021845399, + "grad_norm": 1.1805461645126343, + "learning_rate": 2.8320203419495153e-07, + "loss": 0.8547, + "step": 27274 + }, + { + "epoch": 0.9767758339749674, + "grad_norm": 1.4263263940811157, + "learning_rate": 2.8233038360262474e-07, + "loss": 1.0106, + "step": 27275 + }, + { + "epoch": 0.9768116461045356, + "grad_norm": 1.2896580696105957, + "learning_rate": 2.814600745829177e-07, + "loss": 0.9715, + "step": 27276 + }, + { + "epoch": 0.9768474582341039, + "grad_norm": 1.2764060497283936, + "learning_rate": 2.805911071475209e-07, + "loss": 1.0303, + "step": 27277 + }, + { + "epoch": 0.9768832703636722, + "grad_norm": 1.436397910118103, + "learning_rate": 2.797234813081584e-07, + "loss": 0.9237, + "step": 27278 + }, + { + "epoch": 0.9769190824932404, + "grad_norm": 1.3227636814117432, + "learning_rate": 2.788571970764764e-07, + "loss": 1.0038, + "step": 27279 + }, + { + "epoch": 0.9769548946228087, + "grad_norm": 1.5533167123794556, + "learning_rate": 2.7799225446414334e-07, + "loss": 1.1029, + "step": 27280 + }, + { + "epoch": 0.976990706752377, + "grad_norm": 1.4213978052139282, + "learning_rate": 2.7712865348279436e-07, + "loss": 1.0301, + "step": 27281 + }, + { + "epoch": 0.9770265188819454, + "grad_norm": 1.7065672874450684, + "learning_rate": 2.762663941440424e-07, + "loss": 1.2325, + "step": 27282 + }, + { + "epoch": 0.9770623310115136, + "grad_norm": 1.4243404865264893, + "learning_rate": 2.7540547645950045e-07, + "loss": 1.0461, + "step": 27283 + }, + { + "epoch": 0.9770981431410819, + "grad_norm": 1.4768282175064087, + "learning_rate": 2.745459004407369e-07, + "loss": 0.953, + "step": 27284 + }, + { + "epoch": 0.9771339552706502, + "grad_norm": 1.4659985303878784, + "learning_rate": 2.736876660993204e-07, + "loss": 1.1698, + "step": 27285 + }, + { + "epoch": 0.9771697674002184, + "grad_norm": 1.6656156778335571, + "learning_rate": 2.728307734467972e-07, + "loss": 0.9435, + "step": 27286 + }, + { + "epoch": 0.9772055795297867, + "grad_norm": 1.6014392375946045, + "learning_rate": 2.719752224947025e-07, + "loss": 1.0057, + "step": 27287 + }, + { + "epoch": 0.977241391659355, + "grad_norm": 1.517807126045227, + "learning_rate": 2.711210132545383e-07, + "loss": 1.0561, + "step": 27288 + }, + { + "epoch": 0.9772772037889234, + "grad_norm": 1.2716962099075317, + "learning_rate": 2.7026814573779534e-07, + "loss": 1.0166, + "step": 27289 + }, + { + "epoch": 0.9773130159184916, + "grad_norm": 1.5882794857025146, + "learning_rate": 2.6941661995596446e-07, + "loss": 1.0791, + "step": 27290 + }, + { + "epoch": 0.9773488280480599, + "grad_norm": 1.4860143661499023, + "learning_rate": 2.6856643592048093e-07, + "loss": 1.1337, + "step": 27291 + }, + { + "epoch": 0.9773846401776282, + "grad_norm": 1.2265440225601196, + "learning_rate": 2.6771759364279114e-07, + "loss": 1.1169, + "step": 27292 + }, + { + "epoch": 0.9774204523071964, + "grad_norm": 1.6427011489868164, + "learning_rate": 2.668700931343082e-07, + "loss": 1.1262, + "step": 27293 + }, + { + "epoch": 0.9774562644367647, + "grad_norm": 1.8752362728118896, + "learning_rate": 2.6602393440645636e-07, + "loss": 1.1031, + "step": 27294 + }, + { + "epoch": 0.977492076566333, + "grad_norm": 1.5504733324050903, + "learning_rate": 2.651791174706042e-07, + "loss": 1.1452, + "step": 27295 + }, + { + "epoch": 0.9775278886959013, + "grad_norm": 1.7070441246032715, + "learning_rate": 2.6433564233810936e-07, + "loss": 1.3071, + "step": 27296 + }, + { + "epoch": 0.9775637008254696, + "grad_norm": 1.579372763633728, + "learning_rate": 2.6349350902032944e-07, + "loss": 1.2081, + "step": 27297 + }, + { + "epoch": 0.9775995129550379, + "grad_norm": 1.1971018314361572, + "learning_rate": 2.6265271752859975e-07, + "loss": 1.052, + "step": 27298 + }, + { + "epoch": 0.9776353250846062, + "grad_norm": 1.3668886423110962, + "learning_rate": 2.618132678742224e-07, + "loss": 0.9522, + "step": 27299 + }, + { + "epoch": 0.9776711372141744, + "grad_norm": 1.3415186405181885, + "learning_rate": 2.609751600684995e-07, + "loss": 0.9796, + "step": 27300 + }, + { + "epoch": 0.9777069493437427, + "grad_norm": 1.4189804792404175, + "learning_rate": 2.601383941226998e-07, + "loss": 1.0775, + "step": 27301 + }, + { + "epoch": 0.977742761473311, + "grad_norm": 1.2987242937088013, + "learning_rate": 2.593029700480698e-07, + "loss": 1.1564, + "step": 27302 + }, + { + "epoch": 0.9777785736028793, + "grad_norm": 1.6652156114578247, + "learning_rate": 2.584688878558783e-07, + "loss": 1.1027, + "step": 27303 + }, + { + "epoch": 0.9778143857324476, + "grad_norm": 1.192886233329773, + "learning_rate": 2.576361475573275e-07, + "loss": 1.1527, + "step": 27304 + }, + { + "epoch": 0.9778501978620159, + "grad_norm": 1.4351260662078857, + "learning_rate": 2.568047491636194e-07, + "loss": 0.9377, + "step": 27305 + }, + { + "epoch": 0.9778860099915841, + "grad_norm": 1.6147966384887695, + "learning_rate": 2.559746926859452e-07, + "loss": 1.0619, + "step": 27306 + }, + { + "epoch": 0.9779218221211524, + "grad_norm": 1.4616245031356812, + "learning_rate": 2.551459781354737e-07, + "loss": 1.2354, + "step": 27307 + }, + { + "epoch": 0.9779576342507207, + "grad_norm": 1.7796844244003296, + "learning_rate": 2.543186055233515e-07, + "loss": 0.9832, + "step": 27308 + }, + { + "epoch": 0.977993446380289, + "grad_norm": 1.3953660726547241, + "learning_rate": 2.534925748607031e-07, + "loss": 0.9534, + "step": 27309 + }, + { + "epoch": 0.9780292585098573, + "grad_norm": 1.5926570892333984, + "learning_rate": 2.526678861586529e-07, + "loss": 1.177, + "step": 27310 + }, + { + "epoch": 0.9780650706394256, + "grad_norm": 1.1992731094360352, + "learning_rate": 2.51844539428292e-07, + "loss": 1.1846, + "step": 27311 + }, + { + "epoch": 0.9781008827689939, + "grad_norm": 1.4026738405227661, + "learning_rate": 2.5102253468070047e-07, + "loss": 0.9447, + "step": 27312 + }, + { + "epoch": 0.9781366948985621, + "grad_norm": 1.7118586301803589, + "learning_rate": 2.50201871926925e-07, + "loss": 0.9446, + "step": 27313 + }, + { + "epoch": 0.9781725070281304, + "grad_norm": 1.3922704458236694, + "learning_rate": 2.4938255117802347e-07, + "loss": 1.0292, + "step": 27314 + }, + { + "epoch": 0.9782083191576987, + "grad_norm": 1.6390447616577148, + "learning_rate": 2.485645724450092e-07, + "loss": 0.991, + "step": 27315 + }, + { + "epoch": 0.978244131287267, + "grad_norm": 1.8180023431777954, + "learning_rate": 2.4774793573888453e-07, + "loss": 1.1445, + "step": 27316 + }, + { + "epoch": 0.9782799434168353, + "grad_norm": 1.3109190464019775, + "learning_rate": 2.4693264107064075e-07, + "loss": 1.0416, + "step": 27317 + }, + { + "epoch": 0.9783157555464036, + "grad_norm": 1.3557889461517334, + "learning_rate": 2.4611868845124673e-07, + "loss": 0.8181, + "step": 27318 + }, + { + "epoch": 0.9783515676759719, + "grad_norm": 1.2762757539749146, + "learning_rate": 2.453060778916605e-07, + "loss": 1.1744, + "step": 27319 + }, + { + "epoch": 0.9783873798055401, + "grad_norm": 1.489479660987854, + "learning_rate": 2.4449480940279547e-07, + "loss": 1.1331, + "step": 27320 + }, + { + "epoch": 0.9784231919351084, + "grad_norm": 1.589239239692688, + "learning_rate": 2.436848829955762e-07, + "loss": 0.9222, + "step": 27321 + }, + { + "epoch": 0.9784590040646767, + "grad_norm": 1.6977788209915161, + "learning_rate": 2.428762986809052e-07, + "loss": 1.0927, + "step": 27322 + }, + { + "epoch": 0.9784948161942449, + "grad_norm": 1.483420491218567, + "learning_rate": 2.420690564696626e-07, + "loss": 1.147, + "step": 27323 + }, + { + "epoch": 0.9785306283238133, + "grad_norm": 1.384878396987915, + "learning_rate": 2.4126315637269523e-07, + "loss": 0.7677, + "step": 27324 + }, + { + "epoch": 0.9785664404533816, + "grad_norm": 1.5203402042388916, + "learning_rate": 2.4045859840085005e-07, + "loss": 1.0302, + "step": 27325 + }, + { + "epoch": 0.9786022525829499, + "grad_norm": 1.3007676601409912, + "learning_rate": 2.3965538256496276e-07, + "loss": 0.8531, + "step": 27326 + }, + { + "epoch": 0.9786380647125181, + "grad_norm": 1.2212116718292236, + "learning_rate": 2.388535088758248e-07, + "loss": 1.0369, + "step": 27327 + }, + { + "epoch": 0.9786738768420864, + "grad_norm": 1.1743528842926025, + "learning_rate": 2.3805297734422748e-07, + "loss": 0.9318, + "step": 27328 + }, + { + "epoch": 0.9787096889716547, + "grad_norm": 1.3551725149154663, + "learning_rate": 2.3725378798095112e-07, + "loss": 1.2654, + "step": 27329 + }, + { + "epoch": 0.9787455011012229, + "grad_norm": 1.3368549346923828, + "learning_rate": 2.364559407967426e-07, + "loss": 1.0545, + "step": 27330 + }, + { + "epoch": 0.9787813132307913, + "grad_norm": 1.651985764503479, + "learning_rate": 2.3565943580232676e-07, + "loss": 1.2071, + "step": 27331 + }, + { + "epoch": 0.9788171253603596, + "grad_norm": 1.7286601066589355, + "learning_rate": 2.3486427300841717e-07, + "loss": 1.3394, + "step": 27332 + }, + { + "epoch": 0.9788529374899279, + "grad_norm": 1.4156347513198853, + "learning_rate": 2.3407045242573867e-07, + "loss": 0.8771, + "step": 27333 + }, + { + "epoch": 0.9788887496194961, + "grad_norm": 1.5601121187210083, + "learning_rate": 2.3327797406494934e-07, + "loss": 1.0377, + "step": 27334 + }, + { + "epoch": 0.9789245617490644, + "grad_norm": 1.5366054773330688, + "learning_rate": 2.3248683793670735e-07, + "loss": 1.2178, + "step": 27335 + }, + { + "epoch": 0.9789603738786327, + "grad_norm": 1.424904465675354, + "learning_rate": 2.316970440516708e-07, + "loss": 1.0324, + "step": 27336 + }, + { + "epoch": 0.9789961860082009, + "grad_norm": 1.40438973903656, + "learning_rate": 2.309085924204535e-07, + "loss": 1.0274, + "step": 27337 + }, + { + "epoch": 0.9790319981377693, + "grad_norm": 1.8855745792388916, + "learning_rate": 2.301214830536691e-07, + "loss": 1.3182, + "step": 27338 + }, + { + "epoch": 0.9790678102673376, + "grad_norm": 2.2054779529571533, + "learning_rate": 2.2933571596190918e-07, + "loss": 1.4356, + "step": 27339 + }, + { + "epoch": 0.9791036223969058, + "grad_norm": 1.5082473754882812, + "learning_rate": 2.2855129115574304e-07, + "loss": 0.9127, + "step": 27340 + }, + { + "epoch": 0.9791394345264741, + "grad_norm": 1.4720020294189453, + "learning_rate": 2.277682086457289e-07, + "loss": 1.2395, + "step": 27341 + }, + { + "epoch": 0.9791752466560424, + "grad_norm": 1.5456211566925049, + "learning_rate": 2.269864684423917e-07, + "loss": 1.0014, + "step": 27342 + }, + { + "epoch": 0.9792110587856107, + "grad_norm": 1.4209115505218506, + "learning_rate": 2.262060705562452e-07, + "loss": 1.1261, + "step": 27343 + }, + { + "epoch": 0.9792468709151789, + "grad_norm": 1.2376071214675903, + "learning_rate": 2.2542701499780327e-07, + "loss": 1.0919, + "step": 27344 + }, + { + "epoch": 0.9792826830447473, + "grad_norm": 2.3066775798797607, + "learning_rate": 2.246493017775353e-07, + "loss": 0.9223, + "step": 27345 + }, + { + "epoch": 0.9793184951743156, + "grad_norm": 1.6675759553909302, + "learning_rate": 2.2387293090592177e-07, + "loss": 1.2009, + "step": 27346 + }, + { + "epoch": 0.9793543073038838, + "grad_norm": 1.8033798933029175, + "learning_rate": 2.230979023933877e-07, + "loss": 1.0511, + "step": 27347 + }, + { + "epoch": 0.9793901194334521, + "grad_norm": 1.6606991291046143, + "learning_rate": 2.2232421625036914e-07, + "loss": 1.317, + "step": 27348 + }, + { + "epoch": 0.9794259315630204, + "grad_norm": 1.2873632907867432, + "learning_rate": 2.2155187248728004e-07, + "loss": 1.0761, + "step": 27349 + }, + { + "epoch": 0.9794617436925886, + "grad_norm": 1.3995893001556396, + "learning_rate": 2.2078087111450097e-07, + "loss": 1.1561, + "step": 27350 + }, + { + "epoch": 0.9794975558221569, + "grad_norm": 1.4097408056259155, + "learning_rate": 2.200112121424125e-07, + "loss": 1.0714, + "step": 27351 + }, + { + "epoch": 0.9795333679517253, + "grad_norm": 1.3321614265441895, + "learning_rate": 2.192428955813619e-07, + "loss": 0.9713, + "step": 27352 + }, + { + "epoch": 0.9795691800812936, + "grad_norm": 1.476481556892395, + "learning_rate": 2.184759214416854e-07, + "loss": 1.0198, + "step": 27353 + }, + { + "epoch": 0.9796049922108618, + "grad_norm": 1.8012267351150513, + "learning_rate": 2.177102897337191e-07, + "loss": 1.2115, + "step": 27354 + }, + { + "epoch": 0.9796408043404301, + "grad_norm": 1.564288854598999, + "learning_rate": 2.169460004677437e-07, + "loss": 1.0985, + "step": 27355 + }, + { + "epoch": 0.9796766164699984, + "grad_norm": 1.3103866577148438, + "learning_rate": 2.16183053654051e-07, + "loss": 0.9892, + "step": 27356 + }, + { + "epoch": 0.9797124285995666, + "grad_norm": 1.5918470621109009, + "learning_rate": 2.1542144930289943e-07, + "loss": 0.9567, + "step": 27357 + }, + { + "epoch": 0.9797482407291349, + "grad_norm": 1.390386939048767, + "learning_rate": 2.1466118742453634e-07, + "loss": 1.0768, + "step": 27358 + }, + { + "epoch": 0.9797840528587033, + "grad_norm": 1.488286018371582, + "learning_rate": 2.139022680292091e-07, + "loss": 1.009, + "step": 27359 + }, + { + "epoch": 0.9798198649882716, + "grad_norm": 1.3448433876037598, + "learning_rate": 2.1314469112709844e-07, + "loss": 0.876, + "step": 27360 + }, + { + "epoch": 0.9798556771178398, + "grad_norm": 1.6321555376052856, + "learning_rate": 2.1238845672841845e-07, + "loss": 1.0538, + "step": 27361 + }, + { + "epoch": 0.9798914892474081, + "grad_norm": 1.6489938497543335, + "learning_rate": 2.1163356484332764e-07, + "loss": 1.0136, + "step": 27362 + }, + { + "epoch": 0.9799273013769764, + "grad_norm": 1.2941317558288574, + "learning_rate": 2.1088001548199565e-07, + "loss": 1.2509, + "step": 27363 + }, + { + "epoch": 0.9799631135065446, + "grad_norm": 1.2761520147323608, + "learning_rate": 2.1012780865454773e-07, + "loss": 0.8375, + "step": 27364 + }, + { + "epoch": 0.9799989256361129, + "grad_norm": 1.5855650901794434, + "learning_rate": 2.093769443711091e-07, + "loss": 1.0957, + "step": 27365 + }, + { + "epoch": 0.9800347377656813, + "grad_norm": 1.5724478960037231, + "learning_rate": 2.086274226417939e-07, + "loss": 1.1139, + "step": 27366 + }, + { + "epoch": 0.9800705498952496, + "grad_norm": 1.3637945652008057, + "learning_rate": 2.0787924347666076e-07, + "loss": 1.1153, + "step": 27367 + }, + { + "epoch": 0.9801063620248178, + "grad_norm": 1.2828723192214966, + "learning_rate": 2.071324068858016e-07, + "loss": 0.9198, + "step": 27368 + }, + { + "epoch": 0.9801421741543861, + "grad_norm": 1.6383010149002075, + "learning_rate": 2.0638691287925282e-07, + "loss": 1.2982, + "step": 27369 + }, + { + "epoch": 0.9801779862839544, + "grad_norm": 1.3503916263580322, + "learning_rate": 2.0564276146703977e-07, + "loss": 0.9667, + "step": 27370 + }, + { + "epoch": 0.9802137984135226, + "grad_norm": 1.7444782257080078, + "learning_rate": 2.048999526591766e-07, + "loss": 1.0422, + "step": 27371 + }, + { + "epoch": 0.9802496105430909, + "grad_norm": 1.2949217557907104, + "learning_rate": 2.041584864656554e-07, + "loss": 0.9952, + "step": 27372 + }, + { + "epoch": 0.9802854226726593, + "grad_norm": 1.5461249351501465, + "learning_rate": 2.03418362896457e-07, + "loss": 1.0606, + "step": 27373 + }, + { + "epoch": 0.9803212348022275, + "grad_norm": 1.5476833581924438, + "learning_rate": 2.0267958196154013e-07, + "loss": 1.0506, + "step": 27374 + }, + { + "epoch": 0.9803570469317958, + "grad_norm": 1.589086651802063, + "learning_rate": 2.019421436708413e-07, + "loss": 1.2712, + "step": 27375 + }, + { + "epoch": 0.9803928590613641, + "grad_norm": 1.4325257539749146, + "learning_rate": 2.012060480342748e-07, + "loss": 1.2004, + "step": 27376 + }, + { + "epoch": 0.9804286711909324, + "grad_norm": 1.5624536275863647, + "learning_rate": 2.0047129506175488e-07, + "loss": 0.9658, + "step": 27377 + }, + { + "epoch": 0.9804644833205006, + "grad_norm": 1.2275723218917847, + "learning_rate": 1.9973788476315147e-07, + "loss": 1.0249, + "step": 27378 + }, + { + "epoch": 0.9805002954500689, + "grad_norm": 1.400193214416504, + "learning_rate": 1.9900581714835666e-07, + "loss": 1.1702, + "step": 27379 + }, + { + "epoch": 0.9805361075796373, + "grad_norm": 1.6964702606201172, + "learning_rate": 1.9827509222719587e-07, + "loss": 1.0236, + "step": 27380 + }, + { + "epoch": 0.9805719197092055, + "grad_norm": 1.4599519968032837, + "learning_rate": 1.9754571000950572e-07, + "loss": 1.1097, + "step": 27381 + }, + { + "epoch": 0.9806077318387738, + "grad_norm": 1.5847829580307007, + "learning_rate": 1.968176705051117e-07, + "loss": 1.2223, + "step": 27382 + }, + { + "epoch": 0.9806435439683421, + "grad_norm": 1.2985279560089111, + "learning_rate": 1.960909737237837e-07, + "loss": 0.7564, + "step": 27383 + }, + { + "epoch": 0.9806793560979103, + "grad_norm": 1.36558997631073, + "learning_rate": 1.9536561967532507e-07, + "loss": 1.233, + "step": 27384 + }, + { + "epoch": 0.9807151682274786, + "grad_norm": 1.5477665662765503, + "learning_rate": 1.9464160836948354e-07, + "loss": 1.0079, + "step": 27385 + }, + { + "epoch": 0.9807509803570469, + "grad_norm": 1.6789579391479492, + "learning_rate": 1.9391893981599575e-07, + "loss": 1.1886, + "step": 27386 + }, + { + "epoch": 0.9807867924866153, + "grad_norm": 1.668320655822754, + "learning_rate": 1.9319761402458726e-07, + "loss": 1.1666, + "step": 27387 + }, + { + "epoch": 0.9808226046161835, + "grad_norm": 1.6975700855255127, + "learning_rate": 1.924776310049725e-07, + "loss": 1.0876, + "step": 27388 + }, + { + "epoch": 0.9808584167457518, + "grad_norm": 2.3024966716766357, + "learning_rate": 1.9175899076682158e-07, + "loss": 1.1727, + "step": 27389 + }, + { + "epoch": 0.9808942288753201, + "grad_norm": 1.466162085533142, + "learning_rate": 1.9104169331981558e-07, + "loss": 1.009, + "step": 27390 + }, + { + "epoch": 0.9809300410048883, + "grad_norm": 1.4334148168563843, + "learning_rate": 1.9032573867359126e-07, + "loss": 1.0813, + "step": 27391 + }, + { + "epoch": 0.9809658531344566, + "grad_norm": 1.5842221975326538, + "learning_rate": 1.8961112683778536e-07, + "loss": 1.144, + "step": 27392 + }, + { + "epoch": 0.9810016652640249, + "grad_norm": 1.3801255226135254, + "learning_rate": 1.8889785782202352e-07, + "loss": 1.1085, + "step": 27393 + }, + { + "epoch": 0.9810374773935933, + "grad_norm": 1.2877917289733887, + "learning_rate": 1.8818593163589805e-07, + "loss": 0.9924, + "step": 27394 + }, + { + "epoch": 0.9810732895231615, + "grad_norm": 1.8873291015625, + "learning_rate": 1.8747534828897905e-07, + "loss": 1.1019, + "step": 27395 + }, + { + "epoch": 0.9811091016527298, + "grad_norm": 1.573063850402832, + "learning_rate": 1.867661077908256e-07, + "loss": 0.9889, + "step": 27396 + }, + { + "epoch": 0.9811449137822981, + "grad_norm": 1.5547833442687988, + "learning_rate": 1.8605821015098556e-07, + "loss": 1.024, + "step": 27397 + }, + { + "epoch": 0.9811807259118663, + "grad_norm": 1.896390676498413, + "learning_rate": 1.853516553789847e-07, + "loss": 1.1144, + "step": 27398 + }, + { + "epoch": 0.9812165380414346, + "grad_norm": 1.2594420909881592, + "learning_rate": 1.8464644348432647e-07, + "loss": 0.9962, + "step": 27399 + }, + { + "epoch": 0.9812523501710029, + "grad_norm": 1.399061918258667, + "learning_rate": 1.8394257447650332e-07, + "loss": 1.0466, + "step": 27400 + }, + { + "epoch": 0.9812881623005713, + "grad_norm": 1.780714511871338, + "learning_rate": 1.832400483649632e-07, + "loss": 1.1464, + "step": 27401 + }, + { + "epoch": 0.9813239744301395, + "grad_norm": 1.5194590091705322, + "learning_rate": 1.825388651591875e-07, + "loss": 1.1984, + "step": 27402 + }, + { + "epoch": 0.9813597865597078, + "grad_norm": 1.2115378379821777, + "learning_rate": 1.8183902486859083e-07, + "loss": 1.1431, + "step": 27403 + }, + { + "epoch": 0.9813955986892761, + "grad_norm": 1.5301103591918945, + "learning_rate": 1.8114052750259902e-07, + "loss": 1.0406, + "step": 27404 + }, + { + "epoch": 0.9814314108188443, + "grad_norm": 1.3671207427978516, + "learning_rate": 1.8044337307059345e-07, + "loss": 0.9925, + "step": 27405 + }, + { + "epoch": 0.9814672229484126, + "grad_norm": 1.4043034315109253, + "learning_rate": 1.7974756158196658e-07, + "loss": 1.0948, + "step": 27406 + }, + { + "epoch": 0.9815030350779809, + "grad_norm": 1.5765440464019775, + "learning_rate": 1.7905309304608876e-07, + "loss": 1.0235, + "step": 27407 + }, + { + "epoch": 0.9815388472075492, + "grad_norm": 1.2844887971878052, + "learning_rate": 1.7835996747228578e-07, + "loss": 0.9977, + "step": 27408 + }, + { + "epoch": 0.9815746593371175, + "grad_norm": 1.565724492073059, + "learning_rate": 1.7766818486988357e-07, + "loss": 1.1247, + "step": 27409 + }, + { + "epoch": 0.9816104714666858, + "grad_norm": 1.4905245304107666, + "learning_rate": 1.769777452481969e-07, + "loss": 0.9484, + "step": 27410 + }, + { + "epoch": 0.981646283596254, + "grad_norm": 1.4326133728027344, + "learning_rate": 1.7628864861651827e-07, + "loss": 1.0201, + "step": 27411 + }, + { + "epoch": 0.9816820957258223, + "grad_norm": 1.8125042915344238, + "learning_rate": 1.7560089498410704e-07, + "loss": 1.0451, + "step": 27412 + }, + { + "epoch": 0.9817179078553906, + "grad_norm": 1.4488462209701538, + "learning_rate": 1.749144843602224e-07, + "loss": 1.0771, + "step": 27413 + }, + { + "epoch": 0.9817537199849589, + "grad_norm": 1.4335688352584839, + "learning_rate": 1.7422941675410143e-07, + "loss": 1.0288, + "step": 27414 + }, + { + "epoch": 0.9817895321145272, + "grad_norm": 2.0387279987335205, + "learning_rate": 1.7354569217494788e-07, + "loss": 1.09, + "step": 27415 + }, + { + "epoch": 0.9818253442440955, + "grad_norm": 1.3863636255264282, + "learning_rate": 1.728633106319766e-07, + "loss": 1.111, + "step": 27416 + }, + { + "epoch": 0.9818611563736638, + "grad_norm": 1.6775059700012207, + "learning_rate": 1.721822721343691e-07, + "loss": 1.1549, + "step": 27417 + }, + { + "epoch": 0.981896968503232, + "grad_norm": 1.4848744869232178, + "learning_rate": 1.7150257669127367e-07, + "loss": 0.9331, + "step": 27418 + }, + { + "epoch": 0.9819327806328003, + "grad_norm": 1.5890883207321167, + "learning_rate": 1.7082422431183853e-07, + "loss": 1.0838, + "step": 27419 + }, + { + "epoch": 0.9819685927623686, + "grad_norm": 1.3436697721481323, + "learning_rate": 1.701472150051897e-07, + "loss": 1.198, + "step": 27420 + }, + { + "epoch": 0.9820044048919369, + "grad_norm": 1.5096139907836914, + "learning_rate": 1.6947154878045324e-07, + "loss": 1.1775, + "step": 27421 + }, + { + "epoch": 0.9820402170215052, + "grad_norm": 1.6966298818588257, + "learning_rate": 1.6879722564669964e-07, + "loss": 1.1664, + "step": 27422 + }, + { + "epoch": 0.9820760291510735, + "grad_norm": 1.759365200996399, + "learning_rate": 1.6812424561299943e-07, + "loss": 1.1173, + "step": 27423 + }, + { + "epoch": 0.9821118412806418, + "grad_norm": 1.3796679973602295, + "learning_rate": 1.6745260868841207e-07, + "loss": 1.1804, + "step": 27424 + }, + { + "epoch": 0.98214765341021, + "grad_norm": 1.449776291847229, + "learning_rate": 1.667823148819858e-07, + "loss": 0.9861, + "step": 27425 + }, + { + "epoch": 0.9821834655397783, + "grad_norm": 1.580169677734375, + "learning_rate": 1.661133642027246e-07, + "loss": 0.9452, + "step": 27426 + }, + { + "epoch": 0.9822192776693466, + "grad_norm": 1.5727810859680176, + "learning_rate": 1.6544575665963236e-07, + "loss": 0.9117, + "step": 27427 + }, + { + "epoch": 0.9822550897989148, + "grad_norm": 1.5714576244354248, + "learning_rate": 1.6477949226167967e-07, + "loss": 1.1242, + "step": 27428 + }, + { + "epoch": 0.9822909019284832, + "grad_norm": 1.6771881580352783, + "learning_rate": 1.6411457101784822e-07, + "loss": 0.9846, + "step": 27429 + }, + { + "epoch": 0.9823267140580515, + "grad_norm": 1.3405991792678833, + "learning_rate": 1.6345099293708644e-07, + "loss": 0.9363, + "step": 27430 + }, + { + "epoch": 0.9823625261876198, + "grad_norm": 2.9292101860046387, + "learning_rate": 1.627887580282983e-07, + "loss": 1.1096, + "step": 27431 + }, + { + "epoch": 0.982398338317188, + "grad_norm": 1.4112071990966797, + "learning_rate": 1.6212786630041e-07, + "loss": 0.8764, + "step": 27432 + }, + { + "epoch": 0.9824341504467563, + "grad_norm": 1.6708052158355713, + "learning_rate": 1.6146831776231442e-07, + "loss": 1.0, + "step": 27433 + }, + { + "epoch": 0.9824699625763246, + "grad_norm": 1.4504135847091675, + "learning_rate": 1.6081011242287115e-07, + "loss": 1.248, + "step": 27434 + }, + { + "epoch": 0.9825057747058928, + "grad_norm": 1.268436312675476, + "learning_rate": 1.6015325029095084e-07, + "loss": 1.0002, + "step": 27435 + }, + { + "epoch": 0.9825415868354612, + "grad_norm": 1.4553301334381104, + "learning_rate": 1.5949773137537982e-07, + "loss": 0.8809, + "step": 27436 + }, + { + "epoch": 0.9825773989650295, + "grad_norm": 1.3556536436080933, + "learning_rate": 1.588435556849843e-07, + "loss": 0.9967, + "step": 27437 + }, + { + "epoch": 0.9826132110945978, + "grad_norm": 1.5591381788253784, + "learning_rate": 1.5819072322856842e-07, + "loss": 1.0267, + "step": 27438 + }, + { + "epoch": 0.982649023224166, + "grad_norm": 1.4564419984817505, + "learning_rate": 1.575392340149029e-07, + "loss": 0.9989, + "step": 27439 + }, + { + "epoch": 0.9826848353537343, + "grad_norm": 1.4244483709335327, + "learning_rate": 1.5688908805275848e-07, + "loss": 1.216, + "step": 27440 + }, + { + "epoch": 0.9827206474833026, + "grad_norm": 1.3859997987747192, + "learning_rate": 1.5624028535088375e-07, + "loss": 1.0751, + "step": 27441 + }, + { + "epoch": 0.9827564596128708, + "grad_norm": 1.6954413652420044, + "learning_rate": 1.5559282591801617e-07, + "loss": 1.2824, + "step": 27442 + }, + { + "epoch": 0.9827922717424392, + "grad_norm": 1.4283099174499512, + "learning_rate": 1.5494670976284875e-07, + "loss": 1.1156, + "step": 27443 + }, + { + "epoch": 0.9828280838720075, + "grad_norm": 1.3555166721343994, + "learning_rate": 1.543019368940857e-07, + "loss": 1.0026, + "step": 27444 + }, + { + "epoch": 0.9828638960015758, + "grad_norm": 1.6453227996826172, + "learning_rate": 1.5365850732039778e-07, + "loss": 1.3252, + "step": 27445 + }, + { + "epoch": 0.982899708131144, + "grad_norm": 1.5087833404541016, + "learning_rate": 1.5301642105043368e-07, + "loss": 1.0026, + "step": 27446 + }, + { + "epoch": 0.9829355202607123, + "grad_norm": 1.4073686599731445, + "learning_rate": 1.5237567809285314e-07, + "loss": 0.9336, + "step": 27447 + }, + { + "epoch": 0.9829713323902806, + "grad_norm": 1.4182008504867554, + "learning_rate": 1.5173627845624927e-07, + "loss": 0.9684, + "step": 27448 + }, + { + "epoch": 0.9830071445198488, + "grad_norm": 1.4744418859481812, + "learning_rate": 1.510982221492485e-07, + "loss": 1.1193, + "step": 27449 + }, + { + "epoch": 0.9830429566494172, + "grad_norm": 1.3344407081604004, + "learning_rate": 1.5046150918042178e-07, + "loss": 1.1116, + "step": 27450 + }, + { + "epoch": 0.9830787687789855, + "grad_norm": 1.2558485269546509, + "learning_rate": 1.4982613955834001e-07, + "loss": 0.9455, + "step": 27451 + }, + { + "epoch": 0.9831145809085537, + "grad_norm": 1.3454442024230957, + "learning_rate": 1.4919211329156302e-07, + "loss": 1.0219, + "step": 27452 + }, + { + "epoch": 0.983150393038122, + "grad_norm": 1.6427891254425049, + "learning_rate": 1.4855943038858399e-07, + "loss": 1.1118, + "step": 27453 + }, + { + "epoch": 0.9831862051676903, + "grad_norm": 1.3920280933380127, + "learning_rate": 1.4792809085795166e-07, + "loss": 1.0367, + "step": 27454 + }, + { + "epoch": 0.9832220172972586, + "grad_norm": 1.4591903686523438, + "learning_rate": 1.4729809470814815e-07, + "loss": 1.0664, + "step": 27455 + }, + { + "epoch": 0.9832578294268268, + "grad_norm": 1.8386902809143066, + "learning_rate": 1.4666944194764443e-07, + "loss": 1.0446, + "step": 27456 + }, + { + "epoch": 0.9832936415563952, + "grad_norm": 1.4509979486465454, + "learning_rate": 1.4604213258491152e-07, + "loss": 1.2125, + "step": 27457 + }, + { + "epoch": 0.9833294536859635, + "grad_norm": 1.559127688407898, + "learning_rate": 1.4541616662836488e-07, + "loss": 1.0782, + "step": 27458 + }, + { + "epoch": 0.9833652658155317, + "grad_norm": 1.2496130466461182, + "learning_rate": 1.4479154408645335e-07, + "loss": 0.9063, + "step": 27459 + }, + { + "epoch": 0.9834010779451, + "grad_norm": 1.3917549848556519, + "learning_rate": 1.441682649675591e-07, + "loss": 1.0595, + "step": 27460 + }, + { + "epoch": 0.9834368900746683, + "grad_norm": 1.6512739658355713, + "learning_rate": 1.435463292800754e-07, + "loss": 1.1088, + "step": 27461 + }, + { + "epoch": 0.9834727022042365, + "grad_norm": 1.6292868852615356, + "learning_rate": 1.4292573703237333e-07, + "loss": 1.0129, + "step": 27462 + }, + { + "epoch": 0.9835085143338048, + "grad_norm": 2.42274808883667, + "learning_rate": 1.423064882328018e-07, + "loss": 1.2736, + "step": 27463 + }, + { + "epoch": 0.9835443264633732, + "grad_norm": 1.603611946105957, + "learning_rate": 1.4168858288968745e-07, + "loss": 1.0623, + "step": 27464 + }, + { + "epoch": 0.9835801385929415, + "grad_norm": 1.9419498443603516, + "learning_rate": 1.4107202101134588e-07, + "loss": 1.3527, + "step": 27465 + }, + { + "epoch": 0.9836159507225097, + "grad_norm": 1.4344878196716309, + "learning_rate": 1.404568026060704e-07, + "loss": 0.932, + "step": 27466 + }, + { + "epoch": 0.983651762852078, + "grad_norm": 1.201553225517273, + "learning_rate": 1.3984292768213225e-07, + "loss": 1.3346, + "step": 27467 + }, + { + "epoch": 0.9836875749816463, + "grad_norm": 1.4384363889694214, + "learning_rate": 1.3923039624780255e-07, + "loss": 1.1479, + "step": 27468 + }, + { + "epoch": 0.9837233871112145, + "grad_norm": 1.5935827493667603, + "learning_rate": 1.3861920831131914e-07, + "loss": 1.0439, + "step": 27469 + }, + { + "epoch": 0.9837591992407828, + "grad_norm": 1.344168782234192, + "learning_rate": 1.380093638808977e-07, + "loss": 0.9466, + "step": 27470 + }, + { + "epoch": 0.9837950113703512, + "grad_norm": 1.6885138750076294, + "learning_rate": 1.3740086296475385e-07, + "loss": 1.2059, + "step": 27471 + }, + { + "epoch": 0.9838308234999195, + "grad_norm": 1.3646174669265747, + "learning_rate": 1.3679370557106997e-07, + "loss": 0.897, + "step": 27472 + }, + { + "epoch": 0.9838666356294877, + "grad_norm": 1.2996418476104736, + "learning_rate": 1.3618789170800618e-07, + "loss": 0.9856, + "step": 27473 + }, + { + "epoch": 0.983902447759056, + "grad_norm": 1.3247684240341187, + "learning_rate": 1.355834213837226e-07, + "loss": 1.0596, + "step": 27474 + }, + { + "epoch": 0.9839382598886243, + "grad_norm": 1.4652982950210571, + "learning_rate": 1.349802946063461e-07, + "loss": 1.0323, + "step": 27475 + }, + { + "epoch": 0.9839740720181925, + "grad_norm": 1.4288873672485352, + "learning_rate": 1.3437851138399237e-07, + "loss": 1.0729, + "step": 27476 + }, + { + "epoch": 0.9840098841477608, + "grad_norm": 1.4623678922653198, + "learning_rate": 1.33778071724755e-07, + "loss": 1.0606, + "step": 27477 + }, + { + "epoch": 0.9840456962773292, + "grad_norm": 1.6068331003189087, + "learning_rate": 1.3317897563671633e-07, + "loss": 1.0889, + "step": 27478 + }, + { + "epoch": 0.9840815084068975, + "grad_norm": 1.4139803647994995, + "learning_rate": 1.3258122312793663e-07, + "loss": 1.0827, + "step": 27479 + }, + { + "epoch": 0.9841173205364657, + "grad_norm": 1.497002363204956, + "learning_rate": 1.3198481420646504e-07, + "loss": 0.7797, + "step": 27480 + }, + { + "epoch": 0.984153132666034, + "grad_norm": 1.5712330341339111, + "learning_rate": 1.313897488803062e-07, + "loss": 0.9999, + "step": 27481 + }, + { + "epoch": 0.9841889447956023, + "grad_norm": 1.4279823303222656, + "learning_rate": 1.3079602715748706e-07, + "loss": 0.9907, + "step": 27482 + }, + { + "epoch": 0.9842247569251705, + "grad_norm": 1.4181772470474243, + "learning_rate": 1.3020364904597903e-07, + "loss": 1.067, + "step": 27483 + }, + { + "epoch": 0.9842605690547388, + "grad_norm": 1.6890864372253418, + "learning_rate": 1.296126145537646e-07, + "loss": 1.1361, + "step": 27484 + }, + { + "epoch": 0.9842963811843072, + "grad_norm": 1.5378600358963013, + "learning_rate": 1.2902292368878188e-07, + "loss": 1.0025, + "step": 27485 + }, + { + "epoch": 0.9843321933138754, + "grad_norm": 1.458329677581787, + "learning_rate": 1.2843457645896895e-07, + "loss": 0.9546, + "step": 27486 + }, + { + "epoch": 0.9843680054434437, + "grad_norm": 1.2035341262817383, + "learning_rate": 1.278475728722528e-07, + "loss": 1.1118, + "step": 27487 + }, + { + "epoch": 0.984403817573012, + "grad_norm": 1.6584144830703735, + "learning_rate": 1.27261912936516e-07, + "loss": 1.234, + "step": 27488 + }, + { + "epoch": 0.9844396297025803, + "grad_norm": 1.9451148509979248, + "learning_rate": 1.2667759665964118e-07, + "loss": 0.9463, + "step": 27489 + }, + { + "epoch": 0.9844754418321485, + "grad_norm": 1.2957611083984375, + "learning_rate": 1.260946240494998e-07, + "loss": 1.1596, + "step": 27490 + }, + { + "epoch": 0.9845112539617168, + "grad_norm": 1.340820074081421, + "learning_rate": 1.255129951139189e-07, + "loss": 0.8694, + "step": 27491 + }, + { + "epoch": 0.9845470660912852, + "grad_norm": 1.3692522048950195, + "learning_rate": 1.249327098607367e-07, + "loss": 1.0774, + "step": 27492 + }, + { + "epoch": 0.9845828782208534, + "grad_norm": 1.5369693040847778, + "learning_rate": 1.2435376829775803e-07, + "loss": 1.2175, + "step": 27493 + }, + { + "epoch": 0.9846186903504217, + "grad_norm": 1.2599921226501465, + "learning_rate": 1.2377617043276556e-07, + "loss": 0.9707, + "step": 27494 + }, + { + "epoch": 0.98465450247999, + "grad_norm": 1.2252246141433716, + "learning_rate": 1.231999162735309e-07, + "loss": 1.0129, + "step": 27495 + }, + { + "epoch": 0.9846903146095582, + "grad_norm": 1.207600474357605, + "learning_rate": 1.2262500582781445e-07, + "loss": 1.0074, + "step": 27496 + }, + { + "epoch": 0.9847261267391265, + "grad_norm": 1.542514443397522, + "learning_rate": 1.2205143910334338e-07, + "loss": 1.065, + "step": 27497 + }, + { + "epoch": 0.9847619388686948, + "grad_norm": 1.791689395904541, + "learning_rate": 1.2147921610783374e-07, + "loss": 1.3447, + "step": 27498 + }, + { + "epoch": 0.9847977509982632, + "grad_norm": 1.153727650642395, + "learning_rate": 1.209083368490016e-07, + "loss": 0.7541, + "step": 27499 + }, + { + "epoch": 0.9848335631278314, + "grad_norm": 1.7729942798614502, + "learning_rate": 1.2033880133449638e-07, + "loss": 1.1027, + "step": 27500 + }, + { + "epoch": 0.9848693752573997, + "grad_norm": 1.815110445022583, + "learning_rate": 1.1977060957200083e-07, + "loss": 1.1941, + "step": 27501 + }, + { + "epoch": 0.984905187386968, + "grad_norm": 1.4334110021591187, + "learning_rate": 1.1920376156916435e-07, + "loss": 1.3128, + "step": 27502 + }, + { + "epoch": 0.9849409995165362, + "grad_norm": 1.4980491399765015, + "learning_rate": 1.1863825733359201e-07, + "loss": 1.2945, + "step": 27503 + }, + { + "epoch": 0.9849768116461045, + "grad_norm": 1.887199878692627, + "learning_rate": 1.1807409687291104e-07, + "loss": 1.2336, + "step": 27504 + }, + { + "epoch": 0.9850126237756728, + "grad_norm": 1.3003499507904053, + "learning_rate": 1.1751128019470426e-07, + "loss": 1.0837, + "step": 27505 + }, + { + "epoch": 0.9850484359052412, + "grad_norm": 1.5227686166763306, + "learning_rate": 1.1694980730654337e-07, + "loss": 1.0569, + "step": 27506 + }, + { + "epoch": 0.9850842480348094, + "grad_norm": 2.1501359939575195, + "learning_rate": 1.163896782159779e-07, + "loss": 1.1453, + "step": 27507 + }, + { + "epoch": 0.9851200601643777, + "grad_norm": 1.5971529483795166, + "learning_rate": 1.1583089293055738e-07, + "loss": 1.0645, + "step": 27508 + }, + { + "epoch": 0.985155872293946, + "grad_norm": 1.4598960876464844, + "learning_rate": 1.152734514577869e-07, + "loss": 1.0086, + "step": 27509 + }, + { + "epoch": 0.9851916844235142, + "grad_norm": 1.5571163892745972, + "learning_rate": 1.1471735380517156e-07, + "loss": 1.1508, + "step": 27510 + }, + { + "epoch": 0.9852274965530825, + "grad_norm": 1.3777198791503906, + "learning_rate": 1.141625999801943e-07, + "loss": 1.2324, + "step": 27511 + }, + { + "epoch": 0.9852633086826508, + "grad_norm": 1.3279154300689697, + "learning_rate": 1.1360918999030467e-07, + "loss": 1.0478, + "step": 27512 + }, + { + "epoch": 0.9852991208122192, + "grad_norm": 1.6429117918014526, + "learning_rate": 1.1305712384297451e-07, + "loss": 1.0726, + "step": 27513 + }, + { + "epoch": 0.9853349329417874, + "grad_norm": 1.5766957998275757, + "learning_rate": 1.1250640154560898e-07, + "loss": 1.0574, + "step": 27514 + }, + { + "epoch": 0.9853707450713557, + "grad_norm": 1.4396504163742065, + "learning_rate": 1.1195702310561329e-07, + "loss": 1.1762, + "step": 27515 + }, + { + "epoch": 0.985406557200924, + "grad_norm": 1.3574169874191284, + "learning_rate": 1.1140898853040372e-07, + "loss": 0.8459, + "step": 27516 + }, + { + "epoch": 0.9854423693304922, + "grad_norm": 1.3641563653945923, + "learning_rate": 1.1086229782734103e-07, + "loss": 0.9064, + "step": 27517 + }, + { + "epoch": 0.9854781814600605, + "grad_norm": 1.4852182865142822, + "learning_rate": 1.1031695100376382e-07, + "loss": 1.2322, + "step": 27518 + }, + { + "epoch": 0.9855139935896288, + "grad_norm": 1.4974976778030396, + "learning_rate": 1.0977294806703286e-07, + "loss": 1.1839, + "step": 27519 + }, + { + "epoch": 0.9855498057191971, + "grad_norm": 1.611586332321167, + "learning_rate": 1.0923028902446453e-07, + "loss": 1.1328, + "step": 27520 + }, + { + "epoch": 0.9855856178487654, + "grad_norm": 1.2955644130706787, + "learning_rate": 1.0868897388334187e-07, + "loss": 1.341, + "step": 27521 + }, + { + "epoch": 0.9856214299783337, + "grad_norm": 1.2791533470153809, + "learning_rate": 1.0814900265095907e-07, + "loss": 1.18, + "step": 27522 + }, + { + "epoch": 0.985657242107902, + "grad_norm": 1.903195858001709, + "learning_rate": 1.0761037533457696e-07, + "loss": 1.1666, + "step": 27523 + }, + { + "epoch": 0.9856930542374702, + "grad_norm": 2.0780889987945557, + "learning_rate": 1.0707309194145643e-07, + "loss": 1.0169, + "step": 27524 + }, + { + "epoch": 0.9857288663670385, + "grad_norm": 1.7806499004364014, + "learning_rate": 1.0653715247881391e-07, + "loss": 1.128, + "step": 27525 + }, + { + "epoch": 0.9857646784966068, + "grad_norm": 1.4528485536575317, + "learning_rate": 1.0600255695385475e-07, + "loss": 0.9474, + "step": 27526 + }, + { + "epoch": 0.9858004906261751, + "grad_norm": 1.396384835243225, + "learning_rate": 1.054693053737843e-07, + "loss": 1.1661, + "step": 27527 + }, + { + "epoch": 0.9858363027557434, + "grad_norm": 1.3812812566757202, + "learning_rate": 1.049373977457635e-07, + "loss": 1.0042, + "step": 27528 + }, + { + "epoch": 0.9858721148853117, + "grad_norm": 1.7733498811721802, + "learning_rate": 1.0440683407695328e-07, + "loss": 1.1952, + "step": 27529 + }, + { + "epoch": 0.98590792701488, + "grad_norm": 1.215429425239563, + "learning_rate": 1.0387761437449239e-07, + "loss": 1.0741, + "step": 27530 + }, + { + "epoch": 0.9859437391444482, + "grad_norm": 1.4274464845657349, + "learning_rate": 1.0334973864550845e-07, + "loss": 0.8299, + "step": 27531 + }, + { + "epoch": 0.9859795512740165, + "grad_norm": 1.302443027496338, + "learning_rate": 1.0282320689708469e-07, + "loss": 1.1363, + "step": 27532 + }, + { + "epoch": 0.9860153634035848, + "grad_norm": 1.6204419136047363, + "learning_rate": 1.0229801913632653e-07, + "loss": 1.1651, + "step": 27533 + }, + { + "epoch": 0.986051175533153, + "grad_norm": 1.5585885047912598, + "learning_rate": 1.0177417537028389e-07, + "loss": 1.208, + "step": 27534 + }, + { + "epoch": 0.9860869876627214, + "grad_norm": 1.3006436824798584, + "learning_rate": 1.0125167560601778e-07, + "loss": 1.0895, + "step": 27535 + }, + { + "epoch": 0.9861227997922897, + "grad_norm": 1.2601022720336914, + "learning_rate": 1.0073051985054482e-07, + "loss": 0.9105, + "step": 27536 + }, + { + "epoch": 0.9861586119218579, + "grad_norm": 1.413908839225769, + "learning_rate": 1.0021070811088162e-07, + "loss": 0.9857, + "step": 27537 + }, + { + "epoch": 0.9861944240514262, + "grad_norm": 1.4052464962005615, + "learning_rate": 9.969224039403369e-08, + "loss": 1.132, + "step": 27538 + }, + { + "epoch": 0.9862302361809945, + "grad_norm": 1.450364351272583, + "learning_rate": 9.9175116706951e-08, + "loss": 0.7979, + "step": 27539 + }, + { + "epoch": 0.9862660483105627, + "grad_norm": 1.6038504838943481, + "learning_rate": 9.86593370566058e-08, + "loss": 1.0176, + "step": 27540 + }, + { + "epoch": 0.986301860440131, + "grad_norm": 1.4807735681533813, + "learning_rate": 9.814490144993693e-08, + "loss": 1.1589, + "step": 27541 + }, + { + "epoch": 0.9863376725696994, + "grad_norm": 1.5551173686981201, + "learning_rate": 9.763180989386112e-08, + "loss": 0.9285, + "step": 27542 + }, + { + "epoch": 0.9863734846992677, + "grad_norm": 1.5219990015029907, + "learning_rate": 9.712006239529503e-08, + "loss": 0.976, + "step": 27543 + }, + { + "epoch": 0.9864092968288359, + "grad_norm": 1.8108843564987183, + "learning_rate": 9.660965896111095e-08, + "loss": 1.1867, + "step": 27544 + }, + { + "epoch": 0.9864451089584042, + "grad_norm": 1.8448891639709473, + "learning_rate": 9.610059959817008e-08, + "loss": 0.8712, + "step": 27545 + }, + { + "epoch": 0.9864809210879725, + "grad_norm": 1.2971832752227783, + "learning_rate": 9.559288431333357e-08, + "loss": 1.158, + "step": 27546 + }, + { + "epoch": 0.9865167332175407, + "grad_norm": 1.8384499549865723, + "learning_rate": 9.50865131134182e-08, + "loss": 1.0317, + "step": 27547 + }, + { + "epoch": 0.986552545347109, + "grad_norm": 1.350804328918457, + "learning_rate": 9.458148600525185e-08, + "loss": 0.9664, + "step": 27548 + }, + { + "epoch": 0.9865883574766774, + "grad_norm": 1.5469212532043457, + "learning_rate": 9.407780299562908e-08, + "loss": 1.3113, + "step": 27549 + }, + { + "epoch": 0.9866241696062457, + "grad_norm": 1.4802963733673096, + "learning_rate": 9.357546409132223e-08, + "loss": 1.1165, + "step": 27550 + }, + { + "epoch": 0.9866599817358139, + "grad_norm": 1.3435235023498535, + "learning_rate": 9.307446929908148e-08, + "loss": 1.2653, + "step": 27551 + }, + { + "epoch": 0.9866957938653822, + "grad_norm": 1.5781961679458618, + "learning_rate": 9.257481862564587e-08, + "loss": 1.019, + "step": 27552 + }, + { + "epoch": 0.9867316059949505, + "grad_norm": 1.2941007614135742, + "learning_rate": 9.207651207775448e-08, + "loss": 1.0992, + "step": 27553 + }, + { + "epoch": 0.9867674181245187, + "grad_norm": 1.7129271030426025, + "learning_rate": 9.157954966210192e-08, + "loss": 1.1758, + "step": 27554 + }, + { + "epoch": 0.986803230254087, + "grad_norm": 1.7313461303710938, + "learning_rate": 9.108393138536064e-08, + "loss": 0.9677, + "step": 27555 + }, + { + "epoch": 0.9868390423836554, + "grad_norm": 1.2051182985305786, + "learning_rate": 9.05896572542253e-08, + "loss": 0.8876, + "step": 27556 + }, + { + "epoch": 0.9868748545132237, + "grad_norm": 1.9332503080368042, + "learning_rate": 9.009672727533503e-08, + "loss": 1.2662, + "step": 27557 + }, + { + "epoch": 0.9869106666427919, + "grad_norm": 1.5950955152511597, + "learning_rate": 8.960514145530674e-08, + "loss": 0.9568, + "step": 27558 + }, + { + "epoch": 0.9869464787723602, + "grad_norm": 1.357122778892517, + "learning_rate": 8.911489980076848e-08, + "loss": 1.1457, + "step": 27559 + }, + { + "epoch": 0.9869822909019285, + "grad_norm": 1.6153345108032227, + "learning_rate": 8.862600231832607e-08, + "loss": 1.1285, + "step": 27560 + }, + { + "epoch": 0.9870181030314967, + "grad_norm": 1.1974953413009644, + "learning_rate": 8.813844901452984e-08, + "loss": 1.1206, + "step": 27561 + }, + { + "epoch": 0.987053915161065, + "grad_norm": 1.47188138961792, + "learning_rate": 8.765223989596338e-08, + "loss": 1.1169, + "step": 27562 + }, + { + "epoch": 0.9870897272906334, + "grad_norm": 1.8456473350524902, + "learning_rate": 8.716737496915483e-08, + "loss": 1.0571, + "step": 27563 + }, + { + "epoch": 0.9871255394202016, + "grad_norm": 1.2746301889419556, + "learning_rate": 8.66838542406212e-08, + "loss": 0.9606, + "step": 27564 + }, + { + "epoch": 0.9871613515497699, + "grad_norm": 1.4234386682510376, + "learning_rate": 8.620167771689058e-08, + "loss": 0.9674, + "step": 27565 + }, + { + "epoch": 0.9871971636793382, + "grad_norm": 1.4579520225524902, + "learning_rate": 8.572084540443558e-08, + "loss": 1.0795, + "step": 27566 + }, + { + "epoch": 0.9872329758089065, + "grad_norm": 1.4646751880645752, + "learning_rate": 8.524135730971772e-08, + "loss": 1.0157, + "step": 27567 + }, + { + "epoch": 0.9872687879384747, + "grad_norm": 1.5137327909469604, + "learning_rate": 8.476321343920957e-08, + "loss": 1.0009, + "step": 27568 + }, + { + "epoch": 0.987304600068043, + "grad_norm": 1.7731704711914062, + "learning_rate": 8.428641379931713e-08, + "loss": 1.0848, + "step": 27569 + }, + { + "epoch": 0.9873404121976114, + "grad_norm": 1.2175911664962769, + "learning_rate": 8.381095839647967e-08, + "loss": 0.9935, + "step": 27570 + }, + { + "epoch": 0.9873762243271796, + "grad_norm": 1.512169599533081, + "learning_rate": 8.333684723708102e-08, + "loss": 0.9719, + "step": 27571 + }, + { + "epoch": 0.9874120364567479, + "grad_norm": 1.3633639812469482, + "learning_rate": 8.286408032749382e-08, + "loss": 1.1914, + "step": 27572 + }, + { + "epoch": 0.9874478485863162, + "grad_norm": 1.2438606023788452, + "learning_rate": 8.239265767410187e-08, + "loss": 0.944, + "step": 27573 + }, + { + "epoch": 0.9874836607158844, + "grad_norm": 1.38381028175354, + "learning_rate": 8.192257928322233e-08, + "loss": 1.082, + "step": 27574 + }, + { + "epoch": 0.9875194728454527, + "grad_norm": 1.6180058717727661, + "learning_rate": 8.145384516118349e-08, + "loss": 1.0784, + "step": 27575 + }, + { + "epoch": 0.987555284975021, + "grad_norm": 1.3836427927017212, + "learning_rate": 8.098645531431359e-08, + "loss": 1.0147, + "step": 27576 + }, + { + "epoch": 0.9875910971045894, + "grad_norm": 1.5267466306686401, + "learning_rate": 8.052040974887432e-08, + "loss": 1.1621, + "step": 27577 + }, + { + "epoch": 0.9876269092341576, + "grad_norm": 1.2027134895324707, + "learning_rate": 8.005570847113841e-08, + "loss": 0.9104, + "step": 27578 + }, + { + "epoch": 0.9876627213637259, + "grad_norm": 1.9326460361480713, + "learning_rate": 7.959235148737865e-08, + "loss": 1.0782, + "step": 27579 + }, + { + "epoch": 0.9876985334932942, + "grad_norm": 1.417685627937317, + "learning_rate": 7.913033880381226e-08, + "loss": 1.1404, + "step": 27580 + }, + { + "epoch": 0.9877343456228624, + "grad_norm": 1.4262018203735352, + "learning_rate": 7.866967042665651e-08, + "loss": 1.0199, + "step": 27581 + }, + { + "epoch": 0.9877701577524307, + "grad_norm": 1.9669054746627808, + "learning_rate": 7.821034636211755e-08, + "loss": 1.0427, + "step": 27582 + }, + { + "epoch": 0.987805969881999, + "grad_norm": 1.572965383529663, + "learning_rate": 7.77523666163571e-08, + "loss": 1.0748, + "step": 27583 + }, + { + "epoch": 0.9878417820115674, + "grad_norm": 1.56106436252594, + "learning_rate": 7.729573119555911e-08, + "loss": 1.1541, + "step": 27584 + }, + { + "epoch": 0.9878775941411356, + "grad_norm": 1.6818976402282715, + "learning_rate": 7.684044010585201e-08, + "loss": 1.072, + "step": 27585 + }, + { + "epoch": 0.9879134062707039, + "grad_norm": 1.4383383989334106, + "learning_rate": 7.638649335336423e-08, + "loss": 1.1884, + "step": 27586 + }, + { + "epoch": 0.9879492184002722, + "grad_norm": 1.4299453496932983, + "learning_rate": 7.593389094420201e-08, + "loss": 1.0098, + "step": 27587 + }, + { + "epoch": 0.9879850305298404, + "grad_norm": 1.3673791885375977, + "learning_rate": 7.548263288446045e-08, + "loss": 1.1221, + "step": 27588 + }, + { + "epoch": 0.9880208426594087, + "grad_norm": 1.5209060907363892, + "learning_rate": 7.503271918020138e-08, + "loss": 0.9479, + "step": 27589 + }, + { + "epoch": 0.988056654788977, + "grad_norm": 1.8136937618255615, + "learning_rate": 7.458414983748663e-08, + "loss": 1.2307, + "step": 27590 + }, + { + "epoch": 0.9880924669185454, + "grad_norm": 1.4080394506454468, + "learning_rate": 7.41369248623447e-08, + "loss": 0.8708, + "step": 27591 + }, + { + "epoch": 0.9881282790481136, + "grad_norm": 2.2414894104003906, + "learning_rate": 7.369104426080409e-08, + "loss": 1.0713, + "step": 27592 + }, + { + "epoch": 0.9881640911776819, + "grad_norm": 1.4520658254623413, + "learning_rate": 7.324650803884891e-08, + "loss": 1.2243, + "step": 27593 + }, + { + "epoch": 0.9881999033072502, + "grad_norm": 1.285894513130188, + "learning_rate": 7.280331620246328e-08, + "loss": 1.0392, + "step": 27594 + }, + { + "epoch": 0.9882357154368184, + "grad_norm": 1.6417796611785889, + "learning_rate": 7.236146875762017e-08, + "loss": 1.2921, + "step": 27595 + }, + { + "epoch": 0.9882715275663867, + "grad_norm": 1.2904173135757446, + "learning_rate": 7.19209657102482e-08, + "loss": 0.946, + "step": 27596 + }, + { + "epoch": 0.988307339695955, + "grad_norm": 1.4370306730270386, + "learning_rate": 7.148180706628704e-08, + "loss": 0.8823, + "step": 27597 + }, + { + "epoch": 0.9883431518255233, + "grad_norm": 1.3577415943145752, + "learning_rate": 7.104399283163199e-08, + "loss": 1.0159, + "step": 27598 + }, + { + "epoch": 0.9883789639550916, + "grad_norm": 1.4002429246902466, + "learning_rate": 7.060752301218942e-08, + "loss": 0.9528, + "step": 27599 + }, + { + "epoch": 0.9884147760846599, + "grad_norm": 1.4039676189422607, + "learning_rate": 7.017239761381022e-08, + "loss": 1.039, + "step": 27600 + }, + { + "epoch": 0.9884505882142282, + "grad_norm": 1.374047875404358, + "learning_rate": 6.973861664237857e-08, + "loss": 1.0804, + "step": 27601 + }, + { + "epoch": 0.9884864003437964, + "grad_norm": 1.471638560295105, + "learning_rate": 6.930618010370094e-08, + "loss": 1.0422, + "step": 27602 + }, + { + "epoch": 0.9885222124733647, + "grad_norm": 1.3463810682296753, + "learning_rate": 6.887508800361708e-08, + "loss": 0.9695, + "step": 27603 + }, + { + "epoch": 0.988558024602933, + "grad_norm": 1.427869200706482, + "learning_rate": 6.844534034791128e-08, + "loss": 0.9322, + "step": 27604 + }, + { + "epoch": 0.9885938367325013, + "grad_norm": 1.272656798362732, + "learning_rate": 6.801693714236779e-08, + "loss": 1.1037, + "step": 27605 + }, + { + "epoch": 0.9886296488620696, + "grad_norm": 1.2473417520523071, + "learning_rate": 6.758987839275976e-08, + "loss": 1.0818, + "step": 27606 + }, + { + "epoch": 0.9886654609916379, + "grad_norm": 1.5140964984893799, + "learning_rate": 6.716416410481596e-08, + "loss": 1.1484, + "step": 27607 + }, + { + "epoch": 0.9887012731212061, + "grad_norm": 1.4533019065856934, + "learning_rate": 6.673979428428733e-08, + "loss": 1.0616, + "step": 27608 + }, + { + "epoch": 0.9887370852507744, + "grad_norm": 1.5534924268722534, + "learning_rate": 6.631676893685823e-08, + "loss": 1.239, + "step": 27609 + }, + { + "epoch": 0.9887728973803427, + "grad_norm": 1.4410197734832764, + "learning_rate": 6.589508806823518e-08, + "loss": 0.9173, + "step": 27610 + }, + { + "epoch": 0.988808709509911, + "grad_norm": 1.5059462785720825, + "learning_rate": 6.547475168409145e-08, + "loss": 0.9203, + "step": 27611 + }, + { + "epoch": 0.9888445216394793, + "grad_norm": 1.3585379123687744, + "learning_rate": 6.505575979007805e-08, + "loss": 1.0123, + "step": 27612 + }, + { + "epoch": 0.9888803337690476, + "grad_norm": 1.448473572731018, + "learning_rate": 6.463811239183492e-08, + "loss": 0.9905, + "step": 27613 + }, + { + "epoch": 0.9889161458986159, + "grad_norm": 1.2843048572540283, + "learning_rate": 6.42218094949798e-08, + "loss": 0.9736, + "step": 27614 + }, + { + "epoch": 0.9889519580281841, + "grad_norm": 1.7601364850997925, + "learning_rate": 6.38068511051082e-08, + "loss": 1.1176, + "step": 27615 + }, + { + "epoch": 0.9889877701577524, + "grad_norm": 1.4099684953689575, + "learning_rate": 6.339323722780455e-08, + "loss": 1.1932, + "step": 27616 + }, + { + "epoch": 0.9890235822873207, + "grad_norm": 1.393427848815918, + "learning_rate": 6.298096786864216e-08, + "loss": 1.0198, + "step": 27617 + }, + { + "epoch": 0.989059394416889, + "grad_norm": 1.5659425258636475, + "learning_rate": 6.257004303316106e-08, + "loss": 1.2165, + "step": 27618 + }, + { + "epoch": 0.9890952065464573, + "grad_norm": 1.8837553262710571, + "learning_rate": 6.216046272687904e-08, + "loss": 1.1369, + "step": 27619 + }, + { + "epoch": 0.9891310186760256, + "grad_norm": 1.7967802286148071, + "learning_rate": 6.1752226955325e-08, + "loss": 1.1722, + "step": 27620 + }, + { + "epoch": 0.9891668308055939, + "grad_norm": 1.3427070379257202, + "learning_rate": 6.134533572398349e-08, + "loss": 1.1381, + "step": 27621 + }, + { + "epoch": 0.9892026429351621, + "grad_norm": 1.4787260293960571, + "learning_rate": 6.093978903833897e-08, + "loss": 1.1057, + "step": 27622 + }, + { + "epoch": 0.9892384550647304, + "grad_norm": 1.5546761751174927, + "learning_rate": 6.053558690382045e-08, + "loss": 1.0361, + "step": 27623 + }, + { + "epoch": 0.9892742671942987, + "grad_norm": 1.9652549028396606, + "learning_rate": 6.013272932590131e-08, + "loss": 1.1051, + "step": 27624 + }, + { + "epoch": 0.9893100793238669, + "grad_norm": 1.4945075511932373, + "learning_rate": 5.973121630996615e-08, + "loss": 0.7986, + "step": 27625 + }, + { + "epoch": 0.9893458914534353, + "grad_norm": 1.311902642250061, + "learning_rate": 5.9331047861443944e-08, + "loss": 1.0254, + "step": 27626 + }, + { + "epoch": 0.9893817035830036, + "grad_norm": 1.9457169771194458, + "learning_rate": 5.893222398569709e-08, + "loss": 1.0702, + "step": 27627 + }, + { + "epoch": 0.9894175157125719, + "grad_norm": 1.2771391868591309, + "learning_rate": 5.8534744688110156e-08, + "loss": 0.7672, + "step": 27628 + }, + { + "epoch": 0.9894533278421401, + "grad_norm": 1.6251219511032104, + "learning_rate": 5.8138609974023316e-08, + "loss": 0.9699, + "step": 27629 + }, + { + "epoch": 0.9894891399717084, + "grad_norm": 1.422921895980835, + "learning_rate": 5.774381984876565e-08, + "loss": 1.1711, + "step": 27630 + }, + { + "epoch": 0.9895249521012767, + "grad_norm": 1.2591806650161743, + "learning_rate": 5.735037431765511e-08, + "loss": 0.8875, + "step": 27631 + }, + { + "epoch": 0.9895607642308449, + "grad_norm": 1.5583395957946777, + "learning_rate": 5.6958273385965264e-08, + "loss": 0.932, + "step": 27632 + }, + { + "epoch": 0.9895965763604133, + "grad_norm": 1.6513609886169434, + "learning_rate": 5.656751705899188e-08, + "loss": 1.1593, + "step": 27633 + }, + { + "epoch": 0.9896323884899816, + "grad_norm": 1.5150227546691895, + "learning_rate": 5.617810534198631e-08, + "loss": 0.8553, + "step": 27634 + }, + { + "epoch": 0.9896682006195499, + "grad_norm": 1.417524814605713, + "learning_rate": 5.57900382401777e-08, + "loss": 0.9015, + "step": 27635 + }, + { + "epoch": 0.9897040127491181, + "grad_norm": 2.0789215564727783, + "learning_rate": 5.540331575880631e-08, + "loss": 1.4303, + "step": 27636 + }, + { + "epoch": 0.9897398248786864, + "grad_norm": 1.4218438863754272, + "learning_rate": 5.501793790305687e-08, + "loss": 0.9743, + "step": 27637 + }, + { + "epoch": 0.9897756370082547, + "grad_norm": 1.3632233142852783, + "learning_rate": 5.4633904678125234e-08, + "loss": 1.0977, + "step": 27638 + }, + { + "epoch": 0.9898114491378229, + "grad_norm": 1.5222090482711792, + "learning_rate": 5.425121608917394e-08, + "loss": 1.1833, + "step": 27639 + }, + { + "epoch": 0.9898472612673913, + "grad_norm": 1.2600082159042358, + "learning_rate": 5.3869872141343313e-08, + "loss": 0.912, + "step": 27640 + }, + { + "epoch": 0.9898830733969596, + "grad_norm": 1.4898957014083862, + "learning_rate": 5.348987283978479e-08, + "loss": 1.1992, + "step": 27641 + }, + { + "epoch": 0.9899188855265278, + "grad_norm": 1.4353675842285156, + "learning_rate": 5.3111218189594304e-08, + "loss": 0.9226, + "step": 27642 + }, + { + "epoch": 0.9899546976560961, + "grad_norm": 1.3440052270889282, + "learning_rate": 5.2733908195867764e-08, + "loss": 1.1286, + "step": 27643 + }, + { + "epoch": 0.9899905097856644, + "grad_norm": 1.349199891090393, + "learning_rate": 5.23579428636789e-08, + "loss": 1.0687, + "step": 27644 + }, + { + "epoch": 0.9900263219152327, + "grad_norm": 1.264988660812378, + "learning_rate": 5.1983322198101425e-08, + "loss": 1.1293, + "step": 27645 + }, + { + "epoch": 0.9900621340448009, + "grad_norm": 1.780363917350769, + "learning_rate": 5.161004620416465e-08, + "loss": 0.8623, + "step": 27646 + }, + { + "epoch": 0.9900979461743693, + "grad_norm": 1.4824028015136719, + "learning_rate": 5.1238114886875685e-08, + "loss": 1.0674, + "step": 27647 + }, + { + "epoch": 0.9901337583039376, + "grad_norm": 1.5219635963439941, + "learning_rate": 5.086752825126384e-08, + "loss": 1.1246, + "step": 27648 + }, + { + "epoch": 0.9901695704335058, + "grad_norm": 1.5889519453048706, + "learning_rate": 5.049828630230291e-08, + "loss": 1.0205, + "step": 27649 + }, + { + "epoch": 0.9902053825630741, + "grad_norm": 2.3107101917266846, + "learning_rate": 5.01303890449667e-08, + "loss": 1.2728, + "step": 27650 + }, + { + "epoch": 0.9902411946926424, + "grad_norm": 1.7651376724243164, + "learning_rate": 4.976383648419569e-08, + "loss": 0.9981, + "step": 27651 + }, + { + "epoch": 0.9902770068222106, + "grad_norm": 1.565342664718628, + "learning_rate": 4.9398628624930385e-08, + "loss": 1.0713, + "step": 27652 + }, + { + "epoch": 0.9903128189517789, + "grad_norm": 1.3365628719329834, + "learning_rate": 4.903476547206687e-08, + "loss": 0.9458, + "step": 27653 + }, + { + "epoch": 0.9903486310813473, + "grad_norm": 1.3963887691497803, + "learning_rate": 4.8672247030523425e-08, + "loss": 0.9757, + "step": 27654 + }, + { + "epoch": 0.9903844432109156, + "grad_norm": 1.6301342248916626, + "learning_rate": 4.8311073305162825e-08, + "loss": 0.9916, + "step": 27655 + }, + { + "epoch": 0.9904202553404838, + "grad_norm": 1.953139066696167, + "learning_rate": 4.795124430085896e-08, + "loss": 0.9033, + "step": 27656 + }, + { + "epoch": 0.9904560674700521, + "grad_norm": 1.6313577890396118, + "learning_rate": 4.7592760022430185e-08, + "loss": 0.99, + "step": 27657 + }, + { + "epoch": 0.9904918795996204, + "grad_norm": 1.234663963317871, + "learning_rate": 4.723562047471708e-08, + "loss": 1.2201, + "step": 27658 + }, + { + "epoch": 0.9905276917291886, + "grad_norm": 1.3851481676101685, + "learning_rate": 4.687982566251581e-08, + "loss": 1.0405, + "step": 27659 + }, + { + "epoch": 0.9905635038587569, + "grad_norm": 1.423596739768982, + "learning_rate": 4.652537559062253e-08, + "loss": 0.9984, + "step": 27660 + }, + { + "epoch": 0.9905993159883253, + "grad_norm": 1.3022617101669312, + "learning_rate": 4.617227026378901e-08, + "loss": 0.9273, + "step": 27661 + }, + { + "epoch": 0.9906351281178936, + "grad_norm": 1.396947979927063, + "learning_rate": 4.582050968677809e-08, + "loss": 0.9932, + "step": 27662 + }, + { + "epoch": 0.9906709402474618, + "grad_norm": 1.3669391870498657, + "learning_rate": 4.5470093864330435e-08, + "loss": 1.1764, + "step": 27663 + }, + { + "epoch": 0.9907067523770301, + "grad_norm": 1.6718096733093262, + "learning_rate": 4.5121022801142275e-08, + "loss": 1.0127, + "step": 27664 + }, + { + "epoch": 0.9907425645065984, + "grad_norm": 1.4012477397918701, + "learning_rate": 4.477329650192097e-08, + "loss": 0.9716, + "step": 27665 + }, + { + "epoch": 0.9907783766361666, + "grad_norm": 1.3886672258377075, + "learning_rate": 4.442691497134055e-08, + "loss": 0.9556, + "step": 27666 + }, + { + "epoch": 0.9908141887657349, + "grad_norm": 1.3919391632080078, + "learning_rate": 4.408187821406395e-08, + "loss": 1.253, + "step": 27667 + }, + { + "epoch": 0.9908500008953033, + "grad_norm": 1.3217023611068726, + "learning_rate": 4.373818623473191e-08, + "loss": 1.069, + "step": 27668 + }, + { + "epoch": 0.9908858130248716, + "grad_norm": 1.4588134288787842, + "learning_rate": 4.3395839037962956e-08, + "loss": 1.078, + "step": 27669 + }, + { + "epoch": 0.9909216251544398, + "grad_norm": 1.4926037788391113, + "learning_rate": 4.305483662837562e-08, + "loss": 1.1599, + "step": 27670 + }, + { + "epoch": 0.9909574372840081, + "grad_norm": 1.8088563680648804, + "learning_rate": 4.2715179010555106e-08, + "loss": 1.0789, + "step": 27671 + }, + { + "epoch": 0.9909932494135764, + "grad_norm": 1.3889888525009155, + "learning_rate": 4.2376866189053346e-08, + "loss": 1.0663, + "step": 27672 + }, + { + "epoch": 0.9910290615431446, + "grad_norm": 1.6659222841262817, + "learning_rate": 4.2039898168444445e-08, + "loss": 1.0419, + "step": 27673 + }, + { + "epoch": 0.9910648736727129, + "grad_norm": 1.6931540966033936, + "learning_rate": 4.170427495324702e-08, + "loss": 1.1301, + "step": 27674 + }, + { + "epoch": 0.9911006858022813, + "grad_norm": 1.2651867866516113, + "learning_rate": 4.1369996547979685e-08, + "loss": 0.856, + "step": 27675 + }, + { + "epoch": 0.9911364979318495, + "grad_norm": 1.7764265537261963, + "learning_rate": 4.1037062957138825e-08, + "loss": 1.1788, + "step": 27676 + }, + { + "epoch": 0.9911723100614178, + "grad_norm": 1.449328064918518, + "learning_rate": 4.070547418522086e-08, + "loss": 0.9871, + "step": 27677 + }, + { + "epoch": 0.9912081221909861, + "grad_norm": 1.658101201057434, + "learning_rate": 4.037523023666667e-08, + "loss": 1.1636, + "step": 27678 + }, + { + "epoch": 0.9912439343205544, + "grad_norm": 1.4364286661148071, + "learning_rate": 4.0046331115917157e-08, + "loss": 1.0716, + "step": 27679 + }, + { + "epoch": 0.9912797464501226, + "grad_norm": 1.271278977394104, + "learning_rate": 3.9718776827413204e-08, + "loss": 1.086, + "step": 27680 + }, + { + "epoch": 0.9913155585796909, + "grad_norm": 1.5581222772598267, + "learning_rate": 3.9392567375551306e-08, + "loss": 0.7917, + "step": 27681 + }, + { + "epoch": 0.9913513707092593, + "grad_norm": 1.5096189975738525, + "learning_rate": 3.906770276471683e-08, + "loss": 0.9941, + "step": 27682 + }, + { + "epoch": 0.9913871828388275, + "grad_norm": 1.4725536108016968, + "learning_rate": 3.8744182999295164e-08, + "loss": 1.1383, + "step": 27683 + }, + { + "epoch": 0.9914229949683958, + "grad_norm": 1.7604690790176392, + "learning_rate": 3.842200808362728e-08, + "loss": 1.1978, + "step": 27684 + }, + { + "epoch": 0.9914588070979641, + "grad_norm": 1.6444908380508423, + "learning_rate": 3.810117802204305e-08, + "loss": 1.0273, + "step": 27685 + }, + { + "epoch": 0.9914946192275323, + "grad_norm": 1.5885741710662842, + "learning_rate": 3.778169281887234e-08, + "loss": 1.0741, + "step": 27686 + }, + { + "epoch": 0.9915304313571006, + "grad_norm": 1.170138955116272, + "learning_rate": 3.746355247841171e-08, + "loss": 0.8926, + "step": 27687 + }, + { + "epoch": 0.9915662434866689, + "grad_norm": 1.6561795473098755, + "learning_rate": 3.7146757004924425e-08, + "loss": 0.8845, + "step": 27688 + }, + { + "epoch": 0.9916020556162373, + "grad_norm": 1.457722783088684, + "learning_rate": 3.683130640269594e-08, + "loss": 1.0678, + "step": 27689 + }, + { + "epoch": 0.9916378677458055, + "grad_norm": 1.590030550956726, + "learning_rate": 3.651720067595621e-08, + "loss": 1.078, + "step": 27690 + }, + { + "epoch": 0.9916736798753738, + "grad_norm": 1.4346303939819336, + "learning_rate": 3.620443982892407e-08, + "loss": 1.1259, + "step": 27691 + }, + { + "epoch": 0.9917094920049421, + "grad_norm": 1.712297797203064, + "learning_rate": 3.589302386582949e-08, + "loss": 1.1121, + "step": 27692 + }, + { + "epoch": 0.9917453041345103, + "grad_norm": 1.4394522905349731, + "learning_rate": 3.558295279084689e-08, + "loss": 1.0898, + "step": 27693 + }, + { + "epoch": 0.9917811162640786, + "grad_norm": 1.4605268239974976, + "learning_rate": 3.527422660815072e-08, + "loss": 1.0025, + "step": 27694 + }, + { + "epoch": 0.9918169283936469, + "grad_norm": 1.8551429510116577, + "learning_rate": 3.4966845321893204e-08, + "loss": 1.0817, + "step": 27695 + }, + { + "epoch": 0.9918527405232153, + "grad_norm": 1.9540011882781982, + "learning_rate": 3.4660808936215485e-08, + "loss": 1.1225, + "step": 27696 + }, + { + "epoch": 0.9918885526527835, + "grad_norm": 1.3130738735198975, + "learning_rate": 3.435611745522538e-08, + "loss": 1.1197, + "step": 27697 + }, + { + "epoch": 0.9919243647823518, + "grad_norm": 1.5972137451171875, + "learning_rate": 3.405277088301961e-08, + "loss": 0.9843, + "step": 27698 + }, + { + "epoch": 0.9919601769119201, + "grad_norm": 1.326250672340393, + "learning_rate": 3.375076922370601e-08, + "loss": 0.9378, + "step": 27699 + }, + { + "epoch": 0.9919959890414883, + "grad_norm": 1.3209428787231445, + "learning_rate": 3.345011248131469e-08, + "loss": 1.0935, + "step": 27700 + }, + { + "epoch": 0.9920318011710566, + "grad_norm": 1.5094331502914429, + "learning_rate": 3.3150800659909055e-08, + "loss": 1.0918, + "step": 27701 + }, + { + "epoch": 0.9920676133006249, + "grad_norm": 1.6733571290969849, + "learning_rate": 3.285283376350812e-08, + "loss": 0.9949, + "step": 27702 + }, + { + "epoch": 0.9921034254301933, + "grad_norm": 1.2259957790374756, + "learning_rate": 3.255621179613089e-08, + "loss": 1.018, + "step": 27703 + }, + { + "epoch": 0.9921392375597615, + "grad_norm": 1.5949376821517944, + "learning_rate": 3.226093476175196e-08, + "loss": 1.0258, + "step": 27704 + }, + { + "epoch": 0.9921750496893298, + "grad_norm": 1.2323143482208252, + "learning_rate": 3.1967002664357036e-08, + "loss": 0.881, + "step": 27705 + }, + { + "epoch": 0.9922108618188981, + "grad_norm": 1.5417269468307495, + "learning_rate": 3.167441550789851e-08, + "loss": 1.1883, + "step": 27706 + }, + { + "epoch": 0.9922466739484663, + "grad_norm": 1.640535593032837, + "learning_rate": 3.138317329630658e-08, + "loss": 1.0783, + "step": 27707 + }, + { + "epoch": 0.9922824860780346, + "grad_norm": 1.4381440877914429, + "learning_rate": 3.109327603351142e-08, + "loss": 0.9967, + "step": 27708 + }, + { + "epoch": 0.9923182982076029, + "grad_norm": 1.3508025407791138, + "learning_rate": 3.080472372339882e-08, + "loss": 1.2188, + "step": 27709 + }, + { + "epoch": 0.9923541103371712, + "grad_norm": 1.3767656087875366, + "learning_rate": 3.0517516369865665e-08, + "loss": 1.0029, + "step": 27710 + }, + { + "epoch": 0.9923899224667395, + "grad_norm": 3.0189602375030518, + "learning_rate": 3.0231653976764415e-08, + "loss": 0.9658, + "step": 27711 + }, + { + "epoch": 0.9924257345963078, + "grad_norm": 1.4129972457885742, + "learning_rate": 2.994713654793646e-08, + "loss": 1.1781, + "step": 27712 + }, + { + "epoch": 0.992461546725876, + "grad_norm": 1.5087727308273315, + "learning_rate": 2.966396408722316e-08, + "loss": 0.9938, + "step": 27713 + }, + { + "epoch": 0.9924973588554443, + "grad_norm": 2.057483196258545, + "learning_rate": 2.9382136598432587e-08, + "loss": 0.8253, + "step": 27714 + }, + { + "epoch": 0.9925331709850126, + "grad_norm": 1.5585182905197144, + "learning_rate": 2.9101654085350594e-08, + "loss": 1.0237, + "step": 27715 + }, + { + "epoch": 0.9925689831145809, + "grad_norm": 1.3579484224319458, + "learning_rate": 2.8822516551751942e-08, + "loss": 0.9194, + "step": 27716 + }, + { + "epoch": 0.9926047952441492, + "grad_norm": 1.4384316205978394, + "learning_rate": 2.854472400138919e-08, + "loss": 0.9399, + "step": 27717 + }, + { + "epoch": 0.9926406073737175, + "grad_norm": 1.5773801803588867, + "learning_rate": 2.8268276438003782e-08, + "loss": 1.1145, + "step": 27718 + }, + { + "epoch": 0.9926764195032858, + "grad_norm": 1.5596765279769897, + "learning_rate": 2.799317386531497e-08, + "loss": 1.208, + "step": 27719 + }, + { + "epoch": 0.992712231632854, + "grad_norm": 1.7204127311706543, + "learning_rate": 2.7719416287030897e-08, + "loss": 1.0633, + "step": 27720 + }, + { + "epoch": 0.9927480437624223, + "grad_norm": 1.3638678789138794, + "learning_rate": 2.74470037068264e-08, + "loss": 1.0797, + "step": 27721 + }, + { + "epoch": 0.9927838558919906, + "grad_norm": 1.2090855836868286, + "learning_rate": 2.717593612835412e-08, + "loss": 1.0361, + "step": 27722 + }, + { + "epoch": 0.9928196680215589, + "grad_norm": 1.5854138135910034, + "learning_rate": 2.6906213555288884e-08, + "loss": 1.1002, + "step": 27723 + }, + { + "epoch": 0.9928554801511272, + "grad_norm": 1.3594037294387817, + "learning_rate": 2.6637835991238924e-08, + "loss": 0.9172, + "step": 27724 + }, + { + "epoch": 0.9928912922806955, + "grad_norm": 1.4629255533218384, + "learning_rate": 2.6370803439812463e-08, + "loss": 0.8646, + "step": 27725 + }, + { + "epoch": 0.9929271044102638, + "grad_norm": 1.2687807083129883, + "learning_rate": 2.6105115904617726e-08, + "loss": 1.1735, + "step": 27726 + }, + { + "epoch": 0.992962916539832, + "grad_norm": 1.3806689977645874, + "learning_rate": 2.584077338921853e-08, + "loss": 0.9529, + "step": 27727 + }, + { + "epoch": 0.9929987286694003, + "grad_norm": 1.660772681236267, + "learning_rate": 2.557777589717869e-08, + "loss": 1.0805, + "step": 27728 + }, + { + "epoch": 0.9930345407989686, + "grad_norm": 1.6044869422912598, + "learning_rate": 2.5316123432028714e-08, + "loss": 1.2216, + "step": 27729 + }, + { + "epoch": 0.9930703529285368, + "grad_norm": 1.6122312545776367, + "learning_rate": 2.5055815997299113e-08, + "loss": 1.27, + "step": 27730 + }, + { + "epoch": 0.9931061650581052, + "grad_norm": 1.4263060092926025, + "learning_rate": 2.479685359647599e-08, + "loss": 1.0369, + "step": 27731 + }, + { + "epoch": 0.9931419771876735, + "grad_norm": 1.6894383430480957, + "learning_rate": 2.453923623305654e-08, + "loss": 1.1065, + "step": 27732 + }, + { + "epoch": 0.9931777893172418, + "grad_norm": 1.7193357944488525, + "learning_rate": 2.4282963910504664e-08, + "loss": 1.198, + "step": 27733 + }, + { + "epoch": 0.99321360144681, + "grad_norm": 1.4143182039260864, + "learning_rate": 2.4028036632262053e-08, + "loss": 1.0612, + "step": 27734 + }, + { + "epoch": 0.9932494135763783, + "grad_norm": 1.7577720880508423, + "learning_rate": 2.3774454401770396e-08, + "loss": 1.0912, + "step": 27735 + }, + { + "epoch": 0.9932852257059466, + "grad_norm": 1.4437111616134644, + "learning_rate": 2.3522217222426978e-08, + "loss": 1.1021, + "step": 27736 + }, + { + "epoch": 0.9933210378355148, + "grad_norm": 1.5835072994232178, + "learning_rate": 2.3271325097629082e-08, + "loss": 1.08, + "step": 27737 + }, + { + "epoch": 0.9933568499650832, + "grad_norm": 1.3294556140899658, + "learning_rate": 2.3021778030751784e-08, + "loss": 1.0615, + "step": 27738 + }, + { + "epoch": 0.9933926620946515, + "grad_norm": 1.33971107006073, + "learning_rate": 2.2773576025170163e-08, + "loss": 0.9892, + "step": 27739 + }, + { + "epoch": 0.9934284742242198, + "grad_norm": 1.4302659034729004, + "learning_rate": 2.2526719084192683e-08, + "loss": 1.0463, + "step": 27740 + }, + { + "epoch": 0.993464286353788, + "grad_norm": 1.230095624923706, + "learning_rate": 2.2281207211172218e-08, + "loss": 1.0568, + "step": 27741 + }, + { + "epoch": 0.9935000984833563, + "grad_norm": 1.5333905220031738, + "learning_rate": 2.2037040409383924e-08, + "loss": 1.0414, + "step": 27742 + }, + { + "epoch": 0.9935359106129246, + "grad_norm": 1.6751846075057983, + "learning_rate": 2.1794218682125168e-08, + "loss": 1.1409, + "step": 27743 + }, + { + "epoch": 0.9935717227424928, + "grad_norm": 1.3760696649551392, + "learning_rate": 2.15527420326711e-08, + "loss": 1.2413, + "step": 27744 + }, + { + "epoch": 0.9936075348720612, + "grad_norm": 1.3360587358474731, + "learning_rate": 2.131261046425248e-08, + "loss": 0.9714, + "step": 27745 + }, + { + "epoch": 0.9936433470016295, + "grad_norm": 1.5851085186004639, + "learning_rate": 2.107382398011115e-08, + "loss": 1.0709, + "step": 27746 + }, + { + "epoch": 0.9936791591311978, + "grad_norm": 1.494686245918274, + "learning_rate": 2.0836382583466762e-08, + "loss": 1.1745, + "step": 27747 + }, + { + "epoch": 0.993714971260766, + "grad_norm": 1.4142203330993652, + "learning_rate": 2.0600286277494552e-08, + "loss": 0.9196, + "step": 27748 + }, + { + "epoch": 0.9937507833903343, + "grad_norm": 1.278878092765808, + "learning_rate": 2.0365535065391962e-08, + "loss": 0.9796, + "step": 27749 + }, + { + "epoch": 0.9937865955199026, + "grad_norm": 1.402613878250122, + "learning_rate": 2.013212895030092e-08, + "loss": 1.0339, + "step": 27750 + }, + { + "epoch": 0.9938224076494708, + "grad_norm": 1.842592477798462, + "learning_rate": 1.9900067935363364e-08, + "loss": 1.1176, + "step": 27751 + }, + { + "epoch": 0.9938582197790392, + "grad_norm": 1.434018611907959, + "learning_rate": 1.966935202371012e-08, + "loss": 1.1554, + "step": 27752 + }, + { + "epoch": 0.9938940319086075, + "grad_norm": 1.9200348854064941, + "learning_rate": 1.9439981218438708e-08, + "loss": 1.129, + "step": 27753 + }, + { + "epoch": 0.9939298440381757, + "grad_norm": 1.5200790166854858, + "learning_rate": 1.921195552263555e-08, + "loss": 1.1012, + "step": 27754 + }, + { + "epoch": 0.993965656167744, + "grad_norm": 1.4663357734680176, + "learning_rate": 1.8985274939375962e-08, + "loss": 1.149, + "step": 27755 + }, + { + "epoch": 0.9940014682973123, + "grad_norm": 1.520316243171692, + "learning_rate": 1.8759939471690858e-08, + "loss": 1.0721, + "step": 27756 + }, + { + "epoch": 0.9940372804268806, + "grad_norm": 1.6311445236206055, + "learning_rate": 1.8535949122633346e-08, + "loss": 1.0712, + "step": 27757 + }, + { + "epoch": 0.9940730925564488, + "grad_norm": 1.5463629961013794, + "learning_rate": 1.831330389521213e-08, + "loss": 1.1029, + "step": 27758 + }, + { + "epoch": 0.9941089046860172, + "grad_norm": 1.367632269859314, + "learning_rate": 1.8092003792413714e-08, + "loss": 0.9936, + "step": 27759 + }, + { + "epoch": 0.9941447168155855, + "grad_norm": 1.2297667264938354, + "learning_rate": 1.7872048817213495e-08, + "loss": 1.2141, + "step": 27760 + }, + { + "epoch": 0.9941805289451537, + "grad_norm": 1.7365025281906128, + "learning_rate": 1.7653438972586868e-08, + "loss": 1.0579, + "step": 27761 + }, + { + "epoch": 0.994216341074722, + "grad_norm": 1.7314541339874268, + "learning_rate": 1.743617426145372e-08, + "loss": 1.1318, + "step": 27762 + }, + { + "epoch": 0.9942521532042903, + "grad_norm": 1.507061243057251, + "learning_rate": 1.7220254686756142e-08, + "loss": 0.7989, + "step": 27763 + }, + { + "epoch": 0.9942879653338585, + "grad_norm": 1.3382261991500854, + "learning_rate": 1.700568025139182e-08, + "loss": 1.0895, + "step": 27764 + }, + { + "epoch": 0.9943237774634268, + "grad_norm": 1.9924143552780151, + "learning_rate": 1.679245095824733e-08, + "loss": 1.1166, + "step": 27765 + }, + { + "epoch": 0.9943595895929952, + "grad_norm": 1.815843105316162, + "learning_rate": 1.658056681019815e-08, + "loss": 1.1265, + "step": 27766 + }, + { + "epoch": 0.9943954017225635, + "grad_norm": 1.5924941301345825, + "learning_rate": 1.637002781007535e-08, + "loss": 1.0824, + "step": 27767 + }, + { + "epoch": 0.9944312138521317, + "grad_norm": 1.2463346719741821, + "learning_rate": 1.6160833960732203e-08, + "loss": 1.0799, + "step": 27768 + }, + { + "epoch": 0.9944670259817, + "grad_norm": 1.4165626764297485, + "learning_rate": 1.595298526496647e-08, + "loss": 1.0403, + "step": 27769 + }, + { + "epoch": 0.9945028381112683, + "grad_norm": 1.376821756362915, + "learning_rate": 1.5746481725598117e-08, + "loss": 1.1664, + "step": 27770 + }, + { + "epoch": 0.9945386502408365, + "grad_norm": 1.8158386945724487, + "learning_rate": 1.5541323345380497e-08, + "loss": 1.0434, + "step": 27771 + }, + { + "epoch": 0.9945744623704048, + "grad_norm": 1.5947171449661255, + "learning_rate": 1.533751012707807e-08, + "loss": 1.0563, + "step": 27772 + }, + { + "epoch": 0.9946102744999732, + "grad_norm": 1.3927321434020996, + "learning_rate": 1.5135042073444182e-08, + "loss": 1.2476, + "step": 27773 + }, + { + "epoch": 0.9946460866295415, + "grad_norm": 1.555647850036621, + "learning_rate": 1.4933919187198884e-08, + "loss": 0.8654, + "step": 27774 + }, + { + "epoch": 0.9946818987591097, + "grad_norm": 1.2554421424865723, + "learning_rate": 1.4734141471051122e-08, + "loss": 1.0469, + "step": 27775 + }, + { + "epoch": 0.994717710888678, + "grad_norm": 1.5782759189605713, + "learning_rate": 1.4535708927676529e-08, + "loss": 1.1654, + "step": 27776 + }, + { + "epoch": 0.9947535230182463, + "grad_norm": 1.2934378385543823, + "learning_rate": 1.4338621559750742e-08, + "loss": 1.1624, + "step": 27777 + }, + { + "epoch": 0.9947893351478145, + "grad_norm": 1.591485619544983, + "learning_rate": 1.4142879369927198e-08, + "loss": 1.0744, + "step": 27778 + }, + { + "epoch": 0.9948251472773828, + "grad_norm": 1.7381560802459717, + "learning_rate": 1.3948482360848225e-08, + "loss": 1.1367, + "step": 27779 + }, + { + "epoch": 0.9948609594069512, + "grad_norm": 1.3632047176361084, + "learning_rate": 1.3755430535111747e-08, + "loss": 1.3008, + "step": 27780 + }, + { + "epoch": 0.9948967715365195, + "grad_norm": 1.4664477109909058, + "learning_rate": 1.3563723895326785e-08, + "loss": 1.0883, + "step": 27781 + }, + { + "epoch": 0.9949325836660877, + "grad_norm": 1.4631173610687256, + "learning_rate": 1.3373362444057957e-08, + "loss": 1.0892, + "step": 27782 + }, + { + "epoch": 0.994968395795656, + "grad_norm": 1.4439215660095215, + "learning_rate": 1.3184346183892082e-08, + "loss": 0.9706, + "step": 27783 + }, + { + "epoch": 0.9950042079252243, + "grad_norm": 1.3895304203033447, + "learning_rate": 1.2996675117349367e-08, + "loss": 1.0515, + "step": 27784 + }, + { + "epoch": 0.9950400200547925, + "grad_norm": 1.9795929193496704, + "learning_rate": 1.2810349246961117e-08, + "loss": 1.1346, + "step": 27785 + }, + { + "epoch": 0.9950758321843608, + "grad_norm": 2.1503653526306152, + "learning_rate": 1.262536857523644e-08, + "loss": 1.1701, + "step": 27786 + }, + { + "epoch": 0.9951116443139292, + "grad_norm": 1.4021936655044556, + "learning_rate": 1.2441733104662234e-08, + "loss": 0.9684, + "step": 27787 + }, + { + "epoch": 0.9951474564434974, + "grad_norm": 1.2082736492156982, + "learning_rate": 1.2259442837714297e-08, + "loss": 1.0345, + "step": 27788 + }, + { + "epoch": 0.9951832685730657, + "grad_norm": 1.639564871788025, + "learning_rate": 1.2078497776835119e-08, + "loss": 1.0412, + "step": 27789 + }, + { + "epoch": 0.995219080702634, + "grad_norm": 1.4363194704055786, + "learning_rate": 1.1898897924467189e-08, + "loss": 1.0962, + "step": 27790 + }, + { + "epoch": 0.9952548928322023, + "grad_norm": 1.5681307315826416, + "learning_rate": 1.1720643283019694e-08, + "loss": 1.2167, + "step": 27791 + }, + { + "epoch": 0.9952907049617705, + "grad_norm": 1.3361971378326416, + "learning_rate": 1.1543733854901817e-08, + "loss": 0.9799, + "step": 27792 + }, + { + "epoch": 0.9953265170913388, + "grad_norm": 1.239851951599121, + "learning_rate": 1.1368169642489435e-08, + "loss": 0.9615, + "step": 27793 + }, + { + "epoch": 0.9953623292209072, + "grad_norm": 1.28730046749115, + "learning_rate": 1.119395064813622e-08, + "loss": 0.9932, + "step": 27794 + }, + { + "epoch": 0.9953981413504754, + "grad_norm": 1.460843563079834, + "learning_rate": 1.1021076874195846e-08, + "loss": 1.0147, + "step": 27795 + }, + { + "epoch": 0.9954339534800437, + "grad_norm": 1.5582510232925415, + "learning_rate": 1.0849548322988679e-08, + "loss": 1.011, + "step": 27796 + }, + { + "epoch": 0.995469765609612, + "grad_norm": 1.5871485471725464, + "learning_rate": 1.0679364996823982e-08, + "loss": 1.1118, + "step": 27797 + }, + { + "epoch": 0.9955055777391802, + "grad_norm": 1.4242435693740845, + "learning_rate": 1.0510526897988815e-08, + "loss": 0.9629, + "step": 27798 + }, + { + "epoch": 0.9955413898687485, + "grad_norm": 2.0936505794525146, + "learning_rate": 1.0343034028759136e-08, + "loss": 1.2012, + "step": 27799 + }, + { + "epoch": 0.9955772019983168, + "grad_norm": 1.731306791305542, + "learning_rate": 1.0176886391388695e-08, + "loss": 1.1956, + "step": 27800 + }, + { + "epoch": 0.9956130141278852, + "grad_norm": 1.5613293647766113, + "learning_rate": 1.0012083988109045e-08, + "loss": 0.9402, + "step": 27801 + }, + { + "epoch": 0.9956488262574534, + "grad_norm": 1.5999258756637573, + "learning_rate": 9.848626821140627e-09, + "loss": 1.1658, + "step": 27802 + }, + { + "epoch": 0.9956846383870217, + "grad_norm": 1.2451385259628296, + "learning_rate": 9.686514892681687e-09, + "loss": 0.9187, + "step": 27803 + }, + { + "epoch": 0.99572045051659, + "grad_norm": 1.3997373580932617, + "learning_rate": 9.525748204908258e-09, + "loss": 0.9989, + "step": 27804 + }, + { + "epoch": 0.9957562626461582, + "grad_norm": 1.4760199785232544, + "learning_rate": 9.366326759985278e-09, + "loss": 1.171, + "step": 27805 + }, + { + "epoch": 0.9957920747757265, + "grad_norm": 1.5538642406463623, + "learning_rate": 9.208250560066578e-09, + "loss": 0.862, + "step": 27806 + }, + { + "epoch": 0.9958278869052948, + "grad_norm": 1.5711824893951416, + "learning_rate": 9.051519607272684e-09, + "loss": 1.2452, + "step": 27807 + }, + { + "epoch": 0.9958636990348632, + "grad_norm": 1.4650105237960815, + "learning_rate": 8.89613390370192e-09, + "loss": 1.1438, + "step": 27808 + }, + { + "epoch": 0.9958995111644314, + "grad_norm": 1.3830136060714722, + "learning_rate": 8.742093451463707e-09, + "loss": 1.0371, + "step": 27809 + }, + { + "epoch": 0.9959353232939997, + "grad_norm": 1.2307100296020508, + "learning_rate": 8.589398252611957e-09, + "loss": 1.0224, + "step": 27810 + }, + { + "epoch": 0.995971135423568, + "grad_norm": 1.4562439918518066, + "learning_rate": 8.43804830922279e-09, + "loss": 0.9376, + "step": 27811 + }, + { + "epoch": 0.9960069475531362, + "grad_norm": 1.6497365236282349, + "learning_rate": 8.28804362331681e-09, + "loss": 1.2798, + "step": 27812 + }, + { + "epoch": 0.9960427596827045, + "grad_norm": 1.517102599143982, + "learning_rate": 8.139384196903522e-09, + "loss": 0.9185, + "step": 27813 + }, + { + "epoch": 0.9960785718122728, + "grad_norm": 1.6130027770996094, + "learning_rate": 7.992070032003529e-09, + "loss": 0.9162, + "step": 27814 + }, + { + "epoch": 0.9961143839418412, + "grad_norm": 1.814577341079712, + "learning_rate": 7.84610113059303e-09, + "loss": 1.0098, + "step": 27815 + }, + { + "epoch": 0.9961501960714094, + "grad_norm": 1.284324288368225, + "learning_rate": 7.70147749462602e-09, + "loss": 1.0035, + "step": 27816 + }, + { + "epoch": 0.9961860082009777, + "grad_norm": 1.3869340419769287, + "learning_rate": 7.558199126056487e-09, + "loss": 1.1209, + "step": 27817 + }, + { + "epoch": 0.996221820330546, + "grad_norm": 1.603816270828247, + "learning_rate": 7.416266026816221e-09, + "loss": 1.1834, + "step": 27818 + }, + { + "epoch": 0.9962576324601142, + "grad_norm": 1.5957945585250854, + "learning_rate": 7.275678198803703e-09, + "loss": 0.9895, + "step": 27819 + }, + { + "epoch": 0.9962934445896825, + "grad_norm": 1.2742962837219238, + "learning_rate": 7.136435643917416e-09, + "loss": 1.0132, + "step": 27820 + }, + { + "epoch": 0.9963292567192508, + "grad_norm": 1.4074482917785645, + "learning_rate": 6.998538364022533e-09, + "loss": 1.2053, + "step": 27821 + }, + { + "epoch": 0.9963650688488191, + "grad_norm": 1.4684451818466187, + "learning_rate": 6.861986360995332e-09, + "loss": 1.0188, + "step": 27822 + }, + { + "epoch": 0.9964008809783874, + "grad_norm": 1.8784462213516235, + "learning_rate": 6.726779636645475e-09, + "loss": 1.2482, + "step": 27823 + }, + { + "epoch": 0.9964366931079557, + "grad_norm": 1.0880767107009888, + "learning_rate": 6.592918192804831e-09, + "loss": 0.8632, + "step": 27824 + }, + { + "epoch": 0.996472505237524, + "grad_norm": 1.2403895854949951, + "learning_rate": 6.460402031283064e-09, + "loss": 0.965, + "step": 27825 + }, + { + "epoch": 0.9965083173670922, + "grad_norm": 1.2023898363113403, + "learning_rate": 6.329231153845427e-09, + "loss": 0.8359, + "step": 27826 + }, + { + "epoch": 0.9965441294966605, + "grad_norm": 2.1758337020874023, + "learning_rate": 6.199405562268279e-09, + "loss": 1.1298, + "step": 27827 + }, + { + "epoch": 0.9965799416262288, + "grad_norm": 1.4052248001098633, + "learning_rate": 6.070925258294668e-09, + "loss": 1.1272, + "step": 27828 + }, + { + "epoch": 0.9966157537557971, + "grad_norm": 1.3379669189453125, + "learning_rate": 5.943790243656544e-09, + "loss": 1.0789, + "step": 27829 + }, + { + "epoch": 0.9966515658853654, + "grad_norm": 1.3187897205352783, + "learning_rate": 5.818000520052547e-09, + "loss": 1.0222, + "step": 27830 + }, + { + "epoch": 0.9966873780149337, + "grad_norm": 1.3668831586837769, + "learning_rate": 5.69355608919242e-09, + "loss": 1.043, + "step": 27831 + }, + { + "epoch": 0.996723190144502, + "grad_norm": 1.577150583267212, + "learning_rate": 5.570456952741499e-09, + "loss": 1.0102, + "step": 27832 + }, + { + "epoch": 0.9967590022740702, + "grad_norm": 1.7567318677902222, + "learning_rate": 5.448703112365117e-09, + "loss": 1.2057, + "step": 27833 + }, + { + "epoch": 0.9967948144036385, + "grad_norm": 1.7387431859970093, + "learning_rate": 5.328294569673098e-09, + "loss": 1.1775, + "step": 27834 + }, + { + "epoch": 0.9968306265332068, + "grad_norm": 1.5358966588974, + "learning_rate": 5.209231326319674e-09, + "loss": 0.9692, + "step": 27835 + }, + { + "epoch": 0.9968664386627751, + "grad_norm": 1.1984633207321167, + "learning_rate": 5.0915133838924656e-09, + "loss": 0.952, + "step": 27836 + }, + { + "epoch": 0.9969022507923434, + "grad_norm": 1.528084635734558, + "learning_rate": 4.975140743967987e-09, + "loss": 1.0222, + "step": 27837 + }, + { + "epoch": 0.9969380629219117, + "grad_norm": 1.365281343460083, + "learning_rate": 4.860113408122757e-09, + "loss": 1.3053, + "step": 27838 + }, + { + "epoch": 0.9969738750514799, + "grad_norm": 1.438063621520996, + "learning_rate": 4.746431377899985e-09, + "loss": 1.1499, + "step": 27839 + }, + { + "epoch": 0.9970096871810482, + "grad_norm": 1.3555641174316406, + "learning_rate": 4.634094654820675e-09, + "loss": 0.9504, + "step": 27840 + }, + { + "epoch": 0.9970454993106165, + "grad_norm": 1.598346471786499, + "learning_rate": 4.523103240416937e-09, + "loss": 1.1186, + "step": 27841 + }, + { + "epoch": 0.9970813114401847, + "grad_norm": 2.0042169094085693, + "learning_rate": 4.413457136165367e-09, + "loss": 1.2104, + "step": 27842 + }, + { + "epoch": 0.9971171235697531, + "grad_norm": 1.4060866832733154, + "learning_rate": 4.3051563435425605e-09, + "loss": 1.0737, + "step": 27843 + }, + { + "epoch": 0.9971529356993214, + "grad_norm": 1.5354278087615967, + "learning_rate": 4.198200864014012e-09, + "loss": 1.2415, + "step": 27844 + }, + { + "epoch": 0.9971887478288897, + "grad_norm": 1.4683129787445068, + "learning_rate": 4.092590699011911e-09, + "loss": 1.1221, + "step": 27845 + }, + { + "epoch": 0.9972245599584579, + "grad_norm": 1.8526583909988403, + "learning_rate": 3.988325849957342e-09, + "loss": 1.0542, + "step": 27846 + }, + { + "epoch": 0.9972603720880262, + "grad_norm": 1.4293293952941895, + "learning_rate": 3.885406318260288e-09, + "loss": 1.0868, + "step": 27847 + }, + { + "epoch": 0.9972961842175945, + "grad_norm": 1.4769964218139648, + "learning_rate": 3.783832105286322e-09, + "loss": 0.9492, + "step": 27848 + }, + { + "epoch": 0.9973319963471627, + "grad_norm": 1.364399790763855, + "learning_rate": 3.6836032124232256e-09, + "loss": 1.0655, + "step": 27849 + }, + { + "epoch": 0.9973678084767311, + "grad_norm": 1.672868251800537, + "learning_rate": 3.5847196410143667e-09, + "loss": 1.2762, + "step": 27850 + }, + { + "epoch": 0.9974036206062994, + "grad_norm": 1.5232881307601929, + "learning_rate": 3.4871813923809117e-09, + "loss": 1.0073, + "step": 27851 + }, + { + "epoch": 0.9974394327358677, + "grad_norm": 1.447730302810669, + "learning_rate": 3.390988467844025e-09, + "loss": 1.2262, + "step": 27852 + }, + { + "epoch": 0.9974752448654359, + "grad_norm": 1.6256093978881836, + "learning_rate": 3.2961408686915662e-09, + "loss": 1.0551, + "step": 27853 + }, + { + "epoch": 0.9975110569950042, + "grad_norm": 1.522232174873352, + "learning_rate": 3.2026385962113937e-09, + "loss": 1.0832, + "step": 27854 + }, + { + "epoch": 0.9975468691245725, + "grad_norm": 1.3570501804351807, + "learning_rate": 3.110481651646957e-09, + "loss": 1.0789, + "step": 27855 + }, + { + "epoch": 0.9975826812541407, + "grad_norm": 1.66331946849823, + "learning_rate": 3.0196700362417065e-09, + "loss": 1.1621, + "step": 27856 + }, + { + "epoch": 0.9976184933837091, + "grad_norm": 1.4287519454956055, + "learning_rate": 2.930203751227989e-09, + "loss": 1.0121, + "step": 27857 + }, + { + "epoch": 0.9976543055132774, + "grad_norm": 1.369523048400879, + "learning_rate": 2.8420827977937437e-09, + "loss": 1.186, + "step": 27858 + }, + { + "epoch": 0.9976901176428457, + "grad_norm": 1.316745638847351, + "learning_rate": 2.7553071771380112e-09, + "loss": 1.0365, + "step": 27859 + }, + { + "epoch": 0.9977259297724139, + "grad_norm": 1.8748708963394165, + "learning_rate": 2.6698768904154236e-09, + "loss": 1.1636, + "step": 27860 + }, + { + "epoch": 0.9977617419019822, + "grad_norm": 1.5010159015655518, + "learning_rate": 2.585791938791715e-09, + "loss": 0.871, + "step": 27861 + }, + { + "epoch": 0.9977975540315505, + "grad_norm": 1.201737880706787, + "learning_rate": 2.5030523233771087e-09, + "loss": 1.1339, + "step": 27862 + }, + { + "epoch": 0.9978333661611187, + "grad_norm": 1.3944334983825684, + "learning_rate": 2.4216580453040314e-09, + "loss": 0.8209, + "step": 27863 + }, + { + "epoch": 0.9978691782906871, + "grad_norm": 1.7062697410583496, + "learning_rate": 2.3416091056605027e-09, + "loss": 1.1504, + "step": 27864 + }, + { + "epoch": 0.9979049904202554, + "grad_norm": 1.231886863708496, + "learning_rate": 2.2629055055234384e-09, + "loss": 1.0065, + "step": 27865 + }, + { + "epoch": 0.9979408025498236, + "grad_norm": 1.619002103805542, + "learning_rate": 2.18554724594755e-09, + "loss": 1.1874, + "step": 27866 + }, + { + "epoch": 0.9979766146793919, + "grad_norm": 1.5251200199127197, + "learning_rate": 2.1095343279764477e-09, + "loss": 1.1204, + "step": 27867 + }, + { + "epoch": 0.9980124268089602, + "grad_norm": 1.2616132497787476, + "learning_rate": 2.0348667526426392e-09, + "loss": 0.9694, + "step": 27868 + }, + { + "epoch": 0.9980482389385285, + "grad_norm": 1.2822203636169434, + "learning_rate": 1.961544520934222e-09, + "loss": 0.9169, + "step": 27869 + }, + { + "epoch": 0.9980840510680967, + "grad_norm": 1.6473608016967773, + "learning_rate": 1.8895676338392952e-09, + "loss": 1.2827, + "step": 27870 + }, + { + "epoch": 0.9981198631976651, + "grad_norm": 1.4036833047866821, + "learning_rate": 1.8189360923459575e-09, + "loss": 1.0205, + "step": 27871 + }, + { + "epoch": 0.9981556753272334, + "grad_norm": 1.513411045074463, + "learning_rate": 1.7496498973756936e-09, + "loss": 1.1349, + "step": 27872 + }, + { + "epoch": 0.9981914874568016, + "grad_norm": 1.4444591999053955, + "learning_rate": 1.6817090498832954e-09, + "loss": 1.0382, + "step": 27873 + }, + { + "epoch": 0.9982272995863699, + "grad_norm": 1.3700215816497803, + "learning_rate": 1.615113550779146e-09, + "loss": 1.1371, + "step": 27874 + }, + { + "epoch": 0.9982631117159382, + "grad_norm": 1.815014123916626, + "learning_rate": 1.5498634009514235e-09, + "loss": 1.3445, + "step": 27875 + }, + { + "epoch": 0.9982989238455064, + "grad_norm": 1.5937775373458862, + "learning_rate": 1.4859586012772042e-09, + "loss": 1.2497, + "step": 27876 + }, + { + "epoch": 0.9983347359750747, + "grad_norm": 1.2967684268951416, + "learning_rate": 1.4233991526224622e-09, + "loss": 1.0054, + "step": 27877 + }, + { + "epoch": 0.9983705481046431, + "grad_norm": 1.3631274700164795, + "learning_rate": 1.3621850558309668e-09, + "loss": 1.0459, + "step": 27878 + }, + { + "epoch": 0.9984063602342114, + "grad_norm": 1.6442089080810547, + "learning_rate": 1.3023163117242832e-09, + "loss": 1.0374, + "step": 27879 + }, + { + "epoch": 0.9984421723637796, + "grad_norm": 1.7280105352401733, + "learning_rate": 1.2437929211017718e-09, + "loss": 1.1356, + "step": 27880 + }, + { + "epoch": 0.9984779844933479, + "grad_norm": 1.1873406171798706, + "learning_rate": 1.1866148847516912e-09, + "loss": 1.0047, + "step": 27881 + }, + { + "epoch": 0.9985137966229162, + "grad_norm": 1.5574191808700562, + "learning_rate": 1.1307822034511973e-09, + "loss": 1.1124, + "step": 27882 + }, + { + "epoch": 0.9985496087524844, + "grad_norm": 1.7200137376785278, + "learning_rate": 1.0762948779441396e-09, + "loss": 1.0307, + "step": 27883 + }, + { + "epoch": 0.9985854208820527, + "grad_norm": 1.4180501699447632, + "learning_rate": 1.0231529089632652e-09, + "loss": 1.0889, + "step": 27884 + }, + { + "epoch": 0.9986212330116211, + "grad_norm": 1.9907797574996948, + "learning_rate": 9.713562972302193e-10, + "loss": 1.0474, + "step": 27885 + }, + { + "epoch": 0.9986570451411894, + "grad_norm": 1.5097689628601074, + "learning_rate": 9.209050434333399e-10, + "loss": 0.9966, + "step": 27886 + }, + { + "epoch": 0.9986928572707576, + "grad_norm": 1.8480175733566284, + "learning_rate": 8.717991482609655e-10, + "loss": 1.2269, + "step": 27887 + }, + { + "epoch": 0.9987286694003259, + "grad_norm": 1.6238235235214233, + "learning_rate": 8.240386123681276e-10, + "loss": 1.1004, + "step": 27888 + }, + { + "epoch": 0.9987644815298942, + "grad_norm": 1.5926755666732788, + "learning_rate": 7.776234363987555e-10, + "loss": 1.2706, + "step": 27889 + }, + { + "epoch": 0.9988002936594624, + "grad_norm": 1.3814506530761719, + "learning_rate": 7.325536209856765e-10, + "loss": 1.1267, + "step": 27890 + }, + { + "epoch": 0.9988361057890307, + "grad_norm": 1.190544843673706, + "learning_rate": 6.888291667173085e-10, + "loss": 0.9847, + "step": 27891 + }, + { + "epoch": 0.9988719179185991, + "grad_norm": 1.5012321472167969, + "learning_rate": 6.464500741820701e-10, + "loss": 1.15, + "step": 27892 + }, + { + "epoch": 0.9989077300481674, + "grad_norm": 1.9766901731491089, + "learning_rate": 6.054163439683791e-10, + "loss": 1.2239, + "step": 27893 + }, + { + "epoch": 0.9989435421777356, + "grad_norm": 1.3555752038955688, + "learning_rate": 5.65727976620245e-10, + "loss": 1.0396, + "step": 27894 + }, + { + "epoch": 0.9989793543073039, + "grad_norm": 1.2680057287216187, + "learning_rate": 5.273849726705748e-10, + "loss": 1.1185, + "step": 27895 + }, + { + "epoch": 0.9990151664368722, + "grad_norm": 1.323133111000061, + "learning_rate": 4.903873326300712e-10, + "loss": 0.9281, + "step": 27896 + }, + { + "epoch": 0.9990509785664404, + "grad_norm": 1.1601941585540771, + "learning_rate": 4.5473505700943664e-10, + "loss": 1.0288, + "step": 27897 + }, + { + "epoch": 0.9990867906960087, + "grad_norm": 1.5856188535690308, + "learning_rate": 4.2042814627496483e-10, + "loss": 1.0689, + "step": 27898 + }, + { + "epoch": 0.9991226028255771, + "grad_norm": 1.3923325538635254, + "learning_rate": 3.874666008929495e-10, + "loss": 1.1124, + "step": 27899 + }, + { + "epoch": 0.9991584149551453, + "grad_norm": 1.6114858388900757, + "learning_rate": 3.558504213074798e-10, + "loss": 0.9934, + "step": 27900 + }, + { + "epoch": 0.9991942270847136, + "grad_norm": 1.6986243724822998, + "learning_rate": 3.255796079404405e-10, + "loss": 1.1681, + "step": 27901 + }, + { + "epoch": 0.9992300392142819, + "grad_norm": 1.377776861190796, + "learning_rate": 2.9665416120261414e-10, + "loss": 1.16, + "step": 27902 + }, + { + "epoch": 0.9992658513438502, + "grad_norm": 1.5803245306015015, + "learning_rate": 2.69074081493681e-10, + "loss": 0.895, + "step": 27903 + }, + { + "epoch": 0.9993016634734184, + "grad_norm": 1.7301831245422363, + "learning_rate": 2.4283936915781014e-10, + "loss": 1.0796, + "step": 27904 + }, + { + "epoch": 0.9993374756029867, + "grad_norm": 1.4234933853149414, + "learning_rate": 2.1795002457247748e-10, + "loss": 0.9734, + "step": 27905 + }, + { + "epoch": 0.9993732877325551, + "grad_norm": 1.423836350440979, + "learning_rate": 1.9440604807074992e-10, + "loss": 1.0387, + "step": 27906 + }, + { + "epoch": 0.9994090998621233, + "grad_norm": 1.3166389465332031, + "learning_rate": 1.7220743995238763e-10, + "loss": 1.1515, + "step": 27907 + }, + { + "epoch": 0.9994449119916916, + "grad_norm": 1.6133517026901245, + "learning_rate": 1.5135420052825312e-10, + "loss": 1.1736, + "step": 27908 + }, + { + "epoch": 0.9994807241212599, + "grad_norm": 1.4251246452331543, + "learning_rate": 1.318463300870043e-10, + "loss": 0.8741, + "step": 27909 + }, + { + "epoch": 0.9995165362508281, + "grad_norm": 1.7716898918151855, + "learning_rate": 1.136838288728903e-10, + "loss": 1.0281, + "step": 27910 + }, + { + "epoch": 0.9995523483803964, + "grad_norm": 1.604004144668579, + "learning_rate": 9.68666971412624e-11, + "loss": 1.2152, + "step": 27911 + }, + { + "epoch": 0.9995881605099647, + "grad_norm": 1.4071147441864014, + "learning_rate": 8.139493511416518e-11, + "loss": 1.1426, + "step": 27912 + }, + { + "epoch": 0.9996239726395331, + "grad_norm": 1.5305982828140259, + "learning_rate": 6.726854300254104e-11, + "loss": 1.2715, + "step": 27913 + }, + { + "epoch": 0.9996597847691013, + "grad_norm": 1.2256683111190796, + "learning_rate": 5.448752098402565e-11, + "loss": 0.7965, + "step": 27914 + }, + { + "epoch": 0.9996955968986696, + "grad_norm": 1.660884141921997, + "learning_rate": 4.3051869258459163e-11, + "loss": 1.119, + "step": 27915 + }, + { + "epoch": 0.9997314090282379, + "grad_norm": 1.585005760192871, + "learning_rate": 3.296158795906834e-11, + "loss": 1.0855, + "step": 27916 + }, + { + "epoch": 0.9997672211578061, + "grad_norm": 1.5637381076812744, + "learning_rate": 2.421667721907994e-11, + "loss": 1.0697, + "step": 27917 + }, + { + "epoch": 0.9998030332873744, + "grad_norm": 1.3112397193908691, + "learning_rate": 1.68171371606185e-11, + "loss": 1.1399, + "step": 27918 + }, + { + "epoch": 0.9998388454169427, + "grad_norm": 1.2498399019241333, + "learning_rate": 1.0762967894706321e-11, + "loss": 1.0184, + "step": 27919 + }, + { + "epoch": 0.9998746575465111, + "grad_norm": 1.6833581924438477, + "learning_rate": 6.054169487956784e-12, + "loss": 0.9795, + "step": 27920 + }, + { + "epoch": 0.9999104696760793, + "grad_norm": 1.295207142829895, + "learning_rate": 2.690742006983271e-12, + "loss": 0.9221, + "step": 27921 + }, + { + "epoch": 0.9999462818056476, + "grad_norm": 1.2137815952301025, + "learning_rate": 6.726855072969329e-13, + "loss": 0.9983, + "step": 27922 + }, + { + "epoch": 0.9999820939352159, + "grad_norm": 1.2301528453826904, + "learning_rate": 0.0, + "loss": 1.0304, + "step": 27923 + } + ], + "logging_steps": 1.0, + "max_steps": 27923, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.089197914290913e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}