diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,128060 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999589956672088, + "eval_steps": 500, + "global_step": 18290, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.467244372155324e-05, + "grad_norm": Infinity, + "learning_rate": 1.8214936247723134e-08, + "loss": 3.1263, + "step": 1 + }, + { + "epoch": 0.00010934488744310648, + "grad_norm": Infinity, + "learning_rate": 3.642987249544627e-08, + "loss": 3.6404, + "step": 2 + }, + { + "epoch": 0.00016401733116465974, + "grad_norm": Infinity, + "learning_rate": 5.4644808743169406e-08, + "loss": 4.3901, + "step": 3 + }, + { + "epoch": 0.00021868977488621297, + "grad_norm": Infinity, + "learning_rate": 7.285974499089254e-08, + "loss": 3.9721, + "step": 4 + }, + { + "epoch": 0.0002733622186077662, + "grad_norm": Infinity, + "learning_rate": 9.107468123861569e-08, + "loss": 4.0652, + "step": 5 + }, + { + "epoch": 0.0003280346623293195, + "grad_norm": Infinity, + "learning_rate": 1.0928961748633881e-07, + "loss": 4.2907, + "step": 6 + }, + { + "epoch": 0.0003827071060508727, + "grad_norm": Infinity, + "learning_rate": 1.2750455373406196e-07, + "loss": 3.8615, + "step": 7 + }, + { + "epoch": 0.00043737954977242594, + "grad_norm": Infinity, + "learning_rate": 1.4571948998178507e-07, + "loss": 3.4933, + "step": 8 + }, + { + "epoch": 0.0004920519934939792, + "grad_norm": Infinity, + "learning_rate": 1.639344262295082e-07, + "loss": 3.6942, + "step": 9 + }, + { + "epoch": 0.0005467244372155324, + "grad_norm": Infinity, + "learning_rate": 1.8214936247723137e-07, + "loss": 4.5631, + "step": 10 + }, + { + "epoch": 0.0006013968809370857, + "grad_norm": Infinity, + "learning_rate": 2.0036429872495446e-07, + "loss": 5.0567, + "step": 11 + }, + { + "epoch": 0.000656069324658639, + "grad_norm": Infinity, + "learning_rate": 2.1857923497267762e-07, + "loss": 4.1645, + "step": 12 + }, + { + "epoch": 0.0007107417683801921, + "grad_norm": Infinity, + "learning_rate": 2.3679417122040076e-07, + "loss": 4.2849, + "step": 13 + }, + { + "epoch": 0.0007654142121017454, + "grad_norm": Infinity, + "learning_rate": 2.550091074681239e-07, + "loss": 4.0041, + "step": 14 + }, + { + "epoch": 0.0008200866558232987, + "grad_norm": Infinity, + "learning_rate": 2.73224043715847e-07, + "loss": 3.4858, + "step": 15 + }, + { + "epoch": 0.0008747590995448519, + "grad_norm": Infinity, + "learning_rate": 2.9143897996357015e-07, + "loss": 4.8429, + "step": 16 + }, + { + "epoch": 0.0009294315432664052, + "grad_norm": Infinity, + "learning_rate": 3.096539162112933e-07, + "loss": 3.9163, + "step": 17 + }, + { + "epoch": 0.0009841039869879584, + "grad_norm": Infinity, + "learning_rate": 3.278688524590164e-07, + "loss": 4.336, + "step": 18 + }, + { + "epoch": 0.0010387764307095116, + "grad_norm": Infinity, + "learning_rate": 3.4608378870673953e-07, + "loss": 4.2209, + "step": 19 + }, + { + "epoch": 0.0010934488744310648, + "grad_norm": Infinity, + "learning_rate": 3.6429872495446275e-07, + "loss": 4.2192, + "step": 20 + }, + { + "epoch": 0.0011481213181526182, + "grad_norm": Infinity, + "learning_rate": 3.825136612021858e-07, + "loss": 4.0766, + "step": 21 + }, + { + "epoch": 0.0012027937618741714, + "grad_norm": Infinity, + "learning_rate": 4.007285974499089e-07, + "loss": 4.0405, + "step": 22 + }, + { + "epoch": 0.0012574662055957245, + "grad_norm": Infinity, + "learning_rate": 4.1894353369763213e-07, + "loss": 4.2251, + "step": 23 + }, + { + "epoch": 0.001312138649317278, + "grad_norm": Infinity, + "learning_rate": 4.3715846994535524e-07, + "loss": 4.5712, + "step": 24 + }, + { + "epoch": 0.001366811093038831, + "grad_norm": Infinity, + "learning_rate": 4.5537340619307836e-07, + "loss": 3.9375, + "step": 25 + }, + { + "epoch": 0.0014214835367603843, + "grad_norm": Infinity, + "learning_rate": 4.735883424408015e-07, + "loss": 4.499, + "step": 26 + }, + { + "epoch": 0.0014761559804819377, + "grad_norm": Infinity, + "learning_rate": 4.918032786885246e-07, + "loss": 3.8182, + "step": 27 + }, + { + "epoch": 0.0015308284242034908, + "grad_norm": Infinity, + "learning_rate": 5.100182149362478e-07, + "loss": 4.0515, + "step": 28 + }, + { + "epoch": 0.001585500867925044, + "grad_norm": Infinity, + "learning_rate": 5.28233151183971e-07, + "loss": 4.2507, + "step": 29 + }, + { + "epoch": 0.0016401733116465974, + "grad_norm": Infinity, + "learning_rate": 5.46448087431694e-07, + "loss": 4.3395, + "step": 30 + }, + { + "epoch": 0.0016948457553681506, + "grad_norm": Infinity, + "learning_rate": 5.646630236794172e-07, + "loss": 3.9989, + "step": 31 + }, + { + "epoch": 0.0017495181990897037, + "grad_norm": Infinity, + "learning_rate": 5.828779599271403e-07, + "loss": 4.0733, + "step": 32 + }, + { + "epoch": 0.0018041906428112571, + "grad_norm": Infinity, + "learning_rate": 6.010928961748634e-07, + "loss": 4.2347, + "step": 33 + }, + { + "epoch": 0.0018588630865328103, + "grad_norm": Infinity, + "learning_rate": 6.193078324225866e-07, + "loss": 4.2314, + "step": 34 + }, + { + "epoch": 0.0019135355302543635, + "grad_norm": Infinity, + "learning_rate": 6.375227686703097e-07, + "loss": 4.1942, + "step": 35 + }, + { + "epoch": 0.001968207973975917, + "grad_norm": Infinity, + "learning_rate": 6.557377049180328e-07, + "loss": 4.2445, + "step": 36 + }, + { + "epoch": 0.00202288041769747, + "grad_norm": Infinity, + "learning_rate": 6.73952641165756e-07, + "loss": 4.304, + "step": 37 + }, + { + "epoch": 0.0020775528614190232, + "grad_norm": Infinity, + "learning_rate": 6.921675774134791e-07, + "loss": 2.9882, + "step": 38 + }, + { + "epoch": 0.0021322253051405766, + "grad_norm": Infinity, + "learning_rate": 7.103825136612022e-07, + "loss": 3.5619, + "step": 39 + }, + { + "epoch": 0.0021868977488621296, + "grad_norm": Infinity, + "learning_rate": 7.285974499089255e-07, + "loss": 3.6702, + "step": 40 + }, + { + "epoch": 0.002241570192583683, + "grad_norm": Infinity, + "learning_rate": 7.468123861566486e-07, + "loss": 3.9372, + "step": 41 + }, + { + "epoch": 0.0022962426363052364, + "grad_norm": Infinity, + "learning_rate": 7.650273224043716e-07, + "loss": 3.9838, + "step": 42 + }, + { + "epoch": 0.0023509150800267893, + "grad_norm": Infinity, + "learning_rate": 7.832422586520947e-07, + "loss": 4.7506, + "step": 43 + }, + { + "epoch": 0.0024055875237483427, + "grad_norm": Infinity, + "learning_rate": 8.014571948998178e-07, + "loss": 4.0156, + "step": 44 + }, + { + "epoch": 0.002460259967469896, + "grad_norm": Infinity, + "learning_rate": 8.196721311475409e-07, + "loss": 5.4481, + "step": 45 + }, + { + "epoch": 0.002514932411191449, + "grad_norm": Infinity, + "learning_rate": 8.378870673952643e-07, + "loss": 4.5559, + "step": 46 + }, + { + "epoch": 0.0025696048549130024, + "grad_norm": Infinity, + "learning_rate": 8.561020036429874e-07, + "loss": 4.1283, + "step": 47 + }, + { + "epoch": 0.002624277298634556, + "grad_norm": Infinity, + "learning_rate": 8.743169398907105e-07, + "loss": 3.8645, + "step": 48 + }, + { + "epoch": 0.002678949742356109, + "grad_norm": Infinity, + "learning_rate": 8.925318761384336e-07, + "loss": 3.6105, + "step": 49 + }, + { + "epoch": 0.002733622186077662, + "grad_norm": Infinity, + "learning_rate": 9.107468123861567e-07, + "loss": 4.4374, + "step": 50 + }, + { + "epoch": 0.0027882946297992156, + "grad_norm": Infinity, + "learning_rate": 9.289617486338799e-07, + "loss": 3.8843, + "step": 51 + }, + { + "epoch": 0.0028429670735207685, + "grad_norm": Infinity, + "learning_rate": 9.47176684881603e-07, + "loss": 3.8511, + "step": 52 + }, + { + "epoch": 0.002897639517242322, + "grad_norm": Infinity, + "learning_rate": 9.653916211293261e-07, + "loss": 4.1068, + "step": 53 + }, + { + "epoch": 0.0029523119609638753, + "grad_norm": Infinity, + "learning_rate": 9.836065573770493e-07, + "loss": 4.0291, + "step": 54 + }, + { + "epoch": 0.0030069844046854283, + "grad_norm": Infinity, + "learning_rate": 1.0018214936247724e-06, + "loss": 3.8586, + "step": 55 + }, + { + "epoch": 0.0030616568484069817, + "grad_norm": Infinity, + "learning_rate": 1.0200364298724957e-06, + "loss": 3.8951, + "step": 56 + }, + { + "epoch": 0.003116329292128535, + "grad_norm": Infinity, + "learning_rate": 1.0382513661202188e-06, + "loss": 4.1041, + "step": 57 + }, + { + "epoch": 0.003171001735850088, + "grad_norm": Infinity, + "learning_rate": 1.056466302367942e-06, + "loss": 4.2556, + "step": 58 + }, + { + "epoch": 0.0032256741795716414, + "grad_norm": Infinity, + "learning_rate": 1.074681238615665e-06, + "loss": 3.9358, + "step": 59 + }, + { + "epoch": 0.003280346623293195, + "grad_norm": Infinity, + "learning_rate": 1.092896174863388e-06, + "loss": 3.4931, + "step": 60 + }, + { + "epoch": 0.0033350190670147477, + "grad_norm": Infinity, + "learning_rate": 1.111111111111111e-06, + "loss": 4.265, + "step": 61 + }, + { + "epoch": 0.003389691510736301, + "grad_norm": Infinity, + "learning_rate": 1.1293260473588344e-06, + "loss": 3.7913, + "step": 62 + }, + { + "epoch": 0.0034443639544578545, + "grad_norm": Infinity, + "learning_rate": 1.1475409836065575e-06, + "loss": 3.5198, + "step": 63 + }, + { + "epoch": 0.0034990363981794075, + "grad_norm": Infinity, + "learning_rate": 1.1657559198542806e-06, + "loss": 4.0253, + "step": 64 + }, + { + "epoch": 0.003553708841900961, + "grad_norm": Infinity, + "learning_rate": 1.1839708561020037e-06, + "loss": 4.2022, + "step": 65 + }, + { + "epoch": 0.0036083812856225143, + "grad_norm": Infinity, + "learning_rate": 1.2021857923497268e-06, + "loss": 3.7704, + "step": 66 + }, + { + "epoch": 0.0036630537293440672, + "grad_norm": Infinity, + "learning_rate": 1.2204007285974501e-06, + "loss": 3.93, + "step": 67 + }, + { + "epoch": 0.0037177261730656206, + "grad_norm": Infinity, + "learning_rate": 1.2386156648451732e-06, + "loss": 4.1953, + "step": 68 + }, + { + "epoch": 0.003772398616787174, + "grad_norm": Infinity, + "learning_rate": 1.2568306010928963e-06, + "loss": 4.1865, + "step": 69 + }, + { + "epoch": 0.003827071060508727, + "grad_norm": Infinity, + "learning_rate": 1.2750455373406195e-06, + "loss": 3.9562, + "step": 70 + }, + { + "epoch": 0.0038817435042302804, + "grad_norm": Infinity, + "learning_rate": 1.2932604735883426e-06, + "loss": 3.8219, + "step": 71 + }, + { + "epoch": 0.003936415947951834, + "grad_norm": Infinity, + "learning_rate": 1.3114754098360657e-06, + "loss": 3.7517, + "step": 72 + }, + { + "epoch": 0.003991088391673387, + "grad_norm": Infinity, + "learning_rate": 1.3296903460837888e-06, + "loss": 3.8139, + "step": 73 + }, + { + "epoch": 0.00404576083539494, + "grad_norm": Infinity, + "learning_rate": 1.347905282331512e-06, + "loss": 3.8211, + "step": 74 + }, + { + "epoch": 0.0041004332791164935, + "grad_norm": Infinity, + "learning_rate": 1.3661202185792352e-06, + "loss": 3.8894, + "step": 75 + }, + { + "epoch": 0.0041551057228380464, + "grad_norm": Infinity, + "learning_rate": 1.3843351548269581e-06, + "loss": 3.5135, + "step": 76 + }, + { + "epoch": 0.004209778166559599, + "grad_norm": Infinity, + "learning_rate": 1.4025500910746814e-06, + "loss": 3.9351, + "step": 77 + }, + { + "epoch": 0.004264450610281153, + "grad_norm": Infinity, + "learning_rate": 1.4207650273224043e-06, + "loss": 4.0671, + "step": 78 + }, + { + "epoch": 0.004319123054002706, + "grad_norm": Infinity, + "learning_rate": 1.4389799635701277e-06, + "loss": 4.2252, + "step": 79 + }, + { + "epoch": 0.004373795497724259, + "grad_norm": Infinity, + "learning_rate": 1.457194899817851e-06, + "loss": 4.3557, + "step": 80 + }, + { + "epoch": 0.004428467941445813, + "grad_norm": Infinity, + "learning_rate": 1.4754098360655739e-06, + "loss": 3.8604, + "step": 81 + }, + { + "epoch": 0.004483140385167366, + "grad_norm": Infinity, + "learning_rate": 1.4936247723132972e-06, + "loss": 3.1649, + "step": 82 + }, + { + "epoch": 0.004537812828888919, + "grad_norm": Infinity, + "learning_rate": 1.5118397085610201e-06, + "loss": 4.0934, + "step": 83 + }, + { + "epoch": 0.004592485272610473, + "grad_norm": Infinity, + "learning_rate": 1.5300546448087432e-06, + "loss": 3.9295, + "step": 84 + }, + { + "epoch": 0.004647157716332026, + "grad_norm": Infinity, + "learning_rate": 1.5482695810564663e-06, + "loss": 3.9796, + "step": 85 + }, + { + "epoch": 0.004701830160053579, + "grad_norm": Infinity, + "learning_rate": 1.5664845173041894e-06, + "loss": 3.4379, + "step": 86 + }, + { + "epoch": 0.0047565026037751324, + "grad_norm": Infinity, + "learning_rate": 1.5846994535519128e-06, + "loss": 3.9772, + "step": 87 + }, + { + "epoch": 0.004811175047496685, + "grad_norm": Infinity, + "learning_rate": 1.6029143897996357e-06, + "loss": 4.4253, + "step": 88 + }, + { + "epoch": 0.004865847491218238, + "grad_norm": Infinity, + "learning_rate": 1.621129326047359e-06, + "loss": 3.3117, + "step": 89 + }, + { + "epoch": 0.004920519934939792, + "grad_norm": Infinity, + "learning_rate": 1.6393442622950819e-06, + "loss": 4.3969, + "step": 90 + }, + { + "epoch": 0.004975192378661345, + "grad_norm": Infinity, + "learning_rate": 1.6575591985428052e-06, + "loss": 4.199, + "step": 91 + }, + { + "epoch": 0.005029864822382898, + "grad_norm": Infinity, + "learning_rate": 1.6757741347905285e-06, + "loss": 3.9133, + "step": 92 + }, + { + "epoch": 0.005084537266104452, + "grad_norm": Infinity, + "learning_rate": 1.6939890710382514e-06, + "loss": 4.3934, + "step": 93 + }, + { + "epoch": 0.005139209709826005, + "grad_norm": Infinity, + "learning_rate": 1.7122040072859748e-06, + "loss": 4.0144, + "step": 94 + }, + { + "epoch": 0.005193882153547558, + "grad_norm": Infinity, + "learning_rate": 1.7304189435336977e-06, + "loss": 3.9991, + "step": 95 + }, + { + "epoch": 0.005248554597269112, + "grad_norm": Infinity, + "learning_rate": 1.748633879781421e-06, + "loss": 4.4044, + "step": 96 + }, + { + "epoch": 0.005303227040990665, + "grad_norm": Infinity, + "learning_rate": 1.766848816029144e-06, + "loss": 4.7206, + "step": 97 + }, + { + "epoch": 0.005357899484712218, + "grad_norm": Infinity, + "learning_rate": 1.7850637522768672e-06, + "loss": 4.5471, + "step": 98 + }, + { + "epoch": 0.005412571928433771, + "grad_norm": Infinity, + "learning_rate": 1.8032786885245903e-06, + "loss": 4.6932, + "step": 99 + }, + { + "epoch": 0.005467244372155324, + "grad_norm": Infinity, + "learning_rate": 1.8214936247723134e-06, + "loss": 3.8996, + "step": 100 + }, + { + "epoch": 0.005521916815876877, + "grad_norm": Infinity, + "learning_rate": 1.8397085610200365e-06, + "loss": 4.1216, + "step": 101 + }, + { + "epoch": 0.005576589259598431, + "grad_norm": Infinity, + "learning_rate": 1.8579234972677599e-06, + "loss": 4.4744, + "step": 102 + }, + { + "epoch": 0.005631261703319984, + "grad_norm": Infinity, + "learning_rate": 1.8761384335154828e-06, + "loss": 4.1869, + "step": 103 + }, + { + "epoch": 0.005685934147041537, + "grad_norm": Infinity, + "learning_rate": 1.894353369763206e-06, + "loss": 4.8284, + "step": 104 + }, + { + "epoch": 0.005740606590763091, + "grad_norm": Infinity, + "learning_rate": 1.912568306010929e-06, + "loss": 4.2481, + "step": 105 + }, + { + "epoch": 0.005795279034484644, + "grad_norm": Infinity, + "learning_rate": 1.9307832422586523e-06, + "loss": 3.8294, + "step": 106 + }, + { + "epoch": 0.005849951478206197, + "grad_norm": Infinity, + "learning_rate": 1.9489981785063756e-06, + "loss": 4.1622, + "step": 107 + }, + { + "epoch": 0.005904623921927751, + "grad_norm": Infinity, + "learning_rate": 1.9672131147540985e-06, + "loss": 3.6574, + "step": 108 + }, + { + "epoch": 0.005959296365649304, + "grad_norm": Infinity, + "learning_rate": 1.985428051001822e-06, + "loss": 4.0265, + "step": 109 + }, + { + "epoch": 0.0060139688093708565, + "grad_norm": Infinity, + "learning_rate": 2.0036429872495447e-06, + "loss": 4.3686, + "step": 110 + }, + { + "epoch": 0.00606864125309241, + "grad_norm": Infinity, + "learning_rate": 2.021857923497268e-06, + "loss": 3.6875, + "step": 111 + }, + { + "epoch": 0.006123313696813963, + "grad_norm": Infinity, + "learning_rate": 2.0400728597449914e-06, + "loss": 4.2288, + "step": 112 + }, + { + "epoch": 0.006177986140535516, + "grad_norm": Infinity, + "learning_rate": 2.0582877959927143e-06, + "loss": 3.4549, + "step": 113 + }, + { + "epoch": 0.00623265858425707, + "grad_norm": Infinity, + "learning_rate": 2.0765027322404376e-06, + "loss": 3.4299, + "step": 114 + }, + { + "epoch": 0.006287331027978623, + "grad_norm": Infinity, + "learning_rate": 2.0947176684881605e-06, + "loss": 3.9051, + "step": 115 + }, + { + "epoch": 0.006342003471700176, + "grad_norm": Infinity, + "learning_rate": 2.112932604735884e-06, + "loss": 3.3102, + "step": 116 + }, + { + "epoch": 0.00639667591542173, + "grad_norm": Infinity, + "learning_rate": 2.1311475409836067e-06, + "loss": 3.8207, + "step": 117 + }, + { + "epoch": 0.006451348359143283, + "grad_norm": Infinity, + "learning_rate": 2.14936247723133e-06, + "loss": 4.6, + "step": 118 + }, + { + "epoch": 0.006506020802864836, + "grad_norm": Infinity, + "learning_rate": 2.167577413479053e-06, + "loss": 4.0638, + "step": 119 + }, + { + "epoch": 0.00656069324658639, + "grad_norm": Infinity, + "learning_rate": 2.185792349726776e-06, + "loss": 3.5223, + "step": 120 + }, + { + "epoch": 0.0066153656903079425, + "grad_norm": Infinity, + "learning_rate": 2.204007285974499e-06, + "loss": 3.8178, + "step": 121 + }, + { + "epoch": 0.0066700381340294955, + "grad_norm": Infinity, + "learning_rate": 2.222222222222222e-06, + "loss": 4.2338, + "step": 122 + }, + { + "epoch": 0.006724710577751049, + "grad_norm": Infinity, + "learning_rate": 2.2404371584699454e-06, + "loss": 3.9519, + "step": 123 + }, + { + "epoch": 0.006779383021472602, + "grad_norm": Infinity, + "learning_rate": 2.2586520947176687e-06, + "loss": 4.4393, + "step": 124 + }, + { + "epoch": 0.006834055465194155, + "grad_norm": Infinity, + "learning_rate": 2.2768670309653916e-06, + "loss": 4.3155, + "step": 125 + }, + { + "epoch": 0.006888727908915709, + "grad_norm": Infinity, + "learning_rate": 2.295081967213115e-06, + "loss": 3.7167, + "step": 126 + }, + { + "epoch": 0.006943400352637262, + "grad_norm": Infinity, + "learning_rate": 2.313296903460838e-06, + "loss": 3.4659, + "step": 127 + }, + { + "epoch": 0.006998072796358815, + "grad_norm": Infinity, + "learning_rate": 2.331511839708561e-06, + "loss": 4.5944, + "step": 128 + }, + { + "epoch": 0.007052745240080369, + "grad_norm": Infinity, + "learning_rate": 2.3497267759562845e-06, + "loss": 3.7455, + "step": 129 + }, + { + "epoch": 0.007107417683801922, + "grad_norm": Infinity, + "learning_rate": 2.3679417122040074e-06, + "loss": 4.214, + "step": 130 + }, + { + "epoch": 0.007162090127523475, + "grad_norm": Infinity, + "learning_rate": 2.3861566484517307e-06, + "loss": 4.0683, + "step": 131 + }, + { + "epoch": 0.0072167625712450285, + "grad_norm": Infinity, + "learning_rate": 2.4043715846994536e-06, + "loss": 3.5328, + "step": 132 + }, + { + "epoch": 0.0072714350149665815, + "grad_norm": Infinity, + "learning_rate": 2.422586520947177e-06, + "loss": 4.2665, + "step": 133 + }, + { + "epoch": 0.0073261074586881345, + "grad_norm": Infinity, + "learning_rate": 2.4408014571949003e-06, + "loss": 5.0274, + "step": 134 + }, + { + "epoch": 0.007380779902409688, + "grad_norm": Infinity, + "learning_rate": 2.459016393442623e-06, + "loss": 3.8135, + "step": 135 + }, + { + "epoch": 0.007435452346131241, + "grad_norm": Infinity, + "learning_rate": 2.4772313296903465e-06, + "loss": 4.7053, + "step": 136 + }, + { + "epoch": 0.007490124789852794, + "grad_norm": Infinity, + "learning_rate": 2.4954462659380694e-06, + "loss": 3.4074, + "step": 137 + }, + { + "epoch": 0.007544797233574348, + "grad_norm": Infinity, + "learning_rate": 2.5136612021857927e-06, + "loss": 3.7243, + "step": 138 + }, + { + "epoch": 0.007599469677295901, + "grad_norm": Infinity, + "learning_rate": 2.5318761384335156e-06, + "loss": 3.9154, + "step": 139 + }, + { + "epoch": 0.007654142121017454, + "grad_norm": Infinity, + "learning_rate": 2.550091074681239e-06, + "loss": 3.8528, + "step": 140 + }, + { + "epoch": 0.007708814564739008, + "grad_norm": Infinity, + "learning_rate": 2.5683060109289622e-06, + "loss": 4.0295, + "step": 141 + }, + { + "epoch": 0.007763487008460561, + "grad_norm": Infinity, + "learning_rate": 2.586520947176685e-06, + "loss": 3.9202, + "step": 142 + }, + { + "epoch": 0.007818159452182115, + "grad_norm": Infinity, + "learning_rate": 2.604735883424408e-06, + "loss": 3.6542, + "step": 143 + }, + { + "epoch": 0.007872831895903667, + "grad_norm": Infinity, + "learning_rate": 2.6229508196721314e-06, + "loss": 4.3527, + "step": 144 + }, + { + "epoch": 0.00792750433962522, + "grad_norm": Infinity, + "learning_rate": 2.6411657559198543e-06, + "loss": 3.8664, + "step": 145 + }, + { + "epoch": 0.007982176783346773, + "grad_norm": Infinity, + "learning_rate": 2.6593806921675776e-06, + "loss": 3.8448, + "step": 146 + }, + { + "epoch": 0.008036849227068326, + "grad_norm": Infinity, + "learning_rate": 2.677595628415301e-06, + "loss": 4.1704, + "step": 147 + }, + { + "epoch": 0.00809152167078988, + "grad_norm": Infinity, + "learning_rate": 2.695810564663024e-06, + "loss": 4.0547, + "step": 148 + }, + { + "epoch": 0.008146194114511434, + "grad_norm": Infinity, + "learning_rate": 2.7140255009107467e-06, + "loss": 4.7917, + "step": 149 + }, + { + "epoch": 0.008200866558232987, + "grad_norm": Infinity, + "learning_rate": 2.7322404371584705e-06, + "loss": 3.5802, + "step": 150 + }, + { + "epoch": 0.00825553900195454, + "grad_norm": Infinity, + "learning_rate": 2.7504553734061934e-06, + "loss": 4.3456, + "step": 151 + }, + { + "epoch": 0.008310211445676093, + "grad_norm": Infinity, + "learning_rate": 2.7686703096539162e-06, + "loss": 4.2385, + "step": 152 + }, + { + "epoch": 0.008364883889397646, + "grad_norm": Infinity, + "learning_rate": 2.786885245901639e-06, + "loss": 4.3261, + "step": 153 + }, + { + "epoch": 0.008419556333119199, + "grad_norm": Infinity, + "learning_rate": 2.805100182149363e-06, + "loss": 4.0915, + "step": 154 + }, + { + "epoch": 0.008474228776840753, + "grad_norm": 1.7221764975064777e+19, + "learning_rate": 2.823315118397086e-06, + "loss": 3.4824, + "step": 155 + }, + { + "epoch": 0.008528901220562306, + "grad_norm": 110080.1171875, + "learning_rate": 2.8415300546448087e-06, + "loss": 4.2896, + "step": 156 + }, + { + "epoch": 0.00858357366428386, + "grad_norm": 920.2175903320312, + "learning_rate": 2.8597449908925324e-06, + "loss": 3.5814, + "step": 157 + }, + { + "epoch": 0.008638246108005412, + "grad_norm": 103.09797668457031, + "learning_rate": 2.8779599271402553e-06, + "loss": 2.9116, + "step": 158 + }, + { + "epoch": 0.008692918551726965, + "grad_norm": 79.56197357177734, + "learning_rate": 2.8961748633879782e-06, + "loss": 3.1517, + "step": 159 + }, + { + "epoch": 0.008747590995448518, + "grad_norm": 77.5087661743164, + "learning_rate": 2.914389799635702e-06, + "loss": 3.4689, + "step": 160 + }, + { + "epoch": 0.008802263439170073, + "grad_norm": 66.31782531738281, + "learning_rate": 2.932604735883425e-06, + "loss": 3.0315, + "step": 161 + }, + { + "epoch": 0.008856935882891626, + "grad_norm": 62.11912155151367, + "learning_rate": 2.9508196721311478e-06, + "loss": 3.2401, + "step": 162 + }, + { + "epoch": 0.008911608326613179, + "grad_norm": 58.38945007324219, + "learning_rate": 2.9690346083788707e-06, + "loss": 3.2249, + "step": 163 + }, + { + "epoch": 0.008966280770334732, + "grad_norm": 82.93097686767578, + "learning_rate": 2.9872495446265944e-06, + "loss": 3.0136, + "step": 164 + }, + { + "epoch": 0.009020953214056285, + "grad_norm": 64.07482147216797, + "learning_rate": 3.0054644808743173e-06, + "loss": 2.8243, + "step": 165 + }, + { + "epoch": 0.009075625657777838, + "grad_norm": 50.677696228027344, + "learning_rate": 3.0236794171220402e-06, + "loss": 2.9264, + "step": 166 + }, + { + "epoch": 0.009130298101499392, + "grad_norm": 74.07775115966797, + "learning_rate": 3.0418943533697635e-06, + "loss": 3.0209, + "step": 167 + }, + { + "epoch": 0.009184970545220945, + "grad_norm": 35.553462982177734, + "learning_rate": 3.0601092896174864e-06, + "loss": 3.0923, + "step": 168 + }, + { + "epoch": 0.009239642988942498, + "grad_norm": 58.810829162597656, + "learning_rate": 3.0783242258652098e-06, + "loss": 3.1906, + "step": 169 + }, + { + "epoch": 0.009294315432664051, + "grad_norm": 93.57482147216797, + "learning_rate": 3.0965391621129327e-06, + "loss": 2.8943, + "step": 170 + }, + { + "epoch": 0.009348987876385604, + "grad_norm": 32.62672805786133, + "learning_rate": 3.114754098360656e-06, + "loss": 3.1283, + "step": 171 + }, + { + "epoch": 0.009403660320107157, + "grad_norm": 39.19892883300781, + "learning_rate": 3.132969034608379e-06, + "loss": 3.008, + "step": 172 + }, + { + "epoch": 0.009458332763828712, + "grad_norm": 26.24127197265625, + "learning_rate": 3.1511839708561022e-06, + "loss": 2.7172, + "step": 173 + }, + { + "epoch": 0.009513005207550265, + "grad_norm": 65.6766128540039, + "learning_rate": 3.1693989071038255e-06, + "loss": 2.9175, + "step": 174 + }, + { + "epoch": 0.009567677651271818, + "grad_norm": 27.74483871459961, + "learning_rate": 3.1876138433515484e-06, + "loss": 2.6104, + "step": 175 + }, + { + "epoch": 0.00962235009499337, + "grad_norm": 37.03934860229492, + "learning_rate": 3.2058287795992713e-06, + "loss": 2.9681, + "step": 176 + }, + { + "epoch": 0.009677022538714924, + "grad_norm": 31.6248836517334, + "learning_rate": 3.224043715846995e-06, + "loss": 2.4844, + "step": 177 + }, + { + "epoch": 0.009731694982436477, + "grad_norm": 35.30421829223633, + "learning_rate": 3.242258652094718e-06, + "loss": 2.5279, + "step": 178 + }, + { + "epoch": 0.009786367426158031, + "grad_norm": 29.215970993041992, + "learning_rate": 3.260473588342441e-06, + "loss": 2.6121, + "step": 179 + }, + { + "epoch": 0.009841039869879584, + "grad_norm": 66.76720428466797, + "learning_rate": 3.2786885245901638e-06, + "loss": 3.1396, + "step": 180 + }, + { + "epoch": 0.009895712313601137, + "grad_norm": 139.1005401611328, + "learning_rate": 3.2969034608378875e-06, + "loss": 2.5522, + "step": 181 + }, + { + "epoch": 0.00995038475732269, + "grad_norm": 37.083343505859375, + "learning_rate": 3.3151183970856104e-06, + "loss": 2.5421, + "step": 182 + }, + { + "epoch": 0.010005057201044243, + "grad_norm": 100.15026092529297, + "learning_rate": 3.3333333333333333e-06, + "loss": 2.4301, + "step": 183 + }, + { + "epoch": 0.010059729644765796, + "grad_norm": 47.618003845214844, + "learning_rate": 3.351548269581057e-06, + "loss": 2.8237, + "step": 184 + }, + { + "epoch": 0.010114402088487351, + "grad_norm": 69.16098022460938, + "learning_rate": 3.36976320582878e-06, + "loss": 2.8366, + "step": 185 + }, + { + "epoch": 0.010169074532208904, + "grad_norm": 64.8238754272461, + "learning_rate": 3.387978142076503e-06, + "loss": 2.7801, + "step": 186 + }, + { + "epoch": 0.010223746975930457, + "grad_norm": 59.27120590209961, + "learning_rate": 3.4061930783242266e-06, + "loss": 2.4837, + "step": 187 + }, + { + "epoch": 0.01027841941965201, + "grad_norm": 25.283275604248047, + "learning_rate": 3.4244080145719495e-06, + "loss": 2.7218, + "step": 188 + }, + { + "epoch": 0.010333091863373563, + "grad_norm": 63.152584075927734, + "learning_rate": 3.4426229508196724e-06, + "loss": 2.5888, + "step": 189 + }, + { + "epoch": 0.010387764307095116, + "grad_norm": 24.277202606201172, + "learning_rate": 3.4608378870673953e-06, + "loss": 2.5722, + "step": 190 + }, + { + "epoch": 0.01044243675081667, + "grad_norm": 111.66283416748047, + "learning_rate": 3.4790528233151186e-06, + "loss": 2.4767, + "step": 191 + }, + { + "epoch": 0.010497109194538223, + "grad_norm": 136.1307830810547, + "learning_rate": 3.497267759562842e-06, + "loss": 2.4771, + "step": 192 + }, + { + "epoch": 0.010551781638259776, + "grad_norm": 48.55716323852539, + "learning_rate": 3.515482695810565e-06, + "loss": 2.4357, + "step": 193 + }, + { + "epoch": 0.01060645408198133, + "grad_norm": 40.00905990600586, + "learning_rate": 3.533697632058288e-06, + "loss": 2.6353, + "step": 194 + }, + { + "epoch": 0.010661126525702882, + "grad_norm": 62.75746154785156, + "learning_rate": 3.551912568306011e-06, + "loss": 2.5401, + "step": 195 + }, + { + "epoch": 0.010715798969424435, + "grad_norm": 46.84180450439453, + "learning_rate": 3.5701275045537344e-06, + "loss": 2.3471, + "step": 196 + }, + { + "epoch": 0.01077047141314599, + "grad_norm": 76.8486328125, + "learning_rate": 3.5883424408014573e-06, + "loss": 2.7244, + "step": 197 + }, + { + "epoch": 0.010825143856867543, + "grad_norm": 34.52238464355469, + "learning_rate": 3.6065573770491806e-06, + "loss": 2.3896, + "step": 198 + }, + { + "epoch": 0.010879816300589096, + "grad_norm": 82.75934600830078, + "learning_rate": 3.6247723132969035e-06, + "loss": 2.4274, + "step": 199 + }, + { + "epoch": 0.010934488744310649, + "grad_norm": 79.4642333984375, + "learning_rate": 3.642987249544627e-06, + "loss": 2.6657, + "step": 200 + }, + { + "epoch": 0.010989161188032202, + "grad_norm": 35.99546432495117, + "learning_rate": 3.66120218579235e-06, + "loss": 2.2929, + "step": 201 + }, + { + "epoch": 0.011043833631753755, + "grad_norm": 91.60140228271484, + "learning_rate": 3.679417122040073e-06, + "loss": 2.2779, + "step": 202 + }, + { + "epoch": 0.01109850607547531, + "grad_norm": 68.09001922607422, + "learning_rate": 3.697632058287796e-06, + "loss": 2.4914, + "step": 203 + }, + { + "epoch": 0.011153178519196862, + "grad_norm": 25.333038330078125, + "learning_rate": 3.7158469945355197e-06, + "loss": 2.4129, + "step": 204 + }, + { + "epoch": 0.011207850962918415, + "grad_norm": 61.558349609375, + "learning_rate": 3.7340619307832426e-06, + "loss": 2.3672, + "step": 205 + }, + { + "epoch": 0.011262523406639968, + "grad_norm": 164.27842712402344, + "learning_rate": 3.7522768670309655e-06, + "loss": 2.6676, + "step": 206 + }, + { + "epoch": 0.011317195850361521, + "grad_norm": 2112.04296875, + "learning_rate": 3.7704918032786884e-06, + "loss": 2.3204, + "step": 207 + }, + { + "epoch": 0.011371868294083074, + "grad_norm": 130.11947631835938, + "learning_rate": 3.788706739526412e-06, + "loss": 2.3863, + "step": 208 + }, + { + "epoch": 0.011426540737804629, + "grad_norm": 114.12309265136719, + "learning_rate": 3.806921675774135e-06, + "loss": 2.3434, + "step": 209 + }, + { + "epoch": 0.011481213181526182, + "grad_norm": 97.15536499023438, + "learning_rate": 3.825136612021858e-06, + "loss": 2.4489, + "step": 210 + }, + { + "epoch": 0.011535885625247735, + "grad_norm": 60.718109130859375, + "learning_rate": 3.843351548269581e-06, + "loss": 2.2693, + "step": 211 + }, + { + "epoch": 0.011590558068969288, + "grad_norm": 1544.042236328125, + "learning_rate": 3.861566484517305e-06, + "loss": 2.277, + "step": 212 + }, + { + "epoch": 0.01164523051269084, + "grad_norm": 916.0305786132812, + "learning_rate": 3.879781420765028e-06, + "loss": 2.4567, + "step": 213 + }, + { + "epoch": 0.011699902956412394, + "grad_norm": 117.54584503173828, + "learning_rate": 3.897996357012751e-06, + "loss": 2.1572, + "step": 214 + }, + { + "epoch": 0.011754575400133948, + "grad_norm": 62.696075439453125, + "learning_rate": 3.916211293260474e-06, + "loss": 2.2348, + "step": 215 + }, + { + "epoch": 0.011809247843855501, + "grad_norm": 58.908512115478516, + "learning_rate": 3.934426229508197e-06, + "loss": 2.3924, + "step": 216 + }, + { + "epoch": 0.011863920287577054, + "grad_norm": 64.70187377929688, + "learning_rate": 3.9526411657559195e-06, + "loss": 2.4505, + "step": 217 + }, + { + "epoch": 0.011918592731298607, + "grad_norm": 68.65525817871094, + "learning_rate": 3.970856102003644e-06, + "loss": 2.322, + "step": 218 + }, + { + "epoch": 0.01197326517502016, + "grad_norm": 49.234039306640625, + "learning_rate": 3.989071038251366e-06, + "loss": 2.2917, + "step": 219 + }, + { + "epoch": 0.012027937618741713, + "grad_norm": 59.741981506347656, + "learning_rate": 4.0072859744990895e-06, + "loss": 2.4114, + "step": 220 + }, + { + "epoch": 0.012082610062463268, + "grad_norm": 2432.040283203125, + "learning_rate": 4.025500910746813e-06, + "loss": 2.3625, + "step": 221 + }, + { + "epoch": 0.01213728250618482, + "grad_norm": 117.08010864257812, + "learning_rate": 4.043715846994536e-06, + "loss": 2.3671, + "step": 222 + }, + { + "epoch": 0.012191954949906374, + "grad_norm": 123.0572280883789, + "learning_rate": 4.061930783242259e-06, + "loss": 2.2344, + "step": 223 + }, + { + "epoch": 0.012246627393627927, + "grad_norm": 155.05645751953125, + "learning_rate": 4.080145719489983e-06, + "loss": 2.2487, + "step": 224 + }, + { + "epoch": 0.01230129983734948, + "grad_norm": 86.58550262451172, + "learning_rate": 4.098360655737705e-06, + "loss": 2.2237, + "step": 225 + }, + { + "epoch": 0.012355972281071033, + "grad_norm": 68.1767578125, + "learning_rate": 4.1165755919854286e-06, + "loss": 2.0832, + "step": 226 + }, + { + "epoch": 0.012410644724792587, + "grad_norm": 83.1635971069336, + "learning_rate": 4.134790528233151e-06, + "loss": 2.5196, + "step": 227 + }, + { + "epoch": 0.01246531716851414, + "grad_norm": 56.18354415893555, + "learning_rate": 4.153005464480875e-06, + "loss": 2.4457, + "step": 228 + }, + { + "epoch": 0.012519989612235693, + "grad_norm": 1560.03369140625, + "learning_rate": 4.171220400728598e-06, + "loss": 2.3274, + "step": 229 + }, + { + "epoch": 0.012574662055957246, + "grad_norm": 1160.0335693359375, + "learning_rate": 4.189435336976321e-06, + "loss": 2.4168, + "step": 230 + }, + { + "epoch": 0.012629334499678799, + "grad_norm": 692.0425415039062, + "learning_rate": 4.207650273224044e-06, + "loss": 2.2578, + "step": 231 + }, + { + "epoch": 0.012684006943400352, + "grad_norm": 173.1077117919922, + "learning_rate": 4.225865209471768e-06, + "loss": 2.1735, + "step": 232 + }, + { + "epoch": 0.012738679387121907, + "grad_norm": 90.77042388916016, + "learning_rate": 4.24408014571949e-06, + "loss": 2.402, + "step": 233 + }, + { + "epoch": 0.01279335183084346, + "grad_norm": 52.62535095214844, + "learning_rate": 4.2622950819672135e-06, + "loss": 2.2919, + "step": 234 + }, + { + "epoch": 0.012848024274565013, + "grad_norm": 92.61150360107422, + "learning_rate": 4.280510018214937e-06, + "loss": 2.3392, + "step": 235 + }, + { + "epoch": 0.012902696718286566, + "grad_norm": 54.456111907958984, + "learning_rate": 4.29872495446266e-06, + "loss": 2.0708, + "step": 236 + }, + { + "epoch": 0.012957369162008119, + "grad_norm": 38.25407791137695, + "learning_rate": 4.316939890710383e-06, + "loss": 2.1812, + "step": 237 + }, + { + "epoch": 0.013012041605729672, + "grad_norm": 115.5916976928711, + "learning_rate": 4.335154826958106e-06, + "loss": 2.3965, + "step": 238 + }, + { + "epoch": 0.013066714049451226, + "grad_norm": 59.61564254760742, + "learning_rate": 4.353369763205829e-06, + "loss": 2.255, + "step": 239 + }, + { + "epoch": 0.01312138649317278, + "grad_norm": 44.61579513549805, + "learning_rate": 4.371584699453552e-06, + "loss": 2.4387, + "step": 240 + }, + { + "epoch": 0.013176058936894332, + "grad_norm": 237.02845764160156, + "learning_rate": 4.389799635701276e-06, + "loss": 2.0441, + "step": 241 + }, + { + "epoch": 0.013230731380615885, + "grad_norm": 215.1884307861328, + "learning_rate": 4.408014571948998e-06, + "loss": 2.5254, + "step": 242 + }, + { + "epoch": 0.013285403824337438, + "grad_norm": 482.0247802734375, + "learning_rate": 4.426229508196722e-06, + "loss": 2.2618, + "step": 243 + }, + { + "epoch": 0.013340076268058991, + "grad_norm": 144.0541534423828, + "learning_rate": 4.444444444444444e-06, + "loss": 2.175, + "step": 244 + }, + { + "epoch": 0.013394748711780544, + "grad_norm": 105.08518981933594, + "learning_rate": 4.462659380692168e-06, + "loss": 2.0775, + "step": 245 + }, + { + "epoch": 0.013449421155502099, + "grad_norm": 234.2315673828125, + "learning_rate": 4.480874316939891e-06, + "loss": 2.7041, + "step": 246 + }, + { + "epoch": 0.013504093599223652, + "grad_norm": 106.18502044677734, + "learning_rate": 4.499089253187614e-06, + "loss": 2.4622, + "step": 247 + }, + { + "epoch": 0.013558766042945205, + "grad_norm": 79.70233154296875, + "learning_rate": 4.5173041894353374e-06, + "loss": 2.2111, + "step": 248 + }, + { + "epoch": 0.013613438486666758, + "grad_norm": 422.02093505859375, + "learning_rate": 4.535519125683061e-06, + "loss": 2.1746, + "step": 249 + }, + { + "epoch": 0.01366811093038831, + "grad_norm": 294.0169677734375, + "learning_rate": 4.553734061930783e-06, + "loss": 2.3253, + "step": 250 + }, + { + "epoch": 0.013722783374109863, + "grad_norm": 56.21427536010742, + "learning_rate": 4.571948998178507e-06, + "loss": 2.1363, + "step": 251 + }, + { + "epoch": 0.013777455817831418, + "grad_norm": 53.64483642578125, + "learning_rate": 4.59016393442623e-06, + "loss": 2.206, + "step": 252 + }, + { + "epoch": 0.013832128261552971, + "grad_norm": 41.081058502197266, + "learning_rate": 4.608378870673953e-06, + "loss": 2.206, + "step": 253 + }, + { + "epoch": 0.013886800705274524, + "grad_norm": 53.386470794677734, + "learning_rate": 4.626593806921676e-06, + "loss": 2.1648, + "step": 254 + }, + { + "epoch": 0.013941473148996077, + "grad_norm": 71.7169189453125, + "learning_rate": 4.6448087431694e-06, + "loss": 2.2839, + "step": 255 + }, + { + "epoch": 0.01399614559271763, + "grad_norm": 63.19901657104492, + "learning_rate": 4.663023679417122e-06, + "loss": 2.4377, + "step": 256 + }, + { + "epoch": 0.014050818036439183, + "grad_norm": 736.0277709960938, + "learning_rate": 4.681238615664846e-06, + "loss": 2.3003, + "step": 257 + }, + { + "epoch": 0.014105490480160738, + "grad_norm": 1504.0269775390625, + "learning_rate": 4.699453551912569e-06, + "loss": 2.3203, + "step": 258 + }, + { + "epoch": 0.01416016292388229, + "grad_norm": 506.0145263671875, + "learning_rate": 4.717668488160292e-06, + "loss": 2.1587, + "step": 259 + }, + { + "epoch": 0.014214835367603844, + "grad_norm": 210.07937622070312, + "learning_rate": 4.735883424408015e-06, + "loss": 2.2543, + "step": 260 + }, + { + "epoch": 0.014269507811325396, + "grad_norm": 200.09584045410156, + "learning_rate": 4.754098360655738e-06, + "loss": 2.3717, + "step": 261 + }, + { + "epoch": 0.01432418025504695, + "grad_norm": 89.09048461914062, + "learning_rate": 4.772313296903461e-06, + "loss": 2.2397, + "step": 262 + }, + { + "epoch": 0.014378852698768502, + "grad_norm": 82.1024398803711, + "learning_rate": 4.790528233151184e-06, + "loss": 2.332, + "step": 263 + }, + { + "epoch": 0.014433525142490057, + "grad_norm": 73.08612060546875, + "learning_rate": 4.808743169398907e-06, + "loss": 2.2522, + "step": 264 + }, + { + "epoch": 0.01448819758621161, + "grad_norm": 156.12680053710938, + "learning_rate": 4.8269581056466305e-06, + "loss": 2.3213, + "step": 265 + }, + { + "epoch": 0.014542870029933163, + "grad_norm": 116.62425231933594, + "learning_rate": 4.845173041894354e-06, + "loss": 2.1231, + "step": 266 + }, + { + "epoch": 0.014597542473654716, + "grad_norm": 164.10169982910156, + "learning_rate": 4.863387978142076e-06, + "loss": 2.2355, + "step": 267 + }, + { + "epoch": 0.014652214917376269, + "grad_norm": 199.03456115722656, + "learning_rate": 4.8816029143898005e-06, + "loss": 2.257, + "step": 268 + }, + { + "epoch": 0.014706887361097822, + "grad_norm": 253.03396606445312, + "learning_rate": 4.899817850637523e-06, + "loss": 2.1137, + "step": 269 + }, + { + "epoch": 0.014761559804819377, + "grad_norm": 149.04562377929688, + "learning_rate": 4.918032786885246e-06, + "loss": 2.2583, + "step": 270 + }, + { + "epoch": 0.01481623224854093, + "grad_norm": 151.10055541992188, + "learning_rate": 4.936247723132969e-06, + "loss": 2.1479, + "step": 271 + }, + { + "epoch": 0.014870904692262482, + "grad_norm": 90.08025360107422, + "learning_rate": 4.954462659380693e-06, + "loss": 2.3516, + "step": 272 + }, + { + "epoch": 0.014925577135984035, + "grad_norm": 1020.0331420898438, + "learning_rate": 4.9726775956284154e-06, + "loss": 2.4327, + "step": 273 + }, + { + "epoch": 0.014980249579705588, + "grad_norm": 516.024658203125, + "learning_rate": 4.990892531876139e-06, + "loss": 2.1892, + "step": 274 + }, + { + "epoch": 0.015034922023427141, + "grad_norm": 69.09666442871094, + "learning_rate": 5.009107468123861e-06, + "loss": 2.4538, + "step": 275 + }, + { + "epoch": 0.015089594467148696, + "grad_norm": 74.62389373779297, + "learning_rate": 5.027322404371585e-06, + "loss": 2.2597, + "step": 276 + }, + { + "epoch": 0.015144266910870249, + "grad_norm": 39.60612869262695, + "learning_rate": 5.045537340619309e-06, + "loss": 2.3213, + "step": 277 + }, + { + "epoch": 0.015198939354591802, + "grad_norm": 116.1985855102539, + "learning_rate": 5.063752276867031e-06, + "loss": 2.576, + "step": 278 + }, + { + "epoch": 0.015253611798313355, + "grad_norm": 87.60281372070312, + "learning_rate": 5.0819672131147545e-06, + "loss": 2.2706, + "step": 279 + }, + { + "epoch": 0.015308284242034908, + "grad_norm": 720.0213012695312, + "learning_rate": 5.100182149362478e-06, + "loss": 2.1979, + "step": 280 + }, + { + "epoch": 0.01536295668575646, + "grad_norm": 1456.033447265625, + "learning_rate": 5.1183970856102e-06, + "loss": 2.2262, + "step": 281 + }, + { + "epoch": 0.015417629129478016, + "grad_norm": 229.03140258789062, + "learning_rate": 5.1366120218579245e-06, + "loss": 2.3418, + "step": 282 + }, + { + "epoch": 0.015472301573199568, + "grad_norm": 184.0536651611328, + "learning_rate": 5.154826958105648e-06, + "loss": 2.3116, + "step": 283 + }, + { + "epoch": 0.015526974016921121, + "grad_norm": 173.05467224121094, + "learning_rate": 5.17304189435337e-06, + "loss": 2.0792, + "step": 284 + }, + { + "epoch": 0.015581646460642674, + "grad_norm": 216.1102294921875, + "learning_rate": 5.191256830601094e-06, + "loss": 2.5086, + "step": 285 + }, + { + "epoch": 0.01563631890436423, + "grad_norm": 62.141780853271484, + "learning_rate": 5.209471766848816e-06, + "loss": 2.2765, + "step": 286 + }, + { + "epoch": 0.01569099134808578, + "grad_norm": 86.146728515625, + "learning_rate": 5.227686703096539e-06, + "loss": 2.0431, + "step": 287 + }, + { + "epoch": 0.015745663791807335, + "grad_norm": 262.00994873046875, + "learning_rate": 5.245901639344263e-06, + "loss": 2.22, + "step": 288 + }, + { + "epoch": 0.015800336235528886, + "grad_norm": 202.08021545410156, + "learning_rate": 5.264116575591985e-06, + "loss": 2.3803, + "step": 289 + }, + { + "epoch": 0.01585500867925044, + "grad_norm": 77.10234832763672, + "learning_rate": 5.2823315118397085e-06, + "loss": 2.0133, + "step": 290 + }, + { + "epoch": 0.015909681122971996, + "grad_norm": 42.21597671508789, + "learning_rate": 5.300546448087433e-06, + "loss": 2.3603, + "step": 291 + }, + { + "epoch": 0.015964353566693547, + "grad_norm": 149.15965270996094, + "learning_rate": 5.318761384335155e-06, + "loss": 2.0995, + "step": 292 + }, + { + "epoch": 0.0160190260104151, + "grad_norm": 190.03305053710938, + "learning_rate": 5.3369763205828785e-06, + "loss": 2.1008, + "step": 293 + }, + { + "epoch": 0.016073698454136653, + "grad_norm": 83.13412475585938, + "learning_rate": 5.355191256830602e-06, + "loss": 2.1026, + "step": 294 + }, + { + "epoch": 0.016128370897858207, + "grad_norm": 106.12203216552734, + "learning_rate": 5.373406193078324e-06, + "loss": 2.0958, + "step": 295 + }, + { + "epoch": 0.01618304334157976, + "grad_norm": 69.06713104248047, + "learning_rate": 5.391621129326048e-06, + "loss": 2.0874, + "step": 296 + }, + { + "epoch": 0.016237715785301313, + "grad_norm": 1112.033203125, + "learning_rate": 5.409836065573772e-06, + "loss": 2.4326, + "step": 297 + }, + { + "epoch": 0.016292388229022868, + "grad_norm": 350.02410888671875, + "learning_rate": 5.428051001821493e-06, + "loss": 2.3903, + "step": 298 + }, + { + "epoch": 0.01634706067274442, + "grad_norm": 400.0954895019531, + "learning_rate": 5.446265938069218e-06, + "loss": 2.3526, + "step": 299 + }, + { + "epoch": 0.016401733116465974, + "grad_norm": 181.06124877929688, + "learning_rate": 5.464480874316941e-06, + "loss": 1.9988, + "step": 300 + }, + { + "epoch": 0.016456405560187525, + "grad_norm": 142.1159210205078, + "learning_rate": 5.482695810564663e-06, + "loss": 2.3344, + "step": 301 + }, + { + "epoch": 0.01651107800390908, + "grad_norm": 139.15847778320312, + "learning_rate": 5.500910746812387e-06, + "loss": 2.2094, + "step": 302 + }, + { + "epoch": 0.016565750447630635, + "grad_norm": 157.122314453125, + "learning_rate": 5.519125683060109e-06, + "loss": 2.2887, + "step": 303 + }, + { + "epoch": 0.016620422891352186, + "grad_norm": 232.04971313476562, + "learning_rate": 5.5373406193078325e-06, + "loss": 1.9521, + "step": 304 + }, + { + "epoch": 0.01667509533507374, + "grad_norm": 164.04852294921875, + "learning_rate": 5.555555555555557e-06, + "loss": 2.1428, + "step": 305 + }, + { + "epoch": 0.01672976777879529, + "grad_norm": 312.1055908203125, + "learning_rate": 5.573770491803278e-06, + "loss": 2.5366, + "step": 306 + }, + { + "epoch": 0.016784440222516846, + "grad_norm": 54.34020233154297, + "learning_rate": 5.5919854280510025e-06, + "loss": 2.0249, + "step": 307 + }, + { + "epoch": 0.016839112666238398, + "grad_norm": 448.06951904296875, + "learning_rate": 5.610200364298726e-06, + "loss": 2.341, + "step": 308 + }, + { + "epoch": 0.016893785109959952, + "grad_norm": 728.0336303710938, + "learning_rate": 5.628415300546448e-06, + "loss": 2.2647, + "step": 309 + }, + { + "epoch": 0.016948457553681507, + "grad_norm": 1216.0428466796875, + "learning_rate": 5.646630236794172e-06, + "loss": 1.9016, + "step": 310 + }, + { + "epoch": 0.017003129997403058, + "grad_norm": 164.05328369140625, + "learning_rate": 5.664845173041895e-06, + "loss": 2.346, + "step": 311 + }, + { + "epoch": 0.017057802441124613, + "grad_norm": 127.56770324707031, + "learning_rate": 5.683060109289617e-06, + "loss": 2.1768, + "step": 312 + }, + { + "epoch": 0.017112474884846164, + "grad_norm": 104.07244110107422, + "learning_rate": 5.701275045537341e-06, + "loss": 2.2042, + "step": 313 + }, + { + "epoch": 0.01716714732856772, + "grad_norm": 161.12982177734375, + "learning_rate": 5.719489981785065e-06, + "loss": 2.2461, + "step": 314 + }, + { + "epoch": 0.017221819772289274, + "grad_norm": 57.1138916015625, + "learning_rate": 5.737704918032787e-06, + "loss": 2.1486, + "step": 315 + }, + { + "epoch": 0.017276492216010825, + "grad_norm": 290.0227966308594, + "learning_rate": 5.755919854280511e-06, + "loss": 2.1997, + "step": 316 + }, + { + "epoch": 0.01733116465973238, + "grad_norm": 167.0419464111328, + "learning_rate": 5.774134790528234e-06, + "loss": 1.9737, + "step": 317 + }, + { + "epoch": 0.01738583710345393, + "grad_norm": 210.04949951171875, + "learning_rate": 5.7923497267759565e-06, + "loss": 2.1381, + "step": 318 + }, + { + "epoch": 0.017440509547175485, + "grad_norm": 42.12990188598633, + "learning_rate": 5.81056466302368e-06, + "loss": 2.0099, + "step": 319 + }, + { + "epoch": 0.017495181990897037, + "grad_norm": 61.846195220947266, + "learning_rate": 5.828779599271404e-06, + "loss": 2.1127, + "step": 320 + }, + { + "epoch": 0.01754985443461859, + "grad_norm": 286.0775146484375, + "learning_rate": 5.846994535519126e-06, + "loss": 2.3666, + "step": 321 + }, + { + "epoch": 0.017604526878340146, + "grad_norm": 660.0348510742188, + "learning_rate": 5.86520947176685e-06, + "loss": 2.1506, + "step": 322 + }, + { + "epoch": 0.017659199322061697, + "grad_norm": 135.05392456054688, + "learning_rate": 5.883424408014572e-06, + "loss": 2.0342, + "step": 323 + }, + { + "epoch": 0.017713871765783252, + "grad_norm": 80.08454895019531, + "learning_rate": 5.9016393442622956e-06, + "loss": 2.1864, + "step": 324 + }, + { + "epoch": 0.017768544209504803, + "grad_norm": 61.33713150024414, + "learning_rate": 5.919854280510019e-06, + "loss": 1.9773, + "step": 325 + }, + { + "epoch": 0.017823216653226358, + "grad_norm": 115.6012954711914, + "learning_rate": 5.938069216757741e-06, + "loss": 2.0014, + "step": 326 + }, + { + "epoch": 0.017877889096947912, + "grad_norm": 168.07086181640625, + "learning_rate": 5.956284153005465e-06, + "loss": 2.3268, + "step": 327 + }, + { + "epoch": 0.017932561540669464, + "grad_norm": 536.0667114257812, + "learning_rate": 5.974499089253189e-06, + "loss": 2.2812, + "step": 328 + }, + { + "epoch": 0.01798723398439102, + "grad_norm": 3056.0673828125, + "learning_rate": 5.9927140255009105e-06, + "loss": 2.351, + "step": 329 + }, + { + "epoch": 0.01804190642811257, + "grad_norm": 1184.031005859375, + "learning_rate": 6.010928961748635e-06, + "loss": 2.0082, + "step": 330 + }, + { + "epoch": 0.018096578871834124, + "grad_norm": 81.06494140625, + "learning_rate": 6.029143897996358e-06, + "loss": 2.3003, + "step": 331 + }, + { + "epoch": 0.018151251315555676, + "grad_norm": 106.09380340576172, + "learning_rate": 6.0473588342440805e-06, + "loss": 2.4319, + "step": 332 + }, + { + "epoch": 0.01820592375927723, + "grad_norm": 89.59012603759766, + "learning_rate": 6.065573770491804e-06, + "loss": 2.1698, + "step": 333 + }, + { + "epoch": 0.018260596202998785, + "grad_norm": 129.07919311523438, + "learning_rate": 6.083788706739527e-06, + "loss": 2.1391, + "step": 334 + }, + { + "epoch": 0.018315268646720336, + "grad_norm": 3008.0888671875, + "learning_rate": 6.1020036429872496e-06, + "loss": 2.0973, + "step": 335 + }, + { + "epoch": 0.01836994109044189, + "grad_norm": 2880.08251953125, + "learning_rate": 6.120218579234973e-06, + "loss": 2.2624, + "step": 336 + }, + { + "epoch": 0.018424613534163442, + "grad_norm": 380.0845642089844, + "learning_rate": 6.138433515482697e-06, + "loss": 2.3464, + "step": 337 + }, + { + "epoch": 0.018479285977884997, + "grad_norm": 344.04168701171875, + "learning_rate": 6.1566484517304195e-06, + "loss": 2.0508, + "step": 338 + }, + { + "epoch": 0.01853395842160655, + "grad_norm": 96.06484985351562, + "learning_rate": 6.174863387978143e-06, + "loss": 2.0167, + "step": 339 + }, + { + "epoch": 0.018588630865328103, + "grad_norm": 38.842464447021484, + "learning_rate": 6.193078324225865e-06, + "loss": 2.0625, + "step": 340 + }, + { + "epoch": 0.018643303309049657, + "grad_norm": 54.36257553100586, + "learning_rate": 6.211293260473589e-06, + "loss": 2.3223, + "step": 341 + }, + { + "epoch": 0.01869797575277121, + "grad_norm": 94.08599090576172, + "learning_rate": 6.229508196721312e-06, + "loss": 2.094, + "step": 342 + }, + { + "epoch": 0.018752648196492763, + "grad_norm": 35.74746322631836, + "learning_rate": 6.2477231329690345e-06, + "loss": 2.1348, + "step": 343 + }, + { + "epoch": 0.018807320640214314, + "grad_norm": 380.0302734375, + "learning_rate": 6.265938069216758e-06, + "loss": 2.1062, + "step": 344 + }, + { + "epoch": 0.01886199308393587, + "grad_norm": 488.0375671386719, + "learning_rate": 6.284153005464482e-06, + "loss": 2.0765, + "step": 345 + }, + { + "epoch": 0.018916665527657424, + "grad_norm": 290.03961181640625, + "learning_rate": 6.3023679417122044e-06, + "loss": 2.1583, + "step": 346 + }, + { + "epoch": 0.018971337971378975, + "grad_norm": 81.55799865722656, + "learning_rate": 6.320582877959928e-06, + "loss": 2.1092, + "step": 347 + }, + { + "epoch": 0.01902601041510053, + "grad_norm": 398.1293029785156, + "learning_rate": 6.338797814207651e-06, + "loss": 2.3873, + "step": 348 + }, + { + "epoch": 0.01908068285882208, + "grad_norm": 968.0424194335938, + "learning_rate": 6.3570127504553735e-06, + "loss": 2.1844, + "step": 349 + }, + { + "epoch": 0.019135355302543636, + "grad_norm": 740.0535278320312, + "learning_rate": 6.375227686703097e-06, + "loss": 2.1353, + "step": 350 + }, + { + "epoch": 0.01919002774626519, + "grad_norm": 188.09739685058594, + "learning_rate": 6.393442622950821e-06, + "loss": 2.3837, + "step": 351 + }, + { + "epoch": 0.01924470018998674, + "grad_norm": 66.59907531738281, + "learning_rate": 6.411657559198543e-06, + "loss": 2.2442, + "step": 352 + }, + { + "epoch": 0.019299372633708296, + "grad_norm": 36.62739562988281, + "learning_rate": 6.429872495446267e-06, + "loss": 2.1417, + "step": 353 + }, + { + "epoch": 0.019354045077429848, + "grad_norm": 64.5729751586914, + "learning_rate": 6.44808743169399e-06, + "loss": 2.317, + "step": 354 + }, + { + "epoch": 0.019408717521151402, + "grad_norm": 109.56656646728516, + "learning_rate": 6.466302367941713e-06, + "loss": 2.2312, + "step": 355 + }, + { + "epoch": 0.019463389964872953, + "grad_norm": 1424.073486328125, + "learning_rate": 6.484517304189436e-06, + "loss": 2.02, + "step": 356 + }, + { + "epoch": 0.019518062408594508, + "grad_norm": 1512.0718994140625, + "learning_rate": 6.5027322404371584e-06, + "loss": 2.372, + "step": 357 + }, + { + "epoch": 0.019572734852316063, + "grad_norm": 225.0542449951172, + "learning_rate": 6.520947176684882e-06, + "loss": 2.2185, + "step": 358 + }, + { + "epoch": 0.019627407296037614, + "grad_norm": 134.03750610351562, + "learning_rate": 6.539162112932605e-06, + "loss": 2.088, + "step": 359 + }, + { + "epoch": 0.01968207973975917, + "grad_norm": 54.03818893432617, + "learning_rate": 6.5573770491803276e-06, + "loss": 2.4044, + "step": 360 + }, + { + "epoch": 0.01973675218348072, + "grad_norm": 108.06438446044922, + "learning_rate": 6.575591985428052e-06, + "loss": 2.2113, + "step": 361 + }, + { + "epoch": 0.019791424627202275, + "grad_norm": 105.58277893066406, + "learning_rate": 6.593806921675775e-06, + "loss": 2.3233, + "step": 362 + }, + { + "epoch": 0.019846097070923826, + "grad_norm": 112.11173248291016, + "learning_rate": 6.6120218579234975e-06, + "loss": 2.4842, + "step": 363 + }, + { + "epoch": 0.01990076951464538, + "grad_norm": 154.0957794189453, + "learning_rate": 6.630236794171221e-06, + "loss": 2.0805, + "step": 364 + }, + { + "epoch": 0.019955441958366935, + "grad_norm": 217.04029846191406, + "learning_rate": 6.648451730418944e-06, + "loss": 1.9903, + "step": 365 + }, + { + "epoch": 0.020010114402088486, + "grad_norm": 127.07976531982422, + "learning_rate": 6.666666666666667e-06, + "loss": 2.0758, + "step": 366 + }, + { + "epoch": 0.02006478684581004, + "grad_norm": 128.07859802246094, + "learning_rate": 6.68488160291439e-06, + "loss": 1.7818, + "step": 367 + }, + { + "epoch": 0.020119459289531592, + "grad_norm": 47.70826721191406, + "learning_rate": 6.703096539162114e-06, + "loss": 2.025, + "step": 368 + }, + { + "epoch": 0.020174131733253147, + "grad_norm": 476.0486755371094, + "learning_rate": 6.721311475409837e-06, + "loss": 2.2529, + "step": 369 + }, + { + "epoch": 0.020228804176974702, + "grad_norm": 980.0654296875, + "learning_rate": 6.73952641165756e-06, + "loss": 2.1218, + "step": 370 + }, + { + "epoch": 0.020283476620696253, + "grad_norm": 143.0380401611328, + "learning_rate": 6.757741347905283e-06, + "loss": 2.0816, + "step": 371 + }, + { + "epoch": 0.020338149064417808, + "grad_norm": 156.0684051513672, + "learning_rate": 6.775956284153006e-06, + "loss": 2.1422, + "step": 372 + }, + { + "epoch": 0.02039282150813936, + "grad_norm": 60.340919494628906, + "learning_rate": 6.794171220400729e-06, + "loss": 1.9418, + "step": 373 + }, + { + "epoch": 0.020447493951860914, + "grad_norm": 164.0865020751953, + "learning_rate": 6.812386156648453e-06, + "loss": 2.3902, + "step": 374 + }, + { + "epoch": 0.020502166395582465, + "grad_norm": 2416.14208984375, + "learning_rate": 6.830601092896175e-06, + "loss": 2.2094, + "step": 375 + }, + { + "epoch": 0.02055683883930402, + "grad_norm": 2096.120849609375, + "learning_rate": 6.848816029143899e-06, + "loss": 2.1259, + "step": 376 + }, + { + "epoch": 0.020611511283025574, + "grad_norm": 93.07075500488281, + "learning_rate": 6.8670309653916215e-06, + "loss": 2.2048, + "step": 377 + }, + { + "epoch": 0.020666183726747125, + "grad_norm": 80.58942413330078, + "learning_rate": 6.885245901639345e-06, + "loss": 1.9881, + "step": 378 + }, + { + "epoch": 0.02072085617046868, + "grad_norm": 165.09934997558594, + "learning_rate": 6.903460837887068e-06, + "loss": 1.9152, + "step": 379 + }, + { + "epoch": 0.02077552861419023, + "grad_norm": 233.04808044433594, + "learning_rate": 6.921675774134791e-06, + "loss": 2.1593, + "step": 380 + }, + { + "epoch": 0.020830201057911786, + "grad_norm": 140.05987548828125, + "learning_rate": 6.939890710382514e-06, + "loss": 2.1326, + "step": 381 + }, + { + "epoch": 0.02088487350163334, + "grad_norm": 81.6737060546875, + "learning_rate": 6.958105646630237e-06, + "loss": 2.0379, + "step": 382 + }, + { + "epoch": 0.020939545945354892, + "grad_norm": 144.0922088623047, + "learning_rate": 6.97632058287796e-06, + "loss": 1.9488, + "step": 383 + }, + { + "epoch": 0.020994218389076447, + "grad_norm": 197.0513458251953, + "learning_rate": 6.994535519125684e-06, + "loss": 2.1174, + "step": 384 + }, + { + "epoch": 0.021048890832797998, + "grad_norm": 139.0819091796875, + "learning_rate": 7.012750455373407e-06, + "loss": 2.2122, + "step": 385 + }, + { + "epoch": 0.021103563276519553, + "grad_norm": 2624.111083984375, + "learning_rate": 7.03096539162113e-06, + "loss": 2.2074, + "step": 386 + }, + { + "epoch": 0.021158235720241104, + "grad_norm": 3792.152099609375, + "learning_rate": 7.049180327868853e-06, + "loss": 2.132, + "step": 387 + }, + { + "epoch": 0.02121290816396266, + "grad_norm": 95.10283660888672, + "learning_rate": 7.067395264116576e-06, + "loss": 2.218, + "step": 388 + }, + { + "epoch": 0.021267580607684213, + "grad_norm": 53.56262969970703, + "learning_rate": 7.085610200364299e-06, + "loss": 2.0635, + "step": 389 + }, + { + "epoch": 0.021322253051405764, + "grad_norm": 223.06924438476562, + "learning_rate": 7.103825136612022e-06, + "loss": 1.7789, + "step": 390 + }, + { + "epoch": 0.02137692549512732, + "grad_norm": 86.0604248046875, + "learning_rate": 7.122040072859746e-06, + "loss": 2.291, + "step": 391 + }, + { + "epoch": 0.02143159793884887, + "grad_norm": 80.61493682861328, + "learning_rate": 7.140255009107469e-06, + "loss": 2.3153, + "step": 392 + }, + { + "epoch": 0.021486270382570425, + "grad_norm": 1704.1121826171875, + "learning_rate": 7.158469945355192e-06, + "loss": 1.9821, + "step": 393 + }, + { + "epoch": 0.02154094282629198, + "grad_norm": 1368.0867919921875, + "learning_rate": 7.176684881602915e-06, + "loss": 2.1049, + "step": 394 + }, + { + "epoch": 0.02159561527001353, + "grad_norm": 262.0339660644531, + "learning_rate": 7.194899817850638e-06, + "loss": 2.0476, + "step": 395 + }, + { + "epoch": 0.021650287713735086, + "grad_norm": 116.04308319091797, + "learning_rate": 7.213114754098361e-06, + "loss": 2.139, + "step": 396 + }, + { + "epoch": 0.021704960157456637, + "grad_norm": 73.04888153076172, + "learning_rate": 7.231329690346084e-06, + "loss": 2.0081, + "step": 397 + }, + { + "epoch": 0.02175963260117819, + "grad_norm": 103.05530548095703, + "learning_rate": 7.249544626593807e-06, + "loss": 2.2764, + "step": 398 + }, + { + "epoch": 0.021814305044899743, + "grad_norm": 175.06399536132812, + "learning_rate": 7.267759562841531e-06, + "loss": 2.0149, + "step": 399 + }, + { + "epoch": 0.021868977488621297, + "grad_norm": 132.07765197753906, + "learning_rate": 7.285974499089254e-06, + "loss": 1.9597, + "step": 400 + }, + { + "epoch": 0.021923649932342852, + "grad_norm": 195.09815979003906, + "learning_rate": 7.304189435336977e-06, + "loss": 1.9733, + "step": 401 + }, + { + "epoch": 0.021978322376064403, + "grad_norm": 1344.090576171875, + "learning_rate": 7.3224043715847e-06, + "loss": 2.2872, + "step": 402 + }, + { + "epoch": 0.022032994819785958, + "grad_norm": 520.0455932617188, + "learning_rate": 7.340619307832423e-06, + "loss": 2.1507, + "step": 403 + }, + { + "epoch": 0.02208766726350751, + "grad_norm": 176.04286193847656, + "learning_rate": 7.358834244080146e-06, + "loss": 2.132, + "step": 404 + }, + { + "epoch": 0.022142339707229064, + "grad_norm": 183.08872985839844, + "learning_rate": 7.3770491803278695e-06, + "loss": 2.0971, + "step": 405 + }, + { + "epoch": 0.02219701215095062, + "grad_norm": 149.10377502441406, + "learning_rate": 7.395264116575592e-06, + "loss": 2.1447, + "step": 406 + }, + { + "epoch": 0.02225168459467217, + "grad_norm": 68.07654571533203, + "learning_rate": 7.413479052823316e-06, + "loss": 2.0588, + "step": 407 + }, + { + "epoch": 0.022306357038393725, + "grad_norm": 5472.322265625, + "learning_rate": 7.4316939890710394e-06, + "loss": 2.1728, + "step": 408 + }, + { + "epoch": 0.022361029482115276, + "grad_norm": 16064.892578125, + "learning_rate": 7.449908925318762e-06, + "loss": 2.3048, + "step": 409 + }, + { + "epoch": 0.02241570192583683, + "grad_norm": 3184.17529296875, + "learning_rate": 7.468123861566485e-06, + "loss": 2.2692, + "step": 410 + }, + { + "epoch": 0.02247037436955838, + "grad_norm": 98.55543518066406, + "learning_rate": 7.4863387978142085e-06, + "loss": 2.0833, + "step": 411 + }, + { + "epoch": 0.022525046813279936, + "grad_norm": 55.537940979003906, + "learning_rate": 7.504553734061931e-06, + "loss": 2.2682, + "step": 412 + }, + { + "epoch": 0.02257971925700149, + "grad_norm": 73.56779479980469, + "learning_rate": 7.522768670309654e-06, + "loss": 2.0511, + "step": 413 + }, + { + "epoch": 0.022634391700723042, + "grad_norm": 158.1184844970703, + "learning_rate": 7.540983606557377e-06, + "loss": 2.1175, + "step": 414 + }, + { + "epoch": 0.022689064144444597, + "grad_norm": 129.1199951171875, + "learning_rate": 7.559198542805101e-06, + "loss": 2.0103, + "step": 415 + }, + { + "epoch": 0.022743736588166148, + "grad_norm": 1984.0433349609375, + "learning_rate": 7.577413479052824e-06, + "loss": 1.9272, + "step": 416 + }, + { + "epoch": 0.022798409031887703, + "grad_norm": 1760.045654296875, + "learning_rate": 7.595628415300547e-06, + "loss": 2.0084, + "step": 417 + }, + { + "epoch": 0.022853081475609258, + "grad_norm": 128.05406188964844, + "learning_rate": 7.61384335154827e-06, + "loss": 1.9854, + "step": 418 + }, + { + "epoch": 0.02290775391933081, + "grad_norm": 55.37064743041992, + "learning_rate": 7.632058287795994e-06, + "loss": 2.0041, + "step": 419 + }, + { + "epoch": 0.022962426363052364, + "grad_norm": 59.84200668334961, + "learning_rate": 7.650273224043716e-06, + "loss": 2.0842, + "step": 420 + }, + { + "epoch": 0.023017098806773915, + "grad_norm": 165.13638305664062, + "learning_rate": 7.66848816029144e-06, + "loss": 2.061, + "step": 421 + }, + { + "epoch": 0.02307177125049547, + "grad_norm": 87.5450210571289, + "learning_rate": 7.686703096539163e-06, + "loss": 2.1474, + "step": 422 + }, + { + "epoch": 0.02312644369421702, + "grad_norm": 316.06817626953125, + "learning_rate": 7.704918032786886e-06, + "loss": 1.9407, + "step": 423 + }, + { + "epoch": 0.023181116137938575, + "grad_norm": 191.01979064941406, + "learning_rate": 7.72313296903461e-06, + "loss": 2.0702, + "step": 424 + }, + { + "epoch": 0.02323578858166013, + "grad_norm": 188.01817321777344, + "learning_rate": 7.741347905282333e-06, + "loss": 2.1167, + "step": 425 + }, + { + "epoch": 0.02329046102538168, + "grad_norm": 84.10626983642578, + "learning_rate": 7.759562841530056e-06, + "loss": 1.8993, + "step": 426 + }, + { + "epoch": 0.023345133469103236, + "grad_norm": 101.52397155761719, + "learning_rate": 7.77777777777778e-06, + "loss": 2.1099, + "step": 427 + }, + { + "epoch": 0.023399805912824787, + "grad_norm": 92.57107543945312, + "learning_rate": 7.795992714025502e-06, + "loss": 1.9287, + "step": 428 + }, + { + "epoch": 0.023454478356546342, + "grad_norm": 33.86884689331055, + "learning_rate": 7.814207650273224e-06, + "loss": 2.0166, + "step": 429 + }, + { + "epoch": 0.023509150800267897, + "grad_norm": 184.0557403564453, + "learning_rate": 7.832422586520947e-06, + "loss": 2.2469, + "step": 430 + }, + { + "epoch": 0.023563823243989448, + "grad_norm": 632.0530395507812, + "learning_rate": 7.85063752276867e-06, + "loss": 2.1585, + "step": 431 + }, + { + "epoch": 0.023618495687711002, + "grad_norm": 2096.14404296875, + "learning_rate": 7.868852459016394e-06, + "loss": 2.0698, + "step": 432 + }, + { + "epoch": 0.023673168131432554, + "grad_norm": 498.0515441894531, + "learning_rate": 7.887067395264117e-06, + "loss": 2.2168, + "step": 433 + }, + { + "epoch": 0.02372784057515411, + "grad_norm": 105.0694351196289, + "learning_rate": 7.905282331511839e-06, + "loss": 2.1088, + "step": 434 + }, + { + "epoch": 0.02378251301887566, + "grad_norm": 31.979082107543945, + "learning_rate": 7.923497267759564e-06, + "loss": 2.2001, + "step": 435 + }, + { + "epoch": 0.023837185462597214, + "grad_norm": 37.623863220214844, + "learning_rate": 7.941712204007287e-06, + "loss": 1.8756, + "step": 436 + }, + { + "epoch": 0.02389185790631877, + "grad_norm": 52.540550231933594, + "learning_rate": 7.959927140255009e-06, + "loss": 1.8631, + "step": 437 + }, + { + "epoch": 0.02394653035004032, + "grad_norm": 140.04531860351562, + "learning_rate": 7.978142076502732e-06, + "loss": 2.0724, + "step": 438 + }, + { + "epoch": 0.024001202793761875, + "grad_norm": 194.0487060546875, + "learning_rate": 7.996357012750456e-06, + "loss": 2.1002, + "step": 439 + }, + { + "epoch": 0.024055875237483426, + "grad_norm": 768.0440673828125, + "learning_rate": 8.014571948998179e-06, + "loss": 1.6523, + "step": 440 + }, + { + "epoch": 0.02411054768120498, + "grad_norm": 780.0394897460938, + "learning_rate": 8.032786885245902e-06, + "loss": 1.9694, + "step": 441 + }, + { + "epoch": 0.024165220124926536, + "grad_norm": 202.04151916503906, + "learning_rate": 8.051001821493626e-06, + "loss": 2.0817, + "step": 442 + }, + { + "epoch": 0.024219892568648087, + "grad_norm": 135.05947875976562, + "learning_rate": 8.069216757741349e-06, + "loss": 2.1564, + "step": 443 + }, + { + "epoch": 0.02427456501236964, + "grad_norm": 166.02694702148438, + "learning_rate": 8.087431693989072e-06, + "loss": 2.086, + "step": 444 + }, + { + "epoch": 0.024329237456091193, + "grad_norm": 200.0518341064453, + "learning_rate": 8.105646630236796e-06, + "loss": 1.8671, + "step": 445 + }, + { + "epoch": 0.024383909899812747, + "grad_norm": 157.04017639160156, + "learning_rate": 8.123861566484517e-06, + "loss": 2.1758, + "step": 446 + }, + { + "epoch": 0.0244385823435343, + "grad_norm": 200.06370544433594, + "learning_rate": 8.14207650273224e-06, + "loss": 1.9497, + "step": 447 + }, + { + "epoch": 0.024493254787255853, + "grad_norm": 524.0610961914062, + "learning_rate": 8.160291438979966e-06, + "loss": 1.9198, + "step": 448 + }, + { + "epoch": 0.024547927230977408, + "grad_norm": 708.0828247070312, + "learning_rate": 8.178506375227687e-06, + "loss": 2.0053, + "step": 449 + }, + { + "epoch": 0.02460259967469896, + "grad_norm": 130.0304718017578, + "learning_rate": 8.19672131147541e-06, + "loss": 2.0104, + "step": 450 + }, + { + "epoch": 0.024657272118420514, + "grad_norm": 47.07136917114258, + "learning_rate": 8.214936247723134e-06, + "loss": 2.0507, + "step": 451 + }, + { + "epoch": 0.024711944562142065, + "grad_norm": 60.055110931396484, + "learning_rate": 8.233151183970857e-06, + "loss": 1.8861, + "step": 452 + }, + { + "epoch": 0.02476661700586362, + "grad_norm": 48.10615921020508, + "learning_rate": 8.25136612021858e-06, + "loss": 2.1485, + "step": 453 + }, + { + "epoch": 0.024821289449585174, + "grad_norm": 189.08460998535156, + "learning_rate": 8.269581056466302e-06, + "loss": 2.0385, + "step": 454 + }, + { + "epoch": 0.024875961893306726, + "grad_norm": 5888.46923828125, + "learning_rate": 8.287795992714025e-06, + "loss": 1.9439, + "step": 455 + }, + { + "epoch": 0.02493063433702828, + "grad_norm": 19841.453125, + "learning_rate": 8.30601092896175e-06, + "loss": 2.3199, + "step": 456 + }, + { + "epoch": 0.02498530678074983, + "grad_norm": 7104.5224609375, + "learning_rate": 8.324225865209472e-06, + "loss": 2.0143, + "step": 457 + }, + { + "epoch": 0.025039979224471386, + "grad_norm": 600.05859375, + "learning_rate": 8.342440801457195e-06, + "loss": 2.0787, + "step": 458 + }, + { + "epoch": 0.025094651668192938, + "grad_norm": 145.0356903076172, + "learning_rate": 8.360655737704919e-06, + "loss": 1.731, + "step": 459 + }, + { + "epoch": 0.025149324111914492, + "grad_norm": 83.54547882080078, + "learning_rate": 8.378870673952642e-06, + "loss": 2.044, + "step": 460 + }, + { + "epoch": 0.025203996555636047, + "grad_norm": 60.57720947265625, + "learning_rate": 8.397085610200365e-06, + "loss": 2.0443, + "step": 461 + }, + { + "epoch": 0.025258668999357598, + "grad_norm": 62.81083679199219, + "learning_rate": 8.415300546448089e-06, + "loss": 2.1162, + "step": 462 + }, + { + "epoch": 0.025313341443079153, + "grad_norm": 1200.0567626953125, + "learning_rate": 8.43351548269581e-06, + "loss": 2.1174, + "step": 463 + }, + { + "epoch": 0.025368013886800704, + "grad_norm": 1008.0558471679688, + "learning_rate": 8.451730418943535e-06, + "loss": 1.8258, + "step": 464 + }, + { + "epoch": 0.02542268633052226, + "grad_norm": 296.0312805175781, + "learning_rate": 8.469945355191259e-06, + "loss": 2.0367, + "step": 465 + }, + { + "epoch": 0.025477358774243813, + "grad_norm": 118.54348754882812, + "learning_rate": 8.48816029143898e-06, + "loss": 1.8833, + "step": 466 + }, + { + "epoch": 0.025532031217965365, + "grad_norm": 24.93951416015625, + "learning_rate": 8.506375227686704e-06, + "loss": 1.9877, + "step": 467 + }, + { + "epoch": 0.02558670366168692, + "grad_norm": 34.08345031738281, + "learning_rate": 8.524590163934427e-06, + "loss": 1.6417, + "step": 468 + }, + { + "epoch": 0.02564137610540847, + "grad_norm": 37.821075439453125, + "learning_rate": 8.54280510018215e-06, + "loss": 1.7469, + "step": 469 + }, + { + "epoch": 0.025696048549130025, + "grad_norm": 50.85566329956055, + "learning_rate": 8.561020036429874e-06, + "loss": 2.1521, + "step": 470 + }, + { + "epoch": 0.025750720992851577, + "grad_norm": 232.0168914794922, + "learning_rate": 8.579234972677595e-06, + "loss": 1.958, + "step": 471 + }, + { + "epoch": 0.02580539343657313, + "grad_norm": 2288.042236328125, + "learning_rate": 8.59744990892532e-06, + "loss": 2.1283, + "step": 472 + }, + { + "epoch": 0.025860065880294686, + "grad_norm": 358.01995849609375, + "learning_rate": 8.615664845173044e-06, + "loss": 1.7652, + "step": 473 + }, + { + "epoch": 0.025914738324016237, + "grad_norm": 83.56590270996094, + "learning_rate": 8.633879781420765e-06, + "loss": 1.918, + "step": 474 + }, + { + "epoch": 0.025969410767737792, + "grad_norm": 62.33113479614258, + "learning_rate": 8.652094717668488e-06, + "loss": 1.8928, + "step": 475 + }, + { + "epoch": 0.026024083211459343, + "grad_norm": 138.0233917236328, + "learning_rate": 8.670309653916212e-06, + "loss": 1.8303, + "step": 476 + }, + { + "epoch": 0.026078755655180898, + "grad_norm": 209.02651977539062, + "learning_rate": 8.688524590163935e-06, + "loss": 2.2035, + "step": 477 + }, + { + "epoch": 0.026133428098902452, + "grad_norm": 135.0298614501953, + "learning_rate": 8.706739526411658e-06, + "loss": 1.7703, + "step": 478 + }, + { + "epoch": 0.026188100542624004, + "grad_norm": 50.857093811035156, + "learning_rate": 8.724954462659382e-06, + "loss": 2.0026, + "step": 479 + }, + { + "epoch": 0.02624277298634556, + "grad_norm": 72.04938507080078, + "learning_rate": 8.743169398907103e-06, + "loss": 1.8144, + "step": 480 + }, + { + "epoch": 0.02629744543006711, + "grad_norm": 68.07408142089844, + "learning_rate": 8.761384335154828e-06, + "loss": 1.8212, + "step": 481 + }, + { + "epoch": 0.026352117873788664, + "grad_norm": 320.0442199707031, + "learning_rate": 8.779599271402552e-06, + "loss": 1.9238, + "step": 482 + }, + { + "epoch": 0.026406790317510215, + "grad_norm": 364.0622253417969, + "learning_rate": 8.797814207650273e-06, + "loss": 1.8918, + "step": 483 + }, + { + "epoch": 0.02646146276123177, + "grad_norm": 206.04893493652344, + "learning_rate": 8.816029143897997e-06, + "loss": 2.3211, + "step": 484 + }, + { + "epoch": 0.026516135204953325, + "grad_norm": 34.340911865234375, + "learning_rate": 8.83424408014572e-06, + "loss": 2.1976, + "step": 485 + }, + { + "epoch": 0.026570807648674876, + "grad_norm": 47.81856918334961, + "learning_rate": 8.852459016393443e-06, + "loss": 2.0064, + "step": 486 + }, + { + "epoch": 0.02662548009239643, + "grad_norm": 50.80766296386719, + "learning_rate": 8.870673952641167e-06, + "loss": 1.9891, + "step": 487 + }, + { + "epoch": 0.026680152536117982, + "grad_norm": 32.57099151611328, + "learning_rate": 8.888888888888888e-06, + "loss": 1.9218, + "step": 488 + }, + { + "epoch": 0.026734824979839537, + "grad_norm": 266.02239990234375, + "learning_rate": 8.907103825136613e-06, + "loss": 2.1281, + "step": 489 + }, + { + "epoch": 0.026789497423561088, + "grad_norm": 108.06240844726562, + "learning_rate": 8.925318761384337e-06, + "loss": 1.7934, + "step": 490 + }, + { + "epoch": 0.026844169867282643, + "grad_norm": 1120.06005859375, + "learning_rate": 8.943533697632058e-06, + "loss": 2.0106, + "step": 491 + }, + { + "epoch": 0.026898842311004197, + "grad_norm": 736.0390625, + "learning_rate": 8.961748633879782e-06, + "loss": 1.9261, + "step": 492 + }, + { + "epoch": 0.02695351475472575, + "grad_norm": 88.0309829711914, + "learning_rate": 8.979963570127505e-06, + "loss": 1.7376, + "step": 493 + }, + { + "epoch": 0.027008187198447303, + "grad_norm": 53.81012725830078, + "learning_rate": 8.998178506375228e-06, + "loss": 1.9115, + "step": 494 + }, + { + "epoch": 0.027062859642168854, + "grad_norm": 74.66096496582031, + "learning_rate": 9.016393442622952e-06, + "loss": 2.0631, + "step": 495 + }, + { + "epoch": 0.02711753208589041, + "grad_norm": 38.302921295166016, + "learning_rate": 9.034608378870675e-06, + "loss": 1.6359, + "step": 496 + }, + { + "epoch": 0.027172204529611964, + "grad_norm": 189.01902770996094, + "learning_rate": 9.052823315118398e-06, + "loss": 2.0035, + "step": 497 + }, + { + "epoch": 0.027226876973333515, + "grad_norm": 568.049072265625, + "learning_rate": 9.071038251366122e-06, + "loss": 2.1533, + "step": 498 + }, + { + "epoch": 0.02728154941705507, + "grad_norm": 458.05010986328125, + "learning_rate": 9.089253187613845e-06, + "loss": 2.0849, + "step": 499 + }, + { + "epoch": 0.02733622186077662, + "grad_norm": 502.0683288574219, + "learning_rate": 9.107468123861566e-06, + "loss": 1.8719, + "step": 500 + }, + { + "epoch": 0.027390894304498176, + "grad_norm": 156.02952575683594, + "learning_rate": 9.12568306010929e-06, + "loss": 1.9501, + "step": 501 + }, + { + "epoch": 0.027445566748219727, + "grad_norm": 105.03060150146484, + "learning_rate": 9.143897996357015e-06, + "loss": 2.2536, + "step": 502 + }, + { + "epoch": 0.02750023919194128, + "grad_norm": 36.80440139770508, + "learning_rate": 9.162112932604736e-06, + "loss": 2.1027, + "step": 503 + }, + { + "epoch": 0.027554911635662836, + "grad_norm": 101.0273666381836, + "learning_rate": 9.18032786885246e-06, + "loss": 1.9266, + "step": 504 + }, + { + "epoch": 0.027609584079384387, + "grad_norm": 37.55940246582031, + "learning_rate": 9.198542805100183e-06, + "loss": 2.017, + "step": 505 + }, + { + "epoch": 0.027664256523105942, + "grad_norm": 56.035186767578125, + "learning_rate": 9.216757741347906e-06, + "loss": 2.0428, + "step": 506 + }, + { + "epoch": 0.027718928966827493, + "grad_norm": 23.21670913696289, + "learning_rate": 9.23497267759563e-06, + "loss": 2.205, + "step": 507 + }, + { + "epoch": 0.027773601410549048, + "grad_norm": 66.58242797851562, + "learning_rate": 9.253187613843351e-06, + "loss": 2.0866, + "step": 508 + }, + { + "epoch": 0.027828273854270603, + "grad_norm": 418.0439147949219, + "learning_rate": 9.271402550091075e-06, + "loss": 1.8809, + "step": 509 + }, + { + "epoch": 0.027882946297992154, + "grad_norm": 176.02565002441406, + "learning_rate": 9.2896174863388e-06, + "loss": 1.8985, + "step": 510 + }, + { + "epoch": 0.02793761874171371, + "grad_norm": 84.0439453125, + "learning_rate": 9.307832422586521e-06, + "loss": 1.7829, + "step": 511 + }, + { + "epoch": 0.02799229118543526, + "grad_norm": 55.04361343383789, + "learning_rate": 9.326047358834245e-06, + "loss": 1.944, + "step": 512 + }, + { + "epoch": 0.028046963629156815, + "grad_norm": 32.8006477355957, + "learning_rate": 9.344262295081968e-06, + "loss": 1.9488, + "step": 513 + }, + { + "epoch": 0.028101636072878366, + "grad_norm": 51.80752182006836, + "learning_rate": 9.362477231329691e-06, + "loss": 1.8195, + "step": 514 + }, + { + "epoch": 0.02815630851659992, + "grad_norm": 117.03728485107422, + "learning_rate": 9.380692167577415e-06, + "loss": 2.1746, + "step": 515 + }, + { + "epoch": 0.028210980960321475, + "grad_norm": 38.82332229614258, + "learning_rate": 9.398907103825138e-06, + "loss": 1.6879, + "step": 516 + }, + { + "epoch": 0.028265653404043026, + "grad_norm": 55.288578033447266, + "learning_rate": 9.41712204007286e-06, + "loss": 1.918, + "step": 517 + }, + { + "epoch": 0.02832032584776458, + "grad_norm": 95.53724670410156, + "learning_rate": 9.435336976320585e-06, + "loss": 1.8159, + "step": 518 + }, + { + "epoch": 0.028374998291486132, + "grad_norm": 66.55142974853516, + "learning_rate": 9.453551912568308e-06, + "loss": 1.886, + "step": 519 + }, + { + "epoch": 0.028429670735207687, + "grad_norm": 23.587080001831055, + "learning_rate": 9.47176684881603e-06, + "loss": 1.9319, + "step": 520 + }, + { + "epoch": 0.02848434317892924, + "grad_norm": 70.53990936279297, + "learning_rate": 9.489981785063753e-06, + "loss": 1.9464, + "step": 521 + }, + { + "epoch": 0.028539015622650793, + "grad_norm": 142.0288543701172, + "learning_rate": 9.508196721311476e-06, + "loss": 1.7395, + "step": 522 + }, + { + "epoch": 0.028593688066372348, + "grad_norm": 398.0528869628906, + "learning_rate": 9.5264116575592e-06, + "loss": 1.8973, + "step": 523 + }, + { + "epoch": 0.0286483605100939, + "grad_norm": 98.02732849121094, + "learning_rate": 9.544626593806923e-06, + "loss": 1.801, + "step": 524 + }, + { + "epoch": 0.028703032953815454, + "grad_norm": 146.0543670654297, + "learning_rate": 9.562841530054644e-06, + "loss": 1.8577, + "step": 525 + }, + { + "epoch": 0.028757705397537005, + "grad_norm": 93.03865814208984, + "learning_rate": 9.581056466302368e-06, + "loss": 1.8477, + "step": 526 + }, + { + "epoch": 0.02881237784125856, + "grad_norm": 27.21319580078125, + "learning_rate": 9.599271402550093e-06, + "loss": 1.8168, + "step": 527 + }, + { + "epoch": 0.028867050284980114, + "grad_norm": 54.56467819213867, + "learning_rate": 9.617486338797814e-06, + "loss": 1.6896, + "step": 528 + }, + { + "epoch": 0.028921722728701665, + "grad_norm": 84.05551147460938, + "learning_rate": 9.635701275045538e-06, + "loss": 1.6824, + "step": 529 + }, + { + "epoch": 0.02897639517242322, + "grad_norm": 1184.1884765625, + "learning_rate": 9.653916211293261e-06, + "loss": 1.9277, + "step": 530 + }, + { + "epoch": 0.02903106761614477, + "grad_norm": 1448.201171875, + "learning_rate": 9.672131147540984e-06, + "loss": 1.7857, + "step": 531 + }, + { + "epoch": 0.029085740059866326, + "grad_norm": 504.09197998046875, + "learning_rate": 9.690346083788708e-06, + "loss": 1.8741, + "step": 532 + }, + { + "epoch": 0.02914041250358788, + "grad_norm": 33.55057907104492, + "learning_rate": 9.708561020036431e-06, + "loss": 1.7587, + "step": 533 + }, + { + "epoch": 0.029195084947309432, + "grad_norm": 44.283668518066406, + "learning_rate": 9.726775956284153e-06, + "loss": 1.9074, + "step": 534 + }, + { + "epoch": 0.029249757391030987, + "grad_norm": 27.58649253845215, + "learning_rate": 9.744990892531878e-06, + "loss": 1.95, + "step": 535 + }, + { + "epoch": 0.029304429834752538, + "grad_norm": 37.12625503540039, + "learning_rate": 9.763205828779601e-06, + "loss": 1.9932, + "step": 536 + }, + { + "epoch": 0.029359102278474093, + "grad_norm": 35.37157440185547, + "learning_rate": 9.781420765027323e-06, + "loss": 1.8175, + "step": 537 + }, + { + "epoch": 0.029413774722195644, + "grad_norm": 70.04698944091797, + "learning_rate": 9.799635701275046e-06, + "loss": 1.9425, + "step": 538 + }, + { + "epoch": 0.0294684471659172, + "grad_norm": 35.085933685302734, + "learning_rate": 9.81785063752277e-06, + "loss": 1.6702, + "step": 539 + }, + { + "epoch": 0.029523119609638753, + "grad_norm": 1184.0902099609375, + "learning_rate": 9.836065573770493e-06, + "loss": 1.6389, + "step": 540 + }, + { + "epoch": 0.029577792053360304, + "grad_norm": 1608.0947265625, + "learning_rate": 9.854280510018216e-06, + "loss": 1.8493, + "step": 541 + }, + { + "epoch": 0.02963246449708186, + "grad_norm": 326.02728271484375, + "learning_rate": 9.872495446265938e-06, + "loss": 1.8435, + "step": 542 + }, + { + "epoch": 0.02968713694080341, + "grad_norm": 39.07868576049805, + "learning_rate": 9.890710382513663e-06, + "loss": 1.984, + "step": 543 + }, + { + "epoch": 0.029741809384524965, + "grad_norm": 33.31973648071289, + "learning_rate": 9.908925318761386e-06, + "loss": 1.843, + "step": 544 + }, + { + "epoch": 0.02979648182824652, + "grad_norm": 22.32172966003418, + "learning_rate": 9.927140255009108e-06, + "loss": 1.8457, + "step": 545 + }, + { + "epoch": 0.02985115427196807, + "grad_norm": 16.757490158081055, + "learning_rate": 9.945355191256831e-06, + "loss": 1.9519, + "step": 546 + }, + { + "epoch": 0.029905826715689626, + "grad_norm": 24.877002716064453, + "learning_rate": 9.963570127504554e-06, + "loss": 1.9471, + "step": 547 + }, + { + "epoch": 0.029960499159411177, + "grad_norm": 16.99418067932129, + "learning_rate": 9.981785063752277e-06, + "loss": 1.6664, + "step": 548 + }, + { + "epoch": 0.03001517160313273, + "grad_norm": 32.54814529418945, + "learning_rate": 1e-05, + "loss": 1.8319, + "step": 549 + }, + { + "epoch": 0.030069844046854283, + "grad_norm": 93.52127075195312, + "learning_rate": 1.0018214936247722e-05, + "loss": 1.7347, + "step": 550 + }, + { + "epoch": 0.030124516490575837, + "grad_norm": 2112.07421875, + "learning_rate": 1.0036429872495447e-05, + "loss": 1.9635, + "step": 551 + }, + { + "epoch": 0.030179188934297392, + "grad_norm": 1248.0489501953125, + "learning_rate": 1.005464480874317e-05, + "loss": 1.7856, + "step": 552 + }, + { + "epoch": 0.030233861378018943, + "grad_norm": 50.78277587890625, + "learning_rate": 1.0072859744990892e-05, + "loss": 1.7724, + "step": 553 + }, + { + "epoch": 0.030288533821740498, + "grad_norm": 21.091209411621094, + "learning_rate": 1.0091074681238617e-05, + "loss": 1.5408, + "step": 554 + }, + { + "epoch": 0.03034320626546205, + "grad_norm": 44.30475997924805, + "learning_rate": 1.0109289617486339e-05, + "loss": 1.7884, + "step": 555 + }, + { + "epoch": 0.030397878709183604, + "grad_norm": 16.7192440032959, + "learning_rate": 1.0127504553734062e-05, + "loss": 1.7107, + "step": 556 + }, + { + "epoch": 0.03045255115290516, + "grad_norm": 12.495885848999023, + "learning_rate": 1.0145719489981787e-05, + "loss": 1.8349, + "step": 557 + }, + { + "epoch": 0.03050722359662671, + "grad_norm": 12.55789566040039, + "learning_rate": 1.0163934426229509e-05, + "loss": 1.8916, + "step": 558 + }, + { + "epoch": 0.030561896040348265, + "grad_norm": 7.338320255279541, + "learning_rate": 1.0182149362477232e-05, + "loss": 1.8867, + "step": 559 + }, + { + "epoch": 0.030616568484069816, + "grad_norm": 12.3360013961792, + "learning_rate": 1.0200364298724956e-05, + "loss": 1.9359, + "step": 560 + }, + { + "epoch": 0.03067124092779137, + "grad_norm": 7.530646324157715, + "learning_rate": 1.0218579234972679e-05, + "loss": 1.951, + "step": 561 + }, + { + "epoch": 0.03072591337151292, + "grad_norm": 17.204212188720703, + "learning_rate": 1.02367941712204e-05, + "loss": 1.9486, + "step": 562 + }, + { + "epoch": 0.030780585815234476, + "grad_norm": 19.721540451049805, + "learning_rate": 1.0255009107468126e-05, + "loss": 1.8807, + "step": 563 + }, + { + "epoch": 0.03083525825895603, + "grad_norm": 88.0316162109375, + "learning_rate": 1.0273224043715849e-05, + "loss": 1.8694, + "step": 564 + }, + { + "epoch": 0.030889930702677582, + "grad_norm": 183.02114868164062, + "learning_rate": 1.029143897996357e-05, + "loss": 1.8786, + "step": 565 + }, + { + "epoch": 0.030944603146399137, + "grad_norm": 164.01731872558594, + "learning_rate": 1.0309653916211296e-05, + "loss": 1.6899, + "step": 566 + }, + { + "epoch": 0.030999275590120688, + "grad_norm": 49.53018569946289, + "learning_rate": 1.0327868852459017e-05, + "loss": 1.7949, + "step": 567 + }, + { + "epoch": 0.031053948033842243, + "grad_norm": 12.931891441345215, + "learning_rate": 1.034608378870674e-05, + "loss": 1.5712, + "step": 568 + }, + { + "epoch": 0.031108620477563798, + "grad_norm": 8.202744483947754, + "learning_rate": 1.0364298724954462e-05, + "loss": 1.7965, + "step": 569 + }, + { + "epoch": 0.03116329292128535, + "grad_norm": 13.16232967376709, + "learning_rate": 1.0382513661202187e-05, + "loss": 1.7085, + "step": 570 + }, + { + "epoch": 0.031217965365006903, + "grad_norm": 29.418018341064453, + "learning_rate": 1.0400728597449909e-05, + "loss": 1.8511, + "step": 571 + }, + { + "epoch": 0.03127263780872846, + "grad_norm": 13.46008586883545, + "learning_rate": 1.0418943533697632e-05, + "loss": 1.8956, + "step": 572 + }, + { + "epoch": 0.03132731025245001, + "grad_norm": 11.677648544311523, + "learning_rate": 1.0437158469945357e-05, + "loss": 1.83, + "step": 573 + }, + { + "epoch": 0.03138198269617156, + "grad_norm": 13.705500602722168, + "learning_rate": 1.0455373406193079e-05, + "loss": 2.1726, + "step": 574 + }, + { + "epoch": 0.03143665513989312, + "grad_norm": 21.028234481811523, + "learning_rate": 1.0473588342440802e-05, + "loss": 1.7031, + "step": 575 + }, + { + "epoch": 0.03149132758361467, + "grad_norm": 132.01516723632812, + "learning_rate": 1.0491803278688525e-05, + "loss": 2.0628, + "step": 576 + }, + { + "epoch": 0.03154600002733622, + "grad_norm": 524.0234375, + "learning_rate": 1.0510018214936249e-05, + "loss": 2.0083, + "step": 577 + }, + { + "epoch": 0.03160067247105777, + "grad_norm": 288.0181579589844, + "learning_rate": 1.052823315118397e-05, + "loss": 1.8064, + "step": 578 + }, + { + "epoch": 0.03165534491477933, + "grad_norm": 69.53440856933594, + "learning_rate": 1.0546448087431695e-05, + "loss": 1.6945, + "step": 579 + }, + { + "epoch": 0.03171001735850088, + "grad_norm": 20.822866439819336, + "learning_rate": 1.0564663023679417e-05, + "loss": 1.946, + "step": 580 + }, + { + "epoch": 0.03176468980222243, + "grad_norm": 22.083980560302734, + "learning_rate": 1.058287795992714e-05, + "loss": 1.8491, + "step": 581 + }, + { + "epoch": 0.03181936224594399, + "grad_norm": 15.215189933776855, + "learning_rate": 1.0601092896174865e-05, + "loss": 1.7133, + "step": 582 + }, + { + "epoch": 0.03187403468966554, + "grad_norm": 18.729602813720703, + "learning_rate": 1.0619307832422587e-05, + "loss": 1.4803, + "step": 583 + }, + { + "epoch": 0.031928707133387094, + "grad_norm": 17.22536849975586, + "learning_rate": 1.063752276867031e-05, + "loss": 1.7733, + "step": 584 + }, + { + "epoch": 0.031983379577108645, + "grad_norm": 10.9695405960083, + "learning_rate": 1.0655737704918034e-05, + "loss": 1.7044, + "step": 585 + }, + { + "epoch": 0.0320380520208302, + "grad_norm": 39.03275680541992, + "learning_rate": 1.0673952641165757e-05, + "loss": 1.9761, + "step": 586 + }, + { + "epoch": 0.032092724464551754, + "grad_norm": 92.56143188476562, + "learning_rate": 1.0692167577413479e-05, + "loss": 1.8717, + "step": 587 + }, + { + "epoch": 0.032147396908273305, + "grad_norm": 46.027278900146484, + "learning_rate": 1.0710382513661204e-05, + "loss": 2.0165, + "step": 588 + }, + { + "epoch": 0.032202069351994864, + "grad_norm": 36.79457092285156, + "learning_rate": 1.0728597449908927e-05, + "loss": 1.3755, + "step": 589 + }, + { + "epoch": 0.032256741795716415, + "grad_norm": 17.218006134033203, + "learning_rate": 1.0746812386156649e-05, + "loss": 1.7613, + "step": 590 + }, + { + "epoch": 0.032311414239437966, + "grad_norm": 23.17936134338379, + "learning_rate": 1.0765027322404374e-05, + "loss": 1.7546, + "step": 591 + }, + { + "epoch": 0.03236608668315952, + "grad_norm": 24.034536361694336, + "learning_rate": 1.0783242258652095e-05, + "loss": 1.8651, + "step": 592 + }, + { + "epoch": 0.032420759126881075, + "grad_norm": 27.667858123779297, + "learning_rate": 1.0801457194899819e-05, + "loss": 1.8418, + "step": 593 + }, + { + "epoch": 0.03247543157060263, + "grad_norm": 19.952865600585938, + "learning_rate": 1.0819672131147544e-05, + "loss": 2.0352, + "step": 594 + }, + { + "epoch": 0.03253010401432418, + "grad_norm": 20.993377685546875, + "learning_rate": 1.0837887067395265e-05, + "loss": 1.9965, + "step": 595 + }, + { + "epoch": 0.032584776458045736, + "grad_norm": 7.068521499633789, + "learning_rate": 1.0856102003642987e-05, + "loss": 1.6131, + "step": 596 + }, + { + "epoch": 0.03263944890176729, + "grad_norm": 8.31657886505127, + "learning_rate": 1.0874316939890712e-05, + "loss": 1.7162, + "step": 597 + }, + { + "epoch": 0.03269412134548884, + "grad_norm": 7.927767753601074, + "learning_rate": 1.0892531876138435e-05, + "loss": 1.79, + "step": 598 + }, + { + "epoch": 0.0327487937892104, + "grad_norm": 13.863680839538574, + "learning_rate": 1.0910746812386157e-05, + "loss": 1.5875, + "step": 599 + }, + { + "epoch": 0.03280346623293195, + "grad_norm": 676.1051635742188, + "learning_rate": 1.0928961748633882e-05, + "loss": 1.8077, + "step": 600 + }, + { + "epoch": 0.0328581386766535, + "grad_norm": 5248.7783203125, + "learning_rate": 1.0947176684881603e-05, + "loss": 1.8635, + "step": 601 + }, + { + "epoch": 0.03291281112037505, + "grad_norm": 3664.40478515625, + "learning_rate": 1.0965391621129327e-05, + "loss": 1.8743, + "step": 602 + }, + { + "epoch": 0.03296748356409661, + "grad_norm": 274.0265808105469, + "learning_rate": 1.0983606557377052e-05, + "loss": 1.9368, + "step": 603 + }, + { + "epoch": 0.03302215600781816, + "grad_norm": 34.29569625854492, + "learning_rate": 1.1001821493624773e-05, + "loss": 1.7543, + "step": 604 + }, + { + "epoch": 0.03307682845153971, + "grad_norm": 19.485361099243164, + "learning_rate": 1.1020036429872497e-05, + "loss": 1.7341, + "step": 605 + }, + { + "epoch": 0.03313150089526127, + "grad_norm": 13.396047592163086, + "learning_rate": 1.1038251366120218e-05, + "loss": 1.6473, + "step": 606 + }, + { + "epoch": 0.03318617333898282, + "grad_norm": 7.0207390785217285, + "learning_rate": 1.1056466302367943e-05, + "loss": 1.6209, + "step": 607 + }, + { + "epoch": 0.03324084578270437, + "grad_norm": 8.798051834106445, + "learning_rate": 1.1074681238615665e-05, + "loss": 1.8282, + "step": 608 + }, + { + "epoch": 0.03329551822642592, + "grad_norm": 18.066835403442383, + "learning_rate": 1.1092896174863388e-05, + "loss": 1.8254, + "step": 609 + }, + { + "epoch": 0.03335019067014748, + "grad_norm": 13.874225616455078, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.7056, + "step": 610 + }, + { + "epoch": 0.03340486311386903, + "grad_norm": 117.53350830078125, + "learning_rate": 1.1129326047358835e-05, + "loss": 1.8687, + "step": 611 + }, + { + "epoch": 0.03345953555759058, + "grad_norm": 90.54015350341797, + "learning_rate": 1.1147540983606557e-05, + "loss": 1.7232, + "step": 612 + }, + { + "epoch": 0.03351420800131214, + "grad_norm": 70.52759552001953, + "learning_rate": 1.1165755919854282e-05, + "loss": 1.8801, + "step": 613 + }, + { + "epoch": 0.03356888044503369, + "grad_norm": 11.042872428894043, + "learning_rate": 1.1183970856102005e-05, + "loss": 1.7644, + "step": 614 + }, + { + "epoch": 0.033623552888755244, + "grad_norm": 13.786432266235352, + "learning_rate": 1.1202185792349727e-05, + "loss": 1.8156, + "step": 615 + }, + { + "epoch": 0.033678225332476795, + "grad_norm": 14.614713668823242, + "learning_rate": 1.1220400728597452e-05, + "loss": 1.8262, + "step": 616 + }, + { + "epoch": 0.03373289777619835, + "grad_norm": 11.149925231933594, + "learning_rate": 1.1238615664845173e-05, + "loss": 1.573, + "step": 617 + }, + { + "epoch": 0.033787570219919905, + "grad_norm": 16.103248596191406, + "learning_rate": 1.1256830601092897e-05, + "loss": 1.7203, + "step": 618 + }, + { + "epoch": 0.033842242663641456, + "grad_norm": 106.52347564697266, + "learning_rate": 1.1275045537340622e-05, + "loss": 1.6911, + "step": 619 + }, + { + "epoch": 0.033896915107363014, + "grad_norm": 24.681682586669922, + "learning_rate": 1.1293260473588343e-05, + "loss": 1.5224, + "step": 620 + }, + { + "epoch": 0.033951587551084565, + "grad_norm": 9.65957260131836, + "learning_rate": 1.1311475409836066e-05, + "loss": 1.9153, + "step": 621 + }, + { + "epoch": 0.034006259994806116, + "grad_norm": 13.652491569519043, + "learning_rate": 1.132969034608379e-05, + "loss": 1.5952, + "step": 622 + }, + { + "epoch": 0.03406093243852767, + "grad_norm": 10.877132415771484, + "learning_rate": 1.1347905282331513e-05, + "loss": 1.2974, + "step": 623 + }, + { + "epoch": 0.034115604882249226, + "grad_norm": 11.581110000610352, + "learning_rate": 1.1366120218579235e-05, + "loss": 1.7488, + "step": 624 + }, + { + "epoch": 0.03417027732597078, + "grad_norm": 10.634995460510254, + "learning_rate": 1.138433515482696e-05, + "loss": 1.6954, + "step": 625 + }, + { + "epoch": 0.03422494976969233, + "grad_norm": 13.65375804901123, + "learning_rate": 1.1402550091074681e-05, + "loss": 1.9428, + "step": 626 + }, + { + "epoch": 0.034279622213413886, + "grad_norm": 11.914112091064453, + "learning_rate": 1.1420765027322405e-05, + "loss": 1.6492, + "step": 627 + }, + { + "epoch": 0.03433429465713544, + "grad_norm": 10.57358169555664, + "learning_rate": 1.143897996357013e-05, + "loss": 1.5165, + "step": 628 + }, + { + "epoch": 0.03438896710085699, + "grad_norm": 6.70465087890625, + "learning_rate": 1.1457194899817851e-05, + "loss": 1.6004, + "step": 629 + }, + { + "epoch": 0.03444363954457855, + "grad_norm": 9.917174339294434, + "learning_rate": 1.1475409836065575e-05, + "loss": 1.8597, + "step": 630 + }, + { + "epoch": 0.0344983119883001, + "grad_norm": 39.04571533203125, + "learning_rate": 1.1493624772313298e-05, + "loss": 1.6368, + "step": 631 + }, + { + "epoch": 0.03455298443202165, + "grad_norm": 14.358606338500977, + "learning_rate": 1.1511839708561021e-05, + "loss": 1.554, + "step": 632 + }, + { + "epoch": 0.0346076568757432, + "grad_norm": 23.13021469116211, + "learning_rate": 1.1530054644808743e-05, + "loss": 1.5256, + "step": 633 + }, + { + "epoch": 0.03466232931946476, + "grad_norm": 13.445809364318848, + "learning_rate": 1.1548269581056468e-05, + "loss": 1.5399, + "step": 634 + }, + { + "epoch": 0.03471700176318631, + "grad_norm": 6.005585193634033, + "learning_rate": 1.1566484517304191e-05, + "loss": 1.9514, + "step": 635 + }, + { + "epoch": 0.03477167420690786, + "grad_norm": 9.953651428222656, + "learning_rate": 1.1584699453551913e-05, + "loss": 1.7613, + "step": 636 + }, + { + "epoch": 0.03482634665062942, + "grad_norm": 6.642368793487549, + "learning_rate": 1.1602914389799638e-05, + "loss": 1.6906, + "step": 637 + }, + { + "epoch": 0.03488101909435097, + "grad_norm": 2.8872010707855225, + "learning_rate": 1.162112932604736e-05, + "loss": 1.5059, + "step": 638 + }, + { + "epoch": 0.03493569153807252, + "grad_norm": 8.833909034729004, + "learning_rate": 1.1639344262295083e-05, + "loss": 1.6207, + "step": 639 + }, + { + "epoch": 0.03499036398179407, + "grad_norm": 16.708539962768555, + "learning_rate": 1.1657559198542808e-05, + "loss": 1.6156, + "step": 640 + }, + { + "epoch": 0.03504503642551563, + "grad_norm": 12.755306243896484, + "learning_rate": 1.167577413479053e-05, + "loss": 1.4539, + "step": 641 + }, + { + "epoch": 0.03509970886923718, + "grad_norm": 11.48142147064209, + "learning_rate": 1.1693989071038251e-05, + "loss": 1.6822, + "step": 642 + }, + { + "epoch": 0.035154381312958734, + "grad_norm": 12.490166664123535, + "learning_rate": 1.1712204007285975e-05, + "loss": 1.8842, + "step": 643 + }, + { + "epoch": 0.03520905375668029, + "grad_norm": 25.85009002685547, + "learning_rate": 1.17304189435337e-05, + "loss": 2.1781, + "step": 644 + }, + { + "epoch": 0.03526372620040184, + "grad_norm": 11.819995880126953, + "learning_rate": 1.1748633879781421e-05, + "loss": 1.5323, + "step": 645 + }, + { + "epoch": 0.035318398644123394, + "grad_norm": 8.944014549255371, + "learning_rate": 1.1766848816029144e-05, + "loss": 1.5232, + "step": 646 + }, + { + "epoch": 0.035373071087844946, + "grad_norm": 10.891643524169922, + "learning_rate": 1.1785063752276868e-05, + "loss": 1.6801, + "step": 647 + }, + { + "epoch": 0.035427743531566504, + "grad_norm": 10.763596534729004, + "learning_rate": 1.1803278688524591e-05, + "loss": 1.5737, + "step": 648 + }, + { + "epoch": 0.035482415975288055, + "grad_norm": 8.370046615600586, + "learning_rate": 1.1821493624772313e-05, + "loss": 1.9409, + "step": 649 + }, + { + "epoch": 0.035537088419009606, + "grad_norm": 6.004466533660889, + "learning_rate": 1.1839708561020038e-05, + "loss": 1.5306, + "step": 650 + }, + { + "epoch": 0.035591760862731164, + "grad_norm": 3.5885138511657715, + "learning_rate": 1.1857923497267761e-05, + "loss": 1.4286, + "step": 651 + }, + { + "epoch": 0.035646433306452716, + "grad_norm": 3.7985479831695557, + "learning_rate": 1.1876138433515483e-05, + "loss": 1.6876, + "step": 652 + }, + { + "epoch": 0.03570110575017427, + "grad_norm": 4.900628566741943, + "learning_rate": 1.1894353369763208e-05, + "loss": 1.9043, + "step": 653 + }, + { + "epoch": 0.035755778193895825, + "grad_norm": 5.9568867683410645, + "learning_rate": 1.191256830601093e-05, + "loss": 1.6401, + "step": 654 + }, + { + "epoch": 0.035810450637617376, + "grad_norm": 5.181082725524902, + "learning_rate": 1.1930783242258653e-05, + "loss": 1.6336, + "step": 655 + }, + { + "epoch": 0.03586512308133893, + "grad_norm": 8.357316970825195, + "learning_rate": 1.1948998178506378e-05, + "loss": 1.6637, + "step": 656 + }, + { + "epoch": 0.03591979552506048, + "grad_norm": 7.888040542602539, + "learning_rate": 1.19672131147541e-05, + "loss": 1.8617, + "step": 657 + }, + { + "epoch": 0.03597446796878204, + "grad_norm": 9.822503089904785, + "learning_rate": 1.1985428051001821e-05, + "loss": 1.7256, + "step": 658 + }, + { + "epoch": 0.03602914041250359, + "grad_norm": 15.9174165725708, + "learning_rate": 1.2003642987249546e-05, + "loss": 1.5389, + "step": 659 + }, + { + "epoch": 0.03608381285622514, + "grad_norm": 18.234058380126953, + "learning_rate": 1.202185792349727e-05, + "loss": 1.5336, + "step": 660 + }, + { + "epoch": 0.0361384852999467, + "grad_norm": 16.579933166503906, + "learning_rate": 1.2040072859744991e-05, + "loss": 1.574, + "step": 661 + }, + { + "epoch": 0.03619315774366825, + "grad_norm": 25.298982620239258, + "learning_rate": 1.2058287795992716e-05, + "loss": 1.9648, + "step": 662 + }, + { + "epoch": 0.0362478301873898, + "grad_norm": 7.92656135559082, + "learning_rate": 1.2076502732240438e-05, + "loss": 1.8567, + "step": 663 + }, + { + "epoch": 0.03630250263111135, + "grad_norm": 4.291557788848877, + "learning_rate": 1.2094717668488161e-05, + "loss": 1.7103, + "step": 664 + }, + { + "epoch": 0.03635717507483291, + "grad_norm": 7.58614444732666, + "learning_rate": 1.2112932604735886e-05, + "loss": 1.3305, + "step": 665 + }, + { + "epoch": 0.03641184751855446, + "grad_norm": 10.507026672363281, + "learning_rate": 1.2131147540983608e-05, + "loss": 1.6487, + "step": 666 + }, + { + "epoch": 0.03646651996227601, + "grad_norm": 8.809333801269531, + "learning_rate": 1.2149362477231331e-05, + "loss": 1.6547, + "step": 667 + }, + { + "epoch": 0.03652119240599757, + "grad_norm": 14.086284637451172, + "learning_rate": 1.2167577413479054e-05, + "loss": 1.7063, + "step": 668 + }, + { + "epoch": 0.03657586484971912, + "grad_norm": 12.698277473449707, + "learning_rate": 1.2185792349726778e-05, + "loss": 1.5332, + "step": 669 + }, + { + "epoch": 0.03663053729344067, + "grad_norm": 6.43668270111084, + "learning_rate": 1.2204007285974499e-05, + "loss": 1.6784, + "step": 670 + }, + { + "epoch": 0.036685209737162223, + "grad_norm": 12.270201683044434, + "learning_rate": 1.2222222222222224e-05, + "loss": 1.8055, + "step": 671 + }, + { + "epoch": 0.03673988218088378, + "grad_norm": 10.761246681213379, + "learning_rate": 1.2240437158469946e-05, + "loss": 1.6939, + "step": 672 + }, + { + "epoch": 0.03679455462460533, + "grad_norm": 11.223723411560059, + "learning_rate": 1.2258652094717669e-05, + "loss": 1.6462, + "step": 673 + }, + { + "epoch": 0.036849227068326884, + "grad_norm": 4.737725257873535, + "learning_rate": 1.2276867030965394e-05, + "loss": 1.6201, + "step": 674 + }, + { + "epoch": 0.03690389951204844, + "grad_norm": 4.538906097412109, + "learning_rate": 1.2295081967213116e-05, + "loss": 1.6743, + "step": 675 + }, + { + "epoch": 0.036958571955769993, + "grad_norm": 6.664514064788818, + "learning_rate": 1.2313296903460839e-05, + "loss": 1.7346, + "step": 676 + }, + { + "epoch": 0.037013244399491545, + "grad_norm": 5.120151042938232, + "learning_rate": 1.2331511839708562e-05, + "loss": 1.7071, + "step": 677 + }, + { + "epoch": 0.0370679168432131, + "grad_norm": 4.232073783874512, + "learning_rate": 1.2349726775956286e-05, + "loss": 1.4788, + "step": 678 + }, + { + "epoch": 0.037122589286934654, + "grad_norm": 2.9285714626312256, + "learning_rate": 1.2367941712204007e-05, + "loss": 1.6437, + "step": 679 + }, + { + "epoch": 0.037177261730656205, + "grad_norm": 3.141932249069214, + "learning_rate": 1.238615664845173e-05, + "loss": 1.5734, + "step": 680 + }, + { + "epoch": 0.03723193417437776, + "grad_norm": 4.262214660644531, + "learning_rate": 1.2404371584699456e-05, + "loss": 1.627, + "step": 681 + }, + { + "epoch": 0.037286606618099315, + "grad_norm": 3.932655096054077, + "learning_rate": 1.2422586520947177e-05, + "loss": 1.5565, + "step": 682 + }, + { + "epoch": 0.037341279061820866, + "grad_norm": 11.678011894226074, + "learning_rate": 1.24408014571949e-05, + "loss": 1.5662, + "step": 683 + }, + { + "epoch": 0.03739595150554242, + "grad_norm": 9.801738739013672, + "learning_rate": 1.2459016393442624e-05, + "loss": 1.5538, + "step": 684 + }, + { + "epoch": 0.037450623949263975, + "grad_norm": 1004.0526733398438, + "learning_rate": 1.2477231329690347e-05, + "loss": 1.5911, + "step": 685 + }, + { + "epoch": 0.03750529639298553, + "grad_norm": 7584.44580078125, + "learning_rate": 1.2495446265938069e-05, + "loss": 2.0207, + "step": 686 + }, + { + "epoch": 0.03755996883670708, + "grad_norm": 6208.4208984375, + "learning_rate": 1.2513661202185794e-05, + "loss": 1.6339, + "step": 687 + }, + { + "epoch": 0.03761464128042863, + "grad_norm": 1020.0870361328125, + "learning_rate": 1.2531876138433516e-05, + "loss": 1.8644, + "step": 688 + }, + { + "epoch": 0.03766931372415019, + "grad_norm": 100.52093505859375, + "learning_rate": 1.2550091074681239e-05, + "loss": 1.8457, + "step": 689 + }, + { + "epoch": 0.03772398616787174, + "grad_norm": 26.177534103393555, + "learning_rate": 1.2568306010928964e-05, + "loss": 1.5214, + "step": 690 + }, + { + "epoch": 0.03777865861159329, + "grad_norm": 5.648628234863281, + "learning_rate": 1.2586520947176686e-05, + "loss": 1.5918, + "step": 691 + }, + { + "epoch": 0.03783333105531485, + "grad_norm": 8.46560287475586, + "learning_rate": 1.2604735883424409e-05, + "loss": 1.9479, + "step": 692 + }, + { + "epoch": 0.0378880034990364, + "grad_norm": 11.708614349365234, + "learning_rate": 1.2622950819672132e-05, + "loss": 1.6639, + "step": 693 + }, + { + "epoch": 0.03794267594275795, + "grad_norm": 6.186878681182861, + "learning_rate": 1.2641165755919856e-05, + "loss": 1.4583, + "step": 694 + }, + { + "epoch": 0.0379973483864795, + "grad_norm": 3.3482556343078613, + "learning_rate": 1.2659380692167577e-05, + "loss": 1.7504, + "step": 695 + }, + { + "epoch": 0.03805202083020106, + "grad_norm": 9.62446117401123, + "learning_rate": 1.2677595628415302e-05, + "loss": 1.6985, + "step": 696 + }, + { + "epoch": 0.03810669327392261, + "grad_norm": 5.542342662811279, + "learning_rate": 1.2695810564663025e-05, + "loss": 1.9048, + "step": 697 + }, + { + "epoch": 0.03816136571764416, + "grad_norm": 4.40621280670166, + "learning_rate": 1.2714025500910747e-05, + "loss": 1.6959, + "step": 698 + }, + { + "epoch": 0.03821603816136572, + "grad_norm": 5.957722187042236, + "learning_rate": 1.2732240437158472e-05, + "loss": 1.6857, + "step": 699 + }, + { + "epoch": 0.03827071060508727, + "grad_norm": 4.384960651397705, + "learning_rate": 1.2750455373406194e-05, + "loss": 1.6575, + "step": 700 + }, + { + "epoch": 0.03832538304880882, + "grad_norm": 3.0639445781707764, + "learning_rate": 1.2768670309653917e-05, + "loss": 1.6388, + "step": 701 + }, + { + "epoch": 0.03838005549253038, + "grad_norm": 2.1395390033721924, + "learning_rate": 1.2786885245901642e-05, + "loss": 1.7303, + "step": 702 + }, + { + "epoch": 0.03843472793625193, + "grad_norm": 5.562722206115723, + "learning_rate": 1.2805100182149364e-05, + "loss": 1.6507, + "step": 703 + }, + { + "epoch": 0.03848940037997348, + "grad_norm": 3.7774806022644043, + "learning_rate": 1.2823315118397085e-05, + "loss": 1.652, + "step": 704 + }, + { + "epoch": 0.038544072823695034, + "grad_norm": 4.078719139099121, + "learning_rate": 1.284153005464481e-05, + "loss": 1.6753, + "step": 705 + }, + { + "epoch": 0.03859874526741659, + "grad_norm": 4.444253921508789, + "learning_rate": 1.2859744990892534e-05, + "loss": 1.6236, + "step": 706 + }, + { + "epoch": 0.038653417711138144, + "grad_norm": 13.297170639038086, + "learning_rate": 1.2877959927140255e-05, + "loss": 1.4553, + "step": 707 + }, + { + "epoch": 0.038708090154859695, + "grad_norm": 7.918057918548584, + "learning_rate": 1.289617486338798e-05, + "loss": 1.6613, + "step": 708 + }, + { + "epoch": 0.03876276259858125, + "grad_norm": 5.757493019104004, + "learning_rate": 1.2914389799635702e-05, + "loss": 1.8627, + "step": 709 + }, + { + "epoch": 0.038817435042302804, + "grad_norm": 5.366604328155518, + "learning_rate": 1.2932604735883425e-05, + "loss": 1.5041, + "step": 710 + }, + { + "epoch": 0.038872107486024356, + "grad_norm": 5.436422348022461, + "learning_rate": 1.295081967213115e-05, + "loss": 1.3581, + "step": 711 + }, + { + "epoch": 0.03892677992974591, + "grad_norm": 4.333948612213135, + "learning_rate": 1.2969034608378872e-05, + "loss": 1.6401, + "step": 712 + }, + { + "epoch": 0.038981452373467465, + "grad_norm": 5.492818355560303, + "learning_rate": 1.2987249544626595e-05, + "loss": 1.5681, + "step": 713 + }, + { + "epoch": 0.039036124817189016, + "grad_norm": 4.559354305267334, + "learning_rate": 1.3005464480874317e-05, + "loss": 1.9598, + "step": 714 + }, + { + "epoch": 0.03909079726091057, + "grad_norm": 3.674311876296997, + "learning_rate": 1.3023679417122042e-05, + "loss": 1.507, + "step": 715 + }, + { + "epoch": 0.039145469704632126, + "grad_norm": 3.9374499320983887, + "learning_rate": 1.3041894353369764e-05, + "loss": 1.6584, + "step": 716 + }, + { + "epoch": 0.03920014214835368, + "grad_norm": 3.1975691318511963, + "learning_rate": 1.3060109289617487e-05, + "loss": 1.2895, + "step": 717 + }, + { + "epoch": 0.03925481459207523, + "grad_norm": 4.201927185058594, + "learning_rate": 1.307832422586521e-05, + "loss": 1.6847, + "step": 718 + }, + { + "epoch": 0.03930948703579678, + "grad_norm": 3.447638988494873, + "learning_rate": 1.3096539162112933e-05, + "loss": 1.7574, + "step": 719 + }, + { + "epoch": 0.03936415947951834, + "grad_norm": 8.837514877319336, + "learning_rate": 1.3114754098360655e-05, + "loss": 1.6856, + "step": 720 + }, + { + "epoch": 0.03941883192323989, + "grad_norm": 3.8064751625061035, + "learning_rate": 1.313296903460838e-05, + "loss": 1.5401, + "step": 721 + }, + { + "epoch": 0.03947350436696144, + "grad_norm": 4.084284782409668, + "learning_rate": 1.3151183970856103e-05, + "loss": 1.5107, + "step": 722 + }, + { + "epoch": 0.039528176810683, + "grad_norm": 6.181986331939697, + "learning_rate": 1.3169398907103825e-05, + "loss": 1.5962, + "step": 723 + }, + { + "epoch": 0.03958284925440455, + "grad_norm": 4.772217273712158, + "learning_rate": 1.318761384335155e-05, + "loss": 1.5862, + "step": 724 + }, + { + "epoch": 0.0396375216981261, + "grad_norm": 7.960171222686768, + "learning_rate": 1.3205828779599272e-05, + "loss": 1.7534, + "step": 725 + }, + { + "epoch": 0.03969219414184765, + "grad_norm": 15.094135284423828, + "learning_rate": 1.3224043715846995e-05, + "loss": 1.8199, + "step": 726 + }, + { + "epoch": 0.03974686658556921, + "grad_norm": 8.070466995239258, + "learning_rate": 1.324225865209472e-05, + "loss": 1.6409, + "step": 727 + }, + { + "epoch": 0.03980153902929076, + "grad_norm": 9.309974670410156, + "learning_rate": 1.3260473588342442e-05, + "loss": 1.2196, + "step": 728 + }, + { + "epoch": 0.03985621147301231, + "grad_norm": 6.065708160400391, + "learning_rate": 1.3278688524590165e-05, + "loss": 1.5767, + "step": 729 + }, + { + "epoch": 0.03991088391673387, + "grad_norm": 9.191472053527832, + "learning_rate": 1.3296903460837888e-05, + "loss": 1.4075, + "step": 730 + }, + { + "epoch": 0.03996555636045542, + "grad_norm": 3.3165712356567383, + "learning_rate": 1.3315118397085612e-05, + "loss": 1.7131, + "step": 731 + }, + { + "epoch": 0.04002022880417697, + "grad_norm": 7.4427924156188965, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.7206, + "step": 732 + }, + { + "epoch": 0.04007490124789853, + "grad_norm": 3.81032657623291, + "learning_rate": 1.3351548269581058e-05, + "loss": 1.5482, + "step": 733 + }, + { + "epoch": 0.04012957369162008, + "grad_norm": 3.489546298980713, + "learning_rate": 1.336976320582878e-05, + "loss": 1.5497, + "step": 734 + }, + { + "epoch": 0.040184246135341634, + "grad_norm": 7.5223870277404785, + "learning_rate": 1.3387978142076503e-05, + "loss": 1.7374, + "step": 735 + }, + { + "epoch": 0.040238918579063185, + "grad_norm": 7.0435380935668945, + "learning_rate": 1.3406193078324228e-05, + "loss": 1.8398, + "step": 736 + }, + { + "epoch": 0.04029359102278474, + "grad_norm": 26.91312599182129, + "learning_rate": 1.342440801457195e-05, + "loss": 2.0835, + "step": 737 + }, + { + "epoch": 0.040348263466506294, + "grad_norm": 5.377737522125244, + "learning_rate": 1.3442622950819673e-05, + "loss": 1.659, + "step": 738 + }, + { + "epoch": 0.040402935910227845, + "grad_norm": 23.165529251098633, + "learning_rate": 1.3460837887067397e-05, + "loss": 1.6294, + "step": 739 + }, + { + "epoch": 0.040457608353949404, + "grad_norm": 16.316987991333008, + "learning_rate": 1.347905282331512e-05, + "loss": 1.5186, + "step": 740 + }, + { + "epoch": 0.040512280797670955, + "grad_norm": 11.493973731994629, + "learning_rate": 1.3497267759562842e-05, + "loss": 1.6129, + "step": 741 + }, + { + "epoch": 0.040566953241392506, + "grad_norm": 4.56845235824585, + "learning_rate": 1.3515482695810567e-05, + "loss": 1.6557, + "step": 742 + }, + { + "epoch": 0.04062162568511406, + "grad_norm": 3.6758882999420166, + "learning_rate": 1.353369763205829e-05, + "loss": 1.6312, + "step": 743 + }, + { + "epoch": 0.040676298128835615, + "grad_norm": 5.3206915855407715, + "learning_rate": 1.3551912568306011e-05, + "loss": 1.4385, + "step": 744 + }, + { + "epoch": 0.04073097057255717, + "grad_norm": 6.979442596435547, + "learning_rate": 1.3570127504553736e-05, + "loss": 1.4506, + "step": 745 + }, + { + "epoch": 0.04078564301627872, + "grad_norm": 8.429824829101562, + "learning_rate": 1.3588342440801458e-05, + "loss": 1.7672, + "step": 746 + }, + { + "epoch": 0.040840315460000276, + "grad_norm": 2.2062482833862305, + "learning_rate": 1.3606557377049181e-05, + "loss": 1.4729, + "step": 747 + }, + { + "epoch": 0.04089498790372183, + "grad_norm": 3.669731616973877, + "learning_rate": 1.3624772313296906e-05, + "loss": 1.6853, + "step": 748 + }, + { + "epoch": 0.04094966034744338, + "grad_norm": 6.070624351501465, + "learning_rate": 1.3642987249544628e-05, + "loss": 1.4685, + "step": 749 + }, + { + "epoch": 0.04100433279116493, + "grad_norm": 8.908364295959473, + "learning_rate": 1.366120218579235e-05, + "loss": 1.5133, + "step": 750 + }, + { + "epoch": 0.04105900523488649, + "grad_norm": 16.33085823059082, + "learning_rate": 1.3679417122040073e-05, + "loss": 1.3626, + "step": 751 + }, + { + "epoch": 0.04111367767860804, + "grad_norm": 3.8552441596984863, + "learning_rate": 1.3697632058287798e-05, + "loss": 1.5863, + "step": 752 + }, + { + "epoch": 0.04116835012232959, + "grad_norm": 6.05226993560791, + "learning_rate": 1.371584699453552e-05, + "loss": 1.8569, + "step": 753 + }, + { + "epoch": 0.04122302256605115, + "grad_norm": 2.2400500774383545, + "learning_rate": 1.3734061930783243e-05, + "loss": 1.7029, + "step": 754 + }, + { + "epoch": 0.0412776950097727, + "grad_norm": 3.6601219177246094, + "learning_rate": 1.3752276867030966e-05, + "loss": 1.9266, + "step": 755 + }, + { + "epoch": 0.04133236745349425, + "grad_norm": 2.621263265609741, + "learning_rate": 1.377049180327869e-05, + "loss": 1.648, + "step": 756 + }, + { + "epoch": 0.04138703989721581, + "grad_norm": 1.5985500812530518, + "learning_rate": 1.3788706739526411e-05, + "loss": 1.5011, + "step": 757 + }, + { + "epoch": 0.04144171234093736, + "grad_norm": 4.055543899536133, + "learning_rate": 1.3806921675774136e-05, + "loss": 1.7869, + "step": 758 + }, + { + "epoch": 0.04149638478465891, + "grad_norm": 4.233205318450928, + "learning_rate": 1.382513661202186e-05, + "loss": 1.6338, + "step": 759 + }, + { + "epoch": 0.04155105722838046, + "grad_norm": 4.599460124969482, + "learning_rate": 1.3843351548269581e-05, + "loss": 1.4755, + "step": 760 + }, + { + "epoch": 0.04160572967210202, + "grad_norm": 1.931977391242981, + "learning_rate": 1.3861566484517306e-05, + "loss": 1.4926, + "step": 761 + }, + { + "epoch": 0.04166040211582357, + "grad_norm": 3.5780067443847656, + "learning_rate": 1.3879781420765028e-05, + "loss": 1.235, + "step": 762 + }, + { + "epoch": 0.04171507455954512, + "grad_norm": 3.4355664253234863, + "learning_rate": 1.3897996357012751e-05, + "loss": 1.4965, + "step": 763 + }, + { + "epoch": 0.04176974700326668, + "grad_norm": 4.869495391845703, + "learning_rate": 1.3916211293260475e-05, + "loss": 1.5903, + "step": 764 + }, + { + "epoch": 0.04182441944698823, + "grad_norm": 6.154510974884033, + "learning_rate": 1.3934426229508198e-05, + "loss": 1.3057, + "step": 765 + }, + { + "epoch": 0.041879091890709784, + "grad_norm": 4.640274524688721, + "learning_rate": 1.395264116575592e-05, + "loss": 1.3923, + "step": 766 + }, + { + "epoch": 0.041933764334431335, + "grad_norm": 7.889922142028809, + "learning_rate": 1.3970856102003645e-05, + "loss": 1.6654, + "step": 767 + }, + { + "epoch": 0.04198843677815289, + "grad_norm": 4.304290771484375, + "learning_rate": 1.3989071038251368e-05, + "loss": 1.4957, + "step": 768 + }, + { + "epoch": 0.042043109221874445, + "grad_norm": 4.088436603546143, + "learning_rate": 1.400728597449909e-05, + "loss": 1.6171, + "step": 769 + }, + { + "epoch": 0.042097781665595996, + "grad_norm": 4.8527913093566895, + "learning_rate": 1.4025500910746814e-05, + "loss": 1.6114, + "step": 770 + }, + { + "epoch": 0.042152454109317554, + "grad_norm": 2.623225450515747, + "learning_rate": 1.4043715846994536e-05, + "loss": 1.7311, + "step": 771 + }, + { + "epoch": 0.042207126553039105, + "grad_norm": 1.403700351715088, + "learning_rate": 1.406193078324226e-05, + "loss": 1.6156, + "step": 772 + }, + { + "epoch": 0.042261798996760656, + "grad_norm": 2.2497358322143555, + "learning_rate": 1.4080145719489984e-05, + "loss": 1.5449, + "step": 773 + }, + { + "epoch": 0.04231647144048221, + "grad_norm": 2.3332767486572266, + "learning_rate": 1.4098360655737706e-05, + "loss": 1.5796, + "step": 774 + }, + { + "epoch": 0.042371143884203766, + "grad_norm": 3.9911632537841797, + "learning_rate": 1.411657559198543e-05, + "loss": 1.5169, + "step": 775 + }, + { + "epoch": 0.04242581632792532, + "grad_norm": 8.613127708435059, + "learning_rate": 1.4134790528233153e-05, + "loss": 1.4702, + "step": 776 + }, + { + "epoch": 0.04248048877164687, + "grad_norm": 4.590834140777588, + "learning_rate": 1.4153005464480876e-05, + "loss": 1.68, + "step": 777 + }, + { + "epoch": 0.042535161215368426, + "grad_norm": 2.8305201530456543, + "learning_rate": 1.4171220400728598e-05, + "loss": 1.7465, + "step": 778 + }, + { + "epoch": 0.04258983365908998, + "grad_norm": 3.9445338249206543, + "learning_rate": 1.4189435336976323e-05, + "loss": 1.468, + "step": 779 + }, + { + "epoch": 0.04264450610281153, + "grad_norm": 4.176814556121826, + "learning_rate": 1.4207650273224044e-05, + "loss": 1.7033, + "step": 780 + }, + { + "epoch": 0.04269917854653309, + "grad_norm": 2.7584712505340576, + "learning_rate": 1.4225865209471768e-05, + "loss": 1.6323, + "step": 781 + }, + { + "epoch": 0.04275385099025464, + "grad_norm": 2.9113473892211914, + "learning_rate": 1.4244080145719493e-05, + "loss": 1.3134, + "step": 782 + }, + { + "epoch": 0.04280852343397619, + "grad_norm": 1.8561736345291138, + "learning_rate": 1.4262295081967214e-05, + "loss": 1.3871, + "step": 783 + }, + { + "epoch": 0.04286319587769774, + "grad_norm": 2.4876017570495605, + "learning_rate": 1.4280510018214938e-05, + "loss": 1.4337, + "step": 784 + }, + { + "epoch": 0.0429178683214193, + "grad_norm": 1.7011550664901733, + "learning_rate": 1.4298724954462661e-05, + "loss": 1.3994, + "step": 785 + }, + { + "epoch": 0.04297254076514085, + "grad_norm": 1.6324783563613892, + "learning_rate": 1.4316939890710384e-05, + "loss": 1.8009, + "step": 786 + }, + { + "epoch": 0.0430272132088624, + "grad_norm": 2.866372585296631, + "learning_rate": 1.4335154826958106e-05, + "loss": 1.7095, + "step": 787 + }, + { + "epoch": 0.04308188565258396, + "grad_norm": 2.815638780593872, + "learning_rate": 1.435336976320583e-05, + "loss": 1.7191, + "step": 788 + }, + { + "epoch": 0.04313655809630551, + "grad_norm": 2.441161632537842, + "learning_rate": 1.4371584699453554e-05, + "loss": 1.2543, + "step": 789 + }, + { + "epoch": 0.04319123054002706, + "grad_norm": 2.0225961208343506, + "learning_rate": 1.4389799635701276e-05, + "loss": 1.5309, + "step": 790 + }, + { + "epoch": 0.04324590298374861, + "grad_norm": 3.5986547470092773, + "learning_rate": 1.4408014571949e-05, + "loss": 1.804, + "step": 791 + }, + { + "epoch": 0.04330057542747017, + "grad_norm": 1.4151558876037598, + "learning_rate": 1.4426229508196722e-05, + "loss": 1.5317, + "step": 792 + }, + { + "epoch": 0.04335524787119172, + "grad_norm": 2.052358865737915, + "learning_rate": 1.4444444444444446e-05, + "loss": 1.3323, + "step": 793 + }, + { + "epoch": 0.043409920314913274, + "grad_norm": 2.361806631088257, + "learning_rate": 1.4462659380692167e-05, + "loss": 1.4486, + "step": 794 + }, + { + "epoch": 0.04346459275863483, + "grad_norm": 1.5041462182998657, + "learning_rate": 1.4480874316939892e-05, + "loss": 1.332, + "step": 795 + }, + { + "epoch": 0.04351926520235638, + "grad_norm": 1.6971710920333862, + "learning_rate": 1.4499089253187614e-05, + "loss": 1.6633, + "step": 796 + }, + { + "epoch": 0.043573937646077934, + "grad_norm": 2.8522026538848877, + "learning_rate": 1.4517304189435337e-05, + "loss": 1.605, + "step": 797 + }, + { + "epoch": 0.043628610089799486, + "grad_norm": 2.7455945014953613, + "learning_rate": 1.4535519125683062e-05, + "loss": 1.7904, + "step": 798 + }, + { + "epoch": 0.043683282533521044, + "grad_norm": 2.3787457942962646, + "learning_rate": 1.4553734061930784e-05, + "loss": 1.5623, + "step": 799 + }, + { + "epoch": 0.043737954977242595, + "grad_norm": 1.7269166707992554, + "learning_rate": 1.4571948998178507e-05, + "loss": 1.5057, + "step": 800 + }, + { + "epoch": 0.043792627420964146, + "grad_norm": 2.5024352073669434, + "learning_rate": 1.459016393442623e-05, + "loss": 1.6155, + "step": 801 + }, + { + "epoch": 0.043847299864685704, + "grad_norm": 2.6392643451690674, + "learning_rate": 1.4608378870673954e-05, + "loss": 1.3168, + "step": 802 + }, + { + "epoch": 0.043901972308407256, + "grad_norm": 1.8627870082855225, + "learning_rate": 1.4626593806921676e-05, + "loss": 1.6129, + "step": 803 + }, + { + "epoch": 0.04395664475212881, + "grad_norm": 1.7215907573699951, + "learning_rate": 1.46448087431694e-05, + "loss": 1.8398, + "step": 804 + }, + { + "epoch": 0.044011317195850365, + "grad_norm": 1.863532304763794, + "learning_rate": 1.4663023679417124e-05, + "loss": 1.5942, + "step": 805 + }, + { + "epoch": 0.044065989639571916, + "grad_norm": 1.8992888927459717, + "learning_rate": 1.4681238615664846e-05, + "loss": 1.853, + "step": 806 + }, + { + "epoch": 0.04412066208329347, + "grad_norm": 2.938838005065918, + "learning_rate": 1.469945355191257e-05, + "loss": 1.7013, + "step": 807 + }, + { + "epoch": 0.04417533452701502, + "grad_norm": 2.177415132522583, + "learning_rate": 1.4717668488160292e-05, + "loss": 1.9429, + "step": 808 + }, + { + "epoch": 0.04423000697073658, + "grad_norm": 2.764028310775757, + "learning_rate": 1.4735883424408016e-05, + "loss": 1.6478, + "step": 809 + }, + { + "epoch": 0.04428467941445813, + "grad_norm": 9.729994773864746, + "learning_rate": 1.4754098360655739e-05, + "loss": 1.2768, + "step": 810 + }, + { + "epoch": 0.04433935185817968, + "grad_norm": 4.008709907531738, + "learning_rate": 1.4772313296903462e-05, + "loss": 1.3561, + "step": 811 + }, + { + "epoch": 0.04439402430190124, + "grad_norm": 10.925102233886719, + "learning_rate": 1.4790528233151184e-05, + "loss": 1.5523, + "step": 812 + }, + { + "epoch": 0.04444869674562279, + "grad_norm": 3.32797908782959, + "learning_rate": 1.4808743169398909e-05, + "loss": 1.8055, + "step": 813 + }, + { + "epoch": 0.04450336918934434, + "grad_norm": 15.12276554107666, + "learning_rate": 1.4826958105646632e-05, + "loss": 1.4402, + "step": 814 + }, + { + "epoch": 0.04455804163306589, + "grad_norm": 31.908405303955078, + "learning_rate": 1.4845173041894354e-05, + "loss": 1.7011, + "step": 815 + }, + { + "epoch": 0.04461271407678745, + "grad_norm": 16.702938079833984, + "learning_rate": 1.4863387978142079e-05, + "loss": 1.6679, + "step": 816 + }, + { + "epoch": 0.044667386520509, + "grad_norm": 2.4564831256866455, + "learning_rate": 1.48816029143898e-05, + "loss": 1.6122, + "step": 817 + }, + { + "epoch": 0.04472205896423055, + "grad_norm": 26.65679931640625, + "learning_rate": 1.4899817850637524e-05, + "loss": 1.7926, + "step": 818 + }, + { + "epoch": 0.04477673140795211, + "grad_norm": 5.842408180236816, + "learning_rate": 1.4918032786885249e-05, + "loss": 1.5873, + "step": 819 + }, + { + "epoch": 0.04483140385167366, + "grad_norm": 4.12101936340332, + "learning_rate": 1.493624772313297e-05, + "loss": 1.6405, + "step": 820 + }, + { + "epoch": 0.04488607629539521, + "grad_norm": 2.6663596630096436, + "learning_rate": 1.4954462659380694e-05, + "loss": 1.6161, + "step": 821 + }, + { + "epoch": 0.04494074873911676, + "grad_norm": 3.7673521041870117, + "learning_rate": 1.4972677595628417e-05, + "loss": 1.8488, + "step": 822 + }, + { + "epoch": 0.04499542118283832, + "grad_norm": 4.714693546295166, + "learning_rate": 1.499089253187614e-05, + "loss": 1.7681, + "step": 823 + }, + { + "epoch": 0.04505009362655987, + "grad_norm": 6.4491729736328125, + "learning_rate": 1.5009107468123862e-05, + "loss": 1.3745, + "step": 824 + }, + { + "epoch": 0.045104766070281424, + "grad_norm": 7.360263824462891, + "learning_rate": 1.5027322404371585e-05, + "loss": 1.8032, + "step": 825 + }, + { + "epoch": 0.04515943851400298, + "grad_norm": 5.310680866241455, + "learning_rate": 1.5045537340619309e-05, + "loss": 1.5622, + "step": 826 + }, + { + "epoch": 0.04521411095772453, + "grad_norm": 14.619023323059082, + "learning_rate": 1.5063752276867032e-05, + "loss": 1.8291, + "step": 827 + }, + { + "epoch": 0.045268783401446085, + "grad_norm": 10.877686500549316, + "learning_rate": 1.5081967213114754e-05, + "loss": 1.4486, + "step": 828 + }, + { + "epoch": 0.04532345584516764, + "grad_norm": 5.116811752319336, + "learning_rate": 1.5100182149362479e-05, + "loss": 1.5034, + "step": 829 + }, + { + "epoch": 0.045378128288889194, + "grad_norm": 5.1252007484436035, + "learning_rate": 1.5118397085610202e-05, + "loss": 1.7112, + "step": 830 + }, + { + "epoch": 0.045432800732610745, + "grad_norm": 2.890305995941162, + "learning_rate": 1.5136612021857924e-05, + "loss": 1.5055, + "step": 831 + }, + { + "epoch": 0.045487473176332296, + "grad_norm": 3.1877706050872803, + "learning_rate": 1.5154826958105649e-05, + "loss": 1.7636, + "step": 832 + }, + { + "epoch": 0.045542145620053855, + "grad_norm": 4.178686618804932, + "learning_rate": 1.517304189435337e-05, + "loss": 1.6676, + "step": 833 + }, + { + "epoch": 0.045596818063775406, + "grad_norm": 4.838871002197266, + "learning_rate": 1.5191256830601094e-05, + "loss": 1.5549, + "step": 834 + }, + { + "epoch": 0.04565149050749696, + "grad_norm": 3.2195870876312256, + "learning_rate": 1.5209471766848819e-05, + "loss": 1.5499, + "step": 835 + }, + { + "epoch": 0.045706162951218515, + "grad_norm": 2.431100606918335, + "learning_rate": 1.522768670309654e-05, + "loss": 1.6525, + "step": 836 + }, + { + "epoch": 0.045760835394940066, + "grad_norm": 2.3097457885742188, + "learning_rate": 1.5245901639344264e-05, + "loss": 1.8803, + "step": 837 + }, + { + "epoch": 0.04581550783866162, + "grad_norm": 1.7990683317184448, + "learning_rate": 1.526411657559199e-05, + "loss": 1.754, + "step": 838 + }, + { + "epoch": 0.04587018028238317, + "grad_norm": 1.9642953872680664, + "learning_rate": 1.528233151183971e-05, + "loss": 1.5911, + "step": 839 + }, + { + "epoch": 0.04592485272610473, + "grad_norm": 3.675048828125, + "learning_rate": 1.5300546448087432e-05, + "loss": 1.6634, + "step": 840 + }, + { + "epoch": 0.04597952516982628, + "grad_norm": 1.6151901483535767, + "learning_rate": 1.5318761384335155e-05, + "loss": 1.385, + "step": 841 + }, + { + "epoch": 0.04603419761354783, + "grad_norm": 3.0301644802093506, + "learning_rate": 1.533697632058288e-05, + "loss": 1.6138, + "step": 842 + }, + { + "epoch": 0.04608887005726939, + "grad_norm": 2.950812578201294, + "learning_rate": 1.5355191256830602e-05, + "loss": 1.8304, + "step": 843 + }, + { + "epoch": 0.04614354250099094, + "grad_norm": 1.9085733890533447, + "learning_rate": 1.5373406193078325e-05, + "loss": 1.5243, + "step": 844 + }, + { + "epoch": 0.04619821494471249, + "grad_norm": 2.7243518829345703, + "learning_rate": 1.539162112932605e-05, + "loss": 1.7211, + "step": 845 + }, + { + "epoch": 0.04625288738843404, + "grad_norm": 1.7228420972824097, + "learning_rate": 1.5409836065573772e-05, + "loss": 1.2421, + "step": 846 + }, + { + "epoch": 0.0463075598321556, + "grad_norm": 2.2128946781158447, + "learning_rate": 1.5428051001821495e-05, + "loss": 1.7334, + "step": 847 + }, + { + "epoch": 0.04636223227587715, + "grad_norm": 1.8893773555755615, + "learning_rate": 1.544626593806922e-05, + "loss": 1.7279, + "step": 848 + }, + { + "epoch": 0.0464169047195987, + "grad_norm": 3.061072587966919, + "learning_rate": 1.5464480874316942e-05, + "loss": 1.4947, + "step": 849 + }, + { + "epoch": 0.04647157716332026, + "grad_norm": 2.2656469345092773, + "learning_rate": 1.5482695810564665e-05, + "loss": 1.4287, + "step": 850 + }, + { + "epoch": 0.04652624960704181, + "grad_norm": 1.6797529458999634, + "learning_rate": 1.550091074681239e-05, + "loss": 1.7136, + "step": 851 + }, + { + "epoch": 0.04658092205076336, + "grad_norm": 3.749856472015381, + "learning_rate": 1.551912568306011e-05, + "loss": 1.6531, + "step": 852 + }, + { + "epoch": 0.046635594494484914, + "grad_norm": 2.621014356613159, + "learning_rate": 1.5537340619307835e-05, + "loss": 1.7478, + "step": 853 + }, + { + "epoch": 0.04669026693820647, + "grad_norm": 2.863607168197632, + "learning_rate": 1.555555555555556e-05, + "loss": 1.6376, + "step": 854 + }, + { + "epoch": 0.04674493938192802, + "grad_norm": 2.899007558822632, + "learning_rate": 1.5573770491803278e-05, + "loss": 1.3966, + "step": 855 + }, + { + "epoch": 0.046799611825649574, + "grad_norm": 1.8467987775802612, + "learning_rate": 1.5591985428051005e-05, + "loss": 1.7071, + "step": 856 + }, + { + "epoch": 0.04685428426937113, + "grad_norm": 2.7616074085235596, + "learning_rate": 1.5610200364298725e-05, + "loss": 1.6278, + "step": 857 + }, + { + "epoch": 0.046908956713092684, + "grad_norm": 2.4087319374084473, + "learning_rate": 1.5628415300546448e-05, + "loss": 1.4592, + "step": 858 + }, + { + "epoch": 0.046963629156814235, + "grad_norm": 2.3129115104675293, + "learning_rate": 1.5646630236794175e-05, + "loss": 1.535, + "step": 859 + }, + { + "epoch": 0.04701830160053579, + "grad_norm": 2.688664197921753, + "learning_rate": 1.5664845173041895e-05, + "loss": 1.305, + "step": 860 + }, + { + "epoch": 0.047072974044257344, + "grad_norm": 2.0663363933563232, + "learning_rate": 1.5683060109289618e-05, + "loss": 1.6664, + "step": 861 + }, + { + "epoch": 0.047127646487978896, + "grad_norm": 2.102513551712036, + "learning_rate": 1.570127504553734e-05, + "loss": 1.6047, + "step": 862 + }, + { + "epoch": 0.04718231893170045, + "grad_norm": 2.729097366333008, + "learning_rate": 1.5719489981785065e-05, + "loss": 1.7964, + "step": 863 + }, + { + "epoch": 0.047236991375422005, + "grad_norm": 2.8713314533233643, + "learning_rate": 1.5737704918032788e-05, + "loss": 1.5967, + "step": 864 + }, + { + "epoch": 0.047291663819143556, + "grad_norm": 2.2929441928863525, + "learning_rate": 1.575591985428051e-05, + "loss": 1.5383, + "step": 865 + }, + { + "epoch": 0.04734633626286511, + "grad_norm": 1.666416049003601, + "learning_rate": 1.5774134790528235e-05, + "loss": 1.4506, + "step": 866 + }, + { + "epoch": 0.047401008706586666, + "grad_norm": 1.6965878009796143, + "learning_rate": 1.5792349726775958e-05, + "loss": 1.3768, + "step": 867 + }, + { + "epoch": 0.04745568115030822, + "grad_norm": 1.5463093519210815, + "learning_rate": 1.5810564663023678e-05, + "loss": 1.486, + "step": 868 + }, + { + "epoch": 0.04751035359402977, + "grad_norm": 2.248769521713257, + "learning_rate": 1.5828779599271405e-05, + "loss": 1.4229, + "step": 869 + }, + { + "epoch": 0.04756502603775132, + "grad_norm": 3.0947179794311523, + "learning_rate": 1.5846994535519128e-05, + "loss": 1.5107, + "step": 870 + }, + { + "epoch": 0.04761969848147288, + "grad_norm": 2.0832679271698, + "learning_rate": 1.5865209471766848e-05, + "loss": 1.5296, + "step": 871 + }, + { + "epoch": 0.04767437092519443, + "grad_norm": 1.7443065643310547, + "learning_rate": 1.5883424408014575e-05, + "loss": 1.6744, + "step": 872 + }, + { + "epoch": 0.04772904336891598, + "grad_norm": 2.591224193572998, + "learning_rate": 1.5901639344262295e-05, + "loss": 1.6144, + "step": 873 + }, + { + "epoch": 0.04778371581263754, + "grad_norm": 2.373527765274048, + "learning_rate": 1.5919854280510018e-05, + "loss": 1.9148, + "step": 874 + }, + { + "epoch": 0.04783838825635909, + "grad_norm": 2.1174418926239014, + "learning_rate": 1.5938069216757745e-05, + "loss": 1.6319, + "step": 875 + }, + { + "epoch": 0.04789306070008064, + "grad_norm": 3.7443857192993164, + "learning_rate": 1.5956284153005465e-05, + "loss": 1.1655, + "step": 876 + }, + { + "epoch": 0.04794773314380219, + "grad_norm": 3.386439800262451, + "learning_rate": 1.5974499089253188e-05, + "loss": 1.8926, + "step": 877 + }, + { + "epoch": 0.04800240558752375, + "grad_norm": 3.6396267414093018, + "learning_rate": 1.599271402550091e-05, + "loss": 1.6671, + "step": 878 + }, + { + "epoch": 0.0480570780312453, + "grad_norm": 9.750361442565918, + "learning_rate": 1.6010928961748635e-05, + "loss": 1.2522, + "step": 879 + }, + { + "epoch": 0.04811175047496685, + "grad_norm": 4.4216485023498535, + "learning_rate": 1.6029143897996358e-05, + "loss": 1.965, + "step": 880 + }, + { + "epoch": 0.04816642291868841, + "grad_norm": 11.075642585754395, + "learning_rate": 1.604735883424408e-05, + "loss": 1.4806, + "step": 881 + }, + { + "epoch": 0.04822109536240996, + "grad_norm": 4.035082817077637, + "learning_rate": 1.6065573770491805e-05, + "loss": 1.6505, + "step": 882 + }, + { + "epoch": 0.04827576780613151, + "grad_norm": 5.7525835037231445, + "learning_rate": 1.6083788706739528e-05, + "loss": 1.6207, + "step": 883 + }, + { + "epoch": 0.04833044024985307, + "grad_norm": 7.136516094207764, + "learning_rate": 1.610200364298725e-05, + "loss": 1.5062, + "step": 884 + }, + { + "epoch": 0.04838511269357462, + "grad_norm": 2.670062780380249, + "learning_rate": 1.6120218579234975e-05, + "loss": 1.6042, + "step": 885 + }, + { + "epoch": 0.048439785137296174, + "grad_norm": 2.509127140045166, + "learning_rate": 1.6138433515482698e-05, + "loss": 1.6908, + "step": 886 + }, + { + "epoch": 0.048494457581017725, + "grad_norm": 3.4124467372894287, + "learning_rate": 1.615664845173042e-05, + "loss": 1.6097, + "step": 887 + }, + { + "epoch": 0.04854913002473928, + "grad_norm": 4.130401611328125, + "learning_rate": 1.6174863387978145e-05, + "loss": 1.5518, + "step": 888 + }, + { + "epoch": 0.048603802468460834, + "grad_norm": 4.454240322113037, + "learning_rate": 1.6193078324225864e-05, + "loss": 1.7198, + "step": 889 + }, + { + "epoch": 0.048658474912182385, + "grad_norm": 2.7059013843536377, + "learning_rate": 1.621129326047359e-05, + "loss": 1.4956, + "step": 890 + }, + { + "epoch": 0.048713147355903944, + "grad_norm": 3.9797720909118652, + "learning_rate": 1.6229508196721314e-05, + "loss": 1.4383, + "step": 891 + }, + { + "epoch": 0.048767819799625495, + "grad_norm": 2.8550500869750977, + "learning_rate": 1.6247723132969034e-05, + "loss": 1.6116, + "step": 892 + }, + { + "epoch": 0.048822492243347046, + "grad_norm": 5.759282112121582, + "learning_rate": 1.626593806921676e-05, + "loss": 1.6532, + "step": 893 + }, + { + "epoch": 0.0488771646870686, + "grad_norm": 8.198695182800293, + "learning_rate": 1.628415300546448e-05, + "loss": 1.6537, + "step": 894 + }, + { + "epoch": 0.048931837130790155, + "grad_norm": 2.3857944011688232, + "learning_rate": 1.6302367941712204e-05, + "loss": 1.408, + "step": 895 + }, + { + "epoch": 0.04898650957451171, + "grad_norm": 2.52425479888916, + "learning_rate": 1.632058287795993e-05, + "loss": 1.5587, + "step": 896 + }, + { + "epoch": 0.04904118201823326, + "grad_norm": 2.038750171661377, + "learning_rate": 1.633879781420765e-05, + "loss": 1.7929, + "step": 897 + }, + { + "epoch": 0.049095854461954816, + "grad_norm": 4.93236780166626, + "learning_rate": 1.6357012750455374e-05, + "loss": 1.6365, + "step": 898 + }, + { + "epoch": 0.04915052690567637, + "grad_norm": 3.5422043800354004, + "learning_rate": 1.6375227686703098e-05, + "loss": 1.5392, + "step": 899 + }, + { + "epoch": 0.04920519934939792, + "grad_norm": 1.8570632934570312, + "learning_rate": 1.639344262295082e-05, + "loss": 1.6752, + "step": 900 + }, + { + "epoch": 0.04925987179311947, + "grad_norm": 1.9238133430480957, + "learning_rate": 1.6411657559198544e-05, + "loss": 1.5635, + "step": 901 + }, + { + "epoch": 0.04931454423684103, + "grad_norm": 5.0700883865356445, + "learning_rate": 1.6429872495446268e-05, + "loss": 1.6929, + "step": 902 + }, + { + "epoch": 0.04936921668056258, + "grad_norm": 6.3009033203125, + "learning_rate": 1.644808743169399e-05, + "loss": 1.4278, + "step": 903 + }, + { + "epoch": 0.04942388912428413, + "grad_norm": 2.8622214794158936, + "learning_rate": 1.6466302367941714e-05, + "loss": 1.6498, + "step": 904 + }, + { + "epoch": 0.04947856156800569, + "grad_norm": 3.630887508392334, + "learning_rate": 1.6484517304189434e-05, + "loss": 1.6918, + "step": 905 + }, + { + "epoch": 0.04953323401172724, + "grad_norm": 2.7171058654785156, + "learning_rate": 1.650273224043716e-05, + "loss": 1.6057, + "step": 906 + }, + { + "epoch": 0.04958790645544879, + "grad_norm": 1.6451711654663086, + "learning_rate": 1.6520947176684884e-05, + "loss": 1.9116, + "step": 907 + }, + { + "epoch": 0.04964257889917035, + "grad_norm": 2.8321664333343506, + "learning_rate": 1.6539162112932604e-05, + "loss": 1.4081, + "step": 908 + }, + { + "epoch": 0.0496972513428919, + "grad_norm": 2.3084187507629395, + "learning_rate": 1.655737704918033e-05, + "loss": 1.7407, + "step": 909 + }, + { + "epoch": 0.04975192378661345, + "grad_norm": 5.045464992523193, + "learning_rate": 1.657559198542805e-05, + "loss": 1.588, + "step": 910 + }, + { + "epoch": 0.049806596230335, + "grad_norm": 3.9404139518737793, + "learning_rate": 1.6593806921675774e-05, + "loss": 1.6301, + "step": 911 + }, + { + "epoch": 0.04986126867405656, + "grad_norm": 3.556208848953247, + "learning_rate": 1.66120218579235e-05, + "loss": 1.4895, + "step": 912 + }, + { + "epoch": 0.04991594111777811, + "grad_norm": 2.9763407707214355, + "learning_rate": 1.663023679417122e-05, + "loss": 1.1494, + "step": 913 + }, + { + "epoch": 0.04997061356149966, + "grad_norm": 1.8631376028060913, + "learning_rate": 1.6648451730418944e-05, + "loss": 1.6293, + "step": 914 + }, + { + "epoch": 0.05002528600522122, + "grad_norm": 1.565271019935608, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.6296, + "step": 915 + }, + { + "epoch": 0.05007995844894277, + "grad_norm": 1.7461433410644531, + "learning_rate": 1.668488160291439e-05, + "loss": 1.6761, + "step": 916 + }, + { + "epoch": 0.050134630892664324, + "grad_norm": 4.231555938720703, + "learning_rate": 1.6703096539162114e-05, + "loss": 1.4673, + "step": 917 + }, + { + "epoch": 0.050189303336385875, + "grad_norm": 2.4062516689300537, + "learning_rate": 1.6721311475409837e-05, + "loss": 1.7348, + "step": 918 + }, + { + "epoch": 0.05024397578010743, + "grad_norm": 1.8989382982254028, + "learning_rate": 1.673952641165756e-05, + "loss": 1.5879, + "step": 919 + }, + { + "epoch": 0.050298648223828984, + "grad_norm": 1.794924259185791, + "learning_rate": 1.6757741347905284e-05, + "loss": 1.7125, + "step": 920 + }, + { + "epoch": 0.050353320667550536, + "grad_norm": 2.8212039470672607, + "learning_rate": 1.6775956284153007e-05, + "loss": 1.6634, + "step": 921 + }, + { + "epoch": 0.050407993111272094, + "grad_norm": 2.0903565883636475, + "learning_rate": 1.679417122040073e-05, + "loss": 1.6526, + "step": 922 + }, + { + "epoch": 0.050462665554993645, + "grad_norm": 2.039689064025879, + "learning_rate": 1.6812386156648454e-05, + "loss": 1.5238, + "step": 923 + }, + { + "epoch": 0.050517337998715196, + "grad_norm": 2.937307119369507, + "learning_rate": 1.6830601092896177e-05, + "loss": 1.7526, + "step": 924 + }, + { + "epoch": 0.05057201044243675, + "grad_norm": 2.825777530670166, + "learning_rate": 1.68488160291439e-05, + "loss": 1.5619, + "step": 925 + }, + { + "epoch": 0.050626682886158306, + "grad_norm": 2.712083101272583, + "learning_rate": 1.686703096539162e-05, + "loss": 1.5874, + "step": 926 + }, + { + "epoch": 0.05068135532987986, + "grad_norm": 2.466867446899414, + "learning_rate": 1.6885245901639347e-05, + "loss": 1.5934, + "step": 927 + }, + { + "epoch": 0.05073602777360141, + "grad_norm": 3.316906452178955, + "learning_rate": 1.690346083788707e-05, + "loss": 1.6878, + "step": 928 + }, + { + "epoch": 0.050790700217322966, + "grad_norm": 1.5599945783615112, + "learning_rate": 1.692167577413479e-05, + "loss": 1.4968, + "step": 929 + }, + { + "epoch": 0.05084537266104452, + "grad_norm": 1.9565238952636719, + "learning_rate": 1.6939890710382517e-05, + "loss": 1.7832, + "step": 930 + }, + { + "epoch": 0.05090004510476607, + "grad_norm": 2.2707667350769043, + "learning_rate": 1.6958105646630237e-05, + "loss": 1.4037, + "step": 931 + }, + { + "epoch": 0.05095471754848763, + "grad_norm": 1.5088948011398315, + "learning_rate": 1.697632058287796e-05, + "loss": 1.6299, + "step": 932 + }, + { + "epoch": 0.05100938999220918, + "grad_norm": 2.15570330619812, + "learning_rate": 1.6994535519125684e-05, + "loss": 1.7606, + "step": 933 + }, + { + "epoch": 0.05106406243593073, + "grad_norm": 1.922813892364502, + "learning_rate": 1.7012750455373407e-05, + "loss": 1.4682, + "step": 934 + }, + { + "epoch": 0.05111873487965228, + "grad_norm": 1.7128934860229492, + "learning_rate": 1.703096539162113e-05, + "loss": 1.4399, + "step": 935 + }, + { + "epoch": 0.05117340732337384, + "grad_norm": 1.4315885305404663, + "learning_rate": 1.7049180327868854e-05, + "loss": 1.6916, + "step": 936 + }, + { + "epoch": 0.05122807976709539, + "grad_norm": 1.6494004726409912, + "learning_rate": 1.7067395264116577e-05, + "loss": 1.3038, + "step": 937 + }, + { + "epoch": 0.05128275221081694, + "grad_norm": 2.199384927749634, + "learning_rate": 1.70856102003643e-05, + "loss": 1.5736, + "step": 938 + }, + { + "epoch": 0.0513374246545385, + "grad_norm": 1.627447485923767, + "learning_rate": 1.7103825136612024e-05, + "loss": 1.87, + "step": 939 + }, + { + "epoch": 0.05139209709826005, + "grad_norm": 1.699062466621399, + "learning_rate": 1.7122040072859747e-05, + "loss": 1.3316, + "step": 940 + }, + { + "epoch": 0.0514467695419816, + "grad_norm": 1.5647820234298706, + "learning_rate": 1.714025500910747e-05, + "loss": 1.603, + "step": 941 + }, + { + "epoch": 0.05150144198570315, + "grad_norm": 3.68096661567688, + "learning_rate": 1.715846994535519e-05, + "loss": 1.6899, + "step": 942 + }, + { + "epoch": 0.05155611442942471, + "grad_norm": 2.551361322402954, + "learning_rate": 1.7176684881602917e-05, + "loss": 1.7328, + "step": 943 + }, + { + "epoch": 0.05161078687314626, + "grad_norm": 3.2037527561187744, + "learning_rate": 1.719489981785064e-05, + "loss": 1.5985, + "step": 944 + }, + { + "epoch": 0.051665459316867814, + "grad_norm": 5.474474906921387, + "learning_rate": 1.721311475409836e-05, + "loss": 1.6211, + "step": 945 + }, + { + "epoch": 0.05172013176058937, + "grad_norm": 2.674117088317871, + "learning_rate": 1.7231329690346087e-05, + "loss": 1.5618, + "step": 946 + }, + { + "epoch": 0.05177480420431092, + "grad_norm": 2.140418529510498, + "learning_rate": 1.7249544626593807e-05, + "loss": 1.3224, + "step": 947 + }, + { + "epoch": 0.051829476648032474, + "grad_norm": 2.601842164993286, + "learning_rate": 1.726775956284153e-05, + "loss": 1.6155, + "step": 948 + }, + { + "epoch": 0.051884149091754025, + "grad_norm": 3.2556211948394775, + "learning_rate": 1.7285974499089254e-05, + "loss": 1.4541, + "step": 949 + }, + { + "epoch": 0.051938821535475584, + "grad_norm": 6.209150791168213, + "learning_rate": 1.7304189435336977e-05, + "loss": 1.4197, + "step": 950 + }, + { + "epoch": 0.051993493979197135, + "grad_norm": 4.165321350097656, + "learning_rate": 1.73224043715847e-05, + "loss": 1.6421, + "step": 951 + }, + { + "epoch": 0.052048166422918686, + "grad_norm": 2.3490147590637207, + "learning_rate": 1.7340619307832424e-05, + "loss": 1.5748, + "step": 952 + }, + { + "epoch": 0.052102838866640244, + "grad_norm": 4.193833827972412, + "learning_rate": 1.7358834244080147e-05, + "loss": 1.3633, + "step": 953 + }, + { + "epoch": 0.052157511310361795, + "grad_norm": 3.876671314239502, + "learning_rate": 1.737704918032787e-05, + "loss": 1.787, + "step": 954 + }, + { + "epoch": 0.05221218375408335, + "grad_norm": 1.9177958965301514, + "learning_rate": 1.7395264116575594e-05, + "loss": 1.3382, + "step": 955 + }, + { + "epoch": 0.052266856197804905, + "grad_norm": 3.0565826892852783, + "learning_rate": 1.7413479052823317e-05, + "loss": 1.5429, + "step": 956 + }, + { + "epoch": 0.052321528641526456, + "grad_norm": 2.192924976348877, + "learning_rate": 1.743169398907104e-05, + "loss": 1.5592, + "step": 957 + }, + { + "epoch": 0.05237620108524801, + "grad_norm": 1.8788261413574219, + "learning_rate": 1.7449908925318764e-05, + "loss": 1.4246, + "step": 958 + }, + { + "epoch": 0.05243087352896956, + "grad_norm": 1.7976441383361816, + "learning_rate": 1.7468123861566487e-05, + "loss": 1.4059, + "step": 959 + }, + { + "epoch": 0.05248554597269112, + "grad_norm": 2.355699062347412, + "learning_rate": 1.7486338797814207e-05, + "loss": 1.586, + "step": 960 + }, + { + "epoch": 0.05254021841641267, + "grad_norm": 3.565916061401367, + "learning_rate": 1.7504553734061934e-05, + "loss": 1.6528, + "step": 961 + }, + { + "epoch": 0.05259489086013422, + "grad_norm": 2.7488372325897217, + "learning_rate": 1.7522768670309657e-05, + "loss": 1.6992, + "step": 962 + }, + { + "epoch": 0.05264956330385578, + "grad_norm": 1.9379616975784302, + "learning_rate": 1.7540983606557377e-05, + "loss": 1.6758, + "step": 963 + }, + { + "epoch": 0.05270423574757733, + "grad_norm": 1.7633461952209473, + "learning_rate": 1.7559198542805104e-05, + "loss": 1.4845, + "step": 964 + }, + { + "epoch": 0.05275890819129888, + "grad_norm": 2.016842842102051, + "learning_rate": 1.7577413479052823e-05, + "loss": 1.6792, + "step": 965 + }, + { + "epoch": 0.05281358063502043, + "grad_norm": 1.716569423675537, + "learning_rate": 1.7595628415300547e-05, + "loss": 1.471, + "step": 966 + }, + { + "epoch": 0.05286825307874199, + "grad_norm": 2.4787182807922363, + "learning_rate": 1.7613843351548273e-05, + "loss": 1.5561, + "step": 967 + }, + { + "epoch": 0.05292292552246354, + "grad_norm": 1.9355578422546387, + "learning_rate": 1.7632058287795993e-05, + "loss": 1.8631, + "step": 968 + }, + { + "epoch": 0.05297759796618509, + "grad_norm": 2.3031837940216064, + "learning_rate": 1.7650273224043717e-05, + "loss": 1.5271, + "step": 969 + }, + { + "epoch": 0.05303227040990665, + "grad_norm": 1.8765790462493896, + "learning_rate": 1.766848816029144e-05, + "loss": 1.537, + "step": 970 + }, + { + "epoch": 0.0530869428536282, + "grad_norm": 3.319232940673828, + "learning_rate": 1.7686703096539163e-05, + "loss": 1.7121, + "step": 971 + }, + { + "epoch": 0.05314161529734975, + "grad_norm": 1.9775952100753784, + "learning_rate": 1.7704918032786887e-05, + "loss": 1.4915, + "step": 972 + }, + { + "epoch": 0.0531962877410713, + "grad_norm": 2.965593099594116, + "learning_rate": 1.772313296903461e-05, + "loss": 1.5316, + "step": 973 + }, + { + "epoch": 0.05325096018479286, + "grad_norm": 2.299473524093628, + "learning_rate": 1.7741347905282333e-05, + "loss": 1.8616, + "step": 974 + }, + { + "epoch": 0.05330563262851441, + "grad_norm": 2.1554927825927734, + "learning_rate": 1.7759562841530057e-05, + "loss": 1.4759, + "step": 975 + }, + { + "epoch": 0.053360305072235964, + "grad_norm": 2.0652525424957275, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.3859, + "step": 976 + }, + { + "epoch": 0.05341497751595752, + "grad_norm": 1.6338690519332886, + "learning_rate": 1.7795992714025503e-05, + "loss": 1.7396, + "step": 977 + }, + { + "epoch": 0.05346964995967907, + "grad_norm": 3.920269250869751, + "learning_rate": 1.7814207650273227e-05, + "loss": 1.8321, + "step": 978 + }, + { + "epoch": 0.053524322403400625, + "grad_norm": 2.5900590419769287, + "learning_rate": 1.7832422586520947e-05, + "loss": 1.6051, + "step": 979 + }, + { + "epoch": 0.053578994847122176, + "grad_norm": 3.5914340019226074, + "learning_rate": 1.7850637522768673e-05, + "loss": 1.8238, + "step": 980 + }, + { + "epoch": 0.053633667290843734, + "grad_norm": 1.3633123636245728, + "learning_rate": 1.7868852459016393e-05, + "loss": 1.6602, + "step": 981 + }, + { + "epoch": 0.053688339734565285, + "grad_norm": 1.959674596786499, + "learning_rate": 1.7887067395264117e-05, + "loss": 1.5478, + "step": 982 + }, + { + "epoch": 0.053743012178286836, + "grad_norm": 2.7293405532836914, + "learning_rate": 1.7905282331511843e-05, + "loss": 1.3227, + "step": 983 + }, + { + "epoch": 0.053797684622008395, + "grad_norm": 2.0026705265045166, + "learning_rate": 1.7923497267759563e-05, + "loss": 1.5493, + "step": 984 + }, + { + "epoch": 0.053852357065729946, + "grad_norm": 3.0648066997528076, + "learning_rate": 1.7941712204007287e-05, + "loss": 1.6858, + "step": 985 + }, + { + "epoch": 0.0539070295094515, + "grad_norm": 3.3536529541015625, + "learning_rate": 1.795992714025501e-05, + "loss": 1.4773, + "step": 986 + }, + { + "epoch": 0.053961701953173055, + "grad_norm": 1.492261290550232, + "learning_rate": 1.7978142076502733e-05, + "loss": 1.4719, + "step": 987 + }, + { + "epoch": 0.054016374396894606, + "grad_norm": 6.326408386230469, + "learning_rate": 1.7996357012750456e-05, + "loss": 1.4966, + "step": 988 + }, + { + "epoch": 0.05407104684061616, + "grad_norm": 2.0717921257019043, + "learning_rate": 1.801457194899818e-05, + "loss": 1.7388, + "step": 989 + }, + { + "epoch": 0.05412571928433771, + "grad_norm": 2.880075216293335, + "learning_rate": 1.8032786885245903e-05, + "loss": 1.8364, + "step": 990 + }, + { + "epoch": 0.05418039172805927, + "grad_norm": 1.6246341466903687, + "learning_rate": 1.8051001821493626e-05, + "loss": 1.418, + "step": 991 + }, + { + "epoch": 0.05423506417178082, + "grad_norm": 1.9155960083007812, + "learning_rate": 1.806921675774135e-05, + "loss": 1.4856, + "step": 992 + }, + { + "epoch": 0.05428973661550237, + "grad_norm": 1.44743812084198, + "learning_rate": 1.8087431693989073e-05, + "loss": 1.4939, + "step": 993 + }, + { + "epoch": 0.05434440905922393, + "grad_norm": 2.492051601409912, + "learning_rate": 1.8105646630236796e-05, + "loss": 1.4252, + "step": 994 + }, + { + "epoch": 0.05439908150294548, + "grad_norm": 2.219905376434326, + "learning_rate": 1.812386156648452e-05, + "loss": 1.5234, + "step": 995 + }, + { + "epoch": 0.05445375394666703, + "grad_norm": 6.1376495361328125, + "learning_rate": 1.8142076502732243e-05, + "loss": 1.6073, + "step": 996 + }, + { + "epoch": 0.05450842639038858, + "grad_norm": 2.062927484512329, + "learning_rate": 1.8160291438979963e-05, + "loss": 1.7893, + "step": 997 + }, + { + "epoch": 0.05456309883411014, + "grad_norm": 1.8314250707626343, + "learning_rate": 1.817850637522769e-05, + "loss": 1.3966, + "step": 998 + }, + { + "epoch": 0.05461777127783169, + "grad_norm": 1.7837039232254028, + "learning_rate": 1.8196721311475413e-05, + "loss": 1.592, + "step": 999 + }, + { + "epoch": 0.05467244372155324, + "grad_norm": 1.7994149923324585, + "learning_rate": 1.8214936247723133e-05, + "loss": 1.7109, + "step": 1000 + }, + { + "epoch": 0.0547271161652748, + "grad_norm": 3.149341106414795, + "learning_rate": 1.823315118397086e-05, + "loss": 1.4277, + "step": 1001 + }, + { + "epoch": 0.05478178860899635, + "grad_norm": 5.719334602355957, + "learning_rate": 1.825136612021858e-05, + "loss": 1.28, + "step": 1002 + }, + { + "epoch": 0.0548364610527179, + "grad_norm": 2.6708619594573975, + "learning_rate": 1.8269581056466303e-05, + "loss": 1.3966, + "step": 1003 + }, + { + "epoch": 0.054891133496439454, + "grad_norm": 3.707876205444336, + "learning_rate": 1.828779599271403e-05, + "loss": 1.1891, + "step": 1004 + }, + { + "epoch": 0.05494580594016101, + "grad_norm": 4.38498067855835, + "learning_rate": 1.830601092896175e-05, + "loss": 1.6318, + "step": 1005 + }, + { + "epoch": 0.05500047838388256, + "grad_norm": 2.046452283859253, + "learning_rate": 1.8324225865209473e-05, + "loss": 1.5391, + "step": 1006 + }, + { + "epoch": 0.055055150827604114, + "grad_norm": 3.3421735763549805, + "learning_rate": 1.8342440801457196e-05, + "loss": 1.2313, + "step": 1007 + }, + { + "epoch": 0.05510982327132567, + "grad_norm": 2.865300416946411, + "learning_rate": 1.836065573770492e-05, + "loss": 1.687, + "step": 1008 + }, + { + "epoch": 0.055164495715047224, + "grad_norm": 4.855372905731201, + "learning_rate": 1.8378870673952643e-05, + "loss": 1.5206, + "step": 1009 + }, + { + "epoch": 0.055219168158768775, + "grad_norm": 4.944165229797363, + "learning_rate": 1.8397085610200366e-05, + "loss": 1.3955, + "step": 1010 + }, + { + "epoch": 0.05527384060249033, + "grad_norm": 6.63599967956543, + "learning_rate": 1.841530054644809e-05, + "loss": 1.4876, + "step": 1011 + }, + { + "epoch": 0.055328513046211884, + "grad_norm": 2.5565483570098877, + "learning_rate": 1.8433515482695813e-05, + "loss": 1.4473, + "step": 1012 + }, + { + "epoch": 0.055383185489933436, + "grad_norm": 8.898396492004395, + "learning_rate": 1.8451730418943533e-05, + "loss": 1.485, + "step": 1013 + }, + { + "epoch": 0.05543785793365499, + "grad_norm": 2.4452826976776123, + "learning_rate": 1.846994535519126e-05, + "loss": 1.6475, + "step": 1014 + }, + { + "epoch": 0.055492530377376545, + "grad_norm": 2.4291820526123047, + "learning_rate": 1.8488160291438983e-05, + "loss": 1.3727, + "step": 1015 + }, + { + "epoch": 0.055547202821098096, + "grad_norm": 1.7830654382705688, + "learning_rate": 1.8506375227686703e-05, + "loss": 1.3518, + "step": 1016 + }, + { + "epoch": 0.05560187526481965, + "grad_norm": 2.2720656394958496, + "learning_rate": 1.852459016393443e-05, + "loss": 1.47, + "step": 1017 + }, + { + "epoch": 0.055656547708541206, + "grad_norm": 2.072298765182495, + "learning_rate": 1.854280510018215e-05, + "loss": 1.5845, + "step": 1018 + }, + { + "epoch": 0.05571122015226276, + "grad_norm": 1.3260337114334106, + "learning_rate": 1.8561020036429873e-05, + "loss": 1.7098, + "step": 1019 + }, + { + "epoch": 0.05576589259598431, + "grad_norm": 2.1656899452209473, + "learning_rate": 1.85792349726776e-05, + "loss": 1.5509, + "step": 1020 + }, + { + "epoch": 0.05582056503970586, + "grad_norm": 1.3197749853134155, + "learning_rate": 1.859744990892532e-05, + "loss": 1.3817, + "step": 1021 + }, + { + "epoch": 0.05587523748342742, + "grad_norm": 2.1817290782928467, + "learning_rate": 1.8615664845173043e-05, + "loss": 1.655, + "step": 1022 + }, + { + "epoch": 0.05592990992714897, + "grad_norm": 1.4603151082992554, + "learning_rate": 1.8633879781420766e-05, + "loss": 1.445, + "step": 1023 + }, + { + "epoch": 0.05598458237087052, + "grad_norm": 1.3092000484466553, + "learning_rate": 1.865209471766849e-05, + "loss": 1.6284, + "step": 1024 + }, + { + "epoch": 0.05603925481459208, + "grad_norm": 1.8936012983322144, + "learning_rate": 1.8670309653916213e-05, + "loss": 1.5186, + "step": 1025 + }, + { + "epoch": 0.05609392725831363, + "grad_norm": 1.8892256021499634, + "learning_rate": 1.8688524590163936e-05, + "loss": 1.6829, + "step": 1026 + }, + { + "epoch": 0.05614859970203518, + "grad_norm": 1.9570438861846924, + "learning_rate": 1.870673952641166e-05, + "loss": 1.3589, + "step": 1027 + }, + { + "epoch": 0.05620327214575673, + "grad_norm": 1.9479519128799438, + "learning_rate": 1.8724954462659383e-05, + "loss": 1.535, + "step": 1028 + }, + { + "epoch": 0.05625794458947829, + "grad_norm": 2.321850538253784, + "learning_rate": 1.8743169398907106e-05, + "loss": 1.7679, + "step": 1029 + }, + { + "epoch": 0.05631261703319984, + "grad_norm": 2.117260694503784, + "learning_rate": 1.876138433515483e-05, + "loss": 1.6007, + "step": 1030 + }, + { + "epoch": 0.05636728947692139, + "grad_norm": 1.6165885925292969, + "learning_rate": 1.8779599271402553e-05, + "loss": 1.4637, + "step": 1031 + }, + { + "epoch": 0.05642196192064295, + "grad_norm": 1.569874882698059, + "learning_rate": 1.8797814207650276e-05, + "loss": 1.4357, + "step": 1032 + }, + { + "epoch": 0.0564766343643645, + "grad_norm": 3.1961429119110107, + "learning_rate": 1.8816029143898e-05, + "loss": 1.113, + "step": 1033 + }, + { + "epoch": 0.05653130680808605, + "grad_norm": 2.4478201866149902, + "learning_rate": 1.883424408014572e-05, + "loss": 1.7666, + "step": 1034 + }, + { + "epoch": 0.05658597925180761, + "grad_norm": 1.5885981321334839, + "learning_rate": 1.8852459016393446e-05, + "loss": 1.6434, + "step": 1035 + }, + { + "epoch": 0.05664065169552916, + "grad_norm": 1.3871926069259644, + "learning_rate": 1.887067395264117e-05, + "loss": 1.4566, + "step": 1036 + }, + { + "epoch": 0.05669532413925071, + "grad_norm": 1.6561360359191895, + "learning_rate": 1.888888888888889e-05, + "loss": 1.5231, + "step": 1037 + }, + { + "epoch": 0.056749996582972265, + "grad_norm": 1.5106233358383179, + "learning_rate": 1.8907103825136616e-05, + "loss": 1.5311, + "step": 1038 + }, + { + "epoch": 0.05680466902669382, + "grad_norm": 2.1074798107147217, + "learning_rate": 1.8925318761384336e-05, + "loss": 1.6951, + "step": 1039 + }, + { + "epoch": 0.056859341470415374, + "grad_norm": 1.450971245765686, + "learning_rate": 1.894353369763206e-05, + "loss": 1.6085, + "step": 1040 + }, + { + "epoch": 0.056914013914136925, + "grad_norm": 1.5187914371490479, + "learning_rate": 1.8961748633879782e-05, + "loss": 1.7764, + "step": 1041 + }, + { + "epoch": 0.05696868635785848, + "grad_norm": 1.6257349252700806, + "learning_rate": 1.8979963570127506e-05, + "loss": 1.5847, + "step": 1042 + }, + { + "epoch": 0.057023358801580035, + "grad_norm": 1.8515764474868774, + "learning_rate": 1.899817850637523e-05, + "loss": 1.6639, + "step": 1043 + }, + { + "epoch": 0.057078031245301586, + "grad_norm": 1.5572068691253662, + "learning_rate": 1.9016393442622952e-05, + "loss": 1.7041, + "step": 1044 + }, + { + "epoch": 0.05713270368902314, + "grad_norm": 2.018885374069214, + "learning_rate": 1.9034608378870676e-05, + "loss": 1.4611, + "step": 1045 + }, + { + "epoch": 0.057187376132744695, + "grad_norm": 2.1033661365509033, + "learning_rate": 1.90528233151184e-05, + "loss": 1.8637, + "step": 1046 + }, + { + "epoch": 0.057242048576466247, + "grad_norm": 2.01845121383667, + "learning_rate": 1.9071038251366122e-05, + "loss": 1.3772, + "step": 1047 + }, + { + "epoch": 0.0572967210201878, + "grad_norm": 1.552657961845398, + "learning_rate": 1.9089253187613846e-05, + "loss": 1.4362, + "step": 1048 + }, + { + "epoch": 0.057351393463909356, + "grad_norm": 1.5166974067687988, + "learning_rate": 1.910746812386157e-05, + "loss": 1.3429, + "step": 1049 + }, + { + "epoch": 0.05740606590763091, + "grad_norm": 2.348388195037842, + "learning_rate": 1.912568306010929e-05, + "loss": 1.5362, + "step": 1050 + }, + { + "epoch": 0.05746073835135246, + "grad_norm": 1.3900989294052124, + "learning_rate": 1.9143897996357016e-05, + "loss": 1.4918, + "step": 1051 + }, + { + "epoch": 0.05751541079507401, + "grad_norm": 1.59862220287323, + "learning_rate": 1.9162112932604736e-05, + "loss": 1.506, + "step": 1052 + }, + { + "epoch": 0.05757008323879557, + "grad_norm": 1.486366868019104, + "learning_rate": 1.918032786885246e-05, + "loss": 1.6937, + "step": 1053 + }, + { + "epoch": 0.05762475568251712, + "grad_norm": 2.2689902782440186, + "learning_rate": 1.9198542805100186e-05, + "loss": 1.4784, + "step": 1054 + }, + { + "epoch": 0.05767942812623867, + "grad_norm": 2.1362054347991943, + "learning_rate": 1.9216757741347906e-05, + "loss": 1.7287, + "step": 1055 + }, + { + "epoch": 0.05773410056996023, + "grad_norm": 1.860732078552246, + "learning_rate": 1.923497267759563e-05, + "loss": 1.4211, + "step": 1056 + }, + { + "epoch": 0.05778877301368178, + "grad_norm": 1.8274383544921875, + "learning_rate": 1.9253187613843352e-05, + "loss": 1.471, + "step": 1057 + }, + { + "epoch": 0.05784344545740333, + "grad_norm": 2.7739052772521973, + "learning_rate": 1.9271402550091076e-05, + "loss": 1.4575, + "step": 1058 + }, + { + "epoch": 0.05789811790112489, + "grad_norm": 1.5263198614120483, + "learning_rate": 1.92896174863388e-05, + "loss": 1.5684, + "step": 1059 + }, + { + "epoch": 0.05795279034484644, + "grad_norm": 2.172966480255127, + "learning_rate": 1.9307832422586522e-05, + "loss": 1.8694, + "step": 1060 + }, + { + "epoch": 0.05800746278856799, + "grad_norm": 1.9510854482650757, + "learning_rate": 1.9326047358834245e-05, + "loss": 1.3144, + "step": 1061 + }, + { + "epoch": 0.05806213523228954, + "grad_norm": 2.7731387615203857, + "learning_rate": 1.934426229508197e-05, + "loss": 1.5837, + "step": 1062 + }, + { + "epoch": 0.0581168076760111, + "grad_norm": 1.8601657152175903, + "learning_rate": 1.9362477231329692e-05, + "loss": 1.8122, + "step": 1063 + }, + { + "epoch": 0.05817148011973265, + "grad_norm": 4.007602214813232, + "learning_rate": 1.9380692167577415e-05, + "loss": 1.5329, + "step": 1064 + }, + { + "epoch": 0.0582261525634542, + "grad_norm": 3.9082517623901367, + "learning_rate": 1.939890710382514e-05, + "loss": 1.4998, + "step": 1065 + }, + { + "epoch": 0.05828082500717576, + "grad_norm": 2.3711564540863037, + "learning_rate": 1.9417122040072862e-05, + "loss": 1.6609, + "step": 1066 + }, + { + "epoch": 0.05833549745089731, + "grad_norm": 2.7766661643981934, + "learning_rate": 1.9435336976320585e-05, + "loss": 1.7892, + "step": 1067 + }, + { + "epoch": 0.058390169894618864, + "grad_norm": 2.3673439025878906, + "learning_rate": 1.9453551912568305e-05, + "loss": 1.8221, + "step": 1068 + }, + { + "epoch": 0.058444842338340415, + "grad_norm": 5.456075191497803, + "learning_rate": 1.9471766848816032e-05, + "loss": 1.5561, + "step": 1069 + }, + { + "epoch": 0.05849951478206197, + "grad_norm": 1.48649263381958, + "learning_rate": 1.9489981785063755e-05, + "loss": 1.5075, + "step": 1070 + }, + { + "epoch": 0.058554187225783524, + "grad_norm": 1.7726631164550781, + "learning_rate": 1.9508196721311475e-05, + "loss": 1.5166, + "step": 1071 + }, + { + "epoch": 0.058608859669505076, + "grad_norm": 2.1481375694274902, + "learning_rate": 1.9526411657559202e-05, + "loss": 1.5095, + "step": 1072 + }, + { + "epoch": 0.058663532113226634, + "grad_norm": 1.792392373085022, + "learning_rate": 1.9544626593806922e-05, + "loss": 1.4609, + "step": 1073 + }, + { + "epoch": 0.058718204556948185, + "grad_norm": 2.5192923545837402, + "learning_rate": 1.9562841530054645e-05, + "loss": 1.8426, + "step": 1074 + }, + { + "epoch": 0.058772877000669736, + "grad_norm": 2.237349510192871, + "learning_rate": 1.9581056466302372e-05, + "loss": 1.6949, + "step": 1075 + }, + { + "epoch": 0.05882754944439129, + "grad_norm": 2.3999080657958984, + "learning_rate": 1.9599271402550092e-05, + "loss": 1.7578, + "step": 1076 + }, + { + "epoch": 0.058882221888112846, + "grad_norm": 1.4406505823135376, + "learning_rate": 1.9617486338797815e-05, + "loss": 1.5841, + "step": 1077 + }, + { + "epoch": 0.0589368943318344, + "grad_norm": 1.676379919052124, + "learning_rate": 1.963570127504554e-05, + "loss": 1.5246, + "step": 1078 + }, + { + "epoch": 0.05899156677555595, + "grad_norm": 2.4243569374084473, + "learning_rate": 1.9653916211293262e-05, + "loss": 1.3498, + "step": 1079 + }, + { + "epoch": 0.059046239219277506, + "grad_norm": 1.689837098121643, + "learning_rate": 1.9672131147540985e-05, + "loss": 1.546, + "step": 1080 + }, + { + "epoch": 0.05910091166299906, + "grad_norm": 1.48982572555542, + "learning_rate": 1.969034608378871e-05, + "loss": 1.7219, + "step": 1081 + }, + { + "epoch": 0.05915558410672061, + "grad_norm": 2.395066261291504, + "learning_rate": 1.9708561020036432e-05, + "loss": 1.6832, + "step": 1082 + }, + { + "epoch": 0.05921025655044217, + "grad_norm": 1.978255271911621, + "learning_rate": 1.9726775956284155e-05, + "loss": 1.3514, + "step": 1083 + }, + { + "epoch": 0.05926492899416372, + "grad_norm": 1.4165939092636108, + "learning_rate": 1.9744990892531875e-05, + "loss": 1.5075, + "step": 1084 + }, + { + "epoch": 0.05931960143788527, + "grad_norm": 1.6581324338912964, + "learning_rate": 1.9763205828779602e-05, + "loss": 1.5569, + "step": 1085 + }, + { + "epoch": 0.05937427388160682, + "grad_norm": 1.8892263174057007, + "learning_rate": 1.9781420765027325e-05, + "loss": 1.5563, + "step": 1086 + }, + { + "epoch": 0.05942894632532838, + "grad_norm": 1.8536529541015625, + "learning_rate": 1.9799635701275045e-05, + "loss": 1.5232, + "step": 1087 + }, + { + "epoch": 0.05948361876904993, + "grad_norm": 3.3015785217285156, + "learning_rate": 1.9817850637522772e-05, + "loss": 1.4732, + "step": 1088 + }, + { + "epoch": 0.05953829121277148, + "grad_norm": 1.539364218711853, + "learning_rate": 1.9836065573770492e-05, + "loss": 1.6024, + "step": 1089 + }, + { + "epoch": 0.05959296365649304, + "grad_norm": 1.8659155368804932, + "learning_rate": 1.9854280510018215e-05, + "loss": 1.5308, + "step": 1090 + }, + { + "epoch": 0.05964763610021459, + "grad_norm": 1.957715630531311, + "learning_rate": 1.9872495446265942e-05, + "loss": 1.6114, + "step": 1091 + }, + { + "epoch": 0.05970230854393614, + "grad_norm": 1.7194281816482544, + "learning_rate": 1.9890710382513662e-05, + "loss": 1.6246, + "step": 1092 + }, + { + "epoch": 0.05975698098765769, + "grad_norm": 1.4848217964172363, + "learning_rate": 1.9908925318761385e-05, + "loss": 1.6439, + "step": 1093 + }, + { + "epoch": 0.05981165343137925, + "grad_norm": 1.721565842628479, + "learning_rate": 1.992714025500911e-05, + "loss": 1.8578, + "step": 1094 + }, + { + "epoch": 0.0598663258751008, + "grad_norm": 1.3069735765457153, + "learning_rate": 1.994535519125683e-05, + "loss": 1.6102, + "step": 1095 + }, + { + "epoch": 0.059920998318822354, + "grad_norm": 1.8001673221588135, + "learning_rate": 1.9963570127504555e-05, + "loss": 1.4187, + "step": 1096 + }, + { + "epoch": 0.05997567076254391, + "grad_norm": 1.7654433250427246, + "learning_rate": 1.998178506375228e-05, + "loss": 1.3138, + "step": 1097 + }, + { + "epoch": 0.06003034320626546, + "grad_norm": 1.881939172744751, + "learning_rate": 2e-05, + "loss": 1.6264, + "step": 1098 + }, + { + "epoch": 0.060085015649987014, + "grad_norm": 2.050600528717041, + "learning_rate": 1.9999999833038268e-05, + "loss": 1.2797, + "step": 1099 + }, + { + "epoch": 0.060139688093708565, + "grad_norm": 1.6630687713623047, + "learning_rate": 1.9999999332153076e-05, + "loss": 1.7588, + "step": 1100 + }, + { + "epoch": 0.060194360537430124, + "grad_norm": 2.338650941848755, + "learning_rate": 1.999999849734444e-05, + "loss": 1.4812, + "step": 1101 + }, + { + "epoch": 0.060249032981151675, + "grad_norm": 3.0153844356536865, + "learning_rate": 1.9999997328612388e-05, + "loss": 1.4317, + "step": 1102 + }, + { + "epoch": 0.060303705424873226, + "grad_norm": 1.6165691614151, + "learning_rate": 1.9999995825956956e-05, + "loss": 1.6497, + "step": 1103 + }, + { + "epoch": 0.060358377868594784, + "grad_norm": 1.735592007637024, + "learning_rate": 1.9999993989378202e-05, + "loss": 1.6145, + "step": 1104 + }, + { + "epoch": 0.060413050312316335, + "grad_norm": 3.670102834701538, + "learning_rate": 1.9999991818876183e-05, + "loss": 1.7531, + "step": 1105 + }, + { + "epoch": 0.06046772275603789, + "grad_norm": 1.6926250457763672, + "learning_rate": 1.999998931445097e-05, + "loss": 1.6314, + "step": 1106 + }, + { + "epoch": 0.06052239519975944, + "grad_norm": 3.032902479171753, + "learning_rate": 1.9999986476102647e-05, + "loss": 1.3121, + "step": 1107 + }, + { + "epoch": 0.060577067643480996, + "grad_norm": 2.308690309524536, + "learning_rate": 1.9999983303831313e-05, + "loss": 1.515, + "step": 1108 + }, + { + "epoch": 0.06063174008720255, + "grad_norm": 1.5411741733551025, + "learning_rate": 1.999997979763707e-05, + "loss": 1.5105, + "step": 1109 + }, + { + "epoch": 0.0606864125309241, + "grad_norm": 1.9144343137741089, + "learning_rate": 1.9999975957520033e-05, + "loss": 1.5423, + "step": 1110 + }, + { + "epoch": 0.06074108497464566, + "grad_norm": 2.2904839515686035, + "learning_rate": 1.999997178348034e-05, + "loss": 1.4075, + "step": 1111 + }, + { + "epoch": 0.06079575741836721, + "grad_norm": 1.5542795658111572, + "learning_rate": 1.9999967275518118e-05, + "loss": 1.648, + "step": 1112 + }, + { + "epoch": 0.06085042986208876, + "grad_norm": 2.0650551319122314, + "learning_rate": 1.999996243363352e-05, + "loss": 1.6307, + "step": 1113 + }, + { + "epoch": 0.06090510230581032, + "grad_norm": 2.6128122806549072, + "learning_rate": 1.9999957257826716e-05, + "loss": 1.47, + "step": 1114 + }, + { + "epoch": 0.06095977474953187, + "grad_norm": 1.949393391609192, + "learning_rate": 1.9999951748097874e-05, + "loss": 1.5678, + "step": 1115 + }, + { + "epoch": 0.06101444719325342, + "grad_norm": 1.5070838928222656, + "learning_rate": 1.9999945904447173e-05, + "loss": 1.534, + "step": 1116 + }, + { + "epoch": 0.06106911963697497, + "grad_norm": 2.4719862937927246, + "learning_rate": 1.9999939726874817e-05, + "loss": 1.3761, + "step": 1117 + }, + { + "epoch": 0.06112379208069653, + "grad_norm": 2.7928013801574707, + "learning_rate": 1.9999933215381005e-05, + "loss": 1.4127, + "step": 1118 + }, + { + "epoch": 0.06117846452441808, + "grad_norm": 1.793753743171692, + "learning_rate": 1.999992636996596e-05, + "loss": 1.5528, + "step": 1119 + }, + { + "epoch": 0.06123313696813963, + "grad_norm": 2.4151549339294434, + "learning_rate": 1.9999919190629905e-05, + "loss": 1.5705, + "step": 1120 + }, + { + "epoch": 0.06128780941186119, + "grad_norm": 3.436825752258301, + "learning_rate": 1.9999911677373083e-05, + "loss": 1.4962, + "step": 1121 + }, + { + "epoch": 0.06134248185558274, + "grad_norm": 4.362198352813721, + "learning_rate": 1.9999903830195748e-05, + "loss": 1.4716, + "step": 1122 + }, + { + "epoch": 0.06139715429930429, + "grad_norm": 1.3828624486923218, + "learning_rate": 1.9999895649098154e-05, + "loss": 1.2535, + "step": 1123 + }, + { + "epoch": 0.06145182674302584, + "grad_norm": 2.7093896865844727, + "learning_rate": 1.999988713408058e-05, + "loss": 1.6379, + "step": 1124 + }, + { + "epoch": 0.0615064991867474, + "grad_norm": 2.611001491546631, + "learning_rate": 1.999987828514331e-05, + "loss": 1.3544, + "step": 1125 + }, + { + "epoch": 0.06156117163046895, + "grad_norm": 1.596815586090088, + "learning_rate": 1.9999869102286638e-05, + "loss": 1.5664, + "step": 1126 + }, + { + "epoch": 0.061615844074190504, + "grad_norm": 3.3231937885284424, + "learning_rate": 1.9999859585510873e-05, + "loss": 1.6991, + "step": 1127 + }, + { + "epoch": 0.06167051651791206, + "grad_norm": 1.5783545970916748, + "learning_rate": 1.9999849734816334e-05, + "loss": 1.3794, + "step": 1128 + }, + { + "epoch": 0.06172518896163361, + "grad_norm": 2.289055585861206, + "learning_rate": 1.999983955020334e-05, + "loss": 1.608, + "step": 1129 + }, + { + "epoch": 0.061779861405355165, + "grad_norm": 1.9650911092758179, + "learning_rate": 1.999982903167224e-05, + "loss": 1.4628, + "step": 1130 + }, + { + "epoch": 0.061834533849076716, + "grad_norm": 1.9533259868621826, + "learning_rate": 1.9999818179223383e-05, + "loss": 1.5069, + "step": 1131 + }, + { + "epoch": 0.061889206292798274, + "grad_norm": 1.6862585544586182, + "learning_rate": 1.9999806992857138e-05, + "loss": 1.7397, + "step": 1132 + }, + { + "epoch": 0.061943878736519825, + "grad_norm": 2.232130289077759, + "learning_rate": 1.9999795472573865e-05, + "loss": 1.5034, + "step": 1133 + }, + { + "epoch": 0.061998551180241376, + "grad_norm": 1.7710036039352417, + "learning_rate": 1.9999783618373958e-05, + "loss": 1.6799, + "step": 1134 + }, + { + "epoch": 0.062053223623962935, + "grad_norm": 2.300151824951172, + "learning_rate": 1.999977143025781e-05, + "loss": 1.6116, + "step": 1135 + }, + { + "epoch": 0.062107896067684486, + "grad_norm": 2.14374041557312, + "learning_rate": 1.999975890822583e-05, + "loss": 1.7291, + "step": 1136 + }, + { + "epoch": 0.06216256851140604, + "grad_norm": 1.4311820268630981, + "learning_rate": 1.9999746052278433e-05, + "loss": 1.5527, + "step": 1137 + }, + { + "epoch": 0.062217240955127595, + "grad_norm": 1.6826976537704468, + "learning_rate": 1.9999732862416053e-05, + "loss": 1.6596, + "step": 1138 + }, + { + "epoch": 0.062271913398849146, + "grad_norm": 1.4777826070785522, + "learning_rate": 1.9999719338639127e-05, + "loss": 1.6607, + "step": 1139 + }, + { + "epoch": 0.0623265858425707, + "grad_norm": 1.3170051574707031, + "learning_rate": 1.9999705480948107e-05, + "loss": 1.5824, + "step": 1140 + }, + { + "epoch": 0.06238125828629225, + "grad_norm": 1.6457635164260864, + "learning_rate": 1.9999691289343456e-05, + "loss": 1.4332, + "step": 1141 + }, + { + "epoch": 0.06243593073001381, + "grad_norm": 1.5762465000152588, + "learning_rate": 1.9999676763825647e-05, + "loss": 1.5331, + "step": 1142 + }, + { + "epoch": 0.06249060317373536, + "grad_norm": 1.480153203010559, + "learning_rate": 1.9999661904395165e-05, + "loss": 1.7739, + "step": 1143 + }, + { + "epoch": 0.06254527561745692, + "grad_norm": 1.5314998626708984, + "learning_rate": 1.999964671105251e-05, + "loss": 1.7773, + "step": 1144 + }, + { + "epoch": 0.06259994806117847, + "grad_norm": 1.631931185722351, + "learning_rate": 1.9999631183798183e-05, + "loss": 1.4151, + "step": 1145 + }, + { + "epoch": 0.06265462050490002, + "grad_norm": 2.3287549018859863, + "learning_rate": 1.9999615322632707e-05, + "loss": 1.5851, + "step": 1146 + }, + { + "epoch": 0.06270929294862157, + "grad_norm": 1.7524687051773071, + "learning_rate": 1.9999599127556614e-05, + "loss": 1.4902, + "step": 1147 + }, + { + "epoch": 0.06276396539234312, + "grad_norm": 2.473414421081543, + "learning_rate": 1.9999582598570437e-05, + "loss": 1.486, + "step": 1148 + }, + { + "epoch": 0.06281863783606467, + "grad_norm": 2.890256881713867, + "learning_rate": 1.9999565735674734e-05, + "loss": 1.4517, + "step": 1149 + }, + { + "epoch": 0.06287331027978624, + "grad_norm": 2.91379976272583, + "learning_rate": 1.9999548538870067e-05, + "loss": 1.5246, + "step": 1150 + }, + { + "epoch": 0.06292798272350779, + "grad_norm": 3.221365451812744, + "learning_rate": 1.9999531008157007e-05, + "loss": 1.2856, + "step": 1151 + }, + { + "epoch": 0.06298265516722934, + "grad_norm": 5.254034996032715, + "learning_rate": 1.9999513143536146e-05, + "loss": 1.5668, + "step": 1152 + }, + { + "epoch": 0.06303732761095089, + "grad_norm": 1.8875000476837158, + "learning_rate": 1.999949494500807e-05, + "loss": 1.4098, + "step": 1153 + }, + { + "epoch": 0.06309200005467244, + "grad_norm": 1.8217780590057373, + "learning_rate": 1.99994764125734e-05, + "loss": 1.6899, + "step": 1154 + }, + { + "epoch": 0.063146672498394, + "grad_norm": 2.0160250663757324, + "learning_rate": 1.9999457546232747e-05, + "loss": 1.5212, + "step": 1155 + }, + { + "epoch": 0.06320134494211554, + "grad_norm": 2.416865110397339, + "learning_rate": 1.999943834598674e-05, + "loss": 1.6029, + "step": 1156 + }, + { + "epoch": 0.06325601738583711, + "grad_norm": 8.786148071289062, + "learning_rate": 1.999941881183602e-05, + "loss": 1.793, + "step": 1157 + }, + { + "epoch": 0.06331068982955866, + "grad_norm": 7.103291988372803, + "learning_rate": 1.9999398943781245e-05, + "loss": 1.4022, + "step": 1158 + }, + { + "epoch": 0.06336536227328021, + "grad_norm": 2.043765068054199, + "learning_rate": 1.9999378741823076e-05, + "loss": 1.7342, + "step": 1159 + }, + { + "epoch": 0.06342003471700176, + "grad_norm": 1.9914524555206299, + "learning_rate": 1.9999358205962186e-05, + "loss": 1.6708, + "step": 1160 + }, + { + "epoch": 0.06347470716072331, + "grad_norm": 1.350722074508667, + "learning_rate": 1.9999337336199257e-05, + "loss": 1.3301, + "step": 1161 + }, + { + "epoch": 0.06352937960444487, + "grad_norm": 1.8631303310394287, + "learning_rate": 1.9999316132534995e-05, + "loss": 1.6887, + "step": 1162 + }, + { + "epoch": 0.06358405204816642, + "grad_norm": 1.7120620012283325, + "learning_rate": 1.99992945949701e-05, + "loss": 1.4691, + "step": 1163 + }, + { + "epoch": 0.06363872449188798, + "grad_norm": 1.8400171995162964, + "learning_rate": 1.9999272723505298e-05, + "loss": 1.5931, + "step": 1164 + }, + { + "epoch": 0.06369339693560953, + "grad_norm": 2.6076419353485107, + "learning_rate": 1.9999250518141313e-05, + "loss": 1.5396, + "step": 1165 + }, + { + "epoch": 0.06374806937933108, + "grad_norm": 2.0073513984680176, + "learning_rate": 1.999922797887889e-05, + "loss": 1.3635, + "step": 1166 + }, + { + "epoch": 0.06380274182305264, + "grad_norm": 1.5793925523757935, + "learning_rate": 1.9999205105718782e-05, + "loss": 1.7167, + "step": 1167 + }, + { + "epoch": 0.06385741426677419, + "grad_norm": 1.433756947517395, + "learning_rate": 1.999918189866175e-05, + "loss": 1.6444, + "step": 1168 + }, + { + "epoch": 0.06391208671049574, + "grad_norm": 1.7956300973892212, + "learning_rate": 1.999915835770857e-05, + "loss": 1.5015, + "step": 1169 + }, + { + "epoch": 0.06396675915421729, + "grad_norm": 1.7272138595581055, + "learning_rate": 1.9999134482860028e-05, + "loss": 1.6115, + "step": 1170 + }, + { + "epoch": 0.06402143159793885, + "grad_norm": 1.6001769304275513, + "learning_rate": 1.9999110274116925e-05, + "loss": 1.5871, + "step": 1171 + }, + { + "epoch": 0.0640761040416604, + "grad_norm": 1.5687335729599, + "learning_rate": 1.9999085731480064e-05, + "loss": 1.6568, + "step": 1172 + }, + { + "epoch": 0.06413077648538196, + "grad_norm": 2.129528760910034, + "learning_rate": 1.999906085495027e-05, + "loss": 1.5537, + "step": 1173 + }, + { + "epoch": 0.06418544892910351, + "grad_norm": 2.0786590576171875, + "learning_rate": 1.9999035644528368e-05, + "loss": 1.6236, + "step": 1174 + }, + { + "epoch": 0.06424012137282506, + "grad_norm": 2.630916118621826, + "learning_rate": 1.9999010100215202e-05, + "loss": 1.346, + "step": 1175 + }, + { + "epoch": 0.06429479381654661, + "grad_norm": 1.809578776359558, + "learning_rate": 1.9998984222011627e-05, + "loss": 1.2858, + "step": 1176 + }, + { + "epoch": 0.06434946626026816, + "grad_norm": 1.8068840503692627, + "learning_rate": 1.9998958009918503e-05, + "loss": 1.5214, + "step": 1177 + }, + { + "epoch": 0.06440413870398973, + "grad_norm": 2.138418197631836, + "learning_rate": 1.9998931463936707e-05, + "loss": 1.6829, + "step": 1178 + }, + { + "epoch": 0.06445881114771128, + "grad_norm": 2.4118142127990723, + "learning_rate": 1.999890458406713e-05, + "loss": 1.6459, + "step": 1179 + }, + { + "epoch": 0.06451348359143283, + "grad_norm": 1.626362681388855, + "learning_rate": 1.9998877370310665e-05, + "loss": 1.4582, + "step": 1180 + }, + { + "epoch": 0.06456815603515438, + "grad_norm": 1.9415686130523682, + "learning_rate": 1.999884982266822e-05, + "loss": 1.5394, + "step": 1181 + }, + { + "epoch": 0.06462282847887593, + "grad_norm": 1.241233229637146, + "learning_rate": 1.9998821941140716e-05, + "loss": 1.7253, + "step": 1182 + }, + { + "epoch": 0.06467750092259748, + "grad_norm": 1.4952107667922974, + "learning_rate": 1.9998793725729088e-05, + "loss": 1.7455, + "step": 1183 + }, + { + "epoch": 0.06473217336631903, + "grad_norm": 1.4745464324951172, + "learning_rate": 1.999876517643427e-05, + "loss": 1.6026, + "step": 1184 + }, + { + "epoch": 0.0647868458100406, + "grad_norm": 1.4978954792022705, + "learning_rate": 1.999873629325722e-05, + "loss": 1.4232, + "step": 1185 + }, + { + "epoch": 0.06484151825376215, + "grad_norm": 1.7823805809020996, + "learning_rate": 1.9998707076198906e-05, + "loss": 1.8158, + "step": 1186 + }, + { + "epoch": 0.0648961906974837, + "grad_norm": 1.5436238050460815, + "learning_rate": 1.99986775252603e-05, + "loss": 1.6388, + "step": 1187 + }, + { + "epoch": 0.06495086314120525, + "grad_norm": 1.8410786390304565, + "learning_rate": 1.9998647640442384e-05, + "loss": 1.6684, + "step": 1188 + }, + { + "epoch": 0.0650055355849268, + "grad_norm": 1.9147272109985352, + "learning_rate": 1.9998617421746166e-05, + "loss": 1.7411, + "step": 1189 + }, + { + "epoch": 0.06506020802864836, + "grad_norm": 1.5073370933532715, + "learning_rate": 1.9998586869172647e-05, + "loss": 1.5106, + "step": 1190 + }, + { + "epoch": 0.06511488047236991, + "grad_norm": 1.5456501245498657, + "learning_rate": 1.999855598272285e-05, + "loss": 1.5956, + "step": 1191 + }, + { + "epoch": 0.06516955291609147, + "grad_norm": 2.970588207244873, + "learning_rate": 1.999852476239781e-05, + "loss": 1.5695, + "step": 1192 + }, + { + "epoch": 0.06522422535981302, + "grad_norm": 2.57142972946167, + "learning_rate": 1.999849320819856e-05, + "loss": 1.2888, + "step": 1193 + }, + { + "epoch": 0.06527889780353457, + "grad_norm": 2.1343488693237305, + "learning_rate": 1.9998461320126163e-05, + "loss": 1.9371, + "step": 1194 + }, + { + "epoch": 0.06533357024725613, + "grad_norm": 2.00439715385437, + "learning_rate": 1.999842909818168e-05, + "loss": 1.7101, + "step": 1195 + }, + { + "epoch": 0.06538824269097768, + "grad_norm": 1.6010857820510864, + "learning_rate": 1.9998396542366188e-05, + "loss": 1.6019, + "step": 1196 + }, + { + "epoch": 0.06544291513469923, + "grad_norm": 1.5112777948379517, + "learning_rate": 1.9998363652680774e-05, + "loss": 1.8038, + "step": 1197 + }, + { + "epoch": 0.0654975875784208, + "grad_norm": 1.8478440046310425, + "learning_rate": 1.9998330429126532e-05, + "loss": 1.4455, + "step": 1198 + }, + { + "epoch": 0.06555226002214234, + "grad_norm": 1.5312997102737427, + "learning_rate": 1.9998296871704578e-05, + "loss": 1.5504, + "step": 1199 + }, + { + "epoch": 0.0656069324658639, + "grad_norm": 1.5387616157531738, + "learning_rate": 1.999826298041603e-05, + "loss": 1.5551, + "step": 1200 + }, + { + "epoch": 0.06566160490958545, + "grad_norm": 1.6422808170318604, + "learning_rate": 1.999822875526202e-05, + "loss": 1.7424, + "step": 1201 + }, + { + "epoch": 0.065716277353307, + "grad_norm": 1.6188948154449463, + "learning_rate": 1.9998194196243688e-05, + "loss": 1.8816, + "step": 1202 + }, + { + "epoch": 0.06577094979702855, + "grad_norm": 1.8216135501861572, + "learning_rate": 1.9998159303362193e-05, + "loss": 1.6162, + "step": 1203 + }, + { + "epoch": 0.0658256222407501, + "grad_norm": 1.5276075601577759, + "learning_rate": 1.9998124076618694e-05, + "loss": 1.6135, + "step": 1204 + }, + { + "epoch": 0.06588029468447167, + "grad_norm": 1.366624116897583, + "learning_rate": 1.9998088516014374e-05, + "loss": 1.5568, + "step": 1205 + }, + { + "epoch": 0.06593496712819322, + "grad_norm": 1.7780858278274536, + "learning_rate": 1.9998052621550415e-05, + "loss": 1.5391, + "step": 1206 + }, + { + "epoch": 0.06598963957191477, + "grad_norm": 1.5288591384887695, + "learning_rate": 1.9998016393228016e-05, + "loss": 1.6477, + "step": 1207 + }, + { + "epoch": 0.06604431201563632, + "grad_norm": 1.7184721231460571, + "learning_rate": 1.9997979831048392e-05, + "loss": 1.614, + "step": 1208 + }, + { + "epoch": 0.06609898445935787, + "grad_norm": 1.9177277088165283, + "learning_rate": 1.999794293501276e-05, + "loss": 1.5421, + "step": 1209 + }, + { + "epoch": 0.06615365690307942, + "grad_norm": 1.8093398809432983, + "learning_rate": 1.9997905705122352e-05, + "loss": 1.5807, + "step": 1210 + }, + { + "epoch": 0.06620832934680097, + "grad_norm": 1.5132454633712769, + "learning_rate": 1.999786814137841e-05, + "loss": 1.6107, + "step": 1211 + }, + { + "epoch": 0.06626300179052254, + "grad_norm": 1.8003690242767334, + "learning_rate": 1.9997830243782193e-05, + "loss": 1.6191, + "step": 1212 + }, + { + "epoch": 0.06631767423424409, + "grad_norm": 1.3470360040664673, + "learning_rate": 1.9997792012334963e-05, + "loss": 1.7468, + "step": 1213 + }, + { + "epoch": 0.06637234667796564, + "grad_norm": 1.1724921464920044, + "learning_rate": 1.9997753447037997e-05, + "loss": 1.676, + "step": 1214 + }, + { + "epoch": 0.06642701912168719, + "grad_norm": 1.6767746210098267, + "learning_rate": 1.9997714547892584e-05, + "loss": 1.5952, + "step": 1215 + }, + { + "epoch": 0.06648169156540874, + "grad_norm": 2.5495002269744873, + "learning_rate": 1.9997675314900017e-05, + "loss": 1.6356, + "step": 1216 + }, + { + "epoch": 0.0665363640091303, + "grad_norm": 2.076714038848877, + "learning_rate": 1.9997635748061615e-05, + "loss": 1.4495, + "step": 1217 + }, + { + "epoch": 0.06659103645285185, + "grad_norm": 1.695499300956726, + "learning_rate": 1.9997595847378695e-05, + "loss": 1.4667, + "step": 1218 + }, + { + "epoch": 0.06664570889657341, + "grad_norm": 1.7340378761291504, + "learning_rate": 1.999755561285259e-05, + "loss": 1.4031, + "step": 1219 + }, + { + "epoch": 0.06670038134029496, + "grad_norm": 1.5239973068237305, + "learning_rate": 1.9997515044484643e-05, + "loss": 1.376, + "step": 1220 + }, + { + "epoch": 0.06675505378401651, + "grad_norm": 1.4231432676315308, + "learning_rate": 1.9997474142276204e-05, + "loss": 1.406, + "step": 1221 + }, + { + "epoch": 0.06680972622773806, + "grad_norm": 2.180480480194092, + "learning_rate": 1.999743290622865e-05, + "loss": 1.5423, + "step": 1222 + }, + { + "epoch": 0.06686439867145962, + "grad_norm": 1.600016474723816, + "learning_rate": 1.9997391336343347e-05, + "loss": 1.4837, + "step": 1223 + }, + { + "epoch": 0.06691907111518117, + "grad_norm": 1.5671663284301758, + "learning_rate": 1.999734943262169e-05, + "loss": 1.7183, + "step": 1224 + }, + { + "epoch": 0.06697374355890272, + "grad_norm": 1.5197784900665283, + "learning_rate": 1.999730719506508e-05, + "loss": 1.4081, + "step": 1225 + }, + { + "epoch": 0.06702841600262428, + "grad_norm": 1.7745404243469238, + "learning_rate": 1.9997264623674917e-05, + "loss": 1.4978, + "step": 1226 + }, + { + "epoch": 0.06708308844634583, + "grad_norm": 1.185049057006836, + "learning_rate": 1.9997221718452627e-05, + "loss": 1.5084, + "step": 1227 + }, + { + "epoch": 0.06713776089006739, + "grad_norm": 1.5901004076004028, + "learning_rate": 1.999717847939965e-05, + "loss": 1.3761, + "step": 1228 + }, + { + "epoch": 0.06719243333378894, + "grad_norm": 1.673304557800293, + "learning_rate": 1.9997134906517423e-05, + "loss": 1.6141, + "step": 1229 + }, + { + "epoch": 0.06724710577751049, + "grad_norm": 1.5806787014007568, + "learning_rate": 1.9997090999807406e-05, + "loss": 1.4889, + "step": 1230 + }, + { + "epoch": 0.06730177822123204, + "grad_norm": 1.3565750122070312, + "learning_rate": 1.9997046759271055e-05, + "loss": 1.6043, + "step": 1231 + }, + { + "epoch": 0.06735645066495359, + "grad_norm": 1.7313098907470703, + "learning_rate": 1.9997002184909858e-05, + "loss": 1.6215, + "step": 1232 + }, + { + "epoch": 0.06741112310867516, + "grad_norm": 1.4502475261688232, + "learning_rate": 1.99969572767253e-05, + "loss": 1.5031, + "step": 1233 + }, + { + "epoch": 0.0674657955523967, + "grad_norm": 1.6663099527359009, + "learning_rate": 1.9996912034718875e-05, + "loss": 1.5089, + "step": 1234 + }, + { + "epoch": 0.06752046799611826, + "grad_norm": 1.603837251663208, + "learning_rate": 1.9996866458892102e-05, + "loss": 1.6333, + "step": 1235 + }, + { + "epoch": 0.06757514043983981, + "grad_norm": 1.5547908544540405, + "learning_rate": 1.99968205492465e-05, + "loss": 1.3782, + "step": 1236 + }, + { + "epoch": 0.06762981288356136, + "grad_norm": 1.688398003578186, + "learning_rate": 1.99967743057836e-05, + "loss": 1.4504, + "step": 1237 + }, + { + "epoch": 0.06768448532728291, + "grad_norm": 1.7531839609146118, + "learning_rate": 1.999672772850495e-05, + "loss": 1.6674, + "step": 1238 + }, + { + "epoch": 0.06773915777100446, + "grad_norm": 2.170778274536133, + "learning_rate": 1.9996680817412103e-05, + "loss": 1.7887, + "step": 1239 + }, + { + "epoch": 0.06779383021472603, + "grad_norm": 1.4260436296463013, + "learning_rate": 1.9996633572506623e-05, + "loss": 1.6143, + "step": 1240 + }, + { + "epoch": 0.06784850265844758, + "grad_norm": 1.6468838453292847, + "learning_rate": 1.9996585993790092e-05, + "loss": 1.3406, + "step": 1241 + }, + { + "epoch": 0.06790317510216913, + "grad_norm": 1.4896361827850342, + "learning_rate": 1.9996538081264095e-05, + "loss": 1.6036, + "step": 1242 + }, + { + "epoch": 0.06795784754589068, + "grad_norm": 1.4604078531265259, + "learning_rate": 1.9996489834930236e-05, + "loss": 1.6253, + "step": 1243 + }, + { + "epoch": 0.06801251998961223, + "grad_norm": 1.770143747329712, + "learning_rate": 1.9996441254790122e-05, + "loss": 1.4994, + "step": 1244 + }, + { + "epoch": 0.06806719243333378, + "grad_norm": 1.7042440176010132, + "learning_rate": 1.999639234084538e-05, + "loss": 1.6054, + "step": 1245 + }, + { + "epoch": 0.06812186487705534, + "grad_norm": 1.609182357788086, + "learning_rate": 1.999634309309764e-05, + "loss": 1.7627, + "step": 1246 + }, + { + "epoch": 0.0681765373207769, + "grad_norm": 1.2330793142318726, + "learning_rate": 1.9996293511548545e-05, + "loss": 1.5512, + "step": 1247 + }, + { + "epoch": 0.06823120976449845, + "grad_norm": 1.3988476991653442, + "learning_rate": 1.999624359619975e-05, + "loss": 1.5554, + "step": 1248 + }, + { + "epoch": 0.06828588220822, + "grad_norm": 2.1224515438079834, + "learning_rate": 1.999619334705293e-05, + "loss": 1.3493, + "step": 1249 + }, + { + "epoch": 0.06834055465194155, + "grad_norm": 2.151954174041748, + "learning_rate": 1.9996142764109755e-05, + "loss": 1.6391, + "step": 1250 + }, + { + "epoch": 0.0683952270956631, + "grad_norm": 1.779919981956482, + "learning_rate": 1.9996091847371918e-05, + "loss": 1.7629, + "step": 1251 + }, + { + "epoch": 0.06844989953938466, + "grad_norm": 1.6294386386871338, + "learning_rate": 1.9996040596841118e-05, + "loss": 1.4307, + "step": 1252 + }, + { + "epoch": 0.06850457198310622, + "grad_norm": 1.32049560546875, + "learning_rate": 1.9995989012519065e-05, + "loss": 1.5734, + "step": 1253 + }, + { + "epoch": 0.06855924442682777, + "grad_norm": 1.9762009382247925, + "learning_rate": 1.999593709440748e-05, + "loss": 1.6675, + "step": 1254 + }, + { + "epoch": 0.06861391687054932, + "grad_norm": 1.9890016317367554, + "learning_rate": 1.99958848425081e-05, + "loss": 1.3935, + "step": 1255 + }, + { + "epoch": 0.06866858931427088, + "grad_norm": 2.118669033050537, + "learning_rate": 1.999583225682267e-05, + "loss": 1.5624, + "step": 1256 + }, + { + "epoch": 0.06872326175799243, + "grad_norm": 1.6634175777435303, + "learning_rate": 1.9995779337352947e-05, + "loss": 1.5211, + "step": 1257 + }, + { + "epoch": 0.06877793420171398, + "grad_norm": 1.6647289991378784, + "learning_rate": 1.9995726084100692e-05, + "loss": 1.709, + "step": 1258 + }, + { + "epoch": 0.06883260664543553, + "grad_norm": 1.920580267906189, + "learning_rate": 1.999567249706769e-05, + "loss": 1.4971, + "step": 1259 + }, + { + "epoch": 0.0688872790891571, + "grad_norm": 1.718300461769104, + "learning_rate": 1.999561857625573e-05, + "loss": 1.6064, + "step": 1260 + }, + { + "epoch": 0.06894195153287865, + "grad_norm": 1.3760813474655151, + "learning_rate": 1.9995564321666607e-05, + "loss": 1.5947, + "step": 1261 + }, + { + "epoch": 0.0689966239766002, + "grad_norm": 1.5974551439285278, + "learning_rate": 1.9995509733302135e-05, + "loss": 1.7647, + "step": 1262 + }, + { + "epoch": 0.06905129642032175, + "grad_norm": 1.6128883361816406, + "learning_rate": 1.999545481116414e-05, + "loss": 1.5486, + "step": 1263 + }, + { + "epoch": 0.0691059688640433, + "grad_norm": 1.5112366676330566, + "learning_rate": 1.999539955525445e-05, + "loss": 1.5592, + "step": 1264 + }, + { + "epoch": 0.06916064130776485, + "grad_norm": 2.0007095336914062, + "learning_rate": 1.999534396557492e-05, + "loss": 1.4348, + "step": 1265 + }, + { + "epoch": 0.0692153137514864, + "grad_norm": 1.408529281616211, + "learning_rate": 1.9995288042127396e-05, + "loss": 1.451, + "step": 1266 + }, + { + "epoch": 0.06926998619520797, + "grad_norm": 2.0439984798431396, + "learning_rate": 1.9995231784913753e-05, + "loss": 1.3542, + "step": 1267 + }, + { + "epoch": 0.06932465863892952, + "grad_norm": 1.5341423749923706, + "learning_rate": 1.999517519393586e-05, + "loss": 1.451, + "step": 1268 + }, + { + "epoch": 0.06937933108265107, + "grad_norm": 1.1680412292480469, + "learning_rate": 1.999511826919562e-05, + "loss": 1.5, + "step": 1269 + }, + { + "epoch": 0.06943400352637262, + "grad_norm": 1.7188011407852173, + "learning_rate": 1.9995061010694924e-05, + "loss": 1.8319, + "step": 1270 + }, + { + "epoch": 0.06948867597009417, + "grad_norm": 1.370627522468567, + "learning_rate": 1.9995003418435684e-05, + "loss": 1.4907, + "step": 1271 + }, + { + "epoch": 0.06954334841381572, + "grad_norm": 1.7246214151382446, + "learning_rate": 1.999494549241983e-05, + "loss": 1.5311, + "step": 1272 + }, + { + "epoch": 0.06959802085753727, + "grad_norm": 1.9152171611785889, + "learning_rate": 1.999488723264929e-05, + "loss": 1.5236, + "step": 1273 + }, + { + "epoch": 0.06965269330125884, + "grad_norm": 1.8623993396759033, + "learning_rate": 1.999482863912601e-05, + "loss": 1.5469, + "step": 1274 + }, + { + "epoch": 0.06970736574498039, + "grad_norm": 1.3305394649505615, + "learning_rate": 1.9994769711851953e-05, + "loss": 1.614, + "step": 1275 + }, + { + "epoch": 0.06976203818870194, + "grad_norm": 1.4999099969863892, + "learning_rate": 1.9994710450829076e-05, + "loss": 1.5685, + "step": 1276 + }, + { + "epoch": 0.06981671063242349, + "grad_norm": 1.488967776298523, + "learning_rate": 1.9994650856059364e-05, + "loss": 1.418, + "step": 1277 + }, + { + "epoch": 0.06987138307614504, + "grad_norm": 1.474774956703186, + "learning_rate": 1.999459092754481e-05, + "loss": 1.4679, + "step": 1278 + }, + { + "epoch": 0.0699260555198666, + "grad_norm": 1.4436702728271484, + "learning_rate": 1.9994530665287414e-05, + "loss": 1.4848, + "step": 1279 + }, + { + "epoch": 0.06998072796358815, + "grad_norm": 1.7552404403686523, + "learning_rate": 1.999447006928918e-05, + "loss": 1.508, + "step": 1280 + }, + { + "epoch": 0.07003540040730971, + "grad_norm": 1.7999608516693115, + "learning_rate": 1.999440913955214e-05, + "loss": 1.5579, + "step": 1281 + }, + { + "epoch": 0.07009007285103126, + "grad_norm": 1.95539128780365, + "learning_rate": 1.999434787607833e-05, + "loss": 1.7671, + "step": 1282 + }, + { + "epoch": 0.07014474529475281, + "grad_norm": 1.8290486335754395, + "learning_rate": 1.999428627886979e-05, + "loss": 1.4208, + "step": 1283 + }, + { + "epoch": 0.07019941773847437, + "grad_norm": 1.9239931106567383, + "learning_rate": 1.9994224347928576e-05, + "loss": 1.6337, + "step": 1284 + }, + { + "epoch": 0.07025409018219592, + "grad_norm": 1.9598575830459595, + "learning_rate": 1.999416208325676e-05, + "loss": 1.6817, + "step": 1285 + }, + { + "epoch": 0.07030876262591747, + "grad_norm": 1.805103063583374, + "learning_rate": 1.999409948485642e-05, + "loss": 1.4304, + "step": 1286 + }, + { + "epoch": 0.07036343506963902, + "grad_norm": 2.2211802005767822, + "learning_rate": 1.999403655272965e-05, + "loss": 1.4195, + "step": 1287 + }, + { + "epoch": 0.07041810751336058, + "grad_norm": 2.8500640392303467, + "learning_rate": 1.9993973286878544e-05, + "loss": 1.3211, + "step": 1288 + }, + { + "epoch": 0.07047277995708214, + "grad_norm": 1.6718471050262451, + "learning_rate": 1.999390968730522e-05, + "loss": 1.4816, + "step": 1289 + }, + { + "epoch": 0.07052745240080369, + "grad_norm": 1.9235436916351318, + "learning_rate": 1.9993845754011797e-05, + "loss": 1.8148, + "step": 1290 + }, + { + "epoch": 0.07058212484452524, + "grad_norm": 1.7415251731872559, + "learning_rate": 1.999378148700042e-05, + "loss": 1.5411, + "step": 1291 + }, + { + "epoch": 0.07063679728824679, + "grad_norm": 1.7373558282852173, + "learning_rate": 1.999371688627322e-05, + "loss": 1.5271, + "step": 1292 + }, + { + "epoch": 0.07069146973196834, + "grad_norm": 1.5746880769729614, + "learning_rate": 1.9993651951832364e-05, + "loss": 1.5617, + "step": 1293 + }, + { + "epoch": 0.07074614217568989, + "grad_norm": 1.4084603786468506, + "learning_rate": 1.999358668368002e-05, + "loss": 1.701, + "step": 1294 + }, + { + "epoch": 0.07080081461941146, + "grad_norm": 1.732176423072815, + "learning_rate": 1.9993521081818367e-05, + "loss": 1.705, + "step": 1295 + }, + { + "epoch": 0.07085548706313301, + "grad_norm": 1.931556224822998, + "learning_rate": 1.9993455146249594e-05, + "loss": 1.5438, + "step": 1296 + }, + { + "epoch": 0.07091015950685456, + "grad_norm": 1.510805368423462, + "learning_rate": 1.9993388876975902e-05, + "loss": 1.7864, + "step": 1297 + }, + { + "epoch": 0.07096483195057611, + "grad_norm": 2.2308011054992676, + "learning_rate": 1.9993322273999506e-05, + "loss": 1.5452, + "step": 1298 + }, + { + "epoch": 0.07101950439429766, + "grad_norm": 1.4002772569656372, + "learning_rate": 1.999325533732263e-05, + "loss": 1.6685, + "step": 1299 + }, + { + "epoch": 0.07107417683801921, + "grad_norm": 5.11482572555542, + "learning_rate": 1.999318806694751e-05, + "loss": 1.7813, + "step": 1300 + }, + { + "epoch": 0.07112884928174078, + "grad_norm": 1.7732288837432861, + "learning_rate": 1.9993120462876385e-05, + "loss": 1.7374, + "step": 1301 + }, + { + "epoch": 0.07118352172546233, + "grad_norm": 1.887714147567749, + "learning_rate": 1.9993052525111522e-05, + "loss": 1.5008, + "step": 1302 + }, + { + "epoch": 0.07123819416918388, + "grad_norm": 2.416557788848877, + "learning_rate": 1.9992984253655186e-05, + "loss": 1.5972, + "step": 1303 + }, + { + "epoch": 0.07129286661290543, + "grad_norm": 5.758327960968018, + "learning_rate": 1.9992915648509655e-05, + "loss": 1.3976, + "step": 1304 + }, + { + "epoch": 0.07134753905662698, + "grad_norm": 2.7125093936920166, + "learning_rate": 1.9992846709677222e-05, + "loss": 1.4519, + "step": 1305 + }, + { + "epoch": 0.07140221150034853, + "grad_norm": 3.1276021003723145, + "learning_rate": 1.999277743716019e-05, + "loss": 1.6794, + "step": 1306 + }, + { + "epoch": 0.07145688394407008, + "grad_norm": 1.7113430500030518, + "learning_rate": 1.9992707830960868e-05, + "loss": 1.5264, + "step": 1307 + }, + { + "epoch": 0.07151155638779165, + "grad_norm": 1.7856931686401367, + "learning_rate": 1.9992637891081585e-05, + "loss": 1.4411, + "step": 1308 + }, + { + "epoch": 0.0715662288315132, + "grad_norm": 1.4473592042922974, + "learning_rate": 1.999256761752467e-05, + "loss": 1.4257, + "step": 1309 + }, + { + "epoch": 0.07162090127523475, + "grad_norm": 1.5417540073394775, + "learning_rate": 1.999249701029248e-05, + "loss": 1.7444, + "step": 1310 + }, + { + "epoch": 0.0716755737189563, + "grad_norm": 2.208003520965576, + "learning_rate": 1.999242606938736e-05, + "loss": 1.5103, + "step": 1311 + }, + { + "epoch": 0.07173024616267785, + "grad_norm": 1.998907208442688, + "learning_rate": 1.999235479481169e-05, + "loss": 1.2553, + "step": 1312 + }, + { + "epoch": 0.0717849186063994, + "grad_norm": 1.766482949256897, + "learning_rate": 1.9992283186567848e-05, + "loss": 1.6996, + "step": 1313 + }, + { + "epoch": 0.07183959105012096, + "grad_norm": 1.6463733911514282, + "learning_rate": 1.9992211244658218e-05, + "loss": 1.4919, + "step": 1314 + }, + { + "epoch": 0.07189426349384252, + "grad_norm": 1.3493685722351074, + "learning_rate": 1.999213896908521e-05, + "loss": 1.4316, + "step": 1315 + }, + { + "epoch": 0.07194893593756407, + "grad_norm": 1.573265790939331, + "learning_rate": 1.9992066359851236e-05, + "loss": 1.4353, + "step": 1316 + }, + { + "epoch": 0.07200360838128562, + "grad_norm": 1.4751768112182617, + "learning_rate": 1.9991993416958713e-05, + "loss": 1.5121, + "step": 1317 + }, + { + "epoch": 0.07205828082500718, + "grad_norm": 1.3797111511230469, + "learning_rate": 1.999192014041009e-05, + "loss": 1.5782, + "step": 1318 + }, + { + "epoch": 0.07211295326872873, + "grad_norm": 1.4176898002624512, + "learning_rate": 1.9991846530207798e-05, + "loss": 1.5218, + "step": 1319 + }, + { + "epoch": 0.07216762571245028, + "grad_norm": 1.23614501953125, + "learning_rate": 1.999177258635431e-05, + "loss": 1.7158, + "step": 1320 + }, + { + "epoch": 0.07222229815617183, + "grad_norm": 1.591475248336792, + "learning_rate": 1.999169830885209e-05, + "loss": 1.5515, + "step": 1321 + }, + { + "epoch": 0.0722769705998934, + "grad_norm": 1.3009780645370483, + "learning_rate": 1.9991623697703613e-05, + "loss": 1.6586, + "step": 1322 + }, + { + "epoch": 0.07233164304361495, + "grad_norm": 1.5594565868377686, + "learning_rate": 1.999154875291138e-05, + "loss": 1.5635, + "step": 1323 + }, + { + "epoch": 0.0723863154873365, + "grad_norm": 1.8465018272399902, + "learning_rate": 1.999147347447788e-05, + "loss": 1.5523, + "step": 1324 + }, + { + "epoch": 0.07244098793105805, + "grad_norm": 1.8916280269622803, + "learning_rate": 1.9991397862405645e-05, + "loss": 1.6721, + "step": 1325 + }, + { + "epoch": 0.0724956603747796, + "grad_norm": 1.5384390354156494, + "learning_rate": 1.9991321916697182e-05, + "loss": 1.3299, + "step": 1326 + }, + { + "epoch": 0.07255033281850115, + "grad_norm": 1.2829277515411377, + "learning_rate": 1.999124563735504e-05, + "loss": 1.4423, + "step": 1327 + }, + { + "epoch": 0.0726050052622227, + "grad_norm": 1.9745832681655884, + "learning_rate": 1.9991169024381756e-05, + "loss": 1.7657, + "step": 1328 + }, + { + "epoch": 0.07265967770594427, + "grad_norm": 1.4718008041381836, + "learning_rate": 1.9991092077779895e-05, + "loss": 1.5299, + "step": 1329 + }, + { + "epoch": 0.07271435014966582, + "grad_norm": 1.2389439344406128, + "learning_rate": 1.9991014797552027e-05, + "loss": 1.5754, + "step": 1330 + }, + { + "epoch": 0.07276902259338737, + "grad_norm": 1.311673641204834, + "learning_rate": 1.9990937183700728e-05, + "loss": 1.6706, + "step": 1331 + }, + { + "epoch": 0.07282369503710892, + "grad_norm": 1.619079828262329, + "learning_rate": 1.999085923622859e-05, + "loss": 1.5958, + "step": 1332 + }, + { + "epoch": 0.07287836748083047, + "grad_norm": 1.6181288957595825, + "learning_rate": 1.999078095513822e-05, + "loss": 1.3576, + "step": 1333 + }, + { + "epoch": 0.07293303992455202, + "grad_norm": 1.503535509109497, + "learning_rate": 1.9990702340432232e-05, + "loss": 1.6611, + "step": 1334 + }, + { + "epoch": 0.07298771236827357, + "grad_norm": 1.4735519886016846, + "learning_rate": 1.999062339211325e-05, + "loss": 1.592, + "step": 1335 + }, + { + "epoch": 0.07304238481199514, + "grad_norm": 1.4545278549194336, + "learning_rate": 1.9990544110183907e-05, + "loss": 1.5198, + "step": 1336 + }, + { + "epoch": 0.07309705725571669, + "grad_norm": 2.1462769508361816, + "learning_rate": 1.999046449464685e-05, + "loss": 1.5, + "step": 1337 + }, + { + "epoch": 0.07315172969943824, + "grad_norm": 1.4213460683822632, + "learning_rate": 1.9990384545504743e-05, + "loss": 1.402, + "step": 1338 + }, + { + "epoch": 0.0732064021431598, + "grad_norm": 1.643217921257019, + "learning_rate": 1.999030426276025e-05, + "loss": 1.6088, + "step": 1339 + }, + { + "epoch": 0.07326107458688134, + "grad_norm": 1.4530137777328491, + "learning_rate": 1.999022364641606e-05, + "loss": 1.4301, + "step": 1340 + }, + { + "epoch": 0.0733157470306029, + "grad_norm": 1.4940093755722046, + "learning_rate": 1.9990142696474855e-05, + "loss": 1.28, + "step": 1341 + }, + { + "epoch": 0.07337041947432445, + "grad_norm": 1.8546972274780273, + "learning_rate": 1.9990061412939346e-05, + "loss": 1.3005, + "step": 1342 + }, + { + "epoch": 0.07342509191804601, + "grad_norm": 1.5201925039291382, + "learning_rate": 1.998997979581224e-05, + "loss": 1.21, + "step": 1343 + }, + { + "epoch": 0.07347976436176756, + "grad_norm": 1.710430383682251, + "learning_rate": 1.9989897845096272e-05, + "loss": 1.6418, + "step": 1344 + }, + { + "epoch": 0.07353443680548911, + "grad_norm": 1.6388494968414307, + "learning_rate": 1.998981556079417e-05, + "loss": 1.517, + "step": 1345 + }, + { + "epoch": 0.07358910924921067, + "grad_norm": 1.7378913164138794, + "learning_rate": 1.998973294290868e-05, + "loss": 1.4255, + "step": 1346 + }, + { + "epoch": 0.07364378169293222, + "grad_norm": 1.632462501525879, + "learning_rate": 1.998964999144257e-05, + "loss": 1.7752, + "step": 1347 + }, + { + "epoch": 0.07369845413665377, + "grad_norm": 1.4106452465057373, + "learning_rate": 1.9989566706398606e-05, + "loss": 1.5186, + "step": 1348 + }, + { + "epoch": 0.07375312658037532, + "grad_norm": 1.4673595428466797, + "learning_rate": 1.9989483087779565e-05, + "loss": 1.7956, + "step": 1349 + }, + { + "epoch": 0.07380779902409688, + "grad_norm": 1.5589605569839478, + "learning_rate": 1.9989399135588246e-05, + "loss": 1.5528, + "step": 1350 + }, + { + "epoch": 0.07386247146781844, + "grad_norm": 1.4541977643966675, + "learning_rate": 1.998931484982745e-05, + "loss": 1.3776, + "step": 1351 + }, + { + "epoch": 0.07391714391153999, + "grad_norm": 1.8501222133636475, + "learning_rate": 1.998923023049999e-05, + "loss": 1.5355, + "step": 1352 + }, + { + "epoch": 0.07397181635526154, + "grad_norm": 1.6615946292877197, + "learning_rate": 1.998914527760869e-05, + "loss": 1.2811, + "step": 1353 + }, + { + "epoch": 0.07402648879898309, + "grad_norm": 1.835571050643921, + "learning_rate": 1.998905999115639e-05, + "loss": 1.8053, + "step": 1354 + }, + { + "epoch": 0.07408116124270464, + "grad_norm": 1.2458820343017578, + "learning_rate": 1.9988974371145934e-05, + "loss": 1.3679, + "step": 1355 + }, + { + "epoch": 0.0741358336864262, + "grad_norm": 1.968124270439148, + "learning_rate": 1.9988888417580187e-05, + "loss": 1.5036, + "step": 1356 + }, + { + "epoch": 0.07419050613014776, + "grad_norm": 1.9428611993789673, + "learning_rate": 1.9988802130462017e-05, + "loss": 1.7747, + "step": 1357 + }, + { + "epoch": 0.07424517857386931, + "grad_norm": 1.1556172370910645, + "learning_rate": 1.99887155097943e-05, + "loss": 1.8152, + "step": 1358 + }, + { + "epoch": 0.07429985101759086, + "grad_norm": 2.175797700881958, + "learning_rate": 1.9988628555579935e-05, + "loss": 1.6756, + "step": 1359 + }, + { + "epoch": 0.07435452346131241, + "grad_norm": 1.202979564666748, + "learning_rate": 1.9988541267821825e-05, + "loss": 1.8215, + "step": 1360 + }, + { + "epoch": 0.07440919590503396, + "grad_norm": 1.4070569276809692, + "learning_rate": 1.9988453646522883e-05, + "loss": 1.5916, + "step": 1361 + }, + { + "epoch": 0.07446386834875551, + "grad_norm": 2.2527639865875244, + "learning_rate": 1.9988365691686035e-05, + "loss": 1.7755, + "step": 1362 + }, + { + "epoch": 0.07451854079247708, + "grad_norm": 1.4835149049758911, + "learning_rate": 1.9988277403314216e-05, + "loss": 1.6229, + "step": 1363 + }, + { + "epoch": 0.07457321323619863, + "grad_norm": 1.3869296312332153, + "learning_rate": 1.9988188781410377e-05, + "loss": 1.4841, + "step": 1364 + }, + { + "epoch": 0.07462788567992018, + "grad_norm": 2.0086138248443604, + "learning_rate": 1.9988099825977477e-05, + "loss": 1.6339, + "step": 1365 + }, + { + "epoch": 0.07468255812364173, + "grad_norm": 1.4427064657211304, + "learning_rate": 1.998801053701849e-05, + "loss": 1.4504, + "step": 1366 + }, + { + "epoch": 0.07473723056736328, + "grad_norm": 1.5546643733978271, + "learning_rate": 1.998792091453639e-05, + "loss": 1.457, + "step": 1367 + }, + { + "epoch": 0.07479190301108483, + "grad_norm": 1.8227593898773193, + "learning_rate": 1.998783095853417e-05, + "loss": 1.6237, + "step": 1368 + }, + { + "epoch": 0.07484657545480639, + "grad_norm": 1.49861478805542, + "learning_rate": 1.9987740669014843e-05, + "loss": 1.7417, + "step": 1369 + }, + { + "epoch": 0.07490124789852795, + "grad_norm": 1.54025137424469, + "learning_rate": 1.9987650045981412e-05, + "loss": 1.3121, + "step": 1370 + }, + { + "epoch": 0.0749559203422495, + "grad_norm": 2.3233542442321777, + "learning_rate": 1.9987559089436917e-05, + "loss": 1.3205, + "step": 1371 + }, + { + "epoch": 0.07501059278597105, + "grad_norm": 1.114280343055725, + "learning_rate": 1.998746779938438e-05, + "loss": 1.5266, + "step": 1372 + }, + { + "epoch": 0.0750652652296926, + "grad_norm": 1.5229973793029785, + "learning_rate": 1.9987376175826864e-05, + "loss": 1.6669, + "step": 1373 + }, + { + "epoch": 0.07511993767341416, + "grad_norm": 1.346289038658142, + "learning_rate": 1.9987284218767415e-05, + "loss": 1.5363, + "step": 1374 + }, + { + "epoch": 0.0751746101171357, + "grad_norm": 1.6187776327133179, + "learning_rate": 1.998719192820911e-05, + "loss": 1.4013, + "step": 1375 + }, + { + "epoch": 0.07522928256085726, + "grad_norm": 1.2442119121551514, + "learning_rate": 1.9987099304155035e-05, + "loss": 1.6468, + "step": 1376 + }, + { + "epoch": 0.07528395500457882, + "grad_norm": 1.6300995349884033, + "learning_rate": 1.9987006346608274e-05, + "loss": 1.5543, + "step": 1377 + }, + { + "epoch": 0.07533862744830037, + "grad_norm": 1.3929839134216309, + "learning_rate": 1.998691305557194e-05, + "loss": 1.4407, + "step": 1378 + }, + { + "epoch": 0.07539329989202193, + "grad_norm": 1.757246494293213, + "learning_rate": 1.9986819431049146e-05, + "loss": 1.5915, + "step": 1379 + }, + { + "epoch": 0.07544797233574348, + "grad_norm": 1.6361795663833618, + "learning_rate": 1.9986725473043013e-05, + "loss": 1.6406, + "step": 1380 + }, + { + "epoch": 0.07550264477946503, + "grad_norm": 1.2425823211669922, + "learning_rate": 1.998663118155668e-05, + "loss": 1.4393, + "step": 1381 + }, + { + "epoch": 0.07555731722318658, + "grad_norm": 1.5498889684677124, + "learning_rate": 1.9986536556593303e-05, + "loss": 1.8625, + "step": 1382 + }, + { + "epoch": 0.07561198966690813, + "grad_norm": 1.552069067955017, + "learning_rate": 1.998644159815603e-05, + "loss": 1.3954, + "step": 1383 + }, + { + "epoch": 0.0756666621106297, + "grad_norm": 1.6377636194229126, + "learning_rate": 1.998634630624804e-05, + "loss": 1.6498, + "step": 1384 + }, + { + "epoch": 0.07572133455435125, + "grad_norm": 1.2767021656036377, + "learning_rate": 1.9986250680872515e-05, + "loss": 1.4353, + "step": 1385 + }, + { + "epoch": 0.0757760069980728, + "grad_norm": 1.3770263195037842, + "learning_rate": 1.9986154722032646e-05, + "loss": 1.6689, + "step": 1386 + }, + { + "epoch": 0.07583067944179435, + "grad_norm": 1.7761425971984863, + "learning_rate": 1.998605842973164e-05, + "loss": 1.378, + "step": 1387 + }, + { + "epoch": 0.0758853518855159, + "grad_norm": 1.750681757926941, + "learning_rate": 1.9985961803972704e-05, + "loss": 1.4881, + "step": 1388 + }, + { + "epoch": 0.07594002432923745, + "grad_norm": 1.5114665031433105, + "learning_rate": 1.9985864844759073e-05, + "loss": 1.5791, + "step": 1389 + }, + { + "epoch": 0.075994696772959, + "grad_norm": 1.2786091566085815, + "learning_rate": 1.9985767552093982e-05, + "loss": 1.2696, + "step": 1390 + }, + { + "epoch": 0.07604936921668057, + "grad_norm": 1.5959999561309814, + "learning_rate": 1.9985669925980683e-05, + "loss": 1.4521, + "step": 1391 + }, + { + "epoch": 0.07610404166040212, + "grad_norm": 1.437170147895813, + "learning_rate": 1.998557196642243e-05, + "loss": 1.5523, + "step": 1392 + }, + { + "epoch": 0.07615871410412367, + "grad_norm": 1.6321206092834473, + "learning_rate": 1.99854736734225e-05, + "loss": 1.4285, + "step": 1393 + }, + { + "epoch": 0.07621338654784522, + "grad_norm": 1.7342352867126465, + "learning_rate": 1.9985375046984167e-05, + "loss": 1.6435, + "step": 1394 + }, + { + "epoch": 0.07626805899156677, + "grad_norm": 1.8534045219421387, + "learning_rate": 1.9985276087110733e-05, + "loss": 1.5765, + "step": 1395 + }, + { + "epoch": 0.07632273143528832, + "grad_norm": 1.6811858415603638, + "learning_rate": 1.9985176793805503e-05, + "loss": 1.3725, + "step": 1396 + }, + { + "epoch": 0.07637740387900988, + "grad_norm": 1.9128588438034058, + "learning_rate": 1.9985077167071784e-05, + "loss": 1.518, + "step": 1397 + }, + { + "epoch": 0.07643207632273144, + "grad_norm": 1.540700078010559, + "learning_rate": 1.9984977206912906e-05, + "loss": 1.4952, + "step": 1398 + }, + { + "epoch": 0.07648674876645299, + "grad_norm": 1.6563875675201416, + "learning_rate": 1.9984876913332215e-05, + "loss": 1.5261, + "step": 1399 + }, + { + "epoch": 0.07654142121017454, + "grad_norm": 1.409468173980713, + "learning_rate": 1.998477628633305e-05, + "loss": 1.3602, + "step": 1400 + }, + { + "epoch": 0.0765960936538961, + "grad_norm": 1.6620937585830688, + "learning_rate": 1.9984675325918776e-05, + "loss": 1.5412, + "step": 1401 + }, + { + "epoch": 0.07665076609761765, + "grad_norm": 1.7451437711715698, + "learning_rate": 1.998457403209276e-05, + "loss": 1.4184, + "step": 1402 + }, + { + "epoch": 0.0767054385413392, + "grad_norm": 1.6621379852294922, + "learning_rate": 1.998447240485839e-05, + "loss": 1.4904, + "step": 1403 + }, + { + "epoch": 0.07676011098506076, + "grad_norm": 2.5410947799682617, + "learning_rate": 1.998437044421906e-05, + "loss": 1.5698, + "step": 1404 + }, + { + "epoch": 0.07681478342878231, + "grad_norm": 2.036283254623413, + "learning_rate": 1.998426815017817e-05, + "loss": 1.7214, + "step": 1405 + }, + { + "epoch": 0.07686945587250386, + "grad_norm": 1.2867521047592163, + "learning_rate": 1.9984165522739135e-05, + "loss": 1.428, + "step": 1406 + }, + { + "epoch": 0.07692412831622542, + "grad_norm": 1.6318293809890747, + "learning_rate": 1.9984062561905384e-05, + "loss": 1.6449, + "step": 1407 + }, + { + "epoch": 0.07697880075994697, + "grad_norm": 1.2792149782180786, + "learning_rate": 1.9983959267680357e-05, + "loss": 1.5002, + "step": 1408 + }, + { + "epoch": 0.07703347320366852, + "grad_norm": 1.5758154392242432, + "learning_rate": 1.9983855640067505e-05, + "loss": 1.4105, + "step": 1409 + }, + { + "epoch": 0.07708814564739007, + "grad_norm": 1.5324223041534424, + "learning_rate": 1.9983751679070284e-05, + "loss": 1.569, + "step": 1410 + }, + { + "epoch": 0.07714281809111163, + "grad_norm": 1.58552086353302, + "learning_rate": 1.9983647384692164e-05, + "loss": 1.6422, + "step": 1411 + }, + { + "epoch": 0.07719749053483319, + "grad_norm": 1.4417541027069092, + "learning_rate": 1.9983542756936632e-05, + "loss": 1.4987, + "step": 1412 + }, + { + "epoch": 0.07725216297855474, + "grad_norm": 1.5088517665863037, + "learning_rate": 1.998343779580718e-05, + "loss": 1.3887, + "step": 1413 + }, + { + "epoch": 0.07730683542227629, + "grad_norm": 1.7122528553009033, + "learning_rate": 1.9983332501307315e-05, + "loss": 1.7413, + "step": 1414 + }, + { + "epoch": 0.07736150786599784, + "grad_norm": 1.6357976198196411, + "learning_rate": 1.9983226873440548e-05, + "loss": 1.4047, + "step": 1415 + }, + { + "epoch": 0.07741618030971939, + "grad_norm": 1.5222454071044922, + "learning_rate": 1.998312091221041e-05, + "loss": 1.5011, + "step": 1416 + }, + { + "epoch": 0.07747085275344094, + "grad_norm": 1.4219213724136353, + "learning_rate": 1.9983014617620443e-05, + "loss": 1.5303, + "step": 1417 + }, + { + "epoch": 0.0775255251971625, + "grad_norm": 1.326712965965271, + "learning_rate": 1.9982907989674187e-05, + "loss": 1.4973, + "step": 1418 + }, + { + "epoch": 0.07758019764088406, + "grad_norm": 1.8075969219207764, + "learning_rate": 1.9982801028375208e-05, + "loss": 1.3353, + "step": 1419 + }, + { + "epoch": 0.07763487008460561, + "grad_norm": 5.0382890701293945, + "learning_rate": 1.998269373372708e-05, + "loss": 1.5943, + "step": 1420 + }, + { + "epoch": 0.07768954252832716, + "grad_norm": 1.1993207931518555, + "learning_rate": 1.998258610573338e-05, + "loss": 1.6974, + "step": 1421 + }, + { + "epoch": 0.07774421497204871, + "grad_norm": 1.800008773803711, + "learning_rate": 1.998247814439771e-05, + "loss": 1.4822, + "step": 1422 + }, + { + "epoch": 0.07779888741577026, + "grad_norm": 1.5193781852722168, + "learning_rate": 1.9982369849723665e-05, + "loss": 1.308, + "step": 1423 + }, + { + "epoch": 0.07785355985949181, + "grad_norm": 1.5236345529556274, + "learning_rate": 1.998226122171487e-05, + "loss": 1.478, + "step": 1424 + }, + { + "epoch": 0.07790823230321338, + "grad_norm": 1.7010573148727417, + "learning_rate": 1.998215226037495e-05, + "loss": 1.6201, + "step": 1425 + }, + { + "epoch": 0.07796290474693493, + "grad_norm": 1.4789984226226807, + "learning_rate": 1.9982042965707536e-05, + "loss": 1.7106, + "step": 1426 + }, + { + "epoch": 0.07801757719065648, + "grad_norm": 2.7807955741882324, + "learning_rate": 1.9981933337716288e-05, + "loss": 1.4622, + "step": 1427 + }, + { + "epoch": 0.07807224963437803, + "grad_norm": 1.5102826356887817, + "learning_rate": 1.998182337640486e-05, + "loss": 1.6705, + "step": 1428 + }, + { + "epoch": 0.07812692207809958, + "grad_norm": 1.8030750751495361, + "learning_rate": 1.998171308177693e-05, + "loss": 1.735, + "step": 1429 + }, + { + "epoch": 0.07818159452182114, + "grad_norm": 1.5266687870025635, + "learning_rate": 1.9981602453836175e-05, + "loss": 1.6881, + "step": 1430 + }, + { + "epoch": 0.07823626696554269, + "grad_norm": 1.92497980594635, + "learning_rate": 1.998149149258629e-05, + "loss": 1.4857, + "step": 1431 + }, + { + "epoch": 0.07829093940926425, + "grad_norm": 1.359701156616211, + "learning_rate": 1.9981380198030984e-05, + "loss": 1.4373, + "step": 1432 + }, + { + "epoch": 0.0783456118529858, + "grad_norm": 1.5110993385314941, + "learning_rate": 1.9981268570173968e-05, + "loss": 1.198, + "step": 1433 + }, + { + "epoch": 0.07840028429670735, + "grad_norm": 1.592613697052002, + "learning_rate": 1.9981156609018977e-05, + "loss": 1.4508, + "step": 1434 + }, + { + "epoch": 0.0784549567404289, + "grad_norm": 1.5880794525146484, + "learning_rate": 1.9981044314569745e-05, + "loss": 1.3265, + "step": 1435 + }, + { + "epoch": 0.07850962918415046, + "grad_norm": 1.4265050888061523, + "learning_rate": 1.998093168683002e-05, + "loss": 1.5341, + "step": 1436 + }, + { + "epoch": 0.07856430162787201, + "grad_norm": 1.4180405139923096, + "learning_rate": 1.9980818725803565e-05, + "loss": 1.4296, + "step": 1437 + }, + { + "epoch": 0.07861897407159356, + "grad_norm": 1.5916224718093872, + "learning_rate": 1.9980705431494152e-05, + "loss": 1.5248, + "step": 1438 + }, + { + "epoch": 0.07867364651531512, + "grad_norm": 2.011519432067871, + "learning_rate": 1.9980591803905565e-05, + "loss": 1.5008, + "step": 1439 + }, + { + "epoch": 0.07872831895903667, + "grad_norm": 1.3759584426879883, + "learning_rate": 1.9980477843041596e-05, + "loss": 1.5733, + "step": 1440 + }, + { + "epoch": 0.07878299140275823, + "grad_norm": 2.0081911087036133, + "learning_rate": 1.9980363548906056e-05, + "loss": 1.7075, + "step": 1441 + }, + { + "epoch": 0.07883766384647978, + "grad_norm": 1.8536901473999023, + "learning_rate": 1.9980248921502753e-05, + "loss": 1.3283, + "step": 1442 + }, + { + "epoch": 0.07889233629020133, + "grad_norm": 1.5429751873016357, + "learning_rate": 1.9980133960835522e-05, + "loss": 1.2277, + "step": 1443 + }, + { + "epoch": 0.07894700873392288, + "grad_norm": 1.3373671770095825, + "learning_rate": 1.9980018666908197e-05, + "loss": 1.6307, + "step": 1444 + }, + { + "epoch": 0.07900168117764443, + "grad_norm": 1.3712570667266846, + "learning_rate": 1.9979903039724632e-05, + "loss": 1.6505, + "step": 1445 + }, + { + "epoch": 0.079056353621366, + "grad_norm": 1.8928627967834473, + "learning_rate": 1.9979787079288683e-05, + "loss": 1.3552, + "step": 1446 + }, + { + "epoch": 0.07911102606508755, + "grad_norm": 1.5024348497390747, + "learning_rate": 1.997967078560423e-05, + "loss": 1.3763, + "step": 1447 + }, + { + "epoch": 0.0791656985088091, + "grad_norm": 1.3344171047210693, + "learning_rate": 1.9979554158675145e-05, + "loss": 1.6381, + "step": 1448 + }, + { + "epoch": 0.07922037095253065, + "grad_norm": 1.3010424375534058, + "learning_rate": 1.997943719850533e-05, + "loss": 1.5571, + "step": 1449 + }, + { + "epoch": 0.0792750433962522, + "grad_norm": 1.2441554069519043, + "learning_rate": 1.9979319905098695e-05, + "loss": 1.6426, + "step": 1450 + }, + { + "epoch": 0.07932971583997375, + "grad_norm": 1.6943539381027222, + "learning_rate": 1.9979202278459143e-05, + "loss": 1.4832, + "step": 1451 + }, + { + "epoch": 0.0793843882836953, + "grad_norm": 2.1163547039031982, + "learning_rate": 1.997908431859062e-05, + "loss": 1.6129, + "step": 1452 + }, + { + "epoch": 0.07943906072741687, + "grad_norm": 2.161895990371704, + "learning_rate": 1.9978966025497047e-05, + "loss": 1.4526, + "step": 1453 + }, + { + "epoch": 0.07949373317113842, + "grad_norm": 1.6449568271636963, + "learning_rate": 1.9978847399182384e-05, + "loss": 1.4608, + "step": 1454 + }, + { + "epoch": 0.07954840561485997, + "grad_norm": 1.7780120372772217, + "learning_rate": 1.997872843965059e-05, + "loss": 1.2777, + "step": 1455 + }, + { + "epoch": 0.07960307805858152, + "grad_norm": 1.5372865200042725, + "learning_rate": 1.997860914690564e-05, + "loss": 1.4435, + "step": 1456 + }, + { + "epoch": 0.07965775050230307, + "grad_norm": 1.58393132686615, + "learning_rate": 1.9978489520951512e-05, + "loss": 1.545, + "step": 1457 + }, + { + "epoch": 0.07971242294602462, + "grad_norm": 1.5937480926513672, + "learning_rate": 1.9978369561792207e-05, + "loss": 1.6966, + "step": 1458 + }, + { + "epoch": 0.07976709538974619, + "grad_norm": 1.9731725454330444, + "learning_rate": 1.9978249269431723e-05, + "loss": 1.7453, + "step": 1459 + }, + { + "epoch": 0.07982176783346774, + "grad_norm": 1.6509324312210083, + "learning_rate": 1.9978128643874085e-05, + "loss": 1.6957, + "step": 1460 + }, + { + "epoch": 0.07987644027718929, + "grad_norm": 1.7001550197601318, + "learning_rate": 1.9978007685123314e-05, + "loss": 1.5736, + "step": 1461 + }, + { + "epoch": 0.07993111272091084, + "grad_norm": 1.3981633186340332, + "learning_rate": 1.9977886393183454e-05, + "loss": 1.5515, + "step": 1462 + }, + { + "epoch": 0.0799857851646324, + "grad_norm": 1.49921452999115, + "learning_rate": 1.9977764768058555e-05, + "loss": 1.4608, + "step": 1463 + }, + { + "epoch": 0.08004045760835395, + "grad_norm": 1.4555253982543945, + "learning_rate": 1.9977642809752674e-05, + "loss": 1.7006, + "step": 1464 + }, + { + "epoch": 0.0800951300520755, + "grad_norm": 2.073772430419922, + "learning_rate": 1.997752051826989e-05, + "loss": 1.4796, + "step": 1465 + }, + { + "epoch": 0.08014980249579706, + "grad_norm": 1.9564521312713623, + "learning_rate": 1.9977397893614282e-05, + "loss": 1.648, + "step": 1466 + }, + { + "epoch": 0.08020447493951861, + "grad_norm": 1.3661129474639893, + "learning_rate": 1.997727493578994e-05, + "loss": 1.2909, + "step": 1467 + }, + { + "epoch": 0.08025914738324016, + "grad_norm": 1.850281834602356, + "learning_rate": 1.997715164480098e-05, + "loss": 1.4377, + "step": 1468 + }, + { + "epoch": 0.08031381982696172, + "grad_norm": 1.9980746507644653, + "learning_rate": 1.9977028020651516e-05, + "loss": 1.4728, + "step": 1469 + }, + { + "epoch": 0.08036849227068327, + "grad_norm": 1.548340082168579, + "learning_rate": 1.9976904063345673e-05, + "loss": 1.4234, + "step": 1470 + }, + { + "epoch": 0.08042316471440482, + "grad_norm": 2.0132133960723877, + "learning_rate": 1.997677977288759e-05, + "loss": 1.5255, + "step": 1471 + }, + { + "epoch": 0.08047783715812637, + "grad_norm": 1.921310544013977, + "learning_rate": 1.997665514928142e-05, + "loss": 1.7182, + "step": 1472 + }, + { + "epoch": 0.08053250960184793, + "grad_norm": 1.6799784898757935, + "learning_rate": 1.9976530192531327e-05, + "loss": 1.5288, + "step": 1473 + }, + { + "epoch": 0.08058718204556949, + "grad_norm": 1.738147497177124, + "learning_rate": 1.997640490264148e-05, + "loss": 1.3298, + "step": 1474 + }, + { + "epoch": 0.08064185448929104, + "grad_norm": 1.5402734279632568, + "learning_rate": 1.997627927961606e-05, + "loss": 1.3733, + "step": 1475 + }, + { + "epoch": 0.08069652693301259, + "grad_norm": 1.6303566694259644, + "learning_rate": 1.9976153323459262e-05, + "loss": 1.5844, + "step": 1476 + }, + { + "epoch": 0.08075119937673414, + "grad_norm": 1.9436631202697754, + "learning_rate": 1.99760270341753e-05, + "loss": 1.2202, + "step": 1477 + }, + { + "epoch": 0.08080587182045569, + "grad_norm": 1.6072651147842407, + "learning_rate": 1.9975900411768384e-05, + "loss": 1.258, + "step": 1478 + }, + { + "epoch": 0.08086054426417724, + "grad_norm": 1.9282519817352295, + "learning_rate": 1.9975773456242742e-05, + "loss": 1.3898, + "step": 1479 + }, + { + "epoch": 0.08091521670789881, + "grad_norm": 1.5625959634780884, + "learning_rate": 1.9975646167602617e-05, + "loss": 1.5508, + "step": 1480 + }, + { + "epoch": 0.08096988915162036, + "grad_norm": 1.4742155075073242, + "learning_rate": 1.9975518545852258e-05, + "loss": 1.3071, + "step": 1481 + }, + { + "epoch": 0.08102456159534191, + "grad_norm": 10.476712226867676, + "learning_rate": 1.9975390590995925e-05, + "loss": 1.7869, + "step": 1482 + }, + { + "epoch": 0.08107923403906346, + "grad_norm": 3.1127147674560547, + "learning_rate": 1.9975262303037896e-05, + "loss": 1.4823, + "step": 1483 + }, + { + "epoch": 0.08113390648278501, + "grad_norm": 7.572952747344971, + "learning_rate": 1.997513368198245e-05, + "loss": 1.7829, + "step": 1484 + }, + { + "epoch": 0.08118857892650656, + "grad_norm": 2.278719902038574, + "learning_rate": 1.9975004727833882e-05, + "loss": 1.3796, + "step": 1485 + }, + { + "epoch": 0.08124325137022811, + "grad_norm": 1.6202667951583862, + "learning_rate": 1.9974875440596496e-05, + "loss": 1.4739, + "step": 1486 + }, + { + "epoch": 0.08129792381394968, + "grad_norm": 1.520383596420288, + "learning_rate": 1.9974745820274618e-05, + "loss": 1.2864, + "step": 1487 + }, + { + "epoch": 0.08135259625767123, + "grad_norm": 1.2259752750396729, + "learning_rate": 1.9974615866872567e-05, + "loss": 1.7091, + "step": 1488 + }, + { + "epoch": 0.08140726870139278, + "grad_norm": 2.1116158962249756, + "learning_rate": 1.9974485580394686e-05, + "loss": 1.6003, + "step": 1489 + }, + { + "epoch": 0.08146194114511433, + "grad_norm": 2.020763397216797, + "learning_rate": 1.9974354960845326e-05, + "loss": 1.5504, + "step": 1490 + }, + { + "epoch": 0.08151661358883588, + "grad_norm": 1.5378738641738892, + "learning_rate": 1.997422400822885e-05, + "loss": 1.546, + "step": 1491 + }, + { + "epoch": 0.08157128603255744, + "grad_norm": 1.697464108467102, + "learning_rate": 1.9974092722549628e-05, + "loss": 1.7912, + "step": 1492 + }, + { + "epoch": 0.08162595847627899, + "grad_norm": 1.966559886932373, + "learning_rate": 1.9973961103812045e-05, + "loss": 1.5682, + "step": 1493 + }, + { + "epoch": 0.08168063092000055, + "grad_norm": 1.2549552917480469, + "learning_rate": 1.9973829152020494e-05, + "loss": 1.8657, + "step": 1494 + }, + { + "epoch": 0.0817353033637221, + "grad_norm": 1.483189344406128, + "learning_rate": 1.9973696867179386e-05, + "loss": 1.5341, + "step": 1495 + }, + { + "epoch": 0.08178997580744365, + "grad_norm": 1.478447675704956, + "learning_rate": 1.9973564249293136e-05, + "loss": 1.6676, + "step": 1496 + }, + { + "epoch": 0.0818446482511652, + "grad_norm": 2.074294090270996, + "learning_rate": 1.997343129836617e-05, + "loss": 1.5733, + "step": 1497 + }, + { + "epoch": 0.08189932069488676, + "grad_norm": 1.2281745672225952, + "learning_rate": 1.997329801440293e-05, + "loss": 1.5499, + "step": 1498 + }, + { + "epoch": 0.08195399313860831, + "grad_norm": 2.2531912326812744, + "learning_rate": 1.9973164397407868e-05, + "loss": 1.4698, + "step": 1499 + }, + { + "epoch": 0.08200866558232986, + "grad_norm": 1.4772900342941284, + "learning_rate": 1.997303044738544e-05, + "loss": 1.3853, + "step": 1500 + }, + { + "epoch": 0.08206333802605142, + "grad_norm": 1.9389833211898804, + "learning_rate": 1.997289616434013e-05, + "loss": 1.4524, + "step": 1501 + }, + { + "epoch": 0.08211801046977298, + "grad_norm": 1.6024669408798218, + "learning_rate": 1.9972761548276407e-05, + "loss": 1.7372, + "step": 1502 + }, + { + "epoch": 0.08217268291349453, + "grad_norm": 1.624786376953125, + "learning_rate": 1.9972626599198778e-05, + "loss": 1.5627, + "step": 1503 + }, + { + "epoch": 0.08222735535721608, + "grad_norm": 1.6173421144485474, + "learning_rate": 1.9972491317111745e-05, + "loss": 1.8289, + "step": 1504 + }, + { + "epoch": 0.08228202780093763, + "grad_norm": 2.4811861515045166, + "learning_rate": 1.9972355702019825e-05, + "loss": 1.6162, + "step": 1505 + }, + { + "epoch": 0.08233670024465918, + "grad_norm": 2.171722173690796, + "learning_rate": 1.997221975392755e-05, + "loss": 1.6224, + "step": 1506 + }, + { + "epoch": 0.08239137268838075, + "grad_norm": 1.3494800329208374, + "learning_rate": 1.9972083472839454e-05, + "loss": 1.6018, + "step": 1507 + }, + { + "epoch": 0.0824460451321023, + "grad_norm": 1.9613078832626343, + "learning_rate": 1.9971946858760088e-05, + "loss": 1.3626, + "step": 1508 + }, + { + "epoch": 0.08250071757582385, + "grad_norm": 1.4945746660232544, + "learning_rate": 1.997180991169402e-05, + "loss": 1.4904, + "step": 1509 + }, + { + "epoch": 0.0825553900195454, + "grad_norm": 1.703938364982605, + "learning_rate": 1.997167263164582e-05, + "loss": 1.7092, + "step": 1510 + }, + { + "epoch": 0.08261006246326695, + "grad_norm": 1.6870235204696655, + "learning_rate": 1.997153501862007e-05, + "loss": 1.6295, + "step": 1511 + }, + { + "epoch": 0.0826647349069885, + "grad_norm": 1.6663899421691895, + "learning_rate": 1.997139707262137e-05, + "loss": 1.543, + "step": 1512 + }, + { + "epoch": 0.08271940735071005, + "grad_norm": 1.584061622619629, + "learning_rate": 1.997125879365432e-05, + "loss": 1.7339, + "step": 1513 + }, + { + "epoch": 0.08277407979443162, + "grad_norm": 1.4986975193023682, + "learning_rate": 1.9971120181723544e-05, + "loss": 1.5227, + "step": 1514 + }, + { + "epoch": 0.08282875223815317, + "grad_norm": 1.6567329168319702, + "learning_rate": 1.997098123683366e-05, + "loss": 1.4455, + "step": 1515 + }, + { + "epoch": 0.08288342468187472, + "grad_norm": 2.0390899181365967, + "learning_rate": 1.997084195898932e-05, + "loss": 1.0014, + "step": 1516 + }, + { + "epoch": 0.08293809712559627, + "grad_norm": 1.3754962682724, + "learning_rate": 1.9970702348195174e-05, + "loss": 1.5321, + "step": 1517 + }, + { + "epoch": 0.08299276956931782, + "grad_norm": 1.651336908340454, + "learning_rate": 1.9970562404455872e-05, + "loss": 1.5501, + "step": 1518 + }, + { + "epoch": 0.08304744201303937, + "grad_norm": 1.3758366107940674, + "learning_rate": 1.9970422127776097e-05, + "loss": 1.3529, + "step": 1519 + }, + { + "epoch": 0.08310211445676093, + "grad_norm": 1.5537562370300293, + "learning_rate": 1.997028151816053e-05, + "loss": 1.3207, + "step": 1520 + }, + { + "epoch": 0.08315678690048249, + "grad_norm": 1.4759633541107178, + "learning_rate": 1.997014057561387e-05, + "loss": 1.4841, + "step": 1521 + }, + { + "epoch": 0.08321145934420404, + "grad_norm": 1.6114734411239624, + "learning_rate": 1.996999930014082e-05, + "loss": 1.4303, + "step": 1522 + }, + { + "epoch": 0.08326613178792559, + "grad_norm": 1.588789939880371, + "learning_rate": 1.9969857691746095e-05, + "loss": 1.5405, + "step": 1523 + }, + { + "epoch": 0.08332080423164714, + "grad_norm": 1.5031124353408813, + "learning_rate": 1.9969715750434427e-05, + "loss": 1.5048, + "step": 1524 + }, + { + "epoch": 0.0833754766753687, + "grad_norm": 1.4591197967529297, + "learning_rate": 1.9969573476210558e-05, + "loss": 1.4622, + "step": 1525 + }, + { + "epoch": 0.08343014911909025, + "grad_norm": 1.413156270980835, + "learning_rate": 1.9969430869079237e-05, + "loss": 1.571, + "step": 1526 + }, + { + "epoch": 0.0834848215628118, + "grad_norm": 1.4118118286132812, + "learning_rate": 1.996928792904522e-05, + "loss": 1.4878, + "step": 1527 + }, + { + "epoch": 0.08353949400653336, + "grad_norm": 1.611005425453186, + "learning_rate": 1.996914465611329e-05, + "loss": 1.4341, + "step": 1528 + }, + { + "epoch": 0.08359416645025491, + "grad_norm": 1.4874924421310425, + "learning_rate": 1.996900105028823e-05, + "loss": 1.5743, + "step": 1529 + }, + { + "epoch": 0.08364883889397647, + "grad_norm": 1.5379890203475952, + "learning_rate": 1.9968857111574826e-05, + "loss": 1.6716, + "step": 1530 + }, + { + "epoch": 0.08370351133769802, + "grad_norm": 1.7126048803329468, + "learning_rate": 1.996871283997789e-05, + "loss": 1.3316, + "step": 1531 + }, + { + "epoch": 0.08375818378141957, + "grad_norm": 1.1250460147857666, + "learning_rate": 1.996856823550224e-05, + "loss": 1.4563, + "step": 1532 + }, + { + "epoch": 0.08381285622514112, + "grad_norm": 1.67509126663208, + "learning_rate": 1.9968423298152707e-05, + "loss": 1.4208, + "step": 1533 + }, + { + "epoch": 0.08386752866886267, + "grad_norm": 1.410383939743042, + "learning_rate": 1.996827802793413e-05, + "loss": 1.3505, + "step": 1534 + }, + { + "epoch": 0.08392220111258424, + "grad_norm": 1.808982014656067, + "learning_rate": 1.9968132424851357e-05, + "loss": 1.5998, + "step": 1535 + }, + { + "epoch": 0.08397687355630579, + "grad_norm": 2.1001973152160645, + "learning_rate": 1.9967986488909248e-05, + "loss": 1.5658, + "step": 1536 + }, + { + "epoch": 0.08403154600002734, + "grad_norm": 1.5674080848693848, + "learning_rate": 1.9967840220112684e-05, + "loss": 1.6592, + "step": 1537 + }, + { + "epoch": 0.08408621844374889, + "grad_norm": 1.8142515420913696, + "learning_rate": 1.996769361846654e-05, + "loss": 1.3857, + "step": 1538 + }, + { + "epoch": 0.08414089088747044, + "grad_norm": 1.7058755159378052, + "learning_rate": 1.996754668397572e-05, + "loss": 1.5302, + "step": 1539 + }, + { + "epoch": 0.08419556333119199, + "grad_norm": 2.164722442626953, + "learning_rate": 1.9967399416645124e-05, + "loss": 1.4759, + "step": 1540 + }, + { + "epoch": 0.08425023577491354, + "grad_norm": 1.6644266843795776, + "learning_rate": 1.9967251816479676e-05, + "loss": 1.6062, + "step": 1541 + }, + { + "epoch": 0.08430490821863511, + "grad_norm": 1.6059409379959106, + "learning_rate": 1.9967103883484297e-05, + "loss": 1.5879, + "step": 1542 + }, + { + "epoch": 0.08435958066235666, + "grad_norm": 1.3683308362960815, + "learning_rate": 1.9966955617663934e-05, + "loss": 1.5462, + "step": 1543 + }, + { + "epoch": 0.08441425310607821, + "grad_norm": 1.554993748664856, + "learning_rate": 1.9966807019023532e-05, + "loss": 1.6463, + "step": 1544 + }, + { + "epoch": 0.08446892554979976, + "grad_norm": 2.408597230911255, + "learning_rate": 1.9966658087568057e-05, + "loss": 1.2139, + "step": 1545 + }, + { + "epoch": 0.08452359799352131, + "grad_norm": 1.411595344543457, + "learning_rate": 1.9966508823302484e-05, + "loss": 1.4684, + "step": 1546 + }, + { + "epoch": 0.08457827043724286, + "grad_norm": 1.7894978523254395, + "learning_rate": 1.996635922623179e-05, + "loss": 1.5645, + "step": 1547 + }, + { + "epoch": 0.08463294288096442, + "grad_norm": 1.844016671180725, + "learning_rate": 1.9966209296360975e-05, + "loss": 1.6513, + "step": 1548 + }, + { + "epoch": 0.08468761532468598, + "grad_norm": 1.4432655572891235, + "learning_rate": 1.9966059033695048e-05, + "loss": 1.4113, + "step": 1549 + }, + { + "epoch": 0.08474228776840753, + "grad_norm": 1.9709687232971191, + "learning_rate": 1.9965908438239024e-05, + "loss": 1.5998, + "step": 1550 + }, + { + "epoch": 0.08479696021212908, + "grad_norm": 1.613953709602356, + "learning_rate": 1.9965757509997928e-05, + "loss": 1.5739, + "step": 1551 + }, + { + "epoch": 0.08485163265585063, + "grad_norm": 1.624128818511963, + "learning_rate": 1.9965606248976804e-05, + "loss": 1.5031, + "step": 1552 + }, + { + "epoch": 0.08490630509957219, + "grad_norm": 1.6111371517181396, + "learning_rate": 1.9965454655180704e-05, + "loss": 1.4846, + "step": 1553 + }, + { + "epoch": 0.08496097754329374, + "grad_norm": 1.247204303741455, + "learning_rate": 1.9965302728614687e-05, + "loss": 1.5855, + "step": 1554 + }, + { + "epoch": 0.0850156499870153, + "grad_norm": 1.6090561151504517, + "learning_rate": 1.996515046928383e-05, + "loss": 1.3626, + "step": 1555 + }, + { + "epoch": 0.08507032243073685, + "grad_norm": 1.4144928455352783, + "learning_rate": 1.996499787719321e-05, + "loss": 1.6201, + "step": 1556 + }, + { + "epoch": 0.0851249948744584, + "grad_norm": 1.4039170742034912, + "learning_rate": 1.996484495234793e-05, + "loss": 1.4444, + "step": 1557 + }, + { + "epoch": 0.08517966731817996, + "grad_norm": 1.5119787454605103, + "learning_rate": 1.9964691694753097e-05, + "loss": 1.7159, + "step": 1558 + }, + { + "epoch": 0.0852343397619015, + "grad_norm": 1.5626862049102783, + "learning_rate": 1.9964538104413818e-05, + "loss": 1.3767, + "step": 1559 + }, + { + "epoch": 0.08528901220562306, + "grad_norm": 1.3879029750823975, + "learning_rate": 1.9964384181335237e-05, + "loss": 1.808, + "step": 1560 + }, + { + "epoch": 0.08534368464934461, + "grad_norm": 1.953464388847351, + "learning_rate": 1.9964229925522483e-05, + "loss": 1.456, + "step": 1561 + }, + { + "epoch": 0.08539835709306617, + "grad_norm": 1.4591574668884277, + "learning_rate": 1.9964075336980707e-05, + "loss": 1.5521, + "step": 1562 + }, + { + "epoch": 0.08545302953678773, + "grad_norm": 2.0529251098632812, + "learning_rate": 1.996392041571508e-05, + "loss": 1.7358, + "step": 1563 + }, + { + "epoch": 0.08550770198050928, + "grad_norm": 1.6849250793457031, + "learning_rate": 1.9963765161730764e-05, + "loss": 1.2837, + "step": 1564 + }, + { + "epoch": 0.08556237442423083, + "grad_norm": 1.218431830406189, + "learning_rate": 1.9963609575032952e-05, + "loss": 1.3004, + "step": 1565 + }, + { + "epoch": 0.08561704686795238, + "grad_norm": 1.7762892246246338, + "learning_rate": 1.9963453655626833e-05, + "loss": 1.3263, + "step": 1566 + }, + { + "epoch": 0.08567171931167393, + "grad_norm": 1.3841736316680908, + "learning_rate": 1.996329740351762e-05, + "loss": 1.6189, + "step": 1567 + }, + { + "epoch": 0.08572639175539548, + "grad_norm": 1.4034793376922607, + "learning_rate": 1.9963140818710525e-05, + "loss": 1.4597, + "step": 1568 + }, + { + "epoch": 0.08578106419911705, + "grad_norm": 1.347916841506958, + "learning_rate": 1.9962983901210776e-05, + "loss": 1.5368, + "step": 1569 + }, + { + "epoch": 0.0858357366428386, + "grad_norm": 1.366502046585083, + "learning_rate": 1.996282665102362e-05, + "loss": 1.4905, + "step": 1570 + }, + { + "epoch": 0.08589040908656015, + "grad_norm": 1.5138397216796875, + "learning_rate": 1.9962669068154303e-05, + "loss": 1.5096, + "step": 1571 + }, + { + "epoch": 0.0859450815302817, + "grad_norm": 1.5149129629135132, + "learning_rate": 1.9962511152608087e-05, + "loss": 1.4772, + "step": 1572 + }, + { + "epoch": 0.08599975397400325, + "grad_norm": 2.4390783309936523, + "learning_rate": 1.9962352904390247e-05, + "loss": 1.4653, + "step": 1573 + }, + { + "epoch": 0.0860544264177248, + "grad_norm": 1.493735432624817, + "learning_rate": 1.9962194323506064e-05, + "loss": 1.4343, + "step": 1574 + }, + { + "epoch": 0.08610909886144635, + "grad_norm": 1.5066816806793213, + "learning_rate": 1.9962035409960835e-05, + "loss": 1.4942, + "step": 1575 + }, + { + "epoch": 0.08616377130516792, + "grad_norm": 1.5655211210250854, + "learning_rate": 1.996187616375987e-05, + "loss": 1.5644, + "step": 1576 + }, + { + "epoch": 0.08621844374888947, + "grad_norm": 1.261366367340088, + "learning_rate": 1.996171658490848e-05, + "loss": 1.5788, + "step": 1577 + }, + { + "epoch": 0.08627311619261102, + "grad_norm": 1.8425800800323486, + "learning_rate": 1.9961556673412e-05, + "loss": 1.6185, + "step": 1578 + }, + { + "epoch": 0.08632778863633257, + "grad_norm": 1.6496113538742065, + "learning_rate": 1.9961396429275766e-05, + "loss": 1.5648, + "step": 1579 + }, + { + "epoch": 0.08638246108005412, + "grad_norm": 1.6168984174728394, + "learning_rate": 1.996123585250513e-05, + "loss": 1.5013, + "step": 1580 + }, + { + "epoch": 0.08643713352377567, + "grad_norm": 1.2419233322143555, + "learning_rate": 1.9961074943105457e-05, + "loss": 1.3796, + "step": 1581 + }, + { + "epoch": 0.08649180596749723, + "grad_norm": 1.2004622220993042, + "learning_rate": 1.9960913701082112e-05, + "loss": 1.5179, + "step": 1582 + }, + { + "epoch": 0.08654647841121879, + "grad_norm": 1.3625574111938477, + "learning_rate": 1.996075212644049e-05, + "loss": 1.5454, + "step": 1583 + }, + { + "epoch": 0.08660115085494034, + "grad_norm": 1.5103979110717773, + "learning_rate": 1.9960590219185976e-05, + "loss": 1.559, + "step": 1584 + }, + { + "epoch": 0.0866558232986619, + "grad_norm": 1.7718653678894043, + "learning_rate": 1.9960427979323986e-05, + "loss": 1.4302, + "step": 1585 + }, + { + "epoch": 0.08671049574238344, + "grad_norm": 1.2814831733703613, + "learning_rate": 1.996026540685993e-05, + "loss": 1.5393, + "step": 1586 + }, + { + "epoch": 0.086765168186105, + "grad_norm": 1.7415378093719482, + "learning_rate": 1.996010250179924e-05, + "loss": 1.3508, + "step": 1587 + }, + { + "epoch": 0.08681984062982655, + "grad_norm": 1.4060771465301514, + "learning_rate": 1.9959939264147355e-05, + "loss": 1.5487, + "step": 1588 + }, + { + "epoch": 0.0868745130735481, + "grad_norm": 1.2037849426269531, + "learning_rate": 1.9959775693909726e-05, + "loss": 1.5905, + "step": 1589 + }, + { + "epoch": 0.08692918551726966, + "grad_norm": 1.5074191093444824, + "learning_rate": 1.9959611791091816e-05, + "loss": 1.4369, + "step": 1590 + }, + { + "epoch": 0.08698385796099121, + "grad_norm": 1.4320917129516602, + "learning_rate": 1.9959447555699095e-05, + "loss": 1.5897, + "step": 1591 + }, + { + "epoch": 0.08703853040471277, + "grad_norm": 1.448006272315979, + "learning_rate": 1.9959282987737054e-05, + "loss": 1.5795, + "step": 1592 + }, + { + "epoch": 0.08709320284843432, + "grad_norm": 1.3095695972442627, + "learning_rate": 1.9959118087211182e-05, + "loss": 1.5878, + "step": 1593 + }, + { + "epoch": 0.08714787529215587, + "grad_norm": 1.520589828491211, + "learning_rate": 1.9958952854126986e-05, + "loss": 1.4571, + "step": 1594 + }, + { + "epoch": 0.08720254773587742, + "grad_norm": 1.683666467666626, + "learning_rate": 1.9958787288489983e-05, + "loss": 1.6292, + "step": 1595 + }, + { + "epoch": 0.08725722017959897, + "grad_norm": 1.3453701734542847, + "learning_rate": 1.995862139030571e-05, + "loss": 1.474, + "step": 1596 + }, + { + "epoch": 0.08731189262332054, + "grad_norm": 2.0229408740997314, + "learning_rate": 1.9958455159579695e-05, + "loss": 1.3681, + "step": 1597 + }, + { + "epoch": 0.08736656506704209, + "grad_norm": 1.5268162488937378, + "learning_rate": 1.9958288596317496e-05, + "loss": 1.6693, + "step": 1598 + }, + { + "epoch": 0.08742123751076364, + "grad_norm": 1.4873249530792236, + "learning_rate": 1.9958121700524672e-05, + "loss": 1.508, + "step": 1599 + }, + { + "epoch": 0.08747590995448519, + "grad_norm": 1.4968748092651367, + "learning_rate": 1.99579544722068e-05, + "loss": 1.6564, + "step": 1600 + }, + { + "epoch": 0.08753058239820674, + "grad_norm": 1.2867308855056763, + "learning_rate": 1.9957786911369456e-05, + "loss": 1.7299, + "step": 1601 + }, + { + "epoch": 0.08758525484192829, + "grad_norm": 1.6348540782928467, + "learning_rate": 1.9957619018018243e-05, + "loss": 1.5239, + "step": 1602 + }, + { + "epoch": 0.08763992728564984, + "grad_norm": 1.3460144996643066, + "learning_rate": 1.995745079215876e-05, + "loss": 1.4314, + "step": 1603 + }, + { + "epoch": 0.08769459972937141, + "grad_norm": 1.5785597562789917, + "learning_rate": 1.9957282233796635e-05, + "loss": 1.5281, + "step": 1604 + }, + { + "epoch": 0.08774927217309296, + "grad_norm": 2.0432004928588867, + "learning_rate": 1.9957113342937492e-05, + "loss": 1.5072, + "step": 1605 + }, + { + "epoch": 0.08780394461681451, + "grad_norm": 1.6811572313308716, + "learning_rate": 1.9956944119586967e-05, + "loss": 1.3466, + "step": 1606 + }, + { + "epoch": 0.08785861706053606, + "grad_norm": 1.5277873277664185, + "learning_rate": 1.995677456375071e-05, + "loss": 1.355, + "step": 1607 + }, + { + "epoch": 0.08791328950425761, + "grad_norm": 1.2728925943374634, + "learning_rate": 1.9956604675434388e-05, + "loss": 1.4601, + "step": 1608 + }, + { + "epoch": 0.08796796194797916, + "grad_norm": 1.64427649974823, + "learning_rate": 1.9956434454643675e-05, + "loss": 1.5292, + "step": 1609 + }, + { + "epoch": 0.08802263439170073, + "grad_norm": 1.5454000234603882, + "learning_rate": 1.995626390138425e-05, + "loss": 1.4568, + "step": 1610 + }, + { + "epoch": 0.08807730683542228, + "grad_norm": 1.4717998504638672, + "learning_rate": 1.995609301566181e-05, + "loss": 1.6574, + "step": 1611 + }, + { + "epoch": 0.08813197927914383, + "grad_norm": 1.3838870525360107, + "learning_rate": 1.995592179748206e-05, + "loss": 1.2995, + "step": 1612 + }, + { + "epoch": 0.08818665172286538, + "grad_norm": 1.897501826286316, + "learning_rate": 1.995575024685072e-05, + "loss": 1.3518, + "step": 1613 + }, + { + "epoch": 0.08824132416658693, + "grad_norm": 1.1821835041046143, + "learning_rate": 1.9955578363773518e-05, + "loss": 1.4016, + "step": 1614 + }, + { + "epoch": 0.08829599661030849, + "grad_norm": 1.5416762828826904, + "learning_rate": 1.9955406148256192e-05, + "loss": 1.391, + "step": 1615 + }, + { + "epoch": 0.08835066905403004, + "grad_norm": 1.491653323173523, + "learning_rate": 1.9955233600304496e-05, + "loss": 1.7206, + "step": 1616 + }, + { + "epoch": 0.0884053414977516, + "grad_norm": 1.8274329900741577, + "learning_rate": 1.9955060719924187e-05, + "loss": 1.6298, + "step": 1617 + }, + { + "epoch": 0.08846001394147315, + "grad_norm": 1.4671971797943115, + "learning_rate": 1.995488750712104e-05, + "loss": 1.3853, + "step": 1618 + }, + { + "epoch": 0.0885146863851947, + "grad_norm": 1.5927143096923828, + "learning_rate": 1.995471396190084e-05, + "loss": 1.4121, + "step": 1619 + }, + { + "epoch": 0.08856935882891626, + "grad_norm": 1.558536410331726, + "learning_rate": 1.995454008426938e-05, + "loss": 1.3556, + "step": 1620 + }, + { + "epoch": 0.08862403127263781, + "grad_norm": 1.4611706733703613, + "learning_rate": 1.995436587423247e-05, + "loss": 1.4593, + "step": 1621 + }, + { + "epoch": 0.08867870371635936, + "grad_norm": 1.7322818040847778, + "learning_rate": 1.9954191331795926e-05, + "loss": 1.5987, + "step": 1622 + }, + { + "epoch": 0.08873337616008091, + "grad_norm": 1.5929226875305176, + "learning_rate": 1.995401645696557e-05, + "loss": 1.3749, + "step": 1623 + }, + { + "epoch": 0.08878804860380247, + "grad_norm": 1.351874828338623, + "learning_rate": 1.995384124974725e-05, + "loss": 1.4066, + "step": 1624 + }, + { + "epoch": 0.08884272104752403, + "grad_norm": 1.5202137231826782, + "learning_rate": 1.995366571014681e-05, + "loss": 1.801, + "step": 1625 + }, + { + "epoch": 0.08889739349124558, + "grad_norm": 1.5479397773742676, + "learning_rate": 1.995348983817012e-05, + "loss": 1.3387, + "step": 1626 + }, + { + "epoch": 0.08895206593496713, + "grad_norm": 1.5038297176361084, + "learning_rate": 1.9953313633823045e-05, + "loss": 1.3745, + "step": 1627 + }, + { + "epoch": 0.08900673837868868, + "grad_norm": 2.062901258468628, + "learning_rate": 1.9953137097111475e-05, + "loss": 1.4927, + "step": 1628 + }, + { + "epoch": 0.08906141082241023, + "grad_norm": 1.7623999118804932, + "learning_rate": 1.9952960228041297e-05, + "loss": 1.3663, + "step": 1629 + }, + { + "epoch": 0.08911608326613178, + "grad_norm": 1.6237915754318237, + "learning_rate": 1.9952783026618424e-05, + "loss": 1.4156, + "step": 1630 + }, + { + "epoch": 0.08917075570985335, + "grad_norm": 1.6153465509414673, + "learning_rate": 1.995260549284877e-05, + "loss": 1.3333, + "step": 1631 + }, + { + "epoch": 0.0892254281535749, + "grad_norm": 2.0237479209899902, + "learning_rate": 1.9952427626738264e-05, + "loss": 1.6851, + "step": 1632 + }, + { + "epoch": 0.08928010059729645, + "grad_norm": 1.27190101146698, + "learning_rate": 1.9952249428292844e-05, + "loss": 1.4264, + "step": 1633 + }, + { + "epoch": 0.089334773041018, + "grad_norm": 1.5019017457962036, + "learning_rate": 1.995207089751847e-05, + "loss": 1.453, + "step": 1634 + }, + { + "epoch": 0.08938944548473955, + "grad_norm": 1.7211792469024658, + "learning_rate": 1.9951892034421084e-05, + "loss": 1.4657, + "step": 1635 + }, + { + "epoch": 0.0894441179284611, + "grad_norm": 1.3795530796051025, + "learning_rate": 1.9951712839006677e-05, + "loss": 1.563, + "step": 1636 + }, + { + "epoch": 0.08949879037218265, + "grad_norm": 1.8444921970367432, + "learning_rate": 1.995153331128122e-05, + "loss": 1.285, + "step": 1637 + }, + { + "epoch": 0.08955346281590422, + "grad_norm": 1.9050953388214111, + "learning_rate": 1.9951353451250723e-05, + "loss": 1.6276, + "step": 1638 + }, + { + "epoch": 0.08960813525962577, + "grad_norm": 1.471533179283142, + "learning_rate": 1.9951173258921176e-05, + "loss": 1.5553, + "step": 1639 + }, + { + "epoch": 0.08966280770334732, + "grad_norm": 1.8775919675827026, + "learning_rate": 1.9950992734298606e-05, + "loss": 1.2574, + "step": 1640 + }, + { + "epoch": 0.08971748014706887, + "grad_norm": 1.5879223346710205, + "learning_rate": 1.9950811877389035e-05, + "loss": 1.534, + "step": 1641 + }, + { + "epoch": 0.08977215259079042, + "grad_norm": 1.764948844909668, + "learning_rate": 1.9950630688198505e-05, + "loss": 1.4978, + "step": 1642 + }, + { + "epoch": 0.08982682503451198, + "grad_norm": 1.4748433828353882, + "learning_rate": 1.9950449166733067e-05, + "loss": 1.4818, + "step": 1643 + }, + { + "epoch": 0.08988149747823353, + "grad_norm": 1.3726643323898315, + "learning_rate": 1.9950267312998783e-05, + "loss": 1.5264, + "step": 1644 + }, + { + "epoch": 0.08993616992195509, + "grad_norm": 1.6175169944763184, + "learning_rate": 1.9950085127001724e-05, + "loss": 1.576, + "step": 1645 + }, + { + "epoch": 0.08999084236567664, + "grad_norm": 1.518064260482788, + "learning_rate": 1.9949902608747972e-05, + "loss": 1.5066, + "step": 1646 + }, + { + "epoch": 0.0900455148093982, + "grad_norm": 1.4286092519760132, + "learning_rate": 1.9949719758243625e-05, + "loss": 1.2199, + "step": 1647 + }, + { + "epoch": 0.09010018725311975, + "grad_norm": 2.144718647003174, + "learning_rate": 1.994953657549479e-05, + "loss": 1.5191, + "step": 1648 + }, + { + "epoch": 0.0901548596968413, + "grad_norm": 1.48231041431427, + "learning_rate": 1.994935306050758e-05, + "loss": 1.578, + "step": 1649 + }, + { + "epoch": 0.09020953214056285, + "grad_norm": 1.2621092796325684, + "learning_rate": 1.9949169213288125e-05, + "loss": 1.224, + "step": 1650 + }, + { + "epoch": 0.0902642045842844, + "grad_norm": 2.0915465354919434, + "learning_rate": 1.994898503384256e-05, + "loss": 1.6455, + "step": 1651 + }, + { + "epoch": 0.09031887702800596, + "grad_norm": 1.9343072175979614, + "learning_rate": 1.9948800522177043e-05, + "loss": 1.5153, + "step": 1652 + }, + { + "epoch": 0.09037354947172752, + "grad_norm": 1.779002070426941, + "learning_rate": 1.9948615678297728e-05, + "loss": 1.6261, + "step": 1653 + }, + { + "epoch": 0.09042822191544907, + "grad_norm": 1.7495849132537842, + "learning_rate": 1.994843050221079e-05, + "loss": 1.288, + "step": 1654 + }, + { + "epoch": 0.09048289435917062, + "grad_norm": 1.650020956993103, + "learning_rate": 1.9948244993922413e-05, + "loss": 1.4764, + "step": 1655 + }, + { + "epoch": 0.09053756680289217, + "grad_norm": 1.7434049844741821, + "learning_rate": 1.9948059153438796e-05, + "loss": 1.515, + "step": 1656 + }, + { + "epoch": 0.09059223924661372, + "grad_norm": 1.3190313577651978, + "learning_rate": 1.9947872980766137e-05, + "loss": 1.4466, + "step": 1657 + }, + { + "epoch": 0.09064691169033529, + "grad_norm": 1.4511109590530396, + "learning_rate": 1.9947686475910656e-05, + "loss": 1.2644, + "step": 1658 + }, + { + "epoch": 0.09070158413405684, + "grad_norm": 1.6778180599212646, + "learning_rate": 1.9947499638878577e-05, + "loss": 1.4832, + "step": 1659 + }, + { + "epoch": 0.09075625657777839, + "grad_norm": 1.3683794736862183, + "learning_rate": 1.9947312469676148e-05, + "loss": 1.4252, + "step": 1660 + }, + { + "epoch": 0.09081092902149994, + "grad_norm": 1.2774654626846313, + "learning_rate": 1.994712496830961e-05, + "loss": 1.4912, + "step": 1661 + }, + { + "epoch": 0.09086560146522149, + "grad_norm": 1.2824443578720093, + "learning_rate": 1.994693713478523e-05, + "loss": 1.4566, + "step": 1662 + }, + { + "epoch": 0.09092027390894304, + "grad_norm": 1.465787410736084, + "learning_rate": 1.9946748969109275e-05, + "loss": 1.7518, + "step": 1663 + }, + { + "epoch": 0.09097494635266459, + "grad_norm": 2.1821846961975098, + "learning_rate": 1.994656047128803e-05, + "loss": 1.4461, + "step": 1664 + }, + { + "epoch": 0.09102961879638616, + "grad_norm": 2.0126445293426514, + "learning_rate": 1.9946371641327794e-05, + "loss": 1.564, + "step": 1665 + }, + { + "epoch": 0.09108429124010771, + "grad_norm": 1.5833719968795776, + "learning_rate": 1.994618247923487e-05, + "loss": 1.5187, + "step": 1666 + }, + { + "epoch": 0.09113896368382926, + "grad_norm": 1.6399699449539185, + "learning_rate": 1.9945992985015573e-05, + "loss": 1.586, + "step": 1667 + }, + { + "epoch": 0.09119363612755081, + "grad_norm": 1.8191697597503662, + "learning_rate": 1.994580315867623e-05, + "loss": 1.6757, + "step": 1668 + }, + { + "epoch": 0.09124830857127236, + "grad_norm": 1.2681807279586792, + "learning_rate": 1.994561300022318e-05, + "loss": 1.5816, + "step": 1669 + }, + { + "epoch": 0.09130298101499391, + "grad_norm": 1.9852992296218872, + "learning_rate": 1.9945422509662774e-05, + "loss": 1.5184, + "step": 1670 + }, + { + "epoch": 0.09135765345871547, + "grad_norm": 1.314630150794983, + "learning_rate": 1.9945231687001374e-05, + "loss": 1.4996, + "step": 1671 + }, + { + "epoch": 0.09141232590243703, + "grad_norm": 1.5647642612457275, + "learning_rate": 1.9945040532245352e-05, + "loss": 1.5775, + "step": 1672 + }, + { + "epoch": 0.09146699834615858, + "grad_norm": 1.757447600364685, + "learning_rate": 1.9944849045401088e-05, + "loss": 1.6382, + "step": 1673 + }, + { + "epoch": 0.09152167078988013, + "grad_norm": 1.9850865602493286, + "learning_rate": 1.9944657226474978e-05, + "loss": 1.5413, + "step": 1674 + }, + { + "epoch": 0.09157634323360168, + "grad_norm": 1.3536897897720337, + "learning_rate": 1.9944465075473427e-05, + "loss": 1.4136, + "step": 1675 + }, + { + "epoch": 0.09163101567732324, + "grad_norm": 1.7522871494293213, + "learning_rate": 1.9944272592402854e-05, + "loss": 1.5401, + "step": 1676 + }, + { + "epoch": 0.09168568812104479, + "grad_norm": 1.7064857482910156, + "learning_rate": 1.994407977726968e-05, + "loss": 1.6075, + "step": 1677 + }, + { + "epoch": 0.09174036056476634, + "grad_norm": 1.1687644720077515, + "learning_rate": 1.994388663008035e-05, + "loss": 1.5665, + "step": 1678 + }, + { + "epoch": 0.0917950330084879, + "grad_norm": 2.685781955718994, + "learning_rate": 1.9943693150841312e-05, + "loss": 1.6207, + "step": 1679 + }, + { + "epoch": 0.09184970545220945, + "grad_norm": 1.4428380727767944, + "learning_rate": 1.9943499339559026e-05, + "loss": 1.4328, + "step": 1680 + }, + { + "epoch": 0.091904377895931, + "grad_norm": 1.4570945501327515, + "learning_rate": 1.9943305196239963e-05, + "loss": 1.6468, + "step": 1681 + }, + { + "epoch": 0.09195905033965256, + "grad_norm": 1.5822176933288574, + "learning_rate": 1.9943110720890608e-05, + "loss": 1.4587, + "step": 1682 + }, + { + "epoch": 0.09201372278337411, + "grad_norm": 1.4002459049224854, + "learning_rate": 1.994291591351745e-05, + "loss": 1.7686, + "step": 1683 + }, + { + "epoch": 0.09206839522709566, + "grad_norm": 1.520400047302246, + "learning_rate": 1.9942720774127005e-05, + "loss": 1.4735, + "step": 1684 + }, + { + "epoch": 0.09212306767081721, + "grad_norm": 1.4169068336486816, + "learning_rate": 1.9942525302725773e-05, + "loss": 1.3377, + "step": 1685 + }, + { + "epoch": 0.09217774011453878, + "grad_norm": 1.415833592414856, + "learning_rate": 1.9942329499320298e-05, + "loss": 1.5268, + "step": 1686 + }, + { + "epoch": 0.09223241255826033, + "grad_norm": 1.4467637538909912, + "learning_rate": 1.9942133363917108e-05, + "loss": 1.5045, + "step": 1687 + }, + { + "epoch": 0.09228708500198188, + "grad_norm": 1.48541259765625, + "learning_rate": 1.9941936896522756e-05, + "loss": 1.4302, + "step": 1688 + }, + { + "epoch": 0.09234175744570343, + "grad_norm": 1.3883038759231567, + "learning_rate": 1.9941740097143802e-05, + "loss": 1.3977, + "step": 1689 + }, + { + "epoch": 0.09239642988942498, + "grad_norm": 1.5509294271469116, + "learning_rate": 1.9941542965786814e-05, + "loss": 1.6698, + "step": 1690 + }, + { + "epoch": 0.09245110233314653, + "grad_norm": 1.6423060894012451, + "learning_rate": 1.9941345502458377e-05, + "loss": 1.6031, + "step": 1691 + }, + { + "epoch": 0.09250577477686808, + "grad_norm": 1.4696259498596191, + "learning_rate": 1.994114770716509e-05, + "loss": 1.6066, + "step": 1692 + }, + { + "epoch": 0.09256044722058965, + "grad_norm": 1.9038739204406738, + "learning_rate": 1.994094957991355e-05, + "loss": 1.4787, + "step": 1693 + }, + { + "epoch": 0.0926151196643112, + "grad_norm": 1.9435968399047852, + "learning_rate": 1.994075112071038e-05, + "loss": 1.3815, + "step": 1694 + }, + { + "epoch": 0.09266979210803275, + "grad_norm": 1.979972004890442, + "learning_rate": 1.9940552329562202e-05, + "loss": 1.5015, + "step": 1695 + }, + { + "epoch": 0.0927244645517543, + "grad_norm": 2.1349236965179443, + "learning_rate": 1.9940353206475653e-05, + "loss": 1.574, + "step": 1696 + }, + { + "epoch": 0.09277913699547585, + "grad_norm": 2.1529009342193604, + "learning_rate": 1.9940153751457386e-05, + "loss": 1.4982, + "step": 1697 + }, + { + "epoch": 0.0928338094391974, + "grad_norm": 1.708659052848816, + "learning_rate": 1.993995396451406e-05, + "loss": 1.5826, + "step": 1698 + }, + { + "epoch": 0.09288848188291896, + "grad_norm": 2.215467691421509, + "learning_rate": 1.9939753845652348e-05, + "loss": 1.5957, + "step": 1699 + }, + { + "epoch": 0.09294315432664052, + "grad_norm": 1.4372518062591553, + "learning_rate": 1.9939553394878926e-05, + "loss": 1.4677, + "step": 1700 + }, + { + "epoch": 0.09299782677036207, + "grad_norm": 1.9082722663879395, + "learning_rate": 1.99393526122005e-05, + "loss": 1.2665, + "step": 1701 + }, + { + "epoch": 0.09305249921408362, + "grad_norm": 1.794187068939209, + "learning_rate": 1.993915149762376e-05, + "loss": 1.5771, + "step": 1702 + }, + { + "epoch": 0.09310717165780517, + "grad_norm": 2.1656346321105957, + "learning_rate": 1.993895005115543e-05, + "loss": 1.5424, + "step": 1703 + }, + { + "epoch": 0.09316184410152673, + "grad_norm": 2.311128616333008, + "learning_rate": 1.993874827280224e-05, + "loss": 1.3404, + "step": 1704 + }, + { + "epoch": 0.09321651654524828, + "grad_norm": 3.853316307067871, + "learning_rate": 1.9938546162570916e-05, + "loss": 1.5183, + "step": 1705 + }, + { + "epoch": 0.09327118898896983, + "grad_norm": 2.505786418914795, + "learning_rate": 1.9938343720468216e-05, + "loss": 1.7529, + "step": 1706 + }, + { + "epoch": 0.09332586143269139, + "grad_norm": 2.2447924613952637, + "learning_rate": 1.99381409465009e-05, + "loss": 1.0875, + "step": 1707 + }, + { + "epoch": 0.09338053387641294, + "grad_norm": 1.7783422470092773, + "learning_rate": 1.9937937840675737e-05, + "loss": 1.5174, + "step": 1708 + }, + { + "epoch": 0.0934352063201345, + "grad_norm": 2.6875357627868652, + "learning_rate": 1.993773440299951e-05, + "loss": 1.484, + "step": 1709 + }, + { + "epoch": 0.09348987876385605, + "grad_norm": 2.833354949951172, + "learning_rate": 1.9937530633479013e-05, + "loss": 1.3506, + "step": 1710 + }, + { + "epoch": 0.0935445512075776, + "grad_norm": 1.555580735206604, + "learning_rate": 1.9937326532121047e-05, + "loss": 1.4458, + "step": 1711 + }, + { + "epoch": 0.09359922365129915, + "grad_norm": 1.5199460983276367, + "learning_rate": 1.9937122098932428e-05, + "loss": 1.5436, + "step": 1712 + }, + { + "epoch": 0.09365389609502071, + "grad_norm": 1.9918133020401, + "learning_rate": 1.9936917333919983e-05, + "loss": 1.1949, + "step": 1713 + }, + { + "epoch": 0.09370856853874227, + "grad_norm": 1.6139552593231201, + "learning_rate": 1.9936712237090554e-05, + "loss": 1.3206, + "step": 1714 + }, + { + "epoch": 0.09376324098246382, + "grad_norm": 1.6741852760314941, + "learning_rate": 1.9936506808450984e-05, + "loss": 1.4827, + "step": 1715 + }, + { + "epoch": 0.09381791342618537, + "grad_norm": 1.32270085811615, + "learning_rate": 1.9936301048008137e-05, + "loss": 1.5467, + "step": 1716 + }, + { + "epoch": 0.09387258586990692, + "grad_norm": 1.4346816539764404, + "learning_rate": 1.993609495576888e-05, + "loss": 1.42, + "step": 1717 + }, + { + "epoch": 0.09392725831362847, + "grad_norm": 1.5773098468780518, + "learning_rate": 1.99358885317401e-05, + "loss": 1.5999, + "step": 1718 + }, + { + "epoch": 0.09398193075735002, + "grad_norm": 1.881520390510559, + "learning_rate": 1.9935681775928683e-05, + "loss": 1.9205, + "step": 1719 + }, + { + "epoch": 0.09403660320107159, + "grad_norm": 1.8677185773849487, + "learning_rate": 1.9935474688341536e-05, + "loss": 1.3377, + "step": 1720 + }, + { + "epoch": 0.09409127564479314, + "grad_norm": 1.3251813650131226, + "learning_rate": 1.9935267268985577e-05, + "loss": 1.61, + "step": 1721 + }, + { + "epoch": 0.09414594808851469, + "grad_norm": 1.2654324769973755, + "learning_rate": 1.993505951786773e-05, + "loss": 1.6118, + "step": 1722 + }, + { + "epoch": 0.09420062053223624, + "grad_norm": 1.6510061025619507, + "learning_rate": 1.993485143499493e-05, + "loss": 1.7604, + "step": 1723 + }, + { + "epoch": 0.09425529297595779, + "grad_norm": 1.4687657356262207, + "learning_rate": 1.993464302037413e-05, + "loss": 1.616, + "step": 1724 + }, + { + "epoch": 0.09430996541967934, + "grad_norm": 1.3821159601211548, + "learning_rate": 1.993443427401229e-05, + "loss": 1.6935, + "step": 1725 + }, + { + "epoch": 0.0943646378634009, + "grad_norm": 1.323785424232483, + "learning_rate": 1.993422519591637e-05, + "loss": 1.7314, + "step": 1726 + }, + { + "epoch": 0.09441931030712246, + "grad_norm": 1.2552438974380493, + "learning_rate": 1.9934015786093365e-05, + "loss": 1.5607, + "step": 1727 + }, + { + "epoch": 0.09447398275084401, + "grad_norm": 2.263951301574707, + "learning_rate": 1.9933806044550262e-05, + "loss": 1.4205, + "step": 1728 + }, + { + "epoch": 0.09452865519456556, + "grad_norm": 1.6668647527694702, + "learning_rate": 1.9933595971294064e-05, + "loss": 1.363, + "step": 1729 + }, + { + "epoch": 0.09458332763828711, + "grad_norm": 1.8494908809661865, + "learning_rate": 1.9933385566331785e-05, + "loss": 1.2513, + "step": 1730 + }, + { + "epoch": 0.09463800008200866, + "grad_norm": 1.5761901140213013, + "learning_rate": 1.9933174829670455e-05, + "loss": 1.4754, + "step": 1731 + }, + { + "epoch": 0.09469267252573021, + "grad_norm": 1.3868597745895386, + "learning_rate": 1.9932963761317105e-05, + "loss": 1.6932, + "step": 1732 + }, + { + "epoch": 0.09474734496945177, + "grad_norm": 1.4530638456344604, + "learning_rate": 1.993275236127879e-05, + "loss": 1.5547, + "step": 1733 + }, + { + "epoch": 0.09480201741317333, + "grad_norm": 1.5360432863235474, + "learning_rate": 1.9932540629562563e-05, + "loss": 1.5318, + "step": 1734 + }, + { + "epoch": 0.09485668985689488, + "grad_norm": 1.6872385740280151, + "learning_rate": 1.99323285661755e-05, + "loss": 1.4118, + "step": 1735 + }, + { + "epoch": 0.09491136230061643, + "grad_norm": 1.613949179649353, + "learning_rate": 1.9932116171124676e-05, + "loss": 1.5835, + "step": 1736 + }, + { + "epoch": 0.09496603474433798, + "grad_norm": 1.520357608795166, + "learning_rate": 1.9931903444417187e-05, + "loss": 1.5465, + "step": 1737 + }, + { + "epoch": 0.09502070718805954, + "grad_norm": 1.605879545211792, + "learning_rate": 1.993169038606014e-05, + "loss": 1.5187, + "step": 1738 + }, + { + "epoch": 0.09507537963178109, + "grad_norm": 1.5210539102554321, + "learning_rate": 1.9931476996060644e-05, + "loss": 1.6508, + "step": 1739 + }, + { + "epoch": 0.09513005207550264, + "grad_norm": 1.720354437828064, + "learning_rate": 1.9931263274425823e-05, + "loss": 1.5697, + "step": 1740 + }, + { + "epoch": 0.0951847245192242, + "grad_norm": 1.9032758474349976, + "learning_rate": 1.993104922116282e-05, + "loss": 1.3827, + "step": 1741 + }, + { + "epoch": 0.09523939696294575, + "grad_norm": 1.3109334707260132, + "learning_rate": 1.993083483627878e-05, + "loss": 1.5601, + "step": 1742 + }, + { + "epoch": 0.0952940694066673, + "grad_norm": 1.2695043087005615, + "learning_rate": 1.993062011978086e-05, + "loss": 1.3497, + "step": 1743 + }, + { + "epoch": 0.09534874185038886, + "grad_norm": 1.6482902765274048, + "learning_rate": 1.9930405071676228e-05, + "loss": 1.5903, + "step": 1744 + }, + { + "epoch": 0.09540341429411041, + "grad_norm": 1.9658033847808838, + "learning_rate": 1.993018969197207e-05, + "loss": 1.6168, + "step": 1745 + }, + { + "epoch": 0.09545808673783196, + "grad_norm": 1.6979827880859375, + "learning_rate": 1.992997398067558e-05, + "loss": 1.6714, + "step": 1746 + }, + { + "epoch": 0.09551275918155351, + "grad_norm": 1.3149948120117188, + "learning_rate": 1.9929757937793953e-05, + "loss": 1.596, + "step": 1747 + }, + { + "epoch": 0.09556743162527508, + "grad_norm": 1.1914567947387695, + "learning_rate": 1.992954156333441e-05, + "loss": 1.2877, + "step": 1748 + }, + { + "epoch": 0.09562210406899663, + "grad_norm": 1.5711084604263306, + "learning_rate": 1.9929324857304175e-05, + "loss": 1.394, + "step": 1749 + }, + { + "epoch": 0.09567677651271818, + "grad_norm": 1.6412163972854614, + "learning_rate": 1.992910781971048e-05, + "loss": 1.506, + "step": 1750 + }, + { + "epoch": 0.09573144895643973, + "grad_norm": 1.3164561986923218, + "learning_rate": 1.9928890450560576e-05, + "loss": 1.5328, + "step": 1751 + }, + { + "epoch": 0.09578612140016128, + "grad_norm": 1.8473644256591797, + "learning_rate": 1.9928672749861725e-05, + "loss": 1.6055, + "step": 1752 + }, + { + "epoch": 0.09584079384388283, + "grad_norm": 1.5731841325759888, + "learning_rate": 1.992845471762119e-05, + "loss": 1.3235, + "step": 1753 + }, + { + "epoch": 0.09589546628760438, + "grad_norm": 1.567803144454956, + "learning_rate": 1.992823635384625e-05, + "loss": 1.5657, + "step": 1754 + }, + { + "epoch": 0.09595013873132595, + "grad_norm": 1.7159382104873657, + "learning_rate": 1.9928017658544206e-05, + "loss": 1.358, + "step": 1755 + }, + { + "epoch": 0.0960048111750475, + "grad_norm": 1.5335745811462402, + "learning_rate": 1.9927798631722353e-05, + "loss": 1.4391, + "step": 1756 + }, + { + "epoch": 0.09605948361876905, + "grad_norm": 2.0221900939941406, + "learning_rate": 1.992757927338801e-05, + "loss": 1.6143, + "step": 1757 + }, + { + "epoch": 0.0961141560624906, + "grad_norm": 1.499112844467163, + "learning_rate": 1.9927359583548495e-05, + "loss": 1.3346, + "step": 1758 + }, + { + "epoch": 0.09616882850621215, + "grad_norm": 1.6623201370239258, + "learning_rate": 1.9927139562211154e-05, + "loss": 1.5735, + "step": 1759 + }, + { + "epoch": 0.0962235009499337, + "grad_norm": 1.5757696628570557, + "learning_rate": 1.9926919209383325e-05, + "loss": 1.3258, + "step": 1760 + }, + { + "epoch": 0.09627817339365527, + "grad_norm": 1.5396373271942139, + "learning_rate": 1.9926698525072368e-05, + "loss": 1.5576, + "step": 1761 + }, + { + "epoch": 0.09633284583737682, + "grad_norm": 1.3776769638061523, + "learning_rate": 1.9926477509285654e-05, + "loss": 1.4641, + "step": 1762 + }, + { + "epoch": 0.09638751828109837, + "grad_norm": 1.746182918548584, + "learning_rate": 1.9926256162030564e-05, + "loss": 1.423, + "step": 1763 + }, + { + "epoch": 0.09644219072481992, + "grad_norm": 1.3925483226776123, + "learning_rate": 1.992603448331449e-05, + "loss": 1.3538, + "step": 1764 + }, + { + "epoch": 0.09649686316854147, + "grad_norm": 1.7109605073928833, + "learning_rate": 1.9925812473144826e-05, + "loss": 1.544, + "step": 1765 + }, + { + "epoch": 0.09655153561226303, + "grad_norm": 1.341408610343933, + "learning_rate": 1.9925590131528998e-05, + "loss": 1.2481, + "step": 1766 + }, + { + "epoch": 0.09660620805598458, + "grad_norm": 1.5207229852676392, + "learning_rate": 1.9925367458474425e-05, + "loss": 1.5815, + "step": 1767 + }, + { + "epoch": 0.09666088049970614, + "grad_norm": 1.722517728805542, + "learning_rate": 1.992514445398854e-05, + "loss": 1.5422, + "step": 1768 + }, + { + "epoch": 0.0967155529434277, + "grad_norm": 1.5523806810379028, + "learning_rate": 1.9924921118078792e-05, + "loss": 1.708, + "step": 1769 + }, + { + "epoch": 0.09677022538714924, + "grad_norm": 1.6421241760253906, + "learning_rate": 1.9924697450752636e-05, + "loss": 1.3992, + "step": 1770 + }, + { + "epoch": 0.0968248978308708, + "grad_norm": 1.2706568241119385, + "learning_rate": 1.992447345201754e-05, + "loss": 1.5544, + "step": 1771 + }, + { + "epoch": 0.09687957027459235, + "grad_norm": 1.2883591651916504, + "learning_rate": 1.9924249121880993e-05, + "loss": 1.5576, + "step": 1772 + }, + { + "epoch": 0.0969342427183139, + "grad_norm": 1.8686045408248901, + "learning_rate": 1.992402446035048e-05, + "loss": 1.1654, + "step": 1773 + }, + { + "epoch": 0.09698891516203545, + "grad_norm": 2.1865928173065186, + "learning_rate": 1.99237994674335e-05, + "loss": 1.7203, + "step": 1774 + }, + { + "epoch": 0.09704358760575701, + "grad_norm": 1.6699177026748657, + "learning_rate": 1.992357414313757e-05, + "loss": 1.7919, + "step": 1775 + }, + { + "epoch": 0.09709826004947857, + "grad_norm": 1.5126926898956299, + "learning_rate": 1.9923348487470213e-05, + "loss": 1.3666, + "step": 1776 + }, + { + "epoch": 0.09715293249320012, + "grad_norm": 2.0696840286254883, + "learning_rate": 1.9923122500438964e-05, + "loss": 1.5757, + "step": 1777 + }, + { + "epoch": 0.09720760493692167, + "grad_norm": 1.6386672258377075, + "learning_rate": 1.992289618205137e-05, + "loss": 1.8223, + "step": 1778 + }, + { + "epoch": 0.09726227738064322, + "grad_norm": 1.4819684028625488, + "learning_rate": 1.9922669532314986e-05, + "loss": 1.343, + "step": 1779 + }, + { + "epoch": 0.09731694982436477, + "grad_norm": 1.6749706268310547, + "learning_rate": 1.9922442551237383e-05, + "loss": 1.556, + "step": 1780 + }, + { + "epoch": 0.09737162226808632, + "grad_norm": 1.3279800415039062, + "learning_rate": 1.9922215238826142e-05, + "loss": 1.4086, + "step": 1781 + }, + { + "epoch": 0.09742629471180789, + "grad_norm": 1.958772897720337, + "learning_rate": 1.9921987595088846e-05, + "loss": 1.3735, + "step": 1782 + }, + { + "epoch": 0.09748096715552944, + "grad_norm": 1.5829342603683472, + "learning_rate": 1.9921759620033105e-05, + "loss": 1.3656, + "step": 1783 + }, + { + "epoch": 0.09753563959925099, + "grad_norm": 1.5470174551010132, + "learning_rate": 1.9921531313666526e-05, + "loss": 1.48, + "step": 1784 + }, + { + "epoch": 0.09759031204297254, + "grad_norm": 1.7289918661117554, + "learning_rate": 1.9921302675996735e-05, + "loss": 1.6493, + "step": 1785 + }, + { + "epoch": 0.09764498448669409, + "grad_norm": 1.4466793537139893, + "learning_rate": 1.992107370703137e-05, + "loss": 1.7315, + "step": 1786 + }, + { + "epoch": 0.09769965693041564, + "grad_norm": 1.598184585571289, + "learning_rate": 1.992084440677807e-05, + "loss": 1.2979, + "step": 1787 + }, + { + "epoch": 0.0977543293741372, + "grad_norm": 1.419420838356018, + "learning_rate": 1.9920614775244495e-05, + "loss": 1.4191, + "step": 1788 + }, + { + "epoch": 0.09780900181785876, + "grad_norm": 1.5521754026412964, + "learning_rate": 1.9920384812438315e-05, + "loss": 1.4057, + "step": 1789 + }, + { + "epoch": 0.09786367426158031, + "grad_norm": 1.4221700429916382, + "learning_rate": 1.9920154518367206e-05, + "loss": 1.7167, + "step": 1790 + }, + { + "epoch": 0.09791834670530186, + "grad_norm": 1.3528461456298828, + "learning_rate": 1.9919923893038863e-05, + "loss": 1.5209, + "step": 1791 + }, + { + "epoch": 0.09797301914902341, + "grad_norm": 1.2933242321014404, + "learning_rate": 1.9919692936460978e-05, + "loss": 1.3302, + "step": 1792 + }, + { + "epoch": 0.09802769159274496, + "grad_norm": 1.4120041131973267, + "learning_rate": 1.991946164864127e-05, + "loss": 1.4029, + "step": 1793 + }, + { + "epoch": 0.09808236403646652, + "grad_norm": 1.6395173072814941, + "learning_rate": 1.9919230029587463e-05, + "loss": 1.4848, + "step": 1794 + }, + { + "epoch": 0.09813703648018807, + "grad_norm": 1.4251574277877808, + "learning_rate": 1.9918998079307286e-05, + "loss": 1.5539, + "step": 1795 + }, + { + "epoch": 0.09819170892390963, + "grad_norm": 1.8291261196136475, + "learning_rate": 1.9918765797808492e-05, + "loss": 1.2101, + "step": 1796 + }, + { + "epoch": 0.09824638136763118, + "grad_norm": 1.5372120141983032, + "learning_rate": 1.991853318509883e-05, + "loss": 1.7741, + "step": 1797 + }, + { + "epoch": 0.09830105381135273, + "grad_norm": 1.650926947593689, + "learning_rate": 1.991830024118607e-05, + "loss": 1.3348, + "step": 1798 + }, + { + "epoch": 0.09835572625507429, + "grad_norm": 1.3645391464233398, + "learning_rate": 1.9918066966077992e-05, + "loss": 1.4988, + "step": 1799 + }, + { + "epoch": 0.09841039869879584, + "grad_norm": 2.320249080657959, + "learning_rate": 1.9917833359782382e-05, + "loss": 1.4804, + "step": 1800 + }, + { + "epoch": 0.09846507114251739, + "grad_norm": 1.6973365545272827, + "learning_rate": 1.9917599422307047e-05, + "loss": 1.5744, + "step": 1801 + }, + { + "epoch": 0.09851974358623894, + "grad_norm": 2.1691582202911377, + "learning_rate": 1.9917365153659794e-05, + "loss": 1.2882, + "step": 1802 + }, + { + "epoch": 0.0985744160299605, + "grad_norm": 1.4036483764648438, + "learning_rate": 1.9917130553848445e-05, + "loss": 1.5605, + "step": 1803 + }, + { + "epoch": 0.09862908847368206, + "grad_norm": 1.5860317945480347, + "learning_rate": 1.9916895622880835e-05, + "loss": 1.5295, + "step": 1804 + }, + { + "epoch": 0.0986837609174036, + "grad_norm": 1.9985071420669556, + "learning_rate": 1.991666036076481e-05, + "loss": 1.6596, + "step": 1805 + }, + { + "epoch": 0.09873843336112516, + "grad_norm": 1.5699318647384644, + "learning_rate": 1.9916424767508226e-05, + "loss": 1.3979, + "step": 1806 + }, + { + "epoch": 0.09879310580484671, + "grad_norm": 1.3510178327560425, + "learning_rate": 1.991618884311895e-05, + "loss": 1.612, + "step": 1807 + }, + { + "epoch": 0.09884777824856826, + "grad_norm": 1.2920466661453247, + "learning_rate": 1.9915952587604857e-05, + "loss": 1.4012, + "step": 1808 + }, + { + "epoch": 0.09890245069228983, + "grad_norm": 1.652910828590393, + "learning_rate": 1.9915716000973844e-05, + "loss": 1.7308, + "step": 1809 + }, + { + "epoch": 0.09895712313601138, + "grad_norm": 1.5406733751296997, + "learning_rate": 1.9915479083233803e-05, + "loss": 1.3883, + "step": 1810 + }, + { + "epoch": 0.09901179557973293, + "grad_norm": 2.11116361618042, + "learning_rate": 1.991524183439265e-05, + "loss": 1.6188, + "step": 1811 + }, + { + "epoch": 0.09906646802345448, + "grad_norm": 1.3187652826309204, + "learning_rate": 1.99150042544583e-05, + "loss": 1.5449, + "step": 1812 + }, + { + "epoch": 0.09912114046717603, + "grad_norm": 1.7969388961791992, + "learning_rate": 1.9914766343438695e-05, + "loss": 1.8874, + "step": 1813 + }, + { + "epoch": 0.09917581291089758, + "grad_norm": 1.4672174453735352, + "learning_rate": 1.9914528101341773e-05, + "loss": 1.3793, + "step": 1814 + }, + { + "epoch": 0.09923048535461913, + "grad_norm": 1.4637141227722168, + "learning_rate": 1.9914289528175495e-05, + "loss": 1.429, + "step": 1815 + }, + { + "epoch": 0.0992851577983407, + "grad_norm": 1.2905339002609253, + "learning_rate": 1.9914050623947826e-05, + "loss": 1.2636, + "step": 1816 + }, + { + "epoch": 0.09933983024206225, + "grad_norm": 2.5875487327575684, + "learning_rate": 1.9913811388666742e-05, + "loss": 1.3668, + "step": 1817 + }, + { + "epoch": 0.0993945026857838, + "grad_norm": 1.3329970836639404, + "learning_rate": 1.991357182234023e-05, + "loss": 1.3659, + "step": 1818 + }, + { + "epoch": 0.09944917512950535, + "grad_norm": 1.633386492729187, + "learning_rate": 1.9913331924976295e-05, + "loss": 1.4671, + "step": 1819 + }, + { + "epoch": 0.0995038475732269, + "grad_norm": 1.4165468215942383, + "learning_rate": 1.9913091696582945e-05, + "loss": 1.4703, + "step": 1820 + }, + { + "epoch": 0.09955852001694845, + "grad_norm": 1.5559685230255127, + "learning_rate": 1.99128511371682e-05, + "loss": 1.578, + "step": 1821 + }, + { + "epoch": 0.09961319246067, + "grad_norm": 1.6190109252929688, + "learning_rate": 1.9912610246740095e-05, + "loss": 1.5799, + "step": 1822 + }, + { + "epoch": 0.09966786490439157, + "grad_norm": 1.3375427722930908, + "learning_rate": 1.991236902530667e-05, + "loss": 1.4691, + "step": 1823 + }, + { + "epoch": 0.09972253734811312, + "grad_norm": 1.638346791267395, + "learning_rate": 1.9912127472875986e-05, + "loss": 1.4836, + "step": 1824 + }, + { + "epoch": 0.09977720979183467, + "grad_norm": 1.782301664352417, + "learning_rate": 1.9911885589456107e-05, + "loss": 1.4234, + "step": 1825 + }, + { + "epoch": 0.09983188223555622, + "grad_norm": 1.3556112051010132, + "learning_rate": 1.991164337505511e-05, + "loss": 1.5004, + "step": 1826 + }, + { + "epoch": 0.09988655467927778, + "grad_norm": 1.999994158744812, + "learning_rate": 1.9911400829681075e-05, + "loss": 1.3773, + "step": 1827 + }, + { + "epoch": 0.09994122712299933, + "grad_norm": 1.1879932880401611, + "learning_rate": 1.9911157953342114e-05, + "loss": 1.2768, + "step": 1828 + }, + { + "epoch": 0.09999589956672088, + "grad_norm": 1.3970236778259277, + "learning_rate": 1.9910914746046333e-05, + "loss": 1.2469, + "step": 1829 + }, + { + "epoch": 0.10005057201044244, + "grad_norm": 1.3781545162200928, + "learning_rate": 1.9910671207801847e-05, + "loss": 1.4801, + "step": 1830 + }, + { + "epoch": 0.100105244454164, + "grad_norm": 1.4064985513687134, + "learning_rate": 1.9910427338616798e-05, + "loss": 1.496, + "step": 1831 + }, + { + "epoch": 0.10015991689788555, + "grad_norm": 1.6215204000473022, + "learning_rate": 1.9910183138499324e-05, + "loss": 1.3472, + "step": 1832 + }, + { + "epoch": 0.1002145893416071, + "grad_norm": 1.7643916606903076, + "learning_rate": 1.990993860745758e-05, + "loss": 1.5735, + "step": 1833 + }, + { + "epoch": 0.10026926178532865, + "grad_norm": 1.520466923713684, + "learning_rate": 1.990969374549973e-05, + "loss": 1.5593, + "step": 1834 + }, + { + "epoch": 0.1003239342290502, + "grad_norm": 1.6190546751022339, + "learning_rate": 1.9909448552633952e-05, + "loss": 1.4919, + "step": 1835 + }, + { + "epoch": 0.10037860667277175, + "grad_norm": 1.3602855205535889, + "learning_rate": 1.9909203028868432e-05, + "loss": 1.6262, + "step": 1836 + }, + { + "epoch": 0.10043327911649332, + "grad_norm": 1.2356830835342407, + "learning_rate": 1.9908957174211375e-05, + "loss": 1.6739, + "step": 1837 + }, + { + "epoch": 0.10048795156021487, + "grad_norm": 1.5200629234313965, + "learning_rate": 1.9908710988670983e-05, + "loss": 1.5454, + "step": 1838 + }, + { + "epoch": 0.10054262400393642, + "grad_norm": 1.47928786277771, + "learning_rate": 1.990846447225548e-05, + "loss": 1.4712, + "step": 1839 + }, + { + "epoch": 0.10059729644765797, + "grad_norm": 1.7835007905960083, + "learning_rate": 1.99082176249731e-05, + "loss": 1.5489, + "step": 1840 + }, + { + "epoch": 0.10065196889137952, + "grad_norm": 1.9779497385025024, + "learning_rate": 1.9907970446832076e-05, + "loss": 1.5858, + "step": 1841 + }, + { + "epoch": 0.10070664133510107, + "grad_norm": 1.787682056427002, + "learning_rate": 1.9907722937840676e-05, + "loss": 1.4256, + "step": 1842 + }, + { + "epoch": 0.10076131377882262, + "grad_norm": 1.2726397514343262, + "learning_rate": 1.9907475098007154e-05, + "loss": 1.6643, + "step": 1843 + }, + { + "epoch": 0.10081598622254419, + "grad_norm": 1.6254830360412598, + "learning_rate": 1.990722692733979e-05, + "loss": 1.4695, + "step": 1844 + }, + { + "epoch": 0.10087065866626574, + "grad_norm": 1.3130079507827759, + "learning_rate": 1.9906978425846876e-05, + "loss": 1.4725, + "step": 1845 + }, + { + "epoch": 0.10092533110998729, + "grad_norm": 1.1782548427581787, + "learning_rate": 1.9906729593536697e-05, + "loss": 1.5399, + "step": 1846 + }, + { + "epoch": 0.10098000355370884, + "grad_norm": 2.456939220428467, + "learning_rate": 1.9906480430417575e-05, + "loss": 1.3864, + "step": 1847 + }, + { + "epoch": 0.10103467599743039, + "grad_norm": 1.5808006525039673, + "learning_rate": 1.9906230936497825e-05, + "loss": 1.4651, + "step": 1848 + }, + { + "epoch": 0.10108934844115194, + "grad_norm": 1.4222190380096436, + "learning_rate": 1.9905981111785774e-05, + "loss": 1.5395, + "step": 1849 + }, + { + "epoch": 0.1011440208848735, + "grad_norm": 1.7769229412078857, + "learning_rate": 1.9905730956289772e-05, + "loss": 1.4358, + "step": 1850 + }, + { + "epoch": 0.10119869332859506, + "grad_norm": 1.7468829154968262, + "learning_rate": 1.9905480470018172e-05, + "loss": 1.3577, + "step": 1851 + }, + { + "epoch": 0.10125336577231661, + "grad_norm": 1.3842990398406982, + "learning_rate": 1.9905229652979332e-05, + "loss": 1.7013, + "step": 1852 + }, + { + "epoch": 0.10130803821603816, + "grad_norm": 1.5363842248916626, + "learning_rate": 1.990497850518163e-05, + "loss": 1.3709, + "step": 1853 + }, + { + "epoch": 0.10136271065975971, + "grad_norm": 1.6448169946670532, + "learning_rate": 1.9904727026633453e-05, + "loss": 1.6209, + "step": 1854 + }, + { + "epoch": 0.10141738310348127, + "grad_norm": 1.291101336479187, + "learning_rate": 1.99044752173432e-05, + "loss": 1.4597, + "step": 1855 + }, + { + "epoch": 0.10147205554720282, + "grad_norm": 1.5455677509307861, + "learning_rate": 1.9904223077319276e-05, + "loss": 1.6042, + "step": 1856 + }, + { + "epoch": 0.10152672799092437, + "grad_norm": 1.524499535560608, + "learning_rate": 1.99039706065701e-05, + "loss": 1.4888, + "step": 1857 + }, + { + "epoch": 0.10158140043464593, + "grad_norm": 2.072284698486328, + "learning_rate": 1.9903717805104112e-05, + "loss": 1.485, + "step": 1858 + }, + { + "epoch": 0.10163607287836748, + "grad_norm": 1.9991518259048462, + "learning_rate": 1.990346467292974e-05, + "loss": 1.583, + "step": 1859 + }, + { + "epoch": 0.10169074532208904, + "grad_norm": 1.3282586336135864, + "learning_rate": 1.990321121005545e-05, + "loss": 1.4885, + "step": 1860 + }, + { + "epoch": 0.10174541776581059, + "grad_norm": 1.298799991607666, + "learning_rate": 1.9902957416489693e-05, + "loss": 1.5427, + "step": 1861 + }, + { + "epoch": 0.10180009020953214, + "grad_norm": 1.6543488502502441, + "learning_rate": 1.9902703292240953e-05, + "loss": 1.5759, + "step": 1862 + }, + { + "epoch": 0.10185476265325369, + "grad_norm": 1.4751211404800415, + "learning_rate": 1.9902448837317712e-05, + "loss": 1.479, + "step": 1863 + }, + { + "epoch": 0.10190943509697525, + "grad_norm": 1.4142334461212158, + "learning_rate": 1.9902194051728466e-05, + "loss": 1.4756, + "step": 1864 + }, + { + "epoch": 0.1019641075406968, + "grad_norm": 3.468895435333252, + "learning_rate": 1.9901938935481727e-05, + "loss": 1.6977, + "step": 1865 + }, + { + "epoch": 0.10201877998441836, + "grad_norm": 1.992977261543274, + "learning_rate": 1.990168348858601e-05, + "loss": 1.2973, + "step": 1866 + }, + { + "epoch": 0.10207345242813991, + "grad_norm": 1.234774112701416, + "learning_rate": 1.9901427711049847e-05, + "loss": 1.6625, + "step": 1867 + }, + { + "epoch": 0.10212812487186146, + "grad_norm": 1.5017261505126953, + "learning_rate": 1.9901171602881778e-05, + "loss": 1.4466, + "step": 1868 + }, + { + "epoch": 0.10218279731558301, + "grad_norm": 1.5512694120407104, + "learning_rate": 1.9900915164090352e-05, + "loss": 1.3189, + "step": 1869 + }, + { + "epoch": 0.10223746975930456, + "grad_norm": 1.5173217058181763, + "learning_rate": 1.990065839468414e-05, + "loss": 1.5032, + "step": 1870 + }, + { + "epoch": 0.10229214220302613, + "grad_norm": 1.5251480340957642, + "learning_rate": 1.990040129467171e-05, + "loss": 1.4437, + "step": 1871 + }, + { + "epoch": 0.10234681464674768, + "grad_norm": 1.3362590074539185, + "learning_rate": 1.990014386406165e-05, + "loss": 1.5263, + "step": 1872 + }, + { + "epoch": 0.10240148709046923, + "grad_norm": 1.6607164144515991, + "learning_rate": 1.9899886102862554e-05, + "loss": 1.289, + "step": 1873 + }, + { + "epoch": 0.10245615953419078, + "grad_norm": 1.6196666955947876, + "learning_rate": 1.9899628011083028e-05, + "loss": 1.4849, + "step": 1874 + }, + { + "epoch": 0.10251083197791233, + "grad_norm": 1.8068593740463257, + "learning_rate": 1.9899369588731697e-05, + "loss": 1.7603, + "step": 1875 + }, + { + "epoch": 0.10256550442163388, + "grad_norm": 1.232242226600647, + "learning_rate": 1.9899110835817182e-05, + "loss": 1.4612, + "step": 1876 + }, + { + "epoch": 0.10262017686535543, + "grad_norm": 1.8396286964416504, + "learning_rate": 1.9898851752348128e-05, + "loss": 1.64, + "step": 1877 + }, + { + "epoch": 0.102674849309077, + "grad_norm": 1.588092565536499, + "learning_rate": 1.9898592338333187e-05, + "loss": 1.5692, + "step": 1878 + }, + { + "epoch": 0.10272952175279855, + "grad_norm": 2.3794422149658203, + "learning_rate": 1.989833259378102e-05, + "loss": 1.7307, + "step": 1879 + }, + { + "epoch": 0.1027841941965201, + "grad_norm": 1.7609647512435913, + "learning_rate": 1.98980725187003e-05, + "loss": 1.5798, + "step": 1880 + }, + { + "epoch": 0.10283886664024165, + "grad_norm": 2.0822904109954834, + "learning_rate": 1.989781211309971e-05, + "loss": 1.4635, + "step": 1881 + }, + { + "epoch": 0.1028935390839632, + "grad_norm": 1.427248239517212, + "learning_rate": 1.9897551376987948e-05, + "loss": 1.3029, + "step": 1882 + }, + { + "epoch": 0.10294821152768475, + "grad_norm": 2.082834005355835, + "learning_rate": 1.9897290310373722e-05, + "loss": 1.4991, + "step": 1883 + }, + { + "epoch": 0.1030028839714063, + "grad_norm": 1.6678619384765625, + "learning_rate": 1.989702891326575e-05, + "loss": 1.4678, + "step": 1884 + }, + { + "epoch": 0.10305755641512787, + "grad_norm": 1.557556390762329, + "learning_rate": 1.9896767185672755e-05, + "loss": 1.3074, + "step": 1885 + }, + { + "epoch": 0.10311222885884942, + "grad_norm": 1.8216004371643066, + "learning_rate": 1.989650512760348e-05, + "loss": 1.5519, + "step": 1886 + }, + { + "epoch": 0.10316690130257097, + "grad_norm": 1.4905747175216675, + "learning_rate": 1.9896242739066678e-05, + "loss": 1.4707, + "step": 1887 + }, + { + "epoch": 0.10322157374629252, + "grad_norm": 1.5599100589752197, + "learning_rate": 1.9895980020071106e-05, + "loss": 1.2333, + "step": 1888 + }, + { + "epoch": 0.10327624619001408, + "grad_norm": 1.6520663499832153, + "learning_rate": 1.9895716970625544e-05, + "loss": 1.3961, + "step": 1889 + }, + { + "epoch": 0.10333091863373563, + "grad_norm": 1.591796875, + "learning_rate": 1.9895453590738766e-05, + "loss": 1.582, + "step": 1890 + }, + { + "epoch": 0.10338559107745718, + "grad_norm": 1.581419825553894, + "learning_rate": 1.9895189880419576e-05, + "loss": 1.6571, + "step": 1891 + }, + { + "epoch": 0.10344026352117874, + "grad_norm": 1.403673768043518, + "learning_rate": 1.9894925839676774e-05, + "loss": 1.3764, + "step": 1892 + }, + { + "epoch": 0.1034949359649003, + "grad_norm": 1.4066708087921143, + "learning_rate": 1.989466146851918e-05, + "loss": 1.425, + "step": 1893 + }, + { + "epoch": 0.10354960840862185, + "grad_norm": 1.5084216594696045, + "learning_rate": 1.989439676695562e-05, + "loss": 1.6879, + "step": 1894 + }, + { + "epoch": 0.1036042808523434, + "grad_norm": 1.7873976230621338, + "learning_rate": 1.9894131734994935e-05, + "loss": 1.4415, + "step": 1895 + }, + { + "epoch": 0.10365895329606495, + "grad_norm": 1.2859348058700562, + "learning_rate": 1.9893866372645975e-05, + "loss": 1.4216, + "step": 1896 + }, + { + "epoch": 0.1037136257397865, + "grad_norm": 1.6825690269470215, + "learning_rate": 1.98936006799176e-05, + "loss": 1.3367, + "step": 1897 + }, + { + "epoch": 0.10376829818350805, + "grad_norm": 1.644470453262329, + "learning_rate": 1.989333465681868e-05, + "loss": 1.5398, + "step": 1898 + }, + { + "epoch": 0.10382297062722962, + "grad_norm": 1.7459882497787476, + "learning_rate": 1.98930683033581e-05, + "loss": 1.4494, + "step": 1899 + }, + { + "epoch": 0.10387764307095117, + "grad_norm": 1.3342891931533813, + "learning_rate": 1.9892801619544756e-05, + "loss": 1.6318, + "step": 1900 + }, + { + "epoch": 0.10393231551467272, + "grad_norm": 1.4402107000350952, + "learning_rate": 1.9892534605387555e-05, + "loss": 1.3846, + "step": 1901 + }, + { + "epoch": 0.10398698795839427, + "grad_norm": 1.4923096895217896, + "learning_rate": 1.9892267260895407e-05, + "loss": 1.6075, + "step": 1902 + }, + { + "epoch": 0.10404166040211582, + "grad_norm": 1.390077829360962, + "learning_rate": 1.989199958607724e-05, + "loss": 1.3879, + "step": 1903 + }, + { + "epoch": 0.10409633284583737, + "grad_norm": 1.3582713603973389, + "learning_rate": 1.9891731580942e-05, + "loss": 1.8431, + "step": 1904 + }, + { + "epoch": 0.10415100528955892, + "grad_norm": 1.3708487749099731, + "learning_rate": 1.9891463245498625e-05, + "loss": 1.5424, + "step": 1905 + }, + { + "epoch": 0.10420567773328049, + "grad_norm": 1.5430163145065308, + "learning_rate": 1.9891194579756082e-05, + "loss": 1.4499, + "step": 1906 + }, + { + "epoch": 0.10426035017700204, + "grad_norm": 1.660691261291504, + "learning_rate": 1.9890925583723345e-05, + "loss": 1.5574, + "step": 1907 + }, + { + "epoch": 0.10431502262072359, + "grad_norm": 1.7750089168548584, + "learning_rate": 1.9890656257409388e-05, + "loss": 1.4768, + "step": 1908 + }, + { + "epoch": 0.10436969506444514, + "grad_norm": 1.281873345375061, + "learning_rate": 1.9890386600823214e-05, + "loss": 1.5298, + "step": 1909 + }, + { + "epoch": 0.1044243675081667, + "grad_norm": 1.3206462860107422, + "learning_rate": 1.9890116613973822e-05, + "loss": 1.6014, + "step": 1910 + }, + { + "epoch": 0.10447903995188824, + "grad_norm": 2.001828193664551, + "learning_rate": 1.9889846296870228e-05, + "loss": 1.0937, + "step": 1911 + }, + { + "epoch": 0.10453371239560981, + "grad_norm": 1.3642475605010986, + "learning_rate": 1.9889575649521457e-05, + "loss": 1.5186, + "step": 1912 + }, + { + "epoch": 0.10458838483933136, + "grad_norm": 1.2979789972305298, + "learning_rate": 1.988930467193655e-05, + "loss": 1.3641, + "step": 1913 + }, + { + "epoch": 0.10464305728305291, + "grad_norm": 1.6791332960128784, + "learning_rate": 1.9889033364124555e-05, + "loss": 1.6666, + "step": 1914 + }, + { + "epoch": 0.10469772972677446, + "grad_norm": 1.5501844882965088, + "learning_rate": 1.988876172609453e-05, + "loss": 1.4691, + "step": 1915 + }, + { + "epoch": 0.10475240217049601, + "grad_norm": 1.3279465436935425, + "learning_rate": 1.9888489757855548e-05, + "loss": 1.4253, + "step": 1916 + }, + { + "epoch": 0.10480707461421757, + "grad_norm": 1.483973503112793, + "learning_rate": 1.9888217459416685e-05, + "loss": 1.7022, + "step": 1917 + }, + { + "epoch": 0.10486174705793912, + "grad_norm": 1.6170936822891235, + "learning_rate": 1.9887944830787042e-05, + "loss": 1.2433, + "step": 1918 + }, + { + "epoch": 0.10491641950166068, + "grad_norm": 1.277882695198059, + "learning_rate": 1.9887671871975716e-05, + "loss": 1.4088, + "step": 1919 + }, + { + "epoch": 0.10497109194538223, + "grad_norm": 1.9573445320129395, + "learning_rate": 1.9887398582991825e-05, + "loss": 1.436, + "step": 1920 + }, + { + "epoch": 0.10502576438910378, + "grad_norm": 1.3841543197631836, + "learning_rate": 1.988712496384449e-05, + "loss": 1.7111, + "step": 1921 + }, + { + "epoch": 0.10508043683282534, + "grad_norm": 1.770498275756836, + "learning_rate": 1.9886851014542855e-05, + "loss": 1.4209, + "step": 1922 + }, + { + "epoch": 0.10513510927654689, + "grad_norm": 1.505649209022522, + "learning_rate": 1.9886576735096064e-05, + "loss": 1.2562, + "step": 1923 + }, + { + "epoch": 0.10518978172026844, + "grad_norm": 1.2867196798324585, + "learning_rate": 1.9886302125513276e-05, + "loss": 1.6046, + "step": 1924 + }, + { + "epoch": 0.10524445416398999, + "grad_norm": 1.7925083637237549, + "learning_rate": 1.988602718580366e-05, + "loss": 1.6019, + "step": 1925 + }, + { + "epoch": 0.10529912660771155, + "grad_norm": 1.7155938148498535, + "learning_rate": 1.9885751915976402e-05, + "loss": 1.4376, + "step": 1926 + }, + { + "epoch": 0.1053537990514331, + "grad_norm": 1.8914211988449097, + "learning_rate": 1.9885476316040683e-05, + "loss": 1.4935, + "step": 1927 + }, + { + "epoch": 0.10540847149515466, + "grad_norm": 1.058283805847168, + "learning_rate": 1.988520038600572e-05, + "loss": 1.6977, + "step": 1928 + }, + { + "epoch": 0.10546314393887621, + "grad_norm": 2.0817065238952637, + "learning_rate": 1.9884924125880713e-05, + "loss": 1.6194, + "step": 1929 + }, + { + "epoch": 0.10551781638259776, + "grad_norm": 1.7531912326812744, + "learning_rate": 1.98846475356749e-05, + "loss": 1.5137, + "step": 1930 + }, + { + "epoch": 0.10557248882631931, + "grad_norm": 1.871216058731079, + "learning_rate": 1.9884370615397507e-05, + "loss": 1.2821, + "step": 1931 + }, + { + "epoch": 0.10562716127004086, + "grad_norm": 1.6149303913116455, + "learning_rate": 1.9884093365057786e-05, + "loss": 1.7199, + "step": 1932 + }, + { + "epoch": 0.10568183371376243, + "grad_norm": 1.4858341217041016, + "learning_rate": 1.9883815784664992e-05, + "loss": 1.737, + "step": 1933 + }, + { + "epoch": 0.10573650615748398, + "grad_norm": 1.6470320224761963, + "learning_rate": 1.9883537874228402e-05, + "loss": 1.424, + "step": 1934 + }, + { + "epoch": 0.10579117860120553, + "grad_norm": 1.5587856769561768, + "learning_rate": 1.9883259633757282e-05, + "loss": 1.3876, + "step": 1935 + }, + { + "epoch": 0.10584585104492708, + "grad_norm": 1.8033496141433716, + "learning_rate": 1.9882981063260934e-05, + "loss": 1.4657, + "step": 1936 + }, + { + "epoch": 0.10590052348864863, + "grad_norm": 1.409440517425537, + "learning_rate": 1.9882702162748657e-05, + "loss": 1.4171, + "step": 1937 + }, + { + "epoch": 0.10595519593237018, + "grad_norm": 1.5167649984359741, + "learning_rate": 1.9882422932229765e-05, + "loss": 1.6058, + "step": 1938 + }, + { + "epoch": 0.10600986837609173, + "grad_norm": 1.6418956518173218, + "learning_rate": 1.9882143371713583e-05, + "loss": 1.7459, + "step": 1939 + }, + { + "epoch": 0.1060645408198133, + "grad_norm": 1.8523911237716675, + "learning_rate": 1.9881863481209442e-05, + "loss": 1.6215, + "step": 1940 + }, + { + "epoch": 0.10611921326353485, + "grad_norm": 1.3087043762207031, + "learning_rate": 1.9881583260726692e-05, + "loss": 1.4779, + "step": 1941 + }, + { + "epoch": 0.1061738857072564, + "grad_norm": 2.2791523933410645, + "learning_rate": 1.988130271027469e-05, + "loss": 1.4668, + "step": 1942 + }, + { + "epoch": 0.10622855815097795, + "grad_norm": 1.5564756393432617, + "learning_rate": 1.9881021829862802e-05, + "loss": 1.242, + "step": 1943 + }, + { + "epoch": 0.1062832305946995, + "grad_norm": 1.5333411693572998, + "learning_rate": 1.9880740619500406e-05, + "loss": 1.5141, + "step": 1944 + }, + { + "epoch": 0.10633790303842106, + "grad_norm": 1.8144499063491821, + "learning_rate": 1.9880459079196898e-05, + "loss": 1.4011, + "step": 1945 + }, + { + "epoch": 0.1063925754821426, + "grad_norm": 1.4512852430343628, + "learning_rate": 1.9880177208961676e-05, + "loss": 1.1836, + "step": 1946 + }, + { + "epoch": 0.10644724792586417, + "grad_norm": 1.5164874792099, + "learning_rate": 1.9879895008804154e-05, + "loss": 1.4249, + "step": 1947 + }, + { + "epoch": 0.10650192036958572, + "grad_norm": 1.3745375871658325, + "learning_rate": 1.9879612478733753e-05, + "loss": 1.5694, + "step": 1948 + }, + { + "epoch": 0.10655659281330727, + "grad_norm": 1.490525722503662, + "learning_rate": 1.9879329618759903e-05, + "loss": 1.6197, + "step": 1949 + }, + { + "epoch": 0.10661126525702883, + "grad_norm": 1.6287577152252197, + "learning_rate": 1.987904642889206e-05, + "loss": 1.4735, + "step": 1950 + }, + { + "epoch": 0.10666593770075038, + "grad_norm": 1.3787232637405396, + "learning_rate": 1.9878762909139673e-05, + "loss": 1.6855, + "step": 1951 + }, + { + "epoch": 0.10672061014447193, + "grad_norm": 1.4952980279922485, + "learning_rate": 1.9878479059512212e-05, + "loss": 1.3355, + "step": 1952 + }, + { + "epoch": 0.10677528258819348, + "grad_norm": 1.525322437286377, + "learning_rate": 1.9878194880019154e-05, + "loss": 1.6066, + "step": 1953 + }, + { + "epoch": 0.10682995503191504, + "grad_norm": 1.2198233604431152, + "learning_rate": 1.987791037066999e-05, + "loss": 1.6105, + "step": 1954 + }, + { + "epoch": 0.1068846274756366, + "grad_norm": 1.7355738878250122, + "learning_rate": 1.9877625531474217e-05, + "loss": 1.5543, + "step": 1955 + }, + { + "epoch": 0.10693929991935815, + "grad_norm": 1.660089373588562, + "learning_rate": 1.9877340362441352e-05, + "loss": 1.3806, + "step": 1956 + }, + { + "epoch": 0.1069939723630797, + "grad_norm": 2.3591580390930176, + "learning_rate": 1.9877054863580912e-05, + "loss": 1.2683, + "step": 1957 + }, + { + "epoch": 0.10704864480680125, + "grad_norm": 1.8722151517868042, + "learning_rate": 1.987676903490243e-05, + "loss": 1.5407, + "step": 1958 + }, + { + "epoch": 0.1071033172505228, + "grad_norm": 1.5725215673446655, + "learning_rate": 1.987648287641546e-05, + "loss": 1.5885, + "step": 1959 + }, + { + "epoch": 0.10715798969424435, + "grad_norm": 1.3819406032562256, + "learning_rate": 1.9876196388129548e-05, + "loss": 1.4914, + "step": 1960 + }, + { + "epoch": 0.10721266213796592, + "grad_norm": 1.449741005897522, + "learning_rate": 1.9875909570054263e-05, + "loss": 1.6986, + "step": 1961 + }, + { + "epoch": 0.10726733458168747, + "grad_norm": 1.2723464965820312, + "learning_rate": 1.9875622422199185e-05, + "loss": 1.4863, + "step": 1962 + }, + { + "epoch": 0.10732200702540902, + "grad_norm": 1.4490132331848145, + "learning_rate": 1.98753349445739e-05, + "loss": 1.45, + "step": 1963 + }, + { + "epoch": 0.10737667946913057, + "grad_norm": 1.2839951515197754, + "learning_rate": 1.9875047137188005e-05, + "loss": 1.5723, + "step": 1964 + }, + { + "epoch": 0.10743135191285212, + "grad_norm": 2.0710246562957764, + "learning_rate": 1.9874759000051113e-05, + "loss": 1.4764, + "step": 1965 + }, + { + "epoch": 0.10748602435657367, + "grad_norm": 1.4215987920761108, + "learning_rate": 1.987447053317285e-05, + "loss": 1.4593, + "step": 1966 + }, + { + "epoch": 0.10754069680029524, + "grad_norm": 1.5398881435394287, + "learning_rate": 1.9874181736562844e-05, + "loss": 1.5755, + "step": 1967 + }, + { + "epoch": 0.10759536924401679, + "grad_norm": 1.3654991388320923, + "learning_rate": 1.987389261023074e-05, + "loss": 1.5639, + "step": 1968 + }, + { + "epoch": 0.10765004168773834, + "grad_norm": 1.6712336540222168, + "learning_rate": 1.9873603154186187e-05, + "loss": 1.226, + "step": 1969 + }, + { + "epoch": 0.10770471413145989, + "grad_norm": 1.6026581525802612, + "learning_rate": 1.987331336843886e-05, + "loss": 1.6237, + "step": 1970 + }, + { + "epoch": 0.10775938657518144, + "grad_norm": 1.4360454082489014, + "learning_rate": 1.9873023252998432e-05, + "loss": 1.4259, + "step": 1971 + }, + { + "epoch": 0.107814059018903, + "grad_norm": 1.4054926633834839, + "learning_rate": 1.9872732807874588e-05, + "loss": 1.2498, + "step": 1972 + }, + { + "epoch": 0.10786873146262455, + "grad_norm": 1.642930030822754, + "learning_rate": 1.9872442033077027e-05, + "loss": 1.4641, + "step": 1973 + }, + { + "epoch": 0.10792340390634611, + "grad_norm": 1.4812490940093994, + "learning_rate": 1.987215092861546e-05, + "loss": 1.4924, + "step": 1974 + }, + { + "epoch": 0.10797807635006766, + "grad_norm": 1.7584837675094604, + "learning_rate": 1.9871859494499613e-05, + "loss": 1.3393, + "step": 1975 + }, + { + "epoch": 0.10803274879378921, + "grad_norm": 1.7084780931472778, + "learning_rate": 1.9871567730739207e-05, + "loss": 1.2297, + "step": 1976 + }, + { + "epoch": 0.10808742123751076, + "grad_norm": 1.7051128149032593, + "learning_rate": 1.987127563734399e-05, + "loss": 1.1538, + "step": 1977 + }, + { + "epoch": 0.10814209368123232, + "grad_norm": 1.2738937139511108, + "learning_rate": 1.987098321432372e-05, + "loss": 1.4833, + "step": 1978 + }, + { + "epoch": 0.10819676612495387, + "grad_norm": 1.649613857269287, + "learning_rate": 1.9870690461688154e-05, + "loss": 1.6781, + "step": 1979 + }, + { + "epoch": 0.10825143856867542, + "grad_norm": 1.7914597988128662, + "learning_rate": 1.9870397379447074e-05, + "loss": 1.5661, + "step": 1980 + }, + { + "epoch": 0.10830611101239698, + "grad_norm": 1.4925309419631958, + "learning_rate": 1.9870103967610262e-05, + "loss": 1.4092, + "step": 1981 + }, + { + "epoch": 0.10836078345611853, + "grad_norm": 1.8544085025787354, + "learning_rate": 1.9869810226187516e-05, + "loss": 1.632, + "step": 1982 + }, + { + "epoch": 0.10841545589984009, + "grad_norm": 1.940078854560852, + "learning_rate": 1.9869516155188647e-05, + "loss": 1.5667, + "step": 1983 + }, + { + "epoch": 0.10847012834356164, + "grad_norm": 2.0641887187957764, + "learning_rate": 1.986922175462348e-05, + "loss": 1.5259, + "step": 1984 + }, + { + "epoch": 0.10852480078728319, + "grad_norm": 1.7192715406417847, + "learning_rate": 1.9868927024501833e-05, + "loss": 1.7167, + "step": 1985 + }, + { + "epoch": 0.10857947323100474, + "grad_norm": 1.3301938772201538, + "learning_rate": 1.9868631964833556e-05, + "loss": 1.4773, + "step": 1986 + }, + { + "epoch": 0.10863414567472629, + "grad_norm": 1.4619770050048828, + "learning_rate": 1.98683365756285e-05, + "loss": 1.4883, + "step": 1987 + }, + { + "epoch": 0.10868881811844786, + "grad_norm": 1.8541144132614136, + "learning_rate": 1.986804085689653e-05, + "loss": 1.6216, + "step": 1988 + }, + { + "epoch": 0.1087434905621694, + "grad_norm": 1.5250048637390137, + "learning_rate": 1.9867744808647518e-05, + "loss": 1.6333, + "step": 1989 + }, + { + "epoch": 0.10879816300589096, + "grad_norm": 1.7292821407318115, + "learning_rate": 1.9867448430891353e-05, + "loss": 1.5681, + "step": 1990 + }, + { + "epoch": 0.10885283544961251, + "grad_norm": 1.5866307020187378, + "learning_rate": 1.986715172363793e-05, + "loss": 1.4887, + "step": 1991 + }, + { + "epoch": 0.10890750789333406, + "grad_norm": 1.6618609428405762, + "learning_rate": 1.9866854686897156e-05, + "loss": 1.6184, + "step": 1992 + }, + { + "epoch": 0.10896218033705561, + "grad_norm": 1.903910517692566, + "learning_rate": 1.9866557320678952e-05, + "loss": 1.4971, + "step": 1993 + }, + { + "epoch": 0.10901685278077716, + "grad_norm": 1.8692442178726196, + "learning_rate": 1.9866259624993246e-05, + "loss": 1.6093, + "step": 1994 + }, + { + "epoch": 0.10907152522449873, + "grad_norm": 2.102043628692627, + "learning_rate": 1.986596159984998e-05, + "loss": 1.6427, + "step": 1995 + }, + { + "epoch": 0.10912619766822028, + "grad_norm": 1.2392053604125977, + "learning_rate": 1.9865663245259105e-05, + "loss": 1.4966, + "step": 1996 + }, + { + "epoch": 0.10918087011194183, + "grad_norm": 1.5467495918273926, + "learning_rate": 1.9865364561230583e-05, + "loss": 1.5707, + "step": 1997 + }, + { + "epoch": 0.10923554255566338, + "grad_norm": 1.5546190738677979, + "learning_rate": 1.9865065547774386e-05, + "loss": 1.3284, + "step": 1998 + }, + { + "epoch": 0.10929021499938493, + "grad_norm": 1.685261845588684, + "learning_rate": 1.9864766204900506e-05, + "loss": 1.3546, + "step": 1999 + }, + { + "epoch": 0.10934488744310648, + "grad_norm": 1.7556487321853638, + "learning_rate": 1.986446653261893e-05, + "loss": 1.621, + "step": 2000 + }, + { + "epoch": 0.10939955988682804, + "grad_norm": 1.653738021850586, + "learning_rate": 1.986416653093967e-05, + "loss": 1.5974, + "step": 2001 + }, + { + "epoch": 0.1094542323305496, + "grad_norm": 1.4268683195114136, + "learning_rate": 1.9863866199872747e-05, + "loss": 1.6652, + "step": 2002 + }, + { + "epoch": 0.10950890477427115, + "grad_norm": 1.527908444404602, + "learning_rate": 1.9863565539428177e-05, + "loss": 1.6076, + "step": 2003 + }, + { + "epoch": 0.1095635772179927, + "grad_norm": 1.5152937173843384, + "learning_rate": 1.9863264549616015e-05, + "loss": 1.6974, + "step": 2004 + }, + { + "epoch": 0.10961824966171425, + "grad_norm": 1.709021806716919, + "learning_rate": 1.9862963230446303e-05, + "loss": 1.5193, + "step": 2005 + }, + { + "epoch": 0.1096729221054358, + "grad_norm": 1.3694864511489868, + "learning_rate": 1.9862661581929103e-05, + "loss": 1.5212, + "step": 2006 + }, + { + "epoch": 0.10972759454915736, + "grad_norm": 1.6903427839279175, + "learning_rate": 1.986235960407449e-05, + "loss": 1.7547, + "step": 2007 + }, + { + "epoch": 0.10978226699287891, + "grad_norm": 1.238234519958496, + "learning_rate": 1.9862057296892546e-05, + "loss": 1.5018, + "step": 2008 + }, + { + "epoch": 0.10983693943660047, + "grad_norm": 1.5540999174118042, + "learning_rate": 1.986175466039337e-05, + "loss": 1.3569, + "step": 2009 + }, + { + "epoch": 0.10989161188032202, + "grad_norm": 1.8067352771759033, + "learning_rate": 1.9861451694587063e-05, + "loss": 1.5459, + "step": 2010 + }, + { + "epoch": 0.10994628432404357, + "grad_norm": 2.119537830352783, + "learning_rate": 1.9861148399483743e-05, + "loss": 1.5694, + "step": 2011 + }, + { + "epoch": 0.11000095676776513, + "grad_norm": 1.674353837966919, + "learning_rate": 1.9860844775093536e-05, + "loss": 1.6889, + "step": 2012 + }, + { + "epoch": 0.11005562921148668, + "grad_norm": 1.7046860456466675, + "learning_rate": 1.9860540821426582e-05, + "loss": 1.4794, + "step": 2013 + }, + { + "epoch": 0.11011030165520823, + "grad_norm": 1.6580415964126587, + "learning_rate": 1.9860236538493036e-05, + "loss": 1.5969, + "step": 2014 + }, + { + "epoch": 0.1101649740989298, + "grad_norm": 1.9132863283157349, + "learning_rate": 1.985993192630305e-05, + "loss": 1.473, + "step": 2015 + }, + { + "epoch": 0.11021964654265134, + "grad_norm": 1.1810195446014404, + "learning_rate": 1.9859626984866804e-05, + "loss": 1.5098, + "step": 2016 + }, + { + "epoch": 0.1102743189863729, + "grad_norm": 1.3607548475265503, + "learning_rate": 1.9859321714194477e-05, + "loss": 1.5369, + "step": 2017 + }, + { + "epoch": 0.11032899143009445, + "grad_norm": 1.8367305994033813, + "learning_rate": 1.985901611429626e-05, + "loss": 1.2316, + "step": 2018 + }, + { + "epoch": 0.110383663873816, + "grad_norm": 1.4988521337509155, + "learning_rate": 1.985871018518236e-05, + "loss": 1.3677, + "step": 2019 + }, + { + "epoch": 0.11043833631753755, + "grad_norm": 1.4123139381408691, + "learning_rate": 1.985840392686299e-05, + "loss": 1.5134, + "step": 2020 + }, + { + "epoch": 0.1104930087612591, + "grad_norm": 1.4151808023452759, + "learning_rate": 1.9858097339348386e-05, + "loss": 1.4265, + "step": 2021 + }, + { + "epoch": 0.11054768120498067, + "grad_norm": 1.1807364225387573, + "learning_rate": 1.9857790422648774e-05, + "loss": 1.4532, + "step": 2022 + }, + { + "epoch": 0.11060235364870222, + "grad_norm": 1.439867615699768, + "learning_rate": 1.9857483176774412e-05, + "loss": 1.3182, + "step": 2023 + }, + { + "epoch": 0.11065702609242377, + "grad_norm": 1.8043886423110962, + "learning_rate": 1.9857175601735548e-05, + "loss": 1.4052, + "step": 2024 + }, + { + "epoch": 0.11071169853614532, + "grad_norm": 1.4147120714187622, + "learning_rate": 1.9856867697542467e-05, + "loss": 1.3637, + "step": 2025 + }, + { + "epoch": 0.11076637097986687, + "grad_norm": 1.5430105924606323, + "learning_rate": 1.9856559464205443e-05, + "loss": 1.5892, + "step": 2026 + }, + { + "epoch": 0.11082104342358842, + "grad_norm": 1.431301474571228, + "learning_rate": 1.9856250901734767e-05, + "loss": 1.6436, + "step": 2027 + }, + { + "epoch": 0.11087571586730997, + "grad_norm": 1.7195186614990234, + "learning_rate": 1.985594201014075e-05, + "loss": 1.2582, + "step": 2028 + }, + { + "epoch": 0.11093038831103154, + "grad_norm": 1.7885342836380005, + "learning_rate": 1.9855632789433695e-05, + "loss": 1.4478, + "step": 2029 + }, + { + "epoch": 0.11098506075475309, + "grad_norm": 1.441620945930481, + "learning_rate": 1.9855323239623936e-05, + "loss": 1.4521, + "step": 2030 + }, + { + "epoch": 0.11103973319847464, + "grad_norm": 1.4034147262573242, + "learning_rate": 1.9855013360721806e-05, + "loss": 1.3848, + "step": 2031 + }, + { + "epoch": 0.11109440564219619, + "grad_norm": 1.4102839231491089, + "learning_rate": 1.985470315273766e-05, + "loss": 1.5078, + "step": 2032 + }, + { + "epoch": 0.11114907808591774, + "grad_norm": 1.1605979204177856, + "learning_rate": 1.9854392615681845e-05, + "loss": 1.4279, + "step": 2033 + }, + { + "epoch": 0.1112037505296393, + "grad_norm": 1.222146987915039, + "learning_rate": 1.985408174956474e-05, + "loss": 1.5348, + "step": 2034 + }, + { + "epoch": 0.11125842297336085, + "grad_norm": 1.905659556388855, + "learning_rate": 1.9853770554396722e-05, + "loss": 1.8059, + "step": 2035 + }, + { + "epoch": 0.11131309541708241, + "grad_norm": 1.6361976861953735, + "learning_rate": 1.9853459030188183e-05, + "loss": 1.5305, + "step": 2036 + }, + { + "epoch": 0.11136776786080396, + "grad_norm": 1.4142802953720093, + "learning_rate": 1.9853147176949523e-05, + "loss": 1.6311, + "step": 2037 + }, + { + "epoch": 0.11142244030452551, + "grad_norm": 1.6125863790512085, + "learning_rate": 1.985283499469116e-05, + "loss": 1.4707, + "step": 2038 + }, + { + "epoch": 0.11147711274824706, + "grad_norm": 1.3306326866149902, + "learning_rate": 1.9852522483423513e-05, + "loss": 1.6045, + "step": 2039 + }, + { + "epoch": 0.11153178519196862, + "grad_norm": 1.5955272912979126, + "learning_rate": 1.985220964315702e-05, + "loss": 1.506, + "step": 2040 + }, + { + "epoch": 0.11158645763569017, + "grad_norm": 1.4516217708587646, + "learning_rate": 1.985189647390213e-05, + "loss": 1.3509, + "step": 2041 + }, + { + "epoch": 0.11164113007941172, + "grad_norm": 1.6359407901763916, + "learning_rate": 1.9851582975669302e-05, + "loss": 1.2423, + "step": 2042 + }, + { + "epoch": 0.11169580252313328, + "grad_norm": 1.8766796588897705, + "learning_rate": 1.9851269148468998e-05, + "loss": 1.3698, + "step": 2043 + }, + { + "epoch": 0.11175047496685483, + "grad_norm": 1.506264328956604, + "learning_rate": 1.98509549923117e-05, + "loss": 1.5484, + "step": 2044 + }, + { + "epoch": 0.11180514741057639, + "grad_norm": 1.6138224601745605, + "learning_rate": 1.9850640507207898e-05, + "loss": 1.3606, + "step": 2045 + }, + { + "epoch": 0.11185981985429794, + "grad_norm": 1.5309417247772217, + "learning_rate": 1.9850325693168098e-05, + "loss": 1.5063, + "step": 2046 + }, + { + "epoch": 0.11191449229801949, + "grad_norm": 1.776298999786377, + "learning_rate": 1.9850010550202806e-05, + "loss": 1.4306, + "step": 2047 + }, + { + "epoch": 0.11196916474174104, + "grad_norm": 1.62631094455719, + "learning_rate": 1.9849695078322545e-05, + "loss": 1.5946, + "step": 2048 + }, + { + "epoch": 0.11202383718546259, + "grad_norm": 1.566521167755127, + "learning_rate": 1.9849379277537856e-05, + "loss": 1.6732, + "step": 2049 + }, + { + "epoch": 0.11207850962918416, + "grad_norm": 1.3238890171051025, + "learning_rate": 1.9849063147859282e-05, + "loss": 1.5466, + "step": 2050 + }, + { + "epoch": 0.11213318207290571, + "grad_norm": 1.7273297309875488, + "learning_rate": 1.9848746689297375e-05, + "loss": 1.4634, + "step": 2051 + }, + { + "epoch": 0.11218785451662726, + "grad_norm": 2.2386398315429688, + "learning_rate": 1.9848429901862705e-05, + "loss": 1.5805, + "step": 2052 + }, + { + "epoch": 0.11224252696034881, + "grad_norm": 2.047318696975708, + "learning_rate": 1.984811278556585e-05, + "loss": 1.6889, + "step": 2053 + }, + { + "epoch": 0.11229719940407036, + "grad_norm": 1.7086714506149292, + "learning_rate": 1.9847795340417405e-05, + "loss": 1.365, + "step": 2054 + }, + { + "epoch": 0.11235187184779191, + "grad_norm": 1.6823269128799438, + "learning_rate": 1.984747756642796e-05, + "loss": 1.6357, + "step": 2055 + }, + { + "epoch": 0.11240654429151346, + "grad_norm": 1.6030070781707764, + "learning_rate": 1.9847159463608132e-05, + "loss": 1.458, + "step": 2056 + }, + { + "epoch": 0.11246121673523503, + "grad_norm": 1.2017884254455566, + "learning_rate": 1.9846841031968545e-05, + "loss": 1.3652, + "step": 2057 + }, + { + "epoch": 0.11251588917895658, + "grad_norm": 1.9270284175872803, + "learning_rate": 1.9846522271519827e-05, + "loss": 1.5838, + "step": 2058 + }, + { + "epoch": 0.11257056162267813, + "grad_norm": 1.3303565979003906, + "learning_rate": 1.9846203182272625e-05, + "loss": 1.3567, + "step": 2059 + }, + { + "epoch": 0.11262523406639968, + "grad_norm": 1.4311931133270264, + "learning_rate": 1.9845883764237594e-05, + "loss": 1.6174, + "step": 2060 + }, + { + "epoch": 0.11267990651012123, + "grad_norm": 1.7621846199035645, + "learning_rate": 1.98455640174254e-05, + "loss": 1.3783, + "step": 2061 + }, + { + "epoch": 0.11273457895384278, + "grad_norm": 1.3259841203689575, + "learning_rate": 1.984524394184672e-05, + "loss": 1.5557, + "step": 2062 + }, + { + "epoch": 0.11278925139756434, + "grad_norm": 1.5767921209335327, + "learning_rate": 1.9844923537512245e-05, + "loss": 1.3342, + "step": 2063 + }, + { + "epoch": 0.1128439238412859, + "grad_norm": 1.3794829845428467, + "learning_rate": 1.9844602804432667e-05, + "loss": 1.6597, + "step": 2064 + }, + { + "epoch": 0.11289859628500745, + "grad_norm": 1.1796433925628662, + "learning_rate": 1.98442817426187e-05, + "loss": 1.4652, + "step": 2065 + }, + { + "epoch": 0.112953268728729, + "grad_norm": 1.4543876647949219, + "learning_rate": 1.9843960352081068e-05, + "loss": 1.6197, + "step": 2066 + }, + { + "epoch": 0.11300794117245055, + "grad_norm": 2.151524305343628, + "learning_rate": 1.9843638632830504e-05, + "loss": 1.5211, + "step": 2067 + }, + { + "epoch": 0.1130626136161721, + "grad_norm": 1.4451994895935059, + "learning_rate": 1.9843316584877738e-05, + "loss": 1.4694, + "step": 2068 + }, + { + "epoch": 0.11311728605989366, + "grad_norm": 1.4879003763198853, + "learning_rate": 1.9842994208233543e-05, + "loss": 1.5428, + "step": 2069 + }, + { + "epoch": 0.11317195850361522, + "grad_norm": 1.3844022750854492, + "learning_rate": 1.9842671502908665e-05, + "loss": 1.5028, + "step": 2070 + }, + { + "epoch": 0.11322663094733677, + "grad_norm": 1.6953930854797363, + "learning_rate": 1.9842348468913895e-05, + "loss": 1.8825, + "step": 2071 + }, + { + "epoch": 0.11328130339105832, + "grad_norm": 1.3885602951049805, + "learning_rate": 1.984202510626001e-05, + "loss": 1.3923, + "step": 2072 + }, + { + "epoch": 0.11333597583477988, + "grad_norm": 1.2985951900482178, + "learning_rate": 1.9841701414957815e-05, + "loss": 1.5142, + "step": 2073 + }, + { + "epoch": 0.11339064827850143, + "grad_norm": 1.4009872674942017, + "learning_rate": 1.9841377395018114e-05, + "loss": 1.427, + "step": 2074 + }, + { + "epoch": 0.11344532072222298, + "grad_norm": 1.3317384719848633, + "learning_rate": 1.9841053046451728e-05, + "loss": 1.404, + "step": 2075 + }, + { + "epoch": 0.11349999316594453, + "grad_norm": 1.6107007265090942, + "learning_rate": 1.984072836926949e-05, + "loss": 1.4625, + "step": 2076 + }, + { + "epoch": 0.1135546656096661, + "grad_norm": 1.2963685989379883, + "learning_rate": 1.9840403363482236e-05, + "loss": 1.5763, + "step": 2077 + }, + { + "epoch": 0.11360933805338765, + "grad_norm": 1.4295670986175537, + "learning_rate": 1.9840078029100826e-05, + "loss": 1.3702, + "step": 2078 + }, + { + "epoch": 0.1136640104971092, + "grad_norm": 1.9465328454971313, + "learning_rate": 1.983975236613612e-05, + "loss": 1.3191, + "step": 2079 + }, + { + "epoch": 0.11371868294083075, + "grad_norm": 1.6340664625167847, + "learning_rate": 1.983942637459899e-05, + "loss": 1.5678, + "step": 2080 + }, + { + "epoch": 0.1137733553845523, + "grad_norm": 1.887239694595337, + "learning_rate": 1.9839100054500324e-05, + "loss": 1.3828, + "step": 2081 + }, + { + "epoch": 0.11382802782827385, + "grad_norm": 2.162567138671875, + "learning_rate": 1.9838773405851022e-05, + "loss": 1.546, + "step": 2082 + }, + { + "epoch": 0.1138827002719954, + "grad_norm": 1.3604918718338013, + "learning_rate": 1.9838446428661988e-05, + "loss": 1.4962, + "step": 2083 + }, + { + "epoch": 0.11393737271571697, + "grad_norm": 1.2877616882324219, + "learning_rate": 1.9838119122944143e-05, + "loss": 1.4671, + "step": 2084 + }, + { + "epoch": 0.11399204515943852, + "grad_norm": 1.566432237625122, + "learning_rate": 1.983779148870841e-05, + "loss": 1.4016, + "step": 2085 + }, + { + "epoch": 0.11404671760316007, + "grad_norm": 1.1513621807098389, + "learning_rate": 1.9837463525965735e-05, + "loss": 1.6298, + "step": 2086 + }, + { + "epoch": 0.11410139004688162, + "grad_norm": 1.8664813041687012, + "learning_rate": 1.983713523472707e-05, + "loss": 1.4574, + "step": 2087 + }, + { + "epoch": 0.11415606249060317, + "grad_norm": 1.8783109188079834, + "learning_rate": 1.983680661500338e-05, + "loss": 1.3956, + "step": 2088 + }, + { + "epoch": 0.11421073493432472, + "grad_norm": 1.823887586593628, + "learning_rate": 1.9836477666805626e-05, + "loss": 1.4947, + "step": 2089 + }, + { + "epoch": 0.11426540737804627, + "grad_norm": 1.3804293870925903, + "learning_rate": 1.9836148390144805e-05, + "loss": 1.5005, + "step": 2090 + }, + { + "epoch": 0.11432007982176784, + "grad_norm": 1.673789143562317, + "learning_rate": 1.9835818785031907e-05, + "loss": 1.3724, + "step": 2091 + }, + { + "epoch": 0.11437475226548939, + "grad_norm": 1.4786725044250488, + "learning_rate": 1.9835488851477943e-05, + "loss": 1.5627, + "step": 2092 + }, + { + "epoch": 0.11442942470921094, + "grad_norm": 1.2430716753005981, + "learning_rate": 1.9835158589493923e-05, + "loss": 1.5095, + "step": 2093 + }, + { + "epoch": 0.11448409715293249, + "grad_norm": 1.4683377742767334, + "learning_rate": 1.983482799909088e-05, + "loss": 1.4482, + "step": 2094 + }, + { + "epoch": 0.11453876959665404, + "grad_norm": 1.383358120918274, + "learning_rate": 1.983449708027985e-05, + "loss": 1.3232, + "step": 2095 + }, + { + "epoch": 0.1145934420403756, + "grad_norm": 1.3752278089523315, + "learning_rate": 1.9834165833071887e-05, + "loss": 1.4472, + "step": 2096 + }, + { + "epoch": 0.11464811448409715, + "grad_norm": 1.5781081914901733, + "learning_rate": 1.983383425747805e-05, + "loss": 1.2668, + "step": 2097 + }, + { + "epoch": 0.11470278692781871, + "grad_norm": 1.0660488605499268, + "learning_rate": 1.9833502353509412e-05, + "loss": 1.6604, + "step": 2098 + }, + { + "epoch": 0.11475745937154026, + "grad_norm": 1.6529691219329834, + "learning_rate": 1.983317012117705e-05, + "loss": 1.6012, + "step": 2099 + }, + { + "epoch": 0.11481213181526181, + "grad_norm": 1.9044864177703857, + "learning_rate": 1.983283756049207e-05, + "loss": 1.3698, + "step": 2100 + }, + { + "epoch": 0.11486680425898337, + "grad_norm": 1.4127596616744995, + "learning_rate": 1.983250467146557e-05, + "loss": 1.2431, + "step": 2101 + }, + { + "epoch": 0.11492147670270492, + "grad_norm": 1.4747300148010254, + "learning_rate": 1.9832171454108665e-05, + "loss": 1.5658, + "step": 2102 + }, + { + "epoch": 0.11497614914642647, + "grad_norm": 3.7169394493103027, + "learning_rate": 1.9831837908432482e-05, + "loss": 1.6382, + "step": 2103 + }, + { + "epoch": 0.11503082159014802, + "grad_norm": 1.3171812295913696, + "learning_rate": 1.983150403444816e-05, + "loss": 1.5987, + "step": 2104 + }, + { + "epoch": 0.11508549403386958, + "grad_norm": 1.5340561866760254, + "learning_rate": 1.983116983216685e-05, + "loss": 1.7138, + "step": 2105 + }, + { + "epoch": 0.11514016647759114, + "grad_norm": 1.5955939292907715, + "learning_rate": 1.983083530159971e-05, + "loss": 1.4487, + "step": 2106 + }, + { + "epoch": 0.11519483892131269, + "grad_norm": 1.324059247970581, + "learning_rate": 1.9830500442757907e-05, + "loss": 1.642, + "step": 2107 + }, + { + "epoch": 0.11524951136503424, + "grad_norm": 1.0058523416519165, + "learning_rate": 1.983016525565263e-05, + "loss": 1.732, + "step": 2108 + }, + { + "epoch": 0.11530418380875579, + "grad_norm": 1.3163927793502808, + "learning_rate": 1.9829829740295067e-05, + "loss": 1.5504, + "step": 2109 + }, + { + "epoch": 0.11535885625247734, + "grad_norm": 1.550382137298584, + "learning_rate": 1.982949389669642e-05, + "loss": 1.367, + "step": 2110 + }, + { + "epoch": 0.11541352869619889, + "grad_norm": 1.4394056797027588, + "learning_rate": 1.9829157724867908e-05, + "loss": 1.6032, + "step": 2111 + }, + { + "epoch": 0.11546820113992046, + "grad_norm": 1.6256401538848877, + "learning_rate": 1.9828821224820755e-05, + "loss": 1.605, + "step": 2112 + }, + { + "epoch": 0.11552287358364201, + "grad_norm": 1.4102097749710083, + "learning_rate": 1.9828484396566197e-05, + "loss": 1.3886, + "step": 2113 + }, + { + "epoch": 0.11557754602736356, + "grad_norm": 1.3625547885894775, + "learning_rate": 1.9828147240115483e-05, + "loss": 1.5347, + "step": 2114 + }, + { + "epoch": 0.11563221847108511, + "grad_norm": 1.5389783382415771, + "learning_rate": 1.982780975547987e-05, + "loss": 1.4638, + "step": 2115 + }, + { + "epoch": 0.11568689091480666, + "grad_norm": 1.2762991189956665, + "learning_rate": 1.9827471942670624e-05, + "loss": 1.518, + "step": 2116 + }, + { + "epoch": 0.11574156335852821, + "grad_norm": 1.616515874862671, + "learning_rate": 1.982713380169903e-05, + "loss": 1.379, + "step": 2117 + }, + { + "epoch": 0.11579623580224978, + "grad_norm": 1.680812120437622, + "learning_rate": 1.982679533257638e-05, + "loss": 1.6481, + "step": 2118 + }, + { + "epoch": 0.11585090824597133, + "grad_norm": 1.5962066650390625, + "learning_rate": 1.9826456535313978e-05, + "loss": 1.4638, + "step": 2119 + }, + { + "epoch": 0.11590558068969288, + "grad_norm": 1.7725645303726196, + "learning_rate": 1.982611740992313e-05, + "loss": 1.5714, + "step": 2120 + }, + { + "epoch": 0.11596025313341443, + "grad_norm": 1.2986353635787964, + "learning_rate": 1.9825777956415163e-05, + "loss": 1.4322, + "step": 2121 + }, + { + "epoch": 0.11601492557713598, + "grad_norm": 1.753456950187683, + "learning_rate": 1.9825438174801412e-05, + "loss": 1.564, + "step": 2122 + }, + { + "epoch": 0.11606959802085753, + "grad_norm": 1.9472347497940063, + "learning_rate": 1.982509806509323e-05, + "loss": 1.5442, + "step": 2123 + }, + { + "epoch": 0.11612427046457909, + "grad_norm": 1.3269509077072144, + "learning_rate": 1.982475762730196e-05, + "loss": 1.3349, + "step": 2124 + }, + { + "epoch": 0.11617894290830065, + "grad_norm": 1.5968490839004517, + "learning_rate": 1.9824416861438985e-05, + "loss": 1.8554, + "step": 2125 + }, + { + "epoch": 0.1162336153520222, + "grad_norm": 1.1792750358581543, + "learning_rate": 1.9824075767515677e-05, + "loss": 1.3457, + "step": 2126 + }, + { + "epoch": 0.11628828779574375, + "grad_norm": 1.5933088064193726, + "learning_rate": 1.9823734345543422e-05, + "loss": 1.5829, + "step": 2127 + }, + { + "epoch": 0.1163429602394653, + "grad_norm": 1.7010213136672974, + "learning_rate": 1.982339259553363e-05, + "loss": 1.5394, + "step": 2128 + }, + { + "epoch": 0.11639763268318686, + "grad_norm": 1.9799562692642212, + "learning_rate": 1.9823050517497703e-05, + "loss": 1.5322, + "step": 2129 + }, + { + "epoch": 0.1164523051269084, + "grad_norm": 1.683759331703186, + "learning_rate": 1.9822708111447074e-05, + "loss": 1.3997, + "step": 2130 + }, + { + "epoch": 0.11650697757062996, + "grad_norm": 1.4872674942016602, + "learning_rate": 1.9822365377393168e-05, + "loss": 1.6256, + "step": 2131 + }, + { + "epoch": 0.11656165001435152, + "grad_norm": 1.034798264503479, + "learning_rate": 1.9822022315347433e-05, + "loss": 1.6354, + "step": 2132 + }, + { + "epoch": 0.11661632245807307, + "grad_norm": 1.323437213897705, + "learning_rate": 1.9821678925321326e-05, + "loss": 1.4013, + "step": 2133 + }, + { + "epoch": 0.11667099490179463, + "grad_norm": 1.2934093475341797, + "learning_rate": 1.982133520732631e-05, + "loss": 1.6187, + "step": 2134 + }, + { + "epoch": 0.11672566734551618, + "grad_norm": 1.4183202981948853, + "learning_rate": 1.982099116137387e-05, + "loss": 1.7282, + "step": 2135 + }, + { + "epoch": 0.11678033978923773, + "grad_norm": 1.2202115058898926, + "learning_rate": 1.9820646787475483e-05, + "loss": 1.4785, + "step": 2136 + }, + { + "epoch": 0.11683501223295928, + "grad_norm": 1.3665000200271606, + "learning_rate": 1.982030208564266e-05, + "loss": 1.3545, + "step": 2137 + }, + { + "epoch": 0.11688968467668083, + "grad_norm": 1.9380353689193726, + "learning_rate": 1.9819957055886904e-05, + "loss": 1.6045, + "step": 2138 + }, + { + "epoch": 0.1169443571204024, + "grad_norm": 1.3430495262145996, + "learning_rate": 1.981961169821974e-05, + "loss": 1.4578, + "step": 2139 + }, + { + "epoch": 0.11699902956412395, + "grad_norm": 1.3623323440551758, + "learning_rate": 1.98192660126527e-05, + "loss": 1.3787, + "step": 2140 + }, + { + "epoch": 0.1170537020078455, + "grad_norm": 1.2415351867675781, + "learning_rate": 1.981891999919732e-05, + "loss": 1.5869, + "step": 2141 + }, + { + "epoch": 0.11710837445156705, + "grad_norm": 1.5498374700546265, + "learning_rate": 1.9818573657865167e-05, + "loss": 1.7435, + "step": 2142 + }, + { + "epoch": 0.1171630468952886, + "grad_norm": 1.2794122695922852, + "learning_rate": 1.9818226988667797e-05, + "loss": 1.5251, + "step": 2143 + }, + { + "epoch": 0.11721771933901015, + "grad_norm": 1.6356455087661743, + "learning_rate": 1.981787999161679e-05, + "loss": 1.486, + "step": 2144 + }, + { + "epoch": 0.1172723917827317, + "grad_norm": 1.7392864227294922, + "learning_rate": 1.981753266672373e-05, + "loss": 1.4127, + "step": 2145 + }, + { + "epoch": 0.11732706422645327, + "grad_norm": 1.597644567489624, + "learning_rate": 1.9817185014000218e-05, + "loss": 1.3507, + "step": 2146 + }, + { + "epoch": 0.11738173667017482, + "grad_norm": 1.795045018196106, + "learning_rate": 1.981683703345786e-05, + "loss": 1.5685, + "step": 2147 + }, + { + "epoch": 0.11743640911389637, + "grad_norm": 1.3202393054962158, + "learning_rate": 1.981648872510828e-05, + "loss": 1.6519, + "step": 2148 + }, + { + "epoch": 0.11749108155761792, + "grad_norm": 1.257291316986084, + "learning_rate": 1.9816140088963106e-05, + "loss": 1.6605, + "step": 2149 + }, + { + "epoch": 0.11754575400133947, + "grad_norm": 1.2424163818359375, + "learning_rate": 1.981579112503398e-05, + "loss": 1.2474, + "step": 2150 + }, + { + "epoch": 0.11760042644506102, + "grad_norm": 2.384920120239258, + "learning_rate": 1.981544183333255e-05, + "loss": 1.4041, + "step": 2151 + }, + { + "epoch": 0.11765509888878257, + "grad_norm": 1.4462132453918457, + "learning_rate": 1.9815092213870487e-05, + "loss": 1.2564, + "step": 2152 + }, + { + "epoch": 0.11770977133250414, + "grad_norm": 1.8284201622009277, + "learning_rate": 1.9814742266659467e-05, + "loss": 1.4036, + "step": 2153 + }, + { + "epoch": 0.11776444377622569, + "grad_norm": 1.6408586502075195, + "learning_rate": 1.9814391991711172e-05, + "loss": 1.7621, + "step": 2154 + }, + { + "epoch": 0.11781911621994724, + "grad_norm": 1.2937549352645874, + "learning_rate": 1.9814041389037292e-05, + "loss": 1.5665, + "step": 2155 + }, + { + "epoch": 0.1178737886636688, + "grad_norm": 1.5018497705459595, + "learning_rate": 1.9813690458649546e-05, + "loss": 1.4522, + "step": 2156 + }, + { + "epoch": 0.11792846110739034, + "grad_norm": 1.6738691329956055, + "learning_rate": 1.9813339200559644e-05, + "loss": 1.6531, + "step": 2157 + }, + { + "epoch": 0.1179831335511119, + "grad_norm": 1.3558908700942993, + "learning_rate": 1.981298761477932e-05, + "loss": 1.4286, + "step": 2158 + }, + { + "epoch": 0.11803780599483345, + "grad_norm": 1.744380235671997, + "learning_rate": 1.9812635701320312e-05, + "loss": 1.5735, + "step": 2159 + }, + { + "epoch": 0.11809247843855501, + "grad_norm": 1.5214967727661133, + "learning_rate": 1.9812283460194373e-05, + "loss": 1.5348, + "step": 2160 + }, + { + "epoch": 0.11814715088227656, + "grad_norm": 1.5330896377563477, + "learning_rate": 1.9811930891413263e-05, + "loss": 1.474, + "step": 2161 + }, + { + "epoch": 0.11820182332599811, + "grad_norm": 1.6531410217285156, + "learning_rate": 1.9811577994988755e-05, + "loss": 1.4768, + "step": 2162 + }, + { + "epoch": 0.11825649576971967, + "grad_norm": 1.575439453125, + "learning_rate": 1.9811224770932634e-05, + "loss": 1.4534, + "step": 2163 + }, + { + "epoch": 0.11831116821344122, + "grad_norm": 2.0241854190826416, + "learning_rate": 1.98108712192567e-05, + "loss": 1.406, + "step": 2164 + }, + { + "epoch": 0.11836584065716277, + "grad_norm": 1.4741922616958618, + "learning_rate": 1.981051733997275e-05, + "loss": 1.3152, + "step": 2165 + }, + { + "epoch": 0.11842051310088433, + "grad_norm": 1.3254117965698242, + "learning_rate": 1.9810163133092604e-05, + "loss": 1.502, + "step": 2166 + }, + { + "epoch": 0.11847518554460588, + "grad_norm": 1.417764663696289, + "learning_rate": 1.9809808598628094e-05, + "loss": 1.6112, + "step": 2167 + }, + { + "epoch": 0.11852985798832744, + "grad_norm": 1.310428500175476, + "learning_rate": 1.9809453736591054e-05, + "loss": 1.6095, + "step": 2168 + }, + { + "epoch": 0.11858453043204899, + "grad_norm": 1.824138879776001, + "learning_rate": 1.9809098546993333e-05, + "loss": 1.6574, + "step": 2169 + }, + { + "epoch": 0.11863920287577054, + "grad_norm": 1.5750163793563843, + "learning_rate": 1.9808743029846795e-05, + "loss": 1.4765, + "step": 2170 + }, + { + "epoch": 0.11869387531949209, + "grad_norm": 1.5590815544128418, + "learning_rate": 1.9808387185163313e-05, + "loss": 1.533, + "step": 2171 + }, + { + "epoch": 0.11874854776321364, + "grad_norm": 1.251142978668213, + "learning_rate": 1.980803101295476e-05, + "loss": 1.5087, + "step": 2172 + }, + { + "epoch": 0.1188032202069352, + "grad_norm": 1.607230544090271, + "learning_rate": 1.9807674513233044e-05, + "loss": 1.4059, + "step": 2173 + }, + { + "epoch": 0.11885789265065676, + "grad_norm": 1.4014958143234253, + "learning_rate": 1.9807317686010055e-05, + "loss": 1.5092, + "step": 2174 + }, + { + "epoch": 0.11891256509437831, + "grad_norm": 1.4509198665618896, + "learning_rate": 1.9806960531297722e-05, + "loss": 1.5165, + "step": 2175 + }, + { + "epoch": 0.11896723753809986, + "grad_norm": 1.696474552154541, + "learning_rate": 1.980660304910796e-05, + "loss": 1.6772, + "step": 2176 + }, + { + "epoch": 0.11902190998182141, + "grad_norm": 1.516533613204956, + "learning_rate": 1.980624523945271e-05, + "loss": 1.5021, + "step": 2177 + }, + { + "epoch": 0.11907658242554296, + "grad_norm": 1.6590665578842163, + "learning_rate": 1.9805887102343922e-05, + "loss": 1.4518, + "step": 2178 + }, + { + "epoch": 0.11913125486926451, + "grad_norm": 1.76829195022583, + "learning_rate": 1.980552863779355e-05, + "loss": 1.396, + "step": 2179 + }, + { + "epoch": 0.11918592731298608, + "grad_norm": 1.4405219554901123, + "learning_rate": 1.9805169845813572e-05, + "loss": 1.4872, + "step": 2180 + }, + { + "epoch": 0.11924059975670763, + "grad_norm": 1.4819337129592896, + "learning_rate": 1.980481072641596e-05, + "loss": 1.3241, + "step": 2181 + }, + { + "epoch": 0.11929527220042918, + "grad_norm": 1.770919919013977, + "learning_rate": 1.9804451279612714e-05, + "loss": 1.498, + "step": 2182 + }, + { + "epoch": 0.11934994464415073, + "grad_norm": 1.73201322555542, + "learning_rate": 1.9804091505415833e-05, + "loss": 1.2808, + "step": 2183 + }, + { + "epoch": 0.11940461708787228, + "grad_norm": 2.684366226196289, + "learning_rate": 1.9803731403837326e-05, + "loss": 1.7223, + "step": 2184 + }, + { + "epoch": 0.11945928953159383, + "grad_norm": 1.4788849353790283, + "learning_rate": 1.9803370974889225e-05, + "loss": 1.4736, + "step": 2185 + }, + { + "epoch": 0.11951396197531539, + "grad_norm": 1.2849724292755127, + "learning_rate": 1.9803010218583565e-05, + "loss": 1.544, + "step": 2186 + }, + { + "epoch": 0.11956863441903695, + "grad_norm": 1.8434767723083496, + "learning_rate": 1.980264913493239e-05, + "loss": 1.8705, + "step": 2187 + }, + { + "epoch": 0.1196233068627585, + "grad_norm": 1.2692753076553345, + "learning_rate": 1.9802287723947753e-05, + "loss": 1.5328, + "step": 2188 + }, + { + "epoch": 0.11967797930648005, + "grad_norm": 1.775232195854187, + "learning_rate": 1.9801925985641733e-05, + "loss": 1.4646, + "step": 2189 + }, + { + "epoch": 0.1197326517502016, + "grad_norm": 1.4059679508209229, + "learning_rate": 1.98015639200264e-05, + "loss": 1.4639, + "step": 2190 + }, + { + "epoch": 0.11978732419392316, + "grad_norm": 1.5357905626296997, + "learning_rate": 1.9801201527113843e-05, + "loss": 1.7798, + "step": 2191 + }, + { + "epoch": 0.11984199663764471, + "grad_norm": 1.7334703207015991, + "learning_rate": 1.9800838806916175e-05, + "loss": 1.4159, + "step": 2192 + }, + { + "epoch": 0.11989666908136626, + "grad_norm": 1.9965875148773193, + "learning_rate": 1.9800475759445498e-05, + "loss": 1.2991, + "step": 2193 + }, + { + "epoch": 0.11995134152508782, + "grad_norm": 1.671713948249817, + "learning_rate": 1.980011238471394e-05, + "loss": 1.6186, + "step": 2194 + }, + { + "epoch": 0.12000601396880937, + "grad_norm": 1.3639640808105469, + "learning_rate": 1.9799748682733632e-05, + "loss": 1.4967, + "step": 2195 + }, + { + "epoch": 0.12006068641253093, + "grad_norm": 1.4479432106018066, + "learning_rate": 1.979938465351672e-05, + "loss": 1.6626, + "step": 2196 + }, + { + "epoch": 0.12011535885625248, + "grad_norm": 1.4879260063171387, + "learning_rate": 1.979902029707536e-05, + "loss": 1.3748, + "step": 2197 + }, + { + "epoch": 0.12017003129997403, + "grad_norm": 1.5836442708969116, + "learning_rate": 1.9798655613421712e-05, + "loss": 1.3077, + "step": 2198 + }, + { + "epoch": 0.12022470374369558, + "grad_norm": 1.6549975872039795, + "learning_rate": 1.9798290602567965e-05, + "loss": 1.7923, + "step": 2199 + }, + { + "epoch": 0.12027937618741713, + "grad_norm": 1.3318393230438232, + "learning_rate": 1.9797925264526303e-05, + "loss": 1.6626, + "step": 2200 + }, + { + "epoch": 0.1203340486311387, + "grad_norm": 1.5183582305908203, + "learning_rate": 1.9797559599308922e-05, + "loss": 1.2197, + "step": 2201 + }, + { + "epoch": 0.12038872107486025, + "grad_norm": 1.6083290576934814, + "learning_rate": 1.9797193606928036e-05, + "loss": 1.6008, + "step": 2202 + }, + { + "epoch": 0.1204433935185818, + "grad_norm": 1.3745166063308716, + "learning_rate": 1.979682728739587e-05, + "loss": 1.4794, + "step": 2203 + }, + { + "epoch": 0.12049806596230335, + "grad_norm": 1.7237669229507446, + "learning_rate": 1.9796460640724646e-05, + "loss": 1.5147, + "step": 2204 + }, + { + "epoch": 0.1205527384060249, + "grad_norm": 1.260608434677124, + "learning_rate": 1.9796093666926617e-05, + "loss": 1.4891, + "step": 2205 + }, + { + "epoch": 0.12060741084974645, + "grad_norm": 1.7252625226974487, + "learning_rate": 1.979572636601403e-05, + "loss": 1.3679, + "step": 2206 + }, + { + "epoch": 0.120662083293468, + "grad_norm": 1.2299672365188599, + "learning_rate": 1.9795358737999155e-05, + "loss": 1.414, + "step": 2207 + }, + { + "epoch": 0.12071675573718957, + "grad_norm": 1.8250874280929565, + "learning_rate": 1.979499078289427e-05, + "loss": 1.5498, + "step": 2208 + }, + { + "epoch": 0.12077142818091112, + "grad_norm": 1.646746277809143, + "learning_rate": 1.979462250071165e-05, + "loss": 1.5064, + "step": 2209 + }, + { + "epoch": 0.12082610062463267, + "grad_norm": 1.6556870937347412, + "learning_rate": 1.9794253891463606e-05, + "loss": 1.5237, + "step": 2210 + }, + { + "epoch": 0.12088077306835422, + "grad_norm": 1.4664558172225952, + "learning_rate": 1.9793884955162442e-05, + "loss": 1.5237, + "step": 2211 + }, + { + "epoch": 0.12093544551207577, + "grad_norm": 1.6079596281051636, + "learning_rate": 1.979351569182048e-05, + "loss": 1.6504, + "step": 2212 + }, + { + "epoch": 0.12099011795579732, + "grad_norm": 1.5576897859573364, + "learning_rate": 1.9793146101450042e-05, + "loss": 1.3852, + "step": 2213 + }, + { + "epoch": 0.12104479039951888, + "grad_norm": 1.6389485597610474, + "learning_rate": 1.9792776184063477e-05, + "loss": 1.4826, + "step": 2214 + }, + { + "epoch": 0.12109946284324044, + "grad_norm": 1.6127594709396362, + "learning_rate": 1.9792405939673135e-05, + "loss": 1.6916, + "step": 2215 + }, + { + "epoch": 0.12115413528696199, + "grad_norm": 1.6366896629333496, + "learning_rate": 1.979203536829138e-05, + "loss": 1.3724, + "step": 2216 + }, + { + "epoch": 0.12120880773068354, + "grad_norm": 1.4989615678787231, + "learning_rate": 1.979166446993059e-05, + "loss": 1.3985, + "step": 2217 + }, + { + "epoch": 0.1212634801744051, + "grad_norm": 1.6096817255020142, + "learning_rate": 1.979129324460314e-05, + "loss": 1.5327, + "step": 2218 + }, + { + "epoch": 0.12131815261812665, + "grad_norm": 1.7146228551864624, + "learning_rate": 1.979092169232144e-05, + "loss": 1.3339, + "step": 2219 + }, + { + "epoch": 0.1213728250618482, + "grad_norm": 1.3154703378677368, + "learning_rate": 1.9790549813097884e-05, + "loss": 1.4681, + "step": 2220 + }, + { + "epoch": 0.12142749750556976, + "grad_norm": 1.3958613872528076, + "learning_rate": 1.9790177606944897e-05, + "loss": 1.751, + "step": 2221 + }, + { + "epoch": 0.12148216994929131, + "grad_norm": 1.3193024396896362, + "learning_rate": 1.978980507387491e-05, + "loss": 1.314, + "step": 2222 + }, + { + "epoch": 0.12153684239301286, + "grad_norm": 1.3665273189544678, + "learning_rate": 1.9789432213900354e-05, + "loss": 1.5903, + "step": 2223 + }, + { + "epoch": 0.12159151483673442, + "grad_norm": 1.3339101076126099, + "learning_rate": 1.9789059027033688e-05, + "loss": 1.5437, + "step": 2224 + }, + { + "epoch": 0.12164618728045597, + "grad_norm": 2.173123598098755, + "learning_rate": 1.9788685513287368e-05, + "loss": 1.6362, + "step": 2225 + }, + { + "epoch": 0.12170085972417752, + "grad_norm": 1.4028681516647339, + "learning_rate": 1.978831167267387e-05, + "loss": 1.5864, + "step": 2226 + }, + { + "epoch": 0.12175553216789907, + "grad_norm": 1.583361268043518, + "learning_rate": 1.9787937505205678e-05, + "loss": 1.4995, + "step": 2227 + }, + { + "epoch": 0.12181020461162063, + "grad_norm": 2.952585220336914, + "learning_rate": 1.9787563010895282e-05, + "loss": 1.5553, + "step": 2228 + }, + { + "epoch": 0.12186487705534219, + "grad_norm": 1.5202633142471313, + "learning_rate": 1.978718818975519e-05, + "loss": 1.4733, + "step": 2229 + }, + { + "epoch": 0.12191954949906374, + "grad_norm": 1.6941205263137817, + "learning_rate": 1.9786813041797915e-05, + "loss": 1.3756, + "step": 2230 + }, + { + "epoch": 0.12197422194278529, + "grad_norm": 1.825928807258606, + "learning_rate": 1.9786437567035993e-05, + "loss": 1.5233, + "step": 2231 + }, + { + "epoch": 0.12202889438650684, + "grad_norm": 1.6786606311798096, + "learning_rate": 1.9786061765481954e-05, + "loss": 1.2855, + "step": 2232 + }, + { + "epoch": 0.12208356683022839, + "grad_norm": 1.2744513750076294, + "learning_rate": 1.978568563714835e-05, + "loss": 1.4119, + "step": 2233 + }, + { + "epoch": 0.12213823927394994, + "grad_norm": 1.5231143236160278, + "learning_rate": 1.978530918204774e-05, + "loss": 1.3912, + "step": 2234 + }, + { + "epoch": 0.1221929117176715, + "grad_norm": 2.3304882049560547, + "learning_rate": 1.9784932400192688e-05, + "loss": 1.7591, + "step": 2235 + }, + { + "epoch": 0.12224758416139306, + "grad_norm": 1.2321511507034302, + "learning_rate": 1.978455529159579e-05, + "loss": 1.49, + "step": 2236 + }, + { + "epoch": 0.12230225660511461, + "grad_norm": 1.5068556070327759, + "learning_rate": 1.9784177856269628e-05, + "loss": 1.6692, + "step": 2237 + }, + { + "epoch": 0.12235692904883616, + "grad_norm": 1.4367939233779907, + "learning_rate": 1.9783800094226807e-05, + "loss": 1.5323, + "step": 2238 + }, + { + "epoch": 0.12241160149255771, + "grad_norm": 1.4282959699630737, + "learning_rate": 1.9783422005479942e-05, + "loss": 1.444, + "step": 2239 + }, + { + "epoch": 0.12246627393627926, + "grad_norm": 1.3831864595413208, + "learning_rate": 1.978304359004166e-05, + "loss": 1.3561, + "step": 2240 + }, + { + "epoch": 0.12252094638000081, + "grad_norm": 1.2500190734863281, + "learning_rate": 1.9782664847924596e-05, + "loss": 1.4579, + "step": 2241 + }, + { + "epoch": 0.12257561882372238, + "grad_norm": 1.394028663635254, + "learning_rate": 1.9782285779141397e-05, + "loss": 1.4164, + "step": 2242 + }, + { + "epoch": 0.12263029126744393, + "grad_norm": 1.3891788721084595, + "learning_rate": 1.978190638370472e-05, + "loss": 1.6685, + "step": 2243 + }, + { + "epoch": 0.12268496371116548, + "grad_norm": 2.0963454246520996, + "learning_rate": 1.9781526661627235e-05, + "loss": 1.4606, + "step": 2244 + }, + { + "epoch": 0.12273963615488703, + "grad_norm": 1.2025643587112427, + "learning_rate": 1.978114661292162e-05, + "loss": 1.3988, + "step": 2245 + }, + { + "epoch": 0.12279430859860858, + "grad_norm": 1.537782073020935, + "learning_rate": 1.9780766237600574e-05, + "loss": 1.4557, + "step": 2246 + }, + { + "epoch": 0.12284898104233014, + "grad_norm": 1.693563461303711, + "learning_rate": 1.978038553567679e-05, + "loss": 1.5429, + "step": 2247 + }, + { + "epoch": 0.12290365348605169, + "grad_norm": 1.3819531202316284, + "learning_rate": 1.9780004507162974e-05, + "loss": 1.2394, + "step": 2248 + }, + { + "epoch": 0.12295832592977325, + "grad_norm": 1.4557260274887085, + "learning_rate": 1.9779623152071866e-05, + "loss": 1.5206, + "step": 2249 + }, + { + "epoch": 0.1230129983734948, + "grad_norm": 1.6241339445114136, + "learning_rate": 1.9779241470416194e-05, + "loss": 1.5875, + "step": 2250 + }, + { + "epoch": 0.12306767081721635, + "grad_norm": 1.6574292182922363, + "learning_rate": 1.9778859462208694e-05, + "loss": 1.244, + "step": 2251 + }, + { + "epoch": 0.1231223432609379, + "grad_norm": 1.9668349027633667, + "learning_rate": 1.9778477127462135e-05, + "loss": 1.4388, + "step": 2252 + }, + { + "epoch": 0.12317701570465946, + "grad_norm": 1.5989834070205688, + "learning_rate": 1.977809446618928e-05, + "loss": 1.4299, + "step": 2253 + }, + { + "epoch": 0.12323168814838101, + "grad_norm": 3.090162992477417, + "learning_rate": 1.97777114784029e-05, + "loss": 1.4126, + "step": 2254 + }, + { + "epoch": 0.12328636059210256, + "grad_norm": 1.6093071699142456, + "learning_rate": 1.9777328164115796e-05, + "loss": 1.8288, + "step": 2255 + }, + { + "epoch": 0.12334103303582412, + "grad_norm": 1.3969024419784546, + "learning_rate": 1.9776944523340757e-05, + "loss": 1.4379, + "step": 2256 + }, + { + "epoch": 0.12339570547954568, + "grad_norm": 1.6636064052581787, + "learning_rate": 1.97765605560906e-05, + "loss": 1.5764, + "step": 2257 + }, + { + "epoch": 0.12345037792326723, + "grad_norm": 1.8323752880096436, + "learning_rate": 1.9776176262378145e-05, + "loss": 1.302, + "step": 2258 + }, + { + "epoch": 0.12350505036698878, + "grad_norm": 1.8695847988128662, + "learning_rate": 1.9775791642216223e-05, + "loss": 1.5363, + "step": 2259 + }, + { + "epoch": 0.12355972281071033, + "grad_norm": 1.5299293994903564, + "learning_rate": 1.9775406695617677e-05, + "loss": 1.2597, + "step": 2260 + }, + { + "epoch": 0.12361439525443188, + "grad_norm": 1.5655733346939087, + "learning_rate": 1.9775021422595366e-05, + "loss": 1.4339, + "step": 2261 + }, + { + "epoch": 0.12366906769815343, + "grad_norm": 1.3241184949874878, + "learning_rate": 1.977463582316215e-05, + "loss": 1.523, + "step": 2262 + }, + { + "epoch": 0.123723740141875, + "grad_norm": 1.4531874656677246, + "learning_rate": 1.9774249897330907e-05, + "loss": 1.5476, + "step": 2263 + }, + { + "epoch": 0.12377841258559655, + "grad_norm": 1.3100241422653198, + "learning_rate": 1.9773863645114525e-05, + "loss": 1.6037, + "step": 2264 + }, + { + "epoch": 0.1238330850293181, + "grad_norm": 1.5607290267944336, + "learning_rate": 1.97734770665259e-05, + "loss": 1.403, + "step": 2265 + }, + { + "epoch": 0.12388775747303965, + "grad_norm": 2.4037325382232666, + "learning_rate": 1.9773090161577943e-05, + "loss": 1.1713, + "step": 2266 + }, + { + "epoch": 0.1239424299167612, + "grad_norm": 1.5347084999084473, + "learning_rate": 1.977270293028357e-05, + "loss": 1.3386, + "step": 2267 + }, + { + "epoch": 0.12399710236048275, + "grad_norm": 1.597427487373352, + "learning_rate": 1.9772315372655714e-05, + "loss": 1.6658, + "step": 2268 + }, + { + "epoch": 0.12405177480420432, + "grad_norm": 1.7052805423736572, + "learning_rate": 1.9771927488707318e-05, + "loss": 1.5416, + "step": 2269 + }, + { + "epoch": 0.12410644724792587, + "grad_norm": 1.3383897542953491, + "learning_rate": 1.977153927845133e-05, + "loss": 1.5516, + "step": 2270 + }, + { + "epoch": 0.12416111969164742, + "grad_norm": 1.3917127847671509, + "learning_rate": 1.977115074190072e-05, + "loss": 1.537, + "step": 2271 + }, + { + "epoch": 0.12421579213536897, + "grad_norm": 1.5249624252319336, + "learning_rate": 1.9770761879068455e-05, + "loss": 1.5305, + "step": 2272 + }, + { + "epoch": 0.12427046457909052, + "grad_norm": 1.4067144393920898, + "learning_rate": 1.9770372689967523e-05, + "loss": 1.375, + "step": 2273 + }, + { + "epoch": 0.12432513702281207, + "grad_norm": 1.4047818183898926, + "learning_rate": 1.9769983174610918e-05, + "loss": 1.4739, + "step": 2274 + }, + { + "epoch": 0.12437980946653363, + "grad_norm": 1.2178595066070557, + "learning_rate": 1.9769593333011652e-05, + "loss": 1.3144, + "step": 2275 + }, + { + "epoch": 0.12443448191025519, + "grad_norm": 1.3632766008377075, + "learning_rate": 1.976920316518274e-05, + "loss": 1.3257, + "step": 2276 + }, + { + "epoch": 0.12448915435397674, + "grad_norm": 1.507507085800171, + "learning_rate": 1.9768812671137207e-05, + "loss": 1.527, + "step": 2277 + }, + { + "epoch": 0.12454382679769829, + "grad_norm": 1.6187636852264404, + "learning_rate": 1.97684218508881e-05, + "loss": 1.4626, + "step": 2278 + }, + { + "epoch": 0.12459849924141984, + "grad_norm": 1.1662899255752563, + "learning_rate": 1.9768030704448462e-05, + "loss": 1.4836, + "step": 2279 + }, + { + "epoch": 0.1246531716851414, + "grad_norm": 1.0995826721191406, + "learning_rate": 1.976763923183136e-05, + "loss": 1.6226, + "step": 2280 + }, + { + "epoch": 0.12470784412886295, + "grad_norm": 1.811429500579834, + "learning_rate": 1.9767247433049858e-05, + "loss": 1.5539, + "step": 2281 + }, + { + "epoch": 0.1247625165725845, + "grad_norm": 2.0573246479034424, + "learning_rate": 1.976685530811705e-05, + "loss": 1.7923, + "step": 2282 + }, + { + "epoch": 0.12481718901630606, + "grad_norm": 1.2719224691390991, + "learning_rate": 1.9766462857046022e-05, + "loss": 1.3126, + "step": 2283 + }, + { + "epoch": 0.12487186146002761, + "grad_norm": 1.7612820863723755, + "learning_rate": 1.9766070079849882e-05, + "loss": 1.4577, + "step": 2284 + }, + { + "epoch": 0.12492653390374917, + "grad_norm": 1.2224186658859253, + "learning_rate": 1.9765676976541748e-05, + "loss": 1.5909, + "step": 2285 + }, + { + "epoch": 0.12498120634747072, + "grad_norm": 1.4540303945541382, + "learning_rate": 1.976528354713474e-05, + "loss": 1.6267, + "step": 2286 + }, + { + "epoch": 0.12503587879119227, + "grad_norm": 1.4359360933303833, + "learning_rate": 1.9764889791642e-05, + "loss": 1.468, + "step": 2287 + }, + { + "epoch": 0.12509055123491383, + "grad_norm": 1.5034606456756592, + "learning_rate": 1.9764495710076678e-05, + "loss": 1.322, + "step": 2288 + }, + { + "epoch": 0.12514522367863537, + "grad_norm": 1.344545841217041, + "learning_rate": 1.976410130245193e-05, + "loss": 1.5801, + "step": 2289 + }, + { + "epoch": 0.12519989612235694, + "grad_norm": 1.3797292709350586, + "learning_rate": 1.9763706568780928e-05, + "loss": 1.5064, + "step": 2290 + }, + { + "epoch": 0.12525456856607847, + "grad_norm": 1.6395140886306763, + "learning_rate": 1.976331150907685e-05, + "loss": 1.6619, + "step": 2291 + }, + { + "epoch": 0.12530924100980004, + "grad_norm": 1.286025047302246, + "learning_rate": 1.976291612335289e-05, + "loss": 1.4184, + "step": 2292 + }, + { + "epoch": 0.1253639134535216, + "grad_norm": 1.533369779586792, + "learning_rate": 1.9762520411622255e-05, + "loss": 1.439, + "step": 2293 + }, + { + "epoch": 0.12541858589724314, + "grad_norm": 1.488183856010437, + "learning_rate": 1.9762124373898155e-05, + "loss": 1.5654, + "step": 2294 + }, + { + "epoch": 0.1254732583409647, + "grad_norm": 2.3513894081115723, + "learning_rate": 1.9761728010193812e-05, + "loss": 1.6389, + "step": 2295 + }, + { + "epoch": 0.12552793078468624, + "grad_norm": 1.2902214527130127, + "learning_rate": 1.9761331320522466e-05, + "loss": 1.639, + "step": 2296 + }, + { + "epoch": 0.1255826032284078, + "grad_norm": 1.584539771080017, + "learning_rate": 1.976093430489736e-05, + "loss": 1.4486, + "step": 2297 + }, + { + "epoch": 0.12563727567212934, + "grad_norm": 1.3361467123031616, + "learning_rate": 1.976053696333175e-05, + "loss": 1.5147, + "step": 2298 + }, + { + "epoch": 0.1256919481158509, + "grad_norm": 1.1404266357421875, + "learning_rate": 1.9760139295838912e-05, + "loss": 1.3282, + "step": 2299 + }, + { + "epoch": 0.12574662055957248, + "grad_norm": 1.311617136001587, + "learning_rate": 1.975974130243212e-05, + "loss": 1.3046, + "step": 2300 + }, + { + "epoch": 0.125801293003294, + "grad_norm": 1.951587438583374, + "learning_rate": 1.975934298312466e-05, + "loss": 1.3756, + "step": 2301 + }, + { + "epoch": 0.12585596544701558, + "grad_norm": 1.447426676750183, + "learning_rate": 1.975894433792984e-05, + "loss": 1.5674, + "step": 2302 + }, + { + "epoch": 0.12591063789073711, + "grad_norm": 1.4004061222076416, + "learning_rate": 1.9758545366860968e-05, + "loss": 1.4797, + "step": 2303 + }, + { + "epoch": 0.12596531033445868, + "grad_norm": 1.5852763652801514, + "learning_rate": 1.9758146069931364e-05, + "loss": 1.6944, + "step": 2304 + }, + { + "epoch": 0.12601998277818022, + "grad_norm": 1.6451023817062378, + "learning_rate": 1.975774644715437e-05, + "loss": 1.455, + "step": 2305 + }, + { + "epoch": 0.12607465522190178, + "grad_norm": 1.4356884956359863, + "learning_rate": 1.975734649854332e-05, + "loss": 1.3926, + "step": 2306 + }, + { + "epoch": 0.12612932766562335, + "grad_norm": 1.5828810930252075, + "learning_rate": 1.975694622411158e-05, + "loss": 1.2031, + "step": 2307 + }, + { + "epoch": 0.12618400010934488, + "grad_norm": 4.764293193817139, + "learning_rate": 1.97565456238725e-05, + "loss": 1.6581, + "step": 2308 + }, + { + "epoch": 0.12623867255306645, + "grad_norm": 1.1289180517196655, + "learning_rate": 1.9756144697839477e-05, + "loss": 1.3578, + "step": 2309 + }, + { + "epoch": 0.126293344996788, + "grad_norm": 1.5831561088562012, + "learning_rate": 1.9755743446025882e-05, + "loss": 1.5667, + "step": 2310 + }, + { + "epoch": 0.12634801744050955, + "grad_norm": 1.3436024188995361, + "learning_rate": 1.9755341868445126e-05, + "loss": 1.5467, + "step": 2311 + }, + { + "epoch": 0.1264026898842311, + "grad_norm": 1.696900486946106, + "learning_rate": 1.975493996511061e-05, + "loss": 1.6616, + "step": 2312 + }, + { + "epoch": 0.12645736232795265, + "grad_norm": 1.5763386487960815, + "learning_rate": 1.975453773603576e-05, + "loss": 1.6894, + "step": 2313 + }, + { + "epoch": 0.12651203477167422, + "grad_norm": 1.6130945682525635, + "learning_rate": 1.9754135181234005e-05, + "loss": 1.5992, + "step": 2314 + }, + { + "epoch": 0.12656670721539576, + "grad_norm": 1.7465238571166992, + "learning_rate": 1.9753732300718788e-05, + "loss": 1.4342, + "step": 2315 + }, + { + "epoch": 0.12662137965911732, + "grad_norm": 1.2523523569107056, + "learning_rate": 1.9753329094503563e-05, + "loss": 1.3361, + "step": 2316 + }, + { + "epoch": 0.12667605210283886, + "grad_norm": 1.4834682941436768, + "learning_rate": 1.975292556260179e-05, + "loss": 1.635, + "step": 2317 + }, + { + "epoch": 0.12673072454656042, + "grad_norm": 1.370724081993103, + "learning_rate": 1.9752521705026947e-05, + "loss": 1.3326, + "step": 2318 + }, + { + "epoch": 0.12678539699028196, + "grad_norm": 1.7773091793060303, + "learning_rate": 1.975211752179252e-05, + "loss": 1.4727, + "step": 2319 + }, + { + "epoch": 0.12684006943400353, + "grad_norm": 1.4782559871673584, + "learning_rate": 1.9751713012912002e-05, + "loss": 1.5585, + "step": 2320 + }, + { + "epoch": 0.1268947418777251, + "grad_norm": 1.3027273416519165, + "learning_rate": 1.975130817839891e-05, + "loss": 1.5123, + "step": 2321 + }, + { + "epoch": 0.12694941432144663, + "grad_norm": 1.4454766511917114, + "learning_rate": 1.9750903018266746e-05, + "loss": 1.5976, + "step": 2322 + }, + { + "epoch": 0.1270040867651682, + "grad_norm": 2.2325892448425293, + "learning_rate": 1.9750497532529053e-05, + "loss": 1.5758, + "step": 2323 + }, + { + "epoch": 0.12705875920888973, + "grad_norm": 1.5540193319320679, + "learning_rate": 1.975009172119937e-05, + "loss": 1.5257, + "step": 2324 + }, + { + "epoch": 0.1271134316526113, + "grad_norm": 1.6229135990142822, + "learning_rate": 1.9749685584291246e-05, + "loss": 1.4285, + "step": 2325 + }, + { + "epoch": 0.12716810409633283, + "grad_norm": 1.2235006093978882, + "learning_rate": 1.9749279121818235e-05, + "loss": 1.3818, + "step": 2326 + }, + { + "epoch": 0.1272227765400544, + "grad_norm": 1.8546907901763916, + "learning_rate": 1.9748872333793923e-05, + "loss": 1.516, + "step": 2327 + }, + { + "epoch": 0.12727744898377596, + "grad_norm": 1.844117283821106, + "learning_rate": 1.9748465220231887e-05, + "loss": 1.612, + "step": 2328 + }, + { + "epoch": 0.1273321214274975, + "grad_norm": 1.5279028415679932, + "learning_rate": 1.974805778114572e-05, + "loss": 1.3837, + "step": 2329 + }, + { + "epoch": 0.12738679387121907, + "grad_norm": 1.780888319015503, + "learning_rate": 1.974765001654903e-05, + "loss": 1.3287, + "step": 2330 + }, + { + "epoch": 0.1274414663149406, + "grad_norm": 1.4502723217010498, + "learning_rate": 1.974724192645543e-05, + "loss": 1.7192, + "step": 2331 + }, + { + "epoch": 0.12749613875866217, + "grad_norm": 1.3549083471298218, + "learning_rate": 1.9746833510878553e-05, + "loss": 1.5229, + "step": 2332 + }, + { + "epoch": 0.1275508112023837, + "grad_norm": 1.1984819173812866, + "learning_rate": 1.974642476983203e-05, + "loss": 1.4426, + "step": 2333 + }, + { + "epoch": 0.12760548364610527, + "grad_norm": 1.4965016841888428, + "learning_rate": 1.9746015703329516e-05, + "loss": 1.6526, + "step": 2334 + }, + { + "epoch": 0.12766015608982684, + "grad_norm": 1.4930238723754883, + "learning_rate": 1.974560631138467e-05, + "loss": 1.4108, + "step": 2335 + }, + { + "epoch": 0.12771482853354837, + "grad_norm": 1.5614384412765503, + "learning_rate": 1.9745196594011156e-05, + "loss": 1.5723, + "step": 2336 + }, + { + "epoch": 0.12776950097726994, + "grad_norm": 1.3772392272949219, + "learning_rate": 1.9744786551222658e-05, + "loss": 1.6685, + "step": 2337 + }, + { + "epoch": 0.12782417342099148, + "grad_norm": 1.2275549173355103, + "learning_rate": 1.9744376183032874e-05, + "loss": 1.6315, + "step": 2338 + }, + { + "epoch": 0.12787884586471304, + "grad_norm": 2.2692036628723145, + "learning_rate": 1.9743965489455505e-05, + "loss": 1.651, + "step": 2339 + }, + { + "epoch": 0.12793351830843458, + "grad_norm": 1.413353443145752, + "learning_rate": 1.974355447050426e-05, + "loss": 1.3008, + "step": 2340 + }, + { + "epoch": 0.12798819075215614, + "grad_norm": 1.3652989864349365, + "learning_rate": 1.9743143126192868e-05, + "loss": 1.4645, + "step": 2341 + }, + { + "epoch": 0.1280428631958777, + "grad_norm": 1.3510493040084839, + "learning_rate": 1.9742731456535066e-05, + "loss": 1.276, + "step": 2342 + }, + { + "epoch": 0.12809753563959925, + "grad_norm": 1.5686192512512207, + "learning_rate": 1.9742319461544598e-05, + "loss": 1.3057, + "step": 2343 + }, + { + "epoch": 0.1281522080833208, + "grad_norm": 1.1761547327041626, + "learning_rate": 1.974190714123522e-05, + "loss": 1.4471, + "step": 2344 + }, + { + "epoch": 0.12820688052704235, + "grad_norm": 1.6911592483520508, + "learning_rate": 1.9741494495620703e-05, + "loss": 1.7151, + "step": 2345 + }, + { + "epoch": 0.12826155297076391, + "grad_norm": 1.6486530303955078, + "learning_rate": 1.9741081524714828e-05, + "loss": 1.5461, + "step": 2346 + }, + { + "epoch": 0.12831622541448545, + "grad_norm": 1.7570996284484863, + "learning_rate": 1.9740668228531378e-05, + "loss": 1.6034, + "step": 2347 + }, + { + "epoch": 0.12837089785820702, + "grad_norm": 1.7614108324050903, + "learning_rate": 1.9740254607084165e-05, + "loss": 1.5422, + "step": 2348 + }, + { + "epoch": 0.12842557030192858, + "grad_norm": 1.911727786064148, + "learning_rate": 1.9739840660386987e-05, + "loss": 1.2741, + "step": 2349 + }, + { + "epoch": 0.12848024274565012, + "grad_norm": 1.4621410369873047, + "learning_rate": 1.973942638845368e-05, + "loss": 1.3992, + "step": 2350 + }, + { + "epoch": 0.12853491518937168, + "grad_norm": 1.5376160144805908, + "learning_rate": 1.9739011791298073e-05, + "loss": 1.4572, + "step": 2351 + }, + { + "epoch": 0.12858958763309322, + "grad_norm": 1.7065576314926147, + "learning_rate": 1.9738596868934007e-05, + "loss": 1.4508, + "step": 2352 + }, + { + "epoch": 0.1286442600768148, + "grad_norm": 1.7906171083450317, + "learning_rate": 1.9738181621375335e-05, + "loss": 1.522, + "step": 2353 + }, + { + "epoch": 0.12869893252053632, + "grad_norm": 1.7115626335144043, + "learning_rate": 1.973776604863593e-05, + "loss": 1.5143, + "step": 2354 + }, + { + "epoch": 0.1287536049642579, + "grad_norm": 1.5433955192565918, + "learning_rate": 1.9737350150729667e-05, + "loss": 1.3952, + "step": 2355 + }, + { + "epoch": 0.12880827740797945, + "grad_norm": 1.8932127952575684, + "learning_rate": 1.9736933927670434e-05, + "loss": 1.6832, + "step": 2356 + }, + { + "epoch": 0.128862949851701, + "grad_norm": 1.615719199180603, + "learning_rate": 1.9736517379472125e-05, + "loss": 1.6143, + "step": 2357 + }, + { + "epoch": 0.12891762229542256, + "grad_norm": 1.2895398139953613, + "learning_rate": 1.9736100506148657e-05, + "loss": 1.5545, + "step": 2358 + }, + { + "epoch": 0.1289722947391441, + "grad_norm": 1.1707091331481934, + "learning_rate": 1.9735683307713946e-05, + "loss": 1.5573, + "step": 2359 + }, + { + "epoch": 0.12902696718286566, + "grad_norm": 1.8587101697921753, + "learning_rate": 1.973526578418192e-05, + "loss": 1.4509, + "step": 2360 + }, + { + "epoch": 0.1290816396265872, + "grad_norm": 1.242002248764038, + "learning_rate": 1.973484793556653e-05, + "loss": 1.4317, + "step": 2361 + }, + { + "epoch": 0.12913631207030876, + "grad_norm": 1.9837855100631714, + "learning_rate": 1.9734429761881723e-05, + "loss": 1.8011, + "step": 2362 + }, + { + "epoch": 0.12919098451403033, + "grad_norm": 1.1803640127182007, + "learning_rate": 1.9734011263141462e-05, + "loss": 1.5881, + "step": 2363 + }, + { + "epoch": 0.12924565695775186, + "grad_norm": 1.6587331295013428, + "learning_rate": 1.9733592439359722e-05, + "loss": 1.4051, + "step": 2364 + }, + { + "epoch": 0.12930032940147343, + "grad_norm": 1.6957975625991821, + "learning_rate": 1.9733173290550494e-05, + "loss": 1.5831, + "step": 2365 + }, + { + "epoch": 0.12935500184519497, + "grad_norm": 1.5004810094833374, + "learning_rate": 1.973275381672777e-05, + "loss": 1.6045, + "step": 2366 + }, + { + "epoch": 0.12940967428891653, + "grad_norm": 1.9574795961380005, + "learning_rate": 1.9732334017905555e-05, + "loss": 1.4811, + "step": 2367 + }, + { + "epoch": 0.12946434673263807, + "grad_norm": 1.3405097723007202, + "learning_rate": 1.973191389409787e-05, + "loss": 1.5337, + "step": 2368 + }, + { + "epoch": 0.12951901917635963, + "grad_norm": 1.3333072662353516, + "learning_rate": 1.9731493445318742e-05, + "loss": 1.5579, + "step": 2369 + }, + { + "epoch": 0.1295736916200812, + "grad_norm": 2.417630434036255, + "learning_rate": 1.9731072671582214e-05, + "loss": 1.4084, + "step": 2370 + }, + { + "epoch": 0.12962836406380274, + "grad_norm": 1.61158287525177, + "learning_rate": 1.9730651572902335e-05, + "loss": 1.5378, + "step": 2371 + }, + { + "epoch": 0.1296830365075243, + "grad_norm": 1.8383492231369019, + "learning_rate": 1.9730230149293167e-05, + "loss": 1.5567, + "step": 2372 + }, + { + "epoch": 0.12973770895124584, + "grad_norm": 1.698169231414795, + "learning_rate": 1.972980840076878e-05, + "loss": 1.3291, + "step": 2373 + }, + { + "epoch": 0.1297923813949674, + "grad_norm": 1.9084296226501465, + "learning_rate": 1.9729386327343258e-05, + "loss": 1.5016, + "step": 2374 + }, + { + "epoch": 0.12984705383868894, + "grad_norm": 1.728013515472412, + "learning_rate": 1.97289639290307e-05, + "loss": 1.3846, + "step": 2375 + }, + { + "epoch": 0.1299017262824105, + "grad_norm": 1.2886641025543213, + "learning_rate": 1.9728541205845203e-05, + "loss": 1.2537, + "step": 2376 + }, + { + "epoch": 0.12995639872613207, + "grad_norm": 1.520508885383606, + "learning_rate": 1.9728118157800887e-05, + "loss": 1.5043, + "step": 2377 + }, + { + "epoch": 0.1300110711698536, + "grad_norm": 1.5385642051696777, + "learning_rate": 1.9727694784911877e-05, + "loss": 1.333, + "step": 2378 + }, + { + "epoch": 0.13006574361357517, + "grad_norm": 1.5676707029342651, + "learning_rate": 1.9727271087192312e-05, + "loss": 1.614, + "step": 2379 + }, + { + "epoch": 0.1301204160572967, + "grad_norm": 2.0250372886657715, + "learning_rate": 1.972684706465634e-05, + "loss": 1.6018, + "step": 2380 + }, + { + "epoch": 0.13017508850101828, + "grad_norm": 1.7269335985183716, + "learning_rate": 1.972642271731812e-05, + "loss": 1.4581, + "step": 2381 + }, + { + "epoch": 0.13022976094473981, + "grad_norm": 1.8097134828567505, + "learning_rate": 1.9725998045191822e-05, + "loss": 1.4836, + "step": 2382 + }, + { + "epoch": 0.13028443338846138, + "grad_norm": 1.2579282522201538, + "learning_rate": 1.9725573048291628e-05, + "loss": 1.5641, + "step": 2383 + }, + { + "epoch": 0.13033910583218294, + "grad_norm": 1.3651913404464722, + "learning_rate": 1.9725147726631724e-05, + "loss": 1.5846, + "step": 2384 + }, + { + "epoch": 0.13039377827590448, + "grad_norm": 1.6727772951126099, + "learning_rate": 1.972472208022632e-05, + "loss": 1.3352, + "step": 2385 + }, + { + "epoch": 0.13044845071962605, + "grad_norm": 1.7602049112319946, + "learning_rate": 1.9724296109089623e-05, + "loss": 1.4236, + "step": 2386 + }, + { + "epoch": 0.13050312316334758, + "grad_norm": 1.7678099870681763, + "learning_rate": 1.9723869813235863e-05, + "loss": 1.622, + "step": 2387 + }, + { + "epoch": 0.13055779560706915, + "grad_norm": 1.6225003004074097, + "learning_rate": 1.972344319267927e-05, + "loss": 1.8108, + "step": 2388 + }, + { + "epoch": 0.1306124680507907, + "grad_norm": 1.4779572486877441, + "learning_rate": 1.9723016247434093e-05, + "loss": 1.3465, + "step": 2389 + }, + { + "epoch": 0.13066714049451225, + "grad_norm": 1.4158377647399902, + "learning_rate": 1.972258897751459e-05, + "loss": 1.6631, + "step": 2390 + }, + { + "epoch": 0.13072181293823382, + "grad_norm": 1.3841723203659058, + "learning_rate": 1.9722161382935022e-05, + "loss": 1.4873, + "step": 2391 + }, + { + "epoch": 0.13077648538195535, + "grad_norm": 1.5834232568740845, + "learning_rate": 1.9721733463709673e-05, + "loss": 1.4617, + "step": 2392 + }, + { + "epoch": 0.13083115782567692, + "grad_norm": 1.456494927406311, + "learning_rate": 1.9721305219852833e-05, + "loss": 1.2717, + "step": 2393 + }, + { + "epoch": 0.13088583026939846, + "grad_norm": 1.5943751335144043, + "learning_rate": 1.9720876651378796e-05, + "loss": 1.5294, + "step": 2394 + }, + { + "epoch": 0.13094050271312002, + "grad_norm": 1.7259620428085327, + "learning_rate": 1.9720447758301882e-05, + "loss": 1.6115, + "step": 2395 + }, + { + "epoch": 0.1309951751568416, + "grad_norm": 1.879896640777588, + "learning_rate": 1.9720018540636404e-05, + "loss": 1.4705, + "step": 2396 + }, + { + "epoch": 0.13104984760056312, + "grad_norm": 1.5724061727523804, + "learning_rate": 1.97195889983967e-05, + "loss": 1.4555, + "step": 2397 + }, + { + "epoch": 0.1311045200442847, + "grad_norm": 1.8851159811019897, + "learning_rate": 1.9719159131597113e-05, + "loss": 1.3998, + "step": 2398 + }, + { + "epoch": 0.13115919248800623, + "grad_norm": 1.3653334379196167, + "learning_rate": 1.971872894025199e-05, + "loss": 1.3033, + "step": 2399 + }, + { + "epoch": 0.1312138649317278, + "grad_norm": 1.383117437362671, + "learning_rate": 1.971829842437571e-05, + "loss": 1.3459, + "step": 2400 + }, + { + "epoch": 0.13126853737544933, + "grad_norm": 1.586778998374939, + "learning_rate": 1.9717867583982637e-05, + "loss": 1.2886, + "step": 2401 + }, + { + "epoch": 0.1313232098191709, + "grad_norm": 2.174294948577881, + "learning_rate": 1.971743641908716e-05, + "loss": 1.3607, + "step": 2402 + }, + { + "epoch": 0.13137788226289246, + "grad_norm": 1.4748185873031616, + "learning_rate": 1.9717004929703677e-05, + "loss": 1.2695, + "step": 2403 + }, + { + "epoch": 0.131432554706614, + "grad_norm": 1.2981653213500977, + "learning_rate": 1.9716573115846602e-05, + "loss": 1.4143, + "step": 2404 + }, + { + "epoch": 0.13148722715033556, + "grad_norm": 1.7443220615386963, + "learning_rate": 1.971614097753035e-05, + "loss": 1.542, + "step": 2405 + }, + { + "epoch": 0.1315418995940571, + "grad_norm": 1.8465999364852905, + "learning_rate": 1.9715708514769346e-05, + "loss": 1.4647, + "step": 2406 + }, + { + "epoch": 0.13159657203777866, + "grad_norm": 1.6275655031204224, + "learning_rate": 1.971527572757804e-05, + "loss": 1.481, + "step": 2407 + }, + { + "epoch": 0.1316512444815002, + "grad_norm": 1.7073590755462646, + "learning_rate": 1.9714842615970878e-05, + "loss": 1.2986, + "step": 2408 + }, + { + "epoch": 0.13170591692522177, + "grad_norm": 1.3050272464752197, + "learning_rate": 1.9714409179962328e-05, + "loss": 1.4423, + "step": 2409 + }, + { + "epoch": 0.13176058936894333, + "grad_norm": 1.4541682004928589, + "learning_rate": 1.971397541956686e-05, + "loss": 1.4456, + "step": 2410 + }, + { + "epoch": 0.13181526181266487, + "grad_norm": 1.625257134437561, + "learning_rate": 1.9713541334798957e-05, + "loss": 1.5646, + "step": 2411 + }, + { + "epoch": 0.13186993425638643, + "grad_norm": 1.2107633352279663, + "learning_rate": 1.971310692567311e-05, + "loss": 1.5604, + "step": 2412 + }, + { + "epoch": 0.13192460670010797, + "grad_norm": 1.5664401054382324, + "learning_rate": 1.9712672192203836e-05, + "loss": 1.3167, + "step": 2413 + }, + { + "epoch": 0.13197927914382954, + "grad_norm": 1.251358985900879, + "learning_rate": 1.971223713440564e-05, + "loss": 1.7648, + "step": 2414 + }, + { + "epoch": 0.13203395158755107, + "grad_norm": 1.677809476852417, + "learning_rate": 1.9711801752293064e-05, + "loss": 1.6546, + "step": 2415 + }, + { + "epoch": 0.13208862403127264, + "grad_norm": 1.3358677625656128, + "learning_rate": 1.9711366045880633e-05, + "loss": 1.5434, + "step": 2416 + }, + { + "epoch": 0.1321432964749942, + "grad_norm": 1.745737910270691, + "learning_rate": 1.9710930015182903e-05, + "loss": 1.5829, + "step": 2417 + }, + { + "epoch": 0.13219796891871574, + "grad_norm": 1.5190775394439697, + "learning_rate": 1.9710493660214435e-05, + "loss": 1.3679, + "step": 2418 + }, + { + "epoch": 0.1322526413624373, + "grad_norm": 1.749758005142212, + "learning_rate": 1.971005698098979e-05, + "loss": 1.2607, + "step": 2419 + }, + { + "epoch": 0.13230731380615884, + "grad_norm": 1.2020530700683594, + "learning_rate": 1.9709619977523564e-05, + "loss": 1.437, + "step": 2420 + }, + { + "epoch": 0.1323619862498804, + "grad_norm": 1.6016976833343506, + "learning_rate": 1.970918264983034e-05, + "loss": 1.4011, + "step": 2421 + }, + { + "epoch": 0.13241665869360195, + "grad_norm": 1.8088730573654175, + "learning_rate": 1.970874499792472e-05, + "loss": 1.6162, + "step": 2422 + }, + { + "epoch": 0.1324713311373235, + "grad_norm": 1.7383404970169067, + "learning_rate": 1.970830702182133e-05, + "loss": 1.3034, + "step": 2423 + }, + { + "epoch": 0.13252600358104508, + "grad_norm": 1.4187681674957275, + "learning_rate": 1.970786872153478e-05, + "loss": 1.7168, + "step": 2424 + }, + { + "epoch": 0.1325806760247666, + "grad_norm": 1.9748449325561523, + "learning_rate": 1.970743009707972e-05, + "loss": 1.4893, + "step": 2425 + }, + { + "epoch": 0.13263534846848818, + "grad_norm": 1.6270489692687988, + "learning_rate": 1.9706991148470783e-05, + "loss": 1.1868, + "step": 2426 + }, + { + "epoch": 0.13269002091220972, + "grad_norm": 1.5515056848526, + "learning_rate": 1.970655187572264e-05, + "loss": 1.53, + "step": 2427 + }, + { + "epoch": 0.13274469335593128, + "grad_norm": 1.5495015382766724, + "learning_rate": 1.970611227884995e-05, + "loss": 1.3336, + "step": 2428 + }, + { + "epoch": 0.13279936579965282, + "grad_norm": 1.235771894454956, + "learning_rate": 1.9705672357867392e-05, + "loss": 1.7081, + "step": 2429 + }, + { + "epoch": 0.13285403824337438, + "grad_norm": 1.6889926195144653, + "learning_rate": 1.970523211278966e-05, + "loss": 1.6053, + "step": 2430 + }, + { + "epoch": 0.13290871068709595, + "grad_norm": 1.500515103340149, + "learning_rate": 1.9704791543631455e-05, + "loss": 1.4686, + "step": 2431 + }, + { + "epoch": 0.1329633831308175, + "grad_norm": 1.4784671068191528, + "learning_rate": 1.970435065040749e-05, + "loss": 1.4224, + "step": 2432 + }, + { + "epoch": 0.13301805557453905, + "grad_norm": 1.5484589338302612, + "learning_rate": 1.9703909433132484e-05, + "loss": 1.478, + "step": 2433 + }, + { + "epoch": 0.1330727280182606, + "grad_norm": 1.4533621072769165, + "learning_rate": 1.9703467891821165e-05, + "loss": 1.4361, + "step": 2434 + }, + { + "epoch": 0.13312740046198215, + "grad_norm": 1.4972054958343506, + "learning_rate": 1.9703026026488288e-05, + "loss": 1.4903, + "step": 2435 + }, + { + "epoch": 0.1331820729057037, + "grad_norm": 1.4949742555618286, + "learning_rate": 1.9702583837148605e-05, + "loss": 1.5926, + "step": 2436 + }, + { + "epoch": 0.13323674534942526, + "grad_norm": 1.446578860282898, + "learning_rate": 1.9702141323816875e-05, + "loss": 1.3009, + "step": 2437 + }, + { + "epoch": 0.13329141779314682, + "grad_norm": 2.2195065021514893, + "learning_rate": 1.9701698486507884e-05, + "loss": 1.3067, + "step": 2438 + }, + { + "epoch": 0.13334609023686836, + "grad_norm": 1.6031310558319092, + "learning_rate": 1.970125532523641e-05, + "loss": 1.3964, + "step": 2439 + }, + { + "epoch": 0.13340076268058992, + "grad_norm": 1.6002211570739746, + "learning_rate": 1.970081184001726e-05, + "loss": 1.2769, + "step": 2440 + }, + { + "epoch": 0.13345543512431146, + "grad_norm": 1.5712929964065552, + "learning_rate": 1.9700368030865235e-05, + "loss": 1.507, + "step": 2441 + }, + { + "epoch": 0.13351010756803303, + "grad_norm": 1.5363794565200806, + "learning_rate": 1.9699923897795165e-05, + "loss": 1.3837, + "step": 2442 + }, + { + "epoch": 0.13356478001175456, + "grad_norm": 1.5227073431015015, + "learning_rate": 1.9699479440821868e-05, + "loss": 1.4779, + "step": 2443 + }, + { + "epoch": 0.13361945245547613, + "grad_norm": 1.4529964923858643, + "learning_rate": 1.9699034659960197e-05, + "loss": 1.6217, + "step": 2444 + }, + { + "epoch": 0.1336741248991977, + "grad_norm": 1.408094048500061, + "learning_rate": 1.9698589555224992e-05, + "loss": 1.6283, + "step": 2445 + }, + { + "epoch": 0.13372879734291923, + "grad_norm": 2.0162322521209717, + "learning_rate": 1.969814412663113e-05, + "loss": 1.3807, + "step": 2446 + }, + { + "epoch": 0.1337834697866408, + "grad_norm": 2.02569842338562, + "learning_rate": 1.9697698374193478e-05, + "loss": 1.5516, + "step": 2447 + }, + { + "epoch": 0.13383814223036233, + "grad_norm": 1.2646065950393677, + "learning_rate": 1.969725229792692e-05, + "loss": 1.2511, + "step": 2448 + }, + { + "epoch": 0.1338928146740839, + "grad_norm": 1.667157769203186, + "learning_rate": 1.9696805897846353e-05, + "loss": 1.5066, + "step": 2449 + }, + { + "epoch": 0.13394748711780544, + "grad_norm": 1.4122557640075684, + "learning_rate": 1.969635917396668e-05, + "loss": 1.4337, + "step": 2450 + }, + { + "epoch": 0.134002159561527, + "grad_norm": 1.3789833784103394, + "learning_rate": 1.9695912126302823e-05, + "loss": 1.4646, + "step": 2451 + }, + { + "epoch": 0.13405683200524857, + "grad_norm": 1.7729754447937012, + "learning_rate": 1.9695464754869708e-05, + "loss": 1.4389, + "step": 2452 + }, + { + "epoch": 0.1341115044489701, + "grad_norm": 1.5463348627090454, + "learning_rate": 1.9695017059682274e-05, + "loss": 1.6084, + "step": 2453 + }, + { + "epoch": 0.13416617689269167, + "grad_norm": 1.5131789445877075, + "learning_rate": 1.9694569040755466e-05, + "loss": 1.4493, + "step": 2454 + }, + { + "epoch": 0.1342208493364132, + "grad_norm": 1.5923134088516235, + "learning_rate": 1.9694120698104253e-05, + "loss": 1.4736, + "step": 2455 + }, + { + "epoch": 0.13427552178013477, + "grad_norm": 1.2669020891189575, + "learning_rate": 1.9693672031743604e-05, + "loss": 1.756, + "step": 2456 + }, + { + "epoch": 0.1343301942238563, + "grad_norm": 1.6944218873977661, + "learning_rate": 1.9693223041688494e-05, + "loss": 1.5185, + "step": 2457 + }, + { + "epoch": 0.13438486666757787, + "grad_norm": 4.288828372955322, + "learning_rate": 1.9692773727953923e-05, + "loss": 1.3784, + "step": 2458 + }, + { + "epoch": 0.13443953911129944, + "grad_norm": 2.4448297023773193, + "learning_rate": 1.9692324090554893e-05, + "loss": 1.2598, + "step": 2459 + }, + { + "epoch": 0.13449421155502098, + "grad_norm": 2.510939121246338, + "learning_rate": 1.9691874129506417e-05, + "loss": 1.436, + "step": 2460 + }, + { + "epoch": 0.13454888399874254, + "grad_norm": 1.6901780366897583, + "learning_rate": 1.969142384482352e-05, + "loss": 1.4006, + "step": 2461 + }, + { + "epoch": 0.13460355644246408, + "grad_norm": 2.249464988708496, + "learning_rate": 1.969097323652124e-05, + "loss": 1.5678, + "step": 2462 + }, + { + "epoch": 0.13465822888618564, + "grad_norm": 1.586021065711975, + "learning_rate": 1.9690522304614624e-05, + "loss": 1.1411, + "step": 2463 + }, + { + "epoch": 0.13471290132990718, + "grad_norm": 1.535662055015564, + "learning_rate": 1.969007104911873e-05, + "loss": 1.2463, + "step": 2464 + }, + { + "epoch": 0.13476757377362875, + "grad_norm": 1.770341396331787, + "learning_rate": 1.9689619470048626e-05, + "loss": 1.6105, + "step": 2465 + }, + { + "epoch": 0.1348222462173503, + "grad_norm": 1.5801540613174438, + "learning_rate": 1.9689167567419386e-05, + "loss": 1.4419, + "step": 2466 + }, + { + "epoch": 0.13487691866107185, + "grad_norm": 1.5491560697555542, + "learning_rate": 1.9688715341246106e-05, + "loss": 1.5003, + "step": 2467 + }, + { + "epoch": 0.1349315911047934, + "grad_norm": 1.2343357801437378, + "learning_rate": 1.9688262791543885e-05, + "loss": 1.5294, + "step": 2468 + }, + { + "epoch": 0.13498626354851495, + "grad_norm": 1.625324010848999, + "learning_rate": 1.968780991832784e-05, + "loss": 1.3504, + "step": 2469 + }, + { + "epoch": 0.13504093599223652, + "grad_norm": 1.4264780282974243, + "learning_rate": 1.9687356721613084e-05, + "loss": 1.5077, + "step": 2470 + }, + { + "epoch": 0.13509560843595805, + "grad_norm": 1.6219781637191772, + "learning_rate": 1.9686903201414753e-05, + "loss": 1.5752, + "step": 2471 + }, + { + "epoch": 0.13515028087967962, + "grad_norm": 1.6524996757507324, + "learning_rate": 1.9686449357747996e-05, + "loss": 1.5684, + "step": 2472 + }, + { + "epoch": 0.13520495332340118, + "grad_norm": 2.048863410949707, + "learning_rate": 1.9685995190627967e-05, + "loss": 1.6382, + "step": 2473 + }, + { + "epoch": 0.13525962576712272, + "grad_norm": 1.7793760299682617, + "learning_rate": 1.9685540700069827e-05, + "loss": 1.3133, + "step": 2474 + }, + { + "epoch": 0.13531429821084429, + "grad_norm": 1.230262279510498, + "learning_rate": 1.9685085886088753e-05, + "loss": 1.4325, + "step": 2475 + }, + { + "epoch": 0.13536897065456582, + "grad_norm": 1.4955881834030151, + "learning_rate": 1.9684630748699937e-05, + "loss": 1.2488, + "step": 2476 + }, + { + "epoch": 0.1354236430982874, + "grad_norm": 1.9673196077346802, + "learning_rate": 1.9684175287918576e-05, + "loss": 1.3945, + "step": 2477 + }, + { + "epoch": 0.13547831554200893, + "grad_norm": 2.1387972831726074, + "learning_rate": 1.9683719503759877e-05, + "loss": 1.4932, + "step": 2478 + }, + { + "epoch": 0.1355329879857305, + "grad_norm": 1.4583258628845215, + "learning_rate": 1.968326339623906e-05, + "loss": 1.5911, + "step": 2479 + }, + { + "epoch": 0.13558766042945206, + "grad_norm": 1.629014015197754, + "learning_rate": 1.9682806965371355e-05, + "loss": 1.5513, + "step": 2480 + }, + { + "epoch": 0.1356423328731736, + "grad_norm": 1.6472762823104858, + "learning_rate": 1.9682350211172005e-05, + "loss": 1.592, + "step": 2481 + }, + { + "epoch": 0.13569700531689516, + "grad_norm": 1.5897501707077026, + "learning_rate": 1.968189313365626e-05, + "loss": 1.461, + "step": 2482 + }, + { + "epoch": 0.1357516777606167, + "grad_norm": 2.171119213104248, + "learning_rate": 1.9681435732839384e-05, + "loss": 1.5956, + "step": 2483 + }, + { + "epoch": 0.13580635020433826, + "grad_norm": 1.6529045104980469, + "learning_rate": 1.968097800873665e-05, + "loss": 1.5708, + "step": 2484 + }, + { + "epoch": 0.1358610226480598, + "grad_norm": 1.2666878700256348, + "learning_rate": 1.9680519961363345e-05, + "loss": 1.3606, + "step": 2485 + }, + { + "epoch": 0.13591569509178136, + "grad_norm": 1.4783687591552734, + "learning_rate": 1.9680061590734762e-05, + "loss": 1.3359, + "step": 2486 + }, + { + "epoch": 0.13597036753550293, + "grad_norm": 1.4059737920761108, + "learning_rate": 1.967960289686621e-05, + "loss": 1.5361, + "step": 2487 + }, + { + "epoch": 0.13602503997922447, + "grad_norm": 1.477210521697998, + "learning_rate": 1.9679143879772997e-05, + "loss": 1.6143, + "step": 2488 + }, + { + "epoch": 0.13607971242294603, + "grad_norm": 1.4045931100845337, + "learning_rate": 1.967868453947046e-05, + "loss": 1.6024, + "step": 2489 + }, + { + "epoch": 0.13613438486666757, + "grad_norm": 1.8130955696105957, + "learning_rate": 1.967822487597394e-05, + "loss": 1.4047, + "step": 2490 + }, + { + "epoch": 0.13618905731038913, + "grad_norm": 1.4782239198684692, + "learning_rate": 1.9677764889298775e-05, + "loss": 1.5163, + "step": 2491 + }, + { + "epoch": 0.13624372975411067, + "grad_norm": 1.449273705482483, + "learning_rate": 1.9677304579460328e-05, + "loss": 1.4566, + "step": 2492 + }, + { + "epoch": 0.13629840219783224, + "grad_norm": 1.330162763595581, + "learning_rate": 1.9676843946473977e-05, + "loss": 1.5233, + "step": 2493 + }, + { + "epoch": 0.1363530746415538, + "grad_norm": 1.9761053323745728, + "learning_rate": 1.9676382990355097e-05, + "loss": 1.5317, + "step": 2494 + }, + { + "epoch": 0.13640774708527534, + "grad_norm": 1.2423744201660156, + "learning_rate": 1.9675921711119087e-05, + "loss": 1.5554, + "step": 2495 + }, + { + "epoch": 0.1364624195289969, + "grad_norm": 1.426223635673523, + "learning_rate": 1.967546010878134e-05, + "loss": 1.5979, + "step": 2496 + }, + { + "epoch": 0.13651709197271844, + "grad_norm": 1.4715933799743652, + "learning_rate": 1.9674998183357278e-05, + "loss": 1.4913, + "step": 2497 + }, + { + "epoch": 0.13657176441644, + "grad_norm": 1.5319987535476685, + "learning_rate": 1.9674535934862327e-05, + "loss": 1.3364, + "step": 2498 + }, + { + "epoch": 0.13662643686016157, + "grad_norm": 1.550160527229309, + "learning_rate": 1.9674073363311918e-05, + "loss": 1.4151, + "step": 2499 + }, + { + "epoch": 0.1366811093038831, + "grad_norm": 1.585174560546875, + "learning_rate": 1.9673610468721492e-05, + "loss": 1.8128, + "step": 2500 + }, + { + "epoch": 0.13673578174760467, + "grad_norm": 1.3416951894760132, + "learning_rate": 1.967314725110652e-05, + "loss": 1.6447, + "step": 2501 + }, + { + "epoch": 0.1367904541913262, + "grad_norm": 1.5148967504501343, + "learning_rate": 1.967268371048246e-05, + "loss": 1.583, + "step": 2502 + }, + { + "epoch": 0.13684512663504778, + "grad_norm": 1.6968108415603638, + "learning_rate": 1.9672219846864794e-05, + "loss": 1.527, + "step": 2503 + }, + { + "epoch": 0.1368997990787693, + "grad_norm": 1.862839937210083, + "learning_rate": 1.9671755660269007e-05, + "loss": 1.2174, + "step": 2504 + }, + { + "epoch": 0.13695447152249088, + "grad_norm": 1.7566983699798584, + "learning_rate": 1.967129115071061e-05, + "loss": 1.7125, + "step": 2505 + }, + { + "epoch": 0.13700914396621244, + "grad_norm": 1.6157724857330322, + "learning_rate": 1.96708263182051e-05, + "loss": 1.4751, + "step": 2506 + }, + { + "epoch": 0.13706381640993398, + "grad_norm": 1.3253446817398071, + "learning_rate": 1.967036116276801e-05, + "loss": 1.3718, + "step": 2507 + }, + { + "epoch": 0.13711848885365555, + "grad_norm": 1.5624693632125854, + "learning_rate": 1.966989568441487e-05, + "loss": 1.3086, + "step": 2508 + }, + { + "epoch": 0.13717316129737708, + "grad_norm": 2.143728494644165, + "learning_rate": 1.9669429883161214e-05, + "loss": 1.3484, + "step": 2509 + }, + { + "epoch": 0.13722783374109865, + "grad_norm": 2.2018656730651855, + "learning_rate": 1.9668963759022612e-05, + "loss": 1.3915, + "step": 2510 + }, + { + "epoch": 0.13728250618482019, + "grad_norm": 1.3490291833877563, + "learning_rate": 1.9668497312014617e-05, + "loss": 1.5, + "step": 2511 + }, + { + "epoch": 0.13733717862854175, + "grad_norm": 1.164352297782898, + "learning_rate": 1.966803054215281e-05, + "loss": 1.639, + "step": 2512 + }, + { + "epoch": 0.13739185107226332, + "grad_norm": 1.3871697187423706, + "learning_rate": 1.9667563449452782e-05, + "loss": 1.3422, + "step": 2513 + }, + { + "epoch": 0.13744652351598485, + "grad_norm": 1.7848244905471802, + "learning_rate": 1.9667096033930116e-05, + "loss": 1.4991, + "step": 2514 + }, + { + "epoch": 0.13750119595970642, + "grad_norm": 1.498423457145691, + "learning_rate": 1.9666628295600433e-05, + "loss": 1.4792, + "step": 2515 + }, + { + "epoch": 0.13755586840342796, + "grad_norm": 1.6188348531723022, + "learning_rate": 1.966616023447935e-05, + "loss": 1.6106, + "step": 2516 + }, + { + "epoch": 0.13761054084714952, + "grad_norm": 1.4432159662246704, + "learning_rate": 1.9665691850582492e-05, + "loss": 1.6701, + "step": 2517 + }, + { + "epoch": 0.13766521329087106, + "grad_norm": 1.972272276878357, + "learning_rate": 1.96652231439255e-05, + "loss": 1.4006, + "step": 2518 + }, + { + "epoch": 0.13771988573459262, + "grad_norm": 1.7272000312805176, + "learning_rate": 1.966475411452403e-05, + "loss": 1.5661, + "step": 2519 + }, + { + "epoch": 0.1377745581783142, + "grad_norm": 1.5499130487442017, + "learning_rate": 1.966428476239374e-05, + "loss": 1.5341, + "step": 2520 + }, + { + "epoch": 0.13782923062203573, + "grad_norm": 1.6536346673965454, + "learning_rate": 1.9663815087550305e-05, + "loss": 1.5591, + "step": 2521 + }, + { + "epoch": 0.1378839030657573, + "grad_norm": 1.2231223583221436, + "learning_rate": 1.9663345090009406e-05, + "loss": 1.3829, + "step": 2522 + }, + { + "epoch": 0.13793857550947883, + "grad_norm": 1.6210579872131348, + "learning_rate": 1.9662874769786734e-05, + "loss": 1.2258, + "step": 2523 + }, + { + "epoch": 0.1379932479532004, + "grad_norm": 1.5581566095352173, + "learning_rate": 1.9662404126898008e-05, + "loss": 1.5517, + "step": 2524 + }, + { + "epoch": 0.13804792039692193, + "grad_norm": 1.617989420890808, + "learning_rate": 1.966193316135893e-05, + "loss": 1.3181, + "step": 2525 + }, + { + "epoch": 0.1381025928406435, + "grad_norm": 1.5575212240219116, + "learning_rate": 1.966146187318523e-05, + "loss": 1.306, + "step": 2526 + }, + { + "epoch": 0.13815726528436506, + "grad_norm": 1.3565707206726074, + "learning_rate": 1.966099026239265e-05, + "loss": 1.5427, + "step": 2527 + }, + { + "epoch": 0.1382119377280866, + "grad_norm": 1.6429399251937866, + "learning_rate": 1.9660518328996933e-05, + "loss": 1.4712, + "step": 2528 + }, + { + "epoch": 0.13826661017180816, + "grad_norm": 1.3429585695266724, + "learning_rate": 1.9660046073013838e-05, + "loss": 1.5118, + "step": 2529 + }, + { + "epoch": 0.1383212826155297, + "grad_norm": 1.513349175453186, + "learning_rate": 1.9659573494459142e-05, + "loss": 1.5904, + "step": 2530 + }, + { + "epoch": 0.13837595505925127, + "grad_norm": 1.7244385480880737, + "learning_rate": 1.9659100593348615e-05, + "loss": 1.4022, + "step": 2531 + }, + { + "epoch": 0.1384306275029728, + "grad_norm": 1.5815539360046387, + "learning_rate": 1.9658627369698052e-05, + "loss": 1.4064, + "step": 2532 + }, + { + "epoch": 0.13848529994669437, + "grad_norm": 1.9533398151397705, + "learning_rate": 1.9658153823523262e-05, + "loss": 1.5938, + "step": 2533 + }, + { + "epoch": 0.13853997239041593, + "grad_norm": 1.4767546653747559, + "learning_rate": 1.965767995484005e-05, + "loss": 1.5974, + "step": 2534 + }, + { + "epoch": 0.13859464483413747, + "grad_norm": 1.7968635559082031, + "learning_rate": 1.9657205763664244e-05, + "loss": 1.3315, + "step": 2535 + }, + { + "epoch": 0.13864931727785904, + "grad_norm": 1.3146188259124756, + "learning_rate": 1.965673125001167e-05, + "loss": 1.6369, + "step": 2536 + }, + { + "epoch": 0.13870398972158057, + "grad_norm": 1.5174239873886108, + "learning_rate": 1.9656256413898184e-05, + "loss": 1.4143, + "step": 2537 + }, + { + "epoch": 0.13875866216530214, + "grad_norm": 1.8157330751419067, + "learning_rate": 1.9655781255339638e-05, + "loss": 1.5403, + "step": 2538 + }, + { + "epoch": 0.13881333460902368, + "grad_norm": 1.238908052444458, + "learning_rate": 1.9655305774351898e-05, + "loss": 1.5668, + "step": 2539 + }, + { + "epoch": 0.13886800705274524, + "grad_norm": 1.4682608842849731, + "learning_rate": 1.9654829970950838e-05, + "loss": 1.6095, + "step": 2540 + }, + { + "epoch": 0.1389226794964668, + "grad_norm": 1.4509050846099854, + "learning_rate": 1.9654353845152352e-05, + "loss": 1.5887, + "step": 2541 + }, + { + "epoch": 0.13897735194018834, + "grad_norm": 1.4984679222106934, + "learning_rate": 1.9653877396972337e-05, + "loss": 1.6414, + "step": 2542 + }, + { + "epoch": 0.1390320243839099, + "grad_norm": 1.5927653312683105, + "learning_rate": 1.96534006264267e-05, + "loss": 1.628, + "step": 2543 + }, + { + "epoch": 0.13908669682763145, + "grad_norm": 1.7389239072799683, + "learning_rate": 1.9652923533531365e-05, + "loss": 1.31, + "step": 2544 + }, + { + "epoch": 0.139141369271353, + "grad_norm": 1.133284568786621, + "learning_rate": 1.965244611830226e-05, + "loss": 1.2758, + "step": 2545 + }, + { + "epoch": 0.13919604171507455, + "grad_norm": 1.720874547958374, + "learning_rate": 1.965196838075533e-05, + "loss": 1.3571, + "step": 2546 + }, + { + "epoch": 0.1392507141587961, + "grad_norm": 1.4532185792922974, + "learning_rate": 1.965149032090653e-05, + "loss": 1.4631, + "step": 2547 + }, + { + "epoch": 0.13930538660251768, + "grad_norm": 1.4017982482910156, + "learning_rate": 1.9651011938771815e-05, + "loss": 1.3405, + "step": 2548 + }, + { + "epoch": 0.13936005904623922, + "grad_norm": 1.4047292470932007, + "learning_rate": 1.9650533234367167e-05, + "loss": 1.5103, + "step": 2549 + }, + { + "epoch": 0.13941473148996078, + "grad_norm": 1.6174856424331665, + "learning_rate": 1.965005420770857e-05, + "loss": 1.4548, + "step": 2550 + }, + { + "epoch": 0.13946940393368232, + "grad_norm": 1.5584542751312256, + "learning_rate": 1.9649574858812016e-05, + "loss": 1.5254, + "step": 2551 + }, + { + "epoch": 0.13952407637740388, + "grad_norm": 1.5859335660934448, + "learning_rate": 1.9649095187693517e-05, + "loss": 1.5406, + "step": 2552 + }, + { + "epoch": 0.13957874882112542, + "grad_norm": 1.2523438930511475, + "learning_rate": 1.9648615194369082e-05, + "loss": 1.3891, + "step": 2553 + }, + { + "epoch": 0.13963342126484699, + "grad_norm": 1.774501085281372, + "learning_rate": 1.9648134878854747e-05, + "loss": 1.1961, + "step": 2554 + }, + { + "epoch": 0.13968809370856855, + "grad_norm": 2.07779598236084, + "learning_rate": 1.9647654241166552e-05, + "loss": 1.716, + "step": 2555 + }, + { + "epoch": 0.1397427661522901, + "grad_norm": 1.2971959114074707, + "learning_rate": 1.964717328132054e-05, + "loss": 1.5748, + "step": 2556 + }, + { + "epoch": 0.13979743859601165, + "grad_norm": 1.6197590827941895, + "learning_rate": 1.9646691999332773e-05, + "loss": 1.5171, + "step": 2557 + }, + { + "epoch": 0.1398521110397332, + "grad_norm": 1.3771419525146484, + "learning_rate": 1.9646210395219328e-05, + "loss": 1.4716, + "step": 2558 + }, + { + "epoch": 0.13990678348345476, + "grad_norm": 1.3423523902893066, + "learning_rate": 1.964572846899628e-05, + "loss": 1.5979, + "step": 2559 + }, + { + "epoch": 0.1399614559271763, + "grad_norm": 1.6348894834518433, + "learning_rate": 1.9645246220679722e-05, + "loss": 1.4788, + "step": 2560 + }, + { + "epoch": 0.14001612837089786, + "grad_norm": 2.9920177459716797, + "learning_rate": 1.9644763650285758e-05, + "loss": 1.3721, + "step": 2561 + }, + { + "epoch": 0.14007080081461942, + "grad_norm": 1.3813997507095337, + "learning_rate": 1.9644280757830508e-05, + "loss": 1.6444, + "step": 2562 + }, + { + "epoch": 0.14012547325834096, + "grad_norm": 1.6872905492782593, + "learning_rate": 1.9643797543330092e-05, + "loss": 1.629, + "step": 2563 + }, + { + "epoch": 0.14018014570206253, + "grad_norm": 1.3987150192260742, + "learning_rate": 1.9643314006800645e-05, + "loss": 1.6608, + "step": 2564 + }, + { + "epoch": 0.14023481814578406, + "grad_norm": 2.056896448135376, + "learning_rate": 1.9642830148258314e-05, + "loss": 1.3895, + "step": 2565 + }, + { + "epoch": 0.14028949058950563, + "grad_norm": 1.5547384023666382, + "learning_rate": 1.9642345967719255e-05, + "loss": 1.5676, + "step": 2566 + }, + { + "epoch": 0.14034416303322717, + "grad_norm": 1.6145727634429932, + "learning_rate": 1.964186146519964e-05, + "loss": 1.4557, + "step": 2567 + }, + { + "epoch": 0.14039883547694873, + "grad_norm": 1.5535345077514648, + "learning_rate": 1.9641376640715646e-05, + "loss": 1.489, + "step": 2568 + }, + { + "epoch": 0.1404535079206703, + "grad_norm": 1.2452045679092407, + "learning_rate": 1.9640891494283463e-05, + "loss": 1.7858, + "step": 2569 + }, + { + "epoch": 0.14050818036439183, + "grad_norm": 1.5502755641937256, + "learning_rate": 1.964040602591929e-05, + "loss": 1.3303, + "step": 2570 + }, + { + "epoch": 0.1405628528081134, + "grad_norm": 1.8261324167251587, + "learning_rate": 1.9639920235639334e-05, + "loss": 1.6561, + "step": 2571 + }, + { + "epoch": 0.14061752525183494, + "grad_norm": 1.4370843172073364, + "learning_rate": 1.9639434123459823e-05, + "loss": 1.7043, + "step": 2572 + }, + { + "epoch": 0.1406721976955565, + "grad_norm": 1.7985284328460693, + "learning_rate": 1.9638947689396986e-05, + "loss": 1.6688, + "step": 2573 + }, + { + "epoch": 0.14072687013927804, + "grad_norm": 1.560604214668274, + "learning_rate": 1.9638460933467068e-05, + "loss": 1.359, + "step": 2574 + }, + { + "epoch": 0.1407815425829996, + "grad_norm": 2.903599739074707, + "learning_rate": 1.963797385568632e-05, + "loss": 1.5952, + "step": 2575 + }, + { + "epoch": 0.14083621502672117, + "grad_norm": 1.2607558965682983, + "learning_rate": 1.963748645607101e-05, + "loss": 1.4517, + "step": 2576 + }, + { + "epoch": 0.1408908874704427, + "grad_norm": 1.8362467288970947, + "learning_rate": 1.9636998734637414e-05, + "loss": 1.5238, + "step": 2577 + }, + { + "epoch": 0.14094555991416427, + "grad_norm": 2.395655870437622, + "learning_rate": 1.9636510691401812e-05, + "loss": 1.4949, + "step": 2578 + }, + { + "epoch": 0.1410002323578858, + "grad_norm": 1.6892602443695068, + "learning_rate": 1.963602232638051e-05, + "loss": 1.3536, + "step": 2579 + }, + { + "epoch": 0.14105490480160737, + "grad_norm": 1.5761607885360718, + "learning_rate": 1.963553363958981e-05, + "loss": 1.5843, + "step": 2580 + }, + { + "epoch": 0.1411095772453289, + "grad_norm": 1.5310815572738647, + "learning_rate": 1.963504463104603e-05, + "loss": 1.6745, + "step": 2581 + }, + { + "epoch": 0.14116424968905047, + "grad_norm": 1.8874359130859375, + "learning_rate": 1.96345553007655e-05, + "loss": 1.3073, + "step": 2582 + }, + { + "epoch": 0.14121892213277204, + "grad_norm": 1.088392972946167, + "learning_rate": 1.963406564876456e-05, + "loss": 1.5841, + "step": 2583 + }, + { + "epoch": 0.14127359457649358, + "grad_norm": 1.5567879676818848, + "learning_rate": 1.9633575675059563e-05, + "loss": 1.414, + "step": 2584 + }, + { + "epoch": 0.14132826702021514, + "grad_norm": 1.5623087882995605, + "learning_rate": 1.9633085379666868e-05, + "loss": 1.2179, + "step": 2585 + }, + { + "epoch": 0.14138293946393668, + "grad_norm": 1.7593059539794922, + "learning_rate": 1.9632594762602847e-05, + "loss": 1.5437, + "step": 2586 + }, + { + "epoch": 0.14143761190765824, + "grad_norm": 1.370698094367981, + "learning_rate": 1.9632103823883882e-05, + "loss": 1.3787, + "step": 2587 + }, + { + "epoch": 0.14149228435137978, + "grad_norm": 1.4341905117034912, + "learning_rate": 1.963161256352637e-05, + "loss": 1.5684, + "step": 2588 + }, + { + "epoch": 0.14154695679510135, + "grad_norm": 1.256774663925171, + "learning_rate": 1.9631120981546713e-05, + "loss": 1.3134, + "step": 2589 + }, + { + "epoch": 0.1416016292388229, + "grad_norm": 1.569602370262146, + "learning_rate": 1.9630629077961327e-05, + "loss": 1.4458, + "step": 2590 + }, + { + "epoch": 0.14165630168254445, + "grad_norm": 1.6322277784347534, + "learning_rate": 1.963013685278663e-05, + "loss": 1.5504, + "step": 2591 + }, + { + "epoch": 0.14171097412626601, + "grad_norm": 1.4703210592269897, + "learning_rate": 1.962964430603907e-05, + "loss": 1.4238, + "step": 2592 + }, + { + "epoch": 0.14176564656998755, + "grad_norm": 1.3117592334747314, + "learning_rate": 1.9629151437735095e-05, + "loss": 1.4115, + "step": 2593 + }, + { + "epoch": 0.14182031901370912, + "grad_norm": 1.5676579475402832, + "learning_rate": 1.9628658247891154e-05, + "loss": 1.5112, + "step": 2594 + }, + { + "epoch": 0.14187499145743065, + "grad_norm": 1.3364847898483276, + "learning_rate": 1.9628164736523717e-05, + "loss": 1.6508, + "step": 2595 + }, + { + "epoch": 0.14192966390115222, + "grad_norm": 1.3091844320297241, + "learning_rate": 1.9627670903649273e-05, + "loss": 1.3973, + "step": 2596 + }, + { + "epoch": 0.14198433634487378, + "grad_norm": 1.4540406465530396, + "learning_rate": 1.96271767492843e-05, + "loss": 1.6486, + "step": 2597 + }, + { + "epoch": 0.14203900878859532, + "grad_norm": 1.385706901550293, + "learning_rate": 1.962668227344531e-05, + "loss": 1.1812, + "step": 2598 + }, + { + "epoch": 0.1420936812323169, + "grad_norm": 1.24259352684021, + "learning_rate": 1.96261874761488e-05, + "loss": 1.3709, + "step": 2599 + }, + { + "epoch": 0.14214835367603842, + "grad_norm": 1.4490299224853516, + "learning_rate": 1.962569235741131e-05, + "loss": 1.3172, + "step": 2600 + }, + { + "epoch": 0.14220302611976, + "grad_norm": 1.5021156072616577, + "learning_rate": 1.9625196917249362e-05, + "loss": 1.5801, + "step": 2601 + }, + { + "epoch": 0.14225769856348155, + "grad_norm": 1.3817505836486816, + "learning_rate": 1.96247011556795e-05, + "loss": 1.4621, + "step": 2602 + }, + { + "epoch": 0.1423123710072031, + "grad_norm": 1.835261583328247, + "learning_rate": 1.9624205072718285e-05, + "loss": 1.5575, + "step": 2603 + }, + { + "epoch": 0.14236704345092466, + "grad_norm": 1.785698652267456, + "learning_rate": 1.9623708668382276e-05, + "loss": 1.5295, + "step": 2604 + }, + { + "epoch": 0.1424217158946462, + "grad_norm": 1.7299842834472656, + "learning_rate": 1.9623211942688055e-05, + "loss": 1.5828, + "step": 2605 + }, + { + "epoch": 0.14247638833836776, + "grad_norm": 1.6030619144439697, + "learning_rate": 1.9622714895652204e-05, + "loss": 1.718, + "step": 2606 + }, + { + "epoch": 0.1425310607820893, + "grad_norm": 1.2383472919464111, + "learning_rate": 1.962221752729132e-05, + "loss": 1.5637, + "step": 2607 + }, + { + "epoch": 0.14258573322581086, + "grad_norm": 1.409923791885376, + "learning_rate": 1.962171983762202e-05, + "loss": 1.6907, + "step": 2608 + }, + { + "epoch": 0.14264040566953243, + "grad_norm": 1.7362446784973145, + "learning_rate": 1.962122182666091e-05, + "loss": 1.4089, + "step": 2609 + }, + { + "epoch": 0.14269507811325396, + "grad_norm": 1.4937152862548828, + "learning_rate": 1.9620723494424627e-05, + "loss": 1.2377, + "step": 2610 + }, + { + "epoch": 0.14274975055697553, + "grad_norm": 2.675489902496338, + "learning_rate": 1.9620224840929812e-05, + "loss": 1.2243, + "step": 2611 + }, + { + "epoch": 0.14280442300069707, + "grad_norm": 1.742765188217163, + "learning_rate": 1.9619725866193117e-05, + "loss": 1.3077, + "step": 2612 + }, + { + "epoch": 0.14285909544441863, + "grad_norm": 1.3001271486282349, + "learning_rate": 1.96192265702312e-05, + "loss": 1.4669, + "step": 2613 + }, + { + "epoch": 0.14291376788814017, + "grad_norm": 1.329521894454956, + "learning_rate": 1.9618726953060734e-05, + "loss": 1.8777, + "step": 2614 + }, + { + "epoch": 0.14296844033186173, + "grad_norm": 1.4634222984313965, + "learning_rate": 1.961822701469841e-05, + "loss": 1.4991, + "step": 2615 + }, + { + "epoch": 0.1430231127755833, + "grad_norm": 1.3943212032318115, + "learning_rate": 1.961772675516091e-05, + "loss": 1.5782, + "step": 2616 + }, + { + "epoch": 0.14307778521930484, + "grad_norm": 1.7820508480072021, + "learning_rate": 1.9617226174464945e-05, + "loss": 1.5938, + "step": 2617 + }, + { + "epoch": 0.1431324576630264, + "grad_norm": 1.8556417226791382, + "learning_rate": 1.9616725272627234e-05, + "loss": 1.6668, + "step": 2618 + }, + { + "epoch": 0.14318713010674794, + "grad_norm": 1.5750352144241333, + "learning_rate": 1.9616224049664495e-05, + "loss": 1.4354, + "step": 2619 + }, + { + "epoch": 0.1432418025504695, + "grad_norm": 1.7516969442367554, + "learning_rate": 1.9615722505593474e-05, + "loss": 1.5536, + "step": 2620 + }, + { + "epoch": 0.14329647499419104, + "grad_norm": 2.3303070068359375, + "learning_rate": 1.9615220640430915e-05, + "loss": 1.4416, + "step": 2621 + }, + { + "epoch": 0.1433511474379126, + "grad_norm": 1.4787119626998901, + "learning_rate": 1.9614718454193574e-05, + "loss": 1.2237, + "step": 2622 + }, + { + "epoch": 0.14340581988163417, + "grad_norm": 1.4725826978683472, + "learning_rate": 1.9614215946898224e-05, + "loss": 1.4059, + "step": 2623 + }, + { + "epoch": 0.1434604923253557, + "grad_norm": 1.3561301231384277, + "learning_rate": 1.9613713118561638e-05, + "loss": 1.6324, + "step": 2624 + }, + { + "epoch": 0.14351516476907727, + "grad_norm": 1.4883432388305664, + "learning_rate": 1.9613209969200616e-05, + "loss": 1.6109, + "step": 2625 + }, + { + "epoch": 0.1435698372127988, + "grad_norm": 1.3427529335021973, + "learning_rate": 1.9612706498831956e-05, + "loss": 1.3348, + "step": 2626 + }, + { + "epoch": 0.14362450965652038, + "grad_norm": 1.6326285600662231, + "learning_rate": 1.961220270747247e-05, + "loss": 1.5706, + "step": 2627 + }, + { + "epoch": 0.14367918210024191, + "grad_norm": 1.4452869892120361, + "learning_rate": 1.9611698595138974e-05, + "loss": 1.6104, + "step": 2628 + }, + { + "epoch": 0.14373385454396348, + "grad_norm": 1.7107963562011719, + "learning_rate": 1.961119416184831e-05, + "loss": 1.5027, + "step": 2629 + }, + { + "epoch": 0.14378852698768504, + "grad_norm": 1.6470470428466797, + "learning_rate": 1.961068940761732e-05, + "loss": 1.4935, + "step": 2630 + }, + { + "epoch": 0.14384319943140658, + "grad_norm": 1.6189905405044556, + "learning_rate": 1.961018433246286e-05, + "loss": 1.301, + "step": 2631 + }, + { + "epoch": 0.14389787187512815, + "grad_norm": 1.1573131084442139, + "learning_rate": 1.9609678936401794e-05, + "loss": 1.5299, + "step": 2632 + }, + { + "epoch": 0.14395254431884968, + "grad_norm": 1.7932428121566772, + "learning_rate": 1.9609173219450998e-05, + "loss": 1.5001, + "step": 2633 + }, + { + "epoch": 0.14400721676257125, + "grad_norm": 1.3074805736541748, + "learning_rate": 1.9608667181627358e-05, + "loss": 1.465, + "step": 2634 + }, + { + "epoch": 0.1440618892062928, + "grad_norm": 1.0845136642456055, + "learning_rate": 1.9608160822947772e-05, + "loss": 1.6851, + "step": 2635 + }, + { + "epoch": 0.14411656165001435, + "grad_norm": 1.652076244354248, + "learning_rate": 1.9607654143429156e-05, + "loss": 1.4739, + "step": 2636 + }, + { + "epoch": 0.14417123409373592, + "grad_norm": 1.4403184652328491, + "learning_rate": 1.9607147143088418e-05, + "loss": 1.5963, + "step": 2637 + }, + { + "epoch": 0.14422590653745745, + "grad_norm": 1.5291944742202759, + "learning_rate": 1.9606639821942496e-05, + "loss": 1.5666, + "step": 2638 + }, + { + "epoch": 0.14428057898117902, + "grad_norm": 1.544158697128296, + "learning_rate": 1.9606132180008324e-05, + "loss": 1.5368, + "step": 2639 + }, + { + "epoch": 0.14433525142490056, + "grad_norm": 1.1913743019104004, + "learning_rate": 1.960562421730286e-05, + "loss": 1.4885, + "step": 2640 + }, + { + "epoch": 0.14438992386862212, + "grad_norm": 1.3541431427001953, + "learning_rate": 1.960511593384306e-05, + "loss": 1.6568, + "step": 2641 + }, + { + "epoch": 0.14444459631234366, + "grad_norm": 1.4633240699768066, + "learning_rate": 1.9604607329645905e-05, + "loss": 1.5213, + "step": 2642 + }, + { + "epoch": 0.14449926875606522, + "grad_norm": 1.627467155456543, + "learning_rate": 1.960409840472837e-05, + "loss": 1.3637, + "step": 2643 + }, + { + "epoch": 0.1445539411997868, + "grad_norm": 1.4086514711380005, + "learning_rate": 1.960358915910745e-05, + "loss": 1.4333, + "step": 2644 + }, + { + "epoch": 0.14460861364350833, + "grad_norm": 1.3991012573242188, + "learning_rate": 1.9603079592800157e-05, + "loss": 1.5728, + "step": 2645 + }, + { + "epoch": 0.1446632860872299, + "grad_norm": 1.4714275598526, + "learning_rate": 1.96025697058235e-05, + "loss": 1.4124, + "step": 2646 + }, + { + "epoch": 0.14471795853095143, + "grad_norm": 1.473479151725769, + "learning_rate": 1.9602059498194508e-05, + "loss": 1.2326, + "step": 2647 + }, + { + "epoch": 0.144772630974673, + "grad_norm": 1.5283968448638916, + "learning_rate": 1.9601548969930214e-05, + "loss": 1.2729, + "step": 2648 + }, + { + "epoch": 0.14482730341839453, + "grad_norm": 1.0894277095794678, + "learning_rate": 1.9601038121047674e-05, + "loss": 1.4516, + "step": 2649 + }, + { + "epoch": 0.1448819758621161, + "grad_norm": 1.6791291236877441, + "learning_rate": 1.960052695156394e-05, + "loss": 1.4916, + "step": 2650 + }, + { + "epoch": 0.14493664830583766, + "grad_norm": 1.9271529912948608, + "learning_rate": 1.9600015461496086e-05, + "loss": 1.5088, + "step": 2651 + }, + { + "epoch": 0.1449913207495592, + "grad_norm": 1.5724282264709473, + "learning_rate": 1.9599503650861183e-05, + "loss": 1.4689, + "step": 2652 + }, + { + "epoch": 0.14504599319328076, + "grad_norm": 1.5401198863983154, + "learning_rate": 1.9598991519676328e-05, + "loss": 1.4876, + "step": 2653 + }, + { + "epoch": 0.1451006656370023, + "grad_norm": 1.2708712816238403, + "learning_rate": 1.9598479067958624e-05, + "loss": 1.3732, + "step": 2654 + }, + { + "epoch": 0.14515533808072387, + "grad_norm": 1.3070068359375, + "learning_rate": 1.959796629572518e-05, + "loss": 1.5074, + "step": 2655 + }, + { + "epoch": 0.1452100105244454, + "grad_norm": 1.3323419094085693, + "learning_rate": 1.9597453202993118e-05, + "loss": 1.5326, + "step": 2656 + }, + { + "epoch": 0.14526468296816697, + "grad_norm": 1.245705485343933, + "learning_rate": 1.9596939789779573e-05, + "loss": 1.5829, + "step": 2657 + }, + { + "epoch": 0.14531935541188853, + "grad_norm": 1.2300912141799927, + "learning_rate": 1.9596426056101688e-05, + "loss": 1.7247, + "step": 2658 + }, + { + "epoch": 0.14537402785561007, + "grad_norm": 1.2513489723205566, + "learning_rate": 1.959591200197662e-05, + "loss": 1.3203, + "step": 2659 + }, + { + "epoch": 0.14542870029933164, + "grad_norm": 2.409891128540039, + "learning_rate": 1.959539762742153e-05, + "loss": 1.6502, + "step": 2660 + }, + { + "epoch": 0.14548337274305317, + "grad_norm": 1.2883824110031128, + "learning_rate": 1.9594882932453596e-05, + "loss": 1.6799, + "step": 2661 + }, + { + "epoch": 0.14553804518677474, + "grad_norm": 1.9854342937469482, + "learning_rate": 1.959436791709001e-05, + "loss": 1.6697, + "step": 2662 + }, + { + "epoch": 0.14559271763049628, + "grad_norm": 1.258474588394165, + "learning_rate": 1.9593852581347962e-05, + "loss": 1.3969, + "step": 2663 + }, + { + "epoch": 0.14564739007421784, + "grad_norm": 1.8043646812438965, + "learning_rate": 1.9593336925244662e-05, + "loss": 1.4035, + "step": 2664 + }, + { + "epoch": 0.1457020625179394, + "grad_norm": 1.598469614982605, + "learning_rate": 1.9592820948797337e-05, + "loss": 1.4855, + "step": 2665 + }, + { + "epoch": 0.14575673496166094, + "grad_norm": 1.8115723133087158, + "learning_rate": 1.9592304652023208e-05, + "loss": 1.392, + "step": 2666 + }, + { + "epoch": 0.1458114074053825, + "grad_norm": 1.5192986726760864, + "learning_rate": 1.9591788034939518e-05, + "loss": 1.3507, + "step": 2667 + }, + { + "epoch": 0.14586607984910405, + "grad_norm": 1.693808674812317, + "learning_rate": 1.9591271097563512e-05, + "loss": 1.4837, + "step": 2668 + }, + { + "epoch": 0.1459207522928256, + "grad_norm": 1.8611568212509155, + "learning_rate": 1.9590753839912463e-05, + "loss": 1.5978, + "step": 2669 + }, + { + "epoch": 0.14597542473654715, + "grad_norm": 1.5685235261917114, + "learning_rate": 1.9590236262003634e-05, + "loss": 1.4168, + "step": 2670 + }, + { + "epoch": 0.14603009718026871, + "grad_norm": 1.4435508251190186, + "learning_rate": 1.9589718363854315e-05, + "loss": 1.5932, + "step": 2671 + }, + { + "epoch": 0.14608476962399028, + "grad_norm": 1.7984668016433716, + "learning_rate": 1.9589200145481797e-05, + "loss": 1.6118, + "step": 2672 + }, + { + "epoch": 0.14613944206771182, + "grad_norm": 1.81229567527771, + "learning_rate": 1.9588681606903385e-05, + "loss": 1.5949, + "step": 2673 + }, + { + "epoch": 0.14619411451143338, + "grad_norm": 1.1786168813705444, + "learning_rate": 1.958816274813639e-05, + "loss": 1.4284, + "step": 2674 + }, + { + "epoch": 0.14624878695515492, + "grad_norm": 1.5399738550186157, + "learning_rate": 1.9587643569198144e-05, + "loss": 1.4479, + "step": 2675 + }, + { + "epoch": 0.14630345939887648, + "grad_norm": 1.4457752704620361, + "learning_rate": 1.958712407010598e-05, + "loss": 1.4766, + "step": 2676 + }, + { + "epoch": 0.14635813184259802, + "grad_norm": 1.3401402235031128, + "learning_rate": 1.9586604250877248e-05, + "loss": 1.7048, + "step": 2677 + }, + { + "epoch": 0.1464128042863196, + "grad_norm": 1.1402506828308105, + "learning_rate": 1.9586084111529304e-05, + "loss": 1.3804, + "step": 2678 + }, + { + "epoch": 0.14646747673004115, + "grad_norm": 1.7450233697891235, + "learning_rate": 1.958556365207952e-05, + "loss": 1.6531, + "step": 2679 + }, + { + "epoch": 0.1465221491737627, + "grad_norm": 1.3204283714294434, + "learning_rate": 1.9585042872545266e-05, + "loss": 1.5384, + "step": 2680 + }, + { + "epoch": 0.14657682161748425, + "grad_norm": 1.5281637907028198, + "learning_rate": 1.9584521772943944e-05, + "loss": 1.3037, + "step": 2681 + }, + { + "epoch": 0.1466314940612058, + "grad_norm": 1.6232571601867676, + "learning_rate": 1.9584000353292944e-05, + "loss": 1.7014, + "step": 2682 + }, + { + "epoch": 0.14668616650492736, + "grad_norm": 1.5333011150360107, + "learning_rate": 1.9583478613609684e-05, + "loss": 1.5217, + "step": 2683 + }, + { + "epoch": 0.1467408389486489, + "grad_norm": 1.8362526893615723, + "learning_rate": 1.958295655391159e-05, + "loss": 1.49, + "step": 2684 + }, + { + "epoch": 0.14679551139237046, + "grad_norm": 1.2708649635314941, + "learning_rate": 1.9582434174216084e-05, + "loss": 1.5182, + "step": 2685 + }, + { + "epoch": 0.14685018383609202, + "grad_norm": 1.4042023420333862, + "learning_rate": 1.9581911474540617e-05, + "loss": 1.4009, + "step": 2686 + }, + { + "epoch": 0.14690485627981356, + "grad_norm": 1.2720637321472168, + "learning_rate": 1.958138845490264e-05, + "loss": 1.6375, + "step": 2687 + }, + { + "epoch": 0.14695952872353513, + "grad_norm": 2.3392622470855713, + "learning_rate": 1.958086511531962e-05, + "loss": 1.6048, + "step": 2688 + }, + { + "epoch": 0.14701420116725666, + "grad_norm": 1.630307674407959, + "learning_rate": 1.958034145580903e-05, + "loss": 1.45, + "step": 2689 + }, + { + "epoch": 0.14706887361097823, + "grad_norm": 1.730611801147461, + "learning_rate": 1.957981747638836e-05, + "loss": 1.3541, + "step": 2690 + }, + { + "epoch": 0.14712354605469977, + "grad_norm": 1.3993253707885742, + "learning_rate": 1.9579293177075106e-05, + "loss": 1.1092, + "step": 2691 + }, + { + "epoch": 0.14717821849842133, + "grad_norm": 1.5140854120254517, + "learning_rate": 1.957876855788677e-05, + "loss": 1.4866, + "step": 2692 + }, + { + "epoch": 0.1472328909421429, + "grad_norm": 1.568335771560669, + "learning_rate": 1.957824361884088e-05, + "loss": 1.2932, + "step": 2693 + }, + { + "epoch": 0.14728756338586443, + "grad_norm": 1.3172482252120972, + "learning_rate": 1.9577718359954955e-05, + "loss": 1.3486, + "step": 2694 + }, + { + "epoch": 0.147342235829586, + "grad_norm": 1.7670693397521973, + "learning_rate": 1.9577192781246542e-05, + "loss": 1.323, + "step": 2695 + }, + { + "epoch": 0.14739690827330754, + "grad_norm": 1.6064114570617676, + "learning_rate": 1.9576666882733186e-05, + "loss": 1.3827, + "step": 2696 + }, + { + "epoch": 0.1474515807170291, + "grad_norm": 1.7516783475875854, + "learning_rate": 1.9576140664432454e-05, + "loss": 1.411, + "step": 2697 + }, + { + "epoch": 0.14750625316075064, + "grad_norm": 1.5996910333633423, + "learning_rate": 1.957561412636191e-05, + "loss": 1.586, + "step": 2698 + }, + { + "epoch": 0.1475609256044722, + "grad_norm": 1.746519684791565, + "learning_rate": 1.9575087268539144e-05, + "loss": 1.4436, + "step": 2699 + }, + { + "epoch": 0.14761559804819377, + "grad_norm": 1.7408232688903809, + "learning_rate": 1.957456009098174e-05, + "loss": 1.4148, + "step": 2700 + }, + { + "epoch": 0.1476702704919153, + "grad_norm": 2.1297755241394043, + "learning_rate": 1.9574032593707314e-05, + "loss": 1.5814, + "step": 2701 + }, + { + "epoch": 0.14772494293563687, + "grad_norm": 1.4798660278320312, + "learning_rate": 1.9573504776733467e-05, + "loss": 1.6809, + "step": 2702 + }, + { + "epoch": 0.1477796153793584, + "grad_norm": 1.4937011003494263, + "learning_rate": 1.9572976640077836e-05, + "loss": 1.46, + "step": 2703 + }, + { + "epoch": 0.14783428782307997, + "grad_norm": 1.6723699569702148, + "learning_rate": 1.957244818375805e-05, + "loss": 1.3229, + "step": 2704 + }, + { + "epoch": 0.14788896026680154, + "grad_norm": 1.7542403936386108, + "learning_rate": 1.9571919407791754e-05, + "loss": 1.6006, + "step": 2705 + }, + { + "epoch": 0.14794363271052308, + "grad_norm": 1.3434736728668213, + "learning_rate": 1.9571390312196608e-05, + "loss": 1.6103, + "step": 2706 + }, + { + "epoch": 0.14799830515424464, + "grad_norm": 1.2894448041915894, + "learning_rate": 1.9570860896990283e-05, + "loss": 1.5641, + "step": 2707 + }, + { + "epoch": 0.14805297759796618, + "grad_norm": 1.7815053462982178, + "learning_rate": 1.957033116219045e-05, + "loss": 1.6062, + "step": 2708 + }, + { + "epoch": 0.14810765004168774, + "grad_norm": 1.2941291332244873, + "learning_rate": 1.956980110781481e-05, + "loss": 1.4999, + "step": 2709 + }, + { + "epoch": 0.14816232248540928, + "grad_norm": 1.6473850011825562, + "learning_rate": 1.9569270733881045e-05, + "loss": 1.4151, + "step": 2710 + }, + { + "epoch": 0.14821699492913085, + "grad_norm": 1.6498922109603882, + "learning_rate": 1.956874004040688e-05, + "loss": 1.3299, + "step": 2711 + }, + { + "epoch": 0.1482716673728524, + "grad_norm": 1.5028141736984253, + "learning_rate": 1.956820902741003e-05, + "loss": 1.6647, + "step": 2712 + }, + { + "epoch": 0.14832633981657395, + "grad_norm": 1.7103071212768555, + "learning_rate": 1.9567677694908228e-05, + "loss": 1.4789, + "step": 2713 + }, + { + "epoch": 0.14838101226029551, + "grad_norm": 1.275362253189087, + "learning_rate": 1.9567146042919217e-05, + "loss": 1.4505, + "step": 2714 + }, + { + "epoch": 0.14843568470401705, + "grad_norm": 1.6313906908035278, + "learning_rate": 1.956661407146075e-05, + "loss": 1.3481, + "step": 2715 + }, + { + "epoch": 0.14849035714773862, + "grad_norm": 1.5646727085113525, + "learning_rate": 1.956608178055059e-05, + "loss": 1.6215, + "step": 2716 + }, + { + "epoch": 0.14854502959146015, + "grad_norm": 1.2574666738510132, + "learning_rate": 1.956554917020651e-05, + "loss": 1.3351, + "step": 2717 + }, + { + "epoch": 0.14859970203518172, + "grad_norm": 1.795337438583374, + "learning_rate": 1.95650162404463e-05, + "loss": 1.2933, + "step": 2718 + }, + { + "epoch": 0.14865437447890328, + "grad_norm": 1.5921729803085327, + "learning_rate": 1.9564482991287753e-05, + "loss": 1.5339, + "step": 2719 + }, + { + "epoch": 0.14870904692262482, + "grad_norm": 1.7304553985595703, + "learning_rate": 1.956394942274867e-05, + "loss": 1.6489, + "step": 2720 + }, + { + "epoch": 0.1487637193663464, + "grad_norm": 1.4850367307662964, + "learning_rate": 1.9563415534846877e-05, + "loss": 1.5974, + "step": 2721 + }, + { + "epoch": 0.14881839181006792, + "grad_norm": 1.3624038696289062, + "learning_rate": 1.9562881327600197e-05, + "loss": 1.4979, + "step": 2722 + }, + { + "epoch": 0.1488730642537895, + "grad_norm": 1.2367371320724487, + "learning_rate": 1.956234680102647e-05, + "loss": 1.3116, + "step": 2723 + }, + { + "epoch": 0.14892773669751103, + "grad_norm": 1.6891189813613892, + "learning_rate": 1.9561811955143547e-05, + "loss": 1.3174, + "step": 2724 + }, + { + "epoch": 0.1489824091412326, + "grad_norm": 1.4626493453979492, + "learning_rate": 1.9561276789969282e-05, + "loss": 1.3528, + "step": 2725 + }, + { + "epoch": 0.14903708158495416, + "grad_norm": 1.62235689163208, + "learning_rate": 1.956074130552155e-05, + "loss": 1.6177, + "step": 2726 + }, + { + "epoch": 0.1490917540286757, + "grad_norm": 1.193548560142517, + "learning_rate": 1.956020550181823e-05, + "loss": 1.4922, + "step": 2727 + }, + { + "epoch": 0.14914642647239726, + "grad_norm": 1.3973336219787598, + "learning_rate": 1.9559669378877218e-05, + "loss": 1.368, + "step": 2728 + }, + { + "epoch": 0.1492010989161188, + "grad_norm": 1.6819206476211548, + "learning_rate": 1.955913293671641e-05, + "loss": 1.4093, + "step": 2729 + }, + { + "epoch": 0.14925577135984036, + "grad_norm": 1.8899511098861694, + "learning_rate": 1.955859617535372e-05, + "loss": 1.5094, + "step": 2730 + }, + { + "epoch": 0.1493104438035619, + "grad_norm": 1.746523141860962, + "learning_rate": 1.955805909480708e-05, + "loss": 1.4221, + "step": 2731 + }, + { + "epoch": 0.14936511624728346, + "grad_norm": 1.2347389459609985, + "learning_rate": 1.955752169509441e-05, + "loss": 1.3812, + "step": 2732 + }, + { + "epoch": 0.14941978869100503, + "grad_norm": 1.6066519021987915, + "learning_rate": 1.955698397623367e-05, + "loss": 1.485, + "step": 2733 + }, + { + "epoch": 0.14947446113472657, + "grad_norm": 1.6170560121536255, + "learning_rate": 1.9556445938242805e-05, + "loss": 1.393, + "step": 2734 + }, + { + "epoch": 0.14952913357844813, + "grad_norm": 1.4318395853042603, + "learning_rate": 1.9555907581139787e-05, + "loss": 1.5157, + "step": 2735 + }, + { + "epoch": 0.14958380602216967, + "grad_norm": 1.5819758176803589, + "learning_rate": 1.9555368904942593e-05, + "loss": 1.3953, + "step": 2736 + }, + { + "epoch": 0.14963847846589123, + "grad_norm": 1.471929907798767, + "learning_rate": 1.9554829909669205e-05, + "loss": 1.2365, + "step": 2737 + }, + { + "epoch": 0.14969315090961277, + "grad_norm": 1.3727030754089355, + "learning_rate": 1.9554290595337625e-05, + "loss": 1.4216, + "step": 2738 + }, + { + "epoch": 0.14974782335333434, + "grad_norm": 1.6154820919036865, + "learning_rate": 1.9553750961965864e-05, + "loss": 1.5185, + "step": 2739 + }, + { + "epoch": 0.1498024957970559, + "grad_norm": 1.5462749004364014, + "learning_rate": 1.955321100957194e-05, + "loss": 1.4621, + "step": 2740 + }, + { + "epoch": 0.14985716824077744, + "grad_norm": 1.364641785621643, + "learning_rate": 1.9552670738173884e-05, + "loss": 1.5695, + "step": 2741 + }, + { + "epoch": 0.149911840684499, + "grad_norm": 1.7523092031478882, + "learning_rate": 1.9552130147789733e-05, + "loss": 1.5625, + "step": 2742 + }, + { + "epoch": 0.14996651312822054, + "grad_norm": 1.588987946510315, + "learning_rate": 1.9551589238437546e-05, + "loss": 1.6202, + "step": 2743 + }, + { + "epoch": 0.1500211855719421, + "grad_norm": 1.8422341346740723, + "learning_rate": 1.9551048010135377e-05, + "loss": 1.2982, + "step": 2744 + }, + { + "epoch": 0.15007585801566364, + "grad_norm": 1.5661181211471558, + "learning_rate": 1.9550506462901305e-05, + "loss": 1.3931, + "step": 2745 + }, + { + "epoch": 0.1501305304593852, + "grad_norm": 1.6353744268417358, + "learning_rate": 1.954996459675341e-05, + "loss": 1.2413, + "step": 2746 + }, + { + "epoch": 0.15018520290310677, + "grad_norm": 1.5723563432693481, + "learning_rate": 1.954942241170979e-05, + "loss": 1.5955, + "step": 2747 + }, + { + "epoch": 0.1502398753468283, + "grad_norm": 1.7017713785171509, + "learning_rate": 1.954887990778854e-05, + "loss": 1.3451, + "step": 2748 + }, + { + "epoch": 0.15029454779054988, + "grad_norm": 1.567170262336731, + "learning_rate": 1.9548337085007788e-05, + "loss": 1.5293, + "step": 2749 + }, + { + "epoch": 0.1503492202342714, + "grad_norm": 1.5006502866744995, + "learning_rate": 1.954779394338566e-05, + "loss": 1.3841, + "step": 2750 + }, + { + "epoch": 0.15040389267799298, + "grad_norm": 1.128812313079834, + "learning_rate": 1.954725048294028e-05, + "loss": 1.8104, + "step": 2751 + }, + { + "epoch": 0.15045856512171452, + "grad_norm": 1.6534147262573242, + "learning_rate": 1.9546706703689802e-05, + "loss": 1.5411, + "step": 2752 + }, + { + "epoch": 0.15051323756543608, + "grad_norm": 1.3511890172958374, + "learning_rate": 1.954616260565239e-05, + "loss": 1.6543, + "step": 2753 + }, + { + "epoch": 0.15056791000915765, + "grad_norm": 1.4492024183273315, + "learning_rate": 1.9545618188846206e-05, + "loss": 1.3062, + "step": 2754 + }, + { + "epoch": 0.15062258245287918, + "grad_norm": 2.0497887134552, + "learning_rate": 1.954507345328943e-05, + "loss": 1.6882, + "step": 2755 + }, + { + "epoch": 0.15067725489660075, + "grad_norm": 1.323164701461792, + "learning_rate": 1.9544528399000256e-05, + "loss": 1.642, + "step": 2756 + }, + { + "epoch": 0.15073192734032229, + "grad_norm": 1.3265098333358765, + "learning_rate": 1.954398302599688e-05, + "loss": 1.3997, + "step": 2757 + }, + { + "epoch": 0.15078659978404385, + "grad_norm": 1.6618002653121948, + "learning_rate": 1.9543437334297515e-05, + "loss": 1.5492, + "step": 2758 + }, + { + "epoch": 0.1508412722277654, + "grad_norm": 2.7133631706237793, + "learning_rate": 1.9542891323920386e-05, + "loss": 1.3363, + "step": 2759 + }, + { + "epoch": 0.15089594467148695, + "grad_norm": 1.551042079925537, + "learning_rate": 1.954234499488372e-05, + "loss": 1.4548, + "step": 2760 + }, + { + "epoch": 0.15095061711520852, + "grad_norm": 2.103708267211914, + "learning_rate": 1.9541798347205762e-05, + "loss": 1.4397, + "step": 2761 + }, + { + "epoch": 0.15100528955893006, + "grad_norm": 1.885468602180481, + "learning_rate": 1.9541251380904768e-05, + "loss": 1.4595, + "step": 2762 + }, + { + "epoch": 0.15105996200265162, + "grad_norm": 1.8500287532806396, + "learning_rate": 1.9540704095999e-05, + "loss": 1.6018, + "step": 2763 + }, + { + "epoch": 0.15111463444637316, + "grad_norm": 1.7820398807525635, + "learning_rate": 1.9540156492506734e-05, + "loss": 1.4676, + "step": 2764 + }, + { + "epoch": 0.15116930689009472, + "grad_norm": 1.1652556657791138, + "learning_rate": 1.9539608570446255e-05, + "loss": 1.492, + "step": 2765 + }, + { + "epoch": 0.15122397933381626, + "grad_norm": 1.668034315109253, + "learning_rate": 1.9539060329835864e-05, + "loss": 1.4254, + "step": 2766 + }, + { + "epoch": 0.15127865177753783, + "grad_norm": 1.3174896240234375, + "learning_rate": 1.9538511770693862e-05, + "loss": 1.6154, + "step": 2767 + }, + { + "epoch": 0.1513333242212594, + "grad_norm": 1.469348669052124, + "learning_rate": 1.953796289303857e-05, + "loss": 1.4237, + "step": 2768 + }, + { + "epoch": 0.15138799666498093, + "grad_norm": 1.3936545848846436, + "learning_rate": 1.9537413696888317e-05, + "loss": 1.3313, + "step": 2769 + }, + { + "epoch": 0.1514426691087025, + "grad_norm": 2.3110790252685547, + "learning_rate": 1.953686418226144e-05, + "loss": 1.4026, + "step": 2770 + }, + { + "epoch": 0.15149734155242403, + "grad_norm": 1.7681535482406616, + "learning_rate": 1.9536314349176288e-05, + "loss": 1.7359, + "step": 2771 + }, + { + "epoch": 0.1515520139961456, + "grad_norm": 1.2668601274490356, + "learning_rate": 1.953576419765122e-05, + "loss": 1.4728, + "step": 2772 + }, + { + "epoch": 0.15160668643986713, + "grad_norm": 1.917839527130127, + "learning_rate": 1.953521372770461e-05, + "loss": 1.5362, + "step": 2773 + }, + { + "epoch": 0.1516613588835887, + "grad_norm": 1.627002239227295, + "learning_rate": 1.9534662939354843e-05, + "loss": 1.354, + "step": 2774 + }, + { + "epoch": 0.15171603132731026, + "grad_norm": 1.6094136238098145, + "learning_rate": 1.9534111832620302e-05, + "loss": 1.2889, + "step": 2775 + }, + { + "epoch": 0.1517707037710318, + "grad_norm": 1.6148144006729126, + "learning_rate": 1.9533560407519395e-05, + "loss": 1.4215, + "step": 2776 + }, + { + "epoch": 0.15182537621475337, + "grad_norm": 1.884999394416809, + "learning_rate": 1.9533008664070537e-05, + "loss": 1.5993, + "step": 2777 + }, + { + "epoch": 0.1518800486584749, + "grad_norm": 1.7163339853286743, + "learning_rate": 1.9532456602292148e-05, + "loss": 1.2714, + "step": 2778 + }, + { + "epoch": 0.15193472110219647, + "grad_norm": 1.375525712966919, + "learning_rate": 1.9531904222202664e-05, + "loss": 1.3212, + "step": 2779 + }, + { + "epoch": 0.151989393545918, + "grad_norm": 1.3926852941513062, + "learning_rate": 1.9531351523820533e-05, + "loss": 1.4807, + "step": 2780 + }, + { + "epoch": 0.15204406598963957, + "grad_norm": 1.7772979736328125, + "learning_rate": 1.9530798507164207e-05, + "loss": 1.2821, + "step": 2781 + }, + { + "epoch": 0.15209873843336114, + "grad_norm": 1.6527222394943237, + "learning_rate": 1.9530245172252154e-05, + "loss": 1.6476, + "step": 2782 + }, + { + "epoch": 0.15215341087708267, + "grad_norm": 1.563949465751648, + "learning_rate": 1.952969151910285e-05, + "loss": 1.348, + "step": 2783 + }, + { + "epoch": 0.15220808332080424, + "grad_norm": 1.5919479131698608, + "learning_rate": 1.9529137547734787e-05, + "loss": 1.4431, + "step": 2784 + }, + { + "epoch": 0.15226275576452578, + "grad_norm": 1.7081135511398315, + "learning_rate": 1.952858325816646e-05, + "loss": 1.3587, + "step": 2785 + }, + { + "epoch": 0.15231742820824734, + "grad_norm": 1.5757720470428467, + "learning_rate": 1.9528028650416376e-05, + "loss": 1.1612, + "step": 2786 + }, + { + "epoch": 0.15237210065196888, + "grad_norm": 1.9378749132156372, + "learning_rate": 1.952747372450306e-05, + "loss": 1.561, + "step": 2787 + }, + { + "epoch": 0.15242677309569044, + "grad_norm": 1.6384398937225342, + "learning_rate": 1.952691848044504e-05, + "loss": 1.4257, + "step": 2788 + }, + { + "epoch": 0.152481445539412, + "grad_norm": 1.4178876876831055, + "learning_rate": 1.9526362918260852e-05, + "loss": 1.4808, + "step": 2789 + }, + { + "epoch": 0.15253611798313355, + "grad_norm": 1.5347505807876587, + "learning_rate": 1.9525807037969056e-05, + "loss": 1.4816, + "step": 2790 + }, + { + "epoch": 0.1525907904268551, + "grad_norm": 1.6895028352737427, + "learning_rate": 1.9525250839588206e-05, + "loss": 1.5118, + "step": 2791 + }, + { + "epoch": 0.15264546287057665, + "grad_norm": 1.40770423412323, + "learning_rate": 1.9524694323136883e-05, + "loss": 1.477, + "step": 2792 + }, + { + "epoch": 0.1527001353142982, + "grad_norm": 1.2529168128967285, + "learning_rate": 1.9524137488633662e-05, + "loss": 1.5716, + "step": 2793 + }, + { + "epoch": 0.15275480775801975, + "grad_norm": 1.6914000511169434, + "learning_rate": 1.9523580336097147e-05, + "loss": 1.4004, + "step": 2794 + }, + { + "epoch": 0.15280948020174132, + "grad_norm": 1.743617296218872, + "learning_rate": 1.952302286554593e-05, + "loss": 1.4568, + "step": 2795 + }, + { + "epoch": 0.15286415264546288, + "grad_norm": 1.621233344078064, + "learning_rate": 1.9522465076998638e-05, + "loss": 1.688, + "step": 2796 + }, + { + "epoch": 0.15291882508918442, + "grad_norm": 1.2314403057098389, + "learning_rate": 1.952190697047389e-05, + "loss": 1.5315, + "step": 2797 + }, + { + "epoch": 0.15297349753290598, + "grad_norm": 1.4897488355636597, + "learning_rate": 1.9521348545990323e-05, + "loss": 1.7105, + "step": 2798 + }, + { + "epoch": 0.15302816997662752, + "grad_norm": 1.936442255973816, + "learning_rate": 1.952078980356659e-05, + "loss": 1.2761, + "step": 2799 + }, + { + "epoch": 0.15308284242034909, + "grad_norm": 1.8745709657669067, + "learning_rate": 1.952023074322134e-05, + "loss": 1.5449, + "step": 2800 + }, + { + "epoch": 0.15313751486407062, + "grad_norm": 1.929141879081726, + "learning_rate": 1.9519671364973245e-05, + "loss": 1.4784, + "step": 2801 + }, + { + "epoch": 0.1531921873077922, + "grad_norm": 1.6530835628509521, + "learning_rate": 1.9519111668840987e-05, + "loss": 1.5451, + "step": 2802 + }, + { + "epoch": 0.15324685975151375, + "grad_norm": 1.3246043920516968, + "learning_rate": 1.951855165484325e-05, + "loss": 1.6965, + "step": 2803 + }, + { + "epoch": 0.1533015321952353, + "grad_norm": 1.4162927865982056, + "learning_rate": 1.9517991322998742e-05, + "loss": 1.3616, + "step": 2804 + }, + { + "epoch": 0.15335620463895686, + "grad_norm": 1.6551227569580078, + "learning_rate": 1.9517430673326167e-05, + "loss": 1.448, + "step": 2805 + }, + { + "epoch": 0.1534108770826784, + "grad_norm": 1.8990010023117065, + "learning_rate": 1.951686970584425e-05, + "loss": 1.4693, + "step": 2806 + }, + { + "epoch": 0.15346554952639996, + "grad_norm": 1.645599365234375, + "learning_rate": 1.951630842057172e-05, + "loss": 1.4766, + "step": 2807 + }, + { + "epoch": 0.15352022197012152, + "grad_norm": 1.8766416311264038, + "learning_rate": 1.951574681752732e-05, + "loss": 1.6868, + "step": 2808 + }, + { + "epoch": 0.15357489441384306, + "grad_norm": 1.299578309059143, + "learning_rate": 1.9515184896729805e-05, + "loss": 1.4895, + "step": 2809 + }, + { + "epoch": 0.15362956685756463, + "grad_norm": 1.4858424663543701, + "learning_rate": 1.9514622658197937e-05, + "loss": 1.3516, + "step": 2810 + }, + { + "epoch": 0.15368423930128616, + "grad_norm": 1.379682183265686, + "learning_rate": 1.9514060101950492e-05, + "loss": 1.2771, + "step": 2811 + }, + { + "epoch": 0.15373891174500773, + "grad_norm": 1.472010612487793, + "learning_rate": 1.9513497228006257e-05, + "loss": 1.5607, + "step": 2812 + }, + { + "epoch": 0.15379358418872927, + "grad_norm": 1.5528830289840698, + "learning_rate": 1.9512934036384026e-05, + "loss": 1.4793, + "step": 2813 + }, + { + "epoch": 0.15384825663245083, + "grad_norm": 1.8225514888763428, + "learning_rate": 1.9512370527102604e-05, + "loss": 1.3656, + "step": 2814 + }, + { + "epoch": 0.1539029290761724, + "grad_norm": 1.314798355102539, + "learning_rate": 1.9511806700180807e-05, + "loss": 1.5572, + "step": 2815 + }, + { + "epoch": 0.15395760151989393, + "grad_norm": 1.3251994848251343, + "learning_rate": 1.9511242555637464e-05, + "loss": 1.3877, + "step": 2816 + }, + { + "epoch": 0.1540122739636155, + "grad_norm": 1.4737409353256226, + "learning_rate": 1.9510678093491413e-05, + "loss": 1.5366, + "step": 2817 + }, + { + "epoch": 0.15406694640733704, + "grad_norm": 2.365212917327881, + "learning_rate": 1.9510113313761506e-05, + "loss": 1.5327, + "step": 2818 + }, + { + "epoch": 0.1541216188510586, + "grad_norm": 1.398374319076538, + "learning_rate": 1.9509548216466596e-05, + "loss": 1.7187, + "step": 2819 + }, + { + "epoch": 0.15417629129478014, + "grad_norm": 1.5347862243652344, + "learning_rate": 1.9508982801625557e-05, + "loss": 1.4865, + "step": 2820 + }, + { + "epoch": 0.1542309637385017, + "grad_norm": 1.777205228805542, + "learning_rate": 1.950841706925727e-05, + "loss": 1.5218, + "step": 2821 + }, + { + "epoch": 0.15428563618222327, + "grad_norm": 1.6030161380767822, + "learning_rate": 1.9507851019380625e-05, + "loss": 1.3549, + "step": 2822 + }, + { + "epoch": 0.1543403086259448, + "grad_norm": 1.1387274265289307, + "learning_rate": 1.950728465201452e-05, + "loss": 1.5917, + "step": 2823 + }, + { + "epoch": 0.15439498106966637, + "grad_norm": 1.80415940284729, + "learning_rate": 1.9506717967177876e-05, + "loss": 1.5279, + "step": 2824 + }, + { + "epoch": 0.1544496535133879, + "grad_norm": 1.227742314338684, + "learning_rate": 1.9506150964889606e-05, + "loss": 1.4261, + "step": 2825 + }, + { + "epoch": 0.15450432595710947, + "grad_norm": 1.8534587621688843, + "learning_rate": 1.9505583645168654e-05, + "loss": 1.5666, + "step": 2826 + }, + { + "epoch": 0.154558998400831, + "grad_norm": 1.5830310583114624, + "learning_rate": 1.9505016008033953e-05, + "loss": 1.5686, + "step": 2827 + }, + { + "epoch": 0.15461367084455258, + "grad_norm": 1.7360152006149292, + "learning_rate": 1.9504448053504466e-05, + "loss": 1.1866, + "step": 2828 + }, + { + "epoch": 0.15466834328827414, + "grad_norm": 2.506722927093506, + "learning_rate": 1.9503879781599155e-05, + "loss": 1.6605, + "step": 2829 + }, + { + "epoch": 0.15472301573199568, + "grad_norm": 1.6343717575073242, + "learning_rate": 1.9503311192336998e-05, + "loss": 1.5698, + "step": 2830 + }, + { + "epoch": 0.15477768817571724, + "grad_norm": 1.3413273096084595, + "learning_rate": 1.9502742285736977e-05, + "loss": 1.4405, + "step": 2831 + }, + { + "epoch": 0.15483236061943878, + "grad_norm": 1.656655192375183, + "learning_rate": 1.9502173061818095e-05, + "loss": 1.6401, + "step": 2832 + }, + { + "epoch": 0.15488703306316035, + "grad_norm": 1.8302195072174072, + "learning_rate": 1.9501603520599356e-05, + "loss": 1.3256, + "step": 2833 + }, + { + "epoch": 0.15494170550688188, + "grad_norm": 1.9528372287750244, + "learning_rate": 1.950103366209978e-05, + "loss": 1.474, + "step": 2834 + }, + { + "epoch": 0.15499637795060345, + "grad_norm": 1.539759874343872, + "learning_rate": 1.9500463486338393e-05, + "loss": 1.3104, + "step": 2835 + }, + { + "epoch": 0.155051050394325, + "grad_norm": 1.4037978649139404, + "learning_rate": 1.949989299333424e-05, + "loss": 1.6004, + "step": 2836 + }, + { + "epoch": 0.15510572283804655, + "grad_norm": 1.4914664030075073, + "learning_rate": 1.9499322183106363e-05, + "loss": 1.4271, + "step": 2837 + }, + { + "epoch": 0.15516039528176812, + "grad_norm": 1.8183497190475464, + "learning_rate": 1.949875105567383e-05, + "loss": 1.5641, + "step": 2838 + }, + { + "epoch": 0.15521506772548965, + "grad_norm": 1.4549272060394287, + "learning_rate": 1.9498179611055713e-05, + "loss": 1.4037, + "step": 2839 + }, + { + "epoch": 0.15526974016921122, + "grad_norm": 1.2940536737442017, + "learning_rate": 1.9497607849271086e-05, + "loss": 1.5044, + "step": 2840 + }, + { + "epoch": 0.15532441261293276, + "grad_norm": 1.2748867273330688, + "learning_rate": 1.949703577033905e-05, + "loss": 1.5424, + "step": 2841 + }, + { + "epoch": 0.15537908505665432, + "grad_norm": 1.4477177858352661, + "learning_rate": 1.94964633742787e-05, + "loss": 1.5041, + "step": 2842 + }, + { + "epoch": 0.15543375750037589, + "grad_norm": 1.3932539224624634, + "learning_rate": 1.9495890661109154e-05, + "loss": 1.446, + "step": 2843 + }, + { + "epoch": 0.15548842994409742, + "grad_norm": 1.2819417715072632, + "learning_rate": 1.949531763084954e-05, + "loss": 1.581, + "step": 2844 + }, + { + "epoch": 0.155543102387819, + "grad_norm": 1.3024241924285889, + "learning_rate": 1.9494744283518985e-05, + "loss": 1.4629, + "step": 2845 + }, + { + "epoch": 0.15559777483154053, + "grad_norm": 1.9516854286193848, + "learning_rate": 1.949417061913664e-05, + "loss": 1.5546, + "step": 2846 + }, + { + "epoch": 0.1556524472752621, + "grad_norm": 1.6709771156311035, + "learning_rate": 1.9493596637721658e-05, + "loss": 1.3453, + "step": 2847 + }, + { + "epoch": 0.15570711971898363, + "grad_norm": 1.2713446617126465, + "learning_rate": 1.9493022339293207e-05, + "loss": 1.5296, + "step": 2848 + }, + { + "epoch": 0.1557617921627052, + "grad_norm": 1.645976185798645, + "learning_rate": 1.9492447723870466e-05, + "loss": 1.8008, + "step": 2849 + }, + { + "epoch": 0.15581646460642676, + "grad_norm": 1.5894804000854492, + "learning_rate": 1.9491872791472623e-05, + "loss": 1.5892, + "step": 2850 + }, + { + "epoch": 0.1558711370501483, + "grad_norm": 1.3776493072509766, + "learning_rate": 1.9491297542118866e-05, + "loss": 1.6973, + "step": 2851 + }, + { + "epoch": 0.15592580949386986, + "grad_norm": 1.479350209236145, + "learning_rate": 1.949072197582842e-05, + "loss": 1.6531, + "step": 2852 + }, + { + "epoch": 0.1559804819375914, + "grad_norm": 1.718353271484375, + "learning_rate": 1.9490146092620492e-05, + "loss": 1.4334, + "step": 2853 + }, + { + "epoch": 0.15603515438131296, + "grad_norm": 1.259840965270996, + "learning_rate": 1.948956989251432e-05, + "loss": 1.5089, + "step": 2854 + }, + { + "epoch": 0.1560898268250345, + "grad_norm": 1.5545086860656738, + "learning_rate": 1.9488993375529137e-05, + "loss": 1.5916, + "step": 2855 + }, + { + "epoch": 0.15614449926875607, + "grad_norm": 1.3444379568099976, + "learning_rate": 1.9488416541684202e-05, + "loss": 1.6218, + "step": 2856 + }, + { + "epoch": 0.15619917171247763, + "grad_norm": 1.3815022706985474, + "learning_rate": 1.948783939099877e-05, + "loss": 1.4935, + "step": 2857 + }, + { + "epoch": 0.15625384415619917, + "grad_norm": 1.498307466506958, + "learning_rate": 1.948726192349212e-05, + "loss": 1.2801, + "step": 2858 + }, + { + "epoch": 0.15630851659992073, + "grad_norm": 1.8877227306365967, + "learning_rate": 1.9486684139183533e-05, + "loss": 1.7357, + "step": 2859 + }, + { + "epoch": 0.15636318904364227, + "grad_norm": 1.394875407218933, + "learning_rate": 1.9486106038092298e-05, + "loss": 1.455, + "step": 2860 + }, + { + "epoch": 0.15641786148736384, + "grad_norm": 1.907778024673462, + "learning_rate": 1.9485527620237723e-05, + "loss": 1.6395, + "step": 2861 + }, + { + "epoch": 0.15647253393108537, + "grad_norm": 1.706649661064148, + "learning_rate": 1.9484948885639122e-05, + "loss": 1.3785, + "step": 2862 + }, + { + "epoch": 0.15652720637480694, + "grad_norm": 1.6994718313217163, + "learning_rate": 1.9484369834315823e-05, + "loss": 1.541, + "step": 2863 + }, + { + "epoch": 0.1565818788185285, + "grad_norm": 1.7007173299789429, + "learning_rate": 1.948379046628716e-05, + "loss": 1.5599, + "step": 2864 + }, + { + "epoch": 0.15663655126225004, + "grad_norm": 1.9961122274398804, + "learning_rate": 1.9483210781572473e-05, + "loss": 1.2714, + "step": 2865 + }, + { + "epoch": 0.1566912237059716, + "grad_norm": 1.617627739906311, + "learning_rate": 1.948263078019113e-05, + "loss": 1.5732, + "step": 2866 + }, + { + "epoch": 0.15674589614969314, + "grad_norm": 1.69572114944458, + "learning_rate": 1.9482050462162495e-05, + "loss": 1.672, + "step": 2867 + }, + { + "epoch": 0.1568005685934147, + "grad_norm": 1.5624226331710815, + "learning_rate": 1.9481469827505943e-05, + "loss": 1.6821, + "step": 2868 + }, + { + "epoch": 0.15685524103713624, + "grad_norm": 1.50586998462677, + "learning_rate": 1.948088887624086e-05, + "loss": 1.5493, + "step": 2869 + }, + { + "epoch": 0.1569099134808578, + "grad_norm": 1.5275720357894897, + "learning_rate": 1.9480307608386655e-05, + "loss": 1.2571, + "step": 2870 + }, + { + "epoch": 0.15696458592457938, + "grad_norm": 1.531685709953308, + "learning_rate": 1.9479726023962732e-05, + "loss": 1.8376, + "step": 2871 + }, + { + "epoch": 0.1570192583683009, + "grad_norm": 1.4912668466567993, + "learning_rate": 1.947914412298851e-05, + "loss": 1.5473, + "step": 2872 + }, + { + "epoch": 0.15707393081202248, + "grad_norm": 1.692282795906067, + "learning_rate": 1.9478561905483425e-05, + "loss": 1.2862, + "step": 2873 + }, + { + "epoch": 0.15712860325574401, + "grad_norm": 1.256549596786499, + "learning_rate": 1.9477979371466914e-05, + "loss": 1.4185, + "step": 2874 + }, + { + "epoch": 0.15718327569946558, + "grad_norm": 1.3441303968429565, + "learning_rate": 1.9477396520958432e-05, + "loss": 1.4328, + "step": 2875 + }, + { + "epoch": 0.15723794814318712, + "grad_norm": 1.5684034824371338, + "learning_rate": 1.9476813353977442e-05, + "loss": 1.8436, + "step": 2876 + }, + { + "epoch": 0.15729262058690868, + "grad_norm": 1.5445657968521118, + "learning_rate": 1.947622987054341e-05, + "loss": 1.4321, + "step": 2877 + }, + { + "epoch": 0.15734729303063025, + "grad_norm": 1.935687780380249, + "learning_rate": 1.9475646070675832e-05, + "loss": 1.6397, + "step": 2878 + }, + { + "epoch": 0.15740196547435178, + "grad_norm": 1.3843388557434082, + "learning_rate": 1.9475061954394196e-05, + "loss": 1.3008, + "step": 2879 + }, + { + "epoch": 0.15745663791807335, + "grad_norm": 1.41263747215271, + "learning_rate": 1.9474477521718006e-05, + "loss": 1.365, + "step": 2880 + }, + { + "epoch": 0.1575113103617949, + "grad_norm": 1.6240302324295044, + "learning_rate": 1.947389277266678e-05, + "loss": 1.2593, + "step": 2881 + }, + { + "epoch": 0.15756598280551645, + "grad_norm": 1.292647361755371, + "learning_rate": 1.947330770726004e-05, + "loss": 1.368, + "step": 2882 + }, + { + "epoch": 0.157620655249238, + "grad_norm": 1.3167766332626343, + "learning_rate": 1.947272232551733e-05, + "loss": 1.3292, + "step": 2883 + }, + { + "epoch": 0.15767532769295955, + "grad_norm": 1.3891124725341797, + "learning_rate": 1.947213662745819e-05, + "loss": 1.2945, + "step": 2884 + }, + { + "epoch": 0.15773000013668112, + "grad_norm": 1.264167308807373, + "learning_rate": 1.9471550613102185e-05, + "loss": 1.4995, + "step": 2885 + }, + { + "epoch": 0.15778467258040266, + "grad_norm": 2.1383986473083496, + "learning_rate": 1.9470964282468874e-05, + "loss": 1.4856, + "step": 2886 + }, + { + "epoch": 0.15783934502412422, + "grad_norm": 1.5255045890808105, + "learning_rate": 1.9470377635577843e-05, + "loss": 1.4663, + "step": 2887 + }, + { + "epoch": 0.15789401746784576, + "grad_norm": 1.4268803596496582, + "learning_rate": 1.9469790672448683e-05, + "loss": 1.6214, + "step": 2888 + }, + { + "epoch": 0.15794868991156732, + "grad_norm": 1.55556321144104, + "learning_rate": 1.946920339310099e-05, + "loss": 1.3342, + "step": 2889 + }, + { + "epoch": 0.15800336235528886, + "grad_norm": 1.6133971214294434, + "learning_rate": 1.9468615797554374e-05, + "loss": 1.3396, + "step": 2890 + }, + { + "epoch": 0.15805803479901043, + "grad_norm": 2.013921022415161, + "learning_rate": 1.9468027885828457e-05, + "loss": 1.3489, + "step": 2891 + }, + { + "epoch": 0.158112707242732, + "grad_norm": 1.2624090909957886, + "learning_rate": 1.946743965794287e-05, + "loss": 1.4541, + "step": 2892 + }, + { + "epoch": 0.15816737968645353, + "grad_norm": 1.1502172946929932, + "learning_rate": 1.946685111391726e-05, + "loss": 1.5685, + "step": 2893 + }, + { + "epoch": 0.1582220521301751, + "grad_norm": 1.6468474864959717, + "learning_rate": 1.9466262253771274e-05, + "loss": 1.4328, + "step": 2894 + }, + { + "epoch": 0.15827672457389663, + "grad_norm": 2.271470069885254, + "learning_rate": 1.9465673077524584e-05, + "loss": 1.2145, + "step": 2895 + }, + { + "epoch": 0.1583313970176182, + "grad_norm": 1.5284119844436646, + "learning_rate": 1.946508358519685e-05, + "loss": 1.5941, + "step": 2896 + }, + { + "epoch": 0.15838606946133973, + "grad_norm": 1.4554829597473145, + "learning_rate": 1.946449377680777e-05, + "loss": 1.6488, + "step": 2897 + }, + { + "epoch": 0.1584407419050613, + "grad_norm": 1.3238555192947388, + "learning_rate": 1.946390365237703e-05, + "loss": 1.3098, + "step": 2898 + }, + { + "epoch": 0.15849541434878286, + "grad_norm": 1.4720228910446167, + "learning_rate": 1.9463313211924343e-05, + "loss": 1.4929, + "step": 2899 + }, + { + "epoch": 0.1585500867925044, + "grad_norm": 1.3835958242416382, + "learning_rate": 1.9462722455469422e-05, + "loss": 1.4589, + "step": 2900 + }, + { + "epoch": 0.15860475923622597, + "grad_norm": 1.4937697649002075, + "learning_rate": 1.9462131383031988e-05, + "loss": 1.476, + "step": 2901 + }, + { + "epoch": 0.1586594316799475, + "grad_norm": 1.8144437074661255, + "learning_rate": 1.946153999463179e-05, + "loss": 1.3585, + "step": 2902 + }, + { + "epoch": 0.15871410412366907, + "grad_norm": 1.4093573093414307, + "learning_rate": 1.9460948290288565e-05, + "loss": 1.5233, + "step": 2903 + }, + { + "epoch": 0.1587687765673906, + "grad_norm": 1.2623155117034912, + "learning_rate": 1.9460356270022073e-05, + "loss": 1.3913, + "step": 2904 + }, + { + "epoch": 0.15882344901111217, + "grad_norm": 1.586272954940796, + "learning_rate": 1.945976393385209e-05, + "loss": 1.4395, + "step": 2905 + }, + { + "epoch": 0.15887812145483374, + "grad_norm": 1.595996379852295, + "learning_rate": 1.9459171281798394e-05, + "loss": 1.3935, + "step": 2906 + }, + { + "epoch": 0.15893279389855527, + "grad_norm": 2.031773567199707, + "learning_rate": 1.9458578313880768e-05, + "loss": 1.4715, + "step": 2907 + }, + { + "epoch": 0.15898746634227684, + "grad_norm": 1.476617693901062, + "learning_rate": 1.9457985030119016e-05, + "loss": 1.6449, + "step": 2908 + }, + { + "epoch": 0.15904213878599838, + "grad_norm": 2.097294569015503, + "learning_rate": 1.9457391430532952e-05, + "loss": 1.3776, + "step": 2909 + }, + { + "epoch": 0.15909681122971994, + "grad_norm": 1.8570636510849, + "learning_rate": 1.9456797515142397e-05, + "loss": 1.6176, + "step": 2910 + }, + { + "epoch": 0.1591514836734415, + "grad_norm": 1.6884303092956543, + "learning_rate": 1.945620328396718e-05, + "loss": 1.3341, + "step": 2911 + }, + { + "epoch": 0.15920615611716304, + "grad_norm": 1.8327454328536987, + "learning_rate": 1.9455608737027144e-05, + "loss": 1.4704, + "step": 2912 + }, + { + "epoch": 0.1592608285608846, + "grad_norm": 2.056091785430908, + "learning_rate": 1.9455013874342148e-05, + "loss": 1.6377, + "step": 2913 + }, + { + "epoch": 0.15931550100460615, + "grad_norm": 2.2560179233551025, + "learning_rate": 1.9454418695932048e-05, + "loss": 1.5865, + "step": 2914 + }, + { + "epoch": 0.1593701734483277, + "grad_norm": 2.2100679874420166, + "learning_rate": 1.9453823201816722e-05, + "loss": 1.3796, + "step": 2915 + }, + { + "epoch": 0.15942484589204925, + "grad_norm": 1.4614909887313843, + "learning_rate": 1.945322739201606e-05, + "loss": 1.3479, + "step": 2916 + }, + { + "epoch": 0.15947951833577081, + "grad_norm": 2.1313321590423584, + "learning_rate": 1.945263126654995e-05, + "loss": 1.5382, + "step": 2917 + }, + { + "epoch": 0.15953419077949238, + "grad_norm": 2.5395760536193848, + "learning_rate": 1.9452034825438302e-05, + "loss": 1.4904, + "step": 2918 + }, + { + "epoch": 0.15958886322321392, + "grad_norm": 1.8326257467269897, + "learning_rate": 1.945143806870103e-05, + "loss": 1.5923, + "step": 2919 + }, + { + "epoch": 0.15964353566693548, + "grad_norm": 1.7150812149047852, + "learning_rate": 1.9450840996358062e-05, + "loss": 1.4514, + "step": 2920 + }, + { + "epoch": 0.15969820811065702, + "grad_norm": 1.39237380027771, + "learning_rate": 1.9450243608429336e-05, + "loss": 1.4714, + "step": 2921 + }, + { + "epoch": 0.15975288055437858, + "grad_norm": 1.804503321647644, + "learning_rate": 1.9449645904934802e-05, + "loss": 1.4343, + "step": 2922 + }, + { + "epoch": 0.15980755299810012, + "grad_norm": 1.9880012273788452, + "learning_rate": 1.9449047885894414e-05, + "loss": 1.4436, + "step": 2923 + }, + { + "epoch": 0.1598622254418217, + "grad_norm": 1.5678468942642212, + "learning_rate": 1.9448449551328147e-05, + "loss": 1.6232, + "step": 2924 + }, + { + "epoch": 0.15991689788554325, + "grad_norm": 1.5520185232162476, + "learning_rate": 1.9447850901255975e-05, + "loss": 1.4261, + "step": 2925 + }, + { + "epoch": 0.1599715703292648, + "grad_norm": 1.4170352220535278, + "learning_rate": 1.9447251935697895e-05, + "loss": 1.372, + "step": 2926 + }, + { + "epoch": 0.16002624277298635, + "grad_norm": 2.3266360759735107, + "learning_rate": 1.94466526546739e-05, + "loss": 1.4069, + "step": 2927 + }, + { + "epoch": 0.1600809152167079, + "grad_norm": 1.504142165184021, + "learning_rate": 1.944605305820401e-05, + "loss": 1.3994, + "step": 2928 + }, + { + "epoch": 0.16013558766042946, + "grad_norm": 1.548862338066101, + "learning_rate": 1.944545314630824e-05, + "loss": 1.355, + "step": 2929 + }, + { + "epoch": 0.160190260104151, + "grad_norm": 1.5050389766693115, + "learning_rate": 1.9444852919006627e-05, + "loss": 1.2901, + "step": 2930 + }, + { + "epoch": 0.16024493254787256, + "grad_norm": 1.5616941452026367, + "learning_rate": 1.944425237631921e-05, + "loss": 1.5213, + "step": 2931 + }, + { + "epoch": 0.16029960499159412, + "grad_norm": 1.80536687374115, + "learning_rate": 1.9443651518266044e-05, + "loss": 1.5411, + "step": 2932 + }, + { + "epoch": 0.16035427743531566, + "grad_norm": 1.8086353540420532, + "learning_rate": 1.9443050344867195e-05, + "loss": 1.3903, + "step": 2933 + }, + { + "epoch": 0.16040894987903723, + "grad_norm": 1.1642160415649414, + "learning_rate": 1.9442448856142736e-05, + "loss": 1.6317, + "step": 2934 + }, + { + "epoch": 0.16046362232275876, + "grad_norm": 1.5000027418136597, + "learning_rate": 1.9441847052112753e-05, + "loss": 1.4518, + "step": 2935 + }, + { + "epoch": 0.16051829476648033, + "grad_norm": 1.2241766452789307, + "learning_rate": 1.9441244932797337e-05, + "loss": 1.4503, + "step": 2936 + }, + { + "epoch": 0.16057296721020187, + "grad_norm": 1.5743064880371094, + "learning_rate": 1.9440642498216604e-05, + "loss": 1.3887, + "step": 2937 + }, + { + "epoch": 0.16062763965392343, + "grad_norm": 2.1676409244537354, + "learning_rate": 1.944003974839066e-05, + "loss": 1.2435, + "step": 2938 + }, + { + "epoch": 0.160682312097645, + "grad_norm": 1.760830044746399, + "learning_rate": 1.943943668333964e-05, + "loss": 1.5837, + "step": 2939 + }, + { + "epoch": 0.16073698454136653, + "grad_norm": 1.8846514225006104, + "learning_rate": 1.9438833303083677e-05, + "loss": 1.4989, + "step": 2940 + }, + { + "epoch": 0.1607916569850881, + "grad_norm": 1.540242075920105, + "learning_rate": 1.9438229607642923e-05, + "loss": 1.1804, + "step": 2941 + }, + { + "epoch": 0.16084632942880964, + "grad_norm": 2.7904388904571533, + "learning_rate": 1.9437625597037532e-05, + "loss": 1.4187, + "step": 2942 + }, + { + "epoch": 0.1609010018725312, + "grad_norm": 1.3995752334594727, + "learning_rate": 1.943702127128768e-05, + "loss": 1.6125, + "step": 2943 + }, + { + "epoch": 0.16095567431625274, + "grad_norm": 1.2821311950683594, + "learning_rate": 1.943641663041354e-05, + "loss": 1.5988, + "step": 2944 + }, + { + "epoch": 0.1610103467599743, + "grad_norm": 1.4769959449768066, + "learning_rate": 1.9435811674435308e-05, + "loss": 1.2374, + "step": 2945 + }, + { + "epoch": 0.16106501920369587, + "grad_norm": 1.6258399486541748, + "learning_rate": 1.943520640337318e-05, + "loss": 1.434, + "step": 2946 + }, + { + "epoch": 0.1611196916474174, + "grad_norm": 1.3542473316192627, + "learning_rate": 1.9434600817247368e-05, + "loss": 1.3967, + "step": 2947 + }, + { + "epoch": 0.16117436409113897, + "grad_norm": 1.2718983888626099, + "learning_rate": 1.94339949160781e-05, + "loss": 1.488, + "step": 2948 + }, + { + "epoch": 0.1612290365348605, + "grad_norm": 1.429550051689148, + "learning_rate": 1.94333886998856e-05, + "loss": 1.6153, + "step": 2949 + }, + { + "epoch": 0.16128370897858207, + "grad_norm": 1.4008820056915283, + "learning_rate": 1.943278216869012e-05, + "loss": 1.4494, + "step": 2950 + }, + { + "epoch": 0.1613383814223036, + "grad_norm": 1.601761817932129, + "learning_rate": 1.943217532251191e-05, + "loss": 1.4372, + "step": 2951 + }, + { + "epoch": 0.16139305386602518, + "grad_norm": 1.6986496448516846, + "learning_rate": 1.9431568161371226e-05, + "loss": 1.745, + "step": 2952 + }, + { + "epoch": 0.16144772630974674, + "grad_norm": 1.753475546836853, + "learning_rate": 1.9430960685288355e-05, + "loss": 1.6317, + "step": 2953 + }, + { + "epoch": 0.16150239875346828, + "grad_norm": 1.3728086948394775, + "learning_rate": 1.943035289428357e-05, + "loss": 1.2953, + "step": 2954 + }, + { + "epoch": 0.16155707119718984, + "grad_norm": 1.1990617513656616, + "learning_rate": 1.9429744788377178e-05, + "loss": 1.4537, + "step": 2955 + }, + { + "epoch": 0.16161174364091138, + "grad_norm": 1.6702474355697632, + "learning_rate": 1.942913636758948e-05, + "loss": 1.2114, + "step": 2956 + }, + { + "epoch": 0.16166641608463295, + "grad_norm": 1.4698245525360107, + "learning_rate": 1.942852763194079e-05, + "loss": 1.4242, + "step": 2957 + }, + { + "epoch": 0.16172108852835448, + "grad_norm": 1.444946527481079, + "learning_rate": 1.942791858145144e-05, + "loss": 1.7511, + "step": 2958 + }, + { + "epoch": 0.16177576097207605, + "grad_norm": 1.7891674041748047, + "learning_rate": 1.9427309216141762e-05, + "loss": 1.5691, + "step": 2959 + }, + { + "epoch": 0.16183043341579761, + "grad_norm": 1.5604535341262817, + "learning_rate": 1.942669953603211e-05, + "loss": 1.5421, + "step": 2960 + }, + { + "epoch": 0.16188510585951915, + "grad_norm": 1.2771636247634888, + "learning_rate": 1.9426089541142838e-05, + "loss": 1.4826, + "step": 2961 + }, + { + "epoch": 0.16193977830324072, + "grad_norm": 1.1622397899627686, + "learning_rate": 1.942547923149432e-05, + "loss": 1.6104, + "step": 2962 + }, + { + "epoch": 0.16199445074696225, + "grad_norm": 1.5930923223495483, + "learning_rate": 1.942486860710693e-05, + "loss": 1.2975, + "step": 2963 + }, + { + "epoch": 0.16204912319068382, + "grad_norm": 1.493922233581543, + "learning_rate": 1.9424257668001064e-05, + "loss": 1.4635, + "step": 2964 + }, + { + "epoch": 0.16210379563440536, + "grad_norm": 1.3821995258331299, + "learning_rate": 1.9423646414197116e-05, + "loss": 1.6655, + "step": 2965 + }, + { + "epoch": 0.16215846807812692, + "grad_norm": 1.2558246850967407, + "learning_rate": 1.9423034845715506e-05, + "loss": 1.4165, + "step": 2966 + }, + { + "epoch": 0.1622131405218485, + "grad_norm": 1.5780304670333862, + "learning_rate": 1.9422422962576646e-05, + "loss": 1.5004, + "step": 2967 + }, + { + "epoch": 0.16226781296557002, + "grad_norm": 1.3586695194244385, + "learning_rate": 1.9421810764800978e-05, + "loss": 1.5963, + "step": 2968 + }, + { + "epoch": 0.1623224854092916, + "grad_norm": 1.5924283266067505, + "learning_rate": 1.9421198252408934e-05, + "loss": 1.7022, + "step": 2969 + }, + { + "epoch": 0.16237715785301313, + "grad_norm": 2.083353281021118, + "learning_rate": 1.9420585425420974e-05, + "loss": 1.5368, + "step": 2970 + }, + { + "epoch": 0.1624318302967347, + "grad_norm": 1.4521909952163696, + "learning_rate": 1.9419972283857563e-05, + "loss": 1.4152, + "step": 2971 + }, + { + "epoch": 0.16248650274045623, + "grad_norm": 1.3288148641586304, + "learning_rate": 1.941935882773917e-05, + "loss": 1.3983, + "step": 2972 + }, + { + "epoch": 0.1625411751841778, + "grad_norm": 1.5003793239593506, + "learning_rate": 1.9418745057086284e-05, + "loss": 1.458, + "step": 2973 + }, + { + "epoch": 0.16259584762789936, + "grad_norm": 1.1503868103027344, + "learning_rate": 1.94181309719194e-05, + "loss": 1.5168, + "step": 2974 + }, + { + "epoch": 0.1626505200716209, + "grad_norm": 3.044553518295288, + "learning_rate": 1.9417516572259022e-05, + "loss": 1.191, + "step": 2975 + }, + { + "epoch": 0.16270519251534246, + "grad_norm": 1.4828412532806396, + "learning_rate": 1.9416901858125663e-05, + "loss": 1.3705, + "step": 2976 + }, + { + "epoch": 0.162759864959064, + "grad_norm": 1.4201014041900635, + "learning_rate": 1.9416286829539858e-05, + "loss": 1.4265, + "step": 2977 + }, + { + "epoch": 0.16281453740278556, + "grad_norm": 1.6535860300064087, + "learning_rate": 1.9415671486522137e-05, + "loss": 1.3085, + "step": 2978 + }, + { + "epoch": 0.1628692098465071, + "grad_norm": 1.4409607648849487, + "learning_rate": 1.9415055829093054e-05, + "loss": 1.5535, + "step": 2979 + }, + { + "epoch": 0.16292388229022867, + "grad_norm": 1.8112503290176392, + "learning_rate": 1.941443985727316e-05, + "loss": 1.2867, + "step": 2980 + }, + { + "epoch": 0.16297855473395023, + "grad_norm": 1.7291656732559204, + "learning_rate": 1.941382357108303e-05, + "loss": 1.4712, + "step": 2981 + }, + { + "epoch": 0.16303322717767177, + "grad_norm": 1.1772103309631348, + "learning_rate": 1.9413206970543238e-05, + "loss": 1.4651, + "step": 2982 + }, + { + "epoch": 0.16308789962139333, + "grad_norm": 1.3987596035003662, + "learning_rate": 1.9412590055674378e-05, + "loss": 1.4318, + "step": 2983 + }, + { + "epoch": 0.16314257206511487, + "grad_norm": 1.6583417654037476, + "learning_rate": 1.941197282649705e-05, + "loss": 1.4602, + "step": 2984 + }, + { + "epoch": 0.16319724450883644, + "grad_norm": 1.2500516176223755, + "learning_rate": 1.9411355283031864e-05, + "loss": 1.5394, + "step": 2985 + }, + { + "epoch": 0.16325191695255797, + "grad_norm": 1.5902702808380127, + "learning_rate": 1.941073742529944e-05, + "loss": 1.3562, + "step": 2986 + }, + { + "epoch": 0.16330658939627954, + "grad_norm": 2.721428632736206, + "learning_rate": 1.9410119253320406e-05, + "loss": 1.2388, + "step": 2987 + }, + { + "epoch": 0.1633612618400011, + "grad_norm": 1.283828854560852, + "learning_rate": 1.9409500767115414e-05, + "loss": 1.7534, + "step": 2988 + }, + { + "epoch": 0.16341593428372264, + "grad_norm": 1.5297259092330933, + "learning_rate": 1.9408881966705107e-05, + "loss": 1.3753, + "step": 2989 + }, + { + "epoch": 0.1634706067274442, + "grad_norm": 1.4926596879959106, + "learning_rate": 1.940826285211016e-05, + "loss": 1.2934, + "step": 2990 + }, + { + "epoch": 0.16352527917116574, + "grad_norm": 1.374869465827942, + "learning_rate": 1.940764342335123e-05, + "loss": 1.4744, + "step": 2991 + }, + { + "epoch": 0.1635799516148873, + "grad_norm": 1.6708797216415405, + "learning_rate": 1.9407023680449012e-05, + "loss": 1.5387, + "step": 2992 + }, + { + "epoch": 0.16363462405860885, + "grad_norm": 1.4390288591384888, + "learning_rate": 1.9406403623424204e-05, + "loss": 1.5726, + "step": 2993 + }, + { + "epoch": 0.1636892965023304, + "grad_norm": 1.1983693838119507, + "learning_rate": 1.9405783252297505e-05, + "loss": 1.5905, + "step": 2994 + }, + { + "epoch": 0.16374396894605198, + "grad_norm": 1.4346269369125366, + "learning_rate": 1.9405162567089627e-05, + "loss": 1.3528, + "step": 2995 + }, + { + "epoch": 0.1637986413897735, + "grad_norm": 1.5084056854248047, + "learning_rate": 1.9404541567821305e-05, + "loss": 1.359, + "step": 2996 + }, + { + "epoch": 0.16385331383349508, + "grad_norm": 1.396955966949463, + "learning_rate": 1.9403920254513272e-05, + "loss": 1.4921, + "step": 2997 + }, + { + "epoch": 0.16390798627721662, + "grad_norm": 1.5066412687301636, + "learning_rate": 1.9403298627186277e-05, + "loss": 1.6141, + "step": 2998 + }, + { + "epoch": 0.16396265872093818, + "grad_norm": 1.3528639078140259, + "learning_rate": 1.940267668586107e-05, + "loss": 1.5181, + "step": 2999 + }, + { + "epoch": 0.16401733116465972, + "grad_norm": 1.6686880588531494, + "learning_rate": 1.9402054430558427e-05, + "loss": 1.5499, + "step": 3000 + }, + { + "epoch": 0.16407200360838128, + "grad_norm": 1.2461391687393188, + "learning_rate": 1.9401431861299122e-05, + "loss": 1.6696, + "step": 3001 + }, + { + "epoch": 0.16412667605210285, + "grad_norm": 1.4591553211212158, + "learning_rate": 1.9400808978103948e-05, + "loss": 1.259, + "step": 3002 + }, + { + "epoch": 0.1641813484958244, + "grad_norm": 1.5598564147949219, + "learning_rate": 1.94001857809937e-05, + "loss": 1.4062, + "step": 3003 + }, + { + "epoch": 0.16423602093954595, + "grad_norm": 1.9330321550369263, + "learning_rate": 1.9399562269989193e-05, + "loss": 1.499, + "step": 3004 + }, + { + "epoch": 0.1642906933832675, + "grad_norm": 2.116716146469116, + "learning_rate": 1.9398938445111245e-05, + "loss": 1.6801, + "step": 3005 + }, + { + "epoch": 0.16434536582698905, + "grad_norm": 2.449875831604004, + "learning_rate": 1.939831430638069e-05, + "loss": 1.376, + "step": 3006 + }, + { + "epoch": 0.16440003827071062, + "grad_norm": 1.667380452156067, + "learning_rate": 1.939768985381836e-05, + "loss": 1.3582, + "step": 3007 + }, + { + "epoch": 0.16445471071443216, + "grad_norm": 1.4617202281951904, + "learning_rate": 1.939706508744512e-05, + "loss": 1.3095, + "step": 3008 + }, + { + "epoch": 0.16450938315815372, + "grad_norm": 1.5037696361541748, + "learning_rate": 1.939644000728182e-05, + "loss": 1.3808, + "step": 3009 + }, + { + "epoch": 0.16456405560187526, + "grad_norm": 1.4128530025482178, + "learning_rate": 1.939581461334934e-05, + "loss": 1.4274, + "step": 3010 + }, + { + "epoch": 0.16461872804559682, + "grad_norm": 1.79700767993927, + "learning_rate": 1.9395188905668563e-05, + "loss": 1.4559, + "step": 3011 + }, + { + "epoch": 0.16467340048931836, + "grad_norm": 1.4911346435546875, + "learning_rate": 1.9394562884260382e-05, + "loss": 1.9639, + "step": 3012 + }, + { + "epoch": 0.16472807293303993, + "grad_norm": 1.623079538345337, + "learning_rate": 1.9393936549145703e-05, + "loss": 1.7449, + "step": 3013 + }, + { + "epoch": 0.1647827453767615, + "grad_norm": 1.5332130193710327, + "learning_rate": 1.9393309900345436e-05, + "loss": 1.533, + "step": 3014 + }, + { + "epoch": 0.16483741782048303, + "grad_norm": 1.3567187786102295, + "learning_rate": 1.939268293788051e-05, + "loss": 1.3696, + "step": 3015 + }, + { + "epoch": 0.1648920902642046, + "grad_norm": 1.935801386833191, + "learning_rate": 1.939205566177186e-05, + "loss": 1.3264, + "step": 3016 + }, + { + "epoch": 0.16494676270792613, + "grad_norm": 1.778605341911316, + "learning_rate": 1.9391428072040432e-05, + "loss": 1.6975, + "step": 3017 + }, + { + "epoch": 0.1650014351516477, + "grad_norm": 1.5016435384750366, + "learning_rate": 1.9390800168707185e-05, + "loss": 1.4414, + "step": 3018 + }, + { + "epoch": 0.16505610759536923, + "grad_norm": 1.4284390211105347, + "learning_rate": 1.939017195179308e-05, + "loss": 1.4671, + "step": 3019 + }, + { + "epoch": 0.1651107800390908, + "grad_norm": 1.5254812240600586, + "learning_rate": 1.9389543421319106e-05, + "loss": 1.4437, + "step": 3020 + }, + { + "epoch": 0.16516545248281236, + "grad_norm": 1.5030527114868164, + "learning_rate": 1.938891457730624e-05, + "loss": 1.5906, + "step": 3021 + }, + { + "epoch": 0.1652201249265339, + "grad_norm": 1.836498737335205, + "learning_rate": 1.9388285419775482e-05, + "loss": 1.3868, + "step": 3022 + }, + { + "epoch": 0.16527479737025547, + "grad_norm": 1.6859855651855469, + "learning_rate": 1.938765594874785e-05, + "loss": 1.6398, + "step": 3023 + }, + { + "epoch": 0.165329469813977, + "grad_norm": 1.7381324768066406, + "learning_rate": 1.9387026164244347e-05, + "loss": 1.5532, + "step": 3024 + }, + { + "epoch": 0.16538414225769857, + "grad_norm": 1.7637310028076172, + "learning_rate": 1.9386396066286024e-05, + "loss": 1.5661, + "step": 3025 + }, + { + "epoch": 0.1654388147014201, + "grad_norm": 1.9261845350265503, + "learning_rate": 1.9385765654893905e-05, + "loss": 1.4873, + "step": 3026 + }, + { + "epoch": 0.16549348714514167, + "grad_norm": 2.0983328819274902, + "learning_rate": 1.9385134930089046e-05, + "loss": 1.516, + "step": 3027 + }, + { + "epoch": 0.16554815958886324, + "grad_norm": 1.8908144235610962, + "learning_rate": 1.938450389189251e-05, + "loss": 1.5207, + "step": 3028 + }, + { + "epoch": 0.16560283203258477, + "grad_norm": 1.8914282321929932, + "learning_rate": 1.9383872540325366e-05, + "loss": 1.2451, + "step": 3029 + }, + { + "epoch": 0.16565750447630634, + "grad_norm": 2.457886219024658, + "learning_rate": 1.93832408754087e-05, + "loss": 1.4861, + "step": 3030 + }, + { + "epoch": 0.16571217692002788, + "grad_norm": 2.0223898887634277, + "learning_rate": 1.93826088971636e-05, + "loss": 1.552, + "step": 3031 + }, + { + "epoch": 0.16576684936374944, + "grad_norm": 1.6843526363372803, + "learning_rate": 1.9381976605611176e-05, + "loss": 1.5317, + "step": 3032 + }, + { + "epoch": 0.16582152180747098, + "grad_norm": 1.330568790435791, + "learning_rate": 1.9381344000772535e-05, + "loss": 1.485, + "step": 3033 + }, + { + "epoch": 0.16587619425119254, + "grad_norm": 1.7773728370666504, + "learning_rate": 1.9380711082668805e-05, + "loss": 1.3043, + "step": 3034 + }, + { + "epoch": 0.1659308666949141, + "grad_norm": 2.380366325378418, + "learning_rate": 1.9380077851321117e-05, + "loss": 1.7482, + "step": 3035 + }, + { + "epoch": 0.16598553913863565, + "grad_norm": 1.2891788482666016, + "learning_rate": 1.937944430675062e-05, + "loss": 1.4124, + "step": 3036 + }, + { + "epoch": 0.1660402115823572, + "grad_norm": 2.1071598529815674, + "learning_rate": 1.937881044897847e-05, + "loss": 1.6218, + "step": 3037 + }, + { + "epoch": 0.16609488402607875, + "grad_norm": 1.6929887533187866, + "learning_rate": 1.937817627802583e-05, + "loss": 1.5572, + "step": 3038 + }, + { + "epoch": 0.1661495564698003, + "grad_norm": 1.333242416381836, + "learning_rate": 1.9377541793913876e-05, + "loss": 1.4223, + "step": 3039 + }, + { + "epoch": 0.16620422891352185, + "grad_norm": 1.8188179731369019, + "learning_rate": 1.9376906996663795e-05, + "loss": 1.6296, + "step": 3040 + }, + { + "epoch": 0.16625890135724342, + "grad_norm": 1.4532395601272583, + "learning_rate": 1.937627188629679e-05, + "loss": 1.6156, + "step": 3041 + }, + { + "epoch": 0.16631357380096498, + "grad_norm": 1.5437201261520386, + "learning_rate": 1.9375636462834062e-05, + "loss": 1.3578, + "step": 3042 + }, + { + "epoch": 0.16636824624468652, + "grad_norm": 1.968035101890564, + "learning_rate": 1.9375000726296834e-05, + "loss": 1.4319, + "step": 3043 + }, + { + "epoch": 0.16642291868840808, + "grad_norm": 1.8434683084487915, + "learning_rate": 1.937436467670633e-05, + "loss": 1.4023, + "step": 3044 + }, + { + "epoch": 0.16647759113212962, + "grad_norm": 1.8007043600082397, + "learning_rate": 1.937372831408379e-05, + "loss": 1.2476, + "step": 3045 + }, + { + "epoch": 0.16653226357585119, + "grad_norm": 1.6490200757980347, + "learning_rate": 1.9373091638450472e-05, + "loss": 1.542, + "step": 3046 + }, + { + "epoch": 0.16658693601957272, + "grad_norm": 1.2298974990844727, + "learning_rate": 1.9372454649827626e-05, + "loss": 1.5699, + "step": 3047 + }, + { + "epoch": 0.1666416084632943, + "grad_norm": 1.899327039718628, + "learning_rate": 1.9371817348236525e-05, + "loss": 1.0467, + "step": 3048 + }, + { + "epoch": 0.16669628090701585, + "grad_norm": 1.2843220233917236, + "learning_rate": 1.937117973369845e-05, + "loss": 1.4789, + "step": 3049 + }, + { + "epoch": 0.1667509533507374, + "grad_norm": 1.3068859577178955, + "learning_rate": 1.93705418062347e-05, + "loss": 1.5919, + "step": 3050 + }, + { + "epoch": 0.16680562579445896, + "grad_norm": 1.6323587894439697, + "learning_rate": 1.9369903565866565e-05, + "loss": 1.4817, + "step": 3051 + }, + { + "epoch": 0.1668602982381805, + "grad_norm": 1.4983118772506714, + "learning_rate": 1.9369265012615362e-05, + "loss": 1.5352, + "step": 3052 + }, + { + "epoch": 0.16691497068190206, + "grad_norm": 1.4310681819915771, + "learning_rate": 1.9368626146502416e-05, + "loss": 1.5069, + "step": 3053 + }, + { + "epoch": 0.1669696431256236, + "grad_norm": 1.0704452991485596, + "learning_rate": 1.936798696754906e-05, + "loss": 1.4322, + "step": 3054 + }, + { + "epoch": 0.16702431556934516, + "grad_norm": 1.5754344463348389, + "learning_rate": 1.9367347475776633e-05, + "loss": 1.437, + "step": 3055 + }, + { + "epoch": 0.16707898801306673, + "grad_norm": 2.0662853717803955, + "learning_rate": 1.9366707671206496e-05, + "loss": 1.4166, + "step": 3056 + }, + { + "epoch": 0.16713366045678826, + "grad_norm": 1.5312504768371582, + "learning_rate": 1.936606755386001e-05, + "loss": 1.5283, + "step": 3057 + }, + { + "epoch": 0.16718833290050983, + "grad_norm": 1.271127462387085, + "learning_rate": 1.936542712375855e-05, + "loss": 1.5567, + "step": 3058 + }, + { + "epoch": 0.16724300534423137, + "grad_norm": 1.302716851234436, + "learning_rate": 1.9364786380923503e-05, + "loss": 1.6549, + "step": 3059 + }, + { + "epoch": 0.16729767778795293, + "grad_norm": 1.2980035543441772, + "learning_rate": 1.936414532537626e-05, + "loss": 1.5811, + "step": 3060 + }, + { + "epoch": 0.16735235023167447, + "grad_norm": 1.6364247798919678, + "learning_rate": 1.9363503957138235e-05, + "loss": 1.6713, + "step": 3061 + }, + { + "epoch": 0.16740702267539603, + "grad_norm": 1.5148025751113892, + "learning_rate": 1.9362862276230837e-05, + "loss": 1.3975, + "step": 3062 + }, + { + "epoch": 0.1674616951191176, + "grad_norm": 1.435078740119934, + "learning_rate": 1.9362220282675498e-05, + "loss": 1.5577, + "step": 3063 + }, + { + "epoch": 0.16751636756283914, + "grad_norm": 1.3254634141921997, + "learning_rate": 1.9361577976493654e-05, + "loss": 1.517, + "step": 3064 + }, + { + "epoch": 0.1675710400065607, + "grad_norm": 1.228071689605713, + "learning_rate": 1.9360935357706756e-05, + "loss": 1.4257, + "step": 3065 + }, + { + "epoch": 0.16762571245028224, + "grad_norm": 1.4035472869873047, + "learning_rate": 1.9360292426336263e-05, + "loss": 1.4809, + "step": 3066 + }, + { + "epoch": 0.1676803848940038, + "grad_norm": 1.3281298875808716, + "learning_rate": 1.9359649182403633e-05, + "loss": 1.5607, + "step": 3067 + }, + { + "epoch": 0.16773505733772534, + "grad_norm": 1.4426748752593994, + "learning_rate": 1.935900562593036e-05, + "loss": 1.5548, + "step": 3068 + }, + { + "epoch": 0.1677897297814469, + "grad_norm": 1.3928881883621216, + "learning_rate": 1.9358361756937926e-05, + "loss": 1.4853, + "step": 3069 + }, + { + "epoch": 0.16784440222516847, + "grad_norm": 1.7229942083358765, + "learning_rate": 1.935771757544783e-05, + "loss": 1.488, + "step": 3070 + }, + { + "epoch": 0.16789907466889, + "grad_norm": 1.9389572143554688, + "learning_rate": 1.935707308148159e-05, + "loss": 1.4474, + "step": 3071 + }, + { + "epoch": 0.16795374711261157, + "grad_norm": 1.9239528179168701, + "learning_rate": 1.9356428275060722e-05, + "loss": 1.3524, + "step": 3072 + }, + { + "epoch": 0.1680084195563331, + "grad_norm": 1.7786401510238647, + "learning_rate": 1.9355783156206755e-05, + "loss": 1.5178, + "step": 3073 + }, + { + "epoch": 0.16806309200005468, + "grad_norm": 1.5896676778793335, + "learning_rate": 1.9355137724941237e-05, + "loss": 1.6251, + "step": 3074 + }, + { + "epoch": 0.1681177644437762, + "grad_norm": 1.878134846687317, + "learning_rate": 1.935449198128572e-05, + "loss": 1.4628, + "step": 3075 + }, + { + "epoch": 0.16817243688749778, + "grad_norm": 1.3429179191589355, + "learning_rate": 1.935384592526176e-05, + "loss": 1.3788, + "step": 3076 + }, + { + "epoch": 0.16822710933121934, + "grad_norm": 1.6732711791992188, + "learning_rate": 1.935319955689094e-05, + "loss": 1.6279, + "step": 3077 + }, + { + "epoch": 0.16828178177494088, + "grad_norm": 1.79249906539917, + "learning_rate": 1.9352552876194835e-05, + "loss": 1.525, + "step": 3078 + }, + { + "epoch": 0.16833645421866245, + "grad_norm": 1.4803141355514526, + "learning_rate": 1.9351905883195044e-05, + "loss": 1.6745, + "step": 3079 + }, + { + "epoch": 0.16839112666238398, + "grad_norm": 1.4704283475875854, + "learning_rate": 1.935125857791317e-05, + "loss": 1.558, + "step": 3080 + }, + { + "epoch": 0.16844579910610555, + "grad_norm": 1.751530647277832, + "learning_rate": 1.935061096037083e-05, + "loss": 1.3398, + "step": 3081 + }, + { + "epoch": 0.16850047154982709, + "grad_norm": 1.9059169292449951, + "learning_rate": 1.9349963030589648e-05, + "loss": 1.4655, + "step": 3082 + }, + { + "epoch": 0.16855514399354865, + "grad_norm": 1.1809487342834473, + "learning_rate": 1.9349314788591258e-05, + "loss": 1.5129, + "step": 3083 + }, + { + "epoch": 0.16860981643727022, + "grad_norm": 1.3275114297866821, + "learning_rate": 1.934866623439731e-05, + "loss": 1.5721, + "step": 3084 + }, + { + "epoch": 0.16866448888099175, + "grad_norm": 1.7439683675765991, + "learning_rate": 1.9348017368029458e-05, + "loss": 1.2788, + "step": 3085 + }, + { + "epoch": 0.16871916132471332, + "grad_norm": 1.9172227382659912, + "learning_rate": 1.934736818950937e-05, + "loss": 1.462, + "step": 3086 + }, + { + "epoch": 0.16877383376843486, + "grad_norm": 1.3674726486206055, + "learning_rate": 1.9346718698858728e-05, + "loss": 1.7915, + "step": 3087 + }, + { + "epoch": 0.16882850621215642, + "grad_norm": 1.5309780836105347, + "learning_rate": 1.934606889609921e-05, + "loss": 1.3542, + "step": 3088 + }, + { + "epoch": 0.16888317865587796, + "grad_norm": 1.393235683441162, + "learning_rate": 1.9345418781252527e-05, + "loss": 1.5528, + "step": 3089 + }, + { + "epoch": 0.16893785109959952, + "grad_norm": 1.3432896137237549, + "learning_rate": 1.9344768354340378e-05, + "loss": 1.3604, + "step": 3090 + }, + { + "epoch": 0.1689925235433211, + "grad_norm": 1.5062990188598633, + "learning_rate": 1.9344117615384483e-05, + "loss": 1.3777, + "step": 3091 + }, + { + "epoch": 0.16904719598704263, + "grad_norm": 1.3834000825881958, + "learning_rate": 1.9343466564406576e-05, + "loss": 1.5555, + "step": 3092 + }, + { + "epoch": 0.1691018684307642, + "grad_norm": 1.8492558002471924, + "learning_rate": 1.9342815201428394e-05, + "loss": 1.3536, + "step": 3093 + }, + { + "epoch": 0.16915654087448573, + "grad_norm": 2.0964579582214355, + "learning_rate": 1.934216352647169e-05, + "loss": 1.4894, + "step": 3094 + }, + { + "epoch": 0.1692112133182073, + "grad_norm": 1.4693478345870972, + "learning_rate": 1.9341511539558227e-05, + "loss": 1.527, + "step": 3095 + }, + { + "epoch": 0.16926588576192883, + "grad_norm": 2.2813808917999268, + "learning_rate": 1.934085924070977e-05, + "loss": 1.5681, + "step": 3096 + }, + { + "epoch": 0.1693205582056504, + "grad_norm": 1.308854103088379, + "learning_rate": 1.9340206629948104e-05, + "loss": 1.4854, + "step": 3097 + }, + { + "epoch": 0.16937523064937196, + "grad_norm": 1.6844288110733032, + "learning_rate": 1.933955370729502e-05, + "loss": 1.4198, + "step": 3098 + }, + { + "epoch": 0.1694299030930935, + "grad_norm": 1.66063392162323, + "learning_rate": 1.9338900472772323e-05, + "loss": 1.341, + "step": 3099 + }, + { + "epoch": 0.16948457553681506, + "grad_norm": 1.377745509147644, + "learning_rate": 1.9338246926401828e-05, + "loss": 1.5677, + "step": 3100 + }, + { + "epoch": 0.1695392479805366, + "grad_norm": 1.6490269899368286, + "learning_rate": 1.9337593068205353e-05, + "loss": 1.5164, + "step": 3101 + }, + { + "epoch": 0.16959392042425817, + "grad_norm": 1.6470216512680054, + "learning_rate": 1.933693889820473e-05, + "loss": 1.7168, + "step": 3102 + }, + { + "epoch": 0.1696485928679797, + "grad_norm": 1.1970065832138062, + "learning_rate": 1.933628441642181e-05, + "loss": 1.5567, + "step": 3103 + }, + { + "epoch": 0.16970326531170127, + "grad_norm": 1.5611083507537842, + "learning_rate": 1.9335629622878445e-05, + "loss": 1.5166, + "step": 3104 + }, + { + "epoch": 0.16975793775542283, + "grad_norm": 1.762465000152588, + "learning_rate": 1.9334974517596504e-05, + "loss": 1.7025, + "step": 3105 + }, + { + "epoch": 0.16981261019914437, + "grad_norm": 1.5442931652069092, + "learning_rate": 1.9334319100597855e-05, + "loss": 1.4961, + "step": 3106 + }, + { + "epoch": 0.16986728264286594, + "grad_norm": 1.529619812965393, + "learning_rate": 1.9333663371904388e-05, + "loss": 1.5856, + "step": 3107 + }, + { + "epoch": 0.16992195508658747, + "grad_norm": 1.4157172441482544, + "learning_rate": 1.9333007331537998e-05, + "loss": 1.3978, + "step": 3108 + }, + { + "epoch": 0.16997662753030904, + "grad_norm": 1.4084872007369995, + "learning_rate": 1.9332350979520594e-05, + "loss": 1.5193, + "step": 3109 + }, + { + "epoch": 0.1700312999740306, + "grad_norm": 1.3035317659378052, + "learning_rate": 1.933169431587409e-05, + "loss": 1.5999, + "step": 3110 + }, + { + "epoch": 0.17008597241775214, + "grad_norm": 1.3091024160385132, + "learning_rate": 1.9331037340620418e-05, + "loss": 1.4178, + "step": 3111 + }, + { + "epoch": 0.1701406448614737, + "grad_norm": 1.7417991161346436, + "learning_rate": 1.9330380053781512e-05, + "loss": 1.7037, + "step": 3112 + }, + { + "epoch": 0.17019531730519524, + "grad_norm": 2.005087375640869, + "learning_rate": 1.932972245537932e-05, + "loss": 1.6466, + "step": 3113 + }, + { + "epoch": 0.1702499897489168, + "grad_norm": 1.6346591711044312, + "learning_rate": 1.9329064545435803e-05, + "loss": 1.5334, + "step": 3114 + }, + { + "epoch": 0.17030466219263835, + "grad_norm": 1.1970402002334595, + "learning_rate": 1.932840632397293e-05, + "loss": 1.4938, + "step": 3115 + }, + { + "epoch": 0.1703593346363599, + "grad_norm": 1.481830358505249, + "learning_rate": 1.932774779101268e-05, + "loss": 1.632, + "step": 3116 + }, + { + "epoch": 0.17041400708008148, + "grad_norm": 1.6723536252975464, + "learning_rate": 1.9327088946577042e-05, + "loss": 1.6203, + "step": 3117 + }, + { + "epoch": 0.170468679523803, + "grad_norm": 1.489465594291687, + "learning_rate": 1.9326429790688022e-05, + "loss": 1.3744, + "step": 3118 + }, + { + "epoch": 0.17052335196752458, + "grad_norm": 1.5255831480026245, + "learning_rate": 1.932577032336762e-05, + "loss": 1.2426, + "step": 3119 + }, + { + "epoch": 0.17057802441124612, + "grad_norm": 1.3905680179595947, + "learning_rate": 1.9325110544637868e-05, + "loss": 1.3449, + "step": 3120 + }, + { + "epoch": 0.17063269685496768, + "grad_norm": 1.511493444442749, + "learning_rate": 1.932445045452079e-05, + "loss": 1.4778, + "step": 3121 + }, + { + "epoch": 0.17068736929868922, + "grad_norm": 1.3191237449645996, + "learning_rate": 1.9323790053038434e-05, + "loss": 1.3768, + "step": 3122 + }, + { + "epoch": 0.17074204174241078, + "grad_norm": 1.9568686485290527, + "learning_rate": 1.9323129340212844e-05, + "loss": 1.3286, + "step": 3123 + }, + { + "epoch": 0.17079671418613235, + "grad_norm": 1.4397655725479126, + "learning_rate": 1.932246831606609e-05, + "loss": 1.4657, + "step": 3124 + }, + { + "epoch": 0.17085138662985389, + "grad_norm": 1.8057756423950195, + "learning_rate": 1.9321806980620246e-05, + "loss": 1.5002, + "step": 3125 + }, + { + "epoch": 0.17090605907357545, + "grad_norm": 1.6423299312591553, + "learning_rate": 1.932114533389739e-05, + "loss": 1.2245, + "step": 3126 + }, + { + "epoch": 0.170960731517297, + "grad_norm": 1.9289761781692505, + "learning_rate": 1.932048337591962e-05, + "loss": 1.3428, + "step": 3127 + }, + { + "epoch": 0.17101540396101855, + "grad_norm": 1.7772958278656006, + "learning_rate": 1.931982110670904e-05, + "loss": 1.5414, + "step": 3128 + }, + { + "epoch": 0.1710700764047401, + "grad_norm": 1.4748001098632812, + "learning_rate": 1.931915852628776e-05, + "loss": 1.4339, + "step": 3129 + }, + { + "epoch": 0.17112474884846166, + "grad_norm": 1.5960925817489624, + "learning_rate": 1.931849563467791e-05, + "loss": 1.6332, + "step": 3130 + }, + { + "epoch": 0.17117942129218322, + "grad_norm": 2.0493133068084717, + "learning_rate": 1.9317832431901623e-05, + "loss": 1.4446, + "step": 3131 + }, + { + "epoch": 0.17123409373590476, + "grad_norm": 1.5845820903778076, + "learning_rate": 1.9317168917981048e-05, + "loss": 1.7426, + "step": 3132 + }, + { + "epoch": 0.17128876617962632, + "grad_norm": 1.6221719980239868, + "learning_rate": 1.931650509293834e-05, + "loss": 1.4301, + "step": 3133 + }, + { + "epoch": 0.17134343862334786, + "grad_norm": 1.4620343446731567, + "learning_rate": 1.9315840956795663e-05, + "loss": 1.4945, + "step": 3134 + }, + { + "epoch": 0.17139811106706943, + "grad_norm": 1.4665517807006836, + "learning_rate": 1.9315176509575196e-05, + "loss": 1.398, + "step": 3135 + }, + { + "epoch": 0.17145278351079096, + "grad_norm": 1.3346710205078125, + "learning_rate": 1.9314511751299128e-05, + "loss": 1.4334, + "step": 3136 + }, + { + "epoch": 0.17150745595451253, + "grad_norm": 1.5265414714813232, + "learning_rate": 1.9313846681989655e-05, + "loss": 1.7504, + "step": 3137 + }, + { + "epoch": 0.1715621283982341, + "grad_norm": 1.019304871559143, + "learning_rate": 1.931318130166899e-05, + "loss": 1.5467, + "step": 3138 + }, + { + "epoch": 0.17161680084195563, + "grad_norm": 1.4617949724197388, + "learning_rate": 1.931251561035934e-05, + "loss": 1.5481, + "step": 3139 + }, + { + "epoch": 0.1716714732856772, + "grad_norm": 1.3039498329162598, + "learning_rate": 1.9311849608082945e-05, + "loss": 1.3898, + "step": 3140 + }, + { + "epoch": 0.17172614572939873, + "grad_norm": 1.972407341003418, + "learning_rate": 1.931118329486204e-05, + "loss": 1.4501, + "step": 3141 + }, + { + "epoch": 0.1717808181731203, + "grad_norm": 1.5053396224975586, + "learning_rate": 1.9310516670718877e-05, + "loss": 1.4487, + "step": 3142 + }, + { + "epoch": 0.17183549061684184, + "grad_norm": 1.504928469657898, + "learning_rate": 1.930984973567571e-05, + "loss": 1.4996, + "step": 3143 + }, + { + "epoch": 0.1718901630605634, + "grad_norm": 1.4565448760986328, + "learning_rate": 1.9309182489754818e-05, + "loss": 1.4434, + "step": 3144 + }, + { + "epoch": 0.17194483550428497, + "grad_norm": 1.333269476890564, + "learning_rate": 1.930851493297848e-05, + "loss": 1.5284, + "step": 3145 + }, + { + "epoch": 0.1719995079480065, + "grad_norm": 2.056644916534424, + "learning_rate": 1.9307847065368982e-05, + "loss": 1.2677, + "step": 3146 + }, + { + "epoch": 0.17205418039172807, + "grad_norm": 1.5421801805496216, + "learning_rate": 1.9307178886948626e-05, + "loss": 1.2055, + "step": 3147 + }, + { + "epoch": 0.1721088528354496, + "grad_norm": 1.3173203468322754, + "learning_rate": 1.9306510397739733e-05, + "loss": 1.466, + "step": 3148 + }, + { + "epoch": 0.17216352527917117, + "grad_norm": 1.5367733240127563, + "learning_rate": 1.9305841597764615e-05, + "loss": 1.3903, + "step": 3149 + }, + { + "epoch": 0.1722181977228927, + "grad_norm": 1.3273769617080688, + "learning_rate": 1.930517248704561e-05, + "loss": 1.3604, + "step": 3150 + }, + { + "epoch": 0.17227287016661427, + "grad_norm": 1.5097126960754395, + "learning_rate": 1.9304503065605066e-05, + "loss": 1.603, + "step": 3151 + }, + { + "epoch": 0.17232754261033584, + "grad_norm": 1.4010618925094604, + "learning_rate": 1.930383333346532e-05, + "loss": 1.4587, + "step": 3152 + }, + { + "epoch": 0.17238221505405737, + "grad_norm": 1.5101701021194458, + "learning_rate": 1.930316329064876e-05, + "loss": 1.3765, + "step": 3153 + }, + { + "epoch": 0.17243688749777894, + "grad_norm": 1.4630080461502075, + "learning_rate": 1.9302492937177736e-05, + "loss": 1.4591, + "step": 3154 + }, + { + "epoch": 0.17249155994150048, + "grad_norm": 1.7711553573608398, + "learning_rate": 1.930182227307465e-05, + "loss": 1.4819, + "step": 3155 + }, + { + "epoch": 0.17254623238522204, + "grad_norm": 2.1108508110046387, + "learning_rate": 1.9301151298361887e-05, + "loss": 1.6829, + "step": 3156 + }, + { + "epoch": 0.17260090482894358, + "grad_norm": 1.514237403869629, + "learning_rate": 1.9300480013061863e-05, + "loss": 1.4847, + "step": 3157 + }, + { + "epoch": 0.17265557727266514, + "grad_norm": 1.5815564393997192, + "learning_rate": 1.929980841719698e-05, + "loss": 1.4147, + "step": 3158 + }, + { + "epoch": 0.1727102497163867, + "grad_norm": 1.7810730934143066, + "learning_rate": 1.9299136510789675e-05, + "loss": 1.5176, + "step": 3159 + }, + { + "epoch": 0.17276492216010825, + "grad_norm": 1.5655782222747803, + "learning_rate": 1.9298464293862377e-05, + "loss": 1.1961, + "step": 3160 + }, + { + "epoch": 0.1728195946038298, + "grad_norm": 1.619165062904358, + "learning_rate": 1.9297791766437538e-05, + "loss": 1.3949, + "step": 3161 + }, + { + "epoch": 0.17287426704755135, + "grad_norm": 1.2863025665283203, + "learning_rate": 1.929711892853762e-05, + "loss": 1.4981, + "step": 3162 + }, + { + "epoch": 0.17292893949127291, + "grad_norm": 1.3323827981948853, + "learning_rate": 1.9296445780185077e-05, + "loss": 1.4296, + "step": 3163 + }, + { + "epoch": 0.17298361193499445, + "grad_norm": 1.7692010402679443, + "learning_rate": 1.92957723214024e-05, + "loss": 1.3622, + "step": 3164 + }, + { + "epoch": 0.17303828437871602, + "grad_norm": 1.5927460193634033, + "learning_rate": 1.9295098552212067e-05, + "loss": 1.6138, + "step": 3165 + }, + { + "epoch": 0.17309295682243758, + "grad_norm": 1.7242463827133179, + "learning_rate": 1.9294424472636587e-05, + "loss": 1.5384, + "step": 3166 + }, + { + "epoch": 0.17314762926615912, + "grad_norm": 1.238429069519043, + "learning_rate": 1.9293750082698457e-05, + "loss": 1.6715, + "step": 3167 + }, + { + "epoch": 0.17320230170988068, + "grad_norm": 1.5376235246658325, + "learning_rate": 1.929307538242021e-05, + "loss": 1.3182, + "step": 3168 + }, + { + "epoch": 0.17325697415360222, + "grad_norm": 1.2318739891052246, + "learning_rate": 1.9292400371824365e-05, + "loss": 1.4683, + "step": 3169 + }, + { + "epoch": 0.1733116465973238, + "grad_norm": 1.7218809127807617, + "learning_rate": 1.929172505093347e-05, + "loss": 1.451, + "step": 3170 + }, + { + "epoch": 0.17336631904104532, + "grad_norm": 1.6974908113479614, + "learning_rate": 1.9291049419770067e-05, + "loss": 1.3273, + "step": 3171 + }, + { + "epoch": 0.1734209914847669, + "grad_norm": 1.8820066452026367, + "learning_rate": 1.9290373478356724e-05, + "loss": 1.6182, + "step": 3172 + }, + { + "epoch": 0.17347566392848845, + "grad_norm": 1.4464977979660034, + "learning_rate": 1.928969722671601e-05, + "loss": 1.5379, + "step": 3173 + }, + { + "epoch": 0.17353033637221, + "grad_norm": 1.4760277271270752, + "learning_rate": 1.9289020664870505e-05, + "loss": 1.3343, + "step": 3174 + }, + { + "epoch": 0.17358500881593156, + "grad_norm": 1.781968593597412, + "learning_rate": 1.928834379284281e-05, + "loss": 1.7464, + "step": 3175 + }, + { + "epoch": 0.1736396812596531, + "grad_norm": 1.4150248765945435, + "learning_rate": 1.928766661065551e-05, + "loss": 1.4519, + "step": 3176 + }, + { + "epoch": 0.17369435370337466, + "grad_norm": 1.6286324262619019, + "learning_rate": 1.928698911833123e-05, + "loss": 1.3446, + "step": 3177 + }, + { + "epoch": 0.1737490261470962, + "grad_norm": 1.7663987874984741, + "learning_rate": 1.9286311315892595e-05, + "loss": 1.4637, + "step": 3178 + }, + { + "epoch": 0.17380369859081776, + "grad_norm": 1.4140697717666626, + "learning_rate": 1.9285633203362234e-05, + "loss": 1.6076, + "step": 3179 + }, + { + "epoch": 0.17385837103453933, + "grad_norm": 1.4212020635604858, + "learning_rate": 1.9284954780762787e-05, + "loss": 1.4302, + "step": 3180 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 1.2697545289993286, + "learning_rate": 1.9284276048116915e-05, + "loss": 1.4974, + "step": 3181 + }, + { + "epoch": 0.17396771592198243, + "grad_norm": 1.558129906654358, + "learning_rate": 1.9283597005447277e-05, + "loss": 1.6667, + "step": 3182 + }, + { + "epoch": 0.17402238836570397, + "grad_norm": 1.3632019758224487, + "learning_rate": 1.9282917652776554e-05, + "loss": 1.499, + "step": 3183 + }, + { + "epoch": 0.17407706080942553, + "grad_norm": 1.6385568380355835, + "learning_rate": 1.9282237990127425e-05, + "loss": 1.4251, + "step": 3184 + }, + { + "epoch": 0.17413173325314707, + "grad_norm": 1.196079134941101, + "learning_rate": 1.9281558017522588e-05, + "loss": 1.4389, + "step": 3185 + }, + { + "epoch": 0.17418640569686863, + "grad_norm": 1.5430525541305542, + "learning_rate": 1.928087773498475e-05, + "loss": 1.3654, + "step": 3186 + }, + { + "epoch": 0.1742410781405902, + "grad_norm": 1.462515115737915, + "learning_rate": 1.9280197142536625e-05, + "loss": 1.2792, + "step": 3187 + }, + { + "epoch": 0.17429575058431174, + "grad_norm": 1.5380769968032837, + "learning_rate": 1.927951624020094e-05, + "loss": 1.4821, + "step": 3188 + }, + { + "epoch": 0.1743504230280333, + "grad_norm": 1.6558401584625244, + "learning_rate": 1.9278835028000434e-05, + "loss": 1.4931, + "step": 3189 + }, + { + "epoch": 0.17440509547175484, + "grad_norm": 1.4043374061584473, + "learning_rate": 1.9278153505957854e-05, + "loss": 1.449, + "step": 3190 + }, + { + "epoch": 0.1744597679154764, + "grad_norm": 1.3324021100997925, + "learning_rate": 1.9277471674095952e-05, + "loss": 1.4113, + "step": 3191 + }, + { + "epoch": 0.17451444035919794, + "grad_norm": 1.2585868835449219, + "learning_rate": 1.9276789532437506e-05, + "loss": 1.306, + "step": 3192 + }, + { + "epoch": 0.1745691128029195, + "grad_norm": 1.455937385559082, + "learning_rate": 1.9276107081005287e-05, + "loss": 1.5666, + "step": 3193 + }, + { + "epoch": 0.17462378524664107, + "grad_norm": 1.5852676630020142, + "learning_rate": 1.9275424319822088e-05, + "loss": 1.4339, + "step": 3194 + }, + { + "epoch": 0.1746784576903626, + "grad_norm": 1.4437249898910522, + "learning_rate": 1.92747412489107e-05, + "loss": 1.3606, + "step": 3195 + }, + { + "epoch": 0.17473313013408417, + "grad_norm": 1.6774979829788208, + "learning_rate": 1.927405786829394e-05, + "loss": 1.3776, + "step": 3196 + }, + { + "epoch": 0.1747878025778057, + "grad_norm": 1.512324571609497, + "learning_rate": 1.927337417799463e-05, + "loss": 1.386, + "step": 3197 + }, + { + "epoch": 0.17484247502152728, + "grad_norm": 1.5638595819473267, + "learning_rate": 1.927269017803559e-05, + "loss": 1.3535, + "step": 3198 + }, + { + "epoch": 0.17489714746524881, + "grad_norm": 1.4142882823944092, + "learning_rate": 1.9272005868439667e-05, + "loss": 1.3241, + "step": 3199 + }, + { + "epoch": 0.17495181990897038, + "grad_norm": 1.293529987335205, + "learning_rate": 1.927132124922971e-05, + "loss": 1.5399, + "step": 3200 + }, + { + "epoch": 0.17500649235269194, + "grad_norm": 1.822123646736145, + "learning_rate": 1.9270636320428583e-05, + "loss": 1.4062, + "step": 3201 + }, + { + "epoch": 0.17506116479641348, + "grad_norm": 1.6922852993011475, + "learning_rate": 1.9269951082059152e-05, + "loss": 1.4019, + "step": 3202 + }, + { + "epoch": 0.17511583724013505, + "grad_norm": 1.4010461568832397, + "learning_rate": 1.9269265534144306e-05, + "loss": 1.4335, + "step": 3203 + }, + { + "epoch": 0.17517050968385658, + "grad_norm": 1.6201215982437134, + "learning_rate": 1.926857967670693e-05, + "loss": 1.5343, + "step": 3204 + }, + { + "epoch": 0.17522518212757815, + "grad_norm": 1.3011380434036255, + "learning_rate": 1.9267893509769927e-05, + "loss": 1.6614, + "step": 3205 + }, + { + "epoch": 0.1752798545712997, + "grad_norm": 1.732466220855713, + "learning_rate": 1.9267207033356215e-05, + "loss": 1.4607, + "step": 3206 + }, + { + "epoch": 0.17533452701502125, + "grad_norm": 1.5225168466567993, + "learning_rate": 1.9266520247488716e-05, + "loss": 1.4382, + "step": 3207 + }, + { + "epoch": 0.17538919945874282, + "grad_norm": 1.5443854331970215, + "learning_rate": 1.9265833152190358e-05, + "loss": 1.5643, + "step": 3208 + }, + { + "epoch": 0.17544387190246435, + "grad_norm": 1.925004243850708, + "learning_rate": 1.926514574748409e-05, + "loss": 1.4663, + "step": 3209 + }, + { + "epoch": 0.17549854434618592, + "grad_norm": 1.5835984945297241, + "learning_rate": 1.926445803339286e-05, + "loss": 1.3862, + "step": 3210 + }, + { + "epoch": 0.17555321678990746, + "grad_norm": 1.8369262218475342, + "learning_rate": 1.926377000993964e-05, + "loss": 1.5639, + "step": 3211 + }, + { + "epoch": 0.17560788923362902, + "grad_norm": 1.4752120971679688, + "learning_rate": 1.92630816771474e-05, + "loss": 1.4438, + "step": 3212 + }, + { + "epoch": 0.1756625616773506, + "grad_norm": 1.1177431344985962, + "learning_rate": 1.9262393035039132e-05, + "loss": 1.503, + "step": 3213 + }, + { + "epoch": 0.17571723412107212, + "grad_norm": 1.9506056308746338, + "learning_rate": 1.926170408363782e-05, + "loss": 1.5885, + "step": 3214 + }, + { + "epoch": 0.1757719065647937, + "grad_norm": 1.3282222747802734, + "learning_rate": 1.9261014822966474e-05, + "loss": 1.3842, + "step": 3215 + }, + { + "epoch": 0.17582657900851523, + "grad_norm": 1.5114939212799072, + "learning_rate": 1.9260325253048116e-05, + "loss": 1.5405, + "step": 3216 + }, + { + "epoch": 0.1758812514522368, + "grad_norm": 1.365532398223877, + "learning_rate": 1.9259635373905764e-05, + "loss": 1.5992, + "step": 3217 + }, + { + "epoch": 0.17593592389595833, + "grad_norm": 1.5314362049102783, + "learning_rate": 1.925894518556246e-05, + "loss": 1.4728, + "step": 3218 + }, + { + "epoch": 0.1759905963396799, + "grad_norm": 1.43369722366333, + "learning_rate": 1.925825468804125e-05, + "loss": 1.8396, + "step": 3219 + }, + { + "epoch": 0.17604526878340146, + "grad_norm": 1.8491290807724, + "learning_rate": 1.925756388136519e-05, + "loss": 1.7625, + "step": 3220 + }, + { + "epoch": 0.176099941227123, + "grad_norm": 1.5815578699111938, + "learning_rate": 1.9256872765557348e-05, + "loss": 1.3981, + "step": 3221 + }, + { + "epoch": 0.17615461367084456, + "grad_norm": 1.9333621263504028, + "learning_rate": 1.9256181340640803e-05, + "loss": 1.5778, + "step": 3222 + }, + { + "epoch": 0.1762092861145661, + "grad_norm": 1.13008713722229, + "learning_rate": 1.9255489606638643e-05, + "loss": 1.6113, + "step": 3223 + }, + { + "epoch": 0.17626395855828766, + "grad_norm": 1.43746817111969, + "learning_rate": 1.9254797563573967e-05, + "loss": 1.5113, + "step": 3224 + }, + { + "epoch": 0.1763186310020092, + "grad_norm": 1.5538219213485718, + "learning_rate": 1.9254105211469883e-05, + "loss": 1.3749, + "step": 3225 + }, + { + "epoch": 0.17637330344573077, + "grad_norm": 1.5740238428115845, + "learning_rate": 1.9253412550349507e-05, + "loss": 1.4594, + "step": 3226 + }, + { + "epoch": 0.17642797588945233, + "grad_norm": 1.4998937845230103, + "learning_rate": 1.9252719580235978e-05, + "loss": 1.1972, + "step": 3227 + }, + { + "epoch": 0.17648264833317387, + "grad_norm": 1.4642369747161865, + "learning_rate": 1.9252026301152423e-05, + "loss": 1.4748, + "step": 3228 + }, + { + "epoch": 0.17653732077689543, + "grad_norm": 1.4304090738296509, + "learning_rate": 1.9251332713122006e-05, + "loss": 1.4224, + "step": 3229 + }, + { + "epoch": 0.17659199322061697, + "grad_norm": 1.9025511741638184, + "learning_rate": 1.925063881616788e-05, + "loss": 1.6747, + "step": 3230 + }, + { + "epoch": 0.17664666566433854, + "grad_norm": 2.427675485610962, + "learning_rate": 1.9249944610313212e-05, + "loss": 1.247, + "step": 3231 + }, + { + "epoch": 0.17670133810806007, + "grad_norm": 1.5467519760131836, + "learning_rate": 1.9249250095581192e-05, + "loss": 1.5206, + "step": 3232 + }, + { + "epoch": 0.17675601055178164, + "grad_norm": 1.8104939460754395, + "learning_rate": 1.9248555271995006e-05, + "loss": 1.591, + "step": 3233 + }, + { + "epoch": 0.1768106829955032, + "grad_norm": 2.3739874362945557, + "learning_rate": 1.9247860139577856e-05, + "loss": 1.2661, + "step": 3234 + }, + { + "epoch": 0.17686535543922474, + "grad_norm": 1.518894910812378, + "learning_rate": 1.9247164698352955e-05, + "loss": 1.711, + "step": 3235 + }, + { + "epoch": 0.1769200278829463, + "grad_norm": 2.137528896331787, + "learning_rate": 1.9246468948343528e-05, + "loss": 1.5067, + "step": 3236 + }, + { + "epoch": 0.17697470032666784, + "grad_norm": 1.3351250886917114, + "learning_rate": 1.9245772889572803e-05, + "loss": 1.559, + "step": 3237 + }, + { + "epoch": 0.1770293727703894, + "grad_norm": 1.3900727033615112, + "learning_rate": 1.9245076522064025e-05, + "loss": 1.6353, + "step": 3238 + }, + { + "epoch": 0.17708404521411095, + "grad_norm": 1.717495083808899, + "learning_rate": 1.924437984584045e-05, + "loss": 1.3241, + "step": 3239 + }, + { + "epoch": 0.1771387176578325, + "grad_norm": 1.676015019416809, + "learning_rate": 1.924368286092534e-05, + "loss": 1.5013, + "step": 3240 + }, + { + "epoch": 0.17719339010155408, + "grad_norm": 1.3337774276733398, + "learning_rate": 1.9242985567341964e-05, + "loss": 1.602, + "step": 3241 + }, + { + "epoch": 0.17724806254527561, + "grad_norm": 1.827361822128296, + "learning_rate": 1.9242287965113614e-05, + "loss": 1.291, + "step": 3242 + }, + { + "epoch": 0.17730273498899718, + "grad_norm": 1.9240174293518066, + "learning_rate": 1.924159005426358e-05, + "loss": 1.4748, + "step": 3243 + }, + { + "epoch": 0.17735740743271872, + "grad_norm": 1.9033247232437134, + "learning_rate": 1.924089183481517e-05, + "loss": 1.4077, + "step": 3244 + }, + { + "epoch": 0.17741207987644028, + "grad_norm": 1.4221951961517334, + "learning_rate": 1.9240193306791695e-05, + "loss": 1.461, + "step": 3245 + }, + { + "epoch": 0.17746675232016182, + "grad_norm": 1.5455210208892822, + "learning_rate": 1.923949447021648e-05, + "loss": 1.51, + "step": 3246 + }, + { + "epoch": 0.17752142476388338, + "grad_norm": 1.397103190422058, + "learning_rate": 1.9238795325112867e-05, + "loss": 1.4563, + "step": 3247 + }, + { + "epoch": 0.17757609720760495, + "grad_norm": 2.7586987018585205, + "learning_rate": 1.9238095871504198e-05, + "loss": 1.4069, + "step": 3248 + }, + { + "epoch": 0.1776307696513265, + "grad_norm": 1.5134668350219727, + "learning_rate": 1.9237396109413833e-05, + "loss": 1.563, + "step": 3249 + }, + { + "epoch": 0.17768544209504805, + "grad_norm": 1.2740963697433472, + "learning_rate": 1.9236696038865132e-05, + "loss": 1.5688, + "step": 3250 + }, + { + "epoch": 0.1777401145387696, + "grad_norm": 1.6335242986679077, + "learning_rate": 1.9235995659881478e-05, + "loss": 1.3778, + "step": 3251 + }, + { + "epoch": 0.17779478698249115, + "grad_norm": 1.2872391939163208, + "learning_rate": 1.9235294972486254e-05, + "loss": 1.2591, + "step": 3252 + }, + { + "epoch": 0.1778494594262127, + "grad_norm": 1.4486865997314453, + "learning_rate": 1.923459397670286e-05, + "loss": 1.5803, + "step": 3253 + }, + { + "epoch": 0.17790413186993426, + "grad_norm": 1.5502188205718994, + "learning_rate": 1.92338926725547e-05, + "loss": 1.5161, + "step": 3254 + }, + { + "epoch": 0.17795880431365582, + "grad_norm": 1.4851622581481934, + "learning_rate": 1.9233191060065203e-05, + "loss": 1.3248, + "step": 3255 + }, + { + "epoch": 0.17801347675737736, + "grad_norm": 1.765122413635254, + "learning_rate": 1.9232489139257782e-05, + "loss": 1.5491, + "step": 3256 + }, + { + "epoch": 0.17806814920109892, + "grad_norm": 1.780561089515686, + "learning_rate": 1.9231786910155892e-05, + "loss": 1.276, + "step": 3257 + }, + { + "epoch": 0.17812282164482046, + "grad_norm": 1.35300874710083, + "learning_rate": 1.923108437278297e-05, + "loss": 1.4535, + "step": 3258 + }, + { + "epoch": 0.17817749408854203, + "grad_norm": 1.685369610786438, + "learning_rate": 1.9230381527162477e-05, + "loss": 1.3222, + "step": 3259 + }, + { + "epoch": 0.17823216653226356, + "grad_norm": 1.4313349723815918, + "learning_rate": 1.922967837331789e-05, + "loss": 1.4517, + "step": 3260 + }, + { + "epoch": 0.17828683897598513, + "grad_norm": 1.2636795043945312, + "learning_rate": 1.9228974911272682e-05, + "loss": 1.5213, + "step": 3261 + }, + { + "epoch": 0.1783415114197067, + "grad_norm": 1.4834251403808594, + "learning_rate": 1.9228271141050346e-05, + "loss": 1.431, + "step": 3262 + }, + { + "epoch": 0.17839618386342823, + "grad_norm": 1.5427751541137695, + "learning_rate": 1.922756706267438e-05, + "loss": 1.4308, + "step": 3263 + }, + { + "epoch": 0.1784508563071498, + "grad_norm": 1.198046088218689, + "learning_rate": 1.92268626761683e-05, + "loss": 1.5281, + "step": 3264 + }, + { + "epoch": 0.17850552875087133, + "grad_norm": 1.6541039943695068, + "learning_rate": 1.9226157981555618e-05, + "loss": 1.6354, + "step": 3265 + }, + { + "epoch": 0.1785602011945929, + "grad_norm": 1.5922882556915283, + "learning_rate": 1.9225452978859876e-05, + "loss": 1.4277, + "step": 3266 + }, + { + "epoch": 0.17861487363831444, + "grad_norm": 1.8309385776519775, + "learning_rate": 1.9224747668104607e-05, + "loss": 1.4644, + "step": 3267 + }, + { + "epoch": 0.178669546082036, + "grad_norm": 1.7843718528747559, + "learning_rate": 1.922404204931337e-05, + "loss": 1.6377, + "step": 3268 + }, + { + "epoch": 0.17872421852575757, + "grad_norm": 1.3984532356262207, + "learning_rate": 1.9223336122509724e-05, + "loss": 1.2522, + "step": 3269 + }, + { + "epoch": 0.1787788909694791, + "grad_norm": 2.0832841396331787, + "learning_rate": 1.9222629887717238e-05, + "loss": 1.6826, + "step": 3270 + }, + { + "epoch": 0.17883356341320067, + "grad_norm": 1.3855361938476562, + "learning_rate": 1.92219233449595e-05, + "loss": 1.5154, + "step": 3271 + }, + { + "epoch": 0.1788882358569222, + "grad_norm": 1.6570221185684204, + "learning_rate": 1.9221216494260105e-05, + "loss": 1.3243, + "step": 3272 + }, + { + "epoch": 0.17894290830064377, + "grad_norm": 1.5317400693893433, + "learning_rate": 1.922050933564265e-05, + "loss": 1.5616, + "step": 3273 + }, + { + "epoch": 0.1789975807443653, + "grad_norm": 1.429810881614685, + "learning_rate": 1.921980186913075e-05, + "loss": 1.5108, + "step": 3274 + }, + { + "epoch": 0.17905225318808687, + "grad_norm": 1.2949539422988892, + "learning_rate": 1.9219094094748035e-05, + "loss": 1.2533, + "step": 3275 + }, + { + "epoch": 0.17910692563180844, + "grad_norm": 1.4069973230361938, + "learning_rate": 1.9218386012518134e-05, + "loss": 1.6701, + "step": 3276 + }, + { + "epoch": 0.17916159807552998, + "grad_norm": 1.3533855676651, + "learning_rate": 1.921767762246469e-05, + "loss": 1.4335, + "step": 3277 + }, + { + "epoch": 0.17921627051925154, + "grad_norm": 1.4791748523712158, + "learning_rate": 1.921696892461136e-05, + "loss": 1.5287, + "step": 3278 + }, + { + "epoch": 0.17927094296297308, + "grad_norm": 1.4950282573699951, + "learning_rate": 1.9216259918981812e-05, + "loss": 1.5378, + "step": 3279 + }, + { + "epoch": 0.17932561540669464, + "grad_norm": 1.8650727272033691, + "learning_rate": 1.9215550605599717e-05, + "loss": 1.7629, + "step": 3280 + }, + { + "epoch": 0.17938028785041618, + "grad_norm": 1.9055904150009155, + "learning_rate": 1.9214840984488763e-05, + "loss": 1.3125, + "step": 3281 + }, + { + "epoch": 0.17943496029413775, + "grad_norm": 1.6153271198272705, + "learning_rate": 1.9214131055672648e-05, + "loss": 1.2435, + "step": 3282 + }, + { + "epoch": 0.1794896327378593, + "grad_norm": 1.259818196296692, + "learning_rate": 1.921342081917507e-05, + "loss": 1.5042, + "step": 3283 + }, + { + "epoch": 0.17954430518158085, + "grad_norm": 1.3838963508605957, + "learning_rate": 1.9212710275019754e-05, + "loss": 1.603, + "step": 3284 + }, + { + "epoch": 0.17959897762530241, + "grad_norm": 1.299625277519226, + "learning_rate": 1.921199942323042e-05, + "loss": 1.5355, + "step": 3285 + }, + { + "epoch": 0.17965365006902395, + "grad_norm": 1.5989367961883545, + "learning_rate": 1.9211288263830814e-05, + "loss": 1.5068, + "step": 3286 + }, + { + "epoch": 0.17970832251274552, + "grad_norm": 2.225985527038574, + "learning_rate": 1.9210576796844676e-05, + "loss": 1.6948, + "step": 3287 + }, + { + "epoch": 0.17976299495646705, + "grad_norm": 1.5469245910644531, + "learning_rate": 1.9209865022295765e-05, + "loss": 1.3634, + "step": 3288 + }, + { + "epoch": 0.17981766740018862, + "grad_norm": 1.5397893190383911, + "learning_rate": 1.9209152940207846e-05, + "loss": 1.4139, + "step": 3289 + }, + { + "epoch": 0.17987233984391018, + "grad_norm": 1.8980765342712402, + "learning_rate": 1.9208440550604704e-05, + "loss": 1.6012, + "step": 3290 + }, + { + "epoch": 0.17992701228763172, + "grad_norm": 1.7539405822753906, + "learning_rate": 1.9207727853510126e-05, + "loss": 1.5625, + "step": 3291 + }, + { + "epoch": 0.1799816847313533, + "grad_norm": 1.2640330791473389, + "learning_rate": 1.9207014848947905e-05, + "loss": 1.4332, + "step": 3292 + }, + { + "epoch": 0.18003635717507482, + "grad_norm": 1.541965126991272, + "learning_rate": 1.9206301536941855e-05, + "loss": 1.4593, + "step": 3293 + }, + { + "epoch": 0.1800910296187964, + "grad_norm": 1.721304178237915, + "learning_rate": 1.920558791751579e-05, + "loss": 1.3988, + "step": 3294 + }, + { + "epoch": 0.18014570206251793, + "grad_norm": 2.447507381439209, + "learning_rate": 1.9204873990693546e-05, + "loss": 1.3122, + "step": 3295 + }, + { + "epoch": 0.1802003745062395, + "grad_norm": 1.2798514366149902, + "learning_rate": 1.920415975649896e-05, + "loss": 1.5286, + "step": 3296 + }, + { + "epoch": 0.18025504694996106, + "grad_norm": 1.7727090120315552, + "learning_rate": 1.9203445214955877e-05, + "loss": 1.3015, + "step": 3297 + }, + { + "epoch": 0.1803097193936826, + "grad_norm": 1.9355049133300781, + "learning_rate": 1.9202730366088165e-05, + "loss": 1.2777, + "step": 3298 + }, + { + "epoch": 0.18036439183740416, + "grad_norm": 1.5570006370544434, + "learning_rate": 1.9202015209919692e-05, + "loss": 1.6319, + "step": 3299 + }, + { + "epoch": 0.1804190642811257, + "grad_norm": 1.4739863872528076, + "learning_rate": 1.9201299746474337e-05, + "loss": 1.462, + "step": 3300 + }, + { + "epoch": 0.18047373672484726, + "grad_norm": 1.4051371812820435, + "learning_rate": 1.9200583975775996e-05, + "loss": 1.3653, + "step": 3301 + }, + { + "epoch": 0.1805284091685688, + "grad_norm": 3.4784531593322754, + "learning_rate": 1.919986789784856e-05, + "loss": 1.4795, + "step": 3302 + }, + { + "epoch": 0.18058308161229036, + "grad_norm": 1.43917715549469, + "learning_rate": 1.9199151512715948e-05, + "loss": 1.4844, + "step": 3303 + }, + { + "epoch": 0.18063775405601193, + "grad_norm": 2.132559299468994, + "learning_rate": 1.919843482040208e-05, + "loss": 1.5672, + "step": 3304 + }, + { + "epoch": 0.18069242649973347, + "grad_norm": 2.0881357192993164, + "learning_rate": 1.919771782093089e-05, + "loss": 1.3639, + "step": 3305 + }, + { + "epoch": 0.18074709894345503, + "grad_norm": 1.7254432439804077, + "learning_rate": 1.919700051432632e-05, + "loss": 1.3553, + "step": 3306 + }, + { + "epoch": 0.18080177138717657, + "grad_norm": 1.4924169778823853, + "learning_rate": 1.919628290061232e-05, + "loss": 1.5267, + "step": 3307 + }, + { + "epoch": 0.18085644383089813, + "grad_norm": 1.7627571821212769, + "learning_rate": 1.9195564979812854e-05, + "loss": 1.5716, + "step": 3308 + }, + { + "epoch": 0.18091111627461967, + "grad_norm": 1.5069000720977783, + "learning_rate": 1.9194846751951897e-05, + "loss": 1.6015, + "step": 3309 + }, + { + "epoch": 0.18096578871834124, + "grad_norm": 1.9053940773010254, + "learning_rate": 1.919412821705343e-05, + "loss": 1.2008, + "step": 3310 + }, + { + "epoch": 0.1810204611620628, + "grad_norm": 3.3829166889190674, + "learning_rate": 1.9193409375141446e-05, + "loss": 1.3516, + "step": 3311 + }, + { + "epoch": 0.18107513360578434, + "grad_norm": 1.5904197692871094, + "learning_rate": 1.919269022623995e-05, + "loss": 1.4753, + "step": 3312 + }, + { + "epoch": 0.1811298060495059, + "grad_norm": 1.4372161626815796, + "learning_rate": 1.9191970770372955e-05, + "loss": 1.3239, + "step": 3313 + }, + { + "epoch": 0.18118447849322744, + "grad_norm": 1.3366291522979736, + "learning_rate": 1.919125100756449e-05, + "loss": 1.2631, + "step": 3314 + }, + { + "epoch": 0.181239150936949, + "grad_norm": 1.5912655591964722, + "learning_rate": 1.919053093783858e-05, + "loss": 1.5807, + "step": 3315 + }, + { + "epoch": 0.18129382338067057, + "grad_norm": 1.35109543800354, + "learning_rate": 1.9189810561219282e-05, + "loss": 1.4987, + "step": 3316 + }, + { + "epoch": 0.1813484958243921, + "grad_norm": 1.843475103378296, + "learning_rate": 1.918908987773064e-05, + "loss": 1.2186, + "step": 3317 + }, + { + "epoch": 0.18140316826811367, + "grad_norm": 1.4113112688064575, + "learning_rate": 1.918836888739673e-05, + "loss": 1.6351, + "step": 3318 + }, + { + "epoch": 0.1814578407118352, + "grad_norm": 1.409198522567749, + "learning_rate": 1.9187647590241615e-05, + "loss": 1.562, + "step": 3319 + }, + { + "epoch": 0.18151251315555678, + "grad_norm": 2.434901475906372, + "learning_rate": 1.918692598628939e-05, + "loss": 1.2182, + "step": 3320 + }, + { + "epoch": 0.1815671855992783, + "grad_norm": 1.840852975845337, + "learning_rate": 1.918620407556415e-05, + "loss": 1.6342, + "step": 3321 + }, + { + "epoch": 0.18162185804299988, + "grad_norm": 1.29874587059021, + "learning_rate": 1.918548185809e-05, + "loss": 1.589, + "step": 3322 + }, + { + "epoch": 0.18167653048672144, + "grad_norm": 1.6525452136993408, + "learning_rate": 1.918475933389105e-05, + "loss": 1.4607, + "step": 3323 + }, + { + "epoch": 0.18173120293044298, + "grad_norm": 1.5034356117248535, + "learning_rate": 1.918403650299144e-05, + "loss": 1.3815, + "step": 3324 + }, + { + "epoch": 0.18178587537416455, + "grad_norm": 1.7682437896728516, + "learning_rate": 1.91833133654153e-05, + "loss": 1.3573, + "step": 3325 + }, + { + "epoch": 0.18184054781788608, + "grad_norm": 1.6323617696762085, + "learning_rate": 1.9182589921186777e-05, + "loss": 1.3887, + "step": 3326 + }, + { + "epoch": 0.18189522026160765, + "grad_norm": 1.4020980596542358, + "learning_rate": 1.9181866170330025e-05, + "loss": 1.4361, + "step": 3327 + }, + { + "epoch": 0.18194989270532919, + "grad_norm": 1.5253915786743164, + "learning_rate": 1.918114211286922e-05, + "loss": 1.2625, + "step": 3328 + }, + { + "epoch": 0.18200456514905075, + "grad_norm": 1.511802315711975, + "learning_rate": 1.9180417748828534e-05, + "loss": 1.5762, + "step": 3329 + }, + { + "epoch": 0.18205923759277232, + "grad_norm": 1.7379175424575806, + "learning_rate": 1.917969307823216e-05, + "loss": 1.2631, + "step": 3330 + }, + { + "epoch": 0.18211391003649385, + "grad_norm": 1.6505980491638184, + "learning_rate": 1.9178968101104285e-05, + "loss": 1.5326, + "step": 3331 + }, + { + "epoch": 0.18216858248021542, + "grad_norm": 1.3909515142440796, + "learning_rate": 1.9178242817469133e-05, + "loss": 1.6283, + "step": 3332 + }, + { + "epoch": 0.18222325492393696, + "grad_norm": 1.3827451467514038, + "learning_rate": 1.9177517227350914e-05, + "loss": 1.5503, + "step": 3333 + }, + { + "epoch": 0.18227792736765852, + "grad_norm": 1.4345653057098389, + "learning_rate": 1.9176791330773858e-05, + "loss": 1.2849, + "step": 3334 + }, + { + "epoch": 0.18233259981138006, + "grad_norm": 1.4410405158996582, + "learning_rate": 1.9176065127762206e-05, + "loss": 1.7317, + "step": 3335 + }, + { + "epoch": 0.18238727225510162, + "grad_norm": 1.5690194368362427, + "learning_rate": 1.9175338618340208e-05, + "loss": 1.4767, + "step": 3336 + }, + { + "epoch": 0.1824419446988232, + "grad_norm": 1.6544238328933716, + "learning_rate": 1.9174611802532124e-05, + "loss": 1.3748, + "step": 3337 + }, + { + "epoch": 0.18249661714254473, + "grad_norm": 1.600403070449829, + "learning_rate": 1.917388468036222e-05, + "loss": 1.7484, + "step": 3338 + }, + { + "epoch": 0.1825512895862663, + "grad_norm": 1.7901115417480469, + "learning_rate": 1.917315725185478e-05, + "loss": 1.2992, + "step": 3339 + }, + { + "epoch": 0.18260596202998783, + "grad_norm": 1.4278093576431274, + "learning_rate": 1.9172429517034094e-05, + "loss": 1.4582, + "step": 3340 + }, + { + "epoch": 0.1826606344737094, + "grad_norm": 1.5291253328323364, + "learning_rate": 1.9171701475924466e-05, + "loss": 1.4804, + "step": 3341 + }, + { + "epoch": 0.18271530691743093, + "grad_norm": 1.784932255744934, + "learning_rate": 1.91709731285502e-05, + "loss": 1.5633, + "step": 3342 + }, + { + "epoch": 0.1827699793611525, + "grad_norm": 1.5845814943313599, + "learning_rate": 1.917024447493562e-05, + "loss": 1.1827, + "step": 3343 + }, + { + "epoch": 0.18282465180487406, + "grad_norm": 1.951209306716919, + "learning_rate": 1.9169515515105063e-05, + "loss": 1.6011, + "step": 3344 + }, + { + "epoch": 0.1828793242485956, + "grad_norm": 1.3347384929656982, + "learning_rate": 1.9168786249082862e-05, + "loss": 1.3881, + "step": 3345 + }, + { + "epoch": 0.18293399669231716, + "grad_norm": 1.5671006441116333, + "learning_rate": 1.9168056676893373e-05, + "loss": 1.6939, + "step": 3346 + }, + { + "epoch": 0.1829886691360387, + "grad_norm": 1.4984257221221924, + "learning_rate": 1.916732679856096e-05, + "loss": 1.7131, + "step": 3347 + }, + { + "epoch": 0.18304334157976027, + "grad_norm": 1.5090737342834473, + "learning_rate": 1.9166596614109993e-05, + "loss": 1.4815, + "step": 3348 + }, + { + "epoch": 0.1830980140234818, + "grad_norm": 1.888500690460205, + "learning_rate": 1.9165866123564854e-05, + "loss": 1.5726, + "step": 3349 + }, + { + "epoch": 0.18315268646720337, + "grad_norm": 1.346730351448059, + "learning_rate": 1.9165135326949937e-05, + "loss": 1.3516, + "step": 3350 + }, + { + "epoch": 0.18320735891092493, + "grad_norm": 1.6677134037017822, + "learning_rate": 1.9164404224289645e-05, + "loss": 1.4075, + "step": 3351 + }, + { + "epoch": 0.18326203135464647, + "grad_norm": 1.446661114692688, + "learning_rate": 1.9163672815608392e-05, + "loss": 1.656, + "step": 3352 + }, + { + "epoch": 0.18331670379836804, + "grad_norm": 1.812454104423523, + "learning_rate": 1.9162941100930597e-05, + "loss": 1.7391, + "step": 3353 + }, + { + "epoch": 0.18337137624208957, + "grad_norm": 1.6173847913742065, + "learning_rate": 1.9162209080280704e-05, + "loss": 1.5671, + "step": 3354 + }, + { + "epoch": 0.18342604868581114, + "grad_norm": 1.472896933555603, + "learning_rate": 1.9161476753683144e-05, + "loss": 1.343, + "step": 3355 + }, + { + "epoch": 0.18348072112953268, + "grad_norm": 1.400084376335144, + "learning_rate": 1.916074412116238e-05, + "loss": 1.1718, + "step": 3356 + }, + { + "epoch": 0.18353539357325424, + "grad_norm": 1.4167190790176392, + "learning_rate": 1.916001118274287e-05, + "loss": 1.589, + "step": 3357 + }, + { + "epoch": 0.1835900660169758, + "grad_norm": 1.429132342338562, + "learning_rate": 1.9159277938449095e-05, + "loss": 1.4617, + "step": 3358 + }, + { + "epoch": 0.18364473846069734, + "grad_norm": 2.2118611335754395, + "learning_rate": 1.9158544388305534e-05, + "loss": 1.5306, + "step": 3359 + }, + { + "epoch": 0.1836994109044189, + "grad_norm": 1.6091748476028442, + "learning_rate": 1.9157810532336686e-05, + "loss": 1.4609, + "step": 3360 + }, + { + "epoch": 0.18375408334814045, + "grad_norm": 1.2905519008636475, + "learning_rate": 1.9157076370567056e-05, + "loss": 1.7437, + "step": 3361 + }, + { + "epoch": 0.183808755791862, + "grad_norm": 2.0340158939361572, + "learning_rate": 1.9156341903021158e-05, + "loss": 1.3822, + "step": 3362 + }, + { + "epoch": 0.18386342823558355, + "grad_norm": 1.6496366262435913, + "learning_rate": 1.915560712972352e-05, + "loss": 1.3749, + "step": 3363 + }, + { + "epoch": 0.1839181006793051, + "grad_norm": 1.45815908908844, + "learning_rate": 1.915487205069867e-05, + "loss": 1.504, + "step": 3364 + }, + { + "epoch": 0.18397277312302668, + "grad_norm": 1.4465391635894775, + "learning_rate": 1.9154136665971163e-05, + "loss": 1.7552, + "step": 3365 + }, + { + "epoch": 0.18402744556674822, + "grad_norm": 1.6013439893722534, + "learning_rate": 1.915340097556555e-05, + "loss": 1.3724, + "step": 3366 + }, + { + "epoch": 0.18408211801046978, + "grad_norm": 2.0461232662200928, + "learning_rate": 1.9152664979506405e-05, + "loss": 1.6319, + "step": 3367 + }, + { + "epoch": 0.18413679045419132, + "grad_norm": 1.2604738473892212, + "learning_rate": 1.915192867781829e-05, + "loss": 1.4437, + "step": 3368 + }, + { + "epoch": 0.18419146289791288, + "grad_norm": 1.619309663772583, + "learning_rate": 1.915119207052581e-05, + "loss": 1.4002, + "step": 3369 + }, + { + "epoch": 0.18424613534163442, + "grad_norm": 1.6141358613967896, + "learning_rate": 1.9150455157653546e-05, + "loss": 1.4316, + "step": 3370 + }, + { + "epoch": 0.18430080778535599, + "grad_norm": 1.4060410261154175, + "learning_rate": 1.9149717939226114e-05, + "loss": 1.5061, + "step": 3371 + }, + { + "epoch": 0.18435548022907755, + "grad_norm": 2.314542055130005, + "learning_rate": 1.9148980415268132e-05, + "loss": 1.4739, + "step": 3372 + }, + { + "epoch": 0.1844101526727991, + "grad_norm": 1.3194115161895752, + "learning_rate": 1.914824258580422e-05, + "loss": 1.3307, + "step": 3373 + }, + { + "epoch": 0.18446482511652065, + "grad_norm": 1.4840080738067627, + "learning_rate": 1.9147504450859026e-05, + "loss": 1.5069, + "step": 3374 + }, + { + "epoch": 0.1845194975602422, + "grad_norm": 1.3533554077148438, + "learning_rate": 1.9146766010457192e-05, + "loss": 1.4493, + "step": 3375 + }, + { + "epoch": 0.18457417000396376, + "grad_norm": 1.5156699419021606, + "learning_rate": 1.9146027264623376e-05, + "loss": 1.5324, + "step": 3376 + }, + { + "epoch": 0.1846288424476853, + "grad_norm": 1.4197410345077515, + "learning_rate": 1.9145288213382248e-05, + "loss": 1.3683, + "step": 3377 + }, + { + "epoch": 0.18468351489140686, + "grad_norm": 1.3331983089447021, + "learning_rate": 1.914454885675849e-05, + "loss": 1.6673, + "step": 3378 + }, + { + "epoch": 0.18473818733512842, + "grad_norm": 1.2230205535888672, + "learning_rate": 1.914380919477678e-05, + "loss": 1.4834, + "step": 3379 + }, + { + "epoch": 0.18479285977884996, + "grad_norm": 1.2795162200927734, + "learning_rate": 1.914306922746183e-05, + "loss": 1.7546, + "step": 3380 + }, + { + "epoch": 0.18484753222257153, + "grad_norm": 1.5837029218673706, + "learning_rate": 1.9142328954838342e-05, + "loss": 1.5393, + "step": 3381 + }, + { + "epoch": 0.18490220466629306, + "grad_norm": 1.4846121072769165, + "learning_rate": 1.914158837693104e-05, + "loss": 1.3412, + "step": 3382 + }, + { + "epoch": 0.18495687711001463, + "grad_norm": 1.5496145486831665, + "learning_rate": 1.9140847493764646e-05, + "loss": 1.5529, + "step": 3383 + }, + { + "epoch": 0.18501154955373617, + "grad_norm": 1.328006625175476, + "learning_rate": 1.9140106305363905e-05, + "loss": 1.5822, + "step": 3384 + }, + { + "epoch": 0.18506622199745773, + "grad_norm": 1.3085969686508179, + "learning_rate": 1.913936481175357e-05, + "loss": 1.6657, + "step": 3385 + }, + { + "epoch": 0.1851208944411793, + "grad_norm": 1.5083677768707275, + "learning_rate": 1.9138623012958394e-05, + "loss": 1.2355, + "step": 3386 + }, + { + "epoch": 0.18517556688490083, + "grad_norm": 1.51302170753479, + "learning_rate": 1.9137880909003155e-05, + "loss": 1.2006, + "step": 3387 + }, + { + "epoch": 0.1852302393286224, + "grad_norm": 1.5713907480239868, + "learning_rate": 1.9137138499912626e-05, + "loss": 1.5619, + "step": 3388 + }, + { + "epoch": 0.18528491177234394, + "grad_norm": 1.7342817783355713, + "learning_rate": 1.9136395785711602e-05, + "loss": 1.4675, + "step": 3389 + }, + { + "epoch": 0.1853395842160655, + "grad_norm": 1.7318912744522095, + "learning_rate": 1.9135652766424887e-05, + "loss": 1.4597, + "step": 3390 + }, + { + "epoch": 0.18539425665978704, + "grad_norm": 1.540672779083252, + "learning_rate": 1.9134909442077286e-05, + "loss": 1.5337, + "step": 3391 + }, + { + "epoch": 0.1854489291035086, + "grad_norm": 1.5148842334747314, + "learning_rate": 1.9134165812693623e-05, + "loss": 1.322, + "step": 3392 + }, + { + "epoch": 0.18550360154723017, + "grad_norm": 1.4084275960922241, + "learning_rate": 1.913342187829873e-05, + "loss": 1.3816, + "step": 3393 + }, + { + "epoch": 0.1855582739909517, + "grad_norm": 1.1876728534698486, + "learning_rate": 1.913267763891745e-05, + "loss": 1.4809, + "step": 3394 + }, + { + "epoch": 0.18561294643467327, + "grad_norm": 1.2226970195770264, + "learning_rate": 1.913193309457463e-05, + "loss": 1.5199, + "step": 3395 + }, + { + "epoch": 0.1856676188783948, + "grad_norm": 1.7787103652954102, + "learning_rate": 1.9131188245295135e-05, + "loss": 1.6986, + "step": 3396 + }, + { + "epoch": 0.18572229132211637, + "grad_norm": 1.544915795326233, + "learning_rate": 1.9130443091103838e-05, + "loss": 1.55, + "step": 3397 + }, + { + "epoch": 0.1857769637658379, + "grad_norm": 2.119281530380249, + "learning_rate": 1.9129697632025623e-05, + "loss": 1.385, + "step": 3398 + }, + { + "epoch": 0.18583163620955948, + "grad_norm": 1.462871789932251, + "learning_rate": 1.912895186808538e-05, + "loss": 1.4466, + "step": 3399 + }, + { + "epoch": 0.18588630865328104, + "grad_norm": 1.3621313571929932, + "learning_rate": 1.9128205799308012e-05, + "loss": 1.4184, + "step": 3400 + }, + { + "epoch": 0.18594098109700258, + "grad_norm": 1.6498647928237915, + "learning_rate": 1.912745942571843e-05, + "loss": 1.5137, + "step": 3401 + }, + { + "epoch": 0.18599565354072414, + "grad_norm": 1.2973061800003052, + "learning_rate": 1.9126712747341564e-05, + "loss": 1.4497, + "step": 3402 + }, + { + "epoch": 0.18605032598444568, + "grad_norm": 1.5156092643737793, + "learning_rate": 1.912596576420234e-05, + "loss": 1.5973, + "step": 3403 + }, + { + "epoch": 0.18610499842816725, + "grad_norm": 1.8710601329803467, + "learning_rate": 1.9125218476325705e-05, + "loss": 1.5493, + "step": 3404 + }, + { + "epoch": 0.18615967087188878, + "grad_norm": 1.4419198036193848, + "learning_rate": 1.9124470883736614e-05, + "loss": 1.7293, + "step": 3405 + }, + { + "epoch": 0.18621434331561035, + "grad_norm": 1.6464959383010864, + "learning_rate": 1.9123722986460023e-05, + "loss": 1.4952, + "step": 3406 + }, + { + "epoch": 0.1862690157593319, + "grad_norm": 1.3839919567108154, + "learning_rate": 1.9122974784520918e-05, + "loss": 1.479, + "step": 3407 + }, + { + "epoch": 0.18632368820305345, + "grad_norm": 1.7450575828552246, + "learning_rate": 1.9122226277944276e-05, + "loss": 1.4894, + "step": 3408 + }, + { + "epoch": 0.18637836064677502, + "grad_norm": 1.6884589195251465, + "learning_rate": 1.912147746675509e-05, + "loss": 1.4629, + "step": 3409 + }, + { + "epoch": 0.18643303309049655, + "grad_norm": 1.867673397064209, + "learning_rate": 1.912072835097837e-05, + "loss": 1.3861, + "step": 3410 + }, + { + "epoch": 0.18648770553421812, + "grad_norm": 1.8583717346191406, + "learning_rate": 1.9119978930639127e-05, + "loss": 1.3895, + "step": 3411 + }, + { + "epoch": 0.18654237797793966, + "grad_norm": 1.6268526315689087, + "learning_rate": 1.9119229205762386e-05, + "loss": 1.3728, + "step": 3412 + }, + { + "epoch": 0.18659705042166122, + "grad_norm": 1.6758079528808594, + "learning_rate": 1.9118479176373184e-05, + "loss": 1.4425, + "step": 3413 + }, + { + "epoch": 0.18665172286538279, + "grad_norm": 1.2790405750274658, + "learning_rate": 1.9117728842496562e-05, + "loss": 1.454, + "step": 3414 + }, + { + "epoch": 0.18670639530910432, + "grad_norm": 1.5380148887634277, + "learning_rate": 1.9116978204157583e-05, + "loss": 1.7493, + "step": 3415 + }, + { + "epoch": 0.1867610677528259, + "grad_norm": 1.5196874141693115, + "learning_rate": 1.9116227261381305e-05, + "loss": 1.4899, + "step": 3416 + }, + { + "epoch": 0.18681574019654743, + "grad_norm": 1.7729175090789795, + "learning_rate": 1.9115476014192807e-05, + "loss": 1.3411, + "step": 3417 + }, + { + "epoch": 0.186870412640269, + "grad_norm": 1.5007703304290771, + "learning_rate": 1.911472446261718e-05, + "loss": 1.4476, + "step": 3418 + }, + { + "epoch": 0.18692508508399056, + "grad_norm": 1.642708420753479, + "learning_rate": 1.911397260667951e-05, + "loss": 1.5604, + "step": 3419 + }, + { + "epoch": 0.1869797575277121, + "grad_norm": 1.4495059251785278, + "learning_rate": 1.9113220446404906e-05, + "loss": 1.3746, + "step": 3420 + }, + { + "epoch": 0.18703442997143366, + "grad_norm": 1.4472689628601074, + "learning_rate": 1.911246798181849e-05, + "loss": 1.4399, + "step": 3421 + }, + { + "epoch": 0.1870891024151552, + "grad_norm": 1.306727409362793, + "learning_rate": 1.9111715212945384e-05, + "loss": 1.7218, + "step": 3422 + }, + { + "epoch": 0.18714377485887676, + "grad_norm": 1.449446201324463, + "learning_rate": 1.9110962139810726e-05, + "loss": 1.5329, + "step": 3423 + }, + { + "epoch": 0.1871984473025983, + "grad_norm": 1.489635705947876, + "learning_rate": 1.9110208762439662e-05, + "loss": 1.5831, + "step": 3424 + }, + { + "epoch": 0.18725311974631986, + "grad_norm": 1.4527931213378906, + "learning_rate": 1.910945508085735e-05, + "loss": 1.6614, + "step": 3425 + }, + { + "epoch": 0.18730779219004143, + "grad_norm": 1.6067321300506592, + "learning_rate": 1.910870109508896e-05, + "loss": 1.3578, + "step": 3426 + }, + { + "epoch": 0.18736246463376297, + "grad_norm": 1.5336707830429077, + "learning_rate": 1.910794680515966e-05, + "loss": 1.611, + "step": 3427 + }, + { + "epoch": 0.18741713707748453, + "grad_norm": 1.6033697128295898, + "learning_rate": 1.9107192211094648e-05, + "loss": 1.5708, + "step": 3428 + }, + { + "epoch": 0.18747180952120607, + "grad_norm": 1.2585095167160034, + "learning_rate": 1.9106437312919116e-05, + "loss": 1.6552, + "step": 3429 + }, + { + "epoch": 0.18752648196492763, + "grad_norm": 1.1341807842254639, + "learning_rate": 1.9105682110658275e-05, + "loss": 1.4986, + "step": 3430 + }, + { + "epoch": 0.18758115440864917, + "grad_norm": 1.3472760915756226, + "learning_rate": 1.910492660433734e-05, + "loss": 1.4364, + "step": 3431 + }, + { + "epoch": 0.18763582685237074, + "grad_norm": 1.2320257425308228, + "learning_rate": 1.910417079398154e-05, + "loss": 1.7133, + "step": 3432 + }, + { + "epoch": 0.1876904992960923, + "grad_norm": 1.8307774066925049, + "learning_rate": 1.9103414679616112e-05, + "loss": 1.4245, + "step": 3433 + }, + { + "epoch": 0.18774517173981384, + "grad_norm": 1.2148959636688232, + "learning_rate": 1.9102658261266307e-05, + "loss": 1.5165, + "step": 3434 + }, + { + "epoch": 0.1877998441835354, + "grad_norm": 1.253710150718689, + "learning_rate": 1.9101901538957385e-05, + "loss": 1.5187, + "step": 3435 + }, + { + "epoch": 0.18785451662725694, + "grad_norm": 2.126946210861206, + "learning_rate": 1.910114451271461e-05, + "loss": 1.5485, + "step": 3436 + }, + { + "epoch": 0.1879091890709785, + "grad_norm": 1.4969432353973389, + "learning_rate": 1.9100387182563263e-05, + "loss": 1.6348, + "step": 3437 + }, + { + "epoch": 0.18796386151470004, + "grad_norm": 1.514009714126587, + "learning_rate": 1.909962954852863e-05, + "loss": 1.1342, + "step": 3438 + }, + { + "epoch": 0.1880185339584216, + "grad_norm": 1.7954516410827637, + "learning_rate": 1.909887161063602e-05, + "loss": 1.5971, + "step": 3439 + }, + { + "epoch": 0.18807320640214317, + "grad_norm": 1.4560810327529907, + "learning_rate": 1.9098113368910734e-05, + "loss": 1.5836, + "step": 3440 + }, + { + "epoch": 0.1881278788458647, + "grad_norm": 1.3636008501052856, + "learning_rate": 1.9097354823378094e-05, + "loss": 1.5926, + "step": 3441 + }, + { + "epoch": 0.18818255128958628, + "grad_norm": 1.8828555345535278, + "learning_rate": 1.9096595974063426e-05, + "loss": 1.4695, + "step": 3442 + }, + { + "epoch": 0.1882372237333078, + "grad_norm": 1.5854731798171997, + "learning_rate": 1.9095836820992074e-05, + "loss": 1.4124, + "step": 3443 + }, + { + "epoch": 0.18829189617702938, + "grad_norm": 1.2778760194778442, + "learning_rate": 1.9095077364189388e-05, + "loss": 1.5382, + "step": 3444 + }, + { + "epoch": 0.18834656862075091, + "grad_norm": 1.4812716245651245, + "learning_rate": 1.9094317603680725e-05, + "loss": 1.358, + "step": 3445 + }, + { + "epoch": 0.18840124106447248, + "grad_norm": 1.337017297744751, + "learning_rate": 1.9093557539491458e-05, + "loss": 1.2651, + "step": 3446 + }, + { + "epoch": 0.18845591350819405, + "grad_norm": 1.3703755140304565, + "learning_rate": 1.909279717164696e-05, + "loss": 1.6115, + "step": 3447 + }, + { + "epoch": 0.18851058595191558, + "grad_norm": 2.142211437225342, + "learning_rate": 1.9092036500172635e-05, + "loss": 1.3417, + "step": 3448 + }, + { + "epoch": 0.18856525839563715, + "grad_norm": 1.4279791116714478, + "learning_rate": 1.909127552509387e-05, + "loss": 1.3954, + "step": 3449 + }, + { + "epoch": 0.18861993083935868, + "grad_norm": 1.7467707395553589, + "learning_rate": 1.9090514246436085e-05, + "loss": 1.5362, + "step": 3450 + }, + { + "epoch": 0.18867460328308025, + "grad_norm": 1.8337268829345703, + "learning_rate": 1.9089752664224697e-05, + "loss": 1.5233, + "step": 3451 + }, + { + "epoch": 0.1887292757268018, + "grad_norm": 1.4889376163482666, + "learning_rate": 1.908899077848514e-05, + "loss": 1.4464, + "step": 3452 + }, + { + "epoch": 0.18878394817052335, + "grad_norm": 1.5354238748550415, + "learning_rate": 1.9088228589242855e-05, + "loss": 1.3045, + "step": 3453 + }, + { + "epoch": 0.18883862061424492, + "grad_norm": 1.52393639087677, + "learning_rate": 1.9087466096523287e-05, + "loss": 1.3951, + "step": 3454 + }, + { + "epoch": 0.18889329305796645, + "grad_norm": 1.4212796688079834, + "learning_rate": 1.9086703300351903e-05, + "loss": 1.4385, + "step": 3455 + }, + { + "epoch": 0.18894796550168802, + "grad_norm": 1.6745541095733643, + "learning_rate": 1.908594020075417e-05, + "loss": 1.6959, + "step": 3456 + }, + { + "epoch": 0.18900263794540956, + "grad_norm": 1.5249885320663452, + "learning_rate": 1.9085176797755575e-05, + "loss": 1.2185, + "step": 3457 + }, + { + "epoch": 0.18905731038913112, + "grad_norm": 1.472884178161621, + "learning_rate": 1.908441309138161e-05, + "loss": 1.4497, + "step": 3458 + }, + { + "epoch": 0.18911198283285266, + "grad_norm": 1.5570663213729858, + "learning_rate": 1.9083649081657773e-05, + "loss": 1.4595, + "step": 3459 + }, + { + "epoch": 0.18916665527657422, + "grad_norm": 1.802646279335022, + "learning_rate": 1.9082884768609578e-05, + "loss": 1.3905, + "step": 3460 + }, + { + "epoch": 0.1892213277202958, + "grad_norm": 1.2880007028579712, + "learning_rate": 1.9082120152262544e-05, + "loss": 1.2571, + "step": 3461 + }, + { + "epoch": 0.18927600016401733, + "grad_norm": 1.5402919054031372, + "learning_rate": 1.908135523264221e-05, + "loss": 1.3891, + "step": 3462 + }, + { + "epoch": 0.1893306726077389, + "grad_norm": 1.599230170249939, + "learning_rate": 1.9080590009774114e-05, + "loss": 1.5273, + "step": 3463 + }, + { + "epoch": 0.18938534505146043, + "grad_norm": 1.4754821062088013, + "learning_rate": 1.9079824483683808e-05, + "loss": 1.3307, + "step": 3464 + }, + { + "epoch": 0.189440017495182, + "grad_norm": 1.7228623628616333, + "learning_rate": 1.907905865439686e-05, + "loss": 1.4977, + "step": 3465 + }, + { + "epoch": 0.18949468993890353, + "grad_norm": 1.5406652688980103, + "learning_rate": 1.9078292521938834e-05, + "loss": 1.5431, + "step": 3466 + }, + { + "epoch": 0.1895493623826251, + "grad_norm": 1.4736583232879639, + "learning_rate": 1.907752608633532e-05, + "loss": 1.4506, + "step": 3467 + }, + { + "epoch": 0.18960403482634666, + "grad_norm": 1.700006365776062, + "learning_rate": 1.9076759347611908e-05, + "loss": 1.4421, + "step": 3468 + }, + { + "epoch": 0.1896587072700682, + "grad_norm": 1.4830505847930908, + "learning_rate": 1.90759923057942e-05, + "loss": 1.5617, + "step": 3469 + }, + { + "epoch": 0.18971337971378976, + "grad_norm": 1.453869342803955, + "learning_rate": 1.9075224960907815e-05, + "loss": 1.5109, + "step": 3470 + }, + { + "epoch": 0.1897680521575113, + "grad_norm": 1.2015756368637085, + "learning_rate": 1.9074457312978373e-05, + "loss": 1.5229, + "step": 3471 + }, + { + "epoch": 0.18982272460123287, + "grad_norm": 1.4882715940475464, + "learning_rate": 1.9073689362031504e-05, + "loss": 1.2892, + "step": 3472 + }, + { + "epoch": 0.1898773970449544, + "grad_norm": 1.3120334148406982, + "learning_rate": 1.907292110809286e-05, + "loss": 1.3601, + "step": 3473 + }, + { + "epoch": 0.18993206948867597, + "grad_norm": 1.706580638885498, + "learning_rate": 1.9072152551188085e-05, + "loss": 1.3063, + "step": 3474 + }, + { + "epoch": 0.18998674193239753, + "grad_norm": 1.8983404636383057, + "learning_rate": 1.907138369134285e-05, + "loss": 1.496, + "step": 3475 + }, + { + "epoch": 0.19004141437611907, + "grad_norm": 1.8825441598892212, + "learning_rate": 1.907061452858283e-05, + "loss": 1.3882, + "step": 3476 + }, + { + "epoch": 0.19009608681984064, + "grad_norm": 1.462575912475586, + "learning_rate": 1.90698450629337e-05, + "loss": 1.6632, + "step": 3477 + }, + { + "epoch": 0.19015075926356217, + "grad_norm": 1.653853178024292, + "learning_rate": 1.9069075294421163e-05, + "loss": 1.2919, + "step": 3478 + }, + { + "epoch": 0.19020543170728374, + "grad_norm": 1.4191863536834717, + "learning_rate": 1.9068305223070923e-05, + "loss": 1.3868, + "step": 3479 + }, + { + "epoch": 0.19026010415100528, + "grad_norm": 1.8875428438186646, + "learning_rate": 1.9067534848908688e-05, + "loss": 1.2452, + "step": 3480 + }, + { + "epoch": 0.19031477659472684, + "grad_norm": 1.7905610799789429, + "learning_rate": 1.9066764171960187e-05, + "loss": 1.452, + "step": 3481 + }, + { + "epoch": 0.1903694490384484, + "grad_norm": 1.451296091079712, + "learning_rate": 1.9065993192251157e-05, + "loss": 1.5375, + "step": 3482 + }, + { + "epoch": 0.19042412148216994, + "grad_norm": 1.6760879755020142, + "learning_rate": 1.9065221909807337e-05, + "loss": 1.4774, + "step": 3483 + }, + { + "epoch": 0.1904787939258915, + "grad_norm": 1.297042727470398, + "learning_rate": 1.906445032465449e-05, + "loss": 1.7166, + "step": 3484 + }, + { + "epoch": 0.19053346636961305, + "grad_norm": 1.5842530727386475, + "learning_rate": 1.9063678436818372e-05, + "loss": 1.4972, + "step": 3485 + }, + { + "epoch": 0.1905881388133346, + "grad_norm": 1.4761584997177124, + "learning_rate": 1.9062906246324767e-05, + "loss": 1.4377, + "step": 3486 + }, + { + "epoch": 0.19064281125705615, + "grad_norm": 1.4117100238800049, + "learning_rate": 1.9062133753199452e-05, + "loss": 1.3991, + "step": 3487 + }, + { + "epoch": 0.19069748370077771, + "grad_norm": 1.5843571424484253, + "learning_rate": 1.906136095746823e-05, + "loss": 1.4268, + "step": 3488 + }, + { + "epoch": 0.19075215614449928, + "grad_norm": 1.4169384241104126, + "learning_rate": 1.90605878591569e-05, + "loss": 1.4714, + "step": 3489 + }, + { + "epoch": 0.19080682858822082, + "grad_norm": 1.3945852518081665, + "learning_rate": 1.9059814458291277e-05, + "loss": 1.3702, + "step": 3490 + }, + { + "epoch": 0.19086150103194238, + "grad_norm": 1.6001776456832886, + "learning_rate": 1.9059040754897194e-05, + "loss": 1.7457, + "step": 3491 + }, + { + "epoch": 0.19091617347566392, + "grad_norm": 1.5901851654052734, + "learning_rate": 1.905826674900048e-05, + "loss": 1.6811, + "step": 3492 + }, + { + "epoch": 0.19097084591938548, + "grad_norm": 1.6377756595611572, + "learning_rate": 1.9057492440626983e-05, + "loss": 1.308, + "step": 3493 + }, + { + "epoch": 0.19102551836310702, + "grad_norm": 1.5153090953826904, + "learning_rate": 1.9056717829802562e-05, + "loss": 1.3533, + "step": 3494 + }, + { + "epoch": 0.1910801908068286, + "grad_norm": 1.3397449254989624, + "learning_rate": 1.9055942916553078e-05, + "loss": 1.463, + "step": 3495 + }, + { + "epoch": 0.19113486325055015, + "grad_norm": 1.317204236984253, + "learning_rate": 1.9055167700904412e-05, + "loss": 1.302, + "step": 3496 + }, + { + "epoch": 0.1911895356942717, + "grad_norm": 1.1755748987197876, + "learning_rate": 1.9054392182882446e-05, + "loss": 1.6788, + "step": 3497 + }, + { + "epoch": 0.19124420813799325, + "grad_norm": 1.6745105981826782, + "learning_rate": 1.905361636251308e-05, + "loss": 1.5251, + "step": 3498 + }, + { + "epoch": 0.1912988805817148, + "grad_norm": 1.401665210723877, + "learning_rate": 1.9052840239822218e-05, + "loss": 1.2966, + "step": 3499 + }, + { + "epoch": 0.19135355302543636, + "grad_norm": 1.165949821472168, + "learning_rate": 1.9052063814835774e-05, + "loss": 1.524, + "step": 3500 + }, + { + "epoch": 0.1914082254691579, + "grad_norm": 1.9890329837799072, + "learning_rate": 1.905128708757968e-05, + "loss": 1.4748, + "step": 3501 + }, + { + "epoch": 0.19146289791287946, + "grad_norm": 1.4226189851760864, + "learning_rate": 1.905051005807987e-05, + "loss": 1.4239, + "step": 3502 + }, + { + "epoch": 0.19151757035660102, + "grad_norm": 1.645899772644043, + "learning_rate": 1.9049732726362295e-05, + "loss": 1.4147, + "step": 3503 + }, + { + "epoch": 0.19157224280032256, + "grad_norm": 1.32676100730896, + "learning_rate": 1.9048955092452905e-05, + "loss": 1.494, + "step": 3504 + }, + { + "epoch": 0.19162691524404413, + "grad_norm": 1.4970805644989014, + "learning_rate": 1.9048177156377673e-05, + "loss": 1.4741, + "step": 3505 + }, + { + "epoch": 0.19168158768776566, + "grad_norm": 1.8282662630081177, + "learning_rate": 1.904739891816257e-05, + "loss": 1.2934, + "step": 3506 + }, + { + "epoch": 0.19173626013148723, + "grad_norm": 1.649390697479248, + "learning_rate": 1.9046620377833588e-05, + "loss": 1.2639, + "step": 3507 + }, + { + "epoch": 0.19179093257520877, + "grad_norm": 1.3538563251495361, + "learning_rate": 1.9045841535416724e-05, + "loss": 1.6583, + "step": 3508 + }, + { + "epoch": 0.19184560501893033, + "grad_norm": 1.5144422054290771, + "learning_rate": 1.9045062390937985e-05, + "loss": 1.1778, + "step": 3509 + }, + { + "epoch": 0.1919002774626519, + "grad_norm": 1.298651099205017, + "learning_rate": 1.9044282944423387e-05, + "loss": 1.3779, + "step": 3510 + }, + { + "epoch": 0.19195494990637343, + "grad_norm": 1.6223198175430298, + "learning_rate": 1.9043503195898957e-05, + "loss": 1.3011, + "step": 3511 + }, + { + "epoch": 0.192009622350095, + "grad_norm": 1.2054330110549927, + "learning_rate": 1.9042723145390738e-05, + "loss": 1.4149, + "step": 3512 + }, + { + "epoch": 0.19206429479381654, + "grad_norm": 1.3205583095550537, + "learning_rate": 1.9041942792924767e-05, + "loss": 1.3717, + "step": 3513 + }, + { + "epoch": 0.1921189672375381, + "grad_norm": 1.4480483531951904, + "learning_rate": 1.9041162138527115e-05, + "loss": 1.5072, + "step": 3514 + }, + { + "epoch": 0.19217363968125964, + "grad_norm": 1.457366943359375, + "learning_rate": 1.904038118222384e-05, + "loss": 1.6203, + "step": 3515 + }, + { + "epoch": 0.1922283121249812, + "grad_norm": 1.201584815979004, + "learning_rate": 1.9039599924041026e-05, + "loss": 1.3879, + "step": 3516 + }, + { + "epoch": 0.19228298456870277, + "grad_norm": 1.3452106714248657, + "learning_rate": 1.903881836400476e-05, + "loss": 1.5746, + "step": 3517 + }, + { + "epoch": 0.1923376570124243, + "grad_norm": 1.6685712337493896, + "learning_rate": 1.9038036502141133e-05, + "loss": 1.2898, + "step": 3518 + }, + { + "epoch": 0.19239232945614587, + "grad_norm": 1.5376312732696533, + "learning_rate": 1.9037254338476262e-05, + "loss": 1.4153, + "step": 3519 + }, + { + "epoch": 0.1924470018998674, + "grad_norm": 1.6993545293807983, + "learning_rate": 1.903647187303626e-05, + "loss": 1.3355, + "step": 3520 + }, + { + "epoch": 0.19250167434358897, + "grad_norm": 1.4684169292449951, + "learning_rate": 1.9035689105847258e-05, + "loss": 1.5088, + "step": 3521 + }, + { + "epoch": 0.19255634678731054, + "grad_norm": 1.789528250694275, + "learning_rate": 1.9034906036935393e-05, + "loss": 1.4999, + "step": 3522 + }, + { + "epoch": 0.19261101923103208, + "grad_norm": 1.5393407344818115, + "learning_rate": 1.9034122666326818e-05, + "loss": 1.2027, + "step": 3523 + }, + { + "epoch": 0.19266569167475364, + "grad_norm": 1.5484390258789062, + "learning_rate": 1.903333899404769e-05, + "loss": 1.7288, + "step": 3524 + }, + { + "epoch": 0.19272036411847518, + "grad_norm": 1.5369712114334106, + "learning_rate": 1.903255502012417e-05, + "loss": 1.4586, + "step": 3525 + }, + { + "epoch": 0.19277503656219674, + "grad_norm": 1.2231124639511108, + "learning_rate": 1.9031770744582443e-05, + "loss": 1.4939, + "step": 3526 + }, + { + "epoch": 0.19282970900591828, + "grad_norm": 1.7649824619293213, + "learning_rate": 1.9030986167448696e-05, + "loss": 1.5097, + "step": 3527 + }, + { + "epoch": 0.19288438144963985, + "grad_norm": 1.5753425359725952, + "learning_rate": 1.9030201288749133e-05, + "loss": 1.7031, + "step": 3528 + }, + { + "epoch": 0.1929390538933614, + "grad_norm": 1.6427595615386963, + "learning_rate": 1.9029416108509953e-05, + "loss": 1.4436, + "step": 3529 + }, + { + "epoch": 0.19299372633708295, + "grad_norm": 1.9052214622497559, + "learning_rate": 1.902863062675739e-05, + "loss": 1.4405, + "step": 3530 + }, + { + "epoch": 0.19304839878080451, + "grad_norm": 1.452168583869934, + "learning_rate": 1.9027844843517657e-05, + "loss": 1.4536, + "step": 3531 + }, + { + "epoch": 0.19310307122452605, + "grad_norm": 1.5167666673660278, + "learning_rate": 1.9027058758817e-05, + "loss": 1.5458, + "step": 3532 + }, + { + "epoch": 0.19315774366824762, + "grad_norm": 1.754012107849121, + "learning_rate": 1.9026272372681674e-05, + "loss": 1.371, + "step": 3533 + }, + { + "epoch": 0.19321241611196915, + "grad_norm": 1.7687275409698486, + "learning_rate": 1.902548568513793e-05, + "loss": 1.3132, + "step": 3534 + }, + { + "epoch": 0.19326708855569072, + "grad_norm": 1.552406668663025, + "learning_rate": 1.9024698696212035e-05, + "loss": 1.4189, + "step": 3535 + }, + { + "epoch": 0.19332176099941228, + "grad_norm": 1.6847426891326904, + "learning_rate": 1.902391140593028e-05, + "loss": 1.4708, + "step": 3536 + }, + { + "epoch": 0.19337643344313382, + "grad_norm": 1.6187832355499268, + "learning_rate": 1.9023123814318943e-05, + "loss": 1.4314, + "step": 3537 + }, + { + "epoch": 0.1934311058868554, + "grad_norm": 1.8356883525848389, + "learning_rate": 1.9022335921404332e-05, + "loss": 1.3347, + "step": 3538 + }, + { + "epoch": 0.19348577833057692, + "grad_norm": 1.502838134765625, + "learning_rate": 1.9021547727212753e-05, + "loss": 1.6986, + "step": 3539 + }, + { + "epoch": 0.1935404507742985, + "grad_norm": 1.6226322650909424, + "learning_rate": 1.9020759231770526e-05, + "loss": 1.4932, + "step": 3540 + }, + { + "epoch": 0.19359512321802003, + "grad_norm": 1.1282726526260376, + "learning_rate": 1.9019970435103978e-05, + "loss": 1.5728, + "step": 3541 + }, + { + "epoch": 0.1936497956617416, + "grad_norm": 1.6416312456130981, + "learning_rate": 1.9019181337239453e-05, + "loss": 1.2119, + "step": 3542 + }, + { + "epoch": 0.19370446810546316, + "grad_norm": 1.8058310747146606, + "learning_rate": 1.9018391938203294e-05, + "loss": 1.412, + "step": 3543 + }, + { + "epoch": 0.1937591405491847, + "grad_norm": 1.238856315612793, + "learning_rate": 1.9017602238021873e-05, + "loss": 1.2205, + "step": 3544 + }, + { + "epoch": 0.19381381299290626, + "grad_norm": 1.5285619497299194, + "learning_rate": 1.9016812236721548e-05, + "loss": 1.4608, + "step": 3545 + }, + { + "epoch": 0.1938684854366278, + "grad_norm": 1.9043607711791992, + "learning_rate": 1.9016021934328708e-05, + "loss": 1.2168, + "step": 3546 + }, + { + "epoch": 0.19392315788034936, + "grad_norm": 1.5138943195343018, + "learning_rate": 1.9015231330869736e-05, + "loss": 1.2644, + "step": 3547 + }, + { + "epoch": 0.1939778303240709, + "grad_norm": 1.618690848350525, + "learning_rate": 1.9014440426371034e-05, + "loss": 1.4067, + "step": 3548 + }, + { + "epoch": 0.19403250276779246, + "grad_norm": 1.5149879455566406, + "learning_rate": 1.9013649220859017e-05, + "loss": 1.5161, + "step": 3549 + }, + { + "epoch": 0.19408717521151403, + "grad_norm": 1.6010054349899292, + "learning_rate": 1.9012857714360094e-05, + "loss": 1.5726, + "step": 3550 + }, + { + "epoch": 0.19414184765523557, + "grad_norm": 1.328732967376709, + "learning_rate": 1.901206590690071e-05, + "loss": 1.4424, + "step": 3551 + }, + { + "epoch": 0.19419652009895713, + "grad_norm": 1.5646620988845825, + "learning_rate": 1.9011273798507295e-05, + "loss": 1.5928, + "step": 3552 + }, + { + "epoch": 0.19425119254267867, + "grad_norm": 1.5460883378982544, + "learning_rate": 1.90104813892063e-05, + "loss": 1.6628, + "step": 3553 + }, + { + "epoch": 0.19430586498640023, + "grad_norm": 2.0961990356445312, + "learning_rate": 1.900968867902419e-05, + "loss": 1.5458, + "step": 3554 + }, + { + "epoch": 0.19436053743012177, + "grad_norm": 1.5936014652252197, + "learning_rate": 1.9008895667987434e-05, + "loss": 1.5471, + "step": 3555 + }, + { + "epoch": 0.19441520987384334, + "grad_norm": 1.9480276107788086, + "learning_rate": 1.9008102356122515e-05, + "loss": 1.6219, + "step": 3556 + }, + { + "epoch": 0.1944698823175649, + "grad_norm": 1.5385206937789917, + "learning_rate": 1.9007308743455914e-05, + "loss": 1.3312, + "step": 3557 + }, + { + "epoch": 0.19452455476128644, + "grad_norm": 1.9798071384429932, + "learning_rate": 1.900651483001414e-05, + "loss": 1.302, + "step": 3558 + }, + { + "epoch": 0.194579227205008, + "grad_norm": 1.3232731819152832, + "learning_rate": 1.9005720615823698e-05, + "loss": 1.4194, + "step": 3559 + }, + { + "epoch": 0.19463389964872954, + "grad_norm": 1.207486629486084, + "learning_rate": 1.9004926100911117e-05, + "loss": 1.4811, + "step": 3560 + }, + { + "epoch": 0.1946885720924511, + "grad_norm": 2.5501043796539307, + "learning_rate": 1.9004131285302924e-05, + "loss": 1.6088, + "step": 3561 + }, + { + "epoch": 0.19474324453617264, + "grad_norm": 1.4215281009674072, + "learning_rate": 1.9003336169025655e-05, + "loss": 1.3996, + "step": 3562 + }, + { + "epoch": 0.1947979169798942, + "grad_norm": 1.5823923349380493, + "learning_rate": 1.9002540752105862e-05, + "loss": 1.5273, + "step": 3563 + }, + { + "epoch": 0.19485258942361577, + "grad_norm": 1.4444434642791748, + "learning_rate": 1.9001745034570113e-05, + "loss": 1.5666, + "step": 3564 + }, + { + "epoch": 0.1949072618673373, + "grad_norm": 2.0529062747955322, + "learning_rate": 1.9000949016444972e-05, + "loss": 1.4113, + "step": 3565 + }, + { + "epoch": 0.19496193431105888, + "grad_norm": 1.7671774625778198, + "learning_rate": 1.900015269775702e-05, + "loss": 1.4164, + "step": 3566 + }, + { + "epoch": 0.1950166067547804, + "grad_norm": 1.584179401397705, + "learning_rate": 1.8999356078532852e-05, + "loss": 1.4571, + "step": 3567 + }, + { + "epoch": 0.19507127919850198, + "grad_norm": 1.5643539428710938, + "learning_rate": 1.899855915879907e-05, + "loss": 1.3133, + "step": 3568 + }, + { + "epoch": 0.19512595164222352, + "grad_norm": 1.641041874885559, + "learning_rate": 1.8997761938582277e-05, + "loss": 1.6399, + "step": 3569 + }, + { + "epoch": 0.19518062408594508, + "grad_norm": 1.6450377702713013, + "learning_rate": 1.89969644179091e-05, + "loss": 1.5398, + "step": 3570 + }, + { + "epoch": 0.19523529652966665, + "grad_norm": 1.4167137145996094, + "learning_rate": 1.899616659680617e-05, + "loss": 1.5801, + "step": 3571 + }, + { + "epoch": 0.19528996897338818, + "grad_norm": 1.4938009977340698, + "learning_rate": 1.8995368475300128e-05, + "loss": 1.3684, + "step": 3572 + }, + { + "epoch": 0.19534464141710975, + "grad_norm": 2.033663749694824, + "learning_rate": 1.8994570053417622e-05, + "loss": 1.3433, + "step": 3573 + }, + { + "epoch": 0.1953993138608313, + "grad_norm": 1.2700108289718628, + "learning_rate": 1.8993771331185317e-05, + "loss": 1.5528, + "step": 3574 + }, + { + "epoch": 0.19545398630455285, + "grad_norm": 1.3692196607589722, + "learning_rate": 1.899297230862988e-05, + "loss": 1.4454, + "step": 3575 + }, + { + "epoch": 0.1955086587482744, + "grad_norm": 1.4768929481506348, + "learning_rate": 1.8992172985778002e-05, + "loss": 1.4614, + "step": 3576 + }, + { + "epoch": 0.19556333119199595, + "grad_norm": 1.798924446105957, + "learning_rate": 1.899137336265636e-05, + "loss": 1.4361, + "step": 3577 + }, + { + "epoch": 0.19561800363571752, + "grad_norm": 1.3729621171951294, + "learning_rate": 1.8990573439291666e-05, + "loss": 1.5849, + "step": 3578 + }, + { + "epoch": 0.19567267607943906, + "grad_norm": 1.8653099536895752, + "learning_rate": 1.8989773215710627e-05, + "loss": 1.4795, + "step": 3579 + }, + { + "epoch": 0.19572734852316062, + "grad_norm": 1.399235725402832, + "learning_rate": 1.8988972691939965e-05, + "loss": 1.5977, + "step": 3580 + }, + { + "epoch": 0.19578202096688216, + "grad_norm": 1.3860130310058594, + "learning_rate": 1.898817186800641e-05, + "loss": 1.5533, + "step": 3581 + }, + { + "epoch": 0.19583669341060372, + "grad_norm": 1.2227345705032349, + "learning_rate": 1.8987370743936707e-05, + "loss": 1.4452, + "step": 3582 + }, + { + "epoch": 0.19589136585432526, + "grad_norm": 1.5988872051239014, + "learning_rate": 1.8986569319757605e-05, + "loss": 1.5217, + "step": 3583 + }, + { + "epoch": 0.19594603829804683, + "grad_norm": 1.2619800567626953, + "learning_rate": 1.8985767595495868e-05, + "loss": 1.5615, + "step": 3584 + }, + { + "epoch": 0.1960007107417684, + "grad_norm": 1.571465253829956, + "learning_rate": 1.898496557117826e-05, + "loss": 1.61, + "step": 3585 + }, + { + "epoch": 0.19605538318548993, + "grad_norm": 1.2782765626907349, + "learning_rate": 1.898416324683157e-05, + "loss": 1.572, + "step": 3586 + }, + { + "epoch": 0.1961100556292115, + "grad_norm": 1.736011028289795, + "learning_rate": 1.898336062248259e-05, + "loss": 1.5021, + "step": 3587 + }, + { + "epoch": 0.19616472807293303, + "grad_norm": 1.5546263456344604, + "learning_rate": 1.8982557698158114e-05, + "loss": 1.32, + "step": 3588 + }, + { + "epoch": 0.1962194005166546, + "grad_norm": 1.7298763990402222, + "learning_rate": 1.8981754473884962e-05, + "loss": 1.4835, + "step": 3589 + }, + { + "epoch": 0.19627407296037613, + "grad_norm": 1.141677737236023, + "learning_rate": 1.8980950949689952e-05, + "loss": 1.5102, + "step": 3590 + }, + { + "epoch": 0.1963287454040977, + "grad_norm": 1.389379858970642, + "learning_rate": 1.8980147125599912e-05, + "loss": 1.5691, + "step": 3591 + }, + { + "epoch": 0.19638341784781926, + "grad_norm": 1.7461497783660889, + "learning_rate": 1.897934300164169e-05, + "loss": 1.4579, + "step": 3592 + }, + { + "epoch": 0.1964380902915408, + "grad_norm": 1.8134368658065796, + "learning_rate": 1.897853857784213e-05, + "loss": 1.4989, + "step": 3593 + }, + { + "epoch": 0.19649276273526237, + "grad_norm": 1.8994206190109253, + "learning_rate": 1.8977733854228102e-05, + "loss": 1.6057, + "step": 3594 + }, + { + "epoch": 0.1965474351789839, + "grad_norm": 1.1670957803726196, + "learning_rate": 1.8976928830826474e-05, + "loss": 1.643, + "step": 3595 + }, + { + "epoch": 0.19660210762270547, + "grad_norm": 1.547371506690979, + "learning_rate": 1.8976123507664127e-05, + "loss": 1.6466, + "step": 3596 + }, + { + "epoch": 0.196656780066427, + "grad_norm": 1.4460163116455078, + "learning_rate": 1.897531788476795e-05, + "loss": 1.3708, + "step": 3597 + }, + { + "epoch": 0.19671145251014857, + "grad_norm": 1.2053645849227905, + "learning_rate": 1.8974511962164853e-05, + "loss": 1.4659, + "step": 3598 + }, + { + "epoch": 0.19676612495387014, + "grad_norm": 1.420696496963501, + "learning_rate": 1.8973705739881736e-05, + "loss": 1.6625, + "step": 3599 + }, + { + "epoch": 0.19682079739759167, + "grad_norm": 1.5047460794448853, + "learning_rate": 1.897289921794553e-05, + "loss": 1.5691, + "step": 3600 + }, + { + "epoch": 0.19687546984131324, + "grad_norm": 1.6689870357513428, + "learning_rate": 1.8972092396383165e-05, + "loss": 1.6137, + "step": 3601 + }, + { + "epoch": 0.19693014228503478, + "grad_norm": 1.279528021812439, + "learning_rate": 1.897128527522158e-05, + "loss": 1.3388, + "step": 3602 + }, + { + "epoch": 0.19698481472875634, + "grad_norm": 1.4483572244644165, + "learning_rate": 1.8970477854487726e-05, + "loss": 1.4939, + "step": 3603 + }, + { + "epoch": 0.19703948717247788, + "grad_norm": 1.4967806339263916, + "learning_rate": 1.896967013420857e-05, + "loss": 1.4844, + "step": 3604 + }, + { + "epoch": 0.19709415961619944, + "grad_norm": 1.3930354118347168, + "learning_rate": 1.8968862114411078e-05, + "loss": 1.5211, + "step": 3605 + }, + { + "epoch": 0.197148832059921, + "grad_norm": 1.483640193939209, + "learning_rate": 1.8968053795122232e-05, + "loss": 1.7737, + "step": 3606 + }, + { + "epoch": 0.19720350450364255, + "grad_norm": 1.3370507955551147, + "learning_rate": 1.8967245176369028e-05, + "loss": 1.4964, + "step": 3607 + }, + { + "epoch": 0.1972581769473641, + "grad_norm": 1.2475656270980835, + "learning_rate": 1.8966436258178465e-05, + "loss": 1.4999, + "step": 3608 + }, + { + "epoch": 0.19731284939108565, + "grad_norm": 1.6658488512039185, + "learning_rate": 1.8965627040577558e-05, + "loss": 1.2772, + "step": 3609 + }, + { + "epoch": 0.1973675218348072, + "grad_norm": 1.5969773530960083, + "learning_rate": 1.896481752359332e-05, + "loss": 1.3953, + "step": 3610 + }, + { + "epoch": 0.19742219427852875, + "grad_norm": 1.6299612522125244, + "learning_rate": 1.896400770725279e-05, + "loss": 1.461, + "step": 3611 + }, + { + "epoch": 0.19747686672225032, + "grad_norm": 1.538278579711914, + "learning_rate": 1.896319759158301e-05, + "loss": 1.6943, + "step": 3612 + }, + { + "epoch": 0.19753153916597188, + "grad_norm": 1.2492365837097168, + "learning_rate": 1.896238717661103e-05, + "loss": 1.4754, + "step": 3613 + }, + { + "epoch": 0.19758621160969342, + "grad_norm": 1.971964955329895, + "learning_rate": 1.8961576462363908e-05, + "loss": 1.2064, + "step": 3614 + }, + { + "epoch": 0.19764088405341498, + "grad_norm": 1.6544935703277588, + "learning_rate": 1.896076544886872e-05, + "loss": 1.2407, + "step": 3615 + }, + { + "epoch": 0.19769555649713652, + "grad_norm": 1.364334225654602, + "learning_rate": 1.8959954136152546e-05, + "loss": 1.6094, + "step": 3616 + }, + { + "epoch": 0.19775022894085809, + "grad_norm": 1.660792350769043, + "learning_rate": 1.8959142524242482e-05, + "loss": 1.1587, + "step": 3617 + }, + { + "epoch": 0.19780490138457965, + "grad_norm": 1.4585890769958496, + "learning_rate": 1.8958330613165622e-05, + "loss": 1.2446, + "step": 3618 + }, + { + "epoch": 0.1978595738283012, + "grad_norm": 1.4258733987808228, + "learning_rate": 1.8957518402949082e-05, + "loss": 1.6373, + "step": 3619 + }, + { + "epoch": 0.19791424627202275, + "grad_norm": 1.7735345363616943, + "learning_rate": 1.8956705893619984e-05, + "loss": 1.6466, + "step": 3620 + }, + { + "epoch": 0.1979689187157443, + "grad_norm": 1.6002358198165894, + "learning_rate": 1.895589308520546e-05, + "loss": 1.3636, + "step": 3621 + }, + { + "epoch": 0.19802359115946586, + "grad_norm": 1.363898754119873, + "learning_rate": 1.895507997773265e-05, + "loss": 1.2371, + "step": 3622 + }, + { + "epoch": 0.1980782636031874, + "grad_norm": 1.5952491760253906, + "learning_rate": 1.8954266571228702e-05, + "loss": 1.7404, + "step": 3623 + }, + { + "epoch": 0.19813293604690896, + "grad_norm": 1.928565502166748, + "learning_rate": 1.8953452865720784e-05, + "loss": 1.4582, + "step": 3624 + }, + { + "epoch": 0.19818760849063052, + "grad_norm": 1.9089354276657104, + "learning_rate": 1.8952638861236066e-05, + "loss": 1.6593, + "step": 3625 + }, + { + "epoch": 0.19824228093435206, + "grad_norm": 1.6089823246002197, + "learning_rate": 1.8951824557801726e-05, + "loss": 1.3272, + "step": 3626 + }, + { + "epoch": 0.19829695337807363, + "grad_norm": 1.6507691144943237, + "learning_rate": 1.895100995544496e-05, + "loss": 1.1451, + "step": 3627 + }, + { + "epoch": 0.19835162582179516, + "grad_norm": 1.2844526767730713, + "learning_rate": 1.8950195054192965e-05, + "loss": 1.5884, + "step": 3628 + }, + { + "epoch": 0.19840629826551673, + "grad_norm": 1.6174299716949463, + "learning_rate": 1.8949379854072954e-05, + "loss": 1.3066, + "step": 3629 + }, + { + "epoch": 0.19846097070923827, + "grad_norm": 1.641088604927063, + "learning_rate": 1.8948564355112154e-05, + "loss": 1.7681, + "step": 3630 + }, + { + "epoch": 0.19851564315295983, + "grad_norm": 1.5646822452545166, + "learning_rate": 1.8947748557337792e-05, + "loss": 1.3494, + "step": 3631 + }, + { + "epoch": 0.1985703155966814, + "grad_norm": 1.4624321460723877, + "learning_rate": 1.8946932460777105e-05, + "loss": 1.5103, + "step": 3632 + }, + { + "epoch": 0.19862498804040293, + "grad_norm": 1.5415985584259033, + "learning_rate": 1.894611606545735e-05, + "loss": 1.6268, + "step": 3633 + }, + { + "epoch": 0.1986796604841245, + "grad_norm": 2.8793931007385254, + "learning_rate": 1.8945299371405784e-05, + "loss": 1.481, + "step": 3634 + }, + { + "epoch": 0.19873433292784604, + "grad_norm": 2.067258834838867, + "learning_rate": 1.8944482378649686e-05, + "loss": 1.4569, + "step": 3635 + }, + { + "epoch": 0.1987890053715676, + "grad_norm": 1.6028627157211304, + "learning_rate": 1.8943665087216327e-05, + "loss": 1.4596, + "step": 3636 + }, + { + "epoch": 0.19884367781528914, + "grad_norm": 1.7462011575698853, + "learning_rate": 1.8942847497133008e-05, + "loss": 1.6329, + "step": 3637 + }, + { + "epoch": 0.1988983502590107, + "grad_norm": 1.1693925857543945, + "learning_rate": 1.8942029608427027e-05, + "loss": 1.4367, + "step": 3638 + }, + { + "epoch": 0.19895302270273227, + "grad_norm": 1.5599538087844849, + "learning_rate": 1.894121142112569e-05, + "loss": 1.628, + "step": 3639 + }, + { + "epoch": 0.1990076951464538, + "grad_norm": 1.6828442811965942, + "learning_rate": 1.8940392935256325e-05, + "loss": 1.3074, + "step": 3640 + }, + { + "epoch": 0.19906236759017537, + "grad_norm": 1.696722388267517, + "learning_rate": 1.8939574150846264e-05, + "loss": 1.345, + "step": 3641 + }, + { + "epoch": 0.1991170400338969, + "grad_norm": 1.3733307123184204, + "learning_rate": 1.893875506792284e-05, + "loss": 1.4483, + "step": 3642 + }, + { + "epoch": 0.19917171247761847, + "grad_norm": 1.422431468963623, + "learning_rate": 1.893793568651341e-05, + "loss": 1.4849, + "step": 3643 + }, + { + "epoch": 0.19922638492134, + "grad_norm": 1.9122909307479858, + "learning_rate": 1.8937116006645332e-05, + "loss": 1.4403, + "step": 3644 + }, + { + "epoch": 0.19928105736506158, + "grad_norm": 1.5451682806015015, + "learning_rate": 1.893629602834598e-05, + "loss": 1.5193, + "step": 3645 + }, + { + "epoch": 0.19933572980878314, + "grad_norm": 1.6781833171844482, + "learning_rate": 1.8935475751642736e-05, + "loss": 1.4745, + "step": 3646 + }, + { + "epoch": 0.19939040225250468, + "grad_norm": 1.2617841958999634, + "learning_rate": 1.8934655176562988e-05, + "loss": 1.6331, + "step": 3647 + }, + { + "epoch": 0.19944507469622624, + "grad_norm": 1.7601573467254639, + "learning_rate": 1.8933834303134136e-05, + "loss": 1.4007, + "step": 3648 + }, + { + "epoch": 0.19949974713994778, + "grad_norm": 1.8382649421691895, + "learning_rate": 1.8933013131383594e-05, + "loss": 1.2881, + "step": 3649 + }, + { + "epoch": 0.19955441958366935, + "grad_norm": 1.583475112915039, + "learning_rate": 1.8932191661338785e-05, + "loss": 1.4114, + "step": 3650 + }, + { + "epoch": 0.19960909202739088, + "grad_norm": 1.367856502532959, + "learning_rate": 1.893136989302713e-05, + "loss": 1.3101, + "step": 3651 + }, + { + "epoch": 0.19966376447111245, + "grad_norm": 1.8539960384368896, + "learning_rate": 1.893054782647608e-05, + "loss": 1.6115, + "step": 3652 + }, + { + "epoch": 0.199718436914834, + "grad_norm": 1.560724139213562, + "learning_rate": 1.8929725461713083e-05, + "loss": 1.4367, + "step": 3653 + }, + { + "epoch": 0.19977310935855555, + "grad_norm": 1.5094335079193115, + "learning_rate": 1.8928902798765594e-05, + "loss": 1.5593, + "step": 3654 + }, + { + "epoch": 0.19982778180227712, + "grad_norm": 2.3347795009613037, + "learning_rate": 1.8928079837661092e-05, + "loss": 1.461, + "step": 3655 + }, + { + "epoch": 0.19988245424599865, + "grad_norm": 1.6276919841766357, + "learning_rate": 1.8927256578427054e-05, + "loss": 1.3951, + "step": 3656 + }, + { + "epoch": 0.19993712668972022, + "grad_norm": 1.5086376667022705, + "learning_rate": 1.8926433021090967e-05, + "loss": 1.5464, + "step": 3657 + }, + { + "epoch": 0.19999179913344176, + "grad_norm": 1.7212765216827393, + "learning_rate": 1.8925609165680338e-05, + "loss": 1.5783, + "step": 3658 + }, + { + "epoch": 0.20004647157716332, + "grad_norm": 1.702863335609436, + "learning_rate": 1.8924785012222676e-05, + "loss": 1.4848, + "step": 3659 + }, + { + "epoch": 0.20010114402088489, + "grad_norm": 1.5111311674118042, + "learning_rate": 1.8923960560745495e-05, + "loss": 1.4793, + "step": 3660 + }, + { + "epoch": 0.20015581646460642, + "grad_norm": 1.4166284799575806, + "learning_rate": 1.8923135811276333e-05, + "loss": 1.6596, + "step": 3661 + }, + { + "epoch": 0.200210488908328, + "grad_norm": 1.4416584968566895, + "learning_rate": 1.8922310763842725e-05, + "loss": 1.4585, + "step": 3662 + }, + { + "epoch": 0.20026516135204953, + "grad_norm": 1.5942963361740112, + "learning_rate": 1.8921485418472227e-05, + "loss": 1.3487, + "step": 3663 + }, + { + "epoch": 0.2003198337957711, + "grad_norm": 1.446855902671814, + "learning_rate": 1.8920659775192394e-05, + "loss": 1.4433, + "step": 3664 + }, + { + "epoch": 0.20037450623949263, + "grad_norm": 1.856153964996338, + "learning_rate": 1.89198338340308e-05, + "loss": 1.3957, + "step": 3665 + }, + { + "epoch": 0.2004291786832142, + "grad_norm": 1.1619149446487427, + "learning_rate": 1.891900759501502e-05, + "loss": 1.5835, + "step": 3666 + }, + { + "epoch": 0.20048385112693576, + "grad_norm": 1.434013843536377, + "learning_rate": 1.891818105817265e-05, + "loss": 1.3866, + "step": 3667 + }, + { + "epoch": 0.2005385235706573, + "grad_norm": 1.4708582162857056, + "learning_rate": 1.8917354223531287e-05, + "loss": 1.3774, + "step": 3668 + }, + { + "epoch": 0.20059319601437886, + "grad_norm": 1.3916091918945312, + "learning_rate": 1.891652709111854e-05, + "loss": 1.5174, + "step": 3669 + }, + { + "epoch": 0.2006478684581004, + "grad_norm": 1.3077400922775269, + "learning_rate": 1.891569966096203e-05, + "loss": 1.4626, + "step": 3670 + }, + { + "epoch": 0.20070254090182196, + "grad_norm": 1.1983861923217773, + "learning_rate": 1.891487193308939e-05, + "loss": 1.5431, + "step": 3671 + }, + { + "epoch": 0.2007572133455435, + "grad_norm": 1.410390853881836, + "learning_rate": 1.8914043907528254e-05, + "loss": 1.342, + "step": 3672 + }, + { + "epoch": 0.20081188578926507, + "grad_norm": 1.6819738149642944, + "learning_rate": 1.8913215584306276e-05, + "loss": 1.372, + "step": 3673 + }, + { + "epoch": 0.20086655823298663, + "grad_norm": 1.5696300268173218, + "learning_rate": 1.891238696345111e-05, + "loss": 1.4516, + "step": 3674 + }, + { + "epoch": 0.20092123067670817, + "grad_norm": 1.2517197132110596, + "learning_rate": 1.8911558044990435e-05, + "loss": 1.567, + "step": 3675 + }, + { + "epoch": 0.20097590312042973, + "grad_norm": 1.5456762313842773, + "learning_rate": 1.8910728828951926e-05, + "loss": 1.2341, + "step": 3676 + }, + { + "epoch": 0.20103057556415127, + "grad_norm": 1.4933408498764038, + "learning_rate": 1.8909899315363265e-05, + "loss": 1.3839, + "step": 3677 + }, + { + "epoch": 0.20108524800787284, + "grad_norm": 1.494857668876648, + "learning_rate": 1.8909069504252165e-05, + "loss": 1.6413, + "step": 3678 + }, + { + "epoch": 0.20113992045159437, + "grad_norm": 1.5239249467849731, + "learning_rate": 1.8908239395646325e-05, + "loss": 1.6719, + "step": 3679 + }, + { + "epoch": 0.20119459289531594, + "grad_norm": 1.3748276233673096, + "learning_rate": 1.8907408989573467e-05, + "loss": 1.3777, + "step": 3680 + }, + { + "epoch": 0.2012492653390375, + "grad_norm": 1.3289196491241455, + "learning_rate": 1.8906578286061325e-05, + "loss": 1.3315, + "step": 3681 + }, + { + "epoch": 0.20130393778275904, + "grad_norm": 1.6465240716934204, + "learning_rate": 1.890574728513763e-05, + "loss": 1.6848, + "step": 3682 + }, + { + "epoch": 0.2013586102264806, + "grad_norm": 1.3967961072921753, + "learning_rate": 1.8904915986830135e-05, + "loss": 1.331, + "step": 3683 + }, + { + "epoch": 0.20141328267020214, + "grad_norm": 1.354052186012268, + "learning_rate": 1.89040843911666e-05, + "loss": 1.6088, + "step": 3684 + }, + { + "epoch": 0.2014679551139237, + "grad_norm": 1.53536856174469, + "learning_rate": 1.8903252498174796e-05, + "loss": 1.5758, + "step": 3685 + }, + { + "epoch": 0.20152262755764525, + "grad_norm": 1.4086729288101196, + "learning_rate": 1.8902420307882495e-05, + "loss": 1.3973, + "step": 3686 + }, + { + "epoch": 0.2015773000013668, + "grad_norm": 1.6337181329727173, + "learning_rate": 1.8901587820317494e-05, + "loss": 1.6791, + "step": 3687 + }, + { + "epoch": 0.20163197244508838, + "grad_norm": 1.5427533388137817, + "learning_rate": 1.890075503550758e-05, + "loss": 1.6488, + "step": 3688 + }, + { + "epoch": 0.2016866448888099, + "grad_norm": 1.604088306427002, + "learning_rate": 1.8899921953480576e-05, + "loss": 1.5293, + "step": 3689 + }, + { + "epoch": 0.20174131733253148, + "grad_norm": 1.3295942544937134, + "learning_rate": 1.8899088574264293e-05, + "loss": 1.6721, + "step": 3690 + }, + { + "epoch": 0.20179598977625302, + "grad_norm": 1.1055958271026611, + "learning_rate": 1.8898254897886558e-05, + "loss": 1.6072, + "step": 3691 + }, + { + "epoch": 0.20185066221997458, + "grad_norm": 1.3718233108520508, + "learning_rate": 1.889742092437521e-05, + "loss": 1.0784, + "step": 3692 + }, + { + "epoch": 0.20190533466369612, + "grad_norm": 1.4504469633102417, + "learning_rate": 1.8896586653758104e-05, + "loss": 1.7342, + "step": 3693 + }, + { + "epoch": 0.20196000710741768, + "grad_norm": 1.6454898118972778, + "learning_rate": 1.889575208606309e-05, + "loss": 1.4948, + "step": 3694 + }, + { + "epoch": 0.20201467955113925, + "grad_norm": 1.5197745561599731, + "learning_rate": 1.8894917221318038e-05, + "loss": 1.3459, + "step": 3695 + }, + { + "epoch": 0.20206935199486079, + "grad_norm": 1.964827060699463, + "learning_rate": 1.8894082059550828e-05, + "loss": 1.4089, + "step": 3696 + }, + { + "epoch": 0.20212402443858235, + "grad_norm": 2.1574833393096924, + "learning_rate": 1.889324660078935e-05, + "loss": 1.3602, + "step": 3697 + }, + { + "epoch": 0.2021786968823039, + "grad_norm": 1.474680781364441, + "learning_rate": 1.8892410845061498e-05, + "loss": 1.5199, + "step": 3698 + }, + { + "epoch": 0.20223336932602545, + "grad_norm": 1.606268048286438, + "learning_rate": 1.889157479239518e-05, + "loss": 1.3274, + "step": 3699 + }, + { + "epoch": 0.202288041769747, + "grad_norm": 2.2515878677368164, + "learning_rate": 1.8890738442818317e-05, + "loss": 1.4222, + "step": 3700 + }, + { + "epoch": 0.20234271421346856, + "grad_norm": 1.214245319366455, + "learning_rate": 1.8889901796358835e-05, + "loss": 1.5373, + "step": 3701 + }, + { + "epoch": 0.20239738665719012, + "grad_norm": 1.8017468452453613, + "learning_rate": 1.888906485304467e-05, + "loss": 1.4637, + "step": 3702 + }, + { + "epoch": 0.20245205910091166, + "grad_norm": 1.7115471363067627, + "learning_rate": 1.8888227612903768e-05, + "loss": 1.5292, + "step": 3703 + }, + { + "epoch": 0.20250673154463322, + "grad_norm": 1.9487870931625366, + "learning_rate": 1.888739007596409e-05, + "loss": 1.4207, + "step": 3704 + }, + { + "epoch": 0.20256140398835476, + "grad_norm": 1.7665609121322632, + "learning_rate": 1.8886552242253607e-05, + "loss": 1.33, + "step": 3705 + }, + { + "epoch": 0.20261607643207633, + "grad_norm": 1.5017778873443604, + "learning_rate": 1.8885714111800288e-05, + "loss": 1.5009, + "step": 3706 + }, + { + "epoch": 0.20267074887579786, + "grad_norm": 1.8129874467849731, + "learning_rate": 1.8884875684632124e-05, + "loss": 1.4596, + "step": 3707 + }, + { + "epoch": 0.20272542131951943, + "grad_norm": 1.3304712772369385, + "learning_rate": 1.8884036960777115e-05, + "loss": 1.5106, + "step": 3708 + }, + { + "epoch": 0.202780093763241, + "grad_norm": 1.4358718395233154, + "learning_rate": 1.888319794026326e-05, + "loss": 1.1882, + "step": 3709 + }, + { + "epoch": 0.20283476620696253, + "grad_norm": 1.8177433013916016, + "learning_rate": 1.8882358623118584e-05, + "loss": 1.7576, + "step": 3710 + }, + { + "epoch": 0.2028894386506841, + "grad_norm": 1.474729061126709, + "learning_rate": 1.888151900937111e-05, + "loss": 1.7159, + "step": 3711 + }, + { + "epoch": 0.20294411109440563, + "grad_norm": 1.5195623636245728, + "learning_rate": 1.8880679099048875e-05, + "loss": 1.513, + "step": 3712 + }, + { + "epoch": 0.2029987835381272, + "grad_norm": 1.366709589958191, + "learning_rate": 1.8879838892179924e-05, + "loss": 1.4852, + "step": 3713 + }, + { + "epoch": 0.20305345598184874, + "grad_norm": 1.6490342617034912, + "learning_rate": 1.8878998388792315e-05, + "loss": 1.5757, + "step": 3714 + }, + { + "epoch": 0.2031081284255703, + "grad_norm": 1.599607229232788, + "learning_rate": 1.8878157588914118e-05, + "loss": 1.5554, + "step": 3715 + }, + { + "epoch": 0.20316280086929187, + "grad_norm": 1.8114572763442993, + "learning_rate": 1.88773164925734e-05, + "loss": 1.349, + "step": 3716 + }, + { + "epoch": 0.2032174733130134, + "grad_norm": 2.006974220275879, + "learning_rate": 1.8876475099798258e-05, + "loss": 1.5574, + "step": 3717 + }, + { + "epoch": 0.20327214575673497, + "grad_norm": 1.393688678741455, + "learning_rate": 1.887563341061678e-05, + "loss": 1.4979, + "step": 3718 + }, + { + "epoch": 0.2033268182004565, + "grad_norm": 1.3487131595611572, + "learning_rate": 1.887479142505708e-05, + "loss": 1.3383, + "step": 3719 + }, + { + "epoch": 0.20338149064417807, + "grad_norm": 1.57515549659729, + "learning_rate": 1.8873949143147267e-05, + "loss": 1.2588, + "step": 3720 + }, + { + "epoch": 0.20343616308789964, + "grad_norm": 1.6663002967834473, + "learning_rate": 1.887310656491547e-05, + "loss": 1.592, + "step": 3721 + }, + { + "epoch": 0.20349083553162117, + "grad_norm": 1.4173301458358765, + "learning_rate": 1.8872263690389817e-05, + "loss": 1.5075, + "step": 3722 + }, + { + "epoch": 0.20354550797534274, + "grad_norm": 1.8841384649276733, + "learning_rate": 1.887142051959847e-05, + "loss": 1.6573, + "step": 3723 + }, + { + "epoch": 0.20360018041906427, + "grad_norm": 1.5438497066497803, + "learning_rate": 1.8870577052569564e-05, + "loss": 1.6318, + "step": 3724 + }, + { + "epoch": 0.20365485286278584, + "grad_norm": 1.642189860343933, + "learning_rate": 1.886973328933128e-05, + "loss": 1.5611, + "step": 3725 + }, + { + "epoch": 0.20370952530650738, + "grad_norm": 1.4945803880691528, + "learning_rate": 1.8868889229911787e-05, + "loss": 1.4798, + "step": 3726 + }, + { + "epoch": 0.20376419775022894, + "grad_norm": 1.4031246900558472, + "learning_rate": 1.8868044874339274e-05, + "loss": 1.4138, + "step": 3727 + }, + { + "epoch": 0.2038188701939505, + "grad_norm": 1.4929362535476685, + "learning_rate": 1.8867200222641927e-05, + "loss": 1.4172, + "step": 3728 + }, + { + "epoch": 0.20387354263767204, + "grad_norm": 1.736009120941162, + "learning_rate": 1.8866355274847964e-05, + "loss": 1.486, + "step": 3729 + }, + { + "epoch": 0.2039282150813936, + "grad_norm": 1.4257556200027466, + "learning_rate": 1.8865510030985588e-05, + "loss": 1.5273, + "step": 3730 + }, + { + "epoch": 0.20398288752511515, + "grad_norm": 1.6219682693481445, + "learning_rate": 1.8864664491083032e-05, + "loss": 1.5094, + "step": 3731 + }, + { + "epoch": 0.2040375599688367, + "grad_norm": 1.6109039783477783, + "learning_rate": 1.8863818655168522e-05, + "loss": 1.3012, + "step": 3732 + }, + { + "epoch": 0.20409223241255825, + "grad_norm": 1.331015944480896, + "learning_rate": 1.886297252327031e-05, + "loss": 1.4862, + "step": 3733 + }, + { + "epoch": 0.20414690485627981, + "grad_norm": 1.4973416328430176, + "learning_rate": 1.886212609541665e-05, + "loss": 1.4236, + "step": 3734 + }, + { + "epoch": 0.20420157730000138, + "grad_norm": 1.6137768030166626, + "learning_rate": 1.8861279371635805e-05, + "loss": 1.6772, + "step": 3735 + }, + { + "epoch": 0.20425624974372292, + "grad_norm": 1.268900990486145, + "learning_rate": 1.8860432351956044e-05, + "loss": 1.504, + "step": 3736 + }, + { + "epoch": 0.20431092218744448, + "grad_norm": 1.4981778860092163, + "learning_rate": 1.8859585036405653e-05, + "loss": 1.3697, + "step": 3737 + }, + { + "epoch": 0.20436559463116602, + "grad_norm": 1.54237961769104, + "learning_rate": 1.8858737425012934e-05, + "loss": 1.3148, + "step": 3738 + }, + { + "epoch": 0.20442026707488758, + "grad_norm": 1.2507588863372803, + "learning_rate": 1.8857889517806183e-05, + "loss": 1.3526, + "step": 3739 + }, + { + "epoch": 0.20447493951860912, + "grad_norm": 1.9441419839859009, + "learning_rate": 1.8857041314813715e-05, + "loss": 1.5062, + "step": 3740 + }, + { + "epoch": 0.2045296119623307, + "grad_norm": 1.6279433965682983, + "learning_rate": 1.8856192816063853e-05, + "loss": 1.4296, + "step": 3741 + }, + { + "epoch": 0.20458428440605225, + "grad_norm": 1.7274199724197388, + "learning_rate": 1.8855344021584933e-05, + "loss": 1.5408, + "step": 3742 + }, + { + "epoch": 0.2046389568497738, + "grad_norm": 1.3325932025909424, + "learning_rate": 1.8854494931405293e-05, + "loss": 1.3411, + "step": 3743 + }, + { + "epoch": 0.20469362929349535, + "grad_norm": 1.5472036600112915, + "learning_rate": 1.885364554555329e-05, + "loss": 1.3089, + "step": 3744 + }, + { + "epoch": 0.2047483017372169, + "grad_norm": 1.1886358261108398, + "learning_rate": 1.885279586405729e-05, + "loss": 1.6042, + "step": 3745 + }, + { + "epoch": 0.20480297418093846, + "grad_norm": 1.6432602405548096, + "learning_rate": 1.885194588694566e-05, + "loss": 1.5351, + "step": 3746 + }, + { + "epoch": 0.20485764662466, + "grad_norm": 1.4566247463226318, + "learning_rate": 1.8851095614246785e-05, + "loss": 1.676, + "step": 3747 + }, + { + "epoch": 0.20491231906838156, + "grad_norm": 1.724465012550354, + "learning_rate": 1.885024504598906e-05, + "loss": 1.4013, + "step": 3748 + }, + { + "epoch": 0.20496699151210312, + "grad_norm": 1.4713270664215088, + "learning_rate": 1.8849394182200883e-05, + "loss": 1.3983, + "step": 3749 + }, + { + "epoch": 0.20502166395582466, + "grad_norm": 1.707453966140747, + "learning_rate": 1.8848543022910668e-05, + "loss": 1.8332, + "step": 3750 + }, + { + "epoch": 0.20507633639954623, + "grad_norm": 1.3793983459472656, + "learning_rate": 1.884769156814684e-05, + "loss": 1.5392, + "step": 3751 + }, + { + "epoch": 0.20513100884326776, + "grad_norm": 1.8211826086044312, + "learning_rate": 1.8846839817937827e-05, + "loss": 1.6403, + "step": 3752 + }, + { + "epoch": 0.20518568128698933, + "grad_norm": 1.517295479774475, + "learning_rate": 1.884598777231207e-05, + "loss": 1.3152, + "step": 3753 + }, + { + "epoch": 0.20524035373071087, + "grad_norm": 1.5946295261383057, + "learning_rate": 1.8845135431298026e-05, + "loss": 1.0882, + "step": 3754 + }, + { + "epoch": 0.20529502617443243, + "grad_norm": 1.3396357297897339, + "learning_rate": 1.8844282794924157e-05, + "loss": 1.5943, + "step": 3755 + }, + { + "epoch": 0.205349698618154, + "grad_norm": 1.4168310165405273, + "learning_rate": 1.884342986321893e-05, + "loss": 1.4895, + "step": 3756 + }, + { + "epoch": 0.20540437106187553, + "grad_norm": 1.4491641521453857, + "learning_rate": 1.8842576636210827e-05, + "loss": 1.1801, + "step": 3757 + }, + { + "epoch": 0.2054590435055971, + "grad_norm": 1.679106593132019, + "learning_rate": 1.884172311392834e-05, + "loss": 1.508, + "step": 3758 + }, + { + "epoch": 0.20551371594931864, + "grad_norm": 1.650896668434143, + "learning_rate": 1.8840869296399972e-05, + "loss": 1.4332, + "step": 3759 + }, + { + "epoch": 0.2055683883930402, + "grad_norm": 1.4141845703125, + "learning_rate": 1.8840015183654233e-05, + "loss": 1.3474, + "step": 3760 + }, + { + "epoch": 0.20562306083676174, + "grad_norm": 1.4788930416107178, + "learning_rate": 1.883916077571964e-05, + "loss": 1.3789, + "step": 3761 + }, + { + "epoch": 0.2056777332804833, + "grad_norm": 1.3272156715393066, + "learning_rate": 1.883830607262473e-05, + "loss": 1.6418, + "step": 3762 + }, + { + "epoch": 0.20573240572420487, + "grad_norm": 2.0329787731170654, + "learning_rate": 1.8837451074398038e-05, + "loss": 1.604, + "step": 3763 + }, + { + "epoch": 0.2057870781679264, + "grad_norm": 1.5970772504806519, + "learning_rate": 1.883659578106812e-05, + "loss": 1.462, + "step": 3764 + }, + { + "epoch": 0.20584175061164797, + "grad_norm": 1.554189682006836, + "learning_rate": 1.883574019266353e-05, + "loss": 1.4193, + "step": 3765 + }, + { + "epoch": 0.2058964230553695, + "grad_norm": 1.687595248222351, + "learning_rate": 1.8834884309212845e-05, + "loss": 1.6048, + "step": 3766 + }, + { + "epoch": 0.20595109549909107, + "grad_norm": 1.403653860092163, + "learning_rate": 1.8834028130744637e-05, + "loss": 1.3075, + "step": 3767 + }, + { + "epoch": 0.2060057679428126, + "grad_norm": 1.5115394592285156, + "learning_rate": 1.8833171657287503e-05, + "loss": 1.5324, + "step": 3768 + }, + { + "epoch": 0.20606044038653418, + "grad_norm": 1.4797431230545044, + "learning_rate": 1.8832314888870037e-05, + "loss": 1.6468, + "step": 3769 + }, + { + "epoch": 0.20611511283025574, + "grad_norm": 1.1331157684326172, + "learning_rate": 1.8831457825520855e-05, + "loss": 1.4984, + "step": 3770 + }, + { + "epoch": 0.20616978527397728, + "grad_norm": 1.4819369316101074, + "learning_rate": 1.883060046726857e-05, + "loss": 1.5066, + "step": 3771 + }, + { + "epoch": 0.20622445771769884, + "grad_norm": 1.3833640813827515, + "learning_rate": 1.8829742814141813e-05, + "loss": 1.5692, + "step": 3772 + }, + { + "epoch": 0.20627913016142038, + "grad_norm": 1.221052885055542, + "learning_rate": 1.882888486616923e-05, + "loss": 1.2932, + "step": 3773 + }, + { + "epoch": 0.20633380260514195, + "grad_norm": 1.8041647672653198, + "learning_rate": 1.8828026623379455e-05, + "loss": 1.4596, + "step": 3774 + }, + { + "epoch": 0.20638847504886348, + "grad_norm": 1.1981881856918335, + "learning_rate": 1.882716808580116e-05, + "loss": 1.5429, + "step": 3775 + }, + { + "epoch": 0.20644314749258505, + "grad_norm": 1.588498592376709, + "learning_rate": 1.882630925346301e-05, + "loss": 1.3501, + "step": 3776 + }, + { + "epoch": 0.20649781993630661, + "grad_norm": 1.2571033239364624, + "learning_rate": 1.8825450126393678e-05, + "loss": 1.2851, + "step": 3777 + }, + { + "epoch": 0.20655249238002815, + "grad_norm": 1.4718431234359741, + "learning_rate": 1.882459070462186e-05, + "loss": 1.4071, + "step": 3778 + }, + { + "epoch": 0.20660716482374972, + "grad_norm": 1.3277989625930786, + "learning_rate": 1.882373098817625e-05, + "loss": 1.5072, + "step": 3779 + }, + { + "epoch": 0.20666183726747125, + "grad_norm": 1.4438778162002563, + "learning_rate": 1.8822870977085556e-05, + "loss": 1.4208, + "step": 3780 + }, + { + "epoch": 0.20671650971119282, + "grad_norm": 1.6161483526229858, + "learning_rate": 1.8822010671378498e-05, + "loss": 1.7423, + "step": 3781 + }, + { + "epoch": 0.20677118215491436, + "grad_norm": 1.3523221015930176, + "learning_rate": 1.8821150071083803e-05, + "loss": 1.4565, + "step": 3782 + }, + { + "epoch": 0.20682585459863592, + "grad_norm": 1.5612549781799316, + "learning_rate": 1.8820289176230206e-05, + "loss": 1.2422, + "step": 3783 + }, + { + "epoch": 0.2068805270423575, + "grad_norm": 1.5947293043136597, + "learning_rate": 1.8819427986846457e-05, + "loss": 1.5002, + "step": 3784 + }, + { + "epoch": 0.20693519948607902, + "grad_norm": 1.3601645231246948, + "learning_rate": 1.881856650296131e-05, + "loss": 1.5615, + "step": 3785 + }, + { + "epoch": 0.2069898719298006, + "grad_norm": 1.4817770719528198, + "learning_rate": 1.8817704724603536e-05, + "loss": 1.5659, + "step": 3786 + }, + { + "epoch": 0.20704454437352213, + "grad_norm": 1.8849636316299438, + "learning_rate": 1.8816842651801906e-05, + "loss": 1.6107, + "step": 3787 + }, + { + "epoch": 0.2070992168172437, + "grad_norm": 1.5091720819473267, + "learning_rate": 1.8815980284585218e-05, + "loss": 1.4598, + "step": 3788 + }, + { + "epoch": 0.20715388926096523, + "grad_norm": 1.8765677213668823, + "learning_rate": 1.8815117622982255e-05, + "loss": 1.6019, + "step": 3789 + }, + { + "epoch": 0.2072085617046868, + "grad_norm": 1.5432140827178955, + "learning_rate": 1.8814254667021832e-05, + "loss": 1.4432, + "step": 3790 + }, + { + "epoch": 0.20726323414840836, + "grad_norm": 1.3829432725906372, + "learning_rate": 1.881339141673276e-05, + "loss": 1.4802, + "step": 3791 + }, + { + "epoch": 0.2073179065921299, + "grad_norm": 1.6179031133651733, + "learning_rate": 1.881252787214387e-05, + "loss": 1.737, + "step": 3792 + }, + { + "epoch": 0.20737257903585146, + "grad_norm": 1.3969430923461914, + "learning_rate": 1.8811664033283993e-05, + "loss": 1.2615, + "step": 3793 + }, + { + "epoch": 0.207427251479573, + "grad_norm": 1.3931846618652344, + "learning_rate": 1.8810799900181978e-05, + "loss": 1.6959, + "step": 3794 + }, + { + "epoch": 0.20748192392329456, + "grad_norm": 1.94855535030365, + "learning_rate": 1.880993547286668e-05, + "loss": 1.4667, + "step": 3795 + }, + { + "epoch": 0.2075365963670161, + "grad_norm": 1.5225813388824463, + "learning_rate": 1.880907075136696e-05, + "loss": 1.4672, + "step": 3796 + }, + { + "epoch": 0.20759126881073767, + "grad_norm": 1.9131114482879639, + "learning_rate": 1.8808205735711697e-05, + "loss": 1.7151, + "step": 3797 + }, + { + "epoch": 0.20764594125445923, + "grad_norm": 1.4323517084121704, + "learning_rate": 1.880734042592978e-05, + "loss": 1.5535, + "step": 3798 + }, + { + "epoch": 0.20770061369818077, + "grad_norm": 1.3420345783233643, + "learning_rate": 1.8806474822050096e-05, + "loss": 1.6015, + "step": 3799 + }, + { + "epoch": 0.20775528614190233, + "grad_norm": 1.3755720853805542, + "learning_rate": 1.880560892410155e-05, + "loss": 1.7402, + "step": 3800 + }, + { + "epoch": 0.20780995858562387, + "grad_norm": 1.6566232442855835, + "learning_rate": 1.880474273211306e-05, + "loss": 1.621, + "step": 3801 + }, + { + "epoch": 0.20786463102934544, + "grad_norm": 1.4140335321426392, + "learning_rate": 1.8803876246113553e-05, + "loss": 1.3518, + "step": 3802 + }, + { + "epoch": 0.20791930347306697, + "grad_norm": 1.5209912061691284, + "learning_rate": 1.880300946613196e-05, + "loss": 1.5243, + "step": 3803 + }, + { + "epoch": 0.20797397591678854, + "grad_norm": 1.5442639589309692, + "learning_rate": 1.880214239219722e-05, + "loss": 1.4018, + "step": 3804 + }, + { + "epoch": 0.2080286483605101, + "grad_norm": 1.5371944904327393, + "learning_rate": 1.880127502433829e-05, + "loss": 1.3728, + "step": 3805 + }, + { + "epoch": 0.20808332080423164, + "grad_norm": 2.916748523712158, + "learning_rate": 1.8800407362584135e-05, + "loss": 1.3417, + "step": 3806 + }, + { + "epoch": 0.2081379932479532, + "grad_norm": 1.6501514911651611, + "learning_rate": 1.879953940696373e-05, + "loss": 1.5792, + "step": 3807 + }, + { + "epoch": 0.20819266569167474, + "grad_norm": 1.3156033754348755, + "learning_rate": 1.8798671157506052e-05, + "loss": 1.4423, + "step": 3808 + }, + { + "epoch": 0.2082473381353963, + "grad_norm": 1.9311940670013428, + "learning_rate": 1.87978026142401e-05, + "loss": 1.6741, + "step": 3809 + }, + { + "epoch": 0.20830201057911785, + "grad_norm": 1.8504999876022339, + "learning_rate": 1.8796933777194874e-05, + "loss": 1.4576, + "step": 3810 + }, + { + "epoch": 0.2083566830228394, + "grad_norm": 1.2753818035125732, + "learning_rate": 1.8796064646399386e-05, + "loss": 1.5209, + "step": 3811 + }, + { + "epoch": 0.20841135546656098, + "grad_norm": 1.7691216468811035, + "learning_rate": 1.8795195221882658e-05, + "loss": 1.5469, + "step": 3812 + }, + { + "epoch": 0.20846602791028251, + "grad_norm": 1.5875459909439087, + "learning_rate": 1.879432550367372e-05, + "loss": 1.7915, + "step": 3813 + }, + { + "epoch": 0.20852070035400408, + "grad_norm": 1.4550307989120483, + "learning_rate": 1.8793455491801623e-05, + "loss": 1.4591, + "step": 3814 + }, + { + "epoch": 0.20857537279772562, + "grad_norm": 1.3895097970962524, + "learning_rate": 1.879258518629541e-05, + "loss": 1.4705, + "step": 3815 + }, + { + "epoch": 0.20863004524144718, + "grad_norm": 1.4301713705062866, + "learning_rate": 1.8791714587184144e-05, + "loss": 1.3726, + "step": 3816 + }, + { + "epoch": 0.20868471768516872, + "grad_norm": 1.5009735822677612, + "learning_rate": 1.87908436944969e-05, + "loss": 1.392, + "step": 3817 + }, + { + "epoch": 0.20873939012889028, + "grad_norm": 1.4888293743133545, + "learning_rate": 1.8789972508262755e-05, + "loss": 1.4701, + "step": 3818 + }, + { + "epoch": 0.20879406257261185, + "grad_norm": 1.2610009908676147, + "learning_rate": 1.8789101028510803e-05, + "loss": 1.6508, + "step": 3819 + }, + { + "epoch": 0.2088487350163334, + "grad_norm": 1.5789459943771362, + "learning_rate": 1.878822925527014e-05, + "loss": 1.362, + "step": 3820 + }, + { + "epoch": 0.20890340746005495, + "grad_norm": 1.4372332096099854, + "learning_rate": 1.878735718856988e-05, + "loss": 1.5309, + "step": 3821 + }, + { + "epoch": 0.2089580799037765, + "grad_norm": 1.6087652444839478, + "learning_rate": 1.8786484828439148e-05, + "loss": 1.4373, + "step": 3822 + }, + { + "epoch": 0.20901275234749805, + "grad_norm": 1.7145737409591675, + "learning_rate": 1.8785612174907067e-05, + "loss": 1.3163, + "step": 3823 + }, + { + "epoch": 0.20906742479121962, + "grad_norm": 1.6991465091705322, + "learning_rate": 1.878473922800278e-05, + "loss": 1.3379, + "step": 3824 + }, + { + "epoch": 0.20912209723494116, + "grad_norm": 1.90249502658844, + "learning_rate": 1.8783865987755432e-05, + "loss": 1.4542, + "step": 3825 + }, + { + "epoch": 0.20917676967866272, + "grad_norm": 1.7741788625717163, + "learning_rate": 1.8782992454194192e-05, + "loss": 1.3281, + "step": 3826 + }, + { + "epoch": 0.20923144212238426, + "grad_norm": 1.6554430723190308, + "learning_rate": 1.8782118627348224e-05, + "loss": 1.6975, + "step": 3827 + }, + { + "epoch": 0.20928611456610582, + "grad_norm": 1.4261689186096191, + "learning_rate": 1.8781244507246706e-05, + "loss": 1.3839, + "step": 3828 + }, + { + "epoch": 0.20934078700982736, + "grad_norm": 2.020644187927246, + "learning_rate": 1.8780370093918825e-05, + "loss": 1.5859, + "step": 3829 + }, + { + "epoch": 0.20939545945354893, + "grad_norm": 1.2996035814285278, + "learning_rate": 1.8779495387393786e-05, + "loss": 1.7563, + "step": 3830 + }, + { + "epoch": 0.2094501318972705, + "grad_norm": 1.1415342092514038, + "learning_rate": 1.877862038770079e-05, + "loss": 1.6143, + "step": 3831 + }, + { + "epoch": 0.20950480434099203, + "grad_norm": 1.3675026893615723, + "learning_rate": 1.8777745094869067e-05, + "loss": 1.6095, + "step": 3832 + }, + { + "epoch": 0.2095594767847136, + "grad_norm": 1.4894404411315918, + "learning_rate": 1.8776869508927832e-05, + "loss": 1.4057, + "step": 3833 + }, + { + "epoch": 0.20961414922843513, + "grad_norm": 1.5124905109405518, + "learning_rate": 1.8775993629906333e-05, + "loss": 1.4922, + "step": 3834 + }, + { + "epoch": 0.2096688216721567, + "grad_norm": 1.5200636386871338, + "learning_rate": 1.877511745783381e-05, + "loss": 1.7577, + "step": 3835 + }, + { + "epoch": 0.20972349411587823, + "grad_norm": 2.0326080322265625, + "learning_rate": 1.8774240992739524e-05, + "loss": 1.5329, + "step": 3836 + }, + { + "epoch": 0.2097781665595998, + "grad_norm": 1.5123207569122314, + "learning_rate": 1.877336423465274e-05, + "loss": 1.6164, + "step": 3837 + }, + { + "epoch": 0.20983283900332136, + "grad_norm": 1.5908180475234985, + "learning_rate": 1.8772487183602738e-05, + "loss": 1.485, + "step": 3838 + }, + { + "epoch": 0.2098875114470429, + "grad_norm": 1.8528072834014893, + "learning_rate": 1.8771609839618806e-05, + "loss": 1.4423, + "step": 3839 + }, + { + "epoch": 0.20994218389076447, + "grad_norm": 1.6058169603347778, + "learning_rate": 1.877073220273024e-05, + "loss": 1.3721, + "step": 3840 + }, + { + "epoch": 0.209996856334486, + "grad_norm": 1.5971863269805908, + "learning_rate": 1.8769854272966337e-05, + "loss": 1.1425, + "step": 3841 + }, + { + "epoch": 0.21005152877820757, + "grad_norm": 1.412822961807251, + "learning_rate": 1.8768976050356428e-05, + "loss": 1.4274, + "step": 3842 + }, + { + "epoch": 0.2101062012219291, + "grad_norm": 1.4929964542388916, + "learning_rate": 1.8768097534929827e-05, + "loss": 1.4294, + "step": 3843 + }, + { + "epoch": 0.21016087366565067, + "grad_norm": 1.928521752357483, + "learning_rate": 1.8767218726715876e-05, + "loss": 1.3193, + "step": 3844 + }, + { + "epoch": 0.21021554610937224, + "grad_norm": 1.4784024953842163, + "learning_rate": 1.876633962574392e-05, + "loss": 1.4103, + "step": 3845 + }, + { + "epoch": 0.21027021855309377, + "grad_norm": 1.7809418439865112, + "learning_rate": 1.876546023204331e-05, + "loss": 1.5981, + "step": 3846 + }, + { + "epoch": 0.21032489099681534, + "grad_norm": 1.7177737951278687, + "learning_rate": 1.8764580545643417e-05, + "loss": 1.665, + "step": 3847 + }, + { + "epoch": 0.21037956344053688, + "grad_norm": 1.7991011142730713, + "learning_rate": 1.876370056657361e-05, + "loss": 1.5228, + "step": 3848 + }, + { + "epoch": 0.21043423588425844, + "grad_norm": 1.8756376504898071, + "learning_rate": 1.876282029486328e-05, + "loss": 1.5171, + "step": 3849 + }, + { + "epoch": 0.21048890832797998, + "grad_norm": 1.3800824880599976, + "learning_rate": 1.8761939730541815e-05, + "loss": 1.2853, + "step": 3850 + }, + { + "epoch": 0.21054358077170154, + "grad_norm": 1.9971067905426025, + "learning_rate": 1.876105887363862e-05, + "loss": 1.5618, + "step": 3851 + }, + { + "epoch": 0.2105982532154231, + "grad_norm": 2.039508104324341, + "learning_rate": 1.8760177724183115e-05, + "loss": 1.3583, + "step": 3852 + }, + { + "epoch": 0.21065292565914465, + "grad_norm": 1.4884421825408936, + "learning_rate": 1.8759296282204718e-05, + "loss": 1.4876, + "step": 3853 + }, + { + "epoch": 0.2107075981028662, + "grad_norm": 1.2130941152572632, + "learning_rate": 1.8758414547732864e-05, + "loss": 1.5096, + "step": 3854 + }, + { + "epoch": 0.21076227054658775, + "grad_norm": 1.709412693977356, + "learning_rate": 1.8757532520796993e-05, + "loss": 1.5316, + "step": 3855 + }, + { + "epoch": 0.21081694299030931, + "grad_norm": 1.3295631408691406, + "learning_rate": 1.8756650201426565e-05, + "loss": 1.4871, + "step": 3856 + }, + { + "epoch": 0.21087161543403085, + "grad_norm": 1.5487642288208008, + "learning_rate": 1.8755767589651036e-05, + "loss": 1.2686, + "step": 3857 + }, + { + "epoch": 0.21092628787775242, + "grad_norm": 1.8807179927825928, + "learning_rate": 1.8754884685499887e-05, + "loss": 1.5187, + "step": 3858 + }, + { + "epoch": 0.21098096032147398, + "grad_norm": 1.8202683925628662, + "learning_rate": 1.8754001489002586e-05, + "loss": 1.1948, + "step": 3859 + }, + { + "epoch": 0.21103563276519552, + "grad_norm": 1.3262325525283813, + "learning_rate": 1.875311800018864e-05, + "loss": 1.6146, + "step": 3860 + }, + { + "epoch": 0.21109030520891708, + "grad_norm": 1.5593880414962769, + "learning_rate": 1.8752234219087538e-05, + "loss": 1.6762, + "step": 3861 + }, + { + "epoch": 0.21114497765263862, + "grad_norm": 1.309090495109558, + "learning_rate": 1.87513501457288e-05, + "loss": 1.5761, + "step": 3862 + }, + { + "epoch": 0.2111996500963602, + "grad_norm": 2.055344581604004, + "learning_rate": 1.8750465780141946e-05, + "loss": 1.5108, + "step": 3863 + }, + { + "epoch": 0.21125432254008172, + "grad_norm": 1.5027868747711182, + "learning_rate": 1.8749581122356507e-05, + "loss": 1.3552, + "step": 3864 + }, + { + "epoch": 0.2113089949838033, + "grad_norm": 1.6984754800796509, + "learning_rate": 1.874869617240202e-05, + "loss": 1.5364, + "step": 3865 + }, + { + "epoch": 0.21136366742752485, + "grad_norm": 2.63519287109375, + "learning_rate": 1.874781093030804e-05, + "loss": 1.8302, + "step": 3866 + }, + { + "epoch": 0.2114183398712464, + "grad_norm": 1.2955721616744995, + "learning_rate": 1.8746925396104126e-05, + "loss": 1.5734, + "step": 3867 + }, + { + "epoch": 0.21147301231496796, + "grad_norm": 1.6613651514053345, + "learning_rate": 1.874603956981985e-05, + "loss": 1.3635, + "step": 3868 + }, + { + "epoch": 0.2115276847586895, + "grad_norm": 1.5706926584243774, + "learning_rate": 1.8745153451484786e-05, + "loss": 1.2713, + "step": 3869 + }, + { + "epoch": 0.21158235720241106, + "grad_norm": 1.7969133853912354, + "learning_rate": 1.8744267041128528e-05, + "loss": 1.4701, + "step": 3870 + }, + { + "epoch": 0.2116370296461326, + "grad_norm": 1.843205213546753, + "learning_rate": 1.8743380338780676e-05, + "loss": 1.4239, + "step": 3871 + }, + { + "epoch": 0.21169170208985416, + "grad_norm": 2.675976514816284, + "learning_rate": 1.8742493344470834e-05, + "loss": 1.286, + "step": 3872 + }, + { + "epoch": 0.21174637453357573, + "grad_norm": 1.6896556615829468, + "learning_rate": 1.8741606058228626e-05, + "loss": 1.3348, + "step": 3873 + }, + { + "epoch": 0.21180104697729726, + "grad_norm": 1.218516230583191, + "learning_rate": 1.874071848008368e-05, + "loss": 1.5189, + "step": 3874 + }, + { + "epoch": 0.21185571942101883, + "grad_norm": 1.7413421869277954, + "learning_rate": 1.873983061006563e-05, + "loss": 1.3158, + "step": 3875 + }, + { + "epoch": 0.21191039186474037, + "grad_norm": 1.5045487880706787, + "learning_rate": 1.873894244820413e-05, + "loss": 1.591, + "step": 3876 + }, + { + "epoch": 0.21196506430846193, + "grad_norm": 2.2521607875823975, + "learning_rate": 1.8738053994528835e-05, + "loss": 1.3446, + "step": 3877 + }, + { + "epoch": 0.21201973675218347, + "grad_norm": 1.348280429840088, + "learning_rate": 1.873716524906941e-05, + "loss": 1.3161, + "step": 3878 + }, + { + "epoch": 0.21207440919590503, + "grad_norm": 1.4110714197158813, + "learning_rate": 1.873627621185554e-05, + "loss": 1.5016, + "step": 3879 + }, + { + "epoch": 0.2121290816396266, + "grad_norm": 1.3692959547042847, + "learning_rate": 1.8735386882916904e-05, + "loss": 1.1627, + "step": 3880 + }, + { + "epoch": 0.21218375408334814, + "grad_norm": 1.2424014806747437, + "learning_rate": 1.8734497262283203e-05, + "loss": 1.4953, + "step": 3881 + }, + { + "epoch": 0.2122384265270697, + "grad_norm": 1.7104016542434692, + "learning_rate": 1.873360734998414e-05, + "loss": 1.4545, + "step": 3882 + }, + { + "epoch": 0.21229309897079124, + "grad_norm": 1.4898065328598022, + "learning_rate": 1.8732717146049437e-05, + "loss": 1.5807, + "step": 3883 + }, + { + "epoch": 0.2123477714145128, + "grad_norm": 1.3261539936065674, + "learning_rate": 1.8731826650508812e-05, + "loss": 1.5818, + "step": 3884 + }, + { + "epoch": 0.21240244385823434, + "grad_norm": 1.2264288663864136, + "learning_rate": 1.873093586339201e-05, + "loss": 1.4165, + "step": 3885 + }, + { + "epoch": 0.2124571163019559, + "grad_norm": 1.4789024591445923, + "learning_rate": 1.8730044784728767e-05, + "loss": 1.4911, + "step": 3886 + }, + { + "epoch": 0.21251178874567747, + "grad_norm": 1.755315899848938, + "learning_rate": 1.8729153414548843e-05, + "loss": 1.4921, + "step": 3887 + }, + { + "epoch": 0.212566461189399, + "grad_norm": 1.3631881475448608, + "learning_rate": 1.8728261752882008e-05, + "loss": 1.456, + "step": 3888 + }, + { + "epoch": 0.21262113363312057, + "grad_norm": 1.4321449995040894, + "learning_rate": 1.8727369799758027e-05, + "loss": 1.5374, + "step": 3889 + }, + { + "epoch": 0.2126758060768421, + "grad_norm": 1.442763328552246, + "learning_rate": 1.872647755520669e-05, + "loss": 1.3912, + "step": 3890 + }, + { + "epoch": 0.21273047852056368, + "grad_norm": 1.3308640718460083, + "learning_rate": 1.8725585019257794e-05, + "loss": 1.8544, + "step": 3891 + }, + { + "epoch": 0.2127851509642852, + "grad_norm": 1.659828782081604, + "learning_rate": 1.8724692191941134e-05, + "loss": 1.473, + "step": 3892 + }, + { + "epoch": 0.21283982340800678, + "grad_norm": 1.5849217176437378, + "learning_rate": 1.872379907328653e-05, + "loss": 1.6075, + "step": 3893 + }, + { + "epoch": 0.21289449585172834, + "grad_norm": 1.8133950233459473, + "learning_rate": 1.8722905663323804e-05, + "loss": 1.3993, + "step": 3894 + }, + { + "epoch": 0.21294916829544988, + "grad_norm": 1.3484435081481934, + "learning_rate": 1.872201196208279e-05, + "loss": 1.7017, + "step": 3895 + }, + { + "epoch": 0.21300384073917145, + "grad_norm": 1.4784860610961914, + "learning_rate": 1.872111796959333e-05, + "loss": 1.3981, + "step": 3896 + }, + { + "epoch": 0.21305851318289298, + "grad_norm": 1.334078073501587, + "learning_rate": 1.8720223685885275e-05, + "loss": 1.6572, + "step": 3897 + }, + { + "epoch": 0.21311318562661455, + "grad_norm": 1.883331298828125, + "learning_rate": 1.8719329110988487e-05, + "loss": 1.2391, + "step": 3898 + }, + { + "epoch": 0.21316785807033609, + "grad_norm": 1.5497126579284668, + "learning_rate": 1.871843424493284e-05, + "loss": 1.2707, + "step": 3899 + }, + { + "epoch": 0.21322253051405765, + "grad_norm": 1.4371204376220703, + "learning_rate": 1.8717539087748217e-05, + "loss": 1.4896, + "step": 3900 + }, + { + "epoch": 0.21327720295777922, + "grad_norm": 1.603821039199829, + "learning_rate": 1.871664363946451e-05, + "loss": 1.534, + "step": 3901 + }, + { + "epoch": 0.21333187540150075, + "grad_norm": 1.5007641315460205, + "learning_rate": 1.8715747900111613e-05, + "loss": 1.5556, + "step": 3902 + }, + { + "epoch": 0.21338654784522232, + "grad_norm": 1.2610780000686646, + "learning_rate": 1.8714851869719443e-05, + "loss": 1.4494, + "step": 3903 + }, + { + "epoch": 0.21344122028894386, + "grad_norm": 1.6659702062606812, + "learning_rate": 1.871395554831792e-05, + "loss": 1.285, + "step": 3904 + }, + { + "epoch": 0.21349589273266542, + "grad_norm": 1.5946037769317627, + "learning_rate": 1.871305893593697e-05, + "loss": 1.502, + "step": 3905 + }, + { + "epoch": 0.21355056517638696, + "grad_norm": 1.7122091054916382, + "learning_rate": 1.8712162032606536e-05, + "loss": 1.4466, + "step": 3906 + }, + { + "epoch": 0.21360523762010852, + "grad_norm": 1.983314037322998, + "learning_rate": 1.871126483835657e-05, + "loss": 1.3145, + "step": 3907 + }, + { + "epoch": 0.2136599100638301, + "grad_norm": 1.4200639724731445, + "learning_rate": 1.8710367353217034e-05, + "loss": 1.6195, + "step": 3908 + }, + { + "epoch": 0.21371458250755163, + "grad_norm": 1.5105574131011963, + "learning_rate": 1.8709469577217886e-05, + "loss": 1.5614, + "step": 3909 + }, + { + "epoch": 0.2137692549512732, + "grad_norm": 1.3138463497161865, + "learning_rate": 1.8708571510389114e-05, + "loss": 1.4031, + "step": 3910 + }, + { + "epoch": 0.21382392739499473, + "grad_norm": 1.523833155632019, + "learning_rate": 1.8707673152760705e-05, + "loss": 1.4935, + "step": 3911 + }, + { + "epoch": 0.2138785998387163, + "grad_norm": 1.8330804109573364, + "learning_rate": 1.8706774504362655e-05, + "loss": 1.5344, + "step": 3912 + }, + { + "epoch": 0.21393327228243783, + "grad_norm": 1.9353002309799194, + "learning_rate": 1.8705875565224975e-05, + "loss": 1.5087, + "step": 3913 + }, + { + "epoch": 0.2139879447261594, + "grad_norm": 1.482541799545288, + "learning_rate": 1.8704976335377677e-05, + "loss": 1.3099, + "step": 3914 + }, + { + "epoch": 0.21404261716988096, + "grad_norm": 1.6821671724319458, + "learning_rate": 1.8704076814850795e-05, + "loss": 1.3661, + "step": 3915 + }, + { + "epoch": 0.2140972896136025, + "grad_norm": 1.4940242767333984, + "learning_rate": 1.8703177003674362e-05, + "loss": 1.3794, + "step": 3916 + }, + { + "epoch": 0.21415196205732406, + "grad_norm": 1.3394962549209595, + "learning_rate": 1.870227690187843e-05, + "loss": 1.5178, + "step": 3917 + }, + { + "epoch": 0.2142066345010456, + "grad_norm": 1.5903531312942505, + "learning_rate": 1.8701376509493046e-05, + "loss": 1.3523, + "step": 3918 + }, + { + "epoch": 0.21426130694476717, + "grad_norm": 1.2602788209915161, + "learning_rate": 1.8700475826548285e-05, + "loss": 1.3922, + "step": 3919 + }, + { + "epoch": 0.2143159793884887, + "grad_norm": 1.2317355871200562, + "learning_rate": 1.8699574853074222e-05, + "loss": 1.3928, + "step": 3920 + }, + { + "epoch": 0.21437065183221027, + "grad_norm": 1.4783098697662354, + "learning_rate": 1.8698673589100936e-05, + "loss": 1.6931, + "step": 3921 + }, + { + "epoch": 0.21442532427593183, + "grad_norm": 1.3645853996276855, + "learning_rate": 1.8697772034658527e-05, + "loss": 1.6388, + "step": 3922 + }, + { + "epoch": 0.21447999671965337, + "grad_norm": 1.3776652812957764, + "learning_rate": 1.8696870189777107e-05, + "loss": 1.4913, + "step": 3923 + }, + { + "epoch": 0.21453466916337494, + "grad_norm": 1.4314239025115967, + "learning_rate": 1.8695968054486774e-05, + "loss": 1.6809, + "step": 3924 + }, + { + "epoch": 0.21458934160709647, + "grad_norm": 1.3555632829666138, + "learning_rate": 1.8695065628817667e-05, + "loss": 1.5946, + "step": 3925 + }, + { + "epoch": 0.21464401405081804, + "grad_norm": 1.6254193782806396, + "learning_rate": 1.8694162912799917e-05, + "loss": 1.4564, + "step": 3926 + }, + { + "epoch": 0.2146986864945396, + "grad_norm": 1.4660731554031372, + "learning_rate": 1.8693259906463663e-05, + "loss": 1.4982, + "step": 3927 + }, + { + "epoch": 0.21475335893826114, + "grad_norm": 1.3562361001968384, + "learning_rate": 1.869235660983906e-05, + "loss": 1.4761, + "step": 3928 + }, + { + "epoch": 0.2148080313819827, + "grad_norm": 1.6004817485809326, + "learning_rate": 1.8691453022956274e-05, + "loss": 1.3278, + "step": 3929 + }, + { + "epoch": 0.21486270382570424, + "grad_norm": 1.5348089933395386, + "learning_rate": 1.8690549145845474e-05, + "loss": 1.3055, + "step": 3930 + }, + { + "epoch": 0.2149173762694258, + "grad_norm": 1.3425304889678955, + "learning_rate": 1.8689644978536847e-05, + "loss": 1.6299, + "step": 3931 + }, + { + "epoch": 0.21497204871314735, + "grad_norm": 1.730008840560913, + "learning_rate": 1.8688740521060587e-05, + "loss": 1.2915, + "step": 3932 + }, + { + "epoch": 0.2150267211568689, + "grad_norm": 1.5201696157455444, + "learning_rate": 1.868783577344689e-05, + "loss": 1.281, + "step": 3933 + }, + { + "epoch": 0.21508139360059048, + "grad_norm": 1.4953116178512573, + "learning_rate": 1.8686930735725965e-05, + "loss": 1.3675, + "step": 3934 + }, + { + "epoch": 0.215136066044312, + "grad_norm": 1.4110198020935059, + "learning_rate": 1.868602540792804e-05, + "loss": 1.3242, + "step": 3935 + }, + { + "epoch": 0.21519073848803358, + "grad_norm": 1.6768079996109009, + "learning_rate": 1.8685119790083348e-05, + "loss": 1.6604, + "step": 3936 + }, + { + "epoch": 0.21524541093175512, + "grad_norm": 1.2883009910583496, + "learning_rate": 1.8684213882222123e-05, + "loss": 1.5202, + "step": 3937 + }, + { + "epoch": 0.21530008337547668, + "grad_norm": 1.6103432178497314, + "learning_rate": 1.8683307684374622e-05, + "loss": 1.4852, + "step": 3938 + }, + { + "epoch": 0.21535475581919822, + "grad_norm": 1.593085527420044, + "learning_rate": 1.8682401196571097e-05, + "loss": 1.5037, + "step": 3939 + }, + { + "epoch": 0.21540942826291978, + "grad_norm": 1.6174181699752808, + "learning_rate": 1.8681494418841825e-05, + "loss": 1.3747, + "step": 3940 + }, + { + "epoch": 0.21546410070664135, + "grad_norm": 1.42770516872406, + "learning_rate": 1.8680587351217082e-05, + "loss": 1.4846, + "step": 3941 + }, + { + "epoch": 0.21551877315036289, + "grad_norm": 1.6266257762908936, + "learning_rate": 1.8679679993727157e-05, + "loss": 1.1968, + "step": 3942 + }, + { + "epoch": 0.21557344559408445, + "grad_norm": 1.7312958240509033, + "learning_rate": 1.867877234640235e-05, + "loss": 1.612, + "step": 3943 + }, + { + "epoch": 0.215628118037806, + "grad_norm": 1.3667455911636353, + "learning_rate": 1.867786440927297e-05, + "loss": 1.3759, + "step": 3944 + }, + { + "epoch": 0.21568279048152755, + "grad_norm": 1.4255167245864868, + "learning_rate": 1.867695618236933e-05, + "loss": 1.42, + "step": 3945 + }, + { + "epoch": 0.2157374629252491, + "grad_norm": 1.8455313444137573, + "learning_rate": 1.8676047665721764e-05, + "loss": 1.5704, + "step": 3946 + }, + { + "epoch": 0.21579213536897066, + "grad_norm": 1.6155993938446045, + "learning_rate": 1.867513885936061e-05, + "loss": 1.5455, + "step": 3947 + }, + { + "epoch": 0.21584680781269222, + "grad_norm": 1.6048946380615234, + "learning_rate": 1.867422976331621e-05, + "loss": 1.569, + "step": 3948 + }, + { + "epoch": 0.21590148025641376, + "grad_norm": 1.8236949443817139, + "learning_rate": 1.8673320377618927e-05, + "loss": 1.4383, + "step": 3949 + }, + { + "epoch": 0.21595615270013532, + "grad_norm": 1.3222742080688477, + "learning_rate": 1.8672410702299118e-05, + "loss": 1.6141, + "step": 3950 + }, + { + "epoch": 0.21601082514385686, + "grad_norm": 1.29142427444458, + "learning_rate": 1.867150073738717e-05, + "loss": 1.5369, + "step": 3951 + }, + { + "epoch": 0.21606549758757843, + "grad_norm": 1.268871784210205, + "learning_rate": 1.8670590482913463e-05, + "loss": 1.4661, + "step": 3952 + }, + { + "epoch": 0.21612017003129996, + "grad_norm": 1.4868649244308472, + "learning_rate": 1.8669679938908393e-05, + "loss": 1.472, + "step": 3953 + }, + { + "epoch": 0.21617484247502153, + "grad_norm": 1.5293079614639282, + "learning_rate": 1.8668769105402366e-05, + "loss": 1.6885, + "step": 3954 + }, + { + "epoch": 0.2162295149187431, + "grad_norm": 1.9149553775787354, + "learning_rate": 1.8667857982425797e-05, + "loss": 1.2573, + "step": 3955 + }, + { + "epoch": 0.21628418736246463, + "grad_norm": 1.4978835582733154, + "learning_rate": 1.866694657000911e-05, + "loss": 1.4575, + "step": 3956 + }, + { + "epoch": 0.2163388598061862, + "grad_norm": 1.42157781124115, + "learning_rate": 1.866603486818274e-05, + "loss": 1.4912, + "step": 3957 + }, + { + "epoch": 0.21639353224990773, + "grad_norm": 1.7389081716537476, + "learning_rate": 1.866512287697713e-05, + "loss": 1.3168, + "step": 3958 + }, + { + "epoch": 0.2164482046936293, + "grad_norm": 1.6358532905578613, + "learning_rate": 1.8664210596422733e-05, + "loss": 1.2491, + "step": 3959 + }, + { + "epoch": 0.21650287713735084, + "grad_norm": 1.3682252168655396, + "learning_rate": 1.8663298026550013e-05, + "loss": 1.4866, + "step": 3960 + }, + { + "epoch": 0.2165575495810724, + "grad_norm": 1.4850088357925415, + "learning_rate": 1.8662385167389443e-05, + "loss": 1.4046, + "step": 3961 + }, + { + "epoch": 0.21661222202479397, + "grad_norm": 1.3496167659759521, + "learning_rate": 1.8661472018971506e-05, + "loss": 1.3375, + "step": 3962 + }, + { + "epoch": 0.2166668944685155, + "grad_norm": 1.4392454624176025, + "learning_rate": 1.8660558581326695e-05, + "loss": 1.5269, + "step": 3963 + }, + { + "epoch": 0.21672156691223707, + "grad_norm": 1.298045039176941, + "learning_rate": 1.8659644854485506e-05, + "loss": 1.2326, + "step": 3964 + }, + { + "epoch": 0.2167762393559586, + "grad_norm": 1.434424877166748, + "learning_rate": 1.8658730838478457e-05, + "loss": 1.4268, + "step": 3965 + }, + { + "epoch": 0.21683091179968017, + "grad_norm": 1.3708200454711914, + "learning_rate": 1.8657816533336067e-05, + "loss": 1.619, + "step": 3966 + }, + { + "epoch": 0.2168855842434017, + "grad_norm": 1.5389105081558228, + "learning_rate": 1.8656901939088868e-05, + "loss": 1.5715, + "step": 3967 + }, + { + "epoch": 0.21694025668712327, + "grad_norm": 2.164813995361328, + "learning_rate": 1.8655987055767396e-05, + "loss": 1.5019, + "step": 3968 + }, + { + "epoch": 0.21699492913084484, + "grad_norm": 1.5962748527526855, + "learning_rate": 1.865507188340221e-05, + "loss": 1.4723, + "step": 3969 + }, + { + "epoch": 0.21704960157456638, + "grad_norm": 1.5357075929641724, + "learning_rate": 1.865415642202386e-05, + "loss": 1.4031, + "step": 3970 + }, + { + "epoch": 0.21710427401828794, + "grad_norm": 1.6241998672485352, + "learning_rate": 1.8653240671662916e-05, + "loss": 1.4958, + "step": 3971 + }, + { + "epoch": 0.21715894646200948, + "grad_norm": 1.476944923400879, + "learning_rate": 1.8652324632349967e-05, + "loss": 1.4332, + "step": 3972 + }, + { + "epoch": 0.21721361890573104, + "grad_norm": 1.542913556098938, + "learning_rate": 1.8651408304115593e-05, + "loss": 1.5045, + "step": 3973 + }, + { + "epoch": 0.21726829134945258, + "grad_norm": 1.495831847190857, + "learning_rate": 1.8650491686990394e-05, + "loss": 1.3286, + "step": 3974 + }, + { + "epoch": 0.21732296379317415, + "grad_norm": 1.3063085079193115, + "learning_rate": 1.864957478100498e-05, + "loss": 1.4574, + "step": 3975 + }, + { + "epoch": 0.2173776362368957, + "grad_norm": 2.0100982189178467, + "learning_rate": 1.8648657586189963e-05, + "loss": 1.5685, + "step": 3976 + }, + { + "epoch": 0.21743230868061725, + "grad_norm": 1.6866424083709717, + "learning_rate": 1.864774010257598e-05, + "loss": 1.3354, + "step": 3977 + }, + { + "epoch": 0.2174869811243388, + "grad_norm": 1.3564430475234985, + "learning_rate": 1.864682233019366e-05, + "loss": 1.5362, + "step": 3978 + }, + { + "epoch": 0.21754165356806035, + "grad_norm": 1.4353578090667725, + "learning_rate": 1.864590426907365e-05, + "loss": 1.4648, + "step": 3979 + }, + { + "epoch": 0.21759632601178192, + "grad_norm": 1.8707903623580933, + "learning_rate": 1.8644985919246613e-05, + "loss": 1.9916, + "step": 3980 + }, + { + "epoch": 0.21765099845550345, + "grad_norm": 1.1980676651000977, + "learning_rate": 1.8644067280743202e-05, + "loss": 1.3566, + "step": 3981 + }, + { + "epoch": 0.21770567089922502, + "grad_norm": 1.657166600227356, + "learning_rate": 1.864314835359411e-05, + "loss": 1.5351, + "step": 3982 + }, + { + "epoch": 0.21776034334294658, + "grad_norm": 1.5567257404327393, + "learning_rate": 1.864222913783001e-05, + "loss": 1.4845, + "step": 3983 + }, + { + "epoch": 0.21781501578666812, + "grad_norm": 1.7092028856277466, + "learning_rate": 1.8641309633481594e-05, + "loss": 1.7914, + "step": 3984 + }, + { + "epoch": 0.21786968823038969, + "grad_norm": 1.285555124282837, + "learning_rate": 1.8640389840579578e-05, + "loss": 1.458, + "step": 3985 + }, + { + "epoch": 0.21792436067411122, + "grad_norm": 1.5657744407653809, + "learning_rate": 1.8639469759154667e-05, + "loss": 1.8579, + "step": 3986 + }, + { + "epoch": 0.2179790331178328, + "grad_norm": 1.6224007606506348, + "learning_rate": 1.8638549389237587e-05, + "loss": 1.5306, + "step": 3987 + }, + { + "epoch": 0.21803370556155433, + "grad_norm": 1.5154614448547363, + "learning_rate": 1.863762873085907e-05, + "loss": 1.8442, + "step": 3988 + }, + { + "epoch": 0.2180883780052759, + "grad_norm": 1.5727219581604004, + "learning_rate": 1.8636707784049867e-05, + "loss": 1.4279, + "step": 3989 + }, + { + "epoch": 0.21814305044899746, + "grad_norm": 1.5372064113616943, + "learning_rate": 1.863578654884072e-05, + "loss": 1.4746, + "step": 3990 + }, + { + "epoch": 0.218197722892719, + "grad_norm": 1.636034607887268, + "learning_rate": 1.86348650252624e-05, + "loss": 1.3746, + "step": 3991 + }, + { + "epoch": 0.21825239533644056, + "grad_norm": 1.3596569299697876, + "learning_rate": 1.863394321334567e-05, + "loss": 1.4995, + "step": 3992 + }, + { + "epoch": 0.2183070677801621, + "grad_norm": 1.655949354171753, + "learning_rate": 1.8633021113121318e-05, + "loss": 1.3925, + "step": 3993 + }, + { + "epoch": 0.21836174022388366, + "grad_norm": 2.605494976043701, + "learning_rate": 1.8632098724620134e-05, + "loss": 1.5811, + "step": 3994 + }, + { + "epoch": 0.2184164126676052, + "grad_norm": 1.529522180557251, + "learning_rate": 1.8631176047872913e-05, + "loss": 1.4686, + "step": 3995 + }, + { + "epoch": 0.21847108511132676, + "grad_norm": 1.6003497838974, + "learning_rate": 1.8630253082910473e-05, + "loss": 1.2323, + "step": 3996 + }, + { + "epoch": 0.21852575755504833, + "grad_norm": 1.6311876773834229, + "learning_rate": 1.862932982976363e-05, + "loss": 1.4683, + "step": 3997 + }, + { + "epoch": 0.21858042999876987, + "grad_norm": 1.5706807374954224, + "learning_rate": 1.862840628846322e-05, + "loss": 1.3315, + "step": 3998 + }, + { + "epoch": 0.21863510244249143, + "grad_norm": 1.659889817237854, + "learning_rate": 1.8627482459040068e-05, + "loss": 1.4931, + "step": 3999 + }, + { + "epoch": 0.21868977488621297, + "grad_norm": 1.4956809282302856, + "learning_rate": 1.8626558341525037e-05, + "loss": 1.4593, + "step": 4000 + }, + { + "epoch": 0.21874444732993453, + "grad_norm": 1.63871169090271, + "learning_rate": 1.862563393594898e-05, + "loss": 1.5139, + "step": 4001 + }, + { + "epoch": 0.21879911977365607, + "grad_norm": 1.2603493928909302, + "learning_rate": 1.8624709242342766e-05, + "loss": 1.5207, + "step": 4002 + }, + { + "epoch": 0.21885379221737764, + "grad_norm": 1.8210152387619019, + "learning_rate": 1.8623784260737272e-05, + "loss": 1.4756, + "step": 4003 + }, + { + "epoch": 0.2189084646610992, + "grad_norm": 1.4417670965194702, + "learning_rate": 1.8622858991163385e-05, + "loss": 1.6553, + "step": 4004 + }, + { + "epoch": 0.21896313710482074, + "grad_norm": 1.4474496841430664, + "learning_rate": 1.8621933433652e-05, + "loss": 1.6307, + "step": 4005 + }, + { + "epoch": 0.2190178095485423, + "grad_norm": 1.6808476448059082, + "learning_rate": 1.8621007588234023e-05, + "loss": 1.459, + "step": 4006 + }, + { + "epoch": 0.21907248199226384, + "grad_norm": 1.2198479175567627, + "learning_rate": 1.862008145494038e-05, + "loss": 1.3469, + "step": 4007 + }, + { + "epoch": 0.2191271544359854, + "grad_norm": 1.4048058986663818, + "learning_rate": 1.8619155033801985e-05, + "loss": 1.2728, + "step": 4008 + }, + { + "epoch": 0.21918182687970694, + "grad_norm": 1.621429681777954, + "learning_rate": 1.8618228324849777e-05, + "loss": 1.5377, + "step": 4009 + }, + { + "epoch": 0.2192364993234285, + "grad_norm": 1.4781869649887085, + "learning_rate": 1.8617301328114704e-05, + "loss": 1.6936, + "step": 4010 + }, + { + "epoch": 0.21929117176715007, + "grad_norm": 1.9561161994934082, + "learning_rate": 1.861637404362772e-05, + "loss": 1.2028, + "step": 4011 + }, + { + "epoch": 0.2193458442108716, + "grad_norm": 1.5301505327224731, + "learning_rate": 1.8615446471419786e-05, + "loss": 1.5268, + "step": 4012 + }, + { + "epoch": 0.21940051665459318, + "grad_norm": 1.3320649862289429, + "learning_rate": 1.8614518611521876e-05, + "loss": 1.4915, + "step": 4013 + }, + { + "epoch": 0.2194551890983147, + "grad_norm": 2.186134099960327, + "learning_rate": 1.8613590463964977e-05, + "loss": 1.4582, + "step": 4014 + }, + { + "epoch": 0.21950986154203628, + "grad_norm": 1.4840922355651855, + "learning_rate": 1.861266202878008e-05, + "loss": 1.3614, + "step": 4015 + }, + { + "epoch": 0.21956453398575781, + "grad_norm": 1.9686182737350464, + "learning_rate": 1.8611733305998188e-05, + "loss": 1.3953, + "step": 4016 + }, + { + "epoch": 0.21961920642947938, + "grad_norm": 1.2836674451828003, + "learning_rate": 1.861080429565031e-05, + "loss": 1.5082, + "step": 4017 + }, + { + "epoch": 0.21967387887320095, + "grad_norm": 1.37799870967865, + "learning_rate": 1.8609874997767473e-05, + "loss": 1.5465, + "step": 4018 + }, + { + "epoch": 0.21972855131692248, + "grad_norm": 1.491904854774475, + "learning_rate": 1.8608945412380705e-05, + "loss": 1.5094, + "step": 4019 + }, + { + "epoch": 0.21978322376064405, + "grad_norm": 1.8426976203918457, + "learning_rate": 1.860801553952105e-05, + "loss": 1.4208, + "step": 4020 + }, + { + "epoch": 0.21983789620436558, + "grad_norm": 3.4984021186828613, + "learning_rate": 1.8607085379219557e-05, + "loss": 1.48, + "step": 4021 + }, + { + "epoch": 0.21989256864808715, + "grad_norm": 1.1481024026870728, + "learning_rate": 1.8606154931507285e-05, + "loss": 1.5929, + "step": 4022 + }, + { + "epoch": 0.2199472410918087, + "grad_norm": 1.660731554031372, + "learning_rate": 1.8605224196415304e-05, + "loss": 1.5259, + "step": 4023 + }, + { + "epoch": 0.22000191353553025, + "grad_norm": 1.5906223058700562, + "learning_rate": 1.8604293173974694e-05, + "loss": 1.5038, + "step": 4024 + }, + { + "epoch": 0.22005658597925182, + "grad_norm": 1.97526216506958, + "learning_rate": 1.8603361864216544e-05, + "loss": 1.3961, + "step": 4025 + }, + { + "epoch": 0.22011125842297335, + "grad_norm": 1.2836432456970215, + "learning_rate": 1.8602430267171954e-05, + "loss": 1.4622, + "step": 4026 + }, + { + "epoch": 0.22016593086669492, + "grad_norm": 1.3526153564453125, + "learning_rate": 1.860149838287203e-05, + "loss": 1.3898, + "step": 4027 + }, + { + "epoch": 0.22022060331041646, + "grad_norm": 1.6382087469100952, + "learning_rate": 1.8600566211347897e-05, + "loss": 1.4202, + "step": 4028 + }, + { + "epoch": 0.22027527575413802, + "grad_norm": 1.7087103128433228, + "learning_rate": 1.859963375263067e-05, + "loss": 1.4172, + "step": 4029 + }, + { + "epoch": 0.2203299481978596, + "grad_norm": 1.2201690673828125, + "learning_rate": 1.8598701006751494e-05, + "loss": 1.5993, + "step": 4030 + }, + { + "epoch": 0.22038462064158112, + "grad_norm": 1.825149416923523, + "learning_rate": 1.8597767973741514e-05, + "loss": 1.6193, + "step": 4031 + }, + { + "epoch": 0.2204392930853027, + "grad_norm": 1.6027792692184448, + "learning_rate": 1.8596834653631887e-05, + "loss": 1.2895, + "step": 4032 + }, + { + "epoch": 0.22049396552902423, + "grad_norm": 1.5435138940811157, + "learning_rate": 1.8595901046453776e-05, + "loss": 1.3934, + "step": 4033 + }, + { + "epoch": 0.2205486379727458, + "grad_norm": 1.6178869009017944, + "learning_rate": 1.859496715223836e-05, + "loss": 1.479, + "step": 4034 + }, + { + "epoch": 0.22060331041646733, + "grad_norm": 1.645349383354187, + "learning_rate": 1.8594032971016818e-05, + "loss": 1.4896, + "step": 4035 + }, + { + "epoch": 0.2206579828601889, + "grad_norm": 1.9128869771957397, + "learning_rate": 1.859309850282035e-05, + "loss": 1.4982, + "step": 4036 + }, + { + "epoch": 0.22071265530391046, + "grad_norm": 1.8596012592315674, + "learning_rate": 1.8592163747680164e-05, + "loss": 1.735, + "step": 4037 + }, + { + "epoch": 0.220767327747632, + "grad_norm": 1.298614501953125, + "learning_rate": 1.8591228705627464e-05, + "loss": 1.5233, + "step": 4038 + }, + { + "epoch": 0.22082200019135356, + "grad_norm": 1.8642632961273193, + "learning_rate": 1.8590293376693476e-05, + "loss": 1.4035, + "step": 4039 + }, + { + "epoch": 0.2208766726350751, + "grad_norm": 1.7757574319839478, + "learning_rate": 1.858935776090944e-05, + "loss": 1.3906, + "step": 4040 + }, + { + "epoch": 0.22093134507879666, + "grad_norm": 1.5550702810287476, + "learning_rate": 1.8588421858306587e-05, + "loss": 1.6075, + "step": 4041 + }, + { + "epoch": 0.2209860175225182, + "grad_norm": 1.5963608026504517, + "learning_rate": 1.8587485668916176e-05, + "loss": 1.4772, + "step": 4042 + }, + { + "epoch": 0.22104068996623977, + "grad_norm": 1.5297757387161255, + "learning_rate": 1.858654919276947e-05, + "loss": 1.4242, + "step": 4043 + }, + { + "epoch": 0.22109536240996133, + "grad_norm": 1.3001378774642944, + "learning_rate": 1.8585612429897735e-05, + "loss": 1.567, + "step": 4044 + }, + { + "epoch": 0.22115003485368287, + "grad_norm": 1.2644850015640259, + "learning_rate": 1.8584675380332254e-05, + "loss": 1.358, + "step": 4045 + }, + { + "epoch": 0.22120470729740443, + "grad_norm": 1.4374418258666992, + "learning_rate": 1.858373804410432e-05, + "loss": 1.4307, + "step": 4046 + }, + { + "epoch": 0.22125937974112597, + "grad_norm": 1.7227566242218018, + "learning_rate": 1.8582800421245227e-05, + "loss": 1.4084, + "step": 4047 + }, + { + "epoch": 0.22131405218484754, + "grad_norm": 2.165661334991455, + "learning_rate": 1.858186251178629e-05, + "loss": 1.1748, + "step": 4048 + }, + { + "epoch": 0.22136872462856907, + "grad_norm": 1.6709437370300293, + "learning_rate": 1.8580924315758825e-05, + "loss": 1.4067, + "step": 4049 + }, + { + "epoch": 0.22142339707229064, + "grad_norm": 1.4159249067306519, + "learning_rate": 1.8579985833194164e-05, + "loss": 1.4233, + "step": 4050 + }, + { + "epoch": 0.2214780695160122, + "grad_norm": 1.5867873430252075, + "learning_rate": 1.8579047064123638e-05, + "loss": 1.2635, + "step": 4051 + }, + { + "epoch": 0.22153274195973374, + "grad_norm": 1.5867301225662231, + "learning_rate": 1.8578108008578603e-05, + "loss": 1.2925, + "step": 4052 + }, + { + "epoch": 0.2215874144034553, + "grad_norm": 1.8247334957122803, + "learning_rate": 1.857716866659041e-05, + "loss": 1.5694, + "step": 4053 + }, + { + "epoch": 0.22164208684717684, + "grad_norm": 1.18817138671875, + "learning_rate": 1.857622903819043e-05, + "loss": 1.3748, + "step": 4054 + }, + { + "epoch": 0.2216967592908984, + "grad_norm": 1.3081796169281006, + "learning_rate": 1.8575289123410035e-05, + "loss": 1.37, + "step": 4055 + }, + { + "epoch": 0.22175143173461995, + "grad_norm": 1.7706154584884644, + "learning_rate": 1.8574348922280617e-05, + "loss": 1.2657, + "step": 4056 + }, + { + "epoch": 0.2218061041783415, + "grad_norm": 1.247761845588684, + "learning_rate": 1.8573408434833565e-05, + "loss": 1.3186, + "step": 4057 + }, + { + "epoch": 0.22186077662206308, + "grad_norm": 1.4731206893920898, + "learning_rate": 1.857246766110029e-05, + "loss": 1.3826, + "step": 4058 + }, + { + "epoch": 0.22191544906578461, + "grad_norm": 1.3569303750991821, + "learning_rate": 1.8571526601112202e-05, + "loss": 1.5396, + "step": 4059 + }, + { + "epoch": 0.22197012150950618, + "grad_norm": 1.3613594770431519, + "learning_rate": 1.857058525490073e-05, + "loss": 1.4305, + "step": 4060 + }, + { + "epoch": 0.22202479395322772, + "grad_norm": 1.3704286813735962, + "learning_rate": 1.85696436224973e-05, + "loss": 1.375, + "step": 4061 + }, + { + "epoch": 0.22207946639694928, + "grad_norm": 1.6464284658432007, + "learning_rate": 1.856870170393336e-05, + "loss": 1.423, + "step": 4062 + }, + { + "epoch": 0.22213413884067082, + "grad_norm": 1.5958828926086426, + "learning_rate": 1.856775949924037e-05, + "loss": 1.4124, + "step": 4063 + }, + { + "epoch": 0.22218881128439238, + "grad_norm": 1.3663609027862549, + "learning_rate": 1.856681700844978e-05, + "loss": 1.5137, + "step": 4064 + }, + { + "epoch": 0.22224348372811395, + "grad_norm": 1.410372257232666, + "learning_rate": 1.8565874231593065e-05, + "loss": 1.4673, + "step": 4065 + }, + { + "epoch": 0.2222981561718355, + "grad_norm": 1.2481476068496704, + "learning_rate": 1.8564931168701713e-05, + "loss": 1.5388, + "step": 4066 + }, + { + "epoch": 0.22235282861555705, + "grad_norm": 1.5208345651626587, + "learning_rate": 1.8563987819807213e-05, + "loss": 1.4967, + "step": 4067 + }, + { + "epoch": 0.2224075010592786, + "grad_norm": 1.662079095840454, + "learning_rate": 1.856304418494106e-05, + "loss": 1.4896, + "step": 4068 + }, + { + "epoch": 0.22246217350300015, + "grad_norm": 1.6828900575637817, + "learning_rate": 1.856210026413477e-05, + "loss": 1.5099, + "step": 4069 + }, + { + "epoch": 0.2225168459467217, + "grad_norm": 1.657701849937439, + "learning_rate": 1.856115605741986e-05, + "loss": 1.5106, + "step": 4070 + }, + { + "epoch": 0.22257151839044326, + "grad_norm": 1.542000651359558, + "learning_rate": 1.856021156482786e-05, + "loss": 1.5415, + "step": 4071 + }, + { + "epoch": 0.22262619083416482, + "grad_norm": 2.0495047569274902, + "learning_rate": 1.8559266786390305e-05, + "loss": 1.2585, + "step": 4072 + }, + { + "epoch": 0.22268086327788636, + "grad_norm": 1.4696571826934814, + "learning_rate": 1.855832172213875e-05, + "loss": 1.4693, + "step": 4073 + }, + { + "epoch": 0.22273553572160792, + "grad_norm": 1.323722243309021, + "learning_rate": 1.8557376372104752e-05, + "loss": 1.518, + "step": 4074 + }, + { + "epoch": 0.22279020816532946, + "grad_norm": 1.728509783744812, + "learning_rate": 1.8556430736319876e-05, + "loss": 1.4307, + "step": 4075 + }, + { + "epoch": 0.22284488060905103, + "grad_norm": 1.6926530599594116, + "learning_rate": 1.8555484814815695e-05, + "loss": 1.3656, + "step": 4076 + }, + { + "epoch": 0.22289955305277256, + "grad_norm": 1.6160452365875244, + "learning_rate": 1.8554538607623805e-05, + "loss": 1.524, + "step": 4077 + }, + { + "epoch": 0.22295422549649413, + "grad_norm": 2.1711409091949463, + "learning_rate": 1.8553592114775796e-05, + "loss": 1.5176, + "step": 4078 + }, + { + "epoch": 0.2230088979402157, + "grad_norm": 1.5396027565002441, + "learning_rate": 1.8552645336303274e-05, + "loss": 1.5093, + "step": 4079 + }, + { + "epoch": 0.22306357038393723, + "grad_norm": 1.4191066026687622, + "learning_rate": 1.855169827223785e-05, + "loss": 1.4971, + "step": 4080 + }, + { + "epoch": 0.2231182428276588, + "grad_norm": 1.4821096658706665, + "learning_rate": 1.855075092261116e-05, + "loss": 1.463, + "step": 4081 + }, + { + "epoch": 0.22317291527138033, + "grad_norm": 2.1103017330169678, + "learning_rate": 1.8549803287454832e-05, + "loss": 1.4933, + "step": 4082 + }, + { + "epoch": 0.2232275877151019, + "grad_norm": 1.5980713367462158, + "learning_rate": 1.8548855366800508e-05, + "loss": 1.3928, + "step": 4083 + }, + { + "epoch": 0.22328226015882344, + "grad_norm": 1.5037323236465454, + "learning_rate": 1.854790716067984e-05, + "loss": 1.471, + "step": 4084 + }, + { + "epoch": 0.223336932602545, + "grad_norm": 1.8640247583389282, + "learning_rate": 1.8546958669124494e-05, + "loss": 1.6913, + "step": 4085 + }, + { + "epoch": 0.22339160504626657, + "grad_norm": 1.126706838607788, + "learning_rate": 1.8546009892166145e-05, + "loss": 1.6246, + "step": 4086 + }, + { + "epoch": 0.2234462774899881, + "grad_norm": 1.2092376947402954, + "learning_rate": 1.854506082983647e-05, + "loss": 1.5528, + "step": 4087 + }, + { + "epoch": 0.22350094993370967, + "grad_norm": 1.9526698589324951, + "learning_rate": 1.8544111482167164e-05, + "loss": 1.2284, + "step": 4088 + }, + { + "epoch": 0.2235556223774312, + "grad_norm": 1.6582547426223755, + "learning_rate": 1.8543161849189926e-05, + "loss": 1.3883, + "step": 4089 + }, + { + "epoch": 0.22361029482115277, + "grad_norm": 1.5709710121154785, + "learning_rate": 1.8542211930936464e-05, + "loss": 1.4903, + "step": 4090 + }, + { + "epoch": 0.2236649672648743, + "grad_norm": 1.6490848064422607, + "learning_rate": 1.8541261727438503e-05, + "loss": 1.4091, + "step": 4091 + }, + { + "epoch": 0.22371963970859587, + "grad_norm": 1.328457236289978, + "learning_rate": 1.854031123872777e-05, + "loss": 1.6254, + "step": 4092 + }, + { + "epoch": 0.22377431215231744, + "grad_norm": 1.6236594915390015, + "learning_rate": 1.8539360464836003e-05, + "loss": 1.52, + "step": 4093 + }, + { + "epoch": 0.22382898459603898, + "grad_norm": 1.7155654430389404, + "learning_rate": 1.8538409405794952e-05, + "loss": 1.3243, + "step": 4094 + }, + { + "epoch": 0.22388365703976054, + "grad_norm": 1.5526156425476074, + "learning_rate": 1.8537458061636377e-05, + "loss": 1.4735, + "step": 4095 + }, + { + "epoch": 0.22393832948348208, + "grad_norm": 1.765008568763733, + "learning_rate": 1.853650643239204e-05, + "loss": 1.4154, + "step": 4096 + }, + { + "epoch": 0.22399300192720364, + "grad_norm": 1.6222639083862305, + "learning_rate": 1.8535554518093723e-05, + "loss": 1.4331, + "step": 4097 + }, + { + "epoch": 0.22404767437092518, + "grad_norm": 1.3857439756393433, + "learning_rate": 1.8534602318773215e-05, + "loss": 1.5891, + "step": 4098 + }, + { + "epoch": 0.22410234681464675, + "grad_norm": 1.5865275859832764, + "learning_rate": 1.8533649834462303e-05, + "loss": 1.3198, + "step": 4099 + }, + { + "epoch": 0.2241570192583683, + "grad_norm": 1.5991605520248413, + "learning_rate": 1.85326970651928e-05, + "loss": 1.3374, + "step": 4100 + }, + { + "epoch": 0.22421169170208985, + "grad_norm": 1.7193129062652588, + "learning_rate": 1.8531744010996525e-05, + "loss": 1.2991, + "step": 4101 + }, + { + "epoch": 0.22426636414581141, + "grad_norm": 1.2399317026138306, + "learning_rate": 1.853079067190529e-05, + "loss": 1.6353, + "step": 4102 + }, + { + "epoch": 0.22432103658953295, + "grad_norm": 1.353675127029419, + "learning_rate": 1.852983704795094e-05, + "loss": 1.4476, + "step": 4103 + }, + { + "epoch": 0.22437570903325452, + "grad_norm": 1.535766363143921, + "learning_rate": 1.8528883139165313e-05, + "loss": 1.5761, + "step": 4104 + }, + { + "epoch": 0.22443038147697605, + "grad_norm": 1.3853834867477417, + "learning_rate": 1.8527928945580266e-05, + "loss": 1.4516, + "step": 4105 + }, + { + "epoch": 0.22448505392069762, + "grad_norm": 1.713083028793335, + "learning_rate": 1.852697446722766e-05, + "loss": 1.4447, + "step": 4106 + }, + { + "epoch": 0.22453972636441918, + "grad_norm": 1.3699133396148682, + "learning_rate": 1.8526019704139364e-05, + "loss": 1.5416, + "step": 4107 + }, + { + "epoch": 0.22459439880814072, + "grad_norm": 1.5704632997512817, + "learning_rate": 1.8525064656347265e-05, + "loss": 1.5033, + "step": 4108 + }, + { + "epoch": 0.2246490712518623, + "grad_norm": 1.3550653457641602, + "learning_rate": 1.8524109323883253e-05, + "loss": 1.6779, + "step": 4109 + }, + { + "epoch": 0.22470374369558382, + "grad_norm": 1.8176196813583374, + "learning_rate": 1.852315370677923e-05, + "loss": 1.2191, + "step": 4110 + }, + { + "epoch": 0.2247584161393054, + "grad_norm": 1.5092058181762695, + "learning_rate": 1.8522197805067105e-05, + "loss": 1.3179, + "step": 4111 + }, + { + "epoch": 0.22481308858302693, + "grad_norm": 1.2990275621414185, + "learning_rate": 1.852124161877879e-05, + "loss": 1.3229, + "step": 4112 + }, + { + "epoch": 0.2248677610267485, + "grad_norm": 1.7087730169296265, + "learning_rate": 1.852028514794623e-05, + "loss": 1.3146, + "step": 4113 + }, + { + "epoch": 0.22492243347047006, + "grad_norm": 1.5453712940216064, + "learning_rate": 1.851932839260135e-05, + "loss": 1.6708, + "step": 4114 + }, + { + "epoch": 0.2249771059141916, + "grad_norm": 1.201501488685608, + "learning_rate": 1.8518371352776107e-05, + "loss": 1.565, + "step": 4115 + }, + { + "epoch": 0.22503177835791316, + "grad_norm": 1.4349335432052612, + "learning_rate": 1.8517414028502454e-05, + "loss": 1.695, + "step": 4116 + }, + { + "epoch": 0.2250864508016347, + "grad_norm": 1.2992568016052246, + "learning_rate": 1.8516456419812362e-05, + "loss": 1.3965, + "step": 4117 + }, + { + "epoch": 0.22514112324535626, + "grad_norm": 1.6750550270080566, + "learning_rate": 1.8515498526737806e-05, + "loss": 1.5444, + "step": 4118 + }, + { + "epoch": 0.2251957956890778, + "grad_norm": 1.7960213422775269, + "learning_rate": 1.8514540349310766e-05, + "loss": 1.1769, + "step": 4119 + }, + { + "epoch": 0.22525046813279936, + "grad_norm": 1.6857348680496216, + "learning_rate": 1.851358188756325e-05, + "loss": 1.4965, + "step": 4120 + }, + { + "epoch": 0.22530514057652093, + "grad_norm": 1.4138671159744263, + "learning_rate": 1.8512623141527255e-05, + "loss": 1.3591, + "step": 4121 + }, + { + "epoch": 0.22535981302024247, + "grad_norm": 1.6840559244155884, + "learning_rate": 1.85116641112348e-05, + "loss": 1.6331, + "step": 4122 + }, + { + "epoch": 0.22541448546396403, + "grad_norm": 1.868686556816101, + "learning_rate": 1.8510704796717902e-05, + "loss": 1.6231, + "step": 4123 + }, + { + "epoch": 0.22546915790768557, + "grad_norm": 1.492197036743164, + "learning_rate": 1.85097451980086e-05, + "loss": 1.3565, + "step": 4124 + }, + { + "epoch": 0.22552383035140713, + "grad_norm": 1.4125597476959229, + "learning_rate": 1.850878531513894e-05, + "loss": 1.519, + "step": 4125 + }, + { + "epoch": 0.22557850279512867, + "grad_norm": 1.636623740196228, + "learning_rate": 1.8507825148140974e-05, + "loss": 1.6795, + "step": 4126 + }, + { + "epoch": 0.22563317523885024, + "grad_norm": 1.9724818468093872, + "learning_rate": 1.850686469704676e-05, + "loss": 1.6168, + "step": 4127 + }, + { + "epoch": 0.2256878476825718, + "grad_norm": 1.430824875831604, + "learning_rate": 1.850590396188837e-05, + "loss": 1.4142, + "step": 4128 + }, + { + "epoch": 0.22574252012629334, + "grad_norm": 1.1473698616027832, + "learning_rate": 1.850494294269789e-05, + "loss": 1.5117, + "step": 4129 + }, + { + "epoch": 0.2257971925700149, + "grad_norm": 1.6078330278396606, + "learning_rate": 1.8503981639507404e-05, + "loss": 1.2843, + "step": 4130 + }, + { + "epoch": 0.22585186501373644, + "grad_norm": 1.8786766529083252, + "learning_rate": 1.8503020052349018e-05, + "loss": 1.4385, + "step": 4131 + }, + { + "epoch": 0.225906537457458, + "grad_norm": 2.010653257369995, + "learning_rate": 1.850205818125484e-05, + "loss": 1.6377, + "step": 4132 + }, + { + "epoch": 0.22596120990117957, + "grad_norm": 1.3901094198226929, + "learning_rate": 1.8501096026256985e-05, + "loss": 1.5098, + "step": 4133 + }, + { + "epoch": 0.2260158823449011, + "grad_norm": 1.821137547492981, + "learning_rate": 1.850013358738759e-05, + "loss": 1.4524, + "step": 4134 + }, + { + "epoch": 0.22607055478862267, + "grad_norm": 1.3474584817886353, + "learning_rate": 1.8499170864678787e-05, + "loss": 1.6747, + "step": 4135 + }, + { + "epoch": 0.2261252272323442, + "grad_norm": 1.720786452293396, + "learning_rate": 1.8498207858162724e-05, + "loss": 1.4216, + "step": 4136 + }, + { + "epoch": 0.22617989967606578, + "grad_norm": 1.885172963142395, + "learning_rate": 1.849724456787156e-05, + "loss": 1.4372, + "step": 4137 + }, + { + "epoch": 0.2262345721197873, + "grad_norm": 2.0665764808654785, + "learning_rate": 1.849628099383746e-05, + "loss": 1.4646, + "step": 4138 + }, + { + "epoch": 0.22628924456350888, + "grad_norm": 1.7447535991668701, + "learning_rate": 1.84953171360926e-05, + "loss": 1.3464, + "step": 4139 + }, + { + "epoch": 0.22634391700723044, + "grad_norm": 1.6850790977478027, + "learning_rate": 1.8494352994669166e-05, + "loss": 1.6207, + "step": 4140 + }, + { + "epoch": 0.22639858945095198, + "grad_norm": 1.460789680480957, + "learning_rate": 1.8493388569599352e-05, + "loss": 1.7241, + "step": 4141 + }, + { + "epoch": 0.22645326189467355, + "grad_norm": 1.300305962562561, + "learning_rate": 1.8492423860915366e-05, + "loss": 1.434, + "step": 4142 + }, + { + "epoch": 0.22650793433839508, + "grad_norm": 1.9548237323760986, + "learning_rate": 1.8491458868649417e-05, + "loss": 1.5709, + "step": 4143 + }, + { + "epoch": 0.22656260678211665, + "grad_norm": 1.4423679113388062, + "learning_rate": 1.849049359283373e-05, + "loss": 1.5322, + "step": 4144 + }, + { + "epoch": 0.2266172792258382, + "grad_norm": 1.5959994792938232, + "learning_rate": 1.8489528033500542e-05, + "loss": 1.4652, + "step": 4145 + }, + { + "epoch": 0.22667195166955975, + "grad_norm": 1.1338616609573364, + "learning_rate": 1.848856219068209e-05, + "loss": 1.3781, + "step": 4146 + }, + { + "epoch": 0.22672662411328132, + "grad_norm": 2.174682378768921, + "learning_rate": 1.8487596064410623e-05, + "loss": 1.6005, + "step": 4147 + }, + { + "epoch": 0.22678129655700285, + "grad_norm": 1.1733450889587402, + "learning_rate": 1.8486629654718412e-05, + "loss": 1.4256, + "step": 4148 + }, + { + "epoch": 0.22683596900072442, + "grad_norm": 1.4664950370788574, + "learning_rate": 1.848566296163772e-05, + "loss": 1.5657, + "step": 4149 + }, + { + "epoch": 0.22689064144444596, + "grad_norm": 2.2130701541900635, + "learning_rate": 1.8484695985200832e-05, + "loss": 1.6764, + "step": 4150 + }, + { + "epoch": 0.22694531388816752, + "grad_norm": 1.2693639993667603, + "learning_rate": 1.8483728725440033e-05, + "loss": 1.5283, + "step": 4151 + }, + { + "epoch": 0.22699998633188906, + "grad_norm": 1.7295899391174316, + "learning_rate": 1.8482761182387622e-05, + "loss": 1.5688, + "step": 4152 + }, + { + "epoch": 0.22705465877561062, + "grad_norm": 1.5963619947433472, + "learning_rate": 1.848179335607591e-05, + "loss": 1.3895, + "step": 4153 + }, + { + "epoch": 0.2271093312193322, + "grad_norm": 2.2639923095703125, + "learning_rate": 1.848082524653722e-05, + "loss": 1.5617, + "step": 4154 + }, + { + "epoch": 0.22716400366305373, + "grad_norm": 1.7612335681915283, + "learning_rate": 1.847985685380387e-05, + "loss": 1.375, + "step": 4155 + }, + { + "epoch": 0.2272186761067753, + "grad_norm": 1.735479712486267, + "learning_rate": 1.8478888177908202e-05, + "loss": 1.6175, + "step": 4156 + }, + { + "epoch": 0.22727334855049683, + "grad_norm": 1.898122787475586, + "learning_rate": 1.847791921888256e-05, + "loss": 1.5685, + "step": 4157 + }, + { + "epoch": 0.2273280209942184, + "grad_norm": 1.3635259866714478, + "learning_rate": 1.8476949976759302e-05, + "loss": 1.5964, + "step": 4158 + }, + { + "epoch": 0.22738269343793993, + "grad_norm": 1.4841715097427368, + "learning_rate": 1.8475980451570797e-05, + "loss": 1.4929, + "step": 4159 + }, + { + "epoch": 0.2274373658816615, + "grad_norm": 1.7896523475646973, + "learning_rate": 1.847501064334941e-05, + "loss": 1.0499, + "step": 4160 + }, + { + "epoch": 0.22749203832538306, + "grad_norm": 2.7659037113189697, + "learning_rate": 1.8474040552127533e-05, + "loss": 1.6577, + "step": 4161 + }, + { + "epoch": 0.2275467107691046, + "grad_norm": 1.455170750617981, + "learning_rate": 1.8473070177937552e-05, + "loss": 1.5527, + "step": 4162 + }, + { + "epoch": 0.22760138321282616, + "grad_norm": 1.4381617307662964, + "learning_rate": 1.847209952081188e-05, + "loss": 1.5792, + "step": 4163 + }, + { + "epoch": 0.2276560556565477, + "grad_norm": 1.6960049867630005, + "learning_rate": 1.8471128580782923e-05, + "loss": 1.5687, + "step": 4164 + }, + { + "epoch": 0.22771072810026927, + "grad_norm": 1.4936449527740479, + "learning_rate": 1.8470157357883106e-05, + "loss": 1.611, + "step": 4165 + }, + { + "epoch": 0.2277654005439908, + "grad_norm": 1.3427504301071167, + "learning_rate": 1.8469185852144854e-05, + "loss": 1.5214, + "step": 4166 + }, + { + "epoch": 0.22782007298771237, + "grad_norm": 2.1396594047546387, + "learning_rate": 1.846821406360062e-05, + "loss": 1.4765, + "step": 4167 + }, + { + "epoch": 0.22787474543143393, + "grad_norm": 1.5422961711883545, + "learning_rate": 1.8467241992282842e-05, + "loss": 1.5632, + "step": 4168 + }, + { + "epoch": 0.22792941787515547, + "grad_norm": 1.7648919820785522, + "learning_rate": 1.846626963822399e-05, + "loss": 1.5241, + "step": 4169 + }, + { + "epoch": 0.22798409031887704, + "grad_norm": 1.9467579126358032, + "learning_rate": 1.846529700145652e-05, + "loss": 1.5242, + "step": 4170 + }, + { + "epoch": 0.22803876276259857, + "grad_norm": 1.3245059251785278, + "learning_rate": 1.8464324082012926e-05, + "loss": 1.4689, + "step": 4171 + }, + { + "epoch": 0.22809343520632014, + "grad_norm": 1.6093997955322266, + "learning_rate": 1.8463350879925686e-05, + "loss": 1.4687, + "step": 4172 + }, + { + "epoch": 0.22814810765004168, + "grad_norm": 1.3549340963363647, + "learning_rate": 1.84623773952273e-05, + "loss": 1.3331, + "step": 4173 + }, + { + "epoch": 0.22820278009376324, + "grad_norm": 1.5974501371383667, + "learning_rate": 1.8461403627950275e-05, + "loss": 1.5129, + "step": 4174 + }, + { + "epoch": 0.2282574525374848, + "grad_norm": 1.7055853605270386, + "learning_rate": 1.8460429578127132e-05, + "loss": 1.4368, + "step": 4175 + }, + { + "epoch": 0.22831212498120634, + "grad_norm": 1.5278369188308716, + "learning_rate": 1.8459455245790386e-05, + "loss": 1.3608, + "step": 4176 + }, + { + "epoch": 0.2283667974249279, + "grad_norm": 1.8140888214111328, + "learning_rate": 1.845848063097258e-05, + "loss": 1.3368, + "step": 4177 + }, + { + "epoch": 0.22842146986864945, + "grad_norm": 1.4796295166015625, + "learning_rate": 1.845750573370626e-05, + "loss": 1.3163, + "step": 4178 + }, + { + "epoch": 0.228476142312371, + "grad_norm": 1.4428277015686035, + "learning_rate": 1.8456530554023973e-05, + "loss": 1.2897, + "step": 4179 + }, + { + "epoch": 0.22853081475609255, + "grad_norm": 1.4737648963928223, + "learning_rate": 1.845555509195829e-05, + "loss": 1.4609, + "step": 4180 + }, + { + "epoch": 0.2285854871998141, + "grad_norm": 1.2029600143432617, + "learning_rate": 1.8454579347541783e-05, + "loss": 1.6018, + "step": 4181 + }, + { + "epoch": 0.22864015964353568, + "grad_norm": 1.3619414567947388, + "learning_rate": 1.845360332080703e-05, + "loss": 1.4714, + "step": 4182 + }, + { + "epoch": 0.22869483208725722, + "grad_norm": 2.0369575023651123, + "learning_rate": 1.8452627011786623e-05, + "loss": 1.6173, + "step": 4183 + }, + { + "epoch": 0.22874950453097878, + "grad_norm": 1.4231692552566528, + "learning_rate": 1.8451650420513167e-05, + "loss": 1.6023, + "step": 4184 + }, + { + "epoch": 0.22880417697470032, + "grad_norm": 1.14035964012146, + "learning_rate": 1.8450673547019273e-05, + "loss": 1.8331, + "step": 4185 + }, + { + "epoch": 0.22885884941842188, + "grad_norm": 1.4888666868209839, + "learning_rate": 1.8449696391337556e-05, + "loss": 1.3175, + "step": 4186 + }, + { + "epoch": 0.22891352186214342, + "grad_norm": 2.0476338863372803, + "learning_rate": 1.8448718953500653e-05, + "loss": 1.489, + "step": 4187 + }, + { + "epoch": 0.22896819430586499, + "grad_norm": 1.203067421913147, + "learning_rate": 1.8447741233541195e-05, + "loss": 1.467, + "step": 4188 + }, + { + "epoch": 0.22902286674958655, + "grad_norm": 2.0363657474517822, + "learning_rate": 1.8446763231491834e-05, + "loss": 1.455, + "step": 4189 + }, + { + "epoch": 0.2290775391933081, + "grad_norm": 1.4208637475967407, + "learning_rate": 1.844578494738523e-05, + "loss": 1.3349, + "step": 4190 + }, + { + "epoch": 0.22913221163702965, + "grad_norm": 1.7939808368682861, + "learning_rate": 1.8444806381254046e-05, + "loss": 1.4376, + "step": 4191 + }, + { + "epoch": 0.2291868840807512, + "grad_norm": 1.8448830842971802, + "learning_rate": 1.844382753313096e-05, + "loss": 1.5436, + "step": 4192 + }, + { + "epoch": 0.22924155652447276, + "grad_norm": 1.857378363609314, + "learning_rate": 1.8442848403048658e-05, + "loss": 1.8014, + "step": 4193 + }, + { + "epoch": 0.2292962289681943, + "grad_norm": 1.591251254081726, + "learning_rate": 1.844186899103984e-05, + "loss": 1.4122, + "step": 4194 + }, + { + "epoch": 0.22935090141191586, + "grad_norm": 1.3349025249481201, + "learning_rate": 1.8440889297137204e-05, + "loss": 1.396, + "step": 4195 + }, + { + "epoch": 0.22940557385563742, + "grad_norm": 1.653059720993042, + "learning_rate": 1.843990932137347e-05, + "loss": 1.3878, + "step": 4196 + }, + { + "epoch": 0.22946024629935896, + "grad_norm": 1.6018298864364624, + "learning_rate": 1.8438929063781354e-05, + "loss": 1.4516, + "step": 4197 + }, + { + "epoch": 0.22951491874308053, + "grad_norm": 1.489970326423645, + "learning_rate": 1.8437948524393598e-05, + "loss": 1.4415, + "step": 4198 + }, + { + "epoch": 0.22956959118680206, + "grad_norm": 1.5365896224975586, + "learning_rate": 1.8436967703242938e-05, + "loss": 1.4501, + "step": 4199 + }, + { + "epoch": 0.22962426363052363, + "grad_norm": 1.3824830055236816, + "learning_rate": 1.843598660036213e-05, + "loss": 1.5516, + "step": 4200 + }, + { + "epoch": 0.22967893607424517, + "grad_norm": 1.316392183303833, + "learning_rate": 1.8435005215783933e-05, + "loss": 1.535, + "step": 4201 + }, + { + "epoch": 0.22973360851796673, + "grad_norm": 1.710883617401123, + "learning_rate": 1.843402354954112e-05, + "loss": 1.5553, + "step": 4202 + }, + { + "epoch": 0.2297882809616883, + "grad_norm": 1.6593313217163086, + "learning_rate": 1.8433041601666466e-05, + "loss": 1.4873, + "step": 4203 + }, + { + "epoch": 0.22984295340540983, + "grad_norm": 1.3746041059494019, + "learning_rate": 1.843205937219277e-05, + "loss": 1.417, + "step": 4204 + }, + { + "epoch": 0.2298976258491314, + "grad_norm": 1.3618148565292358, + "learning_rate": 1.843107686115282e-05, + "loss": 1.5773, + "step": 4205 + }, + { + "epoch": 0.22995229829285294, + "grad_norm": 1.4426231384277344, + "learning_rate": 1.843009406857943e-05, + "loss": 1.2157, + "step": 4206 + }, + { + "epoch": 0.2300069707365745, + "grad_norm": 1.6032558679580688, + "learning_rate": 1.842911099450542e-05, + "loss": 1.7821, + "step": 4207 + }, + { + "epoch": 0.23006164318029604, + "grad_norm": 1.496329665184021, + "learning_rate": 1.8428127638963615e-05, + "loss": 1.4404, + "step": 4208 + }, + { + "epoch": 0.2301163156240176, + "grad_norm": 1.4987754821777344, + "learning_rate": 1.8427144001986847e-05, + "loss": 1.599, + "step": 4209 + }, + { + "epoch": 0.23017098806773917, + "grad_norm": 1.573966383934021, + "learning_rate": 1.8426160083607966e-05, + "loss": 1.478, + "step": 4210 + }, + { + "epoch": 0.2302256605114607, + "grad_norm": 1.5499331951141357, + "learning_rate": 1.842517588385983e-05, + "loss": 1.5125, + "step": 4211 + }, + { + "epoch": 0.23028033295518227, + "grad_norm": 2.324805974960327, + "learning_rate": 1.8424191402775297e-05, + "loss": 1.4968, + "step": 4212 + }, + { + "epoch": 0.2303350053989038, + "grad_norm": 1.3911247253417969, + "learning_rate": 1.842320664038725e-05, + "loss": 1.4293, + "step": 4213 + }, + { + "epoch": 0.23038967784262537, + "grad_norm": 1.5456316471099854, + "learning_rate": 1.8422221596728564e-05, + "loss": 1.4365, + "step": 4214 + }, + { + "epoch": 0.2304443502863469, + "grad_norm": 1.3913037776947021, + "learning_rate": 1.842123627183214e-05, + "loss": 1.4442, + "step": 4215 + }, + { + "epoch": 0.23049902273006848, + "grad_norm": 1.5711824893951416, + "learning_rate": 1.842025066573087e-05, + "loss": 1.4006, + "step": 4216 + }, + { + "epoch": 0.23055369517379004, + "grad_norm": 1.446954369544983, + "learning_rate": 1.8419264778457675e-05, + "loss": 1.3515, + "step": 4217 + }, + { + "epoch": 0.23060836761751158, + "grad_norm": 1.5628095865249634, + "learning_rate": 1.841827861004547e-05, + "loss": 1.3753, + "step": 4218 + }, + { + "epoch": 0.23066304006123314, + "grad_norm": 1.5808463096618652, + "learning_rate": 1.8417292160527193e-05, + "loss": 1.2067, + "step": 4219 + }, + { + "epoch": 0.23071771250495468, + "grad_norm": 1.4693762063980103, + "learning_rate": 1.8416305429935776e-05, + "loss": 1.4298, + "step": 4220 + }, + { + "epoch": 0.23077238494867625, + "grad_norm": 1.332908272743225, + "learning_rate": 1.8415318418304167e-05, + "loss": 1.484, + "step": 4221 + }, + { + "epoch": 0.23082705739239778, + "grad_norm": 1.3474576473236084, + "learning_rate": 1.8414331125665336e-05, + "loss": 1.4526, + "step": 4222 + }, + { + "epoch": 0.23088172983611935, + "grad_norm": 1.2504324913024902, + "learning_rate": 1.8413343552052242e-05, + "loss": 1.64, + "step": 4223 + }, + { + "epoch": 0.2309364022798409, + "grad_norm": 1.408079981803894, + "learning_rate": 1.8412355697497863e-05, + "loss": 1.3882, + "step": 4224 + }, + { + "epoch": 0.23099107472356245, + "grad_norm": 1.7435423135757446, + "learning_rate": 1.8411367562035188e-05, + "loss": 1.4309, + "step": 4225 + }, + { + "epoch": 0.23104574716728402, + "grad_norm": 2.4409804344177246, + "learning_rate": 1.841037914569721e-05, + "loss": 1.4382, + "step": 4226 + }, + { + "epoch": 0.23110041961100555, + "grad_norm": 1.4896398782730103, + "learning_rate": 1.840939044851694e-05, + "loss": 1.5607, + "step": 4227 + }, + { + "epoch": 0.23115509205472712, + "grad_norm": 1.7540862560272217, + "learning_rate": 1.8408401470527386e-05, + "loss": 1.2392, + "step": 4228 + }, + { + "epoch": 0.23120976449844868, + "grad_norm": 1.6142964363098145, + "learning_rate": 1.840741221176158e-05, + "loss": 1.5158, + "step": 4229 + }, + { + "epoch": 0.23126443694217022, + "grad_norm": 1.8431097269058228, + "learning_rate": 1.8406422672252548e-05, + "loss": 1.7555, + "step": 4230 + }, + { + "epoch": 0.23131910938589179, + "grad_norm": 1.3392002582550049, + "learning_rate": 1.8405432852033338e-05, + "loss": 1.2844, + "step": 4231 + }, + { + "epoch": 0.23137378182961332, + "grad_norm": 1.4594354629516602, + "learning_rate": 1.8404442751137e-05, + "loss": 1.3111, + "step": 4232 + }, + { + "epoch": 0.2314284542733349, + "grad_norm": 1.4446665048599243, + "learning_rate": 1.84034523695966e-05, + "loss": 1.3583, + "step": 4233 + }, + { + "epoch": 0.23148312671705643, + "grad_norm": 1.7781957387924194, + "learning_rate": 1.8402461707445206e-05, + "loss": 1.393, + "step": 4234 + }, + { + "epoch": 0.231537799160778, + "grad_norm": 1.4447581768035889, + "learning_rate": 1.8401470764715898e-05, + "loss": 1.7168, + "step": 4235 + }, + { + "epoch": 0.23159247160449956, + "grad_norm": 1.193052053451538, + "learning_rate": 1.8400479541441763e-05, + "loss": 1.487, + "step": 4236 + }, + { + "epoch": 0.2316471440482211, + "grad_norm": 1.639060139656067, + "learning_rate": 1.8399488037655906e-05, + "loss": 1.6103, + "step": 4237 + }, + { + "epoch": 0.23170181649194266, + "grad_norm": 1.5653935670852661, + "learning_rate": 1.8398496253391433e-05, + "loss": 1.4803, + "step": 4238 + }, + { + "epoch": 0.2317564889356642, + "grad_norm": 1.5188223123550415, + "learning_rate": 1.8397504188681463e-05, + "loss": 1.303, + "step": 4239 + }, + { + "epoch": 0.23181116137938576, + "grad_norm": 1.3110438585281372, + "learning_rate": 1.8396511843559125e-05, + "loss": 1.5708, + "step": 4240 + }, + { + "epoch": 0.2318658338231073, + "grad_norm": 1.4025230407714844, + "learning_rate": 1.839551921805755e-05, + "loss": 1.723, + "step": 4241 + }, + { + "epoch": 0.23192050626682886, + "grad_norm": 1.7176246643066406, + "learning_rate": 1.839452631220989e-05, + "loss": 1.4505, + "step": 4242 + }, + { + "epoch": 0.23197517871055043, + "grad_norm": 1.6242328882217407, + "learning_rate": 1.8393533126049293e-05, + "loss": 1.4672, + "step": 4243 + }, + { + "epoch": 0.23202985115427197, + "grad_norm": 1.355054497718811, + "learning_rate": 1.8392539659608934e-05, + "loss": 1.3234, + "step": 4244 + }, + { + "epoch": 0.23208452359799353, + "grad_norm": 1.6202151775360107, + "learning_rate": 1.839154591292198e-05, + "loss": 1.3192, + "step": 4245 + }, + { + "epoch": 0.23213919604171507, + "grad_norm": 1.4652514457702637, + "learning_rate": 1.839055188602162e-05, + "loss": 1.5332, + "step": 4246 + }, + { + "epoch": 0.23219386848543663, + "grad_norm": 1.394033432006836, + "learning_rate": 1.838955757894104e-05, + "loss": 1.629, + "step": 4247 + }, + { + "epoch": 0.23224854092915817, + "grad_norm": 1.4845229387283325, + "learning_rate": 1.8388562991713447e-05, + "loss": 1.4664, + "step": 4248 + }, + { + "epoch": 0.23230321337287974, + "grad_norm": 1.8068684339523315, + "learning_rate": 1.838756812437205e-05, + "loss": 1.2153, + "step": 4249 + }, + { + "epoch": 0.2323578858166013, + "grad_norm": 1.3547371625900269, + "learning_rate": 1.8386572976950072e-05, + "loss": 1.3119, + "step": 4250 + }, + { + "epoch": 0.23241255826032284, + "grad_norm": 1.767750859260559, + "learning_rate": 1.838557754948074e-05, + "loss": 1.4403, + "step": 4251 + }, + { + "epoch": 0.2324672307040444, + "grad_norm": 1.790687918663025, + "learning_rate": 1.83845818419973e-05, + "loss": 1.3555, + "step": 4252 + }, + { + "epoch": 0.23252190314776594, + "grad_norm": 1.3614535331726074, + "learning_rate": 1.8383585854533e-05, + "loss": 1.2999, + "step": 4253 + }, + { + "epoch": 0.2325765755914875, + "grad_norm": 1.6828781366348267, + "learning_rate": 1.838258958712109e-05, + "loss": 1.3416, + "step": 4254 + }, + { + "epoch": 0.23263124803520904, + "grad_norm": 1.3844058513641357, + "learning_rate": 1.8381593039794846e-05, + "loss": 1.2846, + "step": 4255 + }, + { + "epoch": 0.2326859204789306, + "grad_norm": 1.498164176940918, + "learning_rate": 1.8380596212587544e-05, + "loss": 1.4491, + "step": 4256 + }, + { + "epoch": 0.23274059292265217, + "grad_norm": 1.4151487350463867, + "learning_rate": 1.8379599105532465e-05, + "loss": 1.6624, + "step": 4257 + }, + { + "epoch": 0.2327952653663737, + "grad_norm": 1.583901047706604, + "learning_rate": 1.837860171866291e-05, + "loss": 1.4791, + "step": 4258 + }, + { + "epoch": 0.23284993781009528, + "grad_norm": 1.3758697509765625, + "learning_rate": 1.8377604052012183e-05, + "loss": 1.5903, + "step": 4259 + }, + { + "epoch": 0.2329046102538168, + "grad_norm": 1.3587367534637451, + "learning_rate": 1.8376606105613593e-05, + "loss": 1.4536, + "step": 4260 + }, + { + "epoch": 0.23295928269753838, + "grad_norm": 2.009284257888794, + "learning_rate": 1.8375607879500476e-05, + "loss": 1.4793, + "step": 4261 + }, + { + "epoch": 0.23301395514125992, + "grad_norm": 1.8722447156906128, + "learning_rate": 1.8374609373706156e-05, + "loss": 1.5582, + "step": 4262 + }, + { + "epoch": 0.23306862758498148, + "grad_norm": 1.532156229019165, + "learning_rate": 1.8373610588263976e-05, + "loss": 1.4762, + "step": 4263 + }, + { + "epoch": 0.23312330002870305, + "grad_norm": 2.1865177154541016, + "learning_rate": 1.837261152320729e-05, + "loss": 1.4867, + "step": 4264 + }, + { + "epoch": 0.23317797247242458, + "grad_norm": 1.8695101737976074, + "learning_rate": 1.837161217856946e-05, + "loss": 1.45, + "step": 4265 + }, + { + "epoch": 0.23323264491614615, + "grad_norm": 1.492288589477539, + "learning_rate": 1.837061255438385e-05, + "loss": 1.5425, + "step": 4266 + }, + { + "epoch": 0.23328731735986769, + "grad_norm": 1.5713926553726196, + "learning_rate": 1.8369612650683846e-05, + "loss": 1.358, + "step": 4267 + }, + { + "epoch": 0.23334198980358925, + "grad_norm": 1.2424399852752686, + "learning_rate": 1.836861246750284e-05, + "loss": 1.3952, + "step": 4268 + }, + { + "epoch": 0.2333966622473108, + "grad_norm": 1.692936897277832, + "learning_rate": 1.8367612004874224e-05, + "loss": 1.3558, + "step": 4269 + }, + { + "epoch": 0.23345133469103235, + "grad_norm": 1.942862629890442, + "learning_rate": 1.8366611262831408e-05, + "loss": 1.3037, + "step": 4270 + }, + { + "epoch": 0.23350600713475392, + "grad_norm": 1.7360551357269287, + "learning_rate": 1.836561024140781e-05, + "loss": 1.3591, + "step": 4271 + }, + { + "epoch": 0.23356067957847546, + "grad_norm": 1.3713864088058472, + "learning_rate": 1.8364608940636853e-05, + "loss": 1.4878, + "step": 4272 + }, + { + "epoch": 0.23361535202219702, + "grad_norm": 1.7178831100463867, + "learning_rate": 1.8363607360551975e-05, + "loss": 1.6311, + "step": 4273 + }, + { + "epoch": 0.23367002446591856, + "grad_norm": 2.03977632522583, + "learning_rate": 1.836260550118662e-05, + "loss": 1.3985, + "step": 4274 + }, + { + "epoch": 0.23372469690964012, + "grad_norm": 1.2989912033081055, + "learning_rate": 1.8361603362574247e-05, + "loss": 1.5647, + "step": 4275 + }, + { + "epoch": 0.23377936935336166, + "grad_norm": 1.8113337755203247, + "learning_rate": 1.8360600944748316e-05, + "loss": 1.4741, + "step": 4276 + }, + { + "epoch": 0.23383404179708323, + "grad_norm": 1.3716168403625488, + "learning_rate": 1.8359598247742305e-05, + "loss": 1.7018, + "step": 4277 + }, + { + "epoch": 0.2338887142408048, + "grad_norm": 1.7987595796585083, + "learning_rate": 1.8358595271589683e-05, + "loss": 1.5895, + "step": 4278 + }, + { + "epoch": 0.23394338668452633, + "grad_norm": 1.3262617588043213, + "learning_rate": 1.8357592016323958e-05, + "loss": 1.2069, + "step": 4279 + }, + { + "epoch": 0.2339980591282479, + "grad_norm": 1.3230767250061035, + "learning_rate": 1.835658848197862e-05, + "loss": 1.5693, + "step": 4280 + }, + { + "epoch": 0.23405273157196943, + "grad_norm": 1.7293535470962524, + "learning_rate": 1.8355584668587185e-05, + "loss": 1.2828, + "step": 4281 + }, + { + "epoch": 0.234107404015691, + "grad_norm": 1.3392846584320068, + "learning_rate": 1.835458057618317e-05, + "loss": 1.5001, + "step": 4282 + }, + { + "epoch": 0.23416207645941253, + "grad_norm": 1.3985393047332764, + "learning_rate": 1.8353576204800106e-05, + "loss": 1.4505, + "step": 4283 + }, + { + "epoch": 0.2342167489031341, + "grad_norm": 1.6296510696411133, + "learning_rate": 1.835257155447153e-05, + "loss": 1.4574, + "step": 4284 + }, + { + "epoch": 0.23427142134685566, + "grad_norm": 1.4291110038757324, + "learning_rate": 1.835156662523099e-05, + "loss": 1.2853, + "step": 4285 + }, + { + "epoch": 0.2343260937905772, + "grad_norm": 1.2603014707565308, + "learning_rate": 1.8350561417112044e-05, + "loss": 1.5568, + "step": 4286 + }, + { + "epoch": 0.23438076623429877, + "grad_norm": 1.4419690370559692, + "learning_rate": 1.8349555930148253e-05, + "loss": 1.2652, + "step": 4287 + }, + { + "epoch": 0.2344354386780203, + "grad_norm": 1.516692042350769, + "learning_rate": 1.83485501643732e-05, + "loss": 1.216, + "step": 4288 + }, + { + "epoch": 0.23449011112174187, + "grad_norm": 1.4768965244293213, + "learning_rate": 1.8347544119820465e-05, + "loss": 1.8542, + "step": 4289 + }, + { + "epoch": 0.2345447835654634, + "grad_norm": 1.9683130979537964, + "learning_rate": 1.8346537796523643e-05, + "loss": 1.3563, + "step": 4290 + }, + { + "epoch": 0.23459945600918497, + "grad_norm": 1.549028754234314, + "learning_rate": 1.8345531194516343e-05, + "loss": 1.2841, + "step": 4291 + }, + { + "epoch": 0.23465412845290654, + "grad_norm": 1.359979271888733, + "learning_rate": 1.8344524313832167e-05, + "loss": 1.4987, + "step": 4292 + }, + { + "epoch": 0.23470880089662807, + "grad_norm": 1.2366464138031006, + "learning_rate": 1.8343517154504747e-05, + "loss": 1.5124, + "step": 4293 + }, + { + "epoch": 0.23476347334034964, + "grad_norm": 1.5735297203063965, + "learning_rate": 1.8342509716567712e-05, + "loss": 1.4665, + "step": 4294 + }, + { + "epoch": 0.23481814578407117, + "grad_norm": 1.6131600141525269, + "learning_rate": 1.8341502000054697e-05, + "loss": 1.4722, + "step": 4295 + }, + { + "epoch": 0.23487281822779274, + "grad_norm": 1.4721570014953613, + "learning_rate": 1.834049400499936e-05, + "loss": 1.5323, + "step": 4296 + }, + { + "epoch": 0.23492749067151428, + "grad_norm": 1.2413469552993774, + "learning_rate": 1.833948573143535e-05, + "loss": 1.5464, + "step": 4297 + }, + { + "epoch": 0.23498216311523584, + "grad_norm": 1.6488022804260254, + "learning_rate": 1.833847717939635e-05, + "loss": 1.393, + "step": 4298 + }, + { + "epoch": 0.2350368355589574, + "grad_norm": 1.5110143423080444, + "learning_rate": 1.8337468348916026e-05, + "loss": 1.2064, + "step": 4299 + }, + { + "epoch": 0.23509150800267894, + "grad_norm": 1.4202114343643188, + "learning_rate": 1.833645924002807e-05, + "loss": 1.3368, + "step": 4300 + }, + { + "epoch": 0.2351461804464005, + "grad_norm": 1.385157823562622, + "learning_rate": 1.8335449852766176e-05, + "loss": 1.6518, + "step": 4301 + }, + { + "epoch": 0.23520085289012205, + "grad_norm": 1.3083782196044922, + "learning_rate": 1.8334440187164054e-05, + "loss": 1.5702, + "step": 4302 + }, + { + "epoch": 0.2352555253338436, + "grad_norm": 1.6026219129562378, + "learning_rate": 1.8333430243255416e-05, + "loss": 1.5592, + "step": 4303 + }, + { + "epoch": 0.23531019777756515, + "grad_norm": 1.8674174547195435, + "learning_rate": 1.8332420021073992e-05, + "loss": 1.6027, + "step": 4304 + }, + { + "epoch": 0.23536487022128671, + "grad_norm": 1.647970199584961, + "learning_rate": 1.8331409520653504e-05, + "loss": 1.473, + "step": 4305 + }, + { + "epoch": 0.23541954266500828, + "grad_norm": 1.4630297422409058, + "learning_rate": 1.8330398742027704e-05, + "loss": 1.6588, + "step": 4306 + }, + { + "epoch": 0.23547421510872982, + "grad_norm": 1.5940433740615845, + "learning_rate": 1.832938768523034e-05, + "loss": 1.4687, + "step": 4307 + }, + { + "epoch": 0.23552888755245138, + "grad_norm": 1.3283308744430542, + "learning_rate": 1.832837635029518e-05, + "loss": 1.3899, + "step": 4308 + }, + { + "epoch": 0.23558355999617292, + "grad_norm": 1.4744973182678223, + "learning_rate": 1.832736473725599e-05, + "loss": 1.4604, + "step": 4309 + }, + { + "epoch": 0.23563823243989448, + "grad_norm": 1.5700205564498901, + "learning_rate": 1.8326352846146548e-05, + "loss": 1.2716, + "step": 4310 + }, + { + "epoch": 0.23569290488361602, + "grad_norm": 1.5140188932418823, + "learning_rate": 1.8325340677000646e-05, + "loss": 1.3094, + "step": 4311 + }, + { + "epoch": 0.2357475773273376, + "grad_norm": 1.7375789880752563, + "learning_rate": 1.8324328229852087e-05, + "loss": 1.7215, + "step": 4312 + }, + { + "epoch": 0.23580224977105915, + "grad_norm": 1.5663621425628662, + "learning_rate": 1.832331550473467e-05, + "loss": 1.3846, + "step": 4313 + }, + { + "epoch": 0.2358569222147807, + "grad_norm": 1.5492795705795288, + "learning_rate": 1.832230250168222e-05, + "loss": 1.21, + "step": 4314 + }, + { + "epoch": 0.23591159465850225, + "grad_norm": 1.5741201639175415, + "learning_rate": 1.832128922072856e-05, + "loss": 1.4496, + "step": 4315 + }, + { + "epoch": 0.2359662671022238, + "grad_norm": 1.5821295976638794, + "learning_rate": 1.8320275661907527e-05, + "loss": 1.5676, + "step": 4316 + }, + { + "epoch": 0.23602093954594536, + "grad_norm": 1.3912794589996338, + "learning_rate": 1.831926182525296e-05, + "loss": 1.2593, + "step": 4317 + }, + { + "epoch": 0.2360756119896669, + "grad_norm": 1.8995773792266846, + "learning_rate": 1.8318247710798728e-05, + "loss": 1.4013, + "step": 4318 + }, + { + "epoch": 0.23613028443338846, + "grad_norm": 1.4313346147537231, + "learning_rate": 1.8317233318578678e-05, + "loss": 1.5199, + "step": 4319 + }, + { + "epoch": 0.23618495687711002, + "grad_norm": 1.9817557334899902, + "learning_rate": 1.8316218648626693e-05, + "loss": 1.4543, + "step": 4320 + }, + { + "epoch": 0.23623962932083156, + "grad_norm": 2.260892152786255, + "learning_rate": 1.8315203700976653e-05, + "loss": 1.2328, + "step": 4321 + }, + { + "epoch": 0.23629430176455313, + "grad_norm": 1.439423680305481, + "learning_rate": 1.831418847566245e-05, + "loss": 1.4838, + "step": 4322 + }, + { + "epoch": 0.23634897420827466, + "grad_norm": 1.4472969770431519, + "learning_rate": 1.8313172972717982e-05, + "loss": 1.3455, + "step": 4323 + }, + { + "epoch": 0.23640364665199623, + "grad_norm": 1.5731464624404907, + "learning_rate": 1.8312157192177166e-05, + "loss": 1.2468, + "step": 4324 + }, + { + "epoch": 0.23645831909571777, + "grad_norm": 1.3132286071777344, + "learning_rate": 1.831114113407391e-05, + "loss": 1.4333, + "step": 4325 + }, + { + "epoch": 0.23651299153943933, + "grad_norm": 2.3293235301971436, + "learning_rate": 1.8310124798442152e-05, + "loss": 1.399, + "step": 4326 + }, + { + "epoch": 0.2365676639831609, + "grad_norm": 1.4777628183364868, + "learning_rate": 1.8309108185315826e-05, + "loss": 1.3618, + "step": 4327 + }, + { + "epoch": 0.23662233642688243, + "grad_norm": 1.737929105758667, + "learning_rate": 1.830809129472888e-05, + "loss": 1.1629, + "step": 4328 + }, + { + "epoch": 0.236677008870604, + "grad_norm": 1.4557172060012817, + "learning_rate": 1.8307074126715267e-05, + "loss": 1.5736, + "step": 4329 + }, + { + "epoch": 0.23673168131432554, + "grad_norm": 1.4770638942718506, + "learning_rate": 1.830605668130896e-05, + "loss": 1.44, + "step": 4330 + }, + { + "epoch": 0.2367863537580471, + "grad_norm": 1.2331651449203491, + "learning_rate": 1.830503895854393e-05, + "loss": 1.4365, + "step": 4331 + }, + { + "epoch": 0.23684102620176867, + "grad_norm": 1.7034788131713867, + "learning_rate": 1.8304020958454156e-05, + "loss": 1.4169, + "step": 4332 + }, + { + "epoch": 0.2368956986454902, + "grad_norm": 1.4945744276046753, + "learning_rate": 1.830300268107364e-05, + "loss": 1.3489, + "step": 4333 + }, + { + "epoch": 0.23695037108921177, + "grad_norm": 1.3653128147125244, + "learning_rate": 1.830198412643638e-05, + "loss": 1.4053, + "step": 4334 + }, + { + "epoch": 0.2370050435329333, + "grad_norm": 1.537984848022461, + "learning_rate": 1.830096529457639e-05, + "loss": 1.4767, + "step": 4335 + }, + { + "epoch": 0.23705971597665487, + "grad_norm": 1.4655238389968872, + "learning_rate": 1.829994618552769e-05, + "loss": 1.3571, + "step": 4336 + }, + { + "epoch": 0.2371143884203764, + "grad_norm": 1.984017014503479, + "learning_rate": 1.8298926799324307e-05, + "loss": 1.4609, + "step": 4337 + }, + { + "epoch": 0.23716906086409797, + "grad_norm": 1.451350212097168, + "learning_rate": 1.8297907136000287e-05, + "loss": 1.4418, + "step": 4338 + }, + { + "epoch": 0.23722373330781954, + "grad_norm": 1.5324656963348389, + "learning_rate": 1.8296887195589678e-05, + "loss": 1.5368, + "step": 4339 + }, + { + "epoch": 0.23727840575154108, + "grad_norm": 1.4530632495880127, + "learning_rate": 1.829586697812653e-05, + "loss": 1.4858, + "step": 4340 + }, + { + "epoch": 0.23733307819526264, + "grad_norm": 1.3133325576782227, + "learning_rate": 1.8294846483644922e-05, + "loss": 1.4961, + "step": 4341 + }, + { + "epoch": 0.23738775063898418, + "grad_norm": 1.2337902784347534, + "learning_rate": 1.829382571217892e-05, + "loss": 1.3085, + "step": 4342 + }, + { + "epoch": 0.23744242308270574, + "grad_norm": 1.3839774131774902, + "learning_rate": 1.8292804663762624e-05, + "loss": 1.5879, + "step": 4343 + }, + { + "epoch": 0.23749709552642728, + "grad_norm": 1.7126766443252563, + "learning_rate": 1.8291783338430113e-05, + "loss": 1.5977, + "step": 4344 + }, + { + "epoch": 0.23755176797014885, + "grad_norm": 1.621975064277649, + "learning_rate": 1.8290761736215503e-05, + "loss": 1.4682, + "step": 4345 + }, + { + "epoch": 0.2376064404138704, + "grad_norm": 1.4547199010849, + "learning_rate": 1.8289739857152903e-05, + "loss": 1.5654, + "step": 4346 + }, + { + "epoch": 0.23766111285759195, + "grad_norm": 1.27836012840271, + "learning_rate": 1.8288717701276436e-05, + "loss": 1.7441, + "step": 4347 + }, + { + "epoch": 0.23771578530131351, + "grad_norm": 1.2681314945220947, + "learning_rate": 1.8287695268620237e-05, + "loss": 1.5657, + "step": 4348 + }, + { + "epoch": 0.23777045774503505, + "grad_norm": 1.2444732189178467, + "learning_rate": 1.8286672559218442e-05, + "loss": 1.5044, + "step": 4349 + }, + { + "epoch": 0.23782513018875662, + "grad_norm": 1.6340872049331665, + "learning_rate": 1.828564957310521e-05, + "loss": 1.446, + "step": 4350 + }, + { + "epoch": 0.23787980263247815, + "grad_norm": 2.5651559829711914, + "learning_rate": 1.828462631031469e-05, + "loss": 1.4311, + "step": 4351 + }, + { + "epoch": 0.23793447507619972, + "grad_norm": 1.8563286066055298, + "learning_rate": 1.8283602770881058e-05, + "loss": 1.572, + "step": 4352 + }, + { + "epoch": 0.23798914751992128, + "grad_norm": 1.8092782497406006, + "learning_rate": 1.8282578954838493e-05, + "loss": 1.6015, + "step": 4353 + }, + { + "epoch": 0.23804381996364282, + "grad_norm": 1.4651066064834595, + "learning_rate": 1.8281554862221182e-05, + "loss": 1.308, + "step": 4354 + }, + { + "epoch": 0.2380984924073644, + "grad_norm": 1.253187656402588, + "learning_rate": 1.8280530493063318e-05, + "loss": 1.5736, + "step": 4355 + }, + { + "epoch": 0.23815316485108592, + "grad_norm": 1.439171552658081, + "learning_rate": 1.8279505847399113e-05, + "loss": 1.747, + "step": 4356 + }, + { + "epoch": 0.2382078372948075, + "grad_norm": 1.4477956295013428, + "learning_rate": 1.827848092526278e-05, + "loss": 1.6862, + "step": 4357 + }, + { + "epoch": 0.23826250973852903, + "grad_norm": 1.5847457647323608, + "learning_rate": 1.8277455726688538e-05, + "loss": 1.3155, + "step": 4358 + }, + { + "epoch": 0.2383171821822506, + "grad_norm": 1.0070490837097168, + "learning_rate": 1.8276430251710628e-05, + "loss": 1.5291, + "step": 4359 + }, + { + "epoch": 0.23837185462597216, + "grad_norm": 1.632210612297058, + "learning_rate": 1.8275404500363293e-05, + "loss": 1.3031, + "step": 4360 + }, + { + "epoch": 0.2384265270696937, + "grad_norm": 1.5433902740478516, + "learning_rate": 1.8274378472680782e-05, + "loss": 1.5191, + "step": 4361 + }, + { + "epoch": 0.23848119951341526, + "grad_norm": 1.8851898908615112, + "learning_rate": 1.8273352168697354e-05, + "loss": 1.5319, + "step": 4362 + }, + { + "epoch": 0.2385358719571368, + "grad_norm": 1.6865589618682861, + "learning_rate": 1.8272325588447286e-05, + "loss": 1.4607, + "step": 4363 + }, + { + "epoch": 0.23859054440085836, + "grad_norm": 1.5429251194000244, + "learning_rate": 1.8271298731964853e-05, + "loss": 1.2889, + "step": 4364 + }, + { + "epoch": 0.2386452168445799, + "grad_norm": 1.6108647584915161, + "learning_rate": 1.827027159928435e-05, + "loss": 1.5767, + "step": 4365 + }, + { + "epoch": 0.23869988928830146, + "grad_norm": 1.483099341392517, + "learning_rate": 1.826924419044007e-05, + "loss": 1.3996, + "step": 4366 + }, + { + "epoch": 0.23875456173202303, + "grad_norm": 1.410265564918518, + "learning_rate": 1.8268216505466318e-05, + "loss": 1.5722, + "step": 4367 + }, + { + "epoch": 0.23880923417574457, + "grad_norm": 1.647821068763733, + "learning_rate": 1.8267188544397417e-05, + "loss": 1.6405, + "step": 4368 + }, + { + "epoch": 0.23886390661946613, + "grad_norm": 1.5690460205078125, + "learning_rate": 1.8266160307267692e-05, + "loss": 1.498, + "step": 4369 + }, + { + "epoch": 0.23891857906318767, + "grad_norm": 1.431566834449768, + "learning_rate": 1.8265131794111478e-05, + "loss": 1.5232, + "step": 4370 + }, + { + "epoch": 0.23897325150690923, + "grad_norm": 1.4808932542800903, + "learning_rate": 1.826410300496312e-05, + "loss": 1.4847, + "step": 4371 + }, + { + "epoch": 0.23902792395063077, + "grad_norm": 1.920005202293396, + "learning_rate": 1.8263073939856965e-05, + "loss": 1.6209, + "step": 4372 + }, + { + "epoch": 0.23908259639435234, + "grad_norm": 1.7941986322402954, + "learning_rate": 1.8262044598827387e-05, + "loss": 1.4765, + "step": 4373 + }, + { + "epoch": 0.2391372688380739, + "grad_norm": 1.220805287361145, + "learning_rate": 1.826101498190875e-05, + "loss": 1.5228, + "step": 4374 + }, + { + "epoch": 0.23919194128179544, + "grad_norm": 1.211711049079895, + "learning_rate": 1.8259985089135436e-05, + "loss": 1.7175, + "step": 4375 + }, + { + "epoch": 0.239246613725517, + "grad_norm": 1.865556240081787, + "learning_rate": 1.8258954920541838e-05, + "loss": 1.4451, + "step": 4376 + }, + { + "epoch": 0.23930128616923854, + "grad_norm": 1.917999029159546, + "learning_rate": 1.8257924476162355e-05, + "loss": 1.4648, + "step": 4377 + }, + { + "epoch": 0.2393559586129601, + "grad_norm": 1.436911940574646, + "learning_rate": 1.82568937560314e-05, + "loss": 1.5264, + "step": 4378 + }, + { + "epoch": 0.23941063105668164, + "grad_norm": 1.1524658203125, + "learning_rate": 1.825586276018338e-05, + "loss": 1.3835, + "step": 4379 + }, + { + "epoch": 0.2394653035004032, + "grad_norm": 1.355096697807312, + "learning_rate": 1.8254831488652733e-05, + "loss": 1.4638, + "step": 4380 + }, + { + "epoch": 0.23951997594412477, + "grad_norm": 1.8090887069702148, + "learning_rate": 1.8253799941473894e-05, + "loss": 1.702, + "step": 4381 + }, + { + "epoch": 0.2395746483878463, + "grad_norm": 1.562937617301941, + "learning_rate": 1.8252768118681305e-05, + "loss": 1.5763, + "step": 4382 + }, + { + "epoch": 0.23962932083156788, + "grad_norm": 1.3210004568099976, + "learning_rate": 1.825173602030942e-05, + "loss": 1.4714, + "step": 4383 + }, + { + "epoch": 0.23968399327528941, + "grad_norm": 1.5264856815338135, + "learning_rate": 1.8250703646392712e-05, + "loss": 1.1751, + "step": 4384 + }, + { + "epoch": 0.23973866571901098, + "grad_norm": 1.9742460250854492, + "learning_rate": 1.8249670996965647e-05, + "loss": 1.1962, + "step": 4385 + }, + { + "epoch": 0.23979333816273252, + "grad_norm": 1.3127996921539307, + "learning_rate": 1.8248638072062704e-05, + "loss": 1.3849, + "step": 4386 + }, + { + "epoch": 0.23984801060645408, + "grad_norm": 1.335302472114563, + "learning_rate": 1.824760487171838e-05, + "loss": 1.5066, + "step": 4387 + }, + { + "epoch": 0.23990268305017565, + "grad_norm": 1.4493441581726074, + "learning_rate": 1.824657139596718e-05, + "loss": 1.5685, + "step": 4388 + }, + { + "epoch": 0.23995735549389718, + "grad_norm": 1.3147460222244263, + "learning_rate": 1.824553764484361e-05, + "loss": 1.628, + "step": 4389 + }, + { + "epoch": 0.24001202793761875, + "grad_norm": 1.5463649034500122, + "learning_rate": 1.8244503618382186e-05, + "loss": 1.5923, + "step": 4390 + }, + { + "epoch": 0.2400667003813403, + "grad_norm": 1.5225977897644043, + "learning_rate": 1.824346931661744e-05, + "loss": 1.4843, + "step": 4391 + }, + { + "epoch": 0.24012137282506185, + "grad_norm": 2.0720374584198, + "learning_rate": 1.8242434739583914e-05, + "loss": 1.391, + "step": 4392 + }, + { + "epoch": 0.2401760452687834, + "grad_norm": 1.6701675653457642, + "learning_rate": 1.8241399887316145e-05, + "loss": 1.5384, + "step": 4393 + }, + { + "epoch": 0.24023071771250495, + "grad_norm": 4.0083231925964355, + "learning_rate": 1.82403647598487e-05, + "loss": 1.5523, + "step": 4394 + }, + { + "epoch": 0.24028539015622652, + "grad_norm": 1.4865267276763916, + "learning_rate": 1.8239329357216135e-05, + "loss": 1.5692, + "step": 4395 + }, + { + "epoch": 0.24034006259994806, + "grad_norm": 1.8935694694519043, + "learning_rate": 1.823829367945303e-05, + "loss": 1.5449, + "step": 4396 + }, + { + "epoch": 0.24039473504366962, + "grad_norm": 1.315659999847412, + "learning_rate": 1.8237257726593967e-05, + "loss": 1.4554, + "step": 4397 + }, + { + "epoch": 0.24044940748739116, + "grad_norm": 1.5599384307861328, + "learning_rate": 1.8236221498673542e-05, + "loss": 1.3511, + "step": 4398 + }, + { + "epoch": 0.24050407993111272, + "grad_norm": 1.6924405097961426, + "learning_rate": 1.823518499572635e-05, + "loss": 1.6382, + "step": 4399 + }, + { + "epoch": 0.24055875237483426, + "grad_norm": 1.3537243604660034, + "learning_rate": 1.8234148217787007e-05, + "loss": 1.4253, + "step": 4400 + }, + { + "epoch": 0.24061342481855583, + "grad_norm": 1.3707959651947021, + "learning_rate": 1.8233111164890135e-05, + "loss": 1.4826, + "step": 4401 + }, + { + "epoch": 0.2406680972622774, + "grad_norm": 1.4412848949432373, + "learning_rate": 1.823207383707036e-05, + "loss": 1.5017, + "step": 4402 + }, + { + "epoch": 0.24072276970599893, + "grad_norm": 2.0516421794891357, + "learning_rate": 1.823103623436232e-05, + "loss": 1.4713, + "step": 4403 + }, + { + "epoch": 0.2407774421497205, + "grad_norm": 1.7061139345169067, + "learning_rate": 1.822999835680067e-05, + "loss": 1.3217, + "step": 4404 + }, + { + "epoch": 0.24083211459344203, + "grad_norm": 1.3332171440124512, + "learning_rate": 1.8228960204420063e-05, + "loss": 1.5904, + "step": 4405 + }, + { + "epoch": 0.2408867870371636, + "grad_norm": 1.4228239059448242, + "learning_rate": 1.8227921777255165e-05, + "loss": 1.5336, + "step": 4406 + }, + { + "epoch": 0.24094145948088513, + "grad_norm": 1.9030276536941528, + "learning_rate": 1.822688307534065e-05, + "loss": 1.4774, + "step": 4407 + }, + { + "epoch": 0.2409961319246067, + "grad_norm": 1.7092382907867432, + "learning_rate": 1.82258440987112e-05, + "loss": 1.4285, + "step": 4408 + }, + { + "epoch": 0.24105080436832826, + "grad_norm": 1.7782855033874512, + "learning_rate": 1.8224804847401518e-05, + "loss": 1.2659, + "step": 4409 + }, + { + "epoch": 0.2411054768120498, + "grad_norm": 1.3308942317962646, + "learning_rate": 1.82237653214463e-05, + "loss": 1.5877, + "step": 4410 + }, + { + "epoch": 0.24116014925577137, + "grad_norm": 1.7833107709884644, + "learning_rate": 1.822272552088026e-05, + "loss": 1.3497, + "step": 4411 + }, + { + "epoch": 0.2412148216994929, + "grad_norm": 1.6002501249313354, + "learning_rate": 1.8221685445738123e-05, + "loss": 1.3008, + "step": 4412 + }, + { + "epoch": 0.24126949414321447, + "grad_norm": 1.681188941001892, + "learning_rate": 1.8220645096054613e-05, + "loss": 1.4078, + "step": 4413 + }, + { + "epoch": 0.241324166586936, + "grad_norm": 1.404900312423706, + "learning_rate": 1.8219604471864472e-05, + "loss": 1.5078, + "step": 4414 + }, + { + "epoch": 0.24137883903065757, + "grad_norm": 1.452004075050354, + "learning_rate": 1.8218563573202453e-05, + "loss": 1.2485, + "step": 4415 + }, + { + "epoch": 0.24143351147437914, + "grad_norm": 1.4066064357757568, + "learning_rate": 1.821752240010331e-05, + "loss": 1.4893, + "step": 4416 + }, + { + "epoch": 0.24148818391810067, + "grad_norm": 2.8114209175109863, + "learning_rate": 1.821648095260181e-05, + "loss": 1.3636, + "step": 4417 + }, + { + "epoch": 0.24154285636182224, + "grad_norm": 1.4372916221618652, + "learning_rate": 1.821543923073273e-05, + "loss": 1.2726, + "step": 4418 + }, + { + "epoch": 0.24159752880554378, + "grad_norm": 1.6234583854675293, + "learning_rate": 1.8214397234530855e-05, + "loss": 1.5362, + "step": 4419 + }, + { + "epoch": 0.24165220124926534, + "grad_norm": 1.2610124349594116, + "learning_rate": 1.8213354964030984e-05, + "loss": 1.6744, + "step": 4420 + }, + { + "epoch": 0.24170687369298688, + "grad_norm": 1.3236496448516846, + "learning_rate": 1.8212312419267917e-05, + "loss": 1.5913, + "step": 4421 + }, + { + "epoch": 0.24176154613670844, + "grad_norm": 1.3167610168457031, + "learning_rate": 1.8211269600276466e-05, + "loss": 1.432, + "step": 4422 + }, + { + "epoch": 0.24181621858043, + "grad_norm": 1.6292670965194702, + "learning_rate": 1.8210226507091454e-05, + "loss": 1.5388, + "step": 4423 + }, + { + "epoch": 0.24187089102415155, + "grad_norm": 1.781249761581421, + "learning_rate": 1.8209183139747716e-05, + "loss": 1.3317, + "step": 4424 + }, + { + "epoch": 0.2419255634678731, + "grad_norm": 1.3878480195999146, + "learning_rate": 1.8208139498280087e-05, + "loss": 1.4153, + "step": 4425 + }, + { + "epoch": 0.24198023591159465, + "grad_norm": 1.4986674785614014, + "learning_rate": 1.8207095582723418e-05, + "loss": 1.4356, + "step": 4426 + }, + { + "epoch": 0.24203490835531621, + "grad_norm": 1.3116464614868164, + "learning_rate": 1.820605139311257e-05, + "loss": 1.7206, + "step": 4427 + }, + { + "epoch": 0.24208958079903775, + "grad_norm": 1.3690458536148071, + "learning_rate": 1.8205006929482412e-05, + "loss": 1.4284, + "step": 4428 + }, + { + "epoch": 0.24214425324275932, + "grad_norm": 1.5726062059402466, + "learning_rate": 1.8203962191867816e-05, + "loss": 1.6212, + "step": 4429 + }, + { + "epoch": 0.24219892568648088, + "grad_norm": 1.6337575912475586, + "learning_rate": 1.8202917180303673e-05, + "loss": 1.3533, + "step": 4430 + }, + { + "epoch": 0.24225359813020242, + "grad_norm": 1.2503881454467773, + "learning_rate": 1.8201871894824877e-05, + "loss": 1.2025, + "step": 4431 + }, + { + "epoch": 0.24230827057392398, + "grad_norm": 1.6388598680496216, + "learning_rate": 1.820082633546633e-05, + "loss": 1.3421, + "step": 4432 + }, + { + "epoch": 0.24236294301764552, + "grad_norm": 1.1304070949554443, + "learning_rate": 1.8199780502262948e-05, + "loss": 1.7342, + "step": 4433 + }, + { + "epoch": 0.2424176154613671, + "grad_norm": 1.734156847000122, + "learning_rate": 1.8198734395249658e-05, + "loss": 1.4404, + "step": 4434 + }, + { + "epoch": 0.24247228790508865, + "grad_norm": 1.7899092435836792, + "learning_rate": 1.8197688014461384e-05, + "loss": 1.4619, + "step": 4435 + }, + { + "epoch": 0.2425269603488102, + "grad_norm": 1.4207490682601929, + "learning_rate": 1.8196641359933068e-05, + "loss": 1.4544, + "step": 4436 + }, + { + "epoch": 0.24258163279253175, + "grad_norm": 1.5364412069320679, + "learning_rate": 1.819559443169967e-05, + "loss": 1.4156, + "step": 4437 + }, + { + "epoch": 0.2426363052362533, + "grad_norm": 1.398687720298767, + "learning_rate": 1.8194547229796137e-05, + "loss": 1.4942, + "step": 4438 + }, + { + "epoch": 0.24269097767997486, + "grad_norm": 1.8475288152694702, + "learning_rate": 1.8193499754257444e-05, + "loss": 1.4361, + "step": 4439 + }, + { + "epoch": 0.2427456501236964, + "grad_norm": 1.7756823301315308, + "learning_rate": 1.8192452005118567e-05, + "loss": 1.4076, + "step": 4440 + }, + { + "epoch": 0.24280032256741796, + "grad_norm": 1.3053852319717407, + "learning_rate": 1.8191403982414496e-05, + "loss": 1.2607, + "step": 4441 + }, + { + "epoch": 0.24285499501113952, + "grad_norm": 1.298339605331421, + "learning_rate": 1.819035568618022e-05, + "loss": 1.4367, + "step": 4442 + }, + { + "epoch": 0.24290966745486106, + "grad_norm": 1.535028100013733, + "learning_rate": 1.8189307116450755e-05, + "loss": 1.6763, + "step": 4443 + }, + { + "epoch": 0.24296433989858263, + "grad_norm": 1.3852319717407227, + "learning_rate": 1.8188258273261104e-05, + "loss": 1.4426, + "step": 4444 + }, + { + "epoch": 0.24301901234230416, + "grad_norm": 1.5774705410003662, + "learning_rate": 1.8187209156646294e-05, + "loss": 1.4316, + "step": 4445 + }, + { + "epoch": 0.24307368478602573, + "grad_norm": 1.3098254203796387, + "learning_rate": 1.8186159766641362e-05, + "loss": 1.5014, + "step": 4446 + }, + { + "epoch": 0.24312835722974727, + "grad_norm": 1.4274144172668457, + "learning_rate": 1.8185110103281343e-05, + "loss": 1.3845, + "step": 4447 + }, + { + "epoch": 0.24318302967346883, + "grad_norm": 1.541902780532837, + "learning_rate": 1.818406016660129e-05, + "loss": 1.3548, + "step": 4448 + }, + { + "epoch": 0.2432377021171904, + "grad_norm": 1.572117805480957, + "learning_rate": 1.8183009956636266e-05, + "loss": 1.5466, + "step": 4449 + }, + { + "epoch": 0.24329237456091193, + "grad_norm": 1.9249001741409302, + "learning_rate": 1.8181959473421335e-05, + "loss": 1.1959, + "step": 4450 + }, + { + "epoch": 0.2433470470046335, + "grad_norm": 1.41232168674469, + "learning_rate": 1.818090871699158e-05, + "loss": 1.2585, + "step": 4451 + }, + { + "epoch": 0.24340171944835504, + "grad_norm": 1.582842230796814, + "learning_rate": 1.8179857687382084e-05, + "loss": 1.5948, + "step": 4452 + }, + { + "epoch": 0.2434563918920766, + "grad_norm": 1.5178600549697876, + "learning_rate": 1.8178806384627947e-05, + "loss": 1.4278, + "step": 4453 + }, + { + "epoch": 0.24351106433579814, + "grad_norm": 1.7253928184509277, + "learning_rate": 1.8177754808764275e-05, + "loss": 1.5382, + "step": 4454 + }, + { + "epoch": 0.2435657367795197, + "grad_norm": 1.330122947692871, + "learning_rate": 1.8176702959826172e-05, + "loss": 1.623, + "step": 4455 + }, + { + "epoch": 0.24362040922324127, + "grad_norm": 1.4162546396255493, + "learning_rate": 1.817565083784878e-05, + "loss": 1.3284, + "step": 4456 + }, + { + "epoch": 0.2436750816669628, + "grad_norm": 1.8620656728744507, + "learning_rate": 1.8174598442867214e-05, + "loss": 1.5324, + "step": 4457 + }, + { + "epoch": 0.24372975411068437, + "grad_norm": 2.0763790607452393, + "learning_rate": 1.8173545774916628e-05, + "loss": 1.4362, + "step": 4458 + }, + { + "epoch": 0.2437844265544059, + "grad_norm": 1.3705519437789917, + "learning_rate": 1.8172492834032165e-05, + "loss": 1.5873, + "step": 4459 + }, + { + "epoch": 0.24383909899812747, + "grad_norm": 1.8282884359359741, + "learning_rate": 1.8171439620248993e-05, + "loss": 1.3915, + "step": 4460 + }, + { + "epoch": 0.243893771441849, + "grad_norm": 1.3562127351760864, + "learning_rate": 1.8170386133602273e-05, + "loss": 1.5894, + "step": 4461 + }, + { + "epoch": 0.24394844388557058, + "grad_norm": 1.566723346710205, + "learning_rate": 1.8169332374127192e-05, + "loss": 1.3383, + "step": 4462 + }, + { + "epoch": 0.24400311632929214, + "grad_norm": 1.411350965499878, + "learning_rate": 1.816827834185893e-05, + "loss": 1.5016, + "step": 4463 + }, + { + "epoch": 0.24405778877301368, + "grad_norm": 1.3598048686981201, + "learning_rate": 1.816722403683269e-05, + "loss": 1.5776, + "step": 4464 + }, + { + "epoch": 0.24411246121673524, + "grad_norm": 3.1155943870544434, + "learning_rate": 1.8166169459083673e-05, + "loss": 1.5162, + "step": 4465 + }, + { + "epoch": 0.24416713366045678, + "grad_norm": 1.4335048198699951, + "learning_rate": 1.8165114608647093e-05, + "loss": 1.4899, + "step": 4466 + }, + { + "epoch": 0.24422180610417835, + "grad_norm": 1.2194007635116577, + "learning_rate": 1.8164059485558177e-05, + "loss": 1.5837, + "step": 4467 + }, + { + "epoch": 0.24427647854789988, + "grad_norm": 1.7493599653244019, + "learning_rate": 1.8163004089852157e-05, + "loss": 1.3785, + "step": 4468 + }, + { + "epoch": 0.24433115099162145, + "grad_norm": 1.3697112798690796, + "learning_rate": 1.8161948421564277e-05, + "loss": 1.3762, + "step": 4469 + }, + { + "epoch": 0.244385823435343, + "grad_norm": 1.7147058248519897, + "learning_rate": 1.8160892480729787e-05, + "loss": 1.614, + "step": 4470 + }, + { + "epoch": 0.24444049587906455, + "grad_norm": 2.1097638607025146, + "learning_rate": 1.8159836267383944e-05, + "loss": 1.5844, + "step": 4471 + }, + { + "epoch": 0.24449516832278612, + "grad_norm": 1.6763325929641724, + "learning_rate": 1.8158779781562022e-05, + "loss": 1.4154, + "step": 4472 + }, + { + "epoch": 0.24454984076650765, + "grad_norm": 1.638491153717041, + "learning_rate": 1.8157723023299295e-05, + "loss": 1.2845, + "step": 4473 + }, + { + "epoch": 0.24460451321022922, + "grad_norm": 1.425336241722107, + "learning_rate": 1.815666599263106e-05, + "loss": 1.4044, + "step": 4474 + }, + { + "epoch": 0.24465918565395076, + "grad_norm": 1.68820059299469, + "learning_rate": 1.8155608689592604e-05, + "loss": 1.458, + "step": 4475 + }, + { + "epoch": 0.24471385809767232, + "grad_norm": 1.597923994064331, + "learning_rate": 1.8154551114219235e-05, + "loss": 1.6989, + "step": 4476 + }, + { + "epoch": 0.24476853054139389, + "grad_norm": 1.1526222229003906, + "learning_rate": 1.815349326654627e-05, + "loss": 1.5732, + "step": 4477 + }, + { + "epoch": 0.24482320298511542, + "grad_norm": 1.3186293840408325, + "learning_rate": 1.815243514660903e-05, + "loss": 1.3624, + "step": 4478 + }, + { + "epoch": 0.244877875428837, + "grad_norm": 1.6770198345184326, + "learning_rate": 1.8151376754442856e-05, + "loss": 1.5045, + "step": 4479 + }, + { + "epoch": 0.24493254787255853, + "grad_norm": 1.577692985534668, + "learning_rate": 1.815031809008308e-05, + "loss": 1.7025, + "step": 4480 + }, + { + "epoch": 0.2449872203162801, + "grad_norm": 2.027815818786621, + "learning_rate": 1.8149259153565058e-05, + "loss": 1.4644, + "step": 4481 + }, + { + "epoch": 0.24504189276000163, + "grad_norm": 1.5668035745620728, + "learning_rate": 1.814819994492415e-05, + "loss": 1.3878, + "step": 4482 + }, + { + "epoch": 0.2450965652037232, + "grad_norm": 1.3710311651229858, + "learning_rate": 1.8147140464195726e-05, + "loss": 1.2424, + "step": 4483 + }, + { + "epoch": 0.24515123764744476, + "grad_norm": 1.1838996410369873, + "learning_rate": 1.814608071141516e-05, + "loss": 1.4521, + "step": 4484 + }, + { + "epoch": 0.2452059100911663, + "grad_norm": 1.4881101846694946, + "learning_rate": 1.8145020686617848e-05, + "loss": 1.596, + "step": 4485 + }, + { + "epoch": 0.24526058253488786, + "grad_norm": 1.831723928451538, + "learning_rate": 1.8143960389839184e-05, + "loss": 1.3529, + "step": 4486 + }, + { + "epoch": 0.2453152549786094, + "grad_norm": 1.3778793811798096, + "learning_rate": 1.814289982111457e-05, + "loss": 1.2845, + "step": 4487 + }, + { + "epoch": 0.24536992742233096, + "grad_norm": 2.028007984161377, + "learning_rate": 1.8141838980479424e-05, + "loss": 1.4141, + "step": 4488 + }, + { + "epoch": 0.2454245998660525, + "grad_norm": 2.192185640335083, + "learning_rate": 1.8140777867969167e-05, + "loss": 1.4626, + "step": 4489 + }, + { + "epoch": 0.24547927230977407, + "grad_norm": 1.4933706521987915, + "learning_rate": 1.8139716483619234e-05, + "loss": 1.5439, + "step": 4490 + }, + { + "epoch": 0.24553394475349563, + "grad_norm": 1.3065309524536133, + "learning_rate": 1.813865482746507e-05, + "loss": 1.378, + "step": 4491 + }, + { + "epoch": 0.24558861719721717, + "grad_norm": 1.9466880559921265, + "learning_rate": 1.813759289954212e-05, + "loss": 1.3055, + "step": 4492 + }, + { + "epoch": 0.24564328964093873, + "grad_norm": 2.7912886142730713, + "learning_rate": 1.8136530699885852e-05, + "loss": 1.1606, + "step": 4493 + }, + { + "epoch": 0.24569796208466027, + "grad_norm": 1.5713002681732178, + "learning_rate": 1.8135468228531727e-05, + "loss": 1.1461, + "step": 4494 + }, + { + "epoch": 0.24575263452838184, + "grad_norm": 1.727418065071106, + "learning_rate": 1.813440548551523e-05, + "loss": 1.1921, + "step": 4495 + }, + { + "epoch": 0.24580730697210337, + "grad_norm": 1.692345380783081, + "learning_rate": 1.8133342470871846e-05, + "loss": 1.6094, + "step": 4496 + }, + { + "epoch": 0.24586197941582494, + "grad_norm": 1.4125840663909912, + "learning_rate": 1.8132279184637064e-05, + "loss": 1.42, + "step": 4497 + }, + { + "epoch": 0.2459166518595465, + "grad_norm": 1.470534086227417, + "learning_rate": 1.8131215626846405e-05, + "loss": 1.5366, + "step": 4498 + }, + { + "epoch": 0.24597132430326804, + "grad_norm": 1.5073018074035645, + "learning_rate": 1.8130151797535375e-05, + "loss": 1.3738, + "step": 4499 + }, + { + "epoch": 0.2460259967469896, + "grad_norm": 1.6097201108932495, + "learning_rate": 1.8129087696739497e-05, + "loss": 1.3492, + "step": 4500 + }, + { + "epoch": 0.24608066919071114, + "grad_norm": 1.8278266191482544, + "learning_rate": 1.8128023324494303e-05, + "loss": 1.3908, + "step": 4501 + }, + { + "epoch": 0.2461353416344327, + "grad_norm": 1.9581501483917236, + "learning_rate": 1.812695868083534e-05, + "loss": 1.4568, + "step": 4502 + }, + { + "epoch": 0.24619001407815425, + "grad_norm": 1.3367767333984375, + "learning_rate": 1.8125893765798157e-05, + "loss": 1.5049, + "step": 4503 + }, + { + "epoch": 0.2462446865218758, + "grad_norm": 1.5314511060714722, + "learning_rate": 1.812482857941831e-05, + "loss": 1.5956, + "step": 4504 + }, + { + "epoch": 0.24629935896559738, + "grad_norm": 1.3844155073165894, + "learning_rate": 1.8123763121731374e-05, + "loss": 1.5656, + "step": 4505 + }, + { + "epoch": 0.2463540314093189, + "grad_norm": 2.0906403064727783, + "learning_rate": 1.8122697392772925e-05, + "loss": 1.537, + "step": 4506 + }, + { + "epoch": 0.24640870385304048, + "grad_norm": 1.4954595565795898, + "learning_rate": 1.8121631392578545e-05, + "loss": 1.6569, + "step": 4507 + }, + { + "epoch": 0.24646337629676202, + "grad_norm": 1.7364856004714966, + "learning_rate": 1.812056512118384e-05, + "loss": 1.543, + "step": 4508 + }, + { + "epoch": 0.24651804874048358, + "grad_norm": 1.3395835161209106, + "learning_rate": 1.8119498578624408e-05, + "loss": 1.4624, + "step": 4509 + }, + { + "epoch": 0.24657272118420512, + "grad_norm": 1.9472867250442505, + "learning_rate": 1.8118431764935865e-05, + "loss": 1.41, + "step": 4510 + }, + { + "epoch": 0.24662739362792668, + "grad_norm": 1.443447232246399, + "learning_rate": 1.8117364680153838e-05, + "loss": 1.4283, + "step": 4511 + }, + { + "epoch": 0.24668206607164825, + "grad_norm": 1.5102126598358154, + "learning_rate": 1.8116297324313953e-05, + "loss": 1.5053, + "step": 4512 + }, + { + "epoch": 0.24673673851536979, + "grad_norm": 1.7663731575012207, + "learning_rate": 1.8115229697451853e-05, + "loss": 1.5119, + "step": 4513 + }, + { + "epoch": 0.24679141095909135, + "grad_norm": 1.3309409618377686, + "learning_rate": 1.8114161799603195e-05, + "loss": 1.2672, + "step": 4514 + }, + { + "epoch": 0.2468460834028129, + "grad_norm": 1.50417160987854, + "learning_rate": 1.8113093630803632e-05, + "loss": 1.608, + "step": 4515 + }, + { + "epoch": 0.24690075584653445, + "grad_norm": 1.318892478942871, + "learning_rate": 1.8112025191088834e-05, + "loss": 1.4626, + "step": 4516 + }, + { + "epoch": 0.246955428290256, + "grad_norm": 1.7004441022872925, + "learning_rate": 1.811095648049448e-05, + "loss": 1.5713, + "step": 4517 + }, + { + "epoch": 0.24701010073397756, + "grad_norm": 1.5651260614395142, + "learning_rate": 1.8109887499056256e-05, + "loss": 1.4899, + "step": 4518 + }, + { + "epoch": 0.24706477317769912, + "grad_norm": 1.6087430715560913, + "learning_rate": 1.8108818246809857e-05, + "loss": 1.4237, + "step": 4519 + }, + { + "epoch": 0.24711944562142066, + "grad_norm": 1.4871376752853394, + "learning_rate": 1.8107748723790993e-05, + "loss": 1.4842, + "step": 4520 + }, + { + "epoch": 0.24717411806514222, + "grad_norm": 1.8597002029418945, + "learning_rate": 1.810667893003537e-05, + "loss": 1.4074, + "step": 4521 + }, + { + "epoch": 0.24722879050886376, + "grad_norm": 1.4121681451797485, + "learning_rate": 1.8105608865578713e-05, + "loss": 1.7353, + "step": 4522 + }, + { + "epoch": 0.24728346295258533, + "grad_norm": 1.2868014574050903, + "learning_rate": 1.8104538530456758e-05, + "loss": 1.4418, + "step": 4523 + }, + { + "epoch": 0.24733813539630686, + "grad_norm": 1.4135098457336426, + "learning_rate": 1.8103467924705245e-05, + "loss": 1.531, + "step": 4524 + }, + { + "epoch": 0.24739280784002843, + "grad_norm": 1.5052870512008667, + "learning_rate": 1.810239704835992e-05, + "loss": 1.3343, + "step": 4525 + }, + { + "epoch": 0.24744748028375, + "grad_norm": 1.716912865638733, + "learning_rate": 1.8101325901456546e-05, + "loss": 1.3476, + "step": 4526 + }, + { + "epoch": 0.24750215272747153, + "grad_norm": 1.8216711282730103, + "learning_rate": 1.810025448403089e-05, + "loss": 1.802, + "step": 4527 + }, + { + "epoch": 0.2475568251711931, + "grad_norm": 1.4257946014404297, + "learning_rate": 1.8099182796118727e-05, + "loss": 1.4827, + "step": 4528 + }, + { + "epoch": 0.24761149761491463, + "grad_norm": 1.5353981256484985, + "learning_rate": 1.8098110837755845e-05, + "loss": 1.6009, + "step": 4529 + }, + { + "epoch": 0.2476661700586362, + "grad_norm": 1.2802218198776245, + "learning_rate": 1.809703860897804e-05, + "loss": 1.6689, + "step": 4530 + }, + { + "epoch": 0.24772084250235774, + "grad_norm": 1.5733487606048584, + "learning_rate": 1.809596610982112e-05, + "loss": 1.5291, + "step": 4531 + }, + { + "epoch": 0.2477755149460793, + "grad_norm": 1.3780193328857422, + "learning_rate": 1.809489334032089e-05, + "loss": 1.6223, + "step": 4532 + }, + { + "epoch": 0.24783018738980087, + "grad_norm": 1.784723162651062, + "learning_rate": 1.8093820300513176e-05, + "loss": 1.6202, + "step": 4533 + }, + { + "epoch": 0.2478848598335224, + "grad_norm": 1.1961987018585205, + "learning_rate": 1.809274699043381e-05, + "loss": 1.4614, + "step": 4534 + }, + { + "epoch": 0.24793953227724397, + "grad_norm": 1.4930357933044434, + "learning_rate": 1.8091673410118633e-05, + "loss": 1.6164, + "step": 4535 + }, + { + "epoch": 0.2479942047209655, + "grad_norm": 1.6423695087432861, + "learning_rate": 1.809059955960349e-05, + "loss": 1.2852, + "step": 4536 + }, + { + "epoch": 0.24804887716468707, + "grad_norm": 1.3059322834014893, + "learning_rate": 1.8089525438924242e-05, + "loss": 1.4595, + "step": 4537 + }, + { + "epoch": 0.24810354960840864, + "grad_norm": 1.5384865999221802, + "learning_rate": 1.808845104811676e-05, + "loss": 1.319, + "step": 4538 + }, + { + "epoch": 0.24815822205213017, + "grad_norm": 1.4639805555343628, + "learning_rate": 1.8087376387216917e-05, + "loss": 1.2488, + "step": 4539 + }, + { + "epoch": 0.24821289449585174, + "grad_norm": 1.9047329425811768, + "learning_rate": 1.80863014562606e-05, + "loss": 1.3936, + "step": 4540 + }, + { + "epoch": 0.24826756693957328, + "grad_norm": 1.5681949853897095, + "learning_rate": 1.80852262552837e-05, + "loss": 1.586, + "step": 4541 + }, + { + "epoch": 0.24832223938329484, + "grad_norm": 1.4823487997055054, + "learning_rate": 1.8084150784322123e-05, + "loss": 1.5812, + "step": 4542 + }, + { + "epoch": 0.24837691182701638, + "grad_norm": 1.2543545961380005, + "learning_rate": 1.808307504341178e-05, + "loss": 1.4992, + "step": 4543 + }, + { + "epoch": 0.24843158427073794, + "grad_norm": 1.5293796062469482, + "learning_rate": 1.8081999032588594e-05, + "loss": 1.1956, + "step": 4544 + }, + { + "epoch": 0.2484862567144595, + "grad_norm": 2.751765489578247, + "learning_rate": 1.80809227518885e-05, + "loss": 1.6273, + "step": 4545 + }, + { + "epoch": 0.24854092915818105, + "grad_norm": 1.5750291347503662, + "learning_rate": 1.8079846201347428e-05, + "loss": 1.4408, + "step": 4546 + }, + { + "epoch": 0.2485956016019026, + "grad_norm": 2.3259079456329346, + "learning_rate": 1.8078769381001334e-05, + "loss": 1.3959, + "step": 4547 + }, + { + "epoch": 0.24865027404562415, + "grad_norm": 1.3065835237503052, + "learning_rate": 1.807769229088617e-05, + "loss": 1.4477, + "step": 4548 + }, + { + "epoch": 0.2487049464893457, + "grad_norm": 1.653353214263916, + "learning_rate": 1.8076614931037908e-05, + "loss": 1.53, + "step": 4549 + }, + { + "epoch": 0.24875961893306725, + "grad_norm": 1.794535517692566, + "learning_rate": 1.8075537301492522e-05, + "loss": 1.279, + "step": 4550 + }, + { + "epoch": 0.24881429137678882, + "grad_norm": 1.6923580169677734, + "learning_rate": 1.807445940228599e-05, + "loss": 1.5749, + "step": 4551 + }, + { + "epoch": 0.24886896382051038, + "grad_norm": 1.4641797542572021, + "learning_rate": 1.807338123345432e-05, + "loss": 1.6214, + "step": 4552 + }, + { + "epoch": 0.24892363626423192, + "grad_norm": 1.3797062635421753, + "learning_rate": 1.80723027950335e-05, + "loss": 1.5666, + "step": 4553 + }, + { + "epoch": 0.24897830870795348, + "grad_norm": 1.7836573123931885, + "learning_rate": 1.8071224087059547e-05, + "loss": 1.3991, + "step": 4554 + }, + { + "epoch": 0.24903298115167502, + "grad_norm": 1.476547122001648, + "learning_rate": 1.8070145109568484e-05, + "loss": 1.4617, + "step": 4555 + }, + { + "epoch": 0.24908765359539659, + "grad_norm": 1.5421595573425293, + "learning_rate": 1.8069065862596338e-05, + "loss": 1.2647, + "step": 4556 + }, + { + "epoch": 0.24914232603911812, + "grad_norm": 1.4371594190597534, + "learning_rate": 1.806798634617915e-05, + "loss": 1.6456, + "step": 4557 + }, + { + "epoch": 0.2491969984828397, + "grad_norm": 1.583432674407959, + "learning_rate": 1.806690656035296e-05, + "loss": 1.569, + "step": 4558 + }, + { + "epoch": 0.24925167092656125, + "grad_norm": 1.339599847793579, + "learning_rate": 1.8065826505153835e-05, + "loss": 1.5443, + "step": 4559 + }, + { + "epoch": 0.2493063433702828, + "grad_norm": 1.4223941564559937, + "learning_rate": 1.806474618061784e-05, + "loss": 1.8483, + "step": 4560 + }, + { + "epoch": 0.24936101581400436, + "grad_norm": 1.4642863273620605, + "learning_rate": 1.806366558678104e-05, + "loss": 1.6662, + "step": 4561 + }, + { + "epoch": 0.2494156882577259, + "grad_norm": 1.298715591430664, + "learning_rate": 1.8062584723679524e-05, + "loss": 1.397, + "step": 4562 + }, + { + "epoch": 0.24947036070144746, + "grad_norm": 1.7083592414855957, + "learning_rate": 1.8061503591349386e-05, + "loss": 1.5686, + "step": 4563 + }, + { + "epoch": 0.249525033145169, + "grad_norm": 1.3934305906295776, + "learning_rate": 1.8060422189826725e-05, + "loss": 1.2611, + "step": 4564 + }, + { + "epoch": 0.24957970558889056, + "grad_norm": 1.7308286428451538, + "learning_rate": 1.8059340519147653e-05, + "loss": 1.5036, + "step": 4565 + }, + { + "epoch": 0.24963437803261213, + "grad_norm": 1.5149123668670654, + "learning_rate": 1.805825857934829e-05, + "loss": 1.4065, + "step": 4566 + }, + { + "epoch": 0.24968905047633366, + "grad_norm": 1.5329147577285767, + "learning_rate": 1.8057176370464765e-05, + "loss": 1.3462, + "step": 4567 + }, + { + "epoch": 0.24974372292005523, + "grad_norm": 1.8229329586029053, + "learning_rate": 1.805609389253321e-05, + "loss": 1.5075, + "step": 4568 + }, + { + "epoch": 0.24979839536377677, + "grad_norm": 1.608276605606079, + "learning_rate": 1.805501114558978e-05, + "loss": 1.244, + "step": 4569 + }, + { + "epoch": 0.24985306780749833, + "grad_norm": 1.3999745845794678, + "learning_rate": 1.8053928129670624e-05, + "loss": 1.4483, + "step": 4570 + }, + { + "epoch": 0.24990774025121987, + "grad_norm": 1.7329338788986206, + "learning_rate": 1.805284484481191e-05, + "loss": 1.2938, + "step": 4571 + }, + { + "epoch": 0.24996241269494143, + "grad_norm": 1.145881175994873, + "learning_rate": 1.805176129104981e-05, + "loss": 1.56, + "step": 4572 + }, + { + "epoch": 0.250017085138663, + "grad_norm": 1.5220776796340942, + "learning_rate": 1.8050677468420503e-05, + "loss": 1.3916, + "step": 4573 + }, + { + "epoch": 0.25007175758238454, + "grad_norm": 1.6924889087677002, + "learning_rate": 1.8049593376960187e-05, + "loss": 1.3965, + "step": 4574 + }, + { + "epoch": 0.2501264300261061, + "grad_norm": 1.6471574306488037, + "learning_rate": 1.804850901670506e-05, + "loss": 1.3888, + "step": 4575 + }, + { + "epoch": 0.25018110246982767, + "grad_norm": 1.917211651802063, + "learning_rate": 1.804742438769133e-05, + "loss": 1.3783, + "step": 4576 + }, + { + "epoch": 0.2502357749135492, + "grad_norm": 1.2947256565093994, + "learning_rate": 1.8046339489955214e-05, + "loss": 1.7466, + "step": 4577 + }, + { + "epoch": 0.25029044735727074, + "grad_norm": 1.4968403577804565, + "learning_rate": 1.804525432353294e-05, + "loss": 1.5021, + "step": 4578 + }, + { + "epoch": 0.2503451198009923, + "grad_norm": 1.3595656156539917, + "learning_rate": 1.8044168888460748e-05, + "loss": 1.5556, + "step": 4579 + }, + { + "epoch": 0.25039979224471387, + "grad_norm": 1.3473864793777466, + "learning_rate": 1.804308318477488e-05, + "loss": 1.4202, + "step": 4580 + }, + { + "epoch": 0.2504544646884354, + "grad_norm": 1.9165891408920288, + "learning_rate": 1.8041997212511594e-05, + "loss": 1.5182, + "step": 4581 + }, + { + "epoch": 0.25050913713215694, + "grad_norm": 1.3975189924240112, + "learning_rate": 1.8040910971707143e-05, + "loss": 1.5809, + "step": 4582 + }, + { + "epoch": 0.25056380957587854, + "grad_norm": 1.2884491682052612, + "learning_rate": 1.8039824462397812e-05, + "loss": 1.4482, + "step": 4583 + }, + { + "epoch": 0.2506184820196001, + "grad_norm": 1.6121565103530884, + "learning_rate": 1.8038737684619874e-05, + "loss": 1.4398, + "step": 4584 + }, + { + "epoch": 0.2506731544633216, + "grad_norm": 1.519930362701416, + "learning_rate": 1.8037650638409622e-05, + "loss": 1.6826, + "step": 4585 + }, + { + "epoch": 0.2507278269070432, + "grad_norm": 1.537805438041687, + "learning_rate": 1.803656332380335e-05, + "loss": 1.5356, + "step": 4586 + }, + { + "epoch": 0.25078249935076474, + "grad_norm": 1.2022521495819092, + "learning_rate": 1.8035475740837376e-05, + "loss": 1.6927, + "step": 4587 + }, + { + "epoch": 0.2508371717944863, + "grad_norm": 1.5551949739456177, + "learning_rate": 1.803438788954801e-05, + "loss": 1.2835, + "step": 4588 + }, + { + "epoch": 0.2508918442382078, + "grad_norm": 1.4134583473205566, + "learning_rate": 1.8033299769971577e-05, + "loss": 1.5486, + "step": 4589 + }, + { + "epoch": 0.2509465166819294, + "grad_norm": 1.4105706214904785, + "learning_rate": 1.8032211382144416e-05, + "loss": 1.4382, + "step": 4590 + }, + { + "epoch": 0.25100118912565095, + "grad_norm": 1.3821731805801392, + "learning_rate": 1.8031122726102868e-05, + "loss": 1.352, + "step": 4591 + }, + { + "epoch": 0.2510558615693725, + "grad_norm": 1.604786992073059, + "learning_rate": 1.8030033801883285e-05, + "loss": 1.4382, + "step": 4592 + }, + { + "epoch": 0.2511105340130941, + "grad_norm": 2.0500402450561523, + "learning_rate": 1.802894460952203e-05, + "loss": 1.1547, + "step": 4593 + }, + { + "epoch": 0.2511652064568156, + "grad_norm": 1.4183831214904785, + "learning_rate": 1.802785514905548e-05, + "loss": 1.4399, + "step": 4594 + }, + { + "epoch": 0.25121987890053715, + "grad_norm": 1.464235544204712, + "learning_rate": 1.8026765420520002e-05, + "loss": 1.2363, + "step": 4595 + }, + { + "epoch": 0.2512745513442587, + "grad_norm": 1.1888766288757324, + "learning_rate": 1.8025675423951995e-05, + "loss": 1.3712, + "step": 4596 + }, + { + "epoch": 0.2513292237879803, + "grad_norm": 1.279093623161316, + "learning_rate": 1.802458515938785e-05, + "loss": 1.7176, + "step": 4597 + }, + { + "epoch": 0.2513838962317018, + "grad_norm": 1.3930152654647827, + "learning_rate": 1.8023494626863976e-05, + "loss": 1.3542, + "step": 4598 + }, + { + "epoch": 0.25143856867542336, + "grad_norm": 1.5836182832717896, + "learning_rate": 1.8022403826416792e-05, + "loss": 1.6204, + "step": 4599 + }, + { + "epoch": 0.25149324111914495, + "grad_norm": 1.3954499959945679, + "learning_rate": 1.8021312758082717e-05, + "loss": 1.6394, + "step": 4600 + }, + { + "epoch": 0.2515479135628665, + "grad_norm": 1.5166420936584473, + "learning_rate": 1.8020221421898185e-05, + "loss": 1.4564, + "step": 4601 + }, + { + "epoch": 0.251602586006588, + "grad_norm": 1.3240267038345337, + "learning_rate": 1.8019129817899643e-05, + "loss": 1.4354, + "step": 4602 + }, + { + "epoch": 0.25165725845030956, + "grad_norm": 1.2539756298065186, + "learning_rate": 1.8018037946123534e-05, + "loss": 1.4088, + "step": 4603 + }, + { + "epoch": 0.25171193089403116, + "grad_norm": 1.4313851594924927, + "learning_rate": 1.8016945806606328e-05, + "loss": 1.3933, + "step": 4604 + }, + { + "epoch": 0.2517666033377527, + "grad_norm": 1.3743536472320557, + "learning_rate": 1.8015853399384488e-05, + "loss": 1.3995, + "step": 4605 + }, + { + "epoch": 0.25182127578147423, + "grad_norm": 1.260909914970398, + "learning_rate": 1.8014760724494493e-05, + "loss": 1.4613, + "step": 4606 + }, + { + "epoch": 0.2518759482251958, + "grad_norm": 1.5717453956604004, + "learning_rate": 1.801366778197283e-05, + "loss": 1.3905, + "step": 4607 + }, + { + "epoch": 0.25193062066891736, + "grad_norm": 2.1817009449005127, + "learning_rate": 1.8012574571855995e-05, + "loss": 1.4832, + "step": 4608 + }, + { + "epoch": 0.2519852931126389, + "grad_norm": 1.386486530303955, + "learning_rate": 1.8011481094180492e-05, + "loss": 1.4028, + "step": 4609 + }, + { + "epoch": 0.25203996555636043, + "grad_norm": 1.8014432191848755, + "learning_rate": 1.8010387348982837e-05, + "loss": 1.3206, + "step": 4610 + }, + { + "epoch": 0.252094638000082, + "grad_norm": 1.3808403015136719, + "learning_rate": 1.800929333629955e-05, + "loss": 1.2952, + "step": 4611 + }, + { + "epoch": 0.25214931044380356, + "grad_norm": 1.4505671262741089, + "learning_rate": 1.8008199056167167e-05, + "loss": 1.3554, + "step": 4612 + }, + { + "epoch": 0.2522039828875251, + "grad_norm": 1.227272391319275, + "learning_rate": 1.800710450862222e-05, + "loss": 1.5678, + "step": 4613 + }, + { + "epoch": 0.2522586553312467, + "grad_norm": 2.2115190029144287, + "learning_rate": 1.800600969370127e-05, + "loss": 1.6763, + "step": 4614 + }, + { + "epoch": 0.25231332777496823, + "grad_norm": 1.599632740020752, + "learning_rate": 1.8004914611440866e-05, + "loss": 1.4336, + "step": 4615 + }, + { + "epoch": 0.25236800021868977, + "grad_norm": 1.534437656402588, + "learning_rate": 1.8003819261877584e-05, + "loss": 1.422, + "step": 4616 + }, + { + "epoch": 0.2524226726624113, + "grad_norm": 1.6578043699264526, + "learning_rate": 1.800272364504799e-05, + "loss": 1.5717, + "step": 4617 + }, + { + "epoch": 0.2524773451061329, + "grad_norm": 1.7172073125839233, + "learning_rate": 1.800162776098868e-05, + "loss": 1.3319, + "step": 4618 + }, + { + "epoch": 0.25253201754985444, + "grad_norm": 1.485256314277649, + "learning_rate": 1.8000531609736236e-05, + "loss": 1.5605, + "step": 4619 + }, + { + "epoch": 0.252586689993576, + "grad_norm": 2.1080822944641113, + "learning_rate": 1.7999435191327272e-05, + "loss": 1.4131, + "step": 4620 + }, + { + "epoch": 0.25264136243729757, + "grad_norm": 1.6243218183517456, + "learning_rate": 1.7998338505798393e-05, + "loss": 1.6294, + "step": 4621 + }, + { + "epoch": 0.2526960348810191, + "grad_norm": 1.3257254362106323, + "learning_rate": 1.7997241553186223e-05, + "loss": 1.664, + "step": 4622 + }, + { + "epoch": 0.25275070732474064, + "grad_norm": 1.6328235864639282, + "learning_rate": 1.7996144333527394e-05, + "loss": 1.1698, + "step": 4623 + }, + { + "epoch": 0.2528053797684622, + "grad_norm": 1.5891635417938232, + "learning_rate": 1.7995046846858542e-05, + "loss": 1.4098, + "step": 4624 + }, + { + "epoch": 0.2528600522121838, + "grad_norm": 1.4119170904159546, + "learning_rate": 1.7993949093216313e-05, + "loss": 1.4356, + "step": 4625 + }, + { + "epoch": 0.2529147246559053, + "grad_norm": 1.6607493162155151, + "learning_rate": 1.7992851072637366e-05, + "loss": 1.4101, + "step": 4626 + }, + { + "epoch": 0.25296939709962685, + "grad_norm": 1.527591586112976, + "learning_rate": 1.7991752785158364e-05, + "loss": 1.5887, + "step": 4627 + }, + { + "epoch": 0.25302406954334844, + "grad_norm": 1.684169054031372, + "learning_rate": 1.7990654230815985e-05, + "loss": 1.7058, + "step": 4628 + }, + { + "epoch": 0.25307874198707, + "grad_norm": 1.436435580253601, + "learning_rate": 1.798955540964691e-05, + "loss": 1.7119, + "step": 4629 + }, + { + "epoch": 0.2531334144307915, + "grad_norm": 1.1983873844146729, + "learning_rate": 1.798845632168783e-05, + "loss": 1.5048, + "step": 4630 + }, + { + "epoch": 0.25318808687451305, + "grad_norm": 1.7154645919799805, + "learning_rate": 1.7987356966975455e-05, + "loss": 1.37, + "step": 4631 + }, + { + "epoch": 0.25324275931823464, + "grad_norm": 1.722623348236084, + "learning_rate": 1.7986257345546484e-05, + "loss": 1.4924, + "step": 4632 + }, + { + "epoch": 0.2532974317619562, + "grad_norm": 1.567765235900879, + "learning_rate": 1.798515745743764e-05, + "loss": 1.4901, + "step": 4633 + }, + { + "epoch": 0.2533521042056777, + "grad_norm": 1.7875627279281616, + "learning_rate": 1.7984057302685647e-05, + "loss": 1.2522, + "step": 4634 + }, + { + "epoch": 0.2534067766493993, + "grad_norm": 1.5332403182983398, + "learning_rate": 1.7982956881327248e-05, + "loss": 1.5221, + "step": 4635 + }, + { + "epoch": 0.25346144909312085, + "grad_norm": 1.733299970626831, + "learning_rate": 1.798185619339919e-05, + "loss": 1.6812, + "step": 4636 + }, + { + "epoch": 0.2535161215368424, + "grad_norm": 1.4350863695144653, + "learning_rate": 1.7980755238938216e-05, + "loss": 1.204, + "step": 4637 + }, + { + "epoch": 0.2535707939805639, + "grad_norm": 1.4109135866165161, + "learning_rate": 1.79796540179811e-05, + "loss": 1.4011, + "step": 4638 + }, + { + "epoch": 0.2536254664242855, + "grad_norm": 1.4485881328582764, + "learning_rate": 1.7978552530564616e-05, + "loss": 1.3652, + "step": 4639 + }, + { + "epoch": 0.25368013886800705, + "grad_norm": 1.8270726203918457, + "learning_rate": 1.797745077672554e-05, + "loss": 1.2539, + "step": 4640 + }, + { + "epoch": 0.2537348113117286, + "grad_norm": 1.5550963878631592, + "learning_rate": 1.797634875650066e-05, + "loss": 1.4221, + "step": 4641 + }, + { + "epoch": 0.2537894837554502, + "grad_norm": 1.6853783130645752, + "learning_rate": 1.7975246469926774e-05, + "loss": 1.3943, + "step": 4642 + }, + { + "epoch": 0.2538441561991717, + "grad_norm": 1.6551976203918457, + "learning_rate": 1.79741439170407e-05, + "loss": 1.6502, + "step": 4643 + }, + { + "epoch": 0.25389882864289326, + "grad_norm": 1.5601515769958496, + "learning_rate": 1.7973041097879246e-05, + "loss": 1.6001, + "step": 4644 + }, + { + "epoch": 0.2539535010866148, + "grad_norm": 1.5294573307037354, + "learning_rate": 1.7971938012479242e-05, + "loss": 1.4896, + "step": 4645 + }, + { + "epoch": 0.2540081735303364, + "grad_norm": 1.13804292678833, + "learning_rate": 1.7970834660877522e-05, + "loss": 1.4822, + "step": 4646 + }, + { + "epoch": 0.2540628459740579, + "grad_norm": 1.2839891910552979, + "learning_rate": 1.7969731043110928e-05, + "loss": 1.4287, + "step": 4647 + }, + { + "epoch": 0.25411751841777946, + "grad_norm": 1.3788424730300903, + "learning_rate": 1.796862715921631e-05, + "loss": 1.5437, + "step": 4648 + }, + { + "epoch": 0.25417219086150106, + "grad_norm": 1.3619804382324219, + "learning_rate": 1.7967523009230535e-05, + "loss": 1.5599, + "step": 4649 + }, + { + "epoch": 0.2542268633052226, + "grad_norm": 1.5921870470046997, + "learning_rate": 1.796641859319047e-05, + "loss": 1.5327, + "step": 4650 + }, + { + "epoch": 0.25428153574894413, + "grad_norm": 1.6855783462524414, + "learning_rate": 1.796531391113299e-05, + "loss": 1.2979, + "step": 4651 + }, + { + "epoch": 0.25433620819266567, + "grad_norm": 1.0686641931533813, + "learning_rate": 1.7964208963094993e-05, + "loss": 1.5242, + "step": 4652 + }, + { + "epoch": 0.25439088063638726, + "grad_norm": 1.319861650466919, + "learning_rate": 1.796310374911337e-05, + "loss": 1.4, + "step": 4653 + }, + { + "epoch": 0.2544455530801088, + "grad_norm": 1.582353115081787, + "learning_rate": 1.7961998269225024e-05, + "loss": 1.6443, + "step": 4654 + }, + { + "epoch": 0.25450022552383034, + "grad_norm": 1.487415075302124, + "learning_rate": 1.7960892523466874e-05, + "loss": 1.2446, + "step": 4655 + }, + { + "epoch": 0.25455489796755193, + "grad_norm": 1.4630118608474731, + "learning_rate": 1.795978651187584e-05, + "loss": 1.4094, + "step": 4656 + }, + { + "epoch": 0.25460957041127347, + "grad_norm": 1.3520277738571167, + "learning_rate": 1.7958680234488857e-05, + "loss": 1.5606, + "step": 4657 + }, + { + "epoch": 0.254664242854995, + "grad_norm": 1.3262805938720703, + "learning_rate": 1.7957573691342866e-05, + "loss": 1.3151, + "step": 4658 + }, + { + "epoch": 0.25471891529871654, + "grad_norm": 1.4988869428634644, + "learning_rate": 1.7956466882474815e-05, + "loss": 1.4939, + "step": 4659 + }, + { + "epoch": 0.25477358774243813, + "grad_norm": 2.1527163982391357, + "learning_rate": 1.7955359807921667e-05, + "loss": 1.4822, + "step": 4660 + }, + { + "epoch": 0.25482826018615967, + "grad_norm": 1.5628613233566284, + "learning_rate": 1.7954252467720386e-05, + "loss": 1.4557, + "step": 4661 + }, + { + "epoch": 0.2548829326298812, + "grad_norm": 1.8883633613586426, + "learning_rate": 1.795314486190795e-05, + "loss": 1.6795, + "step": 4662 + }, + { + "epoch": 0.2549376050736028, + "grad_norm": 1.416656255722046, + "learning_rate": 1.7952036990521344e-05, + "loss": 1.4098, + "step": 4663 + }, + { + "epoch": 0.25499227751732434, + "grad_norm": 1.5949231386184692, + "learning_rate": 1.7950928853597562e-05, + "loss": 1.6058, + "step": 4664 + }, + { + "epoch": 0.2550469499610459, + "grad_norm": 1.6264917850494385, + "learning_rate": 1.7949820451173607e-05, + "loss": 1.4528, + "step": 4665 + }, + { + "epoch": 0.2551016224047674, + "grad_norm": 1.3831095695495605, + "learning_rate": 1.7948711783286498e-05, + "loss": 1.4517, + "step": 4666 + }, + { + "epoch": 0.255156294848489, + "grad_norm": 1.3991285562515259, + "learning_rate": 1.7947602849973245e-05, + "loss": 1.4166, + "step": 4667 + }, + { + "epoch": 0.25521096729221054, + "grad_norm": 1.5626569986343384, + "learning_rate": 1.7946493651270883e-05, + "loss": 1.284, + "step": 4668 + }, + { + "epoch": 0.2552656397359321, + "grad_norm": 1.3923873901367188, + "learning_rate": 1.7945384187216456e-05, + "loss": 1.6376, + "step": 4669 + }, + { + "epoch": 0.2553203121796537, + "grad_norm": 1.3213462829589844, + "learning_rate": 1.7944274457847003e-05, + "loss": 1.5092, + "step": 4670 + }, + { + "epoch": 0.2553749846233752, + "grad_norm": 1.4299399852752686, + "learning_rate": 1.7943164463199584e-05, + "loss": 1.3099, + "step": 4671 + }, + { + "epoch": 0.25542965706709675, + "grad_norm": 1.6233924627304077, + "learning_rate": 1.7942054203311265e-05, + "loss": 1.6276, + "step": 4672 + }, + { + "epoch": 0.2554843295108183, + "grad_norm": 1.960430383682251, + "learning_rate": 1.794094367821912e-05, + "loss": 1.3958, + "step": 4673 + }, + { + "epoch": 0.2555390019545399, + "grad_norm": 1.5258712768554688, + "learning_rate": 1.793983288796023e-05, + "loss": 1.4918, + "step": 4674 + }, + { + "epoch": 0.2555936743982614, + "grad_norm": 1.614457130432129, + "learning_rate": 1.7938721832571688e-05, + "loss": 1.3276, + "step": 4675 + }, + { + "epoch": 0.25564834684198295, + "grad_norm": 1.6554946899414062, + "learning_rate": 1.7937610512090597e-05, + "loss": 1.6089, + "step": 4676 + }, + { + "epoch": 0.25570301928570455, + "grad_norm": 1.1968674659729004, + "learning_rate": 1.7936498926554065e-05, + "loss": 1.3197, + "step": 4677 + }, + { + "epoch": 0.2557576917294261, + "grad_norm": 1.8019012212753296, + "learning_rate": 1.793538707599921e-05, + "loss": 1.3226, + "step": 4678 + }, + { + "epoch": 0.2558123641731476, + "grad_norm": 1.4502524137496948, + "learning_rate": 1.7934274960463155e-05, + "loss": 1.6186, + "step": 4679 + }, + { + "epoch": 0.25586703661686916, + "grad_norm": 1.3444340229034424, + "learning_rate": 1.7933162579983045e-05, + "loss": 1.4743, + "step": 4680 + }, + { + "epoch": 0.25592170906059075, + "grad_norm": 1.233153223991394, + "learning_rate": 1.7932049934596023e-05, + "loss": 1.5041, + "step": 4681 + }, + { + "epoch": 0.2559763815043123, + "grad_norm": 1.7753024101257324, + "learning_rate": 1.793093702433924e-05, + "loss": 1.4391, + "step": 4682 + }, + { + "epoch": 0.2560310539480338, + "grad_norm": 1.6149731874465942, + "learning_rate": 1.7929823849249858e-05, + "loss": 1.4704, + "step": 4683 + }, + { + "epoch": 0.2560857263917554, + "grad_norm": 1.287176489830017, + "learning_rate": 1.7928710409365044e-05, + "loss": 1.356, + "step": 4684 + }, + { + "epoch": 0.25614039883547696, + "grad_norm": 1.5021694898605347, + "learning_rate": 1.792759670472199e-05, + "loss": 1.3921, + "step": 4685 + }, + { + "epoch": 0.2561950712791985, + "grad_norm": 1.7342532873153687, + "learning_rate": 1.792648273535788e-05, + "loss": 1.4156, + "step": 4686 + }, + { + "epoch": 0.25624974372292003, + "grad_norm": 1.6502490043640137, + "learning_rate": 1.792536850130991e-05, + "loss": 1.5752, + "step": 4687 + }, + { + "epoch": 0.2563044161666416, + "grad_norm": 1.3181434869766235, + "learning_rate": 1.792425400261529e-05, + "loss": 1.3234, + "step": 4688 + }, + { + "epoch": 0.25635908861036316, + "grad_norm": 1.4892581701278687, + "learning_rate": 1.792313923931123e-05, + "loss": 1.7552, + "step": 4689 + }, + { + "epoch": 0.2564137610540847, + "grad_norm": 1.3648521900177002, + "learning_rate": 1.792202421143496e-05, + "loss": 1.4159, + "step": 4690 + }, + { + "epoch": 0.2564684334978063, + "grad_norm": 1.1135716438293457, + "learning_rate": 1.7920908919023712e-05, + "loss": 1.7905, + "step": 4691 + }, + { + "epoch": 0.25652310594152783, + "grad_norm": 1.3144290447235107, + "learning_rate": 1.791979336211473e-05, + "loss": 1.3744, + "step": 4692 + }, + { + "epoch": 0.25657777838524937, + "grad_norm": 2.3983123302459717, + "learning_rate": 1.7918677540745263e-05, + "loss": 1.4582, + "step": 4693 + }, + { + "epoch": 0.2566324508289709, + "grad_norm": 1.5106589794158936, + "learning_rate": 1.791756145495257e-05, + "loss": 1.4436, + "step": 4694 + }, + { + "epoch": 0.2566871232726925, + "grad_norm": 2.2008275985717773, + "learning_rate": 1.7916445104773923e-05, + "loss": 1.5274, + "step": 4695 + }, + { + "epoch": 0.25674179571641403, + "grad_norm": 1.521753191947937, + "learning_rate": 1.7915328490246594e-05, + "loss": 1.4546, + "step": 4696 + }, + { + "epoch": 0.25679646816013557, + "grad_norm": 1.5547720193862915, + "learning_rate": 1.7914211611407875e-05, + "loss": 1.7203, + "step": 4697 + }, + { + "epoch": 0.25685114060385716, + "grad_norm": 1.8380800485610962, + "learning_rate": 1.7913094468295057e-05, + "loss": 1.5307, + "step": 4698 + }, + { + "epoch": 0.2569058130475787, + "grad_norm": 1.33706533908844, + "learning_rate": 1.7911977060945448e-05, + "loss": 1.6952, + "step": 4699 + }, + { + "epoch": 0.25696048549130024, + "grad_norm": 1.14395010471344, + "learning_rate": 1.7910859389396356e-05, + "loss": 1.5185, + "step": 4700 + }, + { + "epoch": 0.2570151579350218, + "grad_norm": 1.4122086763381958, + "learning_rate": 1.790974145368511e-05, + "loss": 1.4637, + "step": 4701 + }, + { + "epoch": 0.25706983037874337, + "grad_norm": 1.3689738512039185, + "learning_rate": 1.7908623253849035e-05, + "loss": 1.5967, + "step": 4702 + }, + { + "epoch": 0.2571245028224649, + "grad_norm": 1.6154179573059082, + "learning_rate": 1.7907504789925473e-05, + "loss": 1.4683, + "step": 4703 + }, + { + "epoch": 0.25717917526618644, + "grad_norm": 1.3666303157806396, + "learning_rate": 1.7906386061951766e-05, + "loss": 1.753, + "step": 4704 + }, + { + "epoch": 0.25723384770990804, + "grad_norm": 1.5695881843566895, + "learning_rate": 1.7905267069965276e-05, + "loss": 1.4587, + "step": 4705 + }, + { + "epoch": 0.2572885201536296, + "grad_norm": 1.4642484188079834, + "learning_rate": 1.790414781400337e-05, + "loss": 1.3561, + "step": 4706 + }, + { + "epoch": 0.2573431925973511, + "grad_norm": 1.3726688623428345, + "learning_rate": 1.790302829410342e-05, + "loss": 1.2716, + "step": 4707 + }, + { + "epoch": 0.25739786504107265, + "grad_norm": 1.3571840524673462, + "learning_rate": 1.7901908510302813e-05, + "loss": 1.5042, + "step": 4708 + }, + { + "epoch": 0.25745253748479424, + "grad_norm": 1.201842188835144, + "learning_rate": 1.7900788462638937e-05, + "loss": 1.4695, + "step": 4709 + }, + { + "epoch": 0.2575072099285158, + "grad_norm": 2.1483001708984375, + "learning_rate": 1.789966815114919e-05, + "loss": 1.3048, + "step": 4710 + }, + { + "epoch": 0.2575618823722373, + "grad_norm": 1.2565233707427979, + "learning_rate": 1.7898547575870992e-05, + "loss": 1.5036, + "step": 4711 + }, + { + "epoch": 0.2576165548159589, + "grad_norm": 1.870903730392456, + "learning_rate": 1.7897426736841754e-05, + "loss": 1.3152, + "step": 4712 + }, + { + "epoch": 0.25767122725968045, + "grad_norm": 1.5463205575942993, + "learning_rate": 1.7896305634098904e-05, + "loss": 1.5084, + "step": 4713 + }, + { + "epoch": 0.257725899703402, + "grad_norm": 1.3742226362228394, + "learning_rate": 1.7895184267679885e-05, + "loss": 1.3974, + "step": 4714 + }, + { + "epoch": 0.2577805721471235, + "grad_norm": 1.1914986371994019, + "learning_rate": 1.789406263762213e-05, + "loss": 1.6022, + "step": 4715 + }, + { + "epoch": 0.2578352445908451, + "grad_norm": 1.3607243299484253, + "learning_rate": 1.78929407439631e-05, + "loss": 1.8605, + "step": 4716 + }, + { + "epoch": 0.25788991703456665, + "grad_norm": 1.2700238227844238, + "learning_rate": 1.789181858674026e-05, + "loss": 1.5608, + "step": 4717 + }, + { + "epoch": 0.2579445894782882, + "grad_norm": 1.3488459587097168, + "learning_rate": 1.789069616599108e-05, + "loss": 1.3044, + "step": 4718 + }, + { + "epoch": 0.2579992619220098, + "grad_norm": 1.5394216775894165, + "learning_rate": 1.7889573481753036e-05, + "loss": 1.5127, + "step": 4719 + }, + { + "epoch": 0.2580539343657313, + "grad_norm": 1.639664649963379, + "learning_rate": 1.788845053406362e-05, + "loss": 1.4195, + "step": 4720 + }, + { + "epoch": 0.25810860680945286, + "grad_norm": 1.4186595678329468, + "learning_rate": 1.7887327322960332e-05, + "loss": 1.5703, + "step": 4721 + }, + { + "epoch": 0.2581632792531744, + "grad_norm": 1.5049819946289062, + "learning_rate": 1.7886203848480675e-05, + "loss": 1.5448, + "step": 4722 + }, + { + "epoch": 0.258217951696896, + "grad_norm": 1.3976936340332031, + "learning_rate": 1.7885080110662166e-05, + "loss": 1.6454, + "step": 4723 + }, + { + "epoch": 0.2582726241406175, + "grad_norm": 1.6171201467514038, + "learning_rate": 1.788395610954233e-05, + "loss": 1.4643, + "step": 4724 + }, + { + "epoch": 0.25832729658433906, + "grad_norm": 1.4207950830459595, + "learning_rate": 1.7882831845158696e-05, + "loss": 1.6033, + "step": 4725 + }, + { + "epoch": 0.25838196902806065, + "grad_norm": 1.5205276012420654, + "learning_rate": 1.7881707317548814e-05, + "loss": 1.3206, + "step": 4726 + }, + { + "epoch": 0.2584366414717822, + "grad_norm": 1.5862420797348022, + "learning_rate": 1.7880582526750227e-05, + "loss": 1.4967, + "step": 4727 + }, + { + "epoch": 0.25849131391550373, + "grad_norm": 1.2825678586959839, + "learning_rate": 1.7879457472800496e-05, + "loss": 1.6507, + "step": 4728 + }, + { + "epoch": 0.25854598635922527, + "grad_norm": 1.5112913846969604, + "learning_rate": 1.787833215573719e-05, + "loss": 1.521, + "step": 4729 + }, + { + "epoch": 0.25860065880294686, + "grad_norm": 1.6696432828903198, + "learning_rate": 1.787720657559789e-05, + "loss": 1.5698, + "step": 4730 + }, + { + "epoch": 0.2586553312466684, + "grad_norm": 1.565740942955017, + "learning_rate": 1.7876080732420176e-05, + "loss": 1.4636, + "step": 4731 + }, + { + "epoch": 0.25871000369038993, + "grad_norm": 1.6788290739059448, + "learning_rate": 1.7874954626241644e-05, + "loss": 1.2665, + "step": 4732 + }, + { + "epoch": 0.2587646761341115, + "grad_norm": 1.3811808824539185, + "learning_rate": 1.78738282570999e-05, + "loss": 1.3299, + "step": 4733 + }, + { + "epoch": 0.25881934857783306, + "grad_norm": 1.6612684726715088, + "learning_rate": 1.787270162503255e-05, + "loss": 1.3037, + "step": 4734 + }, + { + "epoch": 0.2588740210215546, + "grad_norm": 1.7014858722686768, + "learning_rate": 1.7871574730077222e-05, + "loss": 1.5348, + "step": 4735 + }, + { + "epoch": 0.25892869346527614, + "grad_norm": 1.3178682327270508, + "learning_rate": 1.7870447572271542e-05, + "loss": 1.5888, + "step": 4736 + }, + { + "epoch": 0.25898336590899773, + "grad_norm": 1.8288451433181763, + "learning_rate": 1.7869320151653148e-05, + "loss": 1.4583, + "step": 4737 + }, + { + "epoch": 0.25903803835271927, + "grad_norm": 1.4422856569290161, + "learning_rate": 1.7868192468259686e-05, + "loss": 1.571, + "step": 4738 + }, + { + "epoch": 0.2590927107964408, + "grad_norm": 1.6636086702346802, + "learning_rate": 1.7867064522128817e-05, + "loss": 1.5621, + "step": 4739 + }, + { + "epoch": 0.2591473832401624, + "grad_norm": 1.4558749198913574, + "learning_rate": 1.7865936313298205e-05, + "loss": 1.3973, + "step": 4740 + }, + { + "epoch": 0.25920205568388394, + "grad_norm": 1.3724117279052734, + "learning_rate": 1.786480784180552e-05, + "loss": 1.5024, + "step": 4741 + }, + { + "epoch": 0.2592567281276055, + "grad_norm": 1.8085105419158936, + "learning_rate": 1.7863679107688444e-05, + "loss": 1.3445, + "step": 4742 + }, + { + "epoch": 0.259311400571327, + "grad_norm": 1.5723769664764404, + "learning_rate": 1.7862550110984674e-05, + "loss": 1.4113, + "step": 4743 + }, + { + "epoch": 0.2593660730150486, + "grad_norm": 1.3127342462539673, + "learning_rate": 1.7861420851731903e-05, + "loss": 1.2958, + "step": 4744 + }, + { + "epoch": 0.25942074545877014, + "grad_norm": 1.3307337760925293, + "learning_rate": 1.7860291329967842e-05, + "loss": 1.5033, + "step": 4745 + }, + { + "epoch": 0.2594754179024917, + "grad_norm": 1.7638918161392212, + "learning_rate": 1.7859161545730206e-05, + "loss": 1.3999, + "step": 4746 + }, + { + "epoch": 0.25953009034621327, + "grad_norm": 1.511784315109253, + "learning_rate": 1.785803149905673e-05, + "loss": 1.3738, + "step": 4747 + }, + { + "epoch": 0.2595847627899348, + "grad_norm": 1.4263060092926025, + "learning_rate": 1.7856901189985137e-05, + "loss": 1.4287, + "step": 4748 + }, + { + "epoch": 0.25963943523365635, + "grad_norm": 1.6918718814849854, + "learning_rate": 1.785577061855318e-05, + "loss": 1.4718, + "step": 4749 + }, + { + "epoch": 0.2596941076773779, + "grad_norm": 1.3109854459762573, + "learning_rate": 1.7854639784798608e-05, + "loss": 1.4467, + "step": 4750 + }, + { + "epoch": 0.2597487801210995, + "grad_norm": 1.8051328659057617, + "learning_rate": 1.785350868875918e-05, + "loss": 1.4158, + "step": 4751 + }, + { + "epoch": 0.259803452564821, + "grad_norm": 2.0072526931762695, + "learning_rate": 1.7852377330472668e-05, + "loss": 1.5516, + "step": 4752 + }, + { + "epoch": 0.25985812500854255, + "grad_norm": 1.3317791223526, + "learning_rate": 1.7851245709976853e-05, + "loss": 1.5691, + "step": 4753 + }, + { + "epoch": 0.25991279745226414, + "grad_norm": 1.9732576608657837, + "learning_rate": 1.785011382730952e-05, + "loss": 1.3969, + "step": 4754 + }, + { + "epoch": 0.2599674698959857, + "grad_norm": 1.7708137035369873, + "learning_rate": 1.7848981682508465e-05, + "loss": 1.6341, + "step": 4755 + }, + { + "epoch": 0.2600221423397072, + "grad_norm": 1.2997791767120361, + "learning_rate": 1.784784927561149e-05, + "loss": 1.5358, + "step": 4756 + }, + { + "epoch": 0.26007681478342876, + "grad_norm": 1.9620240926742554, + "learning_rate": 1.7846716606656415e-05, + "loss": 1.1976, + "step": 4757 + }, + { + "epoch": 0.26013148722715035, + "grad_norm": 1.477043628692627, + "learning_rate": 1.784558367568106e-05, + "loss": 1.8264, + "step": 4758 + }, + { + "epoch": 0.2601861596708719, + "grad_norm": 1.885176420211792, + "learning_rate": 1.7844450482723258e-05, + "loss": 1.4315, + "step": 4759 + }, + { + "epoch": 0.2602408321145934, + "grad_norm": 1.642066478729248, + "learning_rate": 1.784331702782084e-05, + "loss": 1.5391, + "step": 4760 + }, + { + "epoch": 0.260295504558315, + "grad_norm": 1.8353750705718994, + "learning_rate": 1.7842183311011667e-05, + "loss": 1.5056, + "step": 4761 + }, + { + "epoch": 0.26035017700203655, + "grad_norm": 1.80369234085083, + "learning_rate": 1.7841049332333592e-05, + "loss": 1.3298, + "step": 4762 + }, + { + "epoch": 0.2604048494457581, + "grad_norm": 1.6549296379089355, + "learning_rate": 1.7839915091824476e-05, + "loss": 1.5773, + "step": 4763 + }, + { + "epoch": 0.26045952188947963, + "grad_norm": 1.695876955986023, + "learning_rate": 1.7838780589522203e-05, + "loss": 1.533, + "step": 4764 + }, + { + "epoch": 0.2605141943332012, + "grad_norm": 1.5971015691757202, + "learning_rate": 1.7837645825464646e-05, + "loss": 1.4807, + "step": 4765 + }, + { + "epoch": 0.26056886677692276, + "grad_norm": 1.663848876953125, + "learning_rate": 1.783651079968971e-05, + "loss": 1.5905, + "step": 4766 + }, + { + "epoch": 0.2606235392206443, + "grad_norm": 1.3795212507247925, + "learning_rate": 1.783537551223528e-05, + "loss": 1.5489, + "step": 4767 + }, + { + "epoch": 0.2606782116643659, + "grad_norm": 1.5246964693069458, + "learning_rate": 1.7834239963139283e-05, + "loss": 1.2729, + "step": 4768 + }, + { + "epoch": 0.2607328841080874, + "grad_norm": 1.5281627178192139, + "learning_rate": 1.783310415243963e-05, + "loss": 1.5386, + "step": 4769 + }, + { + "epoch": 0.26078755655180896, + "grad_norm": 1.947576880455017, + "learning_rate": 1.7831968080174247e-05, + "loss": 1.3676, + "step": 4770 + }, + { + "epoch": 0.2608422289955305, + "grad_norm": 1.4658746719360352, + "learning_rate": 1.783083174638107e-05, + "loss": 1.4712, + "step": 4771 + }, + { + "epoch": 0.2608969014392521, + "grad_norm": 1.316608190536499, + "learning_rate": 1.7829695151098046e-05, + "loss": 1.5778, + "step": 4772 + }, + { + "epoch": 0.26095157388297363, + "grad_norm": 1.4500477313995361, + "learning_rate": 1.782855829436313e-05, + "loss": 1.4185, + "step": 4773 + }, + { + "epoch": 0.26100624632669517, + "grad_norm": 1.7438405752182007, + "learning_rate": 1.782742117621428e-05, + "loss": 1.4048, + "step": 4774 + }, + { + "epoch": 0.26106091877041676, + "grad_norm": 1.8445954322814941, + "learning_rate": 1.782628379668947e-05, + "loss": 1.5544, + "step": 4775 + }, + { + "epoch": 0.2611155912141383, + "grad_norm": 1.4926493167877197, + "learning_rate": 1.7825146155826682e-05, + "loss": 1.3941, + "step": 4776 + }, + { + "epoch": 0.26117026365785984, + "grad_norm": 1.5127443075180054, + "learning_rate": 1.7824008253663897e-05, + "loss": 1.0211, + "step": 4777 + }, + { + "epoch": 0.2612249361015814, + "grad_norm": 1.5876163244247437, + "learning_rate": 1.782287009023912e-05, + "loss": 1.4894, + "step": 4778 + }, + { + "epoch": 0.26127960854530297, + "grad_norm": 1.7045836448669434, + "learning_rate": 1.782173166559035e-05, + "loss": 1.5852, + "step": 4779 + }, + { + "epoch": 0.2613342809890245, + "grad_norm": 1.4350892305374146, + "learning_rate": 1.7820592979755605e-05, + "loss": 1.331, + "step": 4780 + }, + { + "epoch": 0.26138895343274604, + "grad_norm": 1.9266929626464844, + "learning_rate": 1.7819454032772913e-05, + "loss": 1.5028, + "step": 4781 + }, + { + "epoch": 0.26144362587646763, + "grad_norm": 1.764664649963379, + "learning_rate": 1.78183148246803e-05, + "loss": 1.796, + "step": 4782 + }, + { + "epoch": 0.26149829832018917, + "grad_norm": 1.4546639919281006, + "learning_rate": 1.781717535551581e-05, + "loss": 1.2407, + "step": 4783 + }, + { + "epoch": 0.2615529707639107, + "grad_norm": 2.038416862487793, + "learning_rate": 1.781603562531749e-05, + "loss": 1.1363, + "step": 4784 + }, + { + "epoch": 0.26160764320763225, + "grad_norm": 1.3854541778564453, + "learning_rate": 1.7814895634123397e-05, + "loss": 1.3896, + "step": 4785 + }, + { + "epoch": 0.26166231565135384, + "grad_norm": 1.3397663831710815, + "learning_rate": 1.7813755381971604e-05, + "loss": 1.6585, + "step": 4786 + }, + { + "epoch": 0.2617169880950754, + "grad_norm": 1.3598084449768066, + "learning_rate": 1.7812614868900185e-05, + "loss": 1.339, + "step": 4787 + }, + { + "epoch": 0.2617716605387969, + "grad_norm": 1.8139327764511108, + "learning_rate": 1.7811474094947222e-05, + "loss": 1.6626, + "step": 4788 + }, + { + "epoch": 0.2618263329825185, + "grad_norm": 1.4618760347366333, + "learning_rate": 1.7810333060150803e-05, + "loss": 1.4422, + "step": 4789 + }, + { + "epoch": 0.26188100542624004, + "grad_norm": 1.9515970945358276, + "learning_rate": 1.7809191764549042e-05, + "loss": 1.2196, + "step": 4790 + }, + { + "epoch": 0.2619356778699616, + "grad_norm": 1.5216726064682007, + "learning_rate": 1.7808050208180037e-05, + "loss": 1.4816, + "step": 4791 + }, + { + "epoch": 0.2619903503136832, + "grad_norm": 1.5262668132781982, + "learning_rate": 1.780690839108192e-05, + "loss": 1.4448, + "step": 4792 + }, + { + "epoch": 0.2620450227574047, + "grad_norm": 1.4947131872177124, + "learning_rate": 1.780576631329281e-05, + "loss": 1.7534, + "step": 4793 + }, + { + "epoch": 0.26209969520112625, + "grad_norm": 1.5330592393875122, + "learning_rate": 1.7804623974850844e-05, + "loss": 1.5707, + "step": 4794 + }, + { + "epoch": 0.2621543676448478, + "grad_norm": 1.8438429832458496, + "learning_rate": 1.7803481375794174e-05, + "loss": 1.2803, + "step": 4795 + }, + { + "epoch": 0.2622090400885694, + "grad_norm": 1.5929548740386963, + "learning_rate": 1.7802338516160947e-05, + "loss": 1.6202, + "step": 4796 + }, + { + "epoch": 0.2622637125322909, + "grad_norm": 1.608821988105774, + "learning_rate": 1.7801195395989327e-05, + "loss": 1.3898, + "step": 4797 + }, + { + "epoch": 0.26231838497601245, + "grad_norm": 1.5202218294143677, + "learning_rate": 1.7800052015317488e-05, + "loss": 1.5203, + "step": 4798 + }, + { + "epoch": 0.26237305741973405, + "grad_norm": 1.4171462059020996, + "learning_rate": 1.7798908374183606e-05, + "loss": 1.2732, + "step": 4799 + }, + { + "epoch": 0.2624277298634556, + "grad_norm": 1.1205403804779053, + "learning_rate": 1.7797764472625874e-05, + "loss": 1.6407, + "step": 4800 + }, + { + "epoch": 0.2624824023071771, + "grad_norm": 1.600131630897522, + "learning_rate": 1.779662031068249e-05, + "loss": 1.5606, + "step": 4801 + }, + { + "epoch": 0.26253707475089866, + "grad_norm": 1.54376220703125, + "learning_rate": 1.7795475888391656e-05, + "loss": 1.465, + "step": 4802 + }, + { + "epoch": 0.26259174719462025, + "grad_norm": 1.66549551486969, + "learning_rate": 1.7794331205791593e-05, + "loss": 1.7294, + "step": 4803 + }, + { + "epoch": 0.2626464196383418, + "grad_norm": 1.3756352663040161, + "learning_rate": 1.7793186262920517e-05, + "loss": 1.394, + "step": 4804 + }, + { + "epoch": 0.2627010920820633, + "grad_norm": 2.229153633117676, + "learning_rate": 1.7792041059816668e-05, + "loss": 1.5813, + "step": 4805 + }, + { + "epoch": 0.2627557645257849, + "grad_norm": 1.6728445291519165, + "learning_rate": 1.779089559651828e-05, + "loss": 1.4083, + "step": 4806 + }, + { + "epoch": 0.26281043696950646, + "grad_norm": 1.4001621007919312, + "learning_rate": 1.778974987306361e-05, + "loss": 1.4717, + "step": 4807 + }, + { + "epoch": 0.262865109413228, + "grad_norm": 1.402232050895691, + "learning_rate": 1.7788603889490907e-05, + "loss": 1.4759, + "step": 4808 + }, + { + "epoch": 0.26291978185694953, + "grad_norm": 1.4214391708374023, + "learning_rate": 1.778745764583845e-05, + "loss": 1.1092, + "step": 4809 + }, + { + "epoch": 0.2629744543006711, + "grad_norm": 2.5817487239837646, + "learning_rate": 1.7786311142144505e-05, + "loss": 1.5752, + "step": 4810 + }, + { + "epoch": 0.26302912674439266, + "grad_norm": 2.0383894443511963, + "learning_rate": 1.778516437844736e-05, + "loss": 1.5499, + "step": 4811 + }, + { + "epoch": 0.2630837991881142, + "grad_norm": 1.4648674726486206, + "learning_rate": 1.7784017354785307e-05, + "loss": 1.3, + "step": 4812 + }, + { + "epoch": 0.2631384716318358, + "grad_norm": 1.417848825454712, + "learning_rate": 1.778287007119665e-05, + "loss": 1.2091, + "step": 4813 + }, + { + "epoch": 0.26319314407555733, + "grad_norm": 1.5896754264831543, + "learning_rate": 1.77817225277197e-05, + "loss": 1.4672, + "step": 4814 + }, + { + "epoch": 0.26324781651927887, + "grad_norm": 1.5104957818984985, + "learning_rate": 1.778057472439277e-05, + "loss": 1.3357, + "step": 4815 + }, + { + "epoch": 0.2633024889630004, + "grad_norm": 1.596266508102417, + "learning_rate": 1.7779426661254196e-05, + "loss": 1.3916, + "step": 4816 + }, + { + "epoch": 0.263357161406722, + "grad_norm": 1.6384763717651367, + "learning_rate": 1.777827833834231e-05, + "loss": 1.6984, + "step": 4817 + }, + { + "epoch": 0.26341183385044353, + "grad_norm": 2.135786294937134, + "learning_rate": 1.7777129755695456e-05, + "loss": 1.2847, + "step": 4818 + }, + { + "epoch": 0.26346650629416507, + "grad_norm": 1.2562097311019897, + "learning_rate": 1.7775980913351994e-05, + "loss": 1.55, + "step": 4819 + }, + { + "epoch": 0.26352117873788666, + "grad_norm": 1.4976317882537842, + "learning_rate": 1.7774831811350278e-05, + "loss": 1.4048, + "step": 4820 + }, + { + "epoch": 0.2635758511816082, + "grad_norm": 1.6949684619903564, + "learning_rate": 1.7773682449728684e-05, + "loss": 1.2801, + "step": 4821 + }, + { + "epoch": 0.26363052362532974, + "grad_norm": 1.1740052700042725, + "learning_rate": 1.7772532828525593e-05, + "loss": 1.5173, + "step": 4822 + }, + { + "epoch": 0.2636851960690513, + "grad_norm": 1.5579726696014404, + "learning_rate": 1.7771382947779393e-05, + "loss": 1.5275, + "step": 4823 + }, + { + "epoch": 0.26373986851277287, + "grad_norm": 1.7437900304794312, + "learning_rate": 1.7770232807528478e-05, + "loss": 1.6195, + "step": 4824 + }, + { + "epoch": 0.2637945409564944, + "grad_norm": 1.4533051252365112, + "learning_rate": 1.776908240781126e-05, + "loss": 1.2546, + "step": 4825 + }, + { + "epoch": 0.26384921340021594, + "grad_norm": 1.6445047855377197, + "learning_rate": 1.7767931748666145e-05, + "loss": 1.3005, + "step": 4826 + }, + { + "epoch": 0.26390388584393754, + "grad_norm": 1.2788752317428589, + "learning_rate": 1.7766780830131563e-05, + "loss": 1.6315, + "step": 4827 + }, + { + "epoch": 0.2639585582876591, + "grad_norm": 1.7875179052352905, + "learning_rate": 1.7765629652245945e-05, + "loss": 1.4558, + "step": 4828 + }, + { + "epoch": 0.2640132307313806, + "grad_norm": 1.3936001062393188, + "learning_rate": 1.7764478215047725e-05, + "loss": 1.4757, + "step": 4829 + }, + { + "epoch": 0.26406790317510215, + "grad_norm": 1.416349172592163, + "learning_rate": 1.7763326518575364e-05, + "loss": 1.4386, + "step": 4830 + }, + { + "epoch": 0.26412257561882374, + "grad_norm": 1.3070162534713745, + "learning_rate": 1.776217456286731e-05, + "loss": 1.385, + "step": 4831 + }, + { + "epoch": 0.2641772480625453, + "grad_norm": 1.4106069803237915, + "learning_rate": 1.7761022347962034e-05, + "loss": 1.5834, + "step": 4832 + }, + { + "epoch": 0.2642319205062668, + "grad_norm": 1.2700364589691162, + "learning_rate": 1.7759869873898008e-05, + "loss": 1.6091, + "step": 4833 + }, + { + "epoch": 0.2642865929499884, + "grad_norm": 1.465627670288086, + "learning_rate": 1.775871714071372e-05, + "loss": 1.4378, + "step": 4834 + }, + { + "epoch": 0.26434126539370995, + "grad_norm": 1.4821332693099976, + "learning_rate": 1.7757564148447663e-05, + "loss": 1.5808, + "step": 4835 + }, + { + "epoch": 0.2643959378374315, + "grad_norm": 1.7229065895080566, + "learning_rate": 1.7756410897138326e-05, + "loss": 1.4727, + "step": 4836 + }, + { + "epoch": 0.264450610281153, + "grad_norm": 1.3081518411636353, + "learning_rate": 1.7755257386824238e-05, + "loss": 1.3234, + "step": 4837 + }, + { + "epoch": 0.2645052827248746, + "grad_norm": 1.1314420700073242, + "learning_rate": 1.7754103617543903e-05, + "loss": 1.3661, + "step": 4838 + }, + { + "epoch": 0.26455995516859615, + "grad_norm": 1.6380283832550049, + "learning_rate": 1.7752949589335853e-05, + "loss": 1.4698, + "step": 4839 + }, + { + "epoch": 0.2646146276123177, + "grad_norm": 1.373273491859436, + "learning_rate": 1.7751795302238623e-05, + "loss": 1.3808, + "step": 4840 + }, + { + "epoch": 0.2646693000560393, + "grad_norm": 1.6337398290634155, + "learning_rate": 1.775064075629076e-05, + "loss": 1.3073, + "step": 4841 + }, + { + "epoch": 0.2647239724997608, + "grad_norm": 1.6490510702133179, + "learning_rate": 1.7749485951530815e-05, + "loss": 1.3835, + "step": 4842 + }, + { + "epoch": 0.26477864494348236, + "grad_norm": 1.611162781715393, + "learning_rate": 1.7748330887997344e-05, + "loss": 1.6346, + "step": 4843 + }, + { + "epoch": 0.2648333173872039, + "grad_norm": 1.3160291910171509, + "learning_rate": 1.7747175565728928e-05, + "loss": 1.4934, + "step": 4844 + }, + { + "epoch": 0.2648879898309255, + "grad_norm": 1.5079344511032104, + "learning_rate": 1.7746019984764138e-05, + "loss": 1.2289, + "step": 4845 + }, + { + "epoch": 0.264942662274647, + "grad_norm": 1.335890531539917, + "learning_rate": 1.7744864145141564e-05, + "loss": 1.5288, + "step": 4846 + }, + { + "epoch": 0.26499733471836856, + "grad_norm": 1.4559605121612549, + "learning_rate": 1.7743708046899804e-05, + "loss": 1.3602, + "step": 4847 + }, + { + "epoch": 0.26505200716209015, + "grad_norm": 1.4479413032531738, + "learning_rate": 1.774255169007746e-05, + "loss": 1.5898, + "step": 4848 + }, + { + "epoch": 0.2651066796058117, + "grad_norm": 1.9079301357269287, + "learning_rate": 1.7741395074713146e-05, + "loss": 1.5464, + "step": 4849 + }, + { + "epoch": 0.2651613520495332, + "grad_norm": 1.4805934429168701, + "learning_rate": 1.7740238200845485e-05, + "loss": 1.2764, + "step": 4850 + }, + { + "epoch": 0.26521602449325477, + "grad_norm": 1.6527512073516846, + "learning_rate": 1.773908106851311e-05, + "loss": 1.6655, + "step": 4851 + }, + { + "epoch": 0.26527069693697636, + "grad_norm": 1.4961638450622559, + "learning_rate": 1.7737923677754657e-05, + "loss": 1.7963, + "step": 4852 + }, + { + "epoch": 0.2653253693806979, + "grad_norm": 1.470170259475708, + "learning_rate": 1.7736766028608768e-05, + "loss": 1.5456, + "step": 4853 + }, + { + "epoch": 0.26538004182441943, + "grad_norm": 1.4207763671875, + "learning_rate": 1.7735608121114112e-05, + "loss": 1.5928, + "step": 4854 + }, + { + "epoch": 0.265434714268141, + "grad_norm": 1.8204115629196167, + "learning_rate": 1.7734449955309353e-05, + "loss": 1.7361, + "step": 4855 + }, + { + "epoch": 0.26548938671186256, + "grad_norm": 1.4091681241989136, + "learning_rate": 1.7733291531233156e-05, + "loss": 1.429, + "step": 4856 + }, + { + "epoch": 0.2655440591555841, + "grad_norm": 1.360445499420166, + "learning_rate": 1.7732132848924206e-05, + "loss": 1.5592, + "step": 4857 + }, + { + "epoch": 0.26559873159930564, + "grad_norm": 1.156298279762268, + "learning_rate": 1.77309739084212e-05, + "loss": 1.5337, + "step": 4858 + }, + { + "epoch": 0.26565340404302723, + "grad_norm": 1.706160068511963, + "learning_rate": 1.772981470976283e-05, + "loss": 1.4195, + "step": 4859 + }, + { + "epoch": 0.26570807648674877, + "grad_norm": 1.6298693418502808, + "learning_rate": 1.7728655252987808e-05, + "loss": 1.4813, + "step": 4860 + }, + { + "epoch": 0.2657627489304703, + "grad_norm": 1.9922986030578613, + "learning_rate": 1.7727495538134857e-05, + "loss": 1.695, + "step": 4861 + }, + { + "epoch": 0.2658174213741919, + "grad_norm": 1.3868488073349, + "learning_rate": 1.7726335565242693e-05, + "loss": 1.2859, + "step": 4862 + }, + { + "epoch": 0.26587209381791344, + "grad_norm": 1.4584554433822632, + "learning_rate": 1.7725175334350057e-05, + "loss": 1.3229, + "step": 4863 + }, + { + "epoch": 0.265926766261635, + "grad_norm": 1.2414535284042358, + "learning_rate": 1.7724014845495684e-05, + "loss": 1.6836, + "step": 4864 + }, + { + "epoch": 0.2659814387053565, + "grad_norm": 2.1334891319274902, + "learning_rate": 1.7722854098718333e-05, + "loss": 1.4276, + "step": 4865 + }, + { + "epoch": 0.2660361111490781, + "grad_norm": 1.527337908744812, + "learning_rate": 1.7721693094056762e-05, + "loss": 1.1668, + "step": 4866 + }, + { + "epoch": 0.26609078359279964, + "grad_norm": 1.4787677526474, + "learning_rate": 1.772053183154974e-05, + "loss": 1.7242, + "step": 4867 + }, + { + "epoch": 0.2661454560365212, + "grad_norm": 1.4654392004013062, + "learning_rate": 1.7719370311236042e-05, + "loss": 1.3394, + "step": 4868 + }, + { + "epoch": 0.26620012848024277, + "grad_norm": 1.519832730293274, + "learning_rate": 1.7718208533154454e-05, + "loss": 1.3465, + "step": 4869 + }, + { + "epoch": 0.2662548009239643, + "grad_norm": 1.8754328489303589, + "learning_rate": 1.7717046497343773e-05, + "loss": 1.2584, + "step": 4870 + }, + { + "epoch": 0.26630947336768584, + "grad_norm": 1.4203301668167114, + "learning_rate": 1.77158842038428e-05, + "loss": 1.2614, + "step": 4871 + }, + { + "epoch": 0.2663641458114074, + "grad_norm": 1.9289944171905518, + "learning_rate": 1.7714721652690347e-05, + "loss": 1.3237, + "step": 4872 + }, + { + "epoch": 0.266418818255129, + "grad_norm": 2.878373146057129, + "learning_rate": 1.7713558843925235e-05, + "loss": 1.2268, + "step": 4873 + }, + { + "epoch": 0.2664734906988505, + "grad_norm": 1.2901073694229126, + "learning_rate": 1.7712395777586294e-05, + "loss": 1.4245, + "step": 4874 + }, + { + "epoch": 0.26652816314257205, + "grad_norm": 1.9989054203033447, + "learning_rate": 1.7711232453712363e-05, + "loss": 1.3246, + "step": 4875 + }, + { + "epoch": 0.26658283558629364, + "grad_norm": 1.5944420099258423, + "learning_rate": 1.771006887234228e-05, + "loss": 1.3339, + "step": 4876 + }, + { + "epoch": 0.2666375080300152, + "grad_norm": 1.682448148727417, + "learning_rate": 1.7708905033514908e-05, + "loss": 1.5481, + "step": 4877 + }, + { + "epoch": 0.2666921804737367, + "grad_norm": 1.7751461267471313, + "learning_rate": 1.7707740937269108e-05, + "loss": 1.1933, + "step": 4878 + }, + { + "epoch": 0.26674685291745825, + "grad_norm": 1.468902587890625, + "learning_rate": 1.7706576583643748e-05, + "loss": 1.363, + "step": 4879 + }, + { + "epoch": 0.26680152536117985, + "grad_norm": 1.861428141593933, + "learning_rate": 1.7705411972677713e-05, + "loss": 1.3462, + "step": 4880 + }, + { + "epoch": 0.2668561978049014, + "grad_norm": 1.6458730697631836, + "learning_rate": 1.7704247104409893e-05, + "loss": 1.3275, + "step": 4881 + }, + { + "epoch": 0.2669108702486229, + "grad_norm": 1.3379486799240112, + "learning_rate": 1.7703081978879183e-05, + "loss": 1.3443, + "step": 4882 + }, + { + "epoch": 0.2669655426923445, + "grad_norm": 1.7787516117095947, + "learning_rate": 1.770191659612449e-05, + "loss": 1.2809, + "step": 4883 + }, + { + "epoch": 0.26702021513606605, + "grad_norm": 1.23789644241333, + "learning_rate": 1.7700750956184728e-05, + "loss": 1.5783, + "step": 4884 + }, + { + "epoch": 0.2670748875797876, + "grad_norm": 1.6845568418502808, + "learning_rate": 1.769958505909882e-05, + "loss": 1.4451, + "step": 4885 + }, + { + "epoch": 0.2671295600235091, + "grad_norm": 1.5210245847702026, + "learning_rate": 1.76984189049057e-05, + "loss": 1.3431, + "step": 4886 + }, + { + "epoch": 0.2671842324672307, + "grad_norm": 1.6472339630126953, + "learning_rate": 1.769725249364431e-05, + "loss": 1.3467, + "step": 4887 + }, + { + "epoch": 0.26723890491095226, + "grad_norm": 1.5480927228927612, + "learning_rate": 1.7696085825353593e-05, + "loss": 1.5787, + "step": 4888 + }, + { + "epoch": 0.2672935773546738, + "grad_norm": 1.6491929292678833, + "learning_rate": 1.7694918900072515e-05, + "loss": 1.5317, + "step": 4889 + }, + { + "epoch": 0.2673482497983954, + "grad_norm": 1.3576000928878784, + "learning_rate": 1.769375171784004e-05, + "loss": 1.5171, + "step": 4890 + }, + { + "epoch": 0.2674029222421169, + "grad_norm": 1.4315139055252075, + "learning_rate": 1.7692584278695134e-05, + "loss": 1.4621, + "step": 4891 + }, + { + "epoch": 0.26745759468583846, + "grad_norm": 1.4911112785339355, + "learning_rate": 1.7691416582676792e-05, + "loss": 1.6017, + "step": 4892 + }, + { + "epoch": 0.26751226712956, + "grad_norm": 1.3504700660705566, + "learning_rate": 1.7690248629824003e-05, + "loss": 1.4836, + "step": 4893 + }, + { + "epoch": 0.2675669395732816, + "grad_norm": 1.5989422798156738, + "learning_rate": 1.7689080420175764e-05, + "loss": 1.2838, + "step": 4894 + }, + { + "epoch": 0.26762161201700313, + "grad_norm": 1.409529447555542, + "learning_rate": 1.7687911953771086e-05, + "loss": 1.4573, + "step": 4895 + }, + { + "epoch": 0.26767628446072467, + "grad_norm": 1.6089739799499512, + "learning_rate": 1.768674323064899e-05, + "loss": 1.5436, + "step": 4896 + }, + { + "epoch": 0.26773095690444626, + "grad_norm": 1.9805911779403687, + "learning_rate": 1.76855742508485e-05, + "loss": 1.59, + "step": 4897 + }, + { + "epoch": 0.2677856293481678, + "grad_norm": 1.9226642847061157, + "learning_rate": 1.768440501440865e-05, + "loss": 1.5587, + "step": 4898 + }, + { + "epoch": 0.26784030179188933, + "grad_norm": 1.4256037473678589, + "learning_rate": 1.7683235521368484e-05, + "loss": 1.5235, + "step": 4899 + }, + { + "epoch": 0.26789497423561087, + "grad_norm": 1.5366382598876953, + "learning_rate": 1.7682065771767055e-05, + "loss": 1.3697, + "step": 4900 + }, + { + "epoch": 0.26794964667933246, + "grad_norm": 1.4176181554794312, + "learning_rate": 1.7680895765643423e-05, + "loss": 1.5041, + "step": 4901 + }, + { + "epoch": 0.268004319123054, + "grad_norm": 1.471592903137207, + "learning_rate": 1.767972550303666e-05, + "loss": 1.4499, + "step": 4902 + }, + { + "epoch": 0.26805899156677554, + "grad_norm": 1.4393178224563599, + "learning_rate": 1.767855498398584e-05, + "loss": 1.1523, + "step": 4903 + }, + { + "epoch": 0.26811366401049713, + "grad_norm": 1.208349585533142, + "learning_rate": 1.767738420853005e-05, + "loss": 1.596, + "step": 4904 + }, + { + "epoch": 0.26816833645421867, + "grad_norm": 1.4068855047225952, + "learning_rate": 1.7676213176708387e-05, + "loss": 1.2514, + "step": 4905 + }, + { + "epoch": 0.2682230088979402, + "grad_norm": 1.4920138120651245, + "learning_rate": 1.7675041888559952e-05, + "loss": 1.38, + "step": 4906 + }, + { + "epoch": 0.26827768134166174, + "grad_norm": 1.3129581212997437, + "learning_rate": 1.767387034412386e-05, + "loss": 1.4264, + "step": 4907 + }, + { + "epoch": 0.26833235378538334, + "grad_norm": 1.4025721549987793, + "learning_rate": 1.7672698543439228e-05, + "loss": 1.3677, + "step": 4908 + }, + { + "epoch": 0.2683870262291049, + "grad_norm": 1.6435002088546753, + "learning_rate": 1.7671526486545188e-05, + "loss": 1.6447, + "step": 4909 + }, + { + "epoch": 0.2684416986728264, + "grad_norm": 1.2217323780059814, + "learning_rate": 1.7670354173480876e-05, + "loss": 1.6879, + "step": 4910 + }, + { + "epoch": 0.268496371116548, + "grad_norm": 1.7162939310073853, + "learning_rate": 1.766918160428544e-05, + "loss": 1.5657, + "step": 4911 + }, + { + "epoch": 0.26855104356026954, + "grad_norm": 1.9730627536773682, + "learning_rate": 1.7668008778998034e-05, + "loss": 1.6127, + "step": 4912 + }, + { + "epoch": 0.2686057160039911, + "grad_norm": 1.2710071802139282, + "learning_rate": 1.7666835697657824e-05, + "loss": 1.673, + "step": 4913 + }, + { + "epoch": 0.2686603884477126, + "grad_norm": 1.8085414171218872, + "learning_rate": 1.7665662360303972e-05, + "loss": 1.3706, + "step": 4914 + }, + { + "epoch": 0.2687150608914342, + "grad_norm": 1.7080849409103394, + "learning_rate": 1.7664488766975673e-05, + "loss": 1.4905, + "step": 4915 + }, + { + "epoch": 0.26876973333515575, + "grad_norm": 1.6835638284683228, + "learning_rate": 1.7663314917712103e-05, + "loss": 1.236, + "step": 4916 + }, + { + "epoch": 0.2688244057788773, + "grad_norm": 2.3246278762817383, + "learning_rate": 1.766214081255247e-05, + "loss": 1.6333, + "step": 4917 + }, + { + "epoch": 0.2688790782225989, + "grad_norm": 1.7987779378890991, + "learning_rate": 1.7660966451535974e-05, + "loss": 1.3888, + "step": 4918 + }, + { + "epoch": 0.2689337506663204, + "grad_norm": 1.5907070636749268, + "learning_rate": 1.765979183470183e-05, + "loss": 1.4999, + "step": 4919 + }, + { + "epoch": 0.26898842311004195, + "grad_norm": 1.3416712284088135, + "learning_rate": 1.7658616962089262e-05, + "loss": 1.6814, + "step": 4920 + }, + { + "epoch": 0.2690430955537635, + "grad_norm": 1.5714056491851807, + "learning_rate": 1.7657441833737505e-05, + "loss": 1.3952, + "step": 4921 + }, + { + "epoch": 0.2690977679974851, + "grad_norm": 1.4863625764846802, + "learning_rate": 1.7656266449685796e-05, + "loss": 1.2736, + "step": 4922 + }, + { + "epoch": 0.2691524404412066, + "grad_norm": 1.290935754776001, + "learning_rate": 1.765509080997338e-05, + "loss": 1.4082, + "step": 4923 + }, + { + "epoch": 0.26920711288492816, + "grad_norm": 1.9340296983718872, + "learning_rate": 1.7653914914639524e-05, + "loss": 1.5622, + "step": 4924 + }, + { + "epoch": 0.26926178532864975, + "grad_norm": 1.442223072052002, + "learning_rate": 1.7652738763723484e-05, + "loss": 1.4657, + "step": 4925 + }, + { + "epoch": 0.2693164577723713, + "grad_norm": 1.6326241493225098, + "learning_rate": 1.7651562357264543e-05, + "loss": 1.5077, + "step": 4926 + }, + { + "epoch": 0.2693711302160928, + "grad_norm": 1.5943089723587036, + "learning_rate": 1.765038569530198e-05, + "loss": 1.5324, + "step": 4927 + }, + { + "epoch": 0.26942580265981436, + "grad_norm": 1.8898669481277466, + "learning_rate": 1.764920877787508e-05, + "loss": 1.5784, + "step": 4928 + }, + { + "epoch": 0.26948047510353595, + "grad_norm": 1.280031442642212, + "learning_rate": 1.764803160502316e-05, + "loss": 1.478, + "step": 4929 + }, + { + "epoch": 0.2695351475472575, + "grad_norm": 1.3995484113693237, + "learning_rate": 1.764685417678551e-05, + "loss": 1.265, + "step": 4930 + }, + { + "epoch": 0.26958981999097903, + "grad_norm": 1.3524943590164185, + "learning_rate": 1.7645676493201455e-05, + "loss": 1.6854, + "step": 4931 + }, + { + "epoch": 0.2696444924347006, + "grad_norm": 1.460638403892517, + "learning_rate": 1.7644498554310322e-05, + "loss": 1.3375, + "step": 4932 + }, + { + "epoch": 0.26969916487842216, + "grad_norm": 1.43340003490448, + "learning_rate": 1.764332036015145e-05, + "loss": 1.4842, + "step": 4933 + }, + { + "epoch": 0.2697538373221437, + "grad_norm": 1.5152134895324707, + "learning_rate": 1.7642141910764164e-05, + "loss": 1.1674, + "step": 4934 + }, + { + "epoch": 0.26980850976586523, + "grad_norm": 1.3884683847427368, + "learning_rate": 1.7640963206187835e-05, + "loss": 1.4879, + "step": 4935 + }, + { + "epoch": 0.2698631822095868, + "grad_norm": 1.7648909091949463, + "learning_rate": 1.7639784246461813e-05, + "loss": 1.5183, + "step": 4936 + }, + { + "epoch": 0.26991785465330836, + "grad_norm": 1.7494233846664429, + "learning_rate": 1.7638605031625467e-05, + "loss": 1.3993, + "step": 4937 + }, + { + "epoch": 0.2699725270970299, + "grad_norm": 1.577858328819275, + "learning_rate": 1.7637425561718176e-05, + "loss": 1.3717, + "step": 4938 + }, + { + "epoch": 0.2700271995407515, + "grad_norm": 1.5335357189178467, + "learning_rate": 1.763624583677932e-05, + "loss": 1.5975, + "step": 4939 + }, + { + "epoch": 0.27008187198447303, + "grad_norm": 1.721850037574768, + "learning_rate": 1.76350658568483e-05, + "loss": 1.337, + "step": 4940 + }, + { + "epoch": 0.27013654442819457, + "grad_norm": 1.3936381340026855, + "learning_rate": 1.7633885621964516e-05, + "loss": 1.2585, + "step": 4941 + }, + { + "epoch": 0.2701912168719161, + "grad_norm": 1.326910376548767, + "learning_rate": 1.7632705132167377e-05, + "loss": 1.4835, + "step": 4942 + }, + { + "epoch": 0.2702458893156377, + "grad_norm": 1.2203559875488281, + "learning_rate": 1.76315243874963e-05, + "loss": 1.3301, + "step": 4943 + }, + { + "epoch": 0.27030056175935924, + "grad_norm": 1.831658124923706, + "learning_rate": 1.7630343387990713e-05, + "loss": 1.5421, + "step": 4944 + }, + { + "epoch": 0.2703552342030808, + "grad_norm": 1.3260447978973389, + "learning_rate": 1.7629162133690063e-05, + "loss": 1.5653, + "step": 4945 + }, + { + "epoch": 0.27040990664680237, + "grad_norm": 1.192801833152771, + "learning_rate": 1.762798062463378e-05, + "loss": 1.5352, + "step": 4946 + }, + { + "epoch": 0.2704645790905239, + "grad_norm": 1.367666244506836, + "learning_rate": 1.762679886086133e-05, + "loss": 1.4133, + "step": 4947 + }, + { + "epoch": 0.27051925153424544, + "grad_norm": 1.478309988975525, + "learning_rate": 1.7625616842412166e-05, + "loss": 1.6135, + "step": 4948 + }, + { + "epoch": 0.270573923977967, + "grad_norm": 1.3489261865615845, + "learning_rate": 1.7624434569325762e-05, + "loss": 1.4172, + "step": 4949 + }, + { + "epoch": 0.27062859642168857, + "grad_norm": 2.003322124481201, + "learning_rate": 1.7623252041641596e-05, + "loss": 1.6973, + "step": 4950 + }, + { + "epoch": 0.2706832688654101, + "grad_norm": 1.3544548749923706, + "learning_rate": 1.7622069259399158e-05, + "loss": 1.5701, + "step": 4951 + }, + { + "epoch": 0.27073794130913165, + "grad_norm": 1.6673184633255005, + "learning_rate": 1.762088622263794e-05, + "loss": 1.5347, + "step": 4952 + }, + { + "epoch": 0.27079261375285324, + "grad_norm": 1.431174635887146, + "learning_rate": 1.7619702931397448e-05, + "loss": 1.3386, + "step": 4953 + }, + { + "epoch": 0.2708472861965748, + "grad_norm": 1.5123841762542725, + "learning_rate": 1.7618519385717194e-05, + "loss": 1.5001, + "step": 4954 + }, + { + "epoch": 0.2709019586402963, + "grad_norm": 1.43544602394104, + "learning_rate": 1.76173355856367e-05, + "loss": 1.3902, + "step": 4955 + }, + { + "epoch": 0.27095663108401785, + "grad_norm": 1.4206606149673462, + "learning_rate": 1.76161515311955e-05, + "loss": 1.3102, + "step": 4956 + }, + { + "epoch": 0.27101130352773944, + "grad_norm": 2.5335795879364014, + "learning_rate": 1.7614967222433125e-05, + "loss": 1.283, + "step": 4957 + }, + { + "epoch": 0.271065975971461, + "grad_norm": 1.669156551361084, + "learning_rate": 1.761378265938913e-05, + "loss": 1.2143, + "step": 4958 + }, + { + "epoch": 0.2711206484151825, + "grad_norm": 1.6422395706176758, + "learning_rate": 1.761259784210306e-05, + "loss": 1.5545, + "step": 4959 + }, + { + "epoch": 0.2711753208589041, + "grad_norm": 1.2805944681167603, + "learning_rate": 1.7611412770614487e-05, + "loss": 1.5698, + "step": 4960 + }, + { + "epoch": 0.27122999330262565, + "grad_norm": 1.818114995956421, + "learning_rate": 1.761022744496298e-05, + "loss": 1.5414, + "step": 4961 + }, + { + "epoch": 0.2712846657463472, + "grad_norm": 1.2699707746505737, + "learning_rate": 1.7609041865188122e-05, + "loss": 1.2206, + "step": 4962 + }, + { + "epoch": 0.2713393381900687, + "grad_norm": 1.8911164999008179, + "learning_rate": 1.7607856031329497e-05, + "loss": 1.4933, + "step": 4963 + }, + { + "epoch": 0.2713940106337903, + "grad_norm": 1.3893883228302002, + "learning_rate": 1.760666994342671e-05, + "loss": 1.501, + "step": 4964 + }, + { + "epoch": 0.27144868307751185, + "grad_norm": 1.7728745937347412, + "learning_rate": 1.7605483601519366e-05, + "loss": 1.4309, + "step": 4965 + }, + { + "epoch": 0.2715033555212334, + "grad_norm": 2.134413242340088, + "learning_rate": 1.7604297005647076e-05, + "loss": 1.4776, + "step": 4966 + }, + { + "epoch": 0.271558027964955, + "grad_norm": 1.5759636163711548, + "learning_rate": 1.7603110155849463e-05, + "loss": 1.4328, + "step": 4967 + }, + { + "epoch": 0.2716127004086765, + "grad_norm": 1.6058745384216309, + "learning_rate": 1.7601923052166162e-05, + "loss": 1.4556, + "step": 4968 + }, + { + "epoch": 0.27166737285239806, + "grad_norm": 1.3097907304763794, + "learning_rate": 1.7600735694636814e-05, + "loss": 1.6367, + "step": 4969 + }, + { + "epoch": 0.2717220452961196, + "grad_norm": 1.686509609222412, + "learning_rate": 1.759954808330106e-05, + "loss": 1.4892, + "step": 4970 + }, + { + "epoch": 0.2717767177398412, + "grad_norm": 1.5192960500717163, + "learning_rate": 1.759836021819857e-05, + "loss": 1.597, + "step": 4971 + }, + { + "epoch": 0.2718313901835627, + "grad_norm": 1.4612940549850464, + "learning_rate": 1.7597172099368998e-05, + "loss": 1.3016, + "step": 4972 + }, + { + "epoch": 0.27188606262728426, + "grad_norm": 1.7877280712127686, + "learning_rate": 1.7595983726852022e-05, + "loss": 1.2686, + "step": 4973 + }, + { + "epoch": 0.27194073507100586, + "grad_norm": 1.590567708015442, + "learning_rate": 1.7594795100687324e-05, + "loss": 1.4141, + "step": 4974 + }, + { + "epoch": 0.2719954075147274, + "grad_norm": 1.994118332862854, + "learning_rate": 1.75936062209146e-05, + "loss": 1.6079, + "step": 4975 + }, + { + "epoch": 0.27205007995844893, + "grad_norm": 1.597395420074463, + "learning_rate": 1.759241708757354e-05, + "loss": 1.4063, + "step": 4976 + }, + { + "epoch": 0.27210475240217047, + "grad_norm": 1.4457952976226807, + "learning_rate": 1.7591227700703858e-05, + "loss": 1.451, + "step": 4977 + }, + { + "epoch": 0.27215942484589206, + "grad_norm": 1.7129336595535278, + "learning_rate": 1.7590038060345277e-05, + "loss": 1.5597, + "step": 4978 + }, + { + "epoch": 0.2722140972896136, + "grad_norm": 1.5061402320861816, + "learning_rate": 1.7588848166537507e-05, + "loss": 1.4443, + "step": 4979 + }, + { + "epoch": 0.27226876973333514, + "grad_norm": 1.422021746635437, + "learning_rate": 1.758765801932029e-05, + "loss": 1.4796, + "step": 4980 + }, + { + "epoch": 0.27232344217705673, + "grad_norm": 2.222479820251465, + "learning_rate": 1.7586467618733368e-05, + "loss": 1.5307, + "step": 4981 + }, + { + "epoch": 0.27237811462077827, + "grad_norm": 1.908524513244629, + "learning_rate": 1.758527696481649e-05, + "loss": 1.4925, + "step": 4982 + }, + { + "epoch": 0.2724327870644998, + "grad_norm": 1.485072374343872, + "learning_rate": 1.7584086057609413e-05, + "loss": 1.5673, + "step": 4983 + }, + { + "epoch": 0.27248745950822134, + "grad_norm": 1.3926429748535156, + "learning_rate": 1.7582894897151908e-05, + "loss": 1.7984, + "step": 4984 + }, + { + "epoch": 0.27254213195194293, + "grad_norm": 1.525830864906311, + "learning_rate": 1.758170348348375e-05, + "loss": 1.5824, + "step": 4985 + }, + { + "epoch": 0.27259680439566447, + "grad_norm": 1.7665393352508545, + "learning_rate": 1.7580511816644718e-05, + "loss": 1.6733, + "step": 4986 + }, + { + "epoch": 0.272651476839386, + "grad_norm": 1.3408474922180176, + "learning_rate": 1.7579319896674612e-05, + "loss": 1.5698, + "step": 4987 + }, + { + "epoch": 0.2727061492831076, + "grad_norm": 1.6377314329147339, + "learning_rate": 1.7578127723613224e-05, + "loss": 1.2174, + "step": 4988 + }, + { + "epoch": 0.27276082172682914, + "grad_norm": 1.8399068117141724, + "learning_rate": 1.7576935297500374e-05, + "loss": 1.2188, + "step": 4989 + }, + { + "epoch": 0.2728154941705507, + "grad_norm": 1.6015101671218872, + "learning_rate": 1.757574261837587e-05, + "loss": 1.4368, + "step": 4990 + }, + { + "epoch": 0.2728701666142722, + "grad_norm": 1.3174091577529907, + "learning_rate": 1.7574549686279545e-05, + "loss": 1.2591, + "step": 4991 + }, + { + "epoch": 0.2729248390579938, + "grad_norm": 1.579028844833374, + "learning_rate": 1.7573356501251235e-05, + "loss": 1.5378, + "step": 4992 + }, + { + "epoch": 0.27297951150171534, + "grad_norm": 1.8839596509933472, + "learning_rate": 1.7572163063330773e-05, + "loss": 1.5096, + "step": 4993 + }, + { + "epoch": 0.2730341839454369, + "grad_norm": 1.1861947774887085, + "learning_rate": 1.7570969372558023e-05, + "loss": 1.5559, + "step": 4994 + }, + { + "epoch": 0.2730888563891585, + "grad_norm": 1.5448182821273804, + "learning_rate": 1.756977542897284e-05, + "loss": 1.5267, + "step": 4995 + }, + { + "epoch": 0.27314352883288, + "grad_norm": 1.3619283437728882, + "learning_rate": 1.756858123261509e-05, + "loss": 1.2209, + "step": 4996 + }, + { + "epoch": 0.27319820127660155, + "grad_norm": 1.368538737297058, + "learning_rate": 1.7567386783524655e-05, + "loss": 1.5344, + "step": 4997 + }, + { + "epoch": 0.27325287372032314, + "grad_norm": 1.5050935745239258, + "learning_rate": 1.7566192081741416e-05, + "loss": 1.544, + "step": 4998 + }, + { + "epoch": 0.2733075461640447, + "grad_norm": 1.251407265663147, + "learning_rate": 1.7564997127305268e-05, + "loss": 1.4152, + "step": 4999 + }, + { + "epoch": 0.2733622186077662, + "grad_norm": 1.7875871658325195, + "learning_rate": 1.7563801920256115e-05, + "loss": 1.2288, + "step": 5000 + }, + { + "epoch": 0.27341689105148775, + "grad_norm": 1.8968404531478882, + "learning_rate": 1.7562606460633867e-05, + "loss": 1.4219, + "step": 5001 + }, + { + "epoch": 0.27347156349520935, + "grad_norm": 1.5862276554107666, + "learning_rate": 1.7561410748478443e-05, + "loss": 1.5488, + "step": 5002 + }, + { + "epoch": 0.2735262359389309, + "grad_norm": 1.7669655084609985, + "learning_rate": 1.756021478382977e-05, + "loss": 1.5628, + "step": 5003 + }, + { + "epoch": 0.2735809083826524, + "grad_norm": 1.4815008640289307, + "learning_rate": 1.7559018566727788e-05, + "loss": 1.2789, + "step": 5004 + }, + { + "epoch": 0.273635580826374, + "grad_norm": 1.8438282012939453, + "learning_rate": 1.7557822097212433e-05, + "loss": 1.6898, + "step": 5005 + }, + { + "epoch": 0.27369025327009555, + "grad_norm": 1.5546648502349854, + "learning_rate": 1.755662537532367e-05, + "loss": 1.4028, + "step": 5006 + }, + { + "epoch": 0.2737449257138171, + "grad_norm": 1.7778711318969727, + "learning_rate": 1.7555428401101445e-05, + "loss": 1.5503, + "step": 5007 + }, + { + "epoch": 0.2737995981575386, + "grad_norm": 1.4305341243743896, + "learning_rate": 1.7554231174585742e-05, + "loss": 1.415, + "step": 5008 + }, + { + "epoch": 0.2738542706012602, + "grad_norm": 1.5141816139221191, + "learning_rate": 1.755303369581653e-05, + "loss": 1.511, + "step": 5009 + }, + { + "epoch": 0.27390894304498176, + "grad_norm": 1.5445648431777954, + "learning_rate": 1.7551835964833803e-05, + "loss": 1.3865, + "step": 5010 + }, + { + "epoch": 0.2739636154887033, + "grad_norm": 1.4566867351531982, + "learning_rate": 1.755063798167755e-05, + "loss": 1.3921, + "step": 5011 + }, + { + "epoch": 0.2740182879324249, + "grad_norm": 1.619996190071106, + "learning_rate": 1.7549439746387776e-05, + "loss": 1.3812, + "step": 5012 + }, + { + "epoch": 0.2740729603761464, + "grad_norm": 1.2795169353485107, + "learning_rate": 1.7548241259004496e-05, + "loss": 1.4402, + "step": 5013 + }, + { + "epoch": 0.27412763281986796, + "grad_norm": 1.319917917251587, + "learning_rate": 1.7547042519567728e-05, + "loss": 1.5425, + "step": 5014 + }, + { + "epoch": 0.2741823052635895, + "grad_norm": 1.530895471572876, + "learning_rate": 1.75458435281175e-05, + "loss": 1.4373, + "step": 5015 + }, + { + "epoch": 0.2742369777073111, + "grad_norm": 1.1462253332138062, + "learning_rate": 1.7544644284693847e-05, + "loss": 1.3329, + "step": 5016 + }, + { + "epoch": 0.27429165015103263, + "grad_norm": 1.7655307054519653, + "learning_rate": 1.754344478933682e-05, + "loss": 1.774, + "step": 5017 + }, + { + "epoch": 0.27434632259475417, + "grad_norm": 1.3872535228729248, + "learning_rate": 1.754224504208647e-05, + "loss": 1.5371, + "step": 5018 + }, + { + "epoch": 0.27440099503847576, + "grad_norm": 1.4587730169296265, + "learning_rate": 1.754104504298286e-05, + "loss": 1.4888, + "step": 5019 + }, + { + "epoch": 0.2744556674821973, + "grad_norm": 1.6562490463256836, + "learning_rate": 1.753984479206606e-05, + "loss": 1.386, + "step": 5020 + }, + { + "epoch": 0.27451033992591883, + "grad_norm": 1.2282257080078125, + "learning_rate": 1.7538644289376147e-05, + "loss": 1.4131, + "step": 5021 + }, + { + "epoch": 0.27456501236964037, + "grad_norm": 1.237426519393921, + "learning_rate": 1.7537443534953213e-05, + "loss": 1.4979, + "step": 5022 + }, + { + "epoch": 0.27461968481336196, + "grad_norm": 1.5615054368972778, + "learning_rate": 1.753624252883735e-05, + "loss": 1.3776, + "step": 5023 + }, + { + "epoch": 0.2746743572570835, + "grad_norm": 1.6500229835510254, + "learning_rate": 1.753504127106867e-05, + "loss": 1.3574, + "step": 5024 + }, + { + "epoch": 0.27472902970080504, + "grad_norm": 1.5050575733184814, + "learning_rate": 1.7533839761687278e-05, + "loss": 1.4992, + "step": 5025 + }, + { + "epoch": 0.27478370214452663, + "grad_norm": 1.4506889581680298, + "learning_rate": 1.7532638000733296e-05, + "loss": 1.3783, + "step": 5026 + }, + { + "epoch": 0.27483837458824817, + "grad_norm": 2.580312967300415, + "learning_rate": 1.7531435988246857e-05, + "loss": 1.0966, + "step": 5027 + }, + { + "epoch": 0.2748930470319697, + "grad_norm": 1.9669934511184692, + "learning_rate": 1.75302337242681e-05, + "loss": 1.2031, + "step": 5028 + }, + { + "epoch": 0.27494771947569124, + "grad_norm": 1.5590554475784302, + "learning_rate": 1.7529031208837165e-05, + "loss": 1.6106, + "step": 5029 + }, + { + "epoch": 0.27500239191941284, + "grad_norm": 1.6142020225524902, + "learning_rate": 1.752782844199421e-05, + "loss": 1.1191, + "step": 5030 + }, + { + "epoch": 0.2750570643631344, + "grad_norm": 1.4533579349517822, + "learning_rate": 1.75266254237794e-05, + "loss": 1.3309, + "step": 5031 + }, + { + "epoch": 0.2751117368068559, + "grad_norm": 1.3028345108032227, + "learning_rate": 1.7525422154232906e-05, + "loss": 1.5026, + "step": 5032 + }, + { + "epoch": 0.2751664092505775, + "grad_norm": 1.5833700895309448, + "learning_rate": 1.7524218633394904e-05, + "loss": 1.7407, + "step": 5033 + }, + { + "epoch": 0.27522108169429904, + "grad_norm": 1.4665720462799072, + "learning_rate": 1.7523014861305588e-05, + "loss": 1.4719, + "step": 5034 + }, + { + "epoch": 0.2752757541380206, + "grad_norm": 1.2627657651901245, + "learning_rate": 1.7521810838005154e-05, + "loss": 1.5679, + "step": 5035 + }, + { + "epoch": 0.2753304265817421, + "grad_norm": 1.3343783617019653, + "learning_rate": 1.75206065635338e-05, + "loss": 1.3994, + "step": 5036 + }, + { + "epoch": 0.2753850990254637, + "grad_norm": 1.620030164718628, + "learning_rate": 1.751940203793175e-05, + "loss": 1.266, + "step": 5037 + }, + { + "epoch": 0.27543977146918525, + "grad_norm": 1.5484371185302734, + "learning_rate": 1.751819726123922e-05, + "loss": 1.5251, + "step": 5038 + }, + { + "epoch": 0.2754944439129068, + "grad_norm": 1.5984094142913818, + "learning_rate": 1.7516992233496443e-05, + "loss": 1.471, + "step": 5039 + }, + { + "epoch": 0.2755491163566284, + "grad_norm": 1.611484408378601, + "learning_rate": 1.7515786954743657e-05, + "loss": 1.5481, + "step": 5040 + }, + { + "epoch": 0.2756037888003499, + "grad_norm": 1.439035177230835, + "learning_rate": 1.7514581425021107e-05, + "loss": 1.3353, + "step": 5041 + }, + { + "epoch": 0.27565846124407145, + "grad_norm": 1.6967220306396484, + "learning_rate": 1.7513375644369048e-05, + "loss": 1.6445, + "step": 5042 + }, + { + "epoch": 0.275713133687793, + "grad_norm": 1.7656986713409424, + "learning_rate": 1.7512169612827748e-05, + "loss": 1.3493, + "step": 5043 + }, + { + "epoch": 0.2757678061315146, + "grad_norm": 2.3180947303771973, + "learning_rate": 1.7510963330437474e-05, + "loss": 1.4241, + "step": 5044 + }, + { + "epoch": 0.2758224785752361, + "grad_norm": 1.3247917890548706, + "learning_rate": 1.7509756797238512e-05, + "loss": 1.5716, + "step": 5045 + }, + { + "epoch": 0.27587715101895766, + "grad_norm": 1.5617756843566895, + "learning_rate": 1.7508550013271146e-05, + "loss": 1.5919, + "step": 5046 + }, + { + "epoch": 0.27593182346267925, + "grad_norm": 1.2626290321350098, + "learning_rate": 1.7507342978575676e-05, + "loss": 1.4379, + "step": 5047 + }, + { + "epoch": 0.2759864959064008, + "grad_norm": 1.5131587982177734, + "learning_rate": 1.750613569319241e-05, + "loss": 1.443, + "step": 5048 + }, + { + "epoch": 0.2760411683501223, + "grad_norm": 2.1037726402282715, + "learning_rate": 1.7504928157161657e-05, + "loss": 1.7281, + "step": 5049 + }, + { + "epoch": 0.27609584079384386, + "grad_norm": 1.6635305881500244, + "learning_rate": 1.7503720370523742e-05, + "loss": 1.4833, + "step": 5050 + }, + { + "epoch": 0.27615051323756545, + "grad_norm": 1.5515080690383911, + "learning_rate": 1.7502512333318998e-05, + "loss": 1.2403, + "step": 5051 + }, + { + "epoch": 0.276205185681287, + "grad_norm": 1.195518136024475, + "learning_rate": 1.7501304045587756e-05, + "loss": 1.5826, + "step": 5052 + }, + { + "epoch": 0.27625985812500853, + "grad_norm": 1.375617504119873, + "learning_rate": 1.7500095507370376e-05, + "loss": 1.5208, + "step": 5053 + }, + { + "epoch": 0.2763145305687301, + "grad_norm": 1.3748016357421875, + "learning_rate": 1.7498886718707203e-05, + "loss": 1.3412, + "step": 5054 + }, + { + "epoch": 0.27636920301245166, + "grad_norm": 1.623327612876892, + "learning_rate": 1.749767767963861e-05, + "loss": 1.5466, + "step": 5055 + }, + { + "epoch": 0.2764238754561732, + "grad_norm": 1.4796018600463867, + "learning_rate": 1.7496468390204965e-05, + "loss": 1.5647, + "step": 5056 + }, + { + "epoch": 0.27647854789989473, + "grad_norm": 1.2816691398620605, + "learning_rate": 1.7495258850446646e-05, + "loss": 1.4315, + "step": 5057 + }, + { + "epoch": 0.2765332203436163, + "grad_norm": 1.1851773262023926, + "learning_rate": 1.7494049060404047e-05, + "loss": 1.4395, + "step": 5058 + }, + { + "epoch": 0.27658789278733786, + "grad_norm": 1.6871657371520996, + "learning_rate": 1.7492839020117567e-05, + "loss": 1.4024, + "step": 5059 + }, + { + "epoch": 0.2766425652310594, + "grad_norm": 1.4832764863967896, + "learning_rate": 1.749162872962761e-05, + "loss": 1.5844, + "step": 5060 + }, + { + "epoch": 0.276697237674781, + "grad_norm": 1.6680018901824951, + "learning_rate": 1.7490418188974586e-05, + "loss": 1.3763, + "step": 5061 + }, + { + "epoch": 0.27675191011850253, + "grad_norm": 1.5794941186904907, + "learning_rate": 1.7489207398198924e-05, + "loss": 1.5186, + "step": 5062 + }, + { + "epoch": 0.27680658256222407, + "grad_norm": 1.3677269220352173, + "learning_rate": 1.7487996357341054e-05, + "loss": 1.5895, + "step": 5063 + }, + { + "epoch": 0.2768612550059456, + "grad_norm": 1.5146515369415283, + "learning_rate": 1.7486785066441412e-05, + "loss": 1.3401, + "step": 5064 + }, + { + "epoch": 0.2769159274496672, + "grad_norm": 2.0118958950042725, + "learning_rate": 1.748557352554045e-05, + "loss": 1.585, + "step": 5065 + }, + { + "epoch": 0.27697059989338874, + "grad_norm": 1.5106525421142578, + "learning_rate": 1.7484361734678623e-05, + "loss": 1.4869, + "step": 5066 + }, + { + "epoch": 0.2770252723371103, + "grad_norm": 1.889025330543518, + "learning_rate": 1.7483149693896396e-05, + "loss": 1.605, + "step": 5067 + }, + { + "epoch": 0.27707994478083187, + "grad_norm": 1.3549151420593262, + "learning_rate": 1.7481937403234236e-05, + "loss": 1.6936, + "step": 5068 + }, + { + "epoch": 0.2771346172245534, + "grad_norm": 1.2883135080337524, + "learning_rate": 1.7480724862732634e-05, + "loss": 1.6409, + "step": 5069 + }, + { + "epoch": 0.27718928966827494, + "grad_norm": 1.9500864744186401, + "learning_rate": 1.747951207243207e-05, + "loss": 1.1751, + "step": 5070 + }, + { + "epoch": 0.2772439621119965, + "grad_norm": 1.6023545265197754, + "learning_rate": 1.7478299032373053e-05, + "loss": 1.3127, + "step": 5071 + }, + { + "epoch": 0.27729863455571807, + "grad_norm": 1.6267398595809937, + "learning_rate": 1.747708574259608e-05, + "loss": 1.4546, + "step": 5072 + }, + { + "epoch": 0.2773533069994396, + "grad_norm": 1.689576268196106, + "learning_rate": 1.747587220314167e-05, + "loss": 1.3252, + "step": 5073 + }, + { + "epoch": 0.27740797944316115, + "grad_norm": 2.2005815505981445, + "learning_rate": 1.7474658414050344e-05, + "loss": 1.396, + "step": 5074 + }, + { + "epoch": 0.27746265188688274, + "grad_norm": 1.5338242053985596, + "learning_rate": 1.747344437536263e-05, + "loss": 1.4053, + "step": 5075 + }, + { + "epoch": 0.2775173243306043, + "grad_norm": 1.3248987197875977, + "learning_rate": 1.7472230087119074e-05, + "loss": 1.3664, + "step": 5076 + }, + { + "epoch": 0.2775719967743258, + "grad_norm": 1.3134154081344604, + "learning_rate": 1.747101554936022e-05, + "loss": 1.6867, + "step": 5077 + }, + { + "epoch": 0.27762666921804735, + "grad_norm": 1.3268580436706543, + "learning_rate": 1.746980076212663e-05, + "loss": 1.6839, + "step": 5078 + }, + { + "epoch": 0.27768134166176894, + "grad_norm": 1.4226760864257812, + "learning_rate": 1.746858572545886e-05, + "loss": 1.5471, + "step": 5079 + }, + { + "epoch": 0.2777360141054905, + "grad_norm": 1.450514554977417, + "learning_rate": 1.7467370439397487e-05, + "loss": 1.363, + "step": 5080 + }, + { + "epoch": 0.277790686549212, + "grad_norm": 1.5338201522827148, + "learning_rate": 1.7466154903983092e-05, + "loss": 1.7174, + "step": 5081 + }, + { + "epoch": 0.2778453589929336, + "grad_norm": 1.363319754600525, + "learning_rate": 1.746493911925627e-05, + "loss": 1.4826, + "step": 5082 + }, + { + "epoch": 0.27790003143665515, + "grad_norm": 1.9021944999694824, + "learning_rate": 1.7463723085257606e-05, + "loss": 1.496, + "step": 5083 + }, + { + "epoch": 0.2779547038803767, + "grad_norm": 2.1240339279174805, + "learning_rate": 1.746250680202772e-05, + "loss": 1.3315, + "step": 5084 + }, + { + "epoch": 0.2780093763240982, + "grad_norm": 3.109851360321045, + "learning_rate": 1.7461290269607217e-05, + "loss": 1.3078, + "step": 5085 + }, + { + "epoch": 0.2780640487678198, + "grad_norm": 1.5030238628387451, + "learning_rate": 1.7460073488036723e-05, + "loss": 1.4155, + "step": 5086 + }, + { + "epoch": 0.27811872121154135, + "grad_norm": 1.6980332136154175, + "learning_rate": 1.7458856457356873e-05, + "loss": 1.4788, + "step": 5087 + }, + { + "epoch": 0.2781733936552629, + "grad_norm": 1.3860145807266235, + "learning_rate": 1.74576391776083e-05, + "loss": 1.2906, + "step": 5088 + }, + { + "epoch": 0.2782280660989845, + "grad_norm": 1.3875770568847656, + "learning_rate": 1.7456421648831658e-05, + "loss": 1.3007, + "step": 5089 + }, + { + "epoch": 0.278282738542706, + "grad_norm": 1.4999006986618042, + "learning_rate": 1.74552038710676e-05, + "loss": 1.7085, + "step": 5090 + }, + { + "epoch": 0.27833741098642756, + "grad_norm": 1.6192119121551514, + "learning_rate": 1.7453985844356786e-05, + "loss": 1.4875, + "step": 5091 + }, + { + "epoch": 0.2783920834301491, + "grad_norm": 1.2496856451034546, + "learning_rate": 1.74527675687399e-05, + "loss": 1.4264, + "step": 5092 + }, + { + "epoch": 0.2784467558738707, + "grad_norm": 1.196745753288269, + "learning_rate": 1.7451549044257608e-05, + "loss": 1.8237, + "step": 5093 + }, + { + "epoch": 0.2785014283175922, + "grad_norm": 1.3717544078826904, + "learning_rate": 1.7450330270950614e-05, + "loss": 1.3676, + "step": 5094 + }, + { + "epoch": 0.27855610076131376, + "grad_norm": 1.9866243600845337, + "learning_rate": 1.744911124885961e-05, + "loss": 1.5083, + "step": 5095 + }, + { + "epoch": 0.27861077320503536, + "grad_norm": 1.2495722770690918, + "learning_rate": 1.74478919780253e-05, + "loss": 1.3836, + "step": 5096 + }, + { + "epoch": 0.2786654456487569, + "grad_norm": 1.386962652206421, + "learning_rate": 1.7446672458488398e-05, + "loss": 1.478, + "step": 5097 + }, + { + "epoch": 0.27872011809247843, + "grad_norm": 2.0265390872955322, + "learning_rate": 1.7445452690289632e-05, + "loss": 1.5072, + "step": 5098 + }, + { + "epoch": 0.27877479053619997, + "grad_norm": 1.7404568195343018, + "learning_rate": 1.7444232673469726e-05, + "loss": 1.6076, + "step": 5099 + }, + { + "epoch": 0.27882946297992156, + "grad_norm": 1.2443568706512451, + "learning_rate": 1.7443012408069427e-05, + "loss": 1.3721, + "step": 5100 + }, + { + "epoch": 0.2788841354236431, + "grad_norm": 1.2547944784164429, + "learning_rate": 1.744179189412947e-05, + "loss": 1.4838, + "step": 5101 + }, + { + "epoch": 0.27893880786736464, + "grad_norm": 1.8968364000320435, + "learning_rate": 1.7440571131690626e-05, + "loss": 1.3838, + "step": 5102 + }, + { + "epoch": 0.27899348031108623, + "grad_norm": 1.4851725101470947, + "learning_rate": 1.7439350120793652e-05, + "loss": 1.4378, + "step": 5103 + }, + { + "epoch": 0.27904815275480777, + "grad_norm": 1.6259123086929321, + "learning_rate": 1.7438128861479316e-05, + "loss": 1.5519, + "step": 5104 + }, + { + "epoch": 0.2791028251985293, + "grad_norm": 1.5723763704299927, + "learning_rate": 1.7436907353788404e-05, + "loss": 1.3414, + "step": 5105 + }, + { + "epoch": 0.27915749764225084, + "grad_norm": 1.4335966110229492, + "learning_rate": 1.743568559776171e-05, + "loss": 1.5687, + "step": 5106 + }, + { + "epoch": 0.27921217008597243, + "grad_norm": 1.1393799781799316, + "learning_rate": 1.743446359344002e-05, + "loss": 1.3816, + "step": 5107 + }, + { + "epoch": 0.27926684252969397, + "grad_norm": 1.3704856634140015, + "learning_rate": 1.7433241340864147e-05, + "loss": 1.5159, + "step": 5108 + }, + { + "epoch": 0.2793215149734155, + "grad_norm": 1.6540964841842651, + "learning_rate": 1.7432018840074905e-05, + "loss": 1.2824, + "step": 5109 + }, + { + "epoch": 0.2793761874171371, + "grad_norm": 1.4601014852523804, + "learning_rate": 1.743079609111311e-05, + "loss": 1.5889, + "step": 5110 + }, + { + "epoch": 0.27943085986085864, + "grad_norm": 1.9972821474075317, + "learning_rate": 1.74295730940196e-05, + "loss": 1.4296, + "step": 5111 + }, + { + "epoch": 0.2794855323045802, + "grad_norm": 1.373531699180603, + "learning_rate": 1.7428349848835208e-05, + "loss": 1.5869, + "step": 5112 + }, + { + "epoch": 0.2795402047483017, + "grad_norm": 1.9616692066192627, + "learning_rate": 1.742712635560078e-05, + "loss": 1.1991, + "step": 5113 + }, + { + "epoch": 0.2795948771920233, + "grad_norm": 1.8620679378509521, + "learning_rate": 1.7425902614357182e-05, + "loss": 1.5068, + "step": 5114 + }, + { + "epoch": 0.27964954963574484, + "grad_norm": 1.3888134956359863, + "learning_rate": 1.7424678625145266e-05, + "loss": 1.293, + "step": 5115 + }, + { + "epoch": 0.2797042220794664, + "grad_norm": 1.5410009622573853, + "learning_rate": 1.742345438800591e-05, + "loss": 1.2846, + "step": 5116 + }, + { + "epoch": 0.279758894523188, + "grad_norm": 1.4248782396316528, + "learning_rate": 1.7422229902979992e-05, + "loss": 1.4037, + "step": 5117 + }, + { + "epoch": 0.2798135669669095, + "grad_norm": 1.4917612075805664, + "learning_rate": 1.7421005170108402e-05, + "loss": 1.3638, + "step": 5118 + }, + { + "epoch": 0.27986823941063105, + "grad_norm": 1.2055351734161377, + "learning_rate": 1.741978018943203e-05, + "loss": 1.5563, + "step": 5119 + }, + { + "epoch": 0.2799229118543526, + "grad_norm": 1.1647611856460571, + "learning_rate": 1.741855496099179e-05, + "loss": 1.778, + "step": 5120 + }, + { + "epoch": 0.2799775842980742, + "grad_norm": 1.6351655721664429, + "learning_rate": 1.7417329484828594e-05, + "loss": 1.3553, + "step": 5121 + }, + { + "epoch": 0.2800322567417957, + "grad_norm": 1.3950883150100708, + "learning_rate": 1.7416103760983357e-05, + "loss": 1.3718, + "step": 5122 + }, + { + "epoch": 0.28008692918551725, + "grad_norm": 1.6289665699005127, + "learning_rate": 1.7414877789497017e-05, + "loss": 1.4357, + "step": 5123 + }, + { + "epoch": 0.28014160162923885, + "grad_norm": 1.3299474716186523, + "learning_rate": 1.7413651570410504e-05, + "loss": 1.8733, + "step": 5124 + }, + { + "epoch": 0.2801962740729604, + "grad_norm": 1.3059226274490356, + "learning_rate": 1.7412425103764773e-05, + "loss": 1.7164, + "step": 5125 + }, + { + "epoch": 0.2802509465166819, + "grad_norm": 1.5461419820785522, + "learning_rate": 1.741119838960077e-05, + "loss": 1.3981, + "step": 5126 + }, + { + "epoch": 0.28030561896040346, + "grad_norm": 1.3639979362487793, + "learning_rate": 1.7409971427959465e-05, + "loss": 1.4259, + "step": 5127 + }, + { + "epoch": 0.28036029140412505, + "grad_norm": 1.9640988111495972, + "learning_rate": 1.7408744218881823e-05, + "loss": 1.2467, + "step": 5128 + }, + { + "epoch": 0.2804149638478466, + "grad_norm": 1.7853120565414429, + "learning_rate": 1.7407516762408826e-05, + "loss": 1.4476, + "step": 5129 + }, + { + "epoch": 0.2804696362915681, + "grad_norm": 1.650462031364441, + "learning_rate": 1.7406289058581466e-05, + "loss": 1.4307, + "step": 5130 + }, + { + "epoch": 0.2805243087352897, + "grad_norm": 1.6901918649673462, + "learning_rate": 1.740506110744073e-05, + "loss": 1.5599, + "step": 5131 + }, + { + "epoch": 0.28057898117901126, + "grad_norm": 1.7263340950012207, + "learning_rate": 1.7403832909027633e-05, + "loss": 1.5948, + "step": 5132 + }, + { + "epoch": 0.2806336536227328, + "grad_norm": 1.4450253248214722, + "learning_rate": 1.7402604463383176e-05, + "loss": 1.5457, + "step": 5133 + }, + { + "epoch": 0.28068832606645433, + "grad_norm": 1.745153784751892, + "learning_rate": 1.7401375770548387e-05, + "loss": 1.3187, + "step": 5134 + }, + { + "epoch": 0.2807429985101759, + "grad_norm": 1.3101375102996826, + "learning_rate": 1.7400146830564295e-05, + "loss": 1.4142, + "step": 5135 + }, + { + "epoch": 0.28079767095389746, + "grad_norm": 1.2665128707885742, + "learning_rate": 1.7398917643471933e-05, + "loss": 1.5016, + "step": 5136 + }, + { + "epoch": 0.280852343397619, + "grad_norm": 1.679267406463623, + "learning_rate": 1.739768820931235e-05, + "loss": 1.3968, + "step": 5137 + }, + { + "epoch": 0.2809070158413406, + "grad_norm": 1.315100908279419, + "learning_rate": 1.7396458528126595e-05, + "loss": 1.452, + "step": 5138 + }, + { + "epoch": 0.28096168828506213, + "grad_norm": 1.350100040435791, + "learning_rate": 1.739522859995574e-05, + "loss": 1.2741, + "step": 5139 + }, + { + "epoch": 0.28101636072878367, + "grad_norm": 1.4877734184265137, + "learning_rate": 1.7393998424840845e-05, + "loss": 1.4245, + "step": 5140 + }, + { + "epoch": 0.2810710331725052, + "grad_norm": 1.3972216844558716, + "learning_rate": 1.739276800282299e-05, + "loss": 1.3638, + "step": 5141 + }, + { + "epoch": 0.2811257056162268, + "grad_norm": 1.648511290550232, + "learning_rate": 1.7391537333943267e-05, + "loss": 1.708, + "step": 5142 + }, + { + "epoch": 0.28118037805994833, + "grad_norm": 1.4881881475448608, + "learning_rate": 1.7390306418242767e-05, + "loss": 1.4533, + "step": 5143 + }, + { + "epoch": 0.28123505050366987, + "grad_norm": 1.3510249853134155, + "learning_rate": 1.7389075255762592e-05, + "loss": 1.7002, + "step": 5144 + }, + { + "epoch": 0.28128972294739146, + "grad_norm": 1.4103264808654785, + "learning_rate": 1.7387843846543858e-05, + "loss": 1.571, + "step": 5145 + }, + { + "epoch": 0.281344395391113, + "grad_norm": 1.3821020126342773, + "learning_rate": 1.7386612190627682e-05, + "loss": 1.6295, + "step": 5146 + }, + { + "epoch": 0.28139906783483454, + "grad_norm": 1.2628849744796753, + "learning_rate": 1.7385380288055187e-05, + "loss": 1.4368, + "step": 5147 + }, + { + "epoch": 0.2814537402785561, + "grad_norm": 1.9803255796432495, + "learning_rate": 1.7384148138867518e-05, + "loss": 1.5145, + "step": 5148 + }, + { + "epoch": 0.28150841272227767, + "grad_norm": 1.3821136951446533, + "learning_rate": 1.7382915743105813e-05, + "loss": 1.4644, + "step": 5149 + }, + { + "epoch": 0.2815630851659992, + "grad_norm": 1.5810855627059937, + "learning_rate": 1.738168310081123e-05, + "loss": 1.2533, + "step": 5150 + }, + { + "epoch": 0.28161775760972074, + "grad_norm": 1.7592661380767822, + "learning_rate": 1.7380450212024924e-05, + "loss": 1.3978, + "step": 5151 + }, + { + "epoch": 0.28167243005344234, + "grad_norm": 1.5376209020614624, + "learning_rate": 1.7379217076788068e-05, + "loss": 1.58, + "step": 5152 + }, + { + "epoch": 0.2817271024971639, + "grad_norm": 1.3462153673171997, + "learning_rate": 1.7377983695141836e-05, + "loss": 1.4763, + "step": 5153 + }, + { + "epoch": 0.2817817749408854, + "grad_norm": 1.5649402141571045, + "learning_rate": 1.7376750067127415e-05, + "loss": 1.4529, + "step": 5154 + }, + { + "epoch": 0.28183644738460695, + "grad_norm": 1.362610101699829, + "learning_rate": 1.7375516192786e-05, + "loss": 1.3541, + "step": 5155 + }, + { + "epoch": 0.28189111982832854, + "grad_norm": 1.703599452972412, + "learning_rate": 1.7374282072158796e-05, + "loss": 1.529, + "step": 5156 + }, + { + "epoch": 0.2819457922720501, + "grad_norm": 1.4788646697998047, + "learning_rate": 1.7373047705287004e-05, + "loss": 1.6398, + "step": 5157 + }, + { + "epoch": 0.2820004647157716, + "grad_norm": 2.5260658264160156, + "learning_rate": 1.737181309221185e-05, + "loss": 1.4479, + "step": 5158 + }, + { + "epoch": 0.2820551371594932, + "grad_norm": 1.2673814296722412, + "learning_rate": 1.7370578232974558e-05, + "loss": 1.3829, + "step": 5159 + }, + { + "epoch": 0.28210980960321475, + "grad_norm": 2.1412363052368164, + "learning_rate": 1.7369343127616367e-05, + "loss": 1.2293, + "step": 5160 + }, + { + "epoch": 0.2821644820469363, + "grad_norm": 1.3720213174819946, + "learning_rate": 1.736810777617851e-05, + "loss": 1.5499, + "step": 5161 + }, + { + "epoch": 0.2822191544906578, + "grad_norm": 1.469547986984253, + "learning_rate": 1.736687217870225e-05, + "loss": 1.4481, + "step": 5162 + }, + { + "epoch": 0.2822738269343794, + "grad_norm": 1.437307596206665, + "learning_rate": 1.7365636335228834e-05, + "loss": 1.5014, + "step": 5163 + }, + { + "epoch": 0.28232849937810095, + "grad_norm": 2.448955774307251, + "learning_rate": 1.736440024579954e-05, + "loss": 1.4817, + "step": 5164 + }, + { + "epoch": 0.2823831718218225, + "grad_norm": 1.9481178522109985, + "learning_rate": 1.7363163910455646e-05, + "loss": 1.3089, + "step": 5165 + }, + { + "epoch": 0.2824378442655441, + "grad_norm": 1.053277611732483, + "learning_rate": 1.7361927329238425e-05, + "loss": 1.5611, + "step": 5166 + }, + { + "epoch": 0.2824925167092656, + "grad_norm": 1.3561166524887085, + "learning_rate": 1.7360690502189176e-05, + "loss": 1.5726, + "step": 5167 + }, + { + "epoch": 0.28254718915298715, + "grad_norm": 2.236356258392334, + "learning_rate": 1.73594534293492e-05, + "loss": 1.559, + "step": 5168 + }, + { + "epoch": 0.2826018615967087, + "grad_norm": 1.3043594360351562, + "learning_rate": 1.7358216110759803e-05, + "loss": 1.4051, + "step": 5169 + }, + { + "epoch": 0.2826565340404303, + "grad_norm": 1.5314030647277832, + "learning_rate": 1.7356978546462306e-05, + "loss": 1.4333, + "step": 5170 + }, + { + "epoch": 0.2827112064841518, + "grad_norm": 1.9079219102859497, + "learning_rate": 1.735574073649803e-05, + "loss": 1.6446, + "step": 5171 + }, + { + "epoch": 0.28276587892787336, + "grad_norm": 1.8145445585250854, + "learning_rate": 1.735450268090831e-05, + "loss": 1.511, + "step": 5172 + }, + { + "epoch": 0.28282055137159495, + "grad_norm": 1.468375325202942, + "learning_rate": 1.7353264379734486e-05, + "loss": 1.2072, + "step": 5173 + }, + { + "epoch": 0.2828752238153165, + "grad_norm": 1.2194592952728271, + "learning_rate": 1.735202583301791e-05, + "loss": 1.3392, + "step": 5174 + }, + { + "epoch": 0.282929896259038, + "grad_norm": 1.4678088426589966, + "learning_rate": 1.7350787040799945e-05, + "loss": 1.3198, + "step": 5175 + }, + { + "epoch": 0.28298456870275956, + "grad_norm": 1.678443431854248, + "learning_rate": 1.7349548003121945e-05, + "loss": 1.5723, + "step": 5176 + }, + { + "epoch": 0.28303924114648116, + "grad_norm": 1.40395188331604, + "learning_rate": 1.7348308720025293e-05, + "loss": 1.3069, + "step": 5177 + }, + { + "epoch": 0.2830939135902027, + "grad_norm": 1.400475263595581, + "learning_rate": 1.734706919155137e-05, + "loss": 1.488, + "step": 5178 + }, + { + "epoch": 0.28314858603392423, + "grad_norm": 1.447616457939148, + "learning_rate": 1.7345829417741564e-05, + "loss": 1.4134, + "step": 5179 + }, + { + "epoch": 0.2832032584776458, + "grad_norm": 1.5044721364974976, + "learning_rate": 1.734458939863728e-05, + "loss": 1.4299, + "step": 5180 + }, + { + "epoch": 0.28325793092136736, + "grad_norm": 1.3734532594680786, + "learning_rate": 1.7343349134279917e-05, + "loss": 1.4874, + "step": 5181 + }, + { + "epoch": 0.2833126033650889, + "grad_norm": 1.1649397611618042, + "learning_rate": 1.7342108624710898e-05, + "loss": 1.5625, + "step": 5182 + }, + { + "epoch": 0.28336727580881044, + "grad_norm": 2.578176736831665, + "learning_rate": 1.734086786997164e-05, + "loss": 1.115, + "step": 5183 + }, + { + "epoch": 0.28342194825253203, + "grad_norm": 1.8593502044677734, + "learning_rate": 1.733962687010358e-05, + "loss": 1.4278, + "step": 5184 + }, + { + "epoch": 0.28347662069625357, + "grad_norm": 1.9782291650772095, + "learning_rate": 1.7338385625148156e-05, + "loss": 1.5602, + "step": 5185 + }, + { + "epoch": 0.2835312931399751, + "grad_norm": 1.4977904558181763, + "learning_rate": 1.7337144135146818e-05, + "loss": 1.4469, + "step": 5186 + }, + { + "epoch": 0.2835859655836967, + "grad_norm": 1.2294431924819946, + "learning_rate": 1.7335902400141017e-05, + "loss": 1.5499, + "step": 5187 + }, + { + "epoch": 0.28364063802741823, + "grad_norm": 1.6851633787155151, + "learning_rate": 1.7334660420172224e-05, + "loss": 1.3727, + "step": 5188 + }, + { + "epoch": 0.28369531047113977, + "grad_norm": 1.5072451829910278, + "learning_rate": 1.7333418195281906e-05, + "loss": 1.4832, + "step": 5189 + }, + { + "epoch": 0.2837499829148613, + "grad_norm": 1.596657633781433, + "learning_rate": 1.7332175725511544e-05, + "loss": 1.3715, + "step": 5190 + }, + { + "epoch": 0.2838046553585829, + "grad_norm": 1.879837155342102, + "learning_rate": 1.733093301090263e-05, + "loss": 1.5255, + "step": 5191 + }, + { + "epoch": 0.28385932780230444, + "grad_norm": 1.9117664098739624, + "learning_rate": 1.7329690051496663e-05, + "loss": 1.2348, + "step": 5192 + }, + { + "epoch": 0.283914000246026, + "grad_norm": 1.2765722274780273, + "learning_rate": 1.7328446847335142e-05, + "loss": 1.6411, + "step": 5193 + }, + { + "epoch": 0.28396867268974757, + "grad_norm": 1.1641002893447876, + "learning_rate": 1.7327203398459586e-05, + "loss": 1.5745, + "step": 5194 + }, + { + "epoch": 0.2840233451334691, + "grad_norm": 1.4712886810302734, + "learning_rate": 1.7325959704911516e-05, + "loss": 1.4456, + "step": 5195 + }, + { + "epoch": 0.28407801757719064, + "grad_norm": 1.4921082258224487, + "learning_rate": 1.732471576673246e-05, + "loss": 1.6349, + "step": 5196 + }, + { + "epoch": 0.28413269002091224, + "grad_norm": 1.5505322217941284, + "learning_rate": 1.7323471583963953e-05, + "loss": 1.4255, + "step": 5197 + }, + { + "epoch": 0.2841873624646338, + "grad_norm": 1.39344322681427, + "learning_rate": 1.7322227156647548e-05, + "loss": 1.4827, + "step": 5198 + }, + { + "epoch": 0.2842420349083553, + "grad_norm": 1.5869499444961548, + "learning_rate": 1.7320982484824796e-05, + "loss": 1.4729, + "step": 5199 + }, + { + "epoch": 0.28429670735207685, + "grad_norm": 1.8519213199615479, + "learning_rate": 1.731973756853726e-05, + "loss": 1.3199, + "step": 5200 + }, + { + "epoch": 0.28435137979579844, + "grad_norm": 2.779874086380005, + "learning_rate": 1.7318492407826508e-05, + "loss": 1.3651, + "step": 5201 + }, + { + "epoch": 0.28440605223952, + "grad_norm": 1.5187231302261353, + "learning_rate": 1.731724700273412e-05, + "loss": 1.1836, + "step": 5202 + }, + { + "epoch": 0.2844607246832415, + "grad_norm": 1.4068193435668945, + "learning_rate": 1.731600135330169e-05, + "loss": 1.4878, + "step": 5203 + }, + { + "epoch": 0.2845153971269631, + "grad_norm": 1.6059443950653076, + "learning_rate": 1.7314755459570803e-05, + "loss": 1.4463, + "step": 5204 + }, + { + "epoch": 0.28457006957068465, + "grad_norm": 1.5138858556747437, + "learning_rate": 1.7313509321583066e-05, + "loss": 1.6411, + "step": 5205 + }, + { + "epoch": 0.2846247420144062, + "grad_norm": 1.5917574167251587, + "learning_rate": 1.7312262939380094e-05, + "loss": 1.517, + "step": 5206 + }, + { + "epoch": 0.2846794144581277, + "grad_norm": 1.5216931104660034, + "learning_rate": 1.73110163130035e-05, + "loss": 1.5296, + "step": 5207 + }, + { + "epoch": 0.2847340869018493, + "grad_norm": 1.5826369524002075, + "learning_rate": 1.7309769442494918e-05, + "loss": 1.4898, + "step": 5208 + }, + { + "epoch": 0.28478875934557085, + "grad_norm": 1.647473931312561, + "learning_rate": 1.7308522327895984e-05, + "loss": 1.4539, + "step": 5209 + }, + { + "epoch": 0.2848434317892924, + "grad_norm": 1.4960181713104248, + "learning_rate": 1.7307274969248334e-05, + "loss": 1.3781, + "step": 5210 + }, + { + "epoch": 0.284898104233014, + "grad_norm": 1.4147268533706665, + "learning_rate": 1.7306027366593627e-05, + "loss": 1.5316, + "step": 5211 + }, + { + "epoch": 0.2849527766767355, + "grad_norm": 1.3678483963012695, + "learning_rate": 1.7304779519973526e-05, + "loss": 1.5097, + "step": 5212 + }, + { + "epoch": 0.28500744912045706, + "grad_norm": 1.236788034439087, + "learning_rate": 1.730353142942969e-05, + "loss": 1.5463, + "step": 5213 + }, + { + "epoch": 0.2850621215641786, + "grad_norm": 1.247601866722107, + "learning_rate": 1.7302283095003807e-05, + "loss": 1.5216, + "step": 5214 + }, + { + "epoch": 0.2851167940079002, + "grad_norm": 1.497256875038147, + "learning_rate": 1.730103451673755e-05, + "loss": 1.2967, + "step": 5215 + }, + { + "epoch": 0.2851714664516217, + "grad_norm": 1.5204477310180664, + "learning_rate": 1.7299785694672624e-05, + "loss": 1.5408, + "step": 5216 + }, + { + "epoch": 0.28522613889534326, + "grad_norm": 1.681789755821228, + "learning_rate": 1.729853662885072e-05, + "loss": 1.4108, + "step": 5217 + }, + { + "epoch": 0.28528081133906485, + "grad_norm": 2.6966536045074463, + "learning_rate": 1.7297287319313554e-05, + "loss": 1.3288, + "step": 5218 + }, + { + "epoch": 0.2853354837827864, + "grad_norm": 1.5164330005645752, + "learning_rate": 1.7296037766102842e-05, + "loss": 1.2099, + "step": 5219 + }, + { + "epoch": 0.28539015622650793, + "grad_norm": 1.8428916931152344, + "learning_rate": 1.7294787969260303e-05, + "loss": 1.3225, + "step": 5220 + }, + { + "epoch": 0.28544482867022947, + "grad_norm": 1.4793609380722046, + "learning_rate": 1.729353792882768e-05, + "loss": 1.2042, + "step": 5221 + }, + { + "epoch": 0.28549950111395106, + "grad_norm": 1.8406444787979126, + "learning_rate": 1.729228764484671e-05, + "loss": 1.618, + "step": 5222 + }, + { + "epoch": 0.2855541735576726, + "grad_norm": 1.8336514234542847, + "learning_rate": 1.7291037117359144e-05, + "loss": 1.4451, + "step": 5223 + }, + { + "epoch": 0.28560884600139413, + "grad_norm": 1.9544858932495117, + "learning_rate": 1.728978634640674e-05, + "loss": 1.3028, + "step": 5224 + }, + { + "epoch": 0.2856635184451157, + "grad_norm": 1.5304784774780273, + "learning_rate": 1.7288535332031262e-05, + "loss": 1.5709, + "step": 5225 + }, + { + "epoch": 0.28571819088883726, + "grad_norm": 1.5113062858581543, + "learning_rate": 1.7287284074274485e-05, + "loss": 1.4444, + "step": 5226 + }, + { + "epoch": 0.2857728633325588, + "grad_norm": 1.5341874361038208, + "learning_rate": 1.7286032573178198e-05, + "loss": 1.5525, + "step": 5227 + }, + { + "epoch": 0.28582753577628034, + "grad_norm": 1.7811503410339355, + "learning_rate": 1.728478082878418e-05, + "loss": 1.2914, + "step": 5228 + }, + { + "epoch": 0.28588220822000193, + "grad_norm": 1.9049516916275024, + "learning_rate": 1.7283528841134242e-05, + "loss": 1.446, + "step": 5229 + }, + { + "epoch": 0.28593688066372347, + "grad_norm": 1.225527048110962, + "learning_rate": 1.7282276610270183e-05, + "loss": 1.4564, + "step": 5230 + }, + { + "epoch": 0.285991553107445, + "grad_norm": 1.5018603801727295, + "learning_rate": 1.7281024136233816e-05, + "loss": 1.3446, + "step": 5231 + }, + { + "epoch": 0.2860462255511666, + "grad_norm": 1.3645075559616089, + "learning_rate": 1.727977141906697e-05, + "loss": 1.603, + "step": 5232 + }, + { + "epoch": 0.28610089799488814, + "grad_norm": 1.382973313331604, + "learning_rate": 1.7278518458811472e-05, + "loss": 1.3831, + "step": 5233 + }, + { + "epoch": 0.2861555704386097, + "grad_norm": 1.8394508361816406, + "learning_rate": 1.7277265255509165e-05, + "loss": 1.5361, + "step": 5234 + }, + { + "epoch": 0.2862102428823312, + "grad_norm": 1.4726653099060059, + "learning_rate": 1.7276011809201896e-05, + "loss": 1.3794, + "step": 5235 + }, + { + "epoch": 0.2862649153260528, + "grad_norm": 1.4538298845291138, + "learning_rate": 1.7274758119931515e-05, + "loss": 1.062, + "step": 5236 + }, + { + "epoch": 0.28631958776977434, + "grad_norm": 1.532543659210205, + "learning_rate": 1.7273504187739893e-05, + "loss": 1.4272, + "step": 5237 + }, + { + "epoch": 0.2863742602134959, + "grad_norm": 1.1752893924713135, + "learning_rate": 1.7272250012668896e-05, + "loss": 1.4826, + "step": 5238 + }, + { + "epoch": 0.28642893265721747, + "grad_norm": 1.612038254737854, + "learning_rate": 1.7270995594760407e-05, + "loss": 1.4051, + "step": 5239 + }, + { + "epoch": 0.286483605100939, + "grad_norm": 2.086984395980835, + "learning_rate": 1.7269740934056317e-05, + "loss": 1.31, + "step": 5240 + }, + { + "epoch": 0.28653827754466055, + "grad_norm": 1.3735805749893188, + "learning_rate": 1.726848603059851e-05, + "loss": 1.7202, + "step": 5241 + }, + { + "epoch": 0.2865929499883821, + "grad_norm": 1.3898391723632812, + "learning_rate": 1.7267230884428905e-05, + "loss": 1.7113, + "step": 5242 + }, + { + "epoch": 0.2866476224321037, + "grad_norm": 1.5084127187728882, + "learning_rate": 1.7265975495589408e-05, + "loss": 1.8265, + "step": 5243 + }, + { + "epoch": 0.2867022948758252, + "grad_norm": 1.3697961568832397, + "learning_rate": 1.7264719864121935e-05, + "loss": 1.2669, + "step": 5244 + }, + { + "epoch": 0.28675696731954675, + "grad_norm": 1.3332571983337402, + "learning_rate": 1.726346399006842e-05, + "loss": 1.6134, + "step": 5245 + }, + { + "epoch": 0.28681163976326834, + "grad_norm": 1.813918948173523, + "learning_rate": 1.72622078734708e-05, + "loss": 1.4077, + "step": 5246 + }, + { + "epoch": 0.2868663122069899, + "grad_norm": 1.590884804725647, + "learning_rate": 1.7260951514371015e-05, + "loss": 1.6319, + "step": 5247 + }, + { + "epoch": 0.2869209846507114, + "grad_norm": 1.86717689037323, + "learning_rate": 1.7259694912811022e-05, + "loss": 1.4644, + "step": 5248 + }, + { + "epoch": 0.28697565709443296, + "grad_norm": 2.9196102619171143, + "learning_rate": 1.725843806883278e-05, + "loss": 1.2296, + "step": 5249 + }, + { + "epoch": 0.28703032953815455, + "grad_norm": 1.3776739835739136, + "learning_rate": 1.7257180982478256e-05, + "loss": 1.6226, + "step": 5250 + }, + { + "epoch": 0.2870850019818761, + "grad_norm": 1.604300618171692, + "learning_rate": 1.7255923653789436e-05, + "loss": 1.5203, + "step": 5251 + }, + { + "epoch": 0.2871396744255976, + "grad_norm": 1.8565343618392944, + "learning_rate": 1.7254666082808292e-05, + "loss": 1.565, + "step": 5252 + }, + { + "epoch": 0.2871943468693192, + "grad_norm": 1.2831965684890747, + "learning_rate": 1.725340826957683e-05, + "loss": 1.4038, + "step": 5253 + }, + { + "epoch": 0.28724901931304075, + "grad_norm": 1.7546565532684326, + "learning_rate": 1.725215021413704e-05, + "loss": 1.5218, + "step": 5254 + }, + { + "epoch": 0.2873036917567623, + "grad_norm": 1.7480356693267822, + "learning_rate": 1.725089191653094e-05, + "loss": 1.2852, + "step": 5255 + }, + { + "epoch": 0.28735836420048383, + "grad_norm": 1.5411577224731445, + "learning_rate": 1.7249633376800542e-05, + "loss": 1.4254, + "step": 5256 + }, + { + "epoch": 0.2874130366442054, + "grad_norm": 1.0672607421875, + "learning_rate": 1.7248374594987873e-05, + "loss": 1.6702, + "step": 5257 + }, + { + "epoch": 0.28746770908792696, + "grad_norm": 1.8300222158432007, + "learning_rate": 1.724711557113497e-05, + "loss": 1.3185, + "step": 5258 + }, + { + "epoch": 0.2875223815316485, + "grad_norm": 1.5205214023590088, + "learning_rate": 1.724585630528387e-05, + "loss": 1.4615, + "step": 5259 + }, + { + "epoch": 0.2875770539753701, + "grad_norm": 1.4770574569702148, + "learning_rate": 1.7244596797476627e-05, + "loss": 1.6229, + "step": 5260 + }, + { + "epoch": 0.2876317264190916, + "grad_norm": 1.4049712419509888, + "learning_rate": 1.7243337047755297e-05, + "loss": 1.3777, + "step": 5261 + }, + { + "epoch": 0.28768639886281316, + "grad_norm": 1.8102561235427856, + "learning_rate": 1.7242077056161943e-05, + "loss": 1.5297, + "step": 5262 + }, + { + "epoch": 0.2877410713065347, + "grad_norm": 1.458398461341858, + "learning_rate": 1.7240816822738646e-05, + "loss": 1.3789, + "step": 5263 + }, + { + "epoch": 0.2877957437502563, + "grad_norm": 1.510000228881836, + "learning_rate": 1.723955634752748e-05, + "loss": 1.2665, + "step": 5264 + }, + { + "epoch": 0.28785041619397783, + "grad_norm": 1.4045240879058838, + "learning_rate": 1.7238295630570544e-05, + "loss": 1.7093, + "step": 5265 + }, + { + "epoch": 0.28790508863769937, + "grad_norm": 1.4863778352737427, + "learning_rate": 1.723703467190993e-05, + "loss": 1.6425, + "step": 5266 + }, + { + "epoch": 0.28795976108142096, + "grad_norm": 1.4876694679260254, + "learning_rate": 1.7235773471587743e-05, + "loss": 1.7345, + "step": 5267 + }, + { + "epoch": 0.2880144335251425, + "grad_norm": 1.8033140897750854, + "learning_rate": 1.7234512029646104e-05, + "loss": 1.446, + "step": 5268 + }, + { + "epoch": 0.28806910596886404, + "grad_norm": 1.3633077144622803, + "learning_rate": 1.7233250346127132e-05, + "loss": 1.4431, + "step": 5269 + }, + { + "epoch": 0.2881237784125856, + "grad_norm": 1.3082131147384644, + "learning_rate": 1.7231988421072957e-05, + "loss": 1.4714, + "step": 5270 + }, + { + "epoch": 0.28817845085630717, + "grad_norm": 1.5578949451446533, + "learning_rate": 1.723072625452572e-05, + "loss": 1.2578, + "step": 5271 + }, + { + "epoch": 0.2882331233000287, + "grad_norm": 1.3440520763397217, + "learning_rate": 1.722946384652756e-05, + "loss": 1.4249, + "step": 5272 + }, + { + "epoch": 0.28828779574375024, + "grad_norm": 1.2832034826278687, + "learning_rate": 1.7228201197120642e-05, + "loss": 1.9026, + "step": 5273 + }, + { + "epoch": 0.28834246818747183, + "grad_norm": 1.7910791635513306, + "learning_rate": 1.7226938306347124e-05, + "loss": 1.4111, + "step": 5274 + }, + { + "epoch": 0.28839714063119337, + "grad_norm": 1.3206233978271484, + "learning_rate": 1.722567517424918e-05, + "loss": 1.2453, + "step": 5275 + }, + { + "epoch": 0.2884518130749149, + "grad_norm": 1.6563243865966797, + "learning_rate": 1.722441180086898e-05, + "loss": 1.4813, + "step": 5276 + }, + { + "epoch": 0.28850648551863645, + "grad_norm": 1.4043593406677246, + "learning_rate": 1.722314818624872e-05, + "loss": 1.3986, + "step": 5277 + }, + { + "epoch": 0.28856115796235804, + "grad_norm": 1.5318083763122559, + "learning_rate": 1.7221884330430593e-05, + "loss": 1.5432, + "step": 5278 + }, + { + "epoch": 0.2886158304060796, + "grad_norm": 1.5324938297271729, + "learning_rate": 1.7220620233456806e-05, + "loss": 1.8016, + "step": 5279 + }, + { + "epoch": 0.2886705028498011, + "grad_norm": 1.4375145435333252, + "learning_rate": 1.721935589536956e-05, + "loss": 1.4731, + "step": 5280 + }, + { + "epoch": 0.2887251752935227, + "grad_norm": 1.1910687685012817, + "learning_rate": 1.7218091316211083e-05, + "loss": 1.5661, + "step": 5281 + }, + { + "epoch": 0.28877984773724424, + "grad_norm": 1.4195305109024048, + "learning_rate": 1.7216826496023594e-05, + "loss": 1.5552, + "step": 5282 + }, + { + "epoch": 0.2888345201809658, + "grad_norm": 1.3484371900558472, + "learning_rate": 1.721556143484934e-05, + "loss": 1.4793, + "step": 5283 + }, + { + "epoch": 0.2888891926246873, + "grad_norm": 1.6740978956222534, + "learning_rate": 1.7214296132730555e-05, + "loss": 1.365, + "step": 5284 + }, + { + "epoch": 0.2889438650684089, + "grad_norm": 1.5386719703674316, + "learning_rate": 1.7213030589709493e-05, + "loss": 1.5652, + "step": 5285 + }, + { + "epoch": 0.28899853751213045, + "grad_norm": 1.2297016382217407, + "learning_rate": 1.7211764805828415e-05, + "loss": 1.4782, + "step": 5286 + }, + { + "epoch": 0.289053209955852, + "grad_norm": 1.4507920742034912, + "learning_rate": 1.7210498781129585e-05, + "loss": 1.381, + "step": 5287 + }, + { + "epoch": 0.2891078823995736, + "grad_norm": 1.4592727422714233, + "learning_rate": 1.7209232515655283e-05, + "loss": 1.5498, + "step": 5288 + }, + { + "epoch": 0.2891625548432951, + "grad_norm": 1.2815667390823364, + "learning_rate": 1.720796600944779e-05, + "loss": 1.5269, + "step": 5289 + }, + { + "epoch": 0.28921722728701665, + "grad_norm": 1.8441786766052246, + "learning_rate": 1.7206699262549395e-05, + "loss": 1.3523, + "step": 5290 + }, + { + "epoch": 0.2892718997307382, + "grad_norm": 1.4590691328048706, + "learning_rate": 1.7205432275002403e-05, + "loss": 1.4426, + "step": 5291 + }, + { + "epoch": 0.2893265721744598, + "grad_norm": 2.107696533203125, + "learning_rate": 1.720416504684912e-05, + "loss": 1.6604, + "step": 5292 + }, + { + "epoch": 0.2893812446181813, + "grad_norm": 1.3328591585159302, + "learning_rate": 1.7202897578131858e-05, + "loss": 1.3566, + "step": 5293 + }, + { + "epoch": 0.28943591706190286, + "grad_norm": 1.4846850633621216, + "learning_rate": 1.7201629868892947e-05, + "loss": 1.4166, + "step": 5294 + }, + { + "epoch": 0.28949058950562445, + "grad_norm": 1.8964117765426636, + "learning_rate": 1.7200361919174715e-05, + "loss": 1.3644, + "step": 5295 + }, + { + "epoch": 0.289545261949346, + "grad_norm": 1.5416301488876343, + "learning_rate": 1.7199093729019503e-05, + "loss": 1.5936, + "step": 5296 + }, + { + "epoch": 0.2895999343930675, + "grad_norm": 1.5646589994430542, + "learning_rate": 1.7197825298469655e-05, + "loss": 1.5879, + "step": 5297 + }, + { + "epoch": 0.28965460683678906, + "grad_norm": 1.7383743524551392, + "learning_rate": 1.719655662756753e-05, + "loss": 1.443, + "step": 5298 + }, + { + "epoch": 0.28970927928051066, + "grad_norm": 1.3433282375335693, + "learning_rate": 1.7195287716355495e-05, + "loss": 1.3092, + "step": 5299 + }, + { + "epoch": 0.2897639517242322, + "grad_norm": 1.814107060432434, + "learning_rate": 1.719401856487592e-05, + "loss": 1.4947, + "step": 5300 + }, + { + "epoch": 0.28981862416795373, + "grad_norm": 1.4388052225112915, + "learning_rate": 1.7192749173171183e-05, + "loss": 1.3767, + "step": 5301 + }, + { + "epoch": 0.2898732966116753, + "grad_norm": 1.5879170894622803, + "learning_rate": 1.7191479541283668e-05, + "loss": 1.7454, + "step": 5302 + }, + { + "epoch": 0.28992796905539686, + "grad_norm": 2.115110397338867, + "learning_rate": 1.719020966925578e-05, + "loss": 1.4156, + "step": 5303 + }, + { + "epoch": 0.2899826414991184, + "grad_norm": 1.2624176740646362, + "learning_rate": 1.7188939557129918e-05, + "loss": 1.343, + "step": 5304 + }, + { + "epoch": 0.29003731394283994, + "grad_norm": 1.7163337469100952, + "learning_rate": 1.7187669204948495e-05, + "loss": 1.4878, + "step": 5305 + }, + { + "epoch": 0.29009198638656153, + "grad_norm": 1.3775629997253418, + "learning_rate": 1.718639861275393e-05, + "loss": 1.731, + "step": 5306 + }, + { + "epoch": 0.29014665883028307, + "grad_norm": 1.5687111616134644, + "learning_rate": 1.7185127780588654e-05, + "loss": 1.6079, + "step": 5307 + }, + { + "epoch": 0.2902013312740046, + "grad_norm": 1.6668645143508911, + "learning_rate": 1.7183856708495098e-05, + "loss": 1.4476, + "step": 5308 + }, + { + "epoch": 0.2902560037177262, + "grad_norm": 1.6037665605545044, + "learning_rate": 1.7182585396515712e-05, + "loss": 1.5387, + "step": 5309 + }, + { + "epoch": 0.29031067616144773, + "grad_norm": 1.4272878170013428, + "learning_rate": 1.7181313844692944e-05, + "loss": 1.5286, + "step": 5310 + }, + { + "epoch": 0.29036534860516927, + "grad_norm": 1.2933542728424072, + "learning_rate": 1.7180042053069253e-05, + "loss": 1.3544, + "step": 5311 + }, + { + "epoch": 0.2904200210488908, + "grad_norm": 1.6454694271087646, + "learning_rate": 1.7178770021687113e-05, + "loss": 1.4277, + "step": 5312 + }, + { + "epoch": 0.2904746934926124, + "grad_norm": 1.6040499210357666, + "learning_rate": 1.7177497750588994e-05, + "loss": 1.4666, + "step": 5313 + }, + { + "epoch": 0.29052936593633394, + "grad_norm": 1.3310484886169434, + "learning_rate": 1.717622523981738e-05, + "loss": 1.4023, + "step": 5314 + }, + { + "epoch": 0.2905840383800555, + "grad_norm": 1.4006290435791016, + "learning_rate": 1.7174952489414772e-05, + "loss": 1.4118, + "step": 5315 + }, + { + "epoch": 0.29063871082377707, + "grad_norm": 1.191307544708252, + "learning_rate": 1.717367949942366e-05, + "loss": 1.3983, + "step": 5316 + }, + { + "epoch": 0.2906933832674986, + "grad_norm": 1.3604134321212769, + "learning_rate": 1.7172406269886555e-05, + "loss": 1.5746, + "step": 5317 + }, + { + "epoch": 0.29074805571122014, + "grad_norm": 1.742578148841858, + "learning_rate": 1.717113280084598e-05, + "loss": 1.4282, + "step": 5318 + }, + { + "epoch": 0.2908027281549417, + "grad_norm": 1.532340168952942, + "learning_rate": 1.7169859092344448e-05, + "loss": 1.5447, + "step": 5319 + }, + { + "epoch": 0.2908574005986633, + "grad_norm": 1.4005584716796875, + "learning_rate": 1.71685851444245e-05, + "loss": 1.6341, + "step": 5320 + }, + { + "epoch": 0.2909120730423848, + "grad_norm": 1.88258957862854, + "learning_rate": 1.716731095712867e-05, + "loss": 1.3192, + "step": 5321 + }, + { + "epoch": 0.29096674548610635, + "grad_norm": 1.8597668409347534, + "learning_rate": 1.7166036530499503e-05, + "loss": 1.6405, + "step": 5322 + }, + { + "epoch": 0.29102141792982794, + "grad_norm": 1.5700416564941406, + "learning_rate": 1.716476186457957e-05, + "loss": 1.1837, + "step": 5323 + }, + { + "epoch": 0.2910760903735495, + "grad_norm": 1.5301822423934937, + "learning_rate": 1.7163486959411418e-05, + "loss": 1.3973, + "step": 5324 + }, + { + "epoch": 0.291130762817271, + "grad_norm": 1.6450930833816528, + "learning_rate": 1.7162211815037633e-05, + "loss": 1.2911, + "step": 5325 + }, + { + "epoch": 0.29118543526099255, + "grad_norm": 1.6373695135116577, + "learning_rate": 1.7160936431500785e-05, + "loss": 1.627, + "step": 5326 + }, + { + "epoch": 0.29124010770471415, + "grad_norm": 1.4760476350784302, + "learning_rate": 1.715966080884347e-05, + "loss": 1.3851, + "step": 5327 + }, + { + "epoch": 0.2912947801484357, + "grad_norm": 1.1585437059402466, + "learning_rate": 1.715838494710827e-05, + "loss": 1.6647, + "step": 5328 + }, + { + "epoch": 0.2913494525921572, + "grad_norm": 1.2659400701522827, + "learning_rate": 1.7157108846337812e-05, + "loss": 1.4936, + "step": 5329 + }, + { + "epoch": 0.2914041250358788, + "grad_norm": 1.2723565101623535, + "learning_rate": 1.7155832506574688e-05, + "loss": 1.5281, + "step": 5330 + }, + { + "epoch": 0.29145879747960035, + "grad_norm": 1.2431936264038086, + "learning_rate": 1.7154555927861524e-05, + "loss": 1.3607, + "step": 5331 + }, + { + "epoch": 0.2915134699233219, + "grad_norm": 1.4214951992034912, + "learning_rate": 1.715327911024095e-05, + "loss": 1.3017, + "step": 5332 + }, + { + "epoch": 0.2915681423670434, + "grad_norm": 1.9156768321990967, + "learning_rate": 1.7152002053755604e-05, + "loss": 1.6138, + "step": 5333 + }, + { + "epoch": 0.291622814810765, + "grad_norm": 1.5391680002212524, + "learning_rate": 1.715072475844812e-05, + "loss": 1.4845, + "step": 5334 + }, + { + "epoch": 0.29167748725448656, + "grad_norm": 1.415575385093689, + "learning_rate": 1.7149447224361163e-05, + "loss": 1.7928, + "step": 5335 + }, + { + "epoch": 0.2917321596982081, + "grad_norm": 1.5864871740341187, + "learning_rate": 1.7148169451537385e-05, + "loss": 1.6367, + "step": 5336 + }, + { + "epoch": 0.2917868321419297, + "grad_norm": 1.44679594039917, + "learning_rate": 1.7146891440019456e-05, + "loss": 1.5107, + "step": 5337 + }, + { + "epoch": 0.2918415045856512, + "grad_norm": 1.7464239597320557, + "learning_rate": 1.714561318985005e-05, + "loss": 1.602, + "step": 5338 + }, + { + "epoch": 0.29189617702937276, + "grad_norm": 2.077528715133667, + "learning_rate": 1.714433470107185e-05, + "loss": 1.5861, + "step": 5339 + }, + { + "epoch": 0.2919508494730943, + "grad_norm": 1.3123797178268433, + "learning_rate": 1.714305597372755e-05, + "loss": 1.615, + "step": 5340 + }, + { + "epoch": 0.2920055219168159, + "grad_norm": 1.7153140306472778, + "learning_rate": 1.7141777007859852e-05, + "loss": 1.3974, + "step": 5341 + }, + { + "epoch": 0.29206019436053743, + "grad_norm": 1.7098283767700195, + "learning_rate": 1.7140497803511457e-05, + "loss": 1.4043, + "step": 5342 + }, + { + "epoch": 0.29211486680425897, + "grad_norm": 1.6425241231918335, + "learning_rate": 1.713921836072509e-05, + "loss": 1.4641, + "step": 5343 + }, + { + "epoch": 0.29216953924798056, + "grad_norm": 1.6505047082901, + "learning_rate": 1.7137938679543466e-05, + "loss": 1.5664, + "step": 5344 + }, + { + "epoch": 0.2922242116917021, + "grad_norm": 1.5942789316177368, + "learning_rate": 1.713665876000932e-05, + "loss": 1.4434, + "step": 5345 + }, + { + "epoch": 0.29227888413542363, + "grad_norm": 2.2044713497161865, + "learning_rate": 1.713537860216539e-05, + "loss": 1.3777, + "step": 5346 + }, + { + "epoch": 0.29233355657914517, + "grad_norm": 1.9788199663162231, + "learning_rate": 1.713409820605443e-05, + "loss": 1.2786, + "step": 5347 + }, + { + "epoch": 0.29238822902286676, + "grad_norm": 1.6294422149658203, + "learning_rate": 1.7132817571719185e-05, + "loss": 1.1666, + "step": 5348 + }, + { + "epoch": 0.2924429014665883, + "grad_norm": 1.7352325916290283, + "learning_rate": 1.7131536699202427e-05, + "loss": 1.5324, + "step": 5349 + }, + { + "epoch": 0.29249757391030984, + "grad_norm": 1.746728777885437, + "learning_rate": 1.713025558854692e-05, + "loss": 1.4892, + "step": 5350 + }, + { + "epoch": 0.29255224635403143, + "grad_norm": 1.353890061378479, + "learning_rate": 1.7128974239795448e-05, + "loss": 1.3803, + "step": 5351 + }, + { + "epoch": 0.29260691879775297, + "grad_norm": 1.3510215282440186, + "learning_rate": 1.71276926529908e-05, + "loss": 1.0271, + "step": 5352 + }, + { + "epoch": 0.2926615912414745, + "grad_norm": 1.3844959735870361, + "learning_rate": 1.7126410828175768e-05, + "loss": 1.4605, + "step": 5353 + }, + { + "epoch": 0.29271626368519604, + "grad_norm": 1.4759531021118164, + "learning_rate": 1.7125128765393157e-05, + "loss": 1.3627, + "step": 5354 + }, + { + "epoch": 0.29277093612891764, + "grad_norm": 1.6234434843063354, + "learning_rate": 1.7123846464685774e-05, + "loss": 1.4429, + "step": 5355 + }, + { + "epoch": 0.2928256085726392, + "grad_norm": 1.4657686948776245, + "learning_rate": 1.712256392609644e-05, + "loss": 1.9119, + "step": 5356 + }, + { + "epoch": 0.2928802810163607, + "grad_norm": 1.6655980348587036, + "learning_rate": 1.7121281149667987e-05, + "loss": 1.2944, + "step": 5357 + }, + { + "epoch": 0.2929349534600823, + "grad_norm": 1.775428295135498, + "learning_rate": 1.7119998135443245e-05, + "loss": 1.3875, + "step": 5358 + }, + { + "epoch": 0.29298962590380384, + "grad_norm": 1.2413262128829956, + "learning_rate": 1.7118714883465055e-05, + "loss": 1.646, + "step": 5359 + }, + { + "epoch": 0.2930442983475254, + "grad_norm": 2.1784417629241943, + "learning_rate": 1.711743139377627e-05, + "loss": 1.7406, + "step": 5360 + }, + { + "epoch": 0.2930989707912469, + "grad_norm": 1.4709676504135132, + "learning_rate": 1.7116147666419755e-05, + "loss": 1.3319, + "step": 5361 + }, + { + "epoch": 0.2931536432349685, + "grad_norm": 1.425687313079834, + "learning_rate": 1.7114863701438365e-05, + "loss": 1.3703, + "step": 5362 + }, + { + "epoch": 0.29320831567869005, + "grad_norm": 1.4421452283859253, + "learning_rate": 1.711357949887498e-05, + "loss": 1.3931, + "step": 5363 + }, + { + "epoch": 0.2932629881224116, + "grad_norm": 1.5639671087265015, + "learning_rate": 1.7112295058772487e-05, + "loss": 1.4267, + "step": 5364 + }, + { + "epoch": 0.2933176605661332, + "grad_norm": 1.7330678701400757, + "learning_rate": 1.711101038117377e-05, + "loss": 1.2477, + "step": 5365 + }, + { + "epoch": 0.2933723330098547, + "grad_norm": 1.543703556060791, + "learning_rate": 1.7109725466121734e-05, + "loss": 1.6583, + "step": 5366 + }, + { + "epoch": 0.29342700545357625, + "grad_norm": 1.267654538154602, + "learning_rate": 1.7108440313659275e-05, + "loss": 1.5885, + "step": 5367 + }, + { + "epoch": 0.2934816778972978, + "grad_norm": 1.2994258403778076, + "learning_rate": 1.7107154923829317e-05, + "loss": 1.5896, + "step": 5368 + }, + { + "epoch": 0.2935363503410194, + "grad_norm": 1.712193250656128, + "learning_rate": 1.710586929667478e-05, + "loss": 1.5076, + "step": 5369 + }, + { + "epoch": 0.2935910227847409, + "grad_norm": 2.067056179046631, + "learning_rate": 1.7104583432238588e-05, + "loss": 1.1891, + "step": 5370 + }, + { + "epoch": 0.29364569522846246, + "grad_norm": 1.5311686992645264, + "learning_rate": 1.710329733056369e-05, + "loss": 1.4618, + "step": 5371 + }, + { + "epoch": 0.29370036767218405, + "grad_norm": 1.362605094909668, + "learning_rate": 1.710201099169302e-05, + "loss": 1.622, + "step": 5372 + }, + { + "epoch": 0.2937550401159056, + "grad_norm": 1.4364181756973267, + "learning_rate": 1.710072441566954e-05, + "loss": 1.7034, + "step": 5373 + }, + { + "epoch": 0.2938097125596271, + "grad_norm": 1.6302475929260254, + "learning_rate": 1.7099437602536208e-05, + "loss": 1.5782, + "step": 5374 + }, + { + "epoch": 0.29386438500334866, + "grad_norm": 1.2982937097549438, + "learning_rate": 1.7098150552335997e-05, + "loss": 1.6211, + "step": 5375 + }, + { + "epoch": 0.29391905744707025, + "grad_norm": 1.5365110635757446, + "learning_rate": 1.709686326511188e-05, + "loss": 1.657, + "step": 5376 + }, + { + "epoch": 0.2939737298907918, + "grad_norm": 1.2122807502746582, + "learning_rate": 1.709557574090685e-05, + "loss": 1.6139, + "step": 5377 + }, + { + "epoch": 0.29402840233451333, + "grad_norm": 1.7863154411315918, + "learning_rate": 1.7094287979763892e-05, + "loss": 1.3414, + "step": 5378 + }, + { + "epoch": 0.2940830747782349, + "grad_norm": 1.4522157907485962, + "learning_rate": 1.7092999981726013e-05, + "loss": 1.4275, + "step": 5379 + }, + { + "epoch": 0.29413774722195646, + "grad_norm": 1.8528807163238525, + "learning_rate": 1.7091711746836218e-05, + "loss": 1.3553, + "step": 5380 + }, + { + "epoch": 0.294192419665678, + "grad_norm": 1.8895028829574585, + "learning_rate": 1.709042327513753e-05, + "loss": 1.4586, + "step": 5381 + }, + { + "epoch": 0.29424709210939953, + "grad_norm": 1.6861988306045532, + "learning_rate": 1.708913456667297e-05, + "loss": 1.4842, + "step": 5382 + }, + { + "epoch": 0.2943017645531211, + "grad_norm": 1.415016531944275, + "learning_rate": 1.708784562148557e-05, + "loss": 1.518, + "step": 5383 + }, + { + "epoch": 0.29435643699684266, + "grad_norm": 1.4338703155517578, + "learning_rate": 1.7086556439618373e-05, + "loss": 1.374, + "step": 5384 + }, + { + "epoch": 0.2944111094405642, + "grad_norm": 1.795968770980835, + "learning_rate": 1.7085267021114424e-05, + "loss": 1.5594, + "step": 5385 + }, + { + "epoch": 0.2944657818842858, + "grad_norm": 1.7032129764556885, + "learning_rate": 1.7083977366016785e-05, + "loss": 1.6089, + "step": 5386 + }, + { + "epoch": 0.29452045432800733, + "grad_norm": 1.705583095550537, + "learning_rate": 1.7082687474368523e-05, + "loss": 1.3184, + "step": 5387 + }, + { + "epoch": 0.29457512677172887, + "grad_norm": 1.57439386844635, + "learning_rate": 1.7081397346212703e-05, + "loss": 1.3901, + "step": 5388 + }, + { + "epoch": 0.2946297992154504, + "grad_norm": 1.316441297531128, + "learning_rate": 1.7080106981592407e-05, + "loss": 1.3394, + "step": 5389 + }, + { + "epoch": 0.294684471659172, + "grad_norm": 1.531137228012085, + "learning_rate": 1.7078816380550728e-05, + "loss": 1.392, + "step": 5390 + }, + { + "epoch": 0.29473914410289354, + "grad_norm": 1.2375391721725464, + "learning_rate": 1.707752554313076e-05, + "loss": 1.5269, + "step": 5391 + }, + { + "epoch": 0.2947938165466151, + "grad_norm": 1.7312525510787964, + "learning_rate": 1.7076234469375604e-05, + "loss": 1.3873, + "step": 5392 + }, + { + "epoch": 0.29484848899033667, + "grad_norm": 1.2860374450683594, + "learning_rate": 1.707494315932837e-05, + "loss": 1.6035, + "step": 5393 + }, + { + "epoch": 0.2949031614340582, + "grad_norm": 1.4402450323104858, + "learning_rate": 1.7073651613032186e-05, + "loss": 1.4326, + "step": 5394 + }, + { + "epoch": 0.29495783387777974, + "grad_norm": 1.3896201848983765, + "learning_rate": 1.7072359830530178e-05, + "loss": 1.6184, + "step": 5395 + }, + { + "epoch": 0.2950125063215013, + "grad_norm": 1.3099290132522583, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.5473, + "step": 5396 + }, + { + "epoch": 0.29506717876522287, + "grad_norm": 1.136452555656433, + "learning_rate": 1.706977555708123e-05, + "loss": 1.6032, + "step": 5397 + }, + { + "epoch": 0.2951218512089444, + "grad_norm": 1.5190069675445557, + "learning_rate": 1.7068483066220586e-05, + "loss": 1.466, + "step": 5398 + }, + { + "epoch": 0.29517652365266595, + "grad_norm": 1.7533761262893677, + "learning_rate": 1.7067190339326705e-05, + "loss": 1.5101, + "step": 5399 + }, + { + "epoch": 0.29523119609638754, + "grad_norm": 1.3980696201324463, + "learning_rate": 1.7065897376442757e-05, + "loss": 1.4377, + "step": 5400 + }, + { + "epoch": 0.2952858685401091, + "grad_norm": 1.7723551988601685, + "learning_rate": 1.7064604177611913e-05, + "loss": 1.5786, + "step": 5401 + }, + { + "epoch": 0.2953405409838306, + "grad_norm": 1.3341960906982422, + "learning_rate": 1.7063310742877362e-05, + "loss": 1.3209, + "step": 5402 + }, + { + "epoch": 0.2953952134275522, + "grad_norm": 1.5503188371658325, + "learning_rate": 1.7062017072282285e-05, + "loss": 1.3998, + "step": 5403 + }, + { + "epoch": 0.29544988587127374, + "grad_norm": 1.4386614561080933, + "learning_rate": 1.7060723165869892e-05, + "loss": 1.4157, + "step": 5404 + }, + { + "epoch": 0.2955045583149953, + "grad_norm": 1.3649227619171143, + "learning_rate": 1.7059429023683384e-05, + "loss": 1.4066, + "step": 5405 + }, + { + "epoch": 0.2955592307587168, + "grad_norm": 1.4617661237716675, + "learning_rate": 1.705813464576597e-05, + "loss": 1.5919, + "step": 5406 + }, + { + "epoch": 0.2956139032024384, + "grad_norm": 1.5470764636993408, + "learning_rate": 1.705684003216088e-05, + "loss": 1.4445, + "step": 5407 + }, + { + "epoch": 0.29566857564615995, + "grad_norm": 1.3186246156692505, + "learning_rate": 1.7055545182911343e-05, + "loss": 1.4227, + "step": 5408 + }, + { + "epoch": 0.2957232480898815, + "grad_norm": 1.3920735120773315, + "learning_rate": 1.7054250098060598e-05, + "loss": 1.367, + "step": 5409 + }, + { + "epoch": 0.2957779205336031, + "grad_norm": 1.6361322402954102, + "learning_rate": 1.705295477765188e-05, + "loss": 1.4616, + "step": 5410 + }, + { + "epoch": 0.2958325929773246, + "grad_norm": 2.1639504432678223, + "learning_rate": 1.705165922172846e-05, + "loss": 1.2952, + "step": 5411 + }, + { + "epoch": 0.29588726542104615, + "grad_norm": 1.8876866102218628, + "learning_rate": 1.705036343033359e-05, + "loss": 1.3841, + "step": 5412 + }, + { + "epoch": 0.2959419378647677, + "grad_norm": 1.443161129951477, + "learning_rate": 1.7049067403510544e-05, + "loss": 1.5587, + "step": 5413 + }, + { + "epoch": 0.2959966103084893, + "grad_norm": 1.3831619024276733, + "learning_rate": 1.704777114130259e-05, + "loss": 1.4591, + "step": 5414 + }, + { + "epoch": 0.2960512827522108, + "grad_norm": 1.6881035566329956, + "learning_rate": 1.7046474643753018e-05, + "loss": 1.612, + "step": 5415 + }, + { + "epoch": 0.29610595519593236, + "grad_norm": 1.4137247800827026, + "learning_rate": 1.7045177910905128e-05, + "loss": 1.4526, + "step": 5416 + }, + { + "epoch": 0.29616062763965395, + "grad_norm": 1.9286762475967407, + "learning_rate": 1.7043880942802212e-05, + "loss": 1.4698, + "step": 5417 + }, + { + "epoch": 0.2962153000833755, + "grad_norm": 1.485256314277649, + "learning_rate": 1.7042583739487585e-05, + "loss": 1.5341, + "step": 5418 + }, + { + "epoch": 0.296269972527097, + "grad_norm": 1.7546002864837646, + "learning_rate": 1.7041286301004563e-05, + "loss": 1.5755, + "step": 5419 + }, + { + "epoch": 0.29632464497081856, + "grad_norm": 1.508816123008728, + "learning_rate": 1.7039988627396464e-05, + "loss": 1.5042, + "step": 5420 + }, + { + "epoch": 0.29637931741454016, + "grad_norm": 1.2304377555847168, + "learning_rate": 1.703869071870663e-05, + "loss": 1.4875, + "step": 5421 + }, + { + "epoch": 0.2964339898582617, + "grad_norm": 1.717263102531433, + "learning_rate": 1.703739257497839e-05, + "loss": 1.4491, + "step": 5422 + }, + { + "epoch": 0.29648866230198323, + "grad_norm": 2.2719836235046387, + "learning_rate": 1.7036094196255103e-05, + "loss": 1.4047, + "step": 5423 + }, + { + "epoch": 0.2965433347457048, + "grad_norm": 1.3843705654144287, + "learning_rate": 1.7034795582580118e-05, + "loss": 1.4759, + "step": 5424 + }, + { + "epoch": 0.29659800718942636, + "grad_norm": 1.6136424541473389, + "learning_rate": 1.7033496733996798e-05, + "loss": 1.2842, + "step": 5425 + }, + { + "epoch": 0.2966526796331479, + "grad_norm": 1.4518707990646362, + "learning_rate": 1.703219765054852e-05, + "loss": 1.6596, + "step": 5426 + }, + { + "epoch": 0.29670735207686944, + "grad_norm": 1.7041376829147339, + "learning_rate": 1.7030898332278663e-05, + "loss": 1.3456, + "step": 5427 + }, + { + "epoch": 0.29676202452059103, + "grad_norm": 1.6100465059280396, + "learning_rate": 1.702959877923061e-05, + "loss": 1.5377, + "step": 5428 + }, + { + "epoch": 0.29681669696431257, + "grad_norm": 1.6900883913040161, + "learning_rate": 1.702829899144776e-05, + "loss": 1.2599, + "step": 5429 + }, + { + "epoch": 0.2968713694080341, + "grad_norm": 1.517014503479004, + "learning_rate": 1.702699896897351e-05, + "loss": 1.5581, + "step": 5430 + }, + { + "epoch": 0.2969260418517557, + "grad_norm": 1.8059265613555908, + "learning_rate": 1.7025698711851283e-05, + "loss": 1.3676, + "step": 5431 + }, + { + "epoch": 0.29698071429547723, + "grad_norm": 1.7643961906433105, + "learning_rate": 1.7024398220124483e-05, + "loss": 1.3094, + "step": 5432 + }, + { + "epoch": 0.29703538673919877, + "grad_norm": 1.6676586866378784, + "learning_rate": 1.7023097493836544e-05, + "loss": 1.5777, + "step": 5433 + }, + { + "epoch": 0.2970900591829203, + "grad_norm": 1.784808874130249, + "learning_rate": 1.70217965330309e-05, + "loss": 1.303, + "step": 5434 + }, + { + "epoch": 0.2971447316266419, + "grad_norm": 1.346002459526062, + "learning_rate": 1.7020495337750997e-05, + "loss": 1.5016, + "step": 5435 + }, + { + "epoch": 0.29719940407036344, + "grad_norm": 1.1269357204437256, + "learning_rate": 1.7019193908040274e-05, + "loss": 1.7394, + "step": 5436 + }, + { + "epoch": 0.297254076514085, + "grad_norm": 1.7461459636688232, + "learning_rate": 1.7017892243942195e-05, + "loss": 1.662, + "step": 5437 + }, + { + "epoch": 0.29730874895780657, + "grad_norm": 1.388379693031311, + "learning_rate": 1.701659034550023e-05, + "loss": 1.2933, + "step": 5438 + }, + { + "epoch": 0.2973634214015281, + "grad_norm": 1.4495735168457031, + "learning_rate": 1.7015288212757848e-05, + "loss": 1.2918, + "step": 5439 + }, + { + "epoch": 0.29741809384524964, + "grad_norm": 1.537166714668274, + "learning_rate": 1.701398584575853e-05, + "loss": 1.5861, + "step": 5440 + }, + { + "epoch": 0.2974727662889712, + "grad_norm": 1.5560827255249023, + "learning_rate": 1.7012683244545768e-05, + "loss": 1.4004, + "step": 5441 + }, + { + "epoch": 0.2975274387326928, + "grad_norm": 1.9470210075378418, + "learning_rate": 1.7011380409163053e-05, + "loss": 1.4914, + "step": 5442 + }, + { + "epoch": 0.2975821111764143, + "grad_norm": 1.9175169467926025, + "learning_rate": 1.7010077339653895e-05, + "loss": 1.4326, + "step": 5443 + }, + { + "epoch": 0.29763678362013585, + "grad_norm": 1.842829704284668, + "learning_rate": 1.7008774036061802e-05, + "loss": 1.6308, + "step": 5444 + }, + { + "epoch": 0.29769145606385744, + "grad_norm": 1.553868055343628, + "learning_rate": 1.70074704984303e-05, + "loss": 1.4628, + "step": 5445 + }, + { + "epoch": 0.297746128507579, + "grad_norm": 1.5516610145568848, + "learning_rate": 1.7006166726802916e-05, + "loss": 1.5311, + "step": 5446 + }, + { + "epoch": 0.2978008009513005, + "grad_norm": 1.2652524709701538, + "learning_rate": 1.7004862721223184e-05, + "loss": 1.5721, + "step": 5447 + }, + { + "epoch": 0.29785547339502205, + "grad_norm": 1.6104692220687866, + "learning_rate": 1.7003558481734647e-05, + "loss": 1.3046, + "step": 5448 + }, + { + "epoch": 0.29791014583874365, + "grad_norm": 1.6537868976593018, + "learning_rate": 1.7002254008380858e-05, + "loss": 1.6191, + "step": 5449 + }, + { + "epoch": 0.2979648182824652, + "grad_norm": 1.8773659467697144, + "learning_rate": 1.7000949301205376e-05, + "loss": 1.4261, + "step": 5450 + }, + { + "epoch": 0.2980194907261867, + "grad_norm": 1.515204906463623, + "learning_rate": 1.6999644360251772e-05, + "loss": 1.4652, + "step": 5451 + }, + { + "epoch": 0.2980741631699083, + "grad_norm": 1.3565778732299805, + "learning_rate": 1.6998339185563614e-05, + "loss": 1.3052, + "step": 5452 + }, + { + "epoch": 0.29812883561362985, + "grad_norm": 1.456555962562561, + "learning_rate": 1.699703377718449e-05, + "loss": 1.2577, + "step": 5453 + }, + { + "epoch": 0.2981835080573514, + "grad_norm": 1.764095664024353, + "learning_rate": 1.699572813515799e-05, + "loss": 1.1888, + "step": 5454 + }, + { + "epoch": 0.2982381805010729, + "grad_norm": 2.6477932929992676, + "learning_rate": 1.6994422259527708e-05, + "loss": 1.2658, + "step": 5455 + }, + { + "epoch": 0.2982928529447945, + "grad_norm": 1.4848084449768066, + "learning_rate": 1.6993116150337258e-05, + "loss": 1.1832, + "step": 5456 + }, + { + "epoch": 0.29834752538851605, + "grad_norm": 1.4513468742370605, + "learning_rate": 1.6991809807630245e-05, + "loss": 1.5782, + "step": 5457 + }, + { + "epoch": 0.2984021978322376, + "grad_norm": 1.9920011758804321, + "learning_rate": 1.69905032314503e-05, + "loss": 1.3289, + "step": 5458 + }, + { + "epoch": 0.2984568702759592, + "grad_norm": 1.6251457929611206, + "learning_rate": 1.6989196421841045e-05, + "loss": 1.4835, + "step": 5459 + }, + { + "epoch": 0.2985115427196807, + "grad_norm": 1.6049796342849731, + "learning_rate": 1.698788937884612e-05, + "loss": 1.2966, + "step": 5460 + }, + { + "epoch": 0.29856621516340226, + "grad_norm": 1.428581953048706, + "learning_rate": 1.6986582102509176e-05, + "loss": 0.9857, + "step": 5461 + }, + { + "epoch": 0.2986208876071238, + "grad_norm": 1.4413516521453857, + "learning_rate": 1.698527459287386e-05, + "loss": 1.1842, + "step": 5462 + }, + { + "epoch": 0.2986755600508454, + "grad_norm": 1.5474239587783813, + "learning_rate": 1.6983966849983833e-05, + "loss": 1.7064, + "step": 5463 + }, + { + "epoch": 0.2987302324945669, + "grad_norm": 1.8706676959991455, + "learning_rate": 1.6982658873882764e-05, + "loss": 1.5443, + "step": 5464 + }, + { + "epoch": 0.29878490493828846, + "grad_norm": 1.4367536306381226, + "learning_rate": 1.6981350664614332e-05, + "loss": 1.4023, + "step": 5465 + }, + { + "epoch": 0.29883957738201006, + "grad_norm": 1.2892415523529053, + "learning_rate": 1.6980042222222216e-05, + "loss": 1.8747, + "step": 5466 + }, + { + "epoch": 0.2988942498257316, + "grad_norm": 1.7435332536697388, + "learning_rate": 1.6978733546750112e-05, + "loss": 1.5247, + "step": 5467 + }, + { + "epoch": 0.29894892226945313, + "grad_norm": 1.1983790397644043, + "learning_rate": 1.697742463824172e-05, + "loss": 1.6803, + "step": 5468 + }, + { + "epoch": 0.29900359471317467, + "grad_norm": 1.6477329730987549, + "learning_rate": 1.6976115496740747e-05, + "loss": 1.7715, + "step": 5469 + }, + { + "epoch": 0.29905826715689626, + "grad_norm": 1.5153112411499023, + "learning_rate": 1.6974806122290902e-05, + "loss": 1.5749, + "step": 5470 + }, + { + "epoch": 0.2991129396006178, + "grad_norm": 1.806231141090393, + "learning_rate": 1.6973496514935918e-05, + "loss": 1.5632, + "step": 5471 + }, + { + "epoch": 0.29916761204433934, + "grad_norm": 1.342335820198059, + "learning_rate": 1.6972186674719522e-05, + "loss": 1.4319, + "step": 5472 + }, + { + "epoch": 0.29922228448806093, + "grad_norm": 1.6655807495117188, + "learning_rate": 1.697087660168545e-05, + "loss": 1.3786, + "step": 5473 + }, + { + "epoch": 0.29927695693178247, + "grad_norm": 1.273193597793579, + "learning_rate": 1.6969566295877453e-05, + "loss": 1.4632, + "step": 5474 + }, + { + "epoch": 0.299331629375504, + "grad_norm": 1.4882116317749023, + "learning_rate": 1.6968255757339282e-05, + "loss": 1.6396, + "step": 5475 + }, + { + "epoch": 0.29938630181922554, + "grad_norm": 1.1497023105621338, + "learning_rate": 1.69669449861147e-05, + "loss": 1.468, + "step": 5476 + }, + { + "epoch": 0.29944097426294713, + "grad_norm": 1.5164906978607178, + "learning_rate": 1.6965633982247472e-05, + "loss": 1.4584, + "step": 5477 + }, + { + "epoch": 0.2994956467066687, + "grad_norm": 1.5306200981140137, + "learning_rate": 1.6964322745781386e-05, + "loss": 1.4264, + "step": 5478 + }, + { + "epoch": 0.2995503191503902, + "grad_norm": 1.8118209838867188, + "learning_rate": 1.696301127676022e-05, + "loss": 1.3535, + "step": 5479 + }, + { + "epoch": 0.2996049915941118, + "grad_norm": 1.6634716987609863, + "learning_rate": 1.6961699575227767e-05, + "loss": 1.3214, + "step": 5480 + }, + { + "epoch": 0.29965966403783334, + "grad_norm": 1.922917366027832, + "learning_rate": 1.696038764122783e-05, + "loss": 1.1658, + "step": 5481 + }, + { + "epoch": 0.2997143364815549, + "grad_norm": 1.608165979385376, + "learning_rate": 1.6959075474804217e-05, + "loss": 1.1162, + "step": 5482 + }, + { + "epoch": 0.2997690089252764, + "grad_norm": 1.3517309427261353, + "learning_rate": 1.695776307600074e-05, + "loss": 1.5478, + "step": 5483 + }, + { + "epoch": 0.299823681368998, + "grad_norm": 1.4863754510879517, + "learning_rate": 1.6956450444861232e-05, + "loss": 1.3652, + "step": 5484 + }, + { + "epoch": 0.29987835381271954, + "grad_norm": 1.521386981010437, + "learning_rate": 1.6955137581429518e-05, + "loss": 1.5668, + "step": 5485 + }, + { + "epoch": 0.2999330262564411, + "grad_norm": 1.474103331565857, + "learning_rate": 1.695382448574944e-05, + "loss": 1.4765, + "step": 5486 + }, + { + "epoch": 0.2999876987001627, + "grad_norm": 1.6515300273895264, + "learning_rate": 1.695251115786484e-05, + "loss": 1.363, + "step": 5487 + }, + { + "epoch": 0.3000423711438842, + "grad_norm": 1.43770432472229, + "learning_rate": 1.695119759781958e-05, + "loss": 1.3852, + "step": 5488 + }, + { + "epoch": 0.30009704358760575, + "grad_norm": 1.5434339046478271, + "learning_rate": 1.6949883805657524e-05, + "loss": 1.38, + "step": 5489 + }, + { + "epoch": 0.3001517160313273, + "grad_norm": 1.3512815237045288, + "learning_rate": 1.694856978142254e-05, + "loss": 1.1406, + "step": 5490 + }, + { + "epoch": 0.3002063884750489, + "grad_norm": 1.986851453781128, + "learning_rate": 1.6947255525158503e-05, + "loss": 1.2909, + "step": 5491 + }, + { + "epoch": 0.3002610609187704, + "grad_norm": 1.5807713270187378, + "learning_rate": 1.69459410369093e-05, + "loss": 1.4355, + "step": 5492 + }, + { + "epoch": 0.30031573336249195, + "grad_norm": 1.7224606275558472, + "learning_rate": 1.694462631671883e-05, + "loss": 1.1277, + "step": 5493 + }, + { + "epoch": 0.30037040580621355, + "grad_norm": 1.4426754713058472, + "learning_rate": 1.694331136463099e-05, + "loss": 1.6626, + "step": 5494 + }, + { + "epoch": 0.3004250782499351, + "grad_norm": 1.4338215589523315, + "learning_rate": 1.6941996180689692e-05, + "loss": 1.2067, + "step": 5495 + }, + { + "epoch": 0.3004797506936566, + "grad_norm": 1.4482262134552002, + "learning_rate": 1.694068076493885e-05, + "loss": 1.5079, + "step": 5496 + }, + { + "epoch": 0.30053442313737816, + "grad_norm": 1.768988847732544, + "learning_rate": 1.6939365117422392e-05, + "loss": 1.4505, + "step": 5497 + }, + { + "epoch": 0.30058909558109975, + "grad_norm": 1.4480148553848267, + "learning_rate": 1.6938049238184245e-05, + "loss": 1.4008, + "step": 5498 + }, + { + "epoch": 0.3006437680248213, + "grad_norm": 1.324517011642456, + "learning_rate": 1.6936733127268357e-05, + "loss": 1.5002, + "step": 5499 + }, + { + "epoch": 0.3006984404685428, + "grad_norm": 2.1153695583343506, + "learning_rate": 1.693541678471867e-05, + "loss": 1.5254, + "step": 5500 + }, + { + "epoch": 0.3007531129122644, + "grad_norm": 1.5732730627059937, + "learning_rate": 1.6934100210579144e-05, + "loss": 1.5172, + "step": 5501 + }, + { + "epoch": 0.30080778535598596, + "grad_norm": 1.4341914653778076, + "learning_rate": 1.693278340489374e-05, + "loss": 1.5006, + "step": 5502 + }, + { + "epoch": 0.3008624577997075, + "grad_norm": 1.9600647687911987, + "learning_rate": 1.693146636770643e-05, + "loss": 1.1785, + "step": 5503 + }, + { + "epoch": 0.30091713024342903, + "grad_norm": 1.5954227447509766, + "learning_rate": 1.6930149099061194e-05, + "loss": 1.4092, + "step": 5504 + }, + { + "epoch": 0.3009718026871506, + "grad_norm": 1.789715051651001, + "learning_rate": 1.6928831599002013e-05, + "loss": 1.5128, + "step": 5505 + }, + { + "epoch": 0.30102647513087216, + "grad_norm": 1.6489155292510986, + "learning_rate": 1.692751386757289e-05, + "loss": 1.4183, + "step": 5506 + }, + { + "epoch": 0.3010811475745937, + "grad_norm": 1.2216780185699463, + "learning_rate": 1.6926195904817823e-05, + "loss": 1.481, + "step": 5507 + }, + { + "epoch": 0.3011358200183153, + "grad_norm": 1.6909852027893066, + "learning_rate": 1.6924877710780818e-05, + "loss": 1.6947, + "step": 5508 + }, + { + "epoch": 0.30119049246203683, + "grad_norm": 1.833969235420227, + "learning_rate": 1.69235592855059e-05, + "loss": 1.6374, + "step": 5509 + }, + { + "epoch": 0.30124516490575837, + "grad_norm": 1.2019615173339844, + "learning_rate": 1.6922240629037094e-05, + "loss": 1.6389, + "step": 5510 + }, + { + "epoch": 0.3012998373494799, + "grad_norm": 1.605237603187561, + "learning_rate": 1.6920921741418425e-05, + "loss": 1.4253, + "step": 5511 + }, + { + "epoch": 0.3013545097932015, + "grad_norm": 1.2335801124572754, + "learning_rate": 1.691960262269394e-05, + "loss": 1.6002, + "step": 5512 + }, + { + "epoch": 0.30140918223692303, + "grad_norm": 1.17859947681427, + "learning_rate": 1.6918283272907684e-05, + "loss": 1.6049, + "step": 5513 + }, + { + "epoch": 0.30146385468064457, + "grad_norm": 1.3088021278381348, + "learning_rate": 1.6916963692103716e-05, + "loss": 1.455, + "step": 5514 + }, + { + "epoch": 0.30151852712436616, + "grad_norm": 1.5866718292236328, + "learning_rate": 1.69156438803261e-05, + "loss": 1.6545, + "step": 5515 + }, + { + "epoch": 0.3015731995680877, + "grad_norm": 1.5411450862884521, + "learning_rate": 1.6914323837618906e-05, + "loss": 1.3608, + "step": 5516 + }, + { + "epoch": 0.30162787201180924, + "grad_norm": 1.272140622138977, + "learning_rate": 1.691300356402622e-05, + "loss": 1.3915, + "step": 5517 + }, + { + "epoch": 0.3016825444555308, + "grad_norm": 1.6545277833938599, + "learning_rate": 1.6911683059592115e-05, + "loss": 1.302, + "step": 5518 + }, + { + "epoch": 0.30173721689925237, + "grad_norm": 1.455256700515747, + "learning_rate": 1.6910362324360695e-05, + "loss": 1.6083, + "step": 5519 + }, + { + "epoch": 0.3017918893429739, + "grad_norm": 1.4001303911209106, + "learning_rate": 1.6909041358376065e-05, + "loss": 1.5198, + "step": 5520 + }, + { + "epoch": 0.30184656178669544, + "grad_norm": 1.2514374256134033, + "learning_rate": 1.6907720161682332e-05, + "loss": 1.3355, + "step": 5521 + }, + { + "epoch": 0.30190123423041704, + "grad_norm": 1.4322887659072876, + "learning_rate": 1.690639873432361e-05, + "loss": 1.4214, + "step": 5522 + }, + { + "epoch": 0.3019559066741386, + "grad_norm": 1.7349762916564941, + "learning_rate": 1.6905077076344024e-05, + "loss": 1.5055, + "step": 5523 + }, + { + "epoch": 0.3020105791178601, + "grad_norm": 1.957811713218689, + "learning_rate": 1.6903755187787722e-05, + "loss": 1.4999, + "step": 5524 + }, + { + "epoch": 0.30206525156158165, + "grad_norm": 1.7034714221954346, + "learning_rate": 1.6902433068698827e-05, + "loss": 1.3081, + "step": 5525 + }, + { + "epoch": 0.30211992400530324, + "grad_norm": 1.6925257444381714, + "learning_rate": 1.6901110719121493e-05, + "loss": 1.2148, + "step": 5526 + }, + { + "epoch": 0.3021745964490248, + "grad_norm": 1.2020832300186157, + "learning_rate": 1.689978813909988e-05, + "loss": 1.626, + "step": 5527 + }, + { + "epoch": 0.3022292688927463, + "grad_norm": 1.5642977952957153, + "learning_rate": 1.6898465328678154e-05, + "loss": 1.0496, + "step": 5528 + }, + { + "epoch": 0.3022839413364679, + "grad_norm": 1.67409086227417, + "learning_rate": 1.6897142287900477e-05, + "loss": 1.4491, + "step": 5529 + }, + { + "epoch": 0.30233861378018945, + "grad_norm": 1.541440725326538, + "learning_rate": 1.6895819016811043e-05, + "loss": 1.5615, + "step": 5530 + }, + { + "epoch": 0.302393286223911, + "grad_norm": 1.4250518083572388, + "learning_rate": 1.6894495515454025e-05, + "loss": 1.4121, + "step": 5531 + }, + { + "epoch": 0.3024479586676325, + "grad_norm": 1.6052277088165283, + "learning_rate": 1.6893171783873624e-05, + "loss": 1.4168, + "step": 5532 + }, + { + "epoch": 0.3025026311113541, + "grad_norm": 1.7618186473846436, + "learning_rate": 1.689184782211404e-05, + "loss": 1.469, + "step": 5533 + }, + { + "epoch": 0.30255730355507565, + "grad_norm": 1.5113149881362915, + "learning_rate": 1.6890523630219486e-05, + "loss": 1.672, + "step": 5534 + }, + { + "epoch": 0.3026119759987972, + "grad_norm": 1.4049947261810303, + "learning_rate": 1.6889199208234178e-05, + "loss": 1.6407, + "step": 5535 + }, + { + "epoch": 0.3026666484425188, + "grad_norm": 1.5483721494674683, + "learning_rate": 1.6887874556202342e-05, + "loss": 1.5704, + "step": 5536 + }, + { + "epoch": 0.3027213208862403, + "grad_norm": 1.8171900510787964, + "learning_rate": 1.6886549674168213e-05, + "loss": 1.5752, + "step": 5537 + }, + { + "epoch": 0.30277599332996186, + "grad_norm": 1.5840903520584106, + "learning_rate": 1.6885224562176033e-05, + "loss": 1.2163, + "step": 5538 + }, + { + "epoch": 0.3028306657736834, + "grad_norm": 1.169566035270691, + "learning_rate": 1.6883899220270047e-05, + "loss": 1.5382, + "step": 5539 + }, + { + "epoch": 0.302885338217405, + "grad_norm": 1.3864232301712036, + "learning_rate": 1.688257364849451e-05, + "loss": 1.5561, + "step": 5540 + }, + { + "epoch": 0.3029400106611265, + "grad_norm": 1.4951199293136597, + "learning_rate": 1.688124784689369e-05, + "loss": 1.2956, + "step": 5541 + }, + { + "epoch": 0.30299468310484806, + "grad_norm": 1.7573564052581787, + "learning_rate": 1.6879921815511858e-05, + "loss": 1.1528, + "step": 5542 + }, + { + "epoch": 0.30304935554856965, + "grad_norm": 1.5161828994750977, + "learning_rate": 1.687859555439329e-05, + "loss": 1.4298, + "step": 5543 + }, + { + "epoch": 0.3031040279922912, + "grad_norm": 1.516656756401062, + "learning_rate": 1.6877269063582274e-05, + "loss": 1.3415, + "step": 5544 + }, + { + "epoch": 0.30315870043601273, + "grad_norm": 1.8822952508926392, + "learning_rate": 1.687594234312311e-05, + "loss": 1.7026, + "step": 5545 + }, + { + "epoch": 0.30321337287973427, + "grad_norm": 1.829384684562683, + "learning_rate": 1.6874615393060093e-05, + "loss": 1.5652, + "step": 5546 + }, + { + "epoch": 0.30326804532345586, + "grad_norm": 1.7894474267959595, + "learning_rate": 1.687328821343754e-05, + "loss": 1.3973, + "step": 5547 + }, + { + "epoch": 0.3033227177671774, + "grad_norm": 1.4892470836639404, + "learning_rate": 1.687196080429976e-05, + "loss": 1.5492, + "step": 5548 + }, + { + "epoch": 0.30337739021089893, + "grad_norm": 1.9465442895889282, + "learning_rate": 1.6870633165691087e-05, + "loss": 1.3612, + "step": 5549 + }, + { + "epoch": 0.3034320626546205, + "grad_norm": 1.4931784868240356, + "learning_rate": 1.686930529765585e-05, + "loss": 1.5516, + "step": 5550 + }, + { + "epoch": 0.30348673509834206, + "grad_norm": 1.7061302661895752, + "learning_rate": 1.6867977200238388e-05, + "loss": 1.5609, + "step": 5551 + }, + { + "epoch": 0.3035414075420636, + "grad_norm": 1.824856162071228, + "learning_rate": 1.6866648873483052e-05, + "loss": 1.5003, + "step": 5552 + }, + { + "epoch": 0.30359607998578514, + "grad_norm": 1.1252021789550781, + "learning_rate": 1.6865320317434197e-05, + "loss": 1.5839, + "step": 5553 + }, + { + "epoch": 0.30365075242950673, + "grad_norm": 1.6359292268753052, + "learning_rate": 1.6863991532136186e-05, + "loss": 1.414, + "step": 5554 + }, + { + "epoch": 0.30370542487322827, + "grad_norm": 1.3586634397506714, + "learning_rate": 1.6862662517633394e-05, + "loss": 1.5316, + "step": 5555 + }, + { + "epoch": 0.3037600973169498, + "grad_norm": 1.5657718181610107, + "learning_rate": 1.6861333273970192e-05, + "loss": 1.283, + "step": 5556 + }, + { + "epoch": 0.3038147697606714, + "grad_norm": 1.5960564613342285, + "learning_rate": 1.6860003801190975e-05, + "loss": 1.6344, + "step": 5557 + }, + { + "epoch": 0.30386944220439294, + "grad_norm": 1.5318093299865723, + "learning_rate": 1.685867409934013e-05, + "loss": 1.4773, + "step": 5558 + }, + { + "epoch": 0.3039241146481145, + "grad_norm": 1.5753668546676636, + "learning_rate": 1.6857344168462065e-05, + "loss": 1.4046, + "step": 5559 + }, + { + "epoch": 0.303978787091836, + "grad_norm": 1.230104923248291, + "learning_rate": 1.6856014008601187e-05, + "loss": 1.4823, + "step": 5560 + }, + { + "epoch": 0.3040334595355576, + "grad_norm": 1.5126267671585083, + "learning_rate": 1.6854683619801915e-05, + "loss": 1.5471, + "step": 5561 + }, + { + "epoch": 0.30408813197927914, + "grad_norm": 1.4523777961730957, + "learning_rate": 1.6853353002108667e-05, + "loss": 1.4577, + "step": 5562 + }, + { + "epoch": 0.3041428044230007, + "grad_norm": 2.6262898445129395, + "learning_rate": 1.6852022155565882e-05, + "loss": 1.3659, + "step": 5563 + }, + { + "epoch": 0.30419747686672227, + "grad_norm": 1.6913272142410278, + "learning_rate": 1.6850691080218e-05, + "loss": 1.3211, + "step": 5564 + }, + { + "epoch": 0.3042521493104438, + "grad_norm": 1.6390596628189087, + "learning_rate": 1.684935977610947e-05, + "loss": 1.5028, + "step": 5565 + }, + { + "epoch": 0.30430682175416535, + "grad_norm": 1.5107636451721191, + "learning_rate": 1.684802824328474e-05, + "loss": 1.77, + "step": 5566 + }, + { + "epoch": 0.3043614941978869, + "grad_norm": 1.2543823719024658, + "learning_rate": 1.6846696481788276e-05, + "loss": 1.4676, + "step": 5567 + }, + { + "epoch": 0.3044161666416085, + "grad_norm": 1.314897060394287, + "learning_rate": 1.6845364491664555e-05, + "loss": 1.6545, + "step": 5568 + }, + { + "epoch": 0.30447083908533, + "grad_norm": 1.9589314460754395, + "learning_rate": 1.684403227295805e-05, + "loss": 1.7906, + "step": 5569 + }, + { + "epoch": 0.30452551152905155, + "grad_norm": 1.5268707275390625, + "learning_rate": 1.6842699825713244e-05, + "loss": 1.5277, + "step": 5570 + }, + { + "epoch": 0.30458018397277314, + "grad_norm": 1.558721899986267, + "learning_rate": 1.6841367149974638e-05, + "loss": 1.5658, + "step": 5571 + }, + { + "epoch": 0.3046348564164947, + "grad_norm": 1.475574016571045, + "learning_rate": 1.6840034245786726e-05, + "loss": 1.4573, + "step": 5572 + }, + { + "epoch": 0.3046895288602162, + "grad_norm": 1.603130578994751, + "learning_rate": 1.6838701113194022e-05, + "loss": 1.4231, + "step": 5573 + }, + { + "epoch": 0.30474420130393776, + "grad_norm": 1.63948655128479, + "learning_rate": 1.6837367752241035e-05, + "loss": 1.4417, + "step": 5574 + }, + { + "epoch": 0.30479887374765935, + "grad_norm": 1.4604861736297607, + "learning_rate": 1.68360341629723e-05, + "loss": 1.608, + "step": 5575 + }, + { + "epoch": 0.3048535461913809, + "grad_norm": 2.167135000228882, + "learning_rate": 1.683470034543234e-05, + "loss": 1.4244, + "step": 5576 + }, + { + "epoch": 0.3049082186351024, + "grad_norm": 2.03774356842041, + "learning_rate": 1.68333662996657e-05, + "loss": 1.5232, + "step": 5577 + }, + { + "epoch": 0.304962891078824, + "grad_norm": 1.4565931558609009, + "learning_rate": 1.683203202571692e-05, + "loss": 1.5075, + "step": 5578 + }, + { + "epoch": 0.30501756352254555, + "grad_norm": 1.5738415718078613, + "learning_rate": 1.6830697523630564e-05, + "loss": 1.5145, + "step": 5579 + }, + { + "epoch": 0.3050722359662671, + "grad_norm": 2.2363779544830322, + "learning_rate": 1.6829362793451186e-05, + "loss": 1.4846, + "step": 5580 + }, + { + "epoch": 0.30512690840998863, + "grad_norm": 1.6189820766448975, + "learning_rate": 1.6828027835223363e-05, + "loss": 1.1938, + "step": 5581 + }, + { + "epoch": 0.3051815808537102, + "grad_norm": 1.468650221824646, + "learning_rate": 1.682669264899166e-05, + "loss": 1.3811, + "step": 5582 + }, + { + "epoch": 0.30523625329743176, + "grad_norm": 1.5907938480377197, + "learning_rate": 1.6825357234800676e-05, + "loss": 1.4732, + "step": 5583 + }, + { + "epoch": 0.3052909257411533, + "grad_norm": 1.1900813579559326, + "learning_rate": 1.6824021592694995e-05, + "loss": 1.2878, + "step": 5584 + }, + { + "epoch": 0.3053455981848749, + "grad_norm": 1.646169662475586, + "learning_rate": 1.6822685722719224e-05, + "loss": 1.5513, + "step": 5585 + }, + { + "epoch": 0.3054002706285964, + "grad_norm": 1.749133586883545, + "learning_rate": 1.6821349624917962e-05, + "loss": 1.5729, + "step": 5586 + }, + { + "epoch": 0.30545494307231796, + "grad_norm": 1.3167393207550049, + "learning_rate": 1.6820013299335833e-05, + "loss": 1.5222, + "step": 5587 + }, + { + "epoch": 0.3055096155160395, + "grad_norm": 2.1887214183807373, + "learning_rate": 1.6818676746017455e-05, + "loss": 1.4499, + "step": 5588 + }, + { + "epoch": 0.3055642879597611, + "grad_norm": 1.4009581804275513, + "learning_rate": 1.681733996500746e-05, + "loss": 1.7076, + "step": 5589 + }, + { + "epoch": 0.30561896040348263, + "grad_norm": 1.2322876453399658, + "learning_rate": 1.6816002956350486e-05, + "loss": 1.4384, + "step": 5590 + }, + { + "epoch": 0.30567363284720417, + "grad_norm": 1.8937227725982666, + "learning_rate": 1.681466572009118e-05, + "loss": 1.5599, + "step": 5591 + }, + { + "epoch": 0.30572830529092576, + "grad_norm": 2.217830181121826, + "learning_rate": 1.681332825627419e-05, + "loss": 1.5172, + "step": 5592 + }, + { + "epoch": 0.3057829777346473, + "grad_norm": 1.410913348197937, + "learning_rate": 1.6811990564944186e-05, + "loss": 1.3114, + "step": 5593 + }, + { + "epoch": 0.30583765017836884, + "grad_norm": 1.667126178741455, + "learning_rate": 1.681065264614583e-05, + "loss": 1.4114, + "step": 5594 + }, + { + "epoch": 0.3058923226220904, + "grad_norm": 1.2960937023162842, + "learning_rate": 1.6809314499923802e-05, + "loss": 1.6714, + "step": 5595 + }, + { + "epoch": 0.30594699506581197, + "grad_norm": 1.7101112604141235, + "learning_rate": 1.6807976126322784e-05, + "loss": 1.3122, + "step": 5596 + }, + { + "epoch": 0.3060016675095335, + "grad_norm": 1.8174548149108887, + "learning_rate": 1.680663752538747e-05, + "loss": 1.3144, + "step": 5597 + }, + { + "epoch": 0.30605633995325504, + "grad_norm": 1.7118656635284424, + "learning_rate": 1.680529869716255e-05, + "loss": 1.2907, + "step": 5598 + }, + { + "epoch": 0.30611101239697663, + "grad_norm": 1.4112623929977417, + "learning_rate": 1.680395964169274e-05, + "loss": 1.4544, + "step": 5599 + }, + { + "epoch": 0.30616568484069817, + "grad_norm": 1.1956329345703125, + "learning_rate": 1.6802620359022757e-05, + "loss": 1.4226, + "step": 5600 + }, + { + "epoch": 0.3062203572844197, + "grad_norm": 1.2822648286819458, + "learning_rate": 1.6801280849197316e-05, + "loss": 1.3109, + "step": 5601 + }, + { + "epoch": 0.30627502972814125, + "grad_norm": 1.8166850805282593, + "learning_rate": 1.6799941112261143e-05, + "loss": 1.4931, + "step": 5602 + }, + { + "epoch": 0.30632970217186284, + "grad_norm": 1.5355994701385498, + "learning_rate": 1.679860114825898e-05, + "loss": 1.6028, + "step": 5603 + }, + { + "epoch": 0.3063843746155844, + "grad_norm": 1.3251969814300537, + "learning_rate": 1.6797260957235576e-05, + "loss": 1.5048, + "step": 5604 + }, + { + "epoch": 0.3064390470593059, + "grad_norm": 1.2200958728790283, + "learning_rate": 1.6795920539235676e-05, + "loss": 1.4347, + "step": 5605 + }, + { + "epoch": 0.3064937195030275, + "grad_norm": 1.0308911800384521, + "learning_rate": 1.6794579894304043e-05, + "loss": 1.4791, + "step": 5606 + }, + { + "epoch": 0.30654839194674904, + "grad_norm": 1.542178988456726, + "learning_rate": 1.679323902248544e-05, + "loss": 1.3914, + "step": 5607 + }, + { + "epoch": 0.3066030643904706, + "grad_norm": 1.6968070268630981, + "learning_rate": 1.679189792382465e-05, + "loss": 1.3878, + "step": 5608 + }, + { + "epoch": 0.3066577368341922, + "grad_norm": 1.276521921157837, + "learning_rate": 1.6790556598366447e-05, + "loss": 1.5781, + "step": 5609 + }, + { + "epoch": 0.3067124092779137, + "grad_norm": 1.3684755563735962, + "learning_rate": 1.678921504615562e-05, + "loss": 1.4642, + "step": 5610 + }, + { + "epoch": 0.30676708172163525, + "grad_norm": 1.4366079568862915, + "learning_rate": 1.678787326723698e-05, + "loss": 1.2963, + "step": 5611 + }, + { + "epoch": 0.3068217541653568, + "grad_norm": 1.6617707014083862, + "learning_rate": 1.6786531261655322e-05, + "loss": 1.4432, + "step": 5612 + }, + { + "epoch": 0.3068764266090784, + "grad_norm": 1.5684758424758911, + "learning_rate": 1.6785189029455455e-05, + "loss": 1.1776, + "step": 5613 + }, + { + "epoch": 0.3069310990527999, + "grad_norm": 1.084092617034912, + "learning_rate": 1.6783846570682207e-05, + "loss": 1.3825, + "step": 5614 + }, + { + "epoch": 0.30698577149652145, + "grad_norm": 1.8610261678695679, + "learning_rate": 1.6782503885380404e-05, + "loss": 1.2099, + "step": 5615 + }, + { + "epoch": 0.30704044394024305, + "grad_norm": 1.4688953161239624, + "learning_rate": 1.6781160973594884e-05, + "loss": 1.5195, + "step": 5616 + }, + { + "epoch": 0.3070951163839646, + "grad_norm": 1.5512441396713257, + "learning_rate": 1.677981783537048e-05, + "loss": 1.5779, + "step": 5617 + }, + { + "epoch": 0.3071497888276861, + "grad_norm": 1.1363338232040405, + "learning_rate": 1.6778474470752053e-05, + "loss": 1.3678, + "step": 5618 + }, + { + "epoch": 0.30720446127140766, + "grad_norm": 1.543418049812317, + "learning_rate": 1.6777130879784456e-05, + "loss": 1.4071, + "step": 5619 + }, + { + "epoch": 0.30725913371512925, + "grad_norm": 1.93231201171875, + "learning_rate": 1.6775787062512557e-05, + "loss": 1.6383, + "step": 5620 + }, + { + "epoch": 0.3073138061588508, + "grad_norm": 1.563227653503418, + "learning_rate": 1.6774443018981227e-05, + "loss": 1.4037, + "step": 5621 + }, + { + "epoch": 0.3073684786025723, + "grad_norm": 1.7474581003189087, + "learning_rate": 1.6773098749235348e-05, + "loss": 1.2906, + "step": 5622 + }, + { + "epoch": 0.3074231510462939, + "grad_norm": 1.5975335836410522, + "learning_rate": 1.677175425331981e-05, + "loss": 1.5617, + "step": 5623 + }, + { + "epoch": 0.30747782349001546, + "grad_norm": 1.5224428176879883, + "learning_rate": 1.6770409531279504e-05, + "loss": 1.0589, + "step": 5624 + }, + { + "epoch": 0.307532495933737, + "grad_norm": 1.724183201789856, + "learning_rate": 1.6769064583159338e-05, + "loss": 1.7587, + "step": 5625 + }, + { + "epoch": 0.30758716837745853, + "grad_norm": 1.3541228771209717, + "learning_rate": 1.676771940900422e-05, + "loss": 1.5376, + "step": 5626 + }, + { + "epoch": 0.3076418408211801, + "grad_norm": 2.3776192665100098, + "learning_rate": 1.676637400885907e-05, + "loss": 1.5541, + "step": 5627 + }, + { + "epoch": 0.30769651326490166, + "grad_norm": 1.5012638568878174, + "learning_rate": 1.6765028382768815e-05, + "loss": 1.4702, + "step": 5628 + }, + { + "epoch": 0.3077511857086232, + "grad_norm": 1.36568284034729, + "learning_rate": 1.6763682530778388e-05, + "loss": 1.4124, + "step": 5629 + }, + { + "epoch": 0.3078058581523448, + "grad_norm": 1.3340858221054077, + "learning_rate": 1.6762336452932734e-05, + "loss": 1.3496, + "step": 5630 + }, + { + "epoch": 0.30786053059606633, + "grad_norm": 1.8577817678451538, + "learning_rate": 1.676099014927679e-05, + "loss": 1.5433, + "step": 5631 + }, + { + "epoch": 0.30791520303978787, + "grad_norm": 1.8127039670944214, + "learning_rate": 1.6759643619855525e-05, + "loss": 1.6515, + "step": 5632 + }, + { + "epoch": 0.3079698754835094, + "grad_norm": 1.5503771305084229, + "learning_rate": 1.6758296864713897e-05, + "loss": 1.4491, + "step": 5633 + }, + { + "epoch": 0.308024547927231, + "grad_norm": 1.7260584831237793, + "learning_rate": 1.6756949883896874e-05, + "loss": 1.921, + "step": 5634 + }, + { + "epoch": 0.30807922037095253, + "grad_norm": 1.5759403705596924, + "learning_rate": 1.6755602677449445e-05, + "loss": 1.409, + "step": 5635 + }, + { + "epoch": 0.30813389281467407, + "grad_norm": 1.4155627489089966, + "learning_rate": 1.6754255245416585e-05, + "loss": 1.3079, + "step": 5636 + }, + { + "epoch": 0.30818856525839566, + "grad_norm": 1.4241127967834473, + "learning_rate": 1.6752907587843294e-05, + "loss": 1.3124, + "step": 5637 + }, + { + "epoch": 0.3082432377021172, + "grad_norm": 1.495445966720581, + "learning_rate": 1.6751559704774572e-05, + "loss": 1.5222, + "step": 5638 + }, + { + "epoch": 0.30829791014583874, + "grad_norm": 1.4552794694900513, + "learning_rate": 1.6750211596255427e-05, + "loss": 1.505, + "step": 5639 + }, + { + "epoch": 0.3083525825895603, + "grad_norm": 1.5207583904266357, + "learning_rate": 1.674886326233088e-05, + "loss": 1.5936, + "step": 5640 + }, + { + "epoch": 0.30840725503328187, + "grad_norm": 1.3605163097381592, + "learning_rate": 1.6747514703045952e-05, + "loss": 1.5484, + "step": 5641 + }, + { + "epoch": 0.3084619274770034, + "grad_norm": 1.3503748178482056, + "learning_rate": 1.6746165918445675e-05, + "loss": 1.4928, + "step": 5642 + }, + { + "epoch": 0.30851659992072494, + "grad_norm": 1.5826092958450317, + "learning_rate": 1.6744816908575085e-05, + "loss": 1.4733, + "step": 5643 + }, + { + "epoch": 0.30857127236444654, + "grad_norm": 2.817927837371826, + "learning_rate": 1.6743467673479233e-05, + "loss": 1.3324, + "step": 5644 + }, + { + "epoch": 0.3086259448081681, + "grad_norm": 1.7672452926635742, + "learning_rate": 1.6742118213203173e-05, + "loss": 1.5442, + "step": 5645 + }, + { + "epoch": 0.3086806172518896, + "grad_norm": 1.28427255153656, + "learning_rate": 1.6740768527791962e-05, + "loss": 1.4358, + "step": 5646 + }, + { + "epoch": 0.30873528969561115, + "grad_norm": 1.6285185813903809, + "learning_rate": 1.673941861729067e-05, + "loss": 1.516, + "step": 5647 + }, + { + "epoch": 0.30878996213933274, + "grad_norm": 1.2106304168701172, + "learning_rate": 1.673806848174438e-05, + "loss": 1.4594, + "step": 5648 + }, + { + "epoch": 0.3088446345830543, + "grad_norm": 1.4658167362213135, + "learning_rate": 1.673671812119817e-05, + "loss": 1.4399, + "step": 5649 + }, + { + "epoch": 0.3088993070267758, + "grad_norm": 1.7013970613479614, + "learning_rate": 1.6735367535697136e-05, + "loss": 1.3603, + "step": 5650 + }, + { + "epoch": 0.3089539794704974, + "grad_norm": 1.3760570287704468, + "learning_rate": 1.6734016725286374e-05, + "loss": 1.3726, + "step": 5651 + }, + { + "epoch": 0.30900865191421895, + "grad_norm": 1.5382280349731445, + "learning_rate": 1.673266569001099e-05, + "loss": 1.3771, + "step": 5652 + }, + { + "epoch": 0.3090633243579405, + "grad_norm": 1.6827846765518188, + "learning_rate": 1.6731314429916104e-05, + "loss": 1.2778, + "step": 5653 + }, + { + "epoch": 0.309117996801662, + "grad_norm": 1.628423810005188, + "learning_rate": 1.672996294504683e-05, + "loss": 1.3669, + "step": 5654 + }, + { + "epoch": 0.3091726692453836, + "grad_norm": 1.3710910081863403, + "learning_rate": 1.67286112354483e-05, + "loss": 1.5486, + "step": 5655 + }, + { + "epoch": 0.30922734168910515, + "grad_norm": 1.3048462867736816, + "learning_rate": 1.6727259301165654e-05, + "loss": 1.4483, + "step": 5656 + }, + { + "epoch": 0.3092820141328267, + "grad_norm": 1.2925167083740234, + "learning_rate": 1.6725907142244033e-05, + "loss": 1.5003, + "step": 5657 + }, + { + "epoch": 0.3093366865765483, + "grad_norm": 1.7408108711242676, + "learning_rate": 1.6724554758728587e-05, + "loss": 1.3266, + "step": 5658 + }, + { + "epoch": 0.3093913590202698, + "grad_norm": 1.6185587644577026, + "learning_rate": 1.6723202150664483e-05, + "loss": 1.351, + "step": 5659 + }, + { + "epoch": 0.30944603146399136, + "grad_norm": 1.497179627418518, + "learning_rate": 1.672184931809688e-05, + "loss": 1.4257, + "step": 5660 + }, + { + "epoch": 0.3095007039077129, + "grad_norm": 1.3599811792373657, + "learning_rate": 1.6720496261070956e-05, + "loss": 1.389, + "step": 5661 + }, + { + "epoch": 0.3095553763514345, + "grad_norm": 1.4567558765411377, + "learning_rate": 1.671914297963189e-05, + "loss": 1.389, + "step": 5662 + }, + { + "epoch": 0.309610048795156, + "grad_norm": 1.7913322448730469, + "learning_rate": 1.6717789473824875e-05, + "loss": 1.495, + "step": 5663 + }, + { + "epoch": 0.30966472123887756, + "grad_norm": 1.742207646369934, + "learning_rate": 1.6716435743695104e-05, + "loss": 1.4273, + "step": 5664 + }, + { + "epoch": 0.30971939368259915, + "grad_norm": 1.5569751262664795, + "learning_rate": 1.6715081789287784e-05, + "loss": 1.5556, + "step": 5665 + }, + { + "epoch": 0.3097740661263207, + "grad_norm": 2.148820161819458, + "learning_rate": 1.6713727610648125e-05, + "loss": 1.5379, + "step": 5666 + }, + { + "epoch": 0.30982873857004223, + "grad_norm": 1.3778588771820068, + "learning_rate": 1.671237320782135e-05, + "loss": 1.6068, + "step": 5667 + }, + { + "epoch": 0.30988341101376377, + "grad_norm": 1.4977000951766968, + "learning_rate": 1.6711018580852677e-05, + "loss": 1.2477, + "step": 5668 + }, + { + "epoch": 0.30993808345748536, + "grad_norm": 1.6034953594207764, + "learning_rate": 1.670966372978735e-05, + "loss": 1.1995, + "step": 5669 + }, + { + "epoch": 0.3099927559012069, + "grad_norm": 1.5899590253829956, + "learning_rate": 1.6708308654670605e-05, + "loss": 1.4761, + "step": 5670 + }, + { + "epoch": 0.31004742834492843, + "grad_norm": 1.7298247814178467, + "learning_rate": 1.6706953355547693e-05, + "loss": 1.4405, + "step": 5671 + }, + { + "epoch": 0.31010210078865, + "grad_norm": 1.8537074327468872, + "learning_rate": 1.670559783246387e-05, + "loss": 1.2992, + "step": 5672 + }, + { + "epoch": 0.31015677323237156, + "grad_norm": 1.922317385673523, + "learning_rate": 1.6704242085464398e-05, + "loss": 1.307, + "step": 5673 + }, + { + "epoch": 0.3102114456760931, + "grad_norm": 1.524655818939209, + "learning_rate": 1.6702886114594553e-05, + "loss": 1.5007, + "step": 5674 + }, + { + "epoch": 0.31026611811981464, + "grad_norm": 1.8650954961776733, + "learning_rate": 1.670152991989961e-05, + "loss": 1.5588, + "step": 5675 + }, + { + "epoch": 0.31032079056353623, + "grad_norm": 1.4997811317443848, + "learning_rate": 1.670017350142486e-05, + "loss": 1.453, + "step": 5676 + }, + { + "epoch": 0.31037546300725777, + "grad_norm": 1.5144965648651123, + "learning_rate": 1.669881685921559e-05, + "loss": 1.4911, + "step": 5677 + }, + { + "epoch": 0.3104301354509793, + "grad_norm": 1.4824843406677246, + "learning_rate": 1.669745999331711e-05, + "loss": 1.3274, + "step": 5678 + }, + { + "epoch": 0.3104848078947009, + "grad_norm": 1.5485527515411377, + "learning_rate": 1.6696102903774725e-05, + "loss": 1.3898, + "step": 5679 + }, + { + "epoch": 0.31053948033842244, + "grad_norm": 1.6648509502410889, + "learning_rate": 1.6694745590633744e-05, + "loss": 1.4319, + "step": 5680 + }, + { + "epoch": 0.310594152782144, + "grad_norm": 1.5466374158859253, + "learning_rate": 1.6693388053939508e-05, + "loss": 1.4249, + "step": 5681 + }, + { + "epoch": 0.3106488252258655, + "grad_norm": 1.4442873001098633, + "learning_rate": 1.6692030293737332e-05, + "loss": 1.5259, + "step": 5682 + }, + { + "epoch": 0.3107034976695871, + "grad_norm": 2.3114917278289795, + "learning_rate": 1.669067231007256e-05, + "loss": 1.4693, + "step": 5683 + }, + { + "epoch": 0.31075817011330864, + "grad_norm": 1.5629925727844238, + "learning_rate": 1.6689314102990544e-05, + "loss": 1.1871, + "step": 5684 + }, + { + "epoch": 0.3108128425570302, + "grad_norm": 1.6279356479644775, + "learning_rate": 1.6687955672536635e-05, + "loss": 1.3919, + "step": 5685 + }, + { + "epoch": 0.31086751500075177, + "grad_norm": 1.37820303440094, + "learning_rate": 1.6686597018756188e-05, + "loss": 1.7841, + "step": 5686 + }, + { + "epoch": 0.3109221874444733, + "grad_norm": 1.5134035348892212, + "learning_rate": 1.6685238141694576e-05, + "loss": 1.4601, + "step": 5687 + }, + { + "epoch": 0.31097685988819485, + "grad_norm": 1.5114374160766602, + "learning_rate": 1.6683879041397174e-05, + "loss": 1.66, + "step": 5688 + }, + { + "epoch": 0.3110315323319164, + "grad_norm": 1.8195277452468872, + "learning_rate": 1.668251971790937e-05, + "loss": 1.7499, + "step": 5689 + }, + { + "epoch": 0.311086204775638, + "grad_norm": 2.205582857131958, + "learning_rate": 1.668116017127655e-05, + "loss": 1.1783, + "step": 5690 + }, + { + "epoch": 0.3111408772193595, + "grad_norm": 1.6743865013122559, + "learning_rate": 1.6679800401544116e-05, + "loss": 1.6061, + "step": 5691 + }, + { + "epoch": 0.31119554966308105, + "grad_norm": 1.5684477090835571, + "learning_rate": 1.667844040875747e-05, + "loss": 1.3467, + "step": 5692 + }, + { + "epoch": 0.31125022210680264, + "grad_norm": 2.1151115894317627, + "learning_rate": 1.667708019296203e-05, + "loss": 1.3946, + "step": 5693 + }, + { + "epoch": 0.3113048945505242, + "grad_norm": 1.5877634286880493, + "learning_rate": 1.6675719754203207e-05, + "loss": 1.5071, + "step": 5694 + }, + { + "epoch": 0.3113595669942457, + "grad_norm": 1.8116036653518677, + "learning_rate": 1.6674359092526442e-05, + "loss": 1.4593, + "step": 5695 + }, + { + "epoch": 0.31141423943796726, + "grad_norm": 1.25163733959198, + "learning_rate": 1.6672998207977165e-05, + "loss": 1.483, + "step": 5696 + }, + { + "epoch": 0.31146891188168885, + "grad_norm": 1.6399773359298706, + "learning_rate": 1.667163710060082e-05, + "loss": 1.7509, + "step": 5697 + }, + { + "epoch": 0.3115235843254104, + "grad_norm": 1.5806182622909546, + "learning_rate": 1.667027577044285e-05, + "loss": 1.4666, + "step": 5698 + }, + { + "epoch": 0.3115782567691319, + "grad_norm": 1.707316517829895, + "learning_rate": 1.6668914217548727e-05, + "loss": 1.6463, + "step": 5699 + }, + { + "epoch": 0.3116329292128535, + "grad_norm": 1.8729251623153687, + "learning_rate": 1.6667552441963904e-05, + "loss": 1.3256, + "step": 5700 + }, + { + "epoch": 0.31168760165657505, + "grad_norm": 1.8471101522445679, + "learning_rate": 1.666619044373386e-05, + "loss": 1.5484, + "step": 5701 + }, + { + "epoch": 0.3117422741002966, + "grad_norm": 1.5967702865600586, + "learning_rate": 1.666482822290408e-05, + "loss": 1.6142, + "step": 5702 + }, + { + "epoch": 0.3117969465440181, + "grad_norm": 1.6443365812301636, + "learning_rate": 1.6663465779520042e-05, + "loss": 1.479, + "step": 5703 + }, + { + "epoch": 0.3118516189877397, + "grad_norm": 1.5947661399841309, + "learning_rate": 1.6662103113627246e-05, + "loss": 1.3924, + "step": 5704 + }, + { + "epoch": 0.31190629143146126, + "grad_norm": 1.388566255569458, + "learning_rate": 1.666074022527119e-05, + "loss": 1.5075, + "step": 5705 + }, + { + "epoch": 0.3119609638751828, + "grad_norm": 1.3134994506835938, + "learning_rate": 1.6659377114497393e-05, + "loss": 1.4283, + "step": 5706 + }, + { + "epoch": 0.3120156363189044, + "grad_norm": 1.6883138418197632, + "learning_rate": 1.6658013781351367e-05, + "loss": 1.342, + "step": 5707 + }, + { + "epoch": 0.3120703087626259, + "grad_norm": 1.393064022064209, + "learning_rate": 1.6656650225878634e-05, + "loss": 1.3546, + "step": 5708 + }, + { + "epoch": 0.31212498120634746, + "grad_norm": 2.1348469257354736, + "learning_rate": 1.6655286448124734e-05, + "loss": 1.3113, + "step": 5709 + }, + { + "epoch": 0.312179653650069, + "grad_norm": 1.4859346151351929, + "learning_rate": 1.6653922448135202e-05, + "loss": 1.1583, + "step": 5710 + }, + { + "epoch": 0.3122343260937906, + "grad_norm": 1.4165140390396118, + "learning_rate": 1.6652558225955582e-05, + "loss": 1.6661, + "step": 5711 + }, + { + "epoch": 0.31228899853751213, + "grad_norm": 1.1639654636383057, + "learning_rate": 1.665119378163143e-05, + "loss": 1.57, + "step": 5712 + }, + { + "epoch": 0.31234367098123367, + "grad_norm": 1.875962495803833, + "learning_rate": 1.6649829115208316e-05, + "loss": 1.2207, + "step": 5713 + }, + { + "epoch": 0.31239834342495526, + "grad_norm": 1.4277130365371704, + "learning_rate": 1.66484642267318e-05, + "loss": 1.3697, + "step": 5714 + }, + { + "epoch": 0.3124530158686768, + "grad_norm": 2.5634961128234863, + "learning_rate": 1.6647099116247465e-05, + "loss": 1.2809, + "step": 5715 + }, + { + "epoch": 0.31250768831239834, + "grad_norm": 2.9277381896972656, + "learning_rate": 1.6645733783800893e-05, + "loss": 1.3522, + "step": 5716 + }, + { + "epoch": 0.3125623607561199, + "grad_norm": 1.2748606204986572, + "learning_rate": 1.6644368229437673e-05, + "loss": 1.5178, + "step": 5717 + }, + { + "epoch": 0.31261703319984147, + "grad_norm": 1.774991750717163, + "learning_rate": 1.6643002453203405e-05, + "loss": 1.5095, + "step": 5718 + }, + { + "epoch": 0.312671705643563, + "grad_norm": 2.003493547439575, + "learning_rate": 1.66416364551437e-05, + "loss": 1.4238, + "step": 5719 + }, + { + "epoch": 0.31272637808728454, + "grad_norm": 1.5107351541519165, + "learning_rate": 1.664027023530417e-05, + "loss": 1.4404, + "step": 5720 + }, + { + "epoch": 0.31278105053100613, + "grad_norm": 1.3495070934295654, + "learning_rate": 1.6638903793730434e-05, + "loss": 1.4883, + "step": 5721 + }, + { + "epoch": 0.31283572297472767, + "grad_norm": 1.5981144905090332, + "learning_rate": 1.6637537130468115e-05, + "loss": 1.3111, + "step": 5722 + }, + { + "epoch": 0.3128903954184492, + "grad_norm": 1.5408207178115845, + "learning_rate": 1.6636170245562864e-05, + "loss": 1.6984, + "step": 5723 + }, + { + "epoch": 0.31294506786217074, + "grad_norm": 2.132077693939209, + "learning_rate": 1.6634803139060313e-05, + "loss": 1.5082, + "step": 5724 + }, + { + "epoch": 0.31299974030589234, + "grad_norm": 1.5170738697052002, + "learning_rate": 1.6633435811006117e-05, + "loss": 1.4262, + "step": 5725 + }, + { + "epoch": 0.3130544127496139, + "grad_norm": 1.5291332006454468, + "learning_rate": 1.663206826144593e-05, + "loss": 1.4626, + "step": 5726 + }, + { + "epoch": 0.3131090851933354, + "grad_norm": 1.3631703853607178, + "learning_rate": 1.6630700490425425e-05, + "loss": 1.5854, + "step": 5727 + }, + { + "epoch": 0.313163757637057, + "grad_norm": 1.4618855714797974, + "learning_rate": 1.6629332497990268e-05, + "loss": 1.661, + "step": 5728 + }, + { + "epoch": 0.31321843008077854, + "grad_norm": 1.5501735210418701, + "learning_rate": 1.6627964284186146e-05, + "loss": 1.4123, + "step": 5729 + }, + { + "epoch": 0.3132731025245001, + "grad_norm": 1.7109723091125488, + "learning_rate": 1.6626595849058742e-05, + "loss": 1.5296, + "step": 5730 + }, + { + "epoch": 0.3133277749682216, + "grad_norm": 1.506648063659668, + "learning_rate": 1.6625227192653756e-05, + "loss": 1.3697, + "step": 5731 + }, + { + "epoch": 0.3133824474119432, + "grad_norm": 1.4231045246124268, + "learning_rate": 1.662385831501688e-05, + "loss": 1.3968, + "step": 5732 + }, + { + "epoch": 0.31343711985566475, + "grad_norm": 1.621425747871399, + "learning_rate": 1.6622489216193835e-05, + "loss": 1.4334, + "step": 5733 + }, + { + "epoch": 0.3134917922993863, + "grad_norm": 1.7054352760314941, + "learning_rate": 1.6621119896230336e-05, + "loss": 1.2567, + "step": 5734 + }, + { + "epoch": 0.3135464647431079, + "grad_norm": 1.395897388458252, + "learning_rate": 1.661975035517211e-05, + "loss": 1.2309, + "step": 5735 + }, + { + "epoch": 0.3136011371868294, + "grad_norm": 1.932619333267212, + "learning_rate": 1.6618380593064882e-05, + "loss": 1.4203, + "step": 5736 + }, + { + "epoch": 0.31365580963055095, + "grad_norm": 2.026306390762329, + "learning_rate": 1.6617010609954396e-05, + "loss": 1.457, + "step": 5737 + }, + { + "epoch": 0.3137104820742725, + "grad_norm": 1.6044217348098755, + "learning_rate": 1.6615640405886398e-05, + "loss": 1.3574, + "step": 5738 + }, + { + "epoch": 0.3137651545179941, + "grad_norm": 1.8016873598098755, + "learning_rate": 1.661426998090664e-05, + "loss": 1.6474, + "step": 5739 + }, + { + "epoch": 0.3138198269617156, + "grad_norm": 1.3456913232803345, + "learning_rate": 1.661289933506089e-05, + "loss": 1.5084, + "step": 5740 + }, + { + "epoch": 0.31387449940543716, + "grad_norm": 1.7710200548171997, + "learning_rate": 1.6611528468394913e-05, + "loss": 1.5933, + "step": 5741 + }, + { + "epoch": 0.31392917184915875, + "grad_norm": 1.7526038885116577, + "learning_rate": 1.661015738095449e-05, + "loss": 1.4739, + "step": 5742 + }, + { + "epoch": 0.3139838442928803, + "grad_norm": 1.47505784034729, + "learning_rate": 1.6608786072785393e-05, + "loss": 1.4557, + "step": 5743 + }, + { + "epoch": 0.3140385167366018, + "grad_norm": 1.8357609510421753, + "learning_rate": 1.660741454393343e-05, + "loss": 1.6268, + "step": 5744 + }, + { + "epoch": 0.31409318918032336, + "grad_norm": 1.2729145288467407, + "learning_rate": 1.6606042794444383e-05, + "loss": 1.5093, + "step": 5745 + }, + { + "epoch": 0.31414786162404496, + "grad_norm": 1.8482741117477417, + "learning_rate": 1.6604670824364067e-05, + "loss": 1.4493, + "step": 5746 + }, + { + "epoch": 0.3142025340677665, + "grad_norm": 1.5997130870819092, + "learning_rate": 1.6603298633738293e-05, + "loss": 1.4492, + "step": 5747 + }, + { + "epoch": 0.31425720651148803, + "grad_norm": 1.373069405555725, + "learning_rate": 1.660192622261289e-05, + "loss": 1.3466, + "step": 5748 + }, + { + "epoch": 0.3143118789552096, + "grad_norm": 1.2864038944244385, + "learning_rate": 1.660055359103367e-05, + "loss": 1.4092, + "step": 5749 + }, + { + "epoch": 0.31436655139893116, + "grad_norm": 1.4994033575057983, + "learning_rate": 1.6599180739046483e-05, + "loss": 1.4958, + "step": 5750 + }, + { + "epoch": 0.3144212238426527, + "grad_norm": 1.8205833435058594, + "learning_rate": 1.6597807666697157e-05, + "loss": 1.5251, + "step": 5751 + }, + { + "epoch": 0.31447589628637423, + "grad_norm": 1.7459423542022705, + "learning_rate": 1.659643437403156e-05, + "loss": 1.4116, + "step": 5752 + }, + { + "epoch": 0.3145305687300958, + "grad_norm": 1.7435827255249023, + "learning_rate": 1.6595060861095534e-05, + "loss": 1.3735, + "step": 5753 + }, + { + "epoch": 0.31458524117381736, + "grad_norm": 1.3951668739318848, + "learning_rate": 1.6593687127934953e-05, + "loss": 1.5659, + "step": 5754 + }, + { + "epoch": 0.3146399136175389, + "grad_norm": 1.712458610534668, + "learning_rate": 1.6592313174595685e-05, + "loss": 1.4393, + "step": 5755 + }, + { + "epoch": 0.3146945860612605, + "grad_norm": 1.3968095779418945, + "learning_rate": 1.6590939001123614e-05, + "loss": 1.4434, + "step": 5756 + }, + { + "epoch": 0.31474925850498203, + "grad_norm": 2.2343382835388184, + "learning_rate": 1.658956460756462e-05, + "loss": 1.5001, + "step": 5757 + }, + { + "epoch": 0.31480393094870357, + "grad_norm": 1.4764268398284912, + "learning_rate": 1.6588189993964603e-05, + "loss": 1.4152, + "step": 5758 + }, + { + "epoch": 0.3148586033924251, + "grad_norm": 1.7680784463882446, + "learning_rate": 1.658681516036946e-05, + "loss": 1.4546, + "step": 5759 + }, + { + "epoch": 0.3149132758361467, + "grad_norm": 1.4287279844284058, + "learning_rate": 1.6585440106825107e-05, + "loss": 1.4964, + "step": 5760 + }, + { + "epoch": 0.31496794827986824, + "grad_norm": 1.3654053211212158, + "learning_rate": 1.658406483337745e-05, + "loss": 1.3803, + "step": 5761 + }, + { + "epoch": 0.3150226207235898, + "grad_norm": 1.637569785118103, + "learning_rate": 1.6582689340072418e-05, + "loss": 1.5069, + "step": 5762 + }, + { + "epoch": 0.31507729316731137, + "grad_norm": 1.2997201681137085, + "learning_rate": 1.6581313626955948e-05, + "loss": 1.6001, + "step": 5763 + }, + { + "epoch": 0.3151319656110329, + "grad_norm": 1.5451147556304932, + "learning_rate": 1.6579937694073967e-05, + "loss": 1.2612, + "step": 5764 + }, + { + "epoch": 0.31518663805475444, + "grad_norm": 1.4430917501449585, + "learning_rate": 1.657856154147243e-05, + "loss": 1.5012, + "step": 5765 + }, + { + "epoch": 0.315241310498476, + "grad_norm": 1.4962540864944458, + "learning_rate": 1.6577185169197284e-05, + "loss": 1.46, + "step": 5766 + }, + { + "epoch": 0.3152959829421976, + "grad_norm": 1.4424786567687988, + "learning_rate": 1.6575808577294492e-05, + "loss": 1.5677, + "step": 5767 + }, + { + "epoch": 0.3153506553859191, + "grad_norm": 1.465861439704895, + "learning_rate": 1.6574431765810023e-05, + "loss": 1.4352, + "step": 5768 + }, + { + "epoch": 0.31540532782964065, + "grad_norm": 1.5177456140518188, + "learning_rate": 1.6573054734789846e-05, + "loss": 1.5992, + "step": 5769 + }, + { + "epoch": 0.31546000027336224, + "grad_norm": 1.8394269943237305, + "learning_rate": 1.657167748427995e-05, + "loss": 1.2799, + "step": 5770 + }, + { + "epoch": 0.3155146727170838, + "grad_norm": 1.8345543146133423, + "learning_rate": 1.657030001432632e-05, + "loss": 1.455, + "step": 5771 + }, + { + "epoch": 0.3155693451608053, + "grad_norm": 1.3232601881027222, + "learning_rate": 1.6568922324974958e-05, + "loss": 1.4686, + "step": 5772 + }, + { + "epoch": 0.31562401760452685, + "grad_norm": 1.4476523399353027, + "learning_rate": 1.6567544416271862e-05, + "loss": 1.5223, + "step": 5773 + }, + { + "epoch": 0.31567869004824844, + "grad_norm": 1.6075290441513062, + "learning_rate": 1.6566166288263046e-05, + "loss": 1.3875, + "step": 5774 + }, + { + "epoch": 0.31573336249197, + "grad_norm": 1.8110610246658325, + "learning_rate": 1.656478794099453e-05, + "loss": 1.5019, + "step": 5775 + }, + { + "epoch": 0.3157880349356915, + "grad_norm": 2.496516227722168, + "learning_rate": 1.6563409374512344e-05, + "loss": 1.2954, + "step": 5776 + }, + { + "epoch": 0.3158427073794131, + "grad_norm": 2.0874476432800293, + "learning_rate": 1.6562030588862513e-05, + "loss": 1.4703, + "step": 5777 + }, + { + "epoch": 0.31589737982313465, + "grad_norm": 1.7965909242630005, + "learning_rate": 1.6560651584091083e-05, + "loss": 1.563, + "step": 5778 + }, + { + "epoch": 0.3159520522668562, + "grad_norm": 1.4738322496414185, + "learning_rate": 1.6559272360244104e-05, + "loss": 1.3849, + "step": 5779 + }, + { + "epoch": 0.3160067247105777, + "grad_norm": 1.2309372425079346, + "learning_rate": 1.655789291736763e-05, + "loss": 1.3237, + "step": 5780 + }, + { + "epoch": 0.3160613971542993, + "grad_norm": 1.6620007753372192, + "learning_rate": 1.6556513255507714e-05, + "loss": 1.7703, + "step": 5781 + }, + { + "epoch": 0.31611606959802085, + "grad_norm": 1.5651357173919678, + "learning_rate": 1.6555133374710442e-05, + "loss": 1.3053, + "step": 5782 + }, + { + "epoch": 0.3161707420417424, + "grad_norm": 1.8235851526260376, + "learning_rate": 1.655375327502189e-05, + "loss": 1.4825, + "step": 5783 + }, + { + "epoch": 0.316225414485464, + "grad_norm": 1.34381103515625, + "learning_rate": 1.6552372956488128e-05, + "loss": 1.5347, + "step": 5784 + }, + { + "epoch": 0.3162800869291855, + "grad_norm": 1.7539050579071045, + "learning_rate": 1.655099241915526e-05, + "loss": 1.2811, + "step": 5785 + }, + { + "epoch": 0.31633475937290706, + "grad_norm": 1.5876325368881226, + "learning_rate": 1.6549611663069383e-05, + "loss": 1.7185, + "step": 5786 + }, + { + "epoch": 0.3163894318166286, + "grad_norm": 1.55436372756958, + "learning_rate": 1.6548230688276605e-05, + "loss": 1.7147, + "step": 5787 + }, + { + "epoch": 0.3164441042603502, + "grad_norm": 1.331621766090393, + "learning_rate": 1.6546849494823037e-05, + "loss": 1.5817, + "step": 5788 + }, + { + "epoch": 0.3164987767040717, + "grad_norm": 1.3766465187072754, + "learning_rate": 1.6545468082754802e-05, + "loss": 1.3957, + "step": 5789 + }, + { + "epoch": 0.31655344914779326, + "grad_norm": 1.5918580293655396, + "learning_rate": 1.654408645211803e-05, + "loss": 1.6005, + "step": 5790 + }, + { + "epoch": 0.31660812159151486, + "grad_norm": 1.6317253112792969, + "learning_rate": 1.654270460295885e-05, + "loss": 1.5572, + "step": 5791 + }, + { + "epoch": 0.3166627940352364, + "grad_norm": 1.5809732675552368, + "learning_rate": 1.6541322535323417e-05, + "loss": 1.2859, + "step": 5792 + }, + { + "epoch": 0.31671746647895793, + "grad_norm": 1.4834377765655518, + "learning_rate": 1.653994024925787e-05, + "loss": 1.4952, + "step": 5793 + }, + { + "epoch": 0.31677213892267947, + "grad_norm": 1.6369779109954834, + "learning_rate": 1.6538557744808373e-05, + "loss": 1.5137, + "step": 5794 + }, + { + "epoch": 0.31682681136640106, + "grad_norm": 2.192450523376465, + "learning_rate": 1.653717502202109e-05, + "loss": 1.3965, + "step": 5795 + }, + { + "epoch": 0.3168814838101226, + "grad_norm": 2.320101261138916, + "learning_rate": 1.6535792080942194e-05, + "loss": 1.4582, + "step": 5796 + }, + { + "epoch": 0.31693615625384414, + "grad_norm": 1.5593100786209106, + "learning_rate": 1.653440892161786e-05, + "loss": 1.2855, + "step": 5797 + }, + { + "epoch": 0.31699082869756573, + "grad_norm": 2.040018081665039, + "learning_rate": 1.6533025544094284e-05, + "loss": 1.5353, + "step": 5798 + }, + { + "epoch": 0.31704550114128727, + "grad_norm": 1.9068423509597778, + "learning_rate": 1.653164194841765e-05, + "loss": 1.4319, + "step": 5799 + }, + { + "epoch": 0.3171001735850088, + "grad_norm": 1.717298150062561, + "learning_rate": 1.6530258134634168e-05, + "loss": 1.2856, + "step": 5800 + }, + { + "epoch": 0.31715484602873034, + "grad_norm": 1.491400957107544, + "learning_rate": 1.652887410279004e-05, + "loss": 1.3605, + "step": 5801 + }, + { + "epoch": 0.31720951847245193, + "grad_norm": 1.9480496644973755, + "learning_rate": 1.652748985293149e-05, + "loss": 1.4709, + "step": 5802 + }, + { + "epoch": 0.31726419091617347, + "grad_norm": 1.8214280605316162, + "learning_rate": 1.652610538510473e-05, + "loss": 1.4985, + "step": 5803 + }, + { + "epoch": 0.317318863359895, + "grad_norm": 1.2628570795059204, + "learning_rate": 1.6524720699356e-05, + "loss": 1.6194, + "step": 5804 + }, + { + "epoch": 0.3173735358036166, + "grad_norm": 2.251311779022217, + "learning_rate": 1.652333579573154e-05, + "loss": 1.3996, + "step": 5805 + }, + { + "epoch": 0.31742820824733814, + "grad_norm": 1.2616595029830933, + "learning_rate": 1.6521950674277585e-05, + "loss": 1.5827, + "step": 5806 + }, + { + "epoch": 0.3174828806910597, + "grad_norm": 1.6198179721832275, + "learning_rate": 1.6520565335040392e-05, + "loss": 1.2287, + "step": 5807 + }, + { + "epoch": 0.3175375531347812, + "grad_norm": 2.0165910720825195, + "learning_rate": 1.6519179778066226e-05, + "loss": 1.3049, + "step": 5808 + }, + { + "epoch": 0.3175922255785028, + "grad_norm": 1.6210967302322388, + "learning_rate": 1.6517794003401345e-05, + "loss": 1.4794, + "step": 5809 + }, + { + "epoch": 0.31764689802222434, + "grad_norm": 1.250365972518921, + "learning_rate": 1.651640801109203e-05, + "loss": 1.497, + "step": 5810 + }, + { + "epoch": 0.3177015704659459, + "grad_norm": 1.605764389038086, + "learning_rate": 1.651502180118456e-05, + "loss": 1.6344, + "step": 5811 + }, + { + "epoch": 0.3177562429096675, + "grad_norm": 1.7318577766418457, + "learning_rate": 1.6513635373725224e-05, + "loss": 1.3566, + "step": 5812 + }, + { + "epoch": 0.317810915353389, + "grad_norm": 1.3641326427459717, + "learning_rate": 1.6512248728760316e-05, + "loss": 1.6836, + "step": 5813 + }, + { + "epoch": 0.31786558779711055, + "grad_norm": 1.318778395652771, + "learning_rate": 1.6510861866336145e-05, + "loss": 1.5594, + "step": 5814 + }, + { + "epoch": 0.31792026024083214, + "grad_norm": 1.3115595579147339, + "learning_rate": 1.6509474786499017e-05, + "loss": 1.4101, + "step": 5815 + }, + { + "epoch": 0.3179749326845537, + "grad_norm": 1.6657803058624268, + "learning_rate": 1.650808748929525e-05, + "loss": 1.6471, + "step": 5816 + }, + { + "epoch": 0.3180296051282752, + "grad_norm": 1.369043231010437, + "learning_rate": 1.6506699974771174e-05, + "loss": 1.4943, + "step": 5817 + }, + { + "epoch": 0.31808427757199675, + "grad_norm": 1.2213836908340454, + "learning_rate": 1.650531224297311e-05, + "loss": 1.3909, + "step": 5818 + }, + { + "epoch": 0.31813895001571835, + "grad_norm": 1.7958632707595825, + "learning_rate": 1.6503924293947408e-05, + "loss": 1.6626, + "step": 5819 + }, + { + "epoch": 0.3181936224594399, + "grad_norm": 1.605946660041809, + "learning_rate": 1.6502536127740414e-05, + "loss": 1.4687, + "step": 5820 + }, + { + "epoch": 0.3182482949031614, + "grad_norm": 1.5615991353988647, + "learning_rate": 1.650114774439848e-05, + "loss": 1.4499, + "step": 5821 + }, + { + "epoch": 0.318302967346883, + "grad_norm": 1.4232996702194214, + "learning_rate": 1.6499759143967966e-05, + "loss": 1.4823, + "step": 5822 + }, + { + "epoch": 0.31835763979060455, + "grad_norm": 1.5701905488967896, + "learning_rate": 1.6498370326495242e-05, + "loss": 1.4264, + "step": 5823 + }, + { + "epoch": 0.3184123122343261, + "grad_norm": 1.4258816242218018, + "learning_rate": 1.6496981292026687e-05, + "loss": 1.3919, + "step": 5824 + }, + { + "epoch": 0.3184669846780476, + "grad_norm": 2.1093685626983643, + "learning_rate": 1.6495592040608677e-05, + "loss": 1.2893, + "step": 5825 + }, + { + "epoch": 0.3185216571217692, + "grad_norm": 1.2020189762115479, + "learning_rate": 1.6494202572287607e-05, + "loss": 1.6082, + "step": 5826 + }, + { + "epoch": 0.31857632956549076, + "grad_norm": 1.5557538270950317, + "learning_rate": 1.6492812887109876e-05, + "loss": 1.1604, + "step": 5827 + }, + { + "epoch": 0.3186310020092123, + "grad_norm": 1.312145471572876, + "learning_rate": 1.6491422985121882e-05, + "loss": 1.6176, + "step": 5828 + }, + { + "epoch": 0.3186856744529339, + "grad_norm": 1.8217015266418457, + "learning_rate": 1.6490032866370046e-05, + "loss": 1.7657, + "step": 5829 + }, + { + "epoch": 0.3187403468966554, + "grad_norm": 1.6675519943237305, + "learning_rate": 1.648864253090078e-05, + "loss": 1.4414, + "step": 5830 + }, + { + "epoch": 0.31879501934037696, + "grad_norm": 2.1066951751708984, + "learning_rate": 1.648725197876052e-05, + "loss": 1.4647, + "step": 5831 + }, + { + "epoch": 0.3188496917840985, + "grad_norm": 1.1772336959838867, + "learning_rate": 1.648586120999569e-05, + "loss": 1.5233, + "step": 5832 + }, + { + "epoch": 0.3189043642278201, + "grad_norm": 1.5028200149536133, + "learning_rate": 1.6484470224652734e-05, + "loss": 1.2291, + "step": 5833 + }, + { + "epoch": 0.31895903667154163, + "grad_norm": 1.4346224069595337, + "learning_rate": 1.6483079022778102e-05, + "loss": 1.4995, + "step": 5834 + }, + { + "epoch": 0.31901370911526317, + "grad_norm": 1.6425400972366333, + "learning_rate": 1.648168760441825e-05, + "loss": 1.2549, + "step": 5835 + }, + { + "epoch": 0.31906838155898476, + "grad_norm": 1.4147257804870605, + "learning_rate": 1.6480295969619636e-05, + "loss": 1.5485, + "step": 5836 + }, + { + "epoch": 0.3191230540027063, + "grad_norm": 1.7047321796417236, + "learning_rate": 1.6478904118428735e-05, + "loss": 1.4772, + "step": 5837 + }, + { + "epoch": 0.31917772644642783, + "grad_norm": 1.3449469804763794, + "learning_rate": 1.647751205089202e-05, + "loss": 1.4579, + "step": 5838 + }, + { + "epoch": 0.31923239889014937, + "grad_norm": 1.6510019302368164, + "learning_rate": 1.647611976705598e-05, + "loss": 1.4283, + "step": 5839 + }, + { + "epoch": 0.31928707133387096, + "grad_norm": 1.8364864587783813, + "learning_rate": 1.64747272669671e-05, + "loss": 1.1827, + "step": 5840 + }, + { + "epoch": 0.3193417437775925, + "grad_norm": 1.3264044523239136, + "learning_rate": 1.6473334550671887e-05, + "loss": 1.7148, + "step": 5841 + }, + { + "epoch": 0.31939641622131404, + "grad_norm": 1.479718804359436, + "learning_rate": 1.6471941618216845e-05, + "loss": 1.2956, + "step": 5842 + }, + { + "epoch": 0.31945108866503563, + "grad_norm": 1.4936575889587402, + "learning_rate": 1.6470548469648486e-05, + "loss": 1.4995, + "step": 5843 + }, + { + "epoch": 0.31950576110875717, + "grad_norm": 1.427658200263977, + "learning_rate": 1.6469155105013324e-05, + "loss": 1.3493, + "step": 5844 + }, + { + "epoch": 0.3195604335524787, + "grad_norm": 1.487788438796997, + "learning_rate": 1.6467761524357896e-05, + "loss": 1.5895, + "step": 5845 + }, + { + "epoch": 0.31961510599620024, + "grad_norm": 1.189601182937622, + "learning_rate": 1.6466367727728735e-05, + "loss": 1.5563, + "step": 5846 + }, + { + "epoch": 0.31966977843992184, + "grad_norm": 1.2543516159057617, + "learning_rate": 1.646497371517238e-05, + "loss": 1.8113, + "step": 5847 + }, + { + "epoch": 0.3197244508836434, + "grad_norm": 1.447181224822998, + "learning_rate": 1.6463579486735383e-05, + "loss": 1.4255, + "step": 5848 + }, + { + "epoch": 0.3197791233273649, + "grad_norm": 1.805130958557129, + "learning_rate": 1.6462185042464298e-05, + "loss": 1.3444, + "step": 5849 + }, + { + "epoch": 0.3198337957710865, + "grad_norm": 1.655932068824768, + "learning_rate": 1.6460790382405688e-05, + "loss": 1.4254, + "step": 5850 + }, + { + "epoch": 0.31988846821480804, + "grad_norm": 1.415208101272583, + "learning_rate": 1.6459395506606133e-05, + "loss": 1.3761, + "step": 5851 + }, + { + "epoch": 0.3199431406585296, + "grad_norm": 1.3051152229309082, + "learning_rate": 1.64580004151122e-05, + "loss": 1.2037, + "step": 5852 + }, + { + "epoch": 0.3199978131022511, + "grad_norm": 1.5193973779678345, + "learning_rate": 1.645660510797048e-05, + "loss": 1.3064, + "step": 5853 + }, + { + "epoch": 0.3200524855459727, + "grad_norm": 1.3028883934020996, + "learning_rate": 1.6455209585227568e-05, + "loss": 1.5448, + "step": 5854 + }, + { + "epoch": 0.32010715798969425, + "grad_norm": 1.7911208868026733, + "learning_rate": 1.6453813846930057e-05, + "loss": 1.6871, + "step": 5855 + }, + { + "epoch": 0.3201618304334158, + "grad_norm": 1.8693159818649292, + "learning_rate": 1.645241789312456e-05, + "loss": 1.5984, + "step": 5856 + }, + { + "epoch": 0.3202165028771374, + "grad_norm": 1.546687126159668, + "learning_rate": 1.6451021723857683e-05, + "loss": 1.2153, + "step": 5857 + }, + { + "epoch": 0.3202711753208589, + "grad_norm": 1.7817530632019043, + "learning_rate": 1.6449625339176056e-05, + "loss": 1.4163, + "step": 5858 + }, + { + "epoch": 0.32032584776458045, + "grad_norm": 1.1899880170822144, + "learning_rate": 1.6448228739126302e-05, + "loss": 1.5055, + "step": 5859 + }, + { + "epoch": 0.320380520208302, + "grad_norm": 1.625530481338501, + "learning_rate": 1.6446831923755065e-05, + "loss": 1.376, + "step": 5860 + }, + { + "epoch": 0.3204351926520236, + "grad_norm": 1.3670192956924438, + "learning_rate": 1.6445434893108978e-05, + "loss": 1.4069, + "step": 5861 + }, + { + "epoch": 0.3204898650957451, + "grad_norm": 1.596150517463684, + "learning_rate": 1.6444037647234695e-05, + "loss": 1.3745, + "step": 5862 + }, + { + "epoch": 0.32054453753946666, + "grad_norm": 1.507590889930725, + "learning_rate": 1.6442640186178875e-05, + "loss": 1.3622, + "step": 5863 + }, + { + "epoch": 0.32059920998318825, + "grad_norm": 1.4340671300888062, + "learning_rate": 1.644124250998818e-05, + "loss": 1.5045, + "step": 5864 + }, + { + "epoch": 0.3206538824269098, + "grad_norm": 1.796250581741333, + "learning_rate": 1.6439844618709285e-05, + "loss": 1.5182, + "step": 5865 + }, + { + "epoch": 0.3207085548706313, + "grad_norm": 2.0030579566955566, + "learning_rate": 1.6438446512388862e-05, + "loss": 1.4696, + "step": 5866 + }, + { + "epoch": 0.32076322731435286, + "grad_norm": 1.660475730895996, + "learning_rate": 1.6437048191073608e-05, + "loss": 1.3742, + "step": 5867 + }, + { + "epoch": 0.32081789975807445, + "grad_norm": 1.5916005373001099, + "learning_rate": 1.6435649654810204e-05, + "loss": 1.2698, + "step": 5868 + }, + { + "epoch": 0.320872572201796, + "grad_norm": 1.6243122816085815, + "learning_rate": 1.6434250903645356e-05, + "loss": 1.2347, + "step": 5869 + }, + { + "epoch": 0.32092724464551753, + "grad_norm": 1.6062231063842773, + "learning_rate": 1.6432851937625776e-05, + "loss": 1.5006, + "step": 5870 + }, + { + "epoch": 0.3209819170892391, + "grad_norm": 1.6805064678192139, + "learning_rate": 1.6431452756798174e-05, + "loss": 1.2726, + "step": 5871 + }, + { + "epoch": 0.32103658953296066, + "grad_norm": 1.3994314670562744, + "learning_rate": 1.6430053361209274e-05, + "loss": 1.5209, + "step": 5872 + }, + { + "epoch": 0.3210912619766822, + "grad_norm": 1.6194164752960205, + "learning_rate": 1.64286537509058e-05, + "loss": 1.5464, + "step": 5873 + }, + { + "epoch": 0.32114593442040373, + "grad_norm": 1.6726906299591064, + "learning_rate": 1.6427253925934496e-05, + "loss": 1.188, + "step": 5874 + }, + { + "epoch": 0.3212006068641253, + "grad_norm": 1.8461153507232666, + "learning_rate": 1.6425853886342096e-05, + "loss": 1.223, + "step": 5875 + }, + { + "epoch": 0.32125527930784686, + "grad_norm": 1.3911124467849731, + "learning_rate": 1.642445363217536e-05, + "loss": 1.584, + "step": 5876 + }, + { + "epoch": 0.3213099517515684, + "grad_norm": 2.0033767223358154, + "learning_rate": 1.6423053163481042e-05, + "loss": 1.39, + "step": 5877 + }, + { + "epoch": 0.32136462419529, + "grad_norm": 1.3960609436035156, + "learning_rate": 1.6421652480305904e-05, + "loss": 1.2409, + "step": 5878 + }, + { + "epoch": 0.32141929663901153, + "grad_norm": 1.8475284576416016, + "learning_rate": 1.642025158269672e-05, + "loss": 1.721, + "step": 5879 + }, + { + "epoch": 0.32147396908273307, + "grad_norm": 1.5772515535354614, + "learning_rate": 1.6418850470700274e-05, + "loss": 1.5452, + "step": 5880 + }, + { + "epoch": 0.3215286415264546, + "grad_norm": 1.218491792678833, + "learning_rate": 1.6417449144363346e-05, + "loss": 1.5506, + "step": 5881 + }, + { + "epoch": 0.3215833139701762, + "grad_norm": 1.6131445169448853, + "learning_rate": 1.6416047603732734e-05, + "loss": 1.3562, + "step": 5882 + }, + { + "epoch": 0.32163798641389774, + "grad_norm": 1.6504064798355103, + "learning_rate": 1.6414645848855234e-05, + "loss": 1.5019, + "step": 5883 + }, + { + "epoch": 0.3216926588576193, + "grad_norm": 1.4898868799209595, + "learning_rate": 1.6413243879777657e-05, + "loss": 1.5001, + "step": 5884 + }, + { + "epoch": 0.32174733130134087, + "grad_norm": 1.6594946384429932, + "learning_rate": 1.641184169654682e-05, + "loss": 1.524, + "step": 5885 + }, + { + "epoch": 0.3218020037450624, + "grad_norm": 1.3069629669189453, + "learning_rate": 1.641043929920954e-05, + "loss": 1.4783, + "step": 5886 + }, + { + "epoch": 0.32185667618878394, + "grad_norm": 1.4685348272323608, + "learning_rate": 1.6409036687812654e-05, + "loss": 1.4564, + "step": 5887 + }, + { + "epoch": 0.3219113486325055, + "grad_norm": 1.4284093379974365, + "learning_rate": 1.640763386240299e-05, + "loss": 1.5237, + "step": 5888 + }, + { + "epoch": 0.32196602107622707, + "grad_norm": 1.4464727640151978, + "learning_rate": 1.6406230823027398e-05, + "loss": 1.4202, + "step": 5889 + }, + { + "epoch": 0.3220206935199486, + "grad_norm": 1.368114709854126, + "learning_rate": 1.640482756973272e-05, + "loss": 1.3844, + "step": 5890 + }, + { + "epoch": 0.32207536596367015, + "grad_norm": 1.4663636684417725, + "learning_rate": 1.6403424102565826e-05, + "loss": 1.4138, + "step": 5891 + }, + { + "epoch": 0.32213003840739174, + "grad_norm": 1.5902433395385742, + "learning_rate": 1.640202042157357e-05, + "loss": 1.5335, + "step": 5892 + }, + { + "epoch": 0.3221847108511133, + "grad_norm": 1.1215516328811646, + "learning_rate": 1.6400616526802835e-05, + "loss": 1.5796, + "step": 5893 + }, + { + "epoch": 0.3222393832948348, + "grad_norm": 1.3819677829742432, + "learning_rate": 1.6399212418300496e-05, + "loss": 1.4409, + "step": 5894 + }, + { + "epoch": 0.32229405573855635, + "grad_norm": 1.5084264278411865, + "learning_rate": 1.639780809611343e-05, + "loss": 1.5433, + "step": 5895 + }, + { + "epoch": 0.32234872818227794, + "grad_norm": 1.6950873136520386, + "learning_rate": 1.6396403560288546e-05, + "loss": 1.2019, + "step": 5896 + }, + { + "epoch": 0.3224034006259995, + "grad_norm": 1.5814874172210693, + "learning_rate": 1.6394998810872734e-05, + "loss": 1.4406, + "step": 5897 + }, + { + "epoch": 0.322458073069721, + "grad_norm": 1.8047724962234497, + "learning_rate": 1.6393593847912905e-05, + "loss": 1.5226, + "step": 5898 + }, + { + "epoch": 0.3225127455134426, + "grad_norm": 1.6045498847961426, + "learning_rate": 1.6392188671455976e-05, + "loss": 1.3761, + "step": 5899 + }, + { + "epoch": 0.32256741795716415, + "grad_norm": 1.2546709775924683, + "learning_rate": 1.6390783281548865e-05, + "loss": 1.4466, + "step": 5900 + }, + { + "epoch": 0.3226220904008857, + "grad_norm": 1.3909907341003418, + "learning_rate": 1.6389377678238508e-05, + "loss": 1.4241, + "step": 5901 + }, + { + "epoch": 0.3226767628446072, + "grad_norm": 1.3747189044952393, + "learning_rate": 1.6387971861571834e-05, + "loss": 1.4095, + "step": 5902 + }, + { + "epoch": 0.3227314352883288, + "grad_norm": 1.694595456123352, + "learning_rate": 1.638656583159579e-05, + "loss": 1.5994, + "step": 5903 + }, + { + "epoch": 0.32278610773205035, + "grad_norm": 1.9099870920181274, + "learning_rate": 1.6385159588357327e-05, + "loss": 1.5659, + "step": 5904 + }, + { + "epoch": 0.3228407801757719, + "grad_norm": 1.2921922206878662, + "learning_rate": 1.6383753131903405e-05, + "loss": 1.5526, + "step": 5905 + }, + { + "epoch": 0.3228954526194935, + "grad_norm": 1.3074696063995361, + "learning_rate": 1.638234646228098e-05, + "loss": 1.4809, + "step": 5906 + }, + { + "epoch": 0.322950125063215, + "grad_norm": 2.253556728363037, + "learning_rate": 1.6380939579537033e-05, + "loss": 1.4688, + "step": 5907 + }, + { + "epoch": 0.32300479750693656, + "grad_norm": 1.4052650928497314, + "learning_rate": 1.6379532483718543e-05, + "loss": 1.4674, + "step": 5908 + }, + { + "epoch": 0.3230594699506581, + "grad_norm": 1.454760193824768, + "learning_rate": 1.6378125174872486e-05, + "loss": 1.3774, + "step": 5909 + }, + { + "epoch": 0.3231141423943797, + "grad_norm": 1.4209767580032349, + "learning_rate": 1.637671765304587e-05, + "loss": 1.3472, + "step": 5910 + }, + { + "epoch": 0.3231688148381012, + "grad_norm": 1.157456874847412, + "learning_rate": 1.637530991828568e-05, + "loss": 1.5684, + "step": 5911 + }, + { + "epoch": 0.32322348728182276, + "grad_norm": 1.297343373298645, + "learning_rate": 1.6373901970638943e-05, + "loss": 1.4071, + "step": 5912 + }, + { + "epoch": 0.32327815972554436, + "grad_norm": 1.4741777181625366, + "learning_rate": 1.6372493810152655e-05, + "loss": 1.5681, + "step": 5913 + }, + { + "epoch": 0.3233328321692659, + "grad_norm": 1.7242238521575928, + "learning_rate": 1.6371085436873847e-05, + "loss": 1.5686, + "step": 5914 + }, + { + "epoch": 0.32338750461298743, + "grad_norm": 1.3227479457855225, + "learning_rate": 1.6369676850849547e-05, + "loss": 1.4211, + "step": 5915 + }, + { + "epoch": 0.32344217705670897, + "grad_norm": 1.830773949623108, + "learning_rate": 1.6368268052126787e-05, + "loss": 1.5753, + "step": 5916 + }, + { + "epoch": 0.32349684950043056, + "grad_norm": 1.758670449256897, + "learning_rate": 1.6366859040752614e-05, + "loss": 1.5251, + "step": 5917 + }, + { + "epoch": 0.3235515219441521, + "grad_norm": 1.9845021963119507, + "learning_rate": 1.6365449816774076e-05, + "loss": 1.3948, + "step": 5918 + }, + { + "epoch": 0.32360619438787364, + "grad_norm": 1.5377963781356812, + "learning_rate": 1.6364040380238234e-05, + "loss": 1.5326, + "step": 5919 + }, + { + "epoch": 0.32366086683159523, + "grad_norm": 1.7229623794555664, + "learning_rate": 1.6362630731192152e-05, + "loss": 1.2215, + "step": 5920 + }, + { + "epoch": 0.32371553927531677, + "grad_norm": 1.3778223991394043, + "learning_rate": 1.6361220869682896e-05, + "loss": 1.5055, + "step": 5921 + }, + { + "epoch": 0.3237702117190383, + "grad_norm": 1.5744743347167969, + "learning_rate": 1.635981079575755e-05, + "loss": 1.3724, + "step": 5922 + }, + { + "epoch": 0.32382488416275984, + "grad_norm": 1.422885775566101, + "learning_rate": 1.6358400509463198e-05, + "loss": 1.4335, + "step": 5923 + }, + { + "epoch": 0.32387955660648143, + "grad_norm": 1.6847443580627441, + "learning_rate": 1.635699001084693e-05, + "loss": 1.6578, + "step": 5924 + }, + { + "epoch": 0.32393422905020297, + "grad_norm": 2.483816146850586, + "learning_rate": 1.635557929995585e-05, + "loss": 1.2376, + "step": 5925 + }, + { + "epoch": 0.3239889014939245, + "grad_norm": 1.6153591871261597, + "learning_rate": 1.6354168376837063e-05, + "loss": 1.5151, + "step": 5926 + }, + { + "epoch": 0.3240435739376461, + "grad_norm": 1.3823977708816528, + "learning_rate": 1.6352757241537682e-05, + "loss": 1.4867, + "step": 5927 + }, + { + "epoch": 0.32409824638136764, + "grad_norm": 1.4226419925689697, + "learning_rate": 1.635134589410483e-05, + "loss": 1.244, + "step": 5928 + }, + { + "epoch": 0.3241529188250892, + "grad_norm": 1.2371376752853394, + "learning_rate": 1.634993433458564e-05, + "loss": 1.4531, + "step": 5929 + }, + { + "epoch": 0.3242075912688107, + "grad_norm": 1.8745771646499634, + "learning_rate": 1.6348522563027236e-05, + "loss": 1.6457, + "step": 5930 + }, + { + "epoch": 0.3242622637125323, + "grad_norm": 1.603891134262085, + "learning_rate": 1.634711057947677e-05, + "loss": 1.6691, + "step": 5931 + }, + { + "epoch": 0.32431693615625384, + "grad_norm": 1.563238501548767, + "learning_rate": 1.6345698383981387e-05, + "loss": 1.4412, + "step": 5932 + }, + { + "epoch": 0.3243716085999754, + "grad_norm": 1.474300503730774, + "learning_rate": 1.634428597658824e-05, + "loss": 1.5911, + "step": 5933 + }, + { + "epoch": 0.324426281043697, + "grad_norm": 1.4002948999404907, + "learning_rate": 1.6342873357344503e-05, + "loss": 1.5293, + "step": 5934 + }, + { + "epoch": 0.3244809534874185, + "grad_norm": 1.4771777391433716, + "learning_rate": 1.6341460526297335e-05, + "loss": 1.5459, + "step": 5935 + }, + { + "epoch": 0.32453562593114005, + "grad_norm": 1.5076656341552734, + "learning_rate": 1.6340047483493923e-05, + "loss": 1.3451, + "step": 5936 + }, + { + "epoch": 0.3245902983748616, + "grad_norm": 1.5860246419906616, + "learning_rate": 1.633863422898145e-05, + "loss": 1.574, + "step": 5937 + }, + { + "epoch": 0.3246449708185832, + "grad_norm": 1.8386356830596924, + "learning_rate": 1.63372207628071e-05, + "loss": 1.4398, + "step": 5938 + }, + { + "epoch": 0.3246996432623047, + "grad_norm": 1.435640811920166, + "learning_rate": 1.6335807085018082e-05, + "loss": 1.1287, + "step": 5939 + }, + { + "epoch": 0.32475431570602625, + "grad_norm": 1.1950008869171143, + "learning_rate": 1.6334393195661597e-05, + "loss": 1.5108, + "step": 5940 + }, + { + "epoch": 0.32480898814974785, + "grad_norm": 1.5561752319335938, + "learning_rate": 1.6332979094784857e-05, + "loss": 1.4462, + "step": 5941 + }, + { + "epoch": 0.3248636605934694, + "grad_norm": 1.8555152416229248, + "learning_rate": 1.6331564782435087e-05, + "loss": 1.4033, + "step": 5942 + }, + { + "epoch": 0.3249183330371909, + "grad_norm": 1.2394565343856812, + "learning_rate": 1.6330150258659513e-05, + "loss": 1.5887, + "step": 5943 + }, + { + "epoch": 0.32497300548091246, + "grad_norm": 1.3754161596298218, + "learning_rate": 1.6328735523505366e-05, + "loss": 1.6551, + "step": 5944 + }, + { + "epoch": 0.32502767792463405, + "grad_norm": 1.4546616077423096, + "learning_rate": 1.6327320577019887e-05, + "loss": 1.3712, + "step": 5945 + }, + { + "epoch": 0.3250823503683556, + "grad_norm": 1.5848721265792847, + "learning_rate": 1.6325905419250327e-05, + "loss": 1.2896, + "step": 5946 + }, + { + "epoch": 0.3251370228120771, + "grad_norm": 1.4583696126937866, + "learning_rate": 1.6324490050243943e-05, + "loss": 1.4775, + "step": 5947 + }, + { + "epoch": 0.3251916952557987, + "grad_norm": 1.2419092655181885, + "learning_rate": 1.6323074470047993e-05, + "loss": 1.4351, + "step": 5948 + }, + { + "epoch": 0.32524636769952026, + "grad_norm": 1.4045954942703247, + "learning_rate": 1.6321658678709752e-05, + "loss": 1.6347, + "step": 5949 + }, + { + "epoch": 0.3253010401432418, + "grad_norm": 2.0222935676574707, + "learning_rate": 1.632024267627649e-05, + "loss": 1.2568, + "step": 5950 + }, + { + "epoch": 0.32535571258696333, + "grad_norm": 1.6759454011917114, + "learning_rate": 1.6318826462795497e-05, + "loss": 1.4599, + "step": 5951 + }, + { + "epoch": 0.3254103850306849, + "grad_norm": 1.789064884185791, + "learning_rate": 1.631741003831406e-05, + "loss": 1.5308, + "step": 5952 + }, + { + "epoch": 0.32546505747440646, + "grad_norm": 1.5616536140441895, + "learning_rate": 1.6315993402879477e-05, + "loss": 1.6327, + "step": 5953 + }, + { + "epoch": 0.325519729918128, + "grad_norm": 1.649428129196167, + "learning_rate": 1.6314576556539053e-05, + "loss": 1.3383, + "step": 5954 + }, + { + "epoch": 0.3255744023618496, + "grad_norm": 1.4267042875289917, + "learning_rate": 1.63131594993401e-05, + "loss": 1.3651, + "step": 5955 + }, + { + "epoch": 0.32562907480557113, + "grad_norm": 1.5469027757644653, + "learning_rate": 1.6311742231329936e-05, + "loss": 1.6999, + "step": 5956 + }, + { + "epoch": 0.32568374724929267, + "grad_norm": 1.9685754776000977, + "learning_rate": 1.6310324752555893e-05, + "loss": 1.322, + "step": 5957 + }, + { + "epoch": 0.3257384196930142, + "grad_norm": 2.2693657875061035, + "learning_rate": 1.6308907063065294e-05, + "loss": 1.2857, + "step": 5958 + }, + { + "epoch": 0.3257930921367358, + "grad_norm": 1.9068039655685425, + "learning_rate": 1.6307489162905485e-05, + "loss": 1.3348, + "step": 5959 + }, + { + "epoch": 0.32584776458045733, + "grad_norm": 1.350164532661438, + "learning_rate": 1.630607105212381e-05, + "loss": 1.4875, + "step": 5960 + }, + { + "epoch": 0.32590243702417887, + "grad_norm": 2.331821918487549, + "learning_rate": 1.6304652730767628e-05, + "loss": 1.2727, + "step": 5961 + }, + { + "epoch": 0.32595710946790046, + "grad_norm": 1.2651468515396118, + "learning_rate": 1.6303234198884298e-05, + "loss": 1.296, + "step": 5962 + }, + { + "epoch": 0.326011781911622, + "grad_norm": 1.5397212505340576, + "learning_rate": 1.6301815456521185e-05, + "loss": 1.4821, + "step": 5963 + }, + { + "epoch": 0.32606645435534354, + "grad_norm": 2.0640621185302734, + "learning_rate": 1.6300396503725665e-05, + "loss": 1.6496, + "step": 5964 + }, + { + "epoch": 0.3261211267990651, + "grad_norm": 1.5553789138793945, + "learning_rate": 1.6298977340545126e-05, + "loss": 1.3197, + "step": 5965 + }, + { + "epoch": 0.32617579924278667, + "grad_norm": 1.5211446285247803, + "learning_rate": 1.629755796702695e-05, + "loss": 1.4675, + "step": 5966 + }, + { + "epoch": 0.3262304716865082, + "grad_norm": 1.1621954441070557, + "learning_rate": 1.6296138383218534e-05, + "loss": 1.4768, + "step": 5967 + }, + { + "epoch": 0.32628514413022974, + "grad_norm": 1.3226227760314941, + "learning_rate": 1.6294718589167283e-05, + "loss": 1.5321, + "step": 5968 + }, + { + "epoch": 0.32633981657395134, + "grad_norm": 2.502268075942993, + "learning_rate": 1.629329858492061e-05, + "loss": 1.0286, + "step": 5969 + }, + { + "epoch": 0.3263944890176729, + "grad_norm": 1.5575119256973267, + "learning_rate": 1.6291878370525925e-05, + "loss": 1.5025, + "step": 5970 + }, + { + "epoch": 0.3264491614613944, + "grad_norm": 1.7716927528381348, + "learning_rate": 1.6290457946030663e-05, + "loss": 1.2632, + "step": 5971 + }, + { + "epoch": 0.32650383390511595, + "grad_norm": 1.8056230545043945, + "learning_rate": 1.6289037311482245e-05, + "loss": 1.6756, + "step": 5972 + }, + { + "epoch": 0.32655850634883754, + "grad_norm": 1.2951143980026245, + "learning_rate": 1.6287616466928112e-05, + "loss": 1.5531, + "step": 5973 + }, + { + "epoch": 0.3266131787925591, + "grad_norm": 1.9783079624176025, + "learning_rate": 1.6286195412415714e-05, + "loss": 1.2477, + "step": 5974 + }, + { + "epoch": 0.3266678512362806, + "grad_norm": 1.2854588031768799, + "learning_rate": 1.62847741479925e-05, + "loss": 1.9488, + "step": 5975 + }, + { + "epoch": 0.3267225236800022, + "grad_norm": 1.4913558959960938, + "learning_rate": 1.628335267370593e-05, + "loss": 1.5122, + "step": 5976 + }, + { + "epoch": 0.32677719612372375, + "grad_norm": 1.8351476192474365, + "learning_rate": 1.6281930989603466e-05, + "loss": 1.5954, + "step": 5977 + }, + { + "epoch": 0.3268318685674453, + "grad_norm": 1.7368512153625488, + "learning_rate": 1.628050909573259e-05, + "loss": 1.53, + "step": 5978 + }, + { + "epoch": 0.3268865410111668, + "grad_norm": 1.6197547912597656, + "learning_rate": 1.6279086992140777e-05, + "loss": 1.3667, + "step": 5979 + }, + { + "epoch": 0.3269412134548884, + "grad_norm": 1.5114425420761108, + "learning_rate": 1.6277664678875514e-05, + "loss": 1.6027, + "step": 5980 + }, + { + "epoch": 0.32699588589860995, + "grad_norm": 1.7311809062957764, + "learning_rate": 1.6276242155984295e-05, + "loss": 1.3901, + "step": 5981 + }, + { + "epoch": 0.3270505583423315, + "grad_norm": 1.2341984510421753, + "learning_rate": 1.6274819423514624e-05, + "loss": 1.4358, + "step": 5982 + }, + { + "epoch": 0.3271052307860531, + "grad_norm": 1.5656088590621948, + "learning_rate": 1.6273396481514007e-05, + "loss": 1.5761, + "step": 5983 + }, + { + "epoch": 0.3271599032297746, + "grad_norm": 1.625704288482666, + "learning_rate": 1.627197333002996e-05, + "loss": 1.3612, + "step": 5984 + }, + { + "epoch": 0.32721457567349616, + "grad_norm": 1.532485008239746, + "learning_rate": 1.6270549969110012e-05, + "loss": 1.5029, + "step": 5985 + }, + { + "epoch": 0.3272692481172177, + "grad_norm": 1.97719407081604, + "learning_rate": 1.626912639880168e-05, + "loss": 1.4479, + "step": 5986 + }, + { + "epoch": 0.3273239205609393, + "grad_norm": 1.9732948541641235, + "learning_rate": 1.6267702619152508e-05, + "loss": 1.3814, + "step": 5987 + }, + { + "epoch": 0.3273785930046608, + "grad_norm": 1.4113465547561646, + "learning_rate": 1.6266278630210036e-05, + "loss": 1.2841, + "step": 5988 + }, + { + "epoch": 0.32743326544838236, + "grad_norm": 1.1643115282058716, + "learning_rate": 1.626485443202182e-05, + "loss": 1.5545, + "step": 5989 + }, + { + "epoch": 0.32748793789210395, + "grad_norm": 1.4053397178649902, + "learning_rate": 1.626343002463541e-05, + "loss": 1.5978, + "step": 5990 + }, + { + "epoch": 0.3275426103358255, + "grad_norm": 1.4151653051376343, + "learning_rate": 1.6262005408098378e-05, + "loss": 1.5804, + "step": 5991 + }, + { + "epoch": 0.327597282779547, + "grad_norm": 1.5909370183944702, + "learning_rate": 1.626058058245829e-05, + "loss": 1.3093, + "step": 5992 + }, + { + "epoch": 0.32765195522326857, + "grad_norm": 1.634979486465454, + "learning_rate": 1.625915554776272e-05, + "loss": 1.5919, + "step": 5993 + }, + { + "epoch": 0.32770662766699016, + "grad_norm": 1.4495446681976318, + "learning_rate": 1.6257730304059265e-05, + "loss": 1.2106, + "step": 5994 + }, + { + "epoch": 0.3277613001107117, + "grad_norm": 1.0562894344329834, + "learning_rate": 1.625630485139551e-05, + "loss": 1.6611, + "step": 5995 + }, + { + "epoch": 0.32781597255443323, + "grad_norm": 1.425374150276184, + "learning_rate": 1.6254879189819055e-05, + "loss": 1.5934, + "step": 5996 + }, + { + "epoch": 0.3278706449981548, + "grad_norm": 1.9463754892349243, + "learning_rate": 1.6253453319377504e-05, + "loss": 1.142, + "step": 5997 + }, + { + "epoch": 0.32792531744187636, + "grad_norm": 1.6999380588531494, + "learning_rate": 1.6252027240118472e-05, + "loss": 1.6324, + "step": 5998 + }, + { + "epoch": 0.3279799898855979, + "grad_norm": 1.5281206369400024, + "learning_rate": 1.6250600952089586e-05, + "loss": 1.4755, + "step": 5999 + }, + { + "epoch": 0.32803466232931944, + "grad_norm": 1.440613031387329, + "learning_rate": 1.624917445533846e-05, + "loss": 1.4037, + "step": 6000 + }, + { + "epoch": 0.32808933477304103, + "grad_norm": 1.5604127645492554, + "learning_rate": 1.6247747749912738e-05, + "loss": 1.6212, + "step": 6001 + }, + { + "epoch": 0.32814400721676257, + "grad_norm": 1.1082630157470703, + "learning_rate": 1.6246320835860053e-05, + "loss": 1.4002, + "step": 6002 + }, + { + "epoch": 0.3281986796604841, + "grad_norm": 1.4000383615493774, + "learning_rate": 1.6244893713228066e-05, + "loss": 1.6241, + "step": 6003 + }, + { + "epoch": 0.3282533521042057, + "grad_norm": 1.7968581914901733, + "learning_rate": 1.6243466382064418e-05, + "loss": 1.4022, + "step": 6004 + }, + { + "epoch": 0.32830802454792724, + "grad_norm": 2.3940634727478027, + "learning_rate": 1.624203884241678e-05, + "loss": 1.1539, + "step": 6005 + }, + { + "epoch": 0.3283626969916488, + "grad_norm": 1.5110255479812622, + "learning_rate": 1.6240611094332814e-05, + "loss": 1.4768, + "step": 6006 + }, + { + "epoch": 0.3284173694353703, + "grad_norm": 1.7314671277999878, + "learning_rate": 1.62391831378602e-05, + "loss": 1.3345, + "step": 6007 + }, + { + "epoch": 0.3284720418790919, + "grad_norm": 1.448157548904419, + "learning_rate": 1.6237754973046625e-05, + "loss": 1.4933, + "step": 6008 + }, + { + "epoch": 0.32852671432281344, + "grad_norm": 1.4119153022766113, + "learning_rate": 1.623632659993977e-05, + "loss": 1.3835, + "step": 6009 + }, + { + "epoch": 0.328581386766535, + "grad_norm": 1.5559113025665283, + "learning_rate": 1.6234898018587336e-05, + "loss": 1.469, + "step": 6010 + }, + { + "epoch": 0.32863605921025657, + "grad_norm": 1.6408624649047852, + "learning_rate": 1.6233469229037026e-05, + "loss": 1.5397, + "step": 6011 + }, + { + "epoch": 0.3286907316539781, + "grad_norm": 1.4426385164260864, + "learning_rate": 1.6232040231336556e-05, + "loss": 1.5122, + "step": 6012 + }, + { + "epoch": 0.32874540409769964, + "grad_norm": 1.3875378370285034, + "learning_rate": 1.6230611025533632e-05, + "loss": 1.4434, + "step": 6013 + }, + { + "epoch": 0.32880007654142124, + "grad_norm": 1.5983508825302124, + "learning_rate": 1.622918161167599e-05, + "loss": 1.2096, + "step": 6014 + }, + { + "epoch": 0.3288547489851428, + "grad_norm": 1.918715000152588, + "learning_rate": 1.6227751989811355e-05, + "loss": 1.3852, + "step": 6015 + }, + { + "epoch": 0.3289094214288643, + "grad_norm": 2.8980655670166016, + "learning_rate": 1.622632215998747e-05, + "loss": 1.417, + "step": 6016 + }, + { + "epoch": 0.32896409387258585, + "grad_norm": 1.4649361371994019, + "learning_rate": 1.622489212225207e-05, + "loss": 1.447, + "step": 6017 + }, + { + "epoch": 0.32901876631630744, + "grad_norm": 1.3239058256149292, + "learning_rate": 1.6223461876652922e-05, + "loss": 1.3463, + "step": 6018 + }, + { + "epoch": 0.329073438760029, + "grad_norm": 1.7337194681167603, + "learning_rate": 1.6222031423237776e-05, + "loss": 1.411, + "step": 6019 + }, + { + "epoch": 0.3291281112037505, + "grad_norm": 2.6641831398010254, + "learning_rate": 1.6220600762054403e-05, + "loss": 1.5415, + "step": 6020 + }, + { + "epoch": 0.3291827836474721, + "grad_norm": 1.430055022239685, + "learning_rate": 1.6219169893150568e-05, + "loss": 1.3489, + "step": 6021 + }, + { + "epoch": 0.32923745609119365, + "grad_norm": 1.587340235710144, + "learning_rate": 1.621773881657406e-05, + "loss": 1.4775, + "step": 6022 + }, + { + "epoch": 0.3292921285349152, + "grad_norm": 1.678181767463684, + "learning_rate": 1.621630753237266e-05, + "loss": 1.6315, + "step": 6023 + }, + { + "epoch": 0.3293468009786367, + "grad_norm": 1.5956225395202637, + "learning_rate": 1.6214876040594166e-05, + "loss": 1.4276, + "step": 6024 + }, + { + "epoch": 0.3294014734223583, + "grad_norm": 2.2131919860839844, + "learning_rate": 1.6213444341286376e-05, + "loss": 1.533, + "step": 6025 + }, + { + "epoch": 0.32945614586607985, + "grad_norm": 1.8777661323547363, + "learning_rate": 1.6212012434497103e-05, + "loss": 1.5909, + "step": 6026 + }, + { + "epoch": 0.3295108183098014, + "grad_norm": 1.104090929031372, + "learning_rate": 1.6210580320274157e-05, + "loss": 1.4634, + "step": 6027 + }, + { + "epoch": 0.329565490753523, + "grad_norm": 1.7158658504486084, + "learning_rate": 1.620914799866536e-05, + "loss": 1.598, + "step": 6028 + }, + { + "epoch": 0.3296201631972445, + "grad_norm": 2.7313790321350098, + "learning_rate": 1.6207715469718538e-05, + "loss": 1.2596, + "step": 6029 + }, + { + "epoch": 0.32967483564096606, + "grad_norm": 1.6080107688903809, + "learning_rate": 1.620628273348153e-05, + "loss": 1.6345, + "step": 6030 + }, + { + "epoch": 0.3297295080846876, + "grad_norm": 1.2699053287506104, + "learning_rate": 1.620484979000218e-05, + "loss": 1.6813, + "step": 6031 + }, + { + "epoch": 0.3297841805284092, + "grad_norm": 1.3535528182983398, + "learning_rate": 1.6203416639328334e-05, + "loss": 1.639, + "step": 6032 + }, + { + "epoch": 0.3298388529721307, + "grad_norm": 1.3694911003112793, + "learning_rate": 1.620198328150785e-05, + "loss": 1.2725, + "step": 6033 + }, + { + "epoch": 0.32989352541585226, + "grad_norm": 1.447665810585022, + "learning_rate": 1.6200549716588595e-05, + "loss": 1.1775, + "step": 6034 + }, + { + "epoch": 0.32994819785957386, + "grad_norm": 2.0367045402526855, + "learning_rate": 1.619911594461843e-05, + "loss": 1.4626, + "step": 6035 + }, + { + "epoch": 0.3300028703032954, + "grad_norm": 1.3873211145401, + "learning_rate": 1.619768196564524e-05, + "loss": 1.5801, + "step": 6036 + }, + { + "epoch": 0.33005754274701693, + "grad_norm": 1.4139379262924194, + "learning_rate": 1.6196247779716902e-05, + "loss": 1.5773, + "step": 6037 + }, + { + "epoch": 0.33011221519073847, + "grad_norm": 1.9437769651412964, + "learning_rate": 1.6194813386881314e-05, + "loss": 1.4953, + "step": 6038 + }, + { + "epoch": 0.33016688763446006, + "grad_norm": 1.647168755531311, + "learning_rate": 1.619337878718637e-05, + "loss": 1.678, + "step": 6039 + }, + { + "epoch": 0.3302215600781816, + "grad_norm": 1.451060175895691, + "learning_rate": 1.6191943980679975e-05, + "loss": 1.5577, + "step": 6040 + }, + { + "epoch": 0.33027623252190313, + "grad_norm": 1.525464415550232, + "learning_rate": 1.6190508967410043e-05, + "loss": 1.3821, + "step": 6041 + }, + { + "epoch": 0.3303309049656247, + "grad_norm": 1.280103087425232, + "learning_rate": 1.6189073747424485e-05, + "loss": 1.4701, + "step": 6042 + }, + { + "epoch": 0.33038557740934626, + "grad_norm": 1.4729294776916504, + "learning_rate": 1.6187638320771233e-05, + "loss": 1.4665, + "step": 6043 + }, + { + "epoch": 0.3304402498530678, + "grad_norm": 1.2811568975448608, + "learning_rate": 1.6186202687498218e-05, + "loss": 1.4072, + "step": 6044 + }, + { + "epoch": 0.33049492229678934, + "grad_norm": 1.7564966678619385, + "learning_rate": 1.618476684765338e-05, + "loss": 1.5167, + "step": 6045 + }, + { + "epoch": 0.33054959474051093, + "grad_norm": 1.8666568994522095, + "learning_rate": 1.6183330801284664e-05, + "loss": 1.3242, + "step": 6046 + }, + { + "epoch": 0.33060426718423247, + "grad_norm": 1.4187078475952148, + "learning_rate": 1.6181894548440022e-05, + "loss": 1.4131, + "step": 6047 + }, + { + "epoch": 0.330658939627954, + "grad_norm": 1.5320378541946411, + "learning_rate": 1.6180458089167413e-05, + "loss": 1.3339, + "step": 6048 + }, + { + "epoch": 0.3307136120716756, + "grad_norm": 1.6660587787628174, + "learning_rate": 1.617902142351481e-05, + "loss": 1.3696, + "step": 6049 + }, + { + "epoch": 0.33076828451539714, + "grad_norm": 1.5530827045440674, + "learning_rate": 1.6177584551530178e-05, + "loss": 1.5449, + "step": 6050 + }, + { + "epoch": 0.3308229569591187, + "grad_norm": 1.4303934574127197, + "learning_rate": 1.6176147473261503e-05, + "loss": 1.4606, + "step": 6051 + }, + { + "epoch": 0.3308776294028402, + "grad_norm": 1.4212703704833984, + "learning_rate": 1.6174710188756773e-05, + "loss": 1.3994, + "step": 6052 + }, + { + "epoch": 0.3309323018465618, + "grad_norm": 1.547921061515808, + "learning_rate": 1.617327269806398e-05, + "loss": 1.3835, + "step": 6053 + }, + { + "epoch": 0.33098697429028334, + "grad_norm": 1.7158795595169067, + "learning_rate": 1.617183500123112e-05, + "loss": 1.3328, + "step": 6054 + }, + { + "epoch": 0.3310416467340049, + "grad_norm": 1.6104817390441895, + "learning_rate": 1.6170397098306212e-05, + "loss": 1.6284, + "step": 6055 + }, + { + "epoch": 0.3310963191777265, + "grad_norm": 1.2895087003707886, + "learning_rate": 1.6168958989337266e-05, + "loss": 1.5708, + "step": 6056 + }, + { + "epoch": 0.331150991621448, + "grad_norm": 1.377555012702942, + "learning_rate": 1.61675206743723e-05, + "loss": 1.6643, + "step": 6057 + }, + { + "epoch": 0.33120566406516955, + "grad_norm": 1.4740757942199707, + "learning_rate": 1.616608215345935e-05, + "loss": 1.4378, + "step": 6058 + }, + { + "epoch": 0.3312603365088911, + "grad_norm": 1.124656081199646, + "learning_rate": 1.6164643426646445e-05, + "loss": 1.5438, + "step": 6059 + }, + { + "epoch": 0.3313150089526127, + "grad_norm": 1.8163330554962158, + "learning_rate": 1.616320449398163e-05, + "loss": 1.2648, + "step": 6060 + }, + { + "epoch": 0.3313696813963342, + "grad_norm": 1.348542332649231, + "learning_rate": 1.6161765355512958e-05, + "loss": 1.3118, + "step": 6061 + }, + { + "epoch": 0.33142435384005575, + "grad_norm": 1.68814218044281, + "learning_rate": 1.616032601128848e-05, + "loss": 1.5562, + "step": 6062 + }, + { + "epoch": 0.33147902628377734, + "grad_norm": 1.459236979484558, + "learning_rate": 1.615888646135626e-05, + "loss": 1.3828, + "step": 6063 + }, + { + "epoch": 0.3315336987274989, + "grad_norm": 1.4342659711837769, + "learning_rate": 1.6157446705764367e-05, + "loss": 1.3291, + "step": 6064 + }, + { + "epoch": 0.3315883711712204, + "grad_norm": 1.7194750308990479, + "learning_rate": 1.6156006744560882e-05, + "loss": 1.372, + "step": 6065 + }, + { + "epoch": 0.33164304361494196, + "grad_norm": 1.2654874324798584, + "learning_rate": 1.6154566577793886e-05, + "loss": 1.403, + "step": 6066 + }, + { + "epoch": 0.33169771605866355, + "grad_norm": 1.6881568431854248, + "learning_rate": 1.6153126205511468e-05, + "loss": 1.6555, + "step": 6067 + }, + { + "epoch": 0.3317523885023851, + "grad_norm": 2.0046820640563965, + "learning_rate": 1.615168562776173e-05, + "loss": 1.6411, + "step": 6068 + }, + { + "epoch": 0.3318070609461066, + "grad_norm": 1.417531132698059, + "learning_rate": 1.6150244844592774e-05, + "loss": 1.488, + "step": 6069 + }, + { + "epoch": 0.3318617333898282, + "grad_norm": 1.5003265142440796, + "learning_rate": 1.6148803856052708e-05, + "loss": 1.6568, + "step": 6070 + }, + { + "epoch": 0.33191640583354975, + "grad_norm": 1.322737693786621, + "learning_rate": 1.6147362662189653e-05, + "loss": 1.4908, + "step": 6071 + }, + { + "epoch": 0.3319710782772713, + "grad_norm": 1.4415351152420044, + "learning_rate": 1.6145921263051735e-05, + "loss": 1.314, + "step": 6072 + }, + { + "epoch": 0.33202575072099283, + "grad_norm": 1.6742032766342163, + "learning_rate": 1.614447965868708e-05, + "loss": 1.4383, + "step": 6073 + }, + { + "epoch": 0.3320804231647144, + "grad_norm": 2.2894623279571533, + "learning_rate": 1.6143037849143834e-05, + "loss": 1.656, + "step": 6074 + }, + { + "epoch": 0.33213509560843596, + "grad_norm": 1.3718509674072266, + "learning_rate": 1.6141595834470142e-05, + "loss": 1.4393, + "step": 6075 + }, + { + "epoch": 0.3321897680521575, + "grad_norm": 1.8188611268997192, + "learning_rate": 1.6140153614714148e-05, + "loss": 1.5759, + "step": 6076 + }, + { + "epoch": 0.3322444404958791, + "grad_norm": 1.4353965520858765, + "learning_rate": 1.613871118992402e-05, + "loss": 1.3372, + "step": 6077 + }, + { + "epoch": 0.3322991129396006, + "grad_norm": 1.3908796310424805, + "learning_rate": 1.613726856014792e-05, + "loss": 1.3047, + "step": 6078 + }, + { + "epoch": 0.33235378538332216, + "grad_norm": 1.8032509088516235, + "learning_rate": 1.613582572543402e-05, + "loss": 1.8244, + "step": 6079 + }, + { + "epoch": 0.3324084578270437, + "grad_norm": 1.6441200971603394, + "learning_rate": 1.6134382685830502e-05, + "loss": 1.342, + "step": 6080 + }, + { + "epoch": 0.3324631302707653, + "grad_norm": 1.5089668035507202, + "learning_rate": 1.613293944138555e-05, + "loss": 1.5236, + "step": 6081 + }, + { + "epoch": 0.33251780271448683, + "grad_norm": 1.50893235206604, + "learning_rate": 1.6131495992147363e-05, + "loss": 1.6955, + "step": 6082 + }, + { + "epoch": 0.33257247515820837, + "grad_norm": 1.199289321899414, + "learning_rate": 1.6130052338164133e-05, + "loss": 1.5202, + "step": 6083 + }, + { + "epoch": 0.33262714760192996, + "grad_norm": 1.8036173582077026, + "learning_rate": 1.612860847948407e-05, + "loss": 1.6735, + "step": 6084 + }, + { + "epoch": 0.3326818200456515, + "grad_norm": 1.6790623664855957, + "learning_rate": 1.6127164416155387e-05, + "loss": 1.3268, + "step": 6085 + }, + { + "epoch": 0.33273649248937304, + "grad_norm": 1.485020637512207, + "learning_rate": 1.612572014822631e-05, + "loss": 1.4727, + "step": 6086 + }, + { + "epoch": 0.3327911649330946, + "grad_norm": 1.7255247831344604, + "learning_rate": 1.6124275675745063e-05, + "loss": 1.6125, + "step": 6087 + }, + { + "epoch": 0.33284583737681617, + "grad_norm": 1.2349457740783691, + "learning_rate": 1.612283099875988e-05, + "loss": 1.707, + "step": 6088 + }, + { + "epoch": 0.3329005098205377, + "grad_norm": 1.3872718811035156, + "learning_rate": 1.6121386117319e-05, + "loss": 1.42, + "step": 6089 + }, + { + "epoch": 0.33295518226425924, + "grad_norm": 1.356957197189331, + "learning_rate": 1.6119941031470676e-05, + "loss": 1.4985, + "step": 6090 + }, + { + "epoch": 0.33300985470798083, + "grad_norm": 1.6863011121749878, + "learning_rate": 1.611849574126316e-05, + "loss": 1.2769, + "step": 6091 + }, + { + "epoch": 0.33306452715170237, + "grad_norm": 1.5107663869857788, + "learning_rate": 1.6117050246744708e-05, + "loss": 1.3721, + "step": 6092 + }, + { + "epoch": 0.3331191995954239, + "grad_norm": 1.554687261581421, + "learning_rate": 1.6115604547963597e-05, + "loss": 1.4431, + "step": 6093 + }, + { + "epoch": 0.33317387203914545, + "grad_norm": 1.4834365844726562, + "learning_rate": 1.6114158644968102e-05, + "loss": 1.3951, + "step": 6094 + }, + { + "epoch": 0.33322854448286704, + "grad_norm": 1.616764783859253, + "learning_rate": 1.6112712537806502e-05, + "loss": 1.4321, + "step": 6095 + }, + { + "epoch": 0.3332832169265886, + "grad_norm": 1.3361172676086426, + "learning_rate": 1.6111266226527086e-05, + "loss": 1.4978, + "step": 6096 + }, + { + "epoch": 0.3333378893703101, + "grad_norm": 1.6580023765563965, + "learning_rate": 1.610981971117815e-05, + "loss": 1.1136, + "step": 6097 + }, + { + "epoch": 0.3333925618140317, + "grad_norm": 1.3839261531829834, + "learning_rate": 1.6108372991807998e-05, + "loss": 1.8132, + "step": 6098 + }, + { + "epoch": 0.33344723425775324, + "grad_norm": 1.0930759906768799, + "learning_rate": 1.6106926068464936e-05, + "loss": 1.2932, + "step": 6099 + }, + { + "epoch": 0.3335019067014748, + "grad_norm": 1.8202159404754639, + "learning_rate": 1.610547894119728e-05, + "loss": 1.362, + "step": 6100 + }, + { + "epoch": 0.3335565791451963, + "grad_norm": 1.6974660158157349, + "learning_rate": 1.610403161005336e-05, + "loss": 1.5934, + "step": 6101 + }, + { + "epoch": 0.3336112515889179, + "grad_norm": 1.770453691482544, + "learning_rate": 1.61025840750815e-05, + "loss": 1.371, + "step": 6102 + }, + { + "epoch": 0.33366592403263945, + "grad_norm": 1.6996824741363525, + "learning_rate": 1.6101136336330037e-05, + "loss": 1.5768, + "step": 6103 + }, + { + "epoch": 0.333720596476361, + "grad_norm": 1.7422845363616943, + "learning_rate": 1.6099688393847313e-05, + "loss": 1.4222, + "step": 6104 + }, + { + "epoch": 0.3337752689200826, + "grad_norm": 1.4090927839279175, + "learning_rate": 1.6098240247681684e-05, + "loss": 1.4893, + "step": 6105 + }, + { + "epoch": 0.3338299413638041, + "grad_norm": 1.174648642539978, + "learning_rate": 1.60967918978815e-05, + "loss": 1.3824, + "step": 6106 + }, + { + "epoch": 0.33388461380752565, + "grad_norm": 1.2045490741729736, + "learning_rate": 1.609534334449513e-05, + "loss": 1.6404, + "step": 6107 + }, + { + "epoch": 0.3339392862512472, + "grad_norm": 1.2899876832962036, + "learning_rate": 1.6093894587570942e-05, + "loss": 1.5104, + "step": 6108 + }, + { + "epoch": 0.3339939586949688, + "grad_norm": 1.8210229873657227, + "learning_rate": 1.6092445627157314e-05, + "loss": 1.5889, + "step": 6109 + }, + { + "epoch": 0.3340486311386903, + "grad_norm": 1.8676174879074097, + "learning_rate": 1.609099646330263e-05, + "loss": 1.1419, + "step": 6110 + }, + { + "epoch": 0.33410330358241186, + "grad_norm": 1.7238354682922363, + "learning_rate": 1.608954709605528e-05, + "loss": 1.3101, + "step": 6111 + }, + { + "epoch": 0.33415797602613345, + "grad_norm": 1.4692362546920776, + "learning_rate": 1.6088097525463663e-05, + "loss": 1.6128, + "step": 6112 + }, + { + "epoch": 0.334212648469855, + "grad_norm": 1.3438994884490967, + "learning_rate": 1.6086647751576184e-05, + "loss": 1.533, + "step": 6113 + }, + { + "epoch": 0.3342673209135765, + "grad_norm": 1.4851981401443481, + "learning_rate": 1.6085197774441253e-05, + "loss": 1.7005, + "step": 6114 + }, + { + "epoch": 0.33432199335729806, + "grad_norm": 1.4722079038619995, + "learning_rate": 1.608374759410729e-05, + "loss": 1.4102, + "step": 6115 + }, + { + "epoch": 0.33437666580101966, + "grad_norm": 1.3895654678344727, + "learning_rate": 1.608229721062272e-05, + "loss": 1.4092, + "step": 6116 + }, + { + "epoch": 0.3344313382447412, + "grad_norm": 1.4008469581604004, + "learning_rate": 1.6080846624035972e-05, + "loss": 1.3078, + "step": 6117 + }, + { + "epoch": 0.33448601068846273, + "grad_norm": 1.390910029411316, + "learning_rate": 1.6079395834395487e-05, + "loss": 1.4563, + "step": 6118 + }, + { + "epoch": 0.3345406831321843, + "grad_norm": 1.7263662815093994, + "learning_rate": 1.6077944841749706e-05, + "loss": 1.5082, + "step": 6119 + }, + { + "epoch": 0.33459535557590586, + "grad_norm": 1.4564422369003296, + "learning_rate": 1.6076493646147088e-05, + "loss": 1.4811, + "step": 6120 + }, + { + "epoch": 0.3346500280196274, + "grad_norm": 1.722330927848816, + "learning_rate": 1.607504224763609e-05, + "loss": 1.4603, + "step": 6121 + }, + { + "epoch": 0.33470470046334894, + "grad_norm": 1.78285813331604, + "learning_rate": 1.607359064626517e-05, + "loss": 1.4178, + "step": 6122 + }, + { + "epoch": 0.33475937290707053, + "grad_norm": 1.7766016721725464, + "learning_rate": 1.607213884208281e-05, + "loss": 1.3357, + "step": 6123 + }, + { + "epoch": 0.33481404535079207, + "grad_norm": 1.635114312171936, + "learning_rate": 1.6070686835137484e-05, + "loss": 1.6022, + "step": 6124 + }, + { + "epoch": 0.3348687177945136, + "grad_norm": 1.3048322200775146, + "learning_rate": 1.606923462547768e-05, + "loss": 1.5462, + "step": 6125 + }, + { + "epoch": 0.3349233902382352, + "grad_norm": 1.6294063329696655, + "learning_rate": 1.606778221315189e-05, + "loss": 1.3458, + "step": 6126 + }, + { + "epoch": 0.33497806268195673, + "grad_norm": 1.6210086345672607, + "learning_rate": 1.6066329598208615e-05, + "loss": 1.2191, + "step": 6127 + }, + { + "epoch": 0.33503273512567827, + "grad_norm": 1.8282727003097534, + "learning_rate": 1.6064876780696356e-05, + "loss": 1.4911, + "step": 6128 + }, + { + "epoch": 0.3350874075693998, + "grad_norm": 1.5851526260375977, + "learning_rate": 1.6063423760663633e-05, + "loss": 1.3472, + "step": 6129 + }, + { + "epoch": 0.3351420800131214, + "grad_norm": 1.5069944858551025, + "learning_rate": 1.606197053815896e-05, + "loss": 1.7654, + "step": 6130 + }, + { + "epoch": 0.33519675245684294, + "grad_norm": 1.5231544971466064, + "learning_rate": 1.6060517113230866e-05, + "loss": 1.6493, + "step": 6131 + }, + { + "epoch": 0.3352514249005645, + "grad_norm": 1.66508150100708, + "learning_rate": 1.6059063485927886e-05, + "loss": 1.2756, + "step": 6132 + }, + { + "epoch": 0.33530609734428607, + "grad_norm": 1.4526026248931885, + "learning_rate": 1.6057609656298558e-05, + "loss": 1.3835, + "step": 6133 + }, + { + "epoch": 0.3353607697880076, + "grad_norm": 1.5881311893463135, + "learning_rate": 1.605615562439143e-05, + "loss": 1.2876, + "step": 6134 + }, + { + "epoch": 0.33541544223172914, + "grad_norm": 1.2895127534866333, + "learning_rate": 1.605470139025505e-05, + "loss": 1.4278, + "step": 6135 + }, + { + "epoch": 0.3354701146754507, + "grad_norm": 1.6654621362686157, + "learning_rate": 1.6053246953937985e-05, + "loss": 1.4234, + "step": 6136 + }, + { + "epoch": 0.3355247871191723, + "grad_norm": 1.622192621231079, + "learning_rate": 1.6051792315488798e-05, + "loss": 1.688, + "step": 6137 + }, + { + "epoch": 0.3355794595628938, + "grad_norm": 1.263502836227417, + "learning_rate": 1.605033747495607e-05, + "loss": 1.5392, + "step": 6138 + }, + { + "epoch": 0.33563413200661535, + "grad_norm": 1.2736858129501343, + "learning_rate": 1.604888243238837e-05, + "loss": 1.3074, + "step": 6139 + }, + { + "epoch": 0.33568880445033694, + "grad_norm": 1.383109211921692, + "learning_rate": 1.6047427187834295e-05, + "loss": 1.5536, + "step": 6140 + }, + { + "epoch": 0.3357434768940585, + "grad_norm": 1.5975772142410278, + "learning_rate": 1.6045971741342435e-05, + "loss": 1.6868, + "step": 6141 + }, + { + "epoch": 0.33579814933778, + "grad_norm": 1.3693689107894897, + "learning_rate": 1.604451609296139e-05, + "loss": 1.6029, + "step": 6142 + }, + { + "epoch": 0.33585282178150155, + "grad_norm": 1.442976474761963, + "learning_rate": 1.604306024273977e-05, + "loss": 1.3437, + "step": 6143 + }, + { + "epoch": 0.33590749422522315, + "grad_norm": 1.588809847831726, + "learning_rate": 1.604160419072619e-05, + "loss": 1.2201, + "step": 6144 + }, + { + "epoch": 0.3359621666689447, + "grad_norm": 1.4385457038879395, + "learning_rate": 1.6040147936969263e-05, + "loss": 1.4249, + "step": 6145 + }, + { + "epoch": 0.3360168391126662, + "grad_norm": 1.423038363456726, + "learning_rate": 1.603869148151763e-05, + "loss": 1.5294, + "step": 6146 + }, + { + "epoch": 0.3360715115563878, + "grad_norm": 1.5983036756515503, + "learning_rate": 1.6037234824419915e-05, + "loss": 1.6305, + "step": 6147 + }, + { + "epoch": 0.33612618400010935, + "grad_norm": 1.3263959884643555, + "learning_rate": 1.603577796572476e-05, + "loss": 1.4371, + "step": 6148 + }, + { + "epoch": 0.3361808564438309, + "grad_norm": 1.5766581296920776, + "learning_rate": 1.6034320905480817e-05, + "loss": 1.6187, + "step": 6149 + }, + { + "epoch": 0.3362355288875524, + "grad_norm": 1.5251942873001099, + "learning_rate": 1.603286364373674e-05, + "loss": 1.3294, + "step": 6150 + }, + { + "epoch": 0.336290201331274, + "grad_norm": 1.6643136739730835, + "learning_rate": 1.603140618054119e-05, + "loss": 1.4214, + "step": 6151 + }, + { + "epoch": 0.33634487377499556, + "grad_norm": 1.6157691478729248, + "learning_rate": 1.602994851594283e-05, + "loss": 1.5146, + "step": 6152 + }, + { + "epoch": 0.3363995462187171, + "grad_norm": 1.4956190586090088, + "learning_rate": 1.6028490649990346e-05, + "loss": 1.4734, + "step": 6153 + }, + { + "epoch": 0.3364542186624387, + "grad_norm": 1.471232295036316, + "learning_rate": 1.6027032582732408e-05, + "loss": 1.7419, + "step": 6154 + }, + { + "epoch": 0.3365088911061602, + "grad_norm": 1.866313099861145, + "learning_rate": 1.602557431421771e-05, + "loss": 1.4428, + "step": 6155 + }, + { + "epoch": 0.33656356354988176, + "grad_norm": 1.4770700931549072, + "learning_rate": 1.6024115844494948e-05, + "loss": 1.3303, + "step": 6156 + }, + { + "epoch": 0.3366182359936033, + "grad_norm": 1.3791486024856567, + "learning_rate": 1.602265717361282e-05, + "loss": 1.3563, + "step": 6157 + }, + { + "epoch": 0.3366729084373249, + "grad_norm": 1.580520510673523, + "learning_rate": 1.6021198301620036e-05, + "loss": 1.6825, + "step": 6158 + }, + { + "epoch": 0.33672758088104643, + "grad_norm": 1.6120604276657104, + "learning_rate": 1.6019739228565314e-05, + "loss": 1.4322, + "step": 6159 + }, + { + "epoch": 0.33678225332476797, + "grad_norm": 1.7388614416122437, + "learning_rate": 1.6018279954497374e-05, + "loss": 1.538, + "step": 6160 + }, + { + "epoch": 0.33683692576848956, + "grad_norm": 1.3517740964889526, + "learning_rate": 1.601682047946494e-05, + "loss": 1.6444, + "step": 6161 + }, + { + "epoch": 0.3368915982122111, + "grad_norm": 1.3522623777389526, + "learning_rate": 1.6015360803516755e-05, + "loss": 1.5213, + "step": 6162 + }, + { + "epoch": 0.33694627065593263, + "grad_norm": 1.7210345268249512, + "learning_rate": 1.6013900926701555e-05, + "loss": 1.4054, + "step": 6163 + }, + { + "epoch": 0.33700094309965417, + "grad_norm": 1.5924112796783447, + "learning_rate": 1.6012440849068092e-05, + "loss": 1.4113, + "step": 6164 + }, + { + "epoch": 0.33705561554337576, + "grad_norm": 1.2142826318740845, + "learning_rate": 1.601098057066512e-05, + "loss": 1.4597, + "step": 6165 + }, + { + "epoch": 0.3371102879870973, + "grad_norm": 1.4333178997039795, + "learning_rate": 1.6009520091541403e-05, + "loss": 1.2765, + "step": 6166 + }, + { + "epoch": 0.33716496043081884, + "grad_norm": 1.2131155729293823, + "learning_rate": 1.6008059411745705e-05, + "loss": 1.2802, + "step": 6167 + }, + { + "epoch": 0.33721963287454043, + "grad_norm": 2.418673515319824, + "learning_rate": 1.6006598531326808e-05, + "loss": 1.3737, + "step": 6168 + }, + { + "epoch": 0.33727430531826197, + "grad_norm": 1.5787849426269531, + "learning_rate": 1.6005137450333487e-05, + "loss": 1.6896, + "step": 6169 + }, + { + "epoch": 0.3373289777619835, + "grad_norm": 1.1331368684768677, + "learning_rate": 1.600367616881454e-05, + "loss": 1.4965, + "step": 6170 + }, + { + "epoch": 0.33738365020570504, + "grad_norm": 1.4770652055740356, + "learning_rate": 1.6002214686818755e-05, + "loss": 1.3563, + "step": 6171 + }, + { + "epoch": 0.33743832264942664, + "grad_norm": 1.724311351776123, + "learning_rate": 1.6000753004394938e-05, + "loss": 1.4404, + "step": 6172 + }, + { + "epoch": 0.3374929950931482, + "grad_norm": 1.736505150794983, + "learning_rate": 1.5999291121591894e-05, + "loss": 1.421, + "step": 6173 + }, + { + "epoch": 0.3375476675368697, + "grad_norm": 1.4959385395050049, + "learning_rate": 1.5997829038458447e-05, + "loss": 1.5295, + "step": 6174 + }, + { + "epoch": 0.3376023399805913, + "grad_norm": 1.2395100593566895, + "learning_rate": 1.5996366755043413e-05, + "loss": 1.5887, + "step": 6175 + }, + { + "epoch": 0.33765701242431284, + "grad_norm": 1.525604486465454, + "learning_rate": 1.599490427139562e-05, + "loss": 1.4104, + "step": 6176 + }, + { + "epoch": 0.3377116848680344, + "grad_norm": 1.5450655221939087, + "learning_rate": 1.5993441587563906e-05, + "loss": 1.3647, + "step": 6177 + }, + { + "epoch": 0.3377663573117559, + "grad_norm": 1.3270264863967896, + "learning_rate": 1.599197870359711e-05, + "loss": 1.5908, + "step": 6178 + }, + { + "epoch": 0.3378210297554775, + "grad_norm": 1.5549393892288208, + "learning_rate": 1.5990515619544092e-05, + "loss": 1.5326, + "step": 6179 + }, + { + "epoch": 0.33787570219919905, + "grad_norm": 1.1925735473632812, + "learning_rate": 1.5989052335453695e-05, + "loss": 1.5835, + "step": 6180 + }, + { + "epoch": 0.3379303746429206, + "grad_norm": 1.8088678121566772, + "learning_rate": 1.598758885137479e-05, + "loss": 1.4545, + "step": 6181 + }, + { + "epoch": 0.3379850470866422, + "grad_norm": 1.7851725816726685, + "learning_rate": 1.598612516735624e-05, + "loss": 1.435, + "step": 6182 + }, + { + "epoch": 0.3380397195303637, + "grad_norm": 1.889687418937683, + "learning_rate": 1.5984661283446924e-05, + "loss": 1.3665, + "step": 6183 + }, + { + "epoch": 0.33809439197408525, + "grad_norm": 1.340194821357727, + "learning_rate": 1.5983197199695727e-05, + "loss": 1.5637, + "step": 6184 + }, + { + "epoch": 0.3381490644178068, + "grad_norm": 1.5628361701965332, + "learning_rate": 1.5981732916151534e-05, + "loss": 1.5408, + "step": 6185 + }, + { + "epoch": 0.3382037368615284, + "grad_norm": 2.024118423461914, + "learning_rate": 1.598026843286324e-05, + "loss": 1.4903, + "step": 6186 + }, + { + "epoch": 0.3382584093052499, + "grad_norm": 1.1771423816680908, + "learning_rate": 1.5978803749879754e-05, + "loss": 1.4292, + "step": 6187 + }, + { + "epoch": 0.33831308174897146, + "grad_norm": 1.5062708854675293, + "learning_rate": 1.5977338867249978e-05, + "loss": 1.4429, + "step": 6188 + }, + { + "epoch": 0.33836775419269305, + "grad_norm": 1.363720417022705, + "learning_rate": 1.5975873785022834e-05, + "loss": 1.4687, + "step": 6189 + }, + { + "epoch": 0.3384224266364146, + "grad_norm": 1.758246898651123, + "learning_rate": 1.5974408503247237e-05, + "loss": 1.4591, + "step": 6190 + }, + { + "epoch": 0.3384770990801361, + "grad_norm": 1.5270445346832275, + "learning_rate": 1.5972943021972125e-05, + "loss": 1.5505, + "step": 6191 + }, + { + "epoch": 0.33853177152385766, + "grad_norm": 1.4940952062606812, + "learning_rate": 1.5971477341246425e-05, + "loss": 1.2353, + "step": 6192 + }, + { + "epoch": 0.33858644396757925, + "grad_norm": 1.784889817237854, + "learning_rate": 1.597001146111909e-05, + "loss": 1.4855, + "step": 6193 + }, + { + "epoch": 0.3386411164113008, + "grad_norm": 1.9083833694458008, + "learning_rate": 1.596854538163906e-05, + "loss": 1.6605, + "step": 6194 + }, + { + "epoch": 0.33869578885502233, + "grad_norm": 1.3843779563903809, + "learning_rate": 1.5967079102855293e-05, + "loss": 1.4194, + "step": 6195 + }, + { + "epoch": 0.3387504612987439, + "grad_norm": 1.7113516330718994, + "learning_rate": 1.5965612624816755e-05, + "loss": 1.3696, + "step": 6196 + }, + { + "epoch": 0.33880513374246546, + "grad_norm": 2.6469645500183105, + "learning_rate": 1.5964145947572412e-05, + "loss": 1.6311, + "step": 6197 + }, + { + "epoch": 0.338859806186187, + "grad_norm": 1.3452798128128052, + "learning_rate": 1.596267907117124e-05, + "loss": 1.2593, + "step": 6198 + }, + { + "epoch": 0.33891447862990853, + "grad_norm": 1.2903566360473633, + "learning_rate": 1.596121199566222e-05, + "loss": 1.2734, + "step": 6199 + }, + { + "epoch": 0.3389691510736301, + "grad_norm": 1.7159065008163452, + "learning_rate": 1.5959744721094343e-05, + "loss": 1.4197, + "step": 6200 + }, + { + "epoch": 0.33902382351735166, + "grad_norm": 1.4041188955307007, + "learning_rate": 1.595827724751661e-05, + "loss": 1.4578, + "step": 6201 + }, + { + "epoch": 0.3390784959610732, + "grad_norm": 1.6766088008880615, + "learning_rate": 1.5956809574978014e-05, + "loss": 1.4568, + "step": 6202 + }, + { + "epoch": 0.3391331684047948, + "grad_norm": 1.7398329973220825, + "learning_rate": 1.595534170352757e-05, + "loss": 1.5625, + "step": 6203 + }, + { + "epoch": 0.33918784084851633, + "grad_norm": 1.745830774307251, + "learning_rate": 1.595387363321429e-05, + "loss": 1.3495, + "step": 6204 + }, + { + "epoch": 0.33924251329223787, + "grad_norm": 1.4049092531204224, + "learning_rate": 1.59524053640872e-05, + "loss": 1.5332, + "step": 6205 + }, + { + "epoch": 0.3392971857359594, + "grad_norm": 1.6453989744186401, + "learning_rate": 1.5950936896195328e-05, + "loss": 1.4225, + "step": 6206 + }, + { + "epoch": 0.339351858179681, + "grad_norm": 1.2753534317016602, + "learning_rate": 1.5949468229587704e-05, + "loss": 1.5653, + "step": 6207 + }, + { + "epoch": 0.33940653062340254, + "grad_norm": 1.7838131189346313, + "learning_rate": 1.5947999364313378e-05, + "loss": 1.4558, + "step": 6208 + }, + { + "epoch": 0.3394612030671241, + "grad_norm": 1.4682351350784302, + "learning_rate": 1.5946530300421396e-05, + "loss": 1.3775, + "step": 6209 + }, + { + "epoch": 0.33951587551084567, + "grad_norm": 1.2897740602493286, + "learning_rate": 1.5945061037960812e-05, + "loss": 1.4206, + "step": 6210 + }, + { + "epoch": 0.3395705479545672, + "grad_norm": 1.4858043193817139, + "learning_rate": 1.594359157698069e-05, + "loss": 1.5057, + "step": 6211 + }, + { + "epoch": 0.33962522039828874, + "grad_norm": 2.256558656692505, + "learning_rate": 1.59421219175301e-05, + "loss": 1.4042, + "step": 6212 + }, + { + "epoch": 0.3396798928420103, + "grad_norm": 1.8670239448547363, + "learning_rate": 1.5940652059658116e-05, + "loss": 1.1559, + "step": 6213 + }, + { + "epoch": 0.33973456528573187, + "grad_norm": 1.5230729579925537, + "learning_rate": 1.5939182003413816e-05, + "loss": 1.4271, + "step": 6214 + }, + { + "epoch": 0.3397892377294534, + "grad_norm": 1.4109541177749634, + "learning_rate": 1.5937711748846292e-05, + "loss": 1.2973, + "step": 6215 + }, + { + "epoch": 0.33984391017317495, + "grad_norm": 1.6480205059051514, + "learning_rate": 1.5936241296004646e-05, + "loss": 1.4903, + "step": 6216 + }, + { + "epoch": 0.33989858261689654, + "grad_norm": 1.2494838237762451, + "learning_rate": 1.5934770644937967e-05, + "loss": 1.4985, + "step": 6217 + }, + { + "epoch": 0.3399532550606181, + "grad_norm": 1.4323110580444336, + "learning_rate": 1.593329979569537e-05, + "loss": 1.5229, + "step": 6218 + }, + { + "epoch": 0.3400079275043396, + "grad_norm": 1.7604310512542725, + "learning_rate": 1.5931828748325974e-05, + "loss": 1.7622, + "step": 6219 + }, + { + "epoch": 0.3400625999480612, + "grad_norm": 1.5222392082214355, + "learning_rate": 1.5930357502878892e-05, + "loss": 1.4968, + "step": 6220 + }, + { + "epoch": 0.34011727239178274, + "grad_norm": 1.4281376600265503, + "learning_rate": 1.592888605940326e-05, + "loss": 1.3147, + "step": 6221 + }, + { + "epoch": 0.3401719448355043, + "grad_norm": 1.4284104108810425, + "learning_rate": 1.5927414417948205e-05, + "loss": 1.4062, + "step": 6222 + }, + { + "epoch": 0.3402266172792258, + "grad_norm": 1.4307565689086914, + "learning_rate": 1.592594257856288e-05, + "loss": 1.6432, + "step": 6223 + }, + { + "epoch": 0.3402812897229474, + "grad_norm": 1.3542060852050781, + "learning_rate": 1.5924470541296423e-05, + "loss": 1.4331, + "step": 6224 + }, + { + "epoch": 0.34033596216666895, + "grad_norm": 1.464511513710022, + "learning_rate": 1.5922998306197993e-05, + "loss": 1.4019, + "step": 6225 + }, + { + "epoch": 0.3403906346103905, + "grad_norm": 1.379042387008667, + "learning_rate": 1.5921525873316754e-05, + "loss": 1.3501, + "step": 6226 + }, + { + "epoch": 0.3404453070541121, + "grad_norm": 1.2989474534988403, + "learning_rate": 1.5920053242701867e-05, + "loss": 1.5255, + "step": 6227 + }, + { + "epoch": 0.3404999794978336, + "grad_norm": 1.8433372974395752, + "learning_rate": 1.591858041440251e-05, + "loss": 1.2615, + "step": 6228 + }, + { + "epoch": 0.34055465194155515, + "grad_norm": 1.339243769645691, + "learning_rate": 1.5917107388467866e-05, + "loss": 1.615, + "step": 6229 + }, + { + "epoch": 0.3406093243852767, + "grad_norm": 1.3635538816452026, + "learning_rate": 1.591563416494712e-05, + "loss": 1.45, + "step": 6230 + }, + { + "epoch": 0.3406639968289983, + "grad_norm": 1.6687525510787964, + "learning_rate": 1.591416074388947e-05, + "loss": 1.5334, + "step": 6231 + }, + { + "epoch": 0.3407186692727198, + "grad_norm": 1.4167070388793945, + "learning_rate": 1.5912687125344114e-05, + "loss": 1.3615, + "step": 6232 + }, + { + "epoch": 0.34077334171644136, + "grad_norm": 1.6412702798843384, + "learning_rate": 1.591121330936026e-05, + "loss": 1.602, + "step": 6233 + }, + { + "epoch": 0.34082801416016295, + "grad_norm": 3.2192740440368652, + "learning_rate": 1.5909739295987123e-05, + "loss": 1.2352, + "step": 6234 + }, + { + "epoch": 0.3408826866038845, + "grad_norm": 1.8147119283676147, + "learning_rate": 1.5908265085273923e-05, + "loss": 1.319, + "step": 6235 + }, + { + "epoch": 0.340937359047606, + "grad_norm": 1.8669407367706299, + "learning_rate": 1.5906790677269887e-05, + "loss": 1.4296, + "step": 6236 + }, + { + "epoch": 0.34099203149132756, + "grad_norm": 1.6536544561386108, + "learning_rate": 1.590531607202425e-05, + "loss": 1.601, + "step": 6237 + }, + { + "epoch": 0.34104670393504916, + "grad_norm": 1.4176995754241943, + "learning_rate": 1.5903841269586254e-05, + "loss": 1.6312, + "step": 6238 + }, + { + "epoch": 0.3411013763787707, + "grad_norm": 1.4468451738357544, + "learning_rate": 1.590236627000514e-05, + "loss": 1.8293, + "step": 6239 + }, + { + "epoch": 0.34115604882249223, + "grad_norm": 1.5404348373413086, + "learning_rate": 1.590089107333017e-05, + "loss": 1.6333, + "step": 6240 + }, + { + "epoch": 0.3412107212662138, + "grad_norm": 1.9422341585159302, + "learning_rate": 1.5899415679610597e-05, + "loss": 1.7762, + "step": 6241 + }, + { + "epoch": 0.34126539370993536, + "grad_norm": 1.7519711256027222, + "learning_rate": 1.5897940088895693e-05, + "loss": 1.4375, + "step": 6242 + }, + { + "epoch": 0.3413200661536569, + "grad_norm": 1.2677496671676636, + "learning_rate": 1.589646430123473e-05, + "loss": 1.282, + "step": 6243 + }, + { + "epoch": 0.34137473859737844, + "grad_norm": 1.7074204683303833, + "learning_rate": 1.5894988316676986e-05, + "loss": 1.3783, + "step": 6244 + }, + { + "epoch": 0.34142941104110003, + "grad_norm": 1.5300700664520264, + "learning_rate": 1.589351213527175e-05, + "loss": 1.4358, + "step": 6245 + }, + { + "epoch": 0.34148408348482157, + "grad_norm": 1.9636037349700928, + "learning_rate": 1.5892035757068313e-05, + "loss": 1.1859, + "step": 6246 + }, + { + "epoch": 0.3415387559285431, + "grad_norm": 1.4621766805648804, + "learning_rate": 1.5890559182115978e-05, + "loss": 1.3675, + "step": 6247 + }, + { + "epoch": 0.3415934283722647, + "grad_norm": 1.653929352760315, + "learning_rate": 1.5889082410464046e-05, + "loss": 1.5535, + "step": 6248 + }, + { + "epoch": 0.34164810081598623, + "grad_norm": 1.4367889165878296, + "learning_rate": 1.5887605442161834e-05, + "loss": 1.4903, + "step": 6249 + }, + { + "epoch": 0.34170277325970777, + "grad_norm": 1.113204002380371, + "learning_rate": 1.5886128277258665e-05, + "loss": 1.3149, + "step": 6250 + }, + { + "epoch": 0.3417574457034293, + "grad_norm": 1.3252809047698975, + "learning_rate": 1.5884650915803858e-05, + "loss": 1.5354, + "step": 6251 + }, + { + "epoch": 0.3418121181471509, + "grad_norm": 1.541913390159607, + "learning_rate": 1.5883173357846745e-05, + "loss": 1.5748, + "step": 6252 + }, + { + "epoch": 0.34186679059087244, + "grad_norm": 1.6556096076965332, + "learning_rate": 1.5881695603436674e-05, + "loss": 1.3035, + "step": 6253 + }, + { + "epoch": 0.341921463034594, + "grad_norm": 1.4214082956314087, + "learning_rate": 1.588021765262298e-05, + "loss": 1.6268, + "step": 6254 + }, + { + "epoch": 0.34197613547831557, + "grad_norm": 1.645946741104126, + "learning_rate": 1.5878739505455023e-05, + "loss": 1.6368, + "step": 6255 + }, + { + "epoch": 0.3420308079220371, + "grad_norm": 1.6944432258605957, + "learning_rate": 1.5877261161982157e-05, + "loss": 1.4716, + "step": 6256 + }, + { + "epoch": 0.34208548036575864, + "grad_norm": 1.3972828388214111, + "learning_rate": 1.587578262225375e-05, + "loss": 1.4308, + "step": 6257 + }, + { + "epoch": 0.3421401528094802, + "grad_norm": 1.4322060346603394, + "learning_rate": 1.5874303886319175e-05, + "loss": 1.6406, + "step": 6258 + }, + { + "epoch": 0.3421948252532018, + "grad_norm": 2.3729159832000732, + "learning_rate": 1.5872824954227807e-05, + "loss": 1.3909, + "step": 6259 + }, + { + "epoch": 0.3422494976969233, + "grad_norm": 1.264851689338684, + "learning_rate": 1.5871345826029032e-05, + "loss": 1.7182, + "step": 6260 + }, + { + "epoch": 0.34230417014064485, + "grad_norm": 1.5137501955032349, + "learning_rate": 1.5869866501772247e-05, + "loss": 1.2977, + "step": 6261 + }, + { + "epoch": 0.34235884258436644, + "grad_norm": 1.6380404233932495, + "learning_rate": 1.586838698150684e-05, + "loss": 1.5462, + "step": 6262 + }, + { + "epoch": 0.342413515028088, + "grad_norm": 1.3759359121322632, + "learning_rate": 1.586690726528222e-05, + "loss": 1.4622, + "step": 6263 + }, + { + "epoch": 0.3424681874718095, + "grad_norm": 1.44450843334198, + "learning_rate": 1.5865427353147805e-05, + "loss": 1.5447, + "step": 6264 + }, + { + "epoch": 0.34252285991553105, + "grad_norm": 1.4501575231552124, + "learning_rate": 1.5863947245153006e-05, + "loss": 1.5072, + "step": 6265 + }, + { + "epoch": 0.34257753235925265, + "grad_norm": 2.0708844661712646, + "learning_rate": 1.5862466941347247e-05, + "loss": 1.4502, + "step": 6266 + }, + { + "epoch": 0.3426322048029742, + "grad_norm": 1.427024245262146, + "learning_rate": 1.586098644177996e-05, + "loss": 1.5198, + "step": 6267 + }, + { + "epoch": 0.3426868772466957, + "grad_norm": 1.3693112134933472, + "learning_rate": 1.5859505746500582e-05, + "loss": 1.4631, + "step": 6268 + }, + { + "epoch": 0.3427415496904173, + "grad_norm": 1.8142523765563965, + "learning_rate": 1.585802485555856e-05, + "loss": 1.5252, + "step": 6269 + }, + { + "epoch": 0.34279622213413885, + "grad_norm": 1.3332493305206299, + "learning_rate": 1.5856543769003338e-05, + "loss": 1.6041, + "step": 6270 + }, + { + "epoch": 0.3428508945778604, + "grad_norm": 1.797271966934204, + "learning_rate": 1.5855062486884377e-05, + "loss": 1.5541, + "step": 6271 + }, + { + "epoch": 0.3429055670215819, + "grad_norm": 1.4806230068206787, + "learning_rate": 1.585358100925114e-05, + "loss": 1.4155, + "step": 6272 + }, + { + "epoch": 0.3429602394653035, + "grad_norm": 1.4518630504608154, + "learning_rate": 1.58520993361531e-05, + "loss": 1.5859, + "step": 6273 + }, + { + "epoch": 0.34301491190902506, + "grad_norm": 1.3980908393859863, + "learning_rate": 1.5850617467639728e-05, + "loss": 1.5432, + "step": 6274 + }, + { + "epoch": 0.3430695843527466, + "grad_norm": 1.6517689228057861, + "learning_rate": 1.5849135403760514e-05, + "loss": 1.563, + "step": 6275 + }, + { + "epoch": 0.3431242567964682, + "grad_norm": 1.4359502792358398, + "learning_rate": 1.5847653144564938e-05, + "loss": 1.299, + "step": 6276 + }, + { + "epoch": 0.3431789292401897, + "grad_norm": 1.3573663234710693, + "learning_rate": 1.5846170690102505e-05, + "loss": 1.365, + "step": 6277 + }, + { + "epoch": 0.34323360168391126, + "grad_norm": 1.235065221786499, + "learning_rate": 1.5844688040422714e-05, + "loss": 1.6454, + "step": 6278 + }, + { + "epoch": 0.3432882741276328, + "grad_norm": 1.3867536783218384, + "learning_rate": 1.5843205195575074e-05, + "loss": 1.3986, + "step": 6279 + }, + { + "epoch": 0.3433429465713544, + "grad_norm": 1.3559931516647339, + "learning_rate": 1.58417221556091e-05, + "loss": 1.3975, + "step": 6280 + }, + { + "epoch": 0.34339761901507593, + "grad_norm": 1.8471630811691284, + "learning_rate": 1.5840238920574315e-05, + "loss": 1.5108, + "step": 6281 + }, + { + "epoch": 0.34345229145879747, + "grad_norm": 1.9690407514572144, + "learning_rate": 1.583875549052025e-05, + "loss": 1.3928, + "step": 6282 + }, + { + "epoch": 0.34350696390251906, + "grad_norm": 1.6685457229614258, + "learning_rate": 1.5837271865496435e-05, + "loss": 1.3364, + "step": 6283 + }, + { + "epoch": 0.3435616363462406, + "grad_norm": 4.2206807136535645, + "learning_rate": 1.5835788045552418e-05, + "loss": 1.2382, + "step": 6284 + }, + { + "epoch": 0.34361630878996213, + "grad_norm": 1.3029330968856812, + "learning_rate": 1.5834304030737744e-05, + "loss": 1.4709, + "step": 6285 + }, + { + "epoch": 0.34367098123368367, + "grad_norm": 1.86164128780365, + "learning_rate": 1.583281982110197e-05, + "loss": 1.4545, + "step": 6286 + }, + { + "epoch": 0.34372565367740526, + "grad_norm": 1.2037208080291748, + "learning_rate": 1.5831335416694648e-05, + "loss": 1.5583, + "step": 6287 + }, + { + "epoch": 0.3437803261211268, + "grad_norm": 1.3070067167282104, + "learning_rate": 1.5829850817565358e-05, + "loss": 1.4296, + "step": 6288 + }, + { + "epoch": 0.34383499856484834, + "grad_norm": 1.440934419631958, + "learning_rate": 1.5828366023763665e-05, + "loss": 1.1397, + "step": 6289 + }, + { + "epoch": 0.34388967100856993, + "grad_norm": 1.3518980741500854, + "learning_rate": 1.5826881035339157e-05, + "loss": 1.7217, + "step": 6290 + }, + { + "epoch": 0.34394434345229147, + "grad_norm": 1.5787773132324219, + "learning_rate": 1.582539585234142e-05, + "loss": 1.5292, + "step": 6291 + }, + { + "epoch": 0.343999015896013, + "grad_norm": 1.6557300090789795, + "learning_rate": 1.582391047482004e-05, + "loss": 1.5011, + "step": 6292 + }, + { + "epoch": 0.34405368833973454, + "grad_norm": 1.2301274538040161, + "learning_rate": 1.5822424902824627e-05, + "loss": 1.305, + "step": 6293 + }, + { + "epoch": 0.34410836078345614, + "grad_norm": 1.4641070365905762, + "learning_rate": 1.5820939136404783e-05, + "loss": 1.4222, + "step": 6294 + }, + { + "epoch": 0.3441630332271777, + "grad_norm": 1.5445820093154907, + "learning_rate": 1.581945317561012e-05, + "loss": 1.5763, + "step": 6295 + }, + { + "epoch": 0.3442177056708992, + "grad_norm": 1.4091956615447998, + "learning_rate": 1.5817967020490262e-05, + "loss": 1.7555, + "step": 6296 + }, + { + "epoch": 0.3442723781146208, + "grad_norm": 1.5932542085647583, + "learning_rate": 1.5816480671094835e-05, + "loss": 1.2805, + "step": 6297 + }, + { + "epoch": 0.34432705055834234, + "grad_norm": 1.4940999746322632, + "learning_rate": 1.5814994127473465e-05, + "loss": 1.4105, + "step": 6298 + }, + { + "epoch": 0.3443817230020639, + "grad_norm": 1.6080540418624878, + "learning_rate": 1.5813507389675796e-05, + "loss": 1.3794, + "step": 6299 + }, + { + "epoch": 0.3444363954457854, + "grad_norm": 1.5371414422988892, + "learning_rate": 1.581202045775148e-05, + "loss": 1.4678, + "step": 6300 + }, + { + "epoch": 0.344491067889507, + "grad_norm": 1.4598709344863892, + "learning_rate": 1.5810533331750155e-05, + "loss": 1.3619, + "step": 6301 + }, + { + "epoch": 0.34454574033322855, + "grad_norm": 1.3798710107803345, + "learning_rate": 1.580904601172149e-05, + "loss": 1.379, + "step": 6302 + }, + { + "epoch": 0.3446004127769501, + "grad_norm": 1.3798736333847046, + "learning_rate": 1.580755849771515e-05, + "loss": 1.1933, + "step": 6303 + }, + { + "epoch": 0.3446550852206717, + "grad_norm": 1.8303391933441162, + "learning_rate": 1.58060707897808e-05, + "loss": 1.4203, + "step": 6304 + }, + { + "epoch": 0.3447097576643932, + "grad_norm": 1.4393856525421143, + "learning_rate": 1.580458288796812e-05, + "loss": 1.1292, + "step": 6305 + }, + { + "epoch": 0.34476443010811475, + "grad_norm": 1.4639781713485718, + "learning_rate": 1.58030947923268e-05, + "loss": 1.3508, + "step": 6306 + }, + { + "epoch": 0.3448191025518363, + "grad_norm": 1.4173328876495361, + "learning_rate": 1.580160650290653e-05, + "loss": 1.2484, + "step": 6307 + }, + { + "epoch": 0.3448737749955579, + "grad_norm": 1.5586544275283813, + "learning_rate": 1.5800118019757e-05, + "loss": 1.5713, + "step": 6308 + }, + { + "epoch": 0.3449284474392794, + "grad_norm": 1.4066405296325684, + "learning_rate": 1.5798629342927923e-05, + "loss": 1.4106, + "step": 6309 + }, + { + "epoch": 0.34498311988300095, + "grad_norm": 1.4086098670959473, + "learning_rate": 1.5797140472469002e-05, + "loss": 1.3861, + "step": 6310 + }, + { + "epoch": 0.34503779232672255, + "grad_norm": 1.6723657846450806, + "learning_rate": 1.579565140842996e-05, + "loss": 1.3924, + "step": 6311 + }, + { + "epoch": 0.3450924647704441, + "grad_norm": 1.718144178390503, + "learning_rate": 1.5794162150860513e-05, + "loss": 1.5856, + "step": 6312 + }, + { + "epoch": 0.3451471372141656, + "grad_norm": 1.3794653415679932, + "learning_rate": 1.57926726998104e-05, + "loss": 1.3426, + "step": 6313 + }, + { + "epoch": 0.34520180965788716, + "grad_norm": 2.6809797286987305, + "learning_rate": 1.5791183055329353e-05, + "loss": 1.4772, + "step": 6314 + }, + { + "epoch": 0.34525648210160875, + "grad_norm": 1.7330806255340576, + "learning_rate": 1.578969321746711e-05, + "loss": 1.3999, + "step": 6315 + }, + { + "epoch": 0.3453111545453303, + "grad_norm": 1.5675665140151978, + "learning_rate": 1.578820318627343e-05, + "loss": 1.1635, + "step": 6316 + }, + { + "epoch": 0.3453658269890518, + "grad_norm": 1.1857749223709106, + "learning_rate": 1.578671296179806e-05, + "loss": 1.3564, + "step": 6317 + }, + { + "epoch": 0.3454204994327734, + "grad_norm": 1.4306484460830688, + "learning_rate": 1.5785222544090766e-05, + "loss": 1.3303, + "step": 6318 + }, + { + "epoch": 0.34547517187649496, + "grad_norm": 1.5267422199249268, + "learning_rate": 1.5783731933201315e-05, + "loss": 1.4346, + "step": 6319 + }, + { + "epoch": 0.3455298443202165, + "grad_norm": 1.4574024677276611, + "learning_rate": 1.5782241129179482e-05, + "loss": 1.5658, + "step": 6320 + }, + { + "epoch": 0.34558451676393803, + "grad_norm": 1.5912196636199951, + "learning_rate": 1.5780750132075052e-05, + "loss": 1.4518, + "step": 6321 + }, + { + "epoch": 0.3456391892076596, + "grad_norm": 1.5747829675674438, + "learning_rate": 1.5779258941937803e-05, + "loss": 1.3354, + "step": 6322 + }, + { + "epoch": 0.34569386165138116, + "grad_norm": 1.3256210088729858, + "learning_rate": 1.5777767558817545e-05, + "loss": 1.5691, + "step": 6323 + }, + { + "epoch": 0.3457485340951027, + "grad_norm": 1.5477088689804077, + "learning_rate": 1.577627598276407e-05, + "loss": 1.5042, + "step": 6324 + }, + { + "epoch": 0.3458032065388243, + "grad_norm": 1.4020938873291016, + "learning_rate": 1.577478421382718e-05, + "loss": 1.4506, + "step": 6325 + }, + { + "epoch": 0.34585787898254583, + "grad_norm": 1.2894929647445679, + "learning_rate": 1.5773292252056695e-05, + "loss": 1.6154, + "step": 6326 + }, + { + "epoch": 0.34591255142626737, + "grad_norm": 1.8284571170806885, + "learning_rate": 1.5771800097502437e-05, + "loss": 1.3317, + "step": 6327 + }, + { + "epoch": 0.3459672238699889, + "grad_norm": 1.4423414468765259, + "learning_rate": 1.5770307750214228e-05, + "loss": 1.4189, + "step": 6328 + }, + { + "epoch": 0.3460218963137105, + "grad_norm": 1.5256716012954712, + "learning_rate": 1.5768815210241907e-05, + "loss": 1.5245, + "step": 6329 + }, + { + "epoch": 0.34607656875743203, + "grad_norm": 1.6072337627410889, + "learning_rate": 1.5767322477635303e-05, + "loss": 1.6038, + "step": 6330 + }, + { + "epoch": 0.34613124120115357, + "grad_norm": 1.5815340280532837, + "learning_rate": 1.5765829552444273e-05, + "loss": 1.6268, + "step": 6331 + }, + { + "epoch": 0.34618591364487517, + "grad_norm": 1.5234910249710083, + "learning_rate": 1.576433643471866e-05, + "loss": 1.4711, + "step": 6332 + }, + { + "epoch": 0.3462405860885967, + "grad_norm": 1.3402336835861206, + "learning_rate": 1.5762843124508333e-05, + "loss": 1.2989, + "step": 6333 + }, + { + "epoch": 0.34629525853231824, + "grad_norm": 1.9772685766220093, + "learning_rate": 1.5761349621863145e-05, + "loss": 1.2882, + "step": 6334 + }, + { + "epoch": 0.3463499309760398, + "grad_norm": 1.4210439920425415, + "learning_rate": 1.5759855926832973e-05, + "loss": 1.4394, + "step": 6335 + }, + { + "epoch": 0.34640460341976137, + "grad_norm": 1.562715768814087, + "learning_rate": 1.57583620394677e-05, + "loss": 1.2291, + "step": 6336 + }, + { + "epoch": 0.3464592758634829, + "grad_norm": 1.5399595499038696, + "learning_rate": 1.5756867959817205e-05, + "loss": 1.2889, + "step": 6337 + }, + { + "epoch": 0.34651394830720444, + "grad_norm": 1.2916241884231567, + "learning_rate": 1.5755373687931382e-05, + "loss": 1.443, + "step": 6338 + }, + { + "epoch": 0.34656862075092604, + "grad_norm": 1.9907283782958984, + "learning_rate": 1.5753879223860123e-05, + "loss": 1.5576, + "step": 6339 + }, + { + "epoch": 0.3466232931946476, + "grad_norm": 1.372471809387207, + "learning_rate": 1.5752384567653334e-05, + "loss": 1.5578, + "step": 6340 + }, + { + "epoch": 0.3466779656383691, + "grad_norm": 1.5988450050354004, + "learning_rate": 1.5750889719360927e-05, + "loss": 1.3305, + "step": 6341 + }, + { + "epoch": 0.34673263808209065, + "grad_norm": 1.3761835098266602, + "learning_rate": 1.5749394679032818e-05, + "loss": 1.57, + "step": 6342 + }, + { + "epoch": 0.34678731052581224, + "grad_norm": 1.5030616521835327, + "learning_rate": 1.574789944671893e-05, + "loss": 1.3231, + "step": 6343 + }, + { + "epoch": 0.3468419829695338, + "grad_norm": 1.6076576709747314, + "learning_rate": 1.5746404022469192e-05, + "loss": 1.6849, + "step": 6344 + }, + { + "epoch": 0.3468966554132553, + "grad_norm": 1.6876956224441528, + "learning_rate": 1.5744908406333537e-05, + "loss": 1.4133, + "step": 6345 + }, + { + "epoch": 0.3469513278569769, + "grad_norm": 1.40708327293396, + "learning_rate": 1.574341259836191e-05, + "loss": 1.4864, + "step": 6346 + }, + { + "epoch": 0.34700600030069845, + "grad_norm": 1.7089701890945435, + "learning_rate": 1.574191659860426e-05, + "loss": 1.4889, + "step": 6347 + }, + { + "epoch": 0.34706067274442, + "grad_norm": 1.513465404510498, + "learning_rate": 1.574042040711054e-05, + "loss": 1.3913, + "step": 6348 + }, + { + "epoch": 0.3471153451881415, + "grad_norm": 1.5045944452285767, + "learning_rate": 1.5738924023930712e-05, + "loss": 1.3795, + "step": 6349 + }, + { + "epoch": 0.3471700176318631, + "grad_norm": 1.448637843132019, + "learning_rate": 1.5737427449114744e-05, + "loss": 1.4723, + "step": 6350 + }, + { + "epoch": 0.34722469007558465, + "grad_norm": 2.474825859069824, + "learning_rate": 1.5735930682712613e-05, + "loss": 1.4018, + "step": 6351 + }, + { + "epoch": 0.3472793625193062, + "grad_norm": 1.5715054273605347, + "learning_rate": 1.5734433724774295e-05, + "loss": 1.6413, + "step": 6352 + }, + { + "epoch": 0.3473340349630278, + "grad_norm": 1.7431070804595947, + "learning_rate": 1.5732936575349777e-05, + "loss": 1.1182, + "step": 6353 + }, + { + "epoch": 0.3473887074067493, + "grad_norm": 1.3321834802627563, + "learning_rate": 1.5731439234489054e-05, + "loss": 1.5907, + "step": 6354 + }, + { + "epoch": 0.34744337985047086, + "grad_norm": 1.3725776672363281, + "learning_rate": 1.572994170224213e-05, + "loss": 1.6431, + "step": 6355 + }, + { + "epoch": 0.3474980522941924, + "grad_norm": 1.5631465911865234, + "learning_rate": 1.5728443978659002e-05, + "loss": 1.2534, + "step": 6356 + }, + { + "epoch": 0.347552724737914, + "grad_norm": 1.675947904586792, + "learning_rate": 1.572694606378969e-05, + "loss": 1.1127, + "step": 6357 + }, + { + "epoch": 0.3476073971816355, + "grad_norm": 1.6340333223342896, + "learning_rate": 1.572544795768421e-05, + "loss": 1.6109, + "step": 6358 + }, + { + "epoch": 0.34766206962535706, + "grad_norm": 1.3509697914123535, + "learning_rate": 1.572394966039259e-05, + "loss": 1.5125, + "step": 6359 + }, + { + "epoch": 0.34771674206907865, + "grad_norm": 1.8483095169067383, + "learning_rate": 1.5722451171964853e-05, + "loss": 1.4684, + "step": 6360 + }, + { + "epoch": 0.3477714145128002, + "grad_norm": 1.307050347328186, + "learning_rate": 1.5720952492451047e-05, + "loss": 1.4969, + "step": 6361 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 1.7179505825042725, + "learning_rate": 1.571945362190121e-05, + "loss": 1.3714, + "step": 6362 + }, + { + "epoch": 0.34788075940024327, + "grad_norm": 1.674291968345642, + "learning_rate": 1.5717954560365402e-05, + "loss": 1.4102, + "step": 6363 + }, + { + "epoch": 0.34793543184396486, + "grad_norm": 1.948537826538086, + "learning_rate": 1.5716455307893665e-05, + "loss": 1.2781, + "step": 6364 + }, + { + "epoch": 0.3479901042876864, + "grad_norm": 1.6794990301132202, + "learning_rate": 1.5714955864536078e-05, + "loss": 1.3371, + "step": 6365 + }, + { + "epoch": 0.34804477673140793, + "grad_norm": 1.5542150735855103, + "learning_rate": 1.57134562303427e-05, + "loss": 1.5741, + "step": 6366 + }, + { + "epoch": 0.3480994491751295, + "grad_norm": 1.5417671203613281, + "learning_rate": 1.5711956405363613e-05, + "loss": 1.6949, + "step": 6367 + }, + { + "epoch": 0.34815412161885106, + "grad_norm": 1.5001015663146973, + "learning_rate": 1.57104563896489e-05, + "loss": 1.5001, + "step": 6368 + }, + { + "epoch": 0.3482087940625726, + "grad_norm": 1.2990736961364746, + "learning_rate": 1.5708956183248644e-05, + "loss": 1.4567, + "step": 6369 + }, + { + "epoch": 0.34826346650629414, + "grad_norm": 1.516006350517273, + "learning_rate": 1.5707455786212948e-05, + "loss": 1.4334, + "step": 6370 + }, + { + "epoch": 0.34831813895001573, + "grad_norm": 1.480922818183899, + "learning_rate": 1.5705955198591908e-05, + "loss": 1.4276, + "step": 6371 + }, + { + "epoch": 0.34837281139373727, + "grad_norm": 1.3111133575439453, + "learning_rate": 1.5704454420435635e-05, + "loss": 1.6793, + "step": 6372 + }, + { + "epoch": 0.3484274838374588, + "grad_norm": 1.4380995035171509, + "learning_rate": 1.5702953451794245e-05, + "loss": 1.6469, + "step": 6373 + }, + { + "epoch": 0.3484821562811804, + "grad_norm": 1.2723873853683472, + "learning_rate": 1.5701452292717853e-05, + "loss": 1.6734, + "step": 6374 + }, + { + "epoch": 0.34853682872490194, + "grad_norm": 1.4045331478118896, + "learning_rate": 1.5699950943256593e-05, + "loss": 1.4295, + "step": 6375 + }, + { + "epoch": 0.3485915011686235, + "grad_norm": 1.7547833919525146, + "learning_rate": 1.5698449403460593e-05, + "loss": 1.3211, + "step": 6376 + }, + { + "epoch": 0.348646173612345, + "grad_norm": 1.5681546926498413, + "learning_rate": 1.569694767338e-05, + "loss": 1.3254, + "step": 6377 + }, + { + "epoch": 0.3487008460560666, + "grad_norm": 1.4100874662399292, + "learning_rate": 1.5695445753064954e-05, + "loss": 1.6694, + "step": 6378 + }, + { + "epoch": 0.34875551849978814, + "grad_norm": 2.219574451446533, + "learning_rate": 1.5693943642565604e-05, + "loss": 1.3895, + "step": 6379 + }, + { + "epoch": 0.3488101909435097, + "grad_norm": 1.8822828531265259, + "learning_rate": 1.569244134193212e-05, + "loss": 1.3651, + "step": 6380 + }, + { + "epoch": 0.34886486338723127, + "grad_norm": 1.4539309740066528, + "learning_rate": 1.5690938851214664e-05, + "loss": 1.2739, + "step": 6381 + }, + { + "epoch": 0.3489195358309528, + "grad_norm": 1.727907657623291, + "learning_rate": 1.5689436170463403e-05, + "loss": 1.5156, + "step": 6382 + }, + { + "epoch": 0.34897420827467435, + "grad_norm": 1.0312058925628662, + "learning_rate": 1.5687933299728517e-05, + "loss": 1.8809, + "step": 6383 + }, + { + "epoch": 0.3490288807183959, + "grad_norm": 1.326017141342163, + "learning_rate": 1.5686430239060194e-05, + "loss": 1.4293, + "step": 6384 + }, + { + "epoch": 0.3490835531621175, + "grad_norm": 1.1574273109436035, + "learning_rate": 1.568492698850862e-05, + "loss": 1.6217, + "step": 6385 + }, + { + "epoch": 0.349138225605839, + "grad_norm": 1.316048502922058, + "learning_rate": 1.568342354812399e-05, + "loss": 1.4043, + "step": 6386 + }, + { + "epoch": 0.34919289804956055, + "grad_norm": 1.6811234951019287, + "learning_rate": 1.5681919917956515e-05, + "loss": 1.2242, + "step": 6387 + }, + { + "epoch": 0.34924757049328214, + "grad_norm": 1.8015230894088745, + "learning_rate": 1.56804160980564e-05, + "loss": 1.4954, + "step": 6388 + }, + { + "epoch": 0.3493022429370037, + "grad_norm": 1.3946295976638794, + "learning_rate": 1.567891208847386e-05, + "loss": 1.4837, + "step": 6389 + }, + { + "epoch": 0.3493569153807252, + "grad_norm": 1.7370699644088745, + "learning_rate": 1.567740788925912e-05, + "loss": 1.2916, + "step": 6390 + }, + { + "epoch": 0.34941158782444676, + "grad_norm": 2.033799886703491, + "learning_rate": 1.5675903500462412e-05, + "loss": 1.3633, + "step": 6391 + }, + { + "epoch": 0.34946626026816835, + "grad_norm": 1.5787732601165771, + "learning_rate": 1.5674398922133963e-05, + "loss": 1.4697, + "step": 6392 + }, + { + "epoch": 0.3495209327118899, + "grad_norm": 1.0821483135223389, + "learning_rate": 1.567289415432402e-05, + "loss": 1.4552, + "step": 6393 + }, + { + "epoch": 0.3495756051556114, + "grad_norm": 1.687017560005188, + "learning_rate": 1.5671389197082832e-05, + "loss": 1.3742, + "step": 6394 + }, + { + "epoch": 0.349630277599333, + "grad_norm": 1.4583643674850464, + "learning_rate": 1.5669884050460646e-05, + "loss": 1.4184, + "step": 6395 + }, + { + "epoch": 0.34968495004305455, + "grad_norm": 1.3894634246826172, + "learning_rate": 1.566837871450773e-05, + "loss": 1.4582, + "step": 6396 + }, + { + "epoch": 0.3497396224867761, + "grad_norm": 1.5104762315750122, + "learning_rate": 1.5666873189274344e-05, + "loss": 1.7444, + "step": 6397 + }, + { + "epoch": 0.34979429493049763, + "grad_norm": 1.8834739923477173, + "learning_rate": 1.566536747481077e-05, + "loss": 1.3884, + "step": 6398 + }, + { + "epoch": 0.3498489673742192, + "grad_norm": 1.5008798837661743, + "learning_rate": 1.5663861571167277e-05, + "loss": 1.3424, + "step": 6399 + }, + { + "epoch": 0.34990363981794076, + "grad_norm": 1.5362763404846191, + "learning_rate": 1.5662355478394157e-05, + "loss": 1.5866, + "step": 6400 + }, + { + "epoch": 0.3499583122616623, + "grad_norm": 1.4930682182312012, + "learning_rate": 1.56608491965417e-05, + "loss": 1.4762, + "step": 6401 + }, + { + "epoch": 0.3500129847053839, + "grad_norm": 1.5547744035720825, + "learning_rate": 1.5659342725660206e-05, + "loss": 1.6312, + "step": 6402 + }, + { + "epoch": 0.3500676571491054, + "grad_norm": 2.0171730518341064, + "learning_rate": 1.5657836065799975e-05, + "loss": 1.5154, + "step": 6403 + }, + { + "epoch": 0.35012232959282696, + "grad_norm": 1.8811242580413818, + "learning_rate": 1.5656329217011322e-05, + "loss": 1.2679, + "step": 6404 + }, + { + "epoch": 0.3501770020365485, + "grad_norm": 1.7534162998199463, + "learning_rate": 1.5654822179344568e-05, + "loss": 1.5775, + "step": 6405 + }, + { + "epoch": 0.3502316744802701, + "grad_norm": 1.418160080909729, + "learning_rate": 1.565331495285003e-05, + "loss": 1.4003, + "step": 6406 + }, + { + "epoch": 0.35028634692399163, + "grad_norm": 1.2887626886367798, + "learning_rate": 1.565180753757804e-05, + "loss": 1.4741, + "step": 6407 + }, + { + "epoch": 0.35034101936771317, + "grad_norm": 1.3617838621139526, + "learning_rate": 1.565029993357893e-05, + "loss": 1.4008, + "step": 6408 + }, + { + "epoch": 0.35039569181143476, + "grad_norm": 1.566627025604248, + "learning_rate": 1.564879214090305e-05, + "loss": 1.2712, + "step": 6409 + }, + { + "epoch": 0.3504503642551563, + "grad_norm": 1.547827959060669, + "learning_rate": 1.5647284159600744e-05, + "loss": 1.3321, + "step": 6410 + }, + { + "epoch": 0.35050503669887784, + "grad_norm": 1.7838819026947021, + "learning_rate": 1.5645775989722366e-05, + "loss": 1.261, + "step": 6411 + }, + { + "epoch": 0.3505597091425994, + "grad_norm": 1.5897572040557861, + "learning_rate": 1.5644267631318286e-05, + "loss": 1.9343, + "step": 6412 + }, + { + "epoch": 0.35061438158632097, + "grad_norm": 1.2895992994308472, + "learning_rate": 1.5642759084438858e-05, + "loss": 1.3265, + "step": 6413 + }, + { + "epoch": 0.3506690540300425, + "grad_norm": 1.514318823814392, + "learning_rate": 1.5641250349134467e-05, + "loss": 1.4994, + "step": 6414 + }, + { + "epoch": 0.35072372647376404, + "grad_norm": 1.5130904912948608, + "learning_rate": 1.563974142545549e-05, + "loss": 1.4665, + "step": 6415 + }, + { + "epoch": 0.35077839891748563, + "grad_norm": 1.555832028388977, + "learning_rate": 1.563823231345231e-05, + "loss": 1.1847, + "step": 6416 + }, + { + "epoch": 0.35083307136120717, + "grad_norm": 1.6998953819274902, + "learning_rate": 1.5636723013175325e-05, + "loss": 1.5853, + "step": 6417 + }, + { + "epoch": 0.3508877438049287, + "grad_norm": 1.2943909168243408, + "learning_rate": 1.563521352467493e-05, + "loss": 1.4397, + "step": 6418 + }, + { + "epoch": 0.35094241624865025, + "grad_norm": 1.4926810264587402, + "learning_rate": 1.563370384800153e-05, + "loss": 1.216, + "step": 6419 + }, + { + "epoch": 0.35099708869237184, + "grad_norm": 1.5328494310379028, + "learning_rate": 1.5632193983205542e-05, + "loss": 1.5548, + "step": 6420 + }, + { + "epoch": 0.3510517611360934, + "grad_norm": 1.2326035499572754, + "learning_rate": 1.563068393033738e-05, + "loss": 1.4622, + "step": 6421 + }, + { + "epoch": 0.3511064335798149, + "grad_norm": 1.6647919416427612, + "learning_rate": 1.5629173689447467e-05, + "loss": 1.513, + "step": 6422 + }, + { + "epoch": 0.3511611060235365, + "grad_norm": 2.011956214904785, + "learning_rate": 1.5627663260586235e-05, + "loss": 1.4622, + "step": 6423 + }, + { + "epoch": 0.35121577846725804, + "grad_norm": 1.4045196771621704, + "learning_rate": 1.5626152643804124e-05, + "loss": 1.6774, + "step": 6424 + }, + { + "epoch": 0.3512704509109796, + "grad_norm": 1.3228402137756348, + "learning_rate": 1.562464183915157e-05, + "loss": 1.2438, + "step": 6425 + }, + { + "epoch": 0.3513251233547012, + "grad_norm": 1.6687467098236084, + "learning_rate": 1.562313084667903e-05, + "loss": 1.3406, + "step": 6426 + }, + { + "epoch": 0.3513797957984227, + "grad_norm": 1.6866904497146606, + "learning_rate": 1.5621619666436956e-05, + "loss": 1.4972, + "step": 6427 + }, + { + "epoch": 0.35143446824214425, + "grad_norm": 1.9675114154815674, + "learning_rate": 1.5620108298475808e-05, + "loss": 1.3544, + "step": 6428 + }, + { + "epoch": 0.3514891406858658, + "grad_norm": 1.4747182130813599, + "learning_rate": 1.5618596742846057e-05, + "loss": 1.7518, + "step": 6429 + }, + { + "epoch": 0.3515438131295874, + "grad_norm": 1.3024581670761108, + "learning_rate": 1.5617084999598177e-05, + "loss": 1.4587, + "step": 6430 + }, + { + "epoch": 0.3515984855733089, + "grad_norm": 1.5424233675003052, + "learning_rate": 1.5615573068782643e-05, + "loss": 1.3919, + "step": 6431 + }, + { + "epoch": 0.35165315801703045, + "grad_norm": 1.6273852586746216, + "learning_rate": 1.5614060950449948e-05, + "loss": 1.6824, + "step": 6432 + }, + { + "epoch": 0.35170783046075205, + "grad_norm": 1.2766833305358887, + "learning_rate": 1.561254864465059e-05, + "loss": 1.6085, + "step": 6433 + }, + { + "epoch": 0.3517625029044736, + "grad_norm": 1.6462242603302002, + "learning_rate": 1.561103615143506e-05, + "loss": 1.2786, + "step": 6434 + }, + { + "epoch": 0.3518171753481951, + "grad_norm": 1.360878348350525, + "learning_rate": 1.5609523470853862e-05, + "loss": 1.4762, + "step": 6435 + }, + { + "epoch": 0.35187184779191666, + "grad_norm": 1.612741231918335, + "learning_rate": 1.5608010602957518e-05, + "loss": 1.6244, + "step": 6436 + }, + { + "epoch": 0.35192652023563825, + "grad_norm": 1.6452397108078003, + "learning_rate": 1.5606497547796538e-05, + "loss": 1.3864, + "step": 6437 + }, + { + "epoch": 0.3519811926793598, + "grad_norm": 1.9412660598754883, + "learning_rate": 1.5604984305421446e-05, + "loss": 1.6422, + "step": 6438 + }, + { + "epoch": 0.3520358651230813, + "grad_norm": 1.5844613313674927, + "learning_rate": 1.560347087588278e-05, + "loss": 1.2422, + "step": 6439 + }, + { + "epoch": 0.3520905375668029, + "grad_norm": 1.3765099048614502, + "learning_rate": 1.5601957259231072e-05, + "loss": 1.6717, + "step": 6440 + }, + { + "epoch": 0.35214521001052446, + "grad_norm": 1.1701561212539673, + "learning_rate": 1.560044345551686e-05, + "loss": 1.4771, + "step": 6441 + }, + { + "epoch": 0.352199882454246, + "grad_norm": 1.7423473596572876, + "learning_rate": 1.5598929464790705e-05, + "loss": 1.2075, + "step": 6442 + }, + { + "epoch": 0.35225455489796753, + "grad_norm": 1.426856517791748, + "learning_rate": 1.5597415287103158e-05, + "loss": 1.3102, + "step": 6443 + }, + { + "epoch": 0.3523092273416891, + "grad_norm": 1.4985982179641724, + "learning_rate": 1.5595900922504776e-05, + "loss": 1.2992, + "step": 6444 + }, + { + "epoch": 0.35236389978541066, + "grad_norm": 1.686938762664795, + "learning_rate": 1.559438637104613e-05, + "loss": 1.3, + "step": 6445 + }, + { + "epoch": 0.3524185722291322, + "grad_norm": 1.4827287197113037, + "learning_rate": 1.5592871632777798e-05, + "loss": 1.3491, + "step": 6446 + }, + { + "epoch": 0.3524732446728538, + "grad_norm": 1.4629510641098022, + "learning_rate": 1.559135670775036e-05, + "loss": 1.399, + "step": 6447 + }, + { + "epoch": 0.35252791711657533, + "grad_norm": 1.493793249130249, + "learning_rate": 1.5589841596014398e-05, + "loss": 1.4575, + "step": 6448 + }, + { + "epoch": 0.35258258956029687, + "grad_norm": 1.3858439922332764, + "learning_rate": 1.558832629762051e-05, + "loss": 1.5521, + "step": 6449 + }, + { + "epoch": 0.3526372620040184, + "grad_norm": 1.438399314880371, + "learning_rate": 1.558681081261929e-05, + "loss": 1.5585, + "step": 6450 + }, + { + "epoch": 0.35269193444774, + "grad_norm": 1.5472437143325806, + "learning_rate": 1.558529514106135e-05, + "loss": 1.4906, + "step": 6451 + }, + { + "epoch": 0.35274660689146153, + "grad_norm": 1.5068870782852173, + "learning_rate": 1.5583779282997296e-05, + "loss": 1.3987, + "step": 6452 + }, + { + "epoch": 0.35280127933518307, + "grad_norm": 1.3859772682189941, + "learning_rate": 1.5582263238477753e-05, + "loss": 1.3279, + "step": 6453 + }, + { + "epoch": 0.35285595177890466, + "grad_norm": 1.6235814094543457, + "learning_rate": 1.5580747007553342e-05, + "loss": 1.4328, + "step": 6454 + }, + { + "epoch": 0.3529106242226262, + "grad_norm": 2.8694727420806885, + "learning_rate": 1.557923059027469e-05, + "loss": 1.4455, + "step": 6455 + }, + { + "epoch": 0.35296529666634774, + "grad_norm": 1.6400660276412964, + "learning_rate": 1.5577713986692435e-05, + "loss": 1.4354, + "step": 6456 + }, + { + "epoch": 0.3530199691100693, + "grad_norm": 1.834671139717102, + "learning_rate": 1.5576197196857227e-05, + "loss": 1.4135, + "step": 6457 + }, + { + "epoch": 0.35307464155379087, + "grad_norm": 1.6307264566421509, + "learning_rate": 1.55746802208197e-05, + "loss": 1.2845, + "step": 6458 + }, + { + "epoch": 0.3531293139975124, + "grad_norm": 1.1475138664245605, + "learning_rate": 1.557316305863053e-05, + "loss": 1.6096, + "step": 6459 + }, + { + "epoch": 0.35318398644123394, + "grad_norm": 1.590308427810669, + "learning_rate": 1.557164571034036e-05, + "loss": 1.662, + "step": 6460 + }, + { + "epoch": 0.35323865888495554, + "grad_norm": 1.4149030447006226, + "learning_rate": 1.557012817599987e-05, + "loss": 1.4134, + "step": 6461 + }, + { + "epoch": 0.3532933313286771, + "grad_norm": 1.8893229961395264, + "learning_rate": 1.5568610455659727e-05, + "loss": 1.444, + "step": 6462 + }, + { + "epoch": 0.3533480037723986, + "grad_norm": 1.6697348356246948, + "learning_rate": 1.5567092549370615e-05, + "loss": 1.7565, + "step": 6463 + }, + { + "epoch": 0.35340267621612015, + "grad_norm": 1.8943060636520386, + "learning_rate": 1.5565574457183215e-05, + "loss": 1.4962, + "step": 6464 + }, + { + "epoch": 0.35345734865984174, + "grad_norm": 1.7112634181976318, + "learning_rate": 1.556405617914823e-05, + "loss": 1.3017, + "step": 6465 + }, + { + "epoch": 0.3535120211035633, + "grad_norm": 1.5530568361282349, + "learning_rate": 1.556253771531635e-05, + "loss": 1.5788, + "step": 6466 + }, + { + "epoch": 0.3535666935472848, + "grad_norm": 1.803756594657898, + "learning_rate": 1.5561019065738282e-05, + "loss": 1.4688, + "step": 6467 + }, + { + "epoch": 0.3536213659910064, + "grad_norm": 1.2624038457870483, + "learning_rate": 1.5559500230464738e-05, + "loss": 1.5595, + "step": 6468 + }, + { + "epoch": 0.35367603843472795, + "grad_norm": 2.0066936016082764, + "learning_rate": 1.555798120954644e-05, + "loss": 1.4169, + "step": 6469 + }, + { + "epoch": 0.3537307108784495, + "grad_norm": 1.838923692703247, + "learning_rate": 1.5556462003034104e-05, + "loss": 1.3737, + "step": 6470 + }, + { + "epoch": 0.353785383322171, + "grad_norm": 1.51115083694458, + "learning_rate": 1.5554942610978462e-05, + "loss": 1.5473, + "step": 6471 + }, + { + "epoch": 0.3538400557658926, + "grad_norm": 1.4109418392181396, + "learning_rate": 1.555342303343025e-05, + "loss": 1.4491, + "step": 6472 + }, + { + "epoch": 0.35389472820961415, + "grad_norm": 1.4191386699676514, + "learning_rate": 1.5551903270440213e-05, + "loss": 1.3918, + "step": 6473 + }, + { + "epoch": 0.3539494006533357, + "grad_norm": 1.2522938251495361, + "learning_rate": 1.55503833220591e-05, + "loss": 1.5243, + "step": 6474 + }, + { + "epoch": 0.3540040730970573, + "grad_norm": 1.6841864585876465, + "learning_rate": 1.554886318833766e-05, + "loss": 1.6602, + "step": 6475 + }, + { + "epoch": 0.3540587455407788, + "grad_norm": 1.2756527662277222, + "learning_rate": 1.554734286932666e-05, + "loss": 1.5983, + "step": 6476 + }, + { + "epoch": 0.35411341798450036, + "grad_norm": 1.5567578077316284, + "learning_rate": 1.5545822365076865e-05, + "loss": 1.289, + "step": 6477 + }, + { + "epoch": 0.3541680904282219, + "grad_norm": 1.635095477104187, + "learning_rate": 1.5544301675639045e-05, + "loss": 1.3303, + "step": 6478 + }, + { + "epoch": 0.3542227628719435, + "grad_norm": 2.2330524921417236, + "learning_rate": 1.5542780801063983e-05, + "loss": 1.3152, + "step": 6479 + }, + { + "epoch": 0.354277435315665, + "grad_norm": 1.605098009109497, + "learning_rate": 1.554125974140246e-05, + "loss": 1.4361, + "step": 6480 + }, + { + "epoch": 0.35433210775938656, + "grad_norm": 1.621701955795288, + "learning_rate": 1.5539738496705277e-05, + "loss": 1.4855, + "step": 6481 + }, + { + "epoch": 0.35438678020310815, + "grad_norm": 1.3066123723983765, + "learning_rate": 1.5538217067023223e-05, + "loss": 1.413, + "step": 6482 + }, + { + "epoch": 0.3544414526468297, + "grad_norm": 1.6347755193710327, + "learning_rate": 1.5536695452407107e-05, + "loss": 1.4457, + "step": 6483 + }, + { + "epoch": 0.35449612509055123, + "grad_norm": 1.8155182600021362, + "learning_rate": 1.5535173652907737e-05, + "loss": 1.3774, + "step": 6484 + }, + { + "epoch": 0.35455079753427277, + "grad_norm": 1.3123053312301636, + "learning_rate": 1.553365166857593e-05, + "loss": 1.6136, + "step": 6485 + }, + { + "epoch": 0.35460546997799436, + "grad_norm": 1.2356773614883423, + "learning_rate": 1.5532129499462507e-05, + "loss": 1.2828, + "step": 6486 + }, + { + "epoch": 0.3546601424217159, + "grad_norm": 1.5807592868804932, + "learning_rate": 1.55306071456183e-05, + "loss": 1.436, + "step": 6487 + }, + { + "epoch": 0.35471481486543743, + "grad_norm": 1.350256085395813, + "learning_rate": 1.5529084607094144e-05, + "loss": 1.517, + "step": 6488 + }, + { + "epoch": 0.354769487309159, + "grad_norm": 2.016364097595215, + "learning_rate": 1.5527561883940877e-05, + "loss": 1.6162, + "step": 6489 + }, + { + "epoch": 0.35482415975288056, + "grad_norm": 1.9426347017288208, + "learning_rate": 1.5526038976209345e-05, + "loss": 1.3848, + "step": 6490 + }, + { + "epoch": 0.3548788321966021, + "grad_norm": 1.769240140914917, + "learning_rate": 1.5524515883950405e-05, + "loss": 1.5881, + "step": 6491 + }, + { + "epoch": 0.35493350464032364, + "grad_norm": 1.4492846727371216, + "learning_rate": 1.5522992607214923e-05, + "loss": 1.6381, + "step": 6492 + }, + { + "epoch": 0.35498817708404523, + "grad_norm": 1.6522547006607056, + "learning_rate": 1.552146914605375e-05, + "loss": 1.6447, + "step": 6493 + }, + { + "epoch": 0.35504284952776677, + "grad_norm": 1.7222505807876587, + "learning_rate": 1.551994550051777e-05, + "loss": 1.1853, + "step": 6494 + }, + { + "epoch": 0.3550975219714883, + "grad_norm": 1.81973135471344, + "learning_rate": 1.5518421670657856e-05, + "loss": 1.5317, + "step": 6495 + }, + { + "epoch": 0.3551521944152099, + "grad_norm": 1.6514501571655273, + "learning_rate": 1.5516897656524892e-05, + "loss": 1.3813, + "step": 6496 + }, + { + "epoch": 0.35520686685893144, + "grad_norm": 1.8117135763168335, + "learning_rate": 1.5515373458169767e-05, + "loss": 1.3198, + "step": 6497 + }, + { + "epoch": 0.355261539302653, + "grad_norm": 1.5542360544204712, + "learning_rate": 1.5513849075643384e-05, + "loss": 1.4793, + "step": 6498 + }, + { + "epoch": 0.3553162117463745, + "grad_norm": 1.7880955934524536, + "learning_rate": 1.5512324508996643e-05, + "loss": 1.3599, + "step": 6499 + }, + { + "epoch": 0.3553708841900961, + "grad_norm": 1.314307689666748, + "learning_rate": 1.5510799758280447e-05, + "loss": 1.4063, + "step": 6500 + }, + { + "epoch": 0.35542555663381764, + "grad_norm": 1.7677011489868164, + "learning_rate": 1.5509274823545716e-05, + "loss": 1.4553, + "step": 6501 + }, + { + "epoch": 0.3554802290775392, + "grad_norm": 1.874002456665039, + "learning_rate": 1.550774970484337e-05, + "loss": 1.3665, + "step": 6502 + }, + { + "epoch": 0.35553490152126077, + "grad_norm": 1.9374775886535645, + "learning_rate": 1.5506224402224342e-05, + "loss": 1.4434, + "step": 6503 + }, + { + "epoch": 0.3555895739649823, + "grad_norm": 1.8829232454299927, + "learning_rate": 1.550469891573956e-05, + "loss": 1.5653, + "step": 6504 + }, + { + "epoch": 0.35564424640870385, + "grad_norm": 1.4630309343338013, + "learning_rate": 1.550317324543996e-05, + "loss": 1.3461, + "step": 6505 + }, + { + "epoch": 0.3556989188524254, + "grad_norm": 1.888490915298462, + "learning_rate": 1.5501647391376492e-05, + "loss": 1.5774, + "step": 6506 + }, + { + "epoch": 0.355753591296147, + "grad_norm": 1.6757903099060059, + "learning_rate": 1.550012135360011e-05, + "loss": 1.5019, + "step": 6507 + }, + { + "epoch": 0.3558082637398685, + "grad_norm": 2.014817714691162, + "learning_rate": 1.549859513216177e-05, + "loss": 1.3037, + "step": 6508 + }, + { + "epoch": 0.35586293618359005, + "grad_norm": 1.529549241065979, + "learning_rate": 1.5497068727112435e-05, + "loss": 1.6378, + "step": 6509 + }, + { + "epoch": 0.35591760862731164, + "grad_norm": 1.582224726676941, + "learning_rate": 1.5495542138503073e-05, + "loss": 1.3747, + "step": 6510 + }, + { + "epoch": 0.3559722810710332, + "grad_norm": 1.4512293338775635, + "learning_rate": 1.5494015366384662e-05, + "loss": 1.4317, + "step": 6511 + }, + { + "epoch": 0.3560269535147547, + "grad_norm": 1.5373934507369995, + "learning_rate": 1.5492488410808193e-05, + "loss": 1.3782, + "step": 6512 + }, + { + "epoch": 0.35608162595847626, + "grad_norm": 1.525252103805542, + "learning_rate": 1.5490961271824644e-05, + "loss": 1.3916, + "step": 6513 + }, + { + "epoch": 0.35613629840219785, + "grad_norm": 1.6876020431518555, + "learning_rate": 1.5489433949485012e-05, + "loss": 1.1707, + "step": 6514 + }, + { + "epoch": 0.3561909708459194, + "grad_norm": 1.4368507862091064, + "learning_rate": 1.54879064438403e-05, + "loss": 1.3256, + "step": 6515 + }, + { + "epoch": 0.3562456432896409, + "grad_norm": 1.5459067821502686, + "learning_rate": 1.5486378754941514e-05, + "loss": 1.5016, + "step": 6516 + }, + { + "epoch": 0.3563003157333625, + "grad_norm": 1.5627835988998413, + "learning_rate": 1.5484850882839667e-05, + "loss": 1.6257, + "step": 6517 + }, + { + "epoch": 0.35635498817708405, + "grad_norm": 1.2582045793533325, + "learning_rate": 1.5483322827585777e-05, + "loss": 1.4108, + "step": 6518 + }, + { + "epoch": 0.3564096606208056, + "grad_norm": 1.6292145252227783, + "learning_rate": 1.5481794589230875e-05, + "loss": 1.496, + "step": 6519 + }, + { + "epoch": 0.35646433306452713, + "grad_norm": 1.2605857849121094, + "learning_rate": 1.5480266167825987e-05, + "loss": 1.4685, + "step": 6520 + }, + { + "epoch": 0.3565190055082487, + "grad_norm": 1.6594840288162231, + "learning_rate": 1.5478737563422148e-05, + "loss": 1.2237, + "step": 6521 + }, + { + "epoch": 0.35657367795197026, + "grad_norm": 1.3941930532455444, + "learning_rate": 1.547720877607041e-05, + "loss": 1.653, + "step": 6522 + }, + { + "epoch": 0.3566283503956918, + "grad_norm": 1.5766512155532837, + "learning_rate": 1.5475679805821814e-05, + "loss": 1.4729, + "step": 6523 + }, + { + "epoch": 0.3566830228394134, + "grad_norm": 1.4288241863250732, + "learning_rate": 1.5474150652727423e-05, + "loss": 1.5359, + "step": 6524 + }, + { + "epoch": 0.3567376952831349, + "grad_norm": 1.4179651737213135, + "learning_rate": 1.5472621316838297e-05, + "loss": 1.465, + "step": 6525 + }, + { + "epoch": 0.35679236772685646, + "grad_norm": 1.6946609020233154, + "learning_rate": 1.54710917982055e-05, + "loss": 1.4348, + "step": 6526 + }, + { + "epoch": 0.356847040170578, + "grad_norm": 1.2935128211975098, + "learning_rate": 1.5469562096880113e-05, + "loss": 1.3876, + "step": 6527 + }, + { + "epoch": 0.3569017126142996, + "grad_norm": 1.3266628980636597, + "learning_rate": 1.546803221291321e-05, + "loss": 1.5462, + "step": 6528 + }, + { + "epoch": 0.35695638505802113, + "grad_norm": 1.4745787382125854, + "learning_rate": 1.5466502146355883e-05, + "loss": 1.3438, + "step": 6529 + }, + { + "epoch": 0.35701105750174267, + "grad_norm": 1.3784505128860474, + "learning_rate": 1.546497189725922e-05, + "loss": 1.5388, + "step": 6530 + }, + { + "epoch": 0.35706572994546426, + "grad_norm": 1.339673399925232, + "learning_rate": 1.546344146567432e-05, + "loss": 1.448, + "step": 6531 + }, + { + "epoch": 0.3571204023891858, + "grad_norm": 1.450893521308899, + "learning_rate": 1.546191085165229e-05, + "loss": 1.6764, + "step": 6532 + }, + { + "epoch": 0.35717507483290734, + "grad_norm": 1.497205376625061, + "learning_rate": 1.546038005524424e-05, + "loss": 1.2218, + "step": 6533 + }, + { + "epoch": 0.3572297472766289, + "grad_norm": 1.8202828168869019, + "learning_rate": 1.545884907650129e-05, + "loss": 1.6053, + "step": 6534 + }, + { + "epoch": 0.35728441972035047, + "grad_norm": 1.9006314277648926, + "learning_rate": 1.5457317915474556e-05, + "loss": 1.45, + "step": 6535 + }, + { + "epoch": 0.357339092164072, + "grad_norm": 1.5280401706695557, + "learning_rate": 1.545578657221517e-05, + "loss": 1.3512, + "step": 6536 + }, + { + "epoch": 0.35739376460779354, + "grad_norm": 1.5181963443756104, + "learning_rate": 1.5454255046774273e-05, + "loss": 1.2884, + "step": 6537 + }, + { + "epoch": 0.35744843705151513, + "grad_norm": 1.6952942609786987, + "learning_rate": 1.5452723339203e-05, + "loss": 1.5464, + "step": 6538 + }, + { + "epoch": 0.35750310949523667, + "grad_norm": 1.3053048849105835, + "learning_rate": 1.54511914495525e-05, + "loss": 1.5745, + "step": 6539 + }, + { + "epoch": 0.3575577819389582, + "grad_norm": 1.3525447845458984, + "learning_rate": 1.544965937787392e-05, + "loss": 1.4942, + "step": 6540 + }, + { + "epoch": 0.35761245438267975, + "grad_norm": 2.0030200481414795, + "learning_rate": 1.544812712421843e-05, + "loss": 1.4072, + "step": 6541 + }, + { + "epoch": 0.35766712682640134, + "grad_norm": 1.9889655113220215, + "learning_rate": 1.544659468863719e-05, + "loss": 1.6043, + "step": 6542 + }, + { + "epoch": 0.3577217992701229, + "grad_norm": 2.012726306915283, + "learning_rate": 1.544506207118137e-05, + "loss": 1.1983, + "step": 6543 + }, + { + "epoch": 0.3577764717138444, + "grad_norm": 1.873502254486084, + "learning_rate": 1.5443529271902155e-05, + "loss": 1.1763, + "step": 6544 + }, + { + "epoch": 0.357831144157566, + "grad_norm": 1.6126580238342285, + "learning_rate": 1.544199629085072e-05, + "loss": 1.2611, + "step": 6545 + }, + { + "epoch": 0.35788581660128754, + "grad_norm": 1.6665384769439697, + "learning_rate": 1.5440463128078264e-05, + "loss": 1.6777, + "step": 6546 + }, + { + "epoch": 0.3579404890450091, + "grad_norm": 1.4109909534454346, + "learning_rate": 1.5438929783635968e-05, + "loss": 1.4505, + "step": 6547 + }, + { + "epoch": 0.3579951614887306, + "grad_norm": 1.376733422279358, + "learning_rate": 1.5437396257575053e-05, + "loss": 1.3425, + "step": 6548 + }, + { + "epoch": 0.3580498339324522, + "grad_norm": 1.4149144887924194, + "learning_rate": 1.5435862549946714e-05, + "loss": 1.4839, + "step": 6549 + }, + { + "epoch": 0.35810450637617375, + "grad_norm": 1.3006048202514648, + "learning_rate": 1.543432866080217e-05, + "loss": 1.3403, + "step": 6550 + }, + { + "epoch": 0.3581591788198953, + "grad_norm": 1.4506871700286865, + "learning_rate": 1.543279459019264e-05, + "loss": 1.4675, + "step": 6551 + }, + { + "epoch": 0.3582138512636169, + "grad_norm": 1.3263901472091675, + "learning_rate": 1.5431260338169345e-05, + "loss": 1.4056, + "step": 6552 + }, + { + "epoch": 0.3582685237073384, + "grad_norm": 1.7831547260284424, + "learning_rate": 1.542972590478353e-05, + "loss": 1.5703, + "step": 6553 + }, + { + "epoch": 0.35832319615105995, + "grad_norm": 1.36477530002594, + "learning_rate": 1.5428191290086424e-05, + "loss": 1.6204, + "step": 6554 + }, + { + "epoch": 0.3583778685947815, + "grad_norm": 1.8434439897537231, + "learning_rate": 1.542665649412927e-05, + "loss": 1.5372, + "step": 6555 + }, + { + "epoch": 0.3584325410385031, + "grad_norm": 1.4806073904037476, + "learning_rate": 1.5425121516963323e-05, + "loss": 1.5298, + "step": 6556 + }, + { + "epoch": 0.3584872134822246, + "grad_norm": 1.38801109790802, + "learning_rate": 1.542358635863984e-05, + "loss": 1.4657, + "step": 6557 + }, + { + "epoch": 0.35854188592594616, + "grad_norm": 1.8429877758026123, + "learning_rate": 1.5422051019210082e-05, + "loss": 1.3166, + "step": 6558 + }, + { + "epoch": 0.35859655836966775, + "grad_norm": 1.660188913345337, + "learning_rate": 1.5420515498725315e-05, + "loss": 1.1998, + "step": 6559 + }, + { + "epoch": 0.3586512308133893, + "grad_norm": 1.2610887289047241, + "learning_rate": 1.5418979797236814e-05, + "loss": 1.393, + "step": 6560 + }, + { + "epoch": 0.3587059032571108, + "grad_norm": 2.1298859119415283, + "learning_rate": 1.5417443914795864e-05, + "loss": 1.3656, + "step": 6561 + }, + { + "epoch": 0.35876057570083236, + "grad_norm": 1.5478496551513672, + "learning_rate": 1.541590785145375e-05, + "loss": 1.4134, + "step": 6562 + }, + { + "epoch": 0.35881524814455396, + "grad_norm": 1.4305075407028198, + "learning_rate": 1.5414371607261762e-05, + "loss": 1.4715, + "step": 6563 + }, + { + "epoch": 0.3588699205882755, + "grad_norm": 1.2762845754623413, + "learning_rate": 1.5412835182271202e-05, + "loss": 1.1909, + "step": 6564 + }, + { + "epoch": 0.35892459303199703, + "grad_norm": 1.8445485830307007, + "learning_rate": 1.5411298576533376e-05, + "loss": 1.469, + "step": 6565 + }, + { + "epoch": 0.3589792654757186, + "grad_norm": 2.0711138248443604, + "learning_rate": 1.5409761790099586e-05, + "loss": 1.6461, + "step": 6566 + }, + { + "epoch": 0.35903393791944016, + "grad_norm": 1.2840044498443604, + "learning_rate": 1.540822482302116e-05, + "loss": 1.626, + "step": 6567 + }, + { + "epoch": 0.3590886103631617, + "grad_norm": 1.071934461593628, + "learning_rate": 1.5406687675349415e-05, + "loss": 1.6736, + "step": 6568 + }, + { + "epoch": 0.35914328280688324, + "grad_norm": 1.610072135925293, + "learning_rate": 1.5405150347135684e-05, + "loss": 1.1803, + "step": 6569 + }, + { + "epoch": 0.35919795525060483, + "grad_norm": 1.5621765851974487, + "learning_rate": 1.54036128384313e-05, + "loss": 1.4836, + "step": 6570 + }, + { + "epoch": 0.35925262769432637, + "grad_norm": 1.3276973962783813, + "learning_rate": 1.54020751492876e-05, + "loss": 1.6689, + "step": 6571 + }, + { + "epoch": 0.3593073001380479, + "grad_norm": 1.7100666761398315, + "learning_rate": 1.5400537279755935e-05, + "loss": 1.4859, + "step": 6572 + }, + { + "epoch": 0.3593619725817695, + "grad_norm": 1.2305762767791748, + "learning_rate": 1.5398999229887656e-05, + "loss": 1.5059, + "step": 6573 + }, + { + "epoch": 0.35941664502549103, + "grad_norm": 1.528730034828186, + "learning_rate": 1.5397460999734126e-05, + "loss": 1.5349, + "step": 6574 + }, + { + "epoch": 0.35947131746921257, + "grad_norm": 1.5843175649642944, + "learning_rate": 1.5395922589346707e-05, + "loss": 1.4894, + "step": 6575 + }, + { + "epoch": 0.3595259899129341, + "grad_norm": 1.4153810739517212, + "learning_rate": 1.5394383998776768e-05, + "loss": 1.5034, + "step": 6576 + }, + { + "epoch": 0.3595806623566557, + "grad_norm": 1.2834892272949219, + "learning_rate": 1.5392845228075692e-05, + "loss": 1.4982, + "step": 6577 + }, + { + "epoch": 0.35963533480037724, + "grad_norm": 1.395222544670105, + "learning_rate": 1.5391306277294862e-05, + "loss": 1.6009, + "step": 6578 + }, + { + "epoch": 0.3596900072440988, + "grad_norm": 1.5829616785049438, + "learning_rate": 1.538976714648566e-05, + "loss": 1.3341, + "step": 6579 + }, + { + "epoch": 0.35974467968782037, + "grad_norm": 1.4797062873840332, + "learning_rate": 1.5388227835699487e-05, + "loss": 1.8393, + "step": 6580 + }, + { + "epoch": 0.3597993521315419, + "grad_norm": 1.5283132791519165, + "learning_rate": 1.538668834498774e-05, + "loss": 1.6183, + "step": 6581 + }, + { + "epoch": 0.35985402457526344, + "grad_norm": 1.9977003335952759, + "learning_rate": 1.5385148674401833e-05, + "loss": 1.3914, + "step": 6582 + }, + { + "epoch": 0.359908697018985, + "grad_norm": 1.3164221048355103, + "learning_rate": 1.5383608823993175e-05, + "loss": 1.5549, + "step": 6583 + }, + { + "epoch": 0.3599633694627066, + "grad_norm": 1.7649601697921753, + "learning_rate": 1.538206879381318e-05, + "loss": 1.3736, + "step": 6584 + }, + { + "epoch": 0.3600180419064281, + "grad_norm": 1.318599820137024, + "learning_rate": 1.5380528583913285e-05, + "loss": 1.6112, + "step": 6585 + }, + { + "epoch": 0.36007271435014965, + "grad_norm": 1.4302321672439575, + "learning_rate": 1.5378988194344913e-05, + "loss": 1.4465, + "step": 6586 + }, + { + "epoch": 0.36012738679387124, + "grad_norm": 1.4881880283355713, + "learning_rate": 1.5377447625159502e-05, + "loss": 1.2298, + "step": 6587 + }, + { + "epoch": 0.3601820592375928, + "grad_norm": 1.3714381456375122, + "learning_rate": 1.5375906876408496e-05, + "loss": 1.645, + "step": 6588 + }, + { + "epoch": 0.3602367316813143, + "grad_norm": 1.1083590984344482, + "learning_rate": 1.5374365948143345e-05, + "loss": 1.5808, + "step": 6589 + }, + { + "epoch": 0.36029140412503585, + "grad_norm": 1.5483027696609497, + "learning_rate": 1.5372824840415504e-05, + "loss": 1.5755, + "step": 6590 + }, + { + "epoch": 0.36034607656875745, + "grad_norm": 1.512582778930664, + "learning_rate": 1.5371283553276432e-05, + "loss": 1.5188, + "step": 6591 + }, + { + "epoch": 0.360400749012479, + "grad_norm": 1.5111654996871948, + "learning_rate": 1.5369742086777598e-05, + "loss": 1.4108, + "step": 6592 + }, + { + "epoch": 0.3604554214562005, + "grad_norm": 1.2729352712631226, + "learning_rate": 1.5368200440970478e-05, + "loss": 1.6573, + "step": 6593 + }, + { + "epoch": 0.3605100938999221, + "grad_norm": 1.7011263370513916, + "learning_rate": 1.5366658615906548e-05, + "loss": 1.3508, + "step": 6594 + }, + { + "epoch": 0.36056476634364365, + "grad_norm": 1.2633484601974487, + "learning_rate": 1.536511661163729e-05, + "loss": 1.5442, + "step": 6595 + }, + { + "epoch": 0.3606194387873652, + "grad_norm": 1.454836368560791, + "learning_rate": 1.53635744282142e-05, + "loss": 1.4046, + "step": 6596 + }, + { + "epoch": 0.3606741112310867, + "grad_norm": 1.278607726097107, + "learning_rate": 1.5362032065688778e-05, + "loss": 1.4148, + "step": 6597 + }, + { + "epoch": 0.3607287836748083, + "grad_norm": 1.781720757484436, + "learning_rate": 1.536048952411252e-05, + "loss": 1.6836, + "step": 6598 + }, + { + "epoch": 0.36078345611852985, + "grad_norm": 1.5278899669647217, + "learning_rate": 1.5358946803536937e-05, + "loss": 1.4517, + "step": 6599 + }, + { + "epoch": 0.3608381285622514, + "grad_norm": 1.6150552034378052, + "learning_rate": 1.5357403904013546e-05, + "loss": 1.5721, + "step": 6600 + }, + { + "epoch": 0.360892801005973, + "grad_norm": 1.7152081727981567, + "learning_rate": 1.5355860825593866e-05, + "loss": 1.3957, + "step": 6601 + }, + { + "epoch": 0.3609474734496945, + "grad_norm": 1.4321168661117554, + "learning_rate": 1.5354317568329427e-05, + "loss": 1.3936, + "step": 6602 + }, + { + "epoch": 0.36100214589341606, + "grad_norm": 1.664001703262329, + "learning_rate": 1.5352774132271756e-05, + "loss": 1.2324, + "step": 6603 + }, + { + "epoch": 0.3610568183371376, + "grad_norm": 1.5078874826431274, + "learning_rate": 1.5351230517472402e-05, + "loss": 1.2955, + "step": 6604 + }, + { + "epoch": 0.3611114907808592, + "grad_norm": 1.8456865549087524, + "learning_rate": 1.53496867239829e-05, + "loss": 1.4777, + "step": 6605 + }, + { + "epoch": 0.3611661632245807, + "grad_norm": 1.5075433254241943, + "learning_rate": 1.5348142751854807e-05, + "loss": 1.3752, + "step": 6606 + }, + { + "epoch": 0.36122083566830226, + "grad_norm": 1.591794729232788, + "learning_rate": 1.5346598601139677e-05, + "loss": 1.5574, + "step": 6607 + }, + { + "epoch": 0.36127550811202386, + "grad_norm": 1.4647221565246582, + "learning_rate": 1.5345054271889073e-05, + "loss": 1.5843, + "step": 6608 + }, + { + "epoch": 0.3613301805557454, + "grad_norm": 1.609055995941162, + "learning_rate": 1.5343509764154566e-05, + "loss": 1.3025, + "step": 6609 + }, + { + "epoch": 0.36138485299946693, + "grad_norm": 1.5429109334945679, + "learning_rate": 1.5341965077987727e-05, + "loss": 1.4991, + "step": 6610 + }, + { + "epoch": 0.36143952544318847, + "grad_norm": 1.4655518531799316, + "learning_rate": 1.534042021344014e-05, + "loss": 1.4119, + "step": 6611 + }, + { + "epoch": 0.36149419788691006, + "grad_norm": 1.4380868673324585, + "learning_rate": 1.533887517056339e-05, + "loss": 1.555, + "step": 6612 + }, + { + "epoch": 0.3615488703306316, + "grad_norm": 1.3484381437301636, + "learning_rate": 1.533732994940907e-05, + "loss": 1.3889, + "step": 6613 + }, + { + "epoch": 0.36160354277435314, + "grad_norm": 1.5085116624832153, + "learning_rate": 1.533578455002878e-05, + "loss": 1.4084, + "step": 6614 + }, + { + "epoch": 0.36165821521807473, + "grad_norm": 1.633010983467102, + "learning_rate": 1.533423897247412e-05, + "loss": 1.3043, + "step": 6615 + }, + { + "epoch": 0.36171288766179627, + "grad_norm": 1.522028923034668, + "learning_rate": 1.5332693216796704e-05, + "loss": 1.5618, + "step": 6616 + }, + { + "epoch": 0.3617675601055178, + "grad_norm": 1.51535165309906, + "learning_rate": 1.533114728304815e-05, + "loss": 1.3974, + "step": 6617 + }, + { + "epoch": 0.36182223254923934, + "grad_norm": 1.4085040092468262, + "learning_rate": 1.5329601171280076e-05, + "loss": 1.2312, + "step": 6618 + }, + { + "epoch": 0.36187690499296093, + "grad_norm": 1.9578744173049927, + "learning_rate": 1.5328054881544113e-05, + "loss": 1.2641, + "step": 6619 + }, + { + "epoch": 0.3619315774366825, + "grad_norm": 1.5732202529907227, + "learning_rate": 1.5326508413891894e-05, + "loss": 1.577, + "step": 6620 + }, + { + "epoch": 0.361986249880404, + "grad_norm": 1.5091941356658936, + "learning_rate": 1.5324961768375065e-05, + "loss": 1.4111, + "step": 6621 + }, + { + "epoch": 0.3620409223241256, + "grad_norm": 1.518134355545044, + "learning_rate": 1.5323414945045262e-05, + "loss": 1.2535, + "step": 6622 + }, + { + "epoch": 0.36209559476784714, + "grad_norm": 1.4543437957763672, + "learning_rate": 1.5321867943954143e-05, + "loss": 1.4623, + "step": 6623 + }, + { + "epoch": 0.3621502672115687, + "grad_norm": 1.2833253145217896, + "learning_rate": 1.5320320765153367e-05, + "loss": 1.3412, + "step": 6624 + }, + { + "epoch": 0.36220493965529027, + "grad_norm": 1.5138163566589355, + "learning_rate": 1.5318773408694596e-05, + "loss": 1.6527, + "step": 6625 + }, + { + "epoch": 0.3622596120990118, + "grad_norm": 1.3683655261993408, + "learning_rate": 1.53172258746295e-05, + "loss": 1.302, + "step": 6626 + }, + { + "epoch": 0.36231428454273334, + "grad_norm": 1.800534963607788, + "learning_rate": 1.5315678163009752e-05, + "loss": 1.672, + "step": 6627 + }, + { + "epoch": 0.3623689569864549, + "grad_norm": 1.7392812967300415, + "learning_rate": 1.531413027388704e-05, + "loss": 1.652, + "step": 6628 + }, + { + "epoch": 0.3624236294301765, + "grad_norm": 1.438163161277771, + "learning_rate": 1.5312582207313046e-05, + "loss": 1.3946, + "step": 6629 + }, + { + "epoch": 0.362478301873898, + "grad_norm": 1.4255205392837524, + "learning_rate": 1.5311033963339465e-05, + "loss": 1.5316, + "step": 6630 + }, + { + "epoch": 0.36253297431761955, + "grad_norm": 1.218307614326477, + "learning_rate": 1.5309485542018e-05, + "loss": 1.4231, + "step": 6631 + }, + { + "epoch": 0.36258764676134114, + "grad_norm": 1.868044137954712, + "learning_rate": 1.5307936943400355e-05, + "loss": 1.3931, + "step": 6632 + }, + { + "epoch": 0.3626423192050627, + "grad_norm": 1.494140625, + "learning_rate": 1.5306388167538235e-05, + "loss": 1.3666, + "step": 6633 + }, + { + "epoch": 0.3626969916487842, + "grad_norm": 1.3557019233703613, + "learning_rate": 1.530483921448336e-05, + "loss": 1.4757, + "step": 6634 + }, + { + "epoch": 0.36275166409250575, + "grad_norm": 1.4709259271621704, + "learning_rate": 1.5303290084287465e-05, + "loss": 1.369, + "step": 6635 + }, + { + "epoch": 0.36280633653622735, + "grad_norm": 1.9142053127288818, + "learning_rate": 1.5301740777002265e-05, + "loss": 1.333, + "step": 6636 + }, + { + "epoch": 0.3628610089799489, + "grad_norm": 1.3348461389541626, + "learning_rate": 1.5300191292679497e-05, + "loss": 1.6504, + "step": 6637 + }, + { + "epoch": 0.3629156814236704, + "grad_norm": 1.6439307928085327, + "learning_rate": 1.5298641631370907e-05, + "loss": 1.6791, + "step": 6638 + }, + { + "epoch": 0.362970353867392, + "grad_norm": 1.6972954273223877, + "learning_rate": 1.529709179312824e-05, + "loss": 1.3422, + "step": 6639 + }, + { + "epoch": 0.36302502631111355, + "grad_norm": 1.974664568901062, + "learning_rate": 1.5295541778003243e-05, + "loss": 1.3844, + "step": 6640 + }, + { + "epoch": 0.3630796987548351, + "grad_norm": 2.085223913192749, + "learning_rate": 1.5293991586047685e-05, + "loss": 1.4377, + "step": 6641 + }, + { + "epoch": 0.3631343711985566, + "grad_norm": 1.3888354301452637, + "learning_rate": 1.5292441217313324e-05, + "loss": 1.407, + "step": 6642 + }, + { + "epoch": 0.3631890436422782, + "grad_norm": 1.528250813484192, + "learning_rate": 1.529089067185193e-05, + "loss": 1.5857, + "step": 6643 + }, + { + "epoch": 0.36324371608599976, + "grad_norm": 1.564153790473938, + "learning_rate": 1.5289339949715285e-05, + "loss": 1.1612, + "step": 6644 + }, + { + "epoch": 0.3632983885297213, + "grad_norm": 1.4498052597045898, + "learning_rate": 1.5287789050955164e-05, + "loss": 1.4585, + "step": 6645 + }, + { + "epoch": 0.3633530609734429, + "grad_norm": 1.1894505023956299, + "learning_rate": 1.528623797562336e-05, + "loss": 1.3104, + "step": 6646 + }, + { + "epoch": 0.3634077334171644, + "grad_norm": 1.7386683225631714, + "learning_rate": 1.5284686723771664e-05, + "loss": 1.5046, + "step": 6647 + }, + { + "epoch": 0.36346240586088596, + "grad_norm": 1.9655576944351196, + "learning_rate": 1.5283135295451877e-05, + "loss": 1.5681, + "step": 6648 + }, + { + "epoch": 0.3635170783046075, + "grad_norm": 2.2513575553894043, + "learning_rate": 1.5281583690715805e-05, + "loss": 1.7319, + "step": 6649 + }, + { + "epoch": 0.3635717507483291, + "grad_norm": 1.8412984609603882, + "learning_rate": 1.5280031909615263e-05, + "loss": 1.5503, + "step": 6650 + }, + { + "epoch": 0.36362642319205063, + "grad_norm": 2.5145390033721924, + "learning_rate": 1.5278479952202064e-05, + "loss": 1.509, + "step": 6651 + }, + { + "epoch": 0.36368109563577217, + "grad_norm": 1.4750933647155762, + "learning_rate": 1.5276927818528032e-05, + "loss": 1.486, + "step": 6652 + }, + { + "epoch": 0.36373576807949376, + "grad_norm": 1.3923814296722412, + "learning_rate": 1.5275375508644997e-05, + "loss": 1.5482, + "step": 6653 + }, + { + "epoch": 0.3637904405232153, + "grad_norm": 1.753318190574646, + "learning_rate": 1.5273823022604798e-05, + "loss": 1.1658, + "step": 6654 + }, + { + "epoch": 0.36384511296693683, + "grad_norm": 1.4730379581451416, + "learning_rate": 1.5272270360459266e-05, + "loss": 1.3493, + "step": 6655 + }, + { + "epoch": 0.36389978541065837, + "grad_norm": 1.6240153312683105, + "learning_rate": 1.5270717522260264e-05, + "loss": 1.5235, + "step": 6656 + }, + { + "epoch": 0.36395445785437996, + "grad_norm": 1.7050448656082153, + "learning_rate": 1.526916450805963e-05, + "loss": 1.4047, + "step": 6657 + }, + { + "epoch": 0.3640091302981015, + "grad_norm": 1.4971014261245728, + "learning_rate": 1.526761131790923e-05, + "loss": 1.8521, + "step": 6658 + }, + { + "epoch": 0.36406380274182304, + "grad_norm": 1.184495449066162, + "learning_rate": 1.5266057951860927e-05, + "loss": 1.279, + "step": 6659 + }, + { + "epoch": 0.36411847518554463, + "grad_norm": 1.4376262426376343, + "learning_rate": 1.5264504409966593e-05, + "loss": 1.552, + "step": 6660 + }, + { + "epoch": 0.36417314762926617, + "grad_norm": 1.281082272529602, + "learning_rate": 1.52629506922781e-05, + "loss": 1.3842, + "step": 6661 + }, + { + "epoch": 0.3642278200729877, + "grad_norm": 1.4609135389328003, + "learning_rate": 1.5261396798847335e-05, + "loss": 1.3907, + "step": 6662 + }, + { + "epoch": 0.36428249251670924, + "grad_norm": 1.4348070621490479, + "learning_rate": 1.5259842729726186e-05, + "loss": 1.3101, + "step": 6663 + }, + { + "epoch": 0.36433716496043084, + "grad_norm": 1.7459743022918701, + "learning_rate": 1.5258288484966545e-05, + "loss": 1.4899, + "step": 6664 + }, + { + "epoch": 0.3643918374041524, + "grad_norm": 1.5969115495681763, + "learning_rate": 1.5256734064620313e-05, + "loss": 1.7294, + "step": 6665 + }, + { + "epoch": 0.3644465098478739, + "grad_norm": 1.2359514236450195, + "learning_rate": 1.5255179468739393e-05, + "loss": 1.3188, + "step": 6666 + }, + { + "epoch": 0.3645011822915955, + "grad_norm": 1.4378002882003784, + "learning_rate": 1.5253624697375702e-05, + "loss": 1.3096, + "step": 6667 + }, + { + "epoch": 0.36455585473531704, + "grad_norm": 1.4645377397537231, + "learning_rate": 1.525206975058115e-05, + "loss": 1.5891, + "step": 6668 + }, + { + "epoch": 0.3646105271790386, + "grad_norm": 1.6191192865371704, + "learning_rate": 1.5250514628407671e-05, + "loss": 1.4856, + "step": 6669 + }, + { + "epoch": 0.3646651996227601, + "grad_norm": 1.755118727684021, + "learning_rate": 1.5248959330907186e-05, + "loss": 1.25, + "step": 6670 + }, + { + "epoch": 0.3647198720664817, + "grad_norm": 1.5257105827331543, + "learning_rate": 1.5247403858131629e-05, + "loss": 1.4054, + "step": 6671 + }, + { + "epoch": 0.36477454451020325, + "grad_norm": 1.528442621231079, + "learning_rate": 1.5245848210132943e-05, + "loss": 1.3918, + "step": 6672 + }, + { + "epoch": 0.3648292169539248, + "grad_norm": 1.6950312852859497, + "learning_rate": 1.5244292386963077e-05, + "loss": 1.3919, + "step": 6673 + }, + { + "epoch": 0.3648838893976464, + "grad_norm": 1.2500418424606323, + "learning_rate": 1.5242736388673984e-05, + "loss": 1.512, + "step": 6674 + }, + { + "epoch": 0.3649385618413679, + "grad_norm": 1.719772458076477, + "learning_rate": 1.524118021531762e-05, + "loss": 1.3616, + "step": 6675 + }, + { + "epoch": 0.36499323428508945, + "grad_norm": 1.6299549341201782, + "learning_rate": 1.523962386694595e-05, + "loss": 1.5337, + "step": 6676 + }, + { + "epoch": 0.365047906728811, + "grad_norm": 1.6010688543319702, + "learning_rate": 1.5238067343610943e-05, + "loss": 1.1865, + "step": 6677 + }, + { + "epoch": 0.3651025791725326, + "grad_norm": 1.6933343410491943, + "learning_rate": 1.5236510645364575e-05, + "loss": 1.9499, + "step": 6678 + }, + { + "epoch": 0.3651572516162541, + "grad_norm": 1.4251136779785156, + "learning_rate": 1.5234953772258827e-05, + "loss": 1.5382, + "step": 6679 + }, + { + "epoch": 0.36521192405997566, + "grad_norm": 1.9183855056762695, + "learning_rate": 1.5233396724345691e-05, + "loss": 1.7031, + "step": 6680 + }, + { + "epoch": 0.36526659650369725, + "grad_norm": 1.4246917963027954, + "learning_rate": 1.523183950167716e-05, + "loss": 1.5076, + "step": 6681 + }, + { + "epoch": 0.3653212689474188, + "grad_norm": 1.6950840950012207, + "learning_rate": 1.5230282104305227e-05, + "loss": 1.2969, + "step": 6682 + }, + { + "epoch": 0.3653759413911403, + "grad_norm": 1.1450985670089722, + "learning_rate": 1.5228724532281904e-05, + "loss": 1.6629, + "step": 6683 + }, + { + "epoch": 0.36543061383486186, + "grad_norm": 1.9755661487579346, + "learning_rate": 1.52271667856592e-05, + "loss": 1.2126, + "step": 6684 + }, + { + "epoch": 0.36548528627858345, + "grad_norm": 1.3898591995239258, + "learning_rate": 1.5225608864489128e-05, + "loss": 1.529, + "step": 6685 + }, + { + "epoch": 0.365539958722305, + "grad_norm": 1.961755633354187, + "learning_rate": 1.5224050768823716e-05, + "loss": 1.4391, + "step": 6686 + }, + { + "epoch": 0.36559463116602653, + "grad_norm": 1.4053642749786377, + "learning_rate": 1.5222492498714986e-05, + "loss": 1.373, + "step": 6687 + }, + { + "epoch": 0.3656493036097481, + "grad_norm": 1.8063106536865234, + "learning_rate": 1.5220934054214982e-05, + "loss": 1.4363, + "step": 6688 + }, + { + "epoch": 0.36570397605346966, + "grad_norm": 1.707661747932434, + "learning_rate": 1.5219375435375736e-05, + "loss": 1.2212, + "step": 6689 + }, + { + "epoch": 0.3657586484971912, + "grad_norm": 1.3034083843231201, + "learning_rate": 1.5217816642249297e-05, + "loss": 1.3923, + "step": 6690 + }, + { + "epoch": 0.36581332094091273, + "grad_norm": 1.5722134113311768, + "learning_rate": 1.5216257674887718e-05, + "loss": 1.4322, + "step": 6691 + }, + { + "epoch": 0.3658679933846343, + "grad_norm": 1.5533045530319214, + "learning_rate": 1.5214698533343053e-05, + "loss": 1.328, + "step": 6692 + }, + { + "epoch": 0.36592266582835586, + "grad_norm": 1.285443663597107, + "learning_rate": 1.5213139217667366e-05, + "loss": 1.6288, + "step": 6693 + }, + { + "epoch": 0.3659773382720774, + "grad_norm": 1.4974644184112549, + "learning_rate": 1.5211579727912728e-05, + "loss": 1.3818, + "step": 6694 + }, + { + "epoch": 0.366032010715799, + "grad_norm": 1.7187035083770752, + "learning_rate": 1.5210020064131217e-05, + "loss": 1.6061, + "step": 6695 + }, + { + "epoch": 0.36608668315952053, + "grad_norm": 1.5071762800216675, + "learning_rate": 1.5208460226374907e-05, + "loss": 1.6055, + "step": 6696 + }, + { + "epoch": 0.36614135560324207, + "grad_norm": 1.31297767162323, + "learning_rate": 1.520690021469589e-05, + "loss": 1.6084, + "step": 6697 + }, + { + "epoch": 0.3661960280469636, + "grad_norm": 1.6458925008773804, + "learning_rate": 1.5205340029146256e-05, + "loss": 1.468, + "step": 6698 + }, + { + "epoch": 0.3662507004906852, + "grad_norm": 1.4521986246109009, + "learning_rate": 1.5203779669778102e-05, + "loss": 1.3276, + "step": 6699 + }, + { + "epoch": 0.36630537293440674, + "grad_norm": 1.7988678216934204, + "learning_rate": 1.5202219136643535e-05, + "loss": 1.4036, + "step": 6700 + }, + { + "epoch": 0.3663600453781283, + "grad_norm": 1.1040481328964233, + "learning_rate": 1.5200658429794662e-05, + "loss": 1.5964, + "step": 6701 + }, + { + "epoch": 0.36641471782184987, + "grad_norm": 1.378545880317688, + "learning_rate": 1.5199097549283604e-05, + "loss": 1.4894, + "step": 6702 + }, + { + "epoch": 0.3664693902655714, + "grad_norm": 1.3613338470458984, + "learning_rate": 1.5197536495162478e-05, + "loss": 1.5708, + "step": 6703 + }, + { + "epoch": 0.36652406270929294, + "grad_norm": 1.6190346479415894, + "learning_rate": 1.5195975267483408e-05, + "loss": 1.4677, + "step": 6704 + }, + { + "epoch": 0.3665787351530145, + "grad_norm": 1.3165147304534912, + "learning_rate": 1.5194413866298536e-05, + "loss": 1.42, + "step": 6705 + }, + { + "epoch": 0.36663340759673607, + "grad_norm": 1.7532665729522705, + "learning_rate": 1.5192852291659992e-05, + "loss": 1.1835, + "step": 6706 + }, + { + "epoch": 0.3666880800404576, + "grad_norm": 1.744246006011963, + "learning_rate": 1.5191290543619925e-05, + "loss": 1.3366, + "step": 6707 + }, + { + "epoch": 0.36674275248417915, + "grad_norm": 1.3160302639007568, + "learning_rate": 1.5189728622230489e-05, + "loss": 1.3875, + "step": 6708 + }, + { + "epoch": 0.36679742492790074, + "grad_norm": 1.7497923374176025, + "learning_rate": 1.5188166527543832e-05, + "loss": 1.2708, + "step": 6709 + }, + { + "epoch": 0.3668520973716223, + "grad_norm": 1.654109239578247, + "learning_rate": 1.5186604259612123e-05, + "loss": 1.4128, + "step": 6710 + }, + { + "epoch": 0.3669067698153438, + "grad_norm": 1.6126958131790161, + "learning_rate": 1.5185041818487525e-05, + "loss": 1.4333, + "step": 6711 + }, + { + "epoch": 0.36696144225906535, + "grad_norm": 1.7441900968551636, + "learning_rate": 1.5183479204222216e-05, + "loss": 1.2481, + "step": 6712 + }, + { + "epoch": 0.36701611470278694, + "grad_norm": 1.6769139766693115, + "learning_rate": 1.518191641686837e-05, + "loss": 1.5854, + "step": 6713 + }, + { + "epoch": 0.3670707871465085, + "grad_norm": 1.365701675415039, + "learning_rate": 1.5180353456478174e-05, + "loss": 1.5279, + "step": 6714 + }, + { + "epoch": 0.36712545959023, + "grad_norm": 1.9271283149719238, + "learning_rate": 1.5178790323103825e-05, + "loss": 1.3995, + "step": 6715 + }, + { + "epoch": 0.3671801320339516, + "grad_norm": 1.6613177061080933, + "learning_rate": 1.5177227016797514e-05, + "loss": 1.5248, + "step": 6716 + }, + { + "epoch": 0.36723480447767315, + "grad_norm": 1.3711888790130615, + "learning_rate": 1.517566353761144e-05, + "loss": 1.4869, + "step": 6717 + }, + { + "epoch": 0.3672894769213947, + "grad_norm": 2.0650177001953125, + "learning_rate": 1.5174099885597817e-05, + "loss": 1.4573, + "step": 6718 + }, + { + "epoch": 0.3673441493651162, + "grad_norm": 1.808968424797058, + "learning_rate": 1.5172536060808857e-05, + "loss": 1.4491, + "step": 6719 + }, + { + "epoch": 0.3673988218088378, + "grad_norm": 1.5515143871307373, + "learning_rate": 1.5170972063296783e-05, + "loss": 1.4299, + "step": 6720 + }, + { + "epoch": 0.36745349425255935, + "grad_norm": 1.541411280632019, + "learning_rate": 1.5169407893113816e-05, + "loss": 1.2693, + "step": 6721 + }, + { + "epoch": 0.3675081666962809, + "grad_norm": 1.610363245010376, + "learning_rate": 1.516784355031219e-05, + "loss": 1.5456, + "step": 6722 + }, + { + "epoch": 0.3675628391400025, + "grad_norm": 1.4760500192642212, + "learning_rate": 1.5166279034944141e-05, + "loss": 1.4227, + "step": 6723 + }, + { + "epoch": 0.367617511583724, + "grad_norm": 1.397477626800537, + "learning_rate": 1.5164714347061908e-05, + "loss": 1.3732, + "step": 6724 + }, + { + "epoch": 0.36767218402744556, + "grad_norm": 1.7033635377883911, + "learning_rate": 1.5163149486717747e-05, + "loss": 1.6744, + "step": 6725 + }, + { + "epoch": 0.3677268564711671, + "grad_norm": 1.1896843910217285, + "learning_rate": 1.5161584453963908e-05, + "loss": 1.5267, + "step": 6726 + }, + { + "epoch": 0.3677815289148887, + "grad_norm": 1.6424826383590698, + "learning_rate": 1.5160019248852655e-05, + "loss": 1.3837, + "step": 6727 + }, + { + "epoch": 0.3678362013586102, + "grad_norm": 1.7167797088623047, + "learning_rate": 1.515845387143625e-05, + "loss": 1.2326, + "step": 6728 + }, + { + "epoch": 0.36789087380233176, + "grad_norm": 1.406341552734375, + "learning_rate": 1.515688832176696e-05, + "loss": 1.5448, + "step": 6729 + }, + { + "epoch": 0.36794554624605336, + "grad_norm": 1.7792414426803589, + "learning_rate": 1.5155322599897076e-05, + "loss": 1.2767, + "step": 6730 + }, + { + "epoch": 0.3680002186897749, + "grad_norm": 1.6175276041030884, + "learning_rate": 1.5153756705878867e-05, + "loss": 1.3448, + "step": 6731 + }, + { + "epoch": 0.36805489113349643, + "grad_norm": 1.1828243732452393, + "learning_rate": 1.515219063976463e-05, + "loss": 1.4651, + "step": 6732 + }, + { + "epoch": 0.36810956357721797, + "grad_norm": 1.3636265993118286, + "learning_rate": 1.5150624401606658e-05, + "loss": 1.5199, + "step": 6733 + }, + { + "epoch": 0.36816423602093956, + "grad_norm": 1.7496825456619263, + "learning_rate": 1.514905799145725e-05, + "loss": 1.464, + "step": 6734 + }, + { + "epoch": 0.3682189084646611, + "grad_norm": 1.421126127243042, + "learning_rate": 1.5147491409368713e-05, + "loss": 1.3952, + "step": 6735 + }, + { + "epoch": 0.36827358090838264, + "grad_norm": 1.3794429302215576, + "learning_rate": 1.514592465539336e-05, + "loss": 1.8598, + "step": 6736 + }, + { + "epoch": 0.36832825335210423, + "grad_norm": 1.4515659809112549, + "learning_rate": 1.514435772958351e-05, + "loss": 1.1274, + "step": 6737 + }, + { + "epoch": 0.36838292579582577, + "grad_norm": 1.585089087486267, + "learning_rate": 1.514279063199148e-05, + "loss": 1.5106, + "step": 6738 + }, + { + "epoch": 0.3684375982395473, + "grad_norm": 2.4743077754974365, + "learning_rate": 1.5141223362669602e-05, + "loss": 1.2985, + "step": 6739 + }, + { + "epoch": 0.36849227068326884, + "grad_norm": 1.6363701820373535, + "learning_rate": 1.5139655921670213e-05, + "loss": 1.3255, + "step": 6740 + }, + { + "epoch": 0.36854694312699043, + "grad_norm": 1.698384404182434, + "learning_rate": 1.5138088309045653e-05, + "loss": 1.4556, + "step": 6741 + }, + { + "epoch": 0.36860161557071197, + "grad_norm": 1.5943013429641724, + "learning_rate": 1.5136520524848266e-05, + "loss": 1.3913, + "step": 6742 + }, + { + "epoch": 0.3686562880144335, + "grad_norm": 1.455349326133728, + "learning_rate": 1.5134952569130406e-05, + "loss": 1.6241, + "step": 6743 + }, + { + "epoch": 0.3687109604581551, + "grad_norm": 1.4120333194732666, + "learning_rate": 1.5133384441944432e-05, + "loss": 1.4448, + "step": 6744 + }, + { + "epoch": 0.36876563290187664, + "grad_norm": 1.62314772605896, + "learning_rate": 1.5131816143342701e-05, + "loss": 1.3993, + "step": 6745 + }, + { + "epoch": 0.3688203053455982, + "grad_norm": 1.3391592502593994, + "learning_rate": 1.513024767337759e-05, + "loss": 1.4145, + "step": 6746 + }, + { + "epoch": 0.3688749777893197, + "grad_norm": 1.7091470956802368, + "learning_rate": 1.5128679032101472e-05, + "loss": 1.4931, + "step": 6747 + }, + { + "epoch": 0.3689296502330413, + "grad_norm": 1.502690315246582, + "learning_rate": 1.5127110219566725e-05, + "loss": 1.5862, + "step": 6748 + }, + { + "epoch": 0.36898432267676284, + "grad_norm": 1.2912532091140747, + "learning_rate": 1.5125541235825738e-05, + "loss": 1.4412, + "step": 6749 + }, + { + "epoch": 0.3690389951204844, + "grad_norm": 1.5173100233078003, + "learning_rate": 1.51239720809309e-05, + "loss": 1.4038, + "step": 6750 + }, + { + "epoch": 0.369093667564206, + "grad_norm": 1.4635251760482788, + "learning_rate": 1.512240275493461e-05, + "loss": 1.4262, + "step": 6751 + }, + { + "epoch": 0.3691483400079275, + "grad_norm": 1.5560592412948608, + "learning_rate": 1.5120833257889272e-05, + "loss": 1.1486, + "step": 6752 + }, + { + "epoch": 0.36920301245164905, + "grad_norm": 1.484297513961792, + "learning_rate": 1.5119263589847295e-05, + "loss": 1.3649, + "step": 6753 + }, + { + "epoch": 0.3692576848953706, + "grad_norm": 1.5025330781936646, + "learning_rate": 1.5117693750861096e-05, + "loss": 1.4056, + "step": 6754 + }, + { + "epoch": 0.3693123573390922, + "grad_norm": 1.4213625192642212, + "learning_rate": 1.5116123740983093e-05, + "loss": 1.5602, + "step": 6755 + }, + { + "epoch": 0.3693670297828137, + "grad_norm": 1.8317584991455078, + "learning_rate": 1.5114553560265712e-05, + "loss": 1.199, + "step": 6756 + }, + { + "epoch": 0.36942170222653525, + "grad_norm": 1.0901378393173218, + "learning_rate": 1.5112983208761384e-05, + "loss": 1.3724, + "step": 6757 + }, + { + "epoch": 0.36947637467025685, + "grad_norm": 1.2839573621749878, + "learning_rate": 1.511141268652255e-05, + "loss": 1.5276, + "step": 6758 + }, + { + "epoch": 0.3695310471139784, + "grad_norm": 1.5624855756759644, + "learning_rate": 1.5109841993601654e-05, + "loss": 1.0057, + "step": 6759 + }, + { + "epoch": 0.3695857195576999, + "grad_norm": 1.6478747129440308, + "learning_rate": 1.5108271130051141e-05, + "loss": 1.5395, + "step": 6760 + }, + { + "epoch": 0.36964039200142146, + "grad_norm": 1.370626449584961, + "learning_rate": 1.5106700095923471e-05, + "loss": 1.4335, + "step": 6761 + }, + { + "epoch": 0.36969506444514305, + "grad_norm": 1.314193606376648, + "learning_rate": 1.5105128891271102e-05, + "loss": 1.5349, + "step": 6762 + }, + { + "epoch": 0.3697497368888646, + "grad_norm": 2.010145425796509, + "learning_rate": 1.5103557516146494e-05, + "loss": 1.2681, + "step": 6763 + }, + { + "epoch": 0.3698044093325861, + "grad_norm": 1.4387565851211548, + "learning_rate": 1.510198597060213e-05, + "loss": 1.63, + "step": 6764 + }, + { + "epoch": 0.3698590817763077, + "grad_norm": 1.671975016593933, + "learning_rate": 1.5100414254690478e-05, + "loss": 1.4107, + "step": 6765 + }, + { + "epoch": 0.36991375422002926, + "grad_norm": 1.6584066152572632, + "learning_rate": 1.5098842368464031e-05, + "loss": 1.4584, + "step": 6766 + }, + { + "epoch": 0.3699684266637508, + "grad_norm": 1.6056225299835205, + "learning_rate": 1.5097270311975267e-05, + "loss": 1.5575, + "step": 6767 + }, + { + "epoch": 0.37002309910747233, + "grad_norm": 1.7791392803192139, + "learning_rate": 1.5095698085276692e-05, + "loss": 1.6218, + "step": 6768 + }, + { + "epoch": 0.3700777715511939, + "grad_norm": 1.548086166381836, + "learning_rate": 1.5094125688420795e-05, + "loss": 1.4747, + "step": 6769 + }, + { + "epoch": 0.37013244399491546, + "grad_norm": 1.319367527961731, + "learning_rate": 1.509255312146009e-05, + "loss": 1.4854, + "step": 6770 + }, + { + "epoch": 0.370187116438637, + "grad_norm": 1.7171880006790161, + "learning_rate": 1.5090980384447083e-05, + "loss": 1.4577, + "step": 6771 + }, + { + "epoch": 0.3702417888823586, + "grad_norm": 1.502520203590393, + "learning_rate": 1.5089407477434299e-05, + "loss": 1.439, + "step": 6772 + }, + { + "epoch": 0.37029646132608013, + "grad_norm": 1.5288194417953491, + "learning_rate": 1.5087834400474255e-05, + "loss": 1.5736, + "step": 6773 + }, + { + "epoch": 0.37035113376980167, + "grad_norm": 1.6138814687728882, + "learning_rate": 1.508626115361948e-05, + "loss": 1.3332, + "step": 6774 + }, + { + "epoch": 0.3704058062135232, + "grad_norm": 1.61287260055542, + "learning_rate": 1.5084687736922514e-05, + "loss": 1.3389, + "step": 6775 + }, + { + "epoch": 0.3704604786572448, + "grad_norm": 1.7023792266845703, + "learning_rate": 1.5083114150435889e-05, + "loss": 1.5694, + "step": 6776 + }, + { + "epoch": 0.37051515110096633, + "grad_norm": 5.055995941162109, + "learning_rate": 1.5081540394212155e-05, + "loss": 1.2893, + "step": 6777 + }, + { + "epoch": 0.37056982354468787, + "grad_norm": 1.3912326097488403, + "learning_rate": 1.5079966468303866e-05, + "loss": 1.4151, + "step": 6778 + }, + { + "epoch": 0.37062449598840946, + "grad_norm": 2.1022331714630127, + "learning_rate": 1.5078392372763573e-05, + "loss": 1.5505, + "step": 6779 + }, + { + "epoch": 0.370679168432131, + "grad_norm": 1.3980052471160889, + "learning_rate": 1.5076818107643844e-05, + "loss": 1.5044, + "step": 6780 + }, + { + "epoch": 0.37073384087585254, + "grad_norm": 1.3661823272705078, + "learning_rate": 1.5075243672997242e-05, + "loss": 1.4718, + "step": 6781 + }, + { + "epoch": 0.3707885133195741, + "grad_norm": 1.6214779615402222, + "learning_rate": 1.5073669068876348e-05, + "loss": 1.43, + "step": 6782 + }, + { + "epoch": 0.37084318576329567, + "grad_norm": 1.7555267810821533, + "learning_rate": 1.5072094295333734e-05, + "loss": 1.3785, + "step": 6783 + }, + { + "epoch": 0.3708978582070172, + "grad_norm": 1.3064069747924805, + "learning_rate": 1.5070519352421993e-05, + "loss": 1.589, + "step": 6784 + }, + { + "epoch": 0.37095253065073874, + "grad_norm": 1.10560941696167, + "learning_rate": 1.5068944240193713e-05, + "loss": 1.5939, + "step": 6785 + }, + { + "epoch": 0.37100720309446034, + "grad_norm": 1.7947505712509155, + "learning_rate": 1.5067368958701487e-05, + "loss": 1.2039, + "step": 6786 + }, + { + "epoch": 0.3710618755381819, + "grad_norm": 1.9261040687561035, + "learning_rate": 1.5065793507997923e-05, + "loss": 1.0436, + "step": 6787 + }, + { + "epoch": 0.3711165479819034, + "grad_norm": 1.8454744815826416, + "learning_rate": 1.5064217888135626e-05, + "loss": 1.3992, + "step": 6788 + }, + { + "epoch": 0.37117122042562495, + "grad_norm": 1.4551526308059692, + "learning_rate": 1.5062642099167208e-05, + "loss": 1.5865, + "step": 6789 + }, + { + "epoch": 0.37122589286934654, + "grad_norm": 1.5167059898376465, + "learning_rate": 1.5061066141145294e-05, + "loss": 1.3579, + "step": 6790 + }, + { + "epoch": 0.3712805653130681, + "grad_norm": 1.3503776788711548, + "learning_rate": 1.5059490014122502e-05, + "loss": 1.5698, + "step": 6791 + }, + { + "epoch": 0.3713352377567896, + "grad_norm": 1.1322814226150513, + "learning_rate": 1.5057913718151468e-05, + "loss": 1.3212, + "step": 6792 + }, + { + "epoch": 0.3713899102005112, + "grad_norm": 1.4388023614883423, + "learning_rate": 1.5056337253284825e-05, + "loss": 1.3674, + "step": 6793 + }, + { + "epoch": 0.37144458264423275, + "grad_norm": 1.383577585220337, + "learning_rate": 1.5054760619575217e-05, + "loss": 1.6857, + "step": 6794 + }, + { + "epoch": 0.3714992550879543, + "grad_norm": 1.2113494873046875, + "learning_rate": 1.505318381707529e-05, + "loss": 1.5302, + "step": 6795 + }, + { + "epoch": 0.3715539275316758, + "grad_norm": 2.444333076477051, + "learning_rate": 1.5051606845837699e-05, + "loss": 1.4349, + "step": 6796 + }, + { + "epoch": 0.3716085999753974, + "grad_norm": 1.335412621498108, + "learning_rate": 1.5050029705915101e-05, + "loss": 1.4115, + "step": 6797 + }, + { + "epoch": 0.37166327241911895, + "grad_norm": 1.623543620109558, + "learning_rate": 1.5048452397360158e-05, + "loss": 1.1581, + "step": 6798 + }, + { + "epoch": 0.3717179448628405, + "grad_norm": 1.330867052078247, + "learning_rate": 1.5046874920225544e-05, + "loss": 1.5187, + "step": 6799 + }, + { + "epoch": 0.3717726173065621, + "grad_norm": 1.3880338668823242, + "learning_rate": 1.5045297274563937e-05, + "loss": 1.3818, + "step": 6800 + }, + { + "epoch": 0.3718272897502836, + "grad_norm": 1.740825891494751, + "learning_rate": 1.5043719460428013e-05, + "loss": 1.4774, + "step": 6801 + }, + { + "epoch": 0.37188196219400516, + "grad_norm": 1.2630417346954346, + "learning_rate": 1.504214147787046e-05, + "loss": 1.5098, + "step": 6802 + }, + { + "epoch": 0.3719366346377267, + "grad_norm": 1.5440456867218018, + "learning_rate": 1.5040563326943974e-05, + "loss": 1.3687, + "step": 6803 + }, + { + "epoch": 0.3719913070814483, + "grad_norm": 1.3619625568389893, + "learning_rate": 1.5038985007701246e-05, + "loss": 1.4917, + "step": 6804 + }, + { + "epoch": 0.3720459795251698, + "grad_norm": 1.7686488628387451, + "learning_rate": 1.5037406520194985e-05, + "loss": 1.5069, + "step": 6805 + }, + { + "epoch": 0.37210065196889136, + "grad_norm": 1.4500036239624023, + "learning_rate": 1.50358278644779e-05, + "loss": 1.256, + "step": 6806 + }, + { + "epoch": 0.37215532441261295, + "grad_norm": 1.3196014165878296, + "learning_rate": 1.5034249040602709e-05, + "loss": 1.5021, + "step": 6807 + }, + { + "epoch": 0.3722099968563345, + "grad_norm": 1.3264737129211426, + "learning_rate": 1.5032670048622126e-05, + "loss": 1.6365, + "step": 6808 + }, + { + "epoch": 0.37226466930005603, + "grad_norm": 2.1915159225463867, + "learning_rate": 1.5031090888588882e-05, + "loss": 1.462, + "step": 6809 + }, + { + "epoch": 0.37231934174377757, + "grad_norm": 1.4296133518218994, + "learning_rate": 1.5029511560555707e-05, + "loss": 1.5081, + "step": 6810 + }, + { + "epoch": 0.37237401418749916, + "grad_norm": 1.3907006978988647, + "learning_rate": 1.5027932064575339e-05, + "loss": 1.6642, + "step": 6811 + }, + { + "epoch": 0.3724286866312207, + "grad_norm": 1.7449167966842651, + "learning_rate": 1.5026352400700517e-05, + "loss": 1.3847, + "step": 6812 + }, + { + "epoch": 0.37248335907494223, + "grad_norm": 2.0846779346466064, + "learning_rate": 1.5024772568983998e-05, + "loss": 1.5735, + "step": 6813 + }, + { + "epoch": 0.3725380315186638, + "grad_norm": 1.4524890184402466, + "learning_rate": 1.5023192569478533e-05, + "loss": 1.2114, + "step": 6814 + }, + { + "epoch": 0.37259270396238536, + "grad_norm": 1.5513415336608887, + "learning_rate": 1.5021612402236878e-05, + "loss": 1.7861, + "step": 6815 + }, + { + "epoch": 0.3726473764061069, + "grad_norm": 1.9913691282272339, + "learning_rate": 1.5020032067311804e-05, + "loss": 1.33, + "step": 6816 + }, + { + "epoch": 0.37270204884982844, + "grad_norm": 1.572628378868103, + "learning_rate": 1.5018451564756078e-05, + "loss": 1.3425, + "step": 6817 + }, + { + "epoch": 0.37275672129355003, + "grad_norm": 2.231083393096924, + "learning_rate": 1.5016870894622475e-05, + "loss": 1.6026, + "step": 6818 + }, + { + "epoch": 0.37281139373727157, + "grad_norm": 1.3069727420806885, + "learning_rate": 1.5015290056963787e-05, + "loss": 1.4689, + "step": 6819 + }, + { + "epoch": 0.3728660661809931, + "grad_norm": 1.551588773727417, + "learning_rate": 1.5013709051832792e-05, + "loss": 1.3283, + "step": 6820 + }, + { + "epoch": 0.3729207386247147, + "grad_norm": 1.5013911724090576, + "learning_rate": 1.5012127879282284e-05, + "loss": 1.5536, + "step": 6821 + }, + { + "epoch": 0.37297541106843624, + "grad_norm": 1.7409495115280151, + "learning_rate": 1.5010546539365067e-05, + "loss": 1.3918, + "step": 6822 + }, + { + "epoch": 0.3730300835121578, + "grad_norm": 1.9630331993103027, + "learning_rate": 1.5008965032133942e-05, + "loss": 1.4918, + "step": 6823 + }, + { + "epoch": 0.3730847559558793, + "grad_norm": 1.3263533115386963, + "learning_rate": 1.5007383357641723e-05, + "loss": 1.4759, + "step": 6824 + }, + { + "epoch": 0.3731394283996009, + "grad_norm": 1.6945505142211914, + "learning_rate": 1.5005801515941221e-05, + "loss": 1.3511, + "step": 6825 + }, + { + "epoch": 0.37319410084332244, + "grad_norm": 1.6476727724075317, + "learning_rate": 1.5004219507085264e-05, + "loss": 1.3552, + "step": 6826 + }, + { + "epoch": 0.373248773287044, + "grad_norm": 1.6107819080352783, + "learning_rate": 1.5002637331126672e-05, + "loss": 1.5385, + "step": 6827 + }, + { + "epoch": 0.37330344573076557, + "grad_norm": 1.3220244646072388, + "learning_rate": 1.500105498811828e-05, + "loss": 1.2486, + "step": 6828 + }, + { + "epoch": 0.3733581181744871, + "grad_norm": 1.4555805921554565, + "learning_rate": 1.4999472478112927e-05, + "loss": 1.3175, + "step": 6829 + }, + { + "epoch": 0.37341279061820865, + "grad_norm": 1.4595065116882324, + "learning_rate": 1.4997889801163456e-05, + "loss": 1.5842, + "step": 6830 + }, + { + "epoch": 0.37346746306193024, + "grad_norm": 1.8587316274642944, + "learning_rate": 1.499630695732272e-05, + "loss": 1.3326, + "step": 6831 + }, + { + "epoch": 0.3735221355056518, + "grad_norm": 1.8334879875183105, + "learning_rate": 1.4994723946643568e-05, + "loss": 1.4681, + "step": 6832 + }, + { + "epoch": 0.3735768079493733, + "grad_norm": 1.4710063934326172, + "learning_rate": 1.4993140769178862e-05, + "loss": 1.4971, + "step": 6833 + }, + { + "epoch": 0.37363148039309485, + "grad_norm": 1.2888786792755127, + "learning_rate": 1.4991557424981471e-05, + "loss": 1.5373, + "step": 6834 + }, + { + "epoch": 0.37368615283681644, + "grad_norm": 1.3635671138763428, + "learning_rate": 1.4989973914104262e-05, + "loss": 1.2551, + "step": 6835 + }, + { + "epoch": 0.373740825280538, + "grad_norm": 1.3801630735397339, + "learning_rate": 1.4988390236600117e-05, + "loss": 1.4499, + "step": 6836 + }, + { + "epoch": 0.3737954977242595, + "grad_norm": 1.6162893772125244, + "learning_rate": 1.4986806392521913e-05, + "loss": 1.6495, + "step": 6837 + }, + { + "epoch": 0.3738501701679811, + "grad_norm": 1.7170231342315674, + "learning_rate": 1.4985222381922543e-05, + "loss": 1.4476, + "step": 6838 + }, + { + "epoch": 0.37390484261170265, + "grad_norm": 1.4761953353881836, + "learning_rate": 1.4983638204854902e-05, + "loss": 1.6517, + "step": 6839 + }, + { + "epoch": 0.3739595150554242, + "grad_norm": 1.588178038597107, + "learning_rate": 1.4982053861371885e-05, + "loss": 1.4694, + "step": 6840 + }, + { + "epoch": 0.3740141874991457, + "grad_norm": 1.275499939918518, + "learning_rate": 1.4980469351526402e-05, + "loss": 1.5584, + "step": 6841 + }, + { + "epoch": 0.3740688599428673, + "grad_norm": 1.540328025817871, + "learning_rate": 1.4978884675371354e-05, + "loss": 1.5027, + "step": 6842 + }, + { + "epoch": 0.37412353238658885, + "grad_norm": 1.1532306671142578, + "learning_rate": 1.4977299832959666e-05, + "loss": 1.4151, + "step": 6843 + }, + { + "epoch": 0.3741782048303104, + "grad_norm": 1.4785367250442505, + "learning_rate": 1.4975714824344258e-05, + "loss": 1.3634, + "step": 6844 + }, + { + "epoch": 0.374232877274032, + "grad_norm": 1.439255714416504, + "learning_rate": 1.4974129649578058e-05, + "loss": 1.48, + "step": 6845 + }, + { + "epoch": 0.3742875497177535, + "grad_norm": 1.6276497840881348, + "learning_rate": 1.4972544308713995e-05, + "loss": 1.4869, + "step": 6846 + }, + { + "epoch": 0.37434222216147506, + "grad_norm": 1.3511296510696411, + "learning_rate": 1.497095880180501e-05, + "loss": 1.2007, + "step": 6847 + }, + { + "epoch": 0.3743968946051966, + "grad_norm": 1.6661711931228638, + "learning_rate": 1.4969373128904043e-05, + "loss": 1.7716, + "step": 6848 + }, + { + "epoch": 0.3744515670489182, + "grad_norm": 2.045109748840332, + "learning_rate": 1.4967787290064048e-05, + "loss": 1.5721, + "step": 6849 + }, + { + "epoch": 0.3745062394926397, + "grad_norm": 1.396393060684204, + "learning_rate": 1.4966201285337978e-05, + "loss": 1.4726, + "step": 6850 + }, + { + "epoch": 0.37456091193636126, + "grad_norm": 1.400280475616455, + "learning_rate": 1.4964615114778794e-05, + "loss": 1.4948, + "step": 6851 + }, + { + "epoch": 0.37461558438008286, + "grad_norm": 1.3412615060806274, + "learning_rate": 1.496302877843946e-05, + "loss": 1.4494, + "step": 6852 + }, + { + "epoch": 0.3746702568238044, + "grad_norm": 1.500717282295227, + "learning_rate": 1.4961442276372951e-05, + "loss": 1.3316, + "step": 6853 + }, + { + "epoch": 0.37472492926752593, + "grad_norm": 1.5692627429962158, + "learning_rate": 1.495985560863224e-05, + "loss": 1.5597, + "step": 6854 + }, + { + "epoch": 0.37477960171124747, + "grad_norm": 2.367035150527954, + "learning_rate": 1.4958268775270315e-05, + "loss": 1.4338, + "step": 6855 + }, + { + "epoch": 0.37483427415496906, + "grad_norm": 1.281152606010437, + "learning_rate": 1.4956681776340157e-05, + "loss": 1.5289, + "step": 6856 + }, + { + "epoch": 0.3748889465986906, + "grad_norm": 1.4713737964630127, + "learning_rate": 1.4955094611894763e-05, + "loss": 1.2763, + "step": 6857 + }, + { + "epoch": 0.37494361904241214, + "grad_norm": 1.4972565174102783, + "learning_rate": 1.4953507281987137e-05, + "loss": 1.5387, + "step": 6858 + }, + { + "epoch": 0.37499829148613373, + "grad_norm": 1.854544758796692, + "learning_rate": 1.4951919786670274e-05, + "loss": 1.5402, + "step": 6859 + }, + { + "epoch": 0.37505296392985527, + "grad_norm": 1.8671777248382568, + "learning_rate": 1.4950332125997192e-05, + "loss": 1.5691, + "step": 6860 + }, + { + "epoch": 0.3751076363735768, + "grad_norm": 1.4069650173187256, + "learning_rate": 1.4948744300020903e-05, + "loss": 1.4164, + "step": 6861 + }, + { + "epoch": 0.37516230881729834, + "grad_norm": 1.4400280714035034, + "learning_rate": 1.494715630879443e-05, + "loss": 1.479, + "step": 6862 + }, + { + "epoch": 0.37521698126101993, + "grad_norm": 1.8423950672149658, + "learning_rate": 1.4945568152370797e-05, + "loss": 1.3342, + "step": 6863 + }, + { + "epoch": 0.37527165370474147, + "grad_norm": 1.4443327188491821, + "learning_rate": 1.494397983080304e-05, + "loss": 1.7703, + "step": 6864 + }, + { + "epoch": 0.375326326148463, + "grad_norm": 1.59872305393219, + "learning_rate": 1.4942391344144196e-05, + "loss": 1.4515, + "step": 6865 + }, + { + "epoch": 0.3753809985921846, + "grad_norm": 1.3361616134643555, + "learning_rate": 1.4940802692447306e-05, + "loss": 1.4639, + "step": 6866 + }, + { + "epoch": 0.37543567103590614, + "grad_norm": 1.3910810947418213, + "learning_rate": 1.493921387576542e-05, + "loss": 1.5232, + "step": 6867 + }, + { + "epoch": 0.3754903434796277, + "grad_norm": 1.432551622390747, + "learning_rate": 1.4937624894151592e-05, + "loss": 1.2996, + "step": 6868 + }, + { + "epoch": 0.3755450159233492, + "grad_norm": 1.8263143301010132, + "learning_rate": 1.4936035747658884e-05, + "loss": 1.579, + "step": 6869 + }, + { + "epoch": 0.3755996883670708, + "grad_norm": 1.3565610647201538, + "learning_rate": 1.4934446436340357e-05, + "loss": 1.311, + "step": 6870 + }, + { + "epoch": 0.37565436081079234, + "grad_norm": 2.002131462097168, + "learning_rate": 1.4932856960249087e-05, + "loss": 1.3385, + "step": 6871 + }, + { + "epoch": 0.3757090332545139, + "grad_norm": 1.287316083908081, + "learning_rate": 1.4931267319438148e-05, + "loss": 1.4995, + "step": 6872 + }, + { + "epoch": 0.3757637056982355, + "grad_norm": 1.6328338384628296, + "learning_rate": 1.4929677513960621e-05, + "loss": 1.2584, + "step": 6873 + }, + { + "epoch": 0.375818378141957, + "grad_norm": 1.7313772439956665, + "learning_rate": 1.4928087543869594e-05, + "loss": 1.463, + "step": 6874 + }, + { + "epoch": 0.37587305058567855, + "grad_norm": 1.8581429719924927, + "learning_rate": 1.4926497409218156e-05, + "loss": 1.4456, + "step": 6875 + }, + { + "epoch": 0.3759277230294001, + "grad_norm": 1.9652634859085083, + "learning_rate": 1.4924907110059415e-05, + "loss": 1.3378, + "step": 6876 + }, + { + "epoch": 0.3759823954731217, + "grad_norm": 1.5879429578781128, + "learning_rate": 1.4923316646446466e-05, + "loss": 1.4855, + "step": 6877 + }, + { + "epoch": 0.3760370679168432, + "grad_norm": 1.7318034172058105, + "learning_rate": 1.492172601843242e-05, + "loss": 1.5547, + "step": 6878 + }, + { + "epoch": 0.37609174036056475, + "grad_norm": 1.539480209350586, + "learning_rate": 1.4920135226070395e-05, + "loss": 1.4197, + "step": 6879 + }, + { + "epoch": 0.37614641280428635, + "grad_norm": 1.7184877395629883, + "learning_rate": 1.4918544269413511e-05, + "loss": 1.4044, + "step": 6880 + }, + { + "epoch": 0.3762010852480079, + "grad_norm": 1.3374072313308716, + "learning_rate": 1.491695314851489e-05, + "loss": 1.496, + "step": 6881 + }, + { + "epoch": 0.3762557576917294, + "grad_norm": 1.2997620105743408, + "learning_rate": 1.4915361863427662e-05, + "loss": 1.5037, + "step": 6882 + }, + { + "epoch": 0.37631043013545096, + "grad_norm": 1.536975622177124, + "learning_rate": 1.4913770414204973e-05, + "loss": 1.3222, + "step": 6883 + }, + { + "epoch": 0.37636510257917255, + "grad_norm": 1.62147855758667, + "learning_rate": 1.4912178800899954e-05, + "loss": 1.617, + "step": 6884 + }, + { + "epoch": 0.3764197750228941, + "grad_norm": 1.4365936517715454, + "learning_rate": 1.4910587023565763e-05, + "loss": 1.4521, + "step": 6885 + }, + { + "epoch": 0.3764744474666156, + "grad_norm": 1.5282964706420898, + "learning_rate": 1.4908995082255546e-05, + "loss": 1.4773, + "step": 6886 + }, + { + "epoch": 0.3765291199103372, + "grad_norm": 1.4318221807479858, + "learning_rate": 1.4907402977022465e-05, + "loss": 1.4869, + "step": 6887 + }, + { + "epoch": 0.37658379235405876, + "grad_norm": 1.30866277217865, + "learning_rate": 1.4905810707919681e-05, + "loss": 1.3276, + "step": 6888 + }, + { + "epoch": 0.3766384647977803, + "grad_norm": 1.337232232093811, + "learning_rate": 1.4904218275000366e-05, + "loss": 1.4621, + "step": 6889 + }, + { + "epoch": 0.37669313724150183, + "grad_norm": 1.5116654634475708, + "learning_rate": 1.4902625678317696e-05, + "loss": 1.5493, + "step": 6890 + }, + { + "epoch": 0.3767478096852234, + "grad_norm": 1.3463603258132935, + "learning_rate": 1.490103291792485e-05, + "loss": 1.4152, + "step": 6891 + }, + { + "epoch": 0.37680248212894496, + "grad_norm": 1.4765976667404175, + "learning_rate": 1.4899439993875016e-05, + "loss": 1.2307, + "step": 6892 + }, + { + "epoch": 0.3768571545726665, + "grad_norm": 2.119904041290283, + "learning_rate": 1.4897846906221381e-05, + "loss": 1.6248, + "step": 6893 + }, + { + "epoch": 0.3769118270163881, + "grad_norm": 1.8144772052764893, + "learning_rate": 1.4896253655017146e-05, + "loss": 1.3801, + "step": 6894 + }, + { + "epoch": 0.3769664994601096, + "grad_norm": 2.1030468940734863, + "learning_rate": 1.4894660240315508e-05, + "loss": 1.5668, + "step": 6895 + }, + { + "epoch": 0.37702117190383116, + "grad_norm": 1.4494425058364868, + "learning_rate": 1.4893066662169684e-05, + "loss": 1.4556, + "step": 6896 + }, + { + "epoch": 0.3770758443475527, + "grad_norm": 1.6248018741607666, + "learning_rate": 1.489147292063288e-05, + "loss": 1.6168, + "step": 6897 + }, + { + "epoch": 0.3771305167912743, + "grad_norm": 1.8197717666625977, + "learning_rate": 1.488987901575832e-05, + "loss": 1.2342, + "step": 6898 + }, + { + "epoch": 0.37718518923499583, + "grad_norm": 1.7838022708892822, + "learning_rate": 1.4888284947599222e-05, + "loss": 1.581, + "step": 6899 + }, + { + "epoch": 0.37723986167871737, + "grad_norm": 1.3779516220092773, + "learning_rate": 1.4886690716208816e-05, + "loss": 1.3543, + "step": 6900 + }, + { + "epoch": 0.37729453412243896, + "grad_norm": 1.8150420188903809, + "learning_rate": 1.4885096321640346e-05, + "loss": 1.4444, + "step": 6901 + }, + { + "epoch": 0.3773492065661605, + "grad_norm": 1.600318193435669, + "learning_rate": 1.4883501763947043e-05, + "loss": 1.4558, + "step": 6902 + }, + { + "epoch": 0.37740387900988204, + "grad_norm": 1.3729041814804077, + "learning_rate": 1.4881907043182158e-05, + "loss": 1.8275, + "step": 6903 + }, + { + "epoch": 0.3774585514536036, + "grad_norm": 1.5515002012252808, + "learning_rate": 1.488031215939894e-05, + "loss": 1.5477, + "step": 6904 + }, + { + "epoch": 0.37751322389732517, + "grad_norm": 1.9071381092071533, + "learning_rate": 1.4878717112650649e-05, + "loss": 1.4748, + "step": 6905 + }, + { + "epoch": 0.3775678963410467, + "grad_norm": 1.6363933086395264, + "learning_rate": 1.4877121902990543e-05, + "loss": 1.3445, + "step": 6906 + }, + { + "epoch": 0.37762256878476824, + "grad_norm": 1.503326177597046, + "learning_rate": 1.4875526530471893e-05, + "loss": 1.2826, + "step": 6907 + }, + { + "epoch": 0.37767724122848984, + "grad_norm": 1.498329520225525, + "learning_rate": 1.4873930995147971e-05, + "loss": 1.2701, + "step": 6908 + }, + { + "epoch": 0.3777319136722114, + "grad_norm": 1.1546827554702759, + "learning_rate": 1.4872335297072057e-05, + "loss": 1.4427, + "step": 6909 + }, + { + "epoch": 0.3777865861159329, + "grad_norm": 1.5047457218170166, + "learning_rate": 1.4870739436297435e-05, + "loss": 1.4133, + "step": 6910 + }, + { + "epoch": 0.37784125855965445, + "grad_norm": 1.6081079244613647, + "learning_rate": 1.4869143412877393e-05, + "loss": 1.5025, + "step": 6911 + }, + { + "epoch": 0.37789593100337604, + "grad_norm": 1.2512507438659668, + "learning_rate": 1.4867547226865227e-05, + "loss": 1.5929, + "step": 6912 + }, + { + "epoch": 0.3779506034470976, + "grad_norm": 1.8907984495162964, + "learning_rate": 1.4865950878314234e-05, + "loss": 1.5, + "step": 6913 + }, + { + "epoch": 0.3780052758908191, + "grad_norm": 1.6918076276779175, + "learning_rate": 1.4864354367277725e-05, + "loss": 1.4976, + "step": 6914 + }, + { + "epoch": 0.3780599483345407, + "grad_norm": 1.7846254110336304, + "learning_rate": 1.4862757693809009e-05, + "loss": 1.5158, + "step": 6915 + }, + { + "epoch": 0.37811462077826224, + "grad_norm": 1.6594511270523071, + "learning_rate": 1.48611608579614e-05, + "loss": 1.4016, + "step": 6916 + }, + { + "epoch": 0.3781692932219838, + "grad_norm": 1.5859004259109497, + "learning_rate": 1.4859563859788228e-05, + "loss": 1.4913, + "step": 6917 + }, + { + "epoch": 0.3782239656657053, + "grad_norm": 1.3083164691925049, + "learning_rate": 1.4857966699342817e-05, + "loss": 1.5016, + "step": 6918 + }, + { + "epoch": 0.3782786381094269, + "grad_norm": 1.3641434907913208, + "learning_rate": 1.4856369376678492e-05, + "loss": 1.6006, + "step": 6919 + }, + { + "epoch": 0.37833331055314845, + "grad_norm": 1.6097583770751953, + "learning_rate": 1.4854771891848598e-05, + "loss": 1.5688, + "step": 6920 + }, + { + "epoch": 0.37838798299687, + "grad_norm": 1.3268519639968872, + "learning_rate": 1.485317424490648e-05, + "loss": 1.5025, + "step": 6921 + }, + { + "epoch": 0.3784426554405916, + "grad_norm": 1.997286081314087, + "learning_rate": 1.4851576435905489e-05, + "loss": 1.6834, + "step": 6922 + }, + { + "epoch": 0.3784973278843131, + "grad_norm": 1.8957045078277588, + "learning_rate": 1.4849978464898971e-05, + "loss": 1.3379, + "step": 6923 + }, + { + "epoch": 0.37855200032803465, + "grad_norm": 2.0413787364959717, + "learning_rate": 1.4848380331940295e-05, + "loss": 1.6911, + "step": 6924 + }, + { + "epoch": 0.3786066727717562, + "grad_norm": 1.3739482164382935, + "learning_rate": 1.4846782037082824e-05, + "loss": 1.7398, + "step": 6925 + }, + { + "epoch": 0.3786613452154778, + "grad_norm": 1.431179404258728, + "learning_rate": 1.4845183580379923e-05, + "loss": 1.4624, + "step": 6926 + }, + { + "epoch": 0.3787160176591993, + "grad_norm": 1.909140706062317, + "learning_rate": 1.4843584961884973e-05, + "loss": 1.0867, + "step": 6927 + }, + { + "epoch": 0.37877069010292086, + "grad_norm": 1.642857551574707, + "learning_rate": 1.4841986181651355e-05, + "loss": 1.2286, + "step": 6928 + }, + { + "epoch": 0.37882536254664245, + "grad_norm": 1.3068848848342896, + "learning_rate": 1.484038723973246e-05, + "loss": 1.5071, + "step": 6929 + }, + { + "epoch": 0.378880034990364, + "grad_norm": 1.5250194072723389, + "learning_rate": 1.4838788136181676e-05, + "loss": 1.3257, + "step": 6930 + }, + { + "epoch": 0.3789347074340855, + "grad_norm": 1.4690477848052979, + "learning_rate": 1.4837188871052399e-05, + "loss": 1.6015, + "step": 6931 + }, + { + "epoch": 0.37898937987780706, + "grad_norm": 1.5182164907455444, + "learning_rate": 1.4835589444398037e-05, + "loss": 1.1478, + "step": 6932 + }, + { + "epoch": 0.37904405232152866, + "grad_norm": 1.3457931280136108, + "learning_rate": 1.4833989856271995e-05, + "loss": 1.5161, + "step": 6933 + }, + { + "epoch": 0.3790987247652502, + "grad_norm": 1.9236607551574707, + "learning_rate": 1.4832390106727688e-05, + "loss": 1.3869, + "step": 6934 + }, + { + "epoch": 0.37915339720897173, + "grad_norm": 1.6523454189300537, + "learning_rate": 1.4830790195818537e-05, + "loss": 1.5542, + "step": 6935 + }, + { + "epoch": 0.3792080696526933, + "grad_norm": 1.3460307121276855, + "learning_rate": 1.4829190123597965e-05, + "loss": 1.3678, + "step": 6936 + }, + { + "epoch": 0.37926274209641486, + "grad_norm": 1.3905569314956665, + "learning_rate": 1.4827589890119404e-05, + "loss": 1.449, + "step": 6937 + }, + { + "epoch": 0.3793174145401364, + "grad_norm": 1.3736861944198608, + "learning_rate": 1.4825989495436286e-05, + "loss": 1.4624, + "step": 6938 + }, + { + "epoch": 0.37937208698385794, + "grad_norm": 1.5613900423049927, + "learning_rate": 1.4824388939602056e-05, + "loss": 1.3107, + "step": 6939 + }, + { + "epoch": 0.37942675942757953, + "grad_norm": 1.571496605873108, + "learning_rate": 1.482278822267016e-05, + "loss": 1.5843, + "step": 6940 + }, + { + "epoch": 0.37948143187130107, + "grad_norm": 1.4762074947357178, + "learning_rate": 1.4821187344694043e-05, + "loss": 1.5361, + "step": 6941 + }, + { + "epoch": 0.3795361043150226, + "grad_norm": 2.4888625144958496, + "learning_rate": 1.4819586305727169e-05, + "loss": 1.633, + "step": 6942 + }, + { + "epoch": 0.3795907767587442, + "grad_norm": 1.2259100675582886, + "learning_rate": 1.4817985105823003e-05, + "loss": 1.7423, + "step": 6943 + }, + { + "epoch": 0.37964544920246573, + "grad_norm": 1.5599817037582397, + "learning_rate": 1.4816383745035006e-05, + "loss": 1.3995, + "step": 6944 + }, + { + "epoch": 0.37970012164618727, + "grad_norm": 1.7060025930404663, + "learning_rate": 1.4814782223416653e-05, + "loss": 1.0666, + "step": 6945 + }, + { + "epoch": 0.3797547940899088, + "grad_norm": 1.4277160167694092, + "learning_rate": 1.4813180541021425e-05, + "loss": 1.5149, + "step": 6946 + }, + { + "epoch": 0.3798094665336304, + "grad_norm": 1.2778736352920532, + "learning_rate": 1.4811578697902802e-05, + "loss": 1.5413, + "step": 6947 + }, + { + "epoch": 0.37986413897735194, + "grad_norm": 1.8393707275390625, + "learning_rate": 1.4809976694114276e-05, + "loss": 1.3874, + "step": 6948 + }, + { + "epoch": 0.3799188114210735, + "grad_norm": 1.4420892000198364, + "learning_rate": 1.4808374529709344e-05, + "loss": 1.5901, + "step": 6949 + }, + { + "epoch": 0.37997348386479507, + "grad_norm": 1.3326573371887207, + "learning_rate": 1.4806772204741503e-05, + "loss": 1.4328, + "step": 6950 + }, + { + "epoch": 0.3800281563085166, + "grad_norm": 1.768367886543274, + "learning_rate": 1.4805169719264255e-05, + "loss": 1.3728, + "step": 6951 + }, + { + "epoch": 0.38008282875223814, + "grad_norm": 1.1524890661239624, + "learning_rate": 1.4803567073331115e-05, + "loss": 1.4633, + "step": 6952 + }, + { + "epoch": 0.3801375011959597, + "grad_norm": 1.6840651035308838, + "learning_rate": 1.4801964266995601e-05, + "loss": 1.4821, + "step": 6953 + }, + { + "epoch": 0.3801921736396813, + "grad_norm": 1.3279024362564087, + "learning_rate": 1.480036130031123e-05, + "loss": 1.4359, + "step": 6954 + }, + { + "epoch": 0.3802468460834028, + "grad_norm": 1.30191969871521, + "learning_rate": 1.4798758173331528e-05, + "loss": 1.4008, + "step": 6955 + }, + { + "epoch": 0.38030151852712435, + "grad_norm": 1.5847549438476562, + "learning_rate": 1.4797154886110037e-05, + "loss": 1.6376, + "step": 6956 + }, + { + "epoch": 0.38035619097084594, + "grad_norm": 1.5342968702316284, + "learning_rate": 1.4795551438700283e-05, + "loss": 1.578, + "step": 6957 + }, + { + "epoch": 0.3804108634145675, + "grad_norm": 1.2683879137039185, + "learning_rate": 1.4793947831155815e-05, + "loss": 1.6276, + "step": 6958 + }, + { + "epoch": 0.380465535858289, + "grad_norm": 1.9469002485275269, + "learning_rate": 1.4792344063530177e-05, + "loss": 1.4715, + "step": 6959 + }, + { + "epoch": 0.38052020830201055, + "grad_norm": 1.755374789237976, + "learning_rate": 1.4790740135876929e-05, + "loss": 1.3721, + "step": 6960 + }, + { + "epoch": 0.38057488074573215, + "grad_norm": 1.630825400352478, + "learning_rate": 1.4789136048249621e-05, + "loss": 1.7169, + "step": 6961 + }, + { + "epoch": 0.3806295531894537, + "grad_norm": 1.5660369396209717, + "learning_rate": 1.4787531800701826e-05, + "loss": 1.285, + "step": 6962 + }, + { + "epoch": 0.3806842256331752, + "grad_norm": 1.1465494632720947, + "learning_rate": 1.478592739328711e-05, + "loss": 1.6642, + "step": 6963 + }, + { + "epoch": 0.3807388980768968, + "grad_norm": 1.1996369361877441, + "learning_rate": 1.4784322826059048e-05, + "loss": 1.6128, + "step": 6964 + }, + { + "epoch": 0.38079357052061835, + "grad_norm": 1.611863374710083, + "learning_rate": 1.4782718099071219e-05, + "loss": 1.3515, + "step": 6965 + }, + { + "epoch": 0.3808482429643399, + "grad_norm": 1.5977388620376587, + "learning_rate": 1.4781113212377207e-05, + "loss": 1.4252, + "step": 6966 + }, + { + "epoch": 0.3809029154080614, + "grad_norm": 1.6778641939163208, + "learning_rate": 1.4779508166030609e-05, + "loss": 1.4467, + "step": 6967 + }, + { + "epoch": 0.380957587851783, + "grad_norm": 1.543745756149292, + "learning_rate": 1.4777902960085017e-05, + "loss": 1.5436, + "step": 6968 + }, + { + "epoch": 0.38101226029550456, + "grad_norm": 1.9720436334609985, + "learning_rate": 1.4776297594594033e-05, + "loss": 1.4539, + "step": 6969 + }, + { + "epoch": 0.3810669327392261, + "grad_norm": 1.4691612720489502, + "learning_rate": 1.4774692069611267e-05, + "loss": 1.5272, + "step": 6970 + }, + { + "epoch": 0.3811216051829477, + "grad_norm": 1.5932419300079346, + "learning_rate": 1.4773086385190328e-05, + "loss": 1.2001, + "step": 6971 + }, + { + "epoch": 0.3811762776266692, + "grad_norm": 1.6119154691696167, + "learning_rate": 1.4771480541384831e-05, + "loss": 1.3265, + "step": 6972 + }, + { + "epoch": 0.38123095007039076, + "grad_norm": 1.5390818119049072, + "learning_rate": 1.4769874538248404e-05, + "loss": 1.4896, + "step": 6973 + }, + { + "epoch": 0.3812856225141123, + "grad_norm": 1.4487336874008179, + "learning_rate": 1.4768268375834673e-05, + "loss": 1.4456, + "step": 6974 + }, + { + "epoch": 0.3813402949578339, + "grad_norm": 1.4393633604049683, + "learning_rate": 1.476666205419727e-05, + "loss": 1.4432, + "step": 6975 + }, + { + "epoch": 0.38139496740155543, + "grad_norm": 1.5405369997024536, + "learning_rate": 1.476505557338984e-05, + "loss": 1.5365, + "step": 6976 + }, + { + "epoch": 0.38144963984527697, + "grad_norm": 1.72196626663208, + "learning_rate": 1.4763448933466018e-05, + "loss": 1.4643, + "step": 6977 + }, + { + "epoch": 0.38150431228899856, + "grad_norm": 1.3648947477340698, + "learning_rate": 1.4761842134479463e-05, + "loss": 1.5584, + "step": 6978 + }, + { + "epoch": 0.3815589847327201, + "grad_norm": 1.4580918550491333, + "learning_rate": 1.4760235176483821e-05, + "loss": 1.5056, + "step": 6979 + }, + { + "epoch": 0.38161365717644163, + "grad_norm": 1.5001238584518433, + "learning_rate": 1.475862805953276e-05, + "loss": 1.6647, + "step": 6980 + }, + { + "epoch": 0.38166832962016317, + "grad_norm": 1.6604326963424683, + "learning_rate": 1.475702078367994e-05, + "loss": 1.7494, + "step": 6981 + }, + { + "epoch": 0.38172300206388476, + "grad_norm": 1.605187177658081, + "learning_rate": 1.4755413348979034e-05, + "loss": 1.1529, + "step": 6982 + }, + { + "epoch": 0.3817776745076063, + "grad_norm": 1.4649220705032349, + "learning_rate": 1.4753805755483717e-05, + "loss": 1.4667, + "step": 6983 + }, + { + "epoch": 0.38183234695132784, + "grad_norm": 1.4975745677947998, + "learning_rate": 1.4752198003247669e-05, + "loss": 1.3985, + "step": 6984 + }, + { + "epoch": 0.38188701939504943, + "grad_norm": 1.2217860221862793, + "learning_rate": 1.4750590092324579e-05, + "loss": 1.5407, + "step": 6985 + }, + { + "epoch": 0.38194169183877097, + "grad_norm": 1.6092253923416138, + "learning_rate": 1.4748982022768139e-05, + "loss": 1.2847, + "step": 6986 + }, + { + "epoch": 0.3819963642824925, + "grad_norm": 1.4243605136871338, + "learning_rate": 1.4747373794632043e-05, + "loss": 1.3268, + "step": 6987 + }, + { + "epoch": 0.38205103672621404, + "grad_norm": 1.6093586683273315, + "learning_rate": 1.474576540797e-05, + "loss": 1.4852, + "step": 6988 + }, + { + "epoch": 0.38210570916993564, + "grad_norm": 1.2129120826721191, + "learning_rate": 1.4744156862835712e-05, + "loss": 1.7696, + "step": 6989 + }, + { + "epoch": 0.3821603816136572, + "grad_norm": 1.3222148418426514, + "learning_rate": 1.4742548159282892e-05, + "loss": 1.5001, + "step": 6990 + }, + { + "epoch": 0.3822150540573787, + "grad_norm": 1.724045753479004, + "learning_rate": 1.4740939297365261e-05, + "loss": 1.7963, + "step": 6991 + }, + { + "epoch": 0.3822697265011003, + "grad_norm": 1.62369704246521, + "learning_rate": 1.4739330277136546e-05, + "loss": 1.4265, + "step": 6992 + }, + { + "epoch": 0.38232439894482184, + "grad_norm": 1.440560221672058, + "learning_rate": 1.4737721098650468e-05, + "loss": 1.1757, + "step": 6993 + }, + { + "epoch": 0.3823790713885434, + "grad_norm": 1.5255308151245117, + "learning_rate": 1.4736111761960766e-05, + "loss": 1.4542, + "step": 6994 + }, + { + "epoch": 0.3824337438322649, + "grad_norm": 1.58225679397583, + "learning_rate": 1.4734502267121177e-05, + "loss": 1.4525, + "step": 6995 + }, + { + "epoch": 0.3824884162759865, + "grad_norm": 1.4901467561721802, + "learning_rate": 1.473289261418545e-05, + "loss": 1.4365, + "step": 6996 + }, + { + "epoch": 0.38254308871970805, + "grad_norm": 1.5742803812026978, + "learning_rate": 1.473128280320733e-05, + "loss": 1.4709, + "step": 6997 + }, + { + "epoch": 0.3825977611634296, + "grad_norm": 1.7561379671096802, + "learning_rate": 1.4729672834240575e-05, + "loss": 1.139, + "step": 6998 + }, + { + "epoch": 0.3826524336071512, + "grad_norm": 1.6477843523025513, + "learning_rate": 1.4728062707338949e-05, + "loss": 1.3696, + "step": 6999 + }, + { + "epoch": 0.3827071060508727, + "grad_norm": 1.9647576808929443, + "learning_rate": 1.4726452422556212e-05, + "loss": 1.4007, + "step": 7000 + }, + { + "epoch": 0.38276177849459425, + "grad_norm": 1.5126396417617798, + "learning_rate": 1.4724841979946139e-05, + "loss": 1.2752, + "step": 7001 + }, + { + "epoch": 0.3828164509383158, + "grad_norm": 1.505480408668518, + "learning_rate": 1.4723231379562504e-05, + "loss": 1.5183, + "step": 7002 + }, + { + "epoch": 0.3828711233820374, + "grad_norm": 1.3944612741470337, + "learning_rate": 1.472162062145909e-05, + "loss": 1.4329, + "step": 7003 + }, + { + "epoch": 0.3829257958257589, + "grad_norm": 1.7417651414871216, + "learning_rate": 1.4720009705689682e-05, + "loss": 1.4678, + "step": 7004 + }, + { + "epoch": 0.38298046826948046, + "grad_norm": 1.484724760055542, + "learning_rate": 1.4718398632308075e-05, + "loss": 1.5609, + "step": 7005 + }, + { + "epoch": 0.38303514071320205, + "grad_norm": 1.2030538320541382, + "learning_rate": 1.4716787401368067e-05, + "loss": 1.5231, + "step": 7006 + }, + { + "epoch": 0.3830898131569236, + "grad_norm": 1.3916820287704468, + "learning_rate": 1.4715176012923458e-05, + "loss": 1.3823, + "step": 7007 + }, + { + "epoch": 0.3831444856006451, + "grad_norm": 1.894790768623352, + "learning_rate": 1.471356446702806e-05, + "loss": 1.3606, + "step": 7008 + }, + { + "epoch": 0.38319915804436666, + "grad_norm": 1.5714023113250732, + "learning_rate": 1.4711952763735683e-05, + "loss": 1.2485, + "step": 7009 + }, + { + "epoch": 0.38325383048808825, + "grad_norm": 1.349298357963562, + "learning_rate": 1.4710340903100145e-05, + "loss": 1.5998, + "step": 7010 + }, + { + "epoch": 0.3833085029318098, + "grad_norm": 1.3316974639892578, + "learning_rate": 1.470872888517527e-05, + "loss": 1.5954, + "step": 7011 + }, + { + "epoch": 0.38336317537553133, + "grad_norm": 1.5195090770721436, + "learning_rate": 1.4707116710014887e-05, + "loss": 1.7014, + "step": 7012 + }, + { + "epoch": 0.3834178478192529, + "grad_norm": 1.2037534713745117, + "learning_rate": 1.4705504377672834e-05, + "loss": 1.4789, + "step": 7013 + }, + { + "epoch": 0.38347252026297446, + "grad_norm": 1.4742014408111572, + "learning_rate": 1.4703891888202948e-05, + "loss": 1.4274, + "step": 7014 + }, + { + "epoch": 0.383527192706696, + "grad_norm": 1.7396841049194336, + "learning_rate": 1.4702279241659075e-05, + "loss": 1.3033, + "step": 7015 + }, + { + "epoch": 0.38358186515041753, + "grad_norm": 1.4824973344802856, + "learning_rate": 1.4700666438095064e-05, + "loss": 1.2599, + "step": 7016 + }, + { + "epoch": 0.3836365375941391, + "grad_norm": 1.5941402912139893, + "learning_rate": 1.4699053477564768e-05, + "loss": 1.6563, + "step": 7017 + }, + { + "epoch": 0.38369121003786066, + "grad_norm": 1.609783411026001, + "learning_rate": 1.4697440360122048e-05, + "loss": 1.3963, + "step": 7018 + }, + { + "epoch": 0.3837458824815822, + "grad_norm": 1.7037101984024048, + "learning_rate": 1.4695827085820775e-05, + "loss": 1.5715, + "step": 7019 + }, + { + "epoch": 0.3838005549253038, + "grad_norm": 1.7870827913284302, + "learning_rate": 1.4694213654714816e-05, + "loss": 1.3603, + "step": 7020 + }, + { + "epoch": 0.38385522736902533, + "grad_norm": 1.4130505323410034, + "learning_rate": 1.4692600066858048e-05, + "loss": 1.3291, + "step": 7021 + }, + { + "epoch": 0.38390989981274687, + "grad_norm": 1.5132598876953125, + "learning_rate": 1.469098632230435e-05, + "loss": 1.5028, + "step": 7022 + }, + { + "epoch": 0.3839645722564684, + "grad_norm": 1.327199935913086, + "learning_rate": 1.4689372421107612e-05, + "loss": 1.6146, + "step": 7023 + }, + { + "epoch": 0.38401924470019, + "grad_norm": 1.7967034578323364, + "learning_rate": 1.4687758363321725e-05, + "loss": 1.1512, + "step": 7024 + }, + { + "epoch": 0.38407391714391154, + "grad_norm": 1.4685204029083252, + "learning_rate": 1.4686144149000585e-05, + "loss": 1.6001, + "step": 7025 + }, + { + "epoch": 0.3841285895876331, + "grad_norm": 1.6120535135269165, + "learning_rate": 1.4684529778198097e-05, + "loss": 1.4488, + "step": 7026 + }, + { + "epoch": 0.38418326203135467, + "grad_norm": 1.7374192476272583, + "learning_rate": 1.4682915250968169e-05, + "loss": 1.2782, + "step": 7027 + }, + { + "epoch": 0.3842379344750762, + "grad_norm": 1.6490501165390015, + "learning_rate": 1.468130056736471e-05, + "loss": 1.3523, + "step": 7028 + }, + { + "epoch": 0.38429260691879774, + "grad_norm": 1.233478307723999, + "learning_rate": 1.467968572744164e-05, + "loss": 1.3921, + "step": 7029 + }, + { + "epoch": 0.3843472793625193, + "grad_norm": 1.3523919582366943, + "learning_rate": 1.4678070731252883e-05, + "loss": 1.3372, + "step": 7030 + }, + { + "epoch": 0.38440195180624087, + "grad_norm": 1.5382349491119385, + "learning_rate": 1.4676455578852365e-05, + "loss": 1.4281, + "step": 7031 + }, + { + "epoch": 0.3844566242499624, + "grad_norm": 1.7374345064163208, + "learning_rate": 1.4674840270294022e-05, + "loss": 1.5382, + "step": 7032 + }, + { + "epoch": 0.38451129669368395, + "grad_norm": 1.6818795204162598, + "learning_rate": 1.4673224805631792e-05, + "loss": 1.5191, + "step": 7033 + }, + { + "epoch": 0.38456596913740554, + "grad_norm": 1.822108268737793, + "learning_rate": 1.4671609184919622e-05, + "loss": 1.4684, + "step": 7034 + }, + { + "epoch": 0.3846206415811271, + "grad_norm": 1.3236607313156128, + "learning_rate": 1.4669993408211458e-05, + "loss": 1.2958, + "step": 7035 + }, + { + "epoch": 0.3846753140248486, + "grad_norm": 2.106252431869507, + "learning_rate": 1.4668377475561255e-05, + "loss": 1.2184, + "step": 7036 + }, + { + "epoch": 0.3847299864685702, + "grad_norm": 1.1395745277404785, + "learning_rate": 1.4666761387022974e-05, + "loss": 1.7065, + "step": 7037 + }, + { + "epoch": 0.38478465891229174, + "grad_norm": 1.4192880392074585, + "learning_rate": 1.4665145142650578e-05, + "loss": 1.5115, + "step": 7038 + }, + { + "epoch": 0.3848393313560133, + "grad_norm": 1.4892369508743286, + "learning_rate": 1.466352874249804e-05, + "loss": 1.3134, + "step": 7039 + }, + { + "epoch": 0.3848940037997348, + "grad_norm": 1.1853126287460327, + "learning_rate": 1.4661912186619336e-05, + "loss": 1.5889, + "step": 7040 + }, + { + "epoch": 0.3849486762434564, + "grad_norm": 1.399848461151123, + "learning_rate": 1.4660295475068443e-05, + "loss": 1.4045, + "step": 7041 + }, + { + "epoch": 0.38500334868717795, + "grad_norm": 1.6708743572235107, + "learning_rate": 1.4658678607899348e-05, + "loss": 1.4238, + "step": 7042 + }, + { + "epoch": 0.3850580211308995, + "grad_norm": 1.938430666923523, + "learning_rate": 1.465706158516604e-05, + "loss": 1.3797, + "step": 7043 + }, + { + "epoch": 0.3851126935746211, + "grad_norm": 1.2799221277236938, + "learning_rate": 1.4655444406922521e-05, + "loss": 1.4443, + "step": 7044 + }, + { + "epoch": 0.3851673660183426, + "grad_norm": 1.3711965084075928, + "learning_rate": 1.4653827073222785e-05, + "loss": 1.6071, + "step": 7045 + }, + { + "epoch": 0.38522203846206415, + "grad_norm": 1.66279137134552, + "learning_rate": 1.4652209584120847e-05, + "loss": 1.2157, + "step": 7046 + }, + { + "epoch": 0.3852767109057857, + "grad_norm": 1.9007402658462524, + "learning_rate": 1.4650591939670713e-05, + "loss": 1.5069, + "step": 7047 + }, + { + "epoch": 0.3853313833495073, + "grad_norm": 1.4310460090637207, + "learning_rate": 1.4648974139926403e-05, + "loss": 1.6196, + "step": 7048 + }, + { + "epoch": 0.3853860557932288, + "grad_norm": 1.4652148485183716, + "learning_rate": 1.4647356184941932e-05, + "loss": 1.4715, + "step": 7049 + }, + { + "epoch": 0.38544072823695036, + "grad_norm": 1.4994440078735352, + "learning_rate": 1.4645738074771334e-05, + "loss": 1.482, + "step": 7050 + }, + { + "epoch": 0.38549540068067195, + "grad_norm": 1.5236293077468872, + "learning_rate": 1.4644119809468645e-05, + "loss": 1.49, + "step": 7051 + }, + { + "epoch": 0.3855500731243935, + "grad_norm": 1.208574891090393, + "learning_rate": 1.4642501389087891e-05, + "loss": 1.398, + "step": 7052 + }, + { + "epoch": 0.385604745568115, + "grad_norm": 1.2645256519317627, + "learning_rate": 1.4640882813683125e-05, + "loss": 1.3759, + "step": 7053 + }, + { + "epoch": 0.38565941801183656, + "grad_norm": 1.4941219091415405, + "learning_rate": 1.4639264083308393e-05, + "loss": 1.5269, + "step": 7054 + }, + { + "epoch": 0.38571409045555816, + "grad_norm": 1.5148993730545044, + "learning_rate": 1.4637645198017745e-05, + "loss": 1.2835, + "step": 7055 + }, + { + "epoch": 0.3857687628992797, + "grad_norm": 1.6301300525665283, + "learning_rate": 1.4636026157865242e-05, + "loss": 1.4232, + "step": 7056 + }, + { + "epoch": 0.38582343534300123, + "grad_norm": 1.4371367692947388, + "learning_rate": 1.4634406962904945e-05, + "loss": 1.372, + "step": 7057 + }, + { + "epoch": 0.3858781077867228, + "grad_norm": 1.7005031108856201, + "learning_rate": 1.4632787613190928e-05, + "loss": 1.3902, + "step": 7058 + }, + { + "epoch": 0.38593278023044436, + "grad_norm": 1.4204516410827637, + "learning_rate": 1.463116810877726e-05, + "loss": 1.4352, + "step": 7059 + }, + { + "epoch": 0.3859874526741659, + "grad_norm": 1.624139428138733, + "learning_rate": 1.462954844971802e-05, + "loss": 1.5138, + "step": 7060 + }, + { + "epoch": 0.38604212511788744, + "grad_norm": 1.8401713371276855, + "learning_rate": 1.4627928636067295e-05, + "loss": 1.2519, + "step": 7061 + }, + { + "epoch": 0.38609679756160903, + "grad_norm": 1.5333913564682007, + "learning_rate": 1.4626308667879175e-05, + "loss": 1.3621, + "step": 7062 + }, + { + "epoch": 0.38615147000533057, + "grad_norm": 1.3324079513549805, + "learning_rate": 1.4624688545207749e-05, + "loss": 1.5278, + "step": 7063 + }, + { + "epoch": 0.3862061424490521, + "grad_norm": 2.0264041423797607, + "learning_rate": 1.4623068268107119e-05, + "loss": 1.698, + "step": 7064 + }, + { + "epoch": 0.3862608148927737, + "grad_norm": 1.390016794204712, + "learning_rate": 1.4621447836631395e-05, + "loss": 1.3909, + "step": 7065 + }, + { + "epoch": 0.38631548733649523, + "grad_norm": 1.5065184831619263, + "learning_rate": 1.4619827250834681e-05, + "loss": 1.162, + "step": 7066 + }, + { + "epoch": 0.38637015978021677, + "grad_norm": 1.4686866998672485, + "learning_rate": 1.4618206510771097e-05, + "loss": 1.3339, + "step": 7067 + }, + { + "epoch": 0.3864248322239383, + "grad_norm": 1.3810347318649292, + "learning_rate": 1.4616585616494759e-05, + "loss": 1.2558, + "step": 7068 + }, + { + "epoch": 0.3864795046676599, + "grad_norm": 1.4382542371749878, + "learning_rate": 1.4614964568059795e-05, + "loss": 1.5475, + "step": 7069 + }, + { + "epoch": 0.38653417711138144, + "grad_norm": 1.464514970779419, + "learning_rate": 1.4613343365520333e-05, + "loss": 1.188, + "step": 7070 + }, + { + "epoch": 0.386588849555103, + "grad_norm": 1.343736171722412, + "learning_rate": 1.4611722008930512e-05, + "loss": 1.4286, + "step": 7071 + }, + { + "epoch": 0.38664352199882457, + "grad_norm": 1.5593349933624268, + "learning_rate": 1.4610100498344471e-05, + "loss": 1.3997, + "step": 7072 + }, + { + "epoch": 0.3866981944425461, + "grad_norm": 1.291773796081543, + "learning_rate": 1.4608478833816356e-05, + "loss": 1.5106, + "step": 7073 + }, + { + "epoch": 0.38675286688626764, + "grad_norm": 2.024069309234619, + "learning_rate": 1.4606857015400317e-05, + "loss": 1.4867, + "step": 7074 + }, + { + "epoch": 0.3868075393299892, + "grad_norm": 1.427493691444397, + "learning_rate": 1.4605235043150514e-05, + "loss": 1.8079, + "step": 7075 + }, + { + "epoch": 0.3868622117737108, + "grad_norm": 1.2502484321594238, + "learning_rate": 1.4603612917121107e-05, + "loss": 1.3798, + "step": 7076 + }, + { + "epoch": 0.3869168842174323, + "grad_norm": 1.2786941528320312, + "learning_rate": 1.460199063736626e-05, + "loss": 1.382, + "step": 7077 + }, + { + "epoch": 0.38697155666115385, + "grad_norm": 1.3191874027252197, + "learning_rate": 1.4600368203940147e-05, + "loss": 1.3419, + "step": 7078 + }, + { + "epoch": 0.38702622910487544, + "grad_norm": 1.3674970865249634, + "learning_rate": 1.4598745616896946e-05, + "loss": 1.474, + "step": 7079 + }, + { + "epoch": 0.387080901548597, + "grad_norm": 1.2350491285324097, + "learning_rate": 1.4597122876290839e-05, + "loss": 1.5522, + "step": 7080 + }, + { + "epoch": 0.3871355739923185, + "grad_norm": 1.3650753498077393, + "learning_rate": 1.4595499982176007e-05, + "loss": 1.3899, + "step": 7081 + }, + { + "epoch": 0.38719024643604005, + "grad_norm": 1.6606669425964355, + "learning_rate": 1.459387693460665e-05, + "loss": 1.5585, + "step": 7082 + }, + { + "epoch": 0.38724491887976165, + "grad_norm": 1.6902354955673218, + "learning_rate": 1.4592253733636961e-05, + "loss": 1.5873, + "step": 7083 + }, + { + "epoch": 0.3872995913234832, + "grad_norm": 1.5514689683914185, + "learning_rate": 1.4590630379321145e-05, + "loss": 1.4763, + "step": 7084 + }, + { + "epoch": 0.3873542637672047, + "grad_norm": 1.6923896074295044, + "learning_rate": 1.4589006871713407e-05, + "loss": 1.3338, + "step": 7085 + }, + { + "epoch": 0.3874089362109263, + "grad_norm": 1.5572409629821777, + "learning_rate": 1.4587383210867963e-05, + "loss": 1.5118, + "step": 7086 + }, + { + "epoch": 0.38746360865464785, + "grad_norm": 1.6193046569824219, + "learning_rate": 1.458575939683903e-05, + "loss": 1.303, + "step": 7087 + }, + { + "epoch": 0.3875182810983694, + "grad_norm": 1.7118042707443237, + "learning_rate": 1.4584135429680826e-05, + "loss": 1.462, + "step": 7088 + }, + { + "epoch": 0.3875729535420909, + "grad_norm": 1.6593483686447144, + "learning_rate": 1.4582511309447585e-05, + "loss": 1.4911, + "step": 7089 + }, + { + "epoch": 0.3876276259858125, + "grad_norm": 1.5598106384277344, + "learning_rate": 1.4580887036193539e-05, + "loss": 1.3259, + "step": 7090 + }, + { + "epoch": 0.38768229842953406, + "grad_norm": 1.7252936363220215, + "learning_rate": 1.4579262609972922e-05, + "loss": 1.5217, + "step": 7091 + }, + { + "epoch": 0.3877369708732556, + "grad_norm": 1.4175281524658203, + "learning_rate": 1.4577638030839985e-05, + "loss": 1.1624, + "step": 7092 + }, + { + "epoch": 0.3877916433169772, + "grad_norm": 1.398702621459961, + "learning_rate": 1.4576013298848971e-05, + "loss": 1.3402, + "step": 7093 + }, + { + "epoch": 0.3878463157606987, + "grad_norm": 1.955368995666504, + "learning_rate": 1.4574388414054134e-05, + "loss": 1.3007, + "step": 7094 + }, + { + "epoch": 0.38790098820442026, + "grad_norm": 1.4819079637527466, + "learning_rate": 1.4572763376509732e-05, + "loss": 1.5789, + "step": 7095 + }, + { + "epoch": 0.3879556606481418, + "grad_norm": 1.5879634618759155, + "learning_rate": 1.4571138186270037e-05, + "loss": 1.2887, + "step": 7096 + }, + { + "epoch": 0.3880103330918634, + "grad_norm": 1.466881513595581, + "learning_rate": 1.4569512843389306e-05, + "loss": 1.5029, + "step": 7097 + }, + { + "epoch": 0.38806500553558493, + "grad_norm": 1.7093605995178223, + "learning_rate": 1.4567887347921818e-05, + "loss": 1.3037, + "step": 7098 + }, + { + "epoch": 0.38811967797930647, + "grad_norm": 1.4079707860946655, + "learning_rate": 1.4566261699921857e-05, + "loss": 1.5656, + "step": 7099 + }, + { + "epoch": 0.38817435042302806, + "grad_norm": 1.5464062690734863, + "learning_rate": 1.4564635899443702e-05, + "loss": 1.4024, + "step": 7100 + }, + { + "epoch": 0.3882290228667496, + "grad_norm": 1.835857629776001, + "learning_rate": 1.456300994654164e-05, + "loss": 1.4341, + "step": 7101 + }, + { + "epoch": 0.38828369531047113, + "grad_norm": 1.5067894458770752, + "learning_rate": 1.4561383841269967e-05, + "loss": 1.5508, + "step": 7102 + }, + { + "epoch": 0.38833836775419267, + "grad_norm": 1.5958582162857056, + "learning_rate": 1.4559757583682989e-05, + "loss": 1.4926, + "step": 7103 + }, + { + "epoch": 0.38839304019791426, + "grad_norm": 1.3936363458633423, + "learning_rate": 1.4558131173835002e-05, + "loss": 1.6521, + "step": 7104 + }, + { + "epoch": 0.3884477126416358, + "grad_norm": 1.3574235439300537, + "learning_rate": 1.455650461178032e-05, + "loss": 1.5148, + "step": 7105 + }, + { + "epoch": 0.38850238508535734, + "grad_norm": 2.0232112407684326, + "learning_rate": 1.4554877897573259e-05, + "loss": 1.2732, + "step": 7106 + }, + { + "epoch": 0.38855705752907893, + "grad_norm": 1.337833285331726, + "learning_rate": 1.4553251031268134e-05, + "loss": 1.4299, + "step": 7107 + }, + { + "epoch": 0.38861172997280047, + "grad_norm": 1.6892218589782715, + "learning_rate": 1.4551624012919274e-05, + "loss": 1.4526, + "step": 7108 + }, + { + "epoch": 0.388666402416522, + "grad_norm": 1.8626935482025146, + "learning_rate": 1.4549996842581005e-05, + "loss": 1.4806, + "step": 7109 + }, + { + "epoch": 0.38872107486024354, + "grad_norm": 1.528942346572876, + "learning_rate": 1.4548369520307669e-05, + "loss": 1.7333, + "step": 7110 + }, + { + "epoch": 0.38877574730396514, + "grad_norm": 1.3898024559020996, + "learning_rate": 1.4546742046153596e-05, + "loss": 1.5124, + "step": 7111 + }, + { + "epoch": 0.3888304197476867, + "grad_norm": 1.172122597694397, + "learning_rate": 1.454511442017314e-05, + "loss": 1.5979, + "step": 7112 + }, + { + "epoch": 0.3888850921914082, + "grad_norm": 1.7097198963165283, + "learning_rate": 1.4543486642420647e-05, + "loss": 1.6513, + "step": 7113 + }, + { + "epoch": 0.3889397646351298, + "grad_norm": 1.4294902086257935, + "learning_rate": 1.4541858712950477e-05, + "loss": 1.2615, + "step": 7114 + }, + { + "epoch": 0.38899443707885134, + "grad_norm": 1.2896252870559692, + "learning_rate": 1.4540230631816984e-05, + "loss": 1.2239, + "step": 7115 + }, + { + "epoch": 0.3890491095225729, + "grad_norm": 1.7379779815673828, + "learning_rate": 1.4538602399074532e-05, + "loss": 1.52, + "step": 7116 + }, + { + "epoch": 0.3891037819662944, + "grad_norm": 1.752001166343689, + "learning_rate": 1.4536974014777503e-05, + "loss": 1.4503, + "step": 7117 + }, + { + "epoch": 0.389158454410016, + "grad_norm": 1.172844409942627, + "learning_rate": 1.453534547898026e-05, + "loss": 1.4529, + "step": 7118 + }, + { + "epoch": 0.38921312685373755, + "grad_norm": 1.9018439054489136, + "learning_rate": 1.4533716791737193e-05, + "loss": 1.4338, + "step": 7119 + }, + { + "epoch": 0.3892677992974591, + "grad_norm": 2.001767158508301, + "learning_rate": 1.453208795310268e-05, + "loss": 1.2519, + "step": 7120 + }, + { + "epoch": 0.3893224717411807, + "grad_norm": 1.4341320991516113, + "learning_rate": 1.453045896313112e-05, + "loss": 1.2644, + "step": 7121 + }, + { + "epoch": 0.3893771441849022, + "grad_norm": 1.2234376668930054, + "learning_rate": 1.45288298218769e-05, + "loss": 1.4401, + "step": 7122 + }, + { + "epoch": 0.38943181662862375, + "grad_norm": 1.294748306274414, + "learning_rate": 1.4527200529394425e-05, + "loss": 1.4725, + "step": 7123 + }, + { + "epoch": 0.3894864890723453, + "grad_norm": 1.398561954498291, + "learning_rate": 1.4525571085738104e-05, + "loss": 1.3029, + "step": 7124 + }, + { + "epoch": 0.3895411615160669, + "grad_norm": 1.7415701150894165, + "learning_rate": 1.4523941490962342e-05, + "loss": 1.4512, + "step": 7125 + }, + { + "epoch": 0.3895958339597884, + "grad_norm": 1.5237269401550293, + "learning_rate": 1.452231174512156e-05, + "loss": 1.5504, + "step": 7126 + }, + { + "epoch": 0.38965050640350996, + "grad_norm": 1.681316614151001, + "learning_rate": 1.4520681848270176e-05, + "loss": 1.5167, + "step": 7127 + }, + { + "epoch": 0.38970517884723155, + "grad_norm": 1.5757689476013184, + "learning_rate": 1.4519051800462617e-05, + "loss": 1.6116, + "step": 7128 + }, + { + "epoch": 0.3897598512909531, + "grad_norm": 1.641998291015625, + "learning_rate": 1.4517421601753312e-05, + "loss": 1.3636, + "step": 7129 + }, + { + "epoch": 0.3898145237346746, + "grad_norm": 1.5843398571014404, + "learning_rate": 1.45157912521967e-05, + "loss": 1.3468, + "step": 7130 + }, + { + "epoch": 0.38986919617839616, + "grad_norm": 1.8892121315002441, + "learning_rate": 1.4514160751847226e-05, + "loss": 1.4991, + "step": 7131 + }, + { + "epoch": 0.38992386862211775, + "grad_norm": 1.4199880361557007, + "learning_rate": 1.451253010075933e-05, + "loss": 1.6259, + "step": 7132 + }, + { + "epoch": 0.3899785410658393, + "grad_norm": 1.4772602319717407, + "learning_rate": 1.4510899298987463e-05, + "loss": 1.5193, + "step": 7133 + }, + { + "epoch": 0.3900332135095608, + "grad_norm": 1.8058981895446777, + "learning_rate": 1.4509268346586081e-05, + "loss": 1.5821, + "step": 7134 + }, + { + "epoch": 0.3900878859532824, + "grad_norm": 1.5731850862503052, + "learning_rate": 1.4507637243609651e-05, + "loss": 1.5569, + "step": 7135 + }, + { + "epoch": 0.39014255839700396, + "grad_norm": 1.4901372194290161, + "learning_rate": 1.4506005990112635e-05, + "loss": 1.4869, + "step": 7136 + }, + { + "epoch": 0.3901972308407255, + "grad_norm": 1.686104416847229, + "learning_rate": 1.4504374586149503e-05, + "loss": 1.6403, + "step": 7137 + }, + { + "epoch": 0.39025190328444703, + "grad_norm": 1.4692542552947998, + "learning_rate": 1.4502743031774737e-05, + "loss": 1.567, + "step": 7138 + }, + { + "epoch": 0.3903065757281686, + "grad_norm": 2.134075880050659, + "learning_rate": 1.4501111327042817e-05, + "loss": 1.5043, + "step": 7139 + }, + { + "epoch": 0.39036124817189016, + "grad_norm": 1.8115248680114746, + "learning_rate": 1.4499479472008222e-05, + "loss": 1.3076, + "step": 7140 + }, + { + "epoch": 0.3904159206156117, + "grad_norm": 1.5772424936294556, + "learning_rate": 1.4497847466725453e-05, + "loss": 1.4233, + "step": 7141 + }, + { + "epoch": 0.3904705930593333, + "grad_norm": 1.511938214302063, + "learning_rate": 1.4496215311249002e-05, + "loss": 1.2801, + "step": 7142 + }, + { + "epoch": 0.39052526550305483, + "grad_norm": 1.363091230392456, + "learning_rate": 1.4494583005633369e-05, + "loss": 1.4578, + "step": 7143 + }, + { + "epoch": 0.39057993794677637, + "grad_norm": 1.5888140201568604, + "learning_rate": 1.4492950549933063e-05, + "loss": 1.2905, + "step": 7144 + }, + { + "epoch": 0.3906346103904979, + "grad_norm": 1.2197409868240356, + "learning_rate": 1.4491317944202598e-05, + "loss": 1.6968, + "step": 7145 + }, + { + "epoch": 0.3906892828342195, + "grad_norm": 1.510544776916504, + "learning_rate": 1.4489685188496488e-05, + "loss": 1.5585, + "step": 7146 + }, + { + "epoch": 0.39074395527794104, + "grad_norm": 1.6645100116729736, + "learning_rate": 1.448805228286925e-05, + "loss": 1.3142, + "step": 7147 + }, + { + "epoch": 0.3907986277216626, + "grad_norm": 1.2946670055389404, + "learning_rate": 1.4486419227375415e-05, + "loss": 1.6195, + "step": 7148 + }, + { + "epoch": 0.39085330016538417, + "grad_norm": 1.430091142654419, + "learning_rate": 1.4484786022069517e-05, + "loss": 1.3876, + "step": 7149 + }, + { + "epoch": 0.3909079726091057, + "grad_norm": 1.4771441221237183, + "learning_rate": 1.4483152667006088e-05, + "loss": 1.6079, + "step": 7150 + }, + { + "epoch": 0.39096264505282724, + "grad_norm": 2.505187511444092, + "learning_rate": 1.4481519162239675e-05, + "loss": 1.5131, + "step": 7151 + }, + { + "epoch": 0.3910173174965488, + "grad_norm": 1.435122013092041, + "learning_rate": 1.4479885507824818e-05, + "loss": 1.5748, + "step": 7152 + }, + { + "epoch": 0.39107198994027037, + "grad_norm": 1.319975733757019, + "learning_rate": 1.447825170381607e-05, + "loss": 1.7617, + "step": 7153 + }, + { + "epoch": 0.3911266623839919, + "grad_norm": 1.4285881519317627, + "learning_rate": 1.4476617750267991e-05, + "loss": 1.2797, + "step": 7154 + }, + { + "epoch": 0.39118133482771345, + "grad_norm": 1.384398102760315, + "learning_rate": 1.447498364723514e-05, + "loss": 1.4378, + "step": 7155 + }, + { + "epoch": 0.39123600727143504, + "grad_norm": 1.391271948814392, + "learning_rate": 1.4473349394772085e-05, + "loss": 1.5786, + "step": 7156 + }, + { + "epoch": 0.3912906797151566, + "grad_norm": 1.6314631700515747, + "learning_rate": 1.4471714992933397e-05, + "loss": 1.6506, + "step": 7157 + }, + { + "epoch": 0.3913453521588781, + "grad_norm": 1.3932701349258423, + "learning_rate": 1.4470080441773651e-05, + "loss": 1.5383, + "step": 7158 + }, + { + "epoch": 0.39140002460259965, + "grad_norm": 1.6014944314956665, + "learning_rate": 1.4468445741347432e-05, + "loss": 1.4379, + "step": 7159 + }, + { + "epoch": 0.39145469704632124, + "grad_norm": 1.5079466104507446, + "learning_rate": 1.446681089170932e-05, + "loss": 1.4092, + "step": 7160 + }, + { + "epoch": 0.3915093694900428, + "grad_norm": 1.3236889839172363, + "learning_rate": 1.4465175892913915e-05, + "loss": 1.4475, + "step": 7161 + }, + { + "epoch": 0.3915640419337643, + "grad_norm": 1.8923563957214355, + "learning_rate": 1.4463540745015805e-05, + "loss": 1.6761, + "step": 7162 + }, + { + "epoch": 0.3916187143774859, + "grad_norm": 3.1397855281829834, + "learning_rate": 1.4461905448069597e-05, + "loss": 1.5215, + "step": 7163 + }, + { + "epoch": 0.39167338682120745, + "grad_norm": 1.4792553186416626, + "learning_rate": 1.4460270002129897e-05, + "loss": 1.5781, + "step": 7164 + }, + { + "epoch": 0.391728059264929, + "grad_norm": 1.1638493537902832, + "learning_rate": 1.4458634407251315e-05, + "loss": 1.3167, + "step": 7165 + }, + { + "epoch": 0.3917827317086505, + "grad_norm": 1.8213987350463867, + "learning_rate": 1.4456998663488468e-05, + "loss": 1.5317, + "step": 7166 + }, + { + "epoch": 0.3918374041523721, + "grad_norm": 1.3883848190307617, + "learning_rate": 1.4455362770895976e-05, + "loss": 1.4909, + "step": 7167 + }, + { + "epoch": 0.39189207659609365, + "grad_norm": 1.8280751705169678, + "learning_rate": 1.4453726729528466e-05, + "loss": 1.4771, + "step": 7168 + }, + { + "epoch": 0.3919467490398152, + "grad_norm": 1.534651756286621, + "learning_rate": 1.4452090539440569e-05, + "loss": 1.5102, + "step": 7169 + }, + { + "epoch": 0.3920014214835368, + "grad_norm": 1.504590630531311, + "learning_rate": 1.4450454200686922e-05, + "loss": 1.2604, + "step": 7170 + }, + { + "epoch": 0.3920560939272583, + "grad_norm": 1.603700041770935, + "learning_rate": 1.4448817713322169e-05, + "loss": 1.4763, + "step": 7171 + }, + { + "epoch": 0.39211076637097986, + "grad_norm": 2.202216386795044, + "learning_rate": 1.4447181077400948e-05, + "loss": 1.3674, + "step": 7172 + }, + { + "epoch": 0.3921654388147014, + "grad_norm": 1.4389921426773071, + "learning_rate": 1.444554429297792e-05, + "loss": 1.5935, + "step": 7173 + }, + { + "epoch": 0.392220111258423, + "grad_norm": 1.8419963121414185, + "learning_rate": 1.4443907360107734e-05, + "loss": 1.3904, + "step": 7174 + }, + { + "epoch": 0.3922747837021445, + "grad_norm": 1.5118143558502197, + "learning_rate": 1.4442270278845052e-05, + "loss": 1.5634, + "step": 7175 + }, + { + "epoch": 0.39232945614586606, + "grad_norm": 1.3007203340530396, + "learning_rate": 1.4440633049244541e-05, + "loss": 1.422, + "step": 7176 + }, + { + "epoch": 0.39238412858958766, + "grad_norm": 1.3075331449508667, + "learning_rate": 1.4438995671360875e-05, + "loss": 1.4307, + "step": 7177 + }, + { + "epoch": 0.3924388010333092, + "grad_norm": 1.4658360481262207, + "learning_rate": 1.4437358145248727e-05, + "loss": 1.3896, + "step": 7178 + }, + { + "epoch": 0.39249347347703073, + "grad_norm": 1.601346492767334, + "learning_rate": 1.4435720470962778e-05, + "loss": 1.5383, + "step": 7179 + }, + { + "epoch": 0.39254814592075227, + "grad_norm": 1.318084955215454, + "learning_rate": 1.4434082648557712e-05, + "loss": 1.3384, + "step": 7180 + }, + { + "epoch": 0.39260281836447386, + "grad_norm": 1.153830647468567, + "learning_rate": 1.4432444678088222e-05, + "loss": 1.5405, + "step": 7181 + }, + { + "epoch": 0.3926574908081954, + "grad_norm": 1.5002909898757935, + "learning_rate": 1.4430806559609e-05, + "loss": 1.6469, + "step": 7182 + }, + { + "epoch": 0.39271216325191693, + "grad_norm": 1.6157770156860352, + "learning_rate": 1.4429168293174756e-05, + "loss": 1.5521, + "step": 7183 + }, + { + "epoch": 0.3927668356956385, + "grad_norm": 1.716727614402771, + "learning_rate": 1.4427529878840184e-05, + "loss": 1.3343, + "step": 7184 + }, + { + "epoch": 0.39282150813936006, + "grad_norm": 1.3675230741500854, + "learning_rate": 1.4425891316660005e-05, + "loss": 1.4812, + "step": 7185 + }, + { + "epoch": 0.3928761805830816, + "grad_norm": 1.6440869569778442, + "learning_rate": 1.4424252606688924e-05, + "loss": 1.6208, + "step": 7186 + }, + { + "epoch": 0.39293085302680314, + "grad_norm": 1.995560884475708, + "learning_rate": 1.442261374898167e-05, + "loss": 1.2501, + "step": 7187 + }, + { + "epoch": 0.39298552547052473, + "grad_norm": 2.0933825969696045, + "learning_rate": 1.4420974743592964e-05, + "loss": 1.4141, + "step": 7188 + }, + { + "epoch": 0.39304019791424627, + "grad_norm": 1.7017762660980225, + "learning_rate": 1.4419335590577537e-05, + "loss": 1.738, + "step": 7189 + }, + { + "epoch": 0.3930948703579678, + "grad_norm": 1.6214581727981567, + "learning_rate": 1.4417696289990127e-05, + "loss": 1.3191, + "step": 7190 + }, + { + "epoch": 0.3931495428016894, + "grad_norm": 1.7610825300216675, + "learning_rate": 1.4416056841885469e-05, + "loss": 1.5306, + "step": 7191 + }, + { + "epoch": 0.39320421524541094, + "grad_norm": 1.3853521347045898, + "learning_rate": 1.4414417246318308e-05, + "loss": 1.497, + "step": 7192 + }, + { + "epoch": 0.3932588876891325, + "grad_norm": 1.5529248714447021, + "learning_rate": 1.44127775033434e-05, + "loss": 1.634, + "step": 7193 + }, + { + "epoch": 0.393313560132854, + "grad_norm": 1.2920618057250977, + "learning_rate": 1.4411137613015496e-05, + "loss": 1.6222, + "step": 7194 + }, + { + "epoch": 0.3933682325765756, + "grad_norm": 1.7311441898345947, + "learning_rate": 1.4409497575389352e-05, + "loss": 1.3251, + "step": 7195 + }, + { + "epoch": 0.39342290502029714, + "grad_norm": 1.2031452655792236, + "learning_rate": 1.440785739051974e-05, + "loss": 1.5638, + "step": 7196 + }, + { + "epoch": 0.3934775774640187, + "grad_norm": 1.5281777381896973, + "learning_rate": 1.4406217058461427e-05, + "loss": 1.6235, + "step": 7197 + }, + { + "epoch": 0.3935322499077403, + "grad_norm": 1.5204230546951294, + "learning_rate": 1.4404576579269187e-05, + "loss": 1.2574, + "step": 7198 + }, + { + "epoch": 0.3935869223514618, + "grad_norm": 1.3877832889556885, + "learning_rate": 1.4402935952997799e-05, + "loss": 1.5359, + "step": 7199 + }, + { + "epoch": 0.39364159479518335, + "grad_norm": 1.3798067569732666, + "learning_rate": 1.4401295179702046e-05, + "loss": 1.5397, + "step": 7200 + }, + { + "epoch": 0.3936962672389049, + "grad_norm": 1.4255008697509766, + "learning_rate": 1.4399654259436721e-05, + "loss": 1.6583, + "step": 7201 + }, + { + "epoch": 0.3937509396826265, + "grad_norm": 1.35444176197052, + "learning_rate": 1.4398013192256615e-05, + "loss": 1.5636, + "step": 7202 + }, + { + "epoch": 0.393805612126348, + "grad_norm": 1.4658218622207642, + "learning_rate": 1.4396371978216528e-05, + "loss": 1.3679, + "step": 7203 + }, + { + "epoch": 0.39386028457006955, + "grad_norm": 1.404677152633667, + "learning_rate": 1.4394730617371266e-05, + "loss": 1.6008, + "step": 7204 + }, + { + "epoch": 0.39391495701379114, + "grad_norm": 1.2534180879592896, + "learning_rate": 1.4393089109775635e-05, + "loss": 1.5192, + "step": 7205 + }, + { + "epoch": 0.3939696294575127, + "grad_norm": 1.4010289907455444, + "learning_rate": 1.4391447455484448e-05, + "loss": 1.5887, + "step": 7206 + }, + { + "epoch": 0.3940243019012342, + "grad_norm": 1.1114047765731812, + "learning_rate": 1.438980565455253e-05, + "loss": 1.6503, + "step": 7207 + }, + { + "epoch": 0.39407897434495576, + "grad_norm": 1.516746163368225, + "learning_rate": 1.4388163707034697e-05, + "loss": 1.5899, + "step": 7208 + }, + { + "epoch": 0.39413364678867735, + "grad_norm": 1.905005931854248, + "learning_rate": 1.438652161298578e-05, + "loss": 1.5285, + "step": 7209 + }, + { + "epoch": 0.3941883192323989, + "grad_norm": 1.47809636592865, + "learning_rate": 1.4384879372460617e-05, + "loss": 1.5215, + "step": 7210 + }, + { + "epoch": 0.3942429916761204, + "grad_norm": 1.3377318382263184, + "learning_rate": 1.4383236985514037e-05, + "loss": 1.4078, + "step": 7211 + }, + { + "epoch": 0.394297664119842, + "grad_norm": 3.972628116607666, + "learning_rate": 1.4381594452200894e-05, + "loss": 1.7562, + "step": 7212 + }, + { + "epoch": 0.39435233656356355, + "grad_norm": 1.6220721006393433, + "learning_rate": 1.4379951772576024e-05, + "loss": 1.366, + "step": 7213 + }, + { + "epoch": 0.3944070090072851, + "grad_norm": 1.5630860328674316, + "learning_rate": 1.4378308946694291e-05, + "loss": 1.1439, + "step": 7214 + }, + { + "epoch": 0.39446168145100663, + "grad_norm": 1.3629167079925537, + "learning_rate": 1.437666597461055e-05, + "loss": 1.8215, + "step": 7215 + }, + { + "epoch": 0.3945163538947282, + "grad_norm": 1.444922685623169, + "learning_rate": 1.4375022856379657e-05, + "loss": 1.4513, + "step": 7216 + }, + { + "epoch": 0.39457102633844976, + "grad_norm": 1.5555686950683594, + "learning_rate": 1.4373379592056487e-05, + "loss": 1.5246, + "step": 7217 + }, + { + "epoch": 0.3946256987821713, + "grad_norm": 1.5315498113632202, + "learning_rate": 1.4371736181695908e-05, + "loss": 1.2978, + "step": 7218 + }, + { + "epoch": 0.3946803712258929, + "grad_norm": 1.5830713510513306, + "learning_rate": 1.4370092625352803e-05, + "loss": 1.7836, + "step": 7219 + }, + { + "epoch": 0.3947350436696144, + "grad_norm": 1.4603850841522217, + "learning_rate": 1.4368448923082048e-05, + "loss": 1.2796, + "step": 7220 + }, + { + "epoch": 0.39478971611333596, + "grad_norm": 1.4943039417266846, + "learning_rate": 1.4366805074938533e-05, + "loss": 1.4205, + "step": 7221 + }, + { + "epoch": 0.3948443885570575, + "grad_norm": 1.6681885719299316, + "learning_rate": 1.436516108097715e-05, + "loss": 1.4482, + "step": 7222 + }, + { + "epoch": 0.3948990610007791, + "grad_norm": 1.5073676109313965, + "learning_rate": 1.4363516941252795e-05, + "loss": 1.4671, + "step": 7223 + }, + { + "epoch": 0.39495373344450063, + "grad_norm": 1.7015661001205444, + "learning_rate": 1.4361872655820371e-05, + "loss": 1.3864, + "step": 7224 + }, + { + "epoch": 0.39500840588822217, + "grad_norm": 1.6783684492111206, + "learning_rate": 1.436022822473478e-05, + "loss": 1.5783, + "step": 7225 + }, + { + "epoch": 0.39506307833194376, + "grad_norm": 1.5878629684448242, + "learning_rate": 1.435858364805094e-05, + "loss": 1.279, + "step": 7226 + }, + { + "epoch": 0.3951177507756653, + "grad_norm": 1.355950951576233, + "learning_rate": 1.4356938925823764e-05, + "loss": 1.5172, + "step": 7227 + }, + { + "epoch": 0.39517242321938684, + "grad_norm": 1.5477027893066406, + "learning_rate": 1.4355294058108173e-05, + "loss": 1.438, + "step": 7228 + }, + { + "epoch": 0.3952270956631084, + "grad_norm": 1.2646121978759766, + "learning_rate": 1.4353649044959094e-05, + "loss": 1.4002, + "step": 7229 + }, + { + "epoch": 0.39528176810682997, + "grad_norm": 1.5096888542175293, + "learning_rate": 1.4352003886431459e-05, + "loss": 1.7476, + "step": 7230 + }, + { + "epoch": 0.3953364405505515, + "grad_norm": 1.628758430480957, + "learning_rate": 1.4350358582580197e-05, + "loss": 1.4893, + "step": 7231 + }, + { + "epoch": 0.39539111299427304, + "grad_norm": 1.4053456783294678, + "learning_rate": 1.4348713133460257e-05, + "loss": 1.4331, + "step": 7232 + }, + { + "epoch": 0.39544578543799463, + "grad_norm": 1.558099627494812, + "learning_rate": 1.4347067539126581e-05, + "loss": 1.5521, + "step": 7233 + }, + { + "epoch": 0.39550045788171617, + "grad_norm": 1.5695420503616333, + "learning_rate": 1.4345421799634118e-05, + "loss": 1.402, + "step": 7234 + }, + { + "epoch": 0.3955551303254377, + "grad_norm": 1.5936102867126465, + "learning_rate": 1.4343775915037822e-05, + "loss": 1.3232, + "step": 7235 + }, + { + "epoch": 0.3956098027691593, + "grad_norm": 1.5772212743759155, + "learning_rate": 1.434212988539266e-05, + "loss": 1.5229, + "step": 7236 + }, + { + "epoch": 0.39566447521288084, + "grad_norm": 1.5916680097579956, + "learning_rate": 1.434048371075359e-05, + "loss": 1.5389, + "step": 7237 + }, + { + "epoch": 0.3957191476566024, + "grad_norm": 2.0566065311431885, + "learning_rate": 1.4338837391175582e-05, + "loss": 1.5453, + "step": 7238 + }, + { + "epoch": 0.3957738201003239, + "grad_norm": 2.062883138656616, + "learning_rate": 1.4337190926713613e-05, + "loss": 1.4818, + "step": 7239 + }, + { + "epoch": 0.3958284925440455, + "grad_norm": 1.219281554222107, + "learning_rate": 1.4335544317422663e-05, + "loss": 1.6228, + "step": 7240 + }, + { + "epoch": 0.39588316498776704, + "grad_norm": 1.4703830480575562, + "learning_rate": 1.4333897563357712e-05, + "loss": 1.5334, + "step": 7241 + }, + { + "epoch": 0.3959378374314886, + "grad_norm": 1.3900890350341797, + "learning_rate": 1.4332250664573754e-05, + "loss": 1.4925, + "step": 7242 + }, + { + "epoch": 0.3959925098752102, + "grad_norm": 1.6623258590698242, + "learning_rate": 1.433060362112578e-05, + "loss": 1.3182, + "step": 7243 + }, + { + "epoch": 0.3960471823189317, + "grad_norm": 1.5029202699661255, + "learning_rate": 1.4328956433068789e-05, + "loss": 1.3172, + "step": 7244 + }, + { + "epoch": 0.39610185476265325, + "grad_norm": 1.3872562646865845, + "learning_rate": 1.4327309100457783e-05, + "loss": 1.5021, + "step": 7245 + }, + { + "epoch": 0.3961565272063748, + "grad_norm": 1.874162197113037, + "learning_rate": 1.4325661623347772e-05, + "loss": 1.2522, + "step": 7246 + }, + { + "epoch": 0.3962111996500964, + "grad_norm": 1.9983360767364502, + "learning_rate": 1.432401400179377e-05, + "loss": 1.555, + "step": 7247 + }, + { + "epoch": 0.3962658720938179, + "grad_norm": 1.6998319625854492, + "learning_rate": 1.4322366235850794e-05, + "loss": 1.5686, + "step": 7248 + }, + { + "epoch": 0.39632054453753945, + "grad_norm": 1.2625257968902588, + "learning_rate": 1.4320718325573865e-05, + "loss": 1.397, + "step": 7249 + }, + { + "epoch": 0.39637521698126105, + "grad_norm": 1.6075674295425415, + "learning_rate": 1.4319070271018016e-05, + "loss": 1.4706, + "step": 7250 + }, + { + "epoch": 0.3964298894249826, + "grad_norm": 2.373232364654541, + "learning_rate": 1.4317422072238271e-05, + "loss": 1.4154, + "step": 7251 + }, + { + "epoch": 0.3964845618687041, + "grad_norm": 1.4354206323623657, + "learning_rate": 1.4315773729289673e-05, + "loss": 1.4126, + "step": 7252 + }, + { + "epoch": 0.39653923431242566, + "grad_norm": 1.7709509134292603, + "learning_rate": 1.4314125242227263e-05, + "loss": 1.531, + "step": 7253 + }, + { + "epoch": 0.39659390675614725, + "grad_norm": 1.3605722188949585, + "learning_rate": 1.431247661110609e-05, + "loss": 1.2085, + "step": 7254 + }, + { + "epoch": 0.3966485791998688, + "grad_norm": 1.2790625095367432, + "learning_rate": 1.4310827835981203e-05, + "loss": 1.506, + "step": 7255 + }, + { + "epoch": 0.3967032516435903, + "grad_norm": 1.775255799293518, + "learning_rate": 1.4309178916907658e-05, + "loss": 1.3314, + "step": 7256 + }, + { + "epoch": 0.3967579240873119, + "grad_norm": 1.395192265510559, + "learning_rate": 1.4307529853940519e-05, + "loss": 1.4993, + "step": 7257 + }, + { + "epoch": 0.39681259653103346, + "grad_norm": 1.4177215099334717, + "learning_rate": 1.4305880647134847e-05, + "loss": 1.3472, + "step": 7258 + }, + { + "epoch": 0.396867268974755, + "grad_norm": 1.3836860656738281, + "learning_rate": 1.4304231296545714e-05, + "loss": 1.7358, + "step": 7259 + }, + { + "epoch": 0.39692194141847653, + "grad_norm": 1.6524457931518555, + "learning_rate": 1.4302581802228202e-05, + "loss": 1.3964, + "step": 7260 + }, + { + "epoch": 0.3969766138621981, + "grad_norm": 1.7571521997451782, + "learning_rate": 1.4300932164237386e-05, + "loss": 1.4897, + "step": 7261 + }, + { + "epoch": 0.39703128630591966, + "grad_norm": 1.3092433214187622, + "learning_rate": 1.4299282382628355e-05, + "loss": 1.4004, + "step": 7262 + }, + { + "epoch": 0.3970859587496412, + "grad_norm": 1.4669220447540283, + "learning_rate": 1.4297632457456194e-05, + "loss": 1.5142, + "step": 7263 + }, + { + "epoch": 0.3971406311933628, + "grad_norm": 1.2432270050048828, + "learning_rate": 1.4295982388776003e-05, + "loss": 1.331, + "step": 7264 + }, + { + "epoch": 0.39719530363708433, + "grad_norm": 1.6740249395370483, + "learning_rate": 1.4294332176642875e-05, + "loss": 1.394, + "step": 7265 + }, + { + "epoch": 0.39724997608080587, + "grad_norm": 1.5176653861999512, + "learning_rate": 1.429268182111192e-05, + "loss": 1.6524, + "step": 7266 + }, + { + "epoch": 0.3973046485245274, + "grad_norm": 1.6684993505477905, + "learning_rate": 1.4291031322238247e-05, + "loss": 1.6222, + "step": 7267 + }, + { + "epoch": 0.397359320968249, + "grad_norm": 1.9775503873825073, + "learning_rate": 1.428938068007697e-05, + "loss": 1.1926, + "step": 7268 + }, + { + "epoch": 0.39741399341197053, + "grad_norm": 1.4224071502685547, + "learning_rate": 1.4287729894683207e-05, + "loss": 1.4727, + "step": 7269 + }, + { + "epoch": 0.39746866585569207, + "grad_norm": 1.6308008432388306, + "learning_rate": 1.4286078966112078e-05, + "loss": 1.5967, + "step": 7270 + }, + { + "epoch": 0.39752333829941366, + "grad_norm": 1.403316617012024, + "learning_rate": 1.4284427894418717e-05, + "loss": 1.4232, + "step": 7271 + }, + { + "epoch": 0.3975780107431352, + "grad_norm": 1.4631654024124146, + "learning_rate": 1.4282776679658255e-05, + "loss": 1.404, + "step": 7272 + }, + { + "epoch": 0.39763268318685674, + "grad_norm": 1.6160471439361572, + "learning_rate": 1.4281125321885826e-05, + "loss": 1.624, + "step": 7273 + }, + { + "epoch": 0.3976873556305783, + "grad_norm": 1.7470343112945557, + "learning_rate": 1.427947382115658e-05, + "loss": 1.6461, + "step": 7274 + }, + { + "epoch": 0.39774202807429987, + "grad_norm": 1.604243516921997, + "learning_rate": 1.4277822177525664e-05, + "loss": 1.3434, + "step": 7275 + }, + { + "epoch": 0.3977967005180214, + "grad_norm": 1.5443141460418701, + "learning_rate": 1.4276170391048224e-05, + "loss": 1.5243, + "step": 7276 + }, + { + "epoch": 0.39785137296174294, + "grad_norm": 1.300886869430542, + "learning_rate": 1.4274518461779421e-05, + "loss": 1.6348, + "step": 7277 + }, + { + "epoch": 0.39790604540546454, + "grad_norm": 1.144425868988037, + "learning_rate": 1.4272866389774415e-05, + "loss": 1.3831, + "step": 7278 + }, + { + "epoch": 0.3979607178491861, + "grad_norm": 1.389764428138733, + "learning_rate": 1.4271214175088374e-05, + "loss": 1.4115, + "step": 7279 + }, + { + "epoch": 0.3980153902929076, + "grad_norm": 1.527969479560852, + "learning_rate": 1.426956181777647e-05, + "loss": 1.6678, + "step": 7280 + }, + { + "epoch": 0.39807006273662915, + "grad_norm": 1.8061339855194092, + "learning_rate": 1.4267909317893875e-05, + "loss": 1.4676, + "step": 7281 + }, + { + "epoch": 0.39812473518035074, + "grad_norm": 1.5917524099349976, + "learning_rate": 1.4266256675495777e-05, + "loss": 1.3643, + "step": 7282 + }, + { + "epoch": 0.3981794076240723, + "grad_norm": 1.458054542541504, + "learning_rate": 1.4264603890637357e-05, + "loss": 1.4189, + "step": 7283 + }, + { + "epoch": 0.3982340800677938, + "grad_norm": 1.42207932472229, + "learning_rate": 1.4262950963373802e-05, + "loss": 1.342, + "step": 7284 + }, + { + "epoch": 0.3982887525115154, + "grad_norm": 1.2532958984375, + "learning_rate": 1.4261297893760315e-05, + "loss": 1.4473, + "step": 7285 + }, + { + "epoch": 0.39834342495523695, + "grad_norm": 1.4357026815414429, + "learning_rate": 1.425964468185209e-05, + "loss": 1.4526, + "step": 7286 + }, + { + "epoch": 0.3983980973989585, + "grad_norm": 1.3210899829864502, + "learning_rate": 1.4257991327704332e-05, + "loss": 1.4572, + "step": 7287 + }, + { + "epoch": 0.39845276984268, + "grad_norm": 1.8368384838104248, + "learning_rate": 1.4256337831372256e-05, + "loss": 1.4686, + "step": 7288 + }, + { + "epoch": 0.3985074422864016, + "grad_norm": 1.4848744869232178, + "learning_rate": 1.425468419291107e-05, + "loss": 1.5928, + "step": 7289 + }, + { + "epoch": 0.39856211473012315, + "grad_norm": 1.397538423538208, + "learning_rate": 1.4253030412375994e-05, + "loss": 1.4703, + "step": 7290 + }, + { + "epoch": 0.3986167871738447, + "grad_norm": 1.5026137828826904, + "learning_rate": 1.4251376489822253e-05, + "loss": 1.569, + "step": 7291 + }, + { + "epoch": 0.3986714596175663, + "grad_norm": 1.31200110912323, + "learning_rate": 1.4249722425305077e-05, + "loss": 1.3258, + "step": 7292 + }, + { + "epoch": 0.3987261320612878, + "grad_norm": 1.7724690437316895, + "learning_rate": 1.4248068218879691e-05, + "loss": 1.3307, + "step": 7293 + }, + { + "epoch": 0.39878080450500936, + "grad_norm": 1.3632627725601196, + "learning_rate": 1.4246413870601343e-05, + "loss": 1.5201, + "step": 7294 + }, + { + "epoch": 0.3988354769487309, + "grad_norm": 1.5098835229873657, + "learning_rate": 1.4244759380525273e-05, + "loss": 1.3185, + "step": 7295 + }, + { + "epoch": 0.3988901493924525, + "grad_norm": 1.6220945119857788, + "learning_rate": 1.4243104748706724e-05, + "loss": 1.3629, + "step": 7296 + }, + { + "epoch": 0.398944821836174, + "grad_norm": 1.0446796417236328, + "learning_rate": 1.4241449975200951e-05, + "loss": 1.2928, + "step": 7297 + }, + { + "epoch": 0.39899949427989556, + "grad_norm": 2.184225559234619, + "learning_rate": 1.4239795060063211e-05, + "loss": 1.5909, + "step": 7298 + }, + { + "epoch": 0.39905416672361715, + "grad_norm": 2.3592355251312256, + "learning_rate": 1.4238140003348766e-05, + "loss": 1.4689, + "step": 7299 + }, + { + "epoch": 0.3991088391673387, + "grad_norm": 1.395532250404358, + "learning_rate": 1.4236484805112878e-05, + "loss": 1.42, + "step": 7300 + }, + { + "epoch": 0.39916351161106023, + "grad_norm": 1.3983064889907837, + "learning_rate": 1.4234829465410824e-05, + "loss": 1.323, + "step": 7301 + }, + { + "epoch": 0.39921818405478177, + "grad_norm": 1.6426870822906494, + "learning_rate": 1.4233173984297876e-05, + "loss": 1.1913, + "step": 7302 + }, + { + "epoch": 0.39927285649850336, + "grad_norm": 1.3797683715820312, + "learning_rate": 1.4231518361829317e-05, + "loss": 1.5923, + "step": 7303 + }, + { + "epoch": 0.3993275289422249, + "grad_norm": 1.6204032897949219, + "learning_rate": 1.4229862598060426e-05, + "loss": 1.3, + "step": 7304 + }, + { + "epoch": 0.39938220138594643, + "grad_norm": 1.3464281558990479, + "learning_rate": 1.42282066930465e-05, + "loss": 1.5024, + "step": 7305 + }, + { + "epoch": 0.399436873829668, + "grad_norm": 1.5984156131744385, + "learning_rate": 1.4226550646842831e-05, + "loss": 1.4006, + "step": 7306 + }, + { + "epoch": 0.39949154627338956, + "grad_norm": 1.1549863815307617, + "learning_rate": 1.4224894459504717e-05, + "loss": 1.6008, + "step": 7307 + }, + { + "epoch": 0.3995462187171111, + "grad_norm": 1.3773597478866577, + "learning_rate": 1.4223238131087465e-05, + "loss": 1.2719, + "step": 7308 + }, + { + "epoch": 0.39960089116083264, + "grad_norm": 1.8468081951141357, + "learning_rate": 1.4221581661646377e-05, + "loss": 1.4126, + "step": 7309 + }, + { + "epoch": 0.39965556360455423, + "grad_norm": 1.7444332838058472, + "learning_rate": 1.4219925051236777e-05, + "loss": 1.4291, + "step": 7310 + }, + { + "epoch": 0.39971023604827577, + "grad_norm": 1.3756951093673706, + "learning_rate": 1.4218268299913973e-05, + "loss": 1.7006, + "step": 7311 + }, + { + "epoch": 0.3997649084919973, + "grad_norm": 2.1422066688537598, + "learning_rate": 1.4216611407733292e-05, + "loss": 1.4968, + "step": 7312 + }, + { + "epoch": 0.3998195809357189, + "grad_norm": 1.9607288837432861, + "learning_rate": 1.4214954374750062e-05, + "loss": 1.6182, + "step": 7313 + }, + { + "epoch": 0.39987425337944044, + "grad_norm": 1.4749534130096436, + "learning_rate": 1.4213297201019618e-05, + "loss": 1.5366, + "step": 7314 + }, + { + "epoch": 0.399928925823162, + "grad_norm": 1.4261068105697632, + "learning_rate": 1.421163988659729e-05, + "loss": 1.4848, + "step": 7315 + }, + { + "epoch": 0.3999835982668835, + "grad_norm": 1.3359169960021973, + "learning_rate": 1.4209982431538425e-05, + "loss": 1.5613, + "step": 7316 + }, + { + "epoch": 0.4000382707106051, + "grad_norm": 1.448944330215454, + "learning_rate": 1.4208324835898367e-05, + "loss": 1.3633, + "step": 7317 + }, + { + "epoch": 0.40009294315432664, + "grad_norm": 1.711954951286316, + "learning_rate": 1.4206667099732467e-05, + "loss": 1.4582, + "step": 7318 + }, + { + "epoch": 0.4001476155980482, + "grad_norm": 1.625491738319397, + "learning_rate": 1.420500922309608e-05, + "loss": 1.3547, + "step": 7319 + }, + { + "epoch": 0.40020228804176977, + "grad_norm": 1.4938875436782837, + "learning_rate": 1.420335120604457e-05, + "loss": 1.4119, + "step": 7320 + }, + { + "epoch": 0.4002569604854913, + "grad_norm": 1.302093505859375, + "learning_rate": 1.4201693048633302e-05, + "loss": 1.6309, + "step": 7321 + }, + { + "epoch": 0.40031163292921285, + "grad_norm": 1.3482428789138794, + "learning_rate": 1.420003475091764e-05, + "loss": 1.5699, + "step": 7322 + }, + { + "epoch": 0.4003663053729344, + "grad_norm": 1.3552559614181519, + "learning_rate": 1.4198376312952962e-05, + "loss": 1.541, + "step": 7323 + }, + { + "epoch": 0.400420977816656, + "grad_norm": 1.6623281240463257, + "learning_rate": 1.4196717734794647e-05, + "loss": 1.4314, + "step": 7324 + }, + { + "epoch": 0.4004756502603775, + "grad_norm": 1.588773488998413, + "learning_rate": 1.4195059016498081e-05, + "loss": 1.4281, + "step": 7325 + }, + { + "epoch": 0.40053032270409905, + "grad_norm": 1.6454297304153442, + "learning_rate": 1.419340015811865e-05, + "loss": 1.4848, + "step": 7326 + }, + { + "epoch": 0.40058499514782064, + "grad_norm": 1.6012924909591675, + "learning_rate": 1.4191741159711746e-05, + "loss": 1.6793, + "step": 7327 + }, + { + "epoch": 0.4006396675915422, + "grad_norm": 1.4030325412750244, + "learning_rate": 1.419008202133277e-05, + "loss": 1.3594, + "step": 7328 + }, + { + "epoch": 0.4006943400352637, + "grad_norm": 1.1829054355621338, + "learning_rate": 1.418842274303712e-05, + "loss": 1.4698, + "step": 7329 + }, + { + "epoch": 0.40074901247898526, + "grad_norm": 1.4518502950668335, + "learning_rate": 1.4186763324880208e-05, + "loss": 1.4654, + "step": 7330 + }, + { + "epoch": 0.40080368492270685, + "grad_norm": 1.6131575107574463, + "learning_rate": 1.4185103766917445e-05, + "loss": 1.3998, + "step": 7331 + }, + { + "epoch": 0.4008583573664284, + "grad_norm": 1.733506679534912, + "learning_rate": 1.4183444069204246e-05, + "loss": 1.5373, + "step": 7332 + }, + { + "epoch": 0.4009130298101499, + "grad_norm": 1.4479010105133057, + "learning_rate": 1.4181784231796034e-05, + "loss": 1.6208, + "step": 7333 + }, + { + "epoch": 0.4009677022538715, + "grad_norm": 1.3783133029937744, + "learning_rate": 1.4180124254748233e-05, + "loss": 1.4585, + "step": 7334 + }, + { + "epoch": 0.40102237469759305, + "grad_norm": 1.826237440109253, + "learning_rate": 1.4178464138116272e-05, + "loss": 1.6551, + "step": 7335 + }, + { + "epoch": 0.4010770471413146, + "grad_norm": 1.3659923076629639, + "learning_rate": 1.4176803881955592e-05, + "loss": 1.5368, + "step": 7336 + }, + { + "epoch": 0.40113171958503613, + "grad_norm": 1.6205801963806152, + "learning_rate": 1.4175143486321626e-05, + "loss": 1.6008, + "step": 7337 + }, + { + "epoch": 0.4011863920287577, + "grad_norm": 1.8905390501022339, + "learning_rate": 1.4173482951269823e-05, + "loss": 1.5624, + "step": 7338 + }, + { + "epoch": 0.40124106447247926, + "grad_norm": 1.3910380601882935, + "learning_rate": 1.417182227685563e-05, + "loss": 1.2971, + "step": 7339 + }, + { + "epoch": 0.4012957369162008, + "grad_norm": 1.4981553554534912, + "learning_rate": 1.4170161463134502e-05, + "loss": 1.541, + "step": 7340 + }, + { + "epoch": 0.4013504093599224, + "grad_norm": 1.3899469375610352, + "learning_rate": 1.41685005101619e-05, + "loss": 1.2751, + "step": 7341 + }, + { + "epoch": 0.4014050818036439, + "grad_norm": 1.4423576593399048, + "learning_rate": 1.4166839417993281e-05, + "loss": 1.3133, + "step": 7342 + }, + { + "epoch": 0.40145975424736546, + "grad_norm": 1.4626805782318115, + "learning_rate": 1.4165178186684116e-05, + "loss": 1.4988, + "step": 7343 + }, + { + "epoch": 0.401514426691087, + "grad_norm": 1.3119868040084839, + "learning_rate": 1.4163516816289878e-05, + "loss": 1.6597, + "step": 7344 + }, + { + "epoch": 0.4015690991348086, + "grad_norm": 1.7646890878677368, + "learning_rate": 1.4161855306866043e-05, + "loss": 1.1472, + "step": 7345 + }, + { + "epoch": 0.40162377157853013, + "grad_norm": 1.255588173866272, + "learning_rate": 1.4160193658468093e-05, + "loss": 1.5709, + "step": 7346 + }, + { + "epoch": 0.40167844402225167, + "grad_norm": 1.4570845365524292, + "learning_rate": 1.4158531871151517e-05, + "loss": 1.5463, + "step": 7347 + }, + { + "epoch": 0.40173311646597326, + "grad_norm": 1.1936149597167969, + "learning_rate": 1.4156869944971804e-05, + "loss": 1.469, + "step": 7348 + }, + { + "epoch": 0.4017877889096948, + "grad_norm": 1.568843126296997, + "learning_rate": 1.4155207879984447e-05, + "loss": 1.5947, + "step": 7349 + }, + { + "epoch": 0.40184246135341634, + "grad_norm": 1.285942554473877, + "learning_rate": 1.415354567624495e-05, + "loss": 1.5618, + "step": 7350 + }, + { + "epoch": 0.4018971337971379, + "grad_norm": 1.4683173894882202, + "learning_rate": 1.4151883333808811e-05, + "loss": 1.4166, + "step": 7351 + }, + { + "epoch": 0.40195180624085947, + "grad_norm": 1.5933140516281128, + "learning_rate": 1.4150220852731551e-05, + "loss": 1.3584, + "step": 7352 + }, + { + "epoch": 0.402006478684581, + "grad_norm": 1.3567770719528198, + "learning_rate": 1.4148558233068677e-05, + "loss": 1.4784, + "step": 7353 + }, + { + "epoch": 0.40206115112830254, + "grad_norm": 1.4097959995269775, + "learning_rate": 1.4146895474875706e-05, + "loss": 1.3523, + "step": 7354 + }, + { + "epoch": 0.40211582357202413, + "grad_norm": 1.4108753204345703, + "learning_rate": 1.4145232578208165e-05, + "loss": 1.3791, + "step": 7355 + }, + { + "epoch": 0.40217049601574567, + "grad_norm": 1.5498948097229004, + "learning_rate": 1.414356954312158e-05, + "loss": 1.2888, + "step": 7356 + }, + { + "epoch": 0.4022251684594672, + "grad_norm": 1.5535056591033936, + "learning_rate": 1.4141906369671488e-05, + "loss": 1.2895, + "step": 7357 + }, + { + "epoch": 0.40227984090318875, + "grad_norm": 1.7726242542266846, + "learning_rate": 1.4140243057913418e-05, + "loss": 1.3312, + "step": 7358 + }, + { + "epoch": 0.40233451334691034, + "grad_norm": 1.3071449995040894, + "learning_rate": 1.4138579607902922e-05, + "loss": 1.5569, + "step": 7359 + }, + { + "epoch": 0.4023891857906319, + "grad_norm": 2.4068281650543213, + "learning_rate": 1.4136916019695541e-05, + "loss": 1.1876, + "step": 7360 + }, + { + "epoch": 0.4024438582343534, + "grad_norm": 1.701702356338501, + "learning_rate": 1.4135252293346824e-05, + "loss": 1.6153, + "step": 7361 + }, + { + "epoch": 0.402498530678075, + "grad_norm": 1.261853814125061, + "learning_rate": 1.4133588428912333e-05, + "loss": 1.61, + "step": 7362 + }, + { + "epoch": 0.40255320312179654, + "grad_norm": 1.7735505104064941, + "learning_rate": 1.4131924426447621e-05, + "loss": 1.1739, + "step": 7363 + }, + { + "epoch": 0.4026078755655181, + "grad_norm": 1.807984471321106, + "learning_rate": 1.4130260286008257e-05, + "loss": 1.4306, + "step": 7364 + }, + { + "epoch": 0.4026625480092396, + "grad_norm": 1.8442456722259521, + "learning_rate": 1.4128596007649808e-05, + "loss": 1.5852, + "step": 7365 + }, + { + "epoch": 0.4027172204529612, + "grad_norm": 1.3361034393310547, + "learning_rate": 1.4126931591427855e-05, + "loss": 1.4642, + "step": 7366 + }, + { + "epoch": 0.40277189289668275, + "grad_norm": 2.0199882984161377, + "learning_rate": 1.4125267037397972e-05, + "loss": 1.146, + "step": 7367 + }, + { + "epoch": 0.4028265653404043, + "grad_norm": 1.6648802757263184, + "learning_rate": 1.412360234561574e-05, + "loss": 1.5858, + "step": 7368 + }, + { + "epoch": 0.4028812377841259, + "grad_norm": 1.4846512079238892, + "learning_rate": 1.4121937516136747e-05, + "loss": 1.6466, + "step": 7369 + }, + { + "epoch": 0.4029359102278474, + "grad_norm": 1.6079189777374268, + "learning_rate": 1.4120272549016591e-05, + "loss": 1.8714, + "step": 7370 + }, + { + "epoch": 0.40299058267156895, + "grad_norm": 1.266954779624939, + "learning_rate": 1.4118607444310866e-05, + "loss": 1.509, + "step": 7371 + }, + { + "epoch": 0.4030452551152905, + "grad_norm": 1.4316827058792114, + "learning_rate": 1.4116942202075175e-05, + "loss": 1.4839, + "step": 7372 + }, + { + "epoch": 0.4030999275590121, + "grad_norm": 1.675977110862732, + "learning_rate": 1.4115276822365123e-05, + "loss": 1.4501, + "step": 7373 + }, + { + "epoch": 0.4031546000027336, + "grad_norm": 1.3502978086471558, + "learning_rate": 1.4113611305236317e-05, + "loss": 1.3511, + "step": 7374 + }, + { + "epoch": 0.40320927244645516, + "grad_norm": 1.571675181388855, + "learning_rate": 1.4111945650744379e-05, + "loss": 1.3925, + "step": 7375 + }, + { + "epoch": 0.40326394489017675, + "grad_norm": 1.5254307985305786, + "learning_rate": 1.4110279858944928e-05, + "loss": 1.5125, + "step": 7376 + }, + { + "epoch": 0.4033186173338983, + "grad_norm": 1.3436901569366455, + "learning_rate": 1.4108613929893586e-05, + "loss": 1.3397, + "step": 7377 + }, + { + "epoch": 0.4033732897776198, + "grad_norm": 1.4335379600524902, + "learning_rate": 1.4106947863645983e-05, + "loss": 1.3248, + "step": 7378 + }, + { + "epoch": 0.40342796222134136, + "grad_norm": 1.5759507417678833, + "learning_rate": 1.4105281660257757e-05, + "loss": 1.5008, + "step": 7379 + }, + { + "epoch": 0.40348263466506296, + "grad_norm": 1.2042279243469238, + "learning_rate": 1.410361531978454e-05, + "loss": 1.6326, + "step": 7380 + }, + { + "epoch": 0.4035373071087845, + "grad_norm": 1.1996796131134033, + "learning_rate": 1.4101948842281978e-05, + "loss": 1.4743, + "step": 7381 + }, + { + "epoch": 0.40359197955250603, + "grad_norm": 1.2877349853515625, + "learning_rate": 1.410028222780572e-05, + "loss": 1.4716, + "step": 7382 + }, + { + "epoch": 0.4036466519962276, + "grad_norm": 1.4685378074645996, + "learning_rate": 1.4098615476411416e-05, + "loss": 1.323, + "step": 7383 + }, + { + "epoch": 0.40370132443994916, + "grad_norm": 1.3899675607681274, + "learning_rate": 1.4096948588154723e-05, + "loss": 1.3964, + "step": 7384 + }, + { + "epoch": 0.4037559968836707, + "grad_norm": 1.5984946489334106, + "learning_rate": 1.4095281563091303e-05, + "loss": 1.4396, + "step": 7385 + }, + { + "epoch": 0.40381066932739224, + "grad_norm": 1.451952576637268, + "learning_rate": 1.4093614401276826e-05, + "loss": 1.4663, + "step": 7386 + }, + { + "epoch": 0.40386534177111383, + "grad_norm": 1.566056489944458, + "learning_rate": 1.4091947102766953e-05, + "loss": 1.6654, + "step": 7387 + }, + { + "epoch": 0.40392001421483537, + "grad_norm": 1.3111486434936523, + "learning_rate": 1.4090279667617366e-05, + "loss": 1.3934, + "step": 7388 + }, + { + "epoch": 0.4039746866585569, + "grad_norm": 1.390470266342163, + "learning_rate": 1.408861209588374e-05, + "loss": 1.2775, + "step": 7389 + }, + { + "epoch": 0.4040293591022785, + "grad_norm": 1.5536664724349976, + "learning_rate": 1.4086944387621766e-05, + "loss": 1.1307, + "step": 7390 + }, + { + "epoch": 0.40408403154600003, + "grad_norm": 1.488789439201355, + "learning_rate": 1.4085276542887128e-05, + "loss": 1.4944, + "step": 7391 + }, + { + "epoch": 0.40413870398972157, + "grad_norm": 1.4022642374038696, + "learning_rate": 1.4083608561735517e-05, + "loss": 1.6067, + "step": 7392 + }, + { + "epoch": 0.4041933764334431, + "grad_norm": 1.3465700149536133, + "learning_rate": 1.4081940444222637e-05, + "loss": 1.5008, + "step": 7393 + }, + { + "epoch": 0.4042480488771647, + "grad_norm": 1.4428421258926392, + "learning_rate": 1.4080272190404185e-05, + "loss": 1.4529, + "step": 7394 + }, + { + "epoch": 0.40430272132088624, + "grad_norm": 1.1374701261520386, + "learning_rate": 1.4078603800335871e-05, + "loss": 1.5309, + "step": 7395 + }, + { + "epoch": 0.4043573937646078, + "grad_norm": 1.4970580339431763, + "learning_rate": 1.4076935274073402e-05, + "loss": 1.4637, + "step": 7396 + }, + { + "epoch": 0.40441206620832937, + "grad_norm": 1.3006640672683716, + "learning_rate": 1.4075266611672502e-05, + "loss": 1.4418, + "step": 7397 + }, + { + "epoch": 0.4044667386520509, + "grad_norm": 1.8517948389053345, + "learning_rate": 1.4073597813188884e-05, + "loss": 1.4022, + "step": 7398 + }, + { + "epoch": 0.40452141109577244, + "grad_norm": 1.4575117826461792, + "learning_rate": 1.4071928878678278e-05, + "loss": 1.2139, + "step": 7399 + }, + { + "epoch": 0.404576083539494, + "grad_norm": 1.557366132736206, + "learning_rate": 1.4070259808196411e-05, + "loss": 1.5186, + "step": 7400 + }, + { + "epoch": 0.4046307559832156, + "grad_norm": 1.324807047843933, + "learning_rate": 1.4068590601799018e-05, + "loss": 1.6666, + "step": 7401 + }, + { + "epoch": 0.4046854284269371, + "grad_norm": 1.6726263761520386, + "learning_rate": 1.4066921259541837e-05, + "loss": 1.5294, + "step": 7402 + }, + { + "epoch": 0.40474010087065865, + "grad_norm": 1.701254963874817, + "learning_rate": 1.4065251781480612e-05, + "loss": 1.501, + "step": 7403 + }, + { + "epoch": 0.40479477331438024, + "grad_norm": 2.1835036277770996, + "learning_rate": 1.4063582167671091e-05, + "loss": 1.4744, + "step": 7404 + }, + { + "epoch": 0.4048494457581018, + "grad_norm": 1.3898303508758545, + "learning_rate": 1.4061912418169024e-05, + "loss": 1.6005, + "step": 7405 + }, + { + "epoch": 0.4049041182018233, + "grad_norm": 1.6469066143035889, + "learning_rate": 1.4060242533030173e-05, + "loss": 1.5059, + "step": 7406 + }, + { + "epoch": 0.40495879064554485, + "grad_norm": 1.5340741872787476, + "learning_rate": 1.4058572512310293e-05, + "loss": 1.4119, + "step": 7407 + }, + { + "epoch": 0.40501346308926645, + "grad_norm": 1.4086681604385376, + "learning_rate": 1.4056902356065154e-05, + "loss": 1.5313, + "step": 7408 + }, + { + "epoch": 0.405068135532988, + "grad_norm": 2.242154359817505, + "learning_rate": 1.4055232064350526e-05, + "loss": 1.3879, + "step": 7409 + }, + { + "epoch": 0.4051228079767095, + "grad_norm": 1.373396635055542, + "learning_rate": 1.4053561637222182e-05, + "loss": 1.4089, + "step": 7410 + }, + { + "epoch": 0.4051774804204311, + "grad_norm": 1.5865622758865356, + "learning_rate": 1.4051891074735906e-05, + "loss": 1.3563, + "step": 7411 + }, + { + "epoch": 0.40523215286415265, + "grad_norm": 1.2975393533706665, + "learning_rate": 1.405022037694748e-05, + "loss": 1.4853, + "step": 7412 + }, + { + "epoch": 0.4052868253078742, + "grad_norm": 1.3181310892105103, + "learning_rate": 1.4048549543912687e-05, + "loss": 1.4266, + "step": 7413 + }, + { + "epoch": 0.4053414977515957, + "grad_norm": 1.4046919345855713, + "learning_rate": 1.4046878575687326e-05, + "loss": 1.574, + "step": 7414 + }, + { + "epoch": 0.4053961701953173, + "grad_norm": 1.7441761493682861, + "learning_rate": 1.4045207472327194e-05, + "loss": 1.4295, + "step": 7415 + }, + { + "epoch": 0.40545084263903886, + "grad_norm": 1.4302202463150024, + "learning_rate": 1.4043536233888091e-05, + "loss": 1.5071, + "step": 7416 + }, + { + "epoch": 0.4055055150827604, + "grad_norm": 1.5526491403579712, + "learning_rate": 1.4041864860425822e-05, + "loss": 1.4091, + "step": 7417 + }, + { + "epoch": 0.405560187526482, + "grad_norm": 1.5717599391937256, + "learning_rate": 1.4040193351996206e-05, + "loss": 1.4804, + "step": 7418 + }, + { + "epoch": 0.4056148599702035, + "grad_norm": 1.8117263317108154, + "learning_rate": 1.4038521708655054e-05, + "loss": 1.3056, + "step": 7419 + }, + { + "epoch": 0.40566953241392506, + "grad_norm": 1.3939416408538818, + "learning_rate": 1.4036849930458181e-05, + "loss": 1.7289, + "step": 7420 + }, + { + "epoch": 0.4057242048576466, + "grad_norm": 2.0106699466705322, + "learning_rate": 1.4035178017461419e-05, + "loss": 1.3199, + "step": 7421 + }, + { + "epoch": 0.4057788773013682, + "grad_norm": 1.8926775455474854, + "learning_rate": 1.4033505969720592e-05, + "loss": 1.5794, + "step": 7422 + }, + { + "epoch": 0.40583354974508973, + "grad_norm": 1.419091820716858, + "learning_rate": 1.4031833787291536e-05, + "loss": 1.4643, + "step": 7423 + }, + { + "epoch": 0.40588822218881127, + "grad_norm": 1.8217281103134155, + "learning_rate": 1.4030161470230088e-05, + "loss": 1.4588, + "step": 7424 + }, + { + "epoch": 0.40594289463253286, + "grad_norm": 1.466501235961914, + "learning_rate": 1.4028489018592095e-05, + "loss": 1.5196, + "step": 7425 + }, + { + "epoch": 0.4059975670762544, + "grad_norm": 1.543951153755188, + "learning_rate": 1.40268164324334e-05, + "loss": 1.7183, + "step": 7426 + }, + { + "epoch": 0.40605223951997593, + "grad_norm": 1.6229338645935059, + "learning_rate": 1.4025143711809853e-05, + "loss": 1.3702, + "step": 7427 + }, + { + "epoch": 0.40610691196369747, + "grad_norm": 1.659999132156372, + "learning_rate": 1.4023470856777313e-05, + "loss": 1.4547, + "step": 7428 + }, + { + "epoch": 0.40616158440741906, + "grad_norm": 1.5445003509521484, + "learning_rate": 1.402179786739164e-05, + "loss": 1.2625, + "step": 7429 + }, + { + "epoch": 0.4062162568511406, + "grad_norm": 1.5837241411209106, + "learning_rate": 1.4020124743708696e-05, + "loss": 1.5384, + "step": 7430 + }, + { + "epoch": 0.40627092929486214, + "grad_norm": 1.2450941801071167, + "learning_rate": 1.4018451485784357e-05, + "loss": 1.3334, + "step": 7431 + }, + { + "epoch": 0.40632560173858373, + "grad_norm": 1.4989805221557617, + "learning_rate": 1.4016778093674493e-05, + "loss": 1.4062, + "step": 7432 + }, + { + "epoch": 0.40638027418230527, + "grad_norm": 1.671254277229309, + "learning_rate": 1.4015104567434981e-05, + "loss": 1.0775, + "step": 7433 + }, + { + "epoch": 0.4064349466260268, + "grad_norm": 1.522674322128296, + "learning_rate": 1.4013430907121706e-05, + "loss": 1.541, + "step": 7434 + }, + { + "epoch": 0.40648961906974834, + "grad_norm": 1.3047131299972534, + "learning_rate": 1.4011757112790556e-05, + "loss": 1.4911, + "step": 7435 + }, + { + "epoch": 0.40654429151346994, + "grad_norm": 1.5849344730377197, + "learning_rate": 1.401008318449742e-05, + "loss": 1.2928, + "step": 7436 + }, + { + "epoch": 0.4065989639571915, + "grad_norm": 1.330362319946289, + "learning_rate": 1.4008409122298199e-05, + "loss": 1.4176, + "step": 7437 + }, + { + "epoch": 0.406653636400913, + "grad_norm": 1.6918046474456787, + "learning_rate": 1.400673492624879e-05, + "loss": 1.483, + "step": 7438 + }, + { + "epoch": 0.4067083088446346, + "grad_norm": 1.3310589790344238, + "learning_rate": 1.4005060596405102e-05, + "loss": 1.267, + "step": 7439 + }, + { + "epoch": 0.40676298128835614, + "grad_norm": 1.4434466361999512, + "learning_rate": 1.400338613282304e-05, + "loss": 1.7176, + "step": 7440 + }, + { + "epoch": 0.4068176537320777, + "grad_norm": 1.5338947772979736, + "learning_rate": 1.4001711535558523e-05, + "loss": 1.3085, + "step": 7441 + }, + { + "epoch": 0.40687232617579927, + "grad_norm": 1.8659460544586182, + "learning_rate": 1.4000036804667464e-05, + "loss": 1.3175, + "step": 7442 + }, + { + "epoch": 0.4069269986195208, + "grad_norm": 1.6251797676086426, + "learning_rate": 1.3998361940205794e-05, + "loss": 1.7394, + "step": 7443 + }, + { + "epoch": 0.40698167106324235, + "grad_norm": 1.5500259399414062, + "learning_rate": 1.3996686942229435e-05, + "loss": 1.3776, + "step": 7444 + }, + { + "epoch": 0.4070363435069639, + "grad_norm": 1.5346641540527344, + "learning_rate": 1.3995011810794319e-05, + "loss": 1.1883, + "step": 7445 + }, + { + "epoch": 0.4070910159506855, + "grad_norm": 1.4618966579437256, + "learning_rate": 1.3993336545956386e-05, + "loss": 1.3594, + "step": 7446 + }, + { + "epoch": 0.407145688394407, + "grad_norm": 1.2365756034851074, + "learning_rate": 1.3991661147771574e-05, + "loss": 1.3537, + "step": 7447 + }, + { + "epoch": 0.40720036083812855, + "grad_norm": 1.5957331657409668, + "learning_rate": 1.3989985616295826e-05, + "loss": 1.5261, + "step": 7448 + }, + { + "epoch": 0.40725503328185014, + "grad_norm": 1.479835867881775, + "learning_rate": 1.3988309951585101e-05, + "loss": 1.4437, + "step": 7449 + }, + { + "epoch": 0.4073097057255717, + "grad_norm": 1.5382275581359863, + "learning_rate": 1.3986634153695343e-05, + "loss": 1.3215, + "step": 7450 + }, + { + "epoch": 0.4073643781692932, + "grad_norm": 1.2974194288253784, + "learning_rate": 1.3984958222682522e-05, + "loss": 1.5096, + "step": 7451 + }, + { + "epoch": 0.40741905061301475, + "grad_norm": 1.3224244117736816, + "learning_rate": 1.3983282158602589e-05, + "loss": 1.435, + "step": 7452 + }, + { + "epoch": 0.40747372305673635, + "grad_norm": 1.360091209411621, + "learning_rate": 1.3981605961511522e-05, + "loss": 1.3627, + "step": 7453 + }, + { + "epoch": 0.4075283955004579, + "grad_norm": 2.0193192958831787, + "learning_rate": 1.3979929631465286e-05, + "loss": 1.4092, + "step": 7454 + }, + { + "epoch": 0.4075830679441794, + "grad_norm": 1.4262726306915283, + "learning_rate": 1.3978253168519859e-05, + "loss": 1.5752, + "step": 7455 + }, + { + "epoch": 0.407637740387901, + "grad_norm": 1.6300171613693237, + "learning_rate": 1.3976576572731228e-05, + "loss": 1.5369, + "step": 7456 + }, + { + "epoch": 0.40769241283162255, + "grad_norm": 1.3211814165115356, + "learning_rate": 1.3974899844155373e-05, + "loss": 1.3444, + "step": 7457 + }, + { + "epoch": 0.4077470852753441, + "grad_norm": 1.3347744941711426, + "learning_rate": 1.3973222982848282e-05, + "loss": 1.6383, + "step": 7458 + }, + { + "epoch": 0.4078017577190656, + "grad_norm": 1.58376944065094, + "learning_rate": 1.3971545988865953e-05, + "loss": 1.4348, + "step": 7459 + }, + { + "epoch": 0.4078564301627872, + "grad_norm": 1.7164959907531738, + "learning_rate": 1.3969868862264386e-05, + "loss": 1.209, + "step": 7460 + }, + { + "epoch": 0.40791110260650876, + "grad_norm": 1.443773627281189, + "learning_rate": 1.396819160309958e-05, + "loss": 1.3074, + "step": 7461 + }, + { + "epoch": 0.4079657750502303, + "grad_norm": 1.4983059167861938, + "learning_rate": 1.3966514211427544e-05, + "loss": 1.558, + "step": 7462 + }, + { + "epoch": 0.4080204474939519, + "grad_norm": 1.5636337995529175, + "learning_rate": 1.3964836687304293e-05, + "loss": 1.6971, + "step": 7463 + }, + { + "epoch": 0.4080751199376734, + "grad_norm": 1.3831578493118286, + "learning_rate": 1.3963159030785843e-05, + "loss": 1.5866, + "step": 7464 + }, + { + "epoch": 0.40812979238139496, + "grad_norm": 1.8813450336456299, + "learning_rate": 1.3961481241928207e-05, + "loss": 1.4021, + "step": 7465 + }, + { + "epoch": 0.4081844648251165, + "grad_norm": 1.6406185626983643, + "learning_rate": 1.395980332078742e-05, + "loss": 1.5937, + "step": 7466 + }, + { + "epoch": 0.4082391372688381, + "grad_norm": 1.9929245710372925, + "learning_rate": 1.3958125267419509e-05, + "loss": 1.1619, + "step": 7467 + }, + { + "epoch": 0.40829380971255963, + "grad_norm": 1.5897691249847412, + "learning_rate": 1.3956447081880506e-05, + "loss": 1.7383, + "step": 7468 + }, + { + "epoch": 0.40834848215628117, + "grad_norm": 1.6211649179458618, + "learning_rate": 1.3954768764226449e-05, + "loss": 1.3594, + "step": 7469 + }, + { + "epoch": 0.40840315460000276, + "grad_norm": 1.6519306898117065, + "learning_rate": 1.3953090314513387e-05, + "loss": 1.3685, + "step": 7470 + }, + { + "epoch": 0.4084578270437243, + "grad_norm": 1.632639765739441, + "learning_rate": 1.3951411732797363e-05, + "loss": 1.2452, + "step": 7471 + }, + { + "epoch": 0.40851249948744583, + "grad_norm": 2.166398763656616, + "learning_rate": 1.3949733019134427e-05, + "loss": 1.4413, + "step": 7472 + }, + { + "epoch": 0.40856717193116737, + "grad_norm": 1.609758973121643, + "learning_rate": 1.3948054173580636e-05, + "loss": 1.483, + "step": 7473 + }, + { + "epoch": 0.40862184437488897, + "grad_norm": 1.8781194686889648, + "learning_rate": 1.3946375196192052e-05, + "loss": 1.5019, + "step": 7474 + }, + { + "epoch": 0.4086765168186105, + "grad_norm": 1.4866225719451904, + "learning_rate": 1.394469608702474e-05, + "loss": 1.3911, + "step": 7475 + }, + { + "epoch": 0.40873118926233204, + "grad_norm": 1.422333836555481, + "learning_rate": 1.394301684613477e-05, + "loss": 1.4621, + "step": 7476 + }, + { + "epoch": 0.40878586170605363, + "grad_norm": 1.4858285188674927, + "learning_rate": 1.3941337473578216e-05, + "loss": 1.2469, + "step": 7477 + }, + { + "epoch": 0.40884053414977517, + "grad_norm": 1.4845811128616333, + "learning_rate": 1.3939657969411155e-05, + "loss": 1.504, + "step": 7478 + }, + { + "epoch": 0.4088952065934967, + "grad_norm": 1.3952293395996094, + "learning_rate": 1.3937978333689667e-05, + "loss": 1.703, + "step": 7479 + }, + { + "epoch": 0.40894987903721824, + "grad_norm": 2.105907917022705, + "learning_rate": 1.3936298566469843e-05, + "loss": 1.3114, + "step": 7480 + }, + { + "epoch": 0.40900455148093984, + "grad_norm": 1.8305494785308838, + "learning_rate": 1.3934618667807773e-05, + "loss": 1.4651, + "step": 7481 + }, + { + "epoch": 0.4090592239246614, + "grad_norm": 1.3271152973175049, + "learning_rate": 1.3932938637759555e-05, + "loss": 1.5748, + "step": 7482 + }, + { + "epoch": 0.4091138963683829, + "grad_norm": 1.7673894166946411, + "learning_rate": 1.3931258476381284e-05, + "loss": 1.564, + "step": 7483 + }, + { + "epoch": 0.4091685688121045, + "grad_norm": 1.6030043363571167, + "learning_rate": 1.392957818372907e-05, + "loss": 1.4803, + "step": 7484 + }, + { + "epoch": 0.40922324125582604, + "grad_norm": 1.4576308727264404, + "learning_rate": 1.3927897759859018e-05, + "loss": 1.5032, + "step": 7485 + }, + { + "epoch": 0.4092779136995476, + "grad_norm": 1.4160184860229492, + "learning_rate": 1.3926217204827241e-05, + "loss": 1.4301, + "step": 7486 + }, + { + "epoch": 0.4093325861432691, + "grad_norm": 1.6681439876556396, + "learning_rate": 1.392453651868986e-05, + "loss": 1.5731, + "step": 7487 + }, + { + "epoch": 0.4093872585869907, + "grad_norm": 1.1953924894332886, + "learning_rate": 1.3922855701502997e-05, + "loss": 1.593, + "step": 7488 + }, + { + "epoch": 0.40944193103071225, + "grad_norm": 1.82209050655365, + "learning_rate": 1.3921174753322775e-05, + "loss": 1.3837, + "step": 7489 + }, + { + "epoch": 0.4094966034744338, + "grad_norm": 1.4780224561691284, + "learning_rate": 1.3919493674205326e-05, + "loss": 1.4564, + "step": 7490 + }, + { + "epoch": 0.4095512759181554, + "grad_norm": 1.5255184173583984, + "learning_rate": 1.391781246420679e-05, + "loss": 1.3109, + "step": 7491 + }, + { + "epoch": 0.4096059483618769, + "grad_norm": 1.7913739681243896, + "learning_rate": 1.3916131123383298e-05, + "loss": 1.5374, + "step": 7492 + }, + { + "epoch": 0.40966062080559845, + "grad_norm": 1.7213490009307861, + "learning_rate": 1.3914449651790998e-05, + "loss": 1.619, + "step": 7493 + }, + { + "epoch": 0.40971529324932, + "grad_norm": 1.837601661682129, + "learning_rate": 1.3912768049486039e-05, + "loss": 1.4768, + "step": 7494 + }, + { + "epoch": 0.4097699656930416, + "grad_norm": 1.2706618309020996, + "learning_rate": 1.3911086316524576e-05, + "loss": 1.4749, + "step": 7495 + }, + { + "epoch": 0.4098246381367631, + "grad_norm": 1.4166290760040283, + "learning_rate": 1.390940445296276e-05, + "loss": 1.3449, + "step": 7496 + }, + { + "epoch": 0.40987931058048466, + "grad_norm": 1.9802695512771606, + "learning_rate": 1.3907722458856758e-05, + "loss": 1.4256, + "step": 7497 + }, + { + "epoch": 0.40993398302420625, + "grad_norm": 1.891804575920105, + "learning_rate": 1.3906040334262733e-05, + "loss": 1.2594, + "step": 7498 + }, + { + "epoch": 0.4099886554679278, + "grad_norm": 1.8955553770065308, + "learning_rate": 1.3904358079236854e-05, + "loss": 1.5671, + "step": 7499 + }, + { + "epoch": 0.4100433279116493, + "grad_norm": 1.2355114221572876, + "learning_rate": 1.3902675693835299e-05, + "loss": 1.4279, + "step": 7500 + }, + { + "epoch": 0.41009800035537086, + "grad_norm": 1.1238588094711304, + "learning_rate": 1.3900993178114241e-05, + "loss": 1.4727, + "step": 7501 + }, + { + "epoch": 0.41015267279909245, + "grad_norm": 1.5841468572616577, + "learning_rate": 1.3899310532129872e-05, + "loss": 1.6107, + "step": 7502 + }, + { + "epoch": 0.410207345242814, + "grad_norm": 1.523137092590332, + "learning_rate": 1.3897627755938372e-05, + "loss": 1.4737, + "step": 7503 + }, + { + "epoch": 0.41026201768653553, + "grad_norm": 1.5534621477127075, + "learning_rate": 1.3895944849595934e-05, + "loss": 1.5846, + "step": 7504 + }, + { + "epoch": 0.4103166901302571, + "grad_norm": 1.5625958442687988, + "learning_rate": 1.3894261813158758e-05, + "loss": 1.3208, + "step": 7505 + }, + { + "epoch": 0.41037136257397866, + "grad_norm": 1.310986042022705, + "learning_rate": 1.389257864668304e-05, + "loss": 1.3601, + "step": 7506 + }, + { + "epoch": 0.4104260350177002, + "grad_norm": 1.3991798162460327, + "learning_rate": 1.3890895350224984e-05, + "loss": 1.4401, + "step": 7507 + }, + { + "epoch": 0.41048070746142173, + "grad_norm": 1.7956494092941284, + "learning_rate": 1.3889211923840805e-05, + "loss": 1.5628, + "step": 7508 + }, + { + "epoch": 0.4105353799051433, + "grad_norm": 1.1898622512817383, + "learning_rate": 1.3887528367586714e-05, + "loss": 1.4658, + "step": 7509 + }, + { + "epoch": 0.41059005234886486, + "grad_norm": 1.3725134134292603, + "learning_rate": 1.388584468151893e-05, + "loss": 1.3929, + "step": 7510 + }, + { + "epoch": 0.4106447247925864, + "grad_norm": 1.2056872844696045, + "learning_rate": 1.388416086569367e-05, + "loss": 1.3486, + "step": 7511 + }, + { + "epoch": 0.410699397236308, + "grad_norm": 1.1958638429641724, + "learning_rate": 1.3882476920167167e-05, + "loss": 1.628, + "step": 7512 + }, + { + "epoch": 0.41075406968002953, + "grad_norm": 1.5865432024002075, + "learning_rate": 1.3880792844995646e-05, + "loss": 1.2808, + "step": 7513 + }, + { + "epoch": 0.41080874212375107, + "grad_norm": 1.4715324640274048, + "learning_rate": 1.3879108640235346e-05, + "loss": 1.3108, + "step": 7514 + }, + { + "epoch": 0.4108634145674726, + "grad_norm": 1.3870488405227661, + "learning_rate": 1.3877424305942506e-05, + "loss": 1.424, + "step": 7515 + }, + { + "epoch": 0.4109180870111942, + "grad_norm": 1.518601894378662, + "learning_rate": 1.3875739842173372e-05, + "loss": 1.526, + "step": 7516 + }, + { + "epoch": 0.41097275945491574, + "grad_norm": 1.8974668979644775, + "learning_rate": 1.3874055248984191e-05, + "loss": 1.3317, + "step": 7517 + }, + { + "epoch": 0.4110274318986373, + "grad_norm": 2.465975284576416, + "learning_rate": 1.387237052643121e-05, + "loss": 1.6633, + "step": 7518 + }, + { + "epoch": 0.41108210434235887, + "grad_norm": 1.7022799253463745, + "learning_rate": 1.3870685674570695e-05, + "loss": 1.4543, + "step": 7519 + }, + { + "epoch": 0.4111367767860804, + "grad_norm": 1.2623456716537476, + "learning_rate": 1.3869000693458898e-05, + "loss": 1.395, + "step": 7520 + }, + { + "epoch": 0.41119144922980194, + "grad_norm": 1.9032195806503296, + "learning_rate": 1.386731558315209e-05, + "loss": 1.1652, + "step": 7521 + }, + { + "epoch": 0.4112461216735235, + "grad_norm": 1.5357613563537598, + "learning_rate": 1.3865630343706543e-05, + "loss": 1.2806, + "step": 7522 + }, + { + "epoch": 0.41130079411724507, + "grad_norm": 1.5042051076889038, + "learning_rate": 1.3863944975178525e-05, + "loss": 1.4979, + "step": 7523 + }, + { + "epoch": 0.4113554665609666, + "grad_norm": 1.583094835281372, + "learning_rate": 1.3862259477624317e-05, + "loss": 1.4789, + "step": 7524 + }, + { + "epoch": 0.41141013900468815, + "grad_norm": 1.1842082738876343, + "learning_rate": 1.3860573851100203e-05, + "loss": 1.599, + "step": 7525 + }, + { + "epoch": 0.41146481144840974, + "grad_norm": 1.4019466638565063, + "learning_rate": 1.385888809566247e-05, + "loss": 1.4726, + "step": 7526 + }, + { + "epoch": 0.4115194838921313, + "grad_norm": 1.9063142538070679, + "learning_rate": 1.3857202211367406e-05, + "loss": 1.3286, + "step": 7527 + }, + { + "epoch": 0.4115741563358528, + "grad_norm": 1.5700113773345947, + "learning_rate": 1.3855516198271307e-05, + "loss": 1.4242, + "step": 7528 + }, + { + "epoch": 0.41162882877957435, + "grad_norm": 1.8116118907928467, + "learning_rate": 1.385383005643048e-05, + "loss": 1.3372, + "step": 7529 + }, + { + "epoch": 0.41168350122329594, + "grad_norm": 1.429931402206421, + "learning_rate": 1.3852143785901224e-05, + "loss": 1.3439, + "step": 7530 + }, + { + "epoch": 0.4117381736670175, + "grad_norm": 1.7425503730773926, + "learning_rate": 1.3850457386739846e-05, + "loss": 1.3478, + "step": 7531 + }, + { + "epoch": 0.411792846110739, + "grad_norm": 2.055054187774658, + "learning_rate": 1.3848770859002658e-05, + "loss": 1.2236, + "step": 7532 + }, + { + "epoch": 0.4118475185544606, + "grad_norm": 1.9898180961608887, + "learning_rate": 1.3847084202745982e-05, + "loss": 1.4616, + "step": 7533 + }, + { + "epoch": 0.41190219099818215, + "grad_norm": 1.871288776397705, + "learning_rate": 1.3845397418026136e-05, + "loss": 1.3165, + "step": 7534 + }, + { + "epoch": 0.4119568634419037, + "grad_norm": 2.2513957023620605, + "learning_rate": 1.3843710504899448e-05, + "loss": 1.0745, + "step": 7535 + }, + { + "epoch": 0.4120115358856252, + "grad_norm": 1.9171959161758423, + "learning_rate": 1.3842023463422247e-05, + "loss": 1.1288, + "step": 7536 + }, + { + "epoch": 0.4120662083293468, + "grad_norm": 1.6185513734817505, + "learning_rate": 1.3840336293650867e-05, + "loss": 1.4247, + "step": 7537 + }, + { + "epoch": 0.41212088077306835, + "grad_norm": 1.6830681562423706, + "learning_rate": 1.3838648995641645e-05, + "loss": 1.3821, + "step": 7538 + }, + { + "epoch": 0.4121755532167899, + "grad_norm": 1.5894865989685059, + "learning_rate": 1.3836961569450924e-05, + "loss": 1.4369, + "step": 7539 + }, + { + "epoch": 0.4122302256605115, + "grad_norm": 1.9099839925765991, + "learning_rate": 1.3835274015135056e-05, + "loss": 1.3269, + "step": 7540 + }, + { + "epoch": 0.412284898104233, + "grad_norm": 2.134178400039673, + "learning_rate": 1.3833586332750386e-05, + "loss": 1.4362, + "step": 7541 + }, + { + "epoch": 0.41233957054795456, + "grad_norm": 1.3502863645553589, + "learning_rate": 1.3831898522353275e-05, + "loss": 1.4074, + "step": 7542 + }, + { + "epoch": 0.4123942429916761, + "grad_norm": 1.7273449897766113, + "learning_rate": 1.3830210584000078e-05, + "loss": 1.4578, + "step": 7543 + }, + { + "epoch": 0.4124489154353977, + "grad_norm": 1.6296606063842773, + "learning_rate": 1.3828522517747164e-05, + "loss": 1.3656, + "step": 7544 + }, + { + "epoch": 0.4125035878791192, + "grad_norm": 1.7371100187301636, + "learning_rate": 1.3826834323650899e-05, + "loss": 1.6246, + "step": 7545 + }, + { + "epoch": 0.41255826032284076, + "grad_norm": 1.6994826793670654, + "learning_rate": 1.3825146001767656e-05, + "loss": 1.5503, + "step": 7546 + }, + { + "epoch": 0.41261293276656236, + "grad_norm": 1.5686066150665283, + "learning_rate": 1.3823457552153812e-05, + "loss": 1.8081, + "step": 7547 + }, + { + "epoch": 0.4126676052102839, + "grad_norm": 1.5977447032928467, + "learning_rate": 1.3821768974865746e-05, + "loss": 1.3139, + "step": 7548 + }, + { + "epoch": 0.41272227765400543, + "grad_norm": 1.5612759590148926, + "learning_rate": 1.3820080269959848e-05, + "loss": 1.4748, + "step": 7549 + }, + { + "epoch": 0.41277695009772697, + "grad_norm": 1.4275356531143188, + "learning_rate": 1.3818391437492504e-05, + "loss": 1.4726, + "step": 7550 + }, + { + "epoch": 0.41283162254144856, + "grad_norm": 1.4024221897125244, + "learning_rate": 1.3816702477520113e-05, + "loss": 1.476, + "step": 7551 + }, + { + "epoch": 0.4128862949851701, + "grad_norm": 1.706297516822815, + "learning_rate": 1.3815013390099068e-05, + "loss": 1.5117, + "step": 7552 + }, + { + "epoch": 0.41294096742889164, + "grad_norm": 1.5669548511505127, + "learning_rate": 1.3813324175285772e-05, + "loss": 1.3897, + "step": 7553 + }, + { + "epoch": 0.41299563987261323, + "grad_norm": 1.5296021699905396, + "learning_rate": 1.3811634833136638e-05, + "loss": 1.5225, + "step": 7554 + }, + { + "epoch": 0.41305031231633477, + "grad_norm": 1.5500768423080444, + "learning_rate": 1.3809945363708071e-05, + "loss": 1.3174, + "step": 7555 + }, + { + "epoch": 0.4131049847600563, + "grad_norm": 1.2677241563796997, + "learning_rate": 1.3808255767056484e-05, + "loss": 1.3012, + "step": 7556 + }, + { + "epoch": 0.41315965720377784, + "grad_norm": 1.6708009243011475, + "learning_rate": 1.3806566043238302e-05, + "loss": 1.35, + "step": 7557 + }, + { + "epoch": 0.41321432964749943, + "grad_norm": 1.5054548978805542, + "learning_rate": 1.3804876192309952e-05, + "loss": 1.3714, + "step": 7558 + }, + { + "epoch": 0.41326900209122097, + "grad_norm": 1.5628021955490112, + "learning_rate": 1.3803186214327852e-05, + "loss": 1.3975, + "step": 7559 + }, + { + "epoch": 0.4133236745349425, + "grad_norm": 1.848334550857544, + "learning_rate": 1.3801496109348442e-05, + "loss": 1.4674, + "step": 7560 + }, + { + "epoch": 0.4133783469786641, + "grad_norm": 1.4811826944351196, + "learning_rate": 1.3799805877428159e-05, + "loss": 1.3886, + "step": 7561 + }, + { + "epoch": 0.41343301942238564, + "grad_norm": 1.2478362321853638, + "learning_rate": 1.379811551862344e-05, + "loss": 1.5735, + "step": 7562 + }, + { + "epoch": 0.4134876918661072, + "grad_norm": 1.444819450378418, + "learning_rate": 1.379642503299073e-05, + "loss": 1.8032, + "step": 7563 + }, + { + "epoch": 0.4135423643098287, + "grad_norm": 1.6130213737487793, + "learning_rate": 1.379473442058648e-05, + "loss": 1.4965, + "step": 7564 + }, + { + "epoch": 0.4135970367535503, + "grad_norm": 1.68278169631958, + "learning_rate": 1.3793043681467141e-05, + "loss": 1.3018, + "step": 7565 + }, + { + "epoch": 0.41365170919727184, + "grad_norm": 1.454595685005188, + "learning_rate": 1.3791352815689174e-05, + "loss": 1.3494, + "step": 7566 + }, + { + "epoch": 0.4137063816409934, + "grad_norm": 1.1736068725585938, + "learning_rate": 1.3789661823309041e-05, + "loss": 1.7414, + "step": 7567 + }, + { + "epoch": 0.413761054084715, + "grad_norm": 1.517223596572876, + "learning_rate": 1.3787970704383207e-05, + "loss": 1.4341, + "step": 7568 + }, + { + "epoch": 0.4138157265284365, + "grad_norm": 1.694697618484497, + "learning_rate": 1.3786279458968143e-05, + "loss": 1.4687, + "step": 7569 + }, + { + "epoch": 0.41387039897215805, + "grad_norm": 1.6911875009536743, + "learning_rate": 1.3784588087120323e-05, + "loss": 1.4853, + "step": 7570 + }, + { + "epoch": 0.4139250714158796, + "grad_norm": 1.3952594995498657, + "learning_rate": 1.3782896588896222e-05, + "loss": 1.5176, + "step": 7571 + }, + { + "epoch": 0.4139797438596012, + "grad_norm": 1.717255711555481, + "learning_rate": 1.3781204964352332e-05, + "loss": 1.5226, + "step": 7572 + }, + { + "epoch": 0.4140344163033227, + "grad_norm": 1.4932643175125122, + "learning_rate": 1.3779513213545132e-05, + "loss": 1.2959, + "step": 7573 + }, + { + "epoch": 0.41408908874704425, + "grad_norm": 1.7093876600265503, + "learning_rate": 1.3777821336531121e-05, + "loss": 1.5272, + "step": 7574 + }, + { + "epoch": 0.41414376119076585, + "grad_norm": 1.2561430931091309, + "learning_rate": 1.3776129333366787e-05, + "loss": 1.5183, + "step": 7575 + }, + { + "epoch": 0.4141984336344874, + "grad_norm": 1.3356369733810425, + "learning_rate": 1.3774437204108634e-05, + "loss": 1.4974, + "step": 7576 + }, + { + "epoch": 0.4142531060782089, + "grad_norm": 1.5467244386672974, + "learning_rate": 1.3772744948813166e-05, + "loss": 1.3933, + "step": 7577 + }, + { + "epoch": 0.41430777852193046, + "grad_norm": 1.43560791015625, + "learning_rate": 1.377105256753689e-05, + "loss": 1.3377, + "step": 7578 + }, + { + "epoch": 0.41436245096565205, + "grad_norm": 1.8164342641830444, + "learning_rate": 1.3769360060336323e-05, + "loss": 1.4512, + "step": 7579 + }, + { + "epoch": 0.4144171234093736, + "grad_norm": 1.6401740312576294, + "learning_rate": 1.3767667427267976e-05, + "loss": 1.3037, + "step": 7580 + }, + { + "epoch": 0.4144717958530951, + "grad_norm": 1.438740849494934, + "learning_rate": 1.3765974668388373e-05, + "loss": 1.3491, + "step": 7581 + }, + { + "epoch": 0.4145264682968167, + "grad_norm": 1.6696739196777344, + "learning_rate": 1.376428178375404e-05, + "loss": 1.5097, + "step": 7582 + }, + { + "epoch": 0.41458114074053826, + "grad_norm": 1.7776660919189453, + "learning_rate": 1.3762588773421506e-05, + "loss": 1.2171, + "step": 7583 + }, + { + "epoch": 0.4146358131842598, + "grad_norm": 1.5216095447540283, + "learning_rate": 1.37608956374473e-05, + "loss": 1.3715, + "step": 7584 + }, + { + "epoch": 0.41469048562798133, + "grad_norm": 1.3109209537506104, + "learning_rate": 1.3759202375887963e-05, + "loss": 1.4393, + "step": 7585 + }, + { + "epoch": 0.4147451580717029, + "grad_norm": 1.4631444215774536, + "learning_rate": 1.3757508988800042e-05, + "loss": 1.6964, + "step": 7586 + }, + { + "epoch": 0.41479983051542446, + "grad_norm": 2.174044609069824, + "learning_rate": 1.3755815476240076e-05, + "loss": 1.3758, + "step": 7587 + }, + { + "epoch": 0.414854502959146, + "grad_norm": 1.4422506093978882, + "learning_rate": 1.3754121838264618e-05, + "loss": 1.4802, + "step": 7588 + }, + { + "epoch": 0.4149091754028676, + "grad_norm": 3.9060137271881104, + "learning_rate": 1.3752428074930224e-05, + "loss": 1.6832, + "step": 7589 + }, + { + "epoch": 0.41496384784658913, + "grad_norm": 1.721658706665039, + "learning_rate": 1.3750734186293448e-05, + "loss": 1.3477, + "step": 7590 + }, + { + "epoch": 0.41501852029031067, + "grad_norm": 1.7138992547988892, + "learning_rate": 1.3749040172410857e-05, + "loss": 1.4659, + "step": 7591 + }, + { + "epoch": 0.4150731927340322, + "grad_norm": 1.527883768081665, + "learning_rate": 1.3747346033339017e-05, + "loss": 1.4717, + "step": 7592 + }, + { + "epoch": 0.4151278651777538, + "grad_norm": 1.773754358291626, + "learning_rate": 1.3745651769134502e-05, + "loss": 1.2406, + "step": 7593 + }, + { + "epoch": 0.41518253762147533, + "grad_norm": 1.268386960029602, + "learning_rate": 1.3743957379853885e-05, + "loss": 1.4309, + "step": 7594 + }, + { + "epoch": 0.41523721006519687, + "grad_norm": 1.2703272104263306, + "learning_rate": 1.3742262865553744e-05, + "loss": 1.5201, + "step": 7595 + }, + { + "epoch": 0.41529188250891846, + "grad_norm": 1.5564934015274048, + "learning_rate": 1.3740568226290665e-05, + "loss": 1.4682, + "step": 7596 + }, + { + "epoch": 0.41534655495264, + "grad_norm": 1.2244222164154053, + "learning_rate": 1.3738873462121235e-05, + "loss": 1.6497, + "step": 7597 + }, + { + "epoch": 0.41540122739636154, + "grad_norm": 1.5182820558547974, + "learning_rate": 1.3737178573102044e-05, + "loss": 1.646, + "step": 7598 + }, + { + "epoch": 0.4154558998400831, + "grad_norm": 1.4908963441848755, + "learning_rate": 1.3735483559289693e-05, + "loss": 1.856, + "step": 7599 + }, + { + "epoch": 0.41551057228380467, + "grad_norm": 1.1754862070083618, + "learning_rate": 1.3733788420740783e-05, + "loss": 1.5515, + "step": 7600 + }, + { + "epoch": 0.4155652447275262, + "grad_norm": 1.7354682683944702, + "learning_rate": 1.3732093157511914e-05, + "loss": 1.3596, + "step": 7601 + }, + { + "epoch": 0.41561991717124774, + "grad_norm": 1.890094518661499, + "learning_rate": 1.3730397769659696e-05, + "loss": 1.4754, + "step": 7602 + }, + { + "epoch": 0.41567458961496934, + "grad_norm": 1.6929411888122559, + "learning_rate": 1.3728702257240745e-05, + "loss": 1.3443, + "step": 7603 + }, + { + "epoch": 0.4157292620586909, + "grad_norm": 1.703754186630249, + "learning_rate": 1.3727006620311674e-05, + "loss": 1.3493, + "step": 7604 + }, + { + "epoch": 0.4157839345024124, + "grad_norm": 1.4868919849395752, + "learning_rate": 1.3725310858929106e-05, + "loss": 1.3219, + "step": 7605 + }, + { + "epoch": 0.41583860694613395, + "grad_norm": 1.397002100944519, + "learning_rate": 1.3723614973149667e-05, + "loss": 1.4154, + "step": 7606 + }, + { + "epoch": 0.41589327938985554, + "grad_norm": 1.404613733291626, + "learning_rate": 1.3721918963029987e-05, + "loss": 1.7542, + "step": 7607 + }, + { + "epoch": 0.4159479518335771, + "grad_norm": 1.6097261905670166, + "learning_rate": 1.3720222828626699e-05, + "loss": 1.3875, + "step": 7608 + }, + { + "epoch": 0.4160026242772986, + "grad_norm": 1.3949724435806274, + "learning_rate": 1.3718526569996441e-05, + "loss": 1.5295, + "step": 7609 + }, + { + "epoch": 0.4160572967210202, + "grad_norm": 1.2109935283660889, + "learning_rate": 1.3716830187195856e-05, + "loss": 1.7293, + "step": 7610 + }, + { + "epoch": 0.41611196916474175, + "grad_norm": 1.6639477014541626, + "learning_rate": 1.3715133680281586e-05, + "loss": 1.5098, + "step": 7611 + }, + { + "epoch": 0.4161666416084633, + "grad_norm": 1.482471227645874, + "learning_rate": 1.3713437049310287e-05, + "loss": 1.371, + "step": 7612 + }, + { + "epoch": 0.4162213140521848, + "grad_norm": 1.7029801607131958, + "learning_rate": 1.3711740294338612e-05, + "loss": 1.4199, + "step": 7613 + }, + { + "epoch": 0.4162759864959064, + "grad_norm": 1.5422240495681763, + "learning_rate": 1.3710043415423218e-05, + "loss": 1.4077, + "step": 7614 + }, + { + "epoch": 0.41633065893962795, + "grad_norm": 1.2909563779830933, + "learning_rate": 1.3708346412620768e-05, + "loss": 1.3661, + "step": 7615 + }, + { + "epoch": 0.4163853313833495, + "grad_norm": 1.2997432947158813, + "learning_rate": 1.3706649285987928e-05, + "loss": 1.3944, + "step": 7616 + }, + { + "epoch": 0.4164400038270711, + "grad_norm": 1.5587468147277832, + "learning_rate": 1.3704952035581371e-05, + "loss": 1.2563, + "step": 7617 + }, + { + "epoch": 0.4164946762707926, + "grad_norm": 1.1984531879425049, + "learning_rate": 1.3703254661457775e-05, + "loss": 1.4439, + "step": 7618 + }, + { + "epoch": 0.41654934871451416, + "grad_norm": 1.583970308303833, + "learning_rate": 1.3701557163673811e-05, + "loss": 1.5928, + "step": 7619 + }, + { + "epoch": 0.4166040211582357, + "grad_norm": 1.4161409139633179, + "learning_rate": 1.3699859542286168e-05, + "loss": 1.212, + "step": 7620 + }, + { + "epoch": 0.4166586936019573, + "grad_norm": 1.6558279991149902, + "learning_rate": 1.3698161797351536e-05, + "loss": 1.6223, + "step": 7621 + }, + { + "epoch": 0.4167133660456788, + "grad_norm": 1.53261399269104, + "learning_rate": 1.3696463928926602e-05, + "loss": 1.3425, + "step": 7622 + }, + { + "epoch": 0.41676803848940036, + "grad_norm": 1.7294985055923462, + "learning_rate": 1.3694765937068063e-05, + "loss": 1.201, + "step": 7623 + }, + { + "epoch": 0.41682271093312195, + "grad_norm": 1.7522950172424316, + "learning_rate": 1.3693067821832622e-05, + "loss": 1.4387, + "step": 7624 + }, + { + "epoch": 0.4168773833768435, + "grad_norm": 1.842948079109192, + "learning_rate": 1.3691369583276977e-05, + "loss": 1.5828, + "step": 7625 + }, + { + "epoch": 0.41693205582056503, + "grad_norm": 1.4680280685424805, + "learning_rate": 1.368967122145784e-05, + "loss": 1.4324, + "step": 7626 + }, + { + "epoch": 0.41698672826428657, + "grad_norm": 1.4223709106445312, + "learning_rate": 1.3687972736431925e-05, + "loss": 1.1912, + "step": 7627 + }, + { + "epoch": 0.41704140070800816, + "grad_norm": 1.4803249835968018, + "learning_rate": 1.3686274128255945e-05, + "loss": 1.2604, + "step": 7628 + }, + { + "epoch": 0.4170960731517297, + "grad_norm": 1.3046908378601074, + "learning_rate": 1.3684575396986622e-05, + "loss": 1.4294, + "step": 7629 + }, + { + "epoch": 0.41715074559545123, + "grad_norm": 1.5161943435668945, + "learning_rate": 1.3682876542680677e-05, + "loss": 1.5755, + "step": 7630 + }, + { + "epoch": 0.4172054180391728, + "grad_norm": 1.3900362253189087, + "learning_rate": 1.3681177565394845e-05, + "loss": 1.3113, + "step": 7631 + }, + { + "epoch": 0.41726009048289436, + "grad_norm": 1.4835196733474731, + "learning_rate": 1.3679478465185856e-05, + "loss": 1.5598, + "step": 7632 + }, + { + "epoch": 0.4173147629266159, + "grad_norm": 1.5209426879882812, + "learning_rate": 1.3677779242110447e-05, + "loss": 1.5495, + "step": 7633 + }, + { + "epoch": 0.41736943537033744, + "grad_norm": 1.6582902669906616, + "learning_rate": 1.3676079896225358e-05, + "loss": 1.4463, + "step": 7634 + }, + { + "epoch": 0.41742410781405903, + "grad_norm": 1.5228989124298096, + "learning_rate": 1.3674380427587337e-05, + "loss": 1.6707, + "step": 7635 + }, + { + "epoch": 0.41747878025778057, + "grad_norm": 1.5979435443878174, + "learning_rate": 1.3672680836253129e-05, + "loss": 1.4209, + "step": 7636 + }, + { + "epoch": 0.4175334527015021, + "grad_norm": 1.569198489189148, + "learning_rate": 1.367098112227949e-05, + "loss": 1.2384, + "step": 7637 + }, + { + "epoch": 0.4175881251452237, + "grad_norm": 1.6276772022247314, + "learning_rate": 1.366928128572318e-05, + "loss": 1.6238, + "step": 7638 + }, + { + "epoch": 0.41764279758894524, + "grad_norm": 1.4117066860198975, + "learning_rate": 1.3667581326640954e-05, + "loss": 1.3879, + "step": 7639 + }, + { + "epoch": 0.4176974700326668, + "grad_norm": 1.768768310546875, + "learning_rate": 1.3665881245089585e-05, + "loss": 1.3571, + "step": 7640 + }, + { + "epoch": 0.4177521424763883, + "grad_norm": 1.2424137592315674, + "learning_rate": 1.3664181041125835e-05, + "loss": 1.5289, + "step": 7641 + }, + { + "epoch": 0.4178068149201099, + "grad_norm": 1.7962415218353271, + "learning_rate": 1.3662480714806483e-05, + "loss": 1.2366, + "step": 7642 + }, + { + "epoch": 0.41786148736383144, + "grad_norm": 1.2768445014953613, + "learning_rate": 1.3660780266188306e-05, + "loss": 1.4883, + "step": 7643 + }, + { + "epoch": 0.417916159807553, + "grad_norm": 1.6641548871994019, + "learning_rate": 1.3659079695328086e-05, + "loss": 1.2476, + "step": 7644 + }, + { + "epoch": 0.41797083225127457, + "grad_norm": 1.7325637340545654, + "learning_rate": 1.365737900228261e-05, + "loss": 1.48, + "step": 7645 + }, + { + "epoch": 0.4180255046949961, + "grad_norm": 1.3741446733474731, + "learning_rate": 1.3655678187108663e-05, + "loss": 1.523, + "step": 7646 + }, + { + "epoch": 0.41808017713871765, + "grad_norm": 1.3754870891571045, + "learning_rate": 1.3653977249863046e-05, + "loss": 1.4296, + "step": 7647 + }, + { + "epoch": 0.41813484958243924, + "grad_norm": 1.654816746711731, + "learning_rate": 1.3652276190602551e-05, + "loss": 1.3213, + "step": 7648 + }, + { + "epoch": 0.4181895220261608, + "grad_norm": 1.5170851945877075, + "learning_rate": 1.3650575009383988e-05, + "loss": 1.5419, + "step": 7649 + }, + { + "epoch": 0.4182441944698823, + "grad_norm": 1.6153942346572876, + "learning_rate": 1.3648873706264159e-05, + "loss": 1.5183, + "step": 7650 + }, + { + "epoch": 0.41829886691360385, + "grad_norm": 1.6790094375610352, + "learning_rate": 1.364717228129987e-05, + "loss": 1.3291, + "step": 7651 + }, + { + "epoch": 0.41835353935732544, + "grad_norm": 1.397254467010498, + "learning_rate": 1.3645470734547946e-05, + "loss": 1.3288, + "step": 7652 + }, + { + "epoch": 0.418408211801047, + "grad_norm": 1.366060495376587, + "learning_rate": 1.3643769066065199e-05, + "loss": 1.5188, + "step": 7653 + }, + { + "epoch": 0.4184628842447685, + "grad_norm": 1.4645907878875732, + "learning_rate": 1.3642067275908449e-05, + "loss": 1.5204, + "step": 7654 + }, + { + "epoch": 0.4185175566884901, + "grad_norm": 1.6605451107025146, + "learning_rate": 1.3640365364134524e-05, + "loss": 1.4592, + "step": 7655 + }, + { + "epoch": 0.41857222913221165, + "grad_norm": 1.4712586402893066, + "learning_rate": 1.3638663330800262e-05, + "loss": 1.4188, + "step": 7656 + }, + { + "epoch": 0.4186269015759332, + "grad_norm": 1.810657262802124, + "learning_rate": 1.363696117596249e-05, + "loss": 1.3483, + "step": 7657 + }, + { + "epoch": 0.4186815740196547, + "grad_norm": 1.3318394422531128, + "learning_rate": 1.3635258899678052e-05, + "loss": 1.1946, + "step": 7658 + }, + { + "epoch": 0.4187362464633763, + "grad_norm": 1.6955841779708862, + "learning_rate": 1.3633556502003789e-05, + "loss": 1.4481, + "step": 7659 + }, + { + "epoch": 0.41879091890709785, + "grad_norm": 1.4663807153701782, + "learning_rate": 1.363185398299655e-05, + "loss": 1.5596, + "step": 7660 + }, + { + "epoch": 0.4188455913508194, + "grad_norm": 1.4199260473251343, + "learning_rate": 1.363015134271318e-05, + "loss": 1.5238, + "step": 7661 + }, + { + "epoch": 0.418900263794541, + "grad_norm": 1.9792965650558472, + "learning_rate": 1.3628448581210538e-05, + "loss": 1.194, + "step": 7662 + }, + { + "epoch": 0.4189549362382625, + "grad_norm": 1.349974513053894, + "learning_rate": 1.3626745698545487e-05, + "loss": 1.4732, + "step": 7663 + }, + { + "epoch": 0.41900960868198406, + "grad_norm": 1.508671522140503, + "learning_rate": 1.3625042694774886e-05, + "loss": 1.2933, + "step": 7664 + }, + { + "epoch": 0.4190642811257056, + "grad_norm": 1.3891849517822266, + "learning_rate": 1.3623339569955603e-05, + "loss": 1.5022, + "step": 7665 + }, + { + "epoch": 0.4191189535694272, + "grad_norm": 1.4502830505371094, + "learning_rate": 1.362163632414451e-05, + "loss": 1.3769, + "step": 7666 + }, + { + "epoch": 0.4191736260131487, + "grad_norm": 1.562842845916748, + "learning_rate": 1.3619932957398478e-05, + "loss": 1.545, + "step": 7667 + }, + { + "epoch": 0.41922829845687026, + "grad_norm": 1.9417941570281982, + "learning_rate": 1.3618229469774392e-05, + "loss": 1.2212, + "step": 7668 + }, + { + "epoch": 0.41928297090059186, + "grad_norm": 1.5418651103973389, + "learning_rate": 1.3616525861329133e-05, + "loss": 1.6711, + "step": 7669 + }, + { + "epoch": 0.4193376433443134, + "grad_norm": 1.637630581855774, + "learning_rate": 1.361482213211959e-05, + "loss": 1.5877, + "step": 7670 + }, + { + "epoch": 0.41939231578803493, + "grad_norm": 1.4662328958511353, + "learning_rate": 1.3613118282202653e-05, + "loss": 1.3873, + "step": 7671 + }, + { + "epoch": 0.41944698823175647, + "grad_norm": 1.4155402183532715, + "learning_rate": 1.3611414311635219e-05, + "loss": 1.6328, + "step": 7672 + }, + { + "epoch": 0.41950166067547806, + "grad_norm": 1.5170425176620483, + "learning_rate": 1.3609710220474187e-05, + "loss": 1.5963, + "step": 7673 + }, + { + "epoch": 0.4195563331191996, + "grad_norm": 1.7948139905929565, + "learning_rate": 1.360800600877646e-05, + "loss": 1.4936, + "step": 7674 + }, + { + "epoch": 0.41961100556292114, + "grad_norm": 1.5345629453659058, + "learning_rate": 1.3606301676598942e-05, + "loss": 1.4824, + "step": 7675 + }, + { + "epoch": 0.41966567800664273, + "grad_norm": 1.76844322681427, + "learning_rate": 1.3604597223998553e-05, + "loss": 1.2633, + "step": 7676 + }, + { + "epoch": 0.41972035045036427, + "grad_norm": 1.754332184791565, + "learning_rate": 1.3602892651032205e-05, + "loss": 1.5677, + "step": 7677 + }, + { + "epoch": 0.4197750228940858, + "grad_norm": 1.6031179428100586, + "learning_rate": 1.3601187957756814e-05, + "loss": 1.5602, + "step": 7678 + }, + { + "epoch": 0.41982969533780734, + "grad_norm": 1.6221067905426025, + "learning_rate": 1.3599483144229309e-05, + "loss": 1.6259, + "step": 7679 + }, + { + "epoch": 0.41988436778152893, + "grad_norm": 1.5787702798843384, + "learning_rate": 1.3597778210506615e-05, + "loss": 1.4709, + "step": 7680 + }, + { + "epoch": 0.41993904022525047, + "grad_norm": 1.568062663078308, + "learning_rate": 1.3596073156645662e-05, + "loss": 1.7158, + "step": 7681 + }, + { + "epoch": 0.419993712668972, + "grad_norm": 1.529668927192688, + "learning_rate": 1.359436798270339e-05, + "loss": 1.4884, + "step": 7682 + }, + { + "epoch": 0.4200483851126936, + "grad_norm": 1.4930959939956665, + "learning_rate": 1.3592662688736734e-05, + "loss": 1.5148, + "step": 7683 + }, + { + "epoch": 0.42010305755641514, + "grad_norm": 1.2962262630462646, + "learning_rate": 1.3590957274802641e-05, + "loss": 1.8139, + "step": 7684 + }, + { + "epoch": 0.4201577300001367, + "grad_norm": 1.7854068279266357, + "learning_rate": 1.358925174095806e-05, + "loss": 1.4502, + "step": 7685 + }, + { + "epoch": 0.4202124024438582, + "grad_norm": 1.5266046524047852, + "learning_rate": 1.3587546087259939e-05, + "loss": 1.4172, + "step": 7686 + }, + { + "epoch": 0.4202670748875798, + "grad_norm": 1.328974723815918, + "learning_rate": 1.358584031376524e-05, + "loss": 1.2612, + "step": 7687 + }, + { + "epoch": 0.42032174733130134, + "grad_norm": 1.5396184921264648, + "learning_rate": 1.3584134420530915e-05, + "loss": 1.5295, + "step": 7688 + }, + { + "epoch": 0.4203764197750229, + "grad_norm": 1.3701101541519165, + "learning_rate": 1.358242840761393e-05, + "loss": 1.4631, + "step": 7689 + }, + { + "epoch": 0.4204310922187445, + "grad_norm": 1.5049082040786743, + "learning_rate": 1.3580722275071255e-05, + "loss": 1.4538, + "step": 7690 + }, + { + "epoch": 0.420485764662466, + "grad_norm": 1.6873739957809448, + "learning_rate": 1.3579016022959862e-05, + "loss": 1.4602, + "step": 7691 + }, + { + "epoch": 0.42054043710618755, + "grad_norm": 1.8476881980895996, + "learning_rate": 1.3577309651336728e-05, + "loss": 1.498, + "step": 7692 + }, + { + "epoch": 0.4205951095499091, + "grad_norm": 1.9455775022506714, + "learning_rate": 1.3575603160258824e-05, + "loss": 1.264, + "step": 7693 + }, + { + "epoch": 0.4206497819936307, + "grad_norm": 1.5185649394989014, + "learning_rate": 1.3573896549783146e-05, + "loss": 1.4067, + "step": 7694 + }, + { + "epoch": 0.4207044544373522, + "grad_norm": 1.5234941244125366, + "learning_rate": 1.3572189819966672e-05, + "loss": 1.4697, + "step": 7695 + }, + { + "epoch": 0.42075912688107375, + "grad_norm": 1.6559603214263916, + "learning_rate": 1.3570482970866397e-05, + "loss": 1.5201, + "step": 7696 + }, + { + "epoch": 0.42081379932479535, + "grad_norm": 1.5911461114883423, + "learning_rate": 1.3568776002539319e-05, + "loss": 1.5371, + "step": 7697 + }, + { + "epoch": 0.4208684717685169, + "grad_norm": 1.4509857892990112, + "learning_rate": 1.3567068915042436e-05, + "loss": 1.5497, + "step": 7698 + }, + { + "epoch": 0.4209231442122384, + "grad_norm": 1.5501941442489624, + "learning_rate": 1.3565361708432754e-05, + "loss": 1.5009, + "step": 7699 + }, + { + "epoch": 0.42097781665595996, + "grad_norm": 1.6716638803482056, + "learning_rate": 1.3563654382767273e-05, + "loss": 1.3803, + "step": 7700 + }, + { + "epoch": 0.42103248909968155, + "grad_norm": 1.4233635663986206, + "learning_rate": 1.3561946938103015e-05, + "loss": 1.373, + "step": 7701 + }, + { + "epoch": 0.4210871615434031, + "grad_norm": 1.6355291604995728, + "learning_rate": 1.3560239374496986e-05, + "loss": 1.215, + "step": 7702 + }, + { + "epoch": 0.4211418339871246, + "grad_norm": 1.3501108884811401, + "learning_rate": 1.355853169200621e-05, + "loss": 1.4738, + "step": 7703 + }, + { + "epoch": 0.4211965064308462, + "grad_norm": 1.3617550134658813, + "learning_rate": 1.3556823890687714e-05, + "loss": 1.4247, + "step": 7704 + }, + { + "epoch": 0.42125117887456776, + "grad_norm": 2.9064433574676514, + "learning_rate": 1.355511597059852e-05, + "loss": 1.4634, + "step": 7705 + }, + { + "epoch": 0.4213058513182893, + "grad_norm": 1.64652681350708, + "learning_rate": 1.3553407931795662e-05, + "loss": 1.5149, + "step": 7706 + }, + { + "epoch": 0.42136052376201083, + "grad_norm": 1.5946677923202515, + "learning_rate": 1.3551699774336173e-05, + "loss": 1.3129, + "step": 7707 + }, + { + "epoch": 0.4214151962057324, + "grad_norm": 1.8099640607833862, + "learning_rate": 1.3549991498277095e-05, + "loss": 1.3787, + "step": 7708 + }, + { + "epoch": 0.42146986864945396, + "grad_norm": 1.6564881801605225, + "learning_rate": 1.354828310367547e-05, + "loss": 1.388, + "step": 7709 + }, + { + "epoch": 0.4215245410931755, + "grad_norm": 1.466230034828186, + "learning_rate": 1.3546574590588346e-05, + "loss": 1.0565, + "step": 7710 + }, + { + "epoch": 0.4215792135368971, + "grad_norm": 1.6076990365982056, + "learning_rate": 1.3544865959072777e-05, + "loss": 1.3303, + "step": 7711 + }, + { + "epoch": 0.42163388598061863, + "grad_norm": 1.3790860176086426, + "learning_rate": 1.3543157209185813e-05, + "loss": 1.8783, + "step": 7712 + }, + { + "epoch": 0.42168855842434017, + "grad_norm": 1.5010689496994019, + "learning_rate": 1.3541448340984516e-05, + "loss": 1.5989, + "step": 7713 + }, + { + "epoch": 0.4217432308680617, + "grad_norm": 1.524266004562378, + "learning_rate": 1.3539739354525947e-05, + "loss": 1.6086, + "step": 7714 + }, + { + "epoch": 0.4217979033117833, + "grad_norm": 1.5035170316696167, + "learning_rate": 1.3538030249867178e-05, + "loss": 1.6192, + "step": 7715 + }, + { + "epoch": 0.42185257575550483, + "grad_norm": 2.1801984310150146, + "learning_rate": 1.3536321027065273e-05, + "loss": 1.1772, + "step": 7716 + }, + { + "epoch": 0.42190724819922637, + "grad_norm": 1.7527823448181152, + "learning_rate": 1.3534611686177312e-05, + "loss": 1.3761, + "step": 7717 + }, + { + "epoch": 0.42196192064294796, + "grad_norm": 2.164790391921997, + "learning_rate": 1.3532902227260374e-05, + "loss": 1.5626, + "step": 7718 + }, + { + "epoch": 0.4220165930866695, + "grad_norm": 1.36393404006958, + "learning_rate": 1.3531192650371541e-05, + "loss": 1.4352, + "step": 7719 + }, + { + "epoch": 0.42207126553039104, + "grad_norm": 1.6353707313537598, + "learning_rate": 1.3529482955567896e-05, + "loss": 1.5692, + "step": 7720 + }, + { + "epoch": 0.4221259379741126, + "grad_norm": 1.5427931547164917, + "learning_rate": 1.3527773142906532e-05, + "loss": 1.3373, + "step": 7721 + }, + { + "epoch": 0.42218061041783417, + "grad_norm": 1.1936256885528564, + "learning_rate": 1.3526063212444552e-05, + "loss": 1.847, + "step": 7722 + }, + { + "epoch": 0.4222352828615557, + "grad_norm": 1.5018484592437744, + "learning_rate": 1.352435316423904e-05, + "loss": 1.4505, + "step": 7723 + }, + { + "epoch": 0.42228995530527724, + "grad_norm": 1.4474895000457764, + "learning_rate": 1.352264299834711e-05, + "loss": 1.4362, + "step": 7724 + }, + { + "epoch": 0.42234462774899884, + "grad_norm": 1.2501789331436157, + "learning_rate": 1.3520932714825863e-05, + "loss": 1.365, + "step": 7725 + }, + { + "epoch": 0.4223993001927204, + "grad_norm": 1.3962013721466064, + "learning_rate": 1.3519222313732407e-05, + "loss": 1.5685, + "step": 7726 + }, + { + "epoch": 0.4224539726364419, + "grad_norm": 1.8243244886398315, + "learning_rate": 1.3517511795123864e-05, + "loss": 1.408, + "step": 7727 + }, + { + "epoch": 0.42250864508016345, + "grad_norm": 1.3011423349380493, + "learning_rate": 1.3515801159057344e-05, + "loss": 1.1653, + "step": 7728 + }, + { + "epoch": 0.42256331752388504, + "grad_norm": 1.362151861190796, + "learning_rate": 1.3514090405589978e-05, + "loss": 1.4768, + "step": 7729 + }, + { + "epoch": 0.4226179899676066, + "grad_norm": 2.038435220718384, + "learning_rate": 1.3512379534778883e-05, + "loss": 1.3324, + "step": 7730 + }, + { + "epoch": 0.4226726624113281, + "grad_norm": 1.4698997735977173, + "learning_rate": 1.3510668546681198e-05, + "loss": 1.2549, + "step": 7731 + }, + { + "epoch": 0.4227273348550497, + "grad_norm": 1.3935097455978394, + "learning_rate": 1.3508957441354049e-05, + "loss": 1.5678, + "step": 7732 + }, + { + "epoch": 0.42278200729877125, + "grad_norm": 1.3678934574127197, + "learning_rate": 1.3507246218854576e-05, + "loss": 1.3438, + "step": 7733 + }, + { + "epoch": 0.4228366797424928, + "grad_norm": 1.188361406326294, + "learning_rate": 1.3505534879239923e-05, + "loss": 1.4862, + "step": 7734 + }, + { + "epoch": 0.4228913521862143, + "grad_norm": 1.696642279624939, + "learning_rate": 1.3503823422567235e-05, + "loss": 1.6417, + "step": 7735 + }, + { + "epoch": 0.4229460246299359, + "grad_norm": 1.4140297174453735, + "learning_rate": 1.3502111848893663e-05, + "loss": 1.4725, + "step": 7736 + }, + { + "epoch": 0.42300069707365745, + "grad_norm": 1.3281774520874023, + "learning_rate": 1.3500400158276352e-05, + "loss": 1.5129, + "step": 7737 + }, + { + "epoch": 0.423055369517379, + "grad_norm": 1.6422944068908691, + "learning_rate": 1.3498688350772473e-05, + "loss": 1.4513, + "step": 7738 + }, + { + "epoch": 0.4231100419611006, + "grad_norm": 1.3021364212036133, + "learning_rate": 1.3496976426439177e-05, + "loss": 1.409, + "step": 7739 + }, + { + "epoch": 0.4231647144048221, + "grad_norm": 1.8651705980300903, + "learning_rate": 1.349526438533363e-05, + "loss": 1.6373, + "step": 7740 + }, + { + "epoch": 0.42321938684854365, + "grad_norm": 1.2800102233886719, + "learning_rate": 1.3493552227513007e-05, + "loss": 1.5064, + "step": 7741 + }, + { + "epoch": 0.4232740592922652, + "grad_norm": 1.7319802045822144, + "learning_rate": 1.3491839953034474e-05, + "loss": 1.4429, + "step": 7742 + }, + { + "epoch": 0.4233287317359868, + "grad_norm": 1.3636600971221924, + "learning_rate": 1.3490127561955214e-05, + "loss": 1.432, + "step": 7743 + }, + { + "epoch": 0.4233834041797083, + "grad_norm": 1.6288609504699707, + "learning_rate": 1.3488415054332404e-05, + "loss": 1.3175, + "step": 7744 + }, + { + "epoch": 0.42343807662342986, + "grad_norm": 1.4227442741394043, + "learning_rate": 1.348670243022323e-05, + "loss": 1.5478, + "step": 7745 + }, + { + "epoch": 0.42349274906715145, + "grad_norm": 1.7590513229370117, + "learning_rate": 1.348498968968488e-05, + "loss": 1.2598, + "step": 7746 + }, + { + "epoch": 0.423547421510873, + "grad_norm": 1.327715516090393, + "learning_rate": 1.3483276832774543e-05, + "loss": 1.5161, + "step": 7747 + }, + { + "epoch": 0.4236020939545945, + "grad_norm": 1.490089774131775, + "learning_rate": 1.348156385954942e-05, + "loss": 1.5682, + "step": 7748 + }, + { + "epoch": 0.42365676639831606, + "grad_norm": 1.9091943502426147, + "learning_rate": 1.3479850770066712e-05, + "loss": 1.4584, + "step": 7749 + }, + { + "epoch": 0.42371143884203766, + "grad_norm": 1.3341525793075562, + "learning_rate": 1.3478137564383621e-05, + "loss": 1.3992, + "step": 7750 + }, + { + "epoch": 0.4237661112857592, + "grad_norm": 1.4186136722564697, + "learning_rate": 1.3476424242557355e-05, + "loss": 1.335, + "step": 7751 + }, + { + "epoch": 0.42382078372948073, + "grad_norm": 1.4004210233688354, + "learning_rate": 1.3474710804645125e-05, + "loss": 1.2832, + "step": 7752 + }, + { + "epoch": 0.4238754561732023, + "grad_norm": 1.7653446197509766, + "learning_rate": 1.3472997250704149e-05, + "loss": 1.3858, + "step": 7753 + }, + { + "epoch": 0.42393012861692386, + "grad_norm": 1.5653489828109741, + "learning_rate": 1.3471283580791643e-05, + "loss": 1.418, + "step": 7754 + }, + { + "epoch": 0.4239848010606454, + "grad_norm": 1.4213231801986694, + "learning_rate": 1.3469569794964832e-05, + "loss": 1.3494, + "step": 7755 + }, + { + "epoch": 0.42403947350436694, + "grad_norm": 1.821759581565857, + "learning_rate": 1.3467855893280945e-05, + "loss": 1.2769, + "step": 7756 + }, + { + "epoch": 0.42409414594808853, + "grad_norm": 1.5187175273895264, + "learning_rate": 1.3466141875797214e-05, + "loss": 1.5211, + "step": 7757 + }, + { + "epoch": 0.42414881839181007, + "grad_norm": 1.499180555343628, + "learning_rate": 1.346442774257087e-05, + "loss": 1.524, + "step": 7758 + }, + { + "epoch": 0.4242034908355316, + "grad_norm": 1.426738977432251, + "learning_rate": 1.3462713493659156e-05, + "loss": 1.3975, + "step": 7759 + }, + { + "epoch": 0.4242581632792532, + "grad_norm": 1.433397889137268, + "learning_rate": 1.3460999129119315e-05, + "loss": 1.4195, + "step": 7760 + }, + { + "epoch": 0.42431283572297473, + "grad_norm": 1.788912296295166, + "learning_rate": 1.3459284649008585e-05, + "loss": 1.3804, + "step": 7761 + }, + { + "epoch": 0.4243675081666963, + "grad_norm": 2.4585986137390137, + "learning_rate": 1.3457570053384225e-05, + "loss": 1.6917, + "step": 7762 + }, + { + "epoch": 0.4244221806104178, + "grad_norm": 1.4192057847976685, + "learning_rate": 1.3455855342303491e-05, + "loss": 1.5375, + "step": 7763 + }, + { + "epoch": 0.4244768530541394, + "grad_norm": 1.361799955368042, + "learning_rate": 1.3454140515823637e-05, + "loss": 1.689, + "step": 7764 + }, + { + "epoch": 0.42453152549786094, + "grad_norm": 1.6133207082748413, + "learning_rate": 1.3452425574001926e-05, + "loss": 1.3717, + "step": 7765 + }, + { + "epoch": 0.4245861979415825, + "grad_norm": 1.5181902647018433, + "learning_rate": 1.3450710516895619e-05, + "loss": 1.4852, + "step": 7766 + }, + { + "epoch": 0.42464087038530407, + "grad_norm": 2.138831853866577, + "learning_rate": 1.3448995344561997e-05, + "loss": 1.2611, + "step": 7767 + }, + { + "epoch": 0.4246955428290256, + "grad_norm": 1.409912347793579, + "learning_rate": 1.3447280057058322e-05, + "loss": 1.6021, + "step": 7768 + }, + { + "epoch": 0.42475021527274714, + "grad_norm": 1.7315073013305664, + "learning_rate": 1.3445564654441879e-05, + "loss": 1.3588, + "step": 7769 + }, + { + "epoch": 0.4248048877164687, + "grad_norm": 1.7077502012252808, + "learning_rate": 1.3443849136769946e-05, + "loss": 1.5128, + "step": 7770 + }, + { + "epoch": 0.4248595601601903, + "grad_norm": 1.3554826974868774, + "learning_rate": 1.3442133504099812e-05, + "loss": 1.3306, + "step": 7771 + }, + { + "epoch": 0.4249142326039118, + "grad_norm": 1.4165380001068115, + "learning_rate": 1.344041775648876e-05, + "loss": 1.3929, + "step": 7772 + }, + { + "epoch": 0.42496890504763335, + "grad_norm": 1.6670682430267334, + "learning_rate": 1.3438701893994087e-05, + "loss": 1.2666, + "step": 7773 + }, + { + "epoch": 0.42502357749135494, + "grad_norm": 1.559424638748169, + "learning_rate": 1.3436985916673088e-05, + "loss": 1.5971, + "step": 7774 + }, + { + "epoch": 0.4250782499350765, + "grad_norm": 1.586517333984375, + "learning_rate": 1.3435269824583064e-05, + "loss": 1.6297, + "step": 7775 + }, + { + "epoch": 0.425132922378798, + "grad_norm": 1.667402744293213, + "learning_rate": 1.3433553617781318e-05, + "loss": 1.5389, + "step": 7776 + }, + { + "epoch": 0.42518759482251955, + "grad_norm": 1.6586453914642334, + "learning_rate": 1.3431837296325163e-05, + "loss": 1.4057, + "step": 7777 + }, + { + "epoch": 0.42524226726624115, + "grad_norm": 1.582279086112976, + "learning_rate": 1.3430120860271906e-05, + "loss": 1.5154, + "step": 7778 + }, + { + "epoch": 0.4252969397099627, + "grad_norm": 1.3798879384994507, + "learning_rate": 1.3428404309678863e-05, + "loss": 1.3538, + "step": 7779 + }, + { + "epoch": 0.4253516121536842, + "grad_norm": 1.7156484127044678, + "learning_rate": 1.3426687644603358e-05, + "loss": 1.1843, + "step": 7780 + }, + { + "epoch": 0.4254062845974058, + "grad_norm": 1.3946119546890259, + "learning_rate": 1.3424970865102709e-05, + "loss": 1.6858, + "step": 7781 + }, + { + "epoch": 0.42546095704112735, + "grad_norm": 1.5946508646011353, + "learning_rate": 1.3423253971234248e-05, + "loss": 1.5099, + "step": 7782 + }, + { + "epoch": 0.4255156294848489, + "grad_norm": 1.5352925062179565, + "learning_rate": 1.3421536963055304e-05, + "loss": 1.4784, + "step": 7783 + }, + { + "epoch": 0.4255703019285704, + "grad_norm": 2.0241215229034424, + "learning_rate": 1.341981984062321e-05, + "loss": 1.429, + "step": 7784 + }, + { + "epoch": 0.425624974372292, + "grad_norm": 1.6758097410202026, + "learning_rate": 1.3418102603995307e-05, + "loss": 1.4332, + "step": 7785 + }, + { + "epoch": 0.42567964681601356, + "grad_norm": 1.4475994110107422, + "learning_rate": 1.341638525322894e-05, + "loss": 1.2287, + "step": 7786 + }, + { + "epoch": 0.4257343192597351, + "grad_norm": 1.6120599508285522, + "learning_rate": 1.3414667788381449e-05, + "loss": 1.343, + "step": 7787 + }, + { + "epoch": 0.4257889917034567, + "grad_norm": 1.486696720123291, + "learning_rate": 1.341295020951019e-05, + "loss": 1.392, + "step": 7788 + }, + { + "epoch": 0.4258436641471782, + "grad_norm": 1.000968337059021, + "learning_rate": 1.3411232516672512e-05, + "loss": 1.7921, + "step": 7789 + }, + { + "epoch": 0.42589833659089976, + "grad_norm": 1.618175745010376, + "learning_rate": 1.3409514709925777e-05, + "loss": 1.2477, + "step": 7790 + }, + { + "epoch": 0.4259530090346213, + "grad_norm": 1.230542778968811, + "learning_rate": 1.3407796789327345e-05, + "loss": 1.4294, + "step": 7791 + }, + { + "epoch": 0.4260076814783429, + "grad_norm": 1.6673916578292847, + "learning_rate": 1.3406078754934584e-05, + "loss": 1.5013, + "step": 7792 + }, + { + "epoch": 0.42606235392206443, + "grad_norm": 1.467341423034668, + "learning_rate": 1.3404360606804858e-05, + "loss": 1.4225, + "step": 7793 + }, + { + "epoch": 0.42611702636578597, + "grad_norm": 1.6801996231079102, + "learning_rate": 1.3402642344995543e-05, + "loss": 1.6228, + "step": 7794 + }, + { + "epoch": 0.42617169880950756, + "grad_norm": 1.506030797958374, + "learning_rate": 1.3400923969564017e-05, + "loss": 1.4858, + "step": 7795 + }, + { + "epoch": 0.4262263712532291, + "grad_norm": 1.3548920154571533, + "learning_rate": 1.3399205480567659e-05, + "loss": 1.6357, + "step": 7796 + }, + { + "epoch": 0.42628104369695063, + "grad_norm": 1.6588116884231567, + "learning_rate": 1.3397486878063852e-05, + "loss": 1.2342, + "step": 7797 + }, + { + "epoch": 0.42633571614067217, + "grad_norm": 1.4579850435256958, + "learning_rate": 1.3395768162109986e-05, + "loss": 1.4852, + "step": 7798 + }, + { + "epoch": 0.42639038858439376, + "grad_norm": 1.4615098237991333, + "learning_rate": 1.3394049332763454e-05, + "loss": 1.2727, + "step": 7799 + }, + { + "epoch": 0.4264450610281153, + "grad_norm": 1.6817386150360107, + "learning_rate": 1.339233039008165e-05, + "loss": 1.5075, + "step": 7800 + }, + { + "epoch": 0.42649973347183684, + "grad_norm": 1.3639174699783325, + "learning_rate": 1.339061133412197e-05, + "loss": 1.3404, + "step": 7801 + }, + { + "epoch": 0.42655440591555843, + "grad_norm": 1.7088969945907593, + "learning_rate": 1.3388892164941828e-05, + "loss": 1.5034, + "step": 7802 + }, + { + "epoch": 0.42660907835927997, + "grad_norm": 2.1709702014923096, + "learning_rate": 1.3387172882598622e-05, + "loss": 1.3218, + "step": 7803 + }, + { + "epoch": 0.4266637508030015, + "grad_norm": 1.439139723777771, + "learning_rate": 1.3385453487149765e-05, + "loss": 1.4827, + "step": 7804 + }, + { + "epoch": 0.42671842324672304, + "grad_norm": 1.5050016641616821, + "learning_rate": 1.3383733978652669e-05, + "loss": 1.1792, + "step": 7805 + }, + { + "epoch": 0.42677309569044464, + "grad_norm": 1.8018975257873535, + "learning_rate": 1.3382014357164756e-05, + "loss": 1.4458, + "step": 7806 + }, + { + "epoch": 0.4268277681341662, + "grad_norm": 1.5233021974563599, + "learning_rate": 1.338029462274345e-05, + "loss": 1.2965, + "step": 7807 + }, + { + "epoch": 0.4268824405778877, + "grad_norm": 1.400643229484558, + "learning_rate": 1.3378574775446171e-05, + "loss": 1.5528, + "step": 7808 + }, + { + "epoch": 0.4269371130216093, + "grad_norm": 1.3394159078598022, + "learning_rate": 1.3376854815330357e-05, + "loss": 1.478, + "step": 7809 + }, + { + "epoch": 0.42699178546533084, + "grad_norm": 1.6162784099578857, + "learning_rate": 1.3375134742453435e-05, + "loss": 1.5219, + "step": 7810 + }, + { + "epoch": 0.4270464579090524, + "grad_norm": 1.4425015449523926, + "learning_rate": 1.3373414556872844e-05, + "loss": 1.2555, + "step": 7811 + }, + { + "epoch": 0.4271011303527739, + "grad_norm": 1.4880127906799316, + "learning_rate": 1.3371694258646021e-05, + "loss": 1.5695, + "step": 7812 + }, + { + "epoch": 0.4271558027964955, + "grad_norm": 1.789220929145813, + "learning_rate": 1.336997384783042e-05, + "loss": 1.4226, + "step": 7813 + }, + { + "epoch": 0.42721047524021705, + "grad_norm": 1.6033176183700562, + "learning_rate": 1.336825332448348e-05, + "loss": 1.5529, + "step": 7814 + }, + { + "epoch": 0.4272651476839386, + "grad_norm": 1.2898118495941162, + "learning_rate": 1.336653268866266e-05, + "loss": 1.5013, + "step": 7815 + }, + { + "epoch": 0.4273198201276602, + "grad_norm": 1.1804438829421997, + "learning_rate": 1.3364811940425417e-05, + "loss": 1.5195, + "step": 7816 + }, + { + "epoch": 0.4273744925713817, + "grad_norm": 1.5056360960006714, + "learning_rate": 1.3363091079829202e-05, + "loss": 1.4793, + "step": 7817 + }, + { + "epoch": 0.42742916501510325, + "grad_norm": 1.532439112663269, + "learning_rate": 1.3361370106931486e-05, + "loss": 1.6317, + "step": 7818 + }, + { + "epoch": 0.4274838374588248, + "grad_norm": 1.380676031112671, + "learning_rate": 1.3359649021789734e-05, + "loss": 1.4156, + "step": 7819 + }, + { + "epoch": 0.4275385099025464, + "grad_norm": 1.7958714962005615, + "learning_rate": 1.3357927824461418e-05, + "loss": 1.4485, + "step": 7820 + }, + { + "epoch": 0.4275931823462679, + "grad_norm": 1.4526335000991821, + "learning_rate": 1.3356206515004013e-05, + "loss": 1.3021, + "step": 7821 + }, + { + "epoch": 0.42764785478998946, + "grad_norm": 1.3292075395584106, + "learning_rate": 1.3354485093474998e-05, + "loss": 1.5267, + "step": 7822 + }, + { + "epoch": 0.42770252723371105, + "grad_norm": 1.4445240497589111, + "learning_rate": 1.3352763559931852e-05, + "loss": 1.5097, + "step": 7823 + }, + { + "epoch": 0.4277571996774326, + "grad_norm": 1.6964226961135864, + "learning_rate": 1.3351041914432064e-05, + "loss": 1.4259, + "step": 7824 + }, + { + "epoch": 0.4278118721211541, + "grad_norm": 1.5118178129196167, + "learning_rate": 1.3349320157033121e-05, + "loss": 1.3949, + "step": 7825 + }, + { + "epoch": 0.42786654456487566, + "grad_norm": 1.7519996166229248, + "learning_rate": 1.334759828779252e-05, + "loss": 1.311, + "step": 7826 + }, + { + "epoch": 0.42792121700859725, + "grad_norm": 1.5808993577957153, + "learning_rate": 1.3345876306767757e-05, + "loss": 1.6779, + "step": 7827 + }, + { + "epoch": 0.4279758894523188, + "grad_norm": 1.558193325996399, + "learning_rate": 1.3344154214016331e-05, + "loss": 1.5447, + "step": 7828 + }, + { + "epoch": 0.42803056189604033, + "grad_norm": 2.027606964111328, + "learning_rate": 1.3342432009595754e-05, + "loss": 1.3526, + "step": 7829 + }, + { + "epoch": 0.4280852343397619, + "grad_norm": 1.7794582843780518, + "learning_rate": 1.3340709693563525e-05, + "loss": 1.2657, + "step": 7830 + }, + { + "epoch": 0.42813990678348346, + "grad_norm": 1.261723279953003, + "learning_rate": 1.333898726597716e-05, + "loss": 1.3616, + "step": 7831 + }, + { + "epoch": 0.428194579227205, + "grad_norm": 1.6200226545333862, + "learning_rate": 1.3337264726894175e-05, + "loss": 1.488, + "step": 7832 + }, + { + "epoch": 0.42824925167092653, + "grad_norm": 1.379638433456421, + "learning_rate": 1.3335542076372088e-05, + "loss": 1.3191, + "step": 7833 + }, + { + "epoch": 0.4283039241146481, + "grad_norm": 1.1606504917144775, + "learning_rate": 1.3333819314468428e-05, + "loss": 1.4206, + "step": 7834 + }, + { + "epoch": 0.42835859655836966, + "grad_norm": 1.9121702909469604, + "learning_rate": 1.3332096441240716e-05, + "loss": 1.1963, + "step": 7835 + }, + { + "epoch": 0.4284132690020912, + "grad_norm": 1.7232671976089478, + "learning_rate": 1.3330373456746486e-05, + "loss": 1.4914, + "step": 7836 + }, + { + "epoch": 0.4284679414458128, + "grad_norm": 1.6109615564346313, + "learning_rate": 1.3328650361043269e-05, + "loss": 1.3543, + "step": 7837 + }, + { + "epoch": 0.42852261388953433, + "grad_norm": 1.9411377906799316, + "learning_rate": 1.3326927154188607e-05, + "loss": 1.2876, + "step": 7838 + }, + { + "epoch": 0.42857728633325587, + "grad_norm": 1.9072054624557495, + "learning_rate": 1.3325203836240039e-05, + "loss": 1.4429, + "step": 7839 + }, + { + "epoch": 0.4286319587769774, + "grad_norm": 1.85276460647583, + "learning_rate": 1.3323480407255112e-05, + "loss": 1.3582, + "step": 7840 + }, + { + "epoch": 0.428686631220699, + "grad_norm": 1.6716045141220093, + "learning_rate": 1.3321756867291378e-05, + "loss": 1.3979, + "step": 7841 + }, + { + "epoch": 0.42874130366442054, + "grad_norm": 1.3833388090133667, + "learning_rate": 1.3320033216406388e-05, + "loss": 1.4971, + "step": 7842 + }, + { + "epoch": 0.4287959761081421, + "grad_norm": 1.5063523054122925, + "learning_rate": 1.3318309454657695e-05, + "loss": 1.278, + "step": 7843 + }, + { + "epoch": 0.42885064855186367, + "grad_norm": 1.9712659120559692, + "learning_rate": 1.3316585582102865e-05, + "loss": 1.4919, + "step": 7844 + }, + { + "epoch": 0.4289053209955852, + "grad_norm": 1.8323692083358765, + "learning_rate": 1.3314861598799458e-05, + "loss": 1.5346, + "step": 7845 + }, + { + "epoch": 0.42895999343930674, + "grad_norm": 1.9540200233459473, + "learning_rate": 1.3313137504805042e-05, + "loss": 1.213, + "step": 7846 + }, + { + "epoch": 0.42901466588302833, + "grad_norm": 1.527015209197998, + "learning_rate": 1.3311413300177192e-05, + "loss": 1.3875, + "step": 7847 + }, + { + "epoch": 0.42906933832674987, + "grad_norm": 1.4213956594467163, + "learning_rate": 1.3309688984973484e-05, + "loss": 1.7153, + "step": 7848 + }, + { + "epoch": 0.4291240107704714, + "grad_norm": 1.731685757637024, + "learning_rate": 1.3307964559251494e-05, + "loss": 1.5119, + "step": 7849 + }, + { + "epoch": 0.42917868321419295, + "grad_norm": 1.4444525241851807, + "learning_rate": 1.3306240023068801e-05, + "loss": 1.4455, + "step": 7850 + }, + { + "epoch": 0.42923335565791454, + "grad_norm": 1.6632239818572998, + "learning_rate": 1.3304515376482998e-05, + "loss": 1.391, + "step": 7851 + }, + { + "epoch": 0.4292880281016361, + "grad_norm": 1.581851601600647, + "learning_rate": 1.3302790619551673e-05, + "loss": 1.0815, + "step": 7852 + }, + { + "epoch": 0.4293427005453576, + "grad_norm": 1.4843814373016357, + "learning_rate": 1.3301065752332415e-05, + "loss": 1.3958, + "step": 7853 + }, + { + "epoch": 0.4293973729890792, + "grad_norm": 1.3833755254745483, + "learning_rate": 1.3299340774882833e-05, + "loss": 1.332, + "step": 7854 + }, + { + "epoch": 0.42945204543280074, + "grad_norm": 1.581243634223938, + "learning_rate": 1.3297615687260515e-05, + "loss": 1.2888, + "step": 7855 + }, + { + "epoch": 0.4295067178765223, + "grad_norm": 1.5387985706329346, + "learning_rate": 1.3295890489523071e-05, + "loss": 1.4141, + "step": 7856 + }, + { + "epoch": 0.4295613903202438, + "grad_norm": 1.447860598564148, + "learning_rate": 1.329416518172811e-05, + "loss": 1.4778, + "step": 7857 + }, + { + "epoch": 0.4296160627639654, + "grad_norm": 1.927183985710144, + "learning_rate": 1.3292439763933245e-05, + "loss": 1.6549, + "step": 7858 + }, + { + "epoch": 0.42967073520768695, + "grad_norm": 1.1806904077529907, + "learning_rate": 1.3290714236196087e-05, + "loss": 1.4701, + "step": 7859 + }, + { + "epoch": 0.4297254076514085, + "grad_norm": 1.588576316833496, + "learning_rate": 1.328898859857426e-05, + "loss": 1.3884, + "step": 7860 + }, + { + "epoch": 0.4297800800951301, + "grad_norm": 2.0369861125946045, + "learning_rate": 1.3287262851125387e-05, + "loss": 1.5039, + "step": 7861 + }, + { + "epoch": 0.4298347525388516, + "grad_norm": 1.3238085508346558, + "learning_rate": 1.3285536993907095e-05, + "loss": 1.2946, + "step": 7862 + }, + { + "epoch": 0.42988942498257315, + "grad_norm": 1.4680800437927246, + "learning_rate": 1.328381102697701e-05, + "loss": 1.1739, + "step": 7863 + }, + { + "epoch": 0.4299440974262947, + "grad_norm": 1.6864193677902222, + "learning_rate": 1.328208495039277e-05, + "loss": 1.3271, + "step": 7864 + }, + { + "epoch": 0.4299987698700163, + "grad_norm": 1.6677181720733643, + "learning_rate": 1.3280358764212013e-05, + "loss": 1.4585, + "step": 7865 + }, + { + "epoch": 0.4300534423137378, + "grad_norm": 1.8750813007354736, + "learning_rate": 1.327863246849238e-05, + "loss": 1.4203, + "step": 7866 + }, + { + "epoch": 0.43010811475745936, + "grad_norm": 1.4386814832687378, + "learning_rate": 1.3276906063291511e-05, + "loss": 1.4254, + "step": 7867 + }, + { + "epoch": 0.43016278720118095, + "grad_norm": 1.6095225811004639, + "learning_rate": 1.3275179548667062e-05, + "loss": 1.4366, + "step": 7868 + }, + { + "epoch": 0.4302174596449025, + "grad_norm": 1.4769184589385986, + "learning_rate": 1.3273452924676684e-05, + "loss": 1.4776, + "step": 7869 + }, + { + "epoch": 0.430272132088624, + "grad_norm": 1.4246398210525513, + "learning_rate": 1.327172619137803e-05, + "loss": 1.5119, + "step": 7870 + }, + { + "epoch": 0.43032680453234556, + "grad_norm": 1.7144346237182617, + "learning_rate": 1.326999934882876e-05, + "loss": 1.5955, + "step": 7871 + }, + { + "epoch": 0.43038147697606716, + "grad_norm": 1.534530520439148, + "learning_rate": 1.3268272397086542e-05, + "loss": 1.7133, + "step": 7872 + }, + { + "epoch": 0.4304361494197887, + "grad_norm": 2.1010730266571045, + "learning_rate": 1.3266545336209034e-05, + "loss": 1.2346, + "step": 7873 + }, + { + "epoch": 0.43049082186351023, + "grad_norm": 1.8063244819641113, + "learning_rate": 1.3264818166253917e-05, + "loss": 1.4932, + "step": 7874 + }, + { + "epoch": 0.4305454943072318, + "grad_norm": 1.3981609344482422, + "learning_rate": 1.3263090887278855e-05, + "loss": 1.2839, + "step": 7875 + }, + { + "epoch": 0.43060016675095336, + "grad_norm": 1.2550005912780762, + "learning_rate": 1.3261363499341537e-05, + "loss": 1.4746, + "step": 7876 + }, + { + "epoch": 0.4306548391946749, + "grad_norm": 1.2626333236694336, + "learning_rate": 1.3259636002499634e-05, + "loss": 1.3701, + "step": 7877 + }, + { + "epoch": 0.43070951163839644, + "grad_norm": 1.5683445930480957, + "learning_rate": 1.3257908396810838e-05, + "loss": 1.4269, + "step": 7878 + }, + { + "epoch": 0.43076418408211803, + "grad_norm": 1.6988575458526611, + "learning_rate": 1.3256180682332836e-05, + "loss": 1.4914, + "step": 7879 + }, + { + "epoch": 0.43081885652583957, + "grad_norm": 1.3796024322509766, + "learning_rate": 1.325445285912332e-05, + "loss": 1.3801, + "step": 7880 + }, + { + "epoch": 0.4308735289695611, + "grad_norm": 1.4726730585098267, + "learning_rate": 1.3252724927239986e-05, + "loss": 1.4278, + "step": 7881 + }, + { + "epoch": 0.4309282014132827, + "grad_norm": 1.479769229888916, + "learning_rate": 1.3250996886740532e-05, + "loss": 1.4035, + "step": 7882 + }, + { + "epoch": 0.43098287385700423, + "grad_norm": 1.4669550657272339, + "learning_rate": 1.3249268737682669e-05, + "loss": 1.2253, + "step": 7883 + }, + { + "epoch": 0.43103754630072577, + "grad_norm": 1.2679173946380615, + "learning_rate": 1.3247540480124093e-05, + "loss": 1.5075, + "step": 7884 + }, + { + "epoch": 0.4310922187444473, + "grad_norm": 1.178490161895752, + "learning_rate": 1.324581211412252e-05, + "loss": 1.485, + "step": 7885 + }, + { + "epoch": 0.4311468911881689, + "grad_norm": 1.558617115020752, + "learning_rate": 1.3244083639735665e-05, + "loss": 1.601, + "step": 7886 + }, + { + "epoch": 0.43120156363189044, + "grad_norm": 1.3162277936935425, + "learning_rate": 1.3242355057021246e-05, + "loss": 1.5559, + "step": 7887 + }, + { + "epoch": 0.431256236075612, + "grad_norm": 1.6276177167892456, + "learning_rate": 1.3240626366036982e-05, + "loss": 1.5095, + "step": 7888 + }, + { + "epoch": 0.43131090851933357, + "grad_norm": 1.697239637374878, + "learning_rate": 1.32388975668406e-05, + "loss": 1.2536, + "step": 7889 + }, + { + "epoch": 0.4313655809630551, + "grad_norm": 1.305841088294983, + "learning_rate": 1.3237168659489827e-05, + "loss": 1.4312, + "step": 7890 + }, + { + "epoch": 0.43142025340677664, + "grad_norm": 1.501678466796875, + "learning_rate": 1.3235439644042396e-05, + "loss": 1.4684, + "step": 7891 + }, + { + "epoch": 0.4314749258504982, + "grad_norm": 1.7119406461715698, + "learning_rate": 1.3233710520556042e-05, + "loss": 1.3433, + "step": 7892 + }, + { + "epoch": 0.4315295982942198, + "grad_norm": 1.511143445968628, + "learning_rate": 1.3231981289088509e-05, + "loss": 1.5993, + "step": 7893 + }, + { + "epoch": 0.4315842707379413, + "grad_norm": 1.8522732257843018, + "learning_rate": 1.3230251949697537e-05, + "loss": 1.5229, + "step": 7894 + }, + { + "epoch": 0.43163894318166285, + "grad_norm": 1.513258934020996, + "learning_rate": 1.3228522502440868e-05, + "loss": 1.2744, + "step": 7895 + }, + { + "epoch": 0.43169361562538444, + "grad_norm": 1.203568458557129, + "learning_rate": 1.322679294737626e-05, + "loss": 1.2844, + "step": 7896 + }, + { + "epoch": 0.431748288069106, + "grad_norm": 1.2233251333236694, + "learning_rate": 1.3225063284561461e-05, + "loss": 1.4367, + "step": 7897 + }, + { + "epoch": 0.4318029605128275, + "grad_norm": 1.763957142829895, + "learning_rate": 1.3223333514054232e-05, + "loss": 1.3347, + "step": 7898 + }, + { + "epoch": 0.43185763295654905, + "grad_norm": 1.8858914375305176, + "learning_rate": 1.3221603635912335e-05, + "loss": 1.1392, + "step": 7899 + }, + { + "epoch": 0.43191230540027065, + "grad_norm": 1.504284143447876, + "learning_rate": 1.321987365019353e-05, + "loss": 1.5099, + "step": 7900 + }, + { + "epoch": 0.4319669778439922, + "grad_norm": 1.9837403297424316, + "learning_rate": 1.3218143556955592e-05, + "loss": 1.7284, + "step": 7901 + }, + { + "epoch": 0.4320216502877137, + "grad_norm": 1.462955355644226, + "learning_rate": 1.3216413356256286e-05, + "loss": 1.5183, + "step": 7902 + }, + { + "epoch": 0.4320763227314353, + "grad_norm": 2.3657987117767334, + "learning_rate": 1.3214683048153392e-05, + "loss": 1.4858, + "step": 7903 + }, + { + "epoch": 0.43213099517515685, + "grad_norm": 2.7442667484283447, + "learning_rate": 1.3212952632704688e-05, + "loss": 1.3906, + "step": 7904 + }, + { + "epoch": 0.4321856676188784, + "grad_norm": 1.3944246768951416, + "learning_rate": 1.3211222109967953e-05, + "loss": 1.589, + "step": 7905 + }, + { + "epoch": 0.4322403400625999, + "grad_norm": 1.5480307340621948, + "learning_rate": 1.3209491480000979e-05, + "loss": 1.5359, + "step": 7906 + }, + { + "epoch": 0.4322950125063215, + "grad_norm": 1.4972392320632935, + "learning_rate": 1.3207760742861555e-05, + "loss": 1.4533, + "step": 7907 + }, + { + "epoch": 0.43234968495004306, + "grad_norm": 1.7078042030334473, + "learning_rate": 1.3206029898607468e-05, + "loss": 1.5514, + "step": 7908 + }, + { + "epoch": 0.4324043573937646, + "grad_norm": 1.421466588973999, + "learning_rate": 1.3204298947296521e-05, + "loss": 1.5413, + "step": 7909 + }, + { + "epoch": 0.4324590298374862, + "grad_norm": 1.6177067756652832, + "learning_rate": 1.3202567888986512e-05, + "loss": 1.5275, + "step": 7910 + }, + { + "epoch": 0.4325137022812077, + "grad_norm": 1.3701096773147583, + "learning_rate": 1.320083672373525e-05, + "loss": 1.48, + "step": 7911 + }, + { + "epoch": 0.43256837472492926, + "grad_norm": 1.1058927774429321, + "learning_rate": 1.3199105451600536e-05, + "loss": 1.7541, + "step": 7912 + }, + { + "epoch": 0.4326230471686508, + "grad_norm": 1.483001708984375, + "learning_rate": 1.3197374072640186e-05, + "loss": 1.6006, + "step": 7913 + }, + { + "epoch": 0.4326777196123724, + "grad_norm": 1.6609505414962769, + "learning_rate": 1.3195642586912012e-05, + "loss": 1.5423, + "step": 7914 + }, + { + "epoch": 0.43273239205609393, + "grad_norm": 1.5502384901046753, + "learning_rate": 1.3193910994473831e-05, + "loss": 1.6016, + "step": 7915 + }, + { + "epoch": 0.43278706449981547, + "grad_norm": 1.4679465293884277, + "learning_rate": 1.319217929538347e-05, + "loss": 1.7467, + "step": 7916 + }, + { + "epoch": 0.43284173694353706, + "grad_norm": 1.1976284980773926, + "learning_rate": 1.3190447489698748e-05, + "loss": 1.3504, + "step": 7917 + }, + { + "epoch": 0.4328964093872586, + "grad_norm": 1.6221154928207397, + "learning_rate": 1.31887155774775e-05, + "loss": 1.5164, + "step": 7918 + }, + { + "epoch": 0.43295108183098013, + "grad_norm": 1.3200371265411377, + "learning_rate": 1.3186983558777557e-05, + "loss": 1.5616, + "step": 7919 + }, + { + "epoch": 0.43300575427470167, + "grad_norm": 1.5620577335357666, + "learning_rate": 1.3185251433656756e-05, + "loss": 1.5436, + "step": 7920 + }, + { + "epoch": 0.43306042671842326, + "grad_norm": 1.4334012269973755, + "learning_rate": 1.3183519202172935e-05, + "loss": 1.5357, + "step": 7921 + }, + { + "epoch": 0.4331150991621448, + "grad_norm": 1.423068642616272, + "learning_rate": 1.3181786864383934e-05, + "loss": 1.4083, + "step": 7922 + }, + { + "epoch": 0.43316977160586634, + "grad_norm": 1.6917566061019897, + "learning_rate": 1.3180054420347603e-05, + "loss": 1.3691, + "step": 7923 + }, + { + "epoch": 0.43322444404958793, + "grad_norm": 1.874115228652954, + "learning_rate": 1.3178321870121793e-05, + "loss": 1.4711, + "step": 7924 + }, + { + "epoch": 0.43327911649330947, + "grad_norm": 1.8485077619552612, + "learning_rate": 1.3176589213764362e-05, + "loss": 1.4961, + "step": 7925 + }, + { + "epoch": 0.433333788937031, + "grad_norm": 1.6964565515518188, + "learning_rate": 1.317485645133316e-05, + "loss": 1.5434, + "step": 7926 + }, + { + "epoch": 0.43338846138075254, + "grad_norm": 1.5310845375061035, + "learning_rate": 1.3173123582886052e-05, + "loss": 1.394, + "step": 7927 + }, + { + "epoch": 0.43344313382447414, + "grad_norm": 2.1145453453063965, + "learning_rate": 1.31713906084809e-05, + "loss": 1.4013, + "step": 7928 + }, + { + "epoch": 0.4334978062681957, + "grad_norm": 1.631354570388794, + "learning_rate": 1.3169657528175574e-05, + "loss": 1.3098, + "step": 7929 + }, + { + "epoch": 0.4335524787119172, + "grad_norm": 1.4064902067184448, + "learning_rate": 1.3167924342027947e-05, + "loss": 1.4753, + "step": 7930 + }, + { + "epoch": 0.4336071511556388, + "grad_norm": 1.5428874492645264, + "learning_rate": 1.3166191050095888e-05, + "loss": 1.6301, + "step": 7931 + }, + { + "epoch": 0.43366182359936034, + "grad_norm": 1.289516806602478, + "learning_rate": 1.3164457652437285e-05, + "loss": 1.4912, + "step": 7932 + }, + { + "epoch": 0.4337164960430819, + "grad_norm": 1.3857452869415283, + "learning_rate": 1.3162724149110016e-05, + "loss": 1.5449, + "step": 7933 + }, + { + "epoch": 0.4337711684868034, + "grad_norm": 1.5421695709228516, + "learning_rate": 1.3160990540171963e-05, + "loss": 1.5014, + "step": 7934 + }, + { + "epoch": 0.433825840930525, + "grad_norm": 1.5058538913726807, + "learning_rate": 1.315925682568102e-05, + "loss": 1.4177, + "step": 7935 + }, + { + "epoch": 0.43388051337424655, + "grad_norm": 1.2998119592666626, + "learning_rate": 1.3157523005695077e-05, + "loss": 1.5008, + "step": 7936 + }, + { + "epoch": 0.4339351858179681, + "grad_norm": 1.845200777053833, + "learning_rate": 1.315578908027203e-05, + "loss": 1.2848, + "step": 7937 + }, + { + "epoch": 0.4339898582616897, + "grad_norm": 1.2370152473449707, + "learning_rate": 1.3154055049469782e-05, + "loss": 1.5729, + "step": 7938 + }, + { + "epoch": 0.4340445307054112, + "grad_norm": 1.4025548696517944, + "learning_rate": 1.3152320913346234e-05, + "loss": 1.5593, + "step": 7939 + }, + { + "epoch": 0.43409920314913275, + "grad_norm": 1.4096531867980957, + "learning_rate": 1.3150586671959298e-05, + "loss": 1.5916, + "step": 7940 + }, + { + "epoch": 0.4341538755928543, + "grad_norm": 1.4436815977096558, + "learning_rate": 1.3148852325366874e-05, + "loss": 1.3122, + "step": 7941 + }, + { + "epoch": 0.4342085480365759, + "grad_norm": 1.6792570352554321, + "learning_rate": 1.3147117873626886e-05, + "loss": 1.3522, + "step": 7942 + }, + { + "epoch": 0.4342632204802974, + "grad_norm": 1.3572015762329102, + "learning_rate": 1.3145383316797244e-05, + "loss": 1.301, + "step": 7943 + }, + { + "epoch": 0.43431789292401896, + "grad_norm": 1.4196815490722656, + "learning_rate": 1.3143648654935875e-05, + "loss": 1.4672, + "step": 7944 + }, + { + "epoch": 0.43437256536774055, + "grad_norm": 2.0070111751556396, + "learning_rate": 1.3141913888100699e-05, + "loss": 1.2801, + "step": 7945 + }, + { + "epoch": 0.4344272378114621, + "grad_norm": 1.687574863433838, + "learning_rate": 1.3140179016349648e-05, + "loss": 1.3662, + "step": 7946 + }, + { + "epoch": 0.4344819102551836, + "grad_norm": 1.4930163621902466, + "learning_rate": 1.3138444039740648e-05, + "loss": 1.4385, + "step": 7947 + }, + { + "epoch": 0.43453658269890516, + "grad_norm": 1.0718982219696045, + "learning_rate": 1.3136708958331636e-05, + "loss": 1.5343, + "step": 7948 + }, + { + "epoch": 0.43459125514262675, + "grad_norm": 1.2657756805419922, + "learning_rate": 1.3134973772180554e-05, + "loss": 1.5035, + "step": 7949 + }, + { + "epoch": 0.4346459275863483, + "grad_norm": 1.9662495851516724, + "learning_rate": 1.3133238481345341e-05, + "loss": 1.525, + "step": 7950 + }, + { + "epoch": 0.43470060003006983, + "grad_norm": 1.6782675981521606, + "learning_rate": 1.313150308588394e-05, + "loss": 1.2688, + "step": 7951 + }, + { + "epoch": 0.4347552724737914, + "grad_norm": 1.7351614236831665, + "learning_rate": 1.3129767585854304e-05, + "loss": 1.5296, + "step": 7952 + }, + { + "epoch": 0.43480994491751296, + "grad_norm": 1.4734182357788086, + "learning_rate": 1.3128031981314388e-05, + "loss": 1.6898, + "step": 7953 + }, + { + "epoch": 0.4348646173612345, + "grad_norm": 1.4819661378860474, + "learning_rate": 1.312629627232214e-05, + "loss": 1.4152, + "step": 7954 + }, + { + "epoch": 0.43491928980495603, + "grad_norm": 2.2526443004608154, + "learning_rate": 1.3124560458935522e-05, + "loss": 1.2139, + "step": 7955 + }, + { + "epoch": 0.4349739622486776, + "grad_norm": 1.1022708415985107, + "learning_rate": 1.3122824541212503e-05, + "loss": 1.5428, + "step": 7956 + }, + { + "epoch": 0.43502863469239916, + "grad_norm": 1.5943683385849, + "learning_rate": 1.3121088519211043e-05, + "loss": 1.2717, + "step": 7957 + }, + { + "epoch": 0.4350833071361207, + "grad_norm": 1.1000699996948242, + "learning_rate": 1.311935239298911e-05, + "loss": 1.4479, + "step": 7958 + }, + { + "epoch": 0.4351379795798423, + "grad_norm": 1.7810734510421753, + "learning_rate": 1.3117616162604684e-05, + "loss": 1.4142, + "step": 7959 + }, + { + "epoch": 0.43519265202356383, + "grad_norm": 1.5766388177871704, + "learning_rate": 1.311587982811574e-05, + "loss": 1.1824, + "step": 7960 + }, + { + "epoch": 0.43524732446728537, + "grad_norm": 1.4603058099746704, + "learning_rate": 1.3114143389580254e-05, + "loss": 1.587, + "step": 7961 + }, + { + "epoch": 0.4353019969110069, + "grad_norm": 1.3398041725158691, + "learning_rate": 1.3112406847056213e-05, + "loss": 1.4176, + "step": 7962 + }, + { + "epoch": 0.4353566693547285, + "grad_norm": 1.552003264427185, + "learning_rate": 1.3110670200601604e-05, + "loss": 1.4798, + "step": 7963 + }, + { + "epoch": 0.43541134179845004, + "grad_norm": 1.746003270149231, + "learning_rate": 1.310893345027442e-05, + "loss": 1.2425, + "step": 7964 + }, + { + "epoch": 0.4354660142421716, + "grad_norm": 1.5650519132614136, + "learning_rate": 1.310719659613265e-05, + "loss": 1.4854, + "step": 7965 + }, + { + "epoch": 0.43552068668589317, + "grad_norm": 1.1859936714172363, + "learning_rate": 1.3105459638234294e-05, + "loss": 1.4614, + "step": 7966 + }, + { + "epoch": 0.4355753591296147, + "grad_norm": 1.6562548875808716, + "learning_rate": 1.3103722576637357e-05, + "loss": 1.5827, + "step": 7967 + }, + { + "epoch": 0.43563003157333624, + "grad_norm": 1.9419225454330444, + "learning_rate": 1.3101985411399838e-05, + "loss": 1.5912, + "step": 7968 + }, + { + "epoch": 0.4356847040170578, + "grad_norm": 1.4716405868530273, + "learning_rate": 1.3100248142579743e-05, + "loss": 1.526, + "step": 7969 + }, + { + "epoch": 0.43573937646077937, + "grad_norm": 1.662550449371338, + "learning_rate": 1.3098510770235093e-05, + "loss": 1.3272, + "step": 7970 + }, + { + "epoch": 0.4357940489045009, + "grad_norm": 1.7414203882217407, + "learning_rate": 1.3096773294423896e-05, + "loss": 1.4371, + "step": 7971 + }, + { + "epoch": 0.43584872134822245, + "grad_norm": 1.6071090698242188, + "learning_rate": 1.3095035715204171e-05, + "loss": 1.4656, + "step": 7972 + }, + { + "epoch": 0.43590339379194404, + "grad_norm": 1.7617849111557007, + "learning_rate": 1.3093298032633943e-05, + "loss": 1.4694, + "step": 7973 + }, + { + "epoch": 0.4359580662356656, + "grad_norm": 1.1598265171051025, + "learning_rate": 1.3091560246771234e-05, + "loss": 1.539, + "step": 7974 + }, + { + "epoch": 0.4360127386793871, + "grad_norm": 1.5186183452606201, + "learning_rate": 1.3089822357674073e-05, + "loss": 1.3161, + "step": 7975 + }, + { + "epoch": 0.43606741112310865, + "grad_norm": 1.56089448928833, + "learning_rate": 1.3088084365400493e-05, + "loss": 1.3012, + "step": 7976 + }, + { + "epoch": 0.43612208356683024, + "grad_norm": 1.594698429107666, + "learning_rate": 1.308634627000853e-05, + "loss": 1.4472, + "step": 7977 + }, + { + "epoch": 0.4361767560105518, + "grad_norm": 1.5759395360946655, + "learning_rate": 1.3084608071556222e-05, + "loss": 1.6819, + "step": 7978 + }, + { + "epoch": 0.4362314284542733, + "grad_norm": 1.643673062324524, + "learning_rate": 1.3082869770101613e-05, + "loss": 1.4626, + "step": 7979 + }, + { + "epoch": 0.4362861008979949, + "grad_norm": 1.601363182067871, + "learning_rate": 1.3081131365702749e-05, + "loss": 1.2938, + "step": 7980 + }, + { + "epoch": 0.43634077334171645, + "grad_norm": 1.4633147716522217, + "learning_rate": 1.3079392858417679e-05, + "loss": 1.4583, + "step": 7981 + }, + { + "epoch": 0.436395445785438, + "grad_norm": 1.446134328842163, + "learning_rate": 1.3077654248304452e-05, + "loss": 1.489, + "step": 7982 + }, + { + "epoch": 0.4364501182291595, + "grad_norm": 1.4666799306869507, + "learning_rate": 1.307591553542113e-05, + "loss": 1.6356, + "step": 7983 + }, + { + "epoch": 0.4365047906728811, + "grad_norm": 1.414751410484314, + "learning_rate": 1.307417671982577e-05, + "loss": 1.5209, + "step": 7984 + }, + { + "epoch": 0.43655946311660265, + "grad_norm": 1.552795648574829, + "learning_rate": 1.3072437801576438e-05, + "loss": 1.3349, + "step": 7985 + }, + { + "epoch": 0.4366141355603242, + "grad_norm": 1.4293928146362305, + "learning_rate": 1.3070698780731194e-05, + "loss": 1.4775, + "step": 7986 + }, + { + "epoch": 0.4366688080040458, + "grad_norm": 1.5619577169418335, + "learning_rate": 1.3068959657348112e-05, + "loss": 1.559, + "step": 7987 + }, + { + "epoch": 0.4367234804477673, + "grad_norm": 1.8122988939285278, + "learning_rate": 1.306722043148527e-05, + "loss": 1.4202, + "step": 7988 + }, + { + "epoch": 0.43677815289148886, + "grad_norm": 1.4852317571640015, + "learning_rate": 1.3065481103200736e-05, + "loss": 1.5513, + "step": 7989 + }, + { + "epoch": 0.4368328253352104, + "grad_norm": 2.4488115310668945, + "learning_rate": 1.3063741672552597e-05, + "loss": 1.3887, + "step": 7990 + }, + { + "epoch": 0.436887497778932, + "grad_norm": 2.0974698066711426, + "learning_rate": 1.3062002139598934e-05, + "loss": 1.4757, + "step": 7991 + }, + { + "epoch": 0.4369421702226535, + "grad_norm": 1.1968412399291992, + "learning_rate": 1.3060262504397836e-05, + "loss": 1.4275, + "step": 7992 + }, + { + "epoch": 0.43699684266637506, + "grad_norm": 1.7194857597351074, + "learning_rate": 1.305852276700739e-05, + "loss": 1.3574, + "step": 7993 + }, + { + "epoch": 0.43705151511009666, + "grad_norm": 1.5051885843276978, + "learning_rate": 1.305678292748569e-05, + "loss": 1.2528, + "step": 7994 + }, + { + "epoch": 0.4371061875538182, + "grad_norm": 1.6352863311767578, + "learning_rate": 1.3055042985890837e-05, + "loss": 1.5232, + "step": 7995 + }, + { + "epoch": 0.43716085999753973, + "grad_norm": 2.0232059955596924, + "learning_rate": 1.305330294228093e-05, + "loss": 1.2906, + "step": 7996 + }, + { + "epoch": 0.43721553244126127, + "grad_norm": 1.655968189239502, + "learning_rate": 1.305156279671407e-05, + "loss": 1.6087, + "step": 7997 + }, + { + "epoch": 0.43727020488498286, + "grad_norm": 1.463921308517456, + "learning_rate": 1.3049822549248372e-05, + "loss": 1.3807, + "step": 7998 + }, + { + "epoch": 0.4373248773287044, + "grad_norm": 1.4473774433135986, + "learning_rate": 1.3048082199941941e-05, + "loss": 1.61, + "step": 7999 + }, + { + "epoch": 0.43737954977242594, + "grad_norm": 1.7587614059448242, + "learning_rate": 1.304634174885289e-05, + "loss": 1.1795, + "step": 8000 + }, + { + "epoch": 0.43743422221614753, + "grad_norm": 1.7571375370025635, + "learning_rate": 1.3044601196039341e-05, + "loss": 1.4806, + "step": 8001 + }, + { + "epoch": 0.43748889465986907, + "grad_norm": 1.624485731124878, + "learning_rate": 1.3042860541559416e-05, + "loss": 1.5221, + "step": 8002 + }, + { + "epoch": 0.4375435671035906, + "grad_norm": 1.2045515775680542, + "learning_rate": 1.3041119785471236e-05, + "loss": 1.5157, + "step": 8003 + }, + { + "epoch": 0.43759823954731214, + "grad_norm": 2.1388349533081055, + "learning_rate": 1.303937892783293e-05, + "loss": 1.6276, + "step": 8004 + }, + { + "epoch": 0.43765291199103373, + "grad_norm": 1.408665418624878, + "learning_rate": 1.3037637968702632e-05, + "loss": 1.3317, + "step": 8005 + }, + { + "epoch": 0.43770758443475527, + "grad_norm": 1.4489293098449707, + "learning_rate": 1.303589690813847e-05, + "loss": 1.6387, + "step": 8006 + }, + { + "epoch": 0.4377622568784768, + "grad_norm": 1.6468851566314697, + "learning_rate": 1.3034155746198588e-05, + "loss": 1.3034, + "step": 8007 + }, + { + "epoch": 0.4378169293221984, + "grad_norm": 1.393418788909912, + "learning_rate": 1.3032414482941125e-05, + "loss": 1.4607, + "step": 8008 + }, + { + "epoch": 0.43787160176591994, + "grad_norm": 1.2109824419021606, + "learning_rate": 1.3030673118424227e-05, + "loss": 1.5594, + "step": 8009 + }, + { + "epoch": 0.4379262742096415, + "grad_norm": 1.2740631103515625, + "learning_rate": 1.302893165270604e-05, + "loss": 1.4087, + "step": 8010 + }, + { + "epoch": 0.437980946653363, + "grad_norm": 1.617801547050476, + "learning_rate": 1.3027190085844721e-05, + "loss": 1.5227, + "step": 8011 + }, + { + "epoch": 0.4380356190970846, + "grad_norm": 1.6893993616104126, + "learning_rate": 1.3025448417898421e-05, + "loss": 1.4101, + "step": 8012 + }, + { + "epoch": 0.43809029154080614, + "grad_norm": 1.6102781295776367, + "learning_rate": 1.3023706648925299e-05, + "loss": 1.7067, + "step": 8013 + }, + { + "epoch": 0.4381449639845277, + "grad_norm": 1.3613183498382568, + "learning_rate": 1.3021964778983513e-05, + "loss": 1.7153, + "step": 8014 + }, + { + "epoch": 0.4381996364282493, + "grad_norm": 1.2472304105758667, + "learning_rate": 1.3020222808131236e-05, + "loss": 1.4922, + "step": 8015 + }, + { + "epoch": 0.4382543088719708, + "grad_norm": 1.9083877801895142, + "learning_rate": 1.301848073642663e-05, + "loss": 1.4898, + "step": 8016 + }, + { + "epoch": 0.43830898131569235, + "grad_norm": 1.5571743249893188, + "learning_rate": 1.301673856392787e-05, + "loss": 1.3384, + "step": 8017 + }, + { + "epoch": 0.4383636537594139, + "grad_norm": 1.3811122179031372, + "learning_rate": 1.301499629069313e-05, + "loss": 1.4468, + "step": 8018 + }, + { + "epoch": 0.4384183262031355, + "grad_norm": 1.552744746208191, + "learning_rate": 1.301325391678059e-05, + "loss": 1.4735, + "step": 8019 + }, + { + "epoch": 0.438472998646857, + "grad_norm": 1.8009295463562012, + "learning_rate": 1.301151144224843e-05, + "loss": 1.1278, + "step": 8020 + }, + { + "epoch": 0.43852767109057855, + "grad_norm": 1.487383246421814, + "learning_rate": 1.3009768867154834e-05, + "loss": 1.3423, + "step": 8021 + }, + { + "epoch": 0.43858234353430015, + "grad_norm": 1.2314083576202393, + "learning_rate": 1.3008026191557996e-05, + "loss": 1.3323, + "step": 8022 + }, + { + "epoch": 0.4386370159780217, + "grad_norm": 1.2835650444030762, + "learning_rate": 1.3006283415516103e-05, + "loss": 1.341, + "step": 8023 + }, + { + "epoch": 0.4386916884217432, + "grad_norm": 1.317692518234253, + "learning_rate": 1.3004540539087357e-05, + "loss": 1.4805, + "step": 8024 + }, + { + "epoch": 0.43874636086546476, + "grad_norm": 1.5539722442626953, + "learning_rate": 1.3002797562329944e-05, + "loss": 1.4722, + "step": 8025 + }, + { + "epoch": 0.43880103330918635, + "grad_norm": 1.425225853919983, + "learning_rate": 1.300105448530208e-05, + "loss": 1.2799, + "step": 8026 + }, + { + "epoch": 0.4388557057529079, + "grad_norm": 1.486557960510254, + "learning_rate": 1.2999311308061964e-05, + "loss": 1.547, + "step": 8027 + }, + { + "epoch": 0.4389103781966294, + "grad_norm": 1.5230633020401, + "learning_rate": 1.2997568030667802e-05, + "loss": 1.3874, + "step": 8028 + }, + { + "epoch": 0.438965050640351, + "grad_norm": 1.1834924221038818, + "learning_rate": 1.2995824653177813e-05, + "loss": 1.6382, + "step": 8029 + }, + { + "epoch": 0.43901972308407256, + "grad_norm": 1.6618623733520508, + "learning_rate": 1.2994081175650206e-05, + "loss": 1.5219, + "step": 8030 + }, + { + "epoch": 0.4390743955277941, + "grad_norm": 1.762713074684143, + "learning_rate": 1.2992337598143206e-05, + "loss": 1.5029, + "step": 8031 + }, + { + "epoch": 0.43912906797151563, + "grad_norm": 1.5695552825927734, + "learning_rate": 1.2990593920715032e-05, + "loss": 1.5145, + "step": 8032 + }, + { + "epoch": 0.4391837404152372, + "grad_norm": 1.5366038084030151, + "learning_rate": 1.2988850143423908e-05, + "loss": 1.3612, + "step": 8033 + }, + { + "epoch": 0.43923841285895876, + "grad_norm": 1.427353024482727, + "learning_rate": 1.298710626632806e-05, + "loss": 1.6037, + "step": 8034 + }, + { + "epoch": 0.4392930853026803, + "grad_norm": 1.6134169101715088, + "learning_rate": 1.2985362289485728e-05, + "loss": 1.3302, + "step": 8035 + }, + { + "epoch": 0.4393477577464019, + "grad_norm": 1.452877402305603, + "learning_rate": 1.2983618212955145e-05, + "loss": 1.3714, + "step": 8036 + }, + { + "epoch": 0.4394024301901234, + "grad_norm": 1.5261150598526, + "learning_rate": 1.2981874036794548e-05, + "loss": 1.4354, + "step": 8037 + }, + { + "epoch": 0.43945710263384496, + "grad_norm": 1.2748724222183228, + "learning_rate": 1.2980129761062178e-05, + "loss": 1.4332, + "step": 8038 + }, + { + "epoch": 0.4395117750775665, + "grad_norm": 1.6587997674942017, + "learning_rate": 1.2978385385816284e-05, + "loss": 1.4147, + "step": 8039 + }, + { + "epoch": 0.4395664475212881, + "grad_norm": 1.7333463430404663, + "learning_rate": 1.2976640911115113e-05, + "loss": 1.5737, + "step": 8040 + }, + { + "epoch": 0.43962111996500963, + "grad_norm": 1.525197982788086, + "learning_rate": 1.2974896337016914e-05, + "loss": 1.5287, + "step": 8041 + }, + { + "epoch": 0.43967579240873117, + "grad_norm": 1.6652889251708984, + "learning_rate": 1.2973151663579948e-05, + "loss": 1.5007, + "step": 8042 + }, + { + "epoch": 0.43973046485245276, + "grad_norm": 1.6004962921142578, + "learning_rate": 1.2971406890862473e-05, + "loss": 1.2997, + "step": 8043 + }, + { + "epoch": 0.4397851372961743, + "grad_norm": 1.8425679206848145, + "learning_rate": 1.2969662018922748e-05, + "loss": 1.3485, + "step": 8044 + }, + { + "epoch": 0.43983980973989584, + "grad_norm": 1.500597357749939, + "learning_rate": 1.2967917047819038e-05, + "loss": 1.5501, + "step": 8045 + }, + { + "epoch": 0.4398944821836174, + "grad_norm": 1.7031091451644897, + "learning_rate": 1.2966171977609614e-05, + "loss": 1.5267, + "step": 8046 + }, + { + "epoch": 0.43994915462733897, + "grad_norm": 1.9746205806732178, + "learning_rate": 1.2964426808352747e-05, + "loss": 1.5903, + "step": 8047 + }, + { + "epoch": 0.4400038270710605, + "grad_norm": 1.3452162742614746, + "learning_rate": 1.2962681540106713e-05, + "loss": 1.701, + "step": 8048 + }, + { + "epoch": 0.44005849951478204, + "grad_norm": 1.291957139968872, + "learning_rate": 1.296093617292979e-05, + "loss": 1.3657, + "step": 8049 + }, + { + "epoch": 0.44011317195850364, + "grad_norm": 1.607384204864502, + "learning_rate": 1.295919070688026e-05, + "loss": 1.1477, + "step": 8050 + }, + { + "epoch": 0.4401678444022252, + "grad_norm": 1.7330864667892456, + "learning_rate": 1.2957445142016412e-05, + "loss": 1.5722, + "step": 8051 + }, + { + "epoch": 0.4402225168459467, + "grad_norm": 1.5548391342163086, + "learning_rate": 1.2955699478396527e-05, + "loss": 1.2254, + "step": 8052 + }, + { + "epoch": 0.4402771892896683, + "grad_norm": 1.3436617851257324, + "learning_rate": 1.29539537160789e-05, + "loss": 1.4986, + "step": 8053 + }, + { + "epoch": 0.44033186173338984, + "grad_norm": 1.3335740566253662, + "learning_rate": 1.295220785512183e-05, + "loss": 1.2462, + "step": 8054 + }, + { + "epoch": 0.4403865341771114, + "grad_norm": 1.2817219495773315, + "learning_rate": 1.2950461895583608e-05, + "loss": 1.5259, + "step": 8055 + }, + { + "epoch": 0.4404412066208329, + "grad_norm": 1.588191270828247, + "learning_rate": 1.2948715837522542e-05, + "loss": 1.4215, + "step": 8056 + }, + { + "epoch": 0.4404958790645545, + "grad_norm": 1.270366907119751, + "learning_rate": 1.2946969680996939e-05, + "loss": 1.3546, + "step": 8057 + }, + { + "epoch": 0.44055055150827604, + "grad_norm": 1.997812271118164, + "learning_rate": 1.2945223426065096e-05, + "loss": 1.429, + "step": 8058 + }, + { + "epoch": 0.4406052239519976, + "grad_norm": 1.4300410747528076, + "learning_rate": 1.2943477072785336e-05, + "loss": 1.4445, + "step": 8059 + }, + { + "epoch": 0.4406598963957192, + "grad_norm": 1.6569336652755737, + "learning_rate": 1.2941730621215966e-05, + "loss": 1.3362, + "step": 8060 + }, + { + "epoch": 0.4407145688394407, + "grad_norm": 1.8176023960113525, + "learning_rate": 1.293998407141531e-05, + "loss": 1.5488, + "step": 8061 + }, + { + "epoch": 0.44076924128316225, + "grad_norm": 1.33528733253479, + "learning_rate": 1.2938237423441686e-05, + "loss": 1.4218, + "step": 8062 + }, + { + "epoch": 0.4408239137268838, + "grad_norm": 1.9585232734680176, + "learning_rate": 1.2936490677353422e-05, + "loss": 1.5603, + "step": 8063 + }, + { + "epoch": 0.4408785861706054, + "grad_norm": 1.644315242767334, + "learning_rate": 1.2934743833208842e-05, + "loss": 1.598, + "step": 8064 + }, + { + "epoch": 0.4409332586143269, + "grad_norm": 1.5690281391143799, + "learning_rate": 1.2932996891066279e-05, + "loss": 1.5316, + "step": 8065 + }, + { + "epoch": 0.44098793105804845, + "grad_norm": 1.4484105110168457, + "learning_rate": 1.2931249850984066e-05, + "loss": 1.3131, + "step": 8066 + }, + { + "epoch": 0.44104260350177005, + "grad_norm": 1.725439190864563, + "learning_rate": 1.292950271302054e-05, + "loss": 1.6056, + "step": 8067 + }, + { + "epoch": 0.4410972759454916, + "grad_norm": 1.9181344509124756, + "learning_rate": 1.292775547723405e-05, + "loss": 1.3168, + "step": 8068 + }, + { + "epoch": 0.4411519483892131, + "grad_norm": 1.6868451833724976, + "learning_rate": 1.292600814368293e-05, + "loss": 1.5803, + "step": 8069 + }, + { + "epoch": 0.44120662083293466, + "grad_norm": 1.7687948942184448, + "learning_rate": 1.2924260712425536e-05, + "loss": 1.5991, + "step": 8070 + }, + { + "epoch": 0.44126129327665625, + "grad_norm": 1.605661153793335, + "learning_rate": 1.2922513183520212e-05, + "loss": 1.4439, + "step": 8071 + }, + { + "epoch": 0.4413159657203778, + "grad_norm": 1.3713908195495605, + "learning_rate": 1.2920765557025316e-05, + "loss": 1.568, + "step": 8072 + }, + { + "epoch": 0.4413706381640993, + "grad_norm": 1.4632527828216553, + "learning_rate": 1.2919017832999203e-05, + "loss": 1.1363, + "step": 8073 + }, + { + "epoch": 0.4414253106078209, + "grad_norm": 1.660672903060913, + "learning_rate": 1.2917270011500233e-05, + "loss": 1.1966, + "step": 8074 + }, + { + "epoch": 0.44147998305154246, + "grad_norm": 1.4558382034301758, + "learning_rate": 1.2915522092586777e-05, + "loss": 1.7505, + "step": 8075 + }, + { + "epoch": 0.441534655495264, + "grad_norm": 1.5288527011871338, + "learning_rate": 1.2913774076317193e-05, + "loss": 1.4446, + "step": 8076 + }, + { + "epoch": 0.44158932793898553, + "grad_norm": 2.2732183933258057, + "learning_rate": 1.2912025962749856e-05, + "loss": 0.9899, + "step": 8077 + }, + { + "epoch": 0.4416440003827071, + "grad_norm": 1.2648788690567017, + "learning_rate": 1.2910277751943141e-05, + "loss": 1.3194, + "step": 8078 + }, + { + "epoch": 0.44169867282642866, + "grad_norm": 1.5940566062927246, + "learning_rate": 1.290852944395542e-05, + "loss": 1.4099, + "step": 8079 + }, + { + "epoch": 0.4417533452701502, + "grad_norm": 1.8022809028625488, + "learning_rate": 1.2906781038845076e-05, + "loss": 1.0907, + "step": 8080 + }, + { + "epoch": 0.4418080177138718, + "grad_norm": 1.545464277267456, + "learning_rate": 1.2905032536670492e-05, + "loss": 1.6111, + "step": 8081 + }, + { + "epoch": 0.44186269015759333, + "grad_norm": 1.3461095094680786, + "learning_rate": 1.2903283937490056e-05, + "loss": 1.5767, + "step": 8082 + }, + { + "epoch": 0.44191736260131487, + "grad_norm": 1.512786626815796, + "learning_rate": 1.2901535241362158e-05, + "loss": 1.3413, + "step": 8083 + }, + { + "epoch": 0.4419720350450364, + "grad_norm": 1.6175721883773804, + "learning_rate": 1.2899786448345186e-05, + "loss": 1.5605, + "step": 8084 + }, + { + "epoch": 0.442026707488758, + "grad_norm": 1.5888365507125854, + "learning_rate": 1.2898037558497542e-05, + "loss": 1.4335, + "step": 8085 + }, + { + "epoch": 0.44208137993247953, + "grad_norm": 1.923656940460205, + "learning_rate": 1.2896288571877623e-05, + "loss": 1.5939, + "step": 8086 + }, + { + "epoch": 0.44213605237620107, + "grad_norm": 1.5893315076828003, + "learning_rate": 1.2894539488543832e-05, + "loss": 1.4776, + "step": 8087 + }, + { + "epoch": 0.44219072481992266, + "grad_norm": 1.57053804397583, + "learning_rate": 1.2892790308554574e-05, + "loss": 1.6126, + "step": 8088 + }, + { + "epoch": 0.4422453972636442, + "grad_norm": 1.4348034858703613, + "learning_rate": 1.2891041031968261e-05, + "loss": 1.4407, + "step": 8089 + }, + { + "epoch": 0.44230006970736574, + "grad_norm": 1.6272783279418945, + "learning_rate": 1.2889291658843306e-05, + "loss": 1.5828, + "step": 8090 + }, + { + "epoch": 0.4423547421510873, + "grad_norm": 1.4956233501434326, + "learning_rate": 1.288754218923812e-05, + "loss": 1.2823, + "step": 8091 + }, + { + "epoch": 0.44240941459480887, + "grad_norm": 1.8216571807861328, + "learning_rate": 1.2885792623211124e-05, + "loss": 1.552, + "step": 8092 + }, + { + "epoch": 0.4424640870385304, + "grad_norm": 1.7263739109039307, + "learning_rate": 1.2884042960820742e-05, + "loss": 1.3268, + "step": 8093 + }, + { + "epoch": 0.44251875948225194, + "grad_norm": 1.2669081687927246, + "learning_rate": 1.2882293202125395e-05, + "loss": 1.2265, + "step": 8094 + }, + { + "epoch": 0.44257343192597354, + "grad_norm": 1.380018711090088, + "learning_rate": 1.2880543347183519e-05, + "loss": 1.4719, + "step": 8095 + }, + { + "epoch": 0.4426281043696951, + "grad_norm": 1.703335165977478, + "learning_rate": 1.287879339605354e-05, + "loss": 1.6282, + "step": 8096 + }, + { + "epoch": 0.4426827768134166, + "grad_norm": 1.5998271703720093, + "learning_rate": 1.2877043348793893e-05, + "loss": 1.3054, + "step": 8097 + }, + { + "epoch": 0.44273744925713815, + "grad_norm": 1.3018110990524292, + "learning_rate": 1.2875293205463018e-05, + "loss": 1.6516, + "step": 8098 + }, + { + "epoch": 0.44279212170085974, + "grad_norm": 1.4541958570480347, + "learning_rate": 1.2873542966119355e-05, + "loss": 1.4782, + "step": 8099 + }, + { + "epoch": 0.4428467941445813, + "grad_norm": 1.230223298072815, + "learning_rate": 1.2871792630821349e-05, + "loss": 1.2264, + "step": 8100 + }, + { + "epoch": 0.4429014665883028, + "grad_norm": 1.627626895904541, + "learning_rate": 1.2870042199627448e-05, + "loss": 1.6242, + "step": 8101 + }, + { + "epoch": 0.4429561390320244, + "grad_norm": 2.1716301441192627, + "learning_rate": 1.2868291672596104e-05, + "loss": 1.5429, + "step": 8102 + }, + { + "epoch": 0.44301081147574595, + "grad_norm": 1.6860781908035278, + "learning_rate": 1.2866541049785773e-05, + "loss": 1.4032, + "step": 8103 + }, + { + "epoch": 0.4430654839194675, + "grad_norm": 1.8203725814819336, + "learning_rate": 1.2864790331254906e-05, + "loss": 1.3744, + "step": 8104 + }, + { + "epoch": 0.443120156363189, + "grad_norm": 1.2841624021530151, + "learning_rate": 1.2863039517061968e-05, + "loss": 1.3045, + "step": 8105 + }, + { + "epoch": 0.4431748288069106, + "grad_norm": 1.7242660522460938, + "learning_rate": 1.2861288607265425e-05, + "loss": 1.3976, + "step": 8106 + }, + { + "epoch": 0.44322950125063215, + "grad_norm": 2.147538900375366, + "learning_rate": 1.2859537601923737e-05, + "loss": 1.4725, + "step": 8107 + }, + { + "epoch": 0.4432841736943537, + "grad_norm": 1.3776979446411133, + "learning_rate": 1.285778650109538e-05, + "loss": 1.2749, + "step": 8108 + }, + { + "epoch": 0.4433388461380753, + "grad_norm": 1.5127042531967163, + "learning_rate": 1.2856035304838827e-05, + "loss": 1.4003, + "step": 8109 + }, + { + "epoch": 0.4433935185817968, + "grad_norm": 1.4176710844039917, + "learning_rate": 1.2854284013212555e-05, + "loss": 1.5133, + "step": 8110 + }, + { + "epoch": 0.44344819102551836, + "grad_norm": 1.980574369430542, + "learning_rate": 1.2852532626275038e-05, + "loss": 1.5518, + "step": 8111 + }, + { + "epoch": 0.4435028634692399, + "grad_norm": 1.053544282913208, + "learning_rate": 1.2850781144084763e-05, + "loss": 1.5776, + "step": 8112 + }, + { + "epoch": 0.4435575359129615, + "grad_norm": 1.4116826057434082, + "learning_rate": 1.284902956670022e-05, + "loss": 1.284, + "step": 8113 + }, + { + "epoch": 0.443612208356683, + "grad_norm": 1.4348366260528564, + "learning_rate": 1.284727789417989e-05, + "loss": 1.5306, + "step": 8114 + }, + { + "epoch": 0.44366688080040456, + "grad_norm": 1.7165453433990479, + "learning_rate": 1.2845526126582273e-05, + "loss": 1.3887, + "step": 8115 + }, + { + "epoch": 0.44372155324412615, + "grad_norm": 1.9920040369033813, + "learning_rate": 1.2843774263965857e-05, + "loss": 1.2171, + "step": 8116 + }, + { + "epoch": 0.4437762256878477, + "grad_norm": 1.3562464714050293, + "learning_rate": 1.2842022306389153e-05, + "loss": 1.6755, + "step": 8117 + }, + { + "epoch": 0.44383089813156923, + "grad_norm": 1.5201573371887207, + "learning_rate": 1.2840270253910648e-05, + "loss": 1.4962, + "step": 8118 + }, + { + "epoch": 0.44388557057529077, + "grad_norm": 1.8644646406173706, + "learning_rate": 1.2838518106588856e-05, + "loss": 1.4145, + "step": 8119 + }, + { + "epoch": 0.44394024301901236, + "grad_norm": 1.526608943939209, + "learning_rate": 1.2836765864482286e-05, + "loss": 1.5605, + "step": 8120 + }, + { + "epoch": 0.4439949154627339, + "grad_norm": 1.2228026390075684, + "learning_rate": 1.2835013527649443e-05, + "loss": 1.3585, + "step": 8121 + }, + { + "epoch": 0.44404958790645543, + "grad_norm": 1.5350278615951538, + "learning_rate": 1.283326109614885e-05, + "loss": 1.6832, + "step": 8122 + }, + { + "epoch": 0.444104260350177, + "grad_norm": 1.6968077421188354, + "learning_rate": 1.2831508570039017e-05, + "loss": 1.3384, + "step": 8123 + }, + { + "epoch": 0.44415893279389856, + "grad_norm": 2.048135995864868, + "learning_rate": 1.282975594937847e-05, + "loss": 1.5626, + "step": 8124 + }, + { + "epoch": 0.4442136052376201, + "grad_norm": 2.1295413970947266, + "learning_rate": 1.2828003234225733e-05, + "loss": 1.442, + "step": 8125 + }, + { + "epoch": 0.44426827768134164, + "grad_norm": 2.217195510864258, + "learning_rate": 1.2826250424639329e-05, + "loss": 1.5138, + "step": 8126 + }, + { + "epoch": 0.44432295012506323, + "grad_norm": 1.1507654190063477, + "learning_rate": 1.2824497520677794e-05, + "loss": 1.3807, + "step": 8127 + }, + { + "epoch": 0.44437762256878477, + "grad_norm": 1.6774499416351318, + "learning_rate": 1.2822744522399658e-05, + "loss": 1.4265, + "step": 8128 + }, + { + "epoch": 0.4444322950125063, + "grad_norm": 1.3849742412567139, + "learning_rate": 1.282099142986346e-05, + "loss": 1.4167, + "step": 8129 + }, + { + "epoch": 0.4444869674562279, + "grad_norm": 1.4208265542984009, + "learning_rate": 1.2819238243127736e-05, + "loss": 1.4886, + "step": 8130 + }, + { + "epoch": 0.44454163989994944, + "grad_norm": 1.9465816020965576, + "learning_rate": 1.2817484962251033e-05, + "loss": 1.4557, + "step": 8131 + }, + { + "epoch": 0.444596312343671, + "grad_norm": 1.859169363975525, + "learning_rate": 1.2815731587291893e-05, + "loss": 1.4593, + "step": 8132 + }, + { + "epoch": 0.4446509847873925, + "grad_norm": 1.5738184452056885, + "learning_rate": 1.2813978118308872e-05, + "loss": 1.3435, + "step": 8133 + }, + { + "epoch": 0.4447056572311141, + "grad_norm": 1.5191147327423096, + "learning_rate": 1.2812224555360518e-05, + "loss": 1.3142, + "step": 8134 + }, + { + "epoch": 0.44476032967483564, + "grad_norm": 1.5100433826446533, + "learning_rate": 1.2810470898505384e-05, + "loss": 1.4418, + "step": 8135 + }, + { + "epoch": 0.4448150021185572, + "grad_norm": 1.4552125930786133, + "learning_rate": 1.2808717147802035e-05, + "loss": 1.455, + "step": 8136 + }, + { + "epoch": 0.44486967456227877, + "grad_norm": 1.3486210107803345, + "learning_rate": 1.2806963303309025e-05, + "loss": 1.6826, + "step": 8137 + }, + { + "epoch": 0.4449243470060003, + "grad_norm": 1.3706212043762207, + "learning_rate": 1.2805209365084928e-05, + "loss": 1.6017, + "step": 8138 + }, + { + "epoch": 0.44497901944972185, + "grad_norm": 1.5463474988937378, + "learning_rate": 1.2803455333188306e-05, + "loss": 1.4106, + "step": 8139 + }, + { + "epoch": 0.4450336918934434, + "grad_norm": 1.2022989988327026, + "learning_rate": 1.2801701207677731e-05, + "loss": 1.508, + "step": 8140 + }, + { + "epoch": 0.445088364337165, + "grad_norm": 1.6576238870620728, + "learning_rate": 1.279994698861178e-05, + "loss": 1.5384, + "step": 8141 + }, + { + "epoch": 0.4451430367808865, + "grad_norm": 1.2729111909866333, + "learning_rate": 1.279819267604903e-05, + "loss": 1.52, + "step": 8142 + }, + { + "epoch": 0.44519770922460805, + "grad_norm": 1.7380257844924927, + "learning_rate": 1.2796438270048057e-05, + "loss": 1.4048, + "step": 8143 + }, + { + "epoch": 0.44525238166832964, + "grad_norm": 1.7682032585144043, + "learning_rate": 1.2794683770667448e-05, + "loss": 1.5299, + "step": 8144 + }, + { + "epoch": 0.4453070541120512, + "grad_norm": 1.7684584856033325, + "learning_rate": 1.2792929177965793e-05, + "loss": 1.6556, + "step": 8145 + }, + { + "epoch": 0.4453617265557727, + "grad_norm": 1.3416972160339355, + "learning_rate": 1.2791174492001677e-05, + "loss": 1.4461, + "step": 8146 + }, + { + "epoch": 0.44541639899949426, + "grad_norm": 1.5746370553970337, + "learning_rate": 1.2789419712833698e-05, + "loss": 1.2701, + "step": 8147 + }, + { + "epoch": 0.44547107144321585, + "grad_norm": 1.3784524202346802, + "learning_rate": 1.2787664840520446e-05, + "loss": 1.3107, + "step": 8148 + }, + { + "epoch": 0.4455257438869374, + "grad_norm": 1.5533394813537598, + "learning_rate": 1.2785909875120523e-05, + "loss": 1.4663, + "step": 8149 + }, + { + "epoch": 0.4455804163306589, + "grad_norm": 1.4833149909973145, + "learning_rate": 1.2784154816692533e-05, + "loss": 1.5609, + "step": 8150 + }, + { + "epoch": 0.4456350887743805, + "grad_norm": 1.6536489725112915, + "learning_rate": 1.278239966529508e-05, + "loss": 1.4565, + "step": 8151 + }, + { + "epoch": 0.44568976121810205, + "grad_norm": 1.46404230594635, + "learning_rate": 1.2780644420986774e-05, + "loss": 1.462, + "step": 8152 + }, + { + "epoch": 0.4457444336618236, + "grad_norm": 1.260527491569519, + "learning_rate": 1.2778889083826225e-05, + "loss": 1.4005, + "step": 8153 + }, + { + "epoch": 0.44579910610554513, + "grad_norm": 1.9042547941207886, + "learning_rate": 1.277713365387205e-05, + "loss": 1.6291, + "step": 8154 + }, + { + "epoch": 0.4458537785492667, + "grad_norm": 1.2941627502441406, + "learning_rate": 1.2775378131182867e-05, + "loss": 1.4392, + "step": 8155 + }, + { + "epoch": 0.44590845099298826, + "grad_norm": 1.4428859949111938, + "learning_rate": 1.2773622515817292e-05, + "loss": 1.5246, + "step": 8156 + }, + { + "epoch": 0.4459631234367098, + "grad_norm": 1.596130609512329, + "learning_rate": 1.2771866807833952e-05, + "loss": 1.3273, + "step": 8157 + }, + { + "epoch": 0.4460177958804314, + "grad_norm": 1.599047064781189, + "learning_rate": 1.2770111007291476e-05, + "loss": 1.2327, + "step": 8158 + }, + { + "epoch": 0.4460724683241529, + "grad_norm": 1.415103793144226, + "learning_rate": 1.2768355114248493e-05, + "loss": 1.4243, + "step": 8159 + }, + { + "epoch": 0.44612714076787446, + "grad_norm": 1.4478442668914795, + "learning_rate": 1.276659912876364e-05, + "loss": 1.52, + "step": 8160 + }, + { + "epoch": 0.446181813211596, + "grad_norm": 1.5859559774398804, + "learning_rate": 1.2764843050895548e-05, + "loss": 1.5616, + "step": 8161 + }, + { + "epoch": 0.4462364856553176, + "grad_norm": 1.4891431331634521, + "learning_rate": 1.2763086880702859e-05, + "loss": 1.6772, + "step": 8162 + }, + { + "epoch": 0.44629115809903913, + "grad_norm": 1.5714459419250488, + "learning_rate": 1.2761330618244215e-05, + "loss": 1.3956, + "step": 8163 + }, + { + "epoch": 0.44634583054276067, + "grad_norm": 1.2723820209503174, + "learning_rate": 1.275957426357826e-05, + "loss": 1.4012, + "step": 8164 + }, + { + "epoch": 0.44640050298648226, + "grad_norm": 1.598341464996338, + "learning_rate": 1.2757817816763645e-05, + "loss": 1.4838, + "step": 8165 + }, + { + "epoch": 0.4464551754302038, + "grad_norm": 1.619235634803772, + "learning_rate": 1.2756061277859024e-05, + "loss": 1.4015, + "step": 8166 + }, + { + "epoch": 0.44650984787392534, + "grad_norm": 1.7259916067123413, + "learning_rate": 1.275430464692305e-05, + "loss": 1.3999, + "step": 8167 + }, + { + "epoch": 0.4465645203176469, + "grad_norm": 1.224819302558899, + "learning_rate": 1.2752547924014378e-05, + "loss": 1.5543, + "step": 8168 + }, + { + "epoch": 0.44661919276136847, + "grad_norm": 2.122762441635132, + "learning_rate": 1.2750791109191677e-05, + "loss": 1.63, + "step": 8169 + }, + { + "epoch": 0.44667386520509, + "grad_norm": 1.6128840446472168, + "learning_rate": 1.27490342025136e-05, + "loss": 1.4435, + "step": 8170 + }, + { + "epoch": 0.44672853764881154, + "grad_norm": 1.6642979383468628, + "learning_rate": 1.2747277204038818e-05, + "loss": 1.2569, + "step": 8171 + }, + { + "epoch": 0.44678321009253313, + "grad_norm": 1.5438079833984375, + "learning_rate": 1.2745520113826009e-05, + "loss": 1.301, + "step": 8172 + }, + { + "epoch": 0.44683788253625467, + "grad_norm": 1.4052797555923462, + "learning_rate": 1.274376293193384e-05, + "loss": 1.5006, + "step": 8173 + }, + { + "epoch": 0.4468925549799762, + "grad_norm": 1.6001962423324585, + "learning_rate": 1.2742005658420988e-05, + "loss": 1.4304, + "step": 8174 + }, + { + "epoch": 0.44694722742369775, + "grad_norm": 1.569378137588501, + "learning_rate": 1.2740248293346134e-05, + "loss": 1.7352, + "step": 8175 + }, + { + "epoch": 0.44700189986741934, + "grad_norm": 1.6218761205673218, + "learning_rate": 1.2738490836767958e-05, + "loss": 1.5575, + "step": 8176 + }, + { + "epoch": 0.4470565723111409, + "grad_norm": 1.6781501770019531, + "learning_rate": 1.2736733288745144e-05, + "loss": 1.6832, + "step": 8177 + }, + { + "epoch": 0.4471112447548624, + "grad_norm": 1.6539993286132812, + "learning_rate": 1.2734975649336385e-05, + "loss": 1.3605, + "step": 8178 + }, + { + "epoch": 0.447165917198584, + "grad_norm": 1.375654935836792, + "learning_rate": 1.2733217918600374e-05, + "loss": 1.6212, + "step": 8179 + }, + { + "epoch": 0.44722058964230554, + "grad_norm": 1.3831801414489746, + "learning_rate": 1.2731460096595802e-05, + "loss": 1.3459, + "step": 8180 + }, + { + "epoch": 0.4472752620860271, + "grad_norm": 1.6406824588775635, + "learning_rate": 1.272970218338137e-05, + "loss": 1.5953, + "step": 8181 + }, + { + "epoch": 0.4473299345297486, + "grad_norm": 1.7784641981124878, + "learning_rate": 1.2727944179015773e-05, + "loss": 1.1616, + "step": 8182 + }, + { + "epoch": 0.4473846069734702, + "grad_norm": 1.8152427673339844, + "learning_rate": 1.2726186083557719e-05, + "loss": 1.289, + "step": 8183 + }, + { + "epoch": 0.44743927941719175, + "grad_norm": 1.621306300163269, + "learning_rate": 1.2724427897065915e-05, + "loss": 1.3125, + "step": 8184 + }, + { + "epoch": 0.4474939518609133, + "grad_norm": 1.4791525602340698, + "learning_rate": 1.2722669619599068e-05, + "loss": 1.6682, + "step": 8185 + }, + { + "epoch": 0.4475486243046349, + "grad_norm": 1.7333461046218872, + "learning_rate": 1.2720911251215897e-05, + "loss": 1.3141, + "step": 8186 + }, + { + "epoch": 0.4476032967483564, + "grad_norm": 1.3498351573944092, + "learning_rate": 1.2719152791975113e-05, + "loss": 1.3845, + "step": 8187 + }, + { + "epoch": 0.44765796919207795, + "grad_norm": 1.4816136360168457, + "learning_rate": 1.2717394241935437e-05, + "loss": 1.2027, + "step": 8188 + }, + { + "epoch": 0.4477126416357995, + "grad_norm": 1.2127426862716675, + "learning_rate": 1.271563560115559e-05, + "loss": 1.465, + "step": 8189 + }, + { + "epoch": 0.4477673140795211, + "grad_norm": 1.5512415170669556, + "learning_rate": 1.2713876869694299e-05, + "loss": 1.3751, + "step": 8190 + }, + { + "epoch": 0.4478219865232426, + "grad_norm": 1.3821966648101807, + "learning_rate": 1.2712118047610291e-05, + "loss": 1.2678, + "step": 8191 + }, + { + "epoch": 0.44787665896696416, + "grad_norm": 2.309410810470581, + "learning_rate": 1.2710359134962295e-05, + "loss": 1.507, + "step": 8192 + }, + { + "epoch": 0.44793133141068575, + "grad_norm": 1.4607819318771362, + "learning_rate": 1.270860013180905e-05, + "loss": 1.2919, + "step": 8193 + }, + { + "epoch": 0.4479860038544073, + "grad_norm": 1.5119976997375488, + "learning_rate": 1.2706841038209293e-05, + "loss": 1.3834, + "step": 8194 + }, + { + "epoch": 0.4480406762981288, + "grad_norm": 1.772449254989624, + "learning_rate": 1.2705081854221758e-05, + "loss": 1.3522, + "step": 8195 + }, + { + "epoch": 0.44809534874185036, + "grad_norm": 1.4481263160705566, + "learning_rate": 1.2703322579905191e-05, + "loss": 1.409, + "step": 8196 + }, + { + "epoch": 0.44815002118557196, + "grad_norm": 1.5739508867263794, + "learning_rate": 1.2701563215318343e-05, + "loss": 1.3565, + "step": 8197 + }, + { + "epoch": 0.4482046936292935, + "grad_norm": 1.2894620895385742, + "learning_rate": 1.2699803760519955e-05, + "loss": 1.6649, + "step": 8198 + }, + { + "epoch": 0.44825936607301503, + "grad_norm": 1.581498384475708, + "learning_rate": 1.2698044215568787e-05, + "loss": 1.4402, + "step": 8199 + }, + { + "epoch": 0.4483140385167366, + "grad_norm": 1.2623968124389648, + "learning_rate": 1.2696284580523592e-05, + "loss": 1.6399, + "step": 8200 + }, + { + "epoch": 0.44836871096045816, + "grad_norm": 2.1075406074523926, + "learning_rate": 1.2694524855443131e-05, + "loss": 1.5184, + "step": 8201 + }, + { + "epoch": 0.4484233834041797, + "grad_norm": 1.7794867753982544, + "learning_rate": 1.2692765040386157e-05, + "loss": 1.4508, + "step": 8202 + }, + { + "epoch": 0.44847805584790124, + "grad_norm": 1.681531310081482, + "learning_rate": 1.269100513541144e-05, + "loss": 1.3669, + "step": 8203 + }, + { + "epoch": 0.44853272829162283, + "grad_norm": 1.5385578870773315, + "learning_rate": 1.268924514057775e-05, + "loss": 1.6002, + "step": 8204 + }, + { + "epoch": 0.44858740073534437, + "grad_norm": 1.544387936592102, + "learning_rate": 1.2687485055943852e-05, + "loss": 1.4342, + "step": 8205 + }, + { + "epoch": 0.4486420731790659, + "grad_norm": 1.5841013193130493, + "learning_rate": 1.2685724881568522e-05, + "loss": 1.3044, + "step": 8206 + }, + { + "epoch": 0.4486967456227875, + "grad_norm": 1.461909294128418, + "learning_rate": 1.2683964617510536e-05, + "loss": 1.319, + "step": 8207 + }, + { + "epoch": 0.44875141806650903, + "grad_norm": 1.4264962673187256, + "learning_rate": 1.2682204263828675e-05, + "loss": 1.5861, + "step": 8208 + }, + { + "epoch": 0.44880609051023057, + "grad_norm": 1.1260265111923218, + "learning_rate": 1.2680443820581717e-05, + "loss": 1.4226, + "step": 8209 + }, + { + "epoch": 0.4488607629539521, + "grad_norm": 1.2951370477676392, + "learning_rate": 1.2678683287828451e-05, + "loss": 1.3752, + "step": 8210 + }, + { + "epoch": 0.4489154353976737, + "grad_norm": 1.8305498361587524, + "learning_rate": 1.2676922665627664e-05, + "loss": 1.5361, + "step": 8211 + }, + { + "epoch": 0.44897010784139524, + "grad_norm": 1.517006516456604, + "learning_rate": 1.2675161954038147e-05, + "loss": 1.2312, + "step": 8212 + }, + { + "epoch": 0.4490247802851168, + "grad_norm": 2.0678322315216064, + "learning_rate": 1.2673401153118699e-05, + "loss": 1.3826, + "step": 8213 + }, + { + "epoch": 0.44907945272883837, + "grad_norm": 1.7050068378448486, + "learning_rate": 1.2671640262928109e-05, + "loss": 1.4954, + "step": 8214 + }, + { + "epoch": 0.4491341251725599, + "grad_norm": 1.7713968753814697, + "learning_rate": 1.2669879283525182e-05, + "loss": 1.5162, + "step": 8215 + }, + { + "epoch": 0.44918879761628144, + "grad_norm": 1.6035246849060059, + "learning_rate": 1.2668118214968721e-05, + "loss": 1.1989, + "step": 8216 + }, + { + "epoch": 0.449243470060003, + "grad_norm": 2.751030445098877, + "learning_rate": 1.266635705731753e-05, + "loss": 1.4042, + "step": 8217 + }, + { + "epoch": 0.4492981425037246, + "grad_norm": 1.2930971384048462, + "learning_rate": 1.2664595810630424e-05, + "loss": 1.4746, + "step": 8218 + }, + { + "epoch": 0.4493528149474461, + "grad_norm": 1.304875373840332, + "learning_rate": 1.2662834474966208e-05, + "loss": 1.7743, + "step": 8219 + }, + { + "epoch": 0.44940748739116765, + "grad_norm": 1.4794212579727173, + "learning_rate": 1.2661073050383701e-05, + "loss": 1.4226, + "step": 8220 + }, + { + "epoch": 0.44946215983488924, + "grad_norm": 1.6423336267471313, + "learning_rate": 1.2659311536941721e-05, + "loss": 1.4481, + "step": 8221 + }, + { + "epoch": 0.4495168322786108, + "grad_norm": 1.5234434604644775, + "learning_rate": 1.265754993469909e-05, + "loss": 1.53, + "step": 8222 + }, + { + "epoch": 0.4495715047223323, + "grad_norm": 1.6022552251815796, + "learning_rate": 1.2655788243714629e-05, + "loss": 1.392, + "step": 8223 + }, + { + "epoch": 0.44962617716605385, + "grad_norm": 1.805836796760559, + "learning_rate": 1.2654026464047165e-05, + "loss": 1.5416, + "step": 8224 + }, + { + "epoch": 0.44968084960977545, + "grad_norm": 1.6202785968780518, + "learning_rate": 1.2652264595755532e-05, + "loss": 1.4633, + "step": 8225 + }, + { + "epoch": 0.449735522053497, + "grad_norm": 1.8353651762008667, + "learning_rate": 1.265050263889856e-05, + "loss": 1.424, + "step": 8226 + }, + { + "epoch": 0.4497901944972185, + "grad_norm": 2.071650505065918, + "learning_rate": 1.2648740593535084e-05, + "loss": 1.5241, + "step": 8227 + }, + { + "epoch": 0.4498448669409401, + "grad_norm": 1.4357300996780396, + "learning_rate": 1.2646978459723945e-05, + "loss": 1.19, + "step": 8228 + }, + { + "epoch": 0.44989953938466165, + "grad_norm": 1.656162977218628, + "learning_rate": 1.2645216237523986e-05, + "loss": 1.7556, + "step": 8229 + }, + { + "epoch": 0.4499542118283832, + "grad_norm": 1.5217256546020508, + "learning_rate": 1.2643453926994045e-05, + "loss": 1.3288, + "step": 8230 + }, + { + "epoch": 0.4500088842721047, + "grad_norm": 1.5292726755142212, + "learning_rate": 1.2641691528192976e-05, + "loss": 1.3289, + "step": 8231 + }, + { + "epoch": 0.4500635567158263, + "grad_norm": 1.3378609418869019, + "learning_rate": 1.2639929041179628e-05, + "loss": 1.5199, + "step": 8232 + }, + { + "epoch": 0.45011822915954786, + "grad_norm": 1.7221710681915283, + "learning_rate": 1.2638166466012858e-05, + "loss": 1.4657, + "step": 8233 + }, + { + "epoch": 0.4501729016032694, + "grad_norm": 1.6749554872512817, + "learning_rate": 1.2636403802751516e-05, + "loss": 1.3539, + "step": 8234 + }, + { + "epoch": 0.450227574046991, + "grad_norm": 1.606581211090088, + "learning_rate": 1.2634641051454461e-05, + "loss": 1.5859, + "step": 8235 + }, + { + "epoch": 0.4502822464907125, + "grad_norm": 2.227525234222412, + "learning_rate": 1.2632878212180566e-05, + "loss": 1.4689, + "step": 8236 + }, + { + "epoch": 0.45033691893443406, + "grad_norm": 1.491117238998413, + "learning_rate": 1.2631115284988685e-05, + "loss": 1.4109, + "step": 8237 + }, + { + "epoch": 0.4503915913781556, + "grad_norm": 1.5650933980941772, + "learning_rate": 1.262935226993769e-05, + "loss": 1.3454, + "step": 8238 + }, + { + "epoch": 0.4504462638218772, + "grad_norm": 1.2373417615890503, + "learning_rate": 1.2627589167086455e-05, + "loss": 1.2217, + "step": 8239 + }, + { + "epoch": 0.45050093626559873, + "grad_norm": 1.736806035041809, + "learning_rate": 1.2625825976493853e-05, + "loss": 1.4097, + "step": 8240 + }, + { + "epoch": 0.45055560870932027, + "grad_norm": 1.5745073556900024, + "learning_rate": 1.2624062698218755e-05, + "loss": 1.4714, + "step": 8241 + }, + { + "epoch": 0.45061028115304186, + "grad_norm": 1.3235257863998413, + "learning_rate": 1.262229933232005e-05, + "loss": 1.5481, + "step": 8242 + }, + { + "epoch": 0.4506649535967634, + "grad_norm": 1.7171540260314941, + "learning_rate": 1.2620535878856617e-05, + "loss": 1.2939, + "step": 8243 + }, + { + "epoch": 0.45071962604048493, + "grad_norm": 1.5141083002090454, + "learning_rate": 1.261877233788734e-05, + "loss": 1.3062, + "step": 8244 + }, + { + "epoch": 0.45077429848420647, + "grad_norm": 1.5138897895812988, + "learning_rate": 1.261700870947111e-05, + "loss": 1.3562, + "step": 8245 + }, + { + "epoch": 0.45082897092792806, + "grad_norm": 1.2657006978988647, + "learning_rate": 1.261524499366682e-05, + "loss": 1.1567, + "step": 8246 + }, + { + "epoch": 0.4508836433716496, + "grad_norm": 1.3661819696426392, + "learning_rate": 1.2613481190533362e-05, + "loss": 1.2943, + "step": 8247 + }, + { + "epoch": 0.45093831581537114, + "grad_norm": 1.6761592626571655, + "learning_rate": 1.2611717300129631e-05, + "loss": 1.3673, + "step": 8248 + }, + { + "epoch": 0.45099298825909273, + "grad_norm": 1.2274795770645142, + "learning_rate": 1.2609953322514531e-05, + "loss": 1.4674, + "step": 8249 + }, + { + "epoch": 0.45104766070281427, + "grad_norm": 1.248929500579834, + "learning_rate": 1.260818925774697e-05, + "loss": 1.3502, + "step": 8250 + }, + { + "epoch": 0.4511023331465358, + "grad_norm": 1.5713574886322021, + "learning_rate": 1.2606425105885844e-05, + "loss": 1.3064, + "step": 8251 + }, + { + "epoch": 0.45115700559025734, + "grad_norm": 1.5409235954284668, + "learning_rate": 1.2604660866990072e-05, + "loss": 1.44, + "step": 8252 + }, + { + "epoch": 0.45121167803397894, + "grad_norm": 1.7207536697387695, + "learning_rate": 1.2602896541118562e-05, + "loss": 1.2719, + "step": 8253 + }, + { + "epoch": 0.4512663504777005, + "grad_norm": 1.605028748512268, + "learning_rate": 1.2601132128330224e-05, + "loss": 1.2985, + "step": 8254 + }, + { + "epoch": 0.451321022921422, + "grad_norm": 2.041771411895752, + "learning_rate": 1.2599367628683982e-05, + "loss": 1.6074, + "step": 8255 + }, + { + "epoch": 0.4513756953651436, + "grad_norm": 1.3780932426452637, + "learning_rate": 1.2597603042238756e-05, + "loss": 1.3875, + "step": 8256 + }, + { + "epoch": 0.45143036780886514, + "grad_norm": 1.5755771398544312, + "learning_rate": 1.2595838369053471e-05, + "loss": 1.532, + "step": 8257 + }, + { + "epoch": 0.4514850402525867, + "grad_norm": 1.4916585683822632, + "learning_rate": 1.2594073609187047e-05, + "loss": 1.2428, + "step": 8258 + }, + { + "epoch": 0.45153971269630827, + "grad_norm": 1.656479001045227, + "learning_rate": 1.2592308762698422e-05, + "loss": 1.9466, + "step": 8259 + }, + { + "epoch": 0.4515943851400298, + "grad_norm": 1.8219188451766968, + "learning_rate": 1.2590543829646524e-05, + "loss": 1.4594, + "step": 8260 + }, + { + "epoch": 0.45164905758375135, + "grad_norm": 1.0919530391693115, + "learning_rate": 1.2588778810090288e-05, + "loss": 1.3476, + "step": 8261 + }, + { + "epoch": 0.4517037300274729, + "grad_norm": 1.569446086883545, + "learning_rate": 1.258701370408865e-05, + "loss": 1.4318, + "step": 8262 + }, + { + "epoch": 0.4517584024711945, + "grad_norm": 1.781261920928955, + "learning_rate": 1.2585248511700556e-05, + "loss": 1.7273, + "step": 8263 + }, + { + "epoch": 0.451813074914916, + "grad_norm": 1.3887330293655396, + "learning_rate": 1.258348323298495e-05, + "loss": 1.4251, + "step": 8264 + }, + { + "epoch": 0.45186774735863755, + "grad_norm": 1.2643119096755981, + "learning_rate": 1.2581717868000775e-05, + "loss": 1.3498, + "step": 8265 + }, + { + "epoch": 0.45192241980235914, + "grad_norm": 1.3912986516952515, + "learning_rate": 1.257995241680698e-05, + "loss": 1.2953, + "step": 8266 + }, + { + "epoch": 0.4519770922460807, + "grad_norm": 1.6444356441497803, + "learning_rate": 1.2578186879462525e-05, + "loss": 1.163, + "step": 8267 + }, + { + "epoch": 0.4520317646898022, + "grad_norm": 1.3368147611618042, + "learning_rate": 1.2576421256026355e-05, + "loss": 1.543, + "step": 8268 + }, + { + "epoch": 0.45208643713352376, + "grad_norm": 1.9492864608764648, + "learning_rate": 1.2574655546557432e-05, + "loss": 1.3699, + "step": 8269 + }, + { + "epoch": 0.45214110957724535, + "grad_norm": 1.565609335899353, + "learning_rate": 1.257288975111472e-05, + "loss": 1.4704, + "step": 8270 + }, + { + "epoch": 0.4521957820209669, + "grad_norm": 1.4092614650726318, + "learning_rate": 1.2571123869757186e-05, + "loss": 1.7815, + "step": 8271 + }, + { + "epoch": 0.4522504544646884, + "grad_norm": 1.6454740762710571, + "learning_rate": 1.2569357902543793e-05, + "loss": 1.674, + "step": 8272 + }, + { + "epoch": 0.45230512690841, + "grad_norm": 1.7222377061843872, + "learning_rate": 1.2567591849533507e-05, + "loss": 1.3575, + "step": 8273 + }, + { + "epoch": 0.45235979935213155, + "grad_norm": 1.486137866973877, + "learning_rate": 1.2565825710785305e-05, + "loss": 1.2162, + "step": 8274 + }, + { + "epoch": 0.4524144717958531, + "grad_norm": 1.3720372915267944, + "learning_rate": 1.256405948635816e-05, + "loss": 1.5344, + "step": 8275 + }, + { + "epoch": 0.4524691442395746, + "grad_norm": 1.688270092010498, + "learning_rate": 1.2562293176311054e-05, + "loss": 1.4013, + "step": 8276 + }, + { + "epoch": 0.4525238166832962, + "grad_norm": 1.448646903038025, + "learning_rate": 1.2560526780702963e-05, + "loss": 1.3119, + "step": 8277 + }, + { + "epoch": 0.45257848912701776, + "grad_norm": 1.3954297304153442, + "learning_rate": 1.255876029959288e-05, + "loss": 1.5029, + "step": 8278 + }, + { + "epoch": 0.4526331615707393, + "grad_norm": 1.5546889305114746, + "learning_rate": 1.2556993733039785e-05, + "loss": 1.7311, + "step": 8279 + }, + { + "epoch": 0.4526878340144609, + "grad_norm": 1.519081473350525, + "learning_rate": 1.2555227081102663e-05, + "loss": 1.5059, + "step": 8280 + }, + { + "epoch": 0.4527425064581824, + "grad_norm": 1.3438341617584229, + "learning_rate": 1.255346034384052e-05, + "loss": 1.8253, + "step": 8281 + }, + { + "epoch": 0.45279717890190396, + "grad_norm": 1.89476478099823, + "learning_rate": 1.2551693521312341e-05, + "loss": 1.353, + "step": 8282 + }, + { + "epoch": 0.4528518513456255, + "grad_norm": 1.7865662574768066, + "learning_rate": 1.2549926613577126e-05, + "loss": 1.5819, + "step": 8283 + }, + { + "epoch": 0.4529065237893471, + "grad_norm": 1.4359074831008911, + "learning_rate": 1.2548159620693881e-05, + "loss": 1.3895, + "step": 8284 + }, + { + "epoch": 0.45296119623306863, + "grad_norm": 1.8334403038024902, + "learning_rate": 1.2546392542721606e-05, + "loss": 1.439, + "step": 8285 + }, + { + "epoch": 0.45301586867679017, + "grad_norm": 1.4337259531021118, + "learning_rate": 1.2544625379719305e-05, + "loss": 1.5026, + "step": 8286 + }, + { + "epoch": 0.45307054112051176, + "grad_norm": 1.5983964204788208, + "learning_rate": 1.2542858131745997e-05, + "loss": 1.6266, + "step": 8287 + }, + { + "epoch": 0.4531252135642333, + "grad_norm": 1.457296371459961, + "learning_rate": 1.2541090798860686e-05, + "loss": 1.3777, + "step": 8288 + }, + { + "epoch": 0.45317988600795484, + "grad_norm": 1.446030855178833, + "learning_rate": 1.253932338112239e-05, + "loss": 1.2202, + "step": 8289 + }, + { + "epoch": 0.4532345584516764, + "grad_norm": 1.732521653175354, + "learning_rate": 1.2537555878590126e-05, + "loss": 1.5185, + "step": 8290 + }, + { + "epoch": 0.45328923089539797, + "grad_norm": 1.7114779949188232, + "learning_rate": 1.2535788291322921e-05, + "loss": 1.3594, + "step": 8291 + }, + { + "epoch": 0.4533439033391195, + "grad_norm": 1.3434600830078125, + "learning_rate": 1.2534020619379794e-05, + "loss": 1.4534, + "step": 8292 + }, + { + "epoch": 0.45339857578284104, + "grad_norm": 1.4166096448898315, + "learning_rate": 1.2532252862819772e-05, + "loss": 1.3732, + "step": 8293 + }, + { + "epoch": 0.45345324822656263, + "grad_norm": 1.7464346885681152, + "learning_rate": 1.253048502170188e-05, + "loss": 1.4717, + "step": 8294 + }, + { + "epoch": 0.45350792067028417, + "grad_norm": 1.374466061592102, + "learning_rate": 1.2528717096085162e-05, + "loss": 1.4543, + "step": 8295 + }, + { + "epoch": 0.4535625931140057, + "grad_norm": 1.5717560052871704, + "learning_rate": 1.2526949086028641e-05, + "loss": 1.4058, + "step": 8296 + }, + { + "epoch": 0.45361726555772725, + "grad_norm": 1.7483829259872437, + "learning_rate": 1.2525180991591363e-05, + "loss": 1.4791, + "step": 8297 + }, + { + "epoch": 0.45367193800144884, + "grad_norm": 2.9513299465179443, + "learning_rate": 1.2523412812832368e-05, + "loss": 1.4419, + "step": 8298 + }, + { + "epoch": 0.4537266104451704, + "grad_norm": 1.323081374168396, + "learning_rate": 1.2521644549810695e-05, + "loss": 1.1831, + "step": 8299 + }, + { + "epoch": 0.4537812828888919, + "grad_norm": 1.607689380645752, + "learning_rate": 1.2519876202585393e-05, + "loss": 1.4806, + "step": 8300 + }, + { + "epoch": 0.4538359553326135, + "grad_norm": 1.5162721872329712, + "learning_rate": 1.2518107771215511e-05, + "loss": 1.4913, + "step": 8301 + }, + { + "epoch": 0.45389062777633504, + "grad_norm": 1.790871024131775, + "learning_rate": 1.2516339255760103e-05, + "loss": 1.5832, + "step": 8302 + }, + { + "epoch": 0.4539453002200566, + "grad_norm": 2.4843857288360596, + "learning_rate": 1.2514570656278222e-05, + "loss": 1.3947, + "step": 8303 + }, + { + "epoch": 0.4539999726637781, + "grad_norm": 1.8452470302581787, + "learning_rate": 1.2512801972828927e-05, + "loss": 1.4421, + "step": 8304 + }, + { + "epoch": 0.4540546451074997, + "grad_norm": 1.300421118736267, + "learning_rate": 1.2511033205471277e-05, + "loss": 1.4368, + "step": 8305 + }, + { + "epoch": 0.45410931755122125, + "grad_norm": 1.7986170053482056, + "learning_rate": 1.250926435426434e-05, + "loss": 1.8972, + "step": 8306 + }, + { + "epoch": 0.4541639899949428, + "grad_norm": 1.6761164665222168, + "learning_rate": 1.2507495419267173e-05, + "loss": 1.6137, + "step": 8307 + }, + { + "epoch": 0.4542186624386644, + "grad_norm": 1.754149079322815, + "learning_rate": 1.2505726400538849e-05, + "loss": 1.4785, + "step": 8308 + }, + { + "epoch": 0.4542733348823859, + "grad_norm": 1.9348891973495483, + "learning_rate": 1.2503957298138443e-05, + "loss": 1.2974, + "step": 8309 + }, + { + "epoch": 0.45432800732610745, + "grad_norm": 1.686022400856018, + "learning_rate": 1.2502188112125027e-05, + "loss": 1.6489, + "step": 8310 + }, + { + "epoch": 0.454382679769829, + "grad_norm": 1.3113348484039307, + "learning_rate": 1.2500418842557678e-05, + "loss": 1.6101, + "step": 8311 + }, + { + "epoch": 0.4544373522135506, + "grad_norm": 1.5728421211242676, + "learning_rate": 1.2498649489495476e-05, + "loss": 1.612, + "step": 8312 + }, + { + "epoch": 0.4544920246572721, + "grad_norm": 1.8114333152770996, + "learning_rate": 1.2496880052997507e-05, + "loss": 1.3295, + "step": 8313 + }, + { + "epoch": 0.45454669710099366, + "grad_norm": 1.2129781246185303, + "learning_rate": 1.249511053312285e-05, + "loss": 1.2514, + "step": 8314 + }, + { + "epoch": 0.45460136954471525, + "grad_norm": 1.6758756637573242, + "learning_rate": 1.24933409299306e-05, + "loss": 1.3391, + "step": 8315 + }, + { + "epoch": 0.4546560419884368, + "grad_norm": 1.4929262399673462, + "learning_rate": 1.2491571243479846e-05, + "loss": 1.3428, + "step": 8316 + }, + { + "epoch": 0.4547107144321583, + "grad_norm": 1.8459579944610596, + "learning_rate": 1.248980147382968e-05, + "loss": 1.3683, + "step": 8317 + }, + { + "epoch": 0.45476538687587986, + "grad_norm": 1.5676274299621582, + "learning_rate": 1.24880316210392e-05, + "loss": 1.5675, + "step": 8318 + }, + { + "epoch": 0.45482005931960146, + "grad_norm": 1.6144734621047974, + "learning_rate": 1.2486261685167507e-05, + "loss": 1.3831, + "step": 8319 + }, + { + "epoch": 0.454874731763323, + "grad_norm": 2.3927063941955566, + "learning_rate": 1.2484491666273701e-05, + "loss": 1.3431, + "step": 8320 + }, + { + "epoch": 0.45492940420704453, + "grad_norm": 1.5206681489944458, + "learning_rate": 1.2482721564416887e-05, + "loss": 1.358, + "step": 8321 + }, + { + "epoch": 0.4549840766507661, + "grad_norm": 1.7976808547973633, + "learning_rate": 1.2480951379656175e-05, + "loss": 1.2829, + "step": 8322 + }, + { + "epoch": 0.45503874909448766, + "grad_norm": 1.6753602027893066, + "learning_rate": 1.2479181112050677e-05, + "loss": 1.4123, + "step": 8323 + }, + { + "epoch": 0.4550934215382092, + "grad_norm": 1.1780768632888794, + "learning_rate": 1.2477410761659503e-05, + "loss": 1.5013, + "step": 8324 + }, + { + "epoch": 0.45514809398193073, + "grad_norm": 1.9033538103103638, + "learning_rate": 1.247564032854177e-05, + "loss": 1.2194, + "step": 8325 + }, + { + "epoch": 0.4552027664256523, + "grad_norm": 1.7789602279663086, + "learning_rate": 1.2473869812756598e-05, + "loss": 1.3629, + "step": 8326 + }, + { + "epoch": 0.45525743886937386, + "grad_norm": 1.891589879989624, + "learning_rate": 1.2472099214363105e-05, + "loss": 1.526, + "step": 8327 + }, + { + "epoch": 0.4553121113130954, + "grad_norm": 1.4563053846359253, + "learning_rate": 1.247032853342042e-05, + "loss": 1.2325, + "step": 8328 + }, + { + "epoch": 0.455366783756817, + "grad_norm": 1.7771772146224976, + "learning_rate": 1.2468557769987667e-05, + "loss": 1.0632, + "step": 8329 + }, + { + "epoch": 0.45542145620053853, + "grad_norm": 1.4046615362167358, + "learning_rate": 1.246678692412398e-05, + "loss": 1.5461, + "step": 8330 + }, + { + "epoch": 0.45547612864426007, + "grad_norm": 1.5650007724761963, + "learning_rate": 1.2465015995888489e-05, + "loss": 1.3957, + "step": 8331 + }, + { + "epoch": 0.4555308010879816, + "grad_norm": 1.580884575843811, + "learning_rate": 1.2463244985340329e-05, + "loss": 1.3932, + "step": 8332 + }, + { + "epoch": 0.4555854735317032, + "grad_norm": 1.495091199874878, + "learning_rate": 1.2461473892538637e-05, + "loss": 1.2681, + "step": 8333 + }, + { + "epoch": 0.45564014597542474, + "grad_norm": 1.541108250617981, + "learning_rate": 1.245970271754256e-05, + "loss": 1.4634, + "step": 8334 + }, + { + "epoch": 0.4556948184191463, + "grad_norm": 1.336387276649475, + "learning_rate": 1.2457931460411233e-05, + "loss": 1.4551, + "step": 8335 + }, + { + "epoch": 0.45574949086286787, + "grad_norm": 1.5656036138534546, + "learning_rate": 1.2456160121203808e-05, + "loss": 1.5999, + "step": 8336 + }, + { + "epoch": 0.4558041633065894, + "grad_norm": 1.6777198314666748, + "learning_rate": 1.2454388699979435e-05, + "loss": 1.241, + "step": 8337 + }, + { + "epoch": 0.45585883575031094, + "grad_norm": 1.3736441135406494, + "learning_rate": 1.2452617196797261e-05, + "loss": 1.4922, + "step": 8338 + }, + { + "epoch": 0.4559135081940325, + "grad_norm": 1.5638304948806763, + "learning_rate": 1.2450845611716441e-05, + "loss": 1.4428, + "step": 8339 + }, + { + "epoch": 0.4559681806377541, + "grad_norm": 1.4543014764785767, + "learning_rate": 1.2449073944796142e-05, + "loss": 1.4016, + "step": 8340 + }, + { + "epoch": 0.4560228530814756, + "grad_norm": 1.3712143898010254, + "learning_rate": 1.2447302196095512e-05, + "loss": 1.2344, + "step": 8341 + }, + { + "epoch": 0.45607752552519715, + "grad_norm": 1.5584003925323486, + "learning_rate": 1.2445530365673722e-05, + "loss": 1.3867, + "step": 8342 + }, + { + "epoch": 0.45613219796891874, + "grad_norm": 1.3425966501235962, + "learning_rate": 1.2443758453589934e-05, + "loss": 1.4975, + "step": 8343 + }, + { + "epoch": 0.4561868704126403, + "grad_norm": 1.8042923212051392, + "learning_rate": 1.2441986459903315e-05, + "loss": 1.4056, + "step": 8344 + }, + { + "epoch": 0.4562415428563618, + "grad_norm": 1.311920166015625, + "learning_rate": 1.244021438467304e-05, + "loss": 1.2611, + "step": 8345 + }, + { + "epoch": 0.45629621530008335, + "grad_norm": 1.2409799098968506, + "learning_rate": 1.2438442227958277e-05, + "loss": 1.3185, + "step": 8346 + }, + { + "epoch": 0.45635088774380494, + "grad_norm": 1.4528206586837769, + "learning_rate": 1.2436669989818209e-05, + "loss": 1.3424, + "step": 8347 + }, + { + "epoch": 0.4564055601875265, + "grad_norm": 1.219077229499817, + "learning_rate": 1.2434897670312012e-05, + "loss": 1.6147, + "step": 8348 + }, + { + "epoch": 0.456460232631248, + "grad_norm": 1.2857403755187988, + "learning_rate": 1.2433125269498865e-05, + "loss": 1.2847, + "step": 8349 + }, + { + "epoch": 0.4565149050749696, + "grad_norm": 1.1437569856643677, + "learning_rate": 1.243135278743796e-05, + "loss": 1.427, + "step": 8350 + }, + { + "epoch": 0.45656957751869115, + "grad_norm": 1.285189151763916, + "learning_rate": 1.242958022418848e-05, + "loss": 1.5206, + "step": 8351 + }, + { + "epoch": 0.4566242499624127, + "grad_norm": 1.6053428649902344, + "learning_rate": 1.2427807579809611e-05, + "loss": 1.5435, + "step": 8352 + }, + { + "epoch": 0.4566789224061342, + "grad_norm": 1.662338376045227, + "learning_rate": 1.2426034854360554e-05, + "loss": 1.5706, + "step": 8353 + }, + { + "epoch": 0.4567335948498558, + "grad_norm": 1.3191434144973755, + "learning_rate": 1.24242620479005e-05, + "loss": 1.2751, + "step": 8354 + }, + { + "epoch": 0.45678826729357735, + "grad_norm": 1.7367727756500244, + "learning_rate": 1.2422489160488644e-05, + "loss": 1.1898, + "step": 8355 + }, + { + "epoch": 0.4568429397372989, + "grad_norm": 1.9897300004959106, + "learning_rate": 1.2420716192184195e-05, + "loss": 1.5422, + "step": 8356 + }, + { + "epoch": 0.4568976121810205, + "grad_norm": 1.6305891275405884, + "learning_rate": 1.2418943143046346e-05, + "loss": 1.374, + "step": 8357 + }, + { + "epoch": 0.456952284624742, + "grad_norm": 1.8578996658325195, + "learning_rate": 1.2417170013134315e-05, + "loss": 1.4755, + "step": 8358 + }, + { + "epoch": 0.45700695706846356, + "grad_norm": 1.2320195436477661, + "learning_rate": 1.2415396802507302e-05, + "loss": 1.3902, + "step": 8359 + }, + { + "epoch": 0.4570616295121851, + "grad_norm": 1.6764277219772339, + "learning_rate": 1.2413623511224522e-05, + "loss": 1.3743, + "step": 8360 + }, + { + "epoch": 0.4571163019559067, + "grad_norm": 1.3351614475250244, + "learning_rate": 1.2411850139345192e-05, + "loss": 1.7356, + "step": 8361 + }, + { + "epoch": 0.4571709743996282, + "grad_norm": 2.0786612033843994, + "learning_rate": 1.2410076686928522e-05, + "loss": 1.5056, + "step": 8362 + }, + { + "epoch": 0.45722564684334976, + "grad_norm": 1.5572196245193481, + "learning_rate": 1.240830315403374e-05, + "loss": 1.5687, + "step": 8363 + }, + { + "epoch": 0.45728031928707136, + "grad_norm": 1.314110279083252, + "learning_rate": 1.2406529540720063e-05, + "loss": 1.7501, + "step": 8364 + }, + { + "epoch": 0.4573349917307929, + "grad_norm": 1.6105420589447021, + "learning_rate": 1.2404755847046717e-05, + "loss": 1.3473, + "step": 8365 + }, + { + "epoch": 0.45738966417451443, + "grad_norm": 1.2784370183944702, + "learning_rate": 1.2402982073072931e-05, + "loss": 1.6397, + "step": 8366 + }, + { + "epoch": 0.45744433661823597, + "grad_norm": 1.4153597354888916, + "learning_rate": 1.2401208218857932e-05, + "loss": 1.548, + "step": 8367 + }, + { + "epoch": 0.45749900906195756, + "grad_norm": 1.6088871955871582, + "learning_rate": 1.239943428446096e-05, + "loss": 1.6349, + "step": 8368 + }, + { + "epoch": 0.4575536815056791, + "grad_norm": 2.5364208221435547, + "learning_rate": 1.2397660269941246e-05, + "loss": 1.2238, + "step": 8369 + }, + { + "epoch": 0.45760835394940064, + "grad_norm": 1.3349566459655762, + "learning_rate": 1.2395886175358027e-05, + "loss": 1.5126, + "step": 8370 + }, + { + "epoch": 0.45766302639312223, + "grad_norm": 1.7469148635864258, + "learning_rate": 1.239411200077055e-05, + "loss": 1.4198, + "step": 8371 + }, + { + "epoch": 0.45771769883684377, + "grad_norm": 1.3258064985275269, + "learning_rate": 1.2392337746238052e-05, + "loss": 1.4459, + "step": 8372 + }, + { + "epoch": 0.4577723712805653, + "grad_norm": 1.6229679584503174, + "learning_rate": 1.2390563411819786e-05, + "loss": 1.3063, + "step": 8373 + }, + { + "epoch": 0.45782704372428684, + "grad_norm": 1.7157163619995117, + "learning_rate": 1.2388788997574994e-05, + "loss": 1.7369, + "step": 8374 + }, + { + "epoch": 0.45788171616800843, + "grad_norm": 1.5989396572113037, + "learning_rate": 1.2387014503562935e-05, + "loss": 1.6477, + "step": 8375 + }, + { + "epoch": 0.45793638861172997, + "grad_norm": 2.1151185035705566, + "learning_rate": 1.238523992984286e-05, + "loss": 1.3782, + "step": 8376 + }, + { + "epoch": 0.4579910610554515, + "grad_norm": 1.9971129894256592, + "learning_rate": 1.2383465276474024e-05, + "loss": 1.4396, + "step": 8377 + }, + { + "epoch": 0.4580457334991731, + "grad_norm": 1.4231928586959839, + "learning_rate": 1.2381690543515692e-05, + "loss": 1.4973, + "step": 8378 + }, + { + "epoch": 0.45810040594289464, + "grad_norm": 1.4247376918792725, + "learning_rate": 1.2379915731027126e-05, + "loss": 1.4031, + "step": 8379 + }, + { + "epoch": 0.4581550783866162, + "grad_norm": 1.3362243175506592, + "learning_rate": 1.2378140839067585e-05, + "loss": 1.6025, + "step": 8380 + }, + { + "epoch": 0.4582097508303377, + "grad_norm": 1.4144651889801025, + "learning_rate": 1.2376365867696341e-05, + "loss": 1.5837, + "step": 8381 + }, + { + "epoch": 0.4582644232740593, + "grad_norm": 1.4211645126342773, + "learning_rate": 1.2374590816972667e-05, + "loss": 1.3041, + "step": 8382 + }, + { + "epoch": 0.45831909571778084, + "grad_norm": 1.5745112895965576, + "learning_rate": 1.2372815686955835e-05, + "loss": 1.4476, + "step": 8383 + }, + { + "epoch": 0.4583737681615024, + "grad_norm": 1.432184100151062, + "learning_rate": 1.2371040477705113e-05, + "loss": 1.4202, + "step": 8384 + }, + { + "epoch": 0.458428440605224, + "grad_norm": 1.4704111814498901, + "learning_rate": 1.236926518927979e-05, + "loss": 1.4833, + "step": 8385 + }, + { + "epoch": 0.4584831130489455, + "grad_norm": 1.426781177520752, + "learning_rate": 1.2367489821739142e-05, + "loss": 1.5032, + "step": 8386 + }, + { + "epoch": 0.45853778549266705, + "grad_norm": 1.3691129684448242, + "learning_rate": 1.2365714375142452e-05, + "loss": 1.4765, + "step": 8387 + }, + { + "epoch": 0.4585924579363886, + "grad_norm": 1.532753348350525, + "learning_rate": 1.236393884954901e-05, + "loss": 1.507, + "step": 8388 + }, + { + "epoch": 0.4586471303801102, + "grad_norm": 1.5450760126113892, + "learning_rate": 1.2362163245018104e-05, + "loss": 1.6232, + "step": 8389 + }, + { + "epoch": 0.4587018028238317, + "grad_norm": 1.5565025806427002, + "learning_rate": 1.2360387561609021e-05, + "loss": 1.5948, + "step": 8390 + }, + { + "epoch": 0.45875647526755325, + "grad_norm": 1.5697392225265503, + "learning_rate": 1.2358611799381058e-05, + "loss": 1.4296, + "step": 8391 + }, + { + "epoch": 0.45881114771127485, + "grad_norm": 1.3369685411453247, + "learning_rate": 1.2356835958393513e-05, + "loss": 1.3123, + "step": 8392 + }, + { + "epoch": 0.4588658201549964, + "grad_norm": 2.5174367427825928, + "learning_rate": 1.2355060038705686e-05, + "loss": 1.4678, + "step": 8393 + }, + { + "epoch": 0.4589204925987179, + "grad_norm": 1.6191366910934448, + "learning_rate": 1.2353284040376878e-05, + "loss": 1.4057, + "step": 8394 + }, + { + "epoch": 0.45897516504243946, + "grad_norm": 1.585215449333191, + "learning_rate": 1.2351507963466394e-05, + "loss": 1.5887, + "step": 8395 + }, + { + "epoch": 0.45902983748616105, + "grad_norm": 1.3529409170150757, + "learning_rate": 1.2349731808033542e-05, + "loss": 1.5, + "step": 8396 + }, + { + "epoch": 0.4590845099298826, + "grad_norm": 1.3745185136795044, + "learning_rate": 1.2347955574137629e-05, + "loss": 1.2765, + "step": 8397 + }, + { + "epoch": 0.4591391823736041, + "grad_norm": 1.3768054246902466, + "learning_rate": 1.234617926183797e-05, + "loss": 1.5846, + "step": 8398 + }, + { + "epoch": 0.4591938548173257, + "grad_norm": 2.195627212524414, + "learning_rate": 1.2344402871193876e-05, + "loss": 1.2642, + "step": 8399 + }, + { + "epoch": 0.45924852726104726, + "grad_norm": 1.4450452327728271, + "learning_rate": 1.2342626402264677e-05, + "loss": 1.6837, + "step": 8400 + }, + { + "epoch": 0.4593031997047688, + "grad_norm": 1.736683964729309, + "learning_rate": 1.234084985510968e-05, + "loss": 1.4361, + "step": 8401 + }, + { + "epoch": 0.45935787214849033, + "grad_norm": 1.9091906547546387, + "learning_rate": 1.2339073229788214e-05, + "loss": 1.4157, + "step": 8402 + }, + { + "epoch": 0.4594125445922119, + "grad_norm": 1.7995449304580688, + "learning_rate": 1.2337296526359608e-05, + "loss": 1.4758, + "step": 8403 + }, + { + "epoch": 0.45946721703593346, + "grad_norm": 1.3615905046463013, + "learning_rate": 1.2335519744883182e-05, + "loss": 1.5004, + "step": 8404 + }, + { + "epoch": 0.459521889479655, + "grad_norm": 1.3790394067764282, + "learning_rate": 1.233374288541827e-05, + "loss": 1.4761, + "step": 8405 + }, + { + "epoch": 0.4595765619233766, + "grad_norm": 1.3601573705673218, + "learning_rate": 1.2331965948024209e-05, + "loss": 1.6403, + "step": 8406 + }, + { + "epoch": 0.45963123436709813, + "grad_norm": 1.3208519220352173, + "learning_rate": 1.2330188932760333e-05, + "loss": 1.2699, + "step": 8407 + }, + { + "epoch": 0.45968590681081967, + "grad_norm": 1.757303237915039, + "learning_rate": 1.2328411839685984e-05, + "loss": 1.4726, + "step": 8408 + }, + { + "epoch": 0.4597405792545412, + "grad_norm": 1.3133596181869507, + "learning_rate": 1.2326634668860493e-05, + "loss": 1.2948, + "step": 8409 + }, + { + "epoch": 0.4597952516982628, + "grad_norm": 1.41152024269104, + "learning_rate": 1.2324857420343217e-05, + "loss": 1.2788, + "step": 8410 + }, + { + "epoch": 0.45984992414198433, + "grad_norm": 1.371870756149292, + "learning_rate": 1.2323080094193492e-05, + "loss": 1.5087, + "step": 8411 + }, + { + "epoch": 0.45990459658570587, + "grad_norm": 1.3306373357772827, + "learning_rate": 1.2321302690470671e-05, + "loss": 1.5188, + "step": 8412 + }, + { + "epoch": 0.45995926902942746, + "grad_norm": 1.859194278717041, + "learning_rate": 1.2319525209234109e-05, + "loss": 1.4573, + "step": 8413 + }, + { + "epoch": 0.460013941473149, + "grad_norm": 1.3921136856079102, + "learning_rate": 1.2317747650543158e-05, + "loss": 1.5135, + "step": 8414 + }, + { + "epoch": 0.46006861391687054, + "grad_norm": 1.278654932975769, + "learning_rate": 1.2315970014457172e-05, + "loss": 1.5707, + "step": 8415 + }, + { + "epoch": 0.4601232863605921, + "grad_norm": 1.5016807317733765, + "learning_rate": 1.2314192301035512e-05, + "loss": 1.3894, + "step": 8416 + }, + { + "epoch": 0.46017795880431367, + "grad_norm": 1.9089281558990479, + "learning_rate": 1.2312414510337543e-05, + "loss": 1.3373, + "step": 8417 + }, + { + "epoch": 0.4602326312480352, + "grad_norm": 1.8028810024261475, + "learning_rate": 1.2310636642422624e-05, + "loss": 1.7458, + "step": 8418 + }, + { + "epoch": 0.46028730369175674, + "grad_norm": 1.8102794885635376, + "learning_rate": 1.2308858697350128e-05, + "loss": 1.4956, + "step": 8419 + }, + { + "epoch": 0.46034197613547834, + "grad_norm": 1.4674571752548218, + "learning_rate": 1.230708067517942e-05, + "loss": 1.3517, + "step": 8420 + }, + { + "epoch": 0.4603966485791999, + "grad_norm": 1.6378602981567383, + "learning_rate": 1.2305302575969878e-05, + "loss": 1.4027, + "step": 8421 + }, + { + "epoch": 0.4604513210229214, + "grad_norm": 1.354552149772644, + "learning_rate": 1.2303524399780873e-05, + "loss": 1.5439, + "step": 8422 + }, + { + "epoch": 0.46050599346664295, + "grad_norm": 1.786779522895813, + "learning_rate": 1.230174614667178e-05, + "loss": 1.4288, + "step": 8423 + }, + { + "epoch": 0.46056066591036454, + "grad_norm": 1.8894975185394287, + "learning_rate": 1.2299967816701984e-05, + "loss": 1.4162, + "step": 8424 + }, + { + "epoch": 0.4606153383540861, + "grad_norm": 1.8494594097137451, + "learning_rate": 1.2298189409930863e-05, + "loss": 1.347, + "step": 8425 + }, + { + "epoch": 0.4606700107978076, + "grad_norm": 1.362141728401184, + "learning_rate": 1.2296410926417806e-05, + "loss": 1.3695, + "step": 8426 + }, + { + "epoch": 0.4607246832415292, + "grad_norm": 1.724429965019226, + "learning_rate": 1.2294632366222201e-05, + "loss": 1.3086, + "step": 8427 + }, + { + "epoch": 0.46077935568525075, + "grad_norm": 1.5567829608917236, + "learning_rate": 1.2292853729403437e-05, + "loss": 1.3715, + "step": 8428 + }, + { + "epoch": 0.4608340281289723, + "grad_norm": 1.6302694082260132, + "learning_rate": 1.2291075016020906e-05, + "loss": 1.7798, + "step": 8429 + }, + { + "epoch": 0.4608887005726938, + "grad_norm": 1.3816276788711548, + "learning_rate": 1.2289296226134002e-05, + "loss": 1.4541, + "step": 8430 + }, + { + "epoch": 0.4609433730164154, + "grad_norm": 1.8436386585235596, + "learning_rate": 1.2287517359802129e-05, + "loss": 1.3888, + "step": 8431 + }, + { + "epoch": 0.46099804546013695, + "grad_norm": 2.101267099380493, + "learning_rate": 1.2285738417084679e-05, + "loss": 1.5828, + "step": 8432 + }, + { + "epoch": 0.4610527179038585, + "grad_norm": 1.1412386894226074, + "learning_rate": 1.228395939804106e-05, + "loss": 1.5736, + "step": 8433 + }, + { + "epoch": 0.4611073903475801, + "grad_norm": 1.333156704902649, + "learning_rate": 1.2282180302730683e-05, + "loss": 1.5209, + "step": 8434 + }, + { + "epoch": 0.4611620627913016, + "grad_norm": 1.5951861143112183, + "learning_rate": 1.2280401131212945e-05, + "loss": 1.2764, + "step": 8435 + }, + { + "epoch": 0.46121673523502316, + "grad_norm": 1.5952123403549194, + "learning_rate": 1.2278621883547264e-05, + "loss": 1.4965, + "step": 8436 + }, + { + "epoch": 0.4612714076787447, + "grad_norm": 1.4512799978256226, + "learning_rate": 1.2276842559793049e-05, + "loss": 1.5151, + "step": 8437 + }, + { + "epoch": 0.4613260801224663, + "grad_norm": 1.4107816219329834, + "learning_rate": 1.2275063160009722e-05, + "loss": 1.5326, + "step": 8438 + }, + { + "epoch": 0.4613807525661878, + "grad_norm": 1.2537791728973389, + "learning_rate": 1.2273283684256695e-05, + "loss": 1.4968, + "step": 8439 + }, + { + "epoch": 0.46143542500990936, + "grad_norm": 2.0138747692108154, + "learning_rate": 1.2271504132593388e-05, + "loss": 1.3945, + "step": 8440 + }, + { + "epoch": 0.46149009745363095, + "grad_norm": 1.710965633392334, + "learning_rate": 1.2269724505079234e-05, + "loss": 1.5383, + "step": 8441 + }, + { + "epoch": 0.4615447698973525, + "grad_norm": 1.388253092765808, + "learning_rate": 1.226794480177365e-05, + "loss": 1.4358, + "step": 8442 + }, + { + "epoch": 0.46159944234107403, + "grad_norm": 1.2032523155212402, + "learning_rate": 1.2266165022736067e-05, + "loss": 1.478, + "step": 8443 + }, + { + "epoch": 0.46165411478479557, + "grad_norm": 1.398294448852539, + "learning_rate": 1.2264385168025917e-05, + "loss": 1.593, + "step": 8444 + }, + { + "epoch": 0.46170878722851716, + "grad_norm": 1.4966191053390503, + "learning_rate": 1.2262605237702631e-05, + "loss": 1.4695, + "step": 8445 + }, + { + "epoch": 0.4617634596722387, + "grad_norm": 1.7003390789031982, + "learning_rate": 1.2260825231825648e-05, + "loss": 1.5345, + "step": 8446 + }, + { + "epoch": 0.46181813211596023, + "grad_norm": 1.2505022287368774, + "learning_rate": 1.2259045150454407e-05, + "loss": 1.5618, + "step": 8447 + }, + { + "epoch": 0.4618728045596818, + "grad_norm": 1.266871690750122, + "learning_rate": 1.2257264993648345e-05, + "loss": 1.5471, + "step": 8448 + }, + { + "epoch": 0.46192747700340336, + "grad_norm": 1.7848550081253052, + "learning_rate": 1.225548476146691e-05, + "loss": 1.1692, + "step": 8449 + }, + { + "epoch": 0.4619821494471249, + "grad_norm": 1.7106941938400269, + "learning_rate": 1.2253704453969544e-05, + "loss": 1.4323, + "step": 8450 + }, + { + "epoch": 0.46203682189084644, + "grad_norm": 1.562269687652588, + "learning_rate": 1.2251924071215697e-05, + "loss": 1.4423, + "step": 8451 + }, + { + "epoch": 0.46209149433456803, + "grad_norm": 1.446526050567627, + "learning_rate": 1.2250143613264824e-05, + "loss": 1.4106, + "step": 8452 + }, + { + "epoch": 0.46214616677828957, + "grad_norm": 1.7221862077713013, + "learning_rate": 1.2248363080176373e-05, + "loss": 1.4508, + "step": 8453 + }, + { + "epoch": 0.4622008392220111, + "grad_norm": 1.5172096490859985, + "learning_rate": 1.2246582472009804e-05, + "loss": 1.2871, + "step": 8454 + }, + { + "epoch": 0.4622555116657327, + "grad_norm": 1.4341880083084106, + "learning_rate": 1.2244801788824577e-05, + "loss": 1.1388, + "step": 8455 + }, + { + "epoch": 0.46231018410945424, + "grad_norm": 1.5839085578918457, + "learning_rate": 1.2243021030680149e-05, + "loss": 1.348, + "step": 8456 + }, + { + "epoch": 0.4623648565531758, + "grad_norm": 1.5569982528686523, + "learning_rate": 1.2241240197635982e-05, + "loss": 1.4259, + "step": 8457 + }, + { + "epoch": 0.46241952899689737, + "grad_norm": 1.6195898056030273, + "learning_rate": 1.2239459289751548e-05, + "loss": 1.4249, + "step": 8458 + }, + { + "epoch": 0.4624742014406189, + "grad_norm": 1.6526392698287964, + "learning_rate": 1.2237678307086314e-05, + "loss": 1.2295, + "step": 8459 + }, + { + "epoch": 0.46252887388434044, + "grad_norm": 1.6169054508209229, + "learning_rate": 1.2235897249699749e-05, + "loss": 1.5607, + "step": 8460 + }, + { + "epoch": 0.462583546328062, + "grad_norm": 1.3721240758895874, + "learning_rate": 1.223411611765133e-05, + "loss": 1.5765, + "step": 8461 + }, + { + "epoch": 0.46263821877178357, + "grad_norm": 1.9698259830474854, + "learning_rate": 1.223233491100053e-05, + "loss": 1.2762, + "step": 8462 + }, + { + "epoch": 0.4626928912155051, + "grad_norm": 1.8170890808105469, + "learning_rate": 1.2230553629806829e-05, + "loss": 1.383, + "step": 8463 + }, + { + "epoch": 0.46274756365922665, + "grad_norm": 1.74290132522583, + "learning_rate": 1.2228772274129708e-05, + "loss": 1.362, + "step": 8464 + }, + { + "epoch": 0.46280223610294824, + "grad_norm": 1.5925710201263428, + "learning_rate": 1.2226990844028653e-05, + "loss": 1.5879, + "step": 8465 + }, + { + "epoch": 0.4628569085466698, + "grad_norm": 1.4187201261520386, + "learning_rate": 1.2225209339563144e-05, + "loss": 1.439, + "step": 8466 + }, + { + "epoch": 0.4629115809903913, + "grad_norm": 1.1532909870147705, + "learning_rate": 1.2223427760792678e-05, + "loss": 1.3805, + "step": 8467 + }, + { + "epoch": 0.46296625343411285, + "grad_norm": 1.373427152633667, + "learning_rate": 1.2221646107776739e-05, + "loss": 1.5341, + "step": 8468 + }, + { + "epoch": 0.46302092587783444, + "grad_norm": 1.4125200510025024, + "learning_rate": 1.2219864380574822e-05, + "loss": 1.5877, + "step": 8469 + }, + { + "epoch": 0.463075598321556, + "grad_norm": 1.8601993322372437, + "learning_rate": 1.2218082579246429e-05, + "loss": 1.3976, + "step": 8470 + }, + { + "epoch": 0.4631302707652775, + "grad_norm": 1.5976228713989258, + "learning_rate": 1.2216300703851047e-05, + "loss": 1.6572, + "step": 8471 + }, + { + "epoch": 0.4631849432089991, + "grad_norm": 2.8352174758911133, + "learning_rate": 1.2214518754448188e-05, + "loss": 1.2174, + "step": 8472 + }, + { + "epoch": 0.46323961565272065, + "grad_norm": 1.5763362646102905, + "learning_rate": 1.2212736731097352e-05, + "loss": 1.5137, + "step": 8473 + }, + { + "epoch": 0.4632942880964422, + "grad_norm": 2.037902355194092, + "learning_rate": 1.2210954633858042e-05, + "loss": 1.3837, + "step": 8474 + }, + { + "epoch": 0.4633489605401637, + "grad_norm": 1.878515362739563, + "learning_rate": 1.220917246278977e-05, + "loss": 1.3027, + "step": 8475 + }, + { + "epoch": 0.4634036329838853, + "grad_norm": 1.635756492614746, + "learning_rate": 1.2207390217952044e-05, + "loss": 1.6089, + "step": 8476 + }, + { + "epoch": 0.46345830542760685, + "grad_norm": 1.631560206413269, + "learning_rate": 1.220560789940438e-05, + "loss": 1.366, + "step": 8477 + }, + { + "epoch": 0.4635129778713284, + "grad_norm": 1.3032101392745972, + "learning_rate": 1.2203825507206293e-05, + "loss": 1.4653, + "step": 8478 + }, + { + "epoch": 0.46356765031505, + "grad_norm": 1.4696985483169556, + "learning_rate": 1.2202043041417298e-05, + "loss": 1.3223, + "step": 8479 + }, + { + "epoch": 0.4636223227587715, + "grad_norm": 1.35646390914917, + "learning_rate": 1.2200260502096923e-05, + "loss": 1.4738, + "step": 8480 + }, + { + "epoch": 0.46367699520249306, + "grad_norm": 1.636791706085205, + "learning_rate": 1.2198477889304684e-05, + "loss": 1.8549, + "step": 8481 + }, + { + "epoch": 0.4637316676462146, + "grad_norm": 1.418640375137329, + "learning_rate": 1.2196695203100111e-05, + "loss": 1.5164, + "step": 8482 + }, + { + "epoch": 0.4637863400899362, + "grad_norm": 1.4284106492996216, + "learning_rate": 1.2194912443542728e-05, + "loss": 1.192, + "step": 8483 + }, + { + "epoch": 0.4638410125336577, + "grad_norm": 1.707294225692749, + "learning_rate": 1.2193129610692069e-05, + "loss": 1.3683, + "step": 8484 + }, + { + "epoch": 0.46389568497737926, + "grad_norm": 2.085512399673462, + "learning_rate": 1.2191346704607668e-05, + "loss": 1.2158, + "step": 8485 + }, + { + "epoch": 0.46395035742110086, + "grad_norm": 1.480603575706482, + "learning_rate": 1.2189563725349056e-05, + "loss": 1.322, + "step": 8486 + }, + { + "epoch": 0.4640050298648224, + "grad_norm": 1.7199032306671143, + "learning_rate": 1.2187780672975775e-05, + "loss": 1.1576, + "step": 8487 + }, + { + "epoch": 0.46405970230854393, + "grad_norm": 1.3150805234909058, + "learning_rate": 1.2185997547547364e-05, + "loss": 1.4533, + "step": 8488 + }, + { + "epoch": 0.46411437475226547, + "grad_norm": 1.3928585052490234, + "learning_rate": 1.2184214349123361e-05, + "loss": 1.5711, + "step": 8489 + }, + { + "epoch": 0.46416904719598706, + "grad_norm": 1.6578587293624878, + "learning_rate": 1.2182431077763317e-05, + "loss": 1.4596, + "step": 8490 + }, + { + "epoch": 0.4642237196397086, + "grad_norm": 1.7297641038894653, + "learning_rate": 1.218064773352678e-05, + "loss": 1.4981, + "step": 8491 + }, + { + "epoch": 0.46427839208343014, + "grad_norm": 1.4542466402053833, + "learning_rate": 1.2178864316473298e-05, + "loss": 1.4209, + "step": 8492 + }, + { + "epoch": 0.46433306452715173, + "grad_norm": 1.7451194524765015, + "learning_rate": 1.2177080826662424e-05, + "loss": 1.4747, + "step": 8493 + }, + { + "epoch": 0.46438773697087327, + "grad_norm": 1.8000479936599731, + "learning_rate": 1.2175297264153713e-05, + "loss": 1.4369, + "step": 8494 + }, + { + "epoch": 0.4644424094145948, + "grad_norm": 1.8244184255599976, + "learning_rate": 1.217351362900672e-05, + "loss": 1.5115, + "step": 8495 + }, + { + "epoch": 0.46449708185831634, + "grad_norm": 1.4947004318237305, + "learning_rate": 1.2171729921281006e-05, + "loss": 1.3388, + "step": 8496 + }, + { + "epoch": 0.46455175430203793, + "grad_norm": 1.4605921506881714, + "learning_rate": 1.2169946141036133e-05, + "loss": 1.4671, + "step": 8497 + }, + { + "epoch": 0.46460642674575947, + "grad_norm": 1.2789376974105835, + "learning_rate": 1.2168162288331671e-05, + "loss": 1.4895, + "step": 8498 + }, + { + "epoch": 0.464661099189481, + "grad_norm": 1.323716402053833, + "learning_rate": 1.2166378363227178e-05, + "loss": 1.5679, + "step": 8499 + }, + { + "epoch": 0.4647157716332026, + "grad_norm": 1.6457264423370361, + "learning_rate": 1.216459436578223e-05, + "loss": 1.4303, + "step": 8500 + }, + { + "epoch": 0.46477044407692414, + "grad_norm": 1.6031919717788696, + "learning_rate": 1.2162810296056398e-05, + "loss": 1.4699, + "step": 8501 + }, + { + "epoch": 0.4648251165206457, + "grad_norm": 1.2106616497039795, + "learning_rate": 1.2161026154109253e-05, + "loss": 1.2463, + "step": 8502 + }, + { + "epoch": 0.4648797889643672, + "grad_norm": 1.501794695854187, + "learning_rate": 1.2159241940000372e-05, + "loss": 1.2843, + "step": 8503 + }, + { + "epoch": 0.4649344614080888, + "grad_norm": 1.5942636728286743, + "learning_rate": 1.2157457653789337e-05, + "loss": 1.4221, + "step": 8504 + }, + { + "epoch": 0.46498913385181034, + "grad_norm": 1.4892606735229492, + "learning_rate": 1.215567329553573e-05, + "loss": 1.3292, + "step": 8505 + }, + { + "epoch": 0.4650438062955319, + "grad_norm": 1.24143648147583, + "learning_rate": 1.2153888865299134e-05, + "loss": 1.3848, + "step": 8506 + }, + { + "epoch": 0.4650984787392535, + "grad_norm": 1.4942654371261597, + "learning_rate": 1.2152104363139133e-05, + "loss": 1.3986, + "step": 8507 + }, + { + "epoch": 0.465153151182975, + "grad_norm": 1.2531559467315674, + "learning_rate": 1.215031978911532e-05, + "loss": 1.6002, + "step": 8508 + }, + { + "epoch": 0.46520782362669655, + "grad_norm": 1.7194031476974487, + "learning_rate": 1.214853514328728e-05, + "loss": 1.61, + "step": 8509 + }, + { + "epoch": 0.4652624960704181, + "grad_norm": 1.2975409030914307, + "learning_rate": 1.2146750425714609e-05, + "loss": 1.5597, + "step": 8510 + }, + { + "epoch": 0.4653171685141397, + "grad_norm": 2.8363451957702637, + "learning_rate": 1.2144965636456903e-05, + "loss": 1.4261, + "step": 8511 + }, + { + "epoch": 0.4653718409578612, + "grad_norm": 1.1310139894485474, + "learning_rate": 1.2143180775573764e-05, + "loss": 1.4696, + "step": 8512 + }, + { + "epoch": 0.46542651340158275, + "grad_norm": 1.4153728485107422, + "learning_rate": 1.214139584312479e-05, + "loss": 1.4928, + "step": 8513 + }, + { + "epoch": 0.46548118584530435, + "grad_norm": 1.4057570695877075, + "learning_rate": 1.2139610839169582e-05, + "loss": 1.3129, + "step": 8514 + }, + { + "epoch": 0.4655358582890259, + "grad_norm": 1.5557317733764648, + "learning_rate": 1.213782576376775e-05, + "loss": 1.3233, + "step": 8515 + }, + { + "epoch": 0.4655905307327474, + "grad_norm": 1.421180009841919, + "learning_rate": 1.2136040616978897e-05, + "loss": 1.2698, + "step": 8516 + }, + { + "epoch": 0.46564520317646896, + "grad_norm": 1.7328956127166748, + "learning_rate": 1.2134255398862633e-05, + "loss": 1.5603, + "step": 8517 + }, + { + "epoch": 0.46569987562019055, + "grad_norm": 2.118422746658325, + "learning_rate": 1.2132470109478577e-05, + "loss": 1.2815, + "step": 8518 + }, + { + "epoch": 0.4657545480639121, + "grad_norm": 1.5178425312042236, + "learning_rate": 1.213068474888634e-05, + "loss": 1.5708, + "step": 8519 + }, + { + "epoch": 0.4658092205076336, + "grad_norm": 1.4877053499221802, + "learning_rate": 1.212889931714554e-05, + "loss": 1.335, + "step": 8520 + }, + { + "epoch": 0.4658638929513552, + "grad_norm": 1.5012221336364746, + "learning_rate": 1.2127113814315791e-05, + "loss": 1.4718, + "step": 8521 + }, + { + "epoch": 0.46591856539507676, + "grad_norm": 1.484625220298767, + "learning_rate": 1.2125328240456727e-05, + "loss": 1.429, + "step": 8522 + }, + { + "epoch": 0.4659732378387983, + "grad_norm": 1.7992390394210815, + "learning_rate": 1.2123542595627961e-05, + "loss": 1.4513, + "step": 8523 + }, + { + "epoch": 0.46602791028251983, + "grad_norm": 1.6337566375732422, + "learning_rate": 1.2121756879889126e-05, + "loss": 1.5433, + "step": 8524 + }, + { + "epoch": 0.4660825827262414, + "grad_norm": 1.1938966512680054, + "learning_rate": 1.2119971093299852e-05, + "loss": 1.6009, + "step": 8525 + }, + { + "epoch": 0.46613725516996296, + "grad_norm": 1.2862237691879272, + "learning_rate": 1.2118185235919766e-05, + "loss": 1.4106, + "step": 8526 + }, + { + "epoch": 0.4661919276136845, + "grad_norm": 1.567558765411377, + "learning_rate": 1.2116399307808506e-05, + "loss": 1.3326, + "step": 8527 + }, + { + "epoch": 0.4662466000574061, + "grad_norm": 1.720485806465149, + "learning_rate": 1.2114613309025705e-05, + "loss": 1.4074, + "step": 8528 + }, + { + "epoch": 0.46630127250112763, + "grad_norm": 1.657091736793518, + "learning_rate": 1.2112827239631004e-05, + "loss": 1.3251, + "step": 8529 + }, + { + "epoch": 0.46635594494484917, + "grad_norm": 1.4069019556045532, + "learning_rate": 1.2111041099684045e-05, + "loss": 1.3594, + "step": 8530 + }, + { + "epoch": 0.4664106173885707, + "grad_norm": 1.5694952011108398, + "learning_rate": 1.2109254889244469e-05, + "loss": 1.3173, + "step": 8531 + }, + { + "epoch": 0.4664652898322923, + "grad_norm": 1.3925566673278809, + "learning_rate": 1.2107468608371924e-05, + "loss": 1.1918, + "step": 8532 + }, + { + "epoch": 0.46651996227601383, + "grad_norm": 1.7906670570373535, + "learning_rate": 1.2105682257126057e-05, + "loss": 1.4349, + "step": 8533 + }, + { + "epoch": 0.46657463471973537, + "grad_norm": 1.398276925086975, + "learning_rate": 1.2103895835566516e-05, + "loss": 1.3507, + "step": 8534 + }, + { + "epoch": 0.46662930716345696, + "grad_norm": 1.1672879457473755, + "learning_rate": 1.2102109343752955e-05, + "loss": 1.3143, + "step": 8535 + }, + { + "epoch": 0.4666839796071785, + "grad_norm": 1.506298303604126, + "learning_rate": 1.2100322781745034e-05, + "loss": 1.325, + "step": 8536 + }, + { + "epoch": 0.46673865205090004, + "grad_norm": 1.5086063146591187, + "learning_rate": 1.2098536149602405e-05, + "loss": 1.6022, + "step": 8537 + }, + { + "epoch": 0.4667933244946216, + "grad_norm": 1.8908929824829102, + "learning_rate": 1.209674944738473e-05, + "loss": 1.2168, + "step": 8538 + }, + { + "epoch": 0.46684799693834317, + "grad_norm": 1.5555418729782104, + "learning_rate": 1.209496267515167e-05, + "loss": 1.3448, + "step": 8539 + }, + { + "epoch": 0.4669026693820647, + "grad_norm": 1.6883853673934937, + "learning_rate": 1.2093175832962891e-05, + "loss": 1.378, + "step": 8540 + }, + { + "epoch": 0.46695734182578624, + "grad_norm": 1.581796407699585, + "learning_rate": 1.2091388920878059e-05, + "loss": 1.4219, + "step": 8541 + }, + { + "epoch": 0.46701201426950784, + "grad_norm": 1.4197590351104736, + "learning_rate": 1.2089601938956843e-05, + "loss": 1.4277, + "step": 8542 + }, + { + "epoch": 0.4670666867132294, + "grad_norm": 1.105907678604126, + "learning_rate": 1.2087814887258916e-05, + "loss": 1.4433, + "step": 8543 + }, + { + "epoch": 0.4671213591569509, + "grad_norm": 1.4039716720581055, + "learning_rate": 1.2086027765843948e-05, + "loss": 1.5946, + "step": 8544 + }, + { + "epoch": 0.46717603160067245, + "grad_norm": 1.5426522493362427, + "learning_rate": 1.2084240574771621e-05, + "loss": 1.3311, + "step": 8545 + }, + { + "epoch": 0.46723070404439404, + "grad_norm": 1.4055452346801758, + "learning_rate": 1.2082453314101607e-05, + "loss": 1.5707, + "step": 8546 + }, + { + "epoch": 0.4672853764881156, + "grad_norm": 1.6647100448608398, + "learning_rate": 1.2080665983893595e-05, + "loss": 1.3827, + "step": 8547 + }, + { + "epoch": 0.4673400489318371, + "grad_norm": 1.4991211891174316, + "learning_rate": 1.2078878584207259e-05, + "loss": 1.3611, + "step": 8548 + }, + { + "epoch": 0.4673947213755587, + "grad_norm": 1.5746846199035645, + "learning_rate": 1.2077091115102291e-05, + "loss": 1.376, + "step": 8549 + }, + { + "epoch": 0.46744939381928025, + "grad_norm": 1.3566943407058716, + "learning_rate": 1.2075303576638378e-05, + "loss": 1.4209, + "step": 8550 + }, + { + "epoch": 0.4675040662630018, + "grad_norm": 1.522946834564209, + "learning_rate": 1.2073515968875204e-05, + "loss": 1.3911, + "step": 8551 + }, + { + "epoch": 0.4675587387067233, + "grad_norm": 1.778225064277649, + "learning_rate": 1.2071728291872471e-05, + "loss": 1.2792, + "step": 8552 + }, + { + "epoch": 0.4676134111504449, + "grad_norm": 1.6784110069274902, + "learning_rate": 1.2069940545689867e-05, + "loss": 1.4563, + "step": 8553 + }, + { + "epoch": 0.46766808359416645, + "grad_norm": 2.030123710632324, + "learning_rate": 1.2068152730387091e-05, + "loss": 1.3724, + "step": 8554 + }, + { + "epoch": 0.467722756037888, + "grad_norm": 1.2905162572860718, + "learning_rate": 1.2066364846023841e-05, + "loss": 1.5279, + "step": 8555 + }, + { + "epoch": 0.4677774284816096, + "grad_norm": 1.4415065050125122, + "learning_rate": 1.2064576892659821e-05, + "loss": 1.6193, + "step": 8556 + }, + { + "epoch": 0.4678321009253311, + "grad_norm": 1.5797688961029053, + "learning_rate": 1.2062788870354734e-05, + "loss": 1.459, + "step": 8557 + }, + { + "epoch": 0.46788677336905266, + "grad_norm": 1.3688966035842896, + "learning_rate": 1.2061000779168288e-05, + "loss": 1.4527, + "step": 8558 + }, + { + "epoch": 0.4679414458127742, + "grad_norm": 1.3173481225967407, + "learning_rate": 1.2059212619160186e-05, + "loss": 1.4395, + "step": 8559 + }, + { + "epoch": 0.4679961182564958, + "grad_norm": 1.3735331296920776, + "learning_rate": 1.2057424390390141e-05, + "loss": 1.2767, + "step": 8560 + }, + { + "epoch": 0.4680507907002173, + "grad_norm": 1.2929073572158813, + "learning_rate": 1.2055636092917875e-05, + "loss": 1.5007, + "step": 8561 + }, + { + "epoch": 0.46810546314393886, + "grad_norm": 1.2632399797439575, + "learning_rate": 1.205384772680309e-05, + "loss": 1.3606, + "step": 8562 + }, + { + "epoch": 0.46816013558766045, + "grad_norm": 1.448999285697937, + "learning_rate": 1.2052059292105511e-05, + "loss": 1.4766, + "step": 8563 + }, + { + "epoch": 0.468214808031382, + "grad_norm": 1.4131712913513184, + "learning_rate": 1.2050270788884859e-05, + "loss": 1.0993, + "step": 8564 + }, + { + "epoch": 0.46826948047510353, + "grad_norm": 1.5180270671844482, + "learning_rate": 1.2048482217200854e-05, + "loss": 1.5974, + "step": 8565 + }, + { + "epoch": 0.46832415291882507, + "grad_norm": 1.8991644382476807, + "learning_rate": 1.204669357711322e-05, + "loss": 1.3323, + "step": 8566 + }, + { + "epoch": 0.46837882536254666, + "grad_norm": 1.4921644926071167, + "learning_rate": 1.2044904868681684e-05, + "loss": 1.2985, + "step": 8567 + }, + { + "epoch": 0.4684334978062682, + "grad_norm": 1.718471884727478, + "learning_rate": 1.204311609196598e-05, + "loss": 1.3623, + "step": 8568 + }, + { + "epoch": 0.46848817024998973, + "grad_norm": 1.8945780992507935, + "learning_rate": 1.2041327247025829e-05, + "loss": 1.6098, + "step": 8569 + }, + { + "epoch": 0.4685428426937113, + "grad_norm": 1.2850677967071533, + "learning_rate": 1.2039538333920972e-05, + "loss": 1.6645, + "step": 8570 + }, + { + "epoch": 0.46859751513743286, + "grad_norm": 1.5999782085418701, + "learning_rate": 1.2037749352711147e-05, + "loss": 1.4357, + "step": 8571 + }, + { + "epoch": 0.4686521875811544, + "grad_norm": 1.5136090517044067, + "learning_rate": 1.2035960303456091e-05, + "loss": 1.7186, + "step": 8572 + }, + { + "epoch": 0.46870686002487594, + "grad_norm": 1.3010035753250122, + "learning_rate": 1.203417118621554e-05, + "loss": 1.5852, + "step": 8573 + }, + { + "epoch": 0.46876153246859753, + "grad_norm": 1.3998674154281616, + "learning_rate": 1.203238200104924e-05, + "loss": 1.3793, + "step": 8574 + }, + { + "epoch": 0.46881620491231907, + "grad_norm": 1.5834285020828247, + "learning_rate": 1.2030592748016936e-05, + "loss": 1.4841, + "step": 8575 + }, + { + "epoch": 0.4688708773560406, + "grad_norm": 1.5012603998184204, + "learning_rate": 1.2028803427178376e-05, + "loss": 1.4308, + "step": 8576 + }, + { + "epoch": 0.4689255497997622, + "grad_norm": 1.2219582796096802, + "learning_rate": 1.2027014038593308e-05, + "loss": 1.4344, + "step": 8577 + }, + { + "epoch": 0.46898022224348374, + "grad_norm": 1.6266837120056152, + "learning_rate": 1.2025224582321486e-05, + "loss": 1.6779, + "step": 8578 + }, + { + "epoch": 0.4690348946872053, + "grad_norm": 1.6148515939712524, + "learning_rate": 1.202343505842266e-05, + "loss": 1.4715, + "step": 8579 + }, + { + "epoch": 0.4690895671309268, + "grad_norm": 1.6714844703674316, + "learning_rate": 1.202164546695659e-05, + "loss": 1.3664, + "step": 8580 + }, + { + "epoch": 0.4691442395746484, + "grad_norm": 1.5037442445755005, + "learning_rate": 1.2019855807983036e-05, + "loss": 1.6111, + "step": 8581 + }, + { + "epoch": 0.46919891201836994, + "grad_norm": 1.5250523090362549, + "learning_rate": 1.2018066081561756e-05, + "loss": 1.5624, + "step": 8582 + }, + { + "epoch": 0.4692535844620915, + "grad_norm": 1.6187289953231812, + "learning_rate": 1.2016276287752513e-05, + "loss": 1.3961, + "step": 8583 + }, + { + "epoch": 0.46930825690581307, + "grad_norm": 1.4049806594848633, + "learning_rate": 1.2014486426615076e-05, + "loss": 1.4795, + "step": 8584 + }, + { + "epoch": 0.4693629293495346, + "grad_norm": 1.4472273588180542, + "learning_rate": 1.201269649820921e-05, + "loss": 1.4122, + "step": 8585 + }, + { + "epoch": 0.46941760179325615, + "grad_norm": 1.4757542610168457, + "learning_rate": 1.2010906502594682e-05, + "loss": 1.5236, + "step": 8586 + }, + { + "epoch": 0.4694722742369777, + "grad_norm": 1.114015817642212, + "learning_rate": 1.2009116439831267e-05, + "loss": 1.4001, + "step": 8587 + }, + { + "epoch": 0.4695269466806993, + "grad_norm": 1.5668821334838867, + "learning_rate": 1.200732630997874e-05, + "loss": 1.7442, + "step": 8588 + }, + { + "epoch": 0.4695816191244208, + "grad_norm": 1.1502556800842285, + "learning_rate": 1.2005536113096878e-05, + "loss": 1.5923, + "step": 8589 + }, + { + "epoch": 0.46963629156814235, + "grad_norm": 1.4544557332992554, + "learning_rate": 1.2003745849245458e-05, + "loss": 1.3855, + "step": 8590 + }, + { + "epoch": 0.46969096401186394, + "grad_norm": 1.5223928689956665, + "learning_rate": 1.2001955518484266e-05, + "loss": 1.2551, + "step": 8591 + }, + { + "epoch": 0.4697456364555855, + "grad_norm": 1.4249215126037598, + "learning_rate": 1.2000165120873079e-05, + "loss": 1.4538, + "step": 8592 + }, + { + "epoch": 0.469800308899307, + "grad_norm": 1.3182694911956787, + "learning_rate": 1.1998374656471685e-05, + "loss": 1.3389, + "step": 8593 + }, + { + "epoch": 0.46985498134302855, + "grad_norm": 1.5061687231063843, + "learning_rate": 1.1996584125339869e-05, + "loss": 1.5588, + "step": 8594 + }, + { + "epoch": 0.46990965378675015, + "grad_norm": 1.4066271781921387, + "learning_rate": 1.1994793527537427e-05, + "loss": 1.4306, + "step": 8595 + }, + { + "epoch": 0.4699643262304717, + "grad_norm": 1.6347875595092773, + "learning_rate": 1.199300286312415e-05, + "loss": 1.4245, + "step": 8596 + }, + { + "epoch": 0.4700189986741932, + "grad_norm": 1.1904983520507812, + "learning_rate": 1.199121213215983e-05, + "loss": 1.5839, + "step": 8597 + }, + { + "epoch": 0.4700736711179148, + "grad_norm": 1.1608490943908691, + "learning_rate": 1.1989421334704261e-05, + "loss": 1.389, + "step": 8598 + }, + { + "epoch": 0.47012834356163635, + "grad_norm": 1.4216967821121216, + "learning_rate": 1.1987630470817249e-05, + "loss": 1.4491, + "step": 8599 + }, + { + "epoch": 0.4701830160053579, + "grad_norm": 1.3107600212097168, + "learning_rate": 1.1985839540558588e-05, + "loss": 1.343, + "step": 8600 + }, + { + "epoch": 0.4702376884490794, + "grad_norm": 1.2368823289871216, + "learning_rate": 1.1984048543988086e-05, + "loss": 1.741, + "step": 8601 + }, + { + "epoch": 0.470292360892801, + "grad_norm": 1.1530096530914307, + "learning_rate": 1.1982257481165547e-05, + "loss": 1.4005, + "step": 8602 + }, + { + "epoch": 0.47034703333652256, + "grad_norm": 1.4480770826339722, + "learning_rate": 1.1980466352150782e-05, + "loss": 1.4398, + "step": 8603 + }, + { + "epoch": 0.4704017057802441, + "grad_norm": 1.4505006074905396, + "learning_rate": 1.1978675157003597e-05, + "loss": 1.6052, + "step": 8604 + }, + { + "epoch": 0.4704563782239657, + "grad_norm": 1.5148996114730835, + "learning_rate": 1.1976883895783804e-05, + "loss": 1.2558, + "step": 8605 + }, + { + "epoch": 0.4705110506676872, + "grad_norm": 1.5609116554260254, + "learning_rate": 1.1975092568551221e-05, + "loss": 1.2124, + "step": 8606 + }, + { + "epoch": 0.47056572311140876, + "grad_norm": 1.614410638809204, + "learning_rate": 1.197330117536566e-05, + "loss": 1.2969, + "step": 8607 + }, + { + "epoch": 0.4706203955551303, + "grad_norm": 1.707215428352356, + "learning_rate": 1.1971509716286942e-05, + "loss": 1.4429, + "step": 8608 + }, + { + "epoch": 0.4706750679988519, + "grad_norm": 1.343666911125183, + "learning_rate": 1.1969718191374888e-05, + "loss": 1.6686, + "step": 8609 + }, + { + "epoch": 0.47072974044257343, + "grad_norm": 1.3591887950897217, + "learning_rate": 1.1967926600689324e-05, + "loss": 1.5245, + "step": 8610 + }, + { + "epoch": 0.47078441288629497, + "grad_norm": 1.3865611553192139, + "learning_rate": 1.1966134944290073e-05, + "loss": 1.49, + "step": 8611 + }, + { + "epoch": 0.47083908533001656, + "grad_norm": 1.4486163854599, + "learning_rate": 1.196434322223696e-05, + "loss": 1.532, + "step": 8612 + }, + { + "epoch": 0.4708937577737381, + "grad_norm": 1.4520207643508911, + "learning_rate": 1.1962551434589822e-05, + "loss": 1.7369, + "step": 8613 + }, + { + "epoch": 0.47094843021745963, + "grad_norm": 1.5380040407180786, + "learning_rate": 1.196075958140848e-05, + "loss": 1.6535, + "step": 8614 + }, + { + "epoch": 0.47100310266118117, + "grad_norm": 1.539148211479187, + "learning_rate": 1.1958967662752778e-05, + "loss": 1.3504, + "step": 8615 + }, + { + "epoch": 0.47105777510490277, + "grad_norm": 1.876572608947754, + "learning_rate": 1.1957175678682548e-05, + "loss": 1.4, + "step": 8616 + }, + { + "epoch": 0.4711124475486243, + "grad_norm": 1.7597739696502686, + "learning_rate": 1.1955383629257633e-05, + "loss": 1.3844, + "step": 8617 + }, + { + "epoch": 0.47116711999234584, + "grad_norm": 1.3040410280227661, + "learning_rate": 1.1953591514537866e-05, + "loss": 1.439, + "step": 8618 + }, + { + "epoch": 0.47122179243606743, + "grad_norm": 1.4236096143722534, + "learning_rate": 1.1951799334583093e-05, + "loss": 1.3644, + "step": 8619 + }, + { + "epoch": 0.47127646487978897, + "grad_norm": 1.778592586517334, + "learning_rate": 1.1950007089453166e-05, + "loss": 1.4675, + "step": 8620 + }, + { + "epoch": 0.4713311373235105, + "grad_norm": 1.7919387817382812, + "learning_rate": 1.1948214779207921e-05, + "loss": 1.1313, + "step": 8621 + }, + { + "epoch": 0.47138580976723204, + "grad_norm": 1.607901930809021, + "learning_rate": 1.1946422403907214e-05, + "loss": 1.385, + "step": 8622 + }, + { + "epoch": 0.47144048221095364, + "grad_norm": 1.5326838493347168, + "learning_rate": 1.1944629963610897e-05, + "loss": 1.4435, + "step": 8623 + }, + { + "epoch": 0.4714951546546752, + "grad_norm": 1.9815711975097656, + "learning_rate": 1.1942837458378821e-05, + "loss": 1.311, + "step": 8624 + }, + { + "epoch": 0.4715498270983967, + "grad_norm": 1.8172130584716797, + "learning_rate": 1.1941044888270845e-05, + "loss": 1.3778, + "step": 8625 + }, + { + "epoch": 0.4716044995421183, + "grad_norm": 1.5344493389129639, + "learning_rate": 1.1939252253346822e-05, + "loss": 1.4073, + "step": 8626 + }, + { + "epoch": 0.47165917198583984, + "grad_norm": 1.846177577972412, + "learning_rate": 1.193745955366662e-05, + "loss": 1.4083, + "step": 8627 + }, + { + "epoch": 0.4717138444295614, + "grad_norm": 1.7062491178512573, + "learning_rate": 1.1935666789290095e-05, + "loss": 1.5704, + "step": 8628 + }, + { + "epoch": 0.4717685168732829, + "grad_norm": 1.4251216650009155, + "learning_rate": 1.1933873960277114e-05, + "loss": 1.6038, + "step": 8629 + }, + { + "epoch": 0.4718231893170045, + "grad_norm": 2.2317018508911133, + "learning_rate": 1.1932081066687544e-05, + "loss": 1.4363, + "step": 8630 + }, + { + "epoch": 0.47187786176072605, + "grad_norm": 1.2371035814285278, + "learning_rate": 1.1930288108581252e-05, + "loss": 1.5067, + "step": 8631 + }, + { + "epoch": 0.4719325342044476, + "grad_norm": 1.5600013732910156, + "learning_rate": 1.192849508601811e-05, + "loss": 1.4948, + "step": 8632 + }, + { + "epoch": 0.4719872066481692, + "grad_norm": 1.5093045234680176, + "learning_rate": 1.1926701999057992e-05, + "loss": 1.6917, + "step": 8633 + }, + { + "epoch": 0.4720418790918907, + "grad_norm": 1.4632773399353027, + "learning_rate": 1.1924908847760774e-05, + "loss": 1.545, + "step": 8634 + }, + { + "epoch": 0.47209655153561225, + "grad_norm": 1.6688480377197266, + "learning_rate": 1.1923115632186332e-05, + "loss": 1.4073, + "step": 8635 + }, + { + "epoch": 0.4721512239793338, + "grad_norm": 1.5643891096115112, + "learning_rate": 1.1921322352394548e-05, + "loss": 1.6087, + "step": 8636 + }, + { + "epoch": 0.4722058964230554, + "grad_norm": 1.4177595376968384, + "learning_rate": 1.1919529008445302e-05, + "loss": 1.4226, + "step": 8637 + }, + { + "epoch": 0.4722605688667769, + "grad_norm": 1.3546470403671265, + "learning_rate": 1.1917735600398476e-05, + "loss": 1.3877, + "step": 8638 + }, + { + "epoch": 0.47231524131049846, + "grad_norm": 1.683671236038208, + "learning_rate": 1.191594212831396e-05, + "loss": 1.2612, + "step": 8639 + }, + { + "epoch": 0.47236991375422005, + "grad_norm": 1.403813362121582, + "learning_rate": 1.1914148592251638e-05, + "loss": 1.4147, + "step": 8640 + }, + { + "epoch": 0.4724245861979416, + "grad_norm": 2.3221511840820312, + "learning_rate": 1.1912354992271404e-05, + "loss": 1.376, + "step": 8641 + }, + { + "epoch": 0.4724792586416631, + "grad_norm": 1.3976012468338013, + "learning_rate": 1.191056132843315e-05, + "loss": 1.485, + "step": 8642 + }, + { + "epoch": 0.47253393108538466, + "grad_norm": 1.5177135467529297, + "learning_rate": 1.1908767600796771e-05, + "loss": 1.4472, + "step": 8643 + }, + { + "epoch": 0.47258860352910625, + "grad_norm": 1.3005257844924927, + "learning_rate": 1.1906973809422163e-05, + "loss": 1.6066, + "step": 8644 + }, + { + "epoch": 0.4726432759728278, + "grad_norm": 1.5584365129470825, + "learning_rate": 1.1905179954369222e-05, + "loss": 1.5105, + "step": 8645 + }, + { + "epoch": 0.47269794841654933, + "grad_norm": 1.5357855558395386, + "learning_rate": 1.1903386035697853e-05, + "loss": 1.5199, + "step": 8646 + }, + { + "epoch": 0.4727526208602709, + "grad_norm": 1.6658631563186646, + "learning_rate": 1.1901592053467956e-05, + "loss": 1.3798, + "step": 8647 + }, + { + "epoch": 0.47280729330399246, + "grad_norm": 1.233547568321228, + "learning_rate": 1.189979800773944e-05, + "loss": 1.3596, + "step": 8648 + }, + { + "epoch": 0.472861965747714, + "grad_norm": 1.3525015115737915, + "learning_rate": 1.1898003898572212e-05, + "loss": 1.4878, + "step": 8649 + }, + { + "epoch": 0.47291663819143553, + "grad_norm": 1.9781440496444702, + "learning_rate": 1.1896209726026177e-05, + "loss": 1.5144, + "step": 8650 + }, + { + "epoch": 0.4729713106351571, + "grad_norm": 1.4501919746398926, + "learning_rate": 1.1894415490161253e-05, + "loss": 1.4799, + "step": 8651 + }, + { + "epoch": 0.47302598307887866, + "grad_norm": 1.5875442028045654, + "learning_rate": 1.1892621191037347e-05, + "loss": 1.115, + "step": 8652 + }, + { + "epoch": 0.4730806555226002, + "grad_norm": 1.4981790781021118, + "learning_rate": 1.1890826828714378e-05, + "loss": 1.4165, + "step": 8653 + }, + { + "epoch": 0.4731353279663218, + "grad_norm": 1.6254749298095703, + "learning_rate": 1.1889032403252266e-05, + "loss": 1.3519, + "step": 8654 + }, + { + "epoch": 0.47319000041004333, + "grad_norm": 1.4599741697311401, + "learning_rate": 1.188723791471093e-05, + "loss": 1.6569, + "step": 8655 + }, + { + "epoch": 0.47324467285376487, + "grad_norm": 1.563244104385376, + "learning_rate": 1.1885443363150291e-05, + "loss": 1.7741, + "step": 8656 + }, + { + "epoch": 0.4732993452974864, + "grad_norm": 1.6816489696502686, + "learning_rate": 1.1883648748630274e-05, + "loss": 1.4838, + "step": 8657 + }, + { + "epoch": 0.473354017741208, + "grad_norm": 1.5729787349700928, + "learning_rate": 1.1881854071210805e-05, + "loss": 1.4513, + "step": 8658 + }, + { + "epoch": 0.47340869018492954, + "grad_norm": 1.4348301887512207, + "learning_rate": 1.1880059330951811e-05, + "loss": 1.4672, + "step": 8659 + }, + { + "epoch": 0.4734633626286511, + "grad_norm": 1.7118231058120728, + "learning_rate": 1.1878264527913226e-05, + "loss": 1.412, + "step": 8660 + }, + { + "epoch": 0.47351803507237267, + "grad_norm": 1.445320725440979, + "learning_rate": 1.187646966215498e-05, + "loss": 1.3945, + "step": 8661 + }, + { + "epoch": 0.4735727075160942, + "grad_norm": 1.580462098121643, + "learning_rate": 1.1874674733737009e-05, + "loss": 1.3978, + "step": 8662 + }, + { + "epoch": 0.47362737995981574, + "grad_norm": 1.4043865203857422, + "learning_rate": 1.187287974271925e-05, + "loss": 1.5866, + "step": 8663 + }, + { + "epoch": 0.47368205240353733, + "grad_norm": 1.3550126552581787, + "learning_rate": 1.187108468916164e-05, + "loss": 1.3573, + "step": 8664 + }, + { + "epoch": 0.47373672484725887, + "grad_norm": 1.2621688842773438, + "learning_rate": 1.1869289573124125e-05, + "loss": 1.6364, + "step": 8665 + }, + { + "epoch": 0.4737913972909804, + "grad_norm": 1.4063044786453247, + "learning_rate": 1.1867494394666641e-05, + "loss": 1.5159, + "step": 8666 + }, + { + "epoch": 0.47384606973470195, + "grad_norm": 1.7843369245529175, + "learning_rate": 1.1865699153849137e-05, + "loss": 1.5229, + "step": 8667 + }, + { + "epoch": 0.47390074217842354, + "grad_norm": 1.5084458589553833, + "learning_rate": 1.1863903850731564e-05, + "loss": 1.2888, + "step": 8668 + }, + { + "epoch": 0.4739554146221451, + "grad_norm": 1.3220387697219849, + "learning_rate": 1.1862108485373866e-05, + "loss": 1.377, + "step": 8669 + }, + { + "epoch": 0.4740100870658666, + "grad_norm": 1.5310269594192505, + "learning_rate": 1.1860313057835995e-05, + "loss": 1.5277, + "step": 8670 + }, + { + "epoch": 0.4740647595095882, + "grad_norm": 1.3728818893432617, + "learning_rate": 1.1858517568177905e-05, + "loss": 1.2806, + "step": 8671 + }, + { + "epoch": 0.47411943195330974, + "grad_norm": 1.5500167608261108, + "learning_rate": 1.1856722016459554e-05, + "loss": 1.605, + "step": 8672 + }, + { + "epoch": 0.4741741043970313, + "grad_norm": 1.8395328521728516, + "learning_rate": 1.1854926402740897e-05, + "loss": 1.2447, + "step": 8673 + }, + { + "epoch": 0.4742287768407528, + "grad_norm": 1.00868558883667, + "learning_rate": 1.1853130727081895e-05, + "loss": 1.5526, + "step": 8674 + }, + { + "epoch": 0.4742834492844744, + "grad_norm": 1.6607086658477783, + "learning_rate": 1.185133498954251e-05, + "loss": 1.4859, + "step": 8675 + }, + { + "epoch": 0.47433812172819595, + "grad_norm": 1.6508638858795166, + "learning_rate": 1.1849539190182706e-05, + "loss": 1.3358, + "step": 8676 + }, + { + "epoch": 0.4743927941719175, + "grad_norm": 1.7092031240463257, + "learning_rate": 1.1847743329062447e-05, + "loss": 1.3468, + "step": 8677 + }, + { + "epoch": 0.4744474666156391, + "grad_norm": 1.6146327257156372, + "learning_rate": 1.18459474062417e-05, + "loss": 1.4852, + "step": 8678 + }, + { + "epoch": 0.4745021390593606, + "grad_norm": 1.450448751449585, + "learning_rate": 1.1844151421780442e-05, + "loss": 1.4147, + "step": 8679 + }, + { + "epoch": 0.47455681150308215, + "grad_norm": 1.561669111251831, + "learning_rate": 1.1842355375738639e-05, + "loss": 1.4207, + "step": 8680 + }, + { + "epoch": 0.4746114839468037, + "grad_norm": 1.6547131538391113, + "learning_rate": 1.1840559268176263e-05, + "loss": 1.2141, + "step": 8681 + }, + { + "epoch": 0.4746661563905253, + "grad_norm": 1.6694449186325073, + "learning_rate": 1.18387630991533e-05, + "loss": 1.4169, + "step": 8682 + }, + { + "epoch": 0.4747208288342468, + "grad_norm": 1.6048786640167236, + "learning_rate": 1.1836966868729722e-05, + "loss": 1.488, + "step": 8683 + }, + { + "epoch": 0.47477550127796836, + "grad_norm": 1.8477245569229126, + "learning_rate": 1.1835170576965507e-05, + "loss": 1.5511, + "step": 8684 + }, + { + "epoch": 0.47483017372168995, + "grad_norm": 1.9045506715774536, + "learning_rate": 1.1833374223920639e-05, + "loss": 1.5414, + "step": 8685 + }, + { + "epoch": 0.4748848461654115, + "grad_norm": 1.5037139654159546, + "learning_rate": 1.1831577809655105e-05, + "loss": 1.4681, + "step": 8686 + }, + { + "epoch": 0.474939518609133, + "grad_norm": 1.4697266817092896, + "learning_rate": 1.1829781334228889e-05, + "loss": 1.325, + "step": 8687 + }, + { + "epoch": 0.47499419105285456, + "grad_norm": 1.303508996963501, + "learning_rate": 1.1827984797701983e-05, + "loss": 1.2673, + "step": 8688 + }, + { + "epoch": 0.47504886349657616, + "grad_norm": 1.6051008701324463, + "learning_rate": 1.1826188200134374e-05, + "loss": 1.3706, + "step": 8689 + }, + { + "epoch": 0.4751035359402977, + "grad_norm": 1.6010056734085083, + "learning_rate": 1.1824391541586055e-05, + "loss": 1.2913, + "step": 8690 + }, + { + "epoch": 0.47515820838401923, + "grad_norm": 1.3393514156341553, + "learning_rate": 1.1822594822117022e-05, + "loss": 1.46, + "step": 8691 + }, + { + "epoch": 0.4752128808277408, + "grad_norm": 1.275228500366211, + "learning_rate": 1.182079804178727e-05, + "loss": 1.4514, + "step": 8692 + }, + { + "epoch": 0.47526755327146236, + "grad_norm": 1.77156400680542, + "learning_rate": 1.18190012006568e-05, + "loss": 1.3891, + "step": 8693 + }, + { + "epoch": 0.4753222257151839, + "grad_norm": 1.5460505485534668, + "learning_rate": 1.1817204298785612e-05, + "loss": 1.5132, + "step": 8694 + }, + { + "epoch": 0.47537689815890544, + "grad_norm": 1.2868572473526, + "learning_rate": 1.1815407336233709e-05, + "loss": 1.4205, + "step": 8695 + }, + { + "epoch": 0.47543157060262703, + "grad_norm": 1.43682062625885, + "learning_rate": 1.1813610313061091e-05, + "loss": 1.4455, + "step": 8696 + }, + { + "epoch": 0.47548624304634857, + "grad_norm": 1.4164971113204956, + "learning_rate": 1.1811813229327774e-05, + "loss": 1.619, + "step": 8697 + }, + { + "epoch": 0.4755409154900701, + "grad_norm": 1.495773196220398, + "learning_rate": 1.1810016085093756e-05, + "loss": 1.4864, + "step": 8698 + }, + { + "epoch": 0.4755955879337917, + "grad_norm": 1.3103350400924683, + "learning_rate": 1.1808218880419055e-05, + "loss": 1.5514, + "step": 8699 + }, + { + "epoch": 0.47565026037751323, + "grad_norm": 1.748339295387268, + "learning_rate": 1.1806421615363685e-05, + "loss": 1.2582, + "step": 8700 + }, + { + "epoch": 0.47570493282123477, + "grad_norm": 1.8443493843078613, + "learning_rate": 1.1804624289987658e-05, + "loss": 1.4986, + "step": 8701 + }, + { + "epoch": 0.4757596052649563, + "grad_norm": 1.6466447114944458, + "learning_rate": 1.1802826904350992e-05, + "loss": 1.4439, + "step": 8702 + }, + { + "epoch": 0.4758142777086779, + "grad_norm": 1.3935989141464233, + "learning_rate": 1.1801029458513702e-05, + "loss": 1.3748, + "step": 8703 + }, + { + "epoch": 0.47586895015239944, + "grad_norm": 1.6930731534957886, + "learning_rate": 1.1799231952535817e-05, + "loss": 1.5731, + "step": 8704 + }, + { + "epoch": 0.475923622596121, + "grad_norm": 1.3469523191452026, + "learning_rate": 1.179743438647735e-05, + "loss": 1.3224, + "step": 8705 + }, + { + "epoch": 0.47597829503984257, + "grad_norm": 1.637420415878296, + "learning_rate": 1.1795636760398332e-05, + "loss": 1.6925, + "step": 8706 + }, + { + "epoch": 0.4760329674835641, + "grad_norm": 1.3714935779571533, + "learning_rate": 1.179383907435879e-05, + "loss": 1.3963, + "step": 8707 + }, + { + "epoch": 0.47608763992728564, + "grad_norm": 1.7857328653335571, + "learning_rate": 1.1792041328418755e-05, + "loss": 1.1787, + "step": 8708 + }, + { + "epoch": 0.4761423123710072, + "grad_norm": 2.588192939758301, + "learning_rate": 1.1790243522638252e-05, + "loss": 1.3454, + "step": 8709 + }, + { + "epoch": 0.4761969848147288, + "grad_norm": 1.5592504739761353, + "learning_rate": 1.1788445657077315e-05, + "loss": 1.4946, + "step": 8710 + }, + { + "epoch": 0.4762516572584503, + "grad_norm": 1.6349660158157349, + "learning_rate": 1.1786647731795984e-05, + "loss": 1.1725, + "step": 8711 + }, + { + "epoch": 0.47630632970217185, + "grad_norm": 1.2630891799926758, + "learning_rate": 1.178484974685429e-05, + "loss": 1.5048, + "step": 8712 + }, + { + "epoch": 0.47636100214589344, + "grad_norm": 1.2644522190093994, + "learning_rate": 1.1783051702312274e-05, + "loss": 1.4691, + "step": 8713 + }, + { + "epoch": 0.476415674589615, + "grad_norm": 1.5828802585601807, + "learning_rate": 1.1781253598229982e-05, + "loss": 1.4914, + "step": 8714 + }, + { + "epoch": 0.4764703470333365, + "grad_norm": 1.8538236618041992, + "learning_rate": 1.177945543466745e-05, + "loss": 1.4311, + "step": 8715 + }, + { + "epoch": 0.47652501947705805, + "grad_norm": 1.513542652130127, + "learning_rate": 1.1777657211684726e-05, + "loss": 1.2691, + "step": 8716 + }, + { + "epoch": 0.47657969192077965, + "grad_norm": 1.683903455734253, + "learning_rate": 1.1775858929341855e-05, + "loss": 1.5173, + "step": 8717 + }, + { + "epoch": 0.4766343643645012, + "grad_norm": 1.6774978637695312, + "learning_rate": 1.1774060587698888e-05, + "loss": 1.6814, + "step": 8718 + }, + { + "epoch": 0.4766890368082227, + "grad_norm": 1.7377607822418213, + "learning_rate": 1.1772262186815875e-05, + "loss": 1.5521, + "step": 8719 + }, + { + "epoch": 0.4767437092519443, + "grad_norm": 1.4474691152572632, + "learning_rate": 1.1770463726752868e-05, + "loss": 1.355, + "step": 8720 + }, + { + "epoch": 0.47679838169566585, + "grad_norm": 1.4950790405273438, + "learning_rate": 1.1768665207569922e-05, + "loss": 1.5587, + "step": 8721 + }, + { + "epoch": 0.4768530541393874, + "grad_norm": 1.3610433340072632, + "learning_rate": 1.1766866629327099e-05, + "loss": 1.6177, + "step": 8722 + }, + { + "epoch": 0.4769077265831089, + "grad_norm": 1.4195971488952637, + "learning_rate": 1.176506799208445e-05, + "loss": 1.6758, + "step": 8723 + }, + { + "epoch": 0.4769623990268305, + "grad_norm": 1.6561973094940186, + "learning_rate": 1.1763269295902036e-05, + "loss": 1.3846, + "step": 8724 + }, + { + "epoch": 0.47701707147055206, + "grad_norm": 1.7096986770629883, + "learning_rate": 1.1761470540839927e-05, + "loss": 1.4632, + "step": 8725 + }, + { + "epoch": 0.4770717439142736, + "grad_norm": 1.3893823623657227, + "learning_rate": 1.1759671726958181e-05, + "loss": 1.349, + "step": 8726 + }, + { + "epoch": 0.4771264163579952, + "grad_norm": 1.4886213541030884, + "learning_rate": 1.175787285431687e-05, + "loss": 1.7891, + "step": 8727 + }, + { + "epoch": 0.4771810888017167, + "grad_norm": 1.6610419750213623, + "learning_rate": 1.1756073922976056e-05, + "loss": 1.6211, + "step": 8728 + }, + { + "epoch": 0.47723576124543826, + "grad_norm": 1.6472607851028442, + "learning_rate": 1.1754274932995813e-05, + "loss": 1.4101, + "step": 8729 + }, + { + "epoch": 0.4772904336891598, + "grad_norm": 2.255136489868164, + "learning_rate": 1.1752475884436214e-05, + "loss": 1.3745, + "step": 8730 + }, + { + "epoch": 0.4773451061328814, + "grad_norm": 1.468288779258728, + "learning_rate": 1.1750676777357333e-05, + "loss": 1.3753, + "step": 8731 + }, + { + "epoch": 0.47739977857660293, + "grad_norm": 1.3960362672805786, + "learning_rate": 1.1748877611819247e-05, + "loss": 1.4472, + "step": 8732 + }, + { + "epoch": 0.47745445102032447, + "grad_norm": 1.5651803016662598, + "learning_rate": 1.1747078387882031e-05, + "loss": 1.5703, + "step": 8733 + }, + { + "epoch": 0.47750912346404606, + "grad_norm": 1.3641976118087769, + "learning_rate": 1.1745279105605774e-05, + "loss": 1.5958, + "step": 8734 + }, + { + "epoch": 0.4775637959077676, + "grad_norm": 1.6674115657806396, + "learning_rate": 1.1743479765050549e-05, + "loss": 1.5639, + "step": 8735 + }, + { + "epoch": 0.47761846835148913, + "grad_norm": 1.2764407396316528, + "learning_rate": 1.1741680366276442e-05, + "loss": 1.3629, + "step": 8736 + }, + { + "epoch": 0.47767314079521067, + "grad_norm": 1.3920365571975708, + "learning_rate": 1.173988090934354e-05, + "loss": 1.5309, + "step": 8737 + }, + { + "epoch": 0.47772781323893226, + "grad_norm": 1.5702521800994873, + "learning_rate": 1.1738081394311933e-05, + "loss": 1.3796, + "step": 8738 + }, + { + "epoch": 0.4777824856826538, + "grad_norm": 1.6601786613464355, + "learning_rate": 1.173628182124171e-05, + "loss": 1.4002, + "step": 8739 + }, + { + "epoch": 0.47783715812637534, + "grad_norm": 2.174722194671631, + "learning_rate": 1.1734482190192964e-05, + "loss": 1.4912, + "step": 8740 + }, + { + "epoch": 0.47789183057009693, + "grad_norm": 1.4734140634536743, + "learning_rate": 1.1732682501225785e-05, + "loss": 1.5534, + "step": 8741 + }, + { + "epoch": 0.47794650301381847, + "grad_norm": 1.678284764289856, + "learning_rate": 1.1730882754400274e-05, + "loss": 1.2961, + "step": 8742 + }, + { + "epoch": 0.47800117545754, + "grad_norm": 1.4224785566329956, + "learning_rate": 1.1729082949776524e-05, + "loss": 1.2974, + "step": 8743 + }, + { + "epoch": 0.47805584790126154, + "grad_norm": 1.3985365629196167, + "learning_rate": 1.1727283087414636e-05, + "loss": 1.4801, + "step": 8744 + }, + { + "epoch": 0.47811052034498314, + "grad_norm": 1.490596890449524, + "learning_rate": 1.1725483167374713e-05, + "loss": 1.3663, + "step": 8745 + }, + { + "epoch": 0.4781651927887047, + "grad_norm": 2.04182505607605, + "learning_rate": 1.1723683189716862e-05, + "loss": 1.3542, + "step": 8746 + }, + { + "epoch": 0.4782198652324262, + "grad_norm": 1.9577629566192627, + "learning_rate": 1.1721883154501184e-05, + "loss": 1.3628, + "step": 8747 + }, + { + "epoch": 0.4782745376761478, + "grad_norm": 1.4312087297439575, + "learning_rate": 1.1720083061787782e-05, + "loss": 1.329, + "step": 8748 + }, + { + "epoch": 0.47832921011986934, + "grad_norm": 2.2656707763671875, + "learning_rate": 1.1718282911636774e-05, + "loss": 1.3994, + "step": 8749 + }, + { + "epoch": 0.4783838825635909, + "grad_norm": 1.4151222705841064, + "learning_rate": 1.1716482704108265e-05, + "loss": 1.5026, + "step": 8750 + }, + { + "epoch": 0.4784385550073124, + "grad_norm": 1.445168375968933, + "learning_rate": 1.1714682439262373e-05, + "loss": 1.6602, + "step": 8751 + }, + { + "epoch": 0.478493227451034, + "grad_norm": 1.1595749855041504, + "learning_rate": 1.1712882117159207e-05, + "loss": 1.8933, + "step": 8752 + }, + { + "epoch": 0.47854789989475555, + "grad_norm": 1.3003252744674683, + "learning_rate": 1.1711081737858891e-05, + "loss": 1.6103, + "step": 8753 + }, + { + "epoch": 0.4786025723384771, + "grad_norm": 1.1748300790786743, + "learning_rate": 1.1709281301421543e-05, + "loss": 1.4545, + "step": 8754 + }, + { + "epoch": 0.4786572447821987, + "grad_norm": 1.2483184337615967, + "learning_rate": 1.1707480807907277e-05, + "loss": 1.6752, + "step": 8755 + }, + { + "epoch": 0.4787119172259202, + "grad_norm": 1.3616448640823364, + "learning_rate": 1.1705680257376224e-05, + "loss": 1.3471, + "step": 8756 + }, + { + "epoch": 0.47876658966964175, + "grad_norm": 1.4730181694030762, + "learning_rate": 1.1703879649888501e-05, + "loss": 1.4364, + "step": 8757 + }, + { + "epoch": 0.4788212621133633, + "grad_norm": 1.6339613199234009, + "learning_rate": 1.170207898550424e-05, + "loss": 1.5646, + "step": 8758 + }, + { + "epoch": 0.4788759345570849, + "grad_norm": 1.6471599340438843, + "learning_rate": 1.1700278264283567e-05, + "loss": 1.2693, + "step": 8759 + }, + { + "epoch": 0.4789306070008064, + "grad_norm": 1.802037000656128, + "learning_rate": 1.1698477486286615e-05, + "loss": 1.4389, + "step": 8760 + }, + { + "epoch": 0.47898527944452796, + "grad_norm": 1.1761802434921265, + "learning_rate": 1.169667665157351e-05, + "loss": 1.5637, + "step": 8761 + }, + { + "epoch": 0.47903995188824955, + "grad_norm": 1.4561609029769897, + "learning_rate": 1.1694875760204391e-05, + "loss": 1.422, + "step": 8762 + }, + { + "epoch": 0.4790946243319711, + "grad_norm": 1.3029766082763672, + "learning_rate": 1.1693074812239397e-05, + "loss": 1.6339, + "step": 8763 + }, + { + "epoch": 0.4791492967756926, + "grad_norm": 1.8288052082061768, + "learning_rate": 1.1691273807738659e-05, + "loss": 1.3452, + "step": 8764 + }, + { + "epoch": 0.47920396921941416, + "grad_norm": 1.5167632102966309, + "learning_rate": 1.1689472746762317e-05, + "loss": 1.5185, + "step": 8765 + }, + { + "epoch": 0.47925864166313575, + "grad_norm": 1.8151123523712158, + "learning_rate": 1.1687671629370522e-05, + "loss": 1.1442, + "step": 8766 + }, + { + "epoch": 0.4793133141068573, + "grad_norm": 1.6862984895706177, + "learning_rate": 1.1685870455623409e-05, + "loss": 1.3191, + "step": 8767 + }, + { + "epoch": 0.47936798655057883, + "grad_norm": 1.7543549537658691, + "learning_rate": 1.1684069225581126e-05, + "loss": 1.39, + "step": 8768 + }, + { + "epoch": 0.4794226589943004, + "grad_norm": 1.7186018228530884, + "learning_rate": 1.1682267939303815e-05, + "loss": 1.5565, + "step": 8769 + }, + { + "epoch": 0.47947733143802196, + "grad_norm": 1.7315620183944702, + "learning_rate": 1.1680466596851635e-05, + "loss": 1.6016, + "step": 8770 + }, + { + "epoch": 0.4795320038817435, + "grad_norm": 1.1471681594848633, + "learning_rate": 1.167866519828473e-05, + "loss": 1.3744, + "step": 8771 + }, + { + "epoch": 0.47958667632546503, + "grad_norm": 1.7287894487380981, + "learning_rate": 1.1676863743663254e-05, + "loss": 1.4022, + "step": 8772 + }, + { + "epoch": 0.4796413487691866, + "grad_norm": 1.5203373432159424, + "learning_rate": 1.1675062233047365e-05, + "loss": 1.4457, + "step": 8773 + }, + { + "epoch": 0.47969602121290816, + "grad_norm": 1.4519670009613037, + "learning_rate": 1.1673260666497218e-05, + "loss": 1.4358, + "step": 8774 + }, + { + "epoch": 0.4797506936566297, + "grad_norm": 1.5023316144943237, + "learning_rate": 1.1671459044072969e-05, + "loss": 1.391, + "step": 8775 + }, + { + "epoch": 0.4798053661003513, + "grad_norm": 1.7115163803100586, + "learning_rate": 1.1669657365834779e-05, + "loss": 1.3447, + "step": 8776 + }, + { + "epoch": 0.47986003854407283, + "grad_norm": 2.145456552505493, + "learning_rate": 1.1667855631842815e-05, + "loss": 1.5273, + "step": 8777 + }, + { + "epoch": 0.47991471098779437, + "grad_norm": 1.7604104280471802, + "learning_rate": 1.1666053842157234e-05, + "loss": 1.3717, + "step": 8778 + }, + { + "epoch": 0.4799693834315159, + "grad_norm": 1.368587613105774, + "learning_rate": 1.1664251996838209e-05, + "loss": 1.5488, + "step": 8779 + }, + { + "epoch": 0.4800240558752375, + "grad_norm": 1.5145524740219116, + "learning_rate": 1.1662450095945903e-05, + "loss": 1.4729, + "step": 8780 + }, + { + "epoch": 0.48007872831895904, + "grad_norm": 1.4347151517868042, + "learning_rate": 1.1660648139540487e-05, + "loss": 1.5766, + "step": 8781 + }, + { + "epoch": 0.4801334007626806, + "grad_norm": 1.4400122165679932, + "learning_rate": 1.1658846127682133e-05, + "loss": 1.3908, + "step": 8782 + }, + { + "epoch": 0.48018807320640217, + "grad_norm": 1.4946813583374023, + "learning_rate": 1.1657044060431012e-05, + "loss": 1.5582, + "step": 8783 + }, + { + "epoch": 0.4802427456501237, + "grad_norm": 1.298307180404663, + "learning_rate": 1.1655241937847305e-05, + "loss": 1.3791, + "step": 8784 + }, + { + "epoch": 0.48029741809384524, + "grad_norm": 1.65774405002594, + "learning_rate": 1.1653439759991185e-05, + "loss": 1.6717, + "step": 8785 + }, + { + "epoch": 0.4803520905375668, + "grad_norm": 1.2163894176483154, + "learning_rate": 1.1651637526922832e-05, + "loss": 1.3745, + "step": 8786 + }, + { + "epoch": 0.48040676298128837, + "grad_norm": 1.788888692855835, + "learning_rate": 1.1649835238702425e-05, + "loss": 1.5067, + "step": 8787 + }, + { + "epoch": 0.4804614354250099, + "grad_norm": 1.4490489959716797, + "learning_rate": 1.1648032895390148e-05, + "loss": 1.459, + "step": 8788 + }, + { + "epoch": 0.48051610786873145, + "grad_norm": 1.732333779335022, + "learning_rate": 1.1646230497046185e-05, + "loss": 1.203, + "step": 8789 + }, + { + "epoch": 0.48057078031245304, + "grad_norm": 1.6199595928192139, + "learning_rate": 1.1644428043730722e-05, + "loss": 1.6285, + "step": 8790 + }, + { + "epoch": 0.4806254527561746, + "grad_norm": 1.9809422492980957, + "learning_rate": 1.164262553550395e-05, + "loss": 1.5721, + "step": 8791 + }, + { + "epoch": 0.4806801251998961, + "grad_norm": 1.4196528196334839, + "learning_rate": 1.1640822972426055e-05, + "loss": 1.3911, + "step": 8792 + }, + { + "epoch": 0.48073479764361765, + "grad_norm": 1.3960943222045898, + "learning_rate": 1.1639020354557234e-05, + "loss": 1.4283, + "step": 8793 + }, + { + "epoch": 0.48078947008733924, + "grad_norm": 1.6180859804153442, + "learning_rate": 1.1637217681957673e-05, + "loss": 1.5001, + "step": 8794 + }, + { + "epoch": 0.4808441425310608, + "grad_norm": 1.8014740943908691, + "learning_rate": 1.1635414954687574e-05, + "loss": 1.3381, + "step": 8795 + }, + { + "epoch": 0.4808988149747823, + "grad_norm": 1.6007221937179565, + "learning_rate": 1.1633612172807131e-05, + "loss": 1.4264, + "step": 8796 + }, + { + "epoch": 0.4809534874185039, + "grad_norm": 1.4523342847824097, + "learning_rate": 1.1631809336376544e-05, + "loss": 1.2154, + "step": 8797 + }, + { + "epoch": 0.48100815986222545, + "grad_norm": 1.7276623249053955, + "learning_rate": 1.1630006445456015e-05, + "loss": 1.4773, + "step": 8798 + }, + { + "epoch": 0.481062832305947, + "grad_norm": 1.453268051147461, + "learning_rate": 1.1628203500105748e-05, + "loss": 1.467, + "step": 8799 + }, + { + "epoch": 0.4811175047496685, + "grad_norm": 1.7229136228561401, + "learning_rate": 1.1626400500385941e-05, + "loss": 1.4495, + "step": 8800 + }, + { + "epoch": 0.4811721771933901, + "grad_norm": 1.3912835121154785, + "learning_rate": 1.1624597446356807e-05, + "loss": 1.3956, + "step": 8801 + }, + { + "epoch": 0.48122684963711165, + "grad_norm": 1.4651403427124023, + "learning_rate": 1.1622794338078554e-05, + "loss": 1.47, + "step": 8802 + }, + { + "epoch": 0.4812815220808332, + "grad_norm": 1.6729516983032227, + "learning_rate": 1.1620991175611385e-05, + "loss": 1.3588, + "step": 8803 + }, + { + "epoch": 0.4813361945245548, + "grad_norm": 1.716783881187439, + "learning_rate": 1.1619187959015519e-05, + "loss": 1.5343, + "step": 8804 + }, + { + "epoch": 0.4813908669682763, + "grad_norm": 1.3072147369384766, + "learning_rate": 1.161738468835117e-05, + "loss": 1.3993, + "step": 8805 + }, + { + "epoch": 0.48144553941199786, + "grad_norm": 1.4443353414535522, + "learning_rate": 1.161558136367855e-05, + "loss": 1.2984, + "step": 8806 + }, + { + "epoch": 0.4815002118557194, + "grad_norm": 1.6660542488098145, + "learning_rate": 1.1613777985057877e-05, + "loss": 1.496, + "step": 8807 + }, + { + "epoch": 0.481554884299441, + "grad_norm": 1.6985665559768677, + "learning_rate": 1.1611974552549367e-05, + "loss": 1.4044, + "step": 8808 + }, + { + "epoch": 0.4816095567431625, + "grad_norm": 1.6173744201660156, + "learning_rate": 1.161017106621325e-05, + "loss": 1.3282, + "step": 8809 + }, + { + "epoch": 0.48166422918688406, + "grad_norm": 1.909162163734436, + "learning_rate": 1.1608367526109738e-05, + "loss": 1.4812, + "step": 8810 + }, + { + "epoch": 0.48171890163060566, + "grad_norm": 1.4029896259307861, + "learning_rate": 1.160656393229906e-05, + "loss": 1.463, + "step": 8811 + }, + { + "epoch": 0.4817735740743272, + "grad_norm": 1.2495567798614502, + "learning_rate": 1.1604760284841446e-05, + "loss": 1.3602, + "step": 8812 + }, + { + "epoch": 0.48182824651804873, + "grad_norm": 1.3515276908874512, + "learning_rate": 1.1602956583797118e-05, + "loss": 1.5374, + "step": 8813 + }, + { + "epoch": 0.48188291896177027, + "grad_norm": 2.262829542160034, + "learning_rate": 1.1601152829226308e-05, + "loss": 1.3155, + "step": 8814 + }, + { + "epoch": 0.48193759140549186, + "grad_norm": 1.4453094005584717, + "learning_rate": 1.1599349021189247e-05, + "loss": 1.5114, + "step": 8815 + }, + { + "epoch": 0.4819922638492134, + "grad_norm": 1.2466533184051514, + "learning_rate": 1.1597545159746174e-05, + "loss": 1.5343, + "step": 8816 + }, + { + "epoch": 0.48204693629293494, + "grad_norm": 1.698805570602417, + "learning_rate": 1.1595741244957312e-05, + "loss": 1.3651, + "step": 8817 + }, + { + "epoch": 0.48210160873665653, + "grad_norm": 1.7773606777191162, + "learning_rate": 1.1593937276882911e-05, + "loss": 1.5342, + "step": 8818 + }, + { + "epoch": 0.48215628118037807, + "grad_norm": 1.564233660697937, + "learning_rate": 1.1592133255583204e-05, + "loss": 1.5656, + "step": 8819 + }, + { + "epoch": 0.4822109536240996, + "grad_norm": 1.5003502368927002, + "learning_rate": 1.159032918111843e-05, + "loss": 1.5378, + "step": 8820 + }, + { + "epoch": 0.48226562606782114, + "grad_norm": 1.4162538051605225, + "learning_rate": 1.1588525053548831e-05, + "loss": 1.3787, + "step": 8821 + }, + { + "epoch": 0.48232029851154273, + "grad_norm": 1.504362940788269, + "learning_rate": 1.1586720872934654e-05, + "loss": 1.416, + "step": 8822 + }, + { + "epoch": 0.48237497095526427, + "grad_norm": 1.5976120233535767, + "learning_rate": 1.1584916639336146e-05, + "loss": 1.8197, + "step": 8823 + }, + { + "epoch": 0.4824296433989858, + "grad_norm": 1.711952567100525, + "learning_rate": 1.1583112352813548e-05, + "loss": 1.399, + "step": 8824 + }, + { + "epoch": 0.4824843158427074, + "grad_norm": 1.516013741493225, + "learning_rate": 1.158130801342712e-05, + "loss": 1.6366, + "step": 8825 + }, + { + "epoch": 0.48253898828642894, + "grad_norm": 1.4720593690872192, + "learning_rate": 1.1579503621237102e-05, + "loss": 1.4255, + "step": 8826 + }, + { + "epoch": 0.4825936607301505, + "grad_norm": 1.3824998140335083, + "learning_rate": 1.1577699176303752e-05, + "loss": 1.5099, + "step": 8827 + }, + { + "epoch": 0.482648333173872, + "grad_norm": 1.4063788652420044, + "learning_rate": 1.1575894678687325e-05, + "loss": 1.3395, + "step": 8828 + }, + { + "epoch": 0.4827030056175936, + "grad_norm": 1.628584384918213, + "learning_rate": 1.1574090128448075e-05, + "loss": 1.4882, + "step": 8829 + }, + { + "epoch": 0.48275767806131514, + "grad_norm": 2.017315626144409, + "learning_rate": 1.1572285525646265e-05, + "loss": 1.655, + "step": 8830 + }, + { + "epoch": 0.4828123505050367, + "grad_norm": 1.4360803365707397, + "learning_rate": 1.157048087034215e-05, + "loss": 1.294, + "step": 8831 + }, + { + "epoch": 0.4828670229487583, + "grad_norm": 2.223555326461792, + "learning_rate": 1.1568676162595995e-05, + "loss": 1.6034, + "step": 8832 + }, + { + "epoch": 0.4829216953924798, + "grad_norm": 1.4647692441940308, + "learning_rate": 1.156687140246806e-05, + "loss": 1.2747, + "step": 8833 + }, + { + "epoch": 0.48297636783620135, + "grad_norm": 1.3136236667633057, + "learning_rate": 1.1565066590018615e-05, + "loss": 1.2439, + "step": 8834 + }, + { + "epoch": 0.4830310402799229, + "grad_norm": 1.4956754446029663, + "learning_rate": 1.156326172530792e-05, + "loss": 1.4762, + "step": 8835 + }, + { + "epoch": 0.4830857127236445, + "grad_norm": 1.4079301357269287, + "learning_rate": 1.1561456808396248e-05, + "loss": 1.2239, + "step": 8836 + }, + { + "epoch": 0.483140385167366, + "grad_norm": 1.3078055381774902, + "learning_rate": 1.155965183934387e-05, + "loss": 1.351, + "step": 8837 + }, + { + "epoch": 0.48319505761108755, + "grad_norm": 1.2313836812973022, + "learning_rate": 1.1557846818211061e-05, + "loss": 1.5358, + "step": 8838 + }, + { + "epoch": 0.48324973005480915, + "grad_norm": 1.5134773254394531, + "learning_rate": 1.1556041745058086e-05, + "loss": 1.5295, + "step": 8839 + }, + { + "epoch": 0.4833044024985307, + "grad_norm": 1.0668028593063354, + "learning_rate": 1.1554236619945229e-05, + "loss": 1.5311, + "step": 8840 + }, + { + "epoch": 0.4833590749422522, + "grad_norm": 1.485655426979065, + "learning_rate": 1.1552431442932764e-05, + "loss": 1.3945, + "step": 8841 + }, + { + "epoch": 0.48341374738597376, + "grad_norm": 1.6131532192230225, + "learning_rate": 1.1550626214080967e-05, + "loss": 1.423, + "step": 8842 + }, + { + "epoch": 0.48346841982969535, + "grad_norm": 1.4199461936950684, + "learning_rate": 1.1548820933450125e-05, + "loss": 1.6223, + "step": 8843 + }, + { + "epoch": 0.4835230922734169, + "grad_norm": 1.8918038606643677, + "learning_rate": 1.1547015601100518e-05, + "loss": 1.3695, + "step": 8844 + }, + { + "epoch": 0.4835777647171384, + "grad_norm": 1.5935542583465576, + "learning_rate": 1.1545210217092428e-05, + "loss": 1.61, + "step": 8845 + }, + { + "epoch": 0.48363243716086, + "grad_norm": 1.432242512702942, + "learning_rate": 1.1543404781486142e-05, + "loss": 1.5258, + "step": 8846 + }, + { + "epoch": 0.48368710960458156, + "grad_norm": 1.1587163209915161, + "learning_rate": 1.1541599294341952e-05, + "loss": 1.3543, + "step": 8847 + }, + { + "epoch": 0.4837417820483031, + "grad_norm": 1.7838566303253174, + "learning_rate": 1.1539793755720141e-05, + "loss": 1.5709, + "step": 8848 + }, + { + "epoch": 0.48379645449202463, + "grad_norm": 1.3388149738311768, + "learning_rate": 1.1537988165681004e-05, + "loss": 1.3557, + "step": 8849 + }, + { + "epoch": 0.4838511269357462, + "grad_norm": 1.7754242420196533, + "learning_rate": 1.1536182524284833e-05, + "loss": 1.3285, + "step": 8850 + }, + { + "epoch": 0.48390579937946776, + "grad_norm": 1.5655518770217896, + "learning_rate": 1.1534376831591925e-05, + "loss": 1.3675, + "step": 8851 + }, + { + "epoch": 0.4839604718231893, + "grad_norm": 1.6410784721374512, + "learning_rate": 1.1532571087662575e-05, + "loss": 1.2582, + "step": 8852 + }, + { + "epoch": 0.4840151442669109, + "grad_norm": 1.7865839004516602, + "learning_rate": 1.1530765292557076e-05, + "loss": 1.4371, + "step": 8853 + }, + { + "epoch": 0.48406981671063243, + "grad_norm": 1.5040287971496582, + "learning_rate": 1.1528959446335735e-05, + "loss": 1.6031, + "step": 8854 + }, + { + "epoch": 0.48412448915435397, + "grad_norm": 1.4200316667556763, + "learning_rate": 1.1527153549058847e-05, + "loss": 1.496, + "step": 8855 + }, + { + "epoch": 0.4841791615980755, + "grad_norm": 1.242944598197937, + "learning_rate": 1.152534760078672e-05, + "loss": 1.421, + "step": 8856 + }, + { + "epoch": 0.4842338340417971, + "grad_norm": 1.4487078189849854, + "learning_rate": 1.1523541601579658e-05, + "loss": 1.4723, + "step": 8857 + }, + { + "epoch": 0.48428850648551863, + "grad_norm": 1.6601066589355469, + "learning_rate": 1.1521735551497967e-05, + "loss": 1.4874, + "step": 8858 + }, + { + "epoch": 0.48434317892924017, + "grad_norm": 1.9218058586120605, + "learning_rate": 1.1519929450601954e-05, + "loss": 1.4149, + "step": 8859 + }, + { + "epoch": 0.48439785137296176, + "grad_norm": 1.373572826385498, + "learning_rate": 1.1518123298951929e-05, + "loss": 1.5788, + "step": 8860 + }, + { + "epoch": 0.4844525238166833, + "grad_norm": 1.3707396984100342, + "learning_rate": 1.1516317096608207e-05, + "loss": 1.4806, + "step": 8861 + }, + { + "epoch": 0.48450719626040484, + "grad_norm": 1.4071695804595947, + "learning_rate": 1.1514510843631097e-05, + "loss": 1.5069, + "step": 8862 + }, + { + "epoch": 0.4845618687041264, + "grad_norm": 1.7301424741744995, + "learning_rate": 1.1512704540080916e-05, + "loss": 1.3554, + "step": 8863 + }, + { + "epoch": 0.48461654114784797, + "grad_norm": 1.4105530977249146, + "learning_rate": 1.1510898186017984e-05, + "loss": 1.4934, + "step": 8864 + }, + { + "epoch": 0.4846712135915695, + "grad_norm": 1.507028579711914, + "learning_rate": 1.1509091781502614e-05, + "loss": 1.406, + "step": 8865 + }, + { + "epoch": 0.48472588603529104, + "grad_norm": 1.5684077739715576, + "learning_rate": 1.1507285326595128e-05, + "loss": 1.4059, + "step": 8866 + }, + { + "epoch": 0.48478055847901264, + "grad_norm": 1.707497239112854, + "learning_rate": 1.1505478821355847e-05, + "loss": 1.6483, + "step": 8867 + }, + { + "epoch": 0.4848352309227342, + "grad_norm": 1.7206697463989258, + "learning_rate": 1.1503672265845098e-05, + "loss": 1.4644, + "step": 8868 + }, + { + "epoch": 0.4848899033664557, + "grad_norm": 1.9645471572875977, + "learning_rate": 1.1501865660123201e-05, + "loss": 1.3426, + "step": 8869 + }, + { + "epoch": 0.4849445758101773, + "grad_norm": 1.756959080696106, + "learning_rate": 1.1500059004250487e-05, + "loss": 1.4045, + "step": 8870 + }, + { + "epoch": 0.48499924825389884, + "grad_norm": 1.2096943855285645, + "learning_rate": 1.1498252298287282e-05, + "loss": 1.4607, + "step": 8871 + }, + { + "epoch": 0.4850539206976204, + "grad_norm": 1.8900563716888428, + "learning_rate": 1.1496445542293919e-05, + "loss": 1.3439, + "step": 8872 + }, + { + "epoch": 0.4851085931413419, + "grad_norm": 1.5553306341171265, + "learning_rate": 1.1494638736330726e-05, + "loss": 1.3081, + "step": 8873 + }, + { + "epoch": 0.4851632655850635, + "grad_norm": 1.6021537780761719, + "learning_rate": 1.1492831880458037e-05, + "loss": 1.3735, + "step": 8874 + }, + { + "epoch": 0.48521793802878505, + "grad_norm": 1.736459493637085, + "learning_rate": 1.1491024974736191e-05, + "loss": 1.4869, + "step": 8875 + }, + { + "epoch": 0.4852726104725066, + "grad_norm": 1.363729476928711, + "learning_rate": 1.1489218019225521e-05, + "loss": 1.2676, + "step": 8876 + }, + { + "epoch": 0.4853272829162282, + "grad_norm": 1.548830509185791, + "learning_rate": 1.1487411013986367e-05, + "loss": 1.1851, + "step": 8877 + }, + { + "epoch": 0.4853819553599497, + "grad_norm": 1.5451563596725464, + "learning_rate": 1.1485603959079067e-05, + "loss": 1.5529, + "step": 8878 + }, + { + "epoch": 0.48543662780367125, + "grad_norm": 1.4994852542877197, + "learning_rate": 1.1483796854563969e-05, + "loss": 1.4242, + "step": 8879 + }, + { + "epoch": 0.4854913002473928, + "grad_norm": 1.5163588523864746, + "learning_rate": 1.148198970050141e-05, + "loss": 1.3904, + "step": 8880 + }, + { + "epoch": 0.4855459726911144, + "grad_norm": 1.4671188592910767, + "learning_rate": 1.1480182496951735e-05, + "loss": 1.5973, + "step": 8881 + }, + { + "epoch": 0.4856006451348359, + "grad_norm": 1.488020420074463, + "learning_rate": 1.1478375243975298e-05, + "loss": 1.3528, + "step": 8882 + }, + { + "epoch": 0.48565531757855745, + "grad_norm": 1.4624968767166138, + "learning_rate": 1.1476567941632437e-05, + "loss": 1.4377, + "step": 8883 + }, + { + "epoch": 0.48570999002227905, + "grad_norm": 1.2467511892318726, + "learning_rate": 1.1474760589983513e-05, + "loss": 1.5122, + "step": 8884 + }, + { + "epoch": 0.4857646624660006, + "grad_norm": 1.4905014038085938, + "learning_rate": 1.1472953189088867e-05, + "loss": 1.5142, + "step": 8885 + }, + { + "epoch": 0.4858193349097221, + "grad_norm": 1.8440269231796265, + "learning_rate": 1.1471145739008863e-05, + "loss": 1.4935, + "step": 8886 + }, + { + "epoch": 0.48587400735344366, + "grad_norm": 1.2017462253570557, + "learning_rate": 1.1469338239803846e-05, + "loss": 1.5457, + "step": 8887 + }, + { + "epoch": 0.48592867979716525, + "grad_norm": 1.7384369373321533, + "learning_rate": 1.1467530691534178e-05, + "loss": 1.5058, + "step": 8888 + }, + { + "epoch": 0.4859833522408868, + "grad_norm": 1.2454719543457031, + "learning_rate": 1.1465723094260219e-05, + "loss": 1.4567, + "step": 8889 + }, + { + "epoch": 0.4860380246846083, + "grad_norm": 1.7439100742340088, + "learning_rate": 1.1463915448042326e-05, + "loss": 1.5207, + "step": 8890 + }, + { + "epoch": 0.4860926971283299, + "grad_norm": 1.30532968044281, + "learning_rate": 1.1462107752940859e-05, + "loss": 1.2385, + "step": 8891 + }, + { + "epoch": 0.48614736957205146, + "grad_norm": 1.3348971605300903, + "learning_rate": 1.1460300009016182e-05, + "loss": 1.5191, + "step": 8892 + }, + { + "epoch": 0.486202042015773, + "grad_norm": 1.6530629396438599, + "learning_rate": 1.1458492216328668e-05, + "loss": 1.3255, + "step": 8893 + }, + { + "epoch": 0.48625671445949453, + "grad_norm": 1.5082719326019287, + "learning_rate": 1.145668437493867e-05, + "loss": 1.295, + "step": 8894 + }, + { + "epoch": 0.4863113869032161, + "grad_norm": 1.4558569192886353, + "learning_rate": 1.1454876484906562e-05, + "loss": 1.5312, + "step": 8895 + }, + { + "epoch": 0.48636605934693766, + "grad_norm": 1.8394709825515747, + "learning_rate": 1.1453068546292718e-05, + "loss": 1.1423, + "step": 8896 + }, + { + "epoch": 0.4864207317906592, + "grad_norm": 1.4709019660949707, + "learning_rate": 1.1451260559157505e-05, + "loss": 1.5346, + "step": 8897 + }, + { + "epoch": 0.4864754042343808, + "grad_norm": 1.8390982151031494, + "learning_rate": 1.1449452523561295e-05, + "loss": 1.4671, + "step": 8898 + }, + { + "epoch": 0.48653007667810233, + "grad_norm": 1.6494065523147583, + "learning_rate": 1.1447644439564462e-05, + "loss": 1.6613, + "step": 8899 + }, + { + "epoch": 0.48658474912182387, + "grad_norm": 1.6146643161773682, + "learning_rate": 1.144583630722739e-05, + "loss": 1.6819, + "step": 8900 + }, + { + "epoch": 0.4866394215655454, + "grad_norm": 1.3306736946105957, + "learning_rate": 1.1444028126610445e-05, + "loss": 1.3345, + "step": 8901 + }, + { + "epoch": 0.486694094009267, + "grad_norm": 1.8556256294250488, + "learning_rate": 1.1442219897774014e-05, + "loss": 1.578, + "step": 8902 + }, + { + "epoch": 0.48674876645298853, + "grad_norm": 2.2298877239227295, + "learning_rate": 1.1440411620778478e-05, + "loss": 1.486, + "step": 8903 + }, + { + "epoch": 0.4868034388967101, + "grad_norm": 1.3818875551223755, + "learning_rate": 1.1438603295684219e-05, + "loss": 1.3711, + "step": 8904 + }, + { + "epoch": 0.48685811134043167, + "grad_norm": 1.689257025718689, + "learning_rate": 1.1436794922551617e-05, + "loss": 1.4412, + "step": 8905 + }, + { + "epoch": 0.4869127837841532, + "grad_norm": 1.53681480884552, + "learning_rate": 1.1434986501441062e-05, + "loss": 1.5481, + "step": 8906 + }, + { + "epoch": 0.48696745622787474, + "grad_norm": 1.4854025840759277, + "learning_rate": 1.1433178032412941e-05, + "loss": 1.4283, + "step": 8907 + }, + { + "epoch": 0.4870221286715963, + "grad_norm": 1.392552137374878, + "learning_rate": 1.1431369515527642e-05, + "loss": 1.2256, + "step": 8908 + }, + { + "epoch": 0.48707680111531787, + "grad_norm": 1.942867636680603, + "learning_rate": 1.1429560950845555e-05, + "loss": 1.4358, + "step": 8909 + }, + { + "epoch": 0.4871314735590394, + "grad_norm": 1.9163635969161987, + "learning_rate": 1.1427752338427075e-05, + "loss": 1.3277, + "step": 8910 + }, + { + "epoch": 0.48718614600276094, + "grad_norm": 1.5481315851211548, + "learning_rate": 1.1425943678332595e-05, + "loss": 1.3197, + "step": 8911 + }, + { + "epoch": 0.48724081844648254, + "grad_norm": 1.9906179904937744, + "learning_rate": 1.1424134970622507e-05, + "loss": 1.1985, + "step": 8912 + }, + { + "epoch": 0.4872954908902041, + "grad_norm": 1.5541249513626099, + "learning_rate": 1.1422326215357209e-05, + "loss": 1.4839, + "step": 8913 + }, + { + "epoch": 0.4873501633339256, + "grad_norm": 1.9275059700012207, + "learning_rate": 1.1420517412597106e-05, + "loss": 1.1677, + "step": 8914 + }, + { + "epoch": 0.48740483577764715, + "grad_norm": 1.5684564113616943, + "learning_rate": 1.1418708562402589e-05, + "loss": 1.4887, + "step": 8915 + }, + { + "epoch": 0.48745950822136874, + "grad_norm": 1.3681050539016724, + "learning_rate": 1.1416899664834066e-05, + "loss": 1.6804, + "step": 8916 + }, + { + "epoch": 0.4875141806650903, + "grad_norm": 1.455119013786316, + "learning_rate": 1.141509071995194e-05, + "loss": 1.5775, + "step": 8917 + }, + { + "epoch": 0.4875688531088118, + "grad_norm": 1.4337778091430664, + "learning_rate": 1.1413281727816612e-05, + "loss": 1.2949, + "step": 8918 + }, + { + "epoch": 0.4876235255525334, + "grad_norm": 1.677932620048523, + "learning_rate": 1.1411472688488489e-05, + "loss": 1.3485, + "step": 8919 + }, + { + "epoch": 0.48767819799625495, + "grad_norm": 1.9193453788757324, + "learning_rate": 1.1409663602027984e-05, + "loss": 1.3182, + "step": 8920 + }, + { + "epoch": 0.4877328704399765, + "grad_norm": 1.2908662557601929, + "learning_rate": 1.1407854468495502e-05, + "loss": 1.5484, + "step": 8921 + }, + { + "epoch": 0.487787542883698, + "grad_norm": 1.4009751081466675, + "learning_rate": 1.1406045287951458e-05, + "loss": 1.4718, + "step": 8922 + }, + { + "epoch": 0.4878422153274196, + "grad_norm": 1.5104337930679321, + "learning_rate": 1.140423606045626e-05, + "loss": 1.4506, + "step": 8923 + }, + { + "epoch": 0.48789688777114115, + "grad_norm": 1.648972988128662, + "learning_rate": 1.1402426786070326e-05, + "loss": 1.4033, + "step": 8924 + }, + { + "epoch": 0.4879515602148627, + "grad_norm": 1.5458825826644897, + "learning_rate": 1.1400617464854069e-05, + "loss": 1.257, + "step": 8925 + }, + { + "epoch": 0.4880062326585843, + "grad_norm": 1.3964635133743286, + "learning_rate": 1.1398808096867908e-05, + "loss": 1.2115, + "step": 8926 + }, + { + "epoch": 0.4880609051023058, + "grad_norm": 1.3749147653579712, + "learning_rate": 1.1396998682172264e-05, + "loss": 1.4354, + "step": 8927 + }, + { + "epoch": 0.48811557754602736, + "grad_norm": 1.2157167196273804, + "learning_rate": 1.1395189220827556e-05, + "loss": 1.548, + "step": 8928 + }, + { + "epoch": 0.4881702499897489, + "grad_norm": 1.7866437435150146, + "learning_rate": 1.1393379712894205e-05, + "loss": 1.4691, + "step": 8929 + }, + { + "epoch": 0.4882249224334705, + "grad_norm": 1.3993438482284546, + "learning_rate": 1.1391570158432636e-05, + "loss": 1.5419, + "step": 8930 + }, + { + "epoch": 0.488279594877192, + "grad_norm": 1.4912985563278198, + "learning_rate": 1.1389760557503275e-05, + "loss": 1.4484, + "step": 8931 + }, + { + "epoch": 0.48833426732091356, + "grad_norm": 1.5550270080566406, + "learning_rate": 1.1387950910166545e-05, + "loss": 1.4159, + "step": 8932 + }, + { + "epoch": 0.48838893976463515, + "grad_norm": 2.0218870639801025, + "learning_rate": 1.138614121648288e-05, + "loss": 1.4651, + "step": 8933 + }, + { + "epoch": 0.4884436122083567, + "grad_norm": 1.3284392356872559, + "learning_rate": 1.1384331476512706e-05, + "loss": 1.7359, + "step": 8934 + }, + { + "epoch": 0.48849828465207823, + "grad_norm": 1.5974559783935547, + "learning_rate": 1.1382521690316455e-05, + "loss": 1.3029, + "step": 8935 + }, + { + "epoch": 0.48855295709579977, + "grad_norm": 1.3371320962905884, + "learning_rate": 1.1380711857954562e-05, + "loss": 1.5291, + "step": 8936 + }, + { + "epoch": 0.48860762953952136, + "grad_norm": 1.1349551677703857, + "learning_rate": 1.137890197948746e-05, + "loss": 1.6098, + "step": 8937 + }, + { + "epoch": 0.4886623019832429, + "grad_norm": 1.5757699012756348, + "learning_rate": 1.1377092054975586e-05, + "loss": 1.3388, + "step": 8938 + }, + { + "epoch": 0.48871697442696443, + "grad_norm": 1.7344088554382324, + "learning_rate": 1.1375282084479373e-05, + "loss": 1.4829, + "step": 8939 + }, + { + "epoch": 0.488771646870686, + "grad_norm": 1.7285536527633667, + "learning_rate": 1.1373472068059266e-05, + "loss": 1.3912, + "step": 8940 + }, + { + "epoch": 0.48882631931440756, + "grad_norm": 2.59985613822937, + "learning_rate": 1.1371662005775705e-05, + "loss": 1.5883, + "step": 8941 + }, + { + "epoch": 0.4888809917581291, + "grad_norm": 1.7883342504501343, + "learning_rate": 1.1369851897689128e-05, + "loss": 1.2424, + "step": 8942 + }, + { + "epoch": 0.48893566420185064, + "grad_norm": 1.793935775756836, + "learning_rate": 1.1368041743859985e-05, + "loss": 1.4654, + "step": 8943 + }, + { + "epoch": 0.48899033664557223, + "grad_norm": 1.3650823831558228, + "learning_rate": 1.1366231544348716e-05, + "loss": 1.3618, + "step": 8944 + }, + { + "epoch": 0.48904500908929377, + "grad_norm": 1.4119360446929932, + "learning_rate": 1.1364421299215773e-05, + "loss": 1.4167, + "step": 8945 + }, + { + "epoch": 0.4890996815330153, + "grad_norm": 1.5211089849472046, + "learning_rate": 1.1362611008521597e-05, + "loss": 1.6089, + "step": 8946 + }, + { + "epoch": 0.4891543539767369, + "grad_norm": 1.4179450273513794, + "learning_rate": 1.1360800672326642e-05, + "loss": 1.6098, + "step": 8947 + }, + { + "epoch": 0.48920902642045844, + "grad_norm": 1.6606427431106567, + "learning_rate": 1.1358990290691364e-05, + "loss": 1.3854, + "step": 8948 + }, + { + "epoch": 0.48926369886418, + "grad_norm": 1.2770609855651855, + "learning_rate": 1.1357179863676207e-05, + "loss": 1.3402, + "step": 8949 + }, + { + "epoch": 0.4893183713079015, + "grad_norm": 2.221661329269409, + "learning_rate": 1.135536939134163e-05, + "loss": 1.5014, + "step": 8950 + }, + { + "epoch": 0.4893730437516231, + "grad_norm": 1.498981237411499, + "learning_rate": 1.135355887374809e-05, + "loss": 1.3561, + "step": 8951 + }, + { + "epoch": 0.48942771619534464, + "grad_norm": 1.4370530843734741, + "learning_rate": 1.135174831095604e-05, + "loss": 1.5664, + "step": 8952 + }, + { + "epoch": 0.4894823886390662, + "grad_norm": 2.364649534225464, + "learning_rate": 1.1349937703025944e-05, + "loss": 1.3261, + "step": 8953 + }, + { + "epoch": 0.48953706108278777, + "grad_norm": 1.4112101793289185, + "learning_rate": 1.134812705001826e-05, + "loss": 1.4146, + "step": 8954 + }, + { + "epoch": 0.4895917335265093, + "grad_norm": 1.5558239221572876, + "learning_rate": 1.1346316351993454e-05, + "loss": 1.5764, + "step": 8955 + }, + { + "epoch": 0.48964640597023085, + "grad_norm": 1.5874418020248413, + "learning_rate": 1.1344505609011983e-05, + "loss": 1.3936, + "step": 8956 + }, + { + "epoch": 0.4897010784139524, + "grad_norm": 1.4493896961212158, + "learning_rate": 1.1342694821134313e-05, + "loss": 1.2383, + "step": 8957 + }, + { + "epoch": 0.489755750857674, + "grad_norm": 1.888903021812439, + "learning_rate": 1.1340883988420912e-05, + "loss": 1.4692, + "step": 8958 + }, + { + "epoch": 0.4898104233013955, + "grad_norm": 1.6156282424926758, + "learning_rate": 1.133907311093225e-05, + "loss": 1.7379, + "step": 8959 + }, + { + "epoch": 0.48986509574511705, + "grad_norm": 1.7292752265930176, + "learning_rate": 1.1337262188728795e-05, + "loss": 1.7301, + "step": 8960 + }, + { + "epoch": 0.48991976818883864, + "grad_norm": 1.5319432020187378, + "learning_rate": 1.1335451221871015e-05, + "loss": 1.4781, + "step": 8961 + }, + { + "epoch": 0.4899744406325602, + "grad_norm": 2.092967987060547, + "learning_rate": 1.1333640210419388e-05, + "loss": 1.5447, + "step": 8962 + }, + { + "epoch": 0.4900291130762817, + "grad_norm": 1.5074514150619507, + "learning_rate": 1.1331829154434386e-05, + "loss": 1.555, + "step": 8963 + }, + { + "epoch": 0.49008378552000326, + "grad_norm": 1.271224021911621, + "learning_rate": 1.133001805397648e-05, + "loss": 1.5764, + "step": 8964 + }, + { + "epoch": 0.49013845796372485, + "grad_norm": 1.6949540376663208, + "learning_rate": 1.132820690910615e-05, + "loss": 1.4394, + "step": 8965 + }, + { + "epoch": 0.4901931304074464, + "grad_norm": 1.3375215530395508, + "learning_rate": 1.1326395719883876e-05, + "loss": 1.6955, + "step": 8966 + }, + { + "epoch": 0.4902478028511679, + "grad_norm": 1.4449561834335327, + "learning_rate": 1.1324584486370136e-05, + "loss": 1.6605, + "step": 8967 + }, + { + "epoch": 0.4903024752948895, + "grad_norm": 1.5400400161743164, + "learning_rate": 1.1322773208625413e-05, + "loss": 1.5486, + "step": 8968 + }, + { + "epoch": 0.49035714773861105, + "grad_norm": 1.2065459489822388, + "learning_rate": 1.1320961886710189e-05, + "loss": 1.4933, + "step": 8969 + }, + { + "epoch": 0.4904118201823326, + "grad_norm": 1.6123353242874146, + "learning_rate": 1.1319150520684946e-05, + "loss": 1.5001, + "step": 8970 + }, + { + "epoch": 0.49046649262605413, + "grad_norm": 1.696311354637146, + "learning_rate": 1.1317339110610171e-05, + "loss": 1.4277, + "step": 8971 + }, + { + "epoch": 0.4905211650697757, + "grad_norm": 1.67430579662323, + "learning_rate": 1.131552765654635e-05, + "loss": 1.4409, + "step": 8972 + }, + { + "epoch": 0.49057583751349726, + "grad_norm": 1.5750350952148438, + "learning_rate": 1.1313716158553978e-05, + "loss": 1.5569, + "step": 8973 + }, + { + "epoch": 0.4906305099572188, + "grad_norm": 1.867060661315918, + "learning_rate": 1.1311904616693539e-05, + "loss": 1.3336, + "step": 8974 + }, + { + "epoch": 0.4906851824009404, + "grad_norm": 1.5105445384979248, + "learning_rate": 1.1310093031025527e-05, + "loss": 1.4523, + "step": 8975 + }, + { + "epoch": 0.4907398548446619, + "grad_norm": 1.2466074228286743, + "learning_rate": 1.1308281401610434e-05, + "loss": 1.3542, + "step": 8976 + }, + { + "epoch": 0.49079452728838346, + "grad_norm": 1.6506707668304443, + "learning_rate": 1.1306469728508755e-05, + "loss": 1.3559, + "step": 8977 + }, + { + "epoch": 0.490849199732105, + "grad_norm": 1.37337327003479, + "learning_rate": 1.1304658011780985e-05, + "loss": 1.2202, + "step": 8978 + }, + { + "epoch": 0.4909038721758266, + "grad_norm": 1.37401282787323, + "learning_rate": 1.1302846251487623e-05, + "loss": 1.4734, + "step": 8979 + }, + { + "epoch": 0.49095854461954813, + "grad_norm": 1.2540513277053833, + "learning_rate": 1.130103444768917e-05, + "loss": 1.5525, + "step": 8980 + }, + { + "epoch": 0.49101321706326967, + "grad_norm": 1.1638590097427368, + "learning_rate": 1.1299222600446122e-05, + "loss": 1.5447, + "step": 8981 + }, + { + "epoch": 0.49106788950699126, + "grad_norm": 1.632697343826294, + "learning_rate": 1.129741070981898e-05, + "loss": 1.5584, + "step": 8982 + }, + { + "epoch": 0.4911225619507128, + "grad_norm": 1.207175612449646, + "learning_rate": 1.1295598775868255e-05, + "loss": 1.4719, + "step": 8983 + }, + { + "epoch": 0.49117723439443434, + "grad_norm": 1.6586004495620728, + "learning_rate": 1.1293786798654442e-05, + "loss": 1.4441, + "step": 8984 + }, + { + "epoch": 0.4912319068381559, + "grad_norm": 1.659017562866211, + "learning_rate": 1.1291974778238055e-05, + "loss": 1.7369, + "step": 8985 + }, + { + "epoch": 0.49128657928187747, + "grad_norm": 1.3988802433013916, + "learning_rate": 1.1290162714679596e-05, + "loss": 1.2193, + "step": 8986 + }, + { + "epoch": 0.491341251725599, + "grad_norm": 1.7546440362930298, + "learning_rate": 1.1288350608039577e-05, + "loss": 1.4661, + "step": 8987 + }, + { + "epoch": 0.49139592416932054, + "grad_norm": 1.6871289014816284, + "learning_rate": 1.1286538458378512e-05, + "loss": 1.4794, + "step": 8988 + }, + { + "epoch": 0.49145059661304213, + "grad_norm": 1.4938050508499146, + "learning_rate": 1.1284726265756904e-05, + "loss": 1.3912, + "step": 8989 + }, + { + "epoch": 0.49150526905676367, + "grad_norm": 1.8641606569290161, + "learning_rate": 1.1282914030235275e-05, + "loss": 1.4038, + "step": 8990 + }, + { + "epoch": 0.4915599415004852, + "grad_norm": 1.1470897197723389, + "learning_rate": 1.1281101751874132e-05, + "loss": 1.5685, + "step": 8991 + }, + { + "epoch": 0.49161461394420675, + "grad_norm": 1.3710230588912964, + "learning_rate": 1.1279289430733998e-05, + "loss": 1.7044, + "step": 8992 + }, + { + "epoch": 0.49166928638792834, + "grad_norm": 1.482952356338501, + "learning_rate": 1.1277477066875384e-05, + "loss": 1.3337, + "step": 8993 + }, + { + "epoch": 0.4917239588316499, + "grad_norm": 1.46403169631958, + "learning_rate": 1.1275664660358818e-05, + "loss": 1.563, + "step": 8994 + }, + { + "epoch": 0.4917786312753714, + "grad_norm": 1.8600584268569946, + "learning_rate": 1.1273852211244816e-05, + "loss": 1.3787, + "step": 8995 + }, + { + "epoch": 0.491833303719093, + "grad_norm": 1.272222876548767, + "learning_rate": 1.1272039719593898e-05, + "loss": 1.4321, + "step": 8996 + }, + { + "epoch": 0.49188797616281454, + "grad_norm": 1.6610180139541626, + "learning_rate": 1.1270227185466586e-05, + "loss": 1.3826, + "step": 8997 + }, + { + "epoch": 0.4919426486065361, + "grad_norm": 1.7027947902679443, + "learning_rate": 1.126841460892341e-05, + "loss": 1.4419, + "step": 8998 + }, + { + "epoch": 0.4919973210502576, + "grad_norm": 1.8076894283294678, + "learning_rate": 1.1266601990024893e-05, + "loss": 1.3111, + "step": 8999 + }, + { + "epoch": 0.4920519934939792, + "grad_norm": 1.6859153509140015, + "learning_rate": 1.1264789328831564e-05, + "loss": 1.2497, + "step": 9000 + }, + { + "epoch": 0.49210666593770075, + "grad_norm": 2.1182069778442383, + "learning_rate": 1.1262976625403954e-05, + "loss": 1.362, + "step": 9001 + }, + { + "epoch": 0.4921613383814223, + "grad_norm": 1.5709128379821777, + "learning_rate": 1.1261163879802587e-05, + "loss": 1.6316, + "step": 9002 + }, + { + "epoch": 0.4922160108251439, + "grad_norm": 1.233304738998413, + "learning_rate": 1.1259351092088e-05, + "loss": 1.5865, + "step": 9003 + }, + { + "epoch": 0.4922706832688654, + "grad_norm": 1.4332280158996582, + "learning_rate": 1.1257538262320724e-05, + "loss": 1.4468, + "step": 9004 + }, + { + "epoch": 0.49232535571258695, + "grad_norm": 1.5281051397323608, + "learning_rate": 1.1255725390561296e-05, + "loss": 1.4708, + "step": 9005 + }, + { + "epoch": 0.4923800281563085, + "grad_norm": 1.607839822769165, + "learning_rate": 1.1253912476870247e-05, + "loss": 1.5157, + "step": 9006 + }, + { + "epoch": 0.4924347006000301, + "grad_norm": 2.1112029552459717, + "learning_rate": 1.1252099521308124e-05, + "loss": 1.5246, + "step": 9007 + }, + { + "epoch": 0.4924893730437516, + "grad_norm": 1.7873423099517822, + "learning_rate": 1.1250286523935456e-05, + "loss": 1.4208, + "step": 9008 + }, + { + "epoch": 0.49254404548747316, + "grad_norm": 1.5200713872909546, + "learning_rate": 1.1248473484812787e-05, + "loss": 1.5005, + "step": 9009 + }, + { + "epoch": 0.49259871793119475, + "grad_norm": 1.6305299997329712, + "learning_rate": 1.124666040400066e-05, + "loss": 1.2805, + "step": 9010 + }, + { + "epoch": 0.4926533903749163, + "grad_norm": 1.3098591566085815, + "learning_rate": 1.1244847281559616e-05, + "loss": 1.393, + "step": 9011 + }, + { + "epoch": 0.4927080628186378, + "grad_norm": 1.4357250928878784, + "learning_rate": 1.12430341175502e-05, + "loss": 1.1656, + "step": 9012 + }, + { + "epoch": 0.49276273526235936, + "grad_norm": 1.4470914602279663, + "learning_rate": 1.1241220912032958e-05, + "loss": 1.4789, + "step": 9013 + }, + { + "epoch": 0.49281740770608096, + "grad_norm": 1.4877290725708008, + "learning_rate": 1.1239407665068437e-05, + "loss": 1.463, + "step": 9014 + }, + { + "epoch": 0.4928720801498025, + "grad_norm": 1.8302733898162842, + "learning_rate": 1.1237594376717188e-05, + "loss": 1.4525, + "step": 9015 + }, + { + "epoch": 0.49292675259352403, + "grad_norm": 1.542775273323059, + "learning_rate": 1.1235781047039756e-05, + "loss": 1.6095, + "step": 9016 + }, + { + "epoch": 0.4929814250372456, + "grad_norm": 1.65184485912323, + "learning_rate": 1.1233967676096693e-05, + "loss": 1.4789, + "step": 9017 + }, + { + "epoch": 0.49303609748096716, + "grad_norm": 1.4931288957595825, + "learning_rate": 1.1232154263948558e-05, + "loss": 1.6049, + "step": 9018 + }, + { + "epoch": 0.4930907699246887, + "grad_norm": 1.4774852991104126, + "learning_rate": 1.12303408106559e-05, + "loss": 1.3183, + "step": 9019 + }, + { + "epoch": 0.49314544236841024, + "grad_norm": 1.5324114561080933, + "learning_rate": 1.1228527316279273e-05, + "loss": 1.3246, + "step": 9020 + }, + { + "epoch": 0.49320011481213183, + "grad_norm": 1.4271003007888794, + "learning_rate": 1.1226713780879236e-05, + "loss": 1.3738, + "step": 9021 + }, + { + "epoch": 0.49325478725585337, + "grad_norm": 1.4373724460601807, + "learning_rate": 1.122490020451635e-05, + "loss": 1.2453, + "step": 9022 + }, + { + "epoch": 0.4933094596995749, + "grad_norm": 1.343727469444275, + "learning_rate": 1.122308658725117e-05, + "loss": 1.2727, + "step": 9023 + }, + { + "epoch": 0.4933641321432965, + "grad_norm": 1.423890471458435, + "learning_rate": 1.122127292914426e-05, + "loss": 1.5531, + "step": 9024 + }, + { + "epoch": 0.49341880458701803, + "grad_norm": 1.3803104162216187, + "learning_rate": 1.1219459230256182e-05, + "loss": 1.4246, + "step": 9025 + }, + { + "epoch": 0.49347347703073957, + "grad_norm": 1.5930869579315186, + "learning_rate": 1.1217645490647494e-05, + "loss": 1.464, + "step": 9026 + }, + { + "epoch": 0.4935281494744611, + "grad_norm": 1.9304375648498535, + "learning_rate": 1.1215831710378772e-05, + "loss": 1.4901, + "step": 9027 + }, + { + "epoch": 0.4935828219181827, + "grad_norm": 1.5317354202270508, + "learning_rate": 1.1214017889510573e-05, + "loss": 1.3164, + "step": 9028 + }, + { + "epoch": 0.49363749436190424, + "grad_norm": 1.6734964847564697, + "learning_rate": 1.1212204028103469e-05, + "loss": 1.2138, + "step": 9029 + }, + { + "epoch": 0.4936921668056258, + "grad_norm": 1.8185428380966187, + "learning_rate": 1.1210390126218024e-05, + "loss": 1.4202, + "step": 9030 + }, + { + "epoch": 0.49374683924934737, + "grad_norm": 1.4786337614059448, + "learning_rate": 1.1208576183914816e-05, + "loss": 1.313, + "step": 9031 + }, + { + "epoch": 0.4938015116930689, + "grad_norm": 1.3926070928573608, + "learning_rate": 1.1206762201254415e-05, + "loss": 1.4335, + "step": 9032 + }, + { + "epoch": 0.49385618413679044, + "grad_norm": 1.6531686782836914, + "learning_rate": 1.1204948178297387e-05, + "loss": 1.4299, + "step": 9033 + }, + { + "epoch": 0.493910856580512, + "grad_norm": 1.4320236444473267, + "learning_rate": 1.1203134115104318e-05, + "loss": 1.3978, + "step": 9034 + }, + { + "epoch": 0.4939655290242336, + "grad_norm": 1.4921057224273682, + "learning_rate": 1.1201320011735772e-05, + "loss": 1.5081, + "step": 9035 + }, + { + "epoch": 0.4940202014679551, + "grad_norm": 2.031930446624756, + "learning_rate": 1.1199505868252336e-05, + "loss": 1.5246, + "step": 9036 + }, + { + "epoch": 0.49407487391167665, + "grad_norm": 1.2902096509933472, + "learning_rate": 1.1197691684714582e-05, + "loss": 1.5414, + "step": 9037 + }, + { + "epoch": 0.49412954635539824, + "grad_norm": 1.4765598773956299, + "learning_rate": 1.1195877461183091e-05, + "loss": 1.2358, + "step": 9038 + }, + { + "epoch": 0.4941842187991198, + "grad_norm": 1.5124248266220093, + "learning_rate": 1.119406319771845e-05, + "loss": 1.5065, + "step": 9039 + }, + { + "epoch": 0.4942388912428413, + "grad_norm": 1.6825575828552246, + "learning_rate": 1.1192248894381234e-05, + "loss": 1.3907, + "step": 9040 + }, + { + "epoch": 0.49429356368656285, + "grad_norm": 1.5144683122634888, + "learning_rate": 1.119043455123203e-05, + "loss": 1.4499, + "step": 9041 + }, + { + "epoch": 0.49434823613028445, + "grad_norm": 2.4172732830047607, + "learning_rate": 1.1188620168331421e-05, + "loss": 1.2449, + "step": 9042 + }, + { + "epoch": 0.494402908574006, + "grad_norm": 1.3598355054855347, + "learning_rate": 1.1186805745739999e-05, + "loss": 1.2303, + "step": 9043 + }, + { + "epoch": 0.4944575810177275, + "grad_norm": 1.3184856176376343, + "learning_rate": 1.1184991283518346e-05, + "loss": 1.2681, + "step": 9044 + }, + { + "epoch": 0.4945122534614491, + "grad_norm": 1.8372132778167725, + "learning_rate": 1.1183176781727052e-05, + "loss": 1.4101, + "step": 9045 + }, + { + "epoch": 0.49456692590517065, + "grad_norm": 1.2629607915878296, + "learning_rate": 1.1181362240426711e-05, + "loss": 1.4246, + "step": 9046 + }, + { + "epoch": 0.4946215983488922, + "grad_norm": 1.7297643423080444, + "learning_rate": 1.1179547659677915e-05, + "loss": 1.5764, + "step": 9047 + }, + { + "epoch": 0.4946762707926137, + "grad_norm": 1.5158376693725586, + "learning_rate": 1.117773303954125e-05, + "loss": 1.3593, + "step": 9048 + }, + { + "epoch": 0.4947309432363353, + "grad_norm": 1.7994449138641357, + "learning_rate": 1.1175918380077316e-05, + "loss": 1.4505, + "step": 9049 + }, + { + "epoch": 0.49478561568005686, + "grad_norm": 1.454485535621643, + "learning_rate": 1.1174103681346711e-05, + "loss": 1.5186, + "step": 9050 + }, + { + "epoch": 0.4948402881237784, + "grad_norm": 2.179913282394409, + "learning_rate": 1.1172288943410025e-05, + "loss": 1.777, + "step": 9051 + }, + { + "epoch": 0.4948949605675, + "grad_norm": 1.7091996669769287, + "learning_rate": 1.117047416632786e-05, + "loss": 1.5747, + "step": 9052 + }, + { + "epoch": 0.4949496330112215, + "grad_norm": 1.5550251007080078, + "learning_rate": 1.116865935016082e-05, + "loss": 1.6117, + "step": 9053 + }, + { + "epoch": 0.49500430545494306, + "grad_norm": 1.3951228857040405, + "learning_rate": 1.1166844494969501e-05, + "loss": 1.4391, + "step": 9054 + }, + { + "epoch": 0.4950589778986646, + "grad_norm": 1.3702348470687866, + "learning_rate": 1.1165029600814505e-05, + "loss": 1.3818, + "step": 9055 + }, + { + "epoch": 0.4951136503423862, + "grad_norm": 1.4664639234542847, + "learning_rate": 1.1163214667756437e-05, + "loss": 1.3196, + "step": 9056 + }, + { + "epoch": 0.49516832278610773, + "grad_norm": 1.3326876163482666, + "learning_rate": 1.1161399695855903e-05, + "loss": 1.4801, + "step": 9057 + }, + { + "epoch": 0.49522299522982927, + "grad_norm": 1.3222215175628662, + "learning_rate": 1.1159584685173506e-05, + "loss": 1.3225, + "step": 9058 + }, + { + "epoch": 0.49527766767355086, + "grad_norm": 1.8077224493026733, + "learning_rate": 1.1157769635769857e-05, + "loss": 1.591, + "step": 9059 + }, + { + "epoch": 0.4953323401172724, + "grad_norm": 1.6321781873703003, + "learning_rate": 1.1155954547705563e-05, + "loss": 1.4966, + "step": 9060 + }, + { + "epoch": 0.49538701256099393, + "grad_norm": 1.534729242324829, + "learning_rate": 1.1154139421041232e-05, + "loss": 1.5049, + "step": 9061 + }, + { + "epoch": 0.49544168500471547, + "grad_norm": 1.31524658203125, + "learning_rate": 1.115232425583748e-05, + "loss": 1.5046, + "step": 9062 + }, + { + "epoch": 0.49549635744843706, + "grad_norm": 1.4904879331588745, + "learning_rate": 1.1150509052154913e-05, + "loss": 1.3651, + "step": 9063 + }, + { + "epoch": 0.4955510298921586, + "grad_norm": 1.7160848379135132, + "learning_rate": 1.1148693810054152e-05, + "loss": 1.526, + "step": 9064 + }, + { + "epoch": 0.49560570233588014, + "grad_norm": 1.9921705722808838, + "learning_rate": 1.1146878529595808e-05, + "loss": 1.6044, + "step": 9065 + }, + { + "epoch": 0.49566037477960173, + "grad_norm": 2.2198550701141357, + "learning_rate": 1.11450632108405e-05, + "loss": 1.6854, + "step": 9066 + }, + { + "epoch": 0.49571504722332327, + "grad_norm": 1.6037672758102417, + "learning_rate": 1.1143247853848846e-05, + "loss": 1.4873, + "step": 9067 + }, + { + "epoch": 0.4957697196670448, + "grad_norm": 1.6671561002731323, + "learning_rate": 1.114143245868146e-05, + "loss": 1.3329, + "step": 9068 + }, + { + "epoch": 0.4958243921107664, + "grad_norm": 1.5334960222244263, + "learning_rate": 1.1139617025398968e-05, + "loss": 1.3203, + "step": 9069 + }, + { + "epoch": 0.49587906455448794, + "grad_norm": 1.4010436534881592, + "learning_rate": 1.1137801554061987e-05, + "loss": 1.613, + "step": 9070 + }, + { + "epoch": 0.4959337369982095, + "grad_norm": 1.668505311012268, + "learning_rate": 1.1135986044731144e-05, + "loss": 1.4345, + "step": 9071 + }, + { + "epoch": 0.495988409441931, + "grad_norm": 1.4911911487579346, + "learning_rate": 1.113417049746706e-05, + "loss": 1.393, + "step": 9072 + }, + { + "epoch": 0.4960430818856526, + "grad_norm": 1.3597325086593628, + "learning_rate": 1.1132354912330366e-05, + "loss": 1.4464, + "step": 9073 + }, + { + "epoch": 0.49609775432937414, + "grad_norm": 1.6729313135147095, + "learning_rate": 1.113053928938168e-05, + "loss": 1.4241, + "step": 9074 + }, + { + "epoch": 0.4961524267730957, + "grad_norm": 1.6839046478271484, + "learning_rate": 1.1128723628681635e-05, + "loss": 1.6424, + "step": 9075 + }, + { + "epoch": 0.49620709921681727, + "grad_norm": 1.593008041381836, + "learning_rate": 1.1126907930290861e-05, + "loss": 1.6525, + "step": 9076 + }, + { + "epoch": 0.4962617716605388, + "grad_norm": 1.362743616104126, + "learning_rate": 1.1125092194269982e-05, + "loss": 1.4632, + "step": 9077 + }, + { + "epoch": 0.49631644410426035, + "grad_norm": 1.5301616191864014, + "learning_rate": 1.112327642067964e-05, + "loss": 1.5672, + "step": 9078 + }, + { + "epoch": 0.4963711165479819, + "grad_norm": 1.4728647470474243, + "learning_rate": 1.1121460609580461e-05, + "loss": 1.4847, + "step": 9079 + }, + { + "epoch": 0.4964257889917035, + "grad_norm": 1.6042811870574951, + "learning_rate": 1.1119644761033079e-05, + "loss": 1.5637, + "step": 9080 + }, + { + "epoch": 0.496480461435425, + "grad_norm": 1.1782232522964478, + "learning_rate": 1.1117828875098136e-05, + "loss": 1.4594, + "step": 9081 + }, + { + "epoch": 0.49653513387914655, + "grad_norm": 1.2432860136032104, + "learning_rate": 1.1116012951836257e-05, + "loss": 1.5082, + "step": 9082 + }, + { + "epoch": 0.49658980632286814, + "grad_norm": 1.7155601978302002, + "learning_rate": 1.1114196991308091e-05, + "loss": 1.5989, + "step": 9083 + }, + { + "epoch": 0.4966444787665897, + "grad_norm": 1.413665771484375, + "learning_rate": 1.111238099357427e-05, + "loss": 1.415, + "step": 9084 + }, + { + "epoch": 0.4966991512103112, + "grad_norm": 1.7274888753890991, + "learning_rate": 1.111056495869544e-05, + "loss": 1.5543, + "step": 9085 + }, + { + "epoch": 0.49675382365403276, + "grad_norm": 1.4389283657073975, + "learning_rate": 1.1108748886732239e-05, + "loss": 1.4364, + "step": 9086 + }, + { + "epoch": 0.49680849609775435, + "grad_norm": 1.7308944463729858, + "learning_rate": 1.110693277774531e-05, + "loss": 1.2666, + "step": 9087 + }, + { + "epoch": 0.4968631685414759, + "grad_norm": 1.3710806369781494, + "learning_rate": 1.1105116631795301e-05, + "loss": 1.4424, + "step": 9088 + }, + { + "epoch": 0.4969178409851974, + "grad_norm": 1.6910371780395508, + "learning_rate": 1.110330044894285e-05, + "loss": 1.5524, + "step": 9089 + }, + { + "epoch": 0.496972513428919, + "grad_norm": 1.4939593076705933, + "learning_rate": 1.1101484229248612e-05, + "loss": 1.4615, + "step": 9090 + }, + { + "epoch": 0.49702718587264055, + "grad_norm": 1.6777786016464233, + "learning_rate": 1.1099667972773228e-05, + "loss": 1.6064, + "step": 9091 + }, + { + "epoch": 0.4970818583163621, + "grad_norm": 1.696713924407959, + "learning_rate": 1.1097851679577351e-05, + "loss": 1.4693, + "step": 9092 + }, + { + "epoch": 0.49713653076008363, + "grad_norm": 1.5385689735412598, + "learning_rate": 1.1096035349721633e-05, + "loss": 1.4883, + "step": 9093 + }, + { + "epoch": 0.4971912032038052, + "grad_norm": 1.2810754776000977, + "learning_rate": 1.1094218983266718e-05, + "loss": 1.5046, + "step": 9094 + }, + { + "epoch": 0.49724587564752676, + "grad_norm": 1.3849866390228271, + "learning_rate": 1.1092402580273268e-05, + "loss": 1.5446, + "step": 9095 + }, + { + "epoch": 0.4973005480912483, + "grad_norm": 1.469712257385254, + "learning_rate": 1.109058614080193e-05, + "loss": 1.4392, + "step": 9096 + }, + { + "epoch": 0.4973552205349699, + "grad_norm": 1.3392822742462158, + "learning_rate": 1.1088769664913359e-05, + "loss": 1.5295, + "step": 9097 + }, + { + "epoch": 0.4974098929786914, + "grad_norm": 1.416450023651123, + "learning_rate": 1.1086953152668218e-05, + "loss": 1.2285, + "step": 9098 + }, + { + "epoch": 0.49746456542241296, + "grad_norm": 1.7780925035476685, + "learning_rate": 1.1085136604127161e-05, + "loss": 1.2641, + "step": 9099 + }, + { + "epoch": 0.4975192378661345, + "grad_norm": 1.222494125366211, + "learning_rate": 1.1083320019350845e-05, + "loss": 1.5802, + "step": 9100 + }, + { + "epoch": 0.4975739103098561, + "grad_norm": 1.8617514371871948, + "learning_rate": 1.108150339839993e-05, + "loss": 1.3947, + "step": 9101 + }, + { + "epoch": 0.49762858275357763, + "grad_norm": 1.6611082553863525, + "learning_rate": 1.107968674133508e-05, + "loss": 1.522, + "step": 9102 + }, + { + "epoch": 0.49768325519729917, + "grad_norm": 1.71011483669281, + "learning_rate": 1.1077870048216957e-05, + "loss": 1.3687, + "step": 9103 + }, + { + "epoch": 0.49773792764102076, + "grad_norm": 1.66810142993927, + "learning_rate": 1.107605331910622e-05, + "loss": 1.359, + "step": 9104 + }, + { + "epoch": 0.4977926000847423, + "grad_norm": 1.5113811492919922, + "learning_rate": 1.1074236554063543e-05, + "loss": 1.5404, + "step": 9105 + }, + { + "epoch": 0.49784727252846384, + "grad_norm": 1.36642587184906, + "learning_rate": 1.1072419753149585e-05, + "loss": 1.4316, + "step": 9106 + }, + { + "epoch": 0.4979019449721854, + "grad_norm": 1.5294109582901, + "learning_rate": 1.1070602916425013e-05, + "loss": 1.1739, + "step": 9107 + }, + { + "epoch": 0.49795661741590697, + "grad_norm": 1.646868109703064, + "learning_rate": 1.1068786043950496e-05, + "loss": 1.2835, + "step": 9108 + }, + { + "epoch": 0.4980112898596285, + "grad_norm": 1.3772653341293335, + "learning_rate": 1.1066969135786711e-05, + "loss": 1.5028, + "step": 9109 + }, + { + "epoch": 0.49806596230335004, + "grad_norm": 1.2030298709869385, + "learning_rate": 1.1065152191994318e-05, + "loss": 1.1744, + "step": 9110 + }, + { + "epoch": 0.49812063474707163, + "grad_norm": 1.486496090888977, + "learning_rate": 1.1063335212633996e-05, + "loss": 1.1857, + "step": 9111 + }, + { + "epoch": 0.49817530719079317, + "grad_norm": 2.0692009925842285, + "learning_rate": 1.1061518197766415e-05, + "loss": 1.2597, + "step": 9112 + }, + { + "epoch": 0.4982299796345147, + "grad_norm": 1.6497005224227905, + "learning_rate": 1.105970114745225e-05, + "loss": 1.1629, + "step": 9113 + }, + { + "epoch": 0.49828465207823625, + "grad_norm": 1.3772224187850952, + "learning_rate": 1.1057884061752177e-05, + "loss": 1.6137, + "step": 9114 + }, + { + "epoch": 0.49833932452195784, + "grad_norm": 1.429924488067627, + "learning_rate": 1.1056066940726874e-05, + "loss": 1.5155, + "step": 9115 + }, + { + "epoch": 0.4983939969656794, + "grad_norm": 1.391161322593689, + "learning_rate": 1.1054249784437018e-05, + "loss": 1.4137, + "step": 9116 + }, + { + "epoch": 0.4984486694094009, + "grad_norm": 1.4801071882247925, + "learning_rate": 1.1052432592943287e-05, + "loss": 1.1817, + "step": 9117 + }, + { + "epoch": 0.4985033418531225, + "grad_norm": 1.348170518875122, + "learning_rate": 1.1050615366306364e-05, + "loss": 1.6361, + "step": 9118 + }, + { + "epoch": 0.49855801429684404, + "grad_norm": 1.5469908714294434, + "learning_rate": 1.1048798104586923e-05, + "loss": 1.5002, + "step": 9119 + }, + { + "epoch": 0.4986126867405656, + "grad_norm": 1.6903156042099, + "learning_rate": 1.104698080784566e-05, + "loss": 1.4688, + "step": 9120 + }, + { + "epoch": 0.4986673591842871, + "grad_norm": 1.263737440109253, + "learning_rate": 1.1045163476143247e-05, + "loss": 1.5317, + "step": 9121 + }, + { + "epoch": 0.4987220316280087, + "grad_norm": 1.6398712396621704, + "learning_rate": 1.104334610954037e-05, + "loss": 1.4981, + "step": 9122 + }, + { + "epoch": 0.49877670407173025, + "grad_norm": 1.9135830402374268, + "learning_rate": 1.1041528708097722e-05, + "loss": 1.4283, + "step": 9123 + }, + { + "epoch": 0.4988313765154518, + "grad_norm": 1.5098321437835693, + "learning_rate": 1.1039711271875986e-05, + "loss": 1.5114, + "step": 9124 + }, + { + "epoch": 0.4988860489591734, + "grad_norm": 1.5892633199691772, + "learning_rate": 1.1037893800935851e-05, + "loss": 1.2423, + "step": 9125 + }, + { + "epoch": 0.4989407214028949, + "grad_norm": 1.814576268196106, + "learning_rate": 1.1036076295338005e-05, + "loss": 1.5519, + "step": 9126 + }, + { + "epoch": 0.49899539384661645, + "grad_norm": 1.3491311073303223, + "learning_rate": 1.1034258755143141e-05, + "loss": 1.3456, + "step": 9127 + }, + { + "epoch": 0.499050066290338, + "grad_norm": 1.409963607788086, + "learning_rate": 1.103244118041195e-05, + "loss": 1.6049, + "step": 9128 + }, + { + "epoch": 0.4991047387340596, + "grad_norm": 1.4327152967453003, + "learning_rate": 1.1030623571205125e-05, + "loss": 1.4183, + "step": 9129 + }, + { + "epoch": 0.4991594111777811, + "grad_norm": 1.348069190979004, + "learning_rate": 1.1028805927583362e-05, + "loss": 1.5365, + "step": 9130 + }, + { + "epoch": 0.49921408362150266, + "grad_norm": 1.3394603729248047, + "learning_rate": 1.1026988249607355e-05, + "loss": 1.5305, + "step": 9131 + }, + { + "epoch": 0.49926875606522425, + "grad_norm": 1.4635164737701416, + "learning_rate": 1.1025170537337799e-05, + "loss": 1.38, + "step": 9132 + }, + { + "epoch": 0.4993234285089458, + "grad_norm": 1.7237355709075928, + "learning_rate": 1.1023352790835393e-05, + "loss": 1.406, + "step": 9133 + }, + { + "epoch": 0.4993781009526673, + "grad_norm": 1.3601316213607788, + "learning_rate": 1.1021535010160838e-05, + "loss": 1.4193, + "step": 9134 + }, + { + "epoch": 0.49943277339638886, + "grad_norm": 1.578937292098999, + "learning_rate": 1.1019717195374828e-05, + "loss": 1.4365, + "step": 9135 + }, + { + "epoch": 0.49948744584011046, + "grad_norm": 1.5148382186889648, + "learning_rate": 1.1017899346538071e-05, + "loss": 1.611, + "step": 9136 + }, + { + "epoch": 0.499542118283832, + "grad_norm": 1.4015716314315796, + "learning_rate": 1.1016081463711266e-05, + "loss": 1.4773, + "step": 9137 + }, + { + "epoch": 0.49959679072755353, + "grad_norm": 1.6721147298812866, + "learning_rate": 1.1014263546955118e-05, + "loss": 1.5462, + "step": 9138 + }, + { + "epoch": 0.4996514631712751, + "grad_norm": 1.6913115978240967, + "learning_rate": 1.1012445596330327e-05, + "loss": 1.5385, + "step": 9139 + }, + { + "epoch": 0.49970613561499666, + "grad_norm": 1.29339599609375, + "learning_rate": 1.1010627611897602e-05, + "loss": 1.5467, + "step": 9140 + }, + { + "epoch": 0.4997608080587182, + "grad_norm": 1.3076832294464111, + "learning_rate": 1.1008809593717653e-05, + "loss": 1.722, + "step": 9141 + }, + { + "epoch": 0.49981548050243974, + "grad_norm": 1.844018816947937, + "learning_rate": 1.100699154185118e-05, + "loss": 1.3199, + "step": 9142 + }, + { + "epoch": 0.49987015294616133, + "grad_norm": 1.3650773763656616, + "learning_rate": 1.1005173456358898e-05, + "loss": 1.4179, + "step": 9143 + }, + { + "epoch": 0.49992482538988287, + "grad_norm": 1.8873788118362427, + "learning_rate": 1.1003355337301517e-05, + "loss": 1.228, + "step": 9144 + }, + { + "epoch": 0.4999794978336044, + "grad_norm": 1.5449836254119873, + "learning_rate": 1.1001537184739748e-05, + "loss": 1.5794, + "step": 9145 + }, + { + "epoch": 0.500034170277326, + "grad_norm": 1.782670259475708, + "learning_rate": 1.0999718998734298e-05, + "loss": 1.4638, + "step": 9146 + }, + { + "epoch": 0.5000888427210475, + "grad_norm": 1.4301159381866455, + "learning_rate": 1.0997900779345887e-05, + "loss": 1.511, + "step": 9147 + }, + { + "epoch": 0.5001435151647691, + "grad_norm": 1.4123033285140991, + "learning_rate": 1.0996082526635227e-05, + "loss": 1.4383, + "step": 9148 + }, + { + "epoch": 0.5001981876084907, + "grad_norm": 1.339206576347351, + "learning_rate": 1.0994264240663035e-05, + "loss": 1.4151, + "step": 9149 + }, + { + "epoch": 0.5002528600522121, + "grad_norm": 1.5700911283493042, + "learning_rate": 1.0992445921490026e-05, + "loss": 1.3295, + "step": 9150 + }, + { + "epoch": 0.5003075324959337, + "grad_norm": 1.29872727394104, + "learning_rate": 1.0990627569176921e-05, + "loss": 1.3759, + "step": 9151 + }, + { + "epoch": 0.5003622049396553, + "grad_norm": 1.5021088123321533, + "learning_rate": 1.0988809183784435e-05, + "loss": 1.4874, + "step": 9152 + }, + { + "epoch": 0.5004168773833768, + "grad_norm": 1.4564024209976196, + "learning_rate": 1.0986990765373289e-05, + "loss": 1.4637, + "step": 9153 + }, + { + "epoch": 0.5004715498270984, + "grad_norm": 1.4348844289779663, + "learning_rate": 1.0985172314004205e-05, + "loss": 1.6357, + "step": 9154 + }, + { + "epoch": 0.50052622227082, + "grad_norm": 1.2908122539520264, + "learning_rate": 1.0983353829737909e-05, + "loss": 1.4685, + "step": 9155 + }, + { + "epoch": 0.5005808947145415, + "grad_norm": 1.5261250734329224, + "learning_rate": 1.0981535312635116e-05, + "loss": 1.2542, + "step": 9156 + }, + { + "epoch": 0.5006355671582631, + "grad_norm": 1.5102252960205078, + "learning_rate": 1.097971676275656e-05, + "loss": 1.5578, + "step": 9157 + }, + { + "epoch": 0.5006902396019846, + "grad_norm": 1.752973198890686, + "learning_rate": 1.0977898180162962e-05, + "loss": 1.4971, + "step": 9158 + }, + { + "epoch": 0.5007449120457061, + "grad_norm": 1.712746262550354, + "learning_rate": 1.0976079564915047e-05, + "loss": 1.5652, + "step": 9159 + }, + { + "epoch": 0.5007995844894277, + "grad_norm": 1.5259897708892822, + "learning_rate": 1.0974260917073548e-05, + "loss": 1.4164, + "step": 9160 + }, + { + "epoch": 0.5008542569331492, + "grad_norm": 1.3510578870773315, + "learning_rate": 1.0972442236699186e-05, + "loss": 1.7061, + "step": 9161 + }, + { + "epoch": 0.5009089293768708, + "grad_norm": 1.7799452543258667, + "learning_rate": 1.0970623523852699e-05, + "loss": 1.3515, + "step": 9162 + }, + { + "epoch": 0.5009636018205924, + "grad_norm": 1.3815300464630127, + "learning_rate": 1.0968804778594815e-05, + "loss": 1.4775, + "step": 9163 + }, + { + "epoch": 0.5010182742643139, + "grad_norm": 1.5353935956954956, + "learning_rate": 1.0966986000986267e-05, + "loss": 1.3998, + "step": 9164 + }, + { + "epoch": 0.5010729467080355, + "grad_norm": 1.3972867727279663, + "learning_rate": 1.0965167191087786e-05, + "loss": 1.4665, + "step": 9165 + }, + { + "epoch": 0.5011276191517571, + "grad_norm": 1.278045892715454, + "learning_rate": 1.0963348348960109e-05, + "loss": 1.4402, + "step": 9166 + }, + { + "epoch": 0.5011822915954786, + "grad_norm": 1.7008315324783325, + "learning_rate": 1.0961529474663966e-05, + "loss": 1.6014, + "step": 9167 + }, + { + "epoch": 0.5012369640392002, + "grad_norm": 1.541485071182251, + "learning_rate": 1.0959710568260098e-05, + "loss": 1.3404, + "step": 9168 + }, + { + "epoch": 0.5012916364829217, + "grad_norm": 3.1491332054138184, + "learning_rate": 1.0957891629809248e-05, + "loss": 1.4062, + "step": 9169 + }, + { + "epoch": 0.5013463089266432, + "grad_norm": 1.6276860237121582, + "learning_rate": 1.0956072659372144e-05, + "loss": 1.4424, + "step": 9170 + }, + { + "epoch": 0.5014009813703648, + "grad_norm": 1.5884039402008057, + "learning_rate": 1.0954253657009531e-05, + "loss": 1.72, + "step": 9171 + }, + { + "epoch": 0.5014556538140864, + "grad_norm": 1.816009521484375, + "learning_rate": 1.095243462278215e-05, + "loss": 1.4289, + "step": 9172 + }, + { + "epoch": 0.5015103262578079, + "grad_norm": 1.8660259246826172, + "learning_rate": 1.095061555675074e-05, + "loss": 1.4842, + "step": 9173 + }, + { + "epoch": 0.5015649987015295, + "grad_norm": 2.108156204223633, + "learning_rate": 1.0948796458976048e-05, + "loss": 1.4363, + "step": 9174 + }, + { + "epoch": 0.501619671145251, + "grad_norm": 1.510603666305542, + "learning_rate": 1.0946977329518813e-05, + "loss": 1.4317, + "step": 9175 + }, + { + "epoch": 0.5016743435889726, + "grad_norm": 1.3761035203933716, + "learning_rate": 1.0945158168439786e-05, + "loss": 1.324, + "step": 9176 + }, + { + "epoch": 0.5017290160326942, + "grad_norm": 1.3786381483078003, + "learning_rate": 1.094333897579971e-05, + "loss": 1.4237, + "step": 9177 + }, + { + "epoch": 0.5017836884764156, + "grad_norm": 1.372814655303955, + "learning_rate": 1.094151975165933e-05, + "loss": 1.4011, + "step": 9178 + }, + { + "epoch": 0.5018383609201372, + "grad_norm": 1.5153676271438599, + "learning_rate": 1.0939700496079399e-05, + "loss": 1.5389, + "step": 9179 + }, + { + "epoch": 0.5018930333638588, + "grad_norm": 1.4911731481552124, + "learning_rate": 1.093788120912066e-05, + "loss": 1.2917, + "step": 9180 + }, + { + "epoch": 0.5019477058075803, + "grad_norm": 1.142669439315796, + "learning_rate": 1.0936061890843868e-05, + "loss": 1.4765, + "step": 9181 + }, + { + "epoch": 0.5020023782513019, + "grad_norm": 1.6140811443328857, + "learning_rate": 1.0934242541309772e-05, + "loss": 1.5117, + "step": 9182 + }, + { + "epoch": 0.5020570506950235, + "grad_norm": 1.3763329982757568, + "learning_rate": 1.0932423160579126e-05, + "loss": 1.2528, + "step": 9183 + }, + { + "epoch": 0.502111723138745, + "grad_norm": 1.7512495517730713, + "learning_rate": 1.0930603748712685e-05, + "loss": 1.3083, + "step": 9184 + }, + { + "epoch": 0.5021663955824666, + "grad_norm": 1.5251514911651611, + "learning_rate": 1.0928784305771199e-05, + "loss": 1.4801, + "step": 9185 + }, + { + "epoch": 0.5022210680261882, + "grad_norm": 1.4670257568359375, + "learning_rate": 1.0926964831815425e-05, + "loss": 1.5739, + "step": 9186 + }, + { + "epoch": 0.5022757404699096, + "grad_norm": 1.7784335613250732, + "learning_rate": 1.0925145326906121e-05, + "loss": 1.298, + "step": 9187 + }, + { + "epoch": 0.5023304129136312, + "grad_norm": 1.2992846965789795, + "learning_rate": 1.0923325791104042e-05, + "loss": 1.5651, + "step": 9188 + }, + { + "epoch": 0.5023850853573527, + "grad_norm": 1.4415441751480103, + "learning_rate": 1.0921506224469952e-05, + "loss": 1.4742, + "step": 9189 + }, + { + "epoch": 0.5024397578010743, + "grad_norm": 1.584374189376831, + "learning_rate": 1.0919686627064603e-05, + "loss": 1.3301, + "step": 9190 + }, + { + "epoch": 0.5024944302447959, + "grad_norm": 1.8366363048553467, + "learning_rate": 1.091786699894876e-05, + "loss": 1.5676, + "step": 9191 + }, + { + "epoch": 0.5025491026885174, + "grad_norm": 1.5206568241119385, + "learning_rate": 1.0916047340183184e-05, + "loss": 1.2812, + "step": 9192 + }, + { + "epoch": 0.502603775132239, + "grad_norm": 1.7546380758285522, + "learning_rate": 1.091422765082864e-05, + "loss": 1.4581, + "step": 9193 + }, + { + "epoch": 0.5026584475759606, + "grad_norm": 1.3465174436569214, + "learning_rate": 1.0912407930945887e-05, + "loss": 1.3923, + "step": 9194 + }, + { + "epoch": 0.502713120019682, + "grad_norm": 1.7990145683288574, + "learning_rate": 1.0910588180595692e-05, + "loss": 1.6218, + "step": 9195 + }, + { + "epoch": 0.5027677924634036, + "grad_norm": 1.5059623718261719, + "learning_rate": 1.0908768399838823e-05, + "loss": 1.3335, + "step": 9196 + }, + { + "epoch": 0.5028224649071252, + "grad_norm": 1.350106120109558, + "learning_rate": 1.0906948588736044e-05, + "loss": 1.5531, + "step": 9197 + }, + { + "epoch": 0.5028771373508467, + "grad_norm": 1.3566642999649048, + "learning_rate": 1.0905128747348121e-05, + "loss": 1.9385, + "step": 9198 + }, + { + "epoch": 0.5029318097945683, + "grad_norm": 1.6228435039520264, + "learning_rate": 1.0903308875735827e-05, + "loss": 1.1135, + "step": 9199 + }, + { + "epoch": 0.5029864822382899, + "grad_norm": 1.4706568717956543, + "learning_rate": 1.0901488973959933e-05, + "loss": 1.5584, + "step": 9200 + }, + { + "epoch": 0.5030411546820114, + "grad_norm": 1.5179049968719482, + "learning_rate": 1.0899669042081202e-05, + "loss": 1.5903, + "step": 9201 + }, + { + "epoch": 0.503095827125733, + "grad_norm": 2.105414867401123, + "learning_rate": 1.0897849080160411e-05, + "loss": 1.6963, + "step": 9202 + }, + { + "epoch": 0.5031504995694545, + "grad_norm": 1.5021567344665527, + "learning_rate": 1.0896029088258336e-05, + "loss": 1.8419, + "step": 9203 + }, + { + "epoch": 0.503205172013176, + "grad_norm": 1.659567952156067, + "learning_rate": 1.0894209066435746e-05, + "loss": 1.5926, + "step": 9204 + }, + { + "epoch": 0.5032598444568976, + "grad_norm": 1.2469764947891235, + "learning_rate": 1.0892389014753412e-05, + "loss": 1.3927, + "step": 9205 + }, + { + "epoch": 0.5033145169006191, + "grad_norm": 1.5501298904418945, + "learning_rate": 1.0890568933272119e-05, + "loss": 1.264, + "step": 9206 + }, + { + "epoch": 0.5033691893443407, + "grad_norm": 1.4172344207763672, + "learning_rate": 1.0888748822052642e-05, + "loss": 1.1933, + "step": 9207 + }, + { + "epoch": 0.5034238617880623, + "grad_norm": 1.696315884590149, + "learning_rate": 1.088692868115575e-05, + "loss": 1.5612, + "step": 9208 + }, + { + "epoch": 0.5034785342317838, + "grad_norm": 1.5954720973968506, + "learning_rate": 1.0885108510642232e-05, + "loss": 1.6284, + "step": 9209 + }, + { + "epoch": 0.5035332066755054, + "grad_norm": 1.5088168382644653, + "learning_rate": 1.0883288310572863e-05, + "loss": 1.2866, + "step": 9210 + }, + { + "epoch": 0.503587879119227, + "grad_norm": 1.6653579473495483, + "learning_rate": 1.0881468081008428e-05, + "loss": 1.2981, + "step": 9211 + }, + { + "epoch": 0.5036425515629485, + "grad_norm": 1.4560680389404297, + "learning_rate": 1.08796478220097e-05, + "loss": 1.2777, + "step": 9212 + }, + { + "epoch": 0.50369722400667, + "grad_norm": 1.9540679454803467, + "learning_rate": 1.0877827533637469e-05, + "loss": 1.4255, + "step": 9213 + }, + { + "epoch": 0.5037518964503916, + "grad_norm": 1.614829659461975, + "learning_rate": 1.0876007215952518e-05, + "loss": 1.37, + "step": 9214 + }, + { + "epoch": 0.5038065688941131, + "grad_norm": 1.5617393255233765, + "learning_rate": 1.0874186869015627e-05, + "loss": 1.6135, + "step": 9215 + }, + { + "epoch": 0.5038612413378347, + "grad_norm": 1.5166946649551392, + "learning_rate": 1.0872366492887591e-05, + "loss": 1.7307, + "step": 9216 + }, + { + "epoch": 0.5039159137815562, + "grad_norm": 1.2994204759597778, + "learning_rate": 1.0870546087629185e-05, + "loss": 1.3012, + "step": 9217 + }, + { + "epoch": 0.5039705862252778, + "grad_norm": 1.6283694505691528, + "learning_rate": 1.0868725653301206e-05, + "loss": 1.3122, + "step": 9218 + }, + { + "epoch": 0.5040252586689994, + "grad_norm": 1.7021564245224, + "learning_rate": 1.0866905189964438e-05, + "loss": 1.4768, + "step": 9219 + }, + { + "epoch": 0.5040799311127209, + "grad_norm": 1.982202410697937, + "learning_rate": 1.0865084697679671e-05, + "loss": 1.5364, + "step": 9220 + }, + { + "epoch": 0.5041346035564425, + "grad_norm": 1.782796859741211, + "learning_rate": 1.0863264176507695e-05, + "loss": 1.416, + "step": 9221 + }, + { + "epoch": 0.504189276000164, + "grad_norm": 1.5765010118484497, + "learning_rate": 1.0861443626509305e-05, + "loss": 1.3003, + "step": 9222 + }, + { + "epoch": 0.5042439484438855, + "grad_norm": 1.696351170539856, + "learning_rate": 1.0859623047745289e-05, + "loss": 1.3044, + "step": 9223 + }, + { + "epoch": 0.5042986208876071, + "grad_norm": 1.5950416326522827, + "learning_rate": 1.0857802440276442e-05, + "loss": 1.3656, + "step": 9224 + }, + { + "epoch": 0.5043532933313287, + "grad_norm": 1.4912539720535278, + "learning_rate": 1.085598180416356e-05, + "loss": 1.5904, + "step": 9225 + }, + { + "epoch": 0.5044079657750502, + "grad_norm": 1.6890002489089966, + "learning_rate": 1.0854161139467436e-05, + "loss": 1.3533, + "step": 9226 + }, + { + "epoch": 0.5044626382187718, + "grad_norm": 1.461419701576233, + "learning_rate": 1.0852340446248867e-05, + "loss": 1.5418, + "step": 9227 + }, + { + "epoch": 0.5045173106624934, + "grad_norm": 1.958309292793274, + "learning_rate": 1.0850519724568652e-05, + "loss": 1.4934, + "step": 9228 + }, + { + "epoch": 0.5045719831062149, + "grad_norm": 1.4591922760009766, + "learning_rate": 1.0848698974487585e-05, + "loss": 1.3431, + "step": 9229 + }, + { + "epoch": 0.5046266555499365, + "grad_norm": 1.521844506263733, + "learning_rate": 1.0846878196066468e-05, + "loss": 1.6203, + "step": 9230 + }, + { + "epoch": 0.504681327993658, + "grad_norm": 1.4106777906417847, + "learning_rate": 1.0845057389366102e-05, + "loss": 1.6506, + "step": 9231 + }, + { + "epoch": 0.5047360004373795, + "grad_norm": 1.7956745624542236, + "learning_rate": 1.0843236554447288e-05, + "loss": 1.6423, + "step": 9232 + }, + { + "epoch": 0.5047906728811011, + "grad_norm": 1.5890552997589111, + "learning_rate": 1.0841415691370825e-05, + "loss": 1.3674, + "step": 9233 + }, + { + "epoch": 0.5048453453248226, + "grad_norm": 1.5900684595108032, + "learning_rate": 1.0839594800197516e-05, + "loss": 1.258, + "step": 9234 + }, + { + "epoch": 0.5049000177685442, + "grad_norm": 1.4431349039077759, + "learning_rate": 1.083777388098817e-05, + "loss": 1.5783, + "step": 9235 + }, + { + "epoch": 0.5049546902122658, + "grad_norm": 1.4505817890167236, + "learning_rate": 1.0835952933803588e-05, + "loss": 1.2508, + "step": 9236 + }, + { + "epoch": 0.5050093626559873, + "grad_norm": 1.6787301301956177, + "learning_rate": 1.0834131958704575e-05, + "loss": 1.4432, + "step": 9237 + }, + { + "epoch": 0.5050640350997089, + "grad_norm": 1.4388656616210938, + "learning_rate": 1.0832310955751937e-05, + "loss": 1.4779, + "step": 9238 + }, + { + "epoch": 0.5051187075434305, + "grad_norm": 1.750784158706665, + "learning_rate": 1.0830489925006485e-05, + "loss": 1.4553, + "step": 9239 + }, + { + "epoch": 0.505173379987152, + "grad_norm": 1.9019891023635864, + "learning_rate": 1.0828668866529024e-05, + "loss": 1.4563, + "step": 9240 + }, + { + "epoch": 0.5052280524308735, + "grad_norm": 1.7420440912246704, + "learning_rate": 1.0826847780380365e-05, + "loss": 1.3649, + "step": 9241 + }, + { + "epoch": 0.5052827248745951, + "grad_norm": 1.2174547910690308, + "learning_rate": 1.0825026666621321e-05, + "loss": 1.5446, + "step": 9242 + }, + { + "epoch": 0.5053373973183166, + "grad_norm": 1.4679017066955566, + "learning_rate": 1.0823205525312699e-05, + "loss": 1.6498, + "step": 9243 + }, + { + "epoch": 0.5053920697620382, + "grad_norm": 1.550283432006836, + "learning_rate": 1.082138435651531e-05, + "loss": 1.4266, + "step": 9244 + }, + { + "epoch": 0.5054467422057597, + "grad_norm": 1.6354483366012573, + "learning_rate": 1.0819563160289975e-05, + "loss": 1.4655, + "step": 9245 + }, + { + "epoch": 0.5055014146494813, + "grad_norm": 2.1105849742889404, + "learning_rate": 1.0817741936697499e-05, + "loss": 1.4553, + "step": 9246 + }, + { + "epoch": 0.5055560870932029, + "grad_norm": 2.073723793029785, + "learning_rate": 1.0815920685798702e-05, + "loss": 1.3097, + "step": 9247 + }, + { + "epoch": 0.5056107595369244, + "grad_norm": 1.4843593835830688, + "learning_rate": 1.0814099407654399e-05, + "loss": 1.477, + "step": 9248 + }, + { + "epoch": 0.505665431980646, + "grad_norm": 1.3690578937530518, + "learning_rate": 1.0812278102325408e-05, + "loss": 1.4431, + "step": 9249 + }, + { + "epoch": 0.5057201044243675, + "grad_norm": 1.8132225275039673, + "learning_rate": 1.0810456769872544e-05, + "loss": 1.4096, + "step": 9250 + }, + { + "epoch": 0.505774776868089, + "grad_norm": 1.460686206817627, + "learning_rate": 1.0808635410356625e-05, + "loss": 1.3598, + "step": 9251 + }, + { + "epoch": 0.5058294493118106, + "grad_norm": 1.4334146976470947, + "learning_rate": 1.0806814023838473e-05, + "loss": 1.5086, + "step": 9252 + }, + { + "epoch": 0.5058841217555322, + "grad_norm": 1.1202763319015503, + "learning_rate": 1.0804992610378907e-05, + "loss": 1.2014, + "step": 9253 + }, + { + "epoch": 0.5059387941992537, + "grad_norm": 1.5398565530776978, + "learning_rate": 1.0803171170038748e-05, + "loss": 1.5775, + "step": 9254 + }, + { + "epoch": 0.5059934666429753, + "grad_norm": 1.3210844993591309, + "learning_rate": 1.0801349702878822e-05, + "loss": 1.2643, + "step": 9255 + }, + { + "epoch": 0.5060481390866969, + "grad_norm": 1.229733943939209, + "learning_rate": 1.0799528208959949e-05, + "loss": 1.653, + "step": 9256 + }, + { + "epoch": 0.5061028115304184, + "grad_norm": 1.2771296501159668, + "learning_rate": 1.079770668834295e-05, + "loss": 1.4005, + "step": 9257 + }, + { + "epoch": 0.50615748397414, + "grad_norm": 1.646770715713501, + "learning_rate": 1.0795885141088653e-05, + "loss": 1.4902, + "step": 9258 + }, + { + "epoch": 0.5062121564178614, + "grad_norm": 1.4556580781936646, + "learning_rate": 1.0794063567257886e-05, + "loss": 1.329, + "step": 9259 + }, + { + "epoch": 0.506266828861583, + "grad_norm": 1.5197876691818237, + "learning_rate": 1.0792241966911472e-05, + "loss": 1.4911, + "step": 9260 + }, + { + "epoch": 0.5063215013053046, + "grad_norm": 1.5873117446899414, + "learning_rate": 1.079042034011024e-05, + "loss": 1.4985, + "step": 9261 + }, + { + "epoch": 0.5063761737490261, + "grad_norm": 1.5465741157531738, + "learning_rate": 1.0788598686915018e-05, + "loss": 1.3916, + "step": 9262 + }, + { + "epoch": 0.5064308461927477, + "grad_norm": 3.4945476055145264, + "learning_rate": 1.0786777007386636e-05, + "loss": 1.2786, + "step": 9263 + }, + { + "epoch": 0.5064855186364693, + "grad_norm": 1.558374047279358, + "learning_rate": 1.078495530158592e-05, + "loss": 1.5846, + "step": 9264 + }, + { + "epoch": 0.5065401910801908, + "grad_norm": 1.6386371850967407, + "learning_rate": 1.0783133569573708e-05, + "loss": 1.3352, + "step": 9265 + }, + { + "epoch": 0.5065948635239124, + "grad_norm": 1.8682159185409546, + "learning_rate": 1.0781311811410826e-05, + "loss": 1.3098, + "step": 9266 + }, + { + "epoch": 0.506649535967634, + "grad_norm": 1.9315462112426758, + "learning_rate": 1.077949002715811e-05, + "loss": 1.3522, + "step": 9267 + }, + { + "epoch": 0.5067042084113554, + "grad_norm": 1.2397783994674683, + "learning_rate": 1.0777668216876395e-05, + "loss": 1.4263, + "step": 9268 + }, + { + "epoch": 0.506758880855077, + "grad_norm": 1.342834234237671, + "learning_rate": 1.0775846380626512e-05, + "loss": 1.0928, + "step": 9269 + }, + { + "epoch": 0.5068135532987986, + "grad_norm": 1.6582832336425781, + "learning_rate": 1.0774024518469297e-05, + "loss": 1.5043, + "step": 9270 + }, + { + "epoch": 0.5068682257425201, + "grad_norm": 1.5953493118286133, + "learning_rate": 1.0772202630465586e-05, + "loss": 1.3532, + "step": 9271 + }, + { + "epoch": 0.5069228981862417, + "grad_norm": 1.6426315307617188, + "learning_rate": 1.0770380716676218e-05, + "loss": 1.4139, + "step": 9272 + }, + { + "epoch": 0.5069775706299632, + "grad_norm": 1.7440816164016724, + "learning_rate": 1.0768558777162035e-05, + "loss": 1.7808, + "step": 9273 + }, + { + "epoch": 0.5070322430736848, + "grad_norm": 1.5387557744979858, + "learning_rate": 1.0766736811983864e-05, + "loss": 1.4191, + "step": 9274 + }, + { + "epoch": 0.5070869155174064, + "grad_norm": 1.5923559665679932, + "learning_rate": 1.0764914821202556e-05, + "loss": 1.5257, + "step": 9275 + }, + { + "epoch": 0.5071415879611278, + "grad_norm": 1.683571696281433, + "learning_rate": 1.0763092804878945e-05, + "loss": 1.5723, + "step": 9276 + }, + { + "epoch": 0.5071962604048494, + "grad_norm": 1.9575990438461304, + "learning_rate": 1.0761270763073877e-05, + "loss": 1.3596, + "step": 9277 + }, + { + "epoch": 0.507250932848571, + "grad_norm": 1.238315224647522, + "learning_rate": 1.075944869584819e-05, + "loss": 1.4451, + "step": 9278 + }, + { + "epoch": 0.5073056052922925, + "grad_norm": 1.8144471645355225, + "learning_rate": 1.0757626603262728e-05, + "loss": 1.5875, + "step": 9279 + }, + { + "epoch": 0.5073602777360141, + "grad_norm": 1.3705469369888306, + "learning_rate": 1.0755804485378336e-05, + "loss": 1.3591, + "step": 9280 + }, + { + "epoch": 0.5074149501797357, + "grad_norm": 1.5606905221939087, + "learning_rate": 1.0753982342255863e-05, + "loss": 1.4136, + "step": 9281 + }, + { + "epoch": 0.5074696226234572, + "grad_norm": 1.4108794927597046, + "learning_rate": 1.0752160173956145e-05, + "loss": 1.5005, + "step": 9282 + }, + { + "epoch": 0.5075242950671788, + "grad_norm": 1.3833115100860596, + "learning_rate": 1.0750337980540035e-05, + "loss": 1.4474, + "step": 9283 + }, + { + "epoch": 0.5075789675109004, + "grad_norm": 1.605517864227295, + "learning_rate": 1.0748515762068382e-05, + "loss": 1.4789, + "step": 9284 + }, + { + "epoch": 0.5076336399546219, + "grad_norm": 1.342501163482666, + "learning_rate": 1.0746693518602026e-05, + "loss": 1.4333, + "step": 9285 + }, + { + "epoch": 0.5076883123983434, + "grad_norm": 1.4025932550430298, + "learning_rate": 1.0744871250201825e-05, + "loss": 1.3833, + "step": 9286 + }, + { + "epoch": 0.5077429848420649, + "grad_norm": 1.2430046796798706, + "learning_rate": 1.0743048956928626e-05, + "loss": 1.6784, + "step": 9287 + }, + { + "epoch": 0.5077976572857865, + "grad_norm": 1.6819748878479004, + "learning_rate": 1.0741226638843276e-05, + "loss": 1.5319, + "step": 9288 + }, + { + "epoch": 0.5078523297295081, + "grad_norm": 1.5252074003219604, + "learning_rate": 1.0739404296006631e-05, + "loss": 1.6632, + "step": 9289 + }, + { + "epoch": 0.5079070021732296, + "grad_norm": 2.0276150703430176, + "learning_rate": 1.073758192847954e-05, + "loss": 1.3187, + "step": 9290 + }, + { + "epoch": 0.5079616746169512, + "grad_norm": 1.60667085647583, + "learning_rate": 1.0735759536322859e-05, + "loss": 1.4929, + "step": 9291 + }, + { + "epoch": 0.5080163470606728, + "grad_norm": 1.2206000089645386, + "learning_rate": 1.0733937119597439e-05, + "loss": 1.4021, + "step": 9292 + }, + { + "epoch": 0.5080710195043943, + "grad_norm": 1.391721487045288, + "learning_rate": 1.0732114678364135e-05, + "loss": 1.6491, + "step": 9293 + }, + { + "epoch": 0.5081256919481159, + "grad_norm": 1.7207105159759521, + "learning_rate": 1.0730292212683808e-05, + "loss": 1.4397, + "step": 9294 + }, + { + "epoch": 0.5081803643918374, + "grad_norm": 2.30070161819458, + "learning_rate": 1.072846972261731e-05, + "loss": 1.4144, + "step": 9295 + }, + { + "epoch": 0.5082350368355589, + "grad_norm": 1.8873172998428345, + "learning_rate": 1.0726647208225498e-05, + "loss": 1.4255, + "step": 9296 + }, + { + "epoch": 0.5082897092792805, + "grad_norm": 1.8566172122955322, + "learning_rate": 1.0724824669569226e-05, + "loss": 1.6892, + "step": 9297 + }, + { + "epoch": 0.5083443817230021, + "grad_norm": 1.369321346282959, + "learning_rate": 1.0723002106709364e-05, + "loss": 1.5843, + "step": 9298 + }, + { + "epoch": 0.5083990541667236, + "grad_norm": 1.620497703552246, + "learning_rate": 1.072117951970676e-05, + "loss": 1.5693, + "step": 9299 + }, + { + "epoch": 0.5084537266104452, + "grad_norm": 1.889739990234375, + "learning_rate": 1.0719356908622286e-05, + "loss": 1.4842, + "step": 9300 + }, + { + "epoch": 0.5085083990541667, + "grad_norm": 1.372536540031433, + "learning_rate": 1.0717534273516791e-05, + "loss": 1.4467, + "step": 9301 + }, + { + "epoch": 0.5085630714978883, + "grad_norm": 1.5946305990219116, + "learning_rate": 1.0715711614451146e-05, + "loss": 1.5959, + "step": 9302 + }, + { + "epoch": 0.5086177439416099, + "grad_norm": 1.527079463005066, + "learning_rate": 1.0713888931486208e-05, + "loss": 1.4813, + "step": 9303 + }, + { + "epoch": 0.5086724163853313, + "grad_norm": 1.1641160249710083, + "learning_rate": 1.0712066224682842e-05, + "loss": 1.5031, + "step": 9304 + }, + { + "epoch": 0.5087270888290529, + "grad_norm": 1.4994820356369019, + "learning_rate": 1.0710243494101917e-05, + "loss": 1.5026, + "step": 9305 + }, + { + "epoch": 0.5087817612727745, + "grad_norm": 1.5929375886917114, + "learning_rate": 1.0708420739804296e-05, + "loss": 1.2996, + "step": 9306 + }, + { + "epoch": 0.508836433716496, + "grad_norm": 1.5888992547988892, + "learning_rate": 1.0706597961850842e-05, + "loss": 1.3669, + "step": 9307 + }, + { + "epoch": 0.5088911061602176, + "grad_norm": 1.3530515432357788, + "learning_rate": 1.0704775160302425e-05, + "loss": 1.245, + "step": 9308 + }, + { + "epoch": 0.5089457786039392, + "grad_norm": 1.9583392143249512, + "learning_rate": 1.0702952335219912e-05, + "loss": 1.4229, + "step": 9309 + }, + { + "epoch": 0.5090004510476607, + "grad_norm": 1.4292815923690796, + "learning_rate": 1.0701129486664168e-05, + "loss": 1.4944, + "step": 9310 + }, + { + "epoch": 0.5090551234913823, + "grad_norm": 1.1059497594833374, + "learning_rate": 1.0699306614696068e-05, + "loss": 1.5718, + "step": 9311 + }, + { + "epoch": 0.5091097959351039, + "grad_norm": 1.2363685369491577, + "learning_rate": 1.0697483719376479e-05, + "loss": 1.5596, + "step": 9312 + }, + { + "epoch": 0.5091644683788253, + "grad_norm": 1.8388991355895996, + "learning_rate": 1.069566080076627e-05, + "loss": 1.3228, + "step": 9313 + }, + { + "epoch": 0.5092191408225469, + "grad_norm": 1.6278904676437378, + "learning_rate": 1.0693837858926317e-05, + "loss": 1.5338, + "step": 9314 + }, + { + "epoch": 0.5092738132662684, + "grad_norm": 1.2732250690460205, + "learning_rate": 1.069201489391749e-05, + "loss": 1.5917, + "step": 9315 + }, + { + "epoch": 0.50932848570999, + "grad_norm": 1.6478703022003174, + "learning_rate": 1.0690191905800659e-05, + "loss": 1.3935, + "step": 9316 + }, + { + "epoch": 0.5093831581537116, + "grad_norm": 1.5070806741714478, + "learning_rate": 1.0688368894636702e-05, + "loss": 1.5433, + "step": 9317 + }, + { + "epoch": 0.5094378305974331, + "grad_norm": 1.7521933317184448, + "learning_rate": 1.068654586048649e-05, + "loss": 1.4391, + "step": 9318 + }, + { + "epoch": 0.5094925030411547, + "grad_norm": 1.6729693412780762, + "learning_rate": 1.0684722803410904e-05, + "loss": 1.5067, + "step": 9319 + }, + { + "epoch": 0.5095471754848763, + "grad_norm": 1.4935166835784912, + "learning_rate": 1.0682899723470818e-05, + "loss": 1.3268, + "step": 9320 + }, + { + "epoch": 0.5096018479285978, + "grad_norm": 1.4114007949829102, + "learning_rate": 1.0681076620727104e-05, + "loss": 1.3559, + "step": 9321 + }, + { + "epoch": 0.5096565203723193, + "grad_norm": 1.6342016458511353, + "learning_rate": 1.0679253495240646e-05, + "loss": 1.1913, + "step": 9322 + }, + { + "epoch": 0.5097111928160409, + "grad_norm": 1.6529066562652588, + "learning_rate": 1.067743034707232e-05, + "loss": 1.3087, + "step": 9323 + }, + { + "epoch": 0.5097658652597624, + "grad_norm": 1.6025686264038086, + "learning_rate": 1.0675607176283002e-05, + "loss": 1.5699, + "step": 9324 + }, + { + "epoch": 0.509820537703484, + "grad_norm": 1.1676421165466309, + "learning_rate": 1.0673783982933576e-05, + "loss": 1.5566, + "step": 9325 + }, + { + "epoch": 0.5098752101472056, + "grad_norm": 1.4492204189300537, + "learning_rate": 1.0671960767084925e-05, + "loss": 1.369, + "step": 9326 + }, + { + "epoch": 0.5099298825909271, + "grad_norm": 1.413644552230835, + "learning_rate": 1.0670137528797927e-05, + "loss": 1.4022, + "step": 9327 + }, + { + "epoch": 0.5099845550346487, + "grad_norm": 1.1921741962432861, + "learning_rate": 1.0668314268133462e-05, + "loss": 1.3896, + "step": 9328 + }, + { + "epoch": 0.5100392274783702, + "grad_norm": 1.5494678020477295, + "learning_rate": 1.0666490985152416e-05, + "loss": 1.4985, + "step": 9329 + }, + { + "epoch": 0.5100938999220918, + "grad_norm": 1.9546122550964355, + "learning_rate": 1.066466767991567e-05, + "loss": 1.5729, + "step": 9330 + }, + { + "epoch": 0.5101485723658133, + "grad_norm": 1.6003806591033936, + "learning_rate": 1.0662844352484112e-05, + "loss": 1.6996, + "step": 9331 + }, + { + "epoch": 0.5102032448095348, + "grad_norm": 1.456516981124878, + "learning_rate": 1.0661021002918626e-05, + "loss": 1.4315, + "step": 9332 + }, + { + "epoch": 0.5102579172532564, + "grad_norm": 1.667342185974121, + "learning_rate": 1.0659197631280098e-05, + "loss": 1.4378, + "step": 9333 + }, + { + "epoch": 0.510312589696978, + "grad_norm": 1.4981180429458618, + "learning_rate": 1.0657374237629414e-05, + "loss": 1.4095, + "step": 9334 + }, + { + "epoch": 0.5103672621406995, + "grad_norm": 1.7602192163467407, + "learning_rate": 1.065555082202746e-05, + "loss": 1.7702, + "step": 9335 + }, + { + "epoch": 0.5104219345844211, + "grad_norm": 1.7119896411895752, + "learning_rate": 1.0653727384535131e-05, + "loss": 1.5101, + "step": 9336 + }, + { + "epoch": 0.5104766070281427, + "grad_norm": 1.667327880859375, + "learning_rate": 1.0651903925213304e-05, + "loss": 1.5425, + "step": 9337 + }, + { + "epoch": 0.5105312794718642, + "grad_norm": 1.3137320280075073, + "learning_rate": 1.0650080444122876e-05, + "loss": 1.433, + "step": 9338 + }, + { + "epoch": 0.5105859519155858, + "grad_norm": 1.3761698007583618, + "learning_rate": 1.0648256941324742e-05, + "loss": 1.4386, + "step": 9339 + }, + { + "epoch": 0.5106406243593073, + "grad_norm": 1.4943777322769165, + "learning_rate": 1.0646433416879785e-05, + "loss": 1.6853, + "step": 9340 + }, + { + "epoch": 0.5106952968030288, + "grad_norm": 1.3114054203033447, + "learning_rate": 1.0644609870848896e-05, + "loss": 1.3608, + "step": 9341 + }, + { + "epoch": 0.5107499692467504, + "grad_norm": 1.4403742551803589, + "learning_rate": 1.0642786303292973e-05, + "loss": 1.6557, + "step": 9342 + }, + { + "epoch": 0.5108046416904719, + "grad_norm": 1.7469782829284668, + "learning_rate": 1.064096271427291e-05, + "loss": 1.7745, + "step": 9343 + }, + { + "epoch": 0.5108593141341935, + "grad_norm": 1.741776704788208, + "learning_rate": 1.0639139103849591e-05, + "loss": 1.4234, + "step": 9344 + }, + { + "epoch": 0.5109139865779151, + "grad_norm": 1.5224868059158325, + "learning_rate": 1.0637315472083921e-05, + "loss": 1.3834, + "step": 9345 + }, + { + "epoch": 0.5109686590216366, + "grad_norm": 1.6106971502304077, + "learning_rate": 1.0635491819036794e-05, + "loss": 1.4107, + "step": 9346 + }, + { + "epoch": 0.5110233314653582, + "grad_norm": 1.5903189182281494, + "learning_rate": 1.0633668144769103e-05, + "loss": 1.4642, + "step": 9347 + }, + { + "epoch": 0.5110780039090798, + "grad_norm": 1.577085256576538, + "learning_rate": 1.0631844449341743e-05, + "loss": 1.1, + "step": 9348 + }, + { + "epoch": 0.5111326763528012, + "grad_norm": 1.4978688955307007, + "learning_rate": 1.0630020732815617e-05, + "loss": 1.5031, + "step": 9349 + }, + { + "epoch": 0.5111873487965228, + "grad_norm": 2.0728273391723633, + "learning_rate": 1.062819699525162e-05, + "loss": 1.7052, + "step": 9350 + }, + { + "epoch": 0.5112420212402444, + "grad_norm": 1.336037039756775, + "learning_rate": 1.062637323671065e-05, + "loss": 1.4353, + "step": 9351 + }, + { + "epoch": 0.5112966936839659, + "grad_norm": 1.9921146631240845, + "learning_rate": 1.062454945725361e-05, + "loss": 1.4074, + "step": 9352 + }, + { + "epoch": 0.5113513661276875, + "grad_norm": 1.6166173219680786, + "learning_rate": 1.0622725656941396e-05, + "loss": 1.5844, + "step": 9353 + }, + { + "epoch": 0.5114060385714091, + "grad_norm": 1.3771651983261108, + "learning_rate": 1.0620901835834914e-05, + "loss": 1.555, + "step": 9354 + }, + { + "epoch": 0.5114607110151306, + "grad_norm": 1.447946310043335, + "learning_rate": 1.0619077993995058e-05, + "loss": 1.2597, + "step": 9355 + }, + { + "epoch": 0.5115153834588522, + "grad_norm": 1.4731159210205078, + "learning_rate": 1.0617254131482737e-05, + "loss": 1.4519, + "step": 9356 + }, + { + "epoch": 0.5115700559025737, + "grad_norm": 1.5365208387374878, + "learning_rate": 1.0615430248358853e-05, + "loss": 1.2004, + "step": 9357 + }, + { + "epoch": 0.5116247283462952, + "grad_norm": 1.5491886138916016, + "learning_rate": 1.0613606344684309e-05, + "loss": 1.2931, + "step": 9358 + }, + { + "epoch": 0.5116794007900168, + "grad_norm": 1.4620373249053955, + "learning_rate": 1.061178242052001e-05, + "loss": 1.542, + "step": 9359 + }, + { + "epoch": 0.5117340732337383, + "grad_norm": 1.6268033981323242, + "learning_rate": 1.0609958475926856e-05, + "loss": 1.5359, + "step": 9360 + }, + { + "epoch": 0.5117887456774599, + "grad_norm": 1.2029671669006348, + "learning_rate": 1.0608134510965762e-05, + "loss": 1.6891, + "step": 9361 + }, + { + "epoch": 0.5118434181211815, + "grad_norm": 1.8990589380264282, + "learning_rate": 1.0606310525697627e-05, + "loss": 1.4755, + "step": 9362 + }, + { + "epoch": 0.511898090564903, + "grad_norm": 1.4619781970977783, + "learning_rate": 1.0604486520183362e-05, + "loss": 1.5642, + "step": 9363 + }, + { + "epoch": 0.5119527630086246, + "grad_norm": 1.786696434020996, + "learning_rate": 1.0602662494483872e-05, + "loss": 1.4749, + "step": 9364 + }, + { + "epoch": 0.5120074354523462, + "grad_norm": 1.8423632383346558, + "learning_rate": 1.060083844866007e-05, + "loss": 1.5698, + "step": 9365 + }, + { + "epoch": 0.5120621078960677, + "grad_norm": 1.6513081789016724, + "learning_rate": 1.059901438277286e-05, + "loss": 1.3354, + "step": 9366 + }, + { + "epoch": 0.5121167803397892, + "grad_norm": 1.5091482400894165, + "learning_rate": 1.0597190296883156e-05, + "loss": 1.3502, + "step": 9367 + }, + { + "epoch": 0.5121714527835108, + "grad_norm": 1.9373130798339844, + "learning_rate": 1.0595366191051866e-05, + "loss": 1.4194, + "step": 9368 + }, + { + "epoch": 0.5122261252272323, + "grad_norm": 1.4540321826934814, + "learning_rate": 1.0593542065339899e-05, + "loss": 1.4141, + "step": 9369 + }, + { + "epoch": 0.5122807976709539, + "grad_norm": 1.3404814004898071, + "learning_rate": 1.059171791980817e-05, + "loss": 1.5622, + "step": 9370 + }, + { + "epoch": 0.5123354701146754, + "grad_norm": 1.6811076402664185, + "learning_rate": 1.0589893754517597e-05, + "loss": 1.3309, + "step": 9371 + }, + { + "epoch": 0.512390142558397, + "grad_norm": 1.1697787046432495, + "learning_rate": 1.0588069569529085e-05, + "loss": 1.5044, + "step": 9372 + }, + { + "epoch": 0.5124448150021186, + "grad_norm": 1.7822208404541016, + "learning_rate": 1.0586245364903547e-05, + "loss": 1.454, + "step": 9373 + }, + { + "epoch": 0.5124994874458401, + "grad_norm": 2.010404348373413, + "learning_rate": 1.0584421140701902e-05, + "loss": 1.4748, + "step": 9374 + }, + { + "epoch": 0.5125541598895617, + "grad_norm": 1.882259488105774, + "learning_rate": 1.0582596896985065e-05, + "loss": 1.6027, + "step": 9375 + }, + { + "epoch": 0.5126088323332832, + "grad_norm": 1.2421263456344604, + "learning_rate": 1.0580772633813947e-05, + "loss": 1.2511, + "step": 9376 + }, + { + "epoch": 0.5126635047770047, + "grad_norm": 1.722833514213562, + "learning_rate": 1.057894835124947e-05, + "loss": 1.2967, + "step": 9377 + }, + { + "epoch": 0.5127181772207263, + "grad_norm": 1.6736561059951782, + "learning_rate": 1.0577124049352549e-05, + "loss": 1.5718, + "step": 9378 + }, + { + "epoch": 0.5127728496644479, + "grad_norm": 2.124129056930542, + "learning_rate": 1.0575299728184105e-05, + "loss": 1.4935, + "step": 9379 + }, + { + "epoch": 0.5128275221081694, + "grad_norm": 1.751848816871643, + "learning_rate": 1.0573475387805047e-05, + "loss": 1.3643, + "step": 9380 + }, + { + "epoch": 0.512882194551891, + "grad_norm": 1.5486632585525513, + "learning_rate": 1.0571651028276304e-05, + "loss": 1.3208, + "step": 9381 + }, + { + "epoch": 0.5129368669956126, + "grad_norm": 1.7378445863723755, + "learning_rate": 1.0569826649658791e-05, + "loss": 1.5899, + "step": 9382 + }, + { + "epoch": 0.5129915394393341, + "grad_norm": 2.0884106159210205, + "learning_rate": 1.056800225201343e-05, + "loss": 1.2473, + "step": 9383 + }, + { + "epoch": 0.5130462118830557, + "grad_norm": 1.5014115571975708, + "learning_rate": 1.0566177835401136e-05, + "loss": 1.4423, + "step": 9384 + }, + { + "epoch": 0.5131008843267773, + "grad_norm": 1.2770856618881226, + "learning_rate": 1.0564353399882843e-05, + "loss": 1.5652, + "step": 9385 + }, + { + "epoch": 0.5131555567704987, + "grad_norm": 1.32527756690979, + "learning_rate": 1.0562528945519463e-05, + "loss": 1.4793, + "step": 9386 + }, + { + "epoch": 0.5132102292142203, + "grad_norm": 1.6667119264602661, + "learning_rate": 1.0560704472371919e-05, + "loss": 1.4914, + "step": 9387 + }, + { + "epoch": 0.5132649016579418, + "grad_norm": 1.271610140800476, + "learning_rate": 1.0558879980501137e-05, + "loss": 1.5104, + "step": 9388 + }, + { + "epoch": 0.5133195741016634, + "grad_norm": 1.4345980882644653, + "learning_rate": 1.0557055469968045e-05, + "loss": 1.488, + "step": 9389 + }, + { + "epoch": 0.513374246545385, + "grad_norm": 1.6518100500106812, + "learning_rate": 1.0555230940833561e-05, + "loss": 1.5269, + "step": 9390 + }, + { + "epoch": 0.5134289189891065, + "grad_norm": 1.8548574447631836, + "learning_rate": 1.0553406393158614e-05, + "loss": 1.4047, + "step": 9391 + }, + { + "epoch": 0.5134835914328281, + "grad_norm": 1.4096871614456177, + "learning_rate": 1.0551581827004129e-05, + "loss": 1.4823, + "step": 9392 + }, + { + "epoch": 0.5135382638765497, + "grad_norm": 1.9663136005401611, + "learning_rate": 1.0549757242431032e-05, + "loss": 1.3496, + "step": 9393 + }, + { + "epoch": 0.5135929363202711, + "grad_norm": 1.5867482423782349, + "learning_rate": 1.0547932639500246e-05, + "loss": 1.3197, + "step": 9394 + }, + { + "epoch": 0.5136476087639927, + "grad_norm": 1.2931551933288574, + "learning_rate": 1.0546108018272707e-05, + "loss": 1.6268, + "step": 9395 + }, + { + "epoch": 0.5137022812077143, + "grad_norm": 1.67758047580719, + "learning_rate": 1.0544283378809343e-05, + "loss": 1.3634, + "step": 9396 + }, + { + "epoch": 0.5137569536514358, + "grad_norm": 1.6420689821243286, + "learning_rate": 1.0542458721171076e-05, + "loss": 1.5894, + "step": 9397 + }, + { + "epoch": 0.5138116260951574, + "grad_norm": 1.7578283548355103, + "learning_rate": 1.0540634045418843e-05, + "loss": 1.4217, + "step": 9398 + }, + { + "epoch": 0.513866298538879, + "grad_norm": 2.1036932468414307, + "learning_rate": 1.0538809351613567e-05, + "loss": 1.6228, + "step": 9399 + }, + { + "epoch": 0.5139209709826005, + "grad_norm": 1.1462204456329346, + "learning_rate": 1.0536984639816183e-05, + "loss": 1.4879, + "step": 9400 + }, + { + "epoch": 0.5139756434263221, + "grad_norm": 1.6525676250457764, + "learning_rate": 1.0535159910087623e-05, + "loss": 1.3821, + "step": 9401 + }, + { + "epoch": 0.5140303158700436, + "grad_norm": 1.538533329963684, + "learning_rate": 1.0533335162488816e-05, + "loss": 1.4297, + "step": 9402 + }, + { + "epoch": 0.5140849883137651, + "grad_norm": 1.6269137859344482, + "learning_rate": 1.0531510397080697e-05, + "loss": 1.4072, + "step": 9403 + }, + { + "epoch": 0.5141396607574867, + "grad_norm": 1.4014970064163208, + "learning_rate": 1.0529685613924199e-05, + "loss": 1.2893, + "step": 9404 + }, + { + "epoch": 0.5141943332012082, + "grad_norm": 1.9864768981933594, + "learning_rate": 1.0527860813080257e-05, + "loss": 1.4237, + "step": 9405 + }, + { + "epoch": 0.5142490056449298, + "grad_norm": 1.721970796585083, + "learning_rate": 1.0526035994609805e-05, + "loss": 1.435, + "step": 9406 + }, + { + "epoch": 0.5143036780886514, + "grad_norm": 1.2873598337173462, + "learning_rate": 1.0524211158573772e-05, + "loss": 1.4995, + "step": 9407 + }, + { + "epoch": 0.5143583505323729, + "grad_norm": 1.4932581186294556, + "learning_rate": 1.05223863050331e-05, + "loss": 1.5064, + "step": 9408 + }, + { + "epoch": 0.5144130229760945, + "grad_norm": 1.7482913732528687, + "learning_rate": 1.0520561434048724e-05, + "loss": 1.2665, + "step": 9409 + }, + { + "epoch": 0.5144676954198161, + "grad_norm": 1.79254949092865, + "learning_rate": 1.051873654568158e-05, + "loss": 1.476, + "step": 9410 + }, + { + "epoch": 0.5145223678635376, + "grad_norm": 2.0239973068237305, + "learning_rate": 1.0516911639992607e-05, + "loss": 1.2814, + "step": 9411 + }, + { + "epoch": 0.5145770403072591, + "grad_norm": 2.1276161670684814, + "learning_rate": 1.051508671704274e-05, + "loss": 1.4761, + "step": 9412 + }, + { + "epoch": 0.5146317127509807, + "grad_norm": 1.3996665477752686, + "learning_rate": 1.0513261776892918e-05, + "loss": 1.5907, + "step": 9413 + }, + { + "epoch": 0.5146863851947022, + "grad_norm": 1.8912689685821533, + "learning_rate": 1.0511436819604082e-05, + "loss": 1.4468, + "step": 9414 + }, + { + "epoch": 0.5147410576384238, + "grad_norm": 1.359326720237732, + "learning_rate": 1.0509611845237168e-05, + "loss": 1.8325, + "step": 9415 + }, + { + "epoch": 0.5147957300821453, + "grad_norm": 1.3522495031356812, + "learning_rate": 1.050778685385312e-05, + "loss": 1.4053, + "step": 9416 + }, + { + "epoch": 0.5148504025258669, + "grad_norm": 1.367983102798462, + "learning_rate": 1.050596184551288e-05, + "loss": 1.258, + "step": 9417 + }, + { + "epoch": 0.5149050749695885, + "grad_norm": 1.5202791690826416, + "learning_rate": 1.0504136820277386e-05, + "loss": 1.2914, + "step": 9418 + }, + { + "epoch": 0.51495974741331, + "grad_norm": 1.2903286218643188, + "learning_rate": 1.050231177820758e-05, + "loss": 1.6053, + "step": 9419 + }, + { + "epoch": 0.5150144198570316, + "grad_norm": 1.5602226257324219, + "learning_rate": 1.0500486719364405e-05, + "loss": 1.6041, + "step": 9420 + }, + { + "epoch": 0.5150690923007532, + "grad_norm": 1.3587782382965088, + "learning_rate": 1.0498661643808801e-05, + "loss": 1.4667, + "step": 9421 + }, + { + "epoch": 0.5151237647444746, + "grad_norm": 1.4150269031524658, + "learning_rate": 1.0496836551601717e-05, + "loss": 1.4954, + "step": 9422 + }, + { + "epoch": 0.5151784371881962, + "grad_norm": 1.5497169494628906, + "learning_rate": 1.0495011442804094e-05, + "loss": 1.3957, + "step": 9423 + }, + { + "epoch": 0.5152331096319178, + "grad_norm": 1.3092780113220215, + "learning_rate": 1.0493186317476878e-05, + "loss": 1.6042, + "step": 9424 + }, + { + "epoch": 0.5152877820756393, + "grad_norm": 1.5322531461715698, + "learning_rate": 1.0491361175681016e-05, + "loss": 1.4919, + "step": 9425 + }, + { + "epoch": 0.5153424545193609, + "grad_norm": 1.2410539388656616, + "learning_rate": 1.0489536017477448e-05, + "loss": 1.5906, + "step": 9426 + }, + { + "epoch": 0.5153971269630825, + "grad_norm": 1.8335739374160767, + "learning_rate": 1.0487710842927126e-05, + "loss": 1.5476, + "step": 9427 + }, + { + "epoch": 0.515451799406804, + "grad_norm": 1.504868984222412, + "learning_rate": 1.0485885652090992e-05, + "loss": 1.7079, + "step": 9428 + }, + { + "epoch": 0.5155064718505256, + "grad_norm": 1.3967205286026, + "learning_rate": 1.0484060445029995e-05, + "loss": 1.4122, + "step": 9429 + }, + { + "epoch": 0.515561144294247, + "grad_norm": 1.6984717845916748, + "learning_rate": 1.0482235221805088e-05, + "loss": 1.3473, + "step": 9430 + }, + { + "epoch": 0.5156158167379686, + "grad_norm": 1.4002372026443481, + "learning_rate": 1.0480409982477214e-05, + "loss": 1.4092, + "step": 9431 + }, + { + "epoch": 0.5156704891816902, + "grad_norm": 1.436145305633545, + "learning_rate": 1.0478584727107322e-05, + "loss": 1.2663, + "step": 9432 + }, + { + "epoch": 0.5157251616254117, + "grad_norm": 1.379577875137329, + "learning_rate": 1.0476759455756363e-05, + "loss": 1.2078, + "step": 9433 + }, + { + "epoch": 0.5157798340691333, + "grad_norm": 1.3732364177703857, + "learning_rate": 1.0474934168485289e-05, + "loss": 1.5061, + "step": 9434 + }, + { + "epoch": 0.5158345065128549, + "grad_norm": 1.8604520559310913, + "learning_rate": 1.0473108865355046e-05, + "loss": 1.5614, + "step": 9435 + }, + { + "epoch": 0.5158891789565764, + "grad_norm": 1.5080634355545044, + "learning_rate": 1.0471283546426586e-05, + "loss": 1.4393, + "step": 9436 + }, + { + "epoch": 0.515943851400298, + "grad_norm": 1.446916937828064, + "learning_rate": 1.0469458211760868e-05, + "loss": 1.5974, + "step": 9437 + }, + { + "epoch": 0.5159985238440196, + "grad_norm": 1.2564738988876343, + "learning_rate": 1.0467632861418837e-05, + "loss": 1.3725, + "step": 9438 + }, + { + "epoch": 0.516053196287741, + "grad_norm": 1.659566879272461, + "learning_rate": 1.0465807495461446e-05, + "loss": 1.3427, + "step": 9439 + }, + { + "epoch": 0.5161078687314626, + "grad_norm": 1.007693886756897, + "learning_rate": 1.046398211394965e-05, + "loss": 1.5626, + "step": 9440 + }, + { + "epoch": 0.5161625411751842, + "grad_norm": 1.6807849407196045, + "learning_rate": 1.0462156716944403e-05, + "loss": 1.4281, + "step": 9441 + }, + { + "epoch": 0.5162172136189057, + "grad_norm": 1.5520715713500977, + "learning_rate": 1.0460331304506658e-05, + "loss": 1.4078, + "step": 9442 + }, + { + "epoch": 0.5162718860626273, + "grad_norm": 1.2494021654129028, + "learning_rate": 1.045850587669737e-05, + "loss": 1.5246, + "step": 9443 + }, + { + "epoch": 0.5163265585063488, + "grad_norm": 1.3660398721694946, + "learning_rate": 1.0456680433577497e-05, + "loss": 1.2718, + "step": 9444 + }, + { + "epoch": 0.5163812309500704, + "grad_norm": 1.4657864570617676, + "learning_rate": 1.0454854975207993e-05, + "loss": 1.5717, + "step": 9445 + }, + { + "epoch": 0.516435903393792, + "grad_norm": 1.496975064277649, + "learning_rate": 1.0453029501649812e-05, + "loss": 1.3078, + "step": 9446 + }, + { + "epoch": 0.5164905758375135, + "grad_norm": 1.5092486143112183, + "learning_rate": 1.0451204012963912e-05, + "loss": 1.7152, + "step": 9447 + }, + { + "epoch": 0.516545248281235, + "grad_norm": 1.6136138439178467, + "learning_rate": 1.0449378509211254e-05, + "loss": 1.3967, + "step": 9448 + }, + { + "epoch": 0.5165999207249566, + "grad_norm": 1.0508068799972534, + "learning_rate": 1.044755299045279e-05, + "loss": 1.5444, + "step": 9449 + }, + { + "epoch": 0.5166545931686781, + "grad_norm": 1.2534986734390259, + "learning_rate": 1.0445727456749484e-05, + "loss": 1.2543, + "step": 9450 + }, + { + "epoch": 0.5167092656123997, + "grad_norm": 1.33258056640625, + "learning_rate": 1.0443901908162291e-05, + "loss": 1.4652, + "step": 9451 + }, + { + "epoch": 0.5167639380561213, + "grad_norm": 1.7028062343597412, + "learning_rate": 1.0442076344752173e-05, + "loss": 1.7389, + "step": 9452 + }, + { + "epoch": 0.5168186104998428, + "grad_norm": 1.3991129398345947, + "learning_rate": 1.0440250766580086e-05, + "loss": 1.4162, + "step": 9453 + }, + { + "epoch": 0.5168732829435644, + "grad_norm": 1.8030035495758057, + "learning_rate": 1.0438425173706994e-05, + "loss": 1.3696, + "step": 9454 + }, + { + "epoch": 0.516927955387286, + "grad_norm": 1.3603191375732422, + "learning_rate": 1.043659956619386e-05, + "loss": 1.4439, + "step": 9455 + }, + { + "epoch": 0.5169826278310075, + "grad_norm": 1.6502283811569214, + "learning_rate": 1.0434773944101637e-05, + "loss": 1.2276, + "step": 9456 + }, + { + "epoch": 0.517037300274729, + "grad_norm": 1.2613645792007446, + "learning_rate": 1.0432948307491296e-05, + "loss": 1.7318, + "step": 9457 + }, + { + "epoch": 0.5170919727184505, + "grad_norm": 1.7055522203445435, + "learning_rate": 1.0431122656423791e-05, + "loss": 1.4091, + "step": 9458 + }, + { + "epoch": 0.5171466451621721, + "grad_norm": 1.8832838535308838, + "learning_rate": 1.0429296990960092e-05, + "loss": 1.5602, + "step": 9459 + }, + { + "epoch": 0.5172013176058937, + "grad_norm": 1.4526329040527344, + "learning_rate": 1.0427471311161157e-05, + "loss": 1.4107, + "step": 9460 + }, + { + "epoch": 0.5172559900496152, + "grad_norm": 1.5156830549240112, + "learning_rate": 1.0425645617087951e-05, + "loss": 1.6191, + "step": 9461 + }, + { + "epoch": 0.5173106624933368, + "grad_norm": 1.5977174043655396, + "learning_rate": 1.0423819908801443e-05, + "loss": 1.3534, + "step": 9462 + }, + { + "epoch": 0.5173653349370584, + "grad_norm": 1.3816940784454346, + "learning_rate": 1.0421994186362591e-05, + "loss": 1.3807, + "step": 9463 + }, + { + "epoch": 0.5174200073807799, + "grad_norm": 1.9989253282546997, + "learning_rate": 1.0420168449832362e-05, + "loss": 1.4925, + "step": 9464 + }, + { + "epoch": 0.5174746798245015, + "grad_norm": 1.5782816410064697, + "learning_rate": 1.0418342699271724e-05, + "loss": 1.6397, + "step": 9465 + }, + { + "epoch": 0.517529352268223, + "grad_norm": 1.551824927330017, + "learning_rate": 1.0416516934741643e-05, + "loss": 1.2709, + "step": 9466 + }, + { + "epoch": 0.5175840247119445, + "grad_norm": 1.34273362159729, + "learning_rate": 1.041469115630308e-05, + "loss": 1.4482, + "step": 9467 + }, + { + "epoch": 0.5176386971556661, + "grad_norm": 1.560486078262329, + "learning_rate": 1.041286536401701e-05, + "loss": 1.454, + "step": 9468 + }, + { + "epoch": 0.5176933695993877, + "grad_norm": 1.52601158618927, + "learning_rate": 1.0411039557944396e-05, + "loss": 1.3147, + "step": 9469 + }, + { + "epoch": 0.5177480420431092, + "grad_norm": 2.0221779346466064, + "learning_rate": 1.0409213738146207e-05, + "loss": 1.322, + "step": 9470 + }, + { + "epoch": 0.5178027144868308, + "grad_norm": 1.5173531770706177, + "learning_rate": 1.0407387904683408e-05, + "loss": 1.3691, + "step": 9471 + }, + { + "epoch": 0.5178573869305523, + "grad_norm": 1.2523812055587769, + "learning_rate": 1.0405562057616972e-05, + "loss": 1.5591, + "step": 9472 + }, + { + "epoch": 0.5179120593742739, + "grad_norm": 1.5398751497268677, + "learning_rate": 1.040373619700787e-05, + "loss": 1.2419, + "step": 9473 + }, + { + "epoch": 0.5179667318179955, + "grad_norm": 1.6726717948913574, + "learning_rate": 1.0401910322917066e-05, + "loss": 1.3472, + "step": 9474 + }, + { + "epoch": 0.518021404261717, + "grad_norm": 1.5303915739059448, + "learning_rate": 1.0400084435405533e-05, + "loss": 1.4973, + "step": 9475 + }, + { + "epoch": 0.5180760767054385, + "grad_norm": 1.3579703569412231, + "learning_rate": 1.0398258534534245e-05, + "loss": 1.439, + "step": 9476 + }, + { + "epoch": 0.5181307491491601, + "grad_norm": 1.3903905153274536, + "learning_rate": 1.039643262036417e-05, + "loss": 1.408, + "step": 9477 + }, + { + "epoch": 0.5181854215928816, + "grad_norm": 1.3948336839675903, + "learning_rate": 1.0394606692956275e-05, + "loss": 1.3379, + "step": 9478 + }, + { + "epoch": 0.5182400940366032, + "grad_norm": 1.68294095993042, + "learning_rate": 1.0392780752371539e-05, + "loss": 1.3717, + "step": 9479 + }, + { + "epoch": 0.5182947664803248, + "grad_norm": 2.1559536457061768, + "learning_rate": 1.0390954798670934e-05, + "loss": 1.2089, + "step": 9480 + }, + { + "epoch": 0.5183494389240463, + "grad_norm": 1.389402151107788, + "learning_rate": 1.0389128831915427e-05, + "loss": 1.4885, + "step": 9481 + }, + { + "epoch": 0.5184041113677679, + "grad_norm": 1.9687488079071045, + "learning_rate": 1.0387302852166e-05, + "loss": 1.5883, + "step": 9482 + }, + { + "epoch": 0.5184587838114895, + "grad_norm": 1.254309058189392, + "learning_rate": 1.038547685948362e-05, + "loss": 1.3891, + "step": 9483 + }, + { + "epoch": 0.518513456255211, + "grad_norm": 1.7736709117889404, + "learning_rate": 1.0383650853929261e-05, + "loss": 1.4927, + "step": 9484 + }, + { + "epoch": 0.5185681286989325, + "grad_norm": 2.009521484375, + "learning_rate": 1.0381824835563901e-05, + "loss": 1.294, + "step": 9485 + }, + { + "epoch": 0.518622801142654, + "grad_norm": 1.2868850231170654, + "learning_rate": 1.0379998804448512e-05, + "loss": 1.4955, + "step": 9486 + }, + { + "epoch": 0.5186774735863756, + "grad_norm": 1.2905583381652832, + "learning_rate": 1.0378172760644074e-05, + "loss": 1.3632, + "step": 9487 + }, + { + "epoch": 0.5187321460300972, + "grad_norm": 1.727806806564331, + "learning_rate": 1.037634670421156e-05, + "loss": 1.5186, + "step": 9488 + }, + { + "epoch": 0.5187868184738187, + "grad_norm": 1.4191644191741943, + "learning_rate": 1.0374520635211947e-05, + "loss": 1.39, + "step": 9489 + }, + { + "epoch": 0.5188414909175403, + "grad_norm": 1.6281979084014893, + "learning_rate": 1.037269455370621e-05, + "loss": 1.4062, + "step": 9490 + }, + { + "epoch": 0.5188961633612619, + "grad_norm": 1.7567538022994995, + "learning_rate": 1.0370868459755325e-05, + "loss": 1.4515, + "step": 9491 + }, + { + "epoch": 0.5189508358049834, + "grad_norm": 1.5831151008605957, + "learning_rate": 1.0369042353420274e-05, + "loss": 1.453, + "step": 9492 + }, + { + "epoch": 0.519005508248705, + "grad_norm": 1.5298832654953003, + "learning_rate": 1.0367216234762032e-05, + "loss": 1.7031, + "step": 9493 + }, + { + "epoch": 0.5190601806924265, + "grad_norm": 1.4643945693969727, + "learning_rate": 1.0365390103841579e-05, + "loss": 1.1905, + "step": 9494 + }, + { + "epoch": 0.519114853136148, + "grad_norm": 2.1383633613586426, + "learning_rate": 1.0363563960719894e-05, + "loss": 1.4423, + "step": 9495 + }, + { + "epoch": 0.5191695255798696, + "grad_norm": 1.372326374053955, + "learning_rate": 1.0361737805457954e-05, + "loss": 1.4626, + "step": 9496 + }, + { + "epoch": 0.5192241980235912, + "grad_norm": 1.3635903596878052, + "learning_rate": 1.0359911638116742e-05, + "loss": 1.6916, + "step": 9497 + }, + { + "epoch": 0.5192788704673127, + "grad_norm": 1.5875846147537231, + "learning_rate": 1.0358085458757233e-05, + "loss": 1.5461, + "step": 9498 + }, + { + "epoch": 0.5193335429110343, + "grad_norm": 1.8604371547698975, + "learning_rate": 1.035625926744041e-05, + "loss": 1.5253, + "step": 9499 + }, + { + "epoch": 0.5193882153547558, + "grad_norm": 1.314601182937622, + "learning_rate": 1.0354433064227255e-05, + "loss": 1.5822, + "step": 9500 + }, + { + "epoch": 0.5194428877984774, + "grad_norm": 2.0022079944610596, + "learning_rate": 1.0352606849178747e-05, + "loss": 1.296, + "step": 9501 + }, + { + "epoch": 0.519497560242199, + "grad_norm": 1.623867392539978, + "learning_rate": 1.0350780622355874e-05, + "loss": 1.5996, + "step": 9502 + }, + { + "epoch": 0.5195522326859204, + "grad_norm": 1.5212960243225098, + "learning_rate": 1.0348954383819607e-05, + "loss": 1.308, + "step": 9503 + }, + { + "epoch": 0.519606905129642, + "grad_norm": 1.6847375631332397, + "learning_rate": 1.034712813363094e-05, + "loss": 1.5737, + "step": 9504 + }, + { + "epoch": 0.5196615775733636, + "grad_norm": 1.5652889013290405, + "learning_rate": 1.0345301871850843e-05, + "loss": 1.4886, + "step": 9505 + }, + { + "epoch": 0.5197162500170851, + "grad_norm": 1.3291691541671753, + "learning_rate": 1.034347559854031e-05, + "loss": 1.4179, + "step": 9506 + }, + { + "epoch": 0.5197709224608067, + "grad_norm": 1.8042397499084473, + "learning_rate": 1.034164931376032e-05, + "loss": 1.5426, + "step": 9507 + }, + { + "epoch": 0.5198255949045283, + "grad_norm": 1.8125863075256348, + "learning_rate": 1.033982301757186e-05, + "loss": 1.4004, + "step": 9508 + }, + { + "epoch": 0.5198802673482498, + "grad_norm": 1.4745984077453613, + "learning_rate": 1.0337996710035911e-05, + "loss": 1.3728, + "step": 9509 + }, + { + "epoch": 0.5199349397919714, + "grad_norm": 1.6046141386032104, + "learning_rate": 1.0336170391213457e-05, + "loss": 1.5674, + "step": 9510 + }, + { + "epoch": 0.519989612235693, + "grad_norm": 1.5319678783416748, + "learning_rate": 1.0334344061165486e-05, + "loss": 1.3489, + "step": 9511 + }, + { + "epoch": 0.5200442846794144, + "grad_norm": 1.3706871271133423, + "learning_rate": 1.0332517719952982e-05, + "loss": 1.3834, + "step": 9512 + }, + { + "epoch": 0.520098957123136, + "grad_norm": 1.637779712677002, + "learning_rate": 1.0330691367636932e-05, + "loss": 1.6397, + "step": 9513 + }, + { + "epoch": 0.5201536295668575, + "grad_norm": 1.3003534078598022, + "learning_rate": 1.0328865004278317e-05, + "loss": 1.7123, + "step": 9514 + }, + { + "epoch": 0.5202083020105791, + "grad_norm": 1.5285308361053467, + "learning_rate": 1.0327038629938134e-05, + "loss": 1.3977, + "step": 9515 + }, + { + "epoch": 0.5202629744543007, + "grad_norm": 1.3373881578445435, + "learning_rate": 1.0325212244677361e-05, + "loss": 1.4431, + "step": 9516 + }, + { + "epoch": 0.5203176468980222, + "grad_norm": 1.5273185968399048, + "learning_rate": 1.0323385848556989e-05, + "loss": 1.3534, + "step": 9517 + }, + { + "epoch": 0.5203723193417438, + "grad_norm": 1.271690011024475, + "learning_rate": 1.0321559441638002e-05, + "loss": 1.5617, + "step": 9518 + }, + { + "epoch": 0.5204269917854654, + "grad_norm": 1.3662053346633911, + "learning_rate": 1.0319733023981392e-05, + "loss": 1.4012, + "step": 9519 + }, + { + "epoch": 0.5204816642291868, + "grad_norm": 1.6710349321365356, + "learning_rate": 1.0317906595648146e-05, + "loss": 1.3973, + "step": 9520 + }, + { + "epoch": 0.5205363366729084, + "grad_norm": 1.340355396270752, + "learning_rate": 1.0316080156699253e-05, + "loss": 1.3865, + "step": 9521 + }, + { + "epoch": 0.52059100911663, + "grad_norm": 1.4597227573394775, + "learning_rate": 1.0314253707195706e-05, + "loss": 1.6124, + "step": 9522 + }, + { + "epoch": 0.5206456815603515, + "grad_norm": 1.4714012145996094, + "learning_rate": 1.0312427247198484e-05, + "loss": 1.7316, + "step": 9523 + }, + { + "epoch": 0.5207003540040731, + "grad_norm": 1.3162660598754883, + "learning_rate": 1.0310600776768585e-05, + "loss": 1.3624, + "step": 9524 + }, + { + "epoch": 0.5207550264477947, + "grad_norm": 1.2346307039260864, + "learning_rate": 1.0308774295966999e-05, + "loss": 1.6301, + "step": 9525 + }, + { + "epoch": 0.5208096988915162, + "grad_norm": 1.8855581283569336, + "learning_rate": 1.030694780485471e-05, + "loss": 1.2499, + "step": 9526 + }, + { + "epoch": 0.5208643713352378, + "grad_norm": 1.3357806205749512, + "learning_rate": 1.0305121303492718e-05, + "loss": 1.3353, + "step": 9527 + }, + { + "epoch": 0.5209190437789593, + "grad_norm": 1.2748231887817383, + "learning_rate": 1.030329479194201e-05, + "loss": 1.5081, + "step": 9528 + }, + { + "epoch": 0.5209737162226808, + "grad_norm": 2.646749496459961, + "learning_rate": 1.0301468270263575e-05, + "loss": 1.3482, + "step": 9529 + }, + { + "epoch": 0.5210283886664024, + "grad_norm": 1.1052207946777344, + "learning_rate": 1.0299641738518407e-05, + "loss": 1.5232, + "step": 9530 + }, + { + "epoch": 0.5210830611101239, + "grad_norm": 1.5764282941818237, + "learning_rate": 1.0297815196767498e-05, + "loss": 1.3985, + "step": 9531 + }, + { + "epoch": 0.5211377335538455, + "grad_norm": 1.4225050210952759, + "learning_rate": 1.0295988645071844e-05, + "loss": 1.5835, + "step": 9532 + }, + { + "epoch": 0.5211924059975671, + "grad_norm": 1.3879603147506714, + "learning_rate": 1.0294162083492429e-05, + "loss": 1.3799, + "step": 9533 + }, + { + "epoch": 0.5212470784412886, + "grad_norm": 1.8443543910980225, + "learning_rate": 1.0292335512090255e-05, + "loss": 1.307, + "step": 9534 + }, + { + "epoch": 0.5213017508850102, + "grad_norm": 1.478663444519043, + "learning_rate": 1.0290508930926314e-05, + "loss": 1.0638, + "step": 9535 + }, + { + "epoch": 0.5213564233287318, + "grad_norm": 1.8184336423873901, + "learning_rate": 1.0288682340061598e-05, + "loss": 1.3332, + "step": 9536 + }, + { + "epoch": 0.5214110957724533, + "grad_norm": 1.5316193103790283, + "learning_rate": 1.0286855739557097e-05, + "loss": 1.441, + "step": 9537 + }, + { + "epoch": 0.5214657682161749, + "grad_norm": 1.2921193838119507, + "learning_rate": 1.0285029129473814e-05, + "loss": 1.329, + "step": 9538 + }, + { + "epoch": 0.5215204406598964, + "grad_norm": 1.2495921850204468, + "learning_rate": 1.0283202509872738e-05, + "loss": 1.2849, + "step": 9539 + }, + { + "epoch": 0.5215751131036179, + "grad_norm": 1.5688903331756592, + "learning_rate": 1.0281375880814864e-05, + "loss": 1.4245, + "step": 9540 + }, + { + "epoch": 0.5216297855473395, + "grad_norm": 1.7210745811462402, + "learning_rate": 1.0279549242361193e-05, + "loss": 1.6672, + "step": 9541 + }, + { + "epoch": 0.521684457991061, + "grad_norm": 1.668135166168213, + "learning_rate": 1.0277722594572714e-05, + "loss": 1.4378, + "step": 9542 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 1.75582754611969, + "learning_rate": 1.0275895937510426e-05, + "loss": 1.594, + "step": 9543 + }, + { + "epoch": 0.5217938028785042, + "grad_norm": 1.7121915817260742, + "learning_rate": 1.0274069271235326e-05, + "loss": 1.3755, + "step": 9544 + }, + { + "epoch": 0.5218484753222257, + "grad_norm": 1.2993600368499756, + "learning_rate": 1.0272242595808406e-05, + "loss": 1.4691, + "step": 9545 + }, + { + "epoch": 0.5219031477659473, + "grad_norm": 2.930612564086914, + "learning_rate": 1.0270415911290673e-05, + "loss": 1.6179, + "step": 9546 + }, + { + "epoch": 0.5219578202096689, + "grad_norm": 1.4958484172821045, + "learning_rate": 1.0268589217743114e-05, + "loss": 1.3196, + "step": 9547 + }, + { + "epoch": 0.5220124926533903, + "grad_norm": 1.4111984968185425, + "learning_rate": 1.0266762515226734e-05, + "loss": 1.6797, + "step": 9548 + }, + { + "epoch": 0.5220671650971119, + "grad_norm": 1.2907785177230835, + "learning_rate": 1.0264935803802527e-05, + "loss": 1.6488, + "step": 9549 + }, + { + "epoch": 0.5221218375408335, + "grad_norm": 1.3722456693649292, + "learning_rate": 1.0263109083531489e-05, + "loss": 1.4184, + "step": 9550 + }, + { + "epoch": 0.522176509984555, + "grad_norm": 1.4416364431381226, + "learning_rate": 1.0261282354474622e-05, + "loss": 1.4285, + "step": 9551 + }, + { + "epoch": 0.5222311824282766, + "grad_norm": 1.3551143407821655, + "learning_rate": 1.0259455616692924e-05, + "loss": 1.487, + "step": 9552 + }, + { + "epoch": 0.5222858548719982, + "grad_norm": 1.4466331005096436, + "learning_rate": 1.0257628870247396e-05, + "loss": 1.4049, + "step": 9553 + }, + { + "epoch": 0.5223405273157197, + "grad_norm": 1.5072308778762817, + "learning_rate": 1.0255802115199034e-05, + "loss": 1.5103, + "step": 9554 + }, + { + "epoch": 0.5223951997594413, + "grad_norm": 1.5767408609390259, + "learning_rate": 1.0253975351608842e-05, + "loss": 1.4141, + "step": 9555 + }, + { + "epoch": 0.5224498722031627, + "grad_norm": 1.5630857944488525, + "learning_rate": 1.0252148579537816e-05, + "loss": 1.6867, + "step": 9556 + }, + { + "epoch": 0.5225045446468843, + "grad_norm": 1.4928158521652222, + "learning_rate": 1.0250321799046953e-05, + "loss": 1.671, + "step": 9557 + }, + { + "epoch": 0.5225592170906059, + "grad_norm": 1.390580177307129, + "learning_rate": 1.0248495010197262e-05, + "loss": 1.4781, + "step": 9558 + }, + { + "epoch": 0.5226138895343274, + "grad_norm": 1.4308449029922485, + "learning_rate": 1.0246668213049737e-05, + "loss": 1.3995, + "step": 9559 + }, + { + "epoch": 0.522668561978049, + "grad_norm": 1.2756041288375854, + "learning_rate": 1.0244841407665385e-05, + "loss": 1.6189, + "step": 9560 + }, + { + "epoch": 0.5227232344217706, + "grad_norm": 1.4992676973342896, + "learning_rate": 1.0243014594105201e-05, + "loss": 1.6578, + "step": 9561 + }, + { + "epoch": 0.5227779068654921, + "grad_norm": 1.6568677425384521, + "learning_rate": 1.024118777243019e-05, + "loss": 1.3396, + "step": 9562 + }, + { + "epoch": 0.5228325793092137, + "grad_norm": 1.5115883350372314, + "learning_rate": 1.0239360942701356e-05, + "loss": 1.3168, + "step": 9563 + }, + { + "epoch": 0.5228872517529353, + "grad_norm": 1.610040307044983, + "learning_rate": 1.0237534104979694e-05, + "loss": 1.6644, + "step": 9564 + }, + { + "epoch": 0.5229419241966567, + "grad_norm": 1.6417665481567383, + "learning_rate": 1.0235707259326211e-05, + "loss": 1.5365, + "step": 9565 + }, + { + "epoch": 0.5229965966403783, + "grad_norm": 1.3827612400054932, + "learning_rate": 1.023388040580191e-05, + "loss": 1.376, + "step": 9566 + }, + { + "epoch": 0.5230512690840999, + "grad_norm": 1.8111459016799927, + "learning_rate": 1.0232053544467798e-05, + "loss": 1.4631, + "step": 9567 + }, + { + "epoch": 0.5231059415278214, + "grad_norm": 1.5050125122070312, + "learning_rate": 1.0230226675384868e-05, + "loss": 1.2838, + "step": 9568 + }, + { + "epoch": 0.523160613971543, + "grad_norm": 1.607077717781067, + "learning_rate": 1.0228399798614132e-05, + "loss": 1.5915, + "step": 9569 + }, + { + "epoch": 0.5232152864152645, + "grad_norm": 1.4527966976165771, + "learning_rate": 1.0226572914216593e-05, + "loss": 1.5628, + "step": 9570 + }, + { + "epoch": 0.5232699588589861, + "grad_norm": 1.5377179384231567, + "learning_rate": 1.0224746022253248e-05, + "loss": 1.2521, + "step": 9571 + }, + { + "epoch": 0.5233246313027077, + "grad_norm": 1.4249900579452515, + "learning_rate": 1.0222919122785107e-05, + "loss": 1.4678, + "step": 9572 + }, + { + "epoch": 0.5233793037464292, + "grad_norm": 1.4100884199142456, + "learning_rate": 1.0221092215873171e-05, + "loss": 1.4423, + "step": 9573 + }, + { + "epoch": 0.5234339761901508, + "grad_norm": 1.9504755735397339, + "learning_rate": 1.0219265301578454e-05, + "loss": 1.4517, + "step": 9574 + }, + { + "epoch": 0.5234886486338723, + "grad_norm": 1.5943074226379395, + "learning_rate": 1.021743837996195e-05, + "loss": 1.3566, + "step": 9575 + }, + { + "epoch": 0.5235433210775938, + "grad_norm": 1.6218039989471436, + "learning_rate": 1.0215611451084668e-05, + "loss": 1.5432, + "step": 9576 + }, + { + "epoch": 0.5235979935213154, + "grad_norm": 1.5321696996688843, + "learning_rate": 1.0213784515007616e-05, + "loss": 1.5141, + "step": 9577 + }, + { + "epoch": 0.523652665965037, + "grad_norm": 1.3611992597579956, + "learning_rate": 1.0211957571791796e-05, + "loss": 1.407, + "step": 9578 + }, + { + "epoch": 0.5237073384087585, + "grad_norm": 1.6319527626037598, + "learning_rate": 1.0210130621498214e-05, + "loss": 1.8448, + "step": 9579 + }, + { + "epoch": 0.5237620108524801, + "grad_norm": 1.4232940673828125, + "learning_rate": 1.0208303664187877e-05, + "loss": 1.4847, + "step": 9580 + }, + { + "epoch": 0.5238166832962017, + "grad_norm": 1.455815076828003, + "learning_rate": 1.0206476699921795e-05, + "loss": 1.5561, + "step": 9581 + }, + { + "epoch": 0.5238713557399232, + "grad_norm": 1.7595431804656982, + "learning_rate": 1.0204649728760969e-05, + "loss": 1.4538, + "step": 9582 + }, + { + "epoch": 0.5239260281836448, + "grad_norm": 1.5439138412475586, + "learning_rate": 1.0202822750766408e-05, + "loss": 1.2576, + "step": 9583 + }, + { + "epoch": 0.5239807006273663, + "grad_norm": 1.39867103099823, + "learning_rate": 1.0200995765999122e-05, + "loss": 1.4945, + "step": 9584 + }, + { + "epoch": 0.5240353730710878, + "grad_norm": 1.394458293914795, + "learning_rate": 1.0199168774520115e-05, + "loss": 1.2022, + "step": 9585 + }, + { + "epoch": 0.5240900455148094, + "grad_norm": 1.5822076797485352, + "learning_rate": 1.0197341776390393e-05, + "loss": 1.2787, + "step": 9586 + }, + { + "epoch": 0.5241447179585309, + "grad_norm": 1.854570984840393, + "learning_rate": 1.0195514771670967e-05, + "loss": 1.521, + "step": 9587 + }, + { + "epoch": 0.5241993904022525, + "grad_norm": 1.8833394050598145, + "learning_rate": 1.0193687760422846e-05, + "loss": 1.5278, + "step": 9588 + }, + { + "epoch": 0.5242540628459741, + "grad_norm": 1.4794015884399414, + "learning_rate": 1.0191860742707034e-05, + "loss": 1.324, + "step": 9589 + }, + { + "epoch": 0.5243087352896956, + "grad_norm": 1.6917262077331543, + "learning_rate": 1.0190033718584542e-05, + "loss": 1.4285, + "step": 9590 + }, + { + "epoch": 0.5243634077334172, + "grad_norm": 1.8994368314743042, + "learning_rate": 1.018820668811638e-05, + "loss": 1.512, + "step": 9591 + }, + { + "epoch": 0.5244180801771388, + "grad_norm": 1.4566365480422974, + "learning_rate": 1.0186379651363551e-05, + "loss": 1.4208, + "step": 9592 + }, + { + "epoch": 0.5244727526208602, + "grad_norm": 1.5757614374160767, + "learning_rate": 1.0184552608387072e-05, + "loss": 1.5404, + "step": 9593 + }, + { + "epoch": 0.5245274250645818, + "grad_norm": 1.6264101266860962, + "learning_rate": 1.0182725559247945e-05, + "loss": 1.448, + "step": 9594 + }, + { + "epoch": 0.5245820975083034, + "grad_norm": 1.501527190208435, + "learning_rate": 1.0180898504007188e-05, + "loss": 1.4709, + "step": 9595 + }, + { + "epoch": 0.5246367699520249, + "grad_norm": 1.940700650215149, + "learning_rate": 1.0179071442725801e-05, + "loss": 1.0862, + "step": 9596 + }, + { + "epoch": 0.5246914423957465, + "grad_norm": 1.6064176559448242, + "learning_rate": 1.01772443754648e-05, + "loss": 1.6009, + "step": 9597 + }, + { + "epoch": 0.5247461148394681, + "grad_norm": 1.485619306564331, + "learning_rate": 1.0175417302285194e-05, + "loss": 1.5514, + "step": 9598 + }, + { + "epoch": 0.5248007872831896, + "grad_norm": 1.7575874328613281, + "learning_rate": 1.017359022324799e-05, + "loss": 1.2956, + "step": 9599 + }, + { + "epoch": 0.5248554597269112, + "grad_norm": 1.5844981670379639, + "learning_rate": 1.0171763138414203e-05, + "loss": 1.424, + "step": 9600 + }, + { + "epoch": 0.5249101321706326, + "grad_norm": 1.3777401447296143, + "learning_rate": 1.016993604784484e-05, + "loss": 1.3765, + "step": 9601 + }, + { + "epoch": 0.5249648046143542, + "grad_norm": 1.5816842317581177, + "learning_rate": 1.0168108951600917e-05, + "loss": 1.5319, + "step": 9602 + }, + { + "epoch": 0.5250194770580758, + "grad_norm": 1.3401025533676147, + "learning_rate": 1.0166281849743438e-05, + "loss": 1.5741, + "step": 9603 + }, + { + "epoch": 0.5250741495017973, + "grad_norm": 1.2884364128112793, + "learning_rate": 1.0164454742333419e-05, + "loss": 1.4343, + "step": 9604 + }, + { + "epoch": 0.5251288219455189, + "grad_norm": 1.5652759075164795, + "learning_rate": 1.016262762943187e-05, + "loss": 1.3908, + "step": 9605 + }, + { + "epoch": 0.5251834943892405, + "grad_norm": 1.5855352878570557, + "learning_rate": 1.0160800511099805e-05, + "loss": 1.4088, + "step": 9606 + }, + { + "epoch": 0.525238166832962, + "grad_norm": 1.4046884775161743, + "learning_rate": 1.0158973387398231e-05, + "loss": 1.4279, + "step": 9607 + }, + { + "epoch": 0.5252928392766836, + "grad_norm": 1.5906561613082886, + "learning_rate": 1.0157146258388163e-05, + "loss": 1.4024, + "step": 9608 + }, + { + "epoch": 0.5253475117204052, + "grad_norm": 2.1078853607177734, + "learning_rate": 1.0155319124130616e-05, + "loss": 1.3824, + "step": 9609 + }, + { + "epoch": 0.5254021841641267, + "grad_norm": 1.7148728370666504, + "learning_rate": 1.0153491984686595e-05, + "loss": 1.3634, + "step": 9610 + }, + { + "epoch": 0.5254568566078482, + "grad_norm": 1.449822187423706, + "learning_rate": 1.0151664840117118e-05, + "loss": 1.461, + "step": 9611 + }, + { + "epoch": 0.5255115290515698, + "grad_norm": 1.6124647855758667, + "learning_rate": 1.0149837690483195e-05, + "loss": 1.3594, + "step": 9612 + }, + { + "epoch": 0.5255662014952913, + "grad_norm": 1.516284704208374, + "learning_rate": 1.0148010535845842e-05, + "loss": 1.0809, + "step": 9613 + }, + { + "epoch": 0.5256208739390129, + "grad_norm": 1.718106985092163, + "learning_rate": 1.014618337626607e-05, + "loss": 1.5208, + "step": 9614 + }, + { + "epoch": 0.5256755463827344, + "grad_norm": 1.668699860572815, + "learning_rate": 1.0144356211804888e-05, + "loss": 1.3737, + "step": 9615 + }, + { + "epoch": 0.525730218826456, + "grad_norm": 1.3984092473983765, + "learning_rate": 1.014252904252332e-05, + "loss": 1.413, + "step": 9616 + }, + { + "epoch": 0.5257848912701776, + "grad_norm": 1.4565110206604004, + "learning_rate": 1.0140701868482365e-05, + "loss": 1.6205, + "step": 9617 + }, + { + "epoch": 0.5258395637138991, + "grad_norm": 1.7076020240783691, + "learning_rate": 1.0138874689743048e-05, + "loss": 1.6454, + "step": 9618 + }, + { + "epoch": 0.5258942361576207, + "grad_norm": 1.4167739152908325, + "learning_rate": 1.0137047506366382e-05, + "loss": 1.3319, + "step": 9619 + }, + { + "epoch": 0.5259489086013422, + "grad_norm": 1.4535170793533325, + "learning_rate": 1.0135220318413377e-05, + "loss": 1.5188, + "step": 9620 + }, + { + "epoch": 0.5260035810450637, + "grad_norm": 1.196488380432129, + "learning_rate": 1.0133393125945045e-05, + "loss": 1.597, + "step": 9621 + }, + { + "epoch": 0.5260582534887853, + "grad_norm": 1.238962173461914, + "learning_rate": 1.0131565929022405e-05, + "loss": 1.4822, + "step": 9622 + }, + { + "epoch": 0.5261129259325069, + "grad_norm": 1.4145848751068115, + "learning_rate": 1.012973872770647e-05, + "loss": 1.3025, + "step": 9623 + }, + { + "epoch": 0.5261675983762284, + "grad_norm": 1.3592298030853271, + "learning_rate": 1.0127911522058256e-05, + "loss": 1.6238, + "step": 9624 + }, + { + "epoch": 0.52622227081995, + "grad_norm": 1.5085647106170654, + "learning_rate": 1.0126084312138774e-05, + "loss": 1.4379, + "step": 9625 + }, + { + "epoch": 0.5262769432636716, + "grad_norm": 1.579502820968628, + "learning_rate": 1.0124257098009042e-05, + "loss": 1.5559, + "step": 9626 + }, + { + "epoch": 0.5263316157073931, + "grad_norm": 1.7552965879440308, + "learning_rate": 1.0122429879730075e-05, + "loss": 1.4593, + "step": 9627 + }, + { + "epoch": 0.5263862881511147, + "grad_norm": 2.219430446624756, + "learning_rate": 1.0120602657362885e-05, + "loss": 1.2933, + "step": 9628 + }, + { + "epoch": 0.5264409605948361, + "grad_norm": 1.6113308668136597, + "learning_rate": 1.011877543096849e-05, + "loss": 1.5287, + "step": 9629 + }, + { + "epoch": 0.5264956330385577, + "grad_norm": 1.3754653930664062, + "learning_rate": 1.0116948200607906e-05, + "loss": 1.7803, + "step": 9630 + }, + { + "epoch": 0.5265503054822793, + "grad_norm": 1.373289942741394, + "learning_rate": 1.0115120966342145e-05, + "loss": 1.357, + "step": 9631 + }, + { + "epoch": 0.5266049779260008, + "grad_norm": 1.5567547082901, + "learning_rate": 1.0113293728232227e-05, + "loss": 1.5245, + "step": 9632 + }, + { + "epoch": 0.5266596503697224, + "grad_norm": 1.4499560594558716, + "learning_rate": 1.0111466486339166e-05, + "loss": 1.5029, + "step": 9633 + }, + { + "epoch": 0.526714322813444, + "grad_norm": 1.2369823455810547, + "learning_rate": 1.0109639240723974e-05, + "loss": 1.5986, + "step": 9634 + }, + { + "epoch": 0.5267689952571655, + "grad_norm": 1.521814227104187, + "learning_rate": 1.0107811991447672e-05, + "loss": 1.509, + "step": 9635 + }, + { + "epoch": 0.5268236677008871, + "grad_norm": 1.674859642982483, + "learning_rate": 1.0105984738571274e-05, + "loss": 1.3508, + "step": 9636 + }, + { + "epoch": 0.5268783401446087, + "grad_norm": 1.42023503780365, + "learning_rate": 1.0104157482155798e-05, + "loss": 1.2525, + "step": 9637 + }, + { + "epoch": 0.5269330125883301, + "grad_norm": 2.0146121978759766, + "learning_rate": 1.0102330222262257e-05, + "loss": 1.4424, + "step": 9638 + }, + { + "epoch": 0.5269876850320517, + "grad_norm": 1.5610637664794922, + "learning_rate": 1.0100502958951675e-05, + "loss": 1.5675, + "step": 9639 + }, + { + "epoch": 0.5270423574757733, + "grad_norm": 1.6515778303146362, + "learning_rate": 1.0098675692285061e-05, + "loss": 1.6734, + "step": 9640 + }, + { + "epoch": 0.5270970299194948, + "grad_norm": 1.8954486846923828, + "learning_rate": 1.009684842232343e-05, + "loss": 1.5336, + "step": 9641 + }, + { + "epoch": 0.5271517023632164, + "grad_norm": 1.377197027206421, + "learning_rate": 1.0095021149127806e-05, + "loss": 1.6067, + "step": 9642 + }, + { + "epoch": 0.5272063748069379, + "grad_norm": 1.5501303672790527, + "learning_rate": 1.0093193872759204e-05, + "loss": 1.3687, + "step": 9643 + }, + { + "epoch": 0.5272610472506595, + "grad_norm": 1.5989367961883545, + "learning_rate": 1.0091366593278639e-05, + "loss": 1.5745, + "step": 9644 + }, + { + "epoch": 0.5273157196943811, + "grad_norm": 1.419538140296936, + "learning_rate": 1.0089539310747127e-05, + "loss": 1.4625, + "step": 9645 + }, + { + "epoch": 0.5273703921381026, + "grad_norm": 1.7754912376403809, + "learning_rate": 1.0087712025225691e-05, + "loss": 1.2113, + "step": 9646 + }, + { + "epoch": 0.5274250645818241, + "grad_norm": 1.552901029586792, + "learning_rate": 1.0085884736775345e-05, + "loss": 1.5381, + "step": 9647 + }, + { + "epoch": 0.5274797370255457, + "grad_norm": 1.5761405229568481, + "learning_rate": 1.0084057445457103e-05, + "loss": 1.4763, + "step": 9648 + }, + { + "epoch": 0.5275344094692672, + "grad_norm": 1.1897656917572021, + "learning_rate": 1.0082230151331984e-05, + "loss": 1.4469, + "step": 9649 + }, + { + "epoch": 0.5275890819129888, + "grad_norm": 1.876583456993103, + "learning_rate": 1.008040285446101e-05, + "loss": 1.4293, + "step": 9650 + }, + { + "epoch": 0.5276437543567104, + "grad_norm": 1.8344290256500244, + "learning_rate": 1.0078575554905197e-05, + "loss": 1.3961, + "step": 9651 + }, + { + "epoch": 0.5276984268004319, + "grad_norm": 1.6777477264404297, + "learning_rate": 1.007674825272556e-05, + "loss": 1.5253, + "step": 9652 + }, + { + "epoch": 0.5277530992441535, + "grad_norm": 1.9384022951126099, + "learning_rate": 1.007492094798312e-05, + "loss": 1.2687, + "step": 9653 + }, + { + "epoch": 0.5278077716878751, + "grad_norm": 1.6434110403060913, + "learning_rate": 1.0073093640738896e-05, + "loss": 1.4352, + "step": 9654 + }, + { + "epoch": 0.5278624441315966, + "grad_norm": 1.516599178314209, + "learning_rate": 1.00712663310539e-05, + "loss": 1.2651, + "step": 9655 + }, + { + "epoch": 0.5279171165753181, + "grad_norm": 1.658997654914856, + "learning_rate": 1.0069439018989153e-05, + "loss": 1.3106, + "step": 9656 + }, + { + "epoch": 0.5279717890190396, + "grad_norm": 2.0392682552337646, + "learning_rate": 1.0067611704605675e-05, + "loss": 1.5442, + "step": 9657 + }, + { + "epoch": 0.5280264614627612, + "grad_norm": 1.771472692489624, + "learning_rate": 1.0065784387964486e-05, + "loss": 1.5352, + "step": 9658 + }, + { + "epoch": 0.5280811339064828, + "grad_norm": 1.3939868211746216, + "learning_rate": 1.0063957069126602e-05, + "loss": 1.3303, + "step": 9659 + }, + { + "epoch": 0.5281358063502043, + "grad_norm": 1.8863402605056763, + "learning_rate": 1.006212974815304e-05, + "loss": 1.3871, + "step": 9660 + }, + { + "epoch": 0.5281904787939259, + "grad_norm": 1.417218804359436, + "learning_rate": 1.0060302425104823e-05, + "loss": 1.4375, + "step": 9661 + }, + { + "epoch": 0.5282451512376475, + "grad_norm": 1.395156979560852, + "learning_rate": 1.0058475100042962e-05, + "loss": 1.7956, + "step": 9662 + }, + { + "epoch": 0.528299823681369, + "grad_norm": 1.534226417541504, + "learning_rate": 1.0056647773028478e-05, + "loss": 1.717, + "step": 9663 + }, + { + "epoch": 0.5283544961250906, + "grad_norm": 1.4277228116989136, + "learning_rate": 1.0054820444122395e-05, + "loss": 1.5395, + "step": 9664 + }, + { + "epoch": 0.5284091685688121, + "grad_norm": 1.4189090728759766, + "learning_rate": 1.0052993113385732e-05, + "loss": 1.5733, + "step": 9665 + }, + { + "epoch": 0.5284638410125336, + "grad_norm": 1.7558588981628418, + "learning_rate": 1.0051165780879503e-05, + "loss": 1.3994, + "step": 9666 + }, + { + "epoch": 0.5285185134562552, + "grad_norm": 1.5597220659255981, + "learning_rate": 1.0049338446664726e-05, + "loss": 1.5596, + "step": 9667 + }, + { + "epoch": 0.5285731858999768, + "grad_norm": 1.379339575767517, + "learning_rate": 1.0047511110802426e-05, + "loss": 1.4754, + "step": 9668 + }, + { + "epoch": 0.5286278583436983, + "grad_norm": 1.6735131740570068, + "learning_rate": 1.0045683773353616e-05, + "loss": 1.4029, + "step": 9669 + }, + { + "epoch": 0.5286825307874199, + "grad_norm": 1.865676760673523, + "learning_rate": 1.0043856434379316e-05, + "loss": 1.588, + "step": 9670 + }, + { + "epoch": 0.5287372032311414, + "grad_norm": 1.9310566186904907, + "learning_rate": 1.004202909394055e-05, + "loss": 1.4263, + "step": 9671 + }, + { + "epoch": 0.528791875674863, + "grad_norm": 1.389291763305664, + "learning_rate": 1.0040201752098335e-05, + "loss": 1.5347, + "step": 9672 + }, + { + "epoch": 0.5288465481185846, + "grad_norm": 1.4676599502563477, + "learning_rate": 1.0038374408913684e-05, + "loss": 1.4509, + "step": 9673 + }, + { + "epoch": 0.528901220562306, + "grad_norm": 1.5221308469772339, + "learning_rate": 1.0036547064447623e-05, + "loss": 1.3538, + "step": 9674 + }, + { + "epoch": 0.5289558930060276, + "grad_norm": 1.6120556592941284, + "learning_rate": 1.0034719718761174e-05, + "loss": 1.5242, + "step": 9675 + }, + { + "epoch": 0.5290105654497492, + "grad_norm": 1.738568902015686, + "learning_rate": 1.0032892371915348e-05, + "loss": 1.5551, + "step": 9676 + }, + { + "epoch": 0.5290652378934707, + "grad_norm": 1.841606855392456, + "learning_rate": 1.003106502397117e-05, + "loss": 1.5587, + "step": 9677 + }, + { + "epoch": 0.5291199103371923, + "grad_norm": 1.4009428024291992, + "learning_rate": 1.0029237674989658e-05, + "loss": 1.474, + "step": 9678 + }, + { + "epoch": 0.5291745827809139, + "grad_norm": 1.2373546361923218, + "learning_rate": 1.0027410325031831e-05, + "loss": 1.6851, + "step": 9679 + }, + { + "epoch": 0.5292292552246354, + "grad_norm": 1.6543498039245605, + "learning_rate": 1.002558297415871e-05, + "loss": 1.4397, + "step": 9680 + }, + { + "epoch": 0.529283927668357, + "grad_norm": 1.6356220245361328, + "learning_rate": 1.002375562243131e-05, + "loss": 1.5484, + "step": 9681 + }, + { + "epoch": 0.5293386001120786, + "grad_norm": 1.2176340818405151, + "learning_rate": 1.0021928269910658e-05, + "loss": 1.3838, + "step": 9682 + }, + { + "epoch": 0.5293932725558, + "grad_norm": 1.6561460494995117, + "learning_rate": 1.0020100916657769e-05, + "loss": 1.5052, + "step": 9683 + }, + { + "epoch": 0.5294479449995216, + "grad_norm": 2.075605869293213, + "learning_rate": 1.001827356273366e-05, + "loss": 1.4106, + "step": 9684 + }, + { + "epoch": 0.5295026174432431, + "grad_norm": 1.6827220916748047, + "learning_rate": 1.0016446208199357e-05, + "loss": 1.44, + "step": 9685 + }, + { + "epoch": 0.5295572898869647, + "grad_norm": 1.7650104761123657, + "learning_rate": 1.0014618853115879e-05, + "loss": 1.3059, + "step": 9686 + }, + { + "epoch": 0.5296119623306863, + "grad_norm": 1.6277958154678345, + "learning_rate": 1.0012791497544238e-05, + "loss": 1.3736, + "step": 9687 + }, + { + "epoch": 0.5296666347744078, + "grad_norm": 1.359501838684082, + "learning_rate": 1.0010964141545461e-05, + "loss": 1.5007, + "step": 9688 + }, + { + "epoch": 0.5297213072181294, + "grad_norm": 1.7554681301116943, + "learning_rate": 1.0009136785180566e-05, + "loss": 1.4892, + "step": 9689 + }, + { + "epoch": 0.529775979661851, + "grad_norm": 1.4352338314056396, + "learning_rate": 1.000730942851057e-05, + "loss": 1.1708, + "step": 9690 + }, + { + "epoch": 0.5298306521055725, + "grad_norm": 1.4561716318130493, + "learning_rate": 1.0005482071596497e-05, + "loss": 1.4222, + "step": 9691 + }, + { + "epoch": 0.529885324549294, + "grad_norm": 1.4909560680389404, + "learning_rate": 1.0003654714499365e-05, + "loss": 1.4739, + "step": 9692 + }, + { + "epoch": 0.5299399969930156, + "grad_norm": 1.4229393005371094, + "learning_rate": 1.0001827357280193e-05, + "loss": 1.3745, + "step": 9693 + }, + { + "epoch": 0.5299946694367371, + "grad_norm": 1.4268066883087158, + "learning_rate": 1e-05, + "loss": 1.5697, + "step": 9694 + }, + { + "epoch": 0.5300493418804587, + "grad_norm": 1.4708361625671387, + "learning_rate": 9.998172642719812e-06, + "loss": 1.4826, + "step": 9695 + }, + { + "epoch": 0.5301040143241803, + "grad_norm": 1.8723571300506592, + "learning_rate": 9.996345285500636e-06, + "loss": 1.7576, + "step": 9696 + }, + { + "epoch": 0.5301586867679018, + "grad_norm": 1.285131812095642, + "learning_rate": 9.994517928403505e-06, + "loss": 1.5028, + "step": 9697 + }, + { + "epoch": 0.5302133592116234, + "grad_norm": 1.481791615486145, + "learning_rate": 9.992690571489432e-06, + "loss": 1.5853, + "step": 9698 + }, + { + "epoch": 0.5302680316553449, + "grad_norm": 1.5402089357376099, + "learning_rate": 9.990863214819438e-06, + "loss": 1.3554, + "step": 9699 + }, + { + "epoch": 0.5303227040990665, + "grad_norm": 1.5881931781768799, + "learning_rate": 9.989035858454544e-06, + "loss": 1.4046, + "step": 9700 + }, + { + "epoch": 0.530377376542788, + "grad_norm": 1.4138480424880981, + "learning_rate": 9.987208502455767e-06, + "loss": 1.2812, + "step": 9701 + }, + { + "epoch": 0.5304320489865095, + "grad_norm": 1.7812997102737427, + "learning_rate": 9.985381146884125e-06, + "loss": 1.57, + "step": 9702 + }, + { + "epoch": 0.5304867214302311, + "grad_norm": 1.4645150899887085, + "learning_rate": 9.983553791800645e-06, + "loss": 1.6604, + "step": 9703 + }, + { + "epoch": 0.5305413938739527, + "grad_norm": 1.5250800848007202, + "learning_rate": 9.98172643726634e-06, + "loss": 1.2792, + "step": 9704 + }, + { + "epoch": 0.5305960663176742, + "grad_norm": 1.4013124704360962, + "learning_rate": 9.979899083342234e-06, + "loss": 1.537, + "step": 9705 + }, + { + "epoch": 0.5306507387613958, + "grad_norm": 1.5692267417907715, + "learning_rate": 9.978071730089347e-06, + "loss": 1.4951, + "step": 9706 + }, + { + "epoch": 0.5307054112051174, + "grad_norm": 1.579099178314209, + "learning_rate": 9.976244377568688e-06, + "loss": 1.469, + "step": 9707 + }, + { + "epoch": 0.5307600836488389, + "grad_norm": 1.282965898513794, + "learning_rate": 9.974417025841293e-06, + "loss": 1.1877, + "step": 9708 + }, + { + "epoch": 0.5308147560925605, + "grad_norm": 1.3091561794281006, + "learning_rate": 9.972589674968174e-06, + "loss": 1.176, + "step": 9709 + }, + { + "epoch": 0.530869428536282, + "grad_norm": 1.380894422531128, + "learning_rate": 9.970762325010344e-06, + "loss": 1.5581, + "step": 9710 + }, + { + "epoch": 0.5309241009800035, + "grad_norm": 1.9219566583633423, + "learning_rate": 9.968934976028833e-06, + "loss": 1.4105, + "step": 9711 + }, + { + "epoch": 0.5309787734237251, + "grad_norm": 1.5886187553405762, + "learning_rate": 9.967107628084657e-06, + "loss": 1.5756, + "step": 9712 + }, + { + "epoch": 0.5310334458674466, + "grad_norm": 2.4521853923797607, + "learning_rate": 9.965280281238828e-06, + "loss": 1.5124, + "step": 9713 + }, + { + "epoch": 0.5310881183111682, + "grad_norm": 2.4794135093688965, + "learning_rate": 9.963452935552379e-06, + "loss": 1.4278, + "step": 9714 + }, + { + "epoch": 0.5311427907548898, + "grad_norm": 1.7092700004577637, + "learning_rate": 9.961625591086321e-06, + "loss": 1.4592, + "step": 9715 + }, + { + "epoch": 0.5311974631986113, + "grad_norm": 1.3105058670043945, + "learning_rate": 9.959798247901668e-06, + "loss": 1.3872, + "step": 9716 + }, + { + "epoch": 0.5312521356423329, + "grad_norm": 1.6706513166427612, + "learning_rate": 9.957970906059453e-06, + "loss": 1.4317, + "step": 9717 + }, + { + "epoch": 0.5313068080860545, + "grad_norm": 1.8862605094909668, + "learning_rate": 9.956143565620684e-06, + "loss": 1.3232, + "step": 9718 + }, + { + "epoch": 0.5313614805297759, + "grad_norm": 1.4304081201553345, + "learning_rate": 9.954316226646389e-06, + "loss": 1.4145, + "step": 9719 + }, + { + "epoch": 0.5314161529734975, + "grad_norm": 1.3729381561279297, + "learning_rate": 9.952488889197579e-06, + "loss": 1.7219, + "step": 9720 + }, + { + "epoch": 0.5314708254172191, + "grad_norm": 1.334545612335205, + "learning_rate": 9.950661553335275e-06, + "loss": 1.1346, + "step": 9721 + }, + { + "epoch": 0.5315254978609406, + "grad_norm": 1.5704989433288574, + "learning_rate": 9.9488342191205e-06, + "loss": 1.5096, + "step": 9722 + }, + { + "epoch": 0.5315801703046622, + "grad_norm": 1.4326685667037964, + "learning_rate": 9.947006886614271e-06, + "loss": 1.2768, + "step": 9723 + }, + { + "epoch": 0.5316348427483838, + "grad_norm": 1.2203893661499023, + "learning_rate": 9.945179555877605e-06, + "loss": 1.3761, + "step": 9724 + }, + { + "epoch": 0.5316895151921053, + "grad_norm": 1.585809588432312, + "learning_rate": 9.943352226971524e-06, + "loss": 1.4258, + "step": 9725 + }, + { + "epoch": 0.5317441876358269, + "grad_norm": 1.6387892961502075, + "learning_rate": 9.941524899957045e-06, + "loss": 1.5574, + "step": 9726 + }, + { + "epoch": 0.5317988600795484, + "grad_norm": 1.376819133758545, + "learning_rate": 9.939697574895182e-06, + "loss": 1.3002, + "step": 9727 + }, + { + "epoch": 0.53185353252327, + "grad_norm": 1.5140960216522217, + "learning_rate": 9.937870251846963e-06, + "loss": 1.4736, + "step": 9728 + }, + { + "epoch": 0.5319082049669915, + "grad_norm": 1.8039451837539673, + "learning_rate": 9.936042930873403e-06, + "loss": 1.5529, + "step": 9729 + }, + { + "epoch": 0.531962877410713, + "grad_norm": 1.5179868936538696, + "learning_rate": 9.934215612035516e-06, + "loss": 1.4901, + "step": 9730 + }, + { + "epoch": 0.5320175498544346, + "grad_norm": 1.7182279825210571, + "learning_rate": 9.932388295394328e-06, + "loss": 1.3628, + "step": 9731 + }, + { + "epoch": 0.5320722222981562, + "grad_norm": 1.6253386735916138, + "learning_rate": 9.930560981010847e-06, + "loss": 1.3478, + "step": 9732 + }, + { + "epoch": 0.5321268947418777, + "grad_norm": 1.2777899503707886, + "learning_rate": 9.928733668946104e-06, + "loss": 1.4248, + "step": 9733 + }, + { + "epoch": 0.5321815671855993, + "grad_norm": 1.5823900699615479, + "learning_rate": 9.926906359261111e-06, + "loss": 1.6897, + "step": 9734 + }, + { + "epoch": 0.5322362396293209, + "grad_norm": 1.933713674545288, + "learning_rate": 9.92507905201688e-06, + "loss": 1.3557, + "step": 9735 + }, + { + "epoch": 0.5322909120730424, + "grad_norm": 1.629975438117981, + "learning_rate": 9.923251747274441e-06, + "loss": 1.5459, + "step": 9736 + }, + { + "epoch": 0.532345584516764, + "grad_norm": 1.5453829765319824, + "learning_rate": 9.921424445094806e-06, + "loss": 1.2969, + "step": 9737 + }, + { + "epoch": 0.5324002569604855, + "grad_norm": 1.7058913707733154, + "learning_rate": 9.91959714553899e-06, + "loss": 1.356, + "step": 9738 + }, + { + "epoch": 0.532454929404207, + "grad_norm": 1.3435431718826294, + "learning_rate": 9.917769848668018e-06, + "loss": 1.4196, + "step": 9739 + }, + { + "epoch": 0.5325096018479286, + "grad_norm": 1.8451967239379883, + "learning_rate": 9.915942554542902e-06, + "loss": 1.6447, + "step": 9740 + }, + { + "epoch": 0.5325642742916501, + "grad_norm": 1.4054551124572754, + "learning_rate": 9.914115263224658e-06, + "loss": 1.47, + "step": 9741 + }, + { + "epoch": 0.5326189467353717, + "grad_norm": 2.0126283168792725, + "learning_rate": 9.912287974774312e-06, + "loss": 1.6024, + "step": 9742 + }, + { + "epoch": 0.5326736191790933, + "grad_norm": 1.9159296751022339, + "learning_rate": 9.910460689252876e-06, + "loss": 1.6041, + "step": 9743 + }, + { + "epoch": 0.5327282916228148, + "grad_norm": 1.273247241973877, + "learning_rate": 9.908633406721364e-06, + "loss": 1.5459, + "step": 9744 + }, + { + "epoch": 0.5327829640665364, + "grad_norm": 1.725633978843689, + "learning_rate": 9.9068061272408e-06, + "loss": 1.2139, + "step": 9745 + }, + { + "epoch": 0.532837636510258, + "grad_norm": 1.440406322479248, + "learning_rate": 9.904978850872193e-06, + "loss": 1.3827, + "step": 9746 + }, + { + "epoch": 0.5328923089539794, + "grad_norm": 1.7019323110580444, + "learning_rate": 9.903151577676571e-06, + "loss": 1.2037, + "step": 9747 + }, + { + "epoch": 0.532946981397701, + "grad_norm": 1.4050278663635254, + "learning_rate": 9.901324307714944e-06, + "loss": 1.3675, + "step": 9748 + }, + { + "epoch": 0.5330016538414226, + "grad_norm": 1.9306384325027466, + "learning_rate": 9.899497041048329e-06, + "loss": 1.414, + "step": 9749 + }, + { + "epoch": 0.5330563262851441, + "grad_norm": 1.5524760484695435, + "learning_rate": 9.897669777737745e-06, + "loss": 1.3888, + "step": 9750 + }, + { + "epoch": 0.5331109987288657, + "grad_norm": 1.3557411432266235, + "learning_rate": 9.895842517844208e-06, + "loss": 1.457, + "step": 9751 + }, + { + "epoch": 0.5331656711725873, + "grad_norm": 1.2916865348815918, + "learning_rate": 9.894015261428728e-06, + "loss": 1.6916, + "step": 9752 + }, + { + "epoch": 0.5332203436163088, + "grad_norm": 1.668249487876892, + "learning_rate": 9.892188008552331e-06, + "loss": 1.36, + "step": 9753 + }, + { + "epoch": 0.5332750160600304, + "grad_norm": 1.70171320438385, + "learning_rate": 9.890360759276031e-06, + "loss": 1.5141, + "step": 9754 + }, + { + "epoch": 0.5333296885037518, + "grad_norm": 1.6062912940979004, + "learning_rate": 9.888533513660838e-06, + "loss": 1.5411, + "step": 9755 + }, + { + "epoch": 0.5333843609474734, + "grad_norm": 1.7040424346923828, + "learning_rate": 9.886706271767776e-06, + "loss": 1.3431, + "step": 9756 + }, + { + "epoch": 0.533439033391195, + "grad_norm": 1.6112462282180786, + "learning_rate": 9.884879033657859e-06, + "loss": 1.3919, + "step": 9757 + }, + { + "epoch": 0.5334937058349165, + "grad_norm": 1.342348575592041, + "learning_rate": 9.883051799392097e-06, + "loss": 1.2997, + "step": 9758 + }, + { + "epoch": 0.5335483782786381, + "grad_norm": 1.7382326126098633, + "learning_rate": 9.881224569031513e-06, + "loss": 1.5051, + "step": 9759 + }, + { + "epoch": 0.5336030507223597, + "grad_norm": 1.9332096576690674, + "learning_rate": 9.879397342637115e-06, + "loss": 1.3701, + "step": 9760 + }, + { + "epoch": 0.5336577231660812, + "grad_norm": 1.8102669715881348, + "learning_rate": 9.877570120269927e-06, + "loss": 1.4995, + "step": 9761 + }, + { + "epoch": 0.5337123956098028, + "grad_norm": 1.8019328117370605, + "learning_rate": 9.87574290199096e-06, + "loss": 1.4502, + "step": 9762 + }, + { + "epoch": 0.5337670680535244, + "grad_norm": 1.3983930349349976, + "learning_rate": 9.873915687861228e-06, + "loss": 1.383, + "step": 9763 + }, + { + "epoch": 0.5338217404972458, + "grad_norm": 1.578428030014038, + "learning_rate": 9.872088477941748e-06, + "loss": 1.4576, + "step": 9764 + }, + { + "epoch": 0.5338764129409674, + "grad_norm": 1.941852331161499, + "learning_rate": 9.870261272293533e-06, + "loss": 1.5462, + "step": 9765 + }, + { + "epoch": 0.533931085384689, + "grad_norm": 1.553046703338623, + "learning_rate": 9.868434070977597e-06, + "loss": 1.668, + "step": 9766 + }, + { + "epoch": 0.5339857578284105, + "grad_norm": 1.6409636735916138, + "learning_rate": 9.866606874054958e-06, + "loss": 1.3573, + "step": 9767 + }, + { + "epoch": 0.5340404302721321, + "grad_norm": 1.552938461303711, + "learning_rate": 9.86477968158663e-06, + "loss": 1.4411, + "step": 9768 + }, + { + "epoch": 0.5340951027158536, + "grad_norm": 1.565943956375122, + "learning_rate": 9.862952493633621e-06, + "loss": 1.529, + "step": 9769 + }, + { + "epoch": 0.5341497751595752, + "grad_norm": 1.6240252256393433, + "learning_rate": 9.861125310256955e-06, + "loss": 1.2042, + "step": 9770 + }, + { + "epoch": 0.5342044476032968, + "grad_norm": 1.4654937982559204, + "learning_rate": 9.859298131517639e-06, + "loss": 1.4601, + "step": 9771 + }, + { + "epoch": 0.5342591200470183, + "grad_norm": 1.4528652429580688, + "learning_rate": 9.857470957476685e-06, + "loss": 1.5377, + "step": 9772 + }, + { + "epoch": 0.5343137924907398, + "grad_norm": 1.3965117931365967, + "learning_rate": 9.855643788195113e-06, + "loss": 1.2228, + "step": 9773 + }, + { + "epoch": 0.5343684649344614, + "grad_norm": 1.7054089307785034, + "learning_rate": 9.853816623733931e-06, + "loss": 1.4451, + "step": 9774 + }, + { + "epoch": 0.5344231373781829, + "grad_norm": 1.4338940382003784, + "learning_rate": 9.85198946415416e-06, + "loss": 1.4358, + "step": 9775 + }, + { + "epoch": 0.5344778098219045, + "grad_norm": 1.8338929414749146, + "learning_rate": 9.850162309516807e-06, + "loss": 1.5361, + "step": 9776 + }, + { + "epoch": 0.5345324822656261, + "grad_norm": 1.1978380680084229, + "learning_rate": 9.848335159882884e-06, + "loss": 1.6153, + "step": 9777 + }, + { + "epoch": 0.5345871547093476, + "grad_norm": 1.3756400346755981, + "learning_rate": 9.846508015313407e-06, + "loss": 1.4014, + "step": 9778 + }, + { + "epoch": 0.5346418271530692, + "grad_norm": 1.5521599054336548, + "learning_rate": 9.844680875869389e-06, + "loss": 1.23, + "step": 9779 + }, + { + "epoch": 0.5346964995967908, + "grad_norm": 1.9317569732666016, + "learning_rate": 9.842853741611837e-06, + "loss": 1.6178, + "step": 9780 + }, + { + "epoch": 0.5347511720405123, + "grad_norm": 1.995390772819519, + "learning_rate": 9.84102661260177e-06, + "loss": 1.707, + "step": 9781 + }, + { + "epoch": 0.5348058444842338, + "grad_norm": 1.6168527603149414, + "learning_rate": 9.839199488900198e-06, + "loss": 1.2168, + "step": 9782 + }, + { + "epoch": 0.5348605169279554, + "grad_norm": 1.4536081552505493, + "learning_rate": 9.83737237056813e-06, + "loss": 1.4724, + "step": 9783 + }, + { + "epoch": 0.5349151893716769, + "grad_norm": 1.17406165599823, + "learning_rate": 9.835545257666585e-06, + "loss": 1.5271, + "step": 9784 + }, + { + "epoch": 0.5349698618153985, + "grad_norm": 1.6622037887573242, + "learning_rate": 9.833718150256567e-06, + "loss": 1.239, + "step": 9785 + }, + { + "epoch": 0.53502453425912, + "grad_norm": 1.4512279033660889, + "learning_rate": 9.831891048399087e-06, + "loss": 1.3879, + "step": 9786 + }, + { + "epoch": 0.5350792067028416, + "grad_norm": 1.773640751838684, + "learning_rate": 9.830063952155162e-06, + "loss": 1.3096, + "step": 9787 + }, + { + "epoch": 0.5351338791465632, + "grad_norm": 1.389129400253296, + "learning_rate": 9.8282368615858e-06, + "loss": 1.7029, + "step": 9788 + }, + { + "epoch": 0.5351885515902847, + "grad_norm": 1.3640127182006836, + "learning_rate": 9.826409776752014e-06, + "loss": 1.9486, + "step": 9789 + }, + { + "epoch": 0.5352432240340063, + "grad_norm": 1.3893760442733765, + "learning_rate": 9.824582697714813e-06, + "loss": 1.3922, + "step": 9790 + }, + { + "epoch": 0.5352978964777279, + "grad_norm": 1.445032000541687, + "learning_rate": 9.822755624535202e-06, + "loss": 1.4521, + "step": 9791 + }, + { + "epoch": 0.5353525689214493, + "grad_norm": 1.4022061824798584, + "learning_rate": 9.820928557274202e-06, + "loss": 1.3761, + "step": 9792 + }, + { + "epoch": 0.5354072413651709, + "grad_norm": 1.4106051921844482, + "learning_rate": 9.819101495992817e-06, + "loss": 1.551, + "step": 9793 + }, + { + "epoch": 0.5354619138088925, + "grad_norm": 1.4971446990966797, + "learning_rate": 9.817274440752053e-06, + "loss": 1.5739, + "step": 9794 + }, + { + "epoch": 0.535516586252614, + "grad_norm": 1.6292014122009277, + "learning_rate": 9.81544739161293e-06, + "loss": 1.5348, + "step": 9795 + }, + { + "epoch": 0.5355712586963356, + "grad_norm": 1.6676769256591797, + "learning_rate": 9.81362034863645e-06, + "loss": 1.678, + "step": 9796 + }, + { + "epoch": 0.5356259311400572, + "grad_norm": 2.9741790294647217, + "learning_rate": 9.811793311883624e-06, + "loss": 1.3908, + "step": 9797 + }, + { + "epoch": 0.5356806035837787, + "grad_norm": 1.6820611953735352, + "learning_rate": 9.809966281415461e-06, + "loss": 1.3815, + "step": 9798 + }, + { + "epoch": 0.5357352760275003, + "grad_norm": 1.4550679922103882, + "learning_rate": 9.808139257292971e-06, + "loss": 1.2627, + "step": 9799 + }, + { + "epoch": 0.5357899484712217, + "grad_norm": 1.8045669794082642, + "learning_rate": 9.806312239577156e-06, + "loss": 1.4032, + "step": 9800 + }, + { + "epoch": 0.5358446209149433, + "grad_norm": 1.3779762983322144, + "learning_rate": 9.804485228329035e-06, + "loss": 1.7685, + "step": 9801 + }, + { + "epoch": 0.5358992933586649, + "grad_norm": 1.557210087776184, + "learning_rate": 9.802658223609609e-06, + "loss": 1.2975, + "step": 9802 + }, + { + "epoch": 0.5359539658023864, + "grad_norm": 1.6167939901351929, + "learning_rate": 9.80083122547989e-06, + "loss": 1.489, + "step": 9803 + }, + { + "epoch": 0.536008638246108, + "grad_norm": 1.2344692945480347, + "learning_rate": 9.799004234000883e-06, + "loss": 1.6794, + "step": 9804 + }, + { + "epoch": 0.5360633106898296, + "grad_norm": 1.6912918090820312, + "learning_rate": 9.797177249233592e-06, + "loss": 1.4785, + "step": 9805 + }, + { + "epoch": 0.5361179831335511, + "grad_norm": 1.4997776746749878, + "learning_rate": 9.795350271239034e-06, + "loss": 1.629, + "step": 9806 + }, + { + "epoch": 0.5361726555772727, + "grad_norm": 2.047759771347046, + "learning_rate": 9.79352330007821e-06, + "loss": 1.2726, + "step": 9807 + }, + { + "epoch": 0.5362273280209943, + "grad_norm": 1.7667380571365356, + "learning_rate": 9.791696335812125e-06, + "loss": 1.1429, + "step": 9808 + }, + { + "epoch": 0.5362820004647157, + "grad_norm": 1.6177853345870972, + "learning_rate": 9.789869378501791e-06, + "loss": 1.4802, + "step": 9809 + }, + { + "epoch": 0.5363366729084373, + "grad_norm": 1.4040106534957886, + "learning_rate": 9.788042428208211e-06, + "loss": 1.5253, + "step": 9810 + }, + { + "epoch": 0.5363913453521589, + "grad_norm": 1.4512923955917358, + "learning_rate": 9.786215484992387e-06, + "loss": 1.4737, + "step": 9811 + }, + { + "epoch": 0.5364460177958804, + "grad_norm": 1.4883909225463867, + "learning_rate": 9.784388548915334e-06, + "loss": 1.3167, + "step": 9812 + }, + { + "epoch": 0.536500690239602, + "grad_norm": 1.6002912521362305, + "learning_rate": 9.782561620038055e-06, + "loss": 1.4715, + "step": 9813 + }, + { + "epoch": 0.5365553626833235, + "grad_norm": 2.457350969314575, + "learning_rate": 9.78073469842155e-06, + "loss": 1.6192, + "step": 9814 + }, + { + "epoch": 0.5366100351270451, + "grad_norm": 1.7071988582611084, + "learning_rate": 9.77890778412683e-06, + "loss": 1.3098, + "step": 9815 + }, + { + "epoch": 0.5366647075707667, + "grad_norm": 1.3505527973175049, + "learning_rate": 9.777080877214895e-06, + "loss": 1.409, + "step": 9816 + }, + { + "epoch": 0.5367193800144882, + "grad_norm": 1.501955509185791, + "learning_rate": 9.775253977746756e-06, + "loss": 1.411, + "step": 9817 + }, + { + "epoch": 0.5367740524582097, + "grad_norm": 1.3537901639938354, + "learning_rate": 9.773427085783413e-06, + "loss": 1.4146, + "step": 9818 + }, + { + "epoch": 0.5368287249019313, + "grad_norm": 1.451174020767212, + "learning_rate": 9.771600201385868e-06, + "loss": 1.4267, + "step": 9819 + }, + { + "epoch": 0.5368833973456528, + "grad_norm": 1.649912714958191, + "learning_rate": 9.769773324615133e-06, + "loss": 1.3311, + "step": 9820 + }, + { + "epoch": 0.5369380697893744, + "grad_norm": 1.635528564453125, + "learning_rate": 9.767946455532207e-06, + "loss": 1.7544, + "step": 9821 + }, + { + "epoch": 0.536992742233096, + "grad_norm": 1.3572907447814941, + "learning_rate": 9.76611959419809e-06, + "loss": 1.3974, + "step": 9822 + }, + { + "epoch": 0.5370474146768175, + "grad_norm": 1.5298848152160645, + "learning_rate": 9.764292740673792e-06, + "loss": 1.2337, + "step": 9823 + }, + { + "epoch": 0.5371020871205391, + "grad_norm": 1.6244536638259888, + "learning_rate": 9.762465895020312e-06, + "loss": 1.3906, + "step": 9824 + }, + { + "epoch": 0.5371567595642607, + "grad_norm": 1.4448363780975342, + "learning_rate": 9.76063905729865e-06, + "loss": 1.5081, + "step": 9825 + }, + { + "epoch": 0.5372114320079822, + "grad_norm": 1.5286569595336914, + "learning_rate": 9.758812227569813e-06, + "loss": 1.3999, + "step": 9826 + }, + { + "epoch": 0.5372661044517038, + "grad_norm": 1.7866313457489014, + "learning_rate": 9.756985405894802e-06, + "loss": 1.3776, + "step": 9827 + }, + { + "epoch": 0.5373207768954252, + "grad_norm": 1.5226333141326904, + "learning_rate": 9.755158592334619e-06, + "loss": 1.5786, + "step": 9828 + }, + { + "epoch": 0.5373754493391468, + "grad_norm": 1.38648521900177, + "learning_rate": 9.753331786950266e-06, + "loss": 1.8595, + "step": 9829 + }, + { + "epoch": 0.5374301217828684, + "grad_norm": 1.1605315208435059, + "learning_rate": 9.75150498980274e-06, + "loss": 1.6846, + "step": 9830 + }, + { + "epoch": 0.5374847942265899, + "grad_norm": 1.2986468076705933, + "learning_rate": 9.749678200953048e-06, + "loss": 1.3805, + "step": 9831 + }, + { + "epoch": 0.5375394666703115, + "grad_norm": 1.3178458213806152, + "learning_rate": 9.74785142046219e-06, + "loss": 1.3996, + "step": 9832 + }, + { + "epoch": 0.5375941391140331, + "grad_norm": 1.439375638961792, + "learning_rate": 9.746024648391162e-06, + "loss": 1.6689, + "step": 9833 + }, + { + "epoch": 0.5376488115577546, + "grad_norm": 1.4416362047195435, + "learning_rate": 9.744197884800968e-06, + "loss": 1.5459, + "step": 9834 + }, + { + "epoch": 0.5377034840014762, + "grad_norm": 1.5491634607315063, + "learning_rate": 9.742371129752607e-06, + "loss": 1.1568, + "step": 9835 + }, + { + "epoch": 0.5377581564451978, + "grad_norm": 1.8389750719070435, + "learning_rate": 9.740544383307077e-06, + "loss": 1.4664, + "step": 9836 + }, + { + "epoch": 0.5378128288889192, + "grad_norm": 1.2200466394424438, + "learning_rate": 9.738717645525381e-06, + "loss": 1.361, + "step": 9837 + }, + { + "epoch": 0.5378675013326408, + "grad_norm": 1.3692991733551025, + "learning_rate": 9.736890916468515e-06, + "loss": 1.5163, + "step": 9838 + }, + { + "epoch": 0.5379221737763624, + "grad_norm": 1.3805961608886719, + "learning_rate": 9.735064196197477e-06, + "loss": 1.4664, + "step": 9839 + }, + { + "epoch": 0.5379768462200839, + "grad_norm": 1.5408774614334106, + "learning_rate": 9.73323748477327e-06, + "loss": 1.6138, + "step": 9840 + }, + { + "epoch": 0.5380315186638055, + "grad_norm": 1.5873916149139404, + "learning_rate": 9.731410782256889e-06, + "loss": 1.6282, + "step": 9841 + }, + { + "epoch": 0.538086191107527, + "grad_norm": 1.70631742477417, + "learning_rate": 9.72958408870933e-06, + "loss": 1.3238, + "step": 9842 + }, + { + "epoch": 0.5381408635512486, + "grad_norm": 1.4780610799789429, + "learning_rate": 9.727757404191596e-06, + "loss": 1.4733, + "step": 9843 + }, + { + "epoch": 0.5381955359949702, + "grad_norm": 2.0977420806884766, + "learning_rate": 9.725930728764676e-06, + "loss": 1.689, + "step": 9844 + }, + { + "epoch": 0.5382502084386916, + "grad_norm": 1.5663225650787354, + "learning_rate": 9.724104062489576e-06, + "loss": 1.5073, + "step": 9845 + }, + { + "epoch": 0.5383048808824132, + "grad_norm": 1.819116234779358, + "learning_rate": 9.722277405427291e-06, + "loss": 1.4567, + "step": 9846 + }, + { + "epoch": 0.5383595533261348, + "grad_norm": 1.3799827098846436, + "learning_rate": 9.72045075763881e-06, + "loss": 1.4937, + "step": 9847 + }, + { + "epoch": 0.5384142257698563, + "grad_norm": 1.342206597328186, + "learning_rate": 9.718624119185138e-06, + "loss": 1.4737, + "step": 9848 + }, + { + "epoch": 0.5384688982135779, + "grad_norm": 1.6461100578308105, + "learning_rate": 9.716797490127268e-06, + "loss": 1.2198, + "step": 9849 + }, + { + "epoch": 0.5385235706572995, + "grad_norm": 1.8502929210662842, + "learning_rate": 9.714970870526188e-06, + "loss": 1.5031, + "step": 9850 + }, + { + "epoch": 0.538578243101021, + "grad_norm": 1.4941898584365845, + "learning_rate": 9.713144260442904e-06, + "loss": 1.4834, + "step": 9851 + }, + { + "epoch": 0.5386329155447426, + "grad_norm": 1.3805534839630127, + "learning_rate": 9.711317659938407e-06, + "loss": 1.4, + "step": 9852 + }, + { + "epoch": 0.5386875879884642, + "grad_norm": 1.8017457723617554, + "learning_rate": 9.709491069073688e-06, + "loss": 1.4508, + "step": 9853 + }, + { + "epoch": 0.5387422604321856, + "grad_norm": 2.189244031906128, + "learning_rate": 9.707664487909746e-06, + "loss": 1.1876, + "step": 9854 + }, + { + "epoch": 0.5387969328759072, + "grad_norm": 1.541770339012146, + "learning_rate": 9.705837916507575e-06, + "loss": 1.5152, + "step": 9855 + }, + { + "epoch": 0.5388516053196287, + "grad_norm": 1.4767976999282837, + "learning_rate": 9.70401135492816e-06, + "loss": 1.6847, + "step": 9856 + }, + { + "epoch": 0.5389062777633503, + "grad_norm": 1.8698142766952515, + "learning_rate": 9.702184803232506e-06, + "loss": 1.5221, + "step": 9857 + }, + { + "epoch": 0.5389609502070719, + "grad_norm": 1.664602518081665, + "learning_rate": 9.700358261481593e-06, + "loss": 1.539, + "step": 9858 + }, + { + "epoch": 0.5390156226507934, + "grad_norm": 1.3324512243270874, + "learning_rate": 9.698531729736426e-06, + "loss": 1.5391, + "step": 9859 + }, + { + "epoch": 0.539070295094515, + "grad_norm": 1.56949782371521, + "learning_rate": 9.696705208057994e-06, + "loss": 1.623, + "step": 9860 + }, + { + "epoch": 0.5391249675382366, + "grad_norm": 1.9749751091003418, + "learning_rate": 9.694878696507284e-06, + "loss": 1.3467, + "step": 9861 + }, + { + "epoch": 0.5391796399819581, + "grad_norm": 1.5432151556015015, + "learning_rate": 9.693052195145292e-06, + "loss": 1.2814, + "step": 9862 + }, + { + "epoch": 0.5392343124256797, + "grad_norm": 1.1073328256607056, + "learning_rate": 9.691225704033008e-06, + "loss": 1.5291, + "step": 9863 + }, + { + "epoch": 0.5392889848694012, + "grad_norm": 1.3551539182662964, + "learning_rate": 9.689399223231416e-06, + "loss": 1.3052, + "step": 9864 + }, + { + "epoch": 0.5393436573131227, + "grad_norm": 1.6577683687210083, + "learning_rate": 9.68757275280152e-06, + "loss": 1.3564, + "step": 9865 + }, + { + "epoch": 0.5393983297568443, + "grad_norm": 1.3698673248291016, + "learning_rate": 9.685746292804301e-06, + "loss": 1.2544, + "step": 9866 + }, + { + "epoch": 0.5394530022005659, + "grad_norm": 1.2798980474472046, + "learning_rate": 9.683919843300748e-06, + "loss": 1.2865, + "step": 9867 + }, + { + "epoch": 0.5395076746442874, + "grad_norm": 1.3038330078125, + "learning_rate": 9.682093404351856e-06, + "loss": 1.4122, + "step": 9868 + }, + { + "epoch": 0.539562347088009, + "grad_norm": 1.5197937488555908, + "learning_rate": 9.680266976018613e-06, + "loss": 1.4177, + "step": 9869 + }, + { + "epoch": 0.5396170195317305, + "grad_norm": 1.2395403385162354, + "learning_rate": 9.678440558362e-06, + "loss": 1.474, + "step": 9870 + }, + { + "epoch": 0.5396716919754521, + "grad_norm": 1.3783224821090698, + "learning_rate": 9.676614151443016e-06, + "loss": 1.5297, + "step": 9871 + }, + { + "epoch": 0.5397263644191737, + "grad_norm": 1.7004809379577637, + "learning_rate": 9.67478775532264e-06, + "loss": 1.4154, + "step": 9872 + }, + { + "epoch": 0.5397810368628951, + "grad_norm": 1.4676778316497803, + "learning_rate": 9.67296137006187e-06, + "loss": 1.2483, + "step": 9873 + }, + { + "epoch": 0.5398357093066167, + "grad_norm": 2.0822126865386963, + "learning_rate": 9.671134995721684e-06, + "loss": 1.5185, + "step": 9874 + }, + { + "epoch": 0.5398903817503383, + "grad_norm": 1.436127781867981, + "learning_rate": 9.66930863236307e-06, + "loss": 1.358, + "step": 9875 + }, + { + "epoch": 0.5399450541940598, + "grad_norm": 1.4557610750198364, + "learning_rate": 9.66748228004702e-06, + "loss": 1.4658, + "step": 9876 + }, + { + "epoch": 0.5399997266377814, + "grad_norm": 1.3849409818649292, + "learning_rate": 9.665655938834519e-06, + "loss": 1.3824, + "step": 9877 + }, + { + "epoch": 0.540054399081503, + "grad_norm": 1.7513957023620605, + "learning_rate": 9.663829608786543e-06, + "loss": 1.2417, + "step": 9878 + }, + { + "epoch": 0.5401090715252245, + "grad_norm": 1.5945099592208862, + "learning_rate": 9.662003289964092e-06, + "loss": 1.5283, + "step": 9879 + }, + { + "epoch": 0.5401637439689461, + "grad_norm": 1.6448783874511719, + "learning_rate": 9.660176982428144e-06, + "loss": 1.3833, + "step": 9880 + }, + { + "epoch": 0.5402184164126677, + "grad_norm": 1.3998781442642212, + "learning_rate": 9.658350686239682e-06, + "loss": 1.4712, + "step": 9881 + }, + { + "epoch": 0.5402730888563891, + "grad_norm": 1.7601697444915771, + "learning_rate": 9.656524401459692e-06, + "loss": 1.4561, + "step": 9882 + }, + { + "epoch": 0.5403277613001107, + "grad_norm": 1.4048689603805542, + "learning_rate": 9.654698128149162e-06, + "loss": 1.5619, + "step": 9883 + }, + { + "epoch": 0.5403824337438322, + "grad_norm": 1.5123807191848755, + "learning_rate": 9.652871866369064e-06, + "loss": 1.4897, + "step": 9884 + }, + { + "epoch": 0.5404371061875538, + "grad_norm": 1.5667579174041748, + "learning_rate": 9.651045616180395e-06, + "loss": 1.3308, + "step": 9885 + }, + { + "epoch": 0.5404917786312754, + "grad_norm": 1.2586894035339355, + "learning_rate": 9.64921937764413e-06, + "loss": 1.4222, + "step": 9886 + }, + { + "epoch": 0.5405464510749969, + "grad_norm": 1.4084380865097046, + "learning_rate": 9.647393150821254e-06, + "loss": 1.4968, + "step": 9887 + }, + { + "epoch": 0.5406011235187185, + "grad_norm": 1.5223960876464844, + "learning_rate": 9.645566935772749e-06, + "loss": 1.432, + "step": 9888 + }, + { + "epoch": 0.5406557959624401, + "grad_norm": 1.5818912982940674, + "learning_rate": 9.64374073255959e-06, + "loss": 1.4851, + "step": 9889 + }, + { + "epoch": 0.5407104684061615, + "grad_norm": 1.6756627559661865, + "learning_rate": 9.64191454124277e-06, + "loss": 1.3771, + "step": 9890 + }, + { + "epoch": 0.5407651408498831, + "grad_norm": 1.350911021232605, + "learning_rate": 9.640088361883263e-06, + "loss": 1.29, + "step": 9891 + }, + { + "epoch": 0.5408198132936047, + "grad_norm": 1.4773664474487305, + "learning_rate": 9.638262194542048e-06, + "loss": 1.651, + "step": 9892 + }, + { + "epoch": 0.5408744857373262, + "grad_norm": 2.0600461959838867, + "learning_rate": 9.636436039280111e-06, + "loss": 1.4118, + "step": 9893 + }, + { + "epoch": 0.5409291581810478, + "grad_norm": 2.123889923095703, + "learning_rate": 9.634609896158426e-06, + "loss": 1.4895, + "step": 9894 + }, + { + "epoch": 0.5409838306247694, + "grad_norm": 2.3042104244232178, + "learning_rate": 9.632783765237968e-06, + "loss": 1.316, + "step": 9895 + }, + { + "epoch": 0.5410385030684909, + "grad_norm": 1.537593960762024, + "learning_rate": 9.63095764657973e-06, + "loss": 1.4256, + "step": 9896 + }, + { + "epoch": 0.5410931755122125, + "grad_norm": 1.2897764444351196, + "learning_rate": 9.62913154024468e-06, + "loss": 1.3371, + "step": 9897 + }, + { + "epoch": 0.541147847955934, + "grad_norm": 1.5337392091751099, + "learning_rate": 9.627305446293793e-06, + "loss": 1.5513, + "step": 9898 + }, + { + "epoch": 0.5412025203996556, + "grad_norm": 1.3247371912002563, + "learning_rate": 9.625479364788058e-06, + "loss": 1.4419, + "step": 9899 + }, + { + "epoch": 0.5412571928433771, + "grad_norm": 1.8230891227722168, + "learning_rate": 9.623653295788442e-06, + "loss": 1.7742, + "step": 9900 + }, + { + "epoch": 0.5413118652870986, + "grad_norm": 1.3569060564041138, + "learning_rate": 9.62182723935593e-06, + "loss": 1.5833, + "step": 9901 + }, + { + "epoch": 0.5413665377308202, + "grad_norm": 1.2318618297576904, + "learning_rate": 9.62000119555149e-06, + "loss": 1.6612, + "step": 9902 + }, + { + "epoch": 0.5414212101745418, + "grad_norm": 1.3654797077178955, + "learning_rate": 9.6181751644361e-06, + "loss": 1.4087, + "step": 9903 + }, + { + "epoch": 0.5414758826182633, + "grad_norm": 1.5194675922393799, + "learning_rate": 9.61634914607074e-06, + "loss": 1.3924, + "step": 9904 + }, + { + "epoch": 0.5415305550619849, + "grad_norm": 2.1169989109039307, + "learning_rate": 9.614523140516385e-06, + "loss": 1.2295, + "step": 9905 + }, + { + "epoch": 0.5415852275057065, + "grad_norm": 1.7702832221984863, + "learning_rate": 9.612697147834004e-06, + "loss": 1.535, + "step": 9906 + }, + { + "epoch": 0.541639899949428, + "grad_norm": 1.5973674058914185, + "learning_rate": 9.610871168084575e-06, + "loss": 1.4264, + "step": 9907 + }, + { + "epoch": 0.5416945723931496, + "grad_norm": 1.516692876815796, + "learning_rate": 9.609045201329071e-06, + "loss": 1.5048, + "step": 9908 + }, + { + "epoch": 0.5417492448368711, + "grad_norm": 1.6780316829681396, + "learning_rate": 9.607219247628461e-06, + "loss": 1.3531, + "step": 9909 + }, + { + "epoch": 0.5418039172805926, + "grad_norm": 1.518998146057129, + "learning_rate": 9.605393307043726e-06, + "loss": 1.5447, + "step": 9910 + }, + { + "epoch": 0.5418585897243142, + "grad_norm": 1.616408348083496, + "learning_rate": 9.603567379635836e-06, + "loss": 1.6258, + "step": 9911 + }, + { + "epoch": 0.5419132621680357, + "grad_norm": 1.733818769454956, + "learning_rate": 9.601741465465759e-06, + "loss": 1.4348, + "step": 9912 + }, + { + "epoch": 0.5419679346117573, + "grad_norm": 1.366363763809204, + "learning_rate": 9.59991556459447e-06, + "loss": 1.5398, + "step": 9913 + }, + { + "epoch": 0.5420226070554789, + "grad_norm": 2.138065814971924, + "learning_rate": 9.598089677082934e-06, + "loss": 1.3571, + "step": 9914 + }, + { + "epoch": 0.5420772794992004, + "grad_norm": 1.2912752628326416, + "learning_rate": 9.596263802992134e-06, + "loss": 1.5048, + "step": 9915 + }, + { + "epoch": 0.542131951942922, + "grad_norm": 1.5994317531585693, + "learning_rate": 9.59443794238303e-06, + "loss": 1.3993, + "step": 9916 + }, + { + "epoch": 0.5421866243866436, + "grad_norm": 1.4501099586486816, + "learning_rate": 9.592612095316592e-06, + "loss": 1.4591, + "step": 9917 + }, + { + "epoch": 0.542241296830365, + "grad_norm": 1.4525755643844604, + "learning_rate": 9.590786261853798e-06, + "loss": 1.4132, + "step": 9918 + }, + { + "epoch": 0.5422959692740866, + "grad_norm": 1.6120487451553345, + "learning_rate": 9.588960442055609e-06, + "loss": 1.5349, + "step": 9919 + }, + { + "epoch": 0.5423506417178082, + "grad_norm": 1.6479032039642334, + "learning_rate": 9.587134635982992e-06, + "loss": 1.218, + "step": 9920 + }, + { + "epoch": 0.5424053141615297, + "grad_norm": 1.6028378009796143, + "learning_rate": 9.585308843696923e-06, + "loss": 1.4059, + "step": 9921 + }, + { + "epoch": 0.5424599866052513, + "grad_norm": 2.257596969604492, + "learning_rate": 9.583483065258363e-06, + "loss": 1.3545, + "step": 9922 + }, + { + "epoch": 0.5425146590489729, + "grad_norm": 1.2863298654556274, + "learning_rate": 9.581657300728278e-06, + "loss": 1.6343, + "step": 9923 + }, + { + "epoch": 0.5425693314926944, + "grad_norm": 1.4083772897720337, + "learning_rate": 9.57983155016764e-06, + "loss": 1.2373, + "step": 9924 + }, + { + "epoch": 0.542624003936416, + "grad_norm": 1.3469699621200562, + "learning_rate": 9.578005813637414e-06, + "loss": 1.4533, + "step": 9925 + }, + { + "epoch": 0.5426786763801374, + "grad_norm": 1.6844896078109741, + "learning_rate": 9.576180091198562e-06, + "loss": 1.4036, + "step": 9926 + }, + { + "epoch": 0.542733348823859, + "grad_norm": 2.0637497901916504, + "learning_rate": 9.574354382912052e-06, + "loss": 1.3266, + "step": 9927 + }, + { + "epoch": 0.5427880212675806, + "grad_norm": 1.0774093866348267, + "learning_rate": 9.572528688838845e-06, + "loss": 1.6006, + "step": 9928 + }, + { + "epoch": 0.5428426937113021, + "grad_norm": 1.5183085203170776, + "learning_rate": 9.570703009039911e-06, + "loss": 1.1623, + "step": 9929 + }, + { + "epoch": 0.5428973661550237, + "grad_norm": 1.4152848720550537, + "learning_rate": 9.568877343576212e-06, + "loss": 1.2486, + "step": 9930 + }, + { + "epoch": 0.5429520385987453, + "grad_norm": 1.7228657007217407, + "learning_rate": 9.56705169250871e-06, + "loss": 1.4742, + "step": 9931 + }, + { + "epoch": 0.5430067110424668, + "grad_norm": 1.7053272724151611, + "learning_rate": 9.565226055898366e-06, + "loss": 1.5124, + "step": 9932 + }, + { + "epoch": 0.5430613834861884, + "grad_norm": 1.7557988166809082, + "learning_rate": 9.563400433806147e-06, + "loss": 1.2409, + "step": 9933 + }, + { + "epoch": 0.54311605592991, + "grad_norm": 1.8169937133789062, + "learning_rate": 9.561574826293006e-06, + "loss": 1.2112, + "step": 9934 + }, + { + "epoch": 0.5431707283736315, + "grad_norm": 1.861846923828125, + "learning_rate": 9.559749233419915e-06, + "loss": 1.4543, + "step": 9935 + }, + { + "epoch": 0.543225400817353, + "grad_norm": 1.4544098377227783, + "learning_rate": 9.557923655247832e-06, + "loss": 1.2753, + "step": 9936 + }, + { + "epoch": 0.5432800732610746, + "grad_norm": 1.365883231163025, + "learning_rate": 9.55609809183771e-06, + "loss": 1.2018, + "step": 9937 + }, + { + "epoch": 0.5433347457047961, + "grad_norm": 1.3079043626785278, + "learning_rate": 9.554272543250518e-06, + "loss": 1.4549, + "step": 9938 + }, + { + "epoch": 0.5433894181485177, + "grad_norm": 1.5627719163894653, + "learning_rate": 9.552447009547214e-06, + "loss": 1.5654, + "step": 9939 + }, + { + "epoch": 0.5434440905922392, + "grad_norm": 1.6057878732681274, + "learning_rate": 9.550621490788749e-06, + "loss": 1.2842, + "step": 9940 + }, + { + "epoch": 0.5434987630359608, + "grad_norm": 2.235243082046509, + "learning_rate": 9.548795987036091e-06, + "loss": 1.168, + "step": 9941 + }, + { + "epoch": 0.5435534354796824, + "grad_norm": 1.9071176052093506, + "learning_rate": 9.54697049835019e-06, + "loss": 1.4499, + "step": 9942 + }, + { + "epoch": 0.5436081079234039, + "grad_norm": 1.3748409748077393, + "learning_rate": 9.545145024792012e-06, + "loss": 1.5998, + "step": 9943 + }, + { + "epoch": 0.5436627803671255, + "grad_norm": 1.7194459438323975, + "learning_rate": 9.543319566422507e-06, + "loss": 1.1939, + "step": 9944 + }, + { + "epoch": 0.543717452810847, + "grad_norm": 1.42312490940094, + "learning_rate": 9.541494123302632e-06, + "loss": 1.4406, + "step": 9945 + }, + { + "epoch": 0.5437721252545685, + "grad_norm": 1.583838939666748, + "learning_rate": 9.539668695493344e-06, + "loss": 1.3388, + "step": 9946 + }, + { + "epoch": 0.5438267976982901, + "grad_norm": 1.7054904699325562, + "learning_rate": 9.537843283055602e-06, + "loss": 1.4397, + "step": 9947 + }, + { + "epoch": 0.5438814701420117, + "grad_norm": 1.6049840450286865, + "learning_rate": 9.536017886050352e-06, + "loss": 1.4608, + "step": 9948 + }, + { + "epoch": 0.5439361425857332, + "grad_norm": 1.519769310951233, + "learning_rate": 9.534192504538557e-06, + "loss": 1.4822, + "step": 9949 + }, + { + "epoch": 0.5439908150294548, + "grad_norm": 1.3170417547225952, + "learning_rate": 9.532367138581168e-06, + "loss": 1.3843, + "step": 9950 + }, + { + "epoch": 0.5440454874731764, + "grad_norm": 1.3392893075942993, + "learning_rate": 9.530541788239135e-06, + "loss": 1.6185, + "step": 9951 + }, + { + "epoch": 0.5441001599168979, + "grad_norm": 2.059736728668213, + "learning_rate": 9.528716453573415e-06, + "loss": 1.4526, + "step": 9952 + }, + { + "epoch": 0.5441548323606195, + "grad_norm": 1.6588749885559082, + "learning_rate": 9.52689113464496e-06, + "loss": 1.3301, + "step": 9953 + }, + { + "epoch": 0.5442095048043409, + "grad_norm": 2.1501340866088867, + "learning_rate": 9.525065831514715e-06, + "loss": 1.4691, + "step": 9954 + }, + { + "epoch": 0.5442641772480625, + "grad_norm": 1.7150287628173828, + "learning_rate": 9.52324054424364e-06, + "loss": 1.4585, + "step": 9955 + }, + { + "epoch": 0.5443188496917841, + "grad_norm": 1.738078236579895, + "learning_rate": 9.521415272892678e-06, + "loss": 1.4681, + "step": 9956 + }, + { + "epoch": 0.5443735221355056, + "grad_norm": 1.3943411111831665, + "learning_rate": 9.519590017522788e-06, + "loss": 1.4757, + "step": 9957 + }, + { + "epoch": 0.5444281945792272, + "grad_norm": 1.6687003374099731, + "learning_rate": 9.517764778194915e-06, + "loss": 1.333, + "step": 9958 + }, + { + "epoch": 0.5444828670229488, + "grad_norm": 1.168666958808899, + "learning_rate": 9.515939554970005e-06, + "loss": 1.3278, + "step": 9959 + }, + { + "epoch": 0.5445375394666703, + "grad_norm": 1.3053704500198364, + "learning_rate": 9.514114347909011e-06, + "loss": 1.2422, + "step": 9960 + }, + { + "epoch": 0.5445922119103919, + "grad_norm": 1.638789415359497, + "learning_rate": 9.512289157072879e-06, + "loss": 1.3211, + "step": 9961 + }, + { + "epoch": 0.5446468843541135, + "grad_norm": 1.324152946472168, + "learning_rate": 9.510463982522554e-06, + "loss": 1.55, + "step": 9962 + }, + { + "epoch": 0.5447015567978349, + "grad_norm": 1.5191373825073242, + "learning_rate": 9.508638824318988e-06, + "loss": 1.4078, + "step": 9963 + }, + { + "epoch": 0.5447562292415565, + "grad_norm": 1.4426108598709106, + "learning_rate": 9.506813682523124e-06, + "loss": 1.5057, + "step": 9964 + }, + { + "epoch": 0.5448109016852781, + "grad_norm": 1.5574780702590942, + "learning_rate": 9.504988557195906e-06, + "loss": 1.5813, + "step": 9965 + }, + { + "epoch": 0.5448655741289996, + "grad_norm": 1.5259788036346436, + "learning_rate": 9.503163448398286e-06, + "loss": 1.3561, + "step": 9966 + }, + { + "epoch": 0.5449202465727212, + "grad_norm": 1.4051456451416016, + "learning_rate": 9.501338356191204e-06, + "loss": 1.3039, + "step": 9967 + }, + { + "epoch": 0.5449749190164427, + "grad_norm": 1.5395984649658203, + "learning_rate": 9.499513280635598e-06, + "loss": 1.2435, + "step": 9968 + }, + { + "epoch": 0.5450295914601643, + "grad_norm": 1.492140531539917, + "learning_rate": 9.497688221792424e-06, + "loss": 1.6737, + "step": 9969 + }, + { + "epoch": 0.5450842639038859, + "grad_norm": 1.3422446250915527, + "learning_rate": 9.495863179722616e-06, + "loss": 1.4824, + "step": 9970 + }, + { + "epoch": 0.5451389363476074, + "grad_norm": 1.3023433685302734, + "learning_rate": 9.494038154487124e-06, + "loss": 1.5493, + "step": 9971 + }, + { + "epoch": 0.5451936087913289, + "grad_norm": 1.4466755390167236, + "learning_rate": 9.492213146146883e-06, + "loss": 1.4354, + "step": 9972 + }, + { + "epoch": 0.5452482812350505, + "grad_norm": 1.2619271278381348, + "learning_rate": 9.490388154762832e-06, + "loss": 1.2735, + "step": 9973 + }, + { + "epoch": 0.545302953678772, + "grad_norm": 1.5536260604858398, + "learning_rate": 9.488563180395922e-06, + "loss": 1.3136, + "step": 9974 + }, + { + "epoch": 0.5453576261224936, + "grad_norm": 1.2304331064224243, + "learning_rate": 9.486738223107087e-06, + "loss": 1.5072, + "step": 9975 + }, + { + "epoch": 0.5454122985662152, + "grad_norm": 1.644006609916687, + "learning_rate": 9.484913282957262e-06, + "loss": 1.184, + "step": 9976 + }, + { + "epoch": 0.5454669710099367, + "grad_norm": 1.4919102191925049, + "learning_rate": 9.483088360007396e-06, + "loss": 1.3109, + "step": 9977 + }, + { + "epoch": 0.5455216434536583, + "grad_norm": 1.2410218715667725, + "learning_rate": 9.481263454318423e-06, + "loss": 1.4664, + "step": 9978 + }, + { + "epoch": 0.5455763158973799, + "grad_norm": 1.779821753501892, + "learning_rate": 9.479438565951278e-06, + "loss": 1.3369, + "step": 9979 + }, + { + "epoch": 0.5456309883411014, + "grad_norm": 1.662366271018982, + "learning_rate": 9.477613694966902e-06, + "loss": 1.4137, + "step": 9980 + }, + { + "epoch": 0.545685660784823, + "grad_norm": 1.4031243324279785, + "learning_rate": 9.475788841426232e-06, + "loss": 1.4331, + "step": 9981 + }, + { + "epoch": 0.5457403332285444, + "grad_norm": 1.716838002204895, + "learning_rate": 9.473964005390198e-06, + "loss": 1.2865, + "step": 9982 + }, + { + "epoch": 0.545795005672266, + "grad_norm": 1.866847038269043, + "learning_rate": 9.472139186919745e-06, + "loss": 1.4942, + "step": 9983 + }, + { + "epoch": 0.5458496781159876, + "grad_norm": 1.760635256767273, + "learning_rate": 9.470314386075801e-06, + "loss": 1.3803, + "step": 9984 + }, + { + "epoch": 0.5459043505597091, + "grad_norm": 1.2243618965148926, + "learning_rate": 9.468489602919305e-06, + "loss": 1.6136, + "step": 9985 + }, + { + "epoch": 0.5459590230034307, + "grad_norm": 1.747759461402893, + "learning_rate": 9.466664837511188e-06, + "loss": 1.5537, + "step": 9986 + }, + { + "epoch": 0.5460136954471523, + "grad_norm": 1.6407601833343506, + "learning_rate": 9.464840089912379e-06, + "loss": 1.6333, + "step": 9987 + }, + { + "epoch": 0.5460683678908738, + "grad_norm": 1.4774335622787476, + "learning_rate": 9.463015360183819e-06, + "loss": 1.2518, + "step": 9988 + }, + { + "epoch": 0.5461230403345954, + "grad_norm": 1.4473100900650024, + "learning_rate": 9.461190648386436e-06, + "loss": 1.5936, + "step": 9989 + }, + { + "epoch": 0.546177712778317, + "grad_norm": 1.4302263259887695, + "learning_rate": 9.459365954581162e-06, + "loss": 1.5236, + "step": 9990 + }, + { + "epoch": 0.5462323852220384, + "grad_norm": 1.2944750785827637, + "learning_rate": 9.457541278828927e-06, + "loss": 1.5292, + "step": 9991 + }, + { + "epoch": 0.54628705766576, + "grad_norm": 1.423378825187683, + "learning_rate": 9.455716621190662e-06, + "loss": 1.6129, + "step": 9992 + }, + { + "epoch": 0.5463417301094816, + "grad_norm": 1.3946176767349243, + "learning_rate": 9.453891981727293e-06, + "loss": 1.5598, + "step": 9993 + }, + { + "epoch": 0.5463964025532031, + "grad_norm": 1.7160550355911255, + "learning_rate": 9.452067360499755e-06, + "loss": 1.199, + "step": 9994 + }, + { + "epoch": 0.5464510749969247, + "grad_norm": 1.7264131307601929, + "learning_rate": 9.450242757568975e-06, + "loss": 1.5048, + "step": 9995 + }, + { + "epoch": 0.5465057474406463, + "grad_norm": 1.554206371307373, + "learning_rate": 9.448418172995875e-06, + "loss": 1.3704, + "step": 9996 + }, + { + "epoch": 0.5465604198843678, + "grad_norm": 1.4094318151474, + "learning_rate": 9.44659360684139e-06, + "loss": 1.3893, + "step": 9997 + }, + { + "epoch": 0.5466150923280894, + "grad_norm": 1.4963099956512451, + "learning_rate": 9.44476905916644e-06, + "loss": 1.5493, + "step": 9998 + }, + { + "epoch": 0.5466697647718108, + "grad_norm": 1.3465471267700195, + "learning_rate": 9.442944530031957e-06, + "loss": 1.5265, + "step": 9999 + }, + { + "epoch": 0.5467244372155324, + "grad_norm": 1.6988276243209839, + "learning_rate": 9.441120019498864e-06, + "loss": 1.5066, + "step": 10000 + }, + { + "epoch": 0.546779109659254, + "grad_norm": 1.2496709823608398, + "learning_rate": 9.439295527628083e-06, + "loss": 1.4902, + "step": 10001 + }, + { + "epoch": 0.5468337821029755, + "grad_norm": 1.5289196968078613, + "learning_rate": 9.43747105448054e-06, + "loss": 1.5639, + "step": 10002 + }, + { + "epoch": 0.5468884545466971, + "grad_norm": 1.3632268905639648, + "learning_rate": 9.43564660011716e-06, + "loss": 1.4902, + "step": 10003 + }, + { + "epoch": 0.5469431269904187, + "grad_norm": 2.424450397491455, + "learning_rate": 9.433822164598862e-06, + "loss": 1.566, + "step": 10004 + }, + { + "epoch": 0.5469977994341402, + "grad_norm": 1.498538613319397, + "learning_rate": 9.431997747986575e-06, + "loss": 1.3287, + "step": 10005 + }, + { + "epoch": 0.5470524718778618, + "grad_norm": 2.1451761722564697, + "learning_rate": 9.430173350341214e-06, + "loss": 1.2901, + "step": 10006 + }, + { + "epoch": 0.5471071443215834, + "grad_norm": 1.3585021495819092, + "learning_rate": 9.428348971723697e-06, + "loss": 1.654, + "step": 10007 + }, + { + "epoch": 0.5471618167653048, + "grad_norm": 1.3656519651412964, + "learning_rate": 9.426524612194954e-06, + "loss": 1.2739, + "step": 10008 + }, + { + "epoch": 0.5472164892090264, + "grad_norm": 1.569416880607605, + "learning_rate": 9.424700271815901e-06, + "loss": 1.1565, + "step": 10009 + }, + { + "epoch": 0.547271161652748, + "grad_norm": 1.5444669723510742, + "learning_rate": 9.422875950647453e-06, + "loss": 1.5413, + "step": 10010 + }, + { + "epoch": 0.5473258340964695, + "grad_norm": 1.5612009763717651, + "learning_rate": 9.421051648750533e-06, + "loss": 1.3431, + "step": 10011 + }, + { + "epoch": 0.5473805065401911, + "grad_norm": 2.2729053497314453, + "learning_rate": 9.419227366186058e-06, + "loss": 1.4959, + "step": 10012 + }, + { + "epoch": 0.5474351789839126, + "grad_norm": 1.4924767017364502, + "learning_rate": 9.41740310301494e-06, + "loss": 1.0964, + "step": 10013 + }, + { + "epoch": 0.5474898514276342, + "grad_norm": 1.6046911478042603, + "learning_rate": 9.415578859298103e-06, + "loss": 1.4429, + "step": 10014 + }, + { + "epoch": 0.5475445238713558, + "grad_norm": 1.7328113317489624, + "learning_rate": 9.413754635096454e-06, + "loss": 1.2643, + "step": 10015 + }, + { + "epoch": 0.5475991963150773, + "grad_norm": 1.499720811843872, + "learning_rate": 9.41193043047092e-06, + "loss": 1.4552, + "step": 10016 + }, + { + "epoch": 0.5476538687587988, + "grad_norm": 1.2517907619476318, + "learning_rate": 9.410106245482406e-06, + "loss": 1.4123, + "step": 10017 + }, + { + "epoch": 0.5477085412025204, + "grad_norm": 1.6901730298995972, + "learning_rate": 9.408282080191828e-06, + "loss": 1.5782, + "step": 10018 + }, + { + "epoch": 0.5477632136462419, + "grad_norm": 1.4640432596206665, + "learning_rate": 9.406457934660103e-06, + "loss": 1.6807, + "step": 10019 + }, + { + "epoch": 0.5478178860899635, + "grad_norm": 1.6362425088882446, + "learning_rate": 9.404633808948139e-06, + "loss": 1.3957, + "step": 10020 + }, + { + "epoch": 0.5478725585336851, + "grad_norm": 1.9374054670333862, + "learning_rate": 9.402809703116846e-06, + "loss": 1.5285, + "step": 10021 + }, + { + "epoch": 0.5479272309774066, + "grad_norm": 1.6048859357833862, + "learning_rate": 9.400985617227141e-06, + "loss": 1.666, + "step": 10022 + }, + { + "epoch": 0.5479819034211282, + "grad_norm": 2.1010022163391113, + "learning_rate": 9.399161551339933e-06, + "loss": 1.3946, + "step": 10023 + }, + { + "epoch": 0.5480365758648498, + "grad_norm": 1.7181308269500732, + "learning_rate": 9.397337505516129e-06, + "loss": 1.6129, + "step": 10024 + }, + { + "epoch": 0.5480912483085713, + "grad_norm": 1.3532253503799438, + "learning_rate": 9.395513479816642e-06, + "loss": 1.6164, + "step": 10025 + }, + { + "epoch": 0.5481459207522928, + "grad_norm": 1.5088067054748535, + "learning_rate": 9.393689474302378e-06, + "loss": 1.5049, + "step": 10026 + }, + { + "epoch": 0.5482005931960143, + "grad_norm": 1.4285953044891357, + "learning_rate": 9.39186548903424e-06, + "loss": 1.3564, + "step": 10027 + }, + { + "epoch": 0.5482552656397359, + "grad_norm": 1.4352225065231323, + "learning_rate": 9.390041524073146e-06, + "loss": 1.3663, + "step": 10028 + }, + { + "epoch": 0.5483099380834575, + "grad_norm": 1.148505687713623, + "learning_rate": 9.388217579479994e-06, + "loss": 1.5016, + "step": 10029 + }, + { + "epoch": 0.548364610527179, + "grad_norm": 1.526169776916504, + "learning_rate": 9.386393655315696e-06, + "loss": 1.494, + "step": 10030 + }, + { + "epoch": 0.5484192829709006, + "grad_norm": 1.7490122318267822, + "learning_rate": 9.384569751641152e-06, + "loss": 1.3538, + "step": 10031 + }, + { + "epoch": 0.5484739554146222, + "grad_norm": 1.7083041667938232, + "learning_rate": 9.382745868517263e-06, + "loss": 1.4414, + "step": 10032 + }, + { + "epoch": 0.5485286278583437, + "grad_norm": 1.4718773365020752, + "learning_rate": 9.380922006004944e-06, + "loss": 1.8518, + "step": 10033 + }, + { + "epoch": 0.5485833003020653, + "grad_norm": 1.3072452545166016, + "learning_rate": 9.379098164165093e-06, + "loss": 1.5978, + "step": 10034 + }, + { + "epoch": 0.5486379727457869, + "grad_norm": 1.5105639696121216, + "learning_rate": 9.377274343058604e-06, + "loss": 1.4237, + "step": 10035 + }, + { + "epoch": 0.5486926451895083, + "grad_norm": 2.1749866008758545, + "learning_rate": 9.375450542746393e-06, + "loss": 1.4648, + "step": 10036 + }, + { + "epoch": 0.5487473176332299, + "grad_norm": 1.541966199874878, + "learning_rate": 9.373626763289352e-06, + "loss": 1.4249, + "step": 10037 + }, + { + "epoch": 0.5488019900769515, + "grad_norm": 1.2818036079406738, + "learning_rate": 9.371803004748383e-06, + "loss": 1.501, + "step": 10038 + }, + { + "epoch": 0.548856662520673, + "grad_norm": 1.8975107669830322, + "learning_rate": 9.369979267184386e-06, + "loss": 1.4235, + "step": 10039 + }, + { + "epoch": 0.5489113349643946, + "grad_norm": 1.4275468587875366, + "learning_rate": 9.36815555065826e-06, + "loss": 1.5683, + "step": 10040 + }, + { + "epoch": 0.5489660074081161, + "grad_norm": 1.5843795537948608, + "learning_rate": 9.366331855230898e-06, + "loss": 1.3679, + "step": 10041 + }, + { + "epoch": 0.5490206798518377, + "grad_norm": 1.8491766452789307, + "learning_rate": 9.364508180963209e-06, + "loss": 1.3855, + "step": 10042 + }, + { + "epoch": 0.5490753522955593, + "grad_norm": 1.7548617124557495, + "learning_rate": 9.36268452791608e-06, + "loss": 1.2208, + "step": 10043 + }, + { + "epoch": 0.5491300247392807, + "grad_norm": 1.733811855316162, + "learning_rate": 9.36086089615041e-06, + "loss": 1.4756, + "step": 10044 + }, + { + "epoch": 0.5491846971830023, + "grad_norm": 2.1260366439819336, + "learning_rate": 9.359037285727097e-06, + "loss": 1.4334, + "step": 10045 + }, + { + "epoch": 0.5492393696267239, + "grad_norm": 1.254266381263733, + "learning_rate": 9.357213696707028e-06, + "loss": 1.6417, + "step": 10046 + }, + { + "epoch": 0.5492940420704454, + "grad_norm": 1.3839796781539917, + "learning_rate": 9.355390129151106e-06, + "loss": 1.6008, + "step": 10047 + }, + { + "epoch": 0.549348714514167, + "grad_norm": 1.6633020639419556, + "learning_rate": 9.35356658312022e-06, + "loss": 1.4753, + "step": 10048 + }, + { + "epoch": 0.5494033869578886, + "grad_norm": 1.9708828926086426, + "learning_rate": 9.351743058675263e-06, + "loss": 1.3944, + "step": 10049 + }, + { + "epoch": 0.5494580594016101, + "grad_norm": 1.6931321620941162, + "learning_rate": 9.349919555877125e-06, + "loss": 1.4065, + "step": 10050 + }, + { + "epoch": 0.5495127318453317, + "grad_norm": 1.5086336135864258, + "learning_rate": 9.3480960747867e-06, + "loss": 1.5228, + "step": 10051 + }, + { + "epoch": 0.5495674042890533, + "grad_norm": 1.4714820384979248, + "learning_rate": 9.346272615464874e-06, + "loss": 1.3328, + "step": 10052 + }, + { + "epoch": 0.5496220767327747, + "grad_norm": 1.4338970184326172, + "learning_rate": 9.344449177972541e-06, + "loss": 1.4653, + "step": 10053 + }, + { + "epoch": 0.5496767491764963, + "grad_norm": 1.6290528774261475, + "learning_rate": 9.342625762370589e-06, + "loss": 1.2835, + "step": 10054 + }, + { + "epoch": 0.5497314216202178, + "grad_norm": 1.9533792734146118, + "learning_rate": 9.340802368719904e-06, + "loss": 1.3808, + "step": 10055 + }, + { + "epoch": 0.5497860940639394, + "grad_norm": 2.1422314643859863, + "learning_rate": 9.338978997081378e-06, + "loss": 1.4825, + "step": 10056 + }, + { + "epoch": 0.549840766507661, + "grad_norm": 1.4477638006210327, + "learning_rate": 9.337155647515888e-06, + "loss": 1.3868, + "step": 10057 + }, + { + "epoch": 0.5498954389513825, + "grad_norm": 1.5821337699890137, + "learning_rate": 9.335332320084331e-06, + "loss": 1.5771, + "step": 10058 + }, + { + "epoch": 0.5499501113951041, + "grad_norm": 1.3861593008041382, + "learning_rate": 9.333509014847589e-06, + "loss": 1.6284, + "step": 10059 + }, + { + "epoch": 0.5500047838388257, + "grad_norm": 1.3135277032852173, + "learning_rate": 9.33168573186654e-06, + "loss": 1.4388, + "step": 10060 + }, + { + "epoch": 0.5500594562825472, + "grad_norm": 1.5660452842712402, + "learning_rate": 9.329862471202075e-06, + "loss": 1.4384, + "step": 10061 + }, + { + "epoch": 0.5501141287262687, + "grad_norm": 1.6961630582809448, + "learning_rate": 9.328039232915076e-06, + "loss": 1.3513, + "step": 10062 + }, + { + "epoch": 0.5501688011699903, + "grad_norm": 1.8006824254989624, + "learning_rate": 9.326216017066422e-06, + "loss": 1.3096, + "step": 10063 + }, + { + "epoch": 0.5502234736137118, + "grad_norm": 1.469758152961731, + "learning_rate": 9.324392823717e-06, + "loss": 1.6689, + "step": 10064 + }, + { + "epoch": 0.5502781460574334, + "grad_norm": 1.7302383184432983, + "learning_rate": 9.322569652927685e-06, + "loss": 1.5938, + "step": 10065 + }, + { + "epoch": 0.550332818501155, + "grad_norm": 1.604556918144226, + "learning_rate": 9.320746504759355e-06, + "loss": 1.2413, + "step": 10066 + }, + { + "epoch": 0.5503874909448765, + "grad_norm": 1.5110808610916138, + "learning_rate": 9.318923379272898e-06, + "loss": 1.394, + "step": 10067 + }, + { + "epoch": 0.5504421633885981, + "grad_norm": 1.8510184288024902, + "learning_rate": 9.317100276529187e-06, + "loss": 1.5131, + "step": 10068 + }, + { + "epoch": 0.5504968358323196, + "grad_norm": 1.4563170671463013, + "learning_rate": 9.315277196589097e-06, + "loss": 1.4984, + "step": 10069 + }, + { + "epoch": 0.5505515082760412, + "grad_norm": 1.4118921756744385, + "learning_rate": 9.313454139513512e-06, + "loss": 1.4559, + "step": 10070 + }, + { + "epoch": 0.5506061807197628, + "grad_norm": 1.1666715145111084, + "learning_rate": 9.3116311053633e-06, + "loss": 1.4788, + "step": 10071 + }, + { + "epoch": 0.5506608531634842, + "grad_norm": 1.7629228830337524, + "learning_rate": 9.309808094199343e-06, + "loss": 1.4665, + "step": 10072 + }, + { + "epoch": 0.5507155256072058, + "grad_norm": 1.262775182723999, + "learning_rate": 9.307985106082515e-06, + "loss": 1.4469, + "step": 10073 + }, + { + "epoch": 0.5507701980509274, + "grad_norm": 1.6247221231460571, + "learning_rate": 9.306162141073687e-06, + "loss": 1.4769, + "step": 10074 + }, + { + "epoch": 0.5508248704946489, + "grad_norm": 2.1001856327056885, + "learning_rate": 9.304339199233733e-06, + "loss": 1.8269, + "step": 10075 + }, + { + "epoch": 0.5508795429383705, + "grad_norm": 1.7491881847381592, + "learning_rate": 9.302516280623526e-06, + "loss": 1.2922, + "step": 10076 + }, + { + "epoch": 0.5509342153820921, + "grad_norm": 1.5659741163253784, + "learning_rate": 9.300693385303934e-06, + "loss": 1.555, + "step": 10077 + }, + { + "epoch": 0.5509888878258136, + "grad_norm": 1.8172404766082764, + "learning_rate": 9.298870513335835e-06, + "loss": 1.5967, + "step": 10078 + }, + { + "epoch": 0.5510435602695352, + "grad_norm": 1.8433127403259277, + "learning_rate": 9.297047664780093e-06, + "loss": 1.6575, + "step": 10079 + }, + { + "epoch": 0.5510982327132568, + "grad_norm": 1.696885347366333, + "learning_rate": 9.295224839697577e-06, + "loss": 1.4391, + "step": 10080 + }, + { + "epoch": 0.5511529051569782, + "grad_norm": 1.622254490852356, + "learning_rate": 9.293402038149161e-06, + "loss": 1.5251, + "step": 10081 + }, + { + "epoch": 0.5512075776006998, + "grad_norm": 1.7143075466156006, + "learning_rate": 9.29157926019571e-06, + "loss": 1.5167, + "step": 10082 + }, + { + "epoch": 0.5512622500444213, + "grad_norm": 1.3580149412155151, + "learning_rate": 9.289756505898085e-06, + "loss": 1.576, + "step": 10083 + }, + { + "epoch": 0.5513169224881429, + "grad_norm": 1.2141075134277344, + "learning_rate": 9.287933775317161e-06, + "loss": 1.6853, + "step": 10084 + }, + { + "epoch": 0.5513715949318645, + "grad_norm": 1.7086660861968994, + "learning_rate": 9.286111068513794e-06, + "loss": 1.5545, + "step": 10085 + }, + { + "epoch": 0.551426267375586, + "grad_norm": 1.2870140075683594, + "learning_rate": 9.284288385548858e-06, + "loss": 1.6205, + "step": 10086 + }, + { + "epoch": 0.5514809398193076, + "grad_norm": 1.8283092975616455, + "learning_rate": 9.282465726483214e-06, + "loss": 1.3784, + "step": 10087 + }, + { + "epoch": 0.5515356122630292, + "grad_norm": 1.4171972274780273, + "learning_rate": 9.28064309137772e-06, + "loss": 1.7586, + "step": 10088 + }, + { + "epoch": 0.5515902847067506, + "grad_norm": 1.5254157781600952, + "learning_rate": 9.278820480293241e-06, + "loss": 1.6004, + "step": 10089 + }, + { + "epoch": 0.5516449571504722, + "grad_norm": 1.3566607236862183, + "learning_rate": 9.276997893290641e-06, + "loss": 1.5358, + "step": 10090 + }, + { + "epoch": 0.5516996295941938, + "grad_norm": 1.4981080293655396, + "learning_rate": 9.275175330430774e-06, + "loss": 1.2165, + "step": 10091 + }, + { + "epoch": 0.5517543020379153, + "grad_norm": 1.736518383026123, + "learning_rate": 9.273352791774505e-06, + "loss": 1.3703, + "step": 10092 + }, + { + "epoch": 0.5518089744816369, + "grad_norm": 1.4159729480743408, + "learning_rate": 9.271530277382695e-06, + "loss": 1.2722, + "step": 10093 + }, + { + "epoch": 0.5518636469253585, + "grad_norm": 1.5213534832000732, + "learning_rate": 9.269707787316194e-06, + "loss": 1.2558, + "step": 10094 + }, + { + "epoch": 0.55191831936908, + "grad_norm": 1.3945930004119873, + "learning_rate": 9.267885321635866e-06, + "loss": 1.3282, + "step": 10095 + }, + { + "epoch": 0.5519729918128016, + "grad_norm": 1.5008227825164795, + "learning_rate": 9.266062880402566e-06, + "loss": 1.4824, + "step": 10096 + }, + { + "epoch": 0.552027664256523, + "grad_norm": 1.6694302558898926, + "learning_rate": 9.264240463677143e-06, + "loss": 1.273, + "step": 10097 + }, + { + "epoch": 0.5520823367002446, + "grad_norm": 1.6703014373779297, + "learning_rate": 9.262418071520464e-06, + "loss": 1.5949, + "step": 10098 + }, + { + "epoch": 0.5521370091439662, + "grad_norm": 1.6473901271820068, + "learning_rate": 9.26059570399337e-06, + "loss": 1.3858, + "step": 10099 + }, + { + "epoch": 0.5521916815876877, + "grad_norm": 1.299487590789795, + "learning_rate": 9.258773361156725e-06, + "loss": 1.5879, + "step": 10100 + }, + { + "epoch": 0.5522463540314093, + "grad_norm": 1.6102956533432007, + "learning_rate": 9.256951043071379e-06, + "loss": 1.495, + "step": 10101 + }, + { + "epoch": 0.5523010264751309, + "grad_norm": 1.6750707626342773, + "learning_rate": 9.255128749798177e-06, + "loss": 1.1734, + "step": 10102 + }, + { + "epoch": 0.5523556989188524, + "grad_norm": 1.6748263835906982, + "learning_rate": 9.253306481397975e-06, + "loss": 1.2882, + "step": 10103 + }, + { + "epoch": 0.552410371362574, + "grad_norm": 1.8265266418457031, + "learning_rate": 9.251484237931625e-06, + "loss": 1.4213, + "step": 10104 + }, + { + "epoch": 0.5524650438062956, + "grad_norm": 1.6586992740631104, + "learning_rate": 9.249662019459967e-06, + "loss": 1.1961, + "step": 10105 + }, + { + "epoch": 0.5525197162500171, + "grad_norm": 1.3257126808166504, + "learning_rate": 9.247839826043859e-06, + "loss": 1.4004, + "step": 10106 + }, + { + "epoch": 0.5525743886937386, + "grad_norm": 1.8071459531784058, + "learning_rate": 9.246017657744142e-06, + "loss": 1.4089, + "step": 10107 + }, + { + "epoch": 0.5526290611374602, + "grad_norm": 1.4392261505126953, + "learning_rate": 9.244195514621665e-06, + "loss": 1.5143, + "step": 10108 + }, + { + "epoch": 0.5526837335811817, + "grad_norm": 1.5012614727020264, + "learning_rate": 9.242373396737277e-06, + "loss": 1.4898, + "step": 10109 + }, + { + "epoch": 0.5527384060249033, + "grad_norm": 1.7390161752700806, + "learning_rate": 9.240551304151817e-06, + "loss": 1.4738, + "step": 10110 + }, + { + "epoch": 0.5527930784686248, + "grad_norm": 1.36592435836792, + "learning_rate": 9.238729236926126e-06, + "loss": 1.3946, + "step": 10111 + }, + { + "epoch": 0.5528477509123464, + "grad_norm": 1.3664143085479736, + "learning_rate": 9.236907195121058e-06, + "loss": 1.3175, + "step": 10112 + }, + { + "epoch": 0.552902423356068, + "grad_norm": 1.483902931213379, + "learning_rate": 9.235085178797447e-06, + "loss": 1.426, + "step": 10113 + }, + { + "epoch": 0.5529570957997895, + "grad_norm": 1.8227137327194214, + "learning_rate": 9.233263188016138e-06, + "loss": 1.5531, + "step": 10114 + }, + { + "epoch": 0.5530117682435111, + "grad_norm": 1.467960238456726, + "learning_rate": 9.231441222837971e-06, + "loss": 1.5711, + "step": 10115 + }, + { + "epoch": 0.5530664406872327, + "grad_norm": 1.7566598653793335, + "learning_rate": 9.22961928332378e-06, + "loss": 1.7106, + "step": 10116 + }, + { + "epoch": 0.5531211131309541, + "grad_norm": 1.4416543245315552, + "learning_rate": 9.227797369534415e-06, + "loss": 1.3284, + "step": 10117 + }, + { + "epoch": 0.5531757855746757, + "grad_norm": 1.5243735313415527, + "learning_rate": 9.225975481530707e-06, + "loss": 1.4078, + "step": 10118 + }, + { + "epoch": 0.5532304580183973, + "grad_norm": 1.3733774423599243, + "learning_rate": 9.22415361937349e-06, + "loss": 1.3903, + "step": 10119 + }, + { + "epoch": 0.5532851304621188, + "grad_norm": 1.3522038459777832, + "learning_rate": 9.222331783123608e-06, + "loss": 1.53, + "step": 10120 + }, + { + "epoch": 0.5533398029058404, + "grad_norm": 1.5903977155685425, + "learning_rate": 9.220509972841893e-06, + "loss": 1.5351, + "step": 10121 + }, + { + "epoch": 0.553394475349562, + "grad_norm": 1.4774683713912964, + "learning_rate": 9.218688188589176e-06, + "loss": 1.2224, + "step": 10122 + }, + { + "epoch": 0.5534491477932835, + "grad_norm": 1.4310225248336792, + "learning_rate": 9.216866430426297e-06, + "loss": 1.4398, + "step": 10123 + }, + { + "epoch": 0.5535038202370051, + "grad_norm": 1.6724821329116821, + "learning_rate": 9.215044698414086e-06, + "loss": 1.4225, + "step": 10124 + }, + { + "epoch": 0.5535584926807265, + "grad_norm": 1.4693305492401123, + "learning_rate": 9.213222992613368e-06, + "loss": 1.552, + "step": 10125 + }, + { + "epoch": 0.5536131651244481, + "grad_norm": 1.5578328371047974, + "learning_rate": 9.211401313084986e-06, + "loss": 1.6251, + "step": 10126 + }, + { + "epoch": 0.5536678375681697, + "grad_norm": 1.5334774255752563, + "learning_rate": 9.209579659889762e-06, + "loss": 1.4937, + "step": 10127 + }, + { + "epoch": 0.5537225100118912, + "grad_norm": 1.632624626159668, + "learning_rate": 9.207758033088533e-06, + "loss": 1.4637, + "step": 10128 + }, + { + "epoch": 0.5537771824556128, + "grad_norm": 1.6130969524383545, + "learning_rate": 9.205936432742119e-06, + "loss": 1.5982, + "step": 10129 + }, + { + "epoch": 0.5538318548993344, + "grad_norm": 1.481136441230774, + "learning_rate": 9.204114858911346e-06, + "loss": 1.5213, + "step": 10130 + }, + { + "epoch": 0.5538865273430559, + "grad_norm": 1.5110691785812378, + "learning_rate": 9.202293311657053e-06, + "loss": 1.4758, + "step": 10131 + }, + { + "epoch": 0.5539411997867775, + "grad_norm": 1.441741943359375, + "learning_rate": 9.200471791040056e-06, + "loss": 1.4574, + "step": 10132 + }, + { + "epoch": 0.5539958722304991, + "grad_norm": 1.562153935432434, + "learning_rate": 9.198650297121181e-06, + "loss": 1.248, + "step": 10133 + }, + { + "epoch": 0.5540505446742205, + "grad_norm": 1.357948899269104, + "learning_rate": 9.196828829961254e-06, + "loss": 1.4006, + "step": 10134 + }, + { + "epoch": 0.5541052171179421, + "grad_norm": 1.6459581851959229, + "learning_rate": 9.195007389621098e-06, + "loss": 1.4127, + "step": 10135 + }, + { + "epoch": 0.5541598895616637, + "grad_norm": 1.5931932926177979, + "learning_rate": 9.193185976161529e-06, + "loss": 1.2422, + "step": 10136 + }, + { + "epoch": 0.5542145620053852, + "grad_norm": 1.766203761100769, + "learning_rate": 9.191364589643378e-06, + "loss": 1.4896, + "step": 10137 + }, + { + "epoch": 0.5542692344491068, + "grad_norm": 1.2344117164611816, + "learning_rate": 9.189543230127463e-06, + "loss": 1.5657, + "step": 10138 + }, + { + "epoch": 0.5543239068928283, + "grad_norm": 1.4459753036499023, + "learning_rate": 9.187721897674595e-06, + "loss": 1.4789, + "step": 10139 + }, + { + "epoch": 0.5543785793365499, + "grad_norm": 1.7285021543502808, + "learning_rate": 9.185900592345603e-06, + "loss": 1.3144, + "step": 10140 + }, + { + "epoch": 0.5544332517802715, + "grad_norm": 1.3063105344772339, + "learning_rate": 9.1840793142013e-06, + "loss": 1.706, + "step": 10141 + }, + { + "epoch": 0.554487924223993, + "grad_norm": 2.245776653289795, + "learning_rate": 9.182258063302504e-06, + "loss": 1.1214, + "step": 10142 + }, + { + "epoch": 0.5545425966677145, + "grad_norm": 1.4809696674346924, + "learning_rate": 9.18043683971003e-06, + "loss": 1.5369, + "step": 10143 + }, + { + "epoch": 0.5545972691114361, + "grad_norm": 1.3873473405838013, + "learning_rate": 9.178615643484689e-06, + "loss": 1.332, + "step": 10144 + }, + { + "epoch": 0.5546519415551576, + "grad_norm": 1.465641736984253, + "learning_rate": 9.176794474687305e-06, + "loss": 1.6674, + "step": 10145 + }, + { + "epoch": 0.5547066139988792, + "grad_norm": 1.2257887125015259, + "learning_rate": 9.174973333378684e-06, + "loss": 1.2942, + "step": 10146 + }, + { + "epoch": 0.5547612864426008, + "grad_norm": 1.8412367105484009, + "learning_rate": 9.173152219619637e-06, + "loss": 1.6225, + "step": 10147 + }, + { + "epoch": 0.5548159588863223, + "grad_norm": 1.946877121925354, + "learning_rate": 9.171331133470979e-06, + "loss": 1.4355, + "step": 10148 + }, + { + "epoch": 0.5548706313300439, + "grad_norm": 1.9020111560821533, + "learning_rate": 9.16951007499352e-06, + "loss": 1.4144, + "step": 10149 + }, + { + "epoch": 0.5549253037737655, + "grad_norm": 1.6809792518615723, + "learning_rate": 9.167689044248065e-06, + "loss": 1.5893, + "step": 10150 + }, + { + "epoch": 0.554979976217487, + "grad_norm": 1.3996598720550537, + "learning_rate": 9.16586804129543e-06, + "loss": 1.2533, + "step": 10151 + }, + { + "epoch": 0.5550346486612086, + "grad_norm": 1.674048900604248, + "learning_rate": 9.164047066196417e-06, + "loss": 1.4517, + "step": 10152 + }, + { + "epoch": 0.55508932110493, + "grad_norm": 1.4783666133880615, + "learning_rate": 9.162226119011832e-06, + "loss": 1.3744, + "step": 10153 + }, + { + "epoch": 0.5551439935486516, + "grad_norm": 1.1930376291275024, + "learning_rate": 9.160405199802487e-06, + "loss": 1.4142, + "step": 10154 + }, + { + "epoch": 0.5551986659923732, + "grad_norm": 1.3178918361663818, + "learning_rate": 9.158584308629175e-06, + "loss": 1.5869, + "step": 10155 + }, + { + "epoch": 0.5552533384360947, + "grad_norm": 1.3779114484786987, + "learning_rate": 9.156763445552714e-06, + "loss": 1.5153, + "step": 10156 + }, + { + "epoch": 0.5553080108798163, + "grad_norm": 1.521456241607666, + "learning_rate": 9.154942610633901e-06, + "loss": 1.3696, + "step": 10157 + }, + { + "epoch": 0.5553626833235379, + "grad_norm": 1.8563650846481323, + "learning_rate": 9.153121803933532e-06, + "loss": 1.411, + "step": 10158 + }, + { + "epoch": 0.5554173557672594, + "grad_norm": 1.4959083795547485, + "learning_rate": 9.151301025512417e-06, + "loss": 1.4282, + "step": 10159 + }, + { + "epoch": 0.555472028210981, + "grad_norm": 1.297958254814148, + "learning_rate": 9.149480275431353e-06, + "loss": 1.5031, + "step": 10160 + }, + { + "epoch": 0.5555267006547026, + "grad_norm": 1.427641749382019, + "learning_rate": 9.147659553751135e-06, + "loss": 1.6313, + "step": 10161 + }, + { + "epoch": 0.555581373098424, + "grad_norm": 1.481459140777588, + "learning_rate": 9.145838860532567e-06, + "loss": 1.2215, + "step": 10162 + }, + { + "epoch": 0.5556360455421456, + "grad_norm": 1.3112268447875977, + "learning_rate": 9.144018195836445e-06, + "loss": 1.3075, + "step": 10163 + }, + { + "epoch": 0.5556907179858672, + "grad_norm": 1.662165641784668, + "learning_rate": 9.14219755972356e-06, + "loss": 1.5026, + "step": 10164 + }, + { + "epoch": 0.5557453904295887, + "grad_norm": 1.2662723064422607, + "learning_rate": 9.140376952254713e-06, + "loss": 1.4356, + "step": 10165 + }, + { + "epoch": 0.5558000628733103, + "grad_norm": 1.3737465143203735, + "learning_rate": 9.1385563734907e-06, + "loss": 1.51, + "step": 10166 + }, + { + "epoch": 0.5558547353170318, + "grad_norm": 1.4933723211288452, + "learning_rate": 9.136735823492307e-06, + "loss": 1.4809, + "step": 10167 + }, + { + "epoch": 0.5559094077607534, + "grad_norm": 1.4525383710861206, + "learning_rate": 9.134915302320334e-06, + "loss": 1.3516, + "step": 10168 + }, + { + "epoch": 0.555964080204475, + "grad_norm": 1.784401535987854, + "learning_rate": 9.133094810035564e-06, + "loss": 1.4159, + "step": 10169 + }, + { + "epoch": 0.5560187526481964, + "grad_norm": 1.3461905717849731, + "learning_rate": 9.131274346698797e-06, + "loss": 1.5643, + "step": 10170 + }, + { + "epoch": 0.556073425091918, + "grad_norm": 1.405661940574646, + "learning_rate": 9.129453912370817e-06, + "loss": 1.6784, + "step": 10171 + }, + { + "epoch": 0.5561280975356396, + "grad_norm": 1.6683889627456665, + "learning_rate": 9.127633507112412e-06, + "loss": 1.4738, + "step": 10172 + }, + { + "epoch": 0.5561827699793611, + "grad_norm": 1.570032000541687, + "learning_rate": 9.125813130984374e-06, + "loss": 1.6813, + "step": 10173 + }, + { + "epoch": 0.5562374424230827, + "grad_norm": 1.9212450981140137, + "learning_rate": 9.123992784047487e-06, + "loss": 1.5654, + "step": 10174 + }, + { + "epoch": 0.5562921148668043, + "grad_norm": 1.644378662109375, + "learning_rate": 9.122172466362533e-06, + "loss": 1.6232, + "step": 10175 + }, + { + "epoch": 0.5563467873105258, + "grad_norm": 1.4984573125839233, + "learning_rate": 9.120352177990303e-06, + "loss": 1.5682, + "step": 10176 + }, + { + "epoch": 0.5564014597542474, + "grad_norm": 2.6691434383392334, + "learning_rate": 9.118531918991578e-06, + "loss": 1.4018, + "step": 10177 + }, + { + "epoch": 0.556456132197969, + "grad_norm": 1.46771240234375, + "learning_rate": 9.116711689427137e-06, + "loss": 1.4718, + "step": 10178 + }, + { + "epoch": 0.5565108046416904, + "grad_norm": 1.2590936422348022, + "learning_rate": 9.11489148935777e-06, + "loss": 1.3279, + "step": 10179 + }, + { + "epoch": 0.556565477085412, + "grad_norm": 1.3942526578903198, + "learning_rate": 9.113071318844252e-06, + "loss": 1.519, + "step": 10180 + }, + { + "epoch": 0.5566201495291335, + "grad_norm": 1.731919288635254, + "learning_rate": 9.111251177947363e-06, + "loss": 1.4794, + "step": 10181 + }, + { + "epoch": 0.5566748219728551, + "grad_norm": 1.7041666507720947, + "learning_rate": 9.109431066727883e-06, + "loss": 1.3097, + "step": 10182 + }, + { + "epoch": 0.5567294944165767, + "grad_norm": 1.8610193729400635, + "learning_rate": 9.107610985246586e-06, + "loss": 1.4669, + "step": 10183 + }, + { + "epoch": 0.5567841668602982, + "grad_norm": 1.3331753015518188, + "learning_rate": 9.105790933564259e-06, + "loss": 1.3833, + "step": 10184 + }, + { + "epoch": 0.5568388393040198, + "grad_norm": 1.3361923694610596, + "learning_rate": 9.103970911741668e-06, + "loss": 1.7216, + "step": 10185 + }, + { + "epoch": 0.5568935117477414, + "grad_norm": 1.4768145084381104, + "learning_rate": 9.10215091983959e-06, + "loss": 1.3139, + "step": 10186 + }, + { + "epoch": 0.5569481841914629, + "grad_norm": 1.6748071908950806, + "learning_rate": 9.100330957918802e-06, + "loss": 1.5646, + "step": 10187 + }, + { + "epoch": 0.5570028566351845, + "grad_norm": 1.6234813928604126, + "learning_rate": 9.098511026040072e-06, + "loss": 1.4813, + "step": 10188 + }, + { + "epoch": 0.557057529078906, + "grad_norm": 1.2172304391860962, + "learning_rate": 9.096691124264173e-06, + "loss": 1.6036, + "step": 10189 + }, + { + "epoch": 0.5571122015226275, + "grad_norm": 1.63057541847229, + "learning_rate": 9.09487125265188e-06, + "loss": 1.3272, + "step": 10190 + }, + { + "epoch": 0.5571668739663491, + "grad_norm": 1.5259549617767334, + "learning_rate": 9.09305141126396e-06, + "loss": 1.3926, + "step": 10191 + }, + { + "epoch": 0.5572215464100707, + "grad_norm": 1.5425910949707031, + "learning_rate": 9.09123160016118e-06, + "loss": 1.5447, + "step": 10192 + }, + { + "epoch": 0.5572762188537922, + "grad_norm": 1.5587137937545776, + "learning_rate": 9.089411819404311e-06, + "loss": 1.3769, + "step": 10193 + }, + { + "epoch": 0.5573308912975138, + "grad_norm": 1.6633864641189575, + "learning_rate": 9.08759206905412e-06, + "loss": 1.3631, + "step": 10194 + }, + { + "epoch": 0.5573855637412354, + "grad_norm": 1.7081516981124878, + "learning_rate": 9.085772349171364e-06, + "loss": 1.1967, + "step": 10195 + }, + { + "epoch": 0.5574402361849569, + "grad_norm": 2.005418062210083, + "learning_rate": 9.08395265981682e-06, + "loss": 1.5573, + "step": 10196 + }, + { + "epoch": 0.5574949086286785, + "grad_norm": 1.557687520980835, + "learning_rate": 9.08213300105124e-06, + "loss": 1.4936, + "step": 10197 + }, + { + "epoch": 0.5575495810723999, + "grad_norm": 1.3781661987304688, + "learning_rate": 9.080313372935399e-06, + "loss": 1.5778, + "step": 10198 + }, + { + "epoch": 0.5576042535161215, + "grad_norm": 1.4387539625167847, + "learning_rate": 9.078493775530053e-06, + "loss": 1.2748, + "step": 10199 + }, + { + "epoch": 0.5576589259598431, + "grad_norm": 1.4507845640182495, + "learning_rate": 9.07667420889596e-06, + "loss": 1.5694, + "step": 10200 + }, + { + "epoch": 0.5577135984035646, + "grad_norm": 1.3157683610916138, + "learning_rate": 9.074854673093882e-06, + "loss": 1.3454, + "step": 10201 + }, + { + "epoch": 0.5577682708472862, + "grad_norm": 1.473197102546692, + "learning_rate": 9.07303516818458e-06, + "loss": 1.289, + "step": 10202 + }, + { + "epoch": 0.5578229432910078, + "grad_norm": 1.5590132474899292, + "learning_rate": 9.071215694228801e-06, + "loss": 1.2, + "step": 10203 + }, + { + "epoch": 0.5578776157347293, + "grad_norm": 1.4849313497543335, + "learning_rate": 9.069396251287319e-06, + "loss": 1.4003, + "step": 10204 + }, + { + "epoch": 0.5579322881784509, + "grad_norm": 1.2459882497787476, + "learning_rate": 9.067576839420876e-06, + "loss": 1.3291, + "step": 10205 + }, + { + "epoch": 0.5579869606221725, + "grad_norm": 1.38018000125885, + "learning_rate": 9.065757458690228e-06, + "loss": 1.305, + "step": 10206 + }, + { + "epoch": 0.5580416330658939, + "grad_norm": 1.586799144744873, + "learning_rate": 9.063938109156135e-06, + "loss": 1.5198, + "step": 10207 + }, + { + "epoch": 0.5580963055096155, + "grad_norm": 1.3207650184631348, + "learning_rate": 9.062118790879344e-06, + "loss": 1.5941, + "step": 10208 + }, + { + "epoch": 0.5581509779533371, + "grad_norm": 1.1052292585372925, + "learning_rate": 9.060299503920603e-06, + "loss": 1.2737, + "step": 10209 + }, + { + "epoch": 0.5582056503970586, + "grad_norm": 1.7927641868591309, + "learning_rate": 9.058480248340672e-06, + "loss": 1.3048, + "step": 10210 + }, + { + "epoch": 0.5582603228407802, + "grad_norm": 1.5279219150543213, + "learning_rate": 9.056661024200291e-06, + "loss": 1.5116, + "step": 10211 + }, + { + "epoch": 0.5583149952845017, + "grad_norm": 1.9171512126922607, + "learning_rate": 9.054841831560216e-06, + "loss": 1.3249, + "step": 10212 + }, + { + "epoch": 0.5583696677282233, + "grad_norm": 1.3008676767349243, + "learning_rate": 9.053022670481189e-06, + "loss": 1.7358, + "step": 10213 + }, + { + "epoch": 0.5584243401719449, + "grad_norm": 1.5451395511627197, + "learning_rate": 9.051203541023952e-06, + "loss": 1.622, + "step": 10214 + }, + { + "epoch": 0.5584790126156663, + "grad_norm": 1.893733263015747, + "learning_rate": 9.049384443249261e-06, + "loss": 1.3081, + "step": 10215 + }, + { + "epoch": 0.5585336850593879, + "grad_norm": 1.849643588066101, + "learning_rate": 9.047565377217855e-06, + "loss": 1.6467, + "step": 10216 + }, + { + "epoch": 0.5585883575031095, + "grad_norm": 1.3595998287200928, + "learning_rate": 9.04574634299047e-06, + "loss": 1.6073, + "step": 10217 + }, + { + "epoch": 0.558643029946831, + "grad_norm": 1.5095010995864868, + "learning_rate": 9.043927340627858e-06, + "loss": 1.5011, + "step": 10218 + }, + { + "epoch": 0.5586977023905526, + "grad_norm": 1.3703862428665161, + "learning_rate": 9.042108370190757e-06, + "loss": 1.243, + "step": 10219 + }, + { + "epoch": 0.5587523748342742, + "grad_norm": 1.370633602142334, + "learning_rate": 9.040289431739902e-06, + "loss": 1.4613, + "step": 10220 + }, + { + "epoch": 0.5588070472779957, + "grad_norm": 1.554025411605835, + "learning_rate": 9.038470525336037e-06, + "loss": 1.2027, + "step": 10221 + }, + { + "epoch": 0.5588617197217173, + "grad_norm": 1.7383277416229248, + "learning_rate": 9.036651651039898e-06, + "loss": 1.5424, + "step": 10222 + }, + { + "epoch": 0.5589163921654389, + "grad_norm": 1.3007583618164062, + "learning_rate": 9.034832808912215e-06, + "loss": 1.2745, + "step": 10223 + }, + { + "epoch": 0.5589710646091604, + "grad_norm": 1.4203563928604126, + "learning_rate": 9.033013999013737e-06, + "loss": 1.4871, + "step": 10224 + }, + { + "epoch": 0.5590257370528819, + "grad_norm": 1.4764004945755005, + "learning_rate": 9.031195221405185e-06, + "loss": 1.5658, + "step": 10225 + }, + { + "epoch": 0.5590804094966034, + "grad_norm": 1.2899670600891113, + "learning_rate": 9.029376476147303e-06, + "loss": 1.3315, + "step": 10226 + }, + { + "epoch": 0.559135081940325, + "grad_norm": 1.870147943496704, + "learning_rate": 9.027557763300815e-06, + "loss": 1.4933, + "step": 10227 + }, + { + "epoch": 0.5591897543840466, + "grad_norm": 1.341207504272461, + "learning_rate": 9.025739082926454e-06, + "loss": 1.4293, + "step": 10228 + }, + { + "epoch": 0.5592444268277681, + "grad_norm": 1.660080909729004, + "learning_rate": 9.023920435084955e-06, + "loss": 1.3302, + "step": 10229 + }, + { + "epoch": 0.5592990992714897, + "grad_norm": 1.6433827877044678, + "learning_rate": 9.02210181983704e-06, + "loss": 1.5168, + "step": 10230 + }, + { + "epoch": 0.5593537717152113, + "grad_norm": 1.337721824645996, + "learning_rate": 9.020283237243441e-06, + "loss": 1.4881, + "step": 10231 + }, + { + "epoch": 0.5594084441589328, + "grad_norm": 1.5705543756484985, + "learning_rate": 9.018464687364885e-06, + "loss": 1.7329, + "step": 10232 + }, + { + "epoch": 0.5594631166026544, + "grad_norm": 1.5422495603561401, + "learning_rate": 9.016646170262096e-06, + "loss": 1.4779, + "step": 10233 + }, + { + "epoch": 0.559517789046376, + "grad_norm": 1.544002652168274, + "learning_rate": 9.014827685995795e-06, + "loss": 1.3436, + "step": 10234 + }, + { + "epoch": 0.5595724614900974, + "grad_norm": 1.3222613334655762, + "learning_rate": 9.013009234626715e-06, + "loss": 1.3494, + "step": 10235 + }, + { + "epoch": 0.559627133933819, + "grad_norm": 1.3737545013427734, + "learning_rate": 9.01119081621557e-06, + "loss": 1.365, + "step": 10236 + }, + { + "epoch": 0.5596818063775406, + "grad_norm": 1.486764907836914, + "learning_rate": 9.009372430823082e-06, + "loss": 1.4084, + "step": 10237 + }, + { + "epoch": 0.5597364788212621, + "grad_norm": 1.5915380716323853, + "learning_rate": 9.007554078509975e-06, + "loss": 1.7125, + "step": 10238 + }, + { + "epoch": 0.5597911512649837, + "grad_norm": 1.293891191482544, + "learning_rate": 9.005735759336965e-06, + "loss": 1.6527, + "step": 10239 + }, + { + "epoch": 0.5598458237087052, + "grad_norm": 1.198651671409607, + "learning_rate": 9.003917473364774e-06, + "loss": 1.2287, + "step": 10240 + }, + { + "epoch": 0.5599004961524268, + "grad_norm": 1.938342809677124, + "learning_rate": 9.002099220654116e-06, + "loss": 1.1129, + "step": 10241 + }, + { + "epoch": 0.5599551685961484, + "grad_norm": 1.3804389238357544, + "learning_rate": 9.000281001265702e-06, + "loss": 1.4791, + "step": 10242 + }, + { + "epoch": 0.5600098410398698, + "grad_norm": 1.0312987565994263, + "learning_rate": 8.998462815260255e-06, + "loss": 1.3989, + "step": 10243 + }, + { + "epoch": 0.5600645134835914, + "grad_norm": 1.4128315448760986, + "learning_rate": 8.996644662698485e-06, + "loss": 1.4361, + "step": 10244 + }, + { + "epoch": 0.560119185927313, + "grad_norm": 1.3663721084594727, + "learning_rate": 8.994826543641102e-06, + "loss": 1.555, + "step": 10245 + }, + { + "epoch": 0.5601738583710345, + "grad_norm": 1.4115639925003052, + "learning_rate": 8.993008458148822e-06, + "loss": 1.4325, + "step": 10246 + }, + { + "epoch": 0.5602285308147561, + "grad_norm": 1.5329011678695679, + "learning_rate": 8.991190406282352e-06, + "loss": 1.4147, + "step": 10247 + }, + { + "epoch": 0.5602832032584777, + "grad_norm": 1.5853819847106934, + "learning_rate": 8.989372388102398e-06, + "loss": 1.5047, + "step": 10248 + }, + { + "epoch": 0.5603378757021992, + "grad_norm": 2.0182039737701416, + "learning_rate": 8.987554403669676e-06, + "loss": 1.6921, + "step": 10249 + }, + { + "epoch": 0.5603925481459208, + "grad_norm": 1.5990456342697144, + "learning_rate": 8.985736453044887e-06, + "loss": 1.5891, + "step": 10250 + }, + { + "epoch": 0.5604472205896424, + "grad_norm": 1.2782037258148193, + "learning_rate": 8.983918536288736e-06, + "loss": 1.4859, + "step": 10251 + }, + { + "epoch": 0.5605018930333638, + "grad_norm": 1.4089504480361938, + "learning_rate": 8.982100653461932e-06, + "loss": 1.3273, + "step": 10252 + }, + { + "epoch": 0.5605565654770854, + "grad_norm": 2.0231823921203613, + "learning_rate": 8.980282804625172e-06, + "loss": 1.477, + "step": 10253 + }, + { + "epoch": 0.5606112379208069, + "grad_norm": 1.270861268043518, + "learning_rate": 8.978464989839165e-06, + "loss": 1.3167, + "step": 10254 + }, + { + "epoch": 0.5606659103645285, + "grad_norm": 1.5403999090194702, + "learning_rate": 8.97664720916461e-06, + "loss": 1.5157, + "step": 10255 + }, + { + "epoch": 0.5607205828082501, + "grad_norm": 1.6726713180541992, + "learning_rate": 8.974829462662201e-06, + "loss": 1.4096, + "step": 10256 + }, + { + "epoch": 0.5607752552519716, + "grad_norm": 1.585394263267517, + "learning_rate": 8.973011750392648e-06, + "loss": 1.4052, + "step": 10257 + }, + { + "epoch": 0.5608299276956932, + "grad_norm": 1.2343693971633911, + "learning_rate": 8.97119407241664e-06, + "loss": 1.5152, + "step": 10258 + }, + { + "epoch": 0.5608846001394148, + "grad_norm": 1.328338861465454, + "learning_rate": 8.969376428794877e-06, + "loss": 1.2595, + "step": 10259 + }, + { + "epoch": 0.5609392725831363, + "grad_norm": 2.179035186767578, + "learning_rate": 8.967558819588052e-06, + "loss": 1.4156, + "step": 10260 + }, + { + "epoch": 0.5609939450268578, + "grad_norm": 1.5327662229537964, + "learning_rate": 8.965741244856864e-06, + "loss": 1.1289, + "step": 10261 + }, + { + "epoch": 0.5610486174705794, + "grad_norm": 1.4193886518478394, + "learning_rate": 8.963923704661996e-06, + "loss": 1.4748, + "step": 10262 + }, + { + "epoch": 0.5611032899143009, + "grad_norm": 1.8628743886947632, + "learning_rate": 8.962106199064152e-06, + "loss": 1.3372, + "step": 10263 + }, + { + "epoch": 0.5611579623580225, + "grad_norm": 1.8918787240982056, + "learning_rate": 8.960288728124018e-06, + "loss": 1.477, + "step": 10264 + }, + { + "epoch": 0.5612126348017441, + "grad_norm": 1.8434865474700928, + "learning_rate": 8.95847129190228e-06, + "loss": 1.4174, + "step": 10265 + }, + { + "epoch": 0.5612673072454656, + "grad_norm": 1.3123571872711182, + "learning_rate": 8.956653890459632e-06, + "loss": 1.5166, + "step": 10266 + }, + { + "epoch": 0.5613219796891872, + "grad_norm": 1.8326478004455566, + "learning_rate": 8.954836523856755e-06, + "loss": 1.4611, + "step": 10267 + }, + { + "epoch": 0.5613766521329087, + "grad_norm": 1.8836714029312134, + "learning_rate": 8.953019192154344e-06, + "loss": 1.2088, + "step": 10268 + }, + { + "epoch": 0.5614313245766303, + "grad_norm": 1.598793387413025, + "learning_rate": 8.951201895413078e-06, + "loss": 1.3745, + "step": 10269 + }, + { + "epoch": 0.5614859970203518, + "grad_norm": 1.4439023733139038, + "learning_rate": 8.94938463369364e-06, + "loss": 1.3729, + "step": 10270 + }, + { + "epoch": 0.5615406694640733, + "grad_norm": 1.6794863939285278, + "learning_rate": 8.947567407056716e-06, + "loss": 1.4043, + "step": 10271 + }, + { + "epoch": 0.5615953419077949, + "grad_norm": 1.4436842203140259, + "learning_rate": 8.945750215562987e-06, + "loss": 1.3548, + "step": 10272 + }, + { + "epoch": 0.5616500143515165, + "grad_norm": 1.6795146465301514, + "learning_rate": 8.943933059273127e-06, + "loss": 1.3481, + "step": 10273 + }, + { + "epoch": 0.561704686795238, + "grad_norm": 1.42709481716156, + "learning_rate": 8.942115938247824e-06, + "loss": 1.6172, + "step": 10274 + }, + { + "epoch": 0.5617593592389596, + "grad_norm": 1.6244884729385376, + "learning_rate": 8.940298852547753e-06, + "loss": 1.3847, + "step": 10275 + }, + { + "epoch": 0.5618140316826812, + "grad_norm": 1.5989488363265991, + "learning_rate": 8.938481802233587e-06, + "loss": 1.3503, + "step": 10276 + }, + { + "epoch": 0.5618687041264027, + "grad_norm": 1.4253733158111572, + "learning_rate": 8.936664787366007e-06, + "loss": 1.5318, + "step": 10277 + }, + { + "epoch": 0.5619233765701243, + "grad_norm": 1.6134045124053955, + "learning_rate": 8.934847808005684e-06, + "loss": 1.3836, + "step": 10278 + }, + { + "epoch": 0.5619780490138458, + "grad_norm": 2.438713788986206, + "learning_rate": 8.933030864213292e-06, + "loss": 1.5192, + "step": 10279 + }, + { + "epoch": 0.5620327214575673, + "grad_norm": 1.5799243450164795, + "learning_rate": 8.931213956049505e-06, + "loss": 1.6513, + "step": 10280 + }, + { + "epoch": 0.5620873939012889, + "grad_norm": 1.384016990661621, + "learning_rate": 8.929397083574987e-06, + "loss": 1.5901, + "step": 10281 + }, + { + "epoch": 0.5621420663450104, + "grad_norm": 1.8315383195877075, + "learning_rate": 8.927580246850418e-06, + "loss": 1.4868, + "step": 10282 + }, + { + "epoch": 0.562196738788732, + "grad_norm": 1.5898361206054688, + "learning_rate": 8.92576344593646e-06, + "loss": 1.5428, + "step": 10283 + }, + { + "epoch": 0.5622514112324536, + "grad_norm": 1.7542829513549805, + "learning_rate": 8.923946680893781e-06, + "loss": 1.2137, + "step": 10284 + }, + { + "epoch": 0.5623060836761751, + "grad_norm": 2.519674062728882, + "learning_rate": 8.922129951783047e-06, + "loss": 1.5112, + "step": 10285 + }, + { + "epoch": 0.5623607561198967, + "grad_norm": 1.9285703897476196, + "learning_rate": 8.920313258664925e-06, + "loss": 1.2463, + "step": 10286 + }, + { + "epoch": 0.5624154285636183, + "grad_norm": 1.6752625703811646, + "learning_rate": 8.918496601600072e-06, + "loss": 1.4865, + "step": 10287 + }, + { + "epoch": 0.5624701010073397, + "grad_norm": 1.3955546617507935, + "learning_rate": 8.916679980649159e-06, + "loss": 1.3113, + "step": 10288 + }, + { + "epoch": 0.5625247734510613, + "grad_norm": 1.4988216161727905, + "learning_rate": 8.914863395872844e-06, + "loss": 1.4674, + "step": 10289 + }, + { + "epoch": 0.5625794458947829, + "grad_norm": 1.4746097326278687, + "learning_rate": 8.913046847331784e-06, + "loss": 1.4878, + "step": 10290 + }, + { + "epoch": 0.5626341183385044, + "grad_norm": 1.546018362045288, + "learning_rate": 8.911230335086643e-06, + "loss": 1.4689, + "step": 10291 + }, + { + "epoch": 0.562688790782226, + "grad_norm": 1.3645086288452148, + "learning_rate": 8.909413859198075e-06, + "loss": 1.3881, + "step": 10292 + }, + { + "epoch": 0.5627434632259476, + "grad_norm": 1.2721502780914307, + "learning_rate": 8.907597419726736e-06, + "loss": 1.6119, + "step": 10293 + }, + { + "epoch": 0.5627981356696691, + "grad_norm": 1.911490559577942, + "learning_rate": 8.905781016733285e-06, + "loss": 1.2715, + "step": 10294 + }, + { + "epoch": 0.5628528081133907, + "grad_norm": 1.6469234228134155, + "learning_rate": 8.90396465027837e-06, + "loss": 1.2573, + "step": 10295 + }, + { + "epoch": 0.5629074805571121, + "grad_norm": 1.8684797286987305, + "learning_rate": 8.90214832042265e-06, + "loss": 1.3913, + "step": 10296 + }, + { + "epoch": 0.5629621530008337, + "grad_norm": 1.5181204080581665, + "learning_rate": 8.900332027226776e-06, + "loss": 1.3159, + "step": 10297 + }, + { + "epoch": 0.5630168254445553, + "grad_norm": 1.354954719543457, + "learning_rate": 8.89851577075139e-06, + "loss": 1.5028, + "step": 10298 + }, + { + "epoch": 0.5630714978882768, + "grad_norm": 1.565889596939087, + "learning_rate": 8.896699551057151e-06, + "loss": 1.6079, + "step": 10299 + }, + { + "epoch": 0.5631261703319984, + "grad_norm": 1.2133406400680542, + "learning_rate": 8.894883368204704e-06, + "loss": 1.637, + "step": 10300 + }, + { + "epoch": 0.56318084277572, + "grad_norm": 2.3897716999053955, + "learning_rate": 8.89306722225469e-06, + "loss": 0.9798, + "step": 10301 + }, + { + "epoch": 0.5632355152194415, + "grad_norm": 1.7133827209472656, + "learning_rate": 8.891251113267763e-06, + "loss": 1.3141, + "step": 10302 + }, + { + "epoch": 0.5632901876631631, + "grad_norm": 1.3883681297302246, + "learning_rate": 8.889435041304565e-06, + "loss": 1.5653, + "step": 10303 + }, + { + "epoch": 0.5633448601068847, + "grad_norm": 1.2923295497894287, + "learning_rate": 8.887619006425732e-06, + "loss": 1.7152, + "step": 10304 + }, + { + "epoch": 0.5633995325506062, + "grad_norm": 1.4253833293914795, + "learning_rate": 8.885803008691914e-06, + "loss": 1.6121, + "step": 10305 + }, + { + "epoch": 0.5634542049943277, + "grad_norm": 1.4516148567199707, + "learning_rate": 8.883987048163746e-06, + "loss": 1.4233, + "step": 10306 + }, + { + "epoch": 0.5635088774380493, + "grad_norm": 1.4970896244049072, + "learning_rate": 8.882171124901867e-06, + "loss": 1.3555, + "step": 10307 + }, + { + "epoch": 0.5635635498817708, + "grad_norm": 2.173048496246338, + "learning_rate": 8.880355238966923e-06, + "loss": 1.2734, + "step": 10308 + }, + { + "epoch": 0.5636182223254924, + "grad_norm": 1.014696478843689, + "learning_rate": 8.878539390419542e-06, + "loss": 1.6986, + "step": 10309 + }, + { + "epoch": 0.5636728947692139, + "grad_norm": 1.2752653360366821, + "learning_rate": 8.876723579320363e-06, + "loss": 1.6725, + "step": 10310 + }, + { + "epoch": 0.5637275672129355, + "grad_norm": 1.2504407167434692, + "learning_rate": 8.87490780573002e-06, + "loss": 1.3592, + "step": 10311 + }, + { + "epoch": 0.5637822396566571, + "grad_norm": 1.6862390041351318, + "learning_rate": 8.87309206970914e-06, + "loss": 1.5168, + "step": 10312 + }, + { + "epoch": 0.5638369121003786, + "grad_norm": 1.2943376302719116, + "learning_rate": 8.871276371318367e-06, + "loss": 1.5848, + "step": 10313 + }, + { + "epoch": 0.5638915845441002, + "grad_norm": 1.6695984601974487, + "learning_rate": 8.869460710618324e-06, + "loss": 1.3874, + "step": 10314 + }, + { + "epoch": 0.5639462569878217, + "grad_norm": 1.033164143562317, + "learning_rate": 8.867645087669637e-06, + "loss": 1.5769, + "step": 10315 + }, + { + "epoch": 0.5640009294315432, + "grad_norm": 1.4077184200286865, + "learning_rate": 8.865829502532942e-06, + "loss": 1.485, + "step": 10316 + }, + { + "epoch": 0.5640556018752648, + "grad_norm": 1.6057729721069336, + "learning_rate": 8.86401395526886e-06, + "loss": 1.1981, + "step": 10317 + }, + { + "epoch": 0.5641102743189864, + "grad_norm": 1.567637324333191, + "learning_rate": 8.862198445938013e-06, + "loss": 1.2316, + "step": 10318 + }, + { + "epoch": 0.5641649467627079, + "grad_norm": 1.2630794048309326, + "learning_rate": 8.860382974601035e-06, + "loss": 1.5917, + "step": 10319 + }, + { + "epoch": 0.5642196192064295, + "grad_norm": 1.5671937465667725, + "learning_rate": 8.858567541318543e-06, + "loss": 1.3103, + "step": 10320 + }, + { + "epoch": 0.5642742916501511, + "grad_norm": 1.448656678199768, + "learning_rate": 8.856752146151156e-06, + "loss": 1.2644, + "step": 10321 + }, + { + "epoch": 0.5643289640938726, + "grad_norm": 1.4616169929504395, + "learning_rate": 8.854936789159501e-06, + "loss": 1.329, + "step": 10322 + }, + { + "epoch": 0.5643836365375942, + "grad_norm": 1.7858473062515259, + "learning_rate": 8.853121470404193e-06, + "loss": 1.5865, + "step": 10323 + }, + { + "epoch": 0.5644383089813156, + "grad_norm": 1.5777757167816162, + "learning_rate": 8.85130618994585e-06, + "loss": 1.4926, + "step": 10324 + }, + { + "epoch": 0.5644929814250372, + "grad_norm": 1.6036540269851685, + "learning_rate": 8.849490947845089e-06, + "loss": 1.3751, + "step": 10325 + }, + { + "epoch": 0.5645476538687588, + "grad_norm": 1.4384316205978394, + "learning_rate": 8.847675744162522e-06, + "loss": 1.6283, + "step": 10326 + }, + { + "epoch": 0.5646023263124803, + "grad_norm": 1.3233983516693115, + "learning_rate": 8.84586057895877e-06, + "loss": 1.4805, + "step": 10327 + }, + { + "epoch": 0.5646569987562019, + "grad_norm": 1.6310781240463257, + "learning_rate": 8.844045452294442e-06, + "loss": 1.4881, + "step": 10328 + }, + { + "epoch": 0.5647116711999235, + "grad_norm": 1.714933156967163, + "learning_rate": 8.842230364230146e-06, + "loss": 1.415, + "step": 10329 + }, + { + "epoch": 0.564766343643645, + "grad_norm": 2.0802502632141113, + "learning_rate": 8.840415314826497e-06, + "loss": 1.2652, + "step": 10330 + }, + { + "epoch": 0.5648210160873666, + "grad_norm": 1.5590109825134277, + "learning_rate": 8.838600304144102e-06, + "loss": 1.3996, + "step": 10331 + }, + { + "epoch": 0.5648756885310882, + "grad_norm": 1.1927506923675537, + "learning_rate": 8.836785332243563e-06, + "loss": 1.6084, + "step": 10332 + }, + { + "epoch": 0.5649303609748096, + "grad_norm": 1.8936845064163208, + "learning_rate": 8.834970399185497e-06, + "loss": 1.2515, + "step": 10333 + }, + { + "epoch": 0.5649850334185312, + "grad_norm": 1.5173100233078003, + "learning_rate": 8.833155505030504e-06, + "loss": 1.1464, + "step": 10334 + }, + { + "epoch": 0.5650397058622528, + "grad_norm": 1.234757900238037, + "learning_rate": 8.831340649839182e-06, + "loss": 1.4864, + "step": 10335 + }, + { + "epoch": 0.5650943783059743, + "grad_norm": 1.5236550569534302, + "learning_rate": 8.829525833672142e-06, + "loss": 1.3601, + "step": 10336 + }, + { + "epoch": 0.5651490507496959, + "grad_norm": 1.7549225091934204, + "learning_rate": 8.82771105658998e-06, + "loss": 1.6427, + "step": 10337 + }, + { + "epoch": 0.5652037231934174, + "grad_norm": 1.6682329177856445, + "learning_rate": 8.825896318653294e-06, + "loss": 1.3993, + "step": 10338 + }, + { + "epoch": 0.565258395637139, + "grad_norm": 1.9015952348709106, + "learning_rate": 8.824081619922688e-06, + "loss": 1.1138, + "step": 10339 + }, + { + "epoch": 0.5653130680808606, + "grad_norm": 1.7268428802490234, + "learning_rate": 8.82226696045875e-06, + "loss": 1.4259, + "step": 10340 + }, + { + "epoch": 0.565367740524582, + "grad_norm": 1.5872738361358643, + "learning_rate": 8.82045234032209e-06, + "loss": 1.3969, + "step": 10341 + }, + { + "epoch": 0.5654224129683036, + "grad_norm": 1.4462515115737915, + "learning_rate": 8.818637759573292e-06, + "loss": 1.4422, + "step": 10342 + }, + { + "epoch": 0.5654770854120252, + "grad_norm": 1.6166571378707886, + "learning_rate": 8.81682321827295e-06, + "loss": 1.4395, + "step": 10343 + }, + { + "epoch": 0.5655317578557467, + "grad_norm": 1.4133797883987427, + "learning_rate": 8.815008716481658e-06, + "loss": 1.32, + "step": 10344 + }, + { + "epoch": 0.5655864302994683, + "grad_norm": 1.5030955076217651, + "learning_rate": 8.813194254260006e-06, + "loss": 1.3469, + "step": 10345 + }, + { + "epoch": 0.5656411027431899, + "grad_norm": 1.1152033805847168, + "learning_rate": 8.81137983166858e-06, + "loss": 1.6752, + "step": 10346 + }, + { + "epoch": 0.5656957751869114, + "grad_norm": 1.8546446561813354, + "learning_rate": 8.809565448767975e-06, + "loss": 1.3661, + "step": 10347 + }, + { + "epoch": 0.565750447630633, + "grad_norm": 1.5001713037490845, + "learning_rate": 8.807751105618771e-06, + "loss": 1.5532, + "step": 10348 + }, + { + "epoch": 0.5658051200743546, + "grad_norm": 1.2568202018737793, + "learning_rate": 8.805936802281554e-06, + "loss": 1.4475, + "step": 10349 + }, + { + "epoch": 0.565859792518076, + "grad_norm": 1.3413821458816528, + "learning_rate": 8.80412253881691e-06, + "loss": 1.4725, + "step": 10350 + }, + { + "epoch": 0.5659144649617976, + "grad_norm": 1.3644055128097534, + "learning_rate": 8.802308315285423e-06, + "loss": 1.4883, + "step": 10351 + }, + { + "epoch": 0.5659691374055191, + "grad_norm": 1.3810830116271973, + "learning_rate": 8.800494131747667e-06, + "loss": 1.5196, + "step": 10352 + }, + { + "epoch": 0.5660238098492407, + "grad_norm": 1.8460825681686401, + "learning_rate": 8.79867998826423e-06, + "loss": 1.2594, + "step": 10353 + }, + { + "epoch": 0.5660784822929623, + "grad_norm": 1.4099119901657104, + "learning_rate": 8.796865884895686e-06, + "loss": 1.4596, + "step": 10354 + }, + { + "epoch": 0.5661331547366838, + "grad_norm": 1.5760995149612427, + "learning_rate": 8.795051821702614e-06, + "loss": 1.6649, + "step": 10355 + }, + { + "epoch": 0.5661878271804054, + "grad_norm": 1.2375359535217285, + "learning_rate": 8.793237798745591e-06, + "loss": 1.4583, + "step": 10356 + }, + { + "epoch": 0.566242499624127, + "grad_norm": 1.3595001697540283, + "learning_rate": 8.791423816085184e-06, + "loss": 1.5584, + "step": 10357 + }, + { + "epoch": 0.5662971720678485, + "grad_norm": 1.4933644533157349, + "learning_rate": 8.789609873781978e-06, + "loss": 1.5878, + "step": 10358 + }, + { + "epoch": 0.5663518445115701, + "grad_norm": 1.4185181856155396, + "learning_rate": 8.787795971896536e-06, + "loss": 1.4044, + "step": 10359 + }, + { + "epoch": 0.5664065169552917, + "grad_norm": 1.4546018838882446, + "learning_rate": 8.785982110489428e-06, + "loss": 1.2846, + "step": 10360 + }, + { + "epoch": 0.5664611893990131, + "grad_norm": 1.9815587997436523, + "learning_rate": 8.784168289621231e-06, + "loss": 1.2782, + "step": 10361 + }, + { + "epoch": 0.5665158618427347, + "grad_norm": 1.420289158821106, + "learning_rate": 8.782354509352507e-06, + "loss": 1.3765, + "step": 10362 + }, + { + "epoch": 0.5665705342864563, + "grad_norm": 1.2361475229263306, + "learning_rate": 8.780540769743821e-06, + "loss": 1.4029, + "step": 10363 + }, + { + "epoch": 0.5666252067301778, + "grad_norm": 1.4929717779159546, + "learning_rate": 8.778727070855743e-06, + "loss": 1.5041, + "step": 10364 + }, + { + "epoch": 0.5666798791738994, + "grad_norm": 1.573931336402893, + "learning_rate": 8.776913412748833e-06, + "loss": 1.4751, + "step": 10365 + }, + { + "epoch": 0.5667345516176209, + "grad_norm": 1.639646053314209, + "learning_rate": 8.775099795483651e-06, + "loss": 1.7612, + "step": 10366 + }, + { + "epoch": 0.5667892240613425, + "grad_norm": 1.9351643323898315, + "learning_rate": 8.773286219120765e-06, + "loss": 1.4478, + "step": 10367 + }, + { + "epoch": 0.5668438965050641, + "grad_norm": 1.508620023727417, + "learning_rate": 8.771472683720728e-06, + "loss": 1.3598, + "step": 10368 + }, + { + "epoch": 0.5668985689487855, + "grad_norm": 1.730095386505127, + "learning_rate": 8.769659189344105e-06, + "loss": 1.3844, + "step": 10369 + }, + { + "epoch": 0.5669532413925071, + "grad_norm": 1.5053081512451172, + "learning_rate": 8.767845736051447e-06, + "loss": 1.4039, + "step": 10370 + }, + { + "epoch": 0.5670079138362287, + "grad_norm": 2.064882755279541, + "learning_rate": 8.766032323903306e-06, + "loss": 1.7, + "step": 10371 + }, + { + "epoch": 0.5670625862799502, + "grad_norm": 1.3171453475952148, + "learning_rate": 8.764218952960247e-06, + "loss": 1.4534, + "step": 10372 + }, + { + "epoch": 0.5671172587236718, + "grad_norm": 1.5331734418869019, + "learning_rate": 8.762405623282817e-06, + "loss": 1.2044, + "step": 10373 + }, + { + "epoch": 0.5671719311673934, + "grad_norm": 1.4730910062789917, + "learning_rate": 8.760592334931566e-06, + "loss": 1.3552, + "step": 10374 + }, + { + "epoch": 0.5672266036111149, + "grad_norm": 1.1299803256988525, + "learning_rate": 8.758779087967047e-06, + "loss": 1.7734, + "step": 10375 + }, + { + "epoch": 0.5672812760548365, + "grad_norm": 1.966922402381897, + "learning_rate": 8.756965882449806e-06, + "loss": 1.2893, + "step": 10376 + }, + { + "epoch": 0.5673359484985581, + "grad_norm": 1.232215404510498, + "learning_rate": 8.755152718440387e-06, + "loss": 1.4416, + "step": 10377 + }, + { + "epoch": 0.5673906209422795, + "grad_norm": 1.293992519378662, + "learning_rate": 8.753339595999344e-06, + "loss": 1.4729, + "step": 10378 + }, + { + "epoch": 0.5674452933860011, + "grad_norm": 1.9450771808624268, + "learning_rate": 8.751526515187218e-06, + "loss": 1.4529, + "step": 10379 + }, + { + "epoch": 0.5674999658297226, + "grad_norm": 2.8365426063537598, + "learning_rate": 8.749713476064547e-06, + "loss": 1.3382, + "step": 10380 + }, + { + "epoch": 0.5675546382734442, + "grad_norm": 2.2396273612976074, + "learning_rate": 8.74790047869188e-06, + "loss": 1.6102, + "step": 10381 + }, + { + "epoch": 0.5676093107171658, + "grad_norm": 1.782280445098877, + "learning_rate": 8.746087523129752e-06, + "loss": 1.2222, + "step": 10382 + }, + { + "epoch": 0.5676639831608873, + "grad_norm": 1.3181449174880981, + "learning_rate": 8.744274609438707e-06, + "loss": 1.2986, + "step": 10383 + }, + { + "epoch": 0.5677186556046089, + "grad_norm": 1.6739068031311035, + "learning_rate": 8.742461737679279e-06, + "loss": 1.3784, + "step": 10384 + }, + { + "epoch": 0.5677733280483305, + "grad_norm": 1.9058218002319336, + "learning_rate": 8.740648907912002e-06, + "loss": 1.473, + "step": 10385 + }, + { + "epoch": 0.567828000492052, + "grad_norm": 1.439828872680664, + "learning_rate": 8.738836120197416e-06, + "loss": 1.5561, + "step": 10386 + }, + { + "epoch": 0.5678826729357735, + "grad_norm": 2.198859214782715, + "learning_rate": 8.737023374596051e-06, + "loss": 1.5299, + "step": 10387 + }, + { + "epoch": 0.5679373453794951, + "grad_norm": 1.5814965963363647, + "learning_rate": 8.735210671168438e-06, + "loss": 1.1922, + "step": 10388 + }, + { + "epoch": 0.5679920178232166, + "grad_norm": 1.5871245861053467, + "learning_rate": 8.733398009975109e-06, + "loss": 1.4099, + "step": 10389 + }, + { + "epoch": 0.5680466902669382, + "grad_norm": 1.1614696979522705, + "learning_rate": 8.731585391076594e-06, + "loss": 1.4786, + "step": 10390 + }, + { + "epoch": 0.5681013627106598, + "grad_norm": 2.1696372032165527, + "learning_rate": 8.729772814533415e-06, + "loss": 1.5381, + "step": 10391 + }, + { + "epoch": 0.5681560351543813, + "grad_norm": 1.509549856185913, + "learning_rate": 8.727960280406107e-06, + "loss": 1.3429, + "step": 10392 + }, + { + "epoch": 0.5682107075981029, + "grad_norm": 1.8239368200302124, + "learning_rate": 8.72614778875519e-06, + "loss": 1.531, + "step": 10393 + }, + { + "epoch": 0.5682653800418245, + "grad_norm": 1.5605061054229736, + "learning_rate": 8.724335339641185e-06, + "loss": 1.4179, + "step": 10394 + }, + { + "epoch": 0.568320052485546, + "grad_norm": 1.5712707042694092, + "learning_rate": 8.722522933124617e-06, + "loss": 1.4521, + "step": 10395 + }, + { + "epoch": 0.5683747249292675, + "grad_norm": 1.1878688335418701, + "learning_rate": 8.720710569266004e-06, + "loss": 1.4418, + "step": 10396 + }, + { + "epoch": 0.568429397372989, + "grad_norm": 1.172037959098816, + "learning_rate": 8.718898248125871e-06, + "loss": 1.393, + "step": 10397 + }, + { + "epoch": 0.5684840698167106, + "grad_norm": 1.213944911956787, + "learning_rate": 8.717085969764732e-06, + "loss": 1.3892, + "step": 10398 + }, + { + "epoch": 0.5685387422604322, + "grad_norm": 1.5259298086166382, + "learning_rate": 8.715273734243098e-06, + "loss": 1.3728, + "step": 10399 + }, + { + "epoch": 0.5685934147041537, + "grad_norm": 1.5140302181243896, + "learning_rate": 8.713461541621492e-06, + "loss": 1.4972, + "step": 10400 + }, + { + "epoch": 0.5686480871478753, + "grad_norm": 1.4324753284454346, + "learning_rate": 8.711649391960424e-06, + "loss": 1.7268, + "step": 10401 + }, + { + "epoch": 0.5687027595915969, + "grad_norm": 1.4479594230651855, + "learning_rate": 8.709837285320406e-06, + "loss": 1.4732, + "step": 10402 + }, + { + "epoch": 0.5687574320353184, + "grad_norm": 1.8083946704864502, + "learning_rate": 8.708025221761949e-06, + "loss": 1.3775, + "step": 10403 + }, + { + "epoch": 0.56881210447904, + "grad_norm": 1.614440679550171, + "learning_rate": 8.706213201345561e-06, + "loss": 1.6119, + "step": 10404 + }, + { + "epoch": 0.5688667769227616, + "grad_norm": 1.925727128982544, + "learning_rate": 8.704401224131747e-06, + "loss": 1.1653, + "step": 10405 + }, + { + "epoch": 0.568921449366483, + "grad_norm": 2.9165525436401367, + "learning_rate": 8.702589290181021e-06, + "loss": 1.4378, + "step": 10406 + }, + { + "epoch": 0.5689761218102046, + "grad_norm": 1.4862406253814697, + "learning_rate": 8.700777399553883e-06, + "loss": 1.4755, + "step": 10407 + }, + { + "epoch": 0.5690307942539262, + "grad_norm": 1.1494932174682617, + "learning_rate": 8.698965552310834e-06, + "loss": 1.5211, + "step": 10408 + }, + { + "epoch": 0.5690854666976477, + "grad_norm": 1.6081531047821045, + "learning_rate": 8.69715374851238e-06, + "loss": 1.4547, + "step": 10409 + }, + { + "epoch": 0.5691401391413693, + "grad_norm": 1.8362754583358765, + "learning_rate": 8.695341988219015e-06, + "loss": 1.5492, + "step": 10410 + }, + { + "epoch": 0.5691948115850908, + "grad_norm": 1.258095145225525, + "learning_rate": 8.693530271491249e-06, + "loss": 1.6083, + "step": 10411 + }, + { + "epoch": 0.5692494840288124, + "grad_norm": 1.490594744682312, + "learning_rate": 8.69171859838957e-06, + "loss": 1.5212, + "step": 10412 + }, + { + "epoch": 0.569304156472534, + "grad_norm": 2.02191424369812, + "learning_rate": 8.689906968974476e-06, + "loss": 1.2962, + "step": 10413 + }, + { + "epoch": 0.5693588289162554, + "grad_norm": 1.496113896369934, + "learning_rate": 8.688095383306465e-06, + "loss": 1.3097, + "step": 10414 + }, + { + "epoch": 0.569413501359977, + "grad_norm": 1.8629770278930664, + "learning_rate": 8.686283841446027e-06, + "loss": 1.418, + "step": 10415 + }, + { + "epoch": 0.5694681738036986, + "grad_norm": 1.6768964529037476, + "learning_rate": 8.68447234345365e-06, + "loss": 1.4159, + "step": 10416 + }, + { + "epoch": 0.5695228462474201, + "grad_norm": 1.4143307209014893, + "learning_rate": 8.682660889389834e-06, + "loss": 1.3733, + "step": 10417 + }, + { + "epoch": 0.5695775186911417, + "grad_norm": 1.6109908819198608, + "learning_rate": 8.680849479315061e-06, + "loss": 1.5068, + "step": 10418 + }, + { + "epoch": 0.5696321911348633, + "grad_norm": 2.4874720573425293, + "learning_rate": 8.679038113289815e-06, + "loss": 1.5032, + "step": 10419 + }, + { + "epoch": 0.5696868635785848, + "grad_norm": 1.3303136825561523, + "learning_rate": 8.67722679137459e-06, + "loss": 1.3914, + "step": 10420 + }, + { + "epoch": 0.5697415360223064, + "grad_norm": 1.726977825164795, + "learning_rate": 8.675415513629867e-06, + "loss": 1.7624, + "step": 10421 + }, + { + "epoch": 0.569796208466028, + "grad_norm": 1.347460389137268, + "learning_rate": 8.673604280116127e-06, + "loss": 1.5355, + "step": 10422 + }, + { + "epoch": 0.5698508809097494, + "grad_norm": 1.4135850667953491, + "learning_rate": 8.671793090893853e-06, + "loss": 1.3886, + "step": 10423 + }, + { + "epoch": 0.569905553353471, + "grad_norm": 1.5081000328063965, + "learning_rate": 8.66998194602352e-06, + "loss": 1.512, + "step": 10424 + }, + { + "epoch": 0.5699602257971925, + "grad_norm": 1.5268406867980957, + "learning_rate": 8.668170845565618e-06, + "loss": 1.3241, + "step": 10425 + }, + { + "epoch": 0.5700148982409141, + "grad_norm": 1.437973141670227, + "learning_rate": 8.666359789580613e-06, + "loss": 1.541, + "step": 10426 + }, + { + "epoch": 0.5700695706846357, + "grad_norm": 2.369924306869507, + "learning_rate": 8.664548778128985e-06, + "loss": 1.6705, + "step": 10427 + }, + { + "epoch": 0.5701242431283572, + "grad_norm": 1.3574661016464233, + "learning_rate": 8.662737811271208e-06, + "loss": 1.5947, + "step": 10428 + }, + { + "epoch": 0.5701789155720788, + "grad_norm": 1.3290749788284302, + "learning_rate": 8.660926889067753e-06, + "loss": 1.3214, + "step": 10429 + }, + { + "epoch": 0.5702335880158004, + "grad_norm": 1.8828868865966797, + "learning_rate": 8.659116011579088e-06, + "loss": 1.4233, + "step": 10430 + }, + { + "epoch": 0.5702882604595219, + "grad_norm": 1.2976245880126953, + "learning_rate": 8.65730517886569e-06, + "loss": 1.4222, + "step": 10431 + }, + { + "epoch": 0.5703429329032434, + "grad_norm": 1.4867140054702759, + "learning_rate": 8.655494390988022e-06, + "loss": 1.6889, + "step": 10432 + }, + { + "epoch": 0.570397605346965, + "grad_norm": 1.1634953022003174, + "learning_rate": 8.65368364800655e-06, + "loss": 1.4335, + "step": 10433 + }, + { + "epoch": 0.5704522777906865, + "grad_norm": 1.3412507772445679, + "learning_rate": 8.651872949981743e-06, + "loss": 1.2814, + "step": 10434 + }, + { + "epoch": 0.5705069502344081, + "grad_norm": 1.5536448955535889, + "learning_rate": 8.65006229697406e-06, + "loss": 1.3014, + "step": 10435 + }, + { + "epoch": 0.5705616226781297, + "grad_norm": 1.4378002882003784, + "learning_rate": 8.648251689043961e-06, + "loss": 1.5854, + "step": 10436 + }, + { + "epoch": 0.5706162951218512, + "grad_norm": 1.4904253482818604, + "learning_rate": 8.646441126251914e-06, + "loss": 1.1798, + "step": 10437 + }, + { + "epoch": 0.5706709675655728, + "grad_norm": 1.69159996509552, + "learning_rate": 8.644630608658371e-06, + "loss": 1.3427, + "step": 10438 + }, + { + "epoch": 0.5707256400092943, + "grad_norm": 1.769080638885498, + "learning_rate": 8.642820136323794e-06, + "loss": 1.4524, + "step": 10439 + }, + { + "epoch": 0.5707803124530159, + "grad_norm": 1.3219935894012451, + "learning_rate": 8.641009709308641e-06, + "loss": 1.3856, + "step": 10440 + }, + { + "epoch": 0.5708349848967375, + "grad_norm": 1.7249958515167236, + "learning_rate": 8.639199327673358e-06, + "loss": 1.4516, + "step": 10441 + }, + { + "epoch": 0.5708896573404589, + "grad_norm": 2.0865790843963623, + "learning_rate": 8.637388991478406e-06, + "loss": 1.2666, + "step": 10442 + }, + { + "epoch": 0.5709443297841805, + "grad_norm": 1.4930235147476196, + "learning_rate": 8.635578700784232e-06, + "loss": 1.2266, + "step": 10443 + }, + { + "epoch": 0.5709990022279021, + "grad_norm": 2.7937381267547607, + "learning_rate": 8.633768455651283e-06, + "loss": 1.3404, + "step": 10444 + }, + { + "epoch": 0.5710536746716236, + "grad_norm": 1.5799331665039062, + "learning_rate": 8.631958256140017e-06, + "loss": 1.4143, + "step": 10445 + }, + { + "epoch": 0.5711083471153452, + "grad_norm": 1.2254068851470947, + "learning_rate": 8.630148102310874e-06, + "loss": 1.4481, + "step": 10446 + }, + { + "epoch": 0.5711630195590668, + "grad_norm": 1.5706456899642944, + "learning_rate": 8.628337994224298e-06, + "loss": 1.5, + "step": 10447 + }, + { + "epoch": 0.5712176920027883, + "grad_norm": 1.84151029586792, + "learning_rate": 8.626527931940736e-06, + "loss": 1.5428, + "step": 10448 + }, + { + "epoch": 0.5712723644465099, + "grad_norm": 1.793065071105957, + "learning_rate": 8.624717915520632e-06, + "loss": 1.3121, + "step": 10449 + }, + { + "epoch": 0.5713270368902315, + "grad_norm": 1.425977349281311, + "learning_rate": 8.622907945024418e-06, + "loss": 1.4387, + "step": 10450 + }, + { + "epoch": 0.5713817093339529, + "grad_norm": 1.3530864715576172, + "learning_rate": 8.621098020512543e-06, + "loss": 1.4698, + "step": 10451 + }, + { + "epoch": 0.5714363817776745, + "grad_norm": 1.4802027940750122, + "learning_rate": 8.61928814204544e-06, + "loss": 1.5565, + "step": 10452 + }, + { + "epoch": 0.571491054221396, + "grad_norm": 1.3383272886276245, + "learning_rate": 8.617478309683548e-06, + "loss": 1.3014, + "step": 10453 + }, + { + "epoch": 0.5715457266651176, + "grad_norm": 1.6512396335601807, + "learning_rate": 8.615668523487299e-06, + "loss": 1.2326, + "step": 10454 + }, + { + "epoch": 0.5716003991088392, + "grad_norm": 1.5252540111541748, + "learning_rate": 8.613858783517122e-06, + "loss": 1.43, + "step": 10455 + }, + { + "epoch": 0.5716550715525607, + "grad_norm": 1.976579189300537, + "learning_rate": 8.612049089833457e-06, + "loss": 1.3324, + "step": 10456 + }, + { + "epoch": 0.5717097439962823, + "grad_norm": 1.1984440088272095, + "learning_rate": 8.61023944249673e-06, + "loss": 1.9386, + "step": 10457 + }, + { + "epoch": 0.5717644164400039, + "grad_norm": 1.5727453231811523, + "learning_rate": 8.608429841567365e-06, + "loss": 1.3524, + "step": 10458 + }, + { + "epoch": 0.5718190888837253, + "grad_norm": 1.9489191770553589, + "learning_rate": 8.606620287105796e-06, + "loss": 1.4356, + "step": 10459 + }, + { + "epoch": 0.5718737613274469, + "grad_norm": 1.605324387550354, + "learning_rate": 8.604810779172447e-06, + "loss": 1.5152, + "step": 10460 + }, + { + "epoch": 0.5719284337711685, + "grad_norm": 1.3871325254440308, + "learning_rate": 8.603001317827738e-06, + "loss": 1.6319, + "step": 10461 + }, + { + "epoch": 0.57198310621489, + "grad_norm": 1.4698615074157715, + "learning_rate": 8.601191903132094e-06, + "loss": 1.4057, + "step": 10462 + }, + { + "epoch": 0.5720377786586116, + "grad_norm": 1.3768298625946045, + "learning_rate": 8.599382535145936e-06, + "loss": 1.2698, + "step": 10463 + }, + { + "epoch": 0.5720924511023332, + "grad_norm": 1.771417260169983, + "learning_rate": 8.597573213929677e-06, + "loss": 1.5713, + "step": 10464 + }, + { + "epoch": 0.5721471235460547, + "grad_norm": 1.3656916618347168, + "learning_rate": 8.595763939543743e-06, + "loss": 1.2471, + "step": 10465 + }, + { + "epoch": 0.5722017959897763, + "grad_norm": 1.4576983451843262, + "learning_rate": 8.593954712048544e-06, + "loss": 1.0852, + "step": 10466 + }, + { + "epoch": 0.5722564684334978, + "grad_norm": 1.3937197923660278, + "learning_rate": 8.592145531504499e-06, + "loss": 1.3432, + "step": 10467 + }, + { + "epoch": 0.5723111408772193, + "grad_norm": 1.6330487728118896, + "learning_rate": 8.590336397972018e-06, + "loss": 1.6629, + "step": 10468 + }, + { + "epoch": 0.5723658133209409, + "grad_norm": 1.4842829704284668, + "learning_rate": 8.58852731151151e-06, + "loss": 1.2545, + "step": 10469 + }, + { + "epoch": 0.5724204857646624, + "grad_norm": 2.214244842529297, + "learning_rate": 8.586718272183392e-06, + "loss": 1.5387, + "step": 10470 + }, + { + "epoch": 0.572475158208384, + "grad_norm": 1.393944501876831, + "learning_rate": 8.584909280048064e-06, + "loss": 1.1815, + "step": 10471 + }, + { + "epoch": 0.5725298306521056, + "grad_norm": 1.810158371925354, + "learning_rate": 8.583100335165936e-06, + "loss": 1.7859, + "step": 10472 + }, + { + "epoch": 0.5725845030958271, + "grad_norm": 1.3671315908432007, + "learning_rate": 8.581291437597413e-06, + "loss": 1.3157, + "step": 10473 + }, + { + "epoch": 0.5726391755395487, + "grad_norm": 1.3444645404815674, + "learning_rate": 8.579482587402899e-06, + "loss": 1.4982, + "step": 10474 + }, + { + "epoch": 0.5726938479832703, + "grad_norm": 1.1539255380630493, + "learning_rate": 8.577673784642791e-06, + "loss": 1.336, + "step": 10475 + }, + { + "epoch": 0.5727485204269918, + "grad_norm": 1.6539078950881958, + "learning_rate": 8.575865029377498e-06, + "loss": 1.3283, + "step": 10476 + }, + { + "epoch": 0.5728031928707134, + "grad_norm": 1.3590692281723022, + "learning_rate": 8.57405632166741e-06, + "loss": 1.3322, + "step": 10477 + }, + { + "epoch": 0.5728578653144349, + "grad_norm": 1.622138261795044, + "learning_rate": 8.572247661572926e-06, + "loss": 1.4166, + "step": 10478 + }, + { + "epoch": 0.5729125377581564, + "grad_norm": 1.7461756467819214, + "learning_rate": 8.570439049154447e-06, + "loss": 1.5358, + "step": 10479 + }, + { + "epoch": 0.572967210201878, + "grad_norm": 1.6273075342178345, + "learning_rate": 8.56863048447236e-06, + "loss": 1.4698, + "step": 10480 + }, + { + "epoch": 0.5730218826455995, + "grad_norm": 1.5302492380142212, + "learning_rate": 8.566821967587062e-06, + "loss": 1.5642, + "step": 10481 + }, + { + "epoch": 0.5730765550893211, + "grad_norm": 1.688576102256775, + "learning_rate": 8.565013498558942e-06, + "loss": 1.4193, + "step": 10482 + }, + { + "epoch": 0.5731312275330427, + "grad_norm": 2.842768907546997, + "learning_rate": 8.563205077448385e-06, + "loss": 1.4434, + "step": 10483 + }, + { + "epoch": 0.5731858999767642, + "grad_norm": 1.6520286798477173, + "learning_rate": 8.561396704315785e-06, + "loss": 1.4853, + "step": 10484 + }, + { + "epoch": 0.5732405724204858, + "grad_norm": 1.4965335130691528, + "learning_rate": 8.559588379221525e-06, + "loss": 1.3829, + "step": 10485 + }, + { + "epoch": 0.5732952448642074, + "grad_norm": 2.4698386192321777, + "learning_rate": 8.557780102225987e-06, + "loss": 1.1906, + "step": 10486 + }, + { + "epoch": 0.5733499173079288, + "grad_norm": 1.8845807313919067, + "learning_rate": 8.555971873389558e-06, + "loss": 1.4925, + "step": 10487 + }, + { + "epoch": 0.5734045897516504, + "grad_norm": 1.2809711694717407, + "learning_rate": 8.554163692772617e-06, + "loss": 1.6013, + "step": 10488 + }, + { + "epoch": 0.573459262195372, + "grad_norm": 1.3844505548477173, + "learning_rate": 8.552355560435538e-06, + "loss": 1.6979, + "step": 10489 + }, + { + "epoch": 0.5735139346390935, + "grad_norm": 1.752468466758728, + "learning_rate": 8.550547476438708e-06, + "loss": 1.5804, + "step": 10490 + }, + { + "epoch": 0.5735686070828151, + "grad_norm": 1.530966877937317, + "learning_rate": 8.548739440842499e-06, + "loss": 1.4046, + "step": 10491 + }, + { + "epoch": 0.5736232795265367, + "grad_norm": 1.3034383058547974, + "learning_rate": 8.546931453707285e-06, + "loss": 1.4803, + "step": 10492 + }, + { + "epoch": 0.5736779519702582, + "grad_norm": 1.4091403484344482, + "learning_rate": 8.545123515093441e-06, + "loss": 1.3831, + "step": 10493 + }, + { + "epoch": 0.5737326244139798, + "grad_norm": 1.437299132347107, + "learning_rate": 8.543315625061332e-06, + "loss": 1.6552, + "step": 10494 + }, + { + "epoch": 0.5737872968577012, + "grad_norm": 1.5922319889068604, + "learning_rate": 8.541507783671337e-06, + "loss": 1.4205, + "step": 10495 + }, + { + "epoch": 0.5738419693014228, + "grad_norm": 1.8825876712799072, + "learning_rate": 8.53969999098382e-06, + "loss": 1.5802, + "step": 10496 + }, + { + "epoch": 0.5738966417451444, + "grad_norm": 1.6737689971923828, + "learning_rate": 8.537892247059141e-06, + "loss": 1.2513, + "step": 10497 + }, + { + "epoch": 0.5739513141888659, + "grad_norm": 1.2497791051864624, + "learning_rate": 8.536084551957676e-06, + "loss": 1.7132, + "step": 10498 + }, + { + "epoch": 0.5740059866325875, + "grad_norm": 1.6805732250213623, + "learning_rate": 8.534276905739783e-06, + "loss": 1.497, + "step": 10499 + }, + { + "epoch": 0.5740606590763091, + "grad_norm": 2.2008376121520996, + "learning_rate": 8.532469308465823e-06, + "loss": 1.3982, + "step": 10500 + }, + { + "epoch": 0.5741153315200306, + "grad_norm": 1.4705243110656738, + "learning_rate": 8.530661760196157e-06, + "loss": 1.4546, + "step": 10501 + }, + { + "epoch": 0.5741700039637522, + "grad_norm": 1.4550132751464844, + "learning_rate": 8.528854260991142e-06, + "loss": 1.3793, + "step": 10502 + }, + { + "epoch": 0.5742246764074738, + "grad_norm": 1.625576376914978, + "learning_rate": 8.527046810911133e-06, + "loss": 1.4779, + "step": 10503 + }, + { + "epoch": 0.5742793488511952, + "grad_norm": 1.5325534343719482, + "learning_rate": 8.52523941001649e-06, + "loss": 1.2913, + "step": 10504 + }, + { + "epoch": 0.5743340212949168, + "grad_norm": 1.310488224029541, + "learning_rate": 8.523432058367564e-06, + "loss": 1.3381, + "step": 10505 + }, + { + "epoch": 0.5743886937386384, + "grad_norm": 1.680794596672058, + "learning_rate": 8.521624756024706e-06, + "loss": 1.376, + "step": 10506 + }, + { + "epoch": 0.5744433661823599, + "grad_norm": 2.192509651184082, + "learning_rate": 8.519817503048267e-06, + "loss": 1.4269, + "step": 10507 + }, + { + "epoch": 0.5744980386260815, + "grad_norm": 1.5454708337783813, + "learning_rate": 8.518010299498591e-06, + "loss": 1.5167, + "step": 10508 + }, + { + "epoch": 0.574552711069803, + "grad_norm": 1.8288806676864624, + "learning_rate": 8.516203145436033e-06, + "loss": 1.4937, + "step": 10509 + }, + { + "epoch": 0.5746073835135246, + "grad_norm": 1.9484437704086304, + "learning_rate": 8.514396040920934e-06, + "loss": 1.3786, + "step": 10510 + }, + { + "epoch": 0.5746620559572462, + "grad_norm": 1.7403619289398193, + "learning_rate": 8.512588986013635e-06, + "loss": 1.5122, + "step": 10511 + }, + { + "epoch": 0.5747167284009677, + "grad_norm": 2.059267997741699, + "learning_rate": 8.510781980774482e-06, + "loss": 1.5246, + "step": 10512 + }, + { + "epoch": 0.5747714008446893, + "grad_norm": 1.3974121809005737, + "learning_rate": 8.508975025263814e-06, + "loss": 1.5561, + "step": 10513 + }, + { + "epoch": 0.5748260732884108, + "grad_norm": 1.6962071657180786, + "learning_rate": 8.507168119541964e-06, + "loss": 1.1808, + "step": 10514 + }, + { + "epoch": 0.5748807457321323, + "grad_norm": 1.565330147743225, + "learning_rate": 8.505361263669278e-06, + "loss": 1.4061, + "step": 10515 + }, + { + "epoch": 0.5749354181758539, + "grad_norm": 1.4518405199050903, + "learning_rate": 8.503554457706086e-06, + "loss": 1.6701, + "step": 10516 + }, + { + "epoch": 0.5749900906195755, + "grad_norm": 1.8860423564910889, + "learning_rate": 8.501747701712718e-06, + "loss": 1.448, + "step": 10517 + }, + { + "epoch": 0.575044763063297, + "grad_norm": 1.3975330591201782, + "learning_rate": 8.499940995749514e-06, + "loss": 1.4067, + "step": 10518 + }, + { + "epoch": 0.5750994355070186, + "grad_norm": 1.143104910850525, + "learning_rate": 8.498134339876802e-06, + "loss": 1.4213, + "step": 10519 + }, + { + "epoch": 0.5751541079507402, + "grad_norm": 1.1095048189163208, + "learning_rate": 8.496327734154905e-06, + "loss": 1.5327, + "step": 10520 + }, + { + "epoch": 0.5752087803944617, + "grad_norm": 1.9332021474838257, + "learning_rate": 8.494521178644155e-06, + "loss": 1.4221, + "step": 10521 + }, + { + "epoch": 0.5752634528381833, + "grad_norm": 1.3510782718658447, + "learning_rate": 8.492714673404873e-06, + "loss": 1.6633, + "step": 10522 + }, + { + "epoch": 0.5753181252819047, + "grad_norm": 1.2714266777038574, + "learning_rate": 8.490908218497387e-06, + "loss": 1.6891, + "step": 10523 + }, + { + "epoch": 0.5753727977256263, + "grad_norm": 1.5821634531021118, + "learning_rate": 8.489101813982019e-06, + "loss": 1.2468, + "step": 10524 + }, + { + "epoch": 0.5754274701693479, + "grad_norm": 1.5391145944595337, + "learning_rate": 8.487295459919084e-06, + "loss": 1.4028, + "step": 10525 + }, + { + "epoch": 0.5754821426130694, + "grad_norm": 1.1795787811279297, + "learning_rate": 8.485489156368904e-06, + "loss": 1.5024, + "step": 10526 + }, + { + "epoch": 0.575536815056791, + "grad_norm": 1.7236231565475464, + "learning_rate": 8.483682903391796e-06, + "loss": 1.3906, + "step": 10527 + }, + { + "epoch": 0.5755914875005126, + "grad_norm": 1.528617024421692, + "learning_rate": 8.481876701048071e-06, + "loss": 1.5921, + "step": 10528 + }, + { + "epoch": 0.5756461599442341, + "grad_norm": 1.4995516538619995, + "learning_rate": 8.480070549398048e-06, + "loss": 1.3235, + "step": 10529 + }, + { + "epoch": 0.5757008323879557, + "grad_norm": 1.9136950969696045, + "learning_rate": 8.478264448502038e-06, + "loss": 1.2789, + "step": 10530 + }, + { + "epoch": 0.5757555048316773, + "grad_norm": 1.3718270063400269, + "learning_rate": 8.476458398420344e-06, + "loss": 1.3571, + "step": 10531 + }, + { + "epoch": 0.5758101772753987, + "grad_norm": 1.6068586111068726, + "learning_rate": 8.474652399213283e-06, + "loss": 1.3562, + "step": 10532 + }, + { + "epoch": 0.5758648497191203, + "grad_norm": 1.2878186702728271, + "learning_rate": 8.472846450941158e-06, + "loss": 1.251, + "step": 10533 + }, + { + "epoch": 0.5759195221628419, + "grad_norm": 1.733447790145874, + "learning_rate": 8.471040553664269e-06, + "loss": 1.4103, + "step": 10534 + }, + { + "epoch": 0.5759741946065634, + "grad_norm": 1.8179497718811035, + "learning_rate": 8.469234707442927e-06, + "loss": 1.3346, + "step": 10535 + }, + { + "epoch": 0.576028867050285, + "grad_norm": 1.844386339187622, + "learning_rate": 8.467428912337429e-06, + "loss": 1.3879, + "step": 10536 + }, + { + "epoch": 0.5760835394940065, + "grad_norm": 1.4827951192855835, + "learning_rate": 8.465623168408077e-06, + "loss": 1.8379, + "step": 10537 + }, + { + "epoch": 0.5761382119377281, + "grad_norm": 1.610454797744751, + "learning_rate": 8.463817475715169e-06, + "loss": 1.4965, + "step": 10538 + }, + { + "epoch": 0.5761928843814497, + "grad_norm": 1.596195101737976, + "learning_rate": 8.462011834318996e-06, + "loss": 1.2478, + "step": 10539 + }, + { + "epoch": 0.5762475568251711, + "grad_norm": 1.5077624320983887, + "learning_rate": 8.46020624427986e-06, + "loss": 1.4724, + "step": 10540 + }, + { + "epoch": 0.5763022292688927, + "grad_norm": 1.8033241033554077, + "learning_rate": 8.458400705658051e-06, + "loss": 1.3504, + "step": 10541 + }, + { + "epoch": 0.5763569017126143, + "grad_norm": 1.4863988161087036, + "learning_rate": 8.456595218513857e-06, + "loss": 1.4046, + "step": 10542 + }, + { + "epoch": 0.5764115741563358, + "grad_norm": 1.4661568403244019, + "learning_rate": 8.454789782907575e-06, + "loss": 1.3662, + "step": 10543 + }, + { + "epoch": 0.5764662466000574, + "grad_norm": 1.6446881294250488, + "learning_rate": 8.452984398899487e-06, + "loss": 1.4614, + "step": 10544 + }, + { + "epoch": 0.576520919043779, + "grad_norm": 1.3878952264785767, + "learning_rate": 8.451179066549877e-06, + "loss": 1.7462, + "step": 10545 + }, + { + "epoch": 0.5765755914875005, + "grad_norm": 1.9112497568130493, + "learning_rate": 8.449373785919034e-06, + "loss": 1.3533, + "step": 10546 + }, + { + "epoch": 0.5766302639312221, + "grad_norm": 2.126946449279785, + "learning_rate": 8.447568557067241e-06, + "loss": 1.0827, + "step": 10547 + }, + { + "epoch": 0.5766849363749437, + "grad_norm": 1.287150502204895, + "learning_rate": 8.445763380054773e-06, + "loss": 1.3482, + "step": 10548 + }, + { + "epoch": 0.5767396088186652, + "grad_norm": 1.3865010738372803, + "learning_rate": 8.443958254941915e-06, + "loss": 1.1574, + "step": 10549 + }, + { + "epoch": 0.5767942812623867, + "grad_norm": 1.4757795333862305, + "learning_rate": 8.44215318178894e-06, + "loss": 1.3845, + "step": 10550 + }, + { + "epoch": 0.5768489537061082, + "grad_norm": 1.411795973777771, + "learning_rate": 8.440348160656132e-06, + "loss": 1.5174, + "step": 10551 + }, + { + "epoch": 0.5769036261498298, + "grad_norm": 1.4009531736373901, + "learning_rate": 8.438543191603755e-06, + "loss": 1.4779, + "step": 10552 + }, + { + "epoch": 0.5769582985935514, + "grad_norm": 1.53931725025177, + "learning_rate": 8.43673827469208e-06, + "loss": 1.3314, + "step": 10553 + }, + { + "epoch": 0.5770129710372729, + "grad_norm": 1.6093425750732422, + "learning_rate": 8.43493340998139e-06, + "loss": 1.2798, + "step": 10554 + }, + { + "epoch": 0.5770676434809945, + "grad_norm": 2.123504638671875, + "learning_rate": 8.433128597531943e-06, + "loss": 1.2946, + "step": 10555 + }, + { + "epoch": 0.5771223159247161, + "grad_norm": 1.2025134563446045, + "learning_rate": 8.431323837404008e-06, + "loss": 1.5496, + "step": 10556 + }, + { + "epoch": 0.5771769883684376, + "grad_norm": 1.3855730295181274, + "learning_rate": 8.429519129657854e-06, + "loss": 1.3059, + "step": 10557 + }, + { + "epoch": 0.5772316608121592, + "grad_norm": 1.4577244520187378, + "learning_rate": 8.42771447435374e-06, + "loss": 1.2909, + "step": 10558 + }, + { + "epoch": 0.5772863332558807, + "grad_norm": 1.9956306219100952, + "learning_rate": 8.425909871551925e-06, + "loss": 1.6348, + "step": 10559 + }, + { + "epoch": 0.5773410056996022, + "grad_norm": 1.2613561153411865, + "learning_rate": 8.424105321312678e-06, + "loss": 1.8258, + "step": 10560 + }, + { + "epoch": 0.5773956781433238, + "grad_norm": 1.368535041809082, + "learning_rate": 8.422300823696252e-06, + "loss": 1.6773, + "step": 10561 + }, + { + "epoch": 0.5774503505870454, + "grad_norm": 1.6931447982788086, + "learning_rate": 8.420496378762901e-06, + "loss": 1.3164, + "step": 10562 + }, + { + "epoch": 0.5775050230307669, + "grad_norm": 1.3111330270767212, + "learning_rate": 8.418691986572884e-06, + "loss": 1.5671, + "step": 10563 + }, + { + "epoch": 0.5775596954744885, + "grad_norm": 1.558997392654419, + "learning_rate": 8.416887647186452e-06, + "loss": 1.5968, + "step": 10564 + }, + { + "epoch": 0.57761436791821, + "grad_norm": 1.267675757408142, + "learning_rate": 8.415083360663858e-06, + "loss": 1.5097, + "step": 10565 + }, + { + "epoch": 0.5776690403619316, + "grad_norm": 1.3741403818130493, + "learning_rate": 8.41327912706535e-06, + "loss": 1.5823, + "step": 10566 + }, + { + "epoch": 0.5777237128056532, + "grad_norm": 1.64254891872406, + "learning_rate": 8.411474946451169e-06, + "loss": 1.383, + "step": 10567 + }, + { + "epoch": 0.5777783852493746, + "grad_norm": 1.724321722984314, + "learning_rate": 8.409670818881573e-06, + "loss": 1.2769, + "step": 10568 + }, + { + "epoch": 0.5778330576930962, + "grad_norm": 1.2996145486831665, + "learning_rate": 8.407866744416801e-06, + "loss": 1.6058, + "step": 10569 + }, + { + "epoch": 0.5778877301368178, + "grad_norm": 1.4335523843765259, + "learning_rate": 8.40606272311709e-06, + "loss": 1.4772, + "step": 10570 + }, + { + "epoch": 0.5779424025805393, + "grad_norm": 1.283555030822754, + "learning_rate": 8.40425875504269e-06, + "loss": 1.3608, + "step": 10571 + }, + { + "epoch": 0.5779970750242609, + "grad_norm": 1.7572894096374512, + "learning_rate": 8.402454840253831e-06, + "loss": 1.4433, + "step": 10572 + }, + { + "epoch": 0.5780517474679825, + "grad_norm": 1.6805278062820435, + "learning_rate": 8.400650978810753e-06, + "loss": 1.4571, + "step": 10573 + }, + { + "epoch": 0.578106419911704, + "grad_norm": 1.3325488567352295, + "learning_rate": 8.398847170773694e-06, + "loss": 1.4878, + "step": 10574 + }, + { + "epoch": 0.5781610923554256, + "grad_norm": 1.5678261518478394, + "learning_rate": 8.397043416202887e-06, + "loss": 1.5923, + "step": 10575 + }, + { + "epoch": 0.5782157647991472, + "grad_norm": 1.436856746673584, + "learning_rate": 8.395239715158558e-06, + "loss": 1.4196, + "step": 10576 + }, + { + "epoch": 0.5782704372428686, + "grad_norm": 1.6506743431091309, + "learning_rate": 8.393436067700943e-06, + "loss": 1.4153, + "step": 10577 + }, + { + "epoch": 0.5783251096865902, + "grad_norm": 1.9597595930099487, + "learning_rate": 8.391632473890264e-06, + "loss": 1.1448, + "step": 10578 + }, + { + "epoch": 0.5783797821303117, + "grad_norm": 1.622241735458374, + "learning_rate": 8.389828933786755e-06, + "loss": 1.3473, + "step": 10579 + }, + { + "epoch": 0.5784344545740333, + "grad_norm": 1.665687918663025, + "learning_rate": 8.388025447450635e-06, + "loss": 1.2568, + "step": 10580 + }, + { + "epoch": 0.5784891270177549, + "grad_norm": 1.6929231882095337, + "learning_rate": 8.386222014942125e-06, + "loss": 1.1361, + "step": 10581 + }, + { + "epoch": 0.5785437994614764, + "grad_norm": 1.7313778400421143, + "learning_rate": 8.384418636321452e-06, + "loss": 1.5641, + "step": 10582 + }, + { + "epoch": 0.578598471905198, + "grad_norm": 1.6613441705703735, + "learning_rate": 8.382615311648833e-06, + "loss": 1.3827, + "step": 10583 + }, + { + "epoch": 0.5786531443489196, + "grad_norm": 1.9054603576660156, + "learning_rate": 8.380812040984481e-06, + "loss": 1.5843, + "step": 10584 + }, + { + "epoch": 0.578707816792641, + "grad_norm": 1.6671613454818726, + "learning_rate": 8.379008824388617e-06, + "loss": 1.5813, + "step": 10585 + }, + { + "epoch": 0.5787624892363626, + "grad_norm": 1.376466989517212, + "learning_rate": 8.377205661921453e-06, + "loss": 1.4592, + "step": 10586 + }, + { + "epoch": 0.5788171616800842, + "grad_norm": 1.6124988794326782, + "learning_rate": 8.375402553643194e-06, + "loss": 1.6035, + "step": 10587 + }, + { + "epoch": 0.5788718341238057, + "grad_norm": 2.1662681102752686, + "learning_rate": 8.37359949961406e-06, + "loss": 1.3888, + "step": 10588 + }, + { + "epoch": 0.5789265065675273, + "grad_norm": 1.924889326095581, + "learning_rate": 8.371796499894259e-06, + "loss": 1.7134, + "step": 10589 + }, + { + "epoch": 0.5789811790112489, + "grad_norm": 1.6472434997558594, + "learning_rate": 8.369993554543987e-06, + "loss": 1.6283, + "step": 10590 + }, + { + "epoch": 0.5790358514549704, + "grad_norm": 1.7983827590942383, + "learning_rate": 8.368190663623458e-06, + "loss": 1.4953, + "step": 10591 + }, + { + "epoch": 0.579090523898692, + "grad_norm": 1.5660933256149292, + "learning_rate": 8.36638782719287e-06, + "loss": 1.3311, + "step": 10592 + }, + { + "epoch": 0.5791451963424135, + "grad_norm": 1.5771173238754272, + "learning_rate": 8.36458504531243e-06, + "loss": 1.3145, + "step": 10593 + }, + { + "epoch": 0.579199868786135, + "grad_norm": 1.8726747035980225, + "learning_rate": 8.36278231804233e-06, + "loss": 1.4871, + "step": 10594 + }, + { + "epoch": 0.5792545412298566, + "grad_norm": 1.36983060836792, + "learning_rate": 8.360979645442771e-06, + "loss": 1.4734, + "step": 10595 + }, + { + "epoch": 0.5793092136735781, + "grad_norm": 1.3641257286071777, + "learning_rate": 8.359177027573948e-06, + "loss": 1.4826, + "step": 10596 + }, + { + "epoch": 0.5793638861172997, + "grad_norm": 1.640260934829712, + "learning_rate": 8.357374464496056e-06, + "loss": 1.3177, + "step": 10597 + }, + { + "epoch": 0.5794185585610213, + "grad_norm": 1.4471399784088135, + "learning_rate": 8.355571956269278e-06, + "loss": 1.4732, + "step": 10598 + }, + { + "epoch": 0.5794732310047428, + "grad_norm": 1.2725626230239868, + "learning_rate": 8.353769502953818e-06, + "loss": 1.5034, + "step": 10599 + }, + { + "epoch": 0.5795279034484644, + "grad_norm": 1.4720672369003296, + "learning_rate": 8.351967104609857e-06, + "loss": 1.5199, + "step": 10600 + }, + { + "epoch": 0.579582575892186, + "grad_norm": 1.337529182434082, + "learning_rate": 8.350164761297577e-06, + "loss": 1.3448, + "step": 10601 + }, + { + "epoch": 0.5796372483359075, + "grad_norm": 1.549284815788269, + "learning_rate": 8.34836247307717e-06, + "loss": 1.5743, + "step": 10602 + }, + { + "epoch": 0.579691920779629, + "grad_norm": 1.4924795627593994, + "learning_rate": 8.346560240008818e-06, + "loss": 1.3535, + "step": 10603 + }, + { + "epoch": 0.5797465932233506, + "grad_norm": 1.596197485923767, + "learning_rate": 8.344758062152696e-06, + "loss": 1.4239, + "step": 10604 + }, + { + "epoch": 0.5798012656670721, + "grad_norm": 1.5057107210159302, + "learning_rate": 8.34295593956899e-06, + "loss": 1.5289, + "step": 10605 + }, + { + "epoch": 0.5798559381107937, + "grad_norm": 1.6550840139389038, + "learning_rate": 8.341153872317867e-06, + "loss": 1.6183, + "step": 10606 + }, + { + "epoch": 0.5799106105545153, + "grad_norm": 1.4045888185501099, + "learning_rate": 8.339351860459515e-06, + "loss": 1.5906, + "step": 10607 + }, + { + "epoch": 0.5799652829982368, + "grad_norm": 1.817040205001831, + "learning_rate": 8.3375499040541e-06, + "loss": 1.6116, + "step": 10608 + }, + { + "epoch": 0.5800199554419584, + "grad_norm": 1.3709490299224854, + "learning_rate": 8.335748003161793e-06, + "loss": 1.3993, + "step": 10609 + }, + { + "epoch": 0.5800746278856799, + "grad_norm": 1.8402073383331299, + "learning_rate": 8.333946157842768e-06, + "loss": 1.3797, + "step": 10610 + }, + { + "epoch": 0.5801293003294015, + "grad_norm": 1.6130218505859375, + "learning_rate": 8.332144368157192e-06, + "loss": 1.5949, + "step": 10611 + }, + { + "epoch": 0.5801839727731231, + "grad_norm": 1.407884120941162, + "learning_rate": 8.330342634165221e-06, + "loss": 1.3861, + "step": 10612 + }, + { + "epoch": 0.5802386452168445, + "grad_norm": 1.6030973196029663, + "learning_rate": 8.328540955927035e-06, + "loss": 1.3657, + "step": 10613 + }, + { + "epoch": 0.5802933176605661, + "grad_norm": 1.7967342138290405, + "learning_rate": 8.326739333502787e-06, + "loss": 1.6052, + "step": 10614 + }, + { + "epoch": 0.5803479901042877, + "grad_norm": 1.2897554636001587, + "learning_rate": 8.324937766952638e-06, + "loss": 1.8373, + "step": 10615 + }, + { + "epoch": 0.5804026625480092, + "grad_norm": 1.4459148645401, + "learning_rate": 8.323136256336747e-06, + "loss": 1.3332, + "step": 10616 + }, + { + "epoch": 0.5804573349917308, + "grad_norm": 1.4282467365264893, + "learning_rate": 8.321334801715276e-06, + "loss": 1.3263, + "step": 10617 + }, + { + "epoch": 0.5805120074354524, + "grad_norm": 1.293965458869934, + "learning_rate": 8.319533403148368e-06, + "loss": 1.4473, + "step": 10618 + }, + { + "epoch": 0.5805666798791739, + "grad_norm": 1.5811318159103394, + "learning_rate": 8.317732060696186e-06, + "loss": 1.4663, + "step": 10619 + }, + { + "epoch": 0.5806213523228955, + "grad_norm": 2.0088512897491455, + "learning_rate": 8.315930774418881e-06, + "loss": 1.2601, + "step": 10620 + }, + { + "epoch": 0.5806760247666171, + "grad_norm": 1.0087339878082275, + "learning_rate": 8.314129544376593e-06, + "loss": 1.6617, + "step": 10621 + }, + { + "epoch": 0.5807306972103385, + "grad_norm": 1.2833144664764404, + "learning_rate": 8.31232837062948e-06, + "loss": 1.3915, + "step": 10622 + }, + { + "epoch": 0.5807853696540601, + "grad_norm": 1.97212553024292, + "learning_rate": 8.31052725323768e-06, + "loss": 1.4311, + "step": 10623 + }, + { + "epoch": 0.5808400420977816, + "grad_norm": 1.8369389772415161, + "learning_rate": 8.308726192261344e-06, + "loss": 1.2241, + "step": 10624 + }, + { + "epoch": 0.5808947145415032, + "grad_norm": 1.2058219909667969, + "learning_rate": 8.306925187760608e-06, + "loss": 1.347, + "step": 10625 + }, + { + "epoch": 0.5809493869852248, + "grad_norm": 1.4267992973327637, + "learning_rate": 8.305124239795609e-06, + "loss": 1.4506, + "step": 10626 + }, + { + "epoch": 0.5810040594289463, + "grad_norm": 1.8567514419555664, + "learning_rate": 8.303323348426493e-06, + "loss": 1.5263, + "step": 10627 + }, + { + "epoch": 0.5810587318726679, + "grad_norm": 2.0389277935028076, + "learning_rate": 8.301522513713392e-06, + "loss": 1.1751, + "step": 10628 + }, + { + "epoch": 0.5811134043163895, + "grad_norm": 1.6436055898666382, + "learning_rate": 8.299721735716437e-06, + "loss": 1.5925, + "step": 10629 + }, + { + "epoch": 0.581168076760111, + "grad_norm": 1.8670129776000977, + "learning_rate": 8.297921014495764e-06, + "loss": 1.6049, + "step": 10630 + }, + { + "epoch": 0.5812227492038325, + "grad_norm": 1.9201843738555908, + "learning_rate": 8.296120350111504e-06, + "loss": 1.6087, + "step": 10631 + }, + { + "epoch": 0.5812774216475541, + "grad_norm": 1.7870179414749146, + "learning_rate": 8.29431974262378e-06, + "loss": 1.6725, + "step": 10632 + }, + { + "epoch": 0.5813320940912756, + "grad_norm": 1.9885388612747192, + "learning_rate": 8.292519192092725e-06, + "loss": 1.5568, + "step": 10633 + }, + { + "epoch": 0.5813867665349972, + "grad_norm": 1.3713691234588623, + "learning_rate": 8.29071869857846e-06, + "loss": 1.5531, + "step": 10634 + }, + { + "epoch": 0.5814414389787188, + "grad_norm": 1.3192039728164673, + "learning_rate": 8.28891826214111e-06, + "loss": 1.5743, + "step": 10635 + }, + { + "epoch": 0.5814961114224403, + "grad_norm": 1.4304187297821045, + "learning_rate": 8.287117882840795e-06, + "loss": 1.4726, + "step": 10636 + }, + { + "epoch": 0.5815507838661619, + "grad_norm": 1.4499926567077637, + "learning_rate": 8.285317560737629e-06, + "loss": 1.5969, + "step": 10637 + }, + { + "epoch": 0.5816054563098834, + "grad_norm": 1.820700764656067, + "learning_rate": 8.283517295891737e-06, + "loss": 1.2192, + "step": 10638 + }, + { + "epoch": 0.581660128753605, + "grad_norm": 1.5091296434402466, + "learning_rate": 8.28171708836323e-06, + "loss": 1.5627, + "step": 10639 + }, + { + "epoch": 0.5817148011973265, + "grad_norm": 1.5041414499282837, + "learning_rate": 8.279916938212218e-06, + "loss": 1.4502, + "step": 10640 + }, + { + "epoch": 0.581769473641048, + "grad_norm": 1.7373101711273193, + "learning_rate": 8.27811684549882e-06, + "loss": 1.5258, + "step": 10641 + }, + { + "epoch": 0.5818241460847696, + "grad_norm": 1.5108532905578613, + "learning_rate": 8.276316810283142e-06, + "loss": 1.7747, + "step": 10642 + }, + { + "epoch": 0.5818788185284912, + "grad_norm": 1.2328228950500488, + "learning_rate": 8.274516832625287e-06, + "loss": 1.3477, + "step": 10643 + }, + { + "epoch": 0.5819334909722127, + "grad_norm": 1.9249846935272217, + "learning_rate": 8.272716912585366e-06, + "loss": 1.2985, + "step": 10644 + }, + { + "epoch": 0.5819881634159343, + "grad_norm": 1.7566559314727783, + "learning_rate": 8.270917050223481e-06, + "loss": 1.3887, + "step": 10645 + }, + { + "epoch": 0.5820428358596559, + "grad_norm": 1.4497734308242798, + "learning_rate": 8.269117245599729e-06, + "loss": 1.4927, + "step": 10646 + }, + { + "epoch": 0.5820975083033774, + "grad_norm": 1.4615018367767334, + "learning_rate": 8.267317498774217e-06, + "loss": 1.3546, + "step": 10647 + }, + { + "epoch": 0.582152180747099, + "grad_norm": 1.4154471158981323, + "learning_rate": 8.26551780980704e-06, + "loss": 1.5338, + "step": 10648 + }, + { + "epoch": 0.5822068531908206, + "grad_norm": 1.4583016633987427, + "learning_rate": 8.263718178758292e-06, + "loss": 1.6818, + "step": 10649 + }, + { + "epoch": 0.582261525634542, + "grad_norm": 1.4813966751098633, + "learning_rate": 8.26191860568807e-06, + "loss": 1.4041, + "step": 10650 + }, + { + "epoch": 0.5823161980782636, + "grad_norm": 1.7735319137573242, + "learning_rate": 8.26011909065646e-06, + "loss": 1.4146, + "step": 10651 + }, + { + "epoch": 0.5823708705219851, + "grad_norm": 1.3216761350631714, + "learning_rate": 8.258319633723562e-06, + "loss": 1.5618, + "step": 10652 + }, + { + "epoch": 0.5824255429657067, + "grad_norm": 2.026484489440918, + "learning_rate": 8.256520234949456e-06, + "loss": 1.4374, + "step": 10653 + }, + { + "epoch": 0.5824802154094283, + "grad_norm": 1.669497013092041, + "learning_rate": 8.254720894394231e-06, + "loss": 1.2731, + "step": 10654 + }, + { + "epoch": 0.5825348878531498, + "grad_norm": 1.900208830833435, + "learning_rate": 8.25292161211797e-06, + "loss": 1.302, + "step": 10655 + }, + { + "epoch": 0.5825895602968714, + "grad_norm": 1.364593744277954, + "learning_rate": 8.251122388180758e-06, + "loss": 1.4285, + "step": 10656 + }, + { + "epoch": 0.582644232740593, + "grad_norm": 1.5694220066070557, + "learning_rate": 8.249323222642668e-06, + "loss": 1.5522, + "step": 10657 + }, + { + "epoch": 0.5826989051843144, + "grad_norm": 1.5989662408828735, + "learning_rate": 8.247524115563789e-06, + "loss": 1.5051, + "step": 10658 + }, + { + "epoch": 0.582753577628036, + "grad_norm": 1.8611423969268799, + "learning_rate": 8.24572506700419e-06, + "loss": 1.5495, + "step": 10659 + }, + { + "epoch": 0.5828082500717576, + "grad_norm": 1.2291251420974731, + "learning_rate": 8.243926077023945e-06, + "loss": 1.6578, + "step": 10660 + }, + { + "epoch": 0.5828629225154791, + "grad_norm": 1.7048126459121704, + "learning_rate": 8.242127145683134e-06, + "loss": 1.3808, + "step": 10661 + }, + { + "epoch": 0.5829175949592007, + "grad_norm": 1.609028935432434, + "learning_rate": 8.240328273041822e-06, + "loss": 1.2963, + "step": 10662 + }, + { + "epoch": 0.5829722674029223, + "grad_norm": 1.5912046432495117, + "learning_rate": 8.238529459160076e-06, + "loss": 1.0556, + "step": 10663 + }, + { + "epoch": 0.5830269398466438, + "grad_norm": 1.6023423671722412, + "learning_rate": 8.236730704097966e-06, + "loss": 1.5012, + "step": 10664 + }, + { + "epoch": 0.5830816122903654, + "grad_norm": 2.1311213970184326, + "learning_rate": 8.234932007915552e-06, + "loss": 1.4016, + "step": 10665 + }, + { + "epoch": 0.5831362847340869, + "grad_norm": 2.386735677719116, + "learning_rate": 8.233133370672905e-06, + "loss": 1.0215, + "step": 10666 + }, + { + "epoch": 0.5831909571778084, + "grad_norm": 1.3847736120224, + "learning_rate": 8.23133479243008e-06, + "loss": 1.3103, + "step": 10667 + }, + { + "epoch": 0.58324562962153, + "grad_norm": 2.038565158843994, + "learning_rate": 8.229536273247133e-06, + "loss": 1.5273, + "step": 10668 + }, + { + "epoch": 0.5833003020652515, + "grad_norm": 1.5521160364151, + "learning_rate": 8.227737813184129e-06, + "loss": 1.4917, + "step": 10669 + }, + { + "epoch": 0.5833549745089731, + "grad_norm": 1.3256607055664062, + "learning_rate": 8.225939412301117e-06, + "loss": 1.5939, + "step": 10670 + }, + { + "epoch": 0.5834096469526947, + "grad_norm": 1.8963589668273926, + "learning_rate": 8.224141070658147e-06, + "loss": 1.3011, + "step": 10671 + }, + { + "epoch": 0.5834643193964162, + "grad_norm": 1.3745269775390625, + "learning_rate": 8.222342788315277e-06, + "loss": 1.398, + "step": 10672 + }, + { + "epoch": 0.5835189918401378, + "grad_norm": 1.493634581565857, + "learning_rate": 8.220544565332555e-06, + "loss": 1.6028, + "step": 10673 + }, + { + "epoch": 0.5835736642838594, + "grad_norm": 1.7805545330047607, + "learning_rate": 8.218746401770021e-06, + "loss": 1.1394, + "step": 10674 + }, + { + "epoch": 0.5836283367275809, + "grad_norm": 1.528757929801941, + "learning_rate": 8.216948297687727e-06, + "loss": 1.4302, + "step": 10675 + }, + { + "epoch": 0.5836830091713024, + "grad_norm": 1.283892035484314, + "learning_rate": 8.215150253145715e-06, + "loss": 1.2359, + "step": 10676 + }, + { + "epoch": 0.583737681615024, + "grad_norm": 1.5463379621505737, + "learning_rate": 8.21335226820402e-06, + "loss": 1.484, + "step": 10677 + }, + { + "epoch": 0.5837923540587455, + "grad_norm": 1.5996712446212769, + "learning_rate": 8.211554342922688e-06, + "loss": 1.4817, + "step": 10678 + }, + { + "epoch": 0.5838470265024671, + "grad_norm": 1.4584110975265503, + "learning_rate": 8.20975647736175e-06, + "loss": 1.3611, + "step": 10679 + }, + { + "epoch": 0.5839016989461886, + "grad_norm": 1.1964526176452637, + "learning_rate": 8.207958671581248e-06, + "loss": 1.3448, + "step": 10680 + }, + { + "epoch": 0.5839563713899102, + "grad_norm": 1.6838490962982178, + "learning_rate": 8.206160925641211e-06, + "loss": 1.4725, + "step": 10681 + }, + { + "epoch": 0.5840110438336318, + "grad_norm": 1.268384337425232, + "learning_rate": 8.204363239601668e-06, + "loss": 1.2792, + "step": 10682 + }, + { + "epoch": 0.5840657162773533, + "grad_norm": 1.8891148567199707, + "learning_rate": 8.202565613522653e-06, + "loss": 1.2866, + "step": 10683 + }, + { + "epoch": 0.5841203887210749, + "grad_norm": 1.7674241065979004, + "learning_rate": 8.20076804746419e-06, + "loss": 1.4374, + "step": 10684 + }, + { + "epoch": 0.5841750611647964, + "grad_norm": 1.4214847087860107, + "learning_rate": 8.198970541486298e-06, + "loss": 1.5821, + "step": 10685 + }, + { + "epoch": 0.5842297336085179, + "grad_norm": 1.699438214302063, + "learning_rate": 8.197173095649011e-06, + "loss": 1.4416, + "step": 10686 + }, + { + "epoch": 0.5842844060522395, + "grad_norm": 1.8865227699279785, + "learning_rate": 8.195375710012345e-06, + "loss": 1.5163, + "step": 10687 + }, + { + "epoch": 0.5843390784959611, + "grad_norm": 1.467496395111084, + "learning_rate": 8.193578384636317e-06, + "loss": 1.7086, + "step": 10688 + }, + { + "epoch": 0.5843937509396826, + "grad_norm": 1.491985559463501, + "learning_rate": 8.191781119580947e-06, + "loss": 1.4794, + "step": 10689 + }, + { + "epoch": 0.5844484233834042, + "grad_norm": 2.0781030654907227, + "learning_rate": 8.18998391490625e-06, + "loss": 1.5175, + "step": 10690 + }, + { + "epoch": 0.5845030958271258, + "grad_norm": 1.758070468902588, + "learning_rate": 8.188186770672231e-06, + "loss": 1.4898, + "step": 10691 + }, + { + "epoch": 0.5845577682708473, + "grad_norm": 1.513717532157898, + "learning_rate": 8.18638968693891e-06, + "loss": 1.3951, + "step": 10692 + }, + { + "epoch": 0.5846124407145689, + "grad_norm": 1.205133080482483, + "learning_rate": 8.184592663766296e-06, + "loss": 1.5569, + "step": 10693 + }, + { + "epoch": 0.5846671131582903, + "grad_norm": 1.6801464557647705, + "learning_rate": 8.182795701214393e-06, + "loss": 1.6251, + "step": 10694 + }, + { + "epoch": 0.5847217856020119, + "grad_norm": 1.5240956544876099, + "learning_rate": 8.180998799343203e-06, + "loss": 1.426, + "step": 10695 + }, + { + "epoch": 0.5847764580457335, + "grad_norm": 1.7429919242858887, + "learning_rate": 8.17920195821273e-06, + "loss": 1.2325, + "step": 10696 + }, + { + "epoch": 0.584831130489455, + "grad_norm": 1.9210654497146606, + "learning_rate": 8.17740517788298e-06, + "loss": 1.2983, + "step": 10697 + }, + { + "epoch": 0.5848858029331766, + "grad_norm": 2.0466196537017822, + "learning_rate": 8.175608458413948e-06, + "loss": 1.6972, + "step": 10698 + }, + { + "epoch": 0.5849404753768982, + "grad_norm": 1.4377707242965698, + "learning_rate": 8.173811799865628e-06, + "loss": 1.3506, + "step": 10699 + }, + { + "epoch": 0.5849951478206197, + "grad_norm": 1.3193137645721436, + "learning_rate": 8.172015202298019e-06, + "loss": 1.5381, + "step": 10700 + }, + { + "epoch": 0.5850498202643413, + "grad_norm": 1.3633323907852173, + "learning_rate": 8.170218665771113e-06, + "loss": 1.3989, + "step": 10701 + }, + { + "epoch": 0.5851044927080629, + "grad_norm": 1.4788306951522827, + "learning_rate": 8.168422190344896e-06, + "loss": 1.3591, + "step": 10702 + }, + { + "epoch": 0.5851591651517843, + "grad_norm": 1.8508445024490356, + "learning_rate": 8.166625776079365e-06, + "loss": 1.3971, + "step": 10703 + }, + { + "epoch": 0.5852138375955059, + "grad_norm": 1.719185471534729, + "learning_rate": 8.1648294230345e-06, + "loss": 1.5955, + "step": 10704 + }, + { + "epoch": 0.5852685100392275, + "grad_norm": 1.6580291986465454, + "learning_rate": 8.163033131270281e-06, + "loss": 1.3514, + "step": 10705 + }, + { + "epoch": 0.585323182482949, + "grad_norm": 1.4491515159606934, + "learning_rate": 8.161236900846703e-06, + "loss": 1.175, + "step": 10706 + }, + { + "epoch": 0.5853778549266706, + "grad_norm": 1.8842741250991821, + "learning_rate": 8.159440731823735e-06, + "loss": 1.2975, + "step": 10707 + }, + { + "epoch": 0.5854325273703921, + "grad_norm": 1.5880357027053833, + "learning_rate": 8.157644624261364e-06, + "loss": 1.4714, + "step": 10708 + }, + { + "epoch": 0.5854871998141137, + "grad_norm": 1.7374727725982666, + "learning_rate": 8.155848578219563e-06, + "loss": 1.3818, + "step": 10709 + }, + { + "epoch": 0.5855418722578353, + "grad_norm": 1.4369553327560425, + "learning_rate": 8.1540525937583e-06, + "loss": 1.629, + "step": 10710 + }, + { + "epoch": 0.5855965447015568, + "grad_norm": 1.7829209566116333, + "learning_rate": 8.152256670937557e-06, + "loss": 1.6006, + "step": 10711 + }, + { + "epoch": 0.5856512171452783, + "grad_norm": 1.6468356847763062, + "learning_rate": 8.1504608098173e-06, + "loss": 1.5324, + "step": 10712 + }, + { + "epoch": 0.5857058895889999, + "grad_norm": 1.7031259536743164, + "learning_rate": 8.148665010457492e-06, + "loss": 1.4475, + "step": 10713 + }, + { + "epoch": 0.5857605620327214, + "grad_norm": 1.5058112144470215, + "learning_rate": 8.146869272918109e-06, + "loss": 1.5955, + "step": 10714 + }, + { + "epoch": 0.585815234476443, + "grad_norm": 1.5126396417617798, + "learning_rate": 8.145073597259108e-06, + "loss": 1.0889, + "step": 10715 + }, + { + "epoch": 0.5858699069201646, + "grad_norm": 2.096543788909912, + "learning_rate": 8.14327798354045e-06, + "loss": 1.4555, + "step": 10716 + }, + { + "epoch": 0.5859245793638861, + "grad_norm": 1.3854223489761353, + "learning_rate": 8.141482431822098e-06, + "loss": 1.5786, + "step": 10717 + }, + { + "epoch": 0.5859792518076077, + "grad_norm": 1.5739974975585938, + "learning_rate": 8.13968694216401e-06, + "loss": 1.507, + "step": 10718 + }, + { + "epoch": 0.5860339242513293, + "grad_norm": 1.3183354139328003, + "learning_rate": 8.137891514626137e-06, + "loss": 1.5398, + "step": 10719 + }, + { + "epoch": 0.5860885966950508, + "grad_norm": 1.1912413835525513, + "learning_rate": 8.13609614926844e-06, + "loss": 1.5644, + "step": 10720 + }, + { + "epoch": 0.5861432691387723, + "grad_norm": 1.978305697441101, + "learning_rate": 8.134300846150862e-06, + "loss": 1.1129, + "step": 10721 + }, + { + "epoch": 0.5861979415824938, + "grad_norm": 1.850679874420166, + "learning_rate": 8.132505605333362e-06, + "loss": 1.5182, + "step": 10722 + }, + { + "epoch": 0.5862526140262154, + "grad_norm": 1.790300726890564, + "learning_rate": 8.130710426875881e-06, + "loss": 1.4194, + "step": 10723 + }, + { + "epoch": 0.586307286469937, + "grad_norm": 1.635786533355713, + "learning_rate": 8.12891531083836e-06, + "loss": 1.4831, + "step": 10724 + }, + { + "epoch": 0.5863619589136585, + "grad_norm": 1.913500189781189, + "learning_rate": 8.127120257280752e-06, + "loss": 1.2513, + "step": 10725 + }, + { + "epoch": 0.5864166313573801, + "grad_norm": 1.5960781574249268, + "learning_rate": 8.125325266262994e-06, + "loss": 1.6753, + "step": 10726 + }, + { + "epoch": 0.5864713038011017, + "grad_norm": 2.072129249572754, + "learning_rate": 8.123530337845022e-06, + "loss": 1.3102, + "step": 10727 + }, + { + "epoch": 0.5865259762448232, + "grad_norm": 1.5498085021972656, + "learning_rate": 8.121735472086777e-06, + "loss": 1.3285, + "step": 10728 + }, + { + "epoch": 0.5865806486885448, + "grad_norm": 1.4342972040176392, + "learning_rate": 8.119940669048194e-06, + "loss": 1.3687, + "step": 10729 + }, + { + "epoch": 0.5866353211322664, + "grad_norm": 1.6818891763687134, + "learning_rate": 8.118145928789198e-06, + "loss": 1.4499, + "step": 10730 + }, + { + "epoch": 0.5866899935759878, + "grad_norm": 1.3917793035507202, + "learning_rate": 8.11635125136973e-06, + "loss": 1.5124, + "step": 10731 + }, + { + "epoch": 0.5867446660197094, + "grad_norm": 1.3129240274429321, + "learning_rate": 8.114556636849714e-06, + "loss": 1.5256, + "step": 10732 + }, + { + "epoch": 0.586799338463431, + "grad_norm": 1.5599833726882935, + "learning_rate": 8.112762085289073e-06, + "loss": 1.4791, + "step": 10733 + }, + { + "epoch": 0.5868540109071525, + "grad_norm": 1.4266204833984375, + "learning_rate": 8.110967596747738e-06, + "loss": 1.5061, + "step": 10734 + }, + { + "epoch": 0.5869086833508741, + "grad_norm": 1.4664324522018433, + "learning_rate": 8.109173171285623e-06, + "loss": 1.4802, + "step": 10735 + }, + { + "epoch": 0.5869633557945956, + "grad_norm": 1.3609063625335693, + "learning_rate": 8.107378808962656e-06, + "loss": 1.5162, + "step": 10736 + }, + { + "epoch": 0.5870180282383172, + "grad_norm": 2.0314011573791504, + "learning_rate": 8.105584509838754e-06, + "loss": 1.3611, + "step": 10737 + }, + { + "epoch": 0.5870727006820388, + "grad_norm": 1.2195240259170532, + "learning_rate": 8.103790273973824e-06, + "loss": 1.7115, + "step": 10738 + }, + { + "epoch": 0.5871273731257602, + "grad_norm": 1.5199434757232666, + "learning_rate": 8.101996101427791e-06, + "loss": 1.7217, + "step": 10739 + }, + { + "epoch": 0.5871820455694818, + "grad_norm": 1.3844233751296997, + "learning_rate": 8.100201992260563e-06, + "loss": 1.5112, + "step": 10740 + }, + { + "epoch": 0.5872367180132034, + "grad_norm": 1.2423999309539795, + "learning_rate": 8.098407946532045e-06, + "loss": 1.4473, + "step": 10741 + }, + { + "epoch": 0.5872913904569249, + "grad_norm": 1.5008318424224854, + "learning_rate": 8.096613964302152e-06, + "loss": 1.6205, + "step": 10742 + }, + { + "epoch": 0.5873460629006465, + "grad_norm": 1.8124198913574219, + "learning_rate": 8.094820045630783e-06, + "loss": 1.4091, + "step": 10743 + }, + { + "epoch": 0.5874007353443681, + "grad_norm": 1.4926228523254395, + "learning_rate": 8.093026190577839e-06, + "loss": 1.5351, + "step": 10744 + }, + { + "epoch": 0.5874554077880896, + "grad_norm": 1.4769986867904663, + "learning_rate": 8.091232399203232e-06, + "loss": 1.5073, + "step": 10745 + }, + { + "epoch": 0.5875100802318112, + "grad_norm": 1.5203900337219238, + "learning_rate": 8.089438671566853e-06, + "loss": 1.49, + "step": 10746 + }, + { + "epoch": 0.5875647526755328, + "grad_norm": 1.5054008960723877, + "learning_rate": 8.087645007728598e-06, + "loss": 1.2957, + "step": 10747 + }, + { + "epoch": 0.5876194251192542, + "grad_norm": 1.6095515489578247, + "learning_rate": 8.085851407748365e-06, + "loss": 1.2942, + "step": 10748 + }, + { + "epoch": 0.5876740975629758, + "grad_norm": 1.2878069877624512, + "learning_rate": 8.084057871686041e-06, + "loss": 1.3684, + "step": 10749 + }, + { + "epoch": 0.5877287700066973, + "grad_norm": 1.9863052368164062, + "learning_rate": 8.082264399601527e-06, + "loss": 1.1946, + "step": 10750 + }, + { + "epoch": 0.5877834424504189, + "grad_norm": 1.8561439514160156, + "learning_rate": 8.080470991554703e-06, + "loss": 1.3602, + "step": 10751 + }, + { + "epoch": 0.5878381148941405, + "grad_norm": 1.3590073585510254, + "learning_rate": 8.078677647605455e-06, + "loss": 1.3531, + "step": 10752 + }, + { + "epoch": 0.587892787337862, + "grad_norm": 1.5044424533843994, + "learning_rate": 8.076884367813671e-06, + "loss": 1.3669, + "step": 10753 + }, + { + "epoch": 0.5879474597815836, + "grad_norm": 1.5428136587142944, + "learning_rate": 8.075091152239231e-06, + "loss": 1.453, + "step": 10754 + }, + { + "epoch": 0.5880021322253052, + "grad_norm": 1.483672022819519, + "learning_rate": 8.07329800094201e-06, + "loss": 1.383, + "step": 10755 + }, + { + "epoch": 0.5880568046690267, + "grad_norm": 1.7107877731323242, + "learning_rate": 8.071504913981894e-06, + "loss": 1.5876, + "step": 10756 + }, + { + "epoch": 0.5881114771127482, + "grad_norm": 1.845550298690796, + "learning_rate": 8.069711891418753e-06, + "loss": 1.5878, + "step": 10757 + }, + { + "epoch": 0.5881661495564698, + "grad_norm": 1.2802441120147705, + "learning_rate": 8.067918933312459e-06, + "loss": 1.5851, + "step": 10758 + }, + { + "epoch": 0.5882208220001913, + "grad_norm": 1.44312584400177, + "learning_rate": 8.066126039722889e-06, + "loss": 1.3414, + "step": 10759 + }, + { + "epoch": 0.5882754944439129, + "grad_norm": 1.561798095703125, + "learning_rate": 8.064333210709908e-06, + "loss": 1.5554, + "step": 10760 + }, + { + "epoch": 0.5883301668876345, + "grad_norm": 1.6225910186767578, + "learning_rate": 8.062540446333384e-06, + "loss": 1.4703, + "step": 10761 + }, + { + "epoch": 0.588384839331356, + "grad_norm": 1.7948147058486938, + "learning_rate": 8.060747746653181e-06, + "loss": 1.5136, + "step": 10762 + }, + { + "epoch": 0.5884395117750776, + "grad_norm": 1.2972619533538818, + "learning_rate": 8.058955111729157e-06, + "loss": 1.4481, + "step": 10763 + }, + { + "epoch": 0.5884941842187991, + "grad_norm": 1.7315768003463745, + "learning_rate": 8.05716254162118e-06, + "loss": 1.341, + "step": 10764 + }, + { + "epoch": 0.5885488566625207, + "grad_norm": 1.334397792816162, + "learning_rate": 8.055370036389105e-06, + "loss": 1.3552, + "step": 10765 + }, + { + "epoch": 0.5886035291062423, + "grad_norm": 1.379871129989624, + "learning_rate": 8.053577596092788e-06, + "loss": 1.4889, + "step": 10766 + }, + { + "epoch": 0.5886582015499637, + "grad_norm": 1.5527758598327637, + "learning_rate": 8.051785220792082e-06, + "loss": 1.4275, + "step": 10767 + }, + { + "epoch": 0.5887128739936853, + "grad_norm": 1.4853343963623047, + "learning_rate": 8.04999291054684e-06, + "loss": 1.3566, + "step": 10768 + }, + { + "epoch": 0.5887675464374069, + "grad_norm": 1.4030039310455322, + "learning_rate": 8.048200665416907e-06, + "loss": 1.3602, + "step": 10769 + }, + { + "epoch": 0.5888222188811284, + "grad_norm": 1.6006083488464355, + "learning_rate": 8.046408485462138e-06, + "loss": 1.3624, + "step": 10770 + }, + { + "epoch": 0.58887689132485, + "grad_norm": 1.6444042921066284, + "learning_rate": 8.044616370742372e-06, + "loss": 1.4893, + "step": 10771 + }, + { + "epoch": 0.5889315637685716, + "grad_norm": 1.4621964693069458, + "learning_rate": 8.042824321317453e-06, + "loss": 1.6039, + "step": 10772 + }, + { + "epoch": 0.5889862362122931, + "grad_norm": 1.4295940399169922, + "learning_rate": 8.041032337247226e-06, + "loss": 1.3621, + "step": 10773 + }, + { + "epoch": 0.5890409086560147, + "grad_norm": 1.684395432472229, + "learning_rate": 8.039240418591525e-06, + "loss": 1.4074, + "step": 10774 + }, + { + "epoch": 0.5890955810997363, + "grad_norm": 1.4636069536209106, + "learning_rate": 8.037448565410183e-06, + "loss": 1.3266, + "step": 10775 + }, + { + "epoch": 0.5891502535434577, + "grad_norm": 1.5680545568466187, + "learning_rate": 8.035656777763041e-06, + "loss": 1.4083, + "step": 10776 + }, + { + "epoch": 0.5892049259871793, + "grad_norm": 1.394339919090271, + "learning_rate": 8.033865055709928e-06, + "loss": 1.4037, + "step": 10777 + }, + { + "epoch": 0.5892595984309008, + "grad_norm": 1.927581548690796, + "learning_rate": 8.032073399310678e-06, + "loss": 1.2586, + "step": 10778 + }, + { + "epoch": 0.5893142708746224, + "grad_norm": 2.000762462615967, + "learning_rate": 8.030281808625114e-06, + "loss": 1.3302, + "step": 10779 + }, + { + "epoch": 0.589368943318344, + "grad_norm": 1.7970069646835327, + "learning_rate": 8.02849028371306e-06, + "loss": 1.4067, + "step": 10780 + }, + { + "epoch": 0.5894236157620655, + "grad_norm": 1.4569507837295532, + "learning_rate": 8.026698824634344e-06, + "loss": 1.3365, + "step": 10781 + }, + { + "epoch": 0.5894782882057871, + "grad_norm": 1.4401856660842896, + "learning_rate": 8.024907431448786e-06, + "loss": 1.6732, + "step": 10782 + }, + { + "epoch": 0.5895329606495087, + "grad_norm": 1.5848900079727173, + "learning_rate": 8.023116104216198e-06, + "loss": 1.4369, + "step": 10783 + }, + { + "epoch": 0.5895876330932301, + "grad_norm": 1.6136201620101929, + "learning_rate": 8.021324842996405e-06, + "loss": 1.5588, + "step": 10784 + }, + { + "epoch": 0.5896423055369517, + "grad_norm": 1.4841818809509277, + "learning_rate": 8.019533647849221e-06, + "loss": 1.3618, + "step": 10785 + }, + { + "epoch": 0.5896969779806733, + "grad_norm": 1.4632118940353394, + "learning_rate": 8.017742518834454e-06, + "loss": 1.3948, + "step": 10786 + }, + { + "epoch": 0.5897516504243948, + "grad_norm": 1.4988726377487183, + "learning_rate": 8.015951456011917e-06, + "loss": 1.3058, + "step": 10787 + }, + { + "epoch": 0.5898063228681164, + "grad_norm": 1.587589979171753, + "learning_rate": 8.014160459441417e-06, + "loss": 1.3095, + "step": 10788 + }, + { + "epoch": 0.589860995311838, + "grad_norm": 1.9007751941680908, + "learning_rate": 8.012369529182755e-06, + "loss": 1.4833, + "step": 10789 + }, + { + "epoch": 0.5899156677555595, + "grad_norm": 1.9925010204315186, + "learning_rate": 8.010578665295742e-06, + "loss": 1.3846, + "step": 10790 + }, + { + "epoch": 0.5899703401992811, + "grad_norm": 1.5226666927337646, + "learning_rate": 8.008787867840172e-06, + "loss": 1.521, + "step": 10791 + }, + { + "epoch": 0.5900250126430026, + "grad_norm": 1.7883142232894897, + "learning_rate": 8.006997136875854e-06, + "loss": 1.624, + "step": 10792 + }, + { + "epoch": 0.5900796850867241, + "grad_norm": 1.7366045713424683, + "learning_rate": 8.005206472462576e-06, + "loss": 1.1657, + "step": 10793 + }, + { + "epoch": 0.5901343575304457, + "grad_norm": 1.6970256567001343, + "learning_rate": 8.00341587466013e-06, + "loss": 1.5437, + "step": 10794 + }, + { + "epoch": 0.5901890299741672, + "grad_norm": 1.489329218864441, + "learning_rate": 8.001625343528318e-06, + "loss": 1.4749, + "step": 10795 + }, + { + "epoch": 0.5902437024178888, + "grad_norm": 1.3334745168685913, + "learning_rate": 7.999834879126925e-06, + "loss": 1.6192, + "step": 10796 + }, + { + "epoch": 0.5902983748616104, + "grad_norm": 1.609399676322937, + "learning_rate": 7.998044481515736e-06, + "loss": 1.1356, + "step": 10797 + }, + { + "epoch": 0.5903530473053319, + "grad_norm": 1.6676679849624634, + "learning_rate": 7.996254150754544e-06, + "loss": 1.4378, + "step": 10798 + }, + { + "epoch": 0.5904077197490535, + "grad_norm": 1.903101921081543, + "learning_rate": 7.994463886903125e-06, + "loss": 1.4254, + "step": 10799 + }, + { + "epoch": 0.5904623921927751, + "grad_norm": 1.870683193206787, + "learning_rate": 7.99267369002126e-06, + "loss": 1.4865, + "step": 10800 + }, + { + "epoch": 0.5905170646364966, + "grad_norm": 1.6819185018539429, + "learning_rate": 7.990883560168736e-06, + "loss": 1.4735, + "step": 10801 + }, + { + "epoch": 0.5905717370802182, + "grad_norm": 1.6484551429748535, + "learning_rate": 7.989093497405323e-06, + "loss": 1.2546, + "step": 10802 + }, + { + "epoch": 0.5906264095239397, + "grad_norm": 1.4711103439331055, + "learning_rate": 7.987303501790794e-06, + "loss": 1.2191, + "step": 10803 + }, + { + "epoch": 0.5906810819676612, + "grad_norm": 1.5830950736999512, + "learning_rate": 7.985513573384927e-06, + "loss": 1.5973, + "step": 10804 + }, + { + "epoch": 0.5907357544113828, + "grad_norm": 1.3925831317901611, + "learning_rate": 7.983723712247487e-06, + "loss": 1.6451, + "step": 10805 + }, + { + "epoch": 0.5907904268551044, + "grad_norm": 1.400156021118164, + "learning_rate": 7.981933918438246e-06, + "loss": 1.5386, + "step": 10806 + }, + { + "epoch": 0.5908450992988259, + "grad_norm": 1.6422078609466553, + "learning_rate": 7.980144192016967e-06, + "loss": 1.4575, + "step": 10807 + }, + { + "epoch": 0.5908997717425475, + "grad_norm": 1.4706757068634033, + "learning_rate": 7.97835453304341e-06, + "loss": 1.1415, + "step": 10808 + }, + { + "epoch": 0.590954444186269, + "grad_norm": 1.3992599248886108, + "learning_rate": 7.97656494157734e-06, + "loss": 1.2223, + "step": 10809 + }, + { + "epoch": 0.5910091166299906, + "grad_norm": 1.5072317123413086, + "learning_rate": 7.974775417678518e-06, + "loss": 1.5496, + "step": 10810 + }, + { + "epoch": 0.5910637890737122, + "grad_norm": 1.6284531354904175, + "learning_rate": 7.972985961406693e-06, + "loss": 1.4003, + "step": 10811 + }, + { + "epoch": 0.5911184615174336, + "grad_norm": 1.5075427293777466, + "learning_rate": 7.971196572821628e-06, + "loss": 1.4769, + "step": 10812 + }, + { + "epoch": 0.5911731339611552, + "grad_norm": 1.9572232961654663, + "learning_rate": 7.969407251983069e-06, + "loss": 1.1934, + "step": 10813 + }, + { + "epoch": 0.5912278064048768, + "grad_norm": 1.078872799873352, + "learning_rate": 7.967617998950762e-06, + "loss": 1.6065, + "step": 10814 + }, + { + "epoch": 0.5912824788485983, + "grad_norm": 1.3403857946395874, + "learning_rate": 7.965828813784464e-06, + "loss": 1.4907, + "step": 10815 + }, + { + "epoch": 0.5913371512923199, + "grad_norm": 1.644282579421997, + "learning_rate": 7.964039696543914e-06, + "loss": 1.3166, + "step": 10816 + }, + { + "epoch": 0.5913918237360415, + "grad_norm": 1.8533525466918945, + "learning_rate": 7.962250647288855e-06, + "loss": 1.7649, + "step": 10817 + }, + { + "epoch": 0.591446496179763, + "grad_norm": 1.601078748703003, + "learning_rate": 7.96046166607903e-06, + "loss": 1.3784, + "step": 10818 + }, + { + "epoch": 0.5915011686234846, + "grad_norm": 1.6126552820205688, + "learning_rate": 7.958672752974173e-06, + "loss": 1.3385, + "step": 10819 + }, + { + "epoch": 0.5915558410672062, + "grad_norm": 1.9689072370529175, + "learning_rate": 7.956883908034026e-06, + "loss": 1.5302, + "step": 10820 + }, + { + "epoch": 0.5916105135109276, + "grad_norm": 1.2387028932571411, + "learning_rate": 7.955095131318319e-06, + "loss": 1.7428, + "step": 10821 + }, + { + "epoch": 0.5916651859546492, + "grad_norm": 1.32328462600708, + "learning_rate": 7.953306422886781e-06, + "loss": 1.4763, + "step": 10822 + }, + { + "epoch": 0.5917198583983707, + "grad_norm": 1.5718576908111572, + "learning_rate": 7.951517782799147e-06, + "loss": 1.4356, + "step": 10823 + }, + { + "epoch": 0.5917745308420923, + "grad_norm": 1.1593371629714966, + "learning_rate": 7.949729211115144e-06, + "loss": 1.6562, + "step": 10824 + }, + { + "epoch": 0.5918292032858139, + "grad_norm": 1.319743275642395, + "learning_rate": 7.947940707894489e-06, + "loss": 1.9155, + "step": 10825 + }, + { + "epoch": 0.5918838757295354, + "grad_norm": 1.7273070812225342, + "learning_rate": 7.946152273196912e-06, + "loss": 1.4043, + "step": 10826 + }, + { + "epoch": 0.591938548173257, + "grad_norm": 1.779268503189087, + "learning_rate": 7.94436390708213e-06, + "loss": 1.4442, + "step": 10827 + }, + { + "epoch": 0.5919932206169786, + "grad_norm": 1.544161081314087, + "learning_rate": 7.942575609609857e-06, + "loss": 1.5429, + "step": 10828 + }, + { + "epoch": 0.5920478930607, + "grad_norm": 1.5522984266281128, + "learning_rate": 7.940787380839818e-06, + "loss": 1.4699, + "step": 10829 + }, + { + "epoch": 0.5921025655044216, + "grad_norm": 1.5927002429962158, + "learning_rate": 7.938999220831718e-06, + "loss": 1.384, + "step": 10830 + }, + { + "epoch": 0.5921572379481432, + "grad_norm": 1.6847255229949951, + "learning_rate": 7.93721112964527e-06, + "loss": 1.7173, + "step": 10831 + }, + { + "epoch": 0.5922119103918647, + "grad_norm": 1.8168418407440186, + "learning_rate": 7.935423107340184e-06, + "loss": 1.3424, + "step": 10832 + }, + { + "epoch": 0.5922665828355863, + "grad_norm": 1.5157575607299805, + "learning_rate": 7.93363515397616e-06, + "loss": 1.3803, + "step": 10833 + }, + { + "epoch": 0.5923212552793079, + "grad_norm": 1.5303016901016235, + "learning_rate": 7.931847269612912e-06, + "loss": 1.3971, + "step": 10834 + }, + { + "epoch": 0.5923759277230294, + "grad_norm": 1.745447039604187, + "learning_rate": 7.930059454310138e-06, + "loss": 1.4509, + "step": 10835 + }, + { + "epoch": 0.592430600166751, + "grad_norm": 1.703621506690979, + "learning_rate": 7.928271708127532e-06, + "loss": 1.4349, + "step": 10836 + }, + { + "epoch": 0.5924852726104725, + "grad_norm": 1.166936993598938, + "learning_rate": 7.926484031124799e-06, + "loss": 1.8354, + "step": 10837 + }, + { + "epoch": 0.592539945054194, + "grad_norm": 1.2985107898712158, + "learning_rate": 7.924696423361629e-06, + "loss": 1.6153, + "step": 10838 + }, + { + "epoch": 0.5925946174979156, + "grad_norm": 1.7071471214294434, + "learning_rate": 7.92290888489771e-06, + "loss": 1.2079, + "step": 10839 + }, + { + "epoch": 0.5926492899416371, + "grad_norm": 1.7075939178466797, + "learning_rate": 7.921121415792743e-06, + "loss": 1.2964, + "step": 10840 + }, + { + "epoch": 0.5927039623853587, + "grad_norm": 1.7710031270980835, + "learning_rate": 7.91933401610641e-06, + "loss": 1.501, + "step": 10841 + }, + { + "epoch": 0.5927586348290803, + "grad_norm": 1.3807711601257324, + "learning_rate": 7.917546685898393e-06, + "loss": 1.3644, + "step": 10842 + }, + { + "epoch": 0.5928133072728018, + "grad_norm": 1.65874183177948, + "learning_rate": 7.915759425228382e-06, + "loss": 1.3312, + "step": 10843 + }, + { + "epoch": 0.5928679797165234, + "grad_norm": 1.223637342453003, + "learning_rate": 7.913972234156054e-06, + "loss": 1.6546, + "step": 10844 + }, + { + "epoch": 0.592922652160245, + "grad_norm": 1.659311294555664, + "learning_rate": 7.912185112741087e-06, + "loss": 1.5178, + "step": 10845 + }, + { + "epoch": 0.5929773246039665, + "grad_norm": 1.3020139932632446, + "learning_rate": 7.910398061043162e-06, + "loss": 1.4113, + "step": 10846 + }, + { + "epoch": 0.593031997047688, + "grad_norm": 1.2061678171157837, + "learning_rate": 7.908611079121941e-06, + "loss": 1.5549, + "step": 10847 + }, + { + "epoch": 0.5930866694914096, + "grad_norm": 1.3277738094329834, + "learning_rate": 7.906824167037112e-06, + "loss": 1.5308, + "step": 10848 + }, + { + "epoch": 0.5931413419351311, + "grad_norm": 1.805726170539856, + "learning_rate": 7.905037324848334e-06, + "loss": 1.545, + "step": 10849 + }, + { + "epoch": 0.5931960143788527, + "grad_norm": 1.4593572616577148, + "learning_rate": 7.903250552615273e-06, + "loss": 1.3592, + "step": 10850 + }, + { + "epoch": 0.5932506868225742, + "grad_norm": 1.7455999851226807, + "learning_rate": 7.901463850397599e-06, + "loss": 1.5108, + "step": 10851 + }, + { + "epoch": 0.5933053592662958, + "grad_norm": 1.187143325805664, + "learning_rate": 7.899677218254971e-06, + "loss": 1.5458, + "step": 10852 + }, + { + "epoch": 0.5933600317100174, + "grad_norm": 1.3502217531204224, + "learning_rate": 7.897890656247045e-06, + "loss": 1.4151, + "step": 10853 + }, + { + "epoch": 0.5934147041537389, + "grad_norm": 2.0211281776428223, + "learning_rate": 7.896104164433488e-06, + "loss": 1.1348, + "step": 10854 + }, + { + "epoch": 0.5934693765974605, + "grad_norm": 1.5145862102508545, + "learning_rate": 7.89431774287395e-06, + "loss": 1.5408, + "step": 10855 + }, + { + "epoch": 0.5935240490411821, + "grad_norm": 1.6441041231155396, + "learning_rate": 7.89253139162808e-06, + "loss": 1.6623, + "step": 10856 + }, + { + "epoch": 0.5935787214849035, + "grad_norm": 1.4570139646530151, + "learning_rate": 7.890745110755535e-06, + "loss": 1.4109, + "step": 10857 + }, + { + "epoch": 0.5936333939286251, + "grad_norm": 1.5527093410491943, + "learning_rate": 7.88895890031596e-06, + "loss": 1.5335, + "step": 10858 + }, + { + "epoch": 0.5936880663723467, + "grad_norm": 1.3415499925613403, + "learning_rate": 7.887172760368998e-06, + "loss": 1.3363, + "step": 10859 + }, + { + "epoch": 0.5937427388160682, + "grad_norm": 1.7371333837509155, + "learning_rate": 7.885386690974299e-06, + "loss": 1.3796, + "step": 10860 + }, + { + "epoch": 0.5937974112597898, + "grad_norm": 1.6778429746627808, + "learning_rate": 7.883600692191496e-06, + "loss": 1.5167, + "step": 10861 + }, + { + "epoch": 0.5938520837035114, + "grad_norm": 1.3616480827331543, + "learning_rate": 7.881814764080235e-06, + "loss": 1.4973, + "step": 10862 + }, + { + "epoch": 0.5939067561472329, + "grad_norm": 1.461184024810791, + "learning_rate": 7.880028906700153e-06, + "loss": 1.3914, + "step": 10863 + }, + { + "epoch": 0.5939614285909545, + "grad_norm": 1.193574070930481, + "learning_rate": 7.878243120110876e-06, + "loss": 1.8129, + "step": 10864 + }, + { + "epoch": 0.594016101034676, + "grad_norm": 1.3502825498580933, + "learning_rate": 7.876457404372042e-06, + "loss": 1.3301, + "step": 10865 + }, + { + "epoch": 0.5940707734783975, + "grad_norm": 1.2398067712783813, + "learning_rate": 7.874671759543278e-06, + "loss": 1.476, + "step": 10866 + }, + { + "epoch": 0.5941254459221191, + "grad_norm": 1.673651099205017, + "learning_rate": 7.872886185684207e-06, + "loss": 1.2123, + "step": 10867 + }, + { + "epoch": 0.5941801183658406, + "grad_norm": 1.4350945949554443, + "learning_rate": 7.871100682854465e-06, + "loss": 1.3012, + "step": 10868 + }, + { + "epoch": 0.5942347908095622, + "grad_norm": 1.3147310018539429, + "learning_rate": 7.869315251113663e-06, + "loss": 1.35, + "step": 10869 + }, + { + "epoch": 0.5942894632532838, + "grad_norm": 1.809916377067566, + "learning_rate": 7.867529890521424e-06, + "loss": 1.4489, + "step": 10870 + }, + { + "epoch": 0.5943441356970053, + "grad_norm": 1.411402940750122, + "learning_rate": 7.865744601137369e-06, + "loss": 1.4365, + "step": 10871 + }, + { + "epoch": 0.5943988081407269, + "grad_norm": 1.4199703931808472, + "learning_rate": 7.86395938302111e-06, + "loss": 1.5367, + "step": 10872 + }, + { + "epoch": 0.5944534805844485, + "grad_norm": 1.5711407661437988, + "learning_rate": 7.862174236232252e-06, + "loss": 1.1077, + "step": 10873 + }, + { + "epoch": 0.59450815302817, + "grad_norm": 1.3870561122894287, + "learning_rate": 7.86038916083042e-06, + "loss": 1.2919, + "step": 10874 + }, + { + "epoch": 0.5945628254718915, + "grad_norm": 1.2802752256393433, + "learning_rate": 7.858604156875212e-06, + "loss": 1.3758, + "step": 10875 + }, + { + "epoch": 0.5946174979156131, + "grad_norm": 1.7964448928833008, + "learning_rate": 7.856819224426239e-06, + "loss": 1.588, + "step": 10876 + }, + { + "epoch": 0.5946721703593346, + "grad_norm": 1.448065996170044, + "learning_rate": 7.8550343635431e-06, + "loss": 1.1853, + "step": 10877 + }, + { + "epoch": 0.5947268428030562, + "grad_norm": 1.4355754852294922, + "learning_rate": 7.853249574285393e-06, + "loss": 1.3291, + "step": 10878 + }, + { + "epoch": 0.5947815152467777, + "grad_norm": 1.4384658336639404, + "learning_rate": 7.851464856712725e-06, + "loss": 1.4784, + "step": 10879 + }, + { + "epoch": 0.5948361876904993, + "grad_norm": 1.2778652906417847, + "learning_rate": 7.849680210884687e-06, + "loss": 1.3556, + "step": 10880 + }, + { + "epoch": 0.5948908601342209, + "grad_norm": 1.3665447235107422, + "learning_rate": 7.847895636860867e-06, + "loss": 1.4154, + "step": 10881 + }, + { + "epoch": 0.5949455325779424, + "grad_norm": 2.62203311920166, + "learning_rate": 7.846111134700867e-06, + "loss": 1.3942, + "step": 10882 + }, + { + "epoch": 0.595000205021664, + "grad_norm": 1.4031325578689575, + "learning_rate": 7.844326704464271e-06, + "loss": 1.2628, + "step": 10883 + }, + { + "epoch": 0.5950548774653855, + "grad_norm": 1.6074002981185913, + "learning_rate": 7.842542346210663e-06, + "loss": 1.4347, + "step": 10884 + }, + { + "epoch": 0.595109549909107, + "grad_norm": 1.450743317604065, + "learning_rate": 7.840758059999631e-06, + "loss": 1.4043, + "step": 10885 + }, + { + "epoch": 0.5951642223528286, + "grad_norm": 1.5611084699630737, + "learning_rate": 7.838973845890752e-06, + "loss": 1.3657, + "step": 10886 + }, + { + "epoch": 0.5952188947965502, + "grad_norm": 1.283528208732605, + "learning_rate": 7.837189703943604e-06, + "loss": 1.4869, + "step": 10887 + }, + { + "epoch": 0.5952735672402717, + "grad_norm": 1.5842671394348145, + "learning_rate": 7.835405634217772e-06, + "loss": 1.4259, + "step": 10888 + }, + { + "epoch": 0.5953282396839933, + "grad_norm": 1.4797344207763672, + "learning_rate": 7.833621636772824e-06, + "loss": 1.3963, + "step": 10889 + }, + { + "epoch": 0.5953829121277149, + "grad_norm": 1.6418430805206299, + "learning_rate": 7.831837711668334e-06, + "loss": 1.4091, + "step": 10890 + }, + { + "epoch": 0.5954375845714364, + "grad_norm": 1.3402572870254517, + "learning_rate": 7.83005385896387e-06, + "loss": 1.7422, + "step": 10891 + }, + { + "epoch": 0.595492257015158, + "grad_norm": 1.4499706029891968, + "learning_rate": 7.828270078718994e-06, + "loss": 1.4562, + "step": 10892 + }, + { + "epoch": 0.5955469294588794, + "grad_norm": 1.2297908067703247, + "learning_rate": 7.826486370993284e-06, + "loss": 1.3563, + "step": 10893 + }, + { + "epoch": 0.595601601902601, + "grad_norm": 1.6014351844787598, + "learning_rate": 7.824702735846292e-06, + "loss": 1.4352, + "step": 10894 + }, + { + "epoch": 0.5956562743463226, + "grad_norm": 1.6557754278182983, + "learning_rate": 7.822919173337579e-06, + "loss": 1.2898, + "step": 10895 + }, + { + "epoch": 0.5957109467900441, + "grad_norm": 1.7747507095336914, + "learning_rate": 7.821135683526706e-06, + "loss": 1.299, + "step": 10896 + }, + { + "epoch": 0.5957656192337657, + "grad_norm": 1.247443675994873, + "learning_rate": 7.819352266473223e-06, + "loss": 1.4852, + "step": 10897 + }, + { + "epoch": 0.5958202916774873, + "grad_norm": 1.4422837495803833, + "learning_rate": 7.817568922236683e-06, + "loss": 1.2848, + "step": 10898 + }, + { + "epoch": 0.5958749641212088, + "grad_norm": 1.3467739820480347, + "learning_rate": 7.815785650876642e-06, + "loss": 1.5055, + "step": 10899 + }, + { + "epoch": 0.5959296365649304, + "grad_norm": 1.27732253074646, + "learning_rate": 7.814002452452643e-06, + "loss": 1.4462, + "step": 10900 + }, + { + "epoch": 0.595984309008652, + "grad_norm": 1.4221023321151733, + "learning_rate": 7.812219327024227e-06, + "loss": 1.3026, + "step": 10901 + }, + { + "epoch": 0.5960389814523734, + "grad_norm": 1.3189442157745361, + "learning_rate": 7.810436274650946e-06, + "loss": 1.4061, + "step": 10902 + }, + { + "epoch": 0.596093653896095, + "grad_norm": 1.6956136226654053, + "learning_rate": 7.808653295392334e-06, + "loss": 1.5062, + "step": 10903 + }, + { + "epoch": 0.5961483263398166, + "grad_norm": 1.5166137218475342, + "learning_rate": 7.806870389307933e-06, + "loss": 1.6819, + "step": 10904 + }, + { + "epoch": 0.5962029987835381, + "grad_norm": 1.4769737720489502, + "learning_rate": 7.805087556457275e-06, + "loss": 1.6875, + "step": 10905 + }, + { + "epoch": 0.5962576712272597, + "grad_norm": 1.84920334815979, + "learning_rate": 7.80330479689989e-06, + "loss": 1.5897, + "step": 10906 + }, + { + "epoch": 0.5963123436709812, + "grad_norm": 1.3282744884490967, + "learning_rate": 7.801522110695317e-06, + "loss": 1.4079, + "step": 10907 + }, + { + "epoch": 0.5963670161147028, + "grad_norm": 1.4671740531921387, + "learning_rate": 7.79973949790308e-06, + "loss": 1.4337, + "step": 10908 + }, + { + "epoch": 0.5964216885584244, + "grad_norm": 1.2406495809555054, + "learning_rate": 7.797956958582702e-06, + "loss": 1.6047, + "step": 10909 + }, + { + "epoch": 0.5964763610021458, + "grad_norm": 1.327101230621338, + "learning_rate": 7.796174492793712e-06, + "loss": 1.7467, + "step": 10910 + }, + { + "epoch": 0.5965310334458674, + "grad_norm": 1.6074591875076294, + "learning_rate": 7.794392100595624e-06, + "loss": 1.2144, + "step": 10911 + }, + { + "epoch": 0.596585705889589, + "grad_norm": 1.579944133758545, + "learning_rate": 7.792609782047958e-06, + "loss": 1.5747, + "step": 10912 + }, + { + "epoch": 0.5966403783333105, + "grad_norm": 1.3563379049301147, + "learning_rate": 7.790827537210232e-06, + "loss": 1.2902, + "step": 10913 + }, + { + "epoch": 0.5966950507770321, + "grad_norm": 1.338452935218811, + "learning_rate": 7.789045366141963e-06, + "loss": 1.2899, + "step": 10914 + }, + { + "epoch": 0.5967497232207537, + "grad_norm": 1.632738471031189, + "learning_rate": 7.787263268902652e-06, + "loss": 1.3309, + "step": 10915 + }, + { + "epoch": 0.5968043956644752, + "grad_norm": 1.5701974630355835, + "learning_rate": 7.785481245551816e-06, + "loss": 1.5969, + "step": 10916 + }, + { + "epoch": 0.5968590681081968, + "grad_norm": 1.6575422286987305, + "learning_rate": 7.783699296148953e-06, + "loss": 1.4079, + "step": 10917 + }, + { + "epoch": 0.5969137405519184, + "grad_norm": 1.6299535036087036, + "learning_rate": 7.781917420753575e-06, + "loss": 1.4917, + "step": 10918 + }, + { + "epoch": 0.5969684129956399, + "grad_norm": 1.3543249368667603, + "learning_rate": 7.78013561942518e-06, + "loss": 1.364, + "step": 10919 + }, + { + "epoch": 0.5970230854393614, + "grad_norm": 1.6165180206298828, + "learning_rate": 7.778353892223261e-06, + "loss": 1.1343, + "step": 10920 + }, + { + "epoch": 0.5970777578830829, + "grad_norm": 1.526485562324524, + "learning_rate": 7.776572239207323e-06, + "loss": 1.3951, + "step": 10921 + }, + { + "epoch": 0.5971324303268045, + "grad_norm": 1.6241540908813477, + "learning_rate": 7.774790660436857e-06, + "loss": 1.3155, + "step": 10922 + }, + { + "epoch": 0.5971871027705261, + "grad_norm": 1.5919559001922607, + "learning_rate": 7.773009155971349e-06, + "loss": 1.5545, + "step": 10923 + }, + { + "epoch": 0.5972417752142476, + "grad_norm": 1.6306331157684326, + "learning_rate": 7.771227725870293e-06, + "loss": 1.5634, + "step": 10924 + }, + { + "epoch": 0.5972964476579692, + "grad_norm": 1.5414798259735107, + "learning_rate": 7.769446370193174e-06, + "loss": 1.5221, + "step": 10925 + }, + { + "epoch": 0.5973511201016908, + "grad_norm": 1.904779314994812, + "learning_rate": 7.76766508899947e-06, + "loss": 1.4237, + "step": 10926 + }, + { + "epoch": 0.5974057925454123, + "grad_norm": 1.6626534461975098, + "learning_rate": 7.765883882348673e-06, + "loss": 1.2656, + "step": 10927 + }, + { + "epoch": 0.5974604649891339, + "grad_norm": 1.520181655883789, + "learning_rate": 7.764102750300253e-06, + "loss": 1.4047, + "step": 10928 + }, + { + "epoch": 0.5975151374328554, + "grad_norm": 1.5132851600646973, + "learning_rate": 7.762321692913687e-06, + "loss": 1.5317, + "step": 10929 + }, + { + "epoch": 0.5975698098765769, + "grad_norm": 1.734470009803772, + "learning_rate": 7.760540710248455e-06, + "loss": 1.8096, + "step": 10930 + }, + { + "epoch": 0.5976244823202985, + "grad_norm": 1.445913314819336, + "learning_rate": 7.758759802364022e-06, + "loss": 1.2216, + "step": 10931 + }, + { + "epoch": 0.5976791547640201, + "grad_norm": 1.5493680238723755, + "learning_rate": 7.756978969319855e-06, + "loss": 1.4892, + "step": 10932 + }, + { + "epoch": 0.5977338272077416, + "grad_norm": 1.647579312324524, + "learning_rate": 7.755198211175428e-06, + "loss": 1.2656, + "step": 10933 + }, + { + "epoch": 0.5977884996514632, + "grad_norm": 1.4481457471847534, + "learning_rate": 7.753417527990198e-06, + "loss": 1.4206, + "step": 10934 + }, + { + "epoch": 0.5978431720951847, + "grad_norm": 1.322401523590088, + "learning_rate": 7.751636919823629e-06, + "loss": 1.5547, + "step": 10935 + }, + { + "epoch": 0.5978978445389063, + "grad_norm": 1.5759435892105103, + "learning_rate": 7.74985638673518e-06, + "loss": 1.5155, + "step": 10936 + }, + { + "epoch": 0.5979525169826279, + "grad_norm": 1.6171082258224487, + "learning_rate": 7.748075928784303e-06, + "loss": 1.4993, + "step": 10937 + }, + { + "epoch": 0.5980071894263493, + "grad_norm": 1.548007607460022, + "learning_rate": 7.746295546030459e-06, + "loss": 1.4592, + "step": 10938 + }, + { + "epoch": 0.5980618618700709, + "grad_norm": 1.6409412622451782, + "learning_rate": 7.744515238533095e-06, + "loss": 1.4467, + "step": 10939 + }, + { + "epoch": 0.5981165343137925, + "grad_norm": 1.759367823600769, + "learning_rate": 7.742735006351656e-06, + "loss": 1.4737, + "step": 10940 + }, + { + "epoch": 0.598171206757514, + "grad_norm": 1.3581950664520264, + "learning_rate": 7.740954849545596e-06, + "loss": 1.3338, + "step": 10941 + }, + { + "epoch": 0.5982258792012356, + "grad_norm": 1.3895301818847656, + "learning_rate": 7.739174768174355e-06, + "loss": 1.6015, + "step": 10942 + }, + { + "epoch": 0.5982805516449572, + "grad_norm": 1.3990947008132935, + "learning_rate": 7.73739476229737e-06, + "loss": 1.3632, + "step": 10943 + }, + { + "epoch": 0.5983352240886787, + "grad_norm": 1.5800541639328003, + "learning_rate": 7.735614831974086e-06, + "loss": 1.4171, + "step": 10944 + }, + { + "epoch": 0.5983898965324003, + "grad_norm": 1.2996103763580322, + "learning_rate": 7.733834977263938e-06, + "loss": 1.3358, + "step": 10945 + }, + { + "epoch": 0.5984445689761219, + "grad_norm": 1.1734871864318848, + "learning_rate": 7.732055198226352e-06, + "loss": 1.5863, + "step": 10946 + }, + { + "epoch": 0.5984992414198433, + "grad_norm": 1.8181049823760986, + "learning_rate": 7.73027549492077e-06, + "loss": 1.2504, + "step": 10947 + }, + { + "epoch": 0.5985539138635649, + "grad_norm": 1.6572729349136353, + "learning_rate": 7.72849586740661e-06, + "loss": 1.6342, + "step": 10948 + }, + { + "epoch": 0.5986085863072864, + "grad_norm": 1.5502688884735107, + "learning_rate": 7.72671631574331e-06, + "loss": 1.5651, + "step": 10949 + }, + { + "epoch": 0.598663258751008, + "grad_norm": 1.6166455745697021, + "learning_rate": 7.724936839990285e-06, + "loss": 1.3834, + "step": 10950 + }, + { + "epoch": 0.5987179311947296, + "grad_norm": 1.6280817985534668, + "learning_rate": 7.723157440206953e-06, + "loss": 1.4983, + "step": 10951 + }, + { + "epoch": 0.5987726036384511, + "grad_norm": 1.6159167289733887, + "learning_rate": 7.721378116452741e-06, + "loss": 1.674, + "step": 10952 + }, + { + "epoch": 0.5988272760821727, + "grad_norm": 1.7073031663894653, + "learning_rate": 7.71959886878706e-06, + "loss": 1.3603, + "step": 10953 + }, + { + "epoch": 0.5988819485258943, + "grad_norm": 1.5231757164001465, + "learning_rate": 7.717819697269322e-06, + "loss": 1.4588, + "step": 10954 + }, + { + "epoch": 0.5989366209696158, + "grad_norm": 1.4368075132369995, + "learning_rate": 7.716040601958941e-06, + "loss": 1.4207, + "step": 10955 + }, + { + "epoch": 0.5989912934133373, + "grad_norm": 1.5416468381881714, + "learning_rate": 7.714261582915325e-06, + "loss": 1.4151, + "step": 10956 + }, + { + "epoch": 0.5990459658570589, + "grad_norm": 1.4522242546081543, + "learning_rate": 7.712482640197874e-06, + "loss": 1.5618, + "step": 10957 + }, + { + "epoch": 0.5991006383007804, + "grad_norm": 1.437385082244873, + "learning_rate": 7.710703773866001e-06, + "loss": 1.4803, + "step": 10958 + }, + { + "epoch": 0.599155310744502, + "grad_norm": 1.9285709857940674, + "learning_rate": 7.708924983979099e-06, + "loss": 1.2425, + "step": 10959 + }, + { + "epoch": 0.5992099831882236, + "grad_norm": 1.3956825733184814, + "learning_rate": 7.707146270596564e-06, + "loss": 1.3895, + "step": 10960 + }, + { + "epoch": 0.5992646556319451, + "grad_norm": 1.3082385063171387, + "learning_rate": 7.7053676337778e-06, + "loss": 1.4633, + "step": 10961 + }, + { + "epoch": 0.5993193280756667, + "grad_norm": 1.4429551362991333, + "learning_rate": 7.703589073582194e-06, + "loss": 1.3109, + "step": 10962 + }, + { + "epoch": 0.5993740005193882, + "grad_norm": 1.584421992301941, + "learning_rate": 7.701810590069138e-06, + "loss": 1.2798, + "step": 10963 + }, + { + "epoch": 0.5994286729631098, + "grad_norm": 1.6203651428222656, + "learning_rate": 7.700032183298021e-06, + "loss": 1.5053, + "step": 10964 + }, + { + "epoch": 0.5994833454068313, + "grad_norm": 1.0633774995803833, + "learning_rate": 7.698253853328222e-06, + "loss": 1.5712, + "step": 10965 + }, + { + "epoch": 0.5995380178505528, + "grad_norm": 1.6528856754302979, + "learning_rate": 7.69647560021913e-06, + "loss": 1.4362, + "step": 10966 + }, + { + "epoch": 0.5995926902942744, + "grad_norm": 1.4928967952728271, + "learning_rate": 7.694697424030126e-06, + "loss": 1.5388, + "step": 10967 + }, + { + "epoch": 0.599647362737996, + "grad_norm": 1.653083086013794, + "learning_rate": 7.69291932482058e-06, + "loss": 1.6158, + "step": 10968 + }, + { + "epoch": 0.5997020351817175, + "grad_norm": 1.7552485466003418, + "learning_rate": 7.691141302649877e-06, + "loss": 1.3898, + "step": 10969 + }, + { + "epoch": 0.5997567076254391, + "grad_norm": 1.5599679946899414, + "learning_rate": 7.68936335757738e-06, + "loss": 1.4795, + "step": 10970 + }, + { + "epoch": 0.5998113800691607, + "grad_norm": 1.5185010433197021, + "learning_rate": 7.68758548966246e-06, + "loss": 1.5097, + "step": 10971 + }, + { + "epoch": 0.5998660525128822, + "grad_norm": 1.2825757265090942, + "learning_rate": 7.685807698964491e-06, + "loss": 1.604, + "step": 10972 + }, + { + "epoch": 0.5999207249566038, + "grad_norm": 2.1865057945251465, + "learning_rate": 7.684029985542833e-06, + "loss": 1.2106, + "step": 10973 + }, + { + "epoch": 0.5999753974003253, + "grad_norm": 1.372549295425415, + "learning_rate": 7.682252349456847e-06, + "loss": 1.1774, + "step": 10974 + }, + { + "epoch": 0.6000300698440468, + "grad_norm": 1.5626882314682007, + "learning_rate": 7.680474790765895e-06, + "loss": 1.5892, + "step": 10975 + }, + { + "epoch": 0.6000847422877684, + "grad_norm": 1.6427392959594727, + "learning_rate": 7.678697309529329e-06, + "loss": 1.5302, + "step": 10976 + }, + { + "epoch": 0.6001394147314899, + "grad_norm": 1.4622809886932373, + "learning_rate": 7.676919905806512e-06, + "loss": 1.2287, + "step": 10977 + }, + { + "epoch": 0.6001940871752115, + "grad_norm": 1.4310435056686401, + "learning_rate": 7.67514257965679e-06, + "loss": 1.3765, + "step": 10978 + }, + { + "epoch": 0.6002487596189331, + "grad_norm": 1.6401445865631104, + "learning_rate": 7.673365331139507e-06, + "loss": 1.4502, + "step": 10979 + }, + { + "epoch": 0.6003034320626546, + "grad_norm": 1.477369785308838, + "learning_rate": 7.671588160314021e-06, + "loss": 1.2784, + "step": 10980 + }, + { + "epoch": 0.6003581045063762, + "grad_norm": 1.5429959297180176, + "learning_rate": 7.66981106723967e-06, + "loss": 1.4389, + "step": 10981 + }, + { + "epoch": 0.6004127769500978, + "grad_norm": 1.4818545579910278, + "learning_rate": 7.668034051975793e-06, + "loss": 1.4443, + "step": 10982 + }, + { + "epoch": 0.6004674493938192, + "grad_norm": 1.478763222694397, + "learning_rate": 7.666257114581732e-06, + "loss": 1.5812, + "step": 10983 + }, + { + "epoch": 0.6005221218375408, + "grad_norm": 1.7011405229568481, + "learning_rate": 7.664480255116825e-06, + "loss": 1.2759, + "step": 10984 + }, + { + "epoch": 0.6005767942812624, + "grad_norm": 1.4035594463348389, + "learning_rate": 7.662703473640396e-06, + "loss": 1.6138, + "step": 10985 + }, + { + "epoch": 0.6006314667249839, + "grad_norm": 1.055964708328247, + "learning_rate": 7.660926770211788e-06, + "loss": 1.5847, + "step": 10986 + }, + { + "epoch": 0.6006861391687055, + "grad_norm": 1.3505735397338867, + "learning_rate": 7.659150144890325e-06, + "loss": 1.3485, + "step": 10987 + }, + { + "epoch": 0.6007408116124271, + "grad_norm": 1.7761437892913818, + "learning_rate": 7.657373597735327e-06, + "loss": 1.5492, + "step": 10988 + }, + { + "epoch": 0.6007954840561486, + "grad_norm": 1.4005157947540283, + "learning_rate": 7.655597128806125e-06, + "loss": 1.3749, + "step": 10989 + }, + { + "epoch": 0.6008501564998702, + "grad_norm": 1.353377342224121, + "learning_rate": 7.653820738162031e-06, + "loss": 1.376, + "step": 10990 + }, + { + "epoch": 0.6009048289435917, + "grad_norm": 1.460557222366333, + "learning_rate": 7.652044425862375e-06, + "loss": 1.3233, + "step": 10991 + }, + { + "epoch": 0.6009595013873132, + "grad_norm": 1.7816877365112305, + "learning_rate": 7.650268191966463e-06, + "loss": 1.3293, + "step": 10992 + }, + { + "epoch": 0.6010141738310348, + "grad_norm": 1.6295167207717896, + "learning_rate": 7.64849203653361e-06, + "loss": 1.4476, + "step": 10993 + }, + { + "epoch": 0.6010688462747563, + "grad_norm": 1.73164963722229, + "learning_rate": 7.646715959623125e-06, + "loss": 1.5633, + "step": 10994 + }, + { + "epoch": 0.6011235187184779, + "grad_norm": 1.7372671365737915, + "learning_rate": 7.644939961294318e-06, + "loss": 1.4569, + "step": 10995 + }, + { + "epoch": 0.6011781911621995, + "grad_norm": 1.6149016618728638, + "learning_rate": 7.643164041606489e-06, + "loss": 1.4455, + "step": 10996 + }, + { + "epoch": 0.601232863605921, + "grad_norm": 1.859975814819336, + "learning_rate": 7.641388200618944e-06, + "loss": 1.3003, + "step": 10997 + }, + { + "epoch": 0.6012875360496426, + "grad_norm": 1.5090845823287964, + "learning_rate": 7.639612438390984e-06, + "loss": 1.6559, + "step": 10998 + }, + { + "epoch": 0.6013422084933642, + "grad_norm": 1.7064818143844604, + "learning_rate": 7.6378367549819e-06, + "loss": 1.4056, + "step": 10999 + }, + { + "epoch": 0.6013968809370857, + "grad_norm": 1.2413831949234009, + "learning_rate": 7.636061150450991e-06, + "loss": 1.4843, + "step": 11000 + }, + { + "epoch": 0.6014515533808072, + "grad_norm": 1.52357017993927, + "learning_rate": 7.63428562485755e-06, + "loss": 1.4557, + "step": 11001 + }, + { + "epoch": 0.6015062258245288, + "grad_norm": 1.7673585414886475, + "learning_rate": 7.63251017826086e-06, + "loss": 1.5367, + "step": 11002 + }, + { + "epoch": 0.6015608982682503, + "grad_norm": 1.5001001358032227, + "learning_rate": 7.630734810720212e-06, + "loss": 1.0835, + "step": 11003 + }, + { + "epoch": 0.6016155707119719, + "grad_norm": 1.3245189189910889, + "learning_rate": 7.6289595222948865e-06, + "loss": 1.4667, + "step": 11004 + }, + { + "epoch": 0.6016702431556934, + "grad_norm": 1.561692237854004, + "learning_rate": 7.627184313044169e-06, + "loss": 1.5457, + "step": 11005 + }, + { + "epoch": 0.601724915599415, + "grad_norm": 1.3722443580627441, + "learning_rate": 7.625409183027336e-06, + "loss": 1.5485, + "step": 11006 + }, + { + "epoch": 0.6017795880431366, + "grad_norm": 1.6265337467193604, + "learning_rate": 7.623634132303659e-06, + "loss": 1.6757, + "step": 11007 + }, + { + "epoch": 0.6018342604868581, + "grad_norm": 1.4312069416046143, + "learning_rate": 7.621859160932419e-06, + "loss": 1.32, + "step": 11008 + }, + { + "epoch": 0.6018889329305797, + "grad_norm": 1.6638760566711426, + "learning_rate": 7.62008426897288e-06, + "loss": 1.717, + "step": 11009 + }, + { + "epoch": 0.6019436053743012, + "grad_norm": 1.2928019762039185, + "learning_rate": 7.618309456484309e-06, + "loss": 1.3912, + "step": 11010 + }, + { + "epoch": 0.6019982778180227, + "grad_norm": 1.2199113368988037, + "learning_rate": 7.616534723525977e-06, + "loss": 1.4033, + "step": 11011 + }, + { + "epoch": 0.6020529502617443, + "grad_norm": 1.2107524871826172, + "learning_rate": 7.614760070157145e-06, + "loss": 1.4912, + "step": 11012 + }, + { + "epoch": 0.6021076227054659, + "grad_norm": 2.1430540084838867, + "learning_rate": 7.612985496437069e-06, + "loss": 1.4352, + "step": 11013 + }, + { + "epoch": 0.6021622951491874, + "grad_norm": 1.2684659957885742, + "learning_rate": 7.61121100242501e-06, + "loss": 1.4544, + "step": 11014 + }, + { + "epoch": 0.602216967592909, + "grad_norm": 1.1518479585647583, + "learning_rate": 7.609436588180221e-06, + "loss": 1.587, + "step": 11015 + }, + { + "epoch": 0.6022716400366306, + "grad_norm": 1.5425851345062256, + "learning_rate": 7.60766225376195e-06, + "loss": 1.5049, + "step": 11016 + }, + { + "epoch": 0.6023263124803521, + "grad_norm": 1.410603642463684, + "learning_rate": 7.605887999229454e-06, + "loss": 1.578, + "step": 11017 + }, + { + "epoch": 0.6023809849240737, + "grad_norm": 1.4614228010177612, + "learning_rate": 7.604113824641975e-06, + "loss": 1.5341, + "step": 11018 + }, + { + "epoch": 0.6024356573677953, + "grad_norm": 1.3565424680709839, + "learning_rate": 7.602339730058759e-06, + "loss": 1.422, + "step": 11019 + }, + { + "epoch": 0.6024903298115167, + "grad_norm": 1.3932825326919556, + "learning_rate": 7.600565715539044e-06, + "loss": 1.5312, + "step": 11020 + }, + { + "epoch": 0.6025450022552383, + "grad_norm": 1.3840268850326538, + "learning_rate": 7.5987917811420675e-06, + "loss": 1.4381, + "step": 11021 + }, + { + "epoch": 0.6025996746989598, + "grad_norm": 1.750273585319519, + "learning_rate": 7.597017926927073e-06, + "loss": 1.2891, + "step": 11022 + }, + { + "epoch": 0.6026543471426814, + "grad_norm": 2.3886115550994873, + "learning_rate": 7.595244152953287e-06, + "loss": 1.4685, + "step": 11023 + }, + { + "epoch": 0.602709019586403, + "grad_norm": 1.8308191299438477, + "learning_rate": 7.593470459279939e-06, + "loss": 1.377, + "step": 11024 + }, + { + "epoch": 0.6027636920301245, + "grad_norm": 1.911864995956421, + "learning_rate": 7.591696845966263e-06, + "loss": 1.3131, + "step": 11025 + }, + { + "epoch": 0.6028183644738461, + "grad_norm": 1.976510763168335, + "learning_rate": 7.58992331307148e-06, + "loss": 1.5361, + "step": 11026 + }, + { + "epoch": 0.6028730369175677, + "grad_norm": 1.5981663465499878, + "learning_rate": 7.588149860654812e-06, + "loss": 1.3439, + "step": 11027 + }, + { + "epoch": 0.6029277093612891, + "grad_norm": 1.5474389791488647, + "learning_rate": 7.586376488775481e-06, + "loss": 1.6069, + "step": 11028 + }, + { + "epoch": 0.6029823818050107, + "grad_norm": 1.432536005973816, + "learning_rate": 7.584603197492703e-06, + "loss": 1.4249, + "step": 11029 + }, + { + "epoch": 0.6030370542487323, + "grad_norm": 1.6407924890518188, + "learning_rate": 7.582829986865688e-06, + "loss": 1.8374, + "step": 11030 + }, + { + "epoch": 0.6030917266924538, + "grad_norm": 1.3073110580444336, + "learning_rate": 7.581056856953656e-06, + "loss": 1.6019, + "step": 11031 + }, + { + "epoch": 0.6031463991361754, + "grad_norm": 1.5180394649505615, + "learning_rate": 7.5792838078158094e-06, + "loss": 1.7285, + "step": 11032 + }, + { + "epoch": 0.603201071579897, + "grad_norm": 1.360543131828308, + "learning_rate": 7.57751083951136e-06, + "loss": 1.5012, + "step": 11033 + }, + { + "epoch": 0.6032557440236185, + "grad_norm": 1.9464805126190186, + "learning_rate": 7.575737952099507e-06, + "loss": 1.3826, + "step": 11034 + }, + { + "epoch": 0.6033104164673401, + "grad_norm": 1.7045711278915405, + "learning_rate": 7.573965145639448e-06, + "loss": 1.0949, + "step": 11035 + }, + { + "epoch": 0.6033650889110616, + "grad_norm": 2.3032474517822266, + "learning_rate": 7.57219242019039e-06, + "loss": 1.3345, + "step": 11036 + }, + { + "epoch": 0.6034197613547831, + "grad_norm": 1.3678017854690552, + "learning_rate": 7.570419775811526e-06, + "loss": 1.4693, + "step": 11037 + }, + { + "epoch": 0.6034744337985047, + "grad_norm": 1.6068015098571777, + "learning_rate": 7.568647212562043e-06, + "loss": 1.3679, + "step": 11038 + }, + { + "epoch": 0.6035291062422262, + "grad_norm": 1.490499496459961, + "learning_rate": 7.566874730501137e-06, + "loss": 1.2949, + "step": 11039 + }, + { + "epoch": 0.6035837786859478, + "grad_norm": 2.0778918266296387, + "learning_rate": 7.565102329687994e-06, + "loss": 1.3713, + "step": 11040 + }, + { + "epoch": 0.6036384511296694, + "grad_norm": 1.1666464805603027, + "learning_rate": 7.563330010181793e-06, + "loss": 1.5748, + "step": 11041 + }, + { + "epoch": 0.6036931235733909, + "grad_norm": 1.4085580110549927, + "learning_rate": 7.561557772041725e-06, + "loss": 1.4199, + "step": 11042 + }, + { + "epoch": 0.6037477960171125, + "grad_norm": 1.5069218873977661, + "learning_rate": 7.559785615326967e-06, + "loss": 1.4009, + "step": 11043 + }, + { + "epoch": 0.6038024684608341, + "grad_norm": 2.079209327697754, + "learning_rate": 7.558013540096687e-06, + "loss": 1.3511, + "step": 11044 + }, + { + "epoch": 0.6038571409045556, + "grad_norm": 1.4854398965835571, + "learning_rate": 7.55624154641007e-06, + "loss": 1.3879, + "step": 11045 + }, + { + "epoch": 0.6039118133482771, + "grad_norm": 1.9091904163360596, + "learning_rate": 7.5544696343262805e-06, + "loss": 1.479, + "step": 11046 + }, + { + "epoch": 0.6039664857919987, + "grad_norm": 1.4591927528381348, + "learning_rate": 7.552697803904489e-06, + "loss": 1.4246, + "step": 11047 + }, + { + "epoch": 0.6040211582357202, + "grad_norm": 1.4264100790023804, + "learning_rate": 7.550926055203863e-06, + "loss": 1.4799, + "step": 11048 + }, + { + "epoch": 0.6040758306794418, + "grad_norm": 1.8125386238098145, + "learning_rate": 7.549154388283557e-06, + "loss": 1.3556, + "step": 11049 + }, + { + "epoch": 0.6041305031231633, + "grad_norm": 1.6283231973648071, + "learning_rate": 7.547382803202743e-06, + "loss": 1.3837, + "step": 11050 + }, + { + "epoch": 0.6041851755668849, + "grad_norm": 1.6434208154678345, + "learning_rate": 7.545611300020571e-06, + "loss": 1.4857, + "step": 11051 + }, + { + "epoch": 0.6042398480106065, + "grad_norm": 2.0565717220306396, + "learning_rate": 7.543839878796195e-06, + "loss": 1.3172, + "step": 11052 + }, + { + "epoch": 0.604294520454328, + "grad_norm": 1.4113377332687378, + "learning_rate": 7.542068539588771e-06, + "loss": 1.496, + "step": 11053 + }, + { + "epoch": 0.6043491928980496, + "grad_norm": 1.444532871246338, + "learning_rate": 7.5402972824574475e-06, + "loss": 1.3673, + "step": 11054 + }, + { + "epoch": 0.6044038653417712, + "grad_norm": 1.4089601039886475, + "learning_rate": 7.5385261074613634e-06, + "loss": 1.3982, + "step": 11055 + }, + { + "epoch": 0.6044585377854926, + "grad_norm": 1.3592947721481323, + "learning_rate": 7.536755014659674e-06, + "loss": 1.283, + "step": 11056 + }, + { + "epoch": 0.6045132102292142, + "grad_norm": 1.3562544584274292, + "learning_rate": 7.534984004111515e-06, + "loss": 1.3769, + "step": 11057 + }, + { + "epoch": 0.6045678826729358, + "grad_norm": 1.5573874711990356, + "learning_rate": 7.533213075876022e-06, + "loss": 1.3628, + "step": 11058 + }, + { + "epoch": 0.6046225551166573, + "grad_norm": 1.1315970420837402, + "learning_rate": 7.531442230012336e-06, + "loss": 1.8953, + "step": 11059 + }, + { + "epoch": 0.6046772275603789, + "grad_norm": 1.6780718564987183, + "learning_rate": 7.529671466579581e-06, + "loss": 1.3036, + "step": 11060 + }, + { + "epoch": 0.6047319000041005, + "grad_norm": 1.9101290702819824, + "learning_rate": 7.527900785636897e-06, + "loss": 1.3302, + "step": 11061 + }, + { + "epoch": 0.604786572447822, + "grad_norm": 1.4465876817703247, + "learning_rate": 7.526130187243408e-06, + "loss": 1.4533, + "step": 11062 + }, + { + "epoch": 0.6048412448915436, + "grad_norm": 1.6255711317062378, + "learning_rate": 7.5243596714582315e-06, + "loss": 1.2602, + "step": 11063 + }, + { + "epoch": 0.604895917335265, + "grad_norm": 1.5364336967468262, + "learning_rate": 7.522589238340499e-06, + "loss": 1.5335, + "step": 11064 + }, + { + "epoch": 0.6049505897789866, + "grad_norm": 1.258915662765503, + "learning_rate": 7.520818887949326e-06, + "loss": 1.4458, + "step": 11065 + }, + { + "epoch": 0.6050052622227082, + "grad_norm": 1.7493261098861694, + "learning_rate": 7.519048620343825e-06, + "loss": 1.5492, + "step": 11066 + }, + { + "epoch": 0.6050599346664297, + "grad_norm": 1.3431733846664429, + "learning_rate": 7.517278435583115e-06, + "loss": 1.3308, + "step": 11067 + }, + { + "epoch": 0.6051146071101513, + "grad_norm": 1.7888821363449097, + "learning_rate": 7.515508333726304e-06, + "loss": 1.3505, + "step": 11068 + }, + { + "epoch": 0.6051692795538729, + "grad_norm": 1.8348675966262817, + "learning_rate": 7.513738314832496e-06, + "loss": 1.6456, + "step": 11069 + }, + { + "epoch": 0.6052239519975944, + "grad_norm": 1.5534336566925049, + "learning_rate": 7.5119683789608035e-06, + "loss": 1.4898, + "step": 11070 + }, + { + "epoch": 0.605278624441316, + "grad_norm": 1.9387998580932617, + "learning_rate": 7.510198526170324e-06, + "loss": 1.0468, + "step": 11071 + }, + { + "epoch": 0.6053332968850376, + "grad_norm": 1.6682476997375488, + "learning_rate": 7.508428756520158e-06, + "loss": 1.7138, + "step": 11072 + }, + { + "epoch": 0.605387969328759, + "grad_norm": 1.3217642307281494, + "learning_rate": 7.506659070069404e-06, + "loss": 1.2582, + "step": 11073 + }, + { + "epoch": 0.6054426417724806, + "grad_norm": 1.2861710786819458, + "learning_rate": 7.50488946687715e-06, + "loss": 1.6096, + "step": 11074 + }, + { + "epoch": 0.6054973142162022, + "grad_norm": 1.2495683431625366, + "learning_rate": 7.503119947002496e-06, + "loss": 1.5838, + "step": 11075 + }, + { + "epoch": 0.6055519866599237, + "grad_norm": 1.6792904138565063, + "learning_rate": 7.501350510504526e-06, + "loss": 1.2702, + "step": 11076 + }, + { + "epoch": 0.6056066591036453, + "grad_norm": 1.6627310514450073, + "learning_rate": 7.4995811574423235e-06, + "loss": 1.6085, + "step": 11077 + }, + { + "epoch": 0.6056613315473668, + "grad_norm": 1.3606035709381104, + "learning_rate": 7.497811887874976e-06, + "loss": 1.4143, + "step": 11078 + }, + { + "epoch": 0.6057160039910884, + "grad_norm": 1.34311842918396, + "learning_rate": 7.496042701861561e-06, + "loss": 1.2299, + "step": 11079 + }, + { + "epoch": 0.60577067643481, + "grad_norm": 2.1015231609344482, + "learning_rate": 7.494273599461153e-06, + "loss": 1.5285, + "step": 11080 + }, + { + "epoch": 0.6058253488785315, + "grad_norm": 1.5865628719329834, + "learning_rate": 7.492504580732831e-06, + "loss": 1.3174, + "step": 11081 + }, + { + "epoch": 0.605880021322253, + "grad_norm": 1.6666128635406494, + "learning_rate": 7.490735645735667e-06, + "loss": 1.3208, + "step": 11082 + }, + { + "epoch": 0.6059346937659746, + "grad_norm": 1.436099648475647, + "learning_rate": 7.4889667945287224e-06, + "loss": 1.4862, + "step": 11083 + }, + { + "epoch": 0.6059893662096961, + "grad_norm": 1.4711040258407593, + "learning_rate": 7.487198027171074e-06, + "loss": 1.3545, + "step": 11084 + }, + { + "epoch": 0.6060440386534177, + "grad_norm": 1.941236138343811, + "learning_rate": 7.485429343721779e-06, + "loss": 1.3884, + "step": 11085 + }, + { + "epoch": 0.6060987110971393, + "grad_norm": 1.4490551948547363, + "learning_rate": 7.4836607442398974e-06, + "loss": 1.5905, + "step": 11086 + }, + { + "epoch": 0.6061533835408608, + "grad_norm": 1.4118762016296387, + "learning_rate": 7.481892228784491e-06, + "loss": 1.3079, + "step": 11087 + }, + { + "epoch": 0.6062080559845824, + "grad_norm": 2.220043897628784, + "learning_rate": 7.480123797414608e-06, + "loss": 1.4058, + "step": 11088 + }, + { + "epoch": 0.606262728428304, + "grad_norm": 1.2765339612960815, + "learning_rate": 7.478355450189307e-06, + "loss": 1.3787, + "step": 11089 + }, + { + "epoch": 0.6063174008720255, + "grad_norm": 1.8675049543380737, + "learning_rate": 7.476587187167636e-06, + "loss": 1.3977, + "step": 11090 + }, + { + "epoch": 0.606372073315747, + "grad_norm": 1.0921106338500977, + "learning_rate": 7.474819008408638e-06, + "loss": 1.3743, + "step": 11091 + }, + { + "epoch": 0.6064267457594685, + "grad_norm": 1.4249523878097534, + "learning_rate": 7.47305091397136e-06, + "loss": 1.425, + "step": 11092 + }, + { + "epoch": 0.6064814182031901, + "grad_norm": 1.5421303510665894, + "learning_rate": 7.4712829039148425e-06, + "loss": 1.1917, + "step": 11093 + }, + { + "epoch": 0.6065360906469117, + "grad_norm": 1.620668888092041, + "learning_rate": 7.469514978298119e-06, + "loss": 1.2562, + "step": 11094 + }, + { + "epoch": 0.6065907630906332, + "grad_norm": 1.607236623764038, + "learning_rate": 7.467747137180232e-06, + "loss": 1.4671, + "step": 11095 + }, + { + "epoch": 0.6066454355343548, + "grad_norm": 1.6709033250808716, + "learning_rate": 7.46597938062021e-06, + "loss": 1.2952, + "step": 11096 + }, + { + "epoch": 0.6067001079780764, + "grad_norm": 1.4276975393295288, + "learning_rate": 7.46421170867708e-06, + "loss": 1.476, + "step": 11097 + }, + { + "epoch": 0.6067547804217979, + "grad_norm": 1.2602431774139404, + "learning_rate": 7.462444121409875e-06, + "loss": 1.6444, + "step": 11098 + }, + { + "epoch": 0.6068094528655195, + "grad_norm": 1.3126235008239746, + "learning_rate": 7.460676618877615e-06, + "loss": 1.5409, + "step": 11099 + }, + { + "epoch": 0.606864125309241, + "grad_norm": 1.4635127782821655, + "learning_rate": 7.458909201139317e-06, + "loss": 1.4782, + "step": 11100 + }, + { + "epoch": 0.6069187977529625, + "grad_norm": 1.4678797721862793, + "learning_rate": 7.457141868254007e-06, + "loss": 1.3014, + "step": 11101 + }, + { + "epoch": 0.6069734701966841, + "grad_norm": 1.3872941732406616, + "learning_rate": 7.455374620280693e-06, + "loss": 1.3478, + "step": 11102 + }, + { + "epoch": 0.6070281426404057, + "grad_norm": 1.3082164525985718, + "learning_rate": 7.453607457278398e-06, + "loss": 1.6412, + "step": 11103 + }, + { + "epoch": 0.6070828150841272, + "grad_norm": 1.2057136297225952, + "learning_rate": 7.451840379306123e-06, + "loss": 1.6989, + "step": 11104 + }, + { + "epoch": 0.6071374875278488, + "grad_norm": 1.9279099702835083, + "learning_rate": 7.450073386422876e-06, + "loss": 1.7046, + "step": 11105 + }, + { + "epoch": 0.6071921599715703, + "grad_norm": 1.6242313385009766, + "learning_rate": 7.448306478687663e-06, + "loss": 1.5573, + "step": 11106 + }, + { + "epoch": 0.6072468324152919, + "grad_norm": 1.5582224130630493, + "learning_rate": 7.446539656159486e-06, + "loss": 1.345, + "step": 11107 + }, + { + "epoch": 0.6073015048590135, + "grad_norm": 1.1482361555099487, + "learning_rate": 7.444772918897336e-06, + "loss": 1.4378, + "step": 11108 + }, + { + "epoch": 0.607356177302735, + "grad_norm": 1.0972882509231567, + "learning_rate": 7.44300626696022e-06, + "loss": 1.5209, + "step": 11109 + }, + { + "epoch": 0.6074108497464565, + "grad_norm": 1.3883986473083496, + "learning_rate": 7.441239700407124e-06, + "loss": 1.368, + "step": 11110 + }, + { + "epoch": 0.6074655221901781, + "grad_norm": 1.7706106901168823, + "learning_rate": 7.4394732192970375e-06, + "loss": 1.3397, + "step": 11111 + }, + { + "epoch": 0.6075201946338996, + "grad_norm": 1.4214259386062622, + "learning_rate": 7.43770682368895e-06, + "loss": 1.4785, + "step": 11112 + }, + { + "epoch": 0.6075748670776212, + "grad_norm": 1.178489327430725, + "learning_rate": 7.435940513641845e-06, + "loss": 1.3679, + "step": 11113 + }, + { + "epoch": 0.6076295395213428, + "grad_norm": 1.3425354957580566, + "learning_rate": 7.4341742892146976e-06, + "loss": 1.6071, + "step": 11114 + }, + { + "epoch": 0.6076842119650643, + "grad_norm": 2.1054136753082275, + "learning_rate": 7.432408150466497e-06, + "loss": 1.3002, + "step": 11115 + }, + { + "epoch": 0.6077388844087859, + "grad_norm": 1.3197323083877563, + "learning_rate": 7.430642097456211e-06, + "loss": 1.5929, + "step": 11116 + }, + { + "epoch": 0.6077935568525075, + "grad_norm": 1.5631767511367798, + "learning_rate": 7.428876130242816e-06, + "loss": 1.2975, + "step": 11117 + }, + { + "epoch": 0.607848229296229, + "grad_norm": 1.3502237796783447, + "learning_rate": 7.427110248885281e-06, + "loss": 1.4115, + "step": 11118 + }, + { + "epoch": 0.6079029017399505, + "grad_norm": 1.4887632131576538, + "learning_rate": 7.425344453442566e-06, + "loss": 1.4875, + "step": 11119 + }, + { + "epoch": 0.607957574183672, + "grad_norm": 1.8373732566833496, + "learning_rate": 7.423578743973649e-06, + "loss": 1.5237, + "step": 11120 + }, + { + "epoch": 0.6080122466273936, + "grad_norm": 1.8911124467849731, + "learning_rate": 7.421813120537482e-06, + "loss": 1.4244, + "step": 11121 + }, + { + "epoch": 0.6080669190711152, + "grad_norm": 1.5834707021713257, + "learning_rate": 7.42004758319302e-06, + "loss": 1.3461, + "step": 11122 + }, + { + "epoch": 0.6081215915148367, + "grad_norm": 1.5003236532211304, + "learning_rate": 7.418282131999228e-06, + "loss": 1.4655, + "step": 11123 + }, + { + "epoch": 0.6081762639585583, + "grad_norm": 1.6813890933990479, + "learning_rate": 7.416516767015054e-06, + "loss": 1.3506, + "step": 11124 + }, + { + "epoch": 0.6082309364022799, + "grad_norm": 1.5057485103607178, + "learning_rate": 7.414751488299444e-06, + "loss": 1.5855, + "step": 11125 + }, + { + "epoch": 0.6082856088460014, + "grad_norm": 1.7668269872665405, + "learning_rate": 7.4129862959113515e-06, + "loss": 1.2311, + "step": 11126 + }, + { + "epoch": 0.608340281289723, + "grad_norm": 1.5948389768600464, + "learning_rate": 7.411221189909718e-06, + "loss": 1.6831, + "step": 11127 + }, + { + "epoch": 0.6083949537334445, + "grad_norm": 1.8954893350601196, + "learning_rate": 7.409456170353477e-06, + "loss": 1.3689, + "step": 11128 + }, + { + "epoch": 0.608449626177166, + "grad_norm": 1.6664834022521973, + "learning_rate": 7.40769123730158e-06, + "loss": 1.521, + "step": 11129 + }, + { + "epoch": 0.6085042986208876, + "grad_norm": 1.263390064239502, + "learning_rate": 7.405926390812953e-06, + "loss": 1.4915, + "step": 11130 + }, + { + "epoch": 0.6085589710646092, + "grad_norm": 1.5928078889846802, + "learning_rate": 7.404161630946532e-06, + "loss": 1.4281, + "step": 11131 + }, + { + "epoch": 0.6086136435083307, + "grad_norm": 1.8289260864257812, + "learning_rate": 7.402396957761247e-06, + "loss": 1.2683, + "step": 11132 + }, + { + "epoch": 0.6086683159520523, + "grad_norm": 1.5874749422073364, + "learning_rate": 7.400632371316019e-06, + "loss": 1.1737, + "step": 11133 + }, + { + "epoch": 0.6087229883957738, + "grad_norm": 1.47548508644104, + "learning_rate": 7.398867871669778e-06, + "loss": 1.2843, + "step": 11134 + }, + { + "epoch": 0.6087776608394954, + "grad_norm": 1.3225919008255005, + "learning_rate": 7.397103458881444e-06, + "loss": 1.5595, + "step": 11135 + }, + { + "epoch": 0.608832333283217, + "grad_norm": 1.6486276388168335, + "learning_rate": 7.395339133009931e-06, + "loss": 1.4744, + "step": 11136 + }, + { + "epoch": 0.6088870057269384, + "grad_norm": 1.4729564189910889, + "learning_rate": 7.393574894114157e-06, + "loss": 1.4846, + "step": 11137 + }, + { + "epoch": 0.60894167817066, + "grad_norm": 1.4483561515808105, + "learning_rate": 7.391810742253036e-06, + "loss": 1.4643, + "step": 11138 + }, + { + "epoch": 0.6089963506143816, + "grad_norm": 1.7758631706237793, + "learning_rate": 7.3900466774854695e-06, + "loss": 1.6193, + "step": 11139 + }, + { + "epoch": 0.6090510230581031, + "grad_norm": 1.4262782335281372, + "learning_rate": 7.388282699870373e-06, + "loss": 1.415, + "step": 11140 + }, + { + "epoch": 0.6091056955018247, + "grad_norm": 1.5372954607009888, + "learning_rate": 7.386518809466645e-06, + "loss": 1.3683, + "step": 11141 + }, + { + "epoch": 0.6091603679455463, + "grad_norm": 1.4866589307785034, + "learning_rate": 7.384755006333183e-06, + "loss": 1.2942, + "step": 11142 + }, + { + "epoch": 0.6092150403892678, + "grad_norm": 1.2549121379852295, + "learning_rate": 7.382991290528892e-06, + "loss": 1.5371, + "step": 11143 + }, + { + "epoch": 0.6092697128329894, + "grad_norm": 1.598088264465332, + "learning_rate": 7.3812276621126625e-06, + "loss": 1.6162, + "step": 11144 + }, + { + "epoch": 0.609324385276711, + "grad_norm": 1.7987732887268066, + "learning_rate": 7.379464121143386e-06, + "loss": 1.2266, + "step": 11145 + }, + { + "epoch": 0.6093790577204324, + "grad_norm": 1.532764196395874, + "learning_rate": 7.377700667679954e-06, + "loss": 1.6932, + "step": 11146 + }, + { + "epoch": 0.609433730164154, + "grad_norm": 1.3096998929977417, + "learning_rate": 7.375937301781244e-06, + "loss": 1.3875, + "step": 11147 + }, + { + "epoch": 0.6094884026078755, + "grad_norm": 1.4480677843093872, + "learning_rate": 7.374174023506151e-06, + "loss": 1.5445, + "step": 11148 + }, + { + "epoch": 0.6095430750515971, + "grad_norm": 1.5466830730438232, + "learning_rate": 7.372410832913548e-06, + "loss": 1.3929, + "step": 11149 + }, + { + "epoch": 0.6095977474953187, + "grad_norm": 1.7849215269088745, + "learning_rate": 7.370647730062311e-06, + "loss": 1.4398, + "step": 11150 + }, + { + "epoch": 0.6096524199390402, + "grad_norm": 1.3861196041107178, + "learning_rate": 7.3688847150113185e-06, + "loss": 1.623, + "step": 11151 + }, + { + "epoch": 0.6097070923827618, + "grad_norm": 1.514285922050476, + "learning_rate": 7.36712178781944e-06, + "loss": 1.4443, + "step": 11152 + }, + { + "epoch": 0.6097617648264834, + "grad_norm": 1.6256756782531738, + "learning_rate": 7.365358948545538e-06, + "loss": 1.4058, + "step": 11153 + }, + { + "epoch": 0.6098164372702048, + "grad_norm": 1.445266604423523, + "learning_rate": 7.363596197248488e-06, + "loss": 1.5298, + "step": 11154 + }, + { + "epoch": 0.6098711097139264, + "grad_norm": 1.1353033781051636, + "learning_rate": 7.361833533987148e-06, + "loss": 1.4199, + "step": 11155 + }, + { + "epoch": 0.609925782157648, + "grad_norm": 1.389172077178955, + "learning_rate": 7.360070958820373e-06, + "loss": 1.3387, + "step": 11156 + }, + { + "epoch": 0.6099804546013695, + "grad_norm": 1.5793626308441162, + "learning_rate": 7.358308471807028e-06, + "loss": 1.3444, + "step": 11157 + }, + { + "epoch": 0.6100351270450911, + "grad_norm": 2.263399839401245, + "learning_rate": 7.356546073005957e-06, + "loss": 1.3604, + "step": 11158 + }, + { + "epoch": 0.6100897994888127, + "grad_norm": 1.389597773551941, + "learning_rate": 7.354783762476019e-06, + "loss": 1.4125, + "step": 11159 + }, + { + "epoch": 0.6101444719325342, + "grad_norm": 1.6715773344039917, + "learning_rate": 7.353021540276059e-06, + "loss": 1.3411, + "step": 11160 + }, + { + "epoch": 0.6101991443762558, + "grad_norm": 1.6017653942108154, + "learning_rate": 7.351259406464917e-06, + "loss": 1.4413, + "step": 11161 + }, + { + "epoch": 0.6102538168199773, + "grad_norm": 2.3017683029174805, + "learning_rate": 7.349497361101443e-06, + "loss": 1.4722, + "step": 11162 + }, + { + "epoch": 0.6103084892636989, + "grad_norm": 1.8545787334442139, + "learning_rate": 7.3477354042444716e-06, + "loss": 1.232, + "step": 11163 + }, + { + "epoch": 0.6103631617074204, + "grad_norm": 1.4358080625534058, + "learning_rate": 7.3459735359528366e-06, + "loss": 1.442, + "step": 11164 + }, + { + "epoch": 0.6104178341511419, + "grad_norm": 1.3598164319992065, + "learning_rate": 7.344211756285375e-06, + "loss": 1.5183, + "step": 11165 + }, + { + "epoch": 0.6104725065948635, + "grad_norm": 1.8799738883972168, + "learning_rate": 7.342450065300914e-06, + "loss": 1.5503, + "step": 11166 + }, + { + "epoch": 0.6105271790385851, + "grad_norm": 1.321899175643921, + "learning_rate": 7.34068846305828e-06, + "loss": 1.5029, + "step": 11167 + }, + { + "epoch": 0.6105818514823066, + "grad_norm": 1.3215192556381226, + "learning_rate": 7.338926949616301e-06, + "loss": 1.5464, + "step": 11168 + }, + { + "epoch": 0.6106365239260282, + "grad_norm": 1.4857409000396729, + "learning_rate": 7.337165525033796e-06, + "loss": 1.4743, + "step": 11169 + }, + { + "epoch": 0.6106911963697498, + "grad_norm": 2.027761697769165, + "learning_rate": 7.335404189369579e-06, + "loss": 1.4183, + "step": 11170 + }, + { + "epoch": 0.6107458688134713, + "grad_norm": 1.4762951135635376, + "learning_rate": 7.333642942682473e-06, + "loss": 1.5726, + "step": 11171 + }, + { + "epoch": 0.6108005412571929, + "grad_norm": 2.1094541549682617, + "learning_rate": 7.3318817850312804e-06, + "loss": 1.1623, + "step": 11172 + }, + { + "epoch": 0.6108552137009144, + "grad_norm": 1.6890355348587036, + "learning_rate": 7.33012071647482e-06, + "loss": 1.4149, + "step": 11173 + }, + { + "epoch": 0.6109098861446359, + "grad_norm": 1.351875901222229, + "learning_rate": 7.328359737071895e-06, + "loss": 1.052, + "step": 11174 + }, + { + "epoch": 0.6109645585883575, + "grad_norm": 1.4330294132232666, + "learning_rate": 7.326598846881305e-06, + "loss": 1.4975, + "step": 11175 + }, + { + "epoch": 0.611019231032079, + "grad_norm": 1.318999171257019, + "learning_rate": 7.3248380459618555e-06, + "loss": 1.4332, + "step": 11176 + }, + { + "epoch": 0.6110739034758006, + "grad_norm": 1.3124816417694092, + "learning_rate": 7.323077334372341e-06, + "loss": 1.4712, + "step": 11177 + }, + { + "epoch": 0.6111285759195222, + "grad_norm": 1.4059686660766602, + "learning_rate": 7.3213167121715514e-06, + "loss": 1.6028, + "step": 11178 + }, + { + "epoch": 0.6111832483632437, + "grad_norm": 1.4905949831008911, + "learning_rate": 7.319556179418286e-06, + "loss": 1.4234, + "step": 11179 + }, + { + "epoch": 0.6112379208069653, + "grad_norm": 1.542862057685852, + "learning_rate": 7.3177957361713305e-06, + "loss": 1.501, + "step": 11180 + }, + { + "epoch": 0.6112925932506869, + "grad_norm": 1.3841599225997925, + "learning_rate": 7.316035382489464e-06, + "loss": 1.4981, + "step": 11181 + }, + { + "epoch": 0.6113472656944083, + "grad_norm": 1.4091547727584839, + "learning_rate": 7.31427511843148e-06, + "loss": 1.3736, + "step": 11182 + }, + { + "epoch": 0.6114019381381299, + "grad_norm": 1.5819592475891113, + "learning_rate": 7.3125149440561505e-06, + "loss": 1.4359, + "step": 11183 + }, + { + "epoch": 0.6114566105818515, + "grad_norm": 2.097088575363159, + "learning_rate": 7.310754859422253e-06, + "loss": 1.6284, + "step": 11184 + }, + { + "epoch": 0.611511283025573, + "grad_norm": 1.4195376634597778, + "learning_rate": 7.308994864588562e-06, + "loss": 1.4069, + "step": 11185 + }, + { + "epoch": 0.6115659554692946, + "grad_norm": 1.3853001594543457, + "learning_rate": 7.3072349596138435e-06, + "loss": 1.4962, + "step": 11186 + }, + { + "epoch": 0.6116206279130162, + "grad_norm": 1.4776479005813599, + "learning_rate": 7.305475144556873e-06, + "loss": 1.4278, + "step": 11187 + }, + { + "epoch": 0.6116753003567377, + "grad_norm": 1.9758784770965576, + "learning_rate": 7.30371541947641e-06, + "loss": 1.367, + "step": 11188 + }, + { + "epoch": 0.6117299728004593, + "grad_norm": 1.5167649984359741, + "learning_rate": 7.301955784431214e-06, + "loss": 1.5834, + "step": 11189 + }, + { + "epoch": 0.6117846452441807, + "grad_norm": 2.0203418731689453, + "learning_rate": 7.300196239480046e-06, + "loss": 1.4275, + "step": 11190 + }, + { + "epoch": 0.6118393176879023, + "grad_norm": 2.2079076766967773, + "learning_rate": 7.298436784681664e-06, + "loss": 1.3994, + "step": 11191 + }, + { + "epoch": 0.6118939901316239, + "grad_norm": 1.607237458229065, + "learning_rate": 7.296677420094811e-06, + "loss": 1.5475, + "step": 11192 + }, + { + "epoch": 0.6119486625753454, + "grad_norm": 1.3555012941360474, + "learning_rate": 7.2949181457782466e-06, + "loss": 1.2783, + "step": 11193 + }, + { + "epoch": 0.612003335019067, + "grad_norm": 1.519151210784912, + "learning_rate": 7.293158961790714e-06, + "loss": 1.317, + "step": 11194 + }, + { + "epoch": 0.6120580074627886, + "grad_norm": 1.4711023569107056, + "learning_rate": 7.291399868190953e-06, + "loss": 1.5106, + "step": 11195 + }, + { + "epoch": 0.6121126799065101, + "grad_norm": 1.2379546165466309, + "learning_rate": 7.289640865037708e-06, + "loss": 1.4054, + "step": 11196 + }, + { + "epoch": 0.6121673523502317, + "grad_norm": 1.7432136535644531, + "learning_rate": 7.287881952389715e-06, + "loss": 1.3659, + "step": 11197 + }, + { + "epoch": 0.6122220247939533, + "grad_norm": 1.311533808708191, + "learning_rate": 7.286123130305702e-06, + "loss": 1.6328, + "step": 11198 + }, + { + "epoch": 0.6122766972376747, + "grad_norm": 1.5431138277053833, + "learning_rate": 7.284364398844412e-06, + "loss": 1.4225, + "step": 11199 + }, + { + "epoch": 0.6123313696813963, + "grad_norm": 1.8702104091644287, + "learning_rate": 7.282605758064563e-06, + "loss": 1.6844, + "step": 11200 + }, + { + "epoch": 0.6123860421251179, + "grad_norm": 1.640170693397522, + "learning_rate": 7.280847208024888e-06, + "loss": 1.603, + "step": 11201 + }, + { + "epoch": 0.6124407145688394, + "grad_norm": 1.2290890216827393, + "learning_rate": 7.279088748784106e-06, + "loss": 1.3326, + "step": 11202 + }, + { + "epoch": 0.612495387012561, + "grad_norm": 1.672586441040039, + "learning_rate": 7.277330380400933e-06, + "loss": 1.5035, + "step": 11203 + }, + { + "epoch": 0.6125500594562825, + "grad_norm": 1.4181894063949585, + "learning_rate": 7.275572102934089e-06, + "loss": 1.557, + "step": 11204 + }, + { + "epoch": 0.6126047319000041, + "grad_norm": 1.6114479303359985, + "learning_rate": 7.273813916442286e-06, + "loss": 1.1758, + "step": 11205 + }, + { + "epoch": 0.6126594043437257, + "grad_norm": 1.846899151802063, + "learning_rate": 7.27205582098423e-06, + "loss": 1.4594, + "step": 11206 + }, + { + "epoch": 0.6127140767874472, + "grad_norm": 1.472059965133667, + "learning_rate": 7.270297816618634e-06, + "loss": 1.4423, + "step": 11207 + }, + { + "epoch": 0.6127687492311688, + "grad_norm": 1.7187365293502808, + "learning_rate": 7.268539903404201e-06, + "loss": 1.2815, + "step": 11208 + }, + { + "epoch": 0.6128234216748903, + "grad_norm": 1.4212721586227417, + "learning_rate": 7.266782081399628e-06, + "loss": 1.5816, + "step": 11209 + }, + { + "epoch": 0.6128780941186118, + "grad_norm": 1.3829103708267212, + "learning_rate": 7.265024350663617e-06, + "loss": 1.412, + "step": 11210 + }, + { + "epoch": 0.6129327665623334, + "grad_norm": 1.7706668376922607, + "learning_rate": 7.263266711254859e-06, + "loss": 1.4766, + "step": 11211 + }, + { + "epoch": 0.612987439006055, + "grad_norm": 1.8651031255722046, + "learning_rate": 7.261509163232046e-06, + "loss": 1.4656, + "step": 11212 + }, + { + "epoch": 0.6130421114497765, + "grad_norm": 1.6619399785995483, + "learning_rate": 7.25975170665387e-06, + "loss": 1.2898, + "step": 11213 + }, + { + "epoch": 0.6130967838934981, + "grad_norm": 1.575763463973999, + "learning_rate": 7.2579943415790134e-06, + "loss": 1.2845, + "step": 11214 + }, + { + "epoch": 0.6131514563372197, + "grad_norm": 1.4241074323654175, + "learning_rate": 7.256237068066163e-06, + "loss": 1.5809, + "step": 11215 + }, + { + "epoch": 0.6132061287809412, + "grad_norm": 1.4265395402908325, + "learning_rate": 7.254479886173994e-06, + "loss": 1.377, + "step": 11216 + }, + { + "epoch": 0.6132608012246628, + "grad_norm": 1.9183828830718994, + "learning_rate": 7.25272279596118e-06, + "loss": 1.3146, + "step": 11217 + }, + { + "epoch": 0.6133154736683843, + "grad_norm": 2.364335775375366, + "learning_rate": 7.2509657974864045e-06, + "loss": 1.5444, + "step": 11218 + }, + { + "epoch": 0.6133701461121058, + "grad_norm": 2.3542497158050537, + "learning_rate": 7.24920889080833e-06, + "loss": 1.5015, + "step": 11219 + }, + { + "epoch": 0.6134248185558274, + "grad_norm": 1.3365931510925293, + "learning_rate": 7.247452075985622e-06, + "loss": 1.4967, + "step": 11220 + }, + { + "epoch": 0.6134794909995489, + "grad_norm": 2.0729901790618896, + "learning_rate": 7.245695353076953e-06, + "loss": 1.6361, + "step": 11221 + }, + { + "epoch": 0.6135341634432705, + "grad_norm": 1.550506830215454, + "learning_rate": 7.243938722140978e-06, + "loss": 1.4988, + "step": 11222 + }, + { + "epoch": 0.6135888358869921, + "grad_norm": 1.637242317199707, + "learning_rate": 7.242182183236356e-06, + "loss": 1.1836, + "step": 11223 + }, + { + "epoch": 0.6136435083307136, + "grad_norm": 1.763995885848999, + "learning_rate": 7.240425736421743e-06, + "loss": 1.5019, + "step": 11224 + }, + { + "epoch": 0.6136981807744352, + "grad_norm": 1.485520601272583, + "learning_rate": 7.238669381755791e-06, + "loss": 1.3717, + "step": 11225 + }, + { + "epoch": 0.6137528532181568, + "grad_norm": 1.7119613885879517, + "learning_rate": 7.236913119297144e-06, + "loss": 1.173, + "step": 11226 + }, + { + "epoch": 0.6138075256618782, + "grad_norm": 1.4839704036712646, + "learning_rate": 7.235156949104455e-06, + "loss": 1.4475, + "step": 11227 + }, + { + "epoch": 0.6138621981055998, + "grad_norm": 1.4733902215957642, + "learning_rate": 7.233400871236362e-06, + "loss": 1.7095, + "step": 11228 + }, + { + "epoch": 0.6139168705493214, + "grad_norm": 1.5378608703613281, + "learning_rate": 7.2316448857515076e-06, + "loss": 1.3675, + "step": 11229 + }, + { + "epoch": 0.6139715429930429, + "grad_norm": 1.427270770072937, + "learning_rate": 7.229888992708527e-06, + "loss": 1.5122, + "step": 11230 + }, + { + "epoch": 0.6140262154367645, + "grad_norm": 1.5339518785476685, + "learning_rate": 7.228133192166049e-06, + "loss": 1.4894, + "step": 11231 + }, + { + "epoch": 0.6140808878804861, + "grad_norm": 1.7719238996505737, + "learning_rate": 7.226377484182712e-06, + "loss": 1.4459, + "step": 11232 + }, + { + "epoch": 0.6141355603242076, + "grad_norm": 1.4762464761734009, + "learning_rate": 7.224621868817139e-06, + "loss": 1.3157, + "step": 11233 + }, + { + "epoch": 0.6141902327679292, + "grad_norm": 1.4526363611221313, + "learning_rate": 7.222866346127952e-06, + "loss": 1.6231, + "step": 11234 + }, + { + "epoch": 0.6142449052116506, + "grad_norm": 1.6684788465499878, + "learning_rate": 7.221110916173778e-06, + "loss": 1.2985, + "step": 11235 + }, + { + "epoch": 0.6142995776553722, + "grad_norm": 1.5004498958587646, + "learning_rate": 7.2193555790132296e-06, + "loss": 1.4911, + "step": 11236 + }, + { + "epoch": 0.6143542500990938, + "grad_norm": 1.6499230861663818, + "learning_rate": 7.217600334704922e-06, + "loss": 1.5313, + "step": 11237 + }, + { + "epoch": 0.6144089225428153, + "grad_norm": 1.5789332389831543, + "learning_rate": 7.21584518330747e-06, + "loss": 1.5533, + "step": 11238 + }, + { + "epoch": 0.6144635949865369, + "grad_norm": 1.5590800046920776, + "learning_rate": 7.214090124879481e-06, + "loss": 1.413, + "step": 11239 + }, + { + "epoch": 0.6145182674302585, + "grad_norm": 1.3136307001113892, + "learning_rate": 7.212335159479557e-06, + "loss": 1.6136, + "step": 11240 + }, + { + "epoch": 0.61457293987398, + "grad_norm": 1.2915900945663452, + "learning_rate": 7.210580287166307e-06, + "loss": 1.7388, + "step": 11241 + }, + { + "epoch": 0.6146276123177016, + "grad_norm": 1.7305477857589722, + "learning_rate": 7.208825507998326e-06, + "loss": 1.2802, + "step": 11242 + }, + { + "epoch": 0.6146822847614232, + "grad_norm": 1.3720003366470337, + "learning_rate": 7.20707082203421e-06, + "loss": 1.6848, + "step": 11243 + }, + { + "epoch": 0.6147369572051447, + "grad_norm": 1.4289815425872803, + "learning_rate": 7.205316229332555e-06, + "loss": 1.4266, + "step": 11244 + }, + { + "epoch": 0.6147916296488662, + "grad_norm": 1.6322166919708252, + "learning_rate": 7.203561729951944e-06, + "loss": 1.2452, + "step": 11245 + }, + { + "epoch": 0.6148463020925878, + "grad_norm": 1.5117839574813843, + "learning_rate": 7.201807323950973e-06, + "loss": 1.5374, + "step": 11246 + }, + { + "epoch": 0.6149009745363093, + "grad_norm": 1.4713413715362549, + "learning_rate": 7.200053011388223e-06, + "loss": 1.2965, + "step": 11247 + }, + { + "epoch": 0.6149556469800309, + "grad_norm": 1.3154767751693726, + "learning_rate": 7.198298792322271e-06, + "loss": 1.38, + "step": 11248 + }, + { + "epoch": 0.6150103194237524, + "grad_norm": 1.4814364910125732, + "learning_rate": 7.196544666811698e-06, + "loss": 1.4322, + "step": 11249 + }, + { + "epoch": 0.615064991867474, + "grad_norm": 1.4340389966964722, + "learning_rate": 7.194790634915077e-06, + "loss": 1.3911, + "step": 11250 + }, + { + "epoch": 0.6151196643111956, + "grad_norm": 1.4068740606307983, + "learning_rate": 7.193036696690976e-06, + "loss": 1.6361, + "step": 11251 + }, + { + "epoch": 0.6151743367549171, + "grad_norm": 1.629778265953064, + "learning_rate": 7.1912828521979695e-06, + "loss": 1.6221, + "step": 11252 + }, + { + "epoch": 0.6152290091986387, + "grad_norm": 1.8669359683990479, + "learning_rate": 7.18952910149462e-06, + "loss": 1.5469, + "step": 11253 + }, + { + "epoch": 0.6152836816423602, + "grad_norm": 1.3381799459457397, + "learning_rate": 7.1877754446394865e-06, + "loss": 1.5327, + "step": 11254 + }, + { + "epoch": 0.6153383540860817, + "grad_norm": 1.5031764507293701, + "learning_rate": 7.186021881691132e-06, + "loss": 1.416, + "step": 11255 + }, + { + "epoch": 0.6153930265298033, + "grad_norm": 1.6488858461380005, + "learning_rate": 7.184268412708111e-06, + "loss": 1.3829, + "step": 11256 + }, + { + "epoch": 0.6154476989735249, + "grad_norm": 1.5429956912994385, + "learning_rate": 7.182515037748969e-06, + "loss": 1.3517, + "step": 11257 + }, + { + "epoch": 0.6155023714172464, + "grad_norm": 2.15948224067688, + "learning_rate": 7.1807617568722674e-06, + "loss": 1.326, + "step": 11258 + }, + { + "epoch": 0.615557043860968, + "grad_norm": 1.6708383560180664, + "learning_rate": 7.179008570136543e-06, + "loss": 1.5271, + "step": 11259 + }, + { + "epoch": 0.6156117163046896, + "grad_norm": 1.581773042678833, + "learning_rate": 7.177255477600346e-06, + "loss": 1.4244, + "step": 11260 + }, + { + "epoch": 0.6156663887484111, + "grad_norm": 1.4308412075042725, + "learning_rate": 7.175502479322211e-06, + "loss": 1.4417, + "step": 11261 + }, + { + "epoch": 0.6157210611921327, + "grad_norm": 1.4269601106643677, + "learning_rate": 7.173749575360671e-06, + "loss": 1.5276, + "step": 11262 + }, + { + "epoch": 0.6157757336358541, + "grad_norm": 1.5103983879089355, + "learning_rate": 7.1719967657742696e-06, + "loss": 1.5337, + "step": 11263 + }, + { + "epoch": 0.6158304060795757, + "grad_norm": 1.4487314224243164, + "learning_rate": 7.170244050621533e-06, + "loss": 1.37, + "step": 11264 + }, + { + "epoch": 0.6158850785232973, + "grad_norm": 1.5742745399475098, + "learning_rate": 7.168491429960983e-06, + "loss": 1.4742, + "step": 11265 + }, + { + "epoch": 0.6159397509670188, + "grad_norm": 1.4424104690551758, + "learning_rate": 7.1667389038511535e-06, + "loss": 1.2978, + "step": 11266 + }, + { + "epoch": 0.6159944234107404, + "grad_norm": 1.312054991722107, + "learning_rate": 7.164986472350559e-06, + "loss": 1.4788, + "step": 11267 + }, + { + "epoch": 0.616049095854462, + "grad_norm": 1.549190878868103, + "learning_rate": 7.1632341355177185e-06, + "loss": 1.4678, + "step": 11268 + }, + { + "epoch": 0.6161037682981835, + "grad_norm": 1.5687532424926758, + "learning_rate": 7.1614818934111475e-06, + "loss": 1.2462, + "step": 11269 + }, + { + "epoch": 0.6161584407419051, + "grad_norm": 1.3738665580749512, + "learning_rate": 7.159729746089356e-06, + "loss": 1.3997, + "step": 11270 + }, + { + "epoch": 0.6162131131856267, + "grad_norm": 1.46299409866333, + "learning_rate": 7.1579776936108516e-06, + "loss": 1.4801, + "step": 11271 + }, + { + "epoch": 0.6162677856293481, + "grad_norm": 1.5765085220336914, + "learning_rate": 7.1562257360341434e-06, + "loss": 1.606, + "step": 11272 + }, + { + "epoch": 0.6163224580730697, + "grad_norm": 1.6900253295898438, + "learning_rate": 7.15447387341773e-06, + "loss": 1.3566, + "step": 11273 + }, + { + "epoch": 0.6163771305167913, + "grad_norm": 1.8336018323898315, + "learning_rate": 7.1527221058201136e-06, + "loss": 1.4898, + "step": 11274 + }, + { + "epoch": 0.6164318029605128, + "grad_norm": 1.529417872428894, + "learning_rate": 7.150970433299787e-06, + "loss": 1.2179, + "step": 11275 + }, + { + "epoch": 0.6164864754042344, + "grad_norm": 2.323982000350952, + "learning_rate": 7.1492188559152364e-06, + "loss": 1.4113, + "step": 11276 + }, + { + "epoch": 0.6165411478479559, + "grad_norm": 1.4577778577804565, + "learning_rate": 7.147467373724965e-06, + "loss": 1.5228, + "step": 11277 + }, + { + "epoch": 0.6165958202916775, + "grad_norm": 1.2919238805770874, + "learning_rate": 7.14571598678745e-06, + "loss": 1.4812, + "step": 11278 + }, + { + "epoch": 0.6166504927353991, + "grad_norm": 1.8857380151748657, + "learning_rate": 7.143964695161175e-06, + "loss": 1.3299, + "step": 11279 + }, + { + "epoch": 0.6167051651791206, + "grad_norm": 1.2176172733306885, + "learning_rate": 7.142213498904622e-06, + "loss": 1.4792, + "step": 11280 + }, + { + "epoch": 0.6167598376228421, + "grad_norm": 1.6774964332580566, + "learning_rate": 7.140462398076267e-06, + "loss": 1.6679, + "step": 11281 + }, + { + "epoch": 0.6168145100665637, + "grad_norm": 1.483738899230957, + "learning_rate": 7.138711392734579e-06, + "loss": 1.2712, + "step": 11282 + }, + { + "epoch": 0.6168691825102852, + "grad_norm": 1.6387643814086914, + "learning_rate": 7.136960482938035e-06, + "loss": 1.4216, + "step": 11283 + }, + { + "epoch": 0.6169238549540068, + "grad_norm": 1.4342089891433716, + "learning_rate": 7.135209668745097e-06, + "loss": 1.4631, + "step": 11284 + }, + { + "epoch": 0.6169785273977284, + "grad_norm": 1.8186475038528442, + "learning_rate": 7.133458950214229e-06, + "loss": 1.6161, + "step": 11285 + }, + { + "epoch": 0.6170331998414499, + "grad_norm": 1.6642022132873535, + "learning_rate": 7.131708327403897e-06, + "loss": 1.502, + "step": 11286 + }, + { + "epoch": 0.6170878722851715, + "grad_norm": 1.3822505474090576, + "learning_rate": 7.129957800372554e-06, + "loss": 1.4172, + "step": 11287 + }, + { + "epoch": 0.6171425447288931, + "grad_norm": 1.6530027389526367, + "learning_rate": 7.128207369178654e-06, + "loss": 1.6538, + "step": 11288 + }, + { + "epoch": 0.6171972171726146, + "grad_norm": 1.6328905820846558, + "learning_rate": 7.12645703388065e-06, + "loss": 1.6575, + "step": 11289 + }, + { + "epoch": 0.6172518896163361, + "grad_norm": 1.4530723094940186, + "learning_rate": 7.124706794536984e-06, + "loss": 1.2568, + "step": 11290 + }, + { + "epoch": 0.6173065620600576, + "grad_norm": 1.7239394187927246, + "learning_rate": 7.12295665120611e-06, + "loss": 1.1926, + "step": 11291 + }, + { + "epoch": 0.6173612345037792, + "grad_norm": 1.4491811990737915, + "learning_rate": 7.1212066039464645e-06, + "loss": 1.5379, + "step": 11292 + }, + { + "epoch": 0.6174159069475008, + "grad_norm": 1.4021509885787964, + "learning_rate": 7.119456652816483e-06, + "loss": 1.2938, + "step": 11293 + }, + { + "epoch": 0.6174705793912223, + "grad_norm": 1.6725143194198608, + "learning_rate": 7.117706797874606e-06, + "loss": 1.4654, + "step": 11294 + }, + { + "epoch": 0.6175252518349439, + "grad_norm": 1.8038337230682373, + "learning_rate": 7.115957039179263e-06, + "loss": 1.6268, + "step": 11295 + }, + { + "epoch": 0.6175799242786655, + "grad_norm": 1.15005362033844, + "learning_rate": 7.1142073767888774e-06, + "loss": 1.7423, + "step": 11296 + }, + { + "epoch": 0.617634596722387, + "grad_norm": 1.4514304399490356, + "learning_rate": 7.112457810761883e-06, + "loss": 1.5585, + "step": 11297 + }, + { + "epoch": 0.6176892691661086, + "grad_norm": 1.5547834634780884, + "learning_rate": 7.1107083411566994e-06, + "loss": 1.4718, + "step": 11298 + }, + { + "epoch": 0.6177439416098301, + "grad_norm": 1.6978785991668701, + "learning_rate": 7.10895896803174e-06, + "loss": 1.6465, + "step": 11299 + }, + { + "epoch": 0.6177986140535516, + "grad_norm": 1.4438306093215942, + "learning_rate": 7.107209691445429e-06, + "loss": 1.5043, + "step": 11300 + }, + { + "epoch": 0.6178532864972732, + "grad_norm": 1.678297996520996, + "learning_rate": 7.10546051145617e-06, + "loss": 1.3312, + "step": 11301 + }, + { + "epoch": 0.6179079589409948, + "grad_norm": 1.5490323305130005, + "learning_rate": 7.10371142812238e-06, + "loss": 1.304, + "step": 11302 + }, + { + "epoch": 0.6179626313847163, + "grad_norm": 1.4835631847381592, + "learning_rate": 7.101962441502462e-06, + "loss": 1.5729, + "step": 11303 + }, + { + "epoch": 0.6180173038284379, + "grad_norm": 2.2630739212036133, + "learning_rate": 7.100213551654816e-06, + "loss": 1.5423, + "step": 11304 + }, + { + "epoch": 0.6180719762721594, + "grad_norm": 1.3555306196212769, + "learning_rate": 7.098464758637846e-06, + "loss": 1.3749, + "step": 11305 + }, + { + "epoch": 0.618126648715881, + "grad_norm": 1.546351671218872, + "learning_rate": 7.096716062509947e-06, + "loss": 1.4488, + "step": 11306 + }, + { + "epoch": 0.6181813211596026, + "grad_norm": 1.4902551174163818, + "learning_rate": 7.0949674633295094e-06, + "loss": 1.3693, + "step": 11307 + }, + { + "epoch": 0.618235993603324, + "grad_norm": 1.3330464363098145, + "learning_rate": 7.093218961154926e-06, + "loss": 1.7224, + "step": 11308 + }, + { + "epoch": 0.6182906660470456, + "grad_norm": 2.044011116027832, + "learning_rate": 7.091470556044584e-06, + "loss": 1.5744, + "step": 11309 + }, + { + "epoch": 0.6183453384907672, + "grad_norm": 1.5248863697052002, + "learning_rate": 7.089722248056862e-06, + "loss": 1.4126, + "step": 11310 + }, + { + "epoch": 0.6184000109344887, + "grad_norm": 1.4210618734359741, + "learning_rate": 7.087974037250146e-06, + "loss": 1.3214, + "step": 11311 + }, + { + "epoch": 0.6184546833782103, + "grad_norm": 1.5331138372421265, + "learning_rate": 7.08622592368281e-06, + "loss": 1.3433, + "step": 11312 + }, + { + "epoch": 0.6185093558219319, + "grad_norm": 1.4332351684570312, + "learning_rate": 7.084477907413226e-06, + "loss": 1.4835, + "step": 11313 + }, + { + "epoch": 0.6185640282656534, + "grad_norm": 1.7265453338623047, + "learning_rate": 7.082729988499768e-06, + "loss": 1.507, + "step": 11314 + }, + { + "epoch": 0.618618700709375, + "grad_norm": 1.4977245330810547, + "learning_rate": 7.080982167000799e-06, + "loss": 1.5052, + "step": 11315 + }, + { + "epoch": 0.6186733731530966, + "grad_norm": 1.595024824142456, + "learning_rate": 7.079234442974688e-06, + "loss": 1.2623, + "step": 11316 + }, + { + "epoch": 0.618728045596818, + "grad_norm": 1.4534627199172974, + "learning_rate": 7.077486816479792e-06, + "loss": 1.3666, + "step": 11317 + }, + { + "epoch": 0.6187827180405396, + "grad_norm": 1.667556643486023, + "learning_rate": 7.075739287574467e-06, + "loss": 1.4971, + "step": 11318 + }, + { + "epoch": 0.6188373904842611, + "grad_norm": 1.518512487411499, + "learning_rate": 7.073991856317072e-06, + "loss": 1.3147, + "step": 11319 + }, + { + "epoch": 0.6188920629279827, + "grad_norm": 1.4207017421722412, + "learning_rate": 7.072244522765954e-06, + "loss": 1.6682, + "step": 11320 + }, + { + "epoch": 0.6189467353717043, + "grad_norm": 1.482910394668579, + "learning_rate": 7.070497286979459e-06, + "loss": 1.4135, + "step": 11321 + }, + { + "epoch": 0.6190014078154258, + "grad_norm": 1.6164419651031494, + "learning_rate": 7.068750149015937e-06, + "loss": 1.4089, + "step": 11322 + }, + { + "epoch": 0.6190560802591474, + "grad_norm": 2.005305290222168, + "learning_rate": 7.067003108933725e-06, + "loss": 1.4012, + "step": 11323 + }, + { + "epoch": 0.619110752702869, + "grad_norm": 1.5075533390045166, + "learning_rate": 7.0652561667911605e-06, + "loss": 1.5207, + "step": 11324 + }, + { + "epoch": 0.6191654251465905, + "grad_norm": 1.6045595407485962, + "learning_rate": 7.063509322646581e-06, + "loss": 1.4673, + "step": 11325 + }, + { + "epoch": 0.619220097590312, + "grad_norm": 1.3518534898757935, + "learning_rate": 7.061762576558316e-06, + "loss": 1.3634, + "step": 11326 + }, + { + "epoch": 0.6192747700340336, + "grad_norm": 1.8542567491531372, + "learning_rate": 7.060015928584691e-06, + "loss": 1.721, + "step": 11327 + }, + { + "epoch": 0.6193294424777551, + "grad_norm": 1.9349510669708252, + "learning_rate": 7.058269378784037e-06, + "loss": 1.3852, + "step": 11328 + }, + { + "epoch": 0.6193841149214767, + "grad_norm": 1.658003807067871, + "learning_rate": 7.056522927214666e-06, + "loss": 1.4123, + "step": 11329 + }, + { + "epoch": 0.6194387873651983, + "grad_norm": 1.5262209177017212, + "learning_rate": 7.054776573934906e-06, + "loss": 1.3126, + "step": 11330 + }, + { + "epoch": 0.6194934598089198, + "grad_norm": 1.340118169784546, + "learning_rate": 7.053030319003067e-06, + "loss": 1.3446, + "step": 11331 + }, + { + "epoch": 0.6195481322526414, + "grad_norm": 2.3311946392059326, + "learning_rate": 7.051284162477459e-06, + "loss": 1.3639, + "step": 11332 + }, + { + "epoch": 0.6196028046963629, + "grad_norm": 1.342735767364502, + "learning_rate": 7.049538104416395e-06, + "loss": 1.4848, + "step": 11333 + }, + { + "epoch": 0.6196574771400845, + "grad_norm": 1.2663503885269165, + "learning_rate": 7.047792144878176e-06, + "loss": 1.4071, + "step": 11334 + }, + { + "epoch": 0.619712149583806, + "grad_norm": 1.4455547332763672, + "learning_rate": 7.046046283921102e-06, + "loss": 1.6312, + "step": 11335 + }, + { + "epoch": 0.6197668220275275, + "grad_norm": 2.4003121852874756, + "learning_rate": 7.044300521603476e-06, + "loss": 1.228, + "step": 11336 + }, + { + "epoch": 0.6198214944712491, + "grad_norm": 1.3814276456832886, + "learning_rate": 7.042554857983594e-06, + "loss": 1.4066, + "step": 11337 + }, + { + "epoch": 0.6198761669149707, + "grad_norm": 1.973291277885437, + "learning_rate": 7.040809293119741e-06, + "loss": 1.4122, + "step": 11338 + }, + { + "epoch": 0.6199308393586922, + "grad_norm": 1.3901803493499756, + "learning_rate": 7.039063827070214e-06, + "loss": 1.6264, + "step": 11339 + }, + { + "epoch": 0.6199855118024138, + "grad_norm": 1.3091086149215698, + "learning_rate": 7.037318459893292e-06, + "loss": 1.6452, + "step": 11340 + }, + { + "epoch": 0.6200401842461354, + "grad_norm": 1.1027815341949463, + "learning_rate": 7.035573191647256e-06, + "loss": 1.5865, + "step": 11341 + }, + { + "epoch": 0.6200948566898569, + "grad_norm": 1.530860424041748, + "learning_rate": 7.0338280223903895e-06, + "loss": 1.3578, + "step": 11342 + }, + { + "epoch": 0.6201495291335785, + "grad_norm": 2.0049936771392822, + "learning_rate": 7.032082952180963e-06, + "loss": 1.4381, + "step": 11343 + }, + { + "epoch": 0.6202042015773, + "grad_norm": 1.324973225593567, + "learning_rate": 7.030337981077255e-06, + "loss": 1.377, + "step": 11344 + }, + { + "epoch": 0.6202588740210215, + "grad_norm": 1.423218011856079, + "learning_rate": 7.028593109137531e-06, + "loss": 1.5358, + "step": 11345 + }, + { + "epoch": 0.6203135464647431, + "grad_norm": 1.4772859811782837, + "learning_rate": 7.026848336420053e-06, + "loss": 1.5484, + "step": 11346 + }, + { + "epoch": 0.6203682189084646, + "grad_norm": 1.4506886005401611, + "learning_rate": 7.025103662983088e-06, + "loss": 1.5415, + "step": 11347 + }, + { + "epoch": 0.6204228913521862, + "grad_norm": 1.451756238937378, + "learning_rate": 7.023359088884892e-06, + "loss": 1.4864, + "step": 11348 + }, + { + "epoch": 0.6204775637959078, + "grad_norm": 1.40500807762146, + "learning_rate": 7.021614614183719e-06, + "loss": 1.5, + "step": 11349 + }, + { + "epoch": 0.6205322362396293, + "grad_norm": 1.6555454730987549, + "learning_rate": 7.019870238937825e-06, + "loss": 1.514, + "step": 11350 + }, + { + "epoch": 0.6205869086833509, + "grad_norm": 1.3474652767181396, + "learning_rate": 7.0181259632054555e-06, + "loss": 1.3119, + "step": 11351 + }, + { + "epoch": 0.6206415811270725, + "grad_norm": 1.3833473920822144, + "learning_rate": 7.016381787044857e-06, + "loss": 1.5569, + "step": 11352 + }, + { + "epoch": 0.6206962535707939, + "grad_norm": 1.4037421941757202, + "learning_rate": 7.014637710514274e-06, + "loss": 1.3781, + "step": 11353 + }, + { + "epoch": 0.6207509260145155, + "grad_norm": 1.6895873546600342, + "learning_rate": 7.012893733671944e-06, + "loss": 1.4735, + "step": 11354 + }, + { + "epoch": 0.6208055984582371, + "grad_norm": 1.8576700687408447, + "learning_rate": 7.011149856576096e-06, + "loss": 1.4669, + "step": 11355 + }, + { + "epoch": 0.6208602709019586, + "grad_norm": 1.39018714427948, + "learning_rate": 7.009406079284973e-06, + "loss": 1.3925, + "step": 11356 + }, + { + "epoch": 0.6209149433456802, + "grad_norm": 1.5546581745147705, + "learning_rate": 7.007662401856796e-06, + "loss": 1.1396, + "step": 11357 + }, + { + "epoch": 0.6209696157894018, + "grad_norm": 1.461875081062317, + "learning_rate": 7.005918824349796e-06, + "loss": 1.372, + "step": 11358 + }, + { + "epoch": 0.6210242882331233, + "grad_norm": 1.9631699323654175, + "learning_rate": 7.004175346822191e-06, + "loss": 1.3886, + "step": 11359 + }, + { + "epoch": 0.6210789606768449, + "grad_norm": 1.3004335165023804, + "learning_rate": 7.002431969332197e-06, + "loss": 1.471, + "step": 11360 + }, + { + "epoch": 0.6211336331205664, + "grad_norm": 1.4095059633255005, + "learning_rate": 7.00068869193804e-06, + "loss": 1.4931, + "step": 11361 + }, + { + "epoch": 0.621188305564288, + "grad_norm": 2.425631284713745, + "learning_rate": 6.9989455146979236e-06, + "loss": 1.4223, + "step": 11362 + }, + { + "epoch": 0.6212429780080095, + "grad_norm": 1.5658575296401978, + "learning_rate": 6.997202437670054e-06, + "loss": 1.701, + "step": 11363 + }, + { + "epoch": 0.621297650451731, + "grad_norm": 1.5437408685684204, + "learning_rate": 6.9954594609126484e-06, + "loss": 1.4451, + "step": 11364 + }, + { + "epoch": 0.6213523228954526, + "grad_norm": 1.9241944551467896, + "learning_rate": 6.9937165844838986e-06, + "loss": 1.4372, + "step": 11365 + }, + { + "epoch": 0.6214069953391742, + "grad_norm": 1.1357818841934204, + "learning_rate": 6.9919738084420055e-06, + "loss": 1.5098, + "step": 11366 + }, + { + "epoch": 0.6214616677828957, + "grad_norm": 1.5452214479446411, + "learning_rate": 6.990231132845169e-06, + "loss": 1.269, + "step": 11367 + }, + { + "epoch": 0.6215163402266173, + "grad_norm": 1.9173744916915894, + "learning_rate": 6.988488557751576e-06, + "loss": 1.5096, + "step": 11368 + }, + { + "epoch": 0.6215710126703389, + "grad_norm": 1.373375654220581, + "learning_rate": 6.986746083219412e-06, + "loss": 1.5766, + "step": 11369 + }, + { + "epoch": 0.6216256851140604, + "grad_norm": 1.5970295667648315, + "learning_rate": 6.985003709306872e-06, + "loss": 1.5004, + "step": 11370 + }, + { + "epoch": 0.621680357557782, + "grad_norm": 1.6172000169754028, + "learning_rate": 6.983261436072132e-06, + "loss": 1.174, + "step": 11371 + }, + { + "epoch": 0.6217350300015035, + "grad_norm": 1.5600913763046265, + "learning_rate": 6.981519263573373e-06, + "loss": 1.4454, + "step": 11372 + }, + { + "epoch": 0.621789702445225, + "grad_norm": 1.2849739789962769, + "learning_rate": 6.979777191868768e-06, + "loss": 1.3723, + "step": 11373 + }, + { + "epoch": 0.6218443748889466, + "grad_norm": 2.1352076530456543, + "learning_rate": 6.978035221016487e-06, + "loss": 1.2543, + "step": 11374 + }, + { + "epoch": 0.6218990473326681, + "grad_norm": 1.8882614374160767, + "learning_rate": 6.976293351074705e-06, + "loss": 1.4979, + "step": 11375 + }, + { + "epoch": 0.6219537197763897, + "grad_norm": 1.4393997192382812, + "learning_rate": 6.974551582101583e-06, + "loss": 1.4475, + "step": 11376 + }, + { + "epoch": 0.6220083922201113, + "grad_norm": 1.343148946762085, + "learning_rate": 6.97280991415528e-06, + "loss": 1.7464, + "step": 11377 + }, + { + "epoch": 0.6220630646638328, + "grad_norm": 1.446439266204834, + "learning_rate": 6.971068347293961e-06, + "loss": 1.5041, + "step": 11378 + }, + { + "epoch": 0.6221177371075544, + "grad_norm": 1.917500615119934, + "learning_rate": 6.969326881575777e-06, + "loss": 1.2708, + "step": 11379 + }, + { + "epoch": 0.622172409551276, + "grad_norm": 1.6582297086715698, + "learning_rate": 6.967585517058877e-06, + "loss": 1.4659, + "step": 11380 + }, + { + "epoch": 0.6222270819949974, + "grad_norm": 1.476306438446045, + "learning_rate": 6.965844253801416e-06, + "loss": 1.5015, + "step": 11381 + }, + { + "epoch": 0.622281754438719, + "grad_norm": 1.6985043287277222, + "learning_rate": 6.964103091861535e-06, + "loss": 1.4812, + "step": 11382 + }, + { + "epoch": 0.6223364268824406, + "grad_norm": 2.1637661457061768, + "learning_rate": 6.962362031297372e-06, + "loss": 1.3664, + "step": 11383 + }, + { + "epoch": 0.6223910993261621, + "grad_norm": 1.4181907176971436, + "learning_rate": 6.960621072167071e-06, + "loss": 1.4386, + "step": 11384 + }, + { + "epoch": 0.6224457717698837, + "grad_norm": 1.4279810190200806, + "learning_rate": 6.9588802145287645e-06, + "loss": 1.404, + "step": 11385 + }, + { + "epoch": 0.6225004442136053, + "grad_norm": 1.5797406435012817, + "learning_rate": 6.957139458440585e-06, + "loss": 1.3739, + "step": 11386 + }, + { + "epoch": 0.6225551166573268, + "grad_norm": 1.5002485513687134, + "learning_rate": 6.95539880396066e-06, + "loss": 1.2852, + "step": 11387 + }, + { + "epoch": 0.6226097891010484, + "grad_norm": 1.9034035205841064, + "learning_rate": 6.953658251147109e-06, + "loss": 1.3392, + "step": 11388 + }, + { + "epoch": 0.6226644615447698, + "grad_norm": 1.49906325340271, + "learning_rate": 6.951917800058061e-06, + "loss": 1.5309, + "step": 11389 + }, + { + "epoch": 0.6227191339884914, + "grad_norm": 1.5328285694122314, + "learning_rate": 6.950177450751631e-06, + "loss": 1.4366, + "step": 11390 + }, + { + "epoch": 0.622773806432213, + "grad_norm": 1.8956403732299805, + "learning_rate": 6.948437203285929e-06, + "loss": 1.3791, + "step": 11391 + }, + { + "epoch": 0.6228284788759345, + "grad_norm": 1.6971354484558105, + "learning_rate": 6.946697057719074e-06, + "loss": 1.4768, + "step": 11392 + }, + { + "epoch": 0.6228831513196561, + "grad_norm": 1.645932674407959, + "learning_rate": 6.944957014109167e-06, + "loss": 1.6253, + "step": 11393 + }, + { + "epoch": 0.6229378237633777, + "grad_norm": 1.6717156171798706, + "learning_rate": 6.943217072514311e-06, + "loss": 1.3719, + "step": 11394 + }, + { + "epoch": 0.6229924962070992, + "grad_norm": 1.3806374073028564, + "learning_rate": 6.941477232992614e-06, + "loss": 1.5828, + "step": 11395 + }, + { + "epoch": 0.6230471686508208, + "grad_norm": 1.6089264154434204, + "learning_rate": 6.939737495602169e-06, + "loss": 1.5641, + "step": 11396 + }, + { + "epoch": 0.6231018410945424, + "grad_norm": 1.596998929977417, + "learning_rate": 6.937997860401068e-06, + "loss": 1.2456, + "step": 11397 + }, + { + "epoch": 0.6231565135382638, + "grad_norm": 1.482506513595581, + "learning_rate": 6.936258327447406e-06, + "loss": 1.5428, + "step": 11398 + }, + { + "epoch": 0.6232111859819854, + "grad_norm": 2.011389970779419, + "learning_rate": 6.934518896799263e-06, + "loss": 1.0215, + "step": 11399 + }, + { + "epoch": 0.623265858425707, + "grad_norm": 1.567371129989624, + "learning_rate": 6.932779568514731e-06, + "loss": 1.4306, + "step": 11400 + }, + { + "epoch": 0.6233205308694285, + "grad_norm": 2.022134304046631, + "learning_rate": 6.9310403426518895e-06, + "loss": 1.2035, + "step": 11401 + }, + { + "epoch": 0.6233752033131501, + "grad_norm": 1.8272984027862549, + "learning_rate": 6.929301219268806e-06, + "loss": 1.4658, + "step": 11402 + }, + { + "epoch": 0.6234298757568716, + "grad_norm": 2.004667043685913, + "learning_rate": 6.9275621984235654e-06, + "loss": 1.4122, + "step": 11403 + }, + { + "epoch": 0.6234845482005932, + "grad_norm": 1.6536648273468018, + "learning_rate": 6.925823280174232e-06, + "loss": 1.449, + "step": 11404 + }, + { + "epoch": 0.6235392206443148, + "grad_norm": 1.5341815948486328, + "learning_rate": 6.924084464578871e-06, + "loss": 1.4339, + "step": 11405 + }, + { + "epoch": 0.6235938930880363, + "grad_norm": 1.7786390781402588, + "learning_rate": 6.92234575169555e-06, + "loss": 1.5283, + "step": 11406 + }, + { + "epoch": 0.6236485655317578, + "grad_norm": 1.6796973943710327, + "learning_rate": 6.920607141582327e-06, + "loss": 1.4984, + "step": 11407 + }, + { + "epoch": 0.6237032379754794, + "grad_norm": 1.4974743127822876, + "learning_rate": 6.918868634297252e-06, + "loss": 1.5637, + "step": 11408 + }, + { + "epoch": 0.6237579104192009, + "grad_norm": 1.1716686487197876, + "learning_rate": 6.917130229898387e-06, + "loss": 1.5114, + "step": 11409 + }, + { + "epoch": 0.6238125828629225, + "grad_norm": 1.1350476741790771, + "learning_rate": 6.9153919284437795e-06, + "loss": 1.514, + "step": 11410 + }, + { + "epoch": 0.6238672553066441, + "grad_norm": 1.5462225675582886, + "learning_rate": 6.913653729991472e-06, + "loss": 1.5897, + "step": 11411 + }, + { + "epoch": 0.6239219277503656, + "grad_norm": 1.5727322101593018, + "learning_rate": 6.911915634599511e-06, + "loss": 1.4474, + "step": 11412 + }, + { + "epoch": 0.6239766001940872, + "grad_norm": 1.1857000589370728, + "learning_rate": 6.9101776423259285e-06, + "loss": 1.4633, + "step": 11413 + }, + { + "epoch": 0.6240312726378088, + "grad_norm": 1.2992401123046875, + "learning_rate": 6.908439753228769e-06, + "loss": 1.2816, + "step": 11414 + }, + { + "epoch": 0.6240859450815303, + "grad_norm": 1.1707695722579956, + "learning_rate": 6.906701967366061e-06, + "loss": 1.4396, + "step": 11415 + }, + { + "epoch": 0.6241406175252519, + "grad_norm": 1.7150362730026245, + "learning_rate": 6.90496428479583e-06, + "loss": 1.4205, + "step": 11416 + }, + { + "epoch": 0.6241952899689734, + "grad_norm": 1.379136323928833, + "learning_rate": 6.903226705576107e-06, + "loss": 1.3817, + "step": 11417 + }, + { + "epoch": 0.6242499624126949, + "grad_norm": 1.4961756467819214, + "learning_rate": 6.90148922976491e-06, + "loss": 1.2982, + "step": 11418 + }, + { + "epoch": 0.6243046348564165, + "grad_norm": 1.3869283199310303, + "learning_rate": 6.899751857420256e-06, + "loss": 1.3589, + "step": 11419 + }, + { + "epoch": 0.624359307300138, + "grad_norm": 1.7831544876098633, + "learning_rate": 6.898014588600166e-06, + "loss": 1.7431, + "step": 11420 + }, + { + "epoch": 0.6244139797438596, + "grad_norm": 1.3355439901351929, + "learning_rate": 6.896277423362648e-06, + "loss": 1.7106, + "step": 11421 + }, + { + "epoch": 0.6244686521875812, + "grad_norm": 1.1942518949508667, + "learning_rate": 6.894540361765706e-06, + "loss": 1.5984, + "step": 11422 + }, + { + "epoch": 0.6245233246313027, + "grad_norm": 1.4455989599227905, + "learning_rate": 6.892803403867352e-06, + "loss": 1.5155, + "step": 11423 + }, + { + "epoch": 0.6245779970750243, + "grad_norm": 1.7018177509307861, + "learning_rate": 6.891066549725585e-06, + "loss": 1.4687, + "step": 11424 + }, + { + "epoch": 0.6246326695187459, + "grad_norm": 1.5803769826889038, + "learning_rate": 6.889329799398397e-06, + "loss": 1.5057, + "step": 11425 + }, + { + "epoch": 0.6246873419624673, + "grad_norm": 1.534232258796692, + "learning_rate": 6.88759315294379e-06, + "loss": 1.3454, + "step": 11426 + }, + { + "epoch": 0.6247420144061889, + "grad_norm": 1.760578989982605, + "learning_rate": 6.885856610419747e-06, + "loss": 1.2677, + "step": 11427 + }, + { + "epoch": 0.6247966868499105, + "grad_norm": 1.326806902885437, + "learning_rate": 6.884120171884263e-06, + "loss": 1.4677, + "step": 11428 + }, + { + "epoch": 0.624851359293632, + "grad_norm": 1.8789583444595337, + "learning_rate": 6.882383837395319e-06, + "loss": 1.1878, + "step": 11429 + }, + { + "epoch": 0.6249060317373536, + "grad_norm": 1.4343653917312622, + "learning_rate": 6.8806476070108905e-06, + "loss": 1.4197, + "step": 11430 + }, + { + "epoch": 0.6249607041810752, + "grad_norm": 1.6822841167449951, + "learning_rate": 6.878911480788961e-06, + "loss": 1.6023, + "step": 11431 + }, + { + "epoch": 0.6250153766247967, + "grad_norm": 1.4277268648147583, + "learning_rate": 6.8771754587875015e-06, + "loss": 1.2784, + "step": 11432 + }, + { + "epoch": 0.6250700490685183, + "grad_norm": 1.472919225692749, + "learning_rate": 6.875439541064477e-06, + "loss": 1.6666, + "step": 11433 + }, + { + "epoch": 0.6251247215122397, + "grad_norm": 1.4465314149856567, + "learning_rate": 6.873703727677863e-06, + "loss": 1.4003, + "step": 11434 + }, + { + "epoch": 0.6251793939559613, + "grad_norm": 1.2964444160461426, + "learning_rate": 6.8719680186856175e-06, + "loss": 1.431, + "step": 11435 + }, + { + "epoch": 0.6252340663996829, + "grad_norm": 1.7040340900421143, + "learning_rate": 6.870232414145696e-06, + "loss": 1.6015, + "step": 11436 + }, + { + "epoch": 0.6252887388434044, + "grad_norm": 1.2222744226455688, + "learning_rate": 6.868496914116063e-06, + "loss": 1.4945, + "step": 11437 + }, + { + "epoch": 0.625343411287126, + "grad_norm": 1.6460754871368408, + "learning_rate": 6.866761518654665e-06, + "loss": 1.5018, + "step": 11438 + }, + { + "epoch": 0.6253980837308476, + "grad_norm": 1.4240981340408325, + "learning_rate": 6.865026227819449e-06, + "loss": 1.4775, + "step": 11439 + }, + { + "epoch": 0.6254527561745691, + "grad_norm": 1.7670655250549316, + "learning_rate": 6.8632910416683674e-06, + "loss": 1.2674, + "step": 11440 + }, + { + "epoch": 0.6255074286182907, + "grad_norm": 1.688070297241211, + "learning_rate": 6.861555960259354e-06, + "loss": 1.5129, + "step": 11441 + }, + { + "epoch": 0.6255621010620123, + "grad_norm": 2.257840394973755, + "learning_rate": 6.859820983650356e-06, + "loss": 1.4722, + "step": 11442 + }, + { + "epoch": 0.6256167735057337, + "grad_norm": 1.5317450761795044, + "learning_rate": 6.858086111899304e-06, + "loss": 1.4732, + "step": 11443 + }, + { + "epoch": 0.6256714459494553, + "grad_norm": 1.4739024639129639, + "learning_rate": 6.856351345064127e-06, + "loss": 1.6115, + "step": 11444 + }, + { + "epoch": 0.6257261183931769, + "grad_norm": 1.278967022895813, + "learning_rate": 6.854616683202757e-06, + "loss": 1.4604, + "step": 11445 + }, + { + "epoch": 0.6257807908368984, + "grad_norm": 1.403859257698059, + "learning_rate": 6.852882126373118e-06, + "loss": 1.4666, + "step": 11446 + }, + { + "epoch": 0.62583546328062, + "grad_norm": 1.37687349319458, + "learning_rate": 6.851147674633125e-06, + "loss": 1.5844, + "step": 11447 + }, + { + "epoch": 0.6258901357243415, + "grad_norm": 1.6141566038131714, + "learning_rate": 6.849413328040705e-06, + "loss": 1.5255, + "step": 11448 + }, + { + "epoch": 0.6259448081680631, + "grad_norm": 1.6823092699050903, + "learning_rate": 6.8476790866537665e-06, + "loss": 1.5854, + "step": 11449 + }, + { + "epoch": 0.6259994806117847, + "grad_norm": 1.4342718124389648, + "learning_rate": 6.845944950530219e-06, + "loss": 1.4064, + "step": 11450 + }, + { + "epoch": 0.6260541530555062, + "grad_norm": 1.4616444110870361, + "learning_rate": 6.844210919727971e-06, + "loss": 1.528, + "step": 11451 + }, + { + "epoch": 0.6261088254992278, + "grad_norm": 1.4810489416122437, + "learning_rate": 6.842476994304929e-06, + "loss": 1.483, + "step": 11452 + }, + { + "epoch": 0.6261634979429493, + "grad_norm": 1.621923565864563, + "learning_rate": 6.840743174318982e-06, + "loss": 1.4488, + "step": 11453 + }, + { + "epoch": 0.6262181703866708, + "grad_norm": 1.5607221126556396, + "learning_rate": 6.839009459828041e-06, + "loss": 1.6127, + "step": 11454 + }, + { + "epoch": 0.6262728428303924, + "grad_norm": 1.5573385953903198, + "learning_rate": 6.837275850889987e-06, + "loss": 1.4813, + "step": 11455 + }, + { + "epoch": 0.626327515274114, + "grad_norm": 1.9763848781585693, + "learning_rate": 6.835542347562717e-06, + "loss": 1.3067, + "step": 11456 + }, + { + "epoch": 0.6263821877178355, + "grad_norm": 1.4663636684417725, + "learning_rate": 6.8338089499041135e-06, + "loss": 1.4515, + "step": 11457 + }, + { + "epoch": 0.6264368601615571, + "grad_norm": 1.671555519104004, + "learning_rate": 6.8320756579720545e-06, + "loss": 1.4267, + "step": 11458 + }, + { + "epoch": 0.6264915326052787, + "grad_norm": 1.5715205669403076, + "learning_rate": 6.830342471824428e-06, + "loss": 1.4242, + "step": 11459 + }, + { + "epoch": 0.6265462050490002, + "grad_norm": 1.4004842042922974, + "learning_rate": 6.828609391519103e-06, + "loss": 1.4904, + "step": 11460 + }, + { + "epoch": 0.6266008774927218, + "grad_norm": 1.708982229232788, + "learning_rate": 6.82687641711395e-06, + "loss": 1.4335, + "step": 11461 + }, + { + "epoch": 0.6266555499364432, + "grad_norm": 1.620648741722107, + "learning_rate": 6.825143548666841e-06, + "loss": 1.4716, + "step": 11462 + }, + { + "epoch": 0.6267102223801648, + "grad_norm": 1.5636001825332642, + "learning_rate": 6.823410786235643e-06, + "loss": 1.4184, + "step": 11463 + }, + { + "epoch": 0.6267648948238864, + "grad_norm": 1.7688616514205933, + "learning_rate": 6.821678129878206e-06, + "loss": 1.3861, + "step": 11464 + }, + { + "epoch": 0.6268195672676079, + "grad_norm": 1.9931888580322266, + "learning_rate": 6.819945579652401e-06, + "loss": 1.6231, + "step": 11465 + }, + { + "epoch": 0.6268742397113295, + "grad_norm": 1.4624019861221313, + "learning_rate": 6.818213135616072e-06, + "loss": 1.3598, + "step": 11466 + }, + { + "epoch": 0.6269289121550511, + "grad_norm": 1.648319959640503, + "learning_rate": 6.81648079782707e-06, + "loss": 1.4229, + "step": 11467 + }, + { + "epoch": 0.6269835845987726, + "grad_norm": 1.4410252571105957, + "learning_rate": 6.814748566343248e-06, + "loss": 1.2722, + "step": 11468 + }, + { + "epoch": 0.6270382570424942, + "grad_norm": 1.5139027833938599, + "learning_rate": 6.813016441222444e-06, + "loss": 1.2807, + "step": 11469 + }, + { + "epoch": 0.6270929294862158, + "grad_norm": 1.425146222114563, + "learning_rate": 6.8112844225225015e-06, + "loss": 1.4589, + "step": 11470 + }, + { + "epoch": 0.6271476019299372, + "grad_norm": 1.5392687320709229, + "learning_rate": 6.809552510301255e-06, + "loss": 1.3923, + "step": 11471 + }, + { + "epoch": 0.6272022743736588, + "grad_norm": 1.9288345575332642, + "learning_rate": 6.807820704616532e-06, + "loss": 1.6061, + "step": 11472 + }, + { + "epoch": 0.6272569468173804, + "grad_norm": 1.3916956186294556, + "learning_rate": 6.806089005526171e-06, + "loss": 1.2947, + "step": 11473 + }, + { + "epoch": 0.6273116192611019, + "grad_norm": 1.2916239500045776, + "learning_rate": 6.804357413087993e-06, + "loss": 1.2848, + "step": 11474 + }, + { + "epoch": 0.6273662917048235, + "grad_norm": 1.4189035892486572, + "learning_rate": 6.802625927359818e-06, + "loss": 1.3302, + "step": 11475 + }, + { + "epoch": 0.627420964148545, + "grad_norm": 1.5615055561065674, + "learning_rate": 6.800894548399467e-06, + "loss": 1.3651, + "step": 11476 + }, + { + "epoch": 0.6274756365922666, + "grad_norm": 1.420992136001587, + "learning_rate": 6.799163276264756e-06, + "loss": 1.5937, + "step": 11477 + }, + { + "epoch": 0.6275303090359882, + "grad_norm": 1.3725337982177734, + "learning_rate": 6.797432111013488e-06, + "loss": 1.6394, + "step": 11478 + }, + { + "epoch": 0.6275849814797096, + "grad_norm": 1.6630194187164307, + "learning_rate": 6.795701052703482e-06, + "loss": 1.5285, + "step": 11479 + }, + { + "epoch": 0.6276396539234312, + "grad_norm": 1.9384119510650635, + "learning_rate": 6.793970101392537e-06, + "loss": 1.5027, + "step": 11480 + }, + { + "epoch": 0.6276943263671528, + "grad_norm": 1.4688572883605957, + "learning_rate": 6.792239257138449e-06, + "loss": 1.5063, + "step": 11481 + }, + { + "epoch": 0.6277489988108743, + "grad_norm": 1.3088688850402832, + "learning_rate": 6.790508519999024e-06, + "loss": 1.629, + "step": 11482 + }, + { + "epoch": 0.6278036712545959, + "grad_norm": 1.6467381715774536, + "learning_rate": 6.788777890032048e-06, + "loss": 1.4127, + "step": 11483 + }, + { + "epoch": 0.6278583436983175, + "grad_norm": 1.5445975065231323, + "learning_rate": 6.787047367295316e-06, + "loss": 1.5423, + "step": 11484 + }, + { + "epoch": 0.627913016142039, + "grad_norm": 2.3864026069641113, + "learning_rate": 6.785316951846612e-06, + "loss": 1.4334, + "step": 11485 + }, + { + "epoch": 0.6279676885857606, + "grad_norm": 1.577447533607483, + "learning_rate": 6.783586643743714e-06, + "loss": 1.5113, + "step": 11486 + }, + { + "epoch": 0.6280223610294822, + "grad_norm": 1.431781530380249, + "learning_rate": 6.78185644304441e-06, + "loss": 1.6522, + "step": 11487 + }, + { + "epoch": 0.6280770334732036, + "grad_norm": 1.4022389650344849, + "learning_rate": 6.7801263498064705e-06, + "loss": 1.6314, + "step": 11488 + }, + { + "epoch": 0.6281317059169252, + "grad_norm": 1.1616288423538208, + "learning_rate": 6.778396364087667e-06, + "loss": 1.616, + "step": 11489 + }, + { + "epoch": 0.6281863783606467, + "grad_norm": 1.8592464923858643, + "learning_rate": 6.77666648594577e-06, + "loss": 1.4428, + "step": 11490 + }, + { + "epoch": 0.6282410508043683, + "grad_norm": 1.492042064666748, + "learning_rate": 6.7749367154385424e-06, + "loss": 1.4171, + "step": 11491 + }, + { + "epoch": 0.6282957232480899, + "grad_norm": 1.5614054203033447, + "learning_rate": 6.773207052623743e-06, + "loss": 1.374, + "step": 11492 + }, + { + "epoch": 0.6283503956918114, + "grad_norm": 1.4456466436386108, + "learning_rate": 6.7714774975591335e-06, + "loss": 1.5806, + "step": 11493 + }, + { + "epoch": 0.628405068135533, + "grad_norm": 1.2504156827926636, + "learning_rate": 6.769748050302469e-06, + "loss": 1.5508, + "step": 11494 + }, + { + "epoch": 0.6284597405792546, + "grad_norm": 2.3801283836364746, + "learning_rate": 6.7680187109114936e-06, + "loss": 1.3587, + "step": 11495 + }, + { + "epoch": 0.6285144130229761, + "grad_norm": 1.9999282360076904, + "learning_rate": 6.766289479443959e-06, + "loss": 1.5077, + "step": 11496 + }, + { + "epoch": 0.6285690854666977, + "grad_norm": 1.4093977212905884, + "learning_rate": 6.7645603559576045e-06, + "loss": 1.5352, + "step": 11497 + }, + { + "epoch": 0.6286237579104192, + "grad_norm": 1.351442575454712, + "learning_rate": 6.762831340510175e-06, + "loss": 1.5896, + "step": 11498 + }, + { + "epoch": 0.6286784303541407, + "grad_norm": 1.6834956407546997, + "learning_rate": 6.761102433159403e-06, + "loss": 1.3363, + "step": 11499 + }, + { + "epoch": 0.6287331027978623, + "grad_norm": 1.4698385000228882, + "learning_rate": 6.75937363396302e-06, + "loss": 1.4701, + "step": 11500 + }, + { + "epoch": 0.6287877752415839, + "grad_norm": 1.3453996181488037, + "learning_rate": 6.7576449429787585e-06, + "loss": 1.6032, + "step": 11501 + }, + { + "epoch": 0.6288424476853054, + "grad_norm": 1.4374028444290161, + "learning_rate": 6.755916360264339e-06, + "loss": 1.4887, + "step": 11502 + }, + { + "epoch": 0.628897120129027, + "grad_norm": 1.4292312860488892, + "learning_rate": 6.754187885877481e-06, + "loss": 1.5234, + "step": 11503 + }, + { + "epoch": 0.6289517925727485, + "grad_norm": 1.9008686542510986, + "learning_rate": 6.75245951987591e-06, + "loss": 1.3648, + "step": 11504 + }, + { + "epoch": 0.6290064650164701, + "grad_norm": 1.6823996305465698, + "learning_rate": 6.750731262317337e-06, + "loss": 1.4194, + "step": 11505 + }, + { + "epoch": 0.6290611374601917, + "grad_norm": 1.6349328756332397, + "learning_rate": 6.749003113259467e-06, + "loss": 1.3173, + "step": 11506 + }, + { + "epoch": 0.6291158099039131, + "grad_norm": 1.3939696550369263, + "learning_rate": 6.7472750727600155e-06, + "loss": 1.3407, + "step": 11507 + }, + { + "epoch": 0.6291704823476347, + "grad_norm": 1.397817611694336, + "learning_rate": 6.745547140876683e-06, + "loss": 1.6704, + "step": 11508 + }, + { + "epoch": 0.6292251547913563, + "grad_norm": 1.8879456520080566, + "learning_rate": 6.7438193176671666e-06, + "loss": 1.3822, + "step": 11509 + }, + { + "epoch": 0.6292798272350778, + "grad_norm": 1.2426265478134155, + "learning_rate": 6.742091603189165e-06, + "loss": 1.521, + "step": 11510 + }, + { + "epoch": 0.6293344996787994, + "grad_norm": 1.433889389038086, + "learning_rate": 6.740363997500366e-06, + "loss": 1.5443, + "step": 11511 + }, + { + "epoch": 0.629389172122521, + "grad_norm": 1.4350181818008423, + "learning_rate": 6.7386365006584665e-06, + "loss": 1.293, + "step": 11512 + }, + { + "epoch": 0.6294438445662425, + "grad_norm": 1.3099987506866455, + "learning_rate": 6.736909112721146e-06, + "loss": 1.6126, + "step": 11513 + }, + { + "epoch": 0.6294985170099641, + "grad_norm": 1.9066262245178223, + "learning_rate": 6.735181833746087e-06, + "loss": 1.5836, + "step": 11514 + }, + { + "epoch": 0.6295531894536857, + "grad_norm": 1.1116608381271362, + "learning_rate": 6.733454663790968e-06, + "loss": 1.535, + "step": 11515 + }, + { + "epoch": 0.6296078618974071, + "grad_norm": 1.5740950107574463, + "learning_rate": 6.731727602913465e-06, + "loss": 1.3538, + "step": 11516 + }, + { + "epoch": 0.6296625343411287, + "grad_norm": 1.4737958908081055, + "learning_rate": 6.730000651171241e-06, + "loss": 1.5067, + "step": 11517 + }, + { + "epoch": 0.6297172067848502, + "grad_norm": 2.350283145904541, + "learning_rate": 6.728273808621973e-06, + "loss": 1.5492, + "step": 11518 + }, + { + "epoch": 0.6297718792285718, + "grad_norm": 1.5980159044265747, + "learning_rate": 6.72654707532332e-06, + "loss": 1.396, + "step": 11519 + }, + { + "epoch": 0.6298265516722934, + "grad_norm": 1.4114511013031006, + "learning_rate": 6.72482045133294e-06, + "loss": 1.2873, + "step": 11520 + }, + { + "epoch": 0.6298812241160149, + "grad_norm": 1.5736632347106934, + "learning_rate": 6.7230939367084915e-06, + "loss": 1.6381, + "step": 11521 + }, + { + "epoch": 0.6299358965597365, + "grad_norm": 1.3933721780776978, + "learning_rate": 6.721367531507627e-06, + "loss": 1.537, + "step": 11522 + }, + { + "epoch": 0.6299905690034581, + "grad_norm": 1.3539806604385376, + "learning_rate": 6.7196412357879894e-06, + "loss": 1.3853, + "step": 11523 + }, + { + "epoch": 0.6300452414471795, + "grad_norm": 1.5725961923599243, + "learning_rate": 6.717915049607233e-06, + "loss": 1.4989, + "step": 11524 + }, + { + "epoch": 0.6300999138909011, + "grad_norm": 1.6752901077270508, + "learning_rate": 6.71618897302299e-06, + "loss": 1.5225, + "step": 11525 + }, + { + "epoch": 0.6301545863346227, + "grad_norm": 1.7806683778762817, + "learning_rate": 6.714463006092908e-06, + "loss": 1.3869, + "step": 11526 + }, + { + "epoch": 0.6302092587783442, + "grad_norm": 1.581506609916687, + "learning_rate": 6.7127371488746155e-06, + "loss": 1.5662, + "step": 11527 + }, + { + "epoch": 0.6302639312220658, + "grad_norm": 1.3954658508300781, + "learning_rate": 6.711011401425741e-06, + "loss": 1.5501, + "step": 11528 + }, + { + "epoch": 0.6303186036657874, + "grad_norm": 1.1049195528030396, + "learning_rate": 6.709285763803917e-06, + "loss": 1.7453, + "step": 11529 + }, + { + "epoch": 0.6303732761095089, + "grad_norm": 1.8291031122207642, + "learning_rate": 6.7075602360667616e-06, + "loss": 1.4361, + "step": 11530 + }, + { + "epoch": 0.6304279485532305, + "grad_norm": 2.3635952472686768, + "learning_rate": 6.705834818271893e-06, + "loss": 1.5137, + "step": 11531 + }, + { + "epoch": 0.630482620996952, + "grad_norm": 1.6655937433242798, + "learning_rate": 6.704109510476933e-06, + "loss": 1.5562, + "step": 11532 + }, + { + "epoch": 0.6305372934406736, + "grad_norm": 1.3483684062957764, + "learning_rate": 6.7023843127394905e-06, + "loss": 1.6089, + "step": 11533 + }, + { + "epoch": 0.6305919658843951, + "grad_norm": 1.500985860824585, + "learning_rate": 6.700659225117172e-06, + "loss": 1.6101, + "step": 11534 + }, + { + "epoch": 0.6306466383281166, + "grad_norm": 1.835961937904358, + "learning_rate": 6.698934247667587e-06, + "loss": 1.5596, + "step": 11535 + }, + { + "epoch": 0.6307013107718382, + "grad_norm": 1.8320657014846802, + "learning_rate": 6.697209380448333e-06, + "loss": 1.6403, + "step": 11536 + }, + { + "epoch": 0.6307559832155598, + "grad_norm": 1.2775404453277588, + "learning_rate": 6.695484623517004e-06, + "loss": 1.3756, + "step": 11537 + }, + { + "epoch": 0.6308106556592813, + "grad_norm": 1.5613425970077515, + "learning_rate": 6.693759976931201e-06, + "loss": 1.5826, + "step": 11538 + }, + { + "epoch": 0.6308653281030029, + "grad_norm": 1.5215767621994019, + "learning_rate": 6.692035440748512e-06, + "loss": 1.422, + "step": 11539 + }, + { + "epoch": 0.6309200005467245, + "grad_norm": 1.2402064800262451, + "learning_rate": 6.69031101502652e-06, + "loss": 1.6703, + "step": 11540 + }, + { + "epoch": 0.630974672990446, + "grad_norm": 1.5074859857559204, + "learning_rate": 6.68858669982281e-06, + "loss": 1.4427, + "step": 11541 + }, + { + "epoch": 0.6310293454341676, + "grad_norm": 1.6142116785049438, + "learning_rate": 6.686862495194958e-06, + "loss": 1.3095, + "step": 11542 + }, + { + "epoch": 0.6310840178778891, + "grad_norm": 1.4587757587432861, + "learning_rate": 6.685138401200546e-06, + "loss": 1.4962, + "step": 11543 + }, + { + "epoch": 0.6311386903216106, + "grad_norm": 1.3735133409500122, + "learning_rate": 6.68341441789714e-06, + "loss": 1.4773, + "step": 11544 + }, + { + "epoch": 0.6311933627653322, + "grad_norm": 1.7238502502441406, + "learning_rate": 6.681690545342305e-06, + "loss": 1.4529, + "step": 11545 + }, + { + "epoch": 0.6312480352090537, + "grad_norm": 1.5197951793670654, + "learning_rate": 6.679966783593616e-06, + "loss": 1.2981, + "step": 11546 + }, + { + "epoch": 0.6313027076527753, + "grad_norm": 1.6864111423492432, + "learning_rate": 6.678243132708625e-06, + "loss": 1.5892, + "step": 11547 + }, + { + "epoch": 0.6313573800964969, + "grad_norm": 1.8865197896957397, + "learning_rate": 6.676519592744888e-06, + "loss": 1.7707, + "step": 11548 + }, + { + "epoch": 0.6314120525402184, + "grad_norm": 1.8576083183288574, + "learning_rate": 6.6747961637599645e-06, + "loss": 1.2833, + "step": 11549 + }, + { + "epoch": 0.63146672498394, + "grad_norm": 1.5728137493133545, + "learning_rate": 6.673072845811398e-06, + "loss": 1.3227, + "step": 11550 + }, + { + "epoch": 0.6315213974276616, + "grad_norm": 1.5710755586624146, + "learning_rate": 6.671349638956732e-06, + "loss": 1.3118, + "step": 11551 + }, + { + "epoch": 0.631576069871383, + "grad_norm": 1.3205487728118896, + "learning_rate": 6.669626543253518e-06, + "loss": 1.4346, + "step": 11552 + }, + { + "epoch": 0.6316307423151046, + "grad_norm": 1.5156574249267578, + "learning_rate": 6.667903558759288e-06, + "loss": 1.2277, + "step": 11553 + }, + { + "epoch": 0.6316854147588262, + "grad_norm": 1.4843995571136475, + "learning_rate": 6.666180685531576e-06, + "loss": 1.1558, + "step": 11554 + }, + { + "epoch": 0.6317400872025477, + "grad_norm": 1.9583935737609863, + "learning_rate": 6.664457923627914e-06, + "loss": 1.449, + "step": 11555 + }, + { + "epoch": 0.6317947596462693, + "grad_norm": 1.6623420715332031, + "learning_rate": 6.662735273105827e-06, + "loss": 1.385, + "step": 11556 + }, + { + "epoch": 0.6318494320899909, + "grad_norm": 1.458417534828186, + "learning_rate": 6.661012734022843e-06, + "loss": 1.3357, + "step": 11557 + }, + { + "epoch": 0.6319041045337124, + "grad_norm": 1.4731401205062866, + "learning_rate": 6.659290306436479e-06, + "loss": 1.3381, + "step": 11558 + }, + { + "epoch": 0.631958776977434, + "grad_norm": 1.6989163160324097, + "learning_rate": 6.6575679904042504e-06, + "loss": 1.4877, + "step": 11559 + }, + { + "epoch": 0.6320134494211554, + "grad_norm": 1.7424527406692505, + "learning_rate": 6.65584578598367e-06, + "loss": 1.3114, + "step": 11560 + }, + { + "epoch": 0.632068121864877, + "grad_norm": 1.2865296602249146, + "learning_rate": 6.654123693232247e-06, + "loss": 1.5258, + "step": 11561 + }, + { + "epoch": 0.6321227943085986, + "grad_norm": 1.1858062744140625, + "learning_rate": 6.652401712207481e-06, + "loss": 1.3439, + "step": 11562 + }, + { + "epoch": 0.6321774667523201, + "grad_norm": 1.9379887580871582, + "learning_rate": 6.650679842966881e-06, + "loss": 1.4064, + "step": 11563 + }, + { + "epoch": 0.6322321391960417, + "grad_norm": 1.7952438592910767, + "learning_rate": 6.648958085567941e-06, + "loss": 1.4772, + "step": 11564 + }, + { + "epoch": 0.6322868116397633, + "grad_norm": 1.585354208946228, + "learning_rate": 6.64723644006815e-06, + "loss": 1.3852, + "step": 11565 + }, + { + "epoch": 0.6323414840834848, + "grad_norm": 1.538483738899231, + "learning_rate": 6.645514906525006e-06, + "loss": 1.4372, + "step": 11566 + }, + { + "epoch": 0.6323961565272064, + "grad_norm": 1.5449512004852295, + "learning_rate": 6.643793484995991e-06, + "loss": 1.4281, + "step": 11567 + }, + { + "epoch": 0.632450828970928, + "grad_norm": 1.5945043563842773, + "learning_rate": 6.642072175538583e-06, + "loss": 1.2856, + "step": 11568 + }, + { + "epoch": 0.6325055014146495, + "grad_norm": 1.6361916065216064, + "learning_rate": 6.640350978210269e-06, + "loss": 1.3207, + "step": 11569 + }, + { + "epoch": 0.632560173858371, + "grad_norm": 1.4169237613677979, + "learning_rate": 6.638629893068516e-06, + "loss": 1.379, + "step": 11570 + }, + { + "epoch": 0.6326148463020926, + "grad_norm": 1.5590037107467651, + "learning_rate": 6.6369089201708e-06, + "loss": 1.3045, + "step": 11571 + }, + { + "epoch": 0.6326695187458141, + "grad_norm": 1.373704433441162, + "learning_rate": 6.635188059574589e-06, + "loss": 1.4217, + "step": 11572 + }, + { + "epoch": 0.6327241911895357, + "grad_norm": 1.4424362182617188, + "learning_rate": 6.633467311337341e-06, + "loss": 1.3553, + "step": 11573 + }, + { + "epoch": 0.6327788636332572, + "grad_norm": 1.6737128496170044, + "learning_rate": 6.631746675516522e-06, + "loss": 1.3227, + "step": 11574 + }, + { + "epoch": 0.6328335360769788, + "grad_norm": 1.3701627254486084, + "learning_rate": 6.630026152169585e-06, + "loss": 1.4484, + "step": 11575 + }, + { + "epoch": 0.6328882085207004, + "grad_norm": 1.4308236837387085, + "learning_rate": 6.628305741353979e-06, + "loss": 1.6419, + "step": 11576 + }, + { + "epoch": 0.6329428809644219, + "grad_norm": 1.571258306503296, + "learning_rate": 6.62658544312716e-06, + "loss": 1.3791, + "step": 11577 + }, + { + "epoch": 0.6329975534081435, + "grad_norm": 1.2645540237426758, + "learning_rate": 6.6248652575465696e-06, + "loss": 1.5606, + "step": 11578 + }, + { + "epoch": 0.633052225851865, + "grad_norm": 1.8267362117767334, + "learning_rate": 6.623145184669646e-06, + "loss": 1.4993, + "step": 11579 + }, + { + "epoch": 0.6331068982955865, + "grad_norm": 1.5280346870422363, + "learning_rate": 6.62142522455383e-06, + "loss": 1.5553, + "step": 11580 + }, + { + "epoch": 0.6331615707393081, + "grad_norm": 1.4612579345703125, + "learning_rate": 6.619705377256556e-06, + "loss": 1.2713, + "step": 11581 + }, + { + "epoch": 0.6332162431830297, + "grad_norm": 1.4897491931915283, + "learning_rate": 6.617985642835245e-06, + "loss": 1.5298, + "step": 11582 + }, + { + "epoch": 0.6332709156267512, + "grad_norm": 1.406298279762268, + "learning_rate": 6.616266021347335e-06, + "loss": 1.6358, + "step": 11583 + }, + { + "epoch": 0.6333255880704728, + "grad_norm": 2.219057559967041, + "learning_rate": 6.614546512850237e-06, + "loss": 1.5946, + "step": 11584 + }, + { + "epoch": 0.6333802605141944, + "grad_norm": 1.8650463819503784, + "learning_rate": 6.612827117401381e-06, + "loss": 1.5238, + "step": 11585 + }, + { + "epoch": 0.6334349329579159, + "grad_norm": 1.524573564529419, + "learning_rate": 6.611107835058175e-06, + "loss": 1.5869, + "step": 11586 + }, + { + "epoch": 0.6334896054016375, + "grad_norm": 1.6413367986679077, + "learning_rate": 6.609388665878029e-06, + "loss": 1.2908, + "step": 11587 + }, + { + "epoch": 0.6335442778453589, + "grad_norm": 1.7831132411956787, + "learning_rate": 6.6076696099183544e-06, + "loss": 1.6116, + "step": 11588 + }, + { + "epoch": 0.6335989502890805, + "grad_norm": 1.2970010042190552, + "learning_rate": 6.6059506672365516e-06, + "loss": 1.2956, + "step": 11589 + }, + { + "epoch": 0.6336536227328021, + "grad_norm": 1.9638475179672241, + "learning_rate": 6.604231837890015e-06, + "loss": 1.6315, + "step": 11590 + }, + { + "epoch": 0.6337082951765236, + "grad_norm": 1.7075612545013428, + "learning_rate": 6.6025131219361505e-06, + "loss": 1.4428, + "step": 11591 + }, + { + "epoch": 0.6337629676202452, + "grad_norm": 1.8814479112625122, + "learning_rate": 6.600794519432346e-06, + "loss": 1.5086, + "step": 11592 + }, + { + "epoch": 0.6338176400639668, + "grad_norm": 1.480145812034607, + "learning_rate": 6.599076030435987e-06, + "loss": 1.3524, + "step": 11593 + }, + { + "epoch": 0.6338723125076883, + "grad_norm": 1.2061525583267212, + "learning_rate": 6.5973576550044604e-06, + "loss": 1.4733, + "step": 11594 + }, + { + "epoch": 0.6339269849514099, + "grad_norm": 1.688781499862671, + "learning_rate": 6.595639393195148e-06, + "loss": 1.2583, + "step": 11595 + }, + { + "epoch": 0.6339816573951315, + "grad_norm": 1.4929794073104858, + "learning_rate": 6.59392124506542e-06, + "loss": 1.5163, + "step": 11596 + }, + { + "epoch": 0.6340363298388529, + "grad_norm": 1.3000028133392334, + "learning_rate": 6.592203210672657e-06, + "loss": 1.3858, + "step": 11597 + }, + { + "epoch": 0.6340910022825745, + "grad_norm": 1.4264695644378662, + "learning_rate": 6.590485290074224e-06, + "loss": 1.7275, + "step": 11598 + }, + { + "epoch": 0.6341456747262961, + "grad_norm": 1.6587848663330078, + "learning_rate": 6.588767483327492e-06, + "loss": 1.4545, + "step": 11599 + }, + { + "epoch": 0.6342003471700176, + "grad_norm": 1.501935362815857, + "learning_rate": 6.5870497904898165e-06, + "loss": 1.4028, + "step": 11600 + }, + { + "epoch": 0.6342550196137392, + "grad_norm": 1.7768332958221436, + "learning_rate": 6.585332211618554e-06, + "loss": 1.3607, + "step": 11601 + }, + { + "epoch": 0.6343096920574607, + "grad_norm": 1.868147373199463, + "learning_rate": 6.583614746771065e-06, + "loss": 1.4712, + "step": 11602 + }, + { + "epoch": 0.6343643645011823, + "grad_norm": 1.7706880569458008, + "learning_rate": 6.5818973960046976e-06, + "loss": 1.5204, + "step": 11603 + }, + { + "epoch": 0.6344190369449039, + "grad_norm": 1.766364336013794, + "learning_rate": 6.580180159376792e-06, + "loss": 1.6599, + "step": 11604 + }, + { + "epoch": 0.6344737093886254, + "grad_norm": 1.5614172220230103, + "learning_rate": 6.5784630369447e-06, + "loss": 1.7265, + "step": 11605 + }, + { + "epoch": 0.6345283818323469, + "grad_norm": 1.712599515914917, + "learning_rate": 6.576746028765756e-06, + "loss": 1.4463, + "step": 11606 + }, + { + "epoch": 0.6345830542760685, + "grad_norm": 1.2350839376449585, + "learning_rate": 6.575029134897293e-06, + "loss": 1.5092, + "step": 11607 + }, + { + "epoch": 0.63463772671979, + "grad_norm": 1.5028717517852783, + "learning_rate": 6.573312355396646e-06, + "loss": 1.3994, + "step": 11608 + }, + { + "epoch": 0.6346923991635116, + "grad_norm": 1.5960298776626587, + "learning_rate": 6.571595690321141e-06, + "loss": 1.4614, + "step": 11609 + }, + { + "epoch": 0.6347470716072332, + "grad_norm": 1.9556784629821777, + "learning_rate": 6.569879139728097e-06, + "loss": 1.4246, + "step": 11610 + }, + { + "epoch": 0.6348017440509547, + "grad_norm": 1.2337396144866943, + "learning_rate": 6.5681627036748404e-06, + "loss": 1.488, + "step": 11611 + }, + { + "epoch": 0.6348564164946763, + "grad_norm": 1.2749134302139282, + "learning_rate": 6.566446382218683e-06, + "loss": 1.4811, + "step": 11612 + }, + { + "epoch": 0.6349110889383979, + "grad_norm": 1.3727465867996216, + "learning_rate": 6.56473017541694e-06, + "loss": 1.3947, + "step": 11613 + }, + { + "epoch": 0.6349657613821194, + "grad_norm": 1.5956339836120605, + "learning_rate": 6.5630140833269175e-06, + "loss": 1.3497, + "step": 11614 + }, + { + "epoch": 0.635020433825841, + "grad_norm": 1.4622186422348022, + "learning_rate": 6.5612981060059156e-06, + "loss": 1.3291, + "step": 11615 + }, + { + "epoch": 0.6350751062695624, + "grad_norm": 1.2747242450714111, + "learning_rate": 6.559582243511244e-06, + "loss": 1.4402, + "step": 11616 + }, + { + "epoch": 0.635129778713284, + "grad_norm": 1.6093382835388184, + "learning_rate": 6.557866495900194e-06, + "loss": 1.6596, + "step": 11617 + }, + { + "epoch": 0.6351844511570056, + "grad_norm": 1.8338911533355713, + "learning_rate": 6.556150863230055e-06, + "loss": 1.6754, + "step": 11618 + }, + { + "epoch": 0.6352391236007271, + "grad_norm": 1.2034639120101929, + "learning_rate": 6.5544353455581245e-06, + "loss": 1.3939, + "step": 11619 + }, + { + "epoch": 0.6352937960444487, + "grad_norm": 1.4128652811050415, + "learning_rate": 6.552719942941682e-06, + "loss": 1.3955, + "step": 11620 + }, + { + "epoch": 0.6353484684881703, + "grad_norm": 1.7695237398147583, + "learning_rate": 6.551004655438007e-06, + "loss": 1.4635, + "step": 11621 + }, + { + "epoch": 0.6354031409318918, + "grad_norm": 1.502527117729187, + "learning_rate": 6.549289483104382e-06, + "loss": 1.6261, + "step": 11622 + }, + { + "epoch": 0.6354578133756134, + "grad_norm": 1.420615553855896, + "learning_rate": 6.54757442599808e-06, + "loss": 1.6811, + "step": 11623 + }, + { + "epoch": 0.635512485819335, + "grad_norm": 1.712876319885254, + "learning_rate": 6.545859484176364e-06, + "loss": 1.3016, + "step": 11624 + }, + { + "epoch": 0.6355671582630564, + "grad_norm": 1.6029289960861206, + "learning_rate": 6.54414465769651e-06, + "loss": 1.5261, + "step": 11625 + }, + { + "epoch": 0.635621830706778, + "grad_norm": 1.8397408723831177, + "learning_rate": 6.542429946615774e-06, + "loss": 1.359, + "step": 11626 + }, + { + "epoch": 0.6356765031504996, + "grad_norm": 1.7891172170639038, + "learning_rate": 6.540715350991416e-06, + "loss": 1.5379, + "step": 11627 + }, + { + "epoch": 0.6357311755942211, + "grad_norm": 1.4152448177337646, + "learning_rate": 6.539000870880692e-06, + "loss": 1.4643, + "step": 11628 + }, + { + "epoch": 0.6357858480379427, + "grad_norm": 1.6745637655258179, + "learning_rate": 6.537286506340844e-06, + "loss": 1.3487, + "step": 11629 + }, + { + "epoch": 0.6358405204816643, + "grad_norm": 1.861994981765747, + "learning_rate": 6.53557225742913e-06, + "loss": 1.2593, + "step": 11630 + }, + { + "epoch": 0.6358951929253858, + "grad_norm": 1.535774827003479, + "learning_rate": 6.5338581242027885e-06, + "loss": 1.6414, + "step": 11631 + }, + { + "epoch": 0.6359498653691074, + "grad_norm": 1.235445499420166, + "learning_rate": 6.532144106719056e-06, + "loss": 1.5492, + "step": 11632 + }, + { + "epoch": 0.6360045378128288, + "grad_norm": 1.3220915794372559, + "learning_rate": 6.53043020503517e-06, + "loss": 1.3994, + "step": 11633 + }, + { + "epoch": 0.6360592102565504, + "grad_norm": 1.509080410003662, + "learning_rate": 6.528716419208362e-06, + "loss": 1.6443, + "step": 11634 + }, + { + "epoch": 0.636113882700272, + "grad_norm": 1.4578748941421509, + "learning_rate": 6.527002749295854e-06, + "loss": 1.3913, + "step": 11635 + }, + { + "epoch": 0.6361685551439935, + "grad_norm": 1.806537389755249, + "learning_rate": 6.525289195354878e-06, + "loss": 1.2347, + "step": 11636 + }, + { + "epoch": 0.6362232275877151, + "grad_norm": 1.7301199436187744, + "learning_rate": 6.52357575744265e-06, + "loss": 1.2123, + "step": 11637 + }, + { + "epoch": 0.6362779000314367, + "grad_norm": 1.3067240715026855, + "learning_rate": 6.521862435616382e-06, + "loss": 1.5647, + "step": 11638 + }, + { + "epoch": 0.6363325724751582, + "grad_norm": 2.359527826309204, + "learning_rate": 6.520149229933292e-06, + "loss": 1.3507, + "step": 11639 + }, + { + "epoch": 0.6363872449188798, + "grad_norm": 1.4373149871826172, + "learning_rate": 6.5184361404505795e-06, + "loss": 1.6718, + "step": 11640 + }, + { + "epoch": 0.6364419173626014, + "grad_norm": 1.3824639320373535, + "learning_rate": 6.5167231672254595e-06, + "loss": 1.5751, + "step": 11641 + }, + { + "epoch": 0.6364965898063228, + "grad_norm": 1.5826573371887207, + "learning_rate": 6.5150103103151265e-06, + "loss": 1.306, + "step": 11642 + }, + { + "epoch": 0.6365512622500444, + "grad_norm": 1.6405541896820068, + "learning_rate": 6.513297569776773e-06, + "loss": 1.3699, + "step": 11643 + }, + { + "epoch": 0.636605934693766, + "grad_norm": 1.65358304977417, + "learning_rate": 6.511584945667599e-06, + "loss": 1.5337, + "step": 11644 + }, + { + "epoch": 0.6366606071374875, + "grad_norm": 1.7470476627349854, + "learning_rate": 6.509872438044789e-06, + "loss": 1.1059, + "step": 11645 + }, + { + "epoch": 0.6367152795812091, + "grad_norm": 1.7422555685043335, + "learning_rate": 6.508160046965527e-06, + "loss": 1.4878, + "step": 11646 + }, + { + "epoch": 0.6367699520249306, + "grad_norm": 1.892081379890442, + "learning_rate": 6.506447772486997e-06, + "loss": 1.4226, + "step": 11647 + }, + { + "epoch": 0.6368246244686522, + "grad_norm": 1.587301254272461, + "learning_rate": 6.504735614666373e-06, + "loss": 1.3202, + "step": 11648 + }, + { + "epoch": 0.6368792969123738, + "grad_norm": 1.6137043237686157, + "learning_rate": 6.503023573560825e-06, + "loss": 1.3037, + "step": 11649 + }, + { + "epoch": 0.6369339693560953, + "grad_norm": 1.3372313976287842, + "learning_rate": 6.501311649227531e-06, + "loss": 1.5918, + "step": 11650 + }, + { + "epoch": 0.6369886417998168, + "grad_norm": 1.346779704093933, + "learning_rate": 6.499599841723649e-06, + "loss": 1.6113, + "step": 11651 + }, + { + "epoch": 0.6370433142435384, + "grad_norm": 1.7065263986587524, + "learning_rate": 6.4978881511063416e-06, + "loss": 1.5908, + "step": 11652 + }, + { + "epoch": 0.6370979866872599, + "grad_norm": 1.5186516046524048, + "learning_rate": 6.4961765774327676e-06, + "loss": 1.3722, + "step": 11653 + }, + { + "epoch": 0.6371526591309815, + "grad_norm": 1.5671864748001099, + "learning_rate": 6.4944651207600765e-06, + "loss": 1.4615, + "step": 11654 + }, + { + "epoch": 0.6372073315747031, + "grad_norm": 1.3823975324630737, + "learning_rate": 6.492753781145425e-06, + "loss": 1.4717, + "step": 11655 + }, + { + "epoch": 0.6372620040184246, + "grad_norm": 2.049268960952759, + "learning_rate": 6.491042558645955e-06, + "loss": 1.4618, + "step": 11656 + }, + { + "epoch": 0.6373166764621462, + "grad_norm": 1.4564814567565918, + "learning_rate": 6.489331453318806e-06, + "loss": 1.1907, + "step": 11657 + }, + { + "epoch": 0.6373713489058678, + "grad_norm": 1.5981204509735107, + "learning_rate": 6.487620465221118e-06, + "loss": 1.4519, + "step": 11658 + }, + { + "epoch": 0.6374260213495893, + "grad_norm": 1.9943963289260864, + "learning_rate": 6.485909594410027e-06, + "loss": 1.3411, + "step": 11659 + }, + { + "epoch": 0.6374806937933108, + "grad_norm": 1.2909355163574219, + "learning_rate": 6.484198840942656e-06, + "loss": 1.4983, + "step": 11660 + }, + { + "epoch": 0.6375353662370323, + "grad_norm": 1.689710259437561, + "learning_rate": 6.4824882048761406e-06, + "loss": 1.5523, + "step": 11661 + }, + { + "epoch": 0.6375900386807539, + "grad_norm": 1.3779510259628296, + "learning_rate": 6.480777686267597e-06, + "loss": 1.4902, + "step": 11662 + }, + { + "epoch": 0.6376447111244755, + "grad_norm": 1.3324673175811768, + "learning_rate": 6.479067285174141e-06, + "loss": 1.443, + "step": 11663 + }, + { + "epoch": 0.637699383568197, + "grad_norm": 1.8533728122711182, + "learning_rate": 6.477357001652893e-06, + "loss": 1.3319, + "step": 11664 + }, + { + "epoch": 0.6377540560119186, + "grad_norm": 1.8000129461288452, + "learning_rate": 6.475646835760963e-06, + "loss": 1.2844, + "step": 11665 + }, + { + "epoch": 0.6378087284556402, + "grad_norm": 1.6129993200302124, + "learning_rate": 6.4739367875554526e-06, + "loss": 1.413, + "step": 11666 + }, + { + "epoch": 0.6378634008993617, + "grad_norm": 1.9394419193267822, + "learning_rate": 6.472226857093468e-06, + "loss": 1.4289, + "step": 11667 + }, + { + "epoch": 0.6379180733430833, + "grad_norm": 1.6558988094329834, + "learning_rate": 6.470517044432104e-06, + "loss": 1.4748, + "step": 11668 + }, + { + "epoch": 0.6379727457868049, + "grad_norm": 1.6712301969528198, + "learning_rate": 6.468807349628462e-06, + "loss": 1.511, + "step": 11669 + }, + { + "epoch": 0.6380274182305263, + "grad_norm": 1.4324181079864502, + "learning_rate": 6.467097772739628e-06, + "loss": 1.4966, + "step": 11670 + }, + { + "epoch": 0.6380820906742479, + "grad_norm": 1.516103744506836, + "learning_rate": 6.4653883138226895e-06, + "loss": 1.8595, + "step": 11671 + }, + { + "epoch": 0.6381367631179695, + "grad_norm": 1.4401978254318237, + "learning_rate": 6.46367897293473e-06, + "loss": 1.569, + "step": 11672 + }, + { + "epoch": 0.638191435561691, + "grad_norm": 1.659578800201416, + "learning_rate": 6.461969750132827e-06, + "loss": 1.5305, + "step": 11673 + }, + { + "epoch": 0.6382461080054126, + "grad_norm": 1.6764543056488037, + "learning_rate": 6.460260645474054e-06, + "loss": 1.5637, + "step": 11674 + }, + { + "epoch": 0.6383007804491341, + "grad_norm": 1.6352280378341675, + "learning_rate": 6.458551659015486e-06, + "loss": 1.424, + "step": 11675 + }, + { + "epoch": 0.6383554528928557, + "grad_norm": 1.2879114151000977, + "learning_rate": 6.45684279081419e-06, + "loss": 1.2122, + "step": 11676 + }, + { + "epoch": 0.6384101253365773, + "grad_norm": 1.5016040802001953, + "learning_rate": 6.455134040927227e-06, + "loss": 1.474, + "step": 11677 + }, + { + "epoch": 0.6384647977802987, + "grad_norm": 1.729796051979065, + "learning_rate": 6.4534254094116555e-06, + "loss": 1.4121, + "step": 11678 + }, + { + "epoch": 0.6385194702240203, + "grad_norm": 1.2385809421539307, + "learning_rate": 6.451716896324534e-06, + "loss": 1.4494, + "step": 11679 + }, + { + "epoch": 0.6385741426677419, + "grad_norm": 1.3316991329193115, + "learning_rate": 6.4500085017229065e-06, + "loss": 1.6033, + "step": 11680 + }, + { + "epoch": 0.6386288151114634, + "grad_norm": 1.065735936164856, + "learning_rate": 6.448300225663831e-06, + "loss": 1.6153, + "step": 11681 + }, + { + "epoch": 0.638683487555185, + "grad_norm": 1.6832703351974487, + "learning_rate": 6.446592068204341e-06, + "loss": 1.5405, + "step": 11682 + }, + { + "epoch": 0.6387381599989066, + "grad_norm": 1.4257235527038574, + "learning_rate": 6.444884029401483e-06, + "loss": 1.4323, + "step": 11683 + }, + { + "epoch": 0.6387928324426281, + "grad_norm": 1.3732722997665405, + "learning_rate": 6.44317610931229e-06, + "loss": 1.2878, + "step": 11684 + }, + { + "epoch": 0.6388475048863497, + "grad_norm": 1.9671608209609985, + "learning_rate": 6.44146830799379e-06, + "loss": 1.5749, + "step": 11685 + }, + { + "epoch": 0.6389021773300713, + "grad_norm": 1.0605047941207886, + "learning_rate": 6.439760625503018e-06, + "loss": 1.3523, + "step": 11686 + }, + { + "epoch": 0.6389568497737927, + "grad_norm": 1.6668556928634644, + "learning_rate": 6.438053061896992e-06, + "loss": 1.6793, + "step": 11687 + }, + { + "epoch": 0.6390115222175143, + "grad_norm": 1.4748942852020264, + "learning_rate": 6.436345617232728e-06, + "loss": 1.6144, + "step": 11688 + }, + { + "epoch": 0.6390661946612358, + "grad_norm": 1.4822895526885986, + "learning_rate": 6.43463829156725e-06, + "loss": 1.5033, + "step": 11689 + }, + { + "epoch": 0.6391208671049574, + "grad_norm": 1.909277081489563, + "learning_rate": 6.432931084957567e-06, + "loss": 1.2989, + "step": 11690 + }, + { + "epoch": 0.639175539548679, + "grad_norm": 1.4599847793579102, + "learning_rate": 6.431223997460683e-06, + "loss": 1.55, + "step": 11691 + }, + { + "epoch": 0.6392302119924005, + "grad_norm": 1.8982784748077393, + "learning_rate": 6.429517029133605e-06, + "loss": 1.3984, + "step": 11692 + }, + { + "epoch": 0.6392848844361221, + "grad_norm": 1.3925707340240479, + "learning_rate": 6.427810180033334e-06, + "loss": 1.3967, + "step": 11693 + }, + { + "epoch": 0.6393395568798437, + "grad_norm": 1.3364949226379395, + "learning_rate": 6.426103450216857e-06, + "loss": 1.5179, + "step": 11694 + }, + { + "epoch": 0.6393942293235652, + "grad_norm": 2.153748035430908, + "learning_rate": 6.424396839741178e-06, + "loss": 1.1715, + "step": 11695 + }, + { + "epoch": 0.6394489017672867, + "grad_norm": 1.5977391004562378, + "learning_rate": 6.422690348663276e-06, + "loss": 1.3944, + "step": 11696 + }, + { + "epoch": 0.6395035742110083, + "grad_norm": 1.306795597076416, + "learning_rate": 6.420983977040141e-06, + "loss": 1.4092, + "step": 11697 + }, + { + "epoch": 0.6395582466547298, + "grad_norm": 1.5556102991104126, + "learning_rate": 6.419277724928748e-06, + "loss": 1.3036, + "step": 11698 + }, + { + "epoch": 0.6396129190984514, + "grad_norm": 1.5749375820159912, + "learning_rate": 6.417571592386071e-06, + "loss": 1.4322, + "step": 11699 + }, + { + "epoch": 0.639667591542173, + "grad_norm": 1.2411657571792603, + "learning_rate": 6.415865579469089e-06, + "loss": 1.3288, + "step": 11700 + }, + { + "epoch": 0.6397222639858945, + "grad_norm": 1.3587478399276733, + "learning_rate": 6.4141596862347645e-06, + "loss": 1.5614, + "step": 11701 + }, + { + "epoch": 0.6397769364296161, + "grad_norm": 1.469552755355835, + "learning_rate": 6.41245391274006e-06, + "loss": 1.4129, + "step": 11702 + }, + { + "epoch": 0.6398316088733376, + "grad_norm": 1.539306879043579, + "learning_rate": 6.410748259041941e-06, + "loss": 1.3837, + "step": 11703 + }, + { + "epoch": 0.6398862813170592, + "grad_norm": 1.4513635635375977, + "learning_rate": 6.409042725197361e-06, + "loss": 1.4675, + "step": 11704 + }, + { + "epoch": 0.6399409537607808, + "grad_norm": 1.6308094263076782, + "learning_rate": 6.407337311263269e-06, + "loss": 1.4494, + "step": 11705 + }, + { + "epoch": 0.6399956262045022, + "grad_norm": 1.2838035821914673, + "learning_rate": 6.4056320172966145e-06, + "loss": 1.4644, + "step": 11706 + }, + { + "epoch": 0.6400502986482238, + "grad_norm": 1.4127781391143799, + "learning_rate": 6.4039268433543425e-06, + "loss": 1.3885, + "step": 11707 + }, + { + "epoch": 0.6401049710919454, + "grad_norm": 1.6407103538513184, + "learning_rate": 6.402221789493388e-06, + "loss": 1.2084, + "step": 11708 + }, + { + "epoch": 0.6401596435356669, + "grad_norm": 1.6855931282043457, + "learning_rate": 6.400516855770694e-06, + "loss": 1.4598, + "step": 11709 + }, + { + "epoch": 0.6402143159793885, + "grad_norm": 1.2992444038391113, + "learning_rate": 6.398812042243187e-06, + "loss": 1.4822, + "step": 11710 + }, + { + "epoch": 0.6402689884231101, + "grad_norm": 1.435196042060852, + "learning_rate": 6.3971073489678e-06, + "loss": 1.3833, + "step": 11711 + }, + { + "epoch": 0.6403236608668316, + "grad_norm": 1.465233564376831, + "learning_rate": 6.395402776001449e-06, + "loss": 1.4378, + "step": 11712 + }, + { + "epoch": 0.6403783333105532, + "grad_norm": 1.463397741317749, + "learning_rate": 6.393698323401056e-06, + "loss": 1.4636, + "step": 11713 + }, + { + "epoch": 0.6404330057542748, + "grad_norm": 1.2229468822479248, + "learning_rate": 6.391993991223544e-06, + "loss": 1.4036, + "step": 11714 + }, + { + "epoch": 0.6404876781979962, + "grad_norm": 1.857665777206421, + "learning_rate": 6.390289779525818e-06, + "loss": 1.3444, + "step": 11715 + }, + { + "epoch": 0.6405423506417178, + "grad_norm": 1.3939261436462402, + "learning_rate": 6.388585688364783e-06, + "loss": 1.3624, + "step": 11716 + }, + { + "epoch": 0.6405970230854393, + "grad_norm": 1.5196033716201782, + "learning_rate": 6.3868817177973505e-06, + "loss": 1.3503, + "step": 11717 + }, + { + "epoch": 0.6406516955291609, + "grad_norm": 1.7087492942810059, + "learning_rate": 6.385177867880414e-06, + "loss": 1.4721, + "step": 11718 + }, + { + "epoch": 0.6407063679728825, + "grad_norm": 1.3884925842285156, + "learning_rate": 6.383474138670869e-06, + "loss": 1.756, + "step": 11719 + }, + { + "epoch": 0.640761040416604, + "grad_norm": 1.7590818405151367, + "learning_rate": 6.381770530225611e-06, + "loss": 1.3839, + "step": 11720 + }, + { + "epoch": 0.6408157128603256, + "grad_norm": 1.3982793092727661, + "learning_rate": 6.380067042601526e-06, + "loss": 1.4438, + "step": 11721 + }, + { + "epoch": 0.6408703853040472, + "grad_norm": 1.3438259363174438, + "learning_rate": 6.378363675855494e-06, + "loss": 1.4067, + "step": 11722 + }, + { + "epoch": 0.6409250577477686, + "grad_norm": 1.8473848104476929, + "learning_rate": 6.3766604300444e-06, + "loss": 1.5734, + "step": 11723 + }, + { + "epoch": 0.6409797301914902, + "grad_norm": 1.7302474975585938, + "learning_rate": 6.3749573052251155e-06, + "loss": 1.7015, + "step": 11724 + }, + { + "epoch": 0.6410344026352118, + "grad_norm": 1.564961552619934, + "learning_rate": 6.373254301454514e-06, + "loss": 1.3114, + "step": 11725 + }, + { + "epoch": 0.6410890750789333, + "grad_norm": 2.2492830753326416, + "learning_rate": 6.371551418789463e-06, + "loss": 1.3054, + "step": 11726 + }, + { + "epoch": 0.6411437475226549, + "grad_norm": 1.7157025337219238, + "learning_rate": 6.36984865728682e-06, + "loss": 1.444, + "step": 11727 + }, + { + "epoch": 0.6411984199663765, + "grad_norm": 1.2075313329696655, + "learning_rate": 6.368146017003454e-06, + "loss": 1.4082, + "step": 11728 + }, + { + "epoch": 0.641253092410098, + "grad_norm": 1.6063019037246704, + "learning_rate": 6.366443497996213e-06, + "loss": 1.8101, + "step": 11729 + }, + { + "epoch": 0.6413077648538196, + "grad_norm": 1.4695426225662231, + "learning_rate": 6.3647411003219486e-06, + "loss": 1.3672, + "step": 11730 + }, + { + "epoch": 0.641362437297541, + "grad_norm": 1.422749638557434, + "learning_rate": 6.363038824037511e-06, + "loss": 1.319, + "step": 11731 + }, + { + "epoch": 0.6414171097412626, + "grad_norm": 1.383553385734558, + "learning_rate": 6.3613366691997426e-06, + "loss": 1.7075, + "step": 11732 + }, + { + "epoch": 0.6414717821849842, + "grad_norm": 1.4432578086853027, + "learning_rate": 6.359634635865476e-06, + "loss": 1.4585, + "step": 11733 + }, + { + "epoch": 0.6415264546287057, + "grad_norm": 1.971980094909668, + "learning_rate": 6.357932724091555e-06, + "loss": 1.352, + "step": 11734 + }, + { + "epoch": 0.6415811270724273, + "grad_norm": 1.4903154373168945, + "learning_rate": 6.356230933934808e-06, + "loss": 1.5317, + "step": 11735 + }, + { + "epoch": 0.6416357995161489, + "grad_norm": 1.2522172927856445, + "learning_rate": 6.354529265452059e-06, + "loss": 1.406, + "step": 11736 + }, + { + "epoch": 0.6416904719598704, + "grad_norm": 1.527775526046753, + "learning_rate": 6.3528277187001315e-06, + "loss": 1.4531, + "step": 11737 + }, + { + "epoch": 0.641745144403592, + "grad_norm": 1.3518126010894775, + "learning_rate": 6.351126293735843e-06, + "loss": 1.4503, + "step": 11738 + }, + { + "epoch": 0.6417998168473136, + "grad_norm": 1.5120717287063599, + "learning_rate": 6.349424990616013e-06, + "loss": 1.4065, + "step": 11739 + }, + { + "epoch": 0.6418544892910351, + "grad_norm": 1.2281001806259155, + "learning_rate": 6.34772380939745e-06, + "loss": 1.3128, + "step": 11740 + }, + { + "epoch": 0.6419091617347567, + "grad_norm": 1.389732003211975, + "learning_rate": 6.346022750136956e-06, + "loss": 1.6413, + "step": 11741 + }, + { + "epoch": 0.6419638341784782, + "grad_norm": 1.2629338502883911, + "learning_rate": 6.34432181289134e-06, + "loss": 1.4252, + "step": 11742 + }, + { + "epoch": 0.6420185066221997, + "grad_norm": 1.3582525253295898, + "learning_rate": 6.342620997717397e-06, + "loss": 1.4579, + "step": 11743 + }, + { + "epoch": 0.6420731790659213, + "grad_norm": 1.1278085708618164, + "learning_rate": 6.340920304671916e-06, + "loss": 1.3576, + "step": 11744 + }, + { + "epoch": 0.6421278515096428, + "grad_norm": 1.3632965087890625, + "learning_rate": 6.339219733811697e-06, + "loss": 1.5145, + "step": 11745 + }, + { + "epoch": 0.6421825239533644, + "grad_norm": 1.4377549886703491, + "learning_rate": 6.337519285193521e-06, + "loss": 1.5045, + "step": 11746 + }, + { + "epoch": 0.642237196397086, + "grad_norm": 1.6141202449798584, + "learning_rate": 6.335818958874167e-06, + "loss": 1.4957, + "step": 11747 + }, + { + "epoch": 0.6422918688408075, + "grad_norm": 1.3351246118545532, + "learning_rate": 6.334118754910419e-06, + "loss": 1.3881, + "step": 11748 + }, + { + "epoch": 0.6423465412845291, + "grad_norm": 1.402549386024475, + "learning_rate": 6.332418673359049e-06, + "loss": 1.6049, + "step": 11749 + }, + { + "epoch": 0.6424012137282507, + "grad_norm": 1.305417537689209, + "learning_rate": 6.330718714276823e-06, + "loss": 1.4789, + "step": 11750 + }, + { + "epoch": 0.6424558861719721, + "grad_norm": 1.7569563388824463, + "learning_rate": 6.329018877720512e-06, + "loss": 1.3786, + "step": 11751 + }, + { + "epoch": 0.6425105586156937, + "grad_norm": 1.477946162223816, + "learning_rate": 6.327319163746871e-06, + "loss": 1.6171, + "step": 11752 + }, + { + "epoch": 0.6425652310594153, + "grad_norm": 1.8580653667449951, + "learning_rate": 6.325619572412665e-06, + "loss": 1.3963, + "step": 11753 + }, + { + "epoch": 0.6426199035031368, + "grad_norm": 1.3898319005966187, + "learning_rate": 6.323920103774644e-06, + "loss": 1.2556, + "step": 11754 + }, + { + "epoch": 0.6426745759468584, + "grad_norm": 1.652616024017334, + "learning_rate": 6.322220757889555e-06, + "loss": 1.432, + "step": 11755 + }, + { + "epoch": 0.64272924839058, + "grad_norm": 1.3930474519729614, + "learning_rate": 6.320521534814147e-06, + "loss": 1.4438, + "step": 11756 + }, + { + "epoch": 0.6427839208343015, + "grad_norm": 1.3443864583969116, + "learning_rate": 6.318822434605159e-06, + "loss": 1.43, + "step": 11757 + }, + { + "epoch": 0.6428385932780231, + "grad_norm": 1.5444141626358032, + "learning_rate": 6.317123457319323e-06, + "loss": 1.4283, + "step": 11758 + }, + { + "epoch": 0.6428932657217445, + "grad_norm": 1.4176337718963623, + "learning_rate": 6.315424603013382e-06, + "loss": 1.439, + "step": 11759 + }, + { + "epoch": 0.6429479381654661, + "grad_norm": 1.3020915985107422, + "learning_rate": 6.3137258717440606e-06, + "loss": 1.4114, + "step": 11760 + }, + { + "epoch": 0.6430026106091877, + "grad_norm": 1.2441754341125488, + "learning_rate": 6.312027263568079e-06, + "loss": 1.3627, + "step": 11761 + }, + { + "epoch": 0.6430572830529092, + "grad_norm": 1.3598090410232544, + "learning_rate": 6.310328778542163e-06, + "loss": 1.5065, + "step": 11762 + }, + { + "epoch": 0.6431119554966308, + "grad_norm": 1.302418828010559, + "learning_rate": 6.3086304167230284e-06, + "loss": 1.4379, + "step": 11763 + }, + { + "epoch": 0.6431666279403524, + "grad_norm": 1.7030537128448486, + "learning_rate": 6.306932178167382e-06, + "loss": 1.4478, + "step": 11764 + }, + { + "epoch": 0.6432213003840739, + "grad_norm": 1.851699948310852, + "learning_rate": 6.30523406293194e-06, + "loss": 1.4124, + "step": 11765 + }, + { + "epoch": 0.6432759728277955, + "grad_norm": 1.4806278944015503, + "learning_rate": 6.303536071073397e-06, + "loss": 1.598, + "step": 11766 + }, + { + "epoch": 0.6433306452715171, + "grad_norm": 1.5311168432235718, + "learning_rate": 6.3018382026484645e-06, + "loss": 1.5105, + "step": 11767 + }, + { + "epoch": 0.6433853177152385, + "grad_norm": 2.2747480869293213, + "learning_rate": 6.3001404577138325e-06, + "loss": 1.2782, + "step": 11768 + }, + { + "epoch": 0.6434399901589601, + "grad_norm": 1.9873276948928833, + "learning_rate": 6.29844283632619e-06, + "loss": 1.2039, + "step": 11769 + }, + { + "epoch": 0.6434946626026817, + "grad_norm": 1.7135177850723267, + "learning_rate": 6.296745338542229e-06, + "loss": 1.4132, + "step": 11770 + }, + { + "epoch": 0.6435493350464032, + "grad_norm": 1.7049334049224854, + "learning_rate": 6.295047964418632e-06, + "loss": 1.5834, + "step": 11771 + }, + { + "epoch": 0.6436040074901248, + "grad_norm": 1.5139455795288086, + "learning_rate": 6.293350714012073e-06, + "loss": 1.4611, + "step": 11772 + }, + { + "epoch": 0.6436586799338463, + "grad_norm": 1.424261450767517, + "learning_rate": 6.291653587379236e-06, + "loss": 1.5059, + "step": 11773 + }, + { + "epoch": 0.6437133523775679, + "grad_norm": 1.3184605836868286, + "learning_rate": 6.289956584576786e-06, + "loss": 1.7889, + "step": 11774 + }, + { + "epoch": 0.6437680248212895, + "grad_norm": 1.8268191814422607, + "learning_rate": 6.288259705661391e-06, + "loss": 1.4943, + "step": 11775 + }, + { + "epoch": 0.643822697265011, + "grad_norm": 1.4563827514648438, + "learning_rate": 6.286562950689717e-06, + "loss": 1.3461, + "step": 11776 + }, + { + "epoch": 0.6438773697087325, + "grad_norm": 1.3990938663482666, + "learning_rate": 6.284866319718418e-06, + "loss": 1.4488, + "step": 11777 + }, + { + "epoch": 0.6439320421524541, + "grad_norm": 1.5065290927886963, + "learning_rate": 6.283169812804146e-06, + "loss": 1.2175, + "step": 11778 + }, + { + "epoch": 0.6439867145961756, + "grad_norm": 1.7609573602676392, + "learning_rate": 6.281473430003562e-06, + "loss": 1.4179, + "step": 11779 + }, + { + "epoch": 0.6440413870398972, + "grad_norm": 1.2380675077438354, + "learning_rate": 6.2797771713733025e-06, + "loss": 1.2375, + "step": 11780 + }, + { + "epoch": 0.6440960594836188, + "grad_norm": 1.6767816543579102, + "learning_rate": 6.2780810369700165e-06, + "loss": 1.5481, + "step": 11781 + }, + { + "epoch": 0.6441507319273403, + "grad_norm": 1.5871422290802002, + "learning_rate": 6.276385026850337e-06, + "loss": 1.3214, + "step": 11782 + }, + { + "epoch": 0.6442054043710619, + "grad_norm": 1.7129288911819458, + "learning_rate": 6.2746891410708955e-06, + "loss": 1.2321, + "step": 11783 + }, + { + "epoch": 0.6442600768147835, + "grad_norm": 1.2028504610061646, + "learning_rate": 6.272993379688329e-06, + "loss": 1.4818, + "step": 11784 + }, + { + "epoch": 0.644314749258505, + "grad_norm": 1.2465722560882568, + "learning_rate": 6.271297742759259e-06, + "loss": 1.5182, + "step": 11785 + }, + { + "epoch": 0.6443694217022266, + "grad_norm": 1.8101578950881958, + "learning_rate": 6.269602230340305e-06, + "loss": 1.3443, + "step": 11786 + }, + { + "epoch": 0.644424094145948, + "grad_norm": 1.9692732095718384, + "learning_rate": 6.267906842488088e-06, + "loss": 1.3155, + "step": 11787 + }, + { + "epoch": 0.6444787665896696, + "grad_norm": 1.7964674234390259, + "learning_rate": 6.26621157925922e-06, + "loss": 1.3313, + "step": 11788 + }, + { + "epoch": 0.6445334390333912, + "grad_norm": 1.5752254724502563, + "learning_rate": 6.264516440710308e-06, + "loss": 1.7431, + "step": 11789 + }, + { + "epoch": 0.6445881114771127, + "grad_norm": 1.2968848943710327, + "learning_rate": 6.262821426897958e-06, + "loss": 1.4883, + "step": 11790 + }, + { + "epoch": 0.6446427839208343, + "grad_norm": 1.7111304998397827, + "learning_rate": 6.261126537878771e-06, + "loss": 1.5827, + "step": 11791 + }, + { + "epoch": 0.6446974563645559, + "grad_norm": 1.5907925367355347, + "learning_rate": 6.259431773709338e-06, + "loss": 1.4424, + "step": 11792 + }, + { + "epoch": 0.6447521288082774, + "grad_norm": 1.8831161260604858, + "learning_rate": 6.25773713444626e-06, + "loss": 1.4529, + "step": 11793 + }, + { + "epoch": 0.644806801251999, + "grad_norm": 1.3170510530471802, + "learning_rate": 6.256042620146119e-06, + "loss": 1.5024, + "step": 11794 + }, + { + "epoch": 0.6448614736957206, + "grad_norm": 1.8453760147094727, + "learning_rate": 6.254348230865501e-06, + "loss": 1.308, + "step": 11795 + }, + { + "epoch": 0.644916146139442, + "grad_norm": 1.5409632921218872, + "learning_rate": 6.252653966660987e-06, + "loss": 1.1712, + "step": 11796 + }, + { + "epoch": 0.6449708185831636, + "grad_norm": 1.4746639728546143, + "learning_rate": 6.2509598275891445e-06, + "loss": 1.4905, + "step": 11797 + }, + { + "epoch": 0.6450254910268852, + "grad_norm": 1.4092214107513428, + "learning_rate": 6.249265813706555e-06, + "loss": 1.438, + "step": 11798 + }, + { + "epoch": 0.6450801634706067, + "grad_norm": 1.6033051013946533, + "learning_rate": 6.247571925069782e-06, + "loss": 1.5903, + "step": 11799 + }, + { + "epoch": 0.6451348359143283, + "grad_norm": 1.4868483543395996, + "learning_rate": 6.245878161735386e-06, + "loss": 1.4535, + "step": 11800 + }, + { + "epoch": 0.6451895083580498, + "grad_norm": 2.0000252723693848, + "learning_rate": 6.2441845237599285e-06, + "loss": 1.3184, + "step": 11801 + }, + { + "epoch": 0.6452441808017714, + "grad_norm": 1.5167591571807861, + "learning_rate": 6.242491011199964e-06, + "loss": 1.5343, + "step": 11802 + }, + { + "epoch": 0.645298853245493, + "grad_norm": 1.5848240852355957, + "learning_rate": 6.240797624112037e-06, + "loss": 1.3946, + "step": 11803 + }, + { + "epoch": 0.6453535256892144, + "grad_norm": 1.9187959432601929, + "learning_rate": 6.239104362552704e-06, + "loss": 1.2062, + "step": 11804 + }, + { + "epoch": 0.645408198132936, + "grad_norm": 1.5412768125534058, + "learning_rate": 6.2374112265785e-06, + "loss": 1.6338, + "step": 11805 + }, + { + "epoch": 0.6454628705766576, + "grad_norm": 1.6692456007003784, + "learning_rate": 6.235718216245961e-06, + "loss": 1.7827, + "step": 11806 + }, + { + "epoch": 0.6455175430203791, + "grad_norm": 1.60446035861969, + "learning_rate": 6.2340253316116286e-06, + "loss": 1.3028, + "step": 11807 + }, + { + "epoch": 0.6455722154641007, + "grad_norm": 1.8864362239837646, + "learning_rate": 6.232332572732025e-06, + "loss": 1.5541, + "step": 11808 + }, + { + "epoch": 0.6456268879078223, + "grad_norm": 1.6991465091705322, + "learning_rate": 6.230639939663679e-06, + "loss": 1.5559, + "step": 11809 + }, + { + "epoch": 0.6456815603515438, + "grad_norm": 1.8259254693984985, + "learning_rate": 6.228947432463112e-06, + "loss": 1.3884, + "step": 11810 + }, + { + "epoch": 0.6457362327952654, + "grad_norm": 1.6202360391616821, + "learning_rate": 6.227255051186834e-06, + "loss": 1.3773, + "step": 11811 + }, + { + "epoch": 0.645790905238987, + "grad_norm": 1.7915804386138916, + "learning_rate": 6.2255627958913675e-06, + "loss": 1.5013, + "step": 11812 + }, + { + "epoch": 0.6458455776827084, + "grad_norm": 1.7081538438796997, + "learning_rate": 6.223870666633216e-06, + "loss": 1.3073, + "step": 11813 + }, + { + "epoch": 0.64590025012643, + "grad_norm": 1.311184287071228, + "learning_rate": 6.222178663468883e-06, + "loss": 1.3304, + "step": 11814 + }, + { + "epoch": 0.6459549225701515, + "grad_norm": 1.694230556488037, + "learning_rate": 6.22048678645487e-06, + "loss": 1.4386, + "step": 11815 + }, + { + "epoch": 0.6460095950138731, + "grad_norm": 1.488998293876648, + "learning_rate": 6.218795035647672e-06, + "loss": 1.3296, + "step": 11816 + }, + { + "epoch": 0.6460642674575947, + "grad_norm": 1.490230679512024, + "learning_rate": 6.2171034111037776e-06, + "loss": 1.6033, + "step": 11817 + }, + { + "epoch": 0.6461189399013162, + "grad_norm": 1.5138802528381348, + "learning_rate": 6.215411912879681e-06, + "loss": 1.4803, + "step": 11818 + }, + { + "epoch": 0.6461736123450378, + "grad_norm": 1.8085269927978516, + "learning_rate": 6.2137205410318605e-06, + "loss": 1.2373, + "step": 11819 + }, + { + "epoch": 0.6462282847887594, + "grad_norm": 1.4033396244049072, + "learning_rate": 6.212029295616795e-06, + "loss": 1.4404, + "step": 11820 + }, + { + "epoch": 0.6462829572324809, + "grad_norm": 1.5262203216552734, + "learning_rate": 6.210338176690962e-06, + "loss": 1.4294, + "step": 11821 + }, + { + "epoch": 0.6463376296762025, + "grad_norm": 1.1046056747436523, + "learning_rate": 6.208647184310826e-06, + "loss": 1.5916, + "step": 11822 + }, + { + "epoch": 0.646392302119924, + "grad_norm": 1.5915918350219727, + "learning_rate": 6.206956318532859e-06, + "loss": 1.3186, + "step": 11823 + }, + { + "epoch": 0.6464469745636455, + "grad_norm": 1.5391665697097778, + "learning_rate": 6.205265579413524e-06, + "loss": 1.3774, + "step": 11824 + }, + { + "epoch": 0.6465016470073671, + "grad_norm": 1.629164457321167, + "learning_rate": 6.203574967009271e-06, + "loss": 1.538, + "step": 11825 + }, + { + "epoch": 0.6465563194510887, + "grad_norm": 1.7089951038360596, + "learning_rate": 6.2018844813765635e-06, + "loss": 1.2753, + "step": 11826 + }, + { + "epoch": 0.6466109918948102, + "grad_norm": 1.5679205656051636, + "learning_rate": 6.200194122571843e-06, + "loss": 1.7369, + "step": 11827 + }, + { + "epoch": 0.6466656643385318, + "grad_norm": 1.5290595293045044, + "learning_rate": 6.198503890651557e-06, + "loss": 1.3864, + "step": 11828 + }, + { + "epoch": 0.6467203367822534, + "grad_norm": 1.3700499534606934, + "learning_rate": 6.196813785672149e-06, + "loss": 1.5475, + "step": 11829 + }, + { + "epoch": 0.6467750092259749, + "grad_norm": 1.535009741783142, + "learning_rate": 6.195123807690053e-06, + "loss": 1.5405, + "step": 11830 + }, + { + "epoch": 0.6468296816696965, + "grad_norm": 1.7119786739349365, + "learning_rate": 6.193433956761697e-06, + "loss": 1.3815, + "step": 11831 + }, + { + "epoch": 0.6468843541134179, + "grad_norm": 1.3934824466705322, + "learning_rate": 6.1917442329435175e-06, + "loss": 1.5185, + "step": 11832 + }, + { + "epoch": 0.6469390265571395, + "grad_norm": 1.6131072044372559, + "learning_rate": 6.190054636291935e-06, + "loss": 1.4978, + "step": 11833 + }, + { + "epoch": 0.6469936990008611, + "grad_norm": 2.2573766708374023, + "learning_rate": 6.188365166863366e-06, + "loss": 1.2233, + "step": 11834 + }, + { + "epoch": 0.6470483714445826, + "grad_norm": 1.4340916872024536, + "learning_rate": 6.18667582471423e-06, + "loss": 1.4636, + "step": 11835 + }, + { + "epoch": 0.6471030438883042, + "grad_norm": 1.8204002380371094, + "learning_rate": 6.184986609900934e-06, + "loss": 1.4755, + "step": 11836 + }, + { + "epoch": 0.6471577163320258, + "grad_norm": 1.5638879537582397, + "learning_rate": 6.18329752247989e-06, + "loss": 1.4068, + "step": 11837 + }, + { + "epoch": 0.6472123887757473, + "grad_norm": 1.582229733467102, + "learning_rate": 6.181608562507497e-06, + "loss": 1.4635, + "step": 11838 + }, + { + "epoch": 0.6472670612194689, + "grad_norm": 1.757487416267395, + "learning_rate": 6.179919730040154e-06, + "loss": 1.4812, + "step": 11839 + }, + { + "epoch": 0.6473217336631905, + "grad_norm": 1.4140468835830688, + "learning_rate": 6.178231025134256e-06, + "loss": 1.4084, + "step": 11840 + }, + { + "epoch": 0.6473764061069119, + "grad_norm": 1.0610584020614624, + "learning_rate": 6.176542447846193e-06, + "loss": 1.58, + "step": 11841 + }, + { + "epoch": 0.6474310785506335, + "grad_norm": 1.7835471630096436, + "learning_rate": 6.174853998232346e-06, + "loss": 1.3993, + "step": 11842 + }, + { + "epoch": 0.6474857509943551, + "grad_norm": 1.4636151790618896, + "learning_rate": 6.173165676349103e-06, + "loss": 1.3939, + "step": 11843 + }, + { + "epoch": 0.6475404234380766, + "grad_norm": 1.5566438436508179, + "learning_rate": 6.171477482252839e-06, + "loss": 1.3737, + "step": 11844 + }, + { + "epoch": 0.6475950958817982, + "grad_norm": 1.3988937139511108, + "learning_rate": 6.169789415999921e-06, + "loss": 1.5771, + "step": 11845 + }, + { + "epoch": 0.6476497683255197, + "grad_norm": 1.4119548797607422, + "learning_rate": 6.168101477646726e-06, + "loss": 1.3804, + "step": 11846 + }, + { + "epoch": 0.6477044407692413, + "grad_norm": 1.5648298263549805, + "learning_rate": 6.166413667249615e-06, + "loss": 1.52, + "step": 11847 + }, + { + "epoch": 0.6477591132129629, + "grad_norm": 1.5718477964401245, + "learning_rate": 6.164725984864947e-06, + "loss": 1.2831, + "step": 11848 + }, + { + "epoch": 0.6478137856566843, + "grad_norm": 1.68485689163208, + "learning_rate": 6.163038430549077e-06, + "loss": 1.34, + "step": 11849 + }, + { + "epoch": 0.6478684581004059, + "grad_norm": 1.2948856353759766, + "learning_rate": 6.16135100435836e-06, + "loss": 1.3608, + "step": 11850 + }, + { + "epoch": 0.6479231305441275, + "grad_norm": 1.3764832019805908, + "learning_rate": 6.159663706349136e-06, + "loss": 1.4576, + "step": 11851 + }, + { + "epoch": 0.647977802987849, + "grad_norm": 1.368189811706543, + "learning_rate": 6.157976536577757e-06, + "loss": 1.3725, + "step": 11852 + }, + { + "epoch": 0.6480324754315706, + "grad_norm": 1.589455246925354, + "learning_rate": 6.156289495100553e-06, + "loss": 1.7349, + "step": 11853 + }, + { + "epoch": 0.6480871478752922, + "grad_norm": 1.734025001525879, + "learning_rate": 6.154602581973865e-06, + "loss": 1.2437, + "step": 11854 + }, + { + "epoch": 0.6481418203190137, + "grad_norm": 1.4258651733398438, + "learning_rate": 6.152915797254022e-06, + "loss": 1.4674, + "step": 11855 + }, + { + "epoch": 0.6481964927627353, + "grad_norm": 1.8813084363937378, + "learning_rate": 6.151229140997343e-06, + "loss": 1.4137, + "step": 11856 + }, + { + "epoch": 0.6482511652064569, + "grad_norm": 1.4456318616867065, + "learning_rate": 6.149542613260157e-06, + "loss": 1.2939, + "step": 11857 + }, + { + "epoch": 0.6483058376501784, + "grad_norm": 1.6778693199157715, + "learning_rate": 6.147856214098781e-06, + "loss": 1.3798, + "step": 11858 + }, + { + "epoch": 0.6483605100938999, + "grad_norm": 1.5313090085983276, + "learning_rate": 6.146169943569522e-06, + "loss": 1.3706, + "step": 11859 + }, + { + "epoch": 0.6484151825376214, + "grad_norm": 1.4439817667007446, + "learning_rate": 6.144483801728693e-06, + "loss": 1.266, + "step": 11860 + }, + { + "epoch": 0.648469854981343, + "grad_norm": 1.6050727367401123, + "learning_rate": 6.1427977886326e-06, + "loss": 1.5888, + "step": 11861 + }, + { + "epoch": 0.6485245274250646, + "grad_norm": 1.3635189533233643, + "learning_rate": 6.141111904337534e-06, + "loss": 1.4494, + "step": 11862 + }, + { + "epoch": 0.6485791998687861, + "grad_norm": 1.2657456398010254, + "learning_rate": 6.139426148899801e-06, + "loss": 1.3641, + "step": 11863 + }, + { + "epoch": 0.6486338723125077, + "grad_norm": 1.4774863719940186, + "learning_rate": 6.137740522375687e-06, + "loss": 1.4044, + "step": 11864 + }, + { + "epoch": 0.6486885447562293, + "grad_norm": 1.7064207792282104, + "learning_rate": 6.136055024821477e-06, + "loss": 1.4864, + "step": 11865 + }, + { + "epoch": 0.6487432171999508, + "grad_norm": 1.3997992277145386, + "learning_rate": 6.134369656293461e-06, + "loss": 1.5551, + "step": 11866 + }, + { + "epoch": 0.6487978896436724, + "grad_norm": 1.8768726587295532, + "learning_rate": 6.1326844168479104e-06, + "loss": 1.3745, + "step": 11867 + }, + { + "epoch": 0.648852562087394, + "grad_norm": 1.495766043663025, + "learning_rate": 6.130999306541104e-06, + "loss": 1.4033, + "step": 11868 + }, + { + "epoch": 0.6489072345311154, + "grad_norm": 1.8925633430480957, + "learning_rate": 6.129314325429311e-06, + "loss": 1.1745, + "step": 11869 + }, + { + "epoch": 0.648961906974837, + "grad_norm": 2.4001994132995605, + "learning_rate": 6.12762947356879e-06, + "loss": 1.3611, + "step": 11870 + }, + { + "epoch": 0.6490165794185586, + "grad_norm": 1.6138944625854492, + "learning_rate": 6.1259447510158136e-06, + "loss": 1.2992, + "step": 11871 + }, + { + "epoch": 0.6490712518622801, + "grad_norm": 1.397487759590149, + "learning_rate": 6.124260157826631e-06, + "loss": 1.4901, + "step": 11872 + }, + { + "epoch": 0.6491259243060017, + "grad_norm": 1.4715980291366577, + "learning_rate": 6.122575694057495e-06, + "loss": 1.3493, + "step": 11873 + }, + { + "epoch": 0.6491805967497232, + "grad_norm": 1.3364479541778564, + "learning_rate": 6.120891359764655e-06, + "loss": 1.4698, + "step": 11874 + }, + { + "epoch": 0.6492352691934448, + "grad_norm": 1.3539382219314575, + "learning_rate": 6.1192071550043584e-06, + "loss": 1.6554, + "step": 11875 + }, + { + "epoch": 0.6492899416371664, + "grad_norm": 1.5423020124435425, + "learning_rate": 6.1175230798328365e-06, + "loss": 1.4741, + "step": 11876 + }, + { + "epoch": 0.6493446140808878, + "grad_norm": 1.4364763498306274, + "learning_rate": 6.1158391343063335e-06, + "loss": 1.4251, + "step": 11877 + }, + { + "epoch": 0.6493992865246094, + "grad_norm": 2.167586326599121, + "learning_rate": 6.114155318481076e-06, + "loss": 1.2385, + "step": 11878 + }, + { + "epoch": 0.649453958968331, + "grad_norm": 1.5778107643127441, + "learning_rate": 6.1124716324132885e-06, + "loss": 1.2974, + "step": 11879 + }, + { + "epoch": 0.6495086314120525, + "grad_norm": 1.7524707317352295, + "learning_rate": 6.110788076159198e-06, + "loss": 1.2838, + "step": 11880 + }, + { + "epoch": 0.6495633038557741, + "grad_norm": 1.5293537378311157, + "learning_rate": 6.109104649775016e-06, + "loss": 1.4289, + "step": 11881 + }, + { + "epoch": 0.6496179762994957, + "grad_norm": 1.8224025964736938, + "learning_rate": 6.107421353316965e-06, + "loss": 1.3324, + "step": 11882 + }, + { + "epoch": 0.6496726487432172, + "grad_norm": 1.2629220485687256, + "learning_rate": 6.105738186841248e-06, + "loss": 1.4758, + "step": 11883 + }, + { + "epoch": 0.6497273211869388, + "grad_norm": 2.2754430770874023, + "learning_rate": 6.104055150404067e-06, + "loss": 1.4267, + "step": 11884 + }, + { + "epoch": 0.6497819936306604, + "grad_norm": 1.8597588539123535, + "learning_rate": 6.102372244061631e-06, + "loss": 1.5789, + "step": 11885 + }, + { + "epoch": 0.6498366660743818, + "grad_norm": 1.487559199333191, + "learning_rate": 6.1006894678701314e-06, + "loss": 1.6087, + "step": 11886 + }, + { + "epoch": 0.6498913385181034, + "grad_norm": 1.198846459388733, + "learning_rate": 6.099006821885758e-06, + "loss": 1.6932, + "step": 11887 + }, + { + "epoch": 0.6499460109618249, + "grad_norm": 2.069308042526245, + "learning_rate": 6.097324306164705e-06, + "loss": 1.4802, + "step": 11888 + }, + { + "epoch": 0.6500006834055465, + "grad_norm": 1.896677017211914, + "learning_rate": 6.095641920763149e-06, + "loss": 1.5633, + "step": 11889 + }, + { + "epoch": 0.6500553558492681, + "grad_norm": 1.3736677169799805, + "learning_rate": 6.093959665737268e-06, + "loss": 1.4917, + "step": 11890 + }, + { + "epoch": 0.6501100282929896, + "grad_norm": 1.2629953622817993, + "learning_rate": 6.092277541143243e-06, + "loss": 1.5761, + "step": 11891 + }, + { + "epoch": 0.6501647007367112, + "grad_norm": 1.4661870002746582, + "learning_rate": 6.090595547037242e-06, + "loss": 1.386, + "step": 11892 + }, + { + "epoch": 0.6502193731804328, + "grad_norm": 1.6487503051757812, + "learning_rate": 6.088913683475427e-06, + "loss": 1.5134, + "step": 11893 + }, + { + "epoch": 0.6502740456241543, + "grad_norm": 1.4472191333770752, + "learning_rate": 6.0872319505139635e-06, + "loss": 1.3253, + "step": 11894 + }, + { + "epoch": 0.6503287180678758, + "grad_norm": 1.5426547527313232, + "learning_rate": 6.0855503482090025e-06, + "loss": 1.6336, + "step": 11895 + }, + { + "epoch": 0.6503833905115974, + "grad_norm": 1.643873929977417, + "learning_rate": 6.083868876616706e-06, + "loss": 1.3672, + "step": 11896 + }, + { + "epoch": 0.6504380629553189, + "grad_norm": 1.6917277574539185, + "learning_rate": 6.082187535793216e-06, + "loss": 1.4684, + "step": 11897 + }, + { + "epoch": 0.6504927353990405, + "grad_norm": 1.4820879697799683, + "learning_rate": 6.080506325794675e-06, + "loss": 1.4515, + "step": 11898 + }, + { + "epoch": 0.6505474078427621, + "grad_norm": 1.820566177368164, + "learning_rate": 6.078825246677229e-06, + "loss": 1.641, + "step": 11899 + }, + { + "epoch": 0.6506020802864836, + "grad_norm": 1.5773046016693115, + "learning_rate": 6.077144298497009e-06, + "loss": 1.4062, + "step": 11900 + }, + { + "epoch": 0.6506567527302052, + "grad_norm": 2.218179702758789, + "learning_rate": 6.075463481310141e-06, + "loss": 1.092, + "step": 11901 + }, + { + "epoch": 0.6507114251739267, + "grad_norm": 1.163008213043213, + "learning_rate": 6.073782795172761e-06, + "loss": 1.4669, + "step": 11902 + }, + { + "epoch": 0.6507660976176483, + "grad_norm": 1.8216114044189453, + "learning_rate": 6.0721022401409864e-06, + "loss": 1.1881, + "step": 11903 + }, + { + "epoch": 0.6508207700613698, + "grad_norm": 1.424275517463684, + "learning_rate": 6.070421816270933e-06, + "loss": 1.2029, + "step": 11904 + }, + { + "epoch": 0.6508754425050913, + "grad_norm": 1.5272160768508911, + "learning_rate": 6.068741523618718e-06, + "loss": 1.366, + "step": 11905 + }, + { + "epoch": 0.6509301149488129, + "grad_norm": 1.9092479944229126, + "learning_rate": 6.06706136224045e-06, + "loss": 1.4222, + "step": 11906 + }, + { + "epoch": 0.6509847873925345, + "grad_norm": 2.118665933609009, + "learning_rate": 6.065381332192228e-06, + "loss": 1.4778, + "step": 11907 + }, + { + "epoch": 0.651039459836256, + "grad_norm": 1.7498530149459839, + "learning_rate": 6.06370143353016e-06, + "loss": 1.4788, + "step": 11908 + }, + { + "epoch": 0.6510941322799776, + "grad_norm": 1.3087083101272583, + "learning_rate": 6.0620216663103336e-06, + "loss": 1.3804, + "step": 11909 + }, + { + "epoch": 0.6511488047236992, + "grad_norm": 1.5402467250823975, + "learning_rate": 6.0603420305888484e-06, + "loss": 1.3804, + "step": 11910 + }, + { + "epoch": 0.6512034771674207, + "grad_norm": 1.4716423749923706, + "learning_rate": 6.058662526421787e-06, + "loss": 1.5464, + "step": 11911 + }, + { + "epoch": 0.6512581496111423, + "grad_norm": 1.6683263778686523, + "learning_rate": 6.0569831538652306e-06, + "loss": 1.2983, + "step": 11912 + }, + { + "epoch": 0.6513128220548638, + "grad_norm": 1.919399619102478, + "learning_rate": 6.055303912975261e-06, + "loss": 1.3418, + "step": 11913 + }, + { + "epoch": 0.6513674944985853, + "grad_norm": 1.5865579843521118, + "learning_rate": 6.053624803807951e-06, + "loss": 1.1648, + "step": 11914 + }, + { + "epoch": 0.6514221669423069, + "grad_norm": 1.8459868431091309, + "learning_rate": 6.051945826419366e-06, + "loss": 1.3543, + "step": 11915 + }, + { + "epoch": 0.6514768393860284, + "grad_norm": 1.2856051921844482, + "learning_rate": 6.0502669808655774e-06, + "loss": 1.5932, + "step": 11916 + }, + { + "epoch": 0.65153151182975, + "grad_norm": 1.4652116298675537, + "learning_rate": 6.0485882672026415e-06, + "loss": 1.4667, + "step": 11917 + }, + { + "epoch": 0.6515861842734716, + "grad_norm": 1.782402753829956, + "learning_rate": 6.046909685486615e-06, + "loss": 1.4007, + "step": 11918 + }, + { + "epoch": 0.6516408567171931, + "grad_norm": 1.5955498218536377, + "learning_rate": 6.045231235773552e-06, + "loss": 1.4681, + "step": 11919 + }, + { + "epoch": 0.6516955291609147, + "grad_norm": 1.5662522315979004, + "learning_rate": 6.0435529181195e-06, + "loss": 1.458, + "step": 11920 + }, + { + "epoch": 0.6517502016046363, + "grad_norm": 1.4059232473373413, + "learning_rate": 6.041874732580493e-06, + "loss": 1.4962, + "step": 11921 + }, + { + "epoch": 0.6518048740483577, + "grad_norm": 1.3212960958480835, + "learning_rate": 6.040196679212582e-06, + "loss": 1.1754, + "step": 11922 + }, + { + "epoch": 0.6518595464920793, + "grad_norm": 1.4461475610733032, + "learning_rate": 6.0385187580717915e-06, + "loss": 1.1389, + "step": 11923 + }, + { + "epoch": 0.6519142189358009, + "grad_norm": 1.3887560367584229, + "learning_rate": 6.0368409692141615e-06, + "loss": 1.6046, + "step": 11924 + }, + { + "epoch": 0.6519688913795224, + "grad_norm": 1.58475923538208, + "learning_rate": 6.035163312695709e-06, + "loss": 1.2314, + "step": 11925 + }, + { + "epoch": 0.652023563823244, + "grad_norm": 1.8801347017288208, + "learning_rate": 6.0334857885724575e-06, + "loss": 1.2843, + "step": 11926 + }, + { + "epoch": 0.6520782362669656, + "grad_norm": 1.635109543800354, + "learning_rate": 6.031808396900422e-06, + "loss": 1.4458, + "step": 11927 + }, + { + "epoch": 0.6521329087106871, + "grad_norm": 1.7954210042953491, + "learning_rate": 6.030131137735618e-06, + "loss": 1.3799, + "step": 11928 + }, + { + "epoch": 0.6521875811544087, + "grad_norm": 1.6822597980499268, + "learning_rate": 6.028454011134047e-06, + "loss": 1.3631, + "step": 11929 + }, + { + "epoch": 0.6522422535981302, + "grad_norm": 1.9486403465270996, + "learning_rate": 6.026777017151719e-06, + "loss": 1.5044, + "step": 11930 + }, + { + "epoch": 0.6522969260418517, + "grad_norm": 1.4714323282241821, + "learning_rate": 6.025100155844632e-06, + "loss": 1.4836, + "step": 11931 + }, + { + "epoch": 0.6523515984855733, + "grad_norm": 1.5471583604812622, + "learning_rate": 6.023423427268774e-06, + "loss": 1.398, + "step": 11932 + }, + { + "epoch": 0.6524062709292948, + "grad_norm": 1.8092068433761597, + "learning_rate": 6.021746831480142e-06, + "loss": 1.7235, + "step": 11933 + }, + { + "epoch": 0.6524609433730164, + "grad_norm": 1.451186180114746, + "learning_rate": 6.020070368534719e-06, + "loss": 1.4081, + "step": 11934 + }, + { + "epoch": 0.652515615816738, + "grad_norm": 1.319523811340332, + "learning_rate": 6.01839403848848e-06, + "loss": 1.5775, + "step": 11935 + }, + { + "epoch": 0.6525702882604595, + "grad_norm": 1.6486692428588867, + "learning_rate": 6.016717841397413e-06, + "loss": 1.3488, + "step": 11936 + }, + { + "epoch": 0.6526249607041811, + "grad_norm": 1.6963770389556885, + "learning_rate": 6.015041777317481e-06, + "loss": 1.6011, + "step": 11937 + }, + { + "epoch": 0.6526796331479027, + "grad_norm": 1.962693452835083, + "learning_rate": 6.013365846304657e-06, + "loss": 1.5731, + "step": 11938 + }, + { + "epoch": 0.6527343055916242, + "grad_norm": 1.4702812433242798, + "learning_rate": 6.0116900484149046e-06, + "loss": 1.3855, + "step": 11939 + }, + { + "epoch": 0.6527889780353457, + "grad_norm": 1.473361611366272, + "learning_rate": 6.010014383704174e-06, + "loss": 1.2476, + "step": 11940 + }, + { + "epoch": 0.6528436504790673, + "grad_norm": 1.7360057830810547, + "learning_rate": 6.00833885222843e-06, + "loss": 1.644, + "step": 11941 + }, + { + "epoch": 0.6528983229227888, + "grad_norm": 1.7788697481155396, + "learning_rate": 6.00666345404362e-06, + "loss": 1.3418, + "step": 11942 + }, + { + "epoch": 0.6529529953665104, + "grad_norm": 1.5637296438217163, + "learning_rate": 6.004988189205683e-06, + "loss": 1.1139, + "step": 11943 + }, + { + "epoch": 0.6530076678102319, + "grad_norm": 1.5965642929077148, + "learning_rate": 6.003313057770568e-06, + "loss": 1.4841, + "step": 11944 + }, + { + "epoch": 0.6530623402539535, + "grad_norm": 1.4266948699951172, + "learning_rate": 6.001638059794211e-06, + "loss": 1.1772, + "step": 11945 + }, + { + "epoch": 0.6531170126976751, + "grad_norm": 1.496363878250122, + "learning_rate": 5.999963195332536e-06, + "loss": 1.6273, + "step": 11946 + }, + { + "epoch": 0.6531716851413966, + "grad_norm": 1.3837610483169556, + "learning_rate": 5.9982884644414815e-06, + "loss": 1.3625, + "step": 11947 + }, + { + "epoch": 0.6532263575851182, + "grad_norm": 1.7780522108078003, + "learning_rate": 5.996613867176964e-06, + "loss": 1.3902, + "step": 11948 + }, + { + "epoch": 0.6532810300288397, + "grad_norm": 1.787841558456421, + "learning_rate": 5.994939403594899e-06, + "loss": 1.3386, + "step": 11949 + }, + { + "epoch": 0.6533357024725612, + "grad_norm": 1.313425064086914, + "learning_rate": 5.993265073751211e-06, + "loss": 1.4771, + "step": 11950 + }, + { + "epoch": 0.6533903749162828, + "grad_norm": 1.6495310068130493, + "learning_rate": 5.9915908777018026e-06, + "loss": 1.4398, + "step": 11951 + }, + { + "epoch": 0.6534450473600044, + "grad_norm": 1.3157762289047241, + "learning_rate": 5.989916815502581e-06, + "loss": 1.4849, + "step": 11952 + }, + { + "epoch": 0.6534997198037259, + "grad_norm": 2.6838009357452393, + "learning_rate": 5.9882428872094475e-06, + "loss": 1.5553, + "step": 11953 + }, + { + "epoch": 0.6535543922474475, + "grad_norm": 1.7370972633361816, + "learning_rate": 5.986569092878296e-06, + "loss": 1.6017, + "step": 11954 + }, + { + "epoch": 0.6536090646911691, + "grad_norm": 1.2297358512878418, + "learning_rate": 5.984895432565022e-06, + "loss": 1.8412, + "step": 11955 + }, + { + "epoch": 0.6536637371348906, + "grad_norm": 1.34415602684021, + "learning_rate": 5.983221906325512e-06, + "loss": 1.5842, + "step": 11956 + }, + { + "epoch": 0.6537184095786122, + "grad_norm": 1.591476559638977, + "learning_rate": 5.981548514215646e-06, + "loss": 1.773, + "step": 11957 + }, + { + "epoch": 0.6537730820223336, + "grad_norm": 1.7197377681732178, + "learning_rate": 5.979875256291307e-06, + "loss": 1.484, + "step": 11958 + }, + { + "epoch": 0.6538277544660552, + "grad_norm": 1.4387826919555664, + "learning_rate": 5.9782021326083665e-06, + "loss": 1.4454, + "step": 11959 + }, + { + "epoch": 0.6538824269097768, + "grad_norm": 1.2666640281677246, + "learning_rate": 5.976529143222689e-06, + "loss": 1.5365, + "step": 11960 + }, + { + "epoch": 0.6539370993534983, + "grad_norm": 1.5090231895446777, + "learning_rate": 5.9748562881901504e-06, + "loss": 1.389, + "step": 11961 + }, + { + "epoch": 0.6539917717972199, + "grad_norm": 1.7347179651260376, + "learning_rate": 5.973183567566605e-06, + "loss": 1.5907, + "step": 11962 + }, + { + "epoch": 0.6540464442409415, + "grad_norm": 1.4318712949752808, + "learning_rate": 5.9715109814079085e-06, + "loss": 1.5849, + "step": 11963 + }, + { + "epoch": 0.654101116684663, + "grad_norm": 1.2237480878829956, + "learning_rate": 5.969838529769914e-06, + "loss": 1.5832, + "step": 11964 + }, + { + "epoch": 0.6541557891283846, + "grad_norm": 1.7276796102523804, + "learning_rate": 5.968166212708465e-06, + "loss": 1.351, + "step": 11965 + }, + { + "epoch": 0.6542104615721062, + "grad_norm": 1.584828495979309, + "learning_rate": 5.966494030279411e-06, + "loss": 1.4175, + "step": 11966 + }, + { + "epoch": 0.6542651340158276, + "grad_norm": 1.5189799070358276, + "learning_rate": 5.964821982538586e-06, + "loss": 1.3573, + "step": 11967 + }, + { + "epoch": 0.6543198064595492, + "grad_norm": 1.5695611238479614, + "learning_rate": 5.96315006954182e-06, + "loss": 1.313, + "step": 11968 + }, + { + "epoch": 0.6543744789032708, + "grad_norm": 1.2374932765960693, + "learning_rate": 5.96147829134495e-06, + "loss": 1.3523, + "step": 11969 + }, + { + "epoch": 0.6544291513469923, + "grad_norm": 1.5640524625778198, + "learning_rate": 5.959806648003796e-06, + "loss": 1.3318, + "step": 11970 + }, + { + "epoch": 0.6544838237907139, + "grad_norm": 1.4350824356079102, + "learning_rate": 5.958135139574177e-06, + "loss": 1.4818, + "step": 11971 + }, + { + "epoch": 0.6545384962344354, + "grad_norm": 1.3837084770202637, + "learning_rate": 5.956463766111913e-06, + "loss": 1.6113, + "step": 11972 + }, + { + "epoch": 0.654593168678157, + "grad_norm": 1.3003454208374023, + "learning_rate": 5.954792527672812e-06, + "loss": 1.3751, + "step": 11973 + }, + { + "epoch": 0.6546478411218786, + "grad_norm": 1.4933891296386719, + "learning_rate": 5.953121424312676e-06, + "loss": 1.4483, + "step": 11974 + }, + { + "epoch": 0.6547025135656, + "grad_norm": 1.5878294706344604, + "learning_rate": 5.951450456087317e-06, + "loss": 1.5386, + "step": 11975 + }, + { + "epoch": 0.6547571860093216, + "grad_norm": 1.6281371116638184, + "learning_rate": 5.949779623052526e-06, + "loss": 1.4599, + "step": 11976 + }, + { + "epoch": 0.6548118584530432, + "grad_norm": 1.7525817155838013, + "learning_rate": 5.948108925264096e-06, + "loss": 1.643, + "step": 11977 + }, + { + "epoch": 0.6548665308967647, + "grad_norm": 2.8235585689544678, + "learning_rate": 5.94643836277782e-06, + "loss": 1.5961, + "step": 11978 + }, + { + "epoch": 0.6549212033404863, + "grad_norm": 1.5169492959976196, + "learning_rate": 5.944767935649475e-06, + "loss": 1.4311, + "step": 11979 + }, + { + "epoch": 0.6549758757842079, + "grad_norm": 1.6786473989486694, + "learning_rate": 5.943097643934847e-06, + "loss": 1.5246, + "step": 11980 + }, + { + "epoch": 0.6550305482279294, + "grad_norm": 1.4681916236877441, + "learning_rate": 5.941427487689711e-06, + "loss": 1.2903, + "step": 11981 + }, + { + "epoch": 0.655085220671651, + "grad_norm": 1.4545563459396362, + "learning_rate": 5.939757466969831e-06, + "loss": 1.3448, + "step": 11982 + }, + { + "epoch": 0.6551398931153726, + "grad_norm": 1.6065605878829956, + "learning_rate": 5.9380875818309805e-06, + "loss": 1.3304, + "step": 11983 + }, + { + "epoch": 0.655194565559094, + "grad_norm": 1.5228513479232788, + "learning_rate": 5.9364178323289155e-06, + "loss": 1.4051, + "step": 11984 + }, + { + "epoch": 0.6552492380028156, + "grad_norm": 1.532241940498352, + "learning_rate": 5.934748218519391e-06, + "loss": 1.3693, + "step": 11985 + }, + { + "epoch": 0.6553039104465371, + "grad_norm": 1.524013876914978, + "learning_rate": 5.933078740458167e-06, + "loss": 1.6466, + "step": 11986 + }, + { + "epoch": 0.6553585828902587, + "grad_norm": 1.473665714263916, + "learning_rate": 5.931409398200987e-06, + "loss": 1.1944, + "step": 11987 + }, + { + "epoch": 0.6554132553339803, + "grad_norm": 1.5464293956756592, + "learning_rate": 5.92974019180359e-06, + "loss": 1.3827, + "step": 11988 + }, + { + "epoch": 0.6554679277777018, + "grad_norm": 2.264580249786377, + "learning_rate": 5.928071121321723e-06, + "loss": 1.4629, + "step": 11989 + }, + { + "epoch": 0.6555226002214234, + "grad_norm": 1.3264564275741577, + "learning_rate": 5.926402186811118e-06, + "loss": 1.3691, + "step": 11990 + }, + { + "epoch": 0.655577272665145, + "grad_norm": 1.5808309316635132, + "learning_rate": 5.924733388327501e-06, + "loss": 1.5199, + "step": 11991 + }, + { + "epoch": 0.6556319451088665, + "grad_norm": 1.2798891067504883, + "learning_rate": 5.9230647259266e-06, + "loss": 1.4139, + "step": 11992 + }, + { + "epoch": 0.6556866175525881, + "grad_norm": 1.3437153100967407, + "learning_rate": 5.9213961996641315e-06, + "loss": 1.4593, + "step": 11993 + }, + { + "epoch": 0.6557412899963097, + "grad_norm": 1.4049746990203857, + "learning_rate": 5.919727809595816e-06, + "loss": 1.4327, + "step": 11994 + }, + { + "epoch": 0.6557959624400311, + "grad_norm": 1.3596034049987793, + "learning_rate": 5.918059555777367e-06, + "loss": 1.5219, + "step": 11995 + }, + { + "epoch": 0.6558506348837527, + "grad_norm": 1.6194498538970947, + "learning_rate": 5.916391438264484e-06, + "loss": 1.3216, + "step": 11996 + }, + { + "epoch": 0.6559053073274743, + "grad_norm": 1.440741777420044, + "learning_rate": 5.914723457112877e-06, + "loss": 1.4381, + "step": 11997 + }, + { + "epoch": 0.6559599797711958, + "grad_norm": 2.3395297527313232, + "learning_rate": 5.913055612378238e-06, + "loss": 1.5589, + "step": 11998 + }, + { + "epoch": 0.6560146522149174, + "grad_norm": 1.2149291038513184, + "learning_rate": 5.9113879041162595e-06, + "loss": 1.553, + "step": 11999 + }, + { + "epoch": 0.6560693246586389, + "grad_norm": 1.9311703443527222, + "learning_rate": 5.909720332382638e-06, + "loss": 1.4556, + "step": 12000 + }, + { + "epoch": 0.6561239971023605, + "grad_norm": 1.468773365020752, + "learning_rate": 5.908052897233052e-06, + "loss": 1.5176, + "step": 12001 + }, + { + "epoch": 0.6561786695460821, + "grad_norm": 1.4942787885665894, + "learning_rate": 5.9063855987231785e-06, + "loss": 1.3692, + "step": 12002 + }, + { + "epoch": 0.6562333419898035, + "grad_norm": 1.5557525157928467, + "learning_rate": 5.9047184369086994e-06, + "loss": 1.4137, + "step": 12003 + }, + { + "epoch": 0.6562880144335251, + "grad_norm": 1.3405786752700806, + "learning_rate": 5.903051411845282e-06, + "loss": 1.4624, + "step": 12004 + }, + { + "epoch": 0.6563426868772467, + "grad_norm": 1.3440712690353394, + "learning_rate": 5.901384523588586e-06, + "loss": 1.4604, + "step": 12005 + }, + { + "epoch": 0.6563973593209682, + "grad_norm": 1.4463046789169312, + "learning_rate": 5.899717772194283e-06, + "loss": 1.5028, + "step": 12006 + }, + { + "epoch": 0.6564520317646898, + "grad_norm": 2.268357992172241, + "learning_rate": 5.898051157718022e-06, + "loss": 1.4406, + "step": 12007 + }, + { + "epoch": 0.6565067042084114, + "grad_norm": 1.345040202140808, + "learning_rate": 5.896384680215461e-06, + "loss": 1.5433, + "step": 12008 + }, + { + "epoch": 0.6565613766521329, + "grad_norm": 1.6160523891448975, + "learning_rate": 5.894718339742247e-06, + "loss": 1.1899, + "step": 12009 + }, + { + "epoch": 0.6566160490958545, + "grad_norm": 1.3251092433929443, + "learning_rate": 5.893052136354019e-06, + "loss": 1.6038, + "step": 12010 + }, + { + "epoch": 0.6566707215395761, + "grad_norm": 1.6299140453338623, + "learning_rate": 5.8913860701064175e-06, + "loss": 1.5234, + "step": 12011 + }, + { + "epoch": 0.6567253939832975, + "grad_norm": 1.4718691110610962, + "learning_rate": 5.889720141055077e-06, + "loss": 1.478, + "step": 12012 + }, + { + "epoch": 0.6567800664270191, + "grad_norm": 1.6108582019805908, + "learning_rate": 5.888054349255622e-06, + "loss": 1.3447, + "step": 12013 + }, + { + "epoch": 0.6568347388707406, + "grad_norm": 1.819890022277832, + "learning_rate": 5.886388694763685e-06, + "loss": 1.4894, + "step": 12014 + }, + { + "epoch": 0.6568894113144622, + "grad_norm": 1.6497435569763184, + "learning_rate": 5.884723177634884e-06, + "loss": 1.4738, + "step": 12015 + }, + { + "epoch": 0.6569440837581838, + "grad_norm": 1.5739381313323975, + "learning_rate": 5.883057797924829e-06, + "loss": 1.4239, + "step": 12016 + }, + { + "epoch": 0.6569987562019053, + "grad_norm": 1.3588629961013794, + "learning_rate": 5.881392555689137e-06, + "loss": 1.5357, + "step": 12017 + }, + { + "epoch": 0.6570534286456269, + "grad_norm": 1.666534423828125, + "learning_rate": 5.879727450983412e-06, + "loss": 1.4065, + "step": 12018 + }, + { + "epoch": 0.6571081010893485, + "grad_norm": 1.3885499238967896, + "learning_rate": 5.878062483863254e-06, + "loss": 1.3007, + "step": 12019 + }, + { + "epoch": 0.65716277353307, + "grad_norm": 1.5562400817871094, + "learning_rate": 5.876397654384265e-06, + "loss": 1.4113, + "step": 12020 + }, + { + "epoch": 0.6572174459767915, + "grad_norm": 1.4518065452575684, + "learning_rate": 5.874732962602032e-06, + "loss": 1.4346, + "step": 12021 + }, + { + "epoch": 0.6572721184205131, + "grad_norm": 1.3239150047302246, + "learning_rate": 5.873068408572148e-06, + "loss": 1.3399, + "step": 12022 + }, + { + "epoch": 0.6573267908642346, + "grad_norm": 1.5097545385360718, + "learning_rate": 5.871403992350194e-06, + "loss": 1.4202, + "step": 12023 + }, + { + "epoch": 0.6573814633079562, + "grad_norm": 1.4779644012451172, + "learning_rate": 5.8697397139917446e-06, + "loss": 1.6371, + "step": 12024 + }, + { + "epoch": 0.6574361357516778, + "grad_norm": 1.975235104560852, + "learning_rate": 5.868075573552383e-06, + "loss": 1.5617, + "step": 12025 + }, + { + "epoch": 0.6574908081953993, + "grad_norm": 1.4610528945922852, + "learning_rate": 5.866411571087672e-06, + "loss": 1.3915, + "step": 12026 + }, + { + "epoch": 0.6575454806391209, + "grad_norm": 1.662747859954834, + "learning_rate": 5.864747706653176e-06, + "loss": 1.3938, + "step": 12027 + }, + { + "epoch": 0.6576001530828425, + "grad_norm": 1.4090735912322998, + "learning_rate": 5.8630839803044615e-06, + "loss": 1.5234, + "step": 12028 + }, + { + "epoch": 0.657654825526564, + "grad_norm": 1.8170995712280273, + "learning_rate": 5.86142039209708e-06, + "loss": 1.4237, + "step": 12029 + }, + { + "epoch": 0.6577094979702856, + "grad_norm": 1.515059471130371, + "learning_rate": 5.85975694208658e-06, + "loss": 1.7293, + "step": 12030 + }, + { + "epoch": 0.657764170414007, + "grad_norm": 1.568267822265625, + "learning_rate": 5.8580936303285165e-06, + "loss": 1.6432, + "step": 12031 + }, + { + "epoch": 0.6578188428577286, + "grad_norm": 1.2302361726760864, + "learning_rate": 5.856430456878424e-06, + "loss": 1.4358, + "step": 12032 + }, + { + "epoch": 0.6578735153014502, + "grad_norm": 1.4395878314971924, + "learning_rate": 5.8547674217918374e-06, + "loss": 1.1559, + "step": 12033 + }, + { + "epoch": 0.6579281877451717, + "grad_norm": 1.6222041845321655, + "learning_rate": 5.853104525124298e-06, + "loss": 1.2264, + "step": 12034 + }, + { + "epoch": 0.6579828601888933, + "grad_norm": 1.6536369323730469, + "learning_rate": 5.851441766931328e-06, + "loss": 1.4022, + "step": 12035 + }, + { + "epoch": 0.6580375326326149, + "grad_norm": 1.4461650848388672, + "learning_rate": 5.849779147268453e-06, + "loss": 1.4272, + "step": 12036 + }, + { + "epoch": 0.6580922050763364, + "grad_norm": 1.5594865083694458, + "learning_rate": 5.8481166661911915e-06, + "loss": 1.2322, + "step": 12037 + }, + { + "epoch": 0.658146877520058, + "grad_norm": 1.4645211696624756, + "learning_rate": 5.846454323755053e-06, + "loss": 1.4141, + "step": 12038 + }, + { + "epoch": 0.6582015499637796, + "grad_norm": 1.437299370765686, + "learning_rate": 5.844792120015556e-06, + "loss": 1.3622, + "step": 12039 + }, + { + "epoch": 0.658256222407501, + "grad_norm": 1.478845238685608, + "learning_rate": 5.843130055028201e-06, + "loss": 1.5338, + "step": 12040 + }, + { + "epoch": 0.6583108948512226, + "grad_norm": 1.538744568824768, + "learning_rate": 5.841468128848484e-06, + "loss": 1.2451, + "step": 12041 + }, + { + "epoch": 0.6583655672949442, + "grad_norm": 1.5449538230895996, + "learning_rate": 5.839806341531908e-06, + "loss": 1.4056, + "step": 12042 + }, + { + "epoch": 0.6584202397386657, + "grad_norm": 1.2552454471588135, + "learning_rate": 5.838144693133958e-06, + "loss": 1.4616, + "step": 12043 + }, + { + "epoch": 0.6584749121823873, + "grad_norm": 1.5740492343902588, + "learning_rate": 5.836483183710122e-06, + "loss": 1.725, + "step": 12044 + }, + { + "epoch": 0.6585295846261088, + "grad_norm": 1.8703722953796387, + "learning_rate": 5.8348218133158855e-06, + "loss": 1.431, + "step": 12045 + }, + { + "epoch": 0.6585842570698304, + "grad_norm": 1.2737228870391846, + "learning_rate": 5.833160582006722e-06, + "loss": 1.6459, + "step": 12046 + }, + { + "epoch": 0.658638929513552, + "grad_norm": 1.4841116666793823, + "learning_rate": 5.831499489838105e-06, + "loss": 1.6007, + "step": 12047 + }, + { + "epoch": 0.6586936019572734, + "grad_norm": 1.4018696546554565, + "learning_rate": 5.829838536865502e-06, + "loss": 1.3476, + "step": 12048 + }, + { + "epoch": 0.658748274400995, + "grad_norm": 1.3637053966522217, + "learning_rate": 5.82817772314437e-06, + "loss": 1.3708, + "step": 12049 + }, + { + "epoch": 0.6588029468447166, + "grad_norm": 1.7237621545791626, + "learning_rate": 5.8265170487301806e-06, + "loss": 1.4188, + "step": 12050 + }, + { + "epoch": 0.6588576192884381, + "grad_norm": 1.71147882938385, + "learning_rate": 5.8248565136783786e-06, + "loss": 1.2912, + "step": 12051 + }, + { + "epoch": 0.6589122917321597, + "grad_norm": 1.5242787599563599, + "learning_rate": 5.82319611804441e-06, + "loss": 1.584, + "step": 12052 + }, + { + "epoch": 0.6589669641758813, + "grad_norm": 1.672202706336975, + "learning_rate": 5.821535861883729e-06, + "loss": 1.6253, + "step": 12053 + }, + { + "epoch": 0.6590216366196028, + "grad_norm": 2.2233169078826904, + "learning_rate": 5.819875745251771e-06, + "loss": 1.3802, + "step": 12054 + }, + { + "epoch": 0.6590763090633244, + "grad_norm": 1.3227274417877197, + "learning_rate": 5.8182157682039665e-06, + "loss": 1.4983, + "step": 12055 + }, + { + "epoch": 0.659130981507046, + "grad_norm": 1.4736576080322266, + "learning_rate": 5.816555930795754e-06, + "loss": 1.5489, + "step": 12056 + }, + { + "epoch": 0.6591856539507674, + "grad_norm": 1.5875499248504639, + "learning_rate": 5.814896233082556e-06, + "loss": 1.4746, + "step": 12057 + }, + { + "epoch": 0.659240326394489, + "grad_norm": 1.4480445384979248, + "learning_rate": 5.813236675119793e-06, + "loss": 1.4943, + "step": 12058 + }, + { + "epoch": 0.6592949988382105, + "grad_norm": 1.6355326175689697, + "learning_rate": 5.811577256962883e-06, + "loss": 1.4516, + "step": 12059 + }, + { + "epoch": 0.6593496712819321, + "grad_norm": 1.4163800477981567, + "learning_rate": 5.8099179786672365e-06, + "loss": 1.426, + "step": 12060 + }, + { + "epoch": 0.6594043437256537, + "grad_norm": 1.743087649345398, + "learning_rate": 5.808258840288257e-06, + "loss": 1.4038, + "step": 12061 + }, + { + "epoch": 0.6594590161693752, + "grad_norm": 1.5601674318313599, + "learning_rate": 5.806599841881355e-06, + "loss": 1.4142, + "step": 12062 + }, + { + "epoch": 0.6595136886130968, + "grad_norm": 1.9074304103851318, + "learning_rate": 5.8049409835019215e-06, + "loss": 1.4453, + "step": 12063 + }, + { + "epoch": 0.6595683610568184, + "grad_norm": 1.7125523090362549, + "learning_rate": 5.803282265205354e-06, + "loss": 1.4244, + "step": 12064 + }, + { + "epoch": 0.6596230335005399, + "grad_norm": 1.4645042419433594, + "learning_rate": 5.801623687047041e-06, + "loss": 1.5577, + "step": 12065 + }, + { + "epoch": 0.6596777059442614, + "grad_norm": 2.339202642440796, + "learning_rate": 5.799965249082361e-06, + "loss": 1.3495, + "step": 12066 + }, + { + "epoch": 0.659732378387983, + "grad_norm": 1.3127819299697876, + "learning_rate": 5.798306951366701e-06, + "loss": 1.3717, + "step": 12067 + }, + { + "epoch": 0.6597870508317045, + "grad_norm": 1.4277527332305908, + "learning_rate": 5.79664879395543e-06, + "loss": 1.4101, + "step": 12068 + }, + { + "epoch": 0.6598417232754261, + "grad_norm": 1.5177229642868042, + "learning_rate": 5.794990776903917e-06, + "loss": 1.5082, + "step": 12069 + }, + { + "epoch": 0.6598963957191477, + "grad_norm": 1.5130245685577393, + "learning_rate": 5.793332900267534e-06, + "loss": 1.1832, + "step": 12070 + }, + { + "epoch": 0.6599510681628692, + "grad_norm": 1.787922978401184, + "learning_rate": 5.7916751641016356e-06, + "loss": 1.5908, + "step": 12071 + }, + { + "epoch": 0.6600057406065908, + "grad_norm": 1.401417851448059, + "learning_rate": 5.7900175684615786e-06, + "loss": 1.4158, + "step": 12072 + }, + { + "epoch": 0.6600604130503123, + "grad_norm": 1.6794434785842896, + "learning_rate": 5.788360113402713e-06, + "loss": 1.4182, + "step": 12073 + }, + { + "epoch": 0.6601150854940339, + "grad_norm": 1.7519397735595703, + "learning_rate": 5.786702798980388e-06, + "loss": 1.5636, + "step": 12074 + }, + { + "epoch": 0.6601697579377555, + "grad_norm": 1.7444006204605103, + "learning_rate": 5.7850456252499386e-06, + "loss": 1.2992, + "step": 12075 + }, + { + "epoch": 0.6602244303814769, + "grad_norm": 1.7413535118103027, + "learning_rate": 5.78338859226671e-06, + "loss": 1.5973, + "step": 12076 + }, + { + "epoch": 0.6602791028251985, + "grad_norm": 1.726138710975647, + "learning_rate": 5.781731700086028e-06, + "loss": 1.3604, + "step": 12077 + }, + { + "epoch": 0.6603337752689201, + "grad_norm": 1.4258896112442017, + "learning_rate": 5.780074948763226e-06, + "loss": 1.4418, + "step": 12078 + }, + { + "epoch": 0.6603884477126416, + "grad_norm": 1.3650277853012085, + "learning_rate": 5.778418338353624e-06, + "loss": 1.4509, + "step": 12079 + }, + { + "epoch": 0.6604431201563632, + "grad_norm": 1.860967755317688, + "learning_rate": 5.776761868912537e-06, + "loss": 1.2086, + "step": 12080 + }, + { + "epoch": 0.6604977926000848, + "grad_norm": 1.3394920825958252, + "learning_rate": 5.775105540495284e-06, + "loss": 1.5353, + "step": 12081 + }, + { + "epoch": 0.6605524650438063, + "grad_norm": 1.5221244096755981, + "learning_rate": 5.773449353157172e-06, + "loss": 1.3303, + "step": 12082 + }, + { + "epoch": 0.6606071374875279, + "grad_norm": 2.0186517238616943, + "learning_rate": 5.771793306953504e-06, + "loss": 1.6783, + "step": 12083 + }, + { + "epoch": 0.6606618099312495, + "grad_norm": 1.6956044435501099, + "learning_rate": 5.770137401939577e-06, + "loss": 1.4591, + "step": 12084 + }, + { + "epoch": 0.6607164823749709, + "grad_norm": 1.8549796342849731, + "learning_rate": 5.768481638170691e-06, + "loss": 1.4413, + "step": 12085 + }, + { + "epoch": 0.6607711548186925, + "grad_norm": 1.4732567071914673, + "learning_rate": 5.766826015702127e-06, + "loss": 1.3999, + "step": 12086 + }, + { + "epoch": 0.660825827262414, + "grad_norm": 1.343719482421875, + "learning_rate": 5.7651705345891795e-06, + "loss": 1.3941, + "step": 12087 + }, + { + "epoch": 0.6608804997061356, + "grad_norm": 1.5637540817260742, + "learning_rate": 5.763515194887126e-06, + "loss": 1.5366, + "step": 12088 + }, + { + "epoch": 0.6609351721498572, + "grad_norm": 1.6097193956375122, + "learning_rate": 5.761859996651237e-06, + "loss": 1.5364, + "step": 12089 + }, + { + "epoch": 0.6609898445935787, + "grad_norm": 1.3531659841537476, + "learning_rate": 5.760204939936791e-06, + "loss": 1.5288, + "step": 12090 + }, + { + "epoch": 0.6610445170373003, + "grad_norm": 1.1623896360397339, + "learning_rate": 5.758550024799049e-06, + "loss": 1.5921, + "step": 12091 + }, + { + "epoch": 0.6610991894810219, + "grad_norm": 1.500412106513977, + "learning_rate": 5.756895251293277e-06, + "loss": 1.3199, + "step": 12092 + }, + { + "epoch": 0.6611538619247433, + "grad_norm": 1.4259966611862183, + "learning_rate": 5.75524061947473e-06, + "loss": 1.565, + "step": 12093 + }, + { + "epoch": 0.6612085343684649, + "grad_norm": 1.600795030593872, + "learning_rate": 5.753586129398655e-06, + "loss": 1.398, + "step": 12094 + }, + { + "epoch": 0.6612632068121865, + "grad_norm": 1.5244487524032593, + "learning_rate": 5.751931781120308e-06, + "loss": 1.5674, + "step": 12095 + }, + { + "epoch": 0.661317879255908, + "grad_norm": 1.3453314304351807, + "learning_rate": 5.750277574694927e-06, + "loss": 1.5546, + "step": 12096 + }, + { + "epoch": 0.6613725516996296, + "grad_norm": 1.7152047157287598, + "learning_rate": 5.748623510177751e-06, + "loss": 1.2655, + "step": 12097 + }, + { + "epoch": 0.6614272241433512, + "grad_norm": 1.4638670682907104, + "learning_rate": 5.746969587624011e-06, + "loss": 1.2498, + "step": 12098 + }, + { + "epoch": 0.6614818965870727, + "grad_norm": 1.490842580795288, + "learning_rate": 5.745315807088936e-06, + "loss": 1.4942, + "step": 12099 + }, + { + "epoch": 0.6615365690307943, + "grad_norm": 1.538072109222412, + "learning_rate": 5.743662168627747e-06, + "loss": 1.5559, + "step": 12100 + }, + { + "epoch": 0.6615912414745158, + "grad_norm": 1.4873422384262085, + "learning_rate": 5.74200867229567e-06, + "loss": 1.5382, + "step": 12101 + }, + { + "epoch": 0.6616459139182373, + "grad_norm": 1.2758268117904663, + "learning_rate": 5.740355318147916e-06, + "loss": 1.2572, + "step": 12102 + }, + { + "epoch": 0.6617005863619589, + "grad_norm": 1.3080989122390747, + "learning_rate": 5.738702106239689e-06, + "loss": 1.4396, + "step": 12103 + }, + { + "epoch": 0.6617552588056804, + "grad_norm": 1.5991170406341553, + "learning_rate": 5.737049036626201e-06, + "loss": 1.4407, + "step": 12104 + }, + { + "epoch": 0.661809931249402, + "grad_norm": 1.6547949314117432, + "learning_rate": 5.735396109362646e-06, + "loss": 1.4139, + "step": 12105 + }, + { + "epoch": 0.6618646036931236, + "grad_norm": 1.5609195232391357, + "learning_rate": 5.733743324504225e-06, + "loss": 1.5113, + "step": 12106 + }, + { + "epoch": 0.6619192761368451, + "grad_norm": 1.5883848667144775, + "learning_rate": 5.732090682106126e-06, + "loss": 1.4128, + "step": 12107 + }, + { + "epoch": 0.6619739485805667, + "grad_norm": 1.8181604146957397, + "learning_rate": 5.73043818222353e-06, + "loss": 1.3955, + "step": 12108 + }, + { + "epoch": 0.6620286210242883, + "grad_norm": 1.8883970975875854, + "learning_rate": 5.728785824911627e-06, + "loss": 1.5113, + "step": 12109 + }, + { + "epoch": 0.6620832934680098, + "grad_norm": 2.0105154514312744, + "learning_rate": 5.727133610225588e-06, + "loss": 1.6596, + "step": 12110 + }, + { + "epoch": 0.6621379659117314, + "grad_norm": 1.7659783363342285, + "learning_rate": 5.725481538220583e-06, + "loss": 1.2529, + "step": 12111 + }, + { + "epoch": 0.662192638355453, + "grad_norm": 1.6450750827789307, + "learning_rate": 5.723829608951781e-06, + "loss": 1.1905, + "step": 12112 + }, + { + "epoch": 0.6622473107991744, + "grad_norm": 1.6468333005905151, + "learning_rate": 5.722177822474342e-06, + "loss": 1.403, + "step": 12113 + }, + { + "epoch": 0.662301983242896, + "grad_norm": 1.5958598852157593, + "learning_rate": 5.72052617884342e-06, + "loss": 1.4559, + "step": 12114 + }, + { + "epoch": 0.6623566556866175, + "grad_norm": 1.7084625959396362, + "learning_rate": 5.718874678114174e-06, + "loss": 1.3483, + "step": 12115 + }, + { + "epoch": 0.6624113281303391, + "grad_norm": 1.5373917818069458, + "learning_rate": 5.717223320341751e-06, + "loss": 1.5807, + "step": 12116 + }, + { + "epoch": 0.6624660005740607, + "grad_norm": 2.4254698753356934, + "learning_rate": 5.7155721055812856e-06, + "loss": 1.3203, + "step": 12117 + }, + { + "epoch": 0.6625206730177822, + "grad_norm": 1.5433851480484009, + "learning_rate": 5.713921033887925e-06, + "loss": 1.3995, + "step": 12118 + }, + { + "epoch": 0.6625753454615038, + "grad_norm": 1.3907999992370605, + "learning_rate": 5.712270105316795e-06, + "loss": 1.5376, + "step": 12119 + }, + { + "epoch": 0.6626300179052254, + "grad_norm": 1.453791618347168, + "learning_rate": 5.710619319923032e-06, + "loss": 1.381, + "step": 12120 + }, + { + "epoch": 0.6626846903489468, + "grad_norm": 1.3836565017700195, + "learning_rate": 5.708968677761755e-06, + "loss": 1.538, + "step": 12121 + }, + { + "epoch": 0.6627393627926684, + "grad_norm": 1.6756647825241089, + "learning_rate": 5.707318178888083e-06, + "loss": 1.4736, + "step": 12122 + }, + { + "epoch": 0.66279403523639, + "grad_norm": 1.5466947555541992, + "learning_rate": 5.70566782335713e-06, + "loss": 1.5229, + "step": 12123 + }, + { + "epoch": 0.6628487076801115, + "grad_norm": 1.3315798044204712, + "learning_rate": 5.704017611224005e-06, + "loss": 1.4122, + "step": 12124 + }, + { + "epoch": 0.6629033801238331, + "grad_norm": 1.571380376815796, + "learning_rate": 5.7023675425438096e-06, + "loss": 1.4632, + "step": 12125 + }, + { + "epoch": 0.6629580525675547, + "grad_norm": 1.7986109256744385, + "learning_rate": 5.70071761737165e-06, + "loss": 1.6853, + "step": 12126 + }, + { + "epoch": 0.6630127250112762, + "grad_norm": 1.509354591369629, + "learning_rate": 5.699067835762618e-06, + "loss": 1.2967, + "step": 12127 + }, + { + "epoch": 0.6630673974549978, + "grad_norm": 1.8757177591323853, + "learning_rate": 5.6974181977718e-06, + "loss": 1.4171, + "step": 12128 + }, + { + "epoch": 0.6631220698987192, + "grad_norm": 1.3863805532455444, + "learning_rate": 5.695768703454288e-06, + "loss": 1.4717, + "step": 12129 + }, + { + "epoch": 0.6631767423424408, + "grad_norm": 1.8133536577224731, + "learning_rate": 5.69411935286516e-06, + "loss": 1.4481, + "step": 12130 + }, + { + "epoch": 0.6632314147861624, + "grad_norm": 1.410223364830017, + "learning_rate": 5.6924701460594855e-06, + "loss": 1.4595, + "step": 12131 + }, + { + "epoch": 0.6632860872298839, + "grad_norm": 1.5303014516830444, + "learning_rate": 5.690821083092346e-06, + "loss": 1.3503, + "step": 12132 + }, + { + "epoch": 0.6633407596736055, + "grad_norm": 1.6303281784057617, + "learning_rate": 5.689172164018797e-06, + "loss": 1.4888, + "step": 12133 + }, + { + "epoch": 0.6633954321173271, + "grad_norm": 1.3803743124008179, + "learning_rate": 5.687523388893911e-06, + "loss": 1.3645, + "step": 12134 + }, + { + "epoch": 0.6634501045610486, + "grad_norm": 1.868558645248413, + "learning_rate": 5.685874757772737e-06, + "loss": 1.4608, + "step": 12135 + }, + { + "epoch": 0.6635047770047702, + "grad_norm": 1.672637939453125, + "learning_rate": 5.684226270710329e-06, + "loss": 1.4243, + "step": 12136 + }, + { + "epoch": 0.6635594494484918, + "grad_norm": 1.4602006673812866, + "learning_rate": 5.682577927761732e-06, + "loss": 1.3166, + "step": 12137 + }, + { + "epoch": 0.6636141218922132, + "grad_norm": 2.1893770694732666, + "learning_rate": 5.680929728981991e-06, + "loss": 1.0759, + "step": 12138 + }, + { + "epoch": 0.6636687943359348, + "grad_norm": 1.5209428071975708, + "learning_rate": 5.679281674426136e-06, + "loss": 1.3873, + "step": 12139 + }, + { + "epoch": 0.6637234667796564, + "grad_norm": 1.3667482137680054, + "learning_rate": 5.67763376414921e-06, + "loss": 1.6047, + "step": 12140 + }, + { + "epoch": 0.6637781392233779, + "grad_norm": 2.084317684173584, + "learning_rate": 5.6759859982062345e-06, + "loss": 1.404, + "step": 12141 + }, + { + "epoch": 0.6638328116670995, + "grad_norm": 1.4982786178588867, + "learning_rate": 5.674338376652228e-06, + "loss": 1.4895, + "step": 12142 + }, + { + "epoch": 0.663887484110821, + "grad_norm": 2.426342487335205, + "learning_rate": 5.672690899542219e-06, + "loss": 1.3648, + "step": 12143 + }, + { + "epoch": 0.6639421565545426, + "grad_norm": 1.475606083869934, + "learning_rate": 5.671043566931216e-06, + "loss": 1.4875, + "step": 12144 + }, + { + "epoch": 0.6639968289982642, + "grad_norm": 1.5053048133850098, + "learning_rate": 5.6693963788742215e-06, + "loss": 1.44, + "step": 12145 + }, + { + "epoch": 0.6640515014419857, + "grad_norm": 1.4777195453643799, + "learning_rate": 5.667749335426247e-06, + "loss": 1.4111, + "step": 12146 + }, + { + "epoch": 0.6641061738857073, + "grad_norm": 1.766777753829956, + "learning_rate": 5.666102436642285e-06, + "loss": 1.2125, + "step": 12147 + }, + { + "epoch": 0.6641608463294288, + "grad_norm": 1.2133866548538208, + "learning_rate": 5.6644556825773365e-06, + "loss": 1.4974, + "step": 12148 + }, + { + "epoch": 0.6642155187731503, + "grad_norm": 1.4481922388076782, + "learning_rate": 5.662809073286387e-06, + "loss": 1.2635, + "step": 12149 + }, + { + "epoch": 0.6642701912168719, + "grad_norm": 1.8552095890045166, + "learning_rate": 5.66116260882442e-06, + "loss": 1.3738, + "step": 12150 + }, + { + "epoch": 0.6643248636605935, + "grad_norm": 1.7425163984298706, + "learning_rate": 5.659516289246414e-06, + "loss": 1.5604, + "step": 12151 + }, + { + "epoch": 0.664379536104315, + "grad_norm": 1.345739722251892, + "learning_rate": 5.657870114607346e-06, + "loss": 1.4511, + "step": 12152 + }, + { + "epoch": 0.6644342085480366, + "grad_norm": 1.2174546718597412, + "learning_rate": 5.6562240849621785e-06, + "loss": 1.4566, + "step": 12153 + }, + { + "epoch": 0.6644888809917582, + "grad_norm": 1.4578840732574463, + "learning_rate": 5.654578200365886e-06, + "loss": 1.5232, + "step": 12154 + }, + { + "epoch": 0.6645435534354797, + "grad_norm": 1.3800554275512695, + "learning_rate": 5.652932460873424e-06, + "loss": 1.5296, + "step": 12155 + }, + { + "epoch": 0.6645982258792013, + "grad_norm": 1.4602696895599365, + "learning_rate": 5.651286866539745e-06, + "loss": 1.4292, + "step": 12156 + }, + { + "epoch": 0.6646528983229227, + "grad_norm": 1.386459231376648, + "learning_rate": 5.649641417419805e-06, + "loss": 1.4936, + "step": 12157 + }, + { + "epoch": 0.6647075707666443, + "grad_norm": 1.1725199222564697, + "learning_rate": 5.647996113568547e-06, + "loss": 1.2444, + "step": 12158 + }, + { + "epoch": 0.6647622432103659, + "grad_norm": 1.546595573425293, + "learning_rate": 5.6463509550409065e-06, + "loss": 1.4489, + "step": 12159 + }, + { + "epoch": 0.6648169156540874, + "grad_norm": 1.2622255086898804, + "learning_rate": 5.644705941891828e-06, + "loss": 1.5784, + "step": 12160 + }, + { + "epoch": 0.664871588097809, + "grad_norm": 1.574855923652649, + "learning_rate": 5.643061074176239e-06, + "loss": 1.4752, + "step": 12161 + }, + { + "epoch": 0.6649262605415306, + "grad_norm": 1.4421827793121338, + "learning_rate": 5.641416351949063e-06, + "loss": 1.2867, + "step": 12162 + }, + { + "epoch": 0.6649809329852521, + "grad_norm": 1.929884672164917, + "learning_rate": 5.639771775265223e-06, + "loss": 1.5514, + "step": 12163 + }, + { + "epoch": 0.6650356054289737, + "grad_norm": 2.023993968963623, + "learning_rate": 5.638127344179632e-06, + "loss": 1.2097, + "step": 12164 + }, + { + "epoch": 0.6650902778726953, + "grad_norm": 1.2796655893325806, + "learning_rate": 5.636483058747209e-06, + "loss": 1.415, + "step": 12165 + }, + { + "epoch": 0.6651449503164167, + "grad_norm": 1.5390894412994385, + "learning_rate": 5.634838919022855e-06, + "loss": 1.5642, + "step": 12166 + }, + { + "epoch": 0.6651996227601383, + "grad_norm": 1.844823956489563, + "learning_rate": 5.633194925061469e-06, + "loss": 1.3501, + "step": 12167 + }, + { + "epoch": 0.6652542952038599, + "grad_norm": 1.4272526502609253, + "learning_rate": 5.631551076917955e-06, + "loss": 1.4809, + "step": 12168 + }, + { + "epoch": 0.6653089676475814, + "grad_norm": 1.4822533130645752, + "learning_rate": 5.629907374647201e-06, + "loss": 1.4622, + "step": 12169 + }, + { + "epoch": 0.665363640091303, + "grad_norm": 1.4875508546829224, + "learning_rate": 5.628263818304091e-06, + "loss": 1.3031, + "step": 12170 + }, + { + "epoch": 0.6654183125350245, + "grad_norm": 1.75388765335083, + "learning_rate": 5.626620407943515e-06, + "loss": 1.5741, + "step": 12171 + }, + { + "epoch": 0.6654729849787461, + "grad_norm": 1.543782353401184, + "learning_rate": 5.624977143620347e-06, + "loss": 1.3947, + "step": 12172 + }, + { + "epoch": 0.6655276574224677, + "grad_norm": 1.3283405303955078, + "learning_rate": 5.623334025389453e-06, + "loss": 1.3879, + "step": 12173 + }, + { + "epoch": 0.6655823298661891, + "grad_norm": 1.6949957609176636, + "learning_rate": 5.62169105330571e-06, + "loss": 1.6171, + "step": 12174 + }, + { + "epoch": 0.6656370023099107, + "grad_norm": 1.4266337156295776, + "learning_rate": 5.620048227423977e-06, + "loss": 1.48, + "step": 12175 + }, + { + "epoch": 0.6656916747536323, + "grad_norm": 1.1086604595184326, + "learning_rate": 5.618405547799113e-06, + "loss": 1.6041, + "step": 12176 + }, + { + "epoch": 0.6657463471973538, + "grad_norm": 1.3882771730422974, + "learning_rate": 5.616763014485967e-06, + "loss": 1.616, + "step": 12177 + }, + { + "epoch": 0.6658010196410754, + "grad_norm": 1.5948916673660278, + "learning_rate": 5.615120627539387e-06, + "loss": 1.649, + "step": 12178 + }, + { + "epoch": 0.665855692084797, + "grad_norm": 1.7762349843978882, + "learning_rate": 5.613478387014223e-06, + "loss": 1.2119, + "step": 12179 + }, + { + "epoch": 0.6659103645285185, + "grad_norm": 1.2482036352157593, + "learning_rate": 5.611836292965308e-06, + "loss": 1.6522, + "step": 12180 + }, + { + "epoch": 0.6659650369722401, + "grad_norm": 1.437142014503479, + "learning_rate": 5.6101943454474725e-06, + "loss": 1.4939, + "step": 12181 + }, + { + "epoch": 0.6660197094159617, + "grad_norm": 1.2740416526794434, + "learning_rate": 5.608552544515553e-06, + "loss": 1.31, + "step": 12182 + }, + { + "epoch": 0.6660743818596832, + "grad_norm": 1.9903019666671753, + "learning_rate": 5.60691089022437e-06, + "loss": 1.4505, + "step": 12183 + }, + { + "epoch": 0.6661290543034047, + "grad_norm": 1.1925016641616821, + "learning_rate": 5.605269382628735e-06, + "loss": 1.5433, + "step": 12184 + }, + { + "epoch": 0.6661837267471262, + "grad_norm": 1.654996633529663, + "learning_rate": 5.603628021783474e-06, + "loss": 1.4259, + "step": 12185 + }, + { + "epoch": 0.6662383991908478, + "grad_norm": 1.4840328693389893, + "learning_rate": 5.601986807743388e-06, + "loss": 1.479, + "step": 12186 + }, + { + "epoch": 0.6662930716345694, + "grad_norm": 1.721126675605774, + "learning_rate": 5.60034574056328e-06, + "loss": 1.6445, + "step": 12187 + }, + { + "epoch": 0.6663477440782909, + "grad_norm": 1.8059290647506714, + "learning_rate": 5.598704820297955e-06, + "loss": 1.5007, + "step": 12188 + }, + { + "epoch": 0.6664024165220125, + "grad_norm": 1.657744288444519, + "learning_rate": 5.597064047002204e-06, + "loss": 1.3749, + "step": 12189 + }, + { + "epoch": 0.6664570889657341, + "grad_norm": 2.0633790493011475, + "learning_rate": 5.595423420730816e-06, + "loss": 1.3803, + "step": 12190 + }, + { + "epoch": 0.6665117614094556, + "grad_norm": 1.4820798635482788, + "learning_rate": 5.593782941538577e-06, + "loss": 1.3049, + "step": 12191 + }, + { + "epoch": 0.6665664338531772, + "grad_norm": 1.552718997001648, + "learning_rate": 5.59214260948026e-06, + "loss": 1.3224, + "step": 12192 + }, + { + "epoch": 0.6666211062968987, + "grad_norm": 1.5847017765045166, + "learning_rate": 5.5905024246106485e-06, + "loss": 1.4759, + "step": 12193 + }, + { + "epoch": 0.6666757787406202, + "grad_norm": 1.3022193908691406, + "learning_rate": 5.5888623869845095e-06, + "loss": 1.2644, + "step": 12194 + }, + { + "epoch": 0.6667304511843418, + "grad_norm": 1.6226027011871338, + "learning_rate": 5.587222496656601e-06, + "loss": 1.3591, + "step": 12195 + }, + { + "epoch": 0.6667851236280634, + "grad_norm": 1.436389446258545, + "learning_rate": 5.585582753681693e-06, + "loss": 1.4214, + "step": 12196 + }, + { + "epoch": 0.6668397960717849, + "grad_norm": 1.5434503555297852, + "learning_rate": 5.583943158114535e-06, + "loss": 1.3939, + "step": 12197 + }, + { + "epoch": 0.6668944685155065, + "grad_norm": 1.6716411113739014, + "learning_rate": 5.5823037100098756e-06, + "loss": 1.4331, + "step": 12198 + }, + { + "epoch": 0.666949140959228, + "grad_norm": 1.2519069910049438, + "learning_rate": 5.580664409422464e-06, + "loss": 1.4867, + "step": 12199 + }, + { + "epoch": 0.6670038134029496, + "grad_norm": 1.572237253189087, + "learning_rate": 5.579025256407038e-06, + "loss": 1.298, + "step": 12200 + }, + { + "epoch": 0.6670584858466712, + "grad_norm": 1.4684803485870361, + "learning_rate": 5.577386251018333e-06, + "loss": 1.4966, + "step": 12201 + }, + { + "epoch": 0.6671131582903926, + "grad_norm": 1.3803117275238037, + "learning_rate": 5.57574739331108e-06, + "loss": 1.4916, + "step": 12202 + }, + { + "epoch": 0.6671678307341142, + "grad_norm": 1.5074208974838257, + "learning_rate": 5.5741086833400015e-06, + "loss": 1.5647, + "step": 12203 + }, + { + "epoch": 0.6672225031778358, + "grad_norm": 1.3021845817565918, + "learning_rate": 5.572470121159816e-06, + "loss": 1.5017, + "step": 12204 + }, + { + "epoch": 0.6672771756215573, + "grad_norm": 1.3521803617477417, + "learning_rate": 5.570831706825248e-06, + "loss": 1.3669, + "step": 12205 + }, + { + "epoch": 0.6673318480652789, + "grad_norm": 1.5732117891311646, + "learning_rate": 5.569193440390999e-06, + "loss": 1.669, + "step": 12206 + }, + { + "epoch": 0.6673865205090005, + "grad_norm": 1.9044480323791504, + "learning_rate": 5.567555321911782e-06, + "loss": 1.304, + "step": 12207 + }, + { + "epoch": 0.667441192952722, + "grad_norm": 1.397766351699829, + "learning_rate": 5.565917351442291e-06, + "loss": 1.4541, + "step": 12208 + }, + { + "epoch": 0.6674958653964436, + "grad_norm": 2.294172525405884, + "learning_rate": 5.5642795290372245e-06, + "loss": 1.3856, + "step": 12209 + }, + { + "epoch": 0.6675505378401652, + "grad_norm": 1.775654911994934, + "learning_rate": 5.562641854751275e-06, + "loss": 1.6126, + "step": 12210 + }, + { + "epoch": 0.6676052102838866, + "grad_norm": 1.4803513288497925, + "learning_rate": 5.561004328639128e-06, + "loss": 1.5567, + "step": 12211 + }, + { + "epoch": 0.6676598827276082, + "grad_norm": 1.5558598041534424, + "learning_rate": 5.559366950755457e-06, + "loss": 1.408, + "step": 12212 + }, + { + "epoch": 0.6677145551713297, + "grad_norm": 2.2779250144958496, + "learning_rate": 5.557729721154949e-06, + "loss": 1.2399, + "step": 12213 + }, + { + "epoch": 0.6677692276150513, + "grad_norm": 1.2815122604370117, + "learning_rate": 5.55609263989227e-06, + "loss": 1.4754, + "step": 12214 + }, + { + "epoch": 0.6678239000587729, + "grad_norm": 1.4088445901870728, + "learning_rate": 5.554455707022084e-06, + "loss": 1.3937, + "step": 12215 + }, + { + "epoch": 0.6678785725024944, + "grad_norm": 1.7520508766174316, + "learning_rate": 5.552818922599056e-06, + "loss": 1.4207, + "step": 12216 + }, + { + "epoch": 0.667933244946216, + "grad_norm": 1.0425022840499878, + "learning_rate": 5.551182286677839e-06, + "loss": 1.5949, + "step": 12217 + }, + { + "epoch": 0.6679879173899376, + "grad_norm": 1.461412787437439, + "learning_rate": 5.549545799313081e-06, + "loss": 1.3908, + "step": 12218 + }, + { + "epoch": 0.668042589833659, + "grad_norm": 1.4805569648742676, + "learning_rate": 5.5479094605594355e-06, + "loss": 1.6969, + "step": 12219 + }, + { + "epoch": 0.6680972622773806, + "grad_norm": 1.394388198852539, + "learning_rate": 5.546273270471536e-06, + "loss": 1.5654, + "step": 12220 + }, + { + "epoch": 0.6681519347211022, + "grad_norm": 1.3455147743225098, + "learning_rate": 5.544637229104027e-06, + "loss": 1.4486, + "step": 12221 + }, + { + "epoch": 0.6682066071648237, + "grad_norm": 1.560817003250122, + "learning_rate": 5.543001336511537e-06, + "loss": 1.3039, + "step": 12222 + }, + { + "epoch": 0.6682612796085453, + "grad_norm": 1.6124436855316162, + "learning_rate": 5.541365592748686e-06, + "loss": 1.6838, + "step": 12223 + }, + { + "epoch": 0.6683159520522669, + "grad_norm": 1.7208863496780396, + "learning_rate": 5.539729997870104e-06, + "loss": 1.332, + "step": 12224 + }, + { + "epoch": 0.6683706244959884, + "grad_norm": 1.5706968307495117, + "learning_rate": 5.538094551930405e-06, + "loss": 1.5232, + "step": 12225 + }, + { + "epoch": 0.66842529693971, + "grad_norm": 1.934644103050232, + "learning_rate": 5.536459254984195e-06, + "loss": 1.5697, + "step": 12226 + }, + { + "epoch": 0.6684799693834315, + "grad_norm": 1.1842858791351318, + "learning_rate": 5.534824107086088e-06, + "loss": 1.4207, + "step": 12227 + }, + { + "epoch": 0.668534641827153, + "grad_norm": 1.3600023984909058, + "learning_rate": 5.533189108290682e-06, + "loss": 1.6653, + "step": 12228 + }, + { + "epoch": 0.6685893142708746, + "grad_norm": 1.2209062576293945, + "learning_rate": 5.531554258652574e-06, + "loss": 1.4826, + "step": 12229 + }, + { + "epoch": 0.6686439867145961, + "grad_norm": 1.6088073253631592, + "learning_rate": 5.529919558226353e-06, + "loss": 1.5663, + "step": 12230 + }, + { + "epoch": 0.6686986591583177, + "grad_norm": 1.4805680513381958, + "learning_rate": 5.528285007066609e-06, + "loss": 1.4305, + "step": 12231 + }, + { + "epoch": 0.6687533316020393, + "grad_norm": 1.5092768669128418, + "learning_rate": 5.5266506052279165e-06, + "loss": 1.3225, + "step": 12232 + }, + { + "epoch": 0.6688080040457608, + "grad_norm": 1.7515885829925537, + "learning_rate": 5.525016352764863e-06, + "loss": 1.6235, + "step": 12233 + }, + { + "epoch": 0.6688626764894824, + "grad_norm": 1.3944107294082642, + "learning_rate": 5.52338224973201e-06, + "loss": 1.1647, + "step": 12234 + }, + { + "epoch": 0.668917348933204, + "grad_norm": 1.5474927425384521, + "learning_rate": 5.5217482961839305e-06, + "loss": 1.3939, + "step": 12235 + }, + { + "epoch": 0.6689720213769255, + "grad_norm": 1.1736775636672974, + "learning_rate": 5.520114492175186e-06, + "loss": 1.5875, + "step": 12236 + }, + { + "epoch": 0.6690266938206471, + "grad_norm": 1.7875049114227295, + "learning_rate": 5.518480837760327e-06, + "loss": 1.2913, + "step": 12237 + }, + { + "epoch": 0.6690813662643686, + "grad_norm": 1.6011027097702026, + "learning_rate": 5.516847332993912e-06, + "loss": 1.2574, + "step": 12238 + }, + { + "epoch": 0.6691360387080901, + "grad_norm": 1.758351445198059, + "learning_rate": 5.515213977930485e-06, + "loss": 1.6532, + "step": 12239 + }, + { + "epoch": 0.6691907111518117, + "grad_norm": 1.6037167310714722, + "learning_rate": 5.513580772624587e-06, + "loss": 1.4237, + "step": 12240 + }, + { + "epoch": 0.6692453835955333, + "grad_norm": 1.5019675493240356, + "learning_rate": 5.511947717130755e-06, + "loss": 1.4776, + "step": 12241 + }, + { + "epoch": 0.6693000560392548, + "grad_norm": 2.130033254623413, + "learning_rate": 5.51031481150352e-06, + "loss": 1.3, + "step": 12242 + }, + { + "epoch": 0.6693547284829764, + "grad_norm": 1.8396856784820557, + "learning_rate": 5.508682055797405e-06, + "loss": 1.6616, + "step": 12243 + }, + { + "epoch": 0.6694094009266979, + "grad_norm": 1.5393003225326538, + "learning_rate": 5.507049450066939e-06, + "loss": 1.3386, + "step": 12244 + }, + { + "epoch": 0.6694640733704195, + "grad_norm": 1.4322988986968994, + "learning_rate": 5.505416994366634e-06, + "loss": 1.5001, + "step": 12245 + }, + { + "epoch": 0.6695187458141411, + "grad_norm": 1.5654449462890625, + "learning_rate": 5.5037846887510015e-06, + "loss": 1.5243, + "step": 12246 + }, + { + "epoch": 0.6695734182578625, + "grad_norm": 1.589277744293213, + "learning_rate": 5.502152533274551e-06, + "loss": 1.4102, + "step": 12247 + }, + { + "epoch": 0.6696280907015841, + "grad_norm": 1.557766318321228, + "learning_rate": 5.500520527991777e-06, + "loss": 1.6195, + "step": 12248 + }, + { + "epoch": 0.6696827631453057, + "grad_norm": 1.3316432237625122, + "learning_rate": 5.498888672957187e-06, + "loss": 1.4814, + "step": 12249 + }, + { + "epoch": 0.6697374355890272, + "grad_norm": 1.1976549625396729, + "learning_rate": 5.497256968225264e-06, + "loss": 1.5525, + "step": 12250 + }, + { + "epoch": 0.6697921080327488, + "grad_norm": 1.5845413208007812, + "learning_rate": 5.495625413850495e-06, + "loss": 1.2008, + "step": 12251 + }, + { + "epoch": 0.6698467804764704, + "grad_norm": 1.2894335985183716, + "learning_rate": 5.493994009887367e-06, + "loss": 1.4035, + "step": 12252 + }, + { + "epoch": 0.6699014529201919, + "grad_norm": 1.5457427501678467, + "learning_rate": 5.492362756390353e-06, + "loss": 1.4202, + "step": 12253 + }, + { + "epoch": 0.6699561253639135, + "grad_norm": 1.5995274782180786, + "learning_rate": 5.490731653413922e-06, + "loss": 1.1284, + "step": 12254 + }, + { + "epoch": 0.6700107978076351, + "grad_norm": 1.42861008644104, + "learning_rate": 5.489100701012544e-06, + "loss": 1.4153, + "step": 12255 + }, + { + "epoch": 0.6700654702513565, + "grad_norm": 1.2866439819335938, + "learning_rate": 5.487469899240678e-06, + "loss": 1.4185, + "step": 12256 + }, + { + "epoch": 0.6701201426950781, + "grad_norm": 1.4733861684799194, + "learning_rate": 5.485839248152778e-06, + "loss": 1.3412, + "step": 12257 + }, + { + "epoch": 0.6701748151387996, + "grad_norm": 1.4220763444900513, + "learning_rate": 5.484208747803301e-06, + "loss": 1.5062, + "step": 12258 + }, + { + "epoch": 0.6702294875825212, + "grad_norm": 1.7282716035842896, + "learning_rate": 5.482578398246692e-06, + "loss": 1.5367, + "step": 12259 + }, + { + "epoch": 0.6702841600262428, + "grad_norm": 1.7237529754638672, + "learning_rate": 5.480948199537386e-06, + "loss": 1.5581, + "step": 12260 + }, + { + "epoch": 0.6703388324699643, + "grad_norm": 1.2583831548690796, + "learning_rate": 5.479318151729828e-06, + "loss": 1.751, + "step": 12261 + }, + { + "epoch": 0.6703935049136859, + "grad_norm": 1.3695803880691528, + "learning_rate": 5.47768825487844e-06, + "loss": 1.3629, + "step": 12262 + }, + { + "epoch": 0.6704481773574075, + "grad_norm": 1.5707759857177734, + "learning_rate": 5.476058509037658e-06, + "loss": 1.476, + "step": 12263 + }, + { + "epoch": 0.670502849801129, + "grad_norm": 1.2824716567993164, + "learning_rate": 5.474428914261899e-06, + "loss": 1.385, + "step": 12264 + }, + { + "epoch": 0.6705575222448505, + "grad_norm": 1.4603885412216187, + "learning_rate": 5.472799470605573e-06, + "loss": 1.4422, + "step": 12265 + }, + { + "epoch": 0.6706121946885721, + "grad_norm": 1.9065943956375122, + "learning_rate": 5.4711701781231e-06, + "loss": 1.1837, + "step": 12266 + }, + { + "epoch": 0.6706668671322936, + "grad_norm": 1.6403433084487915, + "learning_rate": 5.469541036868883e-06, + "loss": 1.4058, + "step": 12267 + }, + { + "epoch": 0.6707215395760152, + "grad_norm": 1.5022441148757935, + "learning_rate": 5.467912046897321e-06, + "loss": 1.2774, + "step": 12268 + }, + { + "epoch": 0.6707762120197368, + "grad_norm": 1.4657825231552124, + "learning_rate": 5.46628320826281e-06, + "loss": 1.4589, + "step": 12269 + }, + { + "epoch": 0.6708308844634583, + "grad_norm": 1.7733272314071655, + "learning_rate": 5.4646545210197435e-06, + "loss": 1.5728, + "step": 12270 + }, + { + "epoch": 0.6708855569071799, + "grad_norm": 1.4455355405807495, + "learning_rate": 5.4630259852225e-06, + "loss": 1.4153, + "step": 12271 + }, + { + "epoch": 0.6709402293509014, + "grad_norm": 1.383183479309082, + "learning_rate": 5.461397600925469e-06, + "loss": 1.6641, + "step": 12272 + }, + { + "epoch": 0.670994901794623, + "grad_norm": 1.3958512544631958, + "learning_rate": 5.459769368183022e-06, + "loss": 1.3852, + "step": 12273 + }, + { + "epoch": 0.6710495742383445, + "grad_norm": 1.5576096773147583, + "learning_rate": 5.458141287049525e-06, + "loss": 1.5357, + "step": 12274 + }, + { + "epoch": 0.671104246682066, + "grad_norm": 1.733100175857544, + "learning_rate": 5.456513357579354e-06, + "loss": 1.4083, + "step": 12275 + }, + { + "epoch": 0.6711589191257876, + "grad_norm": 2.0618932247161865, + "learning_rate": 5.4548855798268595e-06, + "loss": 1.5188, + "step": 12276 + }, + { + "epoch": 0.6712135915695092, + "grad_norm": 1.2713706493377686, + "learning_rate": 5.453257953846405e-06, + "loss": 1.5363, + "step": 12277 + }, + { + "epoch": 0.6712682640132307, + "grad_norm": 1.1571904420852661, + "learning_rate": 5.451630479692336e-06, + "loss": 1.4945, + "step": 12278 + }, + { + "epoch": 0.6713229364569523, + "grad_norm": 1.5684782266616821, + "learning_rate": 5.450003157418997e-06, + "loss": 1.4279, + "step": 12279 + }, + { + "epoch": 0.6713776089006739, + "grad_norm": 1.5875123739242554, + "learning_rate": 5.448375987080732e-06, + "loss": 1.4665, + "step": 12280 + }, + { + "epoch": 0.6714322813443954, + "grad_norm": 1.3871407508850098, + "learning_rate": 5.446748968731872e-06, + "loss": 1.439, + "step": 12281 + }, + { + "epoch": 0.671486953788117, + "grad_norm": 1.5077790021896362, + "learning_rate": 5.445122102426745e-06, + "loss": 1.4598, + "step": 12282 + }, + { + "epoch": 0.6715416262318386, + "grad_norm": 1.6069869995117188, + "learning_rate": 5.443495388219684e-06, + "loss": 1.4528, + "step": 12283 + }, + { + "epoch": 0.67159629867556, + "grad_norm": 1.2903465032577515, + "learning_rate": 5.441868826165002e-06, + "loss": 1.6441, + "step": 12284 + }, + { + "epoch": 0.6716509711192816, + "grad_norm": 1.6662055253982544, + "learning_rate": 5.4402424163170145e-06, + "loss": 1.3931, + "step": 12285 + }, + { + "epoch": 0.6717056435630031, + "grad_norm": 1.274679183959961, + "learning_rate": 5.438616158730034e-06, + "loss": 1.1555, + "step": 12286 + }, + { + "epoch": 0.6717603160067247, + "grad_norm": 1.4671368598937988, + "learning_rate": 5.436990053458365e-06, + "loss": 1.3033, + "step": 12287 + }, + { + "epoch": 0.6718149884504463, + "grad_norm": 1.393762230873108, + "learning_rate": 5.435364100556302e-06, + "loss": 1.4281, + "step": 12288 + }, + { + "epoch": 0.6718696608941678, + "grad_norm": 1.489008903503418, + "learning_rate": 5.433738300078146e-06, + "loss": 1.1779, + "step": 12289 + }, + { + "epoch": 0.6719243333378894, + "grad_norm": 1.4634507894515991, + "learning_rate": 5.43211265207818e-06, + "loss": 1.4985, + "step": 12290 + }, + { + "epoch": 0.671979005781611, + "grad_norm": 1.627068042755127, + "learning_rate": 5.430487156610695e-06, + "loss": 1.3582, + "step": 12291 + }, + { + "epoch": 0.6720336782253324, + "grad_norm": 1.4521101713180542, + "learning_rate": 5.428861813729966e-06, + "loss": 1.4526, + "step": 12292 + }, + { + "epoch": 0.672088350669054, + "grad_norm": 1.3412554264068604, + "learning_rate": 5.4272366234902685e-06, + "loss": 1.4238, + "step": 12293 + }, + { + "epoch": 0.6721430231127756, + "grad_norm": 1.5206526517868042, + "learning_rate": 5.42561158594587e-06, + "loss": 1.5071, + "step": 12294 + }, + { + "epoch": 0.6721976955564971, + "grad_norm": 1.8256851434707642, + "learning_rate": 5.423986701151035e-06, + "loss": 1.4403, + "step": 12295 + }, + { + "epoch": 0.6722523680002187, + "grad_norm": 1.3563714027404785, + "learning_rate": 5.4223619691600185e-06, + "loss": 1.6123, + "step": 12296 + }, + { + "epoch": 0.6723070404439403, + "grad_norm": 1.606490969657898, + "learning_rate": 5.42073739002708e-06, + "loss": 1.7899, + "step": 12297 + }, + { + "epoch": 0.6723617128876618, + "grad_norm": 1.3322287797927856, + "learning_rate": 5.419112963806468e-06, + "loss": 1.6089, + "step": 12298 + }, + { + "epoch": 0.6724163853313834, + "grad_norm": 1.5686360597610474, + "learning_rate": 5.417488690552417e-06, + "loss": 1.6678, + "step": 12299 + }, + { + "epoch": 0.6724710577751049, + "grad_norm": 1.7824188470840454, + "learning_rate": 5.415864570319177e-06, + "loss": 1.3656, + "step": 12300 + }, + { + "epoch": 0.6725257302188264, + "grad_norm": 1.8166792392730713, + "learning_rate": 5.414240603160976e-06, + "loss": 1.7022, + "step": 12301 + }, + { + "epoch": 0.672580402662548, + "grad_norm": 1.126756191253662, + "learning_rate": 5.412616789132038e-06, + "loss": 1.4214, + "step": 12302 + }, + { + "epoch": 0.6726350751062695, + "grad_norm": 1.8435667753219604, + "learning_rate": 5.4109931282865945e-06, + "loss": 1.3327, + "step": 12303 + }, + { + "epoch": 0.6726897475499911, + "grad_norm": 1.3795140981674194, + "learning_rate": 5.409369620678855e-06, + "loss": 1.496, + "step": 12304 + }, + { + "epoch": 0.6727444199937127, + "grad_norm": 1.3637200593948364, + "learning_rate": 5.407746266363039e-06, + "loss": 1.788, + "step": 12305 + }, + { + "epoch": 0.6727990924374342, + "grad_norm": 1.3588223457336426, + "learning_rate": 5.406123065393352e-06, + "loss": 1.5948, + "step": 12306 + }, + { + "epoch": 0.6728537648811558, + "grad_norm": 1.5084338188171387, + "learning_rate": 5.4045000178239945e-06, + "loss": 1.5086, + "step": 12307 + }, + { + "epoch": 0.6729084373248774, + "grad_norm": 1.6962131261825562, + "learning_rate": 5.402877123709167e-06, + "loss": 1.4153, + "step": 12308 + }, + { + "epoch": 0.6729631097685989, + "grad_norm": 1.2835849523544312, + "learning_rate": 5.401254383103058e-06, + "loss": 1.3669, + "step": 12309 + }, + { + "epoch": 0.6730177822123204, + "grad_norm": 1.3791741132736206, + "learning_rate": 5.399631796059854e-06, + "loss": 1.3293, + "step": 12310 + }, + { + "epoch": 0.673072454656042, + "grad_norm": 1.3557493686676025, + "learning_rate": 5.398009362633743e-06, + "loss": 1.5048, + "step": 12311 + }, + { + "epoch": 0.6731271270997635, + "grad_norm": 1.3714812994003296, + "learning_rate": 5.3963870828788975e-06, + "loss": 1.57, + "step": 12312 + }, + { + "epoch": 0.6731817995434851, + "grad_norm": 1.9932794570922852, + "learning_rate": 5.394764956849488e-06, + "loss": 1.4114, + "step": 12313 + }, + { + "epoch": 0.6732364719872066, + "grad_norm": 1.3562285900115967, + "learning_rate": 5.393142984599684e-06, + "loss": 1.3912, + "step": 12314 + }, + { + "epoch": 0.6732911444309282, + "grad_norm": 1.4944006204605103, + "learning_rate": 5.3915211661836485e-06, + "loss": 1.4179, + "step": 12315 + }, + { + "epoch": 0.6733458168746498, + "grad_norm": 1.686273455619812, + "learning_rate": 5.389899501655531e-06, + "loss": 1.2585, + "step": 12316 + }, + { + "epoch": 0.6734004893183713, + "grad_norm": 1.286990761756897, + "learning_rate": 5.388277991069491e-06, + "loss": 1.5859, + "step": 12317 + }, + { + "epoch": 0.6734551617620929, + "grad_norm": 1.3033316135406494, + "learning_rate": 5.38665663447967e-06, + "loss": 1.6064, + "step": 12318 + }, + { + "epoch": 0.6735098342058145, + "grad_norm": 1.526414155960083, + "learning_rate": 5.3850354319402095e-06, + "loss": 1.3959, + "step": 12319 + }, + { + "epoch": 0.6735645066495359, + "grad_norm": 1.8107295036315918, + "learning_rate": 5.383414383505245e-06, + "loss": 1.4205, + "step": 12320 + }, + { + "epoch": 0.6736191790932575, + "grad_norm": 1.2225075960159302, + "learning_rate": 5.381793489228906e-06, + "loss": 1.5437, + "step": 12321 + }, + { + "epoch": 0.6736738515369791, + "grad_norm": 1.3942145109176636, + "learning_rate": 5.380172749165321e-06, + "loss": 1.5995, + "step": 12322 + }, + { + "epoch": 0.6737285239807006, + "grad_norm": 1.7254900932312012, + "learning_rate": 5.378552163368609e-06, + "loss": 1.257, + "step": 12323 + }, + { + "epoch": 0.6737831964244222, + "grad_norm": 1.4133708477020264, + "learning_rate": 5.3769317318928805e-06, + "loss": 1.5414, + "step": 12324 + }, + { + "epoch": 0.6738378688681438, + "grad_norm": 1.846585988998413, + "learning_rate": 5.375311454792255e-06, + "loss": 1.4135, + "step": 12325 + }, + { + "epoch": 0.6738925413118653, + "grad_norm": 1.3269721269607544, + "learning_rate": 5.373691332120832e-06, + "loss": 1.5027, + "step": 12326 + }, + { + "epoch": 0.6739472137555869, + "grad_norm": 1.886288046836853, + "learning_rate": 5.372071363932706e-06, + "loss": 1.2983, + "step": 12327 + }, + { + "epoch": 0.6740018861993083, + "grad_norm": 1.3433475494384766, + "learning_rate": 5.370451550281982e-06, + "loss": 1.3276, + "step": 12328 + }, + { + "epoch": 0.6740565586430299, + "grad_norm": 1.2694685459136963, + "learning_rate": 5.368831891222744e-06, + "loss": 1.5631, + "step": 12329 + }, + { + "epoch": 0.6741112310867515, + "grad_norm": 1.330632209777832, + "learning_rate": 5.367212386809073e-06, + "loss": 1.3867, + "step": 12330 + }, + { + "epoch": 0.674165903530473, + "grad_norm": 1.4220527410507202, + "learning_rate": 5.365593037095055e-06, + "loss": 1.4064, + "step": 12331 + }, + { + "epoch": 0.6742205759741946, + "grad_norm": 2.0927441120147705, + "learning_rate": 5.363973842134761e-06, + "loss": 1.7347, + "step": 12332 + }, + { + "epoch": 0.6742752484179162, + "grad_norm": 1.4945731163024902, + "learning_rate": 5.362354801982259e-06, + "loss": 1.3003, + "step": 12333 + }, + { + "epoch": 0.6743299208616377, + "grad_norm": 1.6478601694107056, + "learning_rate": 5.360735916691613e-06, + "loss": 1.6393, + "step": 12334 + }, + { + "epoch": 0.6743845933053593, + "grad_norm": 1.2497292757034302, + "learning_rate": 5.359117186316875e-06, + "loss": 1.2657, + "step": 12335 + }, + { + "epoch": 0.6744392657490809, + "grad_norm": 1.8691195249557495, + "learning_rate": 5.357498610912111e-06, + "loss": 1.4329, + "step": 12336 + }, + { + "epoch": 0.6744939381928023, + "grad_norm": 1.5005789995193481, + "learning_rate": 5.355880190531362e-06, + "loss": 1.5447, + "step": 12337 + }, + { + "epoch": 0.6745486106365239, + "grad_norm": 1.749997854232788, + "learning_rate": 5.354261925228666e-06, + "loss": 1.1489, + "step": 12338 + }, + { + "epoch": 0.6746032830802455, + "grad_norm": 1.3944491147994995, + "learning_rate": 5.3526438150580705e-06, + "loss": 1.6429, + "step": 12339 + }, + { + "epoch": 0.674657955523967, + "grad_norm": 1.6292062997817993, + "learning_rate": 5.351025860073604e-06, + "loss": 1.5534, + "step": 12340 + }, + { + "epoch": 0.6747126279676886, + "grad_norm": 1.0675824880599976, + "learning_rate": 5.349408060329288e-06, + "loss": 1.6161, + "step": 12341 + }, + { + "epoch": 0.6747673004114101, + "grad_norm": 1.4464528560638428, + "learning_rate": 5.347790415879155e-06, + "loss": 1.3997, + "step": 12342 + }, + { + "epoch": 0.6748219728551317, + "grad_norm": 1.773097038269043, + "learning_rate": 5.346172926777215e-06, + "loss": 1.5372, + "step": 12343 + }, + { + "epoch": 0.6748766452988533, + "grad_norm": 1.5190612077713013, + "learning_rate": 5.344555593077483e-06, + "loss": 1.4797, + "step": 12344 + }, + { + "epoch": 0.6749313177425748, + "grad_norm": 1.5679421424865723, + "learning_rate": 5.342938414833965e-06, + "loss": 1.4238, + "step": 12345 + }, + { + "epoch": 0.6749859901862963, + "grad_norm": 1.4038479328155518, + "learning_rate": 5.341321392100656e-06, + "loss": 1.3581, + "step": 12346 + }, + { + "epoch": 0.6750406626300179, + "grad_norm": 1.545367956161499, + "learning_rate": 5.3397045249315615e-06, + "loss": 1.6487, + "step": 12347 + }, + { + "epoch": 0.6750953350737394, + "grad_norm": 1.1562834978103638, + "learning_rate": 5.338087813380669e-06, + "loss": 1.6062, + "step": 12348 + }, + { + "epoch": 0.675150007517461, + "grad_norm": 1.331600308418274, + "learning_rate": 5.336471257501961e-06, + "loss": 1.5116, + "step": 12349 + }, + { + "epoch": 0.6752046799611826, + "grad_norm": 1.6378180980682373, + "learning_rate": 5.334854857349423e-06, + "loss": 1.1221, + "step": 12350 + }, + { + "epoch": 0.6752593524049041, + "grad_norm": 1.5906853675842285, + "learning_rate": 5.3332386129770295e-06, + "loss": 1.4997, + "step": 12351 + }, + { + "epoch": 0.6753140248486257, + "grad_norm": 1.3639429807662964, + "learning_rate": 5.331622524438745e-06, + "loss": 1.3663, + "step": 12352 + }, + { + "epoch": 0.6753686972923473, + "grad_norm": 1.6702991724014282, + "learning_rate": 5.330006591788543e-06, + "loss": 1.458, + "step": 12353 + }, + { + "epoch": 0.6754233697360688, + "grad_norm": 1.752416968345642, + "learning_rate": 5.328390815080381e-06, + "loss": 1.4055, + "step": 12354 + }, + { + "epoch": 0.6754780421797904, + "grad_norm": 1.587344765663147, + "learning_rate": 5.326775194368208e-06, + "loss": 1.399, + "step": 12355 + }, + { + "epoch": 0.6755327146235118, + "grad_norm": 1.5720561742782593, + "learning_rate": 5.32515972970598e-06, + "loss": 1.3991, + "step": 12356 + }, + { + "epoch": 0.6755873870672334, + "grad_norm": 1.657043218612671, + "learning_rate": 5.323544421147637e-06, + "loss": 1.1553, + "step": 12357 + }, + { + "epoch": 0.675642059510955, + "grad_norm": 1.5524555444717407, + "learning_rate": 5.3219292687471226e-06, + "loss": 1.4506, + "step": 12358 + }, + { + "epoch": 0.6756967319546765, + "grad_norm": 1.2381391525268555, + "learning_rate": 5.320314272558366e-06, + "loss": 1.155, + "step": 12359 + }, + { + "epoch": 0.6757514043983981, + "grad_norm": 1.4602104425430298, + "learning_rate": 5.3186994326352926e-06, + "loss": 1.5486, + "step": 12360 + }, + { + "epoch": 0.6758060768421197, + "grad_norm": 1.326181173324585, + "learning_rate": 5.317084749031835e-06, + "loss": 1.5201, + "step": 12361 + }, + { + "epoch": 0.6758607492858412, + "grad_norm": 1.265602946281433, + "learning_rate": 5.315470221801906e-06, + "loss": 1.4683, + "step": 12362 + }, + { + "epoch": 0.6759154217295628, + "grad_norm": 1.2848204374313354, + "learning_rate": 5.313855850999414e-06, + "loss": 1.6237, + "step": 12363 + }, + { + "epoch": 0.6759700941732844, + "grad_norm": 1.7585488557815552, + "learning_rate": 5.312241636678277e-06, + "loss": 1.5759, + "step": 12364 + }, + { + "epoch": 0.6760247666170058, + "grad_norm": 1.6654938459396362, + "learning_rate": 5.310627578892391e-06, + "loss": 1.5198, + "step": 12365 + }, + { + "epoch": 0.6760794390607274, + "grad_norm": 1.5881915092468262, + "learning_rate": 5.309013677695651e-06, + "loss": 1.496, + "step": 12366 + }, + { + "epoch": 0.676134111504449, + "grad_norm": 1.498260259628296, + "learning_rate": 5.307399933141955e-06, + "loss": 1.2613, + "step": 12367 + }, + { + "epoch": 0.6761887839481705, + "grad_norm": 1.3910586833953857, + "learning_rate": 5.3057863452851875e-06, + "loss": 1.3453, + "step": 12368 + }, + { + "epoch": 0.6762434563918921, + "grad_norm": 1.3838413953781128, + "learning_rate": 5.304172914179224e-06, + "loss": 1.6446, + "step": 12369 + }, + { + "epoch": 0.6762981288356136, + "grad_norm": 1.7601958513259888, + "learning_rate": 5.302559639877952e-06, + "loss": 1.3598, + "step": 12370 + }, + { + "epoch": 0.6763528012793352, + "grad_norm": 1.4658172130584717, + "learning_rate": 5.3009465224352355e-06, + "loss": 1.5215, + "step": 12371 + }, + { + "epoch": 0.6764074737230568, + "grad_norm": 1.4815336465835571, + "learning_rate": 5.2993335619049415e-06, + "loss": 1.4682, + "step": 12372 + }, + { + "epoch": 0.6764621461667782, + "grad_norm": 1.4836162328720093, + "learning_rate": 5.297720758340929e-06, + "loss": 1.1419, + "step": 12373 + }, + { + "epoch": 0.6765168186104998, + "grad_norm": 1.5526832342147827, + "learning_rate": 5.296108111797052e-06, + "loss": 1.5758, + "step": 12374 + }, + { + "epoch": 0.6765714910542214, + "grad_norm": 1.4092905521392822, + "learning_rate": 5.294495622327167e-06, + "loss": 1.4831, + "step": 12375 + }, + { + "epoch": 0.6766261634979429, + "grad_norm": 1.4732728004455566, + "learning_rate": 5.292883289985116e-06, + "loss": 1.4308, + "step": 12376 + }, + { + "epoch": 0.6766808359416645, + "grad_norm": 1.4787538051605225, + "learning_rate": 5.291271114824732e-06, + "loss": 1.2742, + "step": 12377 + }, + { + "epoch": 0.6767355083853861, + "grad_norm": 1.4844633340835571, + "learning_rate": 5.289659096899859e-06, + "loss": 1.4587, + "step": 12378 + }, + { + "epoch": 0.6767901808291076, + "grad_norm": 2.3097753524780273, + "learning_rate": 5.288047236264322e-06, + "loss": 1.2675, + "step": 12379 + }, + { + "epoch": 0.6768448532728292, + "grad_norm": 2.1085140705108643, + "learning_rate": 5.286435532971941e-06, + "loss": 1.2824, + "step": 12380 + }, + { + "epoch": 0.6768995257165508, + "grad_norm": 1.4961272478103638, + "learning_rate": 5.284823987076542e-06, + "loss": 1.4844, + "step": 12381 + }, + { + "epoch": 0.6769541981602722, + "grad_norm": 1.723750352859497, + "learning_rate": 5.283212598631935e-06, + "loss": 1.3403, + "step": 12382 + }, + { + "epoch": 0.6770088706039938, + "grad_norm": 1.539919376373291, + "learning_rate": 5.281601367691928e-06, + "loss": 1.3733, + "step": 12383 + }, + { + "epoch": 0.6770635430477153, + "grad_norm": 1.2766709327697754, + "learning_rate": 5.279990294310322e-06, + "loss": 1.4633, + "step": 12384 + }, + { + "epoch": 0.6771182154914369, + "grad_norm": 2.2800793647766113, + "learning_rate": 5.278379378540917e-06, + "loss": 1.4054, + "step": 12385 + }, + { + "epoch": 0.6771728879351585, + "grad_norm": 1.7645319700241089, + "learning_rate": 5.2767686204375e-06, + "loss": 1.3217, + "step": 12386 + }, + { + "epoch": 0.67722756037888, + "grad_norm": 1.5672643184661865, + "learning_rate": 5.275158020053865e-06, + "loss": 1.5772, + "step": 12387 + }, + { + "epoch": 0.6772822328226016, + "grad_norm": 1.565707802772522, + "learning_rate": 5.273547577443789e-06, + "loss": 1.3766, + "step": 12388 + }, + { + "epoch": 0.6773369052663232, + "grad_norm": 1.6762653589248657, + "learning_rate": 5.271937292661054e-06, + "loss": 1.2659, + "step": 12389 + }, + { + "epoch": 0.6773915777100447, + "grad_norm": 1.383841633796692, + "learning_rate": 5.2703271657594255e-06, + "loss": 1.4275, + "step": 12390 + }, + { + "epoch": 0.6774462501537662, + "grad_norm": 1.5499647855758667, + "learning_rate": 5.26871719679267e-06, + "loss": 1.552, + "step": 12391 + }, + { + "epoch": 0.6775009225974878, + "grad_norm": 1.6683589220046997, + "learning_rate": 5.267107385814552e-06, + "loss": 1.5373, + "step": 12392 + }, + { + "epoch": 0.6775555950412093, + "grad_norm": 1.5329653024673462, + "learning_rate": 5.265497732878826e-06, + "loss": 1.2953, + "step": 12393 + }, + { + "epoch": 0.6776102674849309, + "grad_norm": 1.3801774978637695, + "learning_rate": 5.263888238039234e-06, + "loss": 1.3595, + "step": 12394 + }, + { + "epoch": 0.6776649399286525, + "grad_norm": 1.44304621219635, + "learning_rate": 5.262278901349533e-06, + "loss": 1.707, + "step": 12395 + }, + { + "epoch": 0.677719612372374, + "grad_norm": 1.7026489973068237, + "learning_rate": 5.260669722863457e-06, + "loss": 1.4659, + "step": 12396 + }, + { + "epoch": 0.6777742848160956, + "grad_norm": 1.3937227725982666, + "learning_rate": 5.2590607026347395e-06, + "loss": 1.3531, + "step": 12397 + }, + { + "epoch": 0.6778289572598171, + "grad_norm": 1.354643702507019, + "learning_rate": 5.2574518407171115e-06, + "loss": 1.4452, + "step": 12398 + }, + { + "epoch": 0.6778836297035387, + "grad_norm": 1.4252697229385376, + "learning_rate": 5.255843137164294e-06, + "loss": 1.3079, + "step": 12399 + }, + { + "epoch": 0.6779383021472603, + "grad_norm": 1.4304449558258057, + "learning_rate": 5.254234592030003e-06, + "loss": 1.4602, + "step": 12400 + }, + { + "epoch": 0.6779929745909817, + "grad_norm": 1.6710540056228638, + "learning_rate": 5.252626205367959e-06, + "loss": 1.6841, + "step": 12401 + }, + { + "epoch": 0.6780476470347033, + "grad_norm": 1.5374289751052856, + "learning_rate": 5.251017977231862e-06, + "loss": 1.4713, + "step": 12402 + }, + { + "epoch": 0.6781023194784249, + "grad_norm": 1.6110749244689941, + "learning_rate": 5.249409907675422e-06, + "loss": 1.555, + "step": 12403 + }, + { + "epoch": 0.6781569919221464, + "grad_norm": 1.6323379278182983, + "learning_rate": 5.2478019967523355e-06, + "loss": 1.9057, + "step": 12404 + }, + { + "epoch": 0.678211664365868, + "grad_norm": 1.484281063079834, + "learning_rate": 5.246194244516285e-06, + "loss": 1.5235, + "step": 12405 + }, + { + "epoch": 0.6782663368095896, + "grad_norm": 1.4851187467575073, + "learning_rate": 5.244586651020969e-06, + "loss": 1.4587, + "step": 12406 + }, + { + "epoch": 0.6783210092533111, + "grad_norm": 1.4863862991333008, + "learning_rate": 5.242979216320063e-06, + "loss": 1.4014, + "step": 12407 + }, + { + "epoch": 0.6783756816970327, + "grad_norm": 1.366889238357544, + "learning_rate": 5.241371940467239e-06, + "loss": 1.4011, + "step": 12408 + }, + { + "epoch": 0.6784303541407543, + "grad_norm": 1.7324947118759155, + "learning_rate": 5.239764823516178e-06, + "loss": 1.3091, + "step": 12409 + }, + { + "epoch": 0.6784850265844757, + "grad_norm": 1.961371898651123, + "learning_rate": 5.238157865520539e-06, + "loss": 1.6039, + "step": 12410 + }, + { + "epoch": 0.6785396990281973, + "grad_norm": 1.5839203596115112, + "learning_rate": 5.236551066533983e-06, + "loss": 1.3405, + "step": 12411 + }, + { + "epoch": 0.6785943714719188, + "grad_norm": 1.8610717058181763, + "learning_rate": 5.234944426610165e-06, + "loss": 1.2825, + "step": 12412 + }, + { + "epoch": 0.6786490439156404, + "grad_norm": 1.641562581062317, + "learning_rate": 5.233337945802734e-06, + "loss": 1.1925, + "step": 12413 + }, + { + "epoch": 0.678703716359362, + "grad_norm": 1.3224174976348877, + "learning_rate": 5.2317316241653304e-06, + "loss": 1.4419, + "step": 12414 + }, + { + "epoch": 0.6787583888030835, + "grad_norm": 1.2287346124649048, + "learning_rate": 5.2301254617516e-06, + "loss": 1.4421, + "step": 12415 + }, + { + "epoch": 0.6788130612468051, + "grad_norm": 1.7743953466415405, + "learning_rate": 5.228519458615171e-06, + "loss": 1.4016, + "step": 12416 + }, + { + "epoch": 0.6788677336905267, + "grad_norm": 2.9409141540527344, + "learning_rate": 5.226913614809677e-06, + "loss": 1.3711, + "step": 12417 + }, + { + "epoch": 0.6789224061342481, + "grad_norm": 1.2869877815246582, + "learning_rate": 5.225307930388737e-06, + "loss": 1.465, + "step": 12418 + }, + { + "epoch": 0.6789770785779697, + "grad_norm": 1.5891112089157104, + "learning_rate": 5.223702405405966e-06, + "loss": 1.5266, + "step": 12419 + }, + { + "epoch": 0.6790317510216913, + "grad_norm": 1.8229992389678955, + "learning_rate": 5.222097039914984e-06, + "loss": 1.5692, + "step": 12420 + }, + { + "epoch": 0.6790864234654128, + "grad_norm": 1.4893438816070557, + "learning_rate": 5.2204918339693925e-06, + "loss": 1.5071, + "step": 12421 + }, + { + "epoch": 0.6791410959091344, + "grad_norm": 1.7903590202331543, + "learning_rate": 5.218886787622794e-06, + "loss": 1.5377, + "step": 12422 + }, + { + "epoch": 0.679195768352856, + "grad_norm": 1.3568629026412964, + "learning_rate": 5.217281900928787e-06, + "loss": 1.4746, + "step": 12423 + }, + { + "epoch": 0.6792504407965775, + "grad_norm": 1.39116370677948, + "learning_rate": 5.215677173940959e-06, + "loss": 1.6075, + "step": 12424 + }, + { + "epoch": 0.6793051132402991, + "grad_norm": 1.325741171836853, + "learning_rate": 5.214072606712893e-06, + "loss": 1.576, + "step": 12425 + }, + { + "epoch": 0.6793597856840206, + "grad_norm": 1.7358121871948242, + "learning_rate": 5.212468199298178e-06, + "loss": 1.2086, + "step": 12426 + }, + { + "epoch": 0.6794144581277421, + "grad_norm": 1.6012881994247437, + "learning_rate": 5.210863951750382e-06, + "loss": 1.3018, + "step": 12427 + }, + { + "epoch": 0.6794691305714637, + "grad_norm": 1.5510579347610474, + "learning_rate": 5.209259864123075e-06, + "loss": 1.2048, + "step": 12428 + }, + { + "epoch": 0.6795238030151852, + "grad_norm": 1.4419649839401245, + "learning_rate": 5.207655936469825e-06, + "loss": 1.3604, + "step": 12429 + }, + { + "epoch": 0.6795784754589068, + "grad_norm": 1.6929972171783447, + "learning_rate": 5.206052168844187e-06, + "loss": 1.541, + "step": 12430 + }, + { + "epoch": 0.6796331479026284, + "grad_norm": 1.3805854320526123, + "learning_rate": 5.204448561299718e-06, + "loss": 1.5062, + "step": 12431 + }, + { + "epoch": 0.6796878203463499, + "grad_norm": 1.6222658157348633, + "learning_rate": 5.202845113889967e-06, + "loss": 1.3731, + "step": 12432 + }, + { + "epoch": 0.6797424927900715, + "grad_norm": 1.2870349884033203, + "learning_rate": 5.201241826668469e-06, + "loss": 1.541, + "step": 12433 + }, + { + "epoch": 0.6797971652337931, + "grad_norm": 1.627910852432251, + "learning_rate": 5.199638699688772e-06, + "loss": 1.5744, + "step": 12434 + }, + { + "epoch": 0.6798518376775146, + "grad_norm": 1.6100683212280273, + "learning_rate": 5.198035733004403e-06, + "loss": 1.5185, + "step": 12435 + }, + { + "epoch": 0.6799065101212362, + "grad_norm": 1.5297871828079224, + "learning_rate": 5.1964329266688885e-06, + "loss": 1.3155, + "step": 12436 + }, + { + "epoch": 0.6799611825649577, + "grad_norm": 1.6605254411697388, + "learning_rate": 5.19483028073575e-06, + "loss": 1.7317, + "step": 12437 + }, + { + "epoch": 0.6800158550086792, + "grad_norm": 1.5776350498199463, + "learning_rate": 5.193227795258505e-06, + "loss": 1.3469, + "step": 12438 + }, + { + "epoch": 0.6800705274524008, + "grad_norm": 1.7001054286956787, + "learning_rate": 5.19162547029066e-06, + "loss": 1.4377, + "step": 12439 + }, + { + "epoch": 0.6801251998961224, + "grad_norm": 1.612255334854126, + "learning_rate": 5.190023305885727e-06, + "loss": 1.1588, + "step": 12440 + }, + { + "epoch": 0.6801798723398439, + "grad_norm": 1.7338486909866333, + "learning_rate": 5.188421302097202e-06, + "loss": 1.5439, + "step": 12441 + }, + { + "epoch": 0.6802345447835655, + "grad_norm": 1.7775100469589233, + "learning_rate": 5.186819458978578e-06, + "loss": 1.4075, + "step": 12442 + }, + { + "epoch": 0.680289217227287, + "grad_norm": 1.515901803970337, + "learning_rate": 5.185217776583349e-06, + "loss": 1.5289, + "step": 12443 + }, + { + "epoch": 0.6803438896710086, + "grad_norm": 1.280522108078003, + "learning_rate": 5.183616254964994e-06, + "loss": 1.6824, + "step": 12444 + }, + { + "epoch": 0.6803985621147302, + "grad_norm": 1.8817343711853027, + "learning_rate": 5.182014894176999e-06, + "loss": 1.4983, + "step": 12445 + }, + { + "epoch": 0.6804532345584516, + "grad_norm": 1.4617153406143188, + "learning_rate": 5.18041369427283e-06, + "loss": 1.5612, + "step": 12446 + }, + { + "epoch": 0.6805079070021732, + "grad_norm": 1.9088164567947388, + "learning_rate": 5.178812655305954e-06, + "loss": 1.4401, + "step": 12447 + }, + { + "epoch": 0.6805625794458948, + "grad_norm": 1.4115591049194336, + "learning_rate": 5.177211777329842e-06, + "loss": 1.3318, + "step": 12448 + }, + { + "epoch": 0.6806172518896163, + "grad_norm": 1.621325969696045, + "learning_rate": 5.1756110603979445e-06, + "loss": 1.2787, + "step": 12449 + }, + { + "epoch": 0.6806719243333379, + "grad_norm": 1.3679847717285156, + "learning_rate": 5.174010504563716e-06, + "loss": 1.5407, + "step": 12450 + }, + { + "epoch": 0.6807265967770595, + "grad_norm": 1.392717719078064, + "learning_rate": 5.1724101098806e-06, + "loss": 1.2404, + "step": 12451 + }, + { + "epoch": 0.680781269220781, + "grad_norm": 1.394232988357544, + "learning_rate": 5.170809876402039e-06, + "loss": 1.4938, + "step": 12452 + }, + { + "epoch": 0.6808359416645026, + "grad_norm": 1.477671504020691, + "learning_rate": 5.169209804181465e-06, + "loss": 1.2146, + "step": 12453 + }, + { + "epoch": 0.6808906141082242, + "grad_norm": 1.5795997381210327, + "learning_rate": 5.167609893272314e-06, + "loss": 1.5488, + "step": 12454 + }, + { + "epoch": 0.6809452865519456, + "grad_norm": 1.7127333879470825, + "learning_rate": 5.166010143728009e-06, + "loss": 1.479, + "step": 12455 + }, + { + "epoch": 0.6809999589956672, + "grad_norm": 1.6003550291061401, + "learning_rate": 5.1644105556019655e-06, + "loss": 1.471, + "step": 12456 + }, + { + "epoch": 0.6810546314393887, + "grad_norm": 2.047464609146118, + "learning_rate": 5.1628111289476025e-06, + "loss": 1.4133, + "step": 12457 + }, + { + "epoch": 0.6811093038831103, + "grad_norm": 1.343511939048767, + "learning_rate": 5.161211863818328e-06, + "loss": 1.494, + "step": 12458 + }, + { + "epoch": 0.6811639763268319, + "grad_norm": 1.2719762325286865, + "learning_rate": 5.159612760267541e-06, + "loss": 1.518, + "step": 12459 + }, + { + "epoch": 0.6812186487705534, + "grad_norm": 1.3825567960739136, + "learning_rate": 5.158013818348645e-06, + "loss": 1.4557, + "step": 12460 + }, + { + "epoch": 0.681273321214275, + "grad_norm": 1.3313732147216797, + "learning_rate": 5.1564150381150305e-06, + "loss": 1.4479, + "step": 12461 + }, + { + "epoch": 0.6813279936579966, + "grad_norm": 1.5096811056137085, + "learning_rate": 5.154816419620082e-06, + "loss": 1.6397, + "step": 12462 + }, + { + "epoch": 0.681382666101718, + "grad_norm": 1.5179132223129272, + "learning_rate": 5.153217962917184e-06, + "loss": 1.5737, + "step": 12463 + }, + { + "epoch": 0.6814373385454396, + "grad_norm": 1.5771880149841309, + "learning_rate": 5.151619668059707e-06, + "loss": 1.416, + "step": 12464 + }, + { + "epoch": 0.6814920109891612, + "grad_norm": 1.3987302780151367, + "learning_rate": 5.15002153510103e-06, + "loss": 1.558, + "step": 12465 + }, + { + "epoch": 0.6815466834328827, + "grad_norm": 1.863715648651123, + "learning_rate": 5.148423564094517e-06, + "loss": 1.4535, + "step": 12466 + }, + { + "epoch": 0.6816013558766043, + "grad_norm": 1.662830114364624, + "learning_rate": 5.146825755093521e-06, + "loss": 1.592, + "step": 12467 + }, + { + "epoch": 0.6816560283203259, + "grad_norm": 1.6012489795684814, + "learning_rate": 5.145228108151403e-06, + "loss": 1.6392, + "step": 12468 + }, + { + "epoch": 0.6817107007640474, + "grad_norm": 1.7645292282104492, + "learning_rate": 5.143630623321514e-06, + "loss": 1.4931, + "step": 12469 + }, + { + "epoch": 0.681765373207769, + "grad_norm": 2.1190836429595947, + "learning_rate": 5.142033300657188e-06, + "loss": 1.5115, + "step": 12470 + }, + { + "epoch": 0.6818200456514905, + "grad_norm": 1.9683433771133423, + "learning_rate": 5.1404361402117745e-06, + "loss": 1.2877, + "step": 12471 + }, + { + "epoch": 0.681874718095212, + "grad_norm": 1.5109446048736572, + "learning_rate": 5.138839142038601e-06, + "loss": 1.325, + "step": 12472 + }, + { + "epoch": 0.6819293905389336, + "grad_norm": 1.7681643962860107, + "learning_rate": 5.137242306190991e-06, + "loss": 1.3106, + "step": 12473 + }, + { + "epoch": 0.6819840629826551, + "grad_norm": 1.83448326587677, + "learning_rate": 5.135645632722277e-06, + "loss": 1.7713, + "step": 12474 + }, + { + "epoch": 0.6820387354263767, + "grad_norm": 1.3186908960342407, + "learning_rate": 5.134049121685769e-06, + "loss": 1.5208, + "step": 12475 + }, + { + "epoch": 0.6820934078700983, + "grad_norm": 1.3429698944091797, + "learning_rate": 5.132452773134779e-06, + "loss": 1.4199, + "step": 12476 + }, + { + "epoch": 0.6821480803138198, + "grad_norm": 1.4479411840438843, + "learning_rate": 5.130856587122613e-06, + "loss": 1.4627, + "step": 12477 + }, + { + "epoch": 0.6822027527575414, + "grad_norm": 1.683605670928955, + "learning_rate": 5.129260563702568e-06, + "loss": 1.575, + "step": 12478 + }, + { + "epoch": 0.682257425201263, + "grad_norm": 1.6906863451004028, + "learning_rate": 5.127664702927946e-06, + "loss": 1.3251, + "step": 12479 + }, + { + "epoch": 0.6823120976449845, + "grad_norm": 1.3781688213348389, + "learning_rate": 5.126069004852033e-06, + "loss": 1.6342, + "step": 12480 + }, + { + "epoch": 0.682366770088706, + "grad_norm": 1.6135149002075195, + "learning_rate": 5.124473469528108e-06, + "loss": 1.5497, + "step": 12481 + }, + { + "epoch": 0.6824214425324276, + "grad_norm": 1.3771623373031616, + "learning_rate": 5.122878097009459e-06, + "loss": 1.6542, + "step": 12482 + }, + { + "epoch": 0.6824761149761491, + "grad_norm": 1.3896931409835815, + "learning_rate": 5.121282887349354e-06, + "loss": 1.4289, + "step": 12483 + }, + { + "epoch": 0.6825307874198707, + "grad_norm": 1.2603496313095093, + "learning_rate": 5.11968784060106e-06, + "loss": 1.1592, + "step": 12484 + }, + { + "epoch": 0.6825854598635922, + "grad_norm": 1.6747040748596191, + "learning_rate": 5.118092956817844e-06, + "loss": 1.6947, + "step": 12485 + }, + { + "epoch": 0.6826401323073138, + "grad_norm": 1.4085217714309692, + "learning_rate": 5.11649823605296e-06, + "loss": 1.3476, + "step": 12486 + }, + { + "epoch": 0.6826948047510354, + "grad_norm": 1.4604566097259521, + "learning_rate": 5.114903678359655e-06, + "loss": 1.3516, + "step": 12487 + }, + { + "epoch": 0.6827494771947569, + "grad_norm": 1.9860285520553589, + "learning_rate": 5.1133092837911835e-06, + "loss": 1.4991, + "step": 12488 + }, + { + "epoch": 0.6828041496384785, + "grad_norm": 1.6994997262954712, + "learning_rate": 5.111715052400783e-06, + "loss": 1.4772, + "step": 12489 + }, + { + "epoch": 0.6828588220822001, + "grad_norm": 1.1908025741577148, + "learning_rate": 5.110120984241687e-06, + "loss": 1.5207, + "step": 12490 + }, + { + "epoch": 0.6829134945259215, + "grad_norm": 1.396571159362793, + "learning_rate": 5.108527079367125e-06, + "loss": 1.3287, + "step": 12491 + }, + { + "epoch": 0.6829681669696431, + "grad_norm": 1.9184917211532593, + "learning_rate": 5.106933337830318e-06, + "loss": 1.5788, + "step": 12492 + }, + { + "epoch": 0.6830228394133647, + "grad_norm": 1.7304675579071045, + "learning_rate": 5.105339759684493e-06, + "loss": 1.3829, + "step": 12493 + }, + { + "epoch": 0.6830775118570862, + "grad_norm": 1.751920223236084, + "learning_rate": 5.103746344982859e-06, + "loss": 1.4396, + "step": 12494 + }, + { + "epoch": 0.6831321843008078, + "grad_norm": 1.481177806854248, + "learning_rate": 5.1021530937786215e-06, + "loss": 1.3871, + "step": 12495 + }, + { + "epoch": 0.6831868567445294, + "grad_norm": 1.59971022605896, + "learning_rate": 5.100560006124988e-06, + "loss": 1.3829, + "step": 12496 + }, + { + "epoch": 0.6832415291882509, + "grad_norm": 1.7516541481018066, + "learning_rate": 5.098967082075153e-06, + "loss": 1.4453, + "step": 12497 + }, + { + "epoch": 0.6832962016319725, + "grad_norm": 1.233312726020813, + "learning_rate": 5.097374321682304e-06, + "loss": 1.6074, + "step": 12498 + }, + { + "epoch": 0.683350874075694, + "grad_norm": 1.3511598110198975, + "learning_rate": 5.095781724999633e-06, + "loss": 1.4248, + "step": 12499 + }, + { + "epoch": 0.6834055465194155, + "grad_norm": 1.5124800205230713, + "learning_rate": 5.094189292080321e-06, + "loss": 1.2879, + "step": 12500 + }, + { + "epoch": 0.6834602189631371, + "grad_norm": 1.4779280424118042, + "learning_rate": 5.092597022977539e-06, + "loss": 1.2853, + "step": 12501 + }, + { + "epoch": 0.6835148914068586, + "grad_norm": 1.5137221813201904, + "learning_rate": 5.091004917744457e-06, + "loss": 1.2665, + "step": 12502 + }, + { + "epoch": 0.6835695638505802, + "grad_norm": 1.8350263833999634, + "learning_rate": 5.089412976434238e-06, + "loss": 1.5744, + "step": 12503 + }, + { + "epoch": 0.6836242362943018, + "grad_norm": 1.235116958618164, + "learning_rate": 5.087821199100047e-06, + "loss": 1.6695, + "step": 12504 + }, + { + "epoch": 0.6836789087380233, + "grad_norm": 2.0813395977020264, + "learning_rate": 5.086229585795032e-06, + "loss": 1.2444, + "step": 12505 + }, + { + "epoch": 0.6837335811817449, + "grad_norm": 1.371982455253601, + "learning_rate": 5.0846381365723375e-06, + "loss": 1.3562, + "step": 12506 + }, + { + "epoch": 0.6837882536254665, + "grad_norm": 1.4439888000488281, + "learning_rate": 5.083046851485114e-06, + "loss": 1.2857, + "step": 12507 + }, + { + "epoch": 0.683842926069188, + "grad_norm": 1.2317943572998047, + "learning_rate": 5.081455730586495e-06, + "loss": 1.2245, + "step": 12508 + }, + { + "epoch": 0.6838975985129095, + "grad_norm": 1.2828292846679688, + "learning_rate": 5.079864773929606e-06, + "loss": 1.5051, + "step": 12509 + }, + { + "epoch": 0.6839522709566311, + "grad_norm": 1.3647130727767944, + "learning_rate": 5.07827398156758e-06, + "loss": 1.4634, + "step": 12510 + }, + { + "epoch": 0.6840069434003526, + "grad_norm": 1.5501266717910767, + "learning_rate": 5.076683353553538e-06, + "loss": 1.3524, + "step": 12511 + }, + { + "epoch": 0.6840616158440742, + "grad_norm": 1.9850798845291138, + "learning_rate": 5.075092889940587e-06, + "loss": 1.333, + "step": 12512 + }, + { + "epoch": 0.6841162882877957, + "grad_norm": 1.3417630195617676, + "learning_rate": 5.073502590781844e-06, + "loss": 1.658, + "step": 12513 + }, + { + "epoch": 0.6841709607315173, + "grad_norm": 1.3307703733444214, + "learning_rate": 5.07191245613041e-06, + "loss": 1.4242, + "step": 12514 + }, + { + "epoch": 0.6842256331752389, + "grad_norm": 1.5780013799667358, + "learning_rate": 5.070322486039383e-06, + "loss": 1.5749, + "step": 12515 + }, + { + "epoch": 0.6842803056189604, + "grad_norm": 1.4522053003311157, + "learning_rate": 5.0687326805618575e-06, + "loss": 1.4228, + "step": 12516 + }, + { + "epoch": 0.684334978062682, + "grad_norm": 1.6075923442840576, + "learning_rate": 5.067143039750914e-06, + "loss": 1.5231, + "step": 12517 + }, + { + "epoch": 0.6843896505064035, + "grad_norm": 1.3165645599365234, + "learning_rate": 5.065553563659644e-06, + "loss": 1.6413, + "step": 12518 + }, + { + "epoch": 0.684444322950125, + "grad_norm": 1.369328260421753, + "learning_rate": 5.06396425234112e-06, + "loss": 1.5911, + "step": 12519 + }, + { + "epoch": 0.6844989953938466, + "grad_norm": 1.6025198698043823, + "learning_rate": 5.062375105848409e-06, + "loss": 1.478, + "step": 12520 + }, + { + "epoch": 0.6845536678375682, + "grad_norm": 2.123237371444702, + "learning_rate": 5.060786124234582e-06, + "loss": 1.7428, + "step": 12521 + }, + { + "epoch": 0.6846083402812897, + "grad_norm": 1.4375016689300537, + "learning_rate": 5.059197307552698e-06, + "loss": 1.7147, + "step": 12522 + }, + { + "epoch": 0.6846630127250113, + "grad_norm": 1.6798253059387207, + "learning_rate": 5.057608655855806e-06, + "loss": 1.5828, + "step": 12523 + }, + { + "epoch": 0.6847176851687329, + "grad_norm": 1.5346336364746094, + "learning_rate": 5.056020169196962e-06, + "loss": 1.5554, + "step": 12524 + }, + { + "epoch": 0.6847723576124544, + "grad_norm": 1.6162291765213013, + "learning_rate": 5.054431847629204e-06, + "loss": 1.2723, + "step": 12525 + }, + { + "epoch": 0.684827030056176, + "grad_norm": 1.4296791553497314, + "learning_rate": 5.052843691205571e-06, + "loss": 1.4007, + "step": 12526 + }, + { + "epoch": 0.6848817024998974, + "grad_norm": 1.4384645223617554, + "learning_rate": 5.051255699979099e-06, + "loss": 1.395, + "step": 12527 + }, + { + "epoch": 0.684936374943619, + "grad_norm": 1.752401351928711, + "learning_rate": 5.049667874002811e-06, + "loss": 1.5068, + "step": 12528 + }, + { + "epoch": 0.6849910473873406, + "grad_norm": 1.4342199563980103, + "learning_rate": 5.048080213329729e-06, + "loss": 1.2452, + "step": 12529 + }, + { + "epoch": 0.6850457198310621, + "grad_norm": 1.7452828884124756, + "learning_rate": 5.04649271801287e-06, + "loss": 1.5256, + "step": 12530 + }, + { + "epoch": 0.6851003922747837, + "grad_norm": 1.491473913192749, + "learning_rate": 5.0449053881052365e-06, + "loss": 1.4543, + "step": 12531 + }, + { + "epoch": 0.6851550647185053, + "grad_norm": 1.4356282949447632, + "learning_rate": 5.043318223659846e-06, + "loss": 1.546, + "step": 12532 + }, + { + "epoch": 0.6852097371622268, + "grad_norm": 1.4971927404403687, + "learning_rate": 5.04173122472969e-06, + "loss": 1.4804, + "step": 12533 + }, + { + "epoch": 0.6852644096059484, + "grad_norm": 2.151974678039551, + "learning_rate": 5.0401443913677605e-06, + "loss": 1.5454, + "step": 12534 + }, + { + "epoch": 0.68531908204967, + "grad_norm": 1.7423121929168701, + "learning_rate": 5.038557723627051e-06, + "loss": 1.4287, + "step": 12535 + }, + { + "epoch": 0.6853737544933914, + "grad_norm": 1.5379940271377563, + "learning_rate": 5.036971221560543e-06, + "loss": 1.3928, + "step": 12536 + }, + { + "epoch": 0.685428426937113, + "grad_norm": 2.058598518371582, + "learning_rate": 5.035384885221206e-06, + "loss": 1.5691, + "step": 12537 + }, + { + "epoch": 0.6854830993808346, + "grad_norm": 1.922872543334961, + "learning_rate": 5.033798714662023e-06, + "loss": 1.4552, + "step": 12538 + }, + { + "epoch": 0.6855377718245561, + "grad_norm": 1.3434414863586426, + "learning_rate": 5.0322127099359554e-06, + "loss": 1.3582, + "step": 12539 + }, + { + "epoch": 0.6855924442682777, + "grad_norm": 1.3792630434036255, + "learning_rate": 5.030626871095961e-06, + "loss": 1.2917, + "step": 12540 + }, + { + "epoch": 0.6856471167119992, + "grad_norm": 1.6389957666397095, + "learning_rate": 5.029041198194997e-06, + "loss": 1.364, + "step": 12541 + }, + { + "epoch": 0.6857017891557208, + "grad_norm": 1.7659049034118652, + "learning_rate": 5.027455691286012e-06, + "loss": 1.1903, + "step": 12542 + }, + { + "epoch": 0.6857564615994424, + "grad_norm": 1.6619969606399536, + "learning_rate": 5.025870350421945e-06, + "loss": 1.3817, + "step": 12543 + }, + { + "epoch": 0.6858111340431639, + "grad_norm": 1.349259853363037, + "learning_rate": 5.0242851756557446e-06, + "loss": 1.5062, + "step": 12544 + }, + { + "epoch": 0.6858658064868854, + "grad_norm": 1.3891241550445557, + "learning_rate": 5.022700167040333e-06, + "loss": 1.4466, + "step": 12545 + }, + { + "epoch": 0.685920478930607, + "grad_norm": 1.6266038417816162, + "learning_rate": 5.021115324628647e-06, + "loss": 1.6733, + "step": 12546 + }, + { + "epoch": 0.6859751513743285, + "grad_norm": 1.4239237308502197, + "learning_rate": 5.019530648473604e-06, + "loss": 1.544, + "step": 12547 + }, + { + "epoch": 0.6860298238180501, + "grad_norm": 1.4296778440475464, + "learning_rate": 5.017946138628116e-06, + "loss": 1.6888, + "step": 12548 + }, + { + "epoch": 0.6860844962617717, + "grad_norm": 1.5101970434188843, + "learning_rate": 5.0163617951451e-06, + "loss": 1.4794, + "step": 12549 + }, + { + "epoch": 0.6861391687054932, + "grad_norm": 1.24241304397583, + "learning_rate": 5.0147776180774575e-06, + "loss": 1.4236, + "step": 12550 + }, + { + "epoch": 0.6861938411492148, + "grad_norm": 1.6260336637496948, + "learning_rate": 5.0131936074780865e-06, + "loss": 1.4642, + "step": 12551 + }, + { + "epoch": 0.6862485135929364, + "grad_norm": 1.3253679275512695, + "learning_rate": 5.011609763399885e-06, + "loss": 1.1321, + "step": 12552 + }, + { + "epoch": 0.6863031860366579, + "grad_norm": 1.8984185457229614, + "learning_rate": 5.010026085895741e-06, + "loss": 1.4882, + "step": 12553 + }, + { + "epoch": 0.6863578584803794, + "grad_norm": 2.0978622436523438, + "learning_rate": 5.008442575018534e-06, + "loss": 1.5114, + "step": 12554 + }, + { + "epoch": 0.6864125309241009, + "grad_norm": 1.4173107147216797, + "learning_rate": 5.0068592308211425e-06, + "loss": 1.5326, + "step": 12555 + }, + { + "epoch": 0.6864672033678225, + "grad_norm": 1.7480010986328125, + "learning_rate": 5.005276053356438e-06, + "loss": 1.6051, + "step": 12556 + }, + { + "epoch": 0.6865218758115441, + "grad_norm": 1.8378980159759521, + "learning_rate": 5.003693042677283e-06, + "loss": 1.5903, + "step": 12557 + }, + { + "epoch": 0.6865765482552656, + "grad_norm": 1.3662244081497192, + "learning_rate": 5.0021101988365465e-06, + "loss": 1.4176, + "step": 12558 + }, + { + "epoch": 0.6866312206989872, + "grad_norm": 1.2233442068099976, + "learning_rate": 5.000527521887073e-06, + "loss": 1.4869, + "step": 12559 + }, + { + "epoch": 0.6866858931427088, + "grad_norm": 1.7302109003067017, + "learning_rate": 4.998945011881722e-06, + "loss": 1.2039, + "step": 12560 + }, + { + "epoch": 0.6867405655864303, + "grad_norm": 1.291810154914856, + "learning_rate": 4.997362668873331e-06, + "loss": 1.596, + "step": 12561 + }, + { + "epoch": 0.6867952380301519, + "grad_norm": 1.5335478782653809, + "learning_rate": 4.995780492914737e-06, + "loss": 1.4598, + "step": 12562 + }, + { + "epoch": 0.6868499104738734, + "grad_norm": 1.6202136278152466, + "learning_rate": 4.9941984840587786e-06, + "loss": 1.6461, + "step": 12563 + }, + { + "epoch": 0.6869045829175949, + "grad_norm": 1.5797544717788696, + "learning_rate": 4.992616642358279e-06, + "loss": 1.5196, + "step": 12564 + }, + { + "epoch": 0.6869592553613165, + "grad_norm": 1.7833354473114014, + "learning_rate": 4.991034967866061e-06, + "loss": 1.3027, + "step": 12565 + }, + { + "epoch": 0.6870139278050381, + "grad_norm": 1.4136189222335815, + "learning_rate": 4.989453460634938e-06, + "loss": 1.3211, + "step": 12566 + }, + { + "epoch": 0.6870686002487596, + "grad_norm": 1.6635026931762695, + "learning_rate": 4.987872120717721e-06, + "loss": 1.3127, + "step": 12567 + }, + { + "epoch": 0.6871232726924812, + "grad_norm": 1.5452133417129517, + "learning_rate": 4.9862909481672126e-06, + "loss": 1.6195, + "step": 12568 + }, + { + "epoch": 0.6871779451362027, + "grad_norm": 1.4224159717559814, + "learning_rate": 4.984709943036219e-06, + "loss": 1.5926, + "step": 12569 + }, + { + "epoch": 0.6872326175799243, + "grad_norm": 1.5578787326812744, + "learning_rate": 4.9831291053775275e-06, + "loss": 1.3777, + "step": 12570 + }, + { + "epoch": 0.6872872900236459, + "grad_norm": 1.2083828449249268, + "learning_rate": 4.9815484352439244e-06, + "loss": 1.459, + "step": 12571 + }, + { + "epoch": 0.6873419624673673, + "grad_norm": 1.5252037048339844, + "learning_rate": 4.9799679326882e-06, + "loss": 1.239, + "step": 12572 + }, + { + "epoch": 0.6873966349110889, + "grad_norm": 1.4217582941055298, + "learning_rate": 4.978387597763121e-06, + "loss": 1.4022, + "step": 12573 + }, + { + "epoch": 0.6874513073548105, + "grad_norm": 1.5625585317611694, + "learning_rate": 4.976807430521469e-06, + "loss": 1.3879, + "step": 12574 + }, + { + "epoch": 0.687505979798532, + "grad_norm": 1.5524837970733643, + "learning_rate": 4.975227431016003e-06, + "loss": 1.4336, + "step": 12575 + }, + { + "epoch": 0.6875606522422536, + "grad_norm": 1.3555171489715576, + "learning_rate": 4.97364759929948e-06, + "loss": 1.3309, + "step": 12576 + }, + { + "epoch": 0.6876153246859752, + "grad_norm": 1.768763542175293, + "learning_rate": 4.972067935424664e-06, + "loss": 1.5119, + "step": 12577 + }, + { + "epoch": 0.6876699971296967, + "grad_norm": 2.188183546066284, + "learning_rate": 4.970488439444296e-06, + "loss": 1.3792, + "step": 12578 + }, + { + "epoch": 0.6877246695734183, + "grad_norm": 1.539833426475525, + "learning_rate": 4.968909111411122e-06, + "loss": 1.1913, + "step": 12579 + }, + { + "epoch": 0.6877793420171399, + "grad_norm": 1.7019153833389282, + "learning_rate": 4.967329951377878e-06, + "loss": 1.2929, + "step": 12580 + }, + { + "epoch": 0.6878340144608613, + "grad_norm": 1.889857292175293, + "learning_rate": 4.965750959397297e-06, + "loss": 1.3691, + "step": 12581 + }, + { + "epoch": 0.6878886869045829, + "grad_norm": 1.371424913406372, + "learning_rate": 4.964172135522102e-06, + "loss": 1.613, + "step": 12582 + }, + { + "epoch": 0.6879433593483044, + "grad_norm": 1.300324559211731, + "learning_rate": 4.962593479805018e-06, + "loss": 1.628, + "step": 12583 + }, + { + "epoch": 0.687998031792026, + "grad_norm": 1.6447099447250366, + "learning_rate": 4.961014992298759e-06, + "loss": 1.1569, + "step": 12584 + }, + { + "epoch": 0.6880527042357476, + "grad_norm": 1.6062737703323364, + "learning_rate": 4.95943667305603e-06, + "loss": 1.4292, + "step": 12585 + }, + { + "epoch": 0.6881073766794691, + "grad_norm": 1.199884057044983, + "learning_rate": 4.957858522129544e-06, + "loss": 1.4867, + "step": 12586 + }, + { + "epoch": 0.6881620491231907, + "grad_norm": 1.739594578742981, + "learning_rate": 4.956280539571988e-06, + "loss": 1.556, + "step": 12587 + }, + { + "epoch": 0.6882167215669123, + "grad_norm": 1.7304290533065796, + "learning_rate": 4.954702725436065e-06, + "loss": 1.3773, + "step": 12588 + }, + { + "epoch": 0.6882713940106338, + "grad_norm": 1.8788316249847412, + "learning_rate": 4.953125079774457e-06, + "loss": 1.3803, + "step": 12589 + }, + { + "epoch": 0.6883260664543553, + "grad_norm": 1.353029489517212, + "learning_rate": 4.9515476026398415e-06, + "loss": 1.3534, + "step": 12590 + }, + { + "epoch": 0.6883807388980769, + "grad_norm": 1.392526626586914, + "learning_rate": 4.9499702940849016e-06, + "loss": 1.5074, + "step": 12591 + }, + { + "epoch": 0.6884354113417984, + "grad_norm": 1.5475367307662964, + "learning_rate": 4.948393154162303e-06, + "loss": 1.2069, + "step": 12592 + }, + { + "epoch": 0.68849008378552, + "grad_norm": 1.5546257495880127, + "learning_rate": 4.946816182924713e-06, + "loss": 1.3755, + "step": 12593 + }, + { + "epoch": 0.6885447562292416, + "grad_norm": 1.4121778011322021, + "learning_rate": 4.945239380424787e-06, + "loss": 1.547, + "step": 12594 + }, + { + "epoch": 0.6885994286729631, + "grad_norm": 1.8223012685775757, + "learning_rate": 4.9436627467151795e-06, + "loss": 1.3829, + "step": 12595 + }, + { + "epoch": 0.6886541011166847, + "grad_norm": 1.2909690141677856, + "learning_rate": 4.942086281848535e-06, + "loss": 1.4213, + "step": 12596 + }, + { + "epoch": 0.6887087735604062, + "grad_norm": 1.4040502309799194, + "learning_rate": 4.9405099858775e-06, + "loss": 1.5751, + "step": 12597 + }, + { + "epoch": 0.6887634460041278, + "grad_norm": 1.4123307466506958, + "learning_rate": 4.938933858854712e-06, + "loss": 1.5158, + "step": 12598 + }, + { + "epoch": 0.6888181184478493, + "grad_norm": 1.1677260398864746, + "learning_rate": 4.937357900832793e-06, + "loss": 1.7037, + "step": 12599 + }, + { + "epoch": 0.6888727908915708, + "grad_norm": 1.5351438522338867, + "learning_rate": 4.935782111864378e-06, + "loss": 1.6274, + "step": 12600 + }, + { + "epoch": 0.6889274633352924, + "grad_norm": 1.4713488817214966, + "learning_rate": 4.934206492002077e-06, + "loss": 1.4185, + "step": 12601 + }, + { + "epoch": 0.688982135779014, + "grad_norm": 1.4844412803649902, + "learning_rate": 4.932631041298513e-06, + "loss": 1.4785, + "step": 12602 + }, + { + "epoch": 0.6890368082227355, + "grad_norm": 1.841965675354004, + "learning_rate": 4.93105575980629e-06, + "loss": 1.493, + "step": 12603 + }, + { + "epoch": 0.6890914806664571, + "grad_norm": 1.3710013628005981, + "learning_rate": 4.92948064757801e-06, + "loss": 1.3814, + "step": 12604 + }, + { + "epoch": 0.6891461531101787, + "grad_norm": 1.5177048444747925, + "learning_rate": 4.927905704666268e-06, + "loss": 1.5524, + "step": 12605 + }, + { + "epoch": 0.6892008255539002, + "grad_norm": 2.126216411590576, + "learning_rate": 4.926330931123659e-06, + "loss": 1.2568, + "step": 12606 + }, + { + "epoch": 0.6892554979976218, + "grad_norm": 1.1634511947631836, + "learning_rate": 4.92475632700276e-06, + "loss": 1.4732, + "step": 12607 + }, + { + "epoch": 0.6893101704413434, + "grad_norm": 1.434659481048584, + "learning_rate": 4.92318189235616e-06, + "loss": 1.5507, + "step": 12608 + }, + { + "epoch": 0.6893648428850648, + "grad_norm": 1.2993474006652832, + "learning_rate": 4.921607627236431e-06, + "loss": 1.4118, + "step": 12609 + }, + { + "epoch": 0.6894195153287864, + "grad_norm": 1.5277929306030273, + "learning_rate": 4.920033531696137e-06, + "loss": 1.442, + "step": 12610 + }, + { + "epoch": 0.6894741877725079, + "grad_norm": 1.3764774799346924, + "learning_rate": 4.918459605787847e-06, + "loss": 1.4726, + "step": 12611 + }, + { + "epoch": 0.6895288602162295, + "grad_norm": 1.3899260759353638, + "learning_rate": 4.916885849564115e-06, + "loss": 1.5742, + "step": 12612 + }, + { + "epoch": 0.6895835326599511, + "grad_norm": 1.4881709814071655, + "learning_rate": 4.915312263077488e-06, + "loss": 1.6722, + "step": 12613 + }, + { + "epoch": 0.6896382051036726, + "grad_norm": 1.573333501815796, + "learning_rate": 4.91373884638052e-06, + "loss": 1.4115, + "step": 12614 + }, + { + "epoch": 0.6896928775473942, + "grad_norm": 1.3995153903961182, + "learning_rate": 4.9121655995257445e-06, + "loss": 1.6356, + "step": 12615 + }, + { + "epoch": 0.6897475499911158, + "grad_norm": 1.693527102470398, + "learning_rate": 4.910592522565702e-06, + "loss": 1.2526, + "step": 12616 + }, + { + "epoch": 0.6898022224348372, + "grad_norm": 1.5314923524856567, + "learning_rate": 4.9090196155529165e-06, + "loss": 1.4828, + "step": 12617 + }, + { + "epoch": 0.6898568948785588, + "grad_norm": 1.8125971555709839, + "learning_rate": 4.907446878539913e-06, + "loss": 1.3868, + "step": 12618 + }, + { + "epoch": 0.6899115673222804, + "grad_norm": 1.2792389392852783, + "learning_rate": 4.905874311579209e-06, + "loss": 1.5711, + "step": 12619 + }, + { + "epoch": 0.6899662397660019, + "grad_norm": 1.5471869707107544, + "learning_rate": 4.904301914723315e-06, + "loss": 1.5574, + "step": 12620 + }, + { + "epoch": 0.6900209122097235, + "grad_norm": 1.7636319398880005, + "learning_rate": 4.902729688024734e-06, + "loss": 1.2987, + "step": 12621 + }, + { + "epoch": 0.6900755846534451, + "grad_norm": 1.14493727684021, + "learning_rate": 4.9011576315359736e-06, + "loss": 1.5493, + "step": 12622 + }, + { + "epoch": 0.6901302570971666, + "grad_norm": 1.3865689039230347, + "learning_rate": 4.899585745309525e-06, + "loss": 1.4175, + "step": 12623 + }, + { + "epoch": 0.6901849295408882, + "grad_norm": 1.6405283212661743, + "learning_rate": 4.898014029397873e-06, + "loss": 1.256, + "step": 12624 + }, + { + "epoch": 0.6902396019846097, + "grad_norm": 1.8468223810195923, + "learning_rate": 4.896442483853507e-06, + "loss": 1.5727, + "step": 12625 + }, + { + "epoch": 0.6902942744283312, + "grad_norm": 1.7900587320327759, + "learning_rate": 4.894871108728903e-06, + "loss": 1.2804, + "step": 12626 + }, + { + "epoch": 0.6903489468720528, + "grad_norm": 2.193723678588867, + "learning_rate": 4.89329990407653e-06, + "loss": 1.4048, + "step": 12627 + }, + { + "epoch": 0.6904036193157743, + "grad_norm": 1.6427571773529053, + "learning_rate": 4.8917288699488596e-06, + "loss": 1.3707, + "step": 12628 + }, + { + "epoch": 0.6904582917594959, + "grad_norm": 1.7458499670028687, + "learning_rate": 4.890158006398345e-06, + "loss": 1.375, + "step": 12629 + }, + { + "epoch": 0.6905129642032175, + "grad_norm": 1.331009864807129, + "learning_rate": 4.888587313477449e-06, + "loss": 1.5148, + "step": 12630 + }, + { + "epoch": 0.690567636646939, + "grad_norm": 1.2243924140930176, + "learning_rate": 4.887016791238617e-06, + "loss": 1.5375, + "step": 12631 + }, + { + "epoch": 0.6906223090906606, + "grad_norm": 1.8473342657089233, + "learning_rate": 4.8854464397342914e-06, + "loss": 1.4601, + "step": 12632 + }, + { + "epoch": 0.6906769815343822, + "grad_norm": 1.4399538040161133, + "learning_rate": 4.883876259016912e-06, + "loss": 1.4356, + "step": 12633 + }, + { + "epoch": 0.6907316539781037, + "grad_norm": 1.9733492136001587, + "learning_rate": 4.882306249138909e-06, + "loss": 1.6619, + "step": 12634 + }, + { + "epoch": 0.6907863264218252, + "grad_norm": 1.2281891107559204, + "learning_rate": 4.880736410152707e-06, + "loss": 1.4699, + "step": 12635 + }, + { + "epoch": 0.6908409988655468, + "grad_norm": 1.6101410388946533, + "learning_rate": 4.879166742110731e-06, + "loss": 1.5281, + "step": 12636 + }, + { + "epoch": 0.6908956713092683, + "grad_norm": 1.5665374994277954, + "learning_rate": 4.877597245065394e-06, + "loss": 1.4385, + "step": 12637 + }, + { + "epoch": 0.6909503437529899, + "grad_norm": 1.41428542137146, + "learning_rate": 4.876027919069103e-06, + "loss": 1.2557, + "step": 12638 + }, + { + "epoch": 0.6910050161967115, + "grad_norm": 1.736632227897644, + "learning_rate": 4.874458764174266e-06, + "loss": 1.4047, + "step": 12639 + }, + { + "epoch": 0.691059688640433, + "grad_norm": 1.594208002090454, + "learning_rate": 4.872889780433279e-06, + "loss": 1.4209, + "step": 12640 + }, + { + "epoch": 0.6911143610841546, + "grad_norm": 1.4146908521652222, + "learning_rate": 4.871320967898528e-06, + "loss": 1.1819, + "step": 12641 + }, + { + "epoch": 0.6911690335278761, + "grad_norm": 2.048612117767334, + "learning_rate": 4.86975232662241e-06, + "loss": 1.5932, + "step": 12642 + }, + { + "epoch": 0.6912237059715977, + "grad_norm": 1.037929892539978, + "learning_rate": 4.8681838566573e-06, + "loss": 1.5901, + "step": 12643 + }, + { + "epoch": 0.6912783784153193, + "grad_norm": 2.110140085220337, + "learning_rate": 4.866615558055573e-06, + "loss": 1.7381, + "step": 12644 + }, + { + "epoch": 0.6913330508590407, + "grad_norm": 1.2667720317840576, + "learning_rate": 4.865047430869598e-06, + "loss": 1.3552, + "step": 12645 + }, + { + "epoch": 0.6913877233027623, + "grad_norm": 1.517743468284607, + "learning_rate": 4.863479475151737e-06, + "loss": 1.3383, + "step": 12646 + }, + { + "epoch": 0.6914423957464839, + "grad_norm": 1.5148379802703857, + "learning_rate": 4.861911690954351e-06, + "loss": 1.4768, + "step": 12647 + }, + { + "epoch": 0.6914970681902054, + "grad_norm": 1.4871065616607666, + "learning_rate": 4.860344078329791e-06, + "loss": 1.3466, + "step": 12648 + }, + { + "epoch": 0.691551740633927, + "grad_norm": 1.4514607191085815, + "learning_rate": 4.8587766373304e-06, + "loss": 1.4461, + "step": 12649 + }, + { + "epoch": 0.6916064130776486, + "grad_norm": 1.3476753234863281, + "learning_rate": 4.8572093680085245e-06, + "loss": 1.4788, + "step": 12650 + }, + { + "epoch": 0.6916610855213701, + "grad_norm": 1.5992449522018433, + "learning_rate": 4.855642270416496e-06, + "loss": 1.4247, + "step": 12651 + }, + { + "epoch": 0.6917157579650917, + "grad_norm": 1.7964733839035034, + "learning_rate": 4.85407534460664e-06, + "loss": 1.5337, + "step": 12652 + }, + { + "epoch": 0.6917704304088133, + "grad_norm": 1.9599559307098389, + "learning_rate": 4.852508590631288e-06, + "loss": 1.64, + "step": 12653 + }, + { + "epoch": 0.6918251028525347, + "grad_norm": 1.1227549314498901, + "learning_rate": 4.850942008542753e-06, + "loss": 1.6253, + "step": 12654 + }, + { + "epoch": 0.6918797752962563, + "grad_norm": 1.49053156375885, + "learning_rate": 4.849375598393342e-06, + "loss": 1.3045, + "step": 12655 + }, + { + "epoch": 0.6919344477399778, + "grad_norm": 1.6941853761672974, + "learning_rate": 4.8478093602353715e-06, + "loss": 1.5349, + "step": 12656 + }, + { + "epoch": 0.6919891201836994, + "grad_norm": 1.7267807722091675, + "learning_rate": 4.846243294121136e-06, + "loss": 1.6074, + "step": 12657 + }, + { + "epoch": 0.692043792627421, + "grad_norm": 1.5677582025527954, + "learning_rate": 4.8446774001029295e-06, + "loss": 1.4583, + "step": 12658 + }, + { + "epoch": 0.6920984650711425, + "grad_norm": 1.4251391887664795, + "learning_rate": 4.843111678233042e-06, + "loss": 1.495, + "step": 12659 + }, + { + "epoch": 0.6921531375148641, + "grad_norm": 1.321510672569275, + "learning_rate": 4.841546128563754e-06, + "loss": 1.519, + "step": 12660 + }, + { + "epoch": 0.6922078099585857, + "grad_norm": 1.579886794090271, + "learning_rate": 4.8399807511473486e-06, + "loss": 1.5718, + "step": 12661 + }, + { + "epoch": 0.6922624824023071, + "grad_norm": 2.006162405014038, + "learning_rate": 4.838415546036095e-06, + "loss": 1.5482, + "step": 12662 + }, + { + "epoch": 0.6923171548460287, + "grad_norm": 1.6120859384536743, + "learning_rate": 4.836850513282253e-06, + "loss": 1.0462, + "step": 12663 + }, + { + "epoch": 0.6923718272897503, + "grad_norm": 1.924094557762146, + "learning_rate": 4.835285652938093e-06, + "loss": 1.2706, + "step": 12664 + }, + { + "epoch": 0.6924264997334718, + "grad_norm": 1.61594820022583, + "learning_rate": 4.833720965055865e-06, + "loss": 1.4422, + "step": 12665 + }, + { + "epoch": 0.6924811721771934, + "grad_norm": 1.3249280452728271, + "learning_rate": 4.832156449687812e-06, + "loss": 1.3145, + "step": 12666 + }, + { + "epoch": 0.692535844620915, + "grad_norm": 1.4477096796035767, + "learning_rate": 4.830592106886186e-06, + "loss": 1.5834, + "step": 12667 + }, + { + "epoch": 0.6925905170646365, + "grad_norm": 1.6219396591186523, + "learning_rate": 4.82902793670322e-06, + "loss": 1.5247, + "step": 12668 + }, + { + "epoch": 0.6926451895083581, + "grad_norm": 1.1024829149246216, + "learning_rate": 4.827463939191141e-06, + "loss": 1.4468, + "step": 12669 + }, + { + "epoch": 0.6926998619520796, + "grad_norm": 1.5392849445343018, + "learning_rate": 4.825900114402185e-06, + "loss": 1.5443, + "step": 12670 + }, + { + "epoch": 0.6927545343958011, + "grad_norm": 1.4061228036880493, + "learning_rate": 4.824336462388563e-06, + "loss": 1.3008, + "step": 12671 + }, + { + "epoch": 0.6928092068395227, + "grad_norm": 1.5420535802841187, + "learning_rate": 4.8227729832024914e-06, + "loss": 1.4563, + "step": 12672 + }, + { + "epoch": 0.6928638792832442, + "grad_norm": 1.4997011423110962, + "learning_rate": 4.82120967689618e-06, + "loss": 1.4427, + "step": 12673 + }, + { + "epoch": 0.6929185517269658, + "grad_norm": 1.7505507469177246, + "learning_rate": 4.819646543521825e-06, + "loss": 1.4787, + "step": 12674 + }, + { + "epoch": 0.6929732241706874, + "grad_norm": 1.889017939567566, + "learning_rate": 4.818083583131633e-06, + "loss": 1.6443, + "step": 12675 + }, + { + "epoch": 0.6930278966144089, + "grad_norm": 1.5882377624511719, + "learning_rate": 4.816520795777789e-06, + "loss": 1.5857, + "step": 12676 + }, + { + "epoch": 0.6930825690581305, + "grad_norm": 1.623258352279663, + "learning_rate": 4.8149581815124756e-06, + "loss": 1.5211, + "step": 12677 + }, + { + "epoch": 0.6931372415018521, + "grad_norm": 1.5722674131393433, + "learning_rate": 4.81339574038788e-06, + "loss": 1.6855, + "step": 12678 + }, + { + "epoch": 0.6931919139455736, + "grad_norm": 1.6019083261489868, + "learning_rate": 4.811833472456171e-06, + "loss": 1.4597, + "step": 12679 + }, + { + "epoch": 0.6932465863892951, + "grad_norm": 1.403531789779663, + "learning_rate": 4.810271377769512e-06, + "loss": 1.3048, + "step": 12680 + }, + { + "epoch": 0.6933012588330167, + "grad_norm": 1.6559206247329712, + "learning_rate": 4.808709456380075e-06, + "loss": 1.2427, + "step": 12681 + }, + { + "epoch": 0.6933559312767382, + "grad_norm": 1.6106420755386353, + "learning_rate": 4.80714770834001e-06, + "loss": 1.436, + "step": 12682 + }, + { + "epoch": 0.6934106037204598, + "grad_norm": 1.8350330591201782, + "learning_rate": 4.805586133701468e-06, + "loss": 1.3721, + "step": 12683 + }, + { + "epoch": 0.6934652761641813, + "grad_norm": 1.3318266868591309, + "learning_rate": 4.804024732516596e-06, + "loss": 1.5021, + "step": 12684 + }, + { + "epoch": 0.6935199486079029, + "grad_norm": 1.647231101989746, + "learning_rate": 4.802463504837526e-06, + "loss": 1.3784, + "step": 12685 + }, + { + "epoch": 0.6935746210516245, + "grad_norm": 1.7344506978988647, + "learning_rate": 4.8009024507163996e-06, + "loss": 1.5225, + "step": 12686 + }, + { + "epoch": 0.693629293495346, + "grad_norm": 1.3496705293655396, + "learning_rate": 4.79934157020534e-06, + "loss": 1.4096, + "step": 12687 + }, + { + "epoch": 0.6936839659390676, + "grad_norm": 1.4421358108520508, + "learning_rate": 4.797780863356466e-06, + "loss": 1.4043, + "step": 12688 + }, + { + "epoch": 0.6937386383827892, + "grad_norm": 1.3286625146865845, + "learning_rate": 4.7962203302219e-06, + "loss": 1.2246, + "step": 12689 + }, + { + "epoch": 0.6937933108265106, + "grad_norm": 1.4480130672454834, + "learning_rate": 4.7946599708537485e-06, + "loss": 1.2644, + "step": 12690 + }, + { + "epoch": 0.6938479832702322, + "grad_norm": 1.5842177867889404, + "learning_rate": 4.793099785304111e-06, + "loss": 1.5265, + "step": 12691 + }, + { + "epoch": 0.6939026557139538, + "grad_norm": 1.5651576519012451, + "learning_rate": 4.791539773625094e-06, + "loss": 1.364, + "step": 12692 + }, + { + "epoch": 0.6939573281576753, + "grad_norm": 1.6641672849655151, + "learning_rate": 4.789979935868786e-06, + "loss": 1.5688, + "step": 12693 + }, + { + "epoch": 0.6940120006013969, + "grad_norm": 1.6433922052383423, + "learning_rate": 4.788420272087271e-06, + "loss": 1.5955, + "step": 12694 + }, + { + "epoch": 0.6940666730451185, + "grad_norm": 1.385828971862793, + "learning_rate": 4.786860782332634e-06, + "loss": 1.5602, + "step": 12695 + }, + { + "epoch": 0.69412134548884, + "grad_norm": 1.465132474899292, + "learning_rate": 4.78530146665695e-06, + "loss": 1.4008, + "step": 12696 + }, + { + "epoch": 0.6941760179325616, + "grad_norm": 1.469182014465332, + "learning_rate": 4.783742325112286e-06, + "loss": 1.28, + "step": 12697 + }, + { + "epoch": 0.694230690376283, + "grad_norm": 1.3773282766342163, + "learning_rate": 4.782183357750707e-06, + "loss": 1.2505, + "step": 12698 + }, + { + "epoch": 0.6942853628200046, + "grad_norm": 1.5422922372817993, + "learning_rate": 4.780624564624265e-06, + "loss": 1.4508, + "step": 12699 + }, + { + "epoch": 0.6943400352637262, + "grad_norm": 1.4191745519638062, + "learning_rate": 4.779065945785021e-06, + "loss": 1.4418, + "step": 12700 + }, + { + "epoch": 0.6943947077074477, + "grad_norm": 1.6282323598861694, + "learning_rate": 4.777507501285016e-06, + "loss": 1.2877, + "step": 12701 + }, + { + "epoch": 0.6944493801511693, + "grad_norm": 1.8007875680923462, + "learning_rate": 4.775949231176287e-06, + "loss": 1.4858, + "step": 12702 + }, + { + "epoch": 0.6945040525948909, + "grad_norm": 1.5981236696243286, + "learning_rate": 4.7743911355108745e-06, + "loss": 1.4832, + "step": 12703 + }, + { + "epoch": 0.6945587250386124, + "grad_norm": 1.6222060918807983, + "learning_rate": 4.772833214340805e-06, + "loss": 1.4558, + "step": 12704 + }, + { + "epoch": 0.694613397482334, + "grad_norm": 1.4976918697357178, + "learning_rate": 4.771275467718096e-06, + "loss": 1.457, + "step": 12705 + }, + { + "epoch": 0.6946680699260556, + "grad_norm": 1.5352579355239868, + "learning_rate": 4.769717895694774e-06, + "loss": 1.2636, + "step": 12706 + }, + { + "epoch": 0.694722742369777, + "grad_norm": 1.4842028617858887, + "learning_rate": 4.768160498322843e-06, + "loss": 1.4025, + "step": 12707 + }, + { + "epoch": 0.6947774148134986, + "grad_norm": 1.1934443712234497, + "learning_rate": 4.766603275654308e-06, + "loss": 1.564, + "step": 12708 + }, + { + "epoch": 0.6948320872572202, + "grad_norm": 1.4067209959030151, + "learning_rate": 4.765046227741173e-06, + "loss": 1.4231, + "step": 12709 + }, + { + "epoch": 0.6948867597009417, + "grad_norm": 1.3533306121826172, + "learning_rate": 4.7634893546354275e-06, + "loss": 1.5021, + "step": 12710 + }, + { + "epoch": 0.6949414321446633, + "grad_norm": 1.452576756477356, + "learning_rate": 4.761932656389061e-06, + "loss": 1.5978, + "step": 12711 + }, + { + "epoch": 0.6949961045883848, + "grad_norm": 1.7409418821334839, + "learning_rate": 4.760376133054056e-06, + "loss": 1.3771, + "step": 12712 + }, + { + "epoch": 0.6950507770321064, + "grad_norm": 1.864794135093689, + "learning_rate": 4.7588197846823814e-06, + "loss": 1.5627, + "step": 12713 + }, + { + "epoch": 0.695105449475828, + "grad_norm": 1.2294936180114746, + "learning_rate": 4.757263611326018e-06, + "loss": 1.5782, + "step": 12714 + }, + { + "epoch": 0.6951601219195495, + "grad_norm": 1.7710022926330566, + "learning_rate": 4.755707613036925e-06, + "loss": 1.5248, + "step": 12715 + }, + { + "epoch": 0.695214794363271, + "grad_norm": 1.4016880989074707, + "learning_rate": 4.7541517898670565e-06, + "loss": 1.6573, + "step": 12716 + }, + { + "epoch": 0.6952694668069926, + "grad_norm": 1.4852845668792725, + "learning_rate": 4.7525961418683734e-06, + "loss": 1.4241, + "step": 12717 + }, + { + "epoch": 0.6953241392507141, + "grad_norm": 1.5146164894104004, + "learning_rate": 4.751040669092819e-06, + "loss": 1.3582, + "step": 12718 + }, + { + "epoch": 0.6953788116944357, + "grad_norm": 1.4575072526931763, + "learning_rate": 4.74948537159233e-06, + "loss": 1.2785, + "step": 12719 + }, + { + "epoch": 0.6954334841381573, + "grad_norm": 1.2742780447006226, + "learning_rate": 4.747930249418848e-06, + "loss": 1.5702, + "step": 12720 + }, + { + "epoch": 0.6954881565818788, + "grad_norm": 1.506352424621582, + "learning_rate": 4.7463753026243e-06, + "loss": 1.4382, + "step": 12721 + }, + { + "epoch": 0.6955428290256004, + "grad_norm": 2.258129596710205, + "learning_rate": 4.744820531260609e-06, + "loss": 1.4104, + "step": 12722 + }, + { + "epoch": 0.695597501469322, + "grad_norm": 1.4175885915756226, + "learning_rate": 4.743265935379692e-06, + "loss": 1.329, + "step": 12723 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 1.2584092617034912, + "learning_rate": 4.7417115150334606e-06, + "loss": 1.6057, + "step": 12724 + }, + { + "epoch": 0.695706846356765, + "grad_norm": 1.7065621614456177, + "learning_rate": 4.740157270273816e-06, + "loss": 1.7797, + "step": 12725 + }, + { + "epoch": 0.6957615188004865, + "grad_norm": 1.3784452676773071, + "learning_rate": 4.7386032011526674e-06, + "loss": 1.4114, + "step": 12726 + }, + { + "epoch": 0.6958161912442081, + "grad_norm": 1.6299738883972168, + "learning_rate": 4.737049307721901e-06, + "loss": 1.3975, + "step": 12727 + }, + { + "epoch": 0.6958708636879297, + "grad_norm": 1.568264365196228, + "learning_rate": 4.735495590033411e-06, + "loss": 1.3064, + "step": 12728 + }, + { + "epoch": 0.6959255361316512, + "grad_norm": 1.9341517686843872, + "learning_rate": 4.733942048139077e-06, + "loss": 1.4186, + "step": 12729 + }, + { + "epoch": 0.6959802085753728, + "grad_norm": 1.6576203107833862, + "learning_rate": 4.7323886820907715e-06, + "loss": 1.2214, + "step": 12730 + }, + { + "epoch": 0.6960348810190944, + "grad_norm": 1.375853180885315, + "learning_rate": 4.730835491940372e-06, + "loss": 1.3405, + "step": 12731 + }, + { + "epoch": 0.6960895534628159, + "grad_norm": 1.9814516305923462, + "learning_rate": 4.729282477739741e-06, + "loss": 1.6065, + "step": 12732 + }, + { + "epoch": 0.6961442259065375, + "grad_norm": 1.395311951637268, + "learning_rate": 4.7277296395407316e-06, + "loss": 1.3969, + "step": 12733 + }, + { + "epoch": 0.696198898350259, + "grad_norm": 1.6271382570266724, + "learning_rate": 4.726176977395205e-06, + "loss": 1.5192, + "step": 12734 + }, + { + "epoch": 0.6962535707939805, + "grad_norm": 1.420960783958435, + "learning_rate": 4.724624491355005e-06, + "loss": 1.7151, + "step": 12735 + }, + { + "epoch": 0.6963082432377021, + "grad_norm": 1.4915984869003296, + "learning_rate": 4.723072181471971e-06, + "loss": 1.5257, + "step": 12736 + }, + { + "epoch": 0.6963629156814237, + "grad_norm": 1.658263921737671, + "learning_rate": 4.7215200477979415e-06, + "loss": 1.3873, + "step": 12737 + }, + { + "epoch": 0.6964175881251452, + "grad_norm": 1.172123908996582, + "learning_rate": 4.719968090384743e-06, + "loss": 1.348, + "step": 12738 + }, + { + "epoch": 0.6964722605688668, + "grad_norm": 1.2196485996246338, + "learning_rate": 4.718416309284196e-06, + "loss": 1.4275, + "step": 12739 + }, + { + "epoch": 0.6965269330125883, + "grad_norm": 1.7756707668304443, + "learning_rate": 4.7168647045481264e-06, + "loss": 1.3961, + "step": 12740 + }, + { + "epoch": 0.6965816054563099, + "grad_norm": 1.1941505670547485, + "learning_rate": 4.715313276228337e-06, + "loss": 1.4812, + "step": 12741 + }, + { + "epoch": 0.6966362779000315, + "grad_norm": 1.9657379388809204, + "learning_rate": 4.7137620243766425e-06, + "loss": 1.4571, + "step": 12742 + }, + { + "epoch": 0.696690950343753, + "grad_norm": 1.551171898841858, + "learning_rate": 4.712210949044839e-06, + "loss": 1.4744, + "step": 12743 + }, + { + "epoch": 0.6967456227874745, + "grad_norm": 1.3117817640304565, + "learning_rate": 4.710660050284716e-06, + "loss": 1.2828, + "step": 12744 + }, + { + "epoch": 0.6968002952311961, + "grad_norm": 1.3347671031951904, + "learning_rate": 4.709109328148069e-06, + "loss": 1.5819, + "step": 12745 + }, + { + "epoch": 0.6968549676749176, + "grad_norm": 2.0066497325897217, + "learning_rate": 4.707558782686677e-06, + "loss": 1.3399, + "step": 12746 + }, + { + "epoch": 0.6969096401186392, + "grad_norm": 1.8572089672088623, + "learning_rate": 4.7060084139523135e-06, + "loss": 1.4434, + "step": 12747 + }, + { + "epoch": 0.6969643125623608, + "grad_norm": 1.6588354110717773, + "learning_rate": 4.704458221996755e-06, + "loss": 1.3763, + "step": 12748 + }, + { + "epoch": 0.6970189850060823, + "grad_norm": 1.5079933404922485, + "learning_rate": 4.702908206871763e-06, + "loss": 1.5742, + "step": 12749 + }, + { + "epoch": 0.6970736574498039, + "grad_norm": 1.5456907749176025, + "learning_rate": 4.701358368629095e-06, + "loss": 1.2344, + "step": 12750 + }, + { + "epoch": 0.6971283298935255, + "grad_norm": 1.6749314069747925, + "learning_rate": 4.699808707320506e-06, + "loss": 1.3455, + "step": 12751 + }, + { + "epoch": 0.697183002337247, + "grad_norm": 1.4537371397018433, + "learning_rate": 4.6982592229977405e-06, + "loss": 1.5496, + "step": 12752 + }, + { + "epoch": 0.6972376747809685, + "grad_norm": 1.4397616386413574, + "learning_rate": 4.6967099157125384e-06, + "loss": 1.4129, + "step": 12753 + }, + { + "epoch": 0.69729234722469, + "grad_norm": 1.315933108329773, + "learning_rate": 4.695160785516639e-06, + "loss": 1.5584, + "step": 12754 + }, + { + "epoch": 0.6973470196684116, + "grad_norm": 1.4992375373840332, + "learning_rate": 4.693611832461766e-06, + "loss": 1.5619, + "step": 12755 + }, + { + "epoch": 0.6974016921121332, + "grad_norm": 1.5331978797912598, + "learning_rate": 4.6920630565996495e-06, + "loss": 1.3237, + "step": 12756 + }, + { + "epoch": 0.6974563645558547, + "grad_norm": 1.606038212776184, + "learning_rate": 4.690514457982003e-06, + "loss": 1.6084, + "step": 12757 + }, + { + "epoch": 0.6975110369995763, + "grad_norm": 1.4854283332824707, + "learning_rate": 4.688966036660534e-06, + "loss": 1.3185, + "step": 12758 + }, + { + "epoch": 0.6975657094432979, + "grad_norm": 1.8012793064117432, + "learning_rate": 4.687417792686954e-06, + "loss": 1.4333, + "step": 12759 + }, + { + "epoch": 0.6976203818870194, + "grad_norm": 1.6739463806152344, + "learning_rate": 4.685869726112963e-06, + "loss": 1.3819, + "step": 12760 + }, + { + "epoch": 0.697675054330741, + "grad_norm": 1.5769010782241821, + "learning_rate": 4.684321836990251e-06, + "loss": 1.5143, + "step": 12761 + }, + { + "epoch": 0.6977297267744625, + "grad_norm": 1.2654963731765747, + "learning_rate": 4.682774125370506e-06, + "loss": 1.5276, + "step": 12762 + }, + { + "epoch": 0.697784399218184, + "grad_norm": 1.7288378477096558, + "learning_rate": 4.68122659130541e-06, + "loss": 1.45, + "step": 12763 + }, + { + "epoch": 0.6978390716619056, + "grad_norm": 1.9219619035720825, + "learning_rate": 4.679679234846636e-06, + "loss": 1.6246, + "step": 12764 + }, + { + "epoch": 0.6978937441056272, + "grad_norm": 1.585184931755066, + "learning_rate": 4.67813205604586e-06, + "loss": 1.6446, + "step": 12765 + }, + { + "epoch": 0.6979484165493487, + "grad_norm": 1.289135456085205, + "learning_rate": 4.676585054954743e-06, + "loss": 1.4671, + "step": 12766 + }, + { + "epoch": 0.6980030889930703, + "grad_norm": 1.6477755308151245, + "learning_rate": 4.675038231624939e-06, + "loss": 1.5437, + "step": 12767 + }, + { + "epoch": 0.6980577614367918, + "grad_norm": 1.4778488874435425, + "learning_rate": 4.673491586108108e-06, + "loss": 1.3995, + "step": 12768 + }, + { + "epoch": 0.6981124338805134, + "grad_norm": 1.351542592048645, + "learning_rate": 4.671945118455891e-06, + "loss": 1.5261, + "step": 12769 + }, + { + "epoch": 0.698167106324235, + "grad_norm": 1.6234935522079468, + "learning_rate": 4.670398828719926e-06, + "loss": 1.342, + "step": 12770 + }, + { + "epoch": 0.6982217787679564, + "grad_norm": 1.6295742988586426, + "learning_rate": 4.668852716951854e-06, + "loss": 1.5437, + "step": 12771 + }, + { + "epoch": 0.698276451211678, + "grad_norm": 1.6802703142166138, + "learning_rate": 4.667306783203296e-06, + "loss": 1.5198, + "step": 12772 + }, + { + "epoch": 0.6983311236553996, + "grad_norm": 1.6314786672592163, + "learning_rate": 4.6657610275258826e-06, + "loss": 1.6006, + "step": 12773 + }, + { + "epoch": 0.6983857960991211, + "grad_norm": 1.641888976097107, + "learning_rate": 4.664215449971225e-06, + "loss": 1.3319, + "step": 12774 + }, + { + "epoch": 0.6984404685428427, + "grad_norm": 1.767582654953003, + "learning_rate": 4.662670050590934e-06, + "loss": 1.2838, + "step": 12775 + }, + { + "epoch": 0.6984951409865643, + "grad_norm": 1.3774205446243286, + "learning_rate": 4.661124829436615e-06, + "loss": 1.3776, + "step": 12776 + }, + { + "epoch": 0.6985498134302858, + "grad_norm": 1.5055075883865356, + "learning_rate": 4.6595797865598655e-06, + "loss": 1.5225, + "step": 12777 + }, + { + "epoch": 0.6986044858740074, + "grad_norm": 1.8088098764419556, + "learning_rate": 4.658034922012276e-06, + "loss": 1.423, + "step": 12778 + }, + { + "epoch": 0.698659158317729, + "grad_norm": 1.4998942613601685, + "learning_rate": 4.656490235845438e-06, + "loss": 1.2316, + "step": 12779 + }, + { + "epoch": 0.6987138307614504, + "grad_norm": 1.454359769821167, + "learning_rate": 4.654945728110931e-06, + "loss": 1.3824, + "step": 12780 + }, + { + "epoch": 0.698768503205172, + "grad_norm": 1.8272379636764526, + "learning_rate": 4.653401398860324e-06, + "loss": 1.4143, + "step": 12781 + }, + { + "epoch": 0.6988231756488935, + "grad_norm": 1.649082899093628, + "learning_rate": 4.651857248145195e-06, + "loss": 1.3863, + "step": 12782 + }, + { + "epoch": 0.6988778480926151, + "grad_norm": 1.65567147731781, + "learning_rate": 4.650313276017102e-06, + "loss": 1.5582, + "step": 12783 + }, + { + "epoch": 0.6989325205363367, + "grad_norm": 1.4615367650985718, + "learning_rate": 4.6487694825275985e-06, + "loss": 1.4188, + "step": 12784 + }, + { + "epoch": 0.6989871929800582, + "grad_norm": 1.7950690984725952, + "learning_rate": 4.6472258677282434e-06, + "loss": 1.4954, + "step": 12785 + }, + { + "epoch": 0.6990418654237798, + "grad_norm": 1.7068135738372803, + "learning_rate": 4.645682431670573e-06, + "loss": 1.2895, + "step": 12786 + }, + { + "epoch": 0.6990965378675014, + "grad_norm": 1.708926796913147, + "learning_rate": 4.644139174406134e-06, + "loss": 1.4127, + "step": 12787 + }, + { + "epoch": 0.6991512103112228, + "grad_norm": 1.4926165342330933, + "learning_rate": 4.6425960959864556e-06, + "loss": 1.4467, + "step": 12788 + }, + { + "epoch": 0.6992058827549444, + "grad_norm": 1.4822641611099243, + "learning_rate": 4.6410531964630665e-06, + "loss": 1.5523, + "step": 12789 + }, + { + "epoch": 0.699260555198666, + "grad_norm": 1.7070993185043335, + "learning_rate": 4.639510475887486e-06, + "loss": 1.4044, + "step": 12790 + }, + { + "epoch": 0.6993152276423875, + "grad_norm": 1.6028661727905273, + "learning_rate": 4.637967934311228e-06, + "loss": 1.2605, + "step": 12791 + }, + { + "epoch": 0.6993699000861091, + "grad_norm": 1.8172000646591187, + "learning_rate": 4.636425571785801e-06, + "loss": 1.3256, + "step": 12792 + }, + { + "epoch": 0.6994245725298307, + "grad_norm": 1.8331480026245117, + "learning_rate": 4.634883388362712e-06, + "loss": 1.4137, + "step": 12793 + }, + { + "epoch": 0.6994792449735522, + "grad_norm": 2.1707701683044434, + "learning_rate": 4.633341384093459e-06, + "loss": 1.2992, + "step": 12794 + }, + { + "epoch": 0.6995339174172738, + "grad_norm": 1.342433214187622, + "learning_rate": 4.631799559029524e-06, + "loss": 1.3677, + "step": 12795 + }, + { + "epoch": 0.6995885898609953, + "grad_norm": 3.2339601516723633, + "learning_rate": 4.630257913222403e-06, + "loss": 1.2887, + "step": 12796 + }, + { + "epoch": 0.6996432623047169, + "grad_norm": 1.730796456336975, + "learning_rate": 4.628716446723572e-06, + "loss": 1.6009, + "step": 12797 + }, + { + "epoch": 0.6996979347484384, + "grad_norm": 1.4840508699417114, + "learning_rate": 4.627175159584498e-06, + "loss": 1.5279, + "step": 12798 + }, + { + "epoch": 0.6997526071921599, + "grad_norm": 2.1150944232940674, + "learning_rate": 4.625634051856658e-06, + "loss": 1.4924, + "step": 12799 + }, + { + "epoch": 0.6998072796358815, + "grad_norm": 1.6726810932159424, + "learning_rate": 4.624093123591508e-06, + "loss": 1.3928, + "step": 12800 + }, + { + "epoch": 0.6998619520796031, + "grad_norm": 1.077768087387085, + "learning_rate": 4.622552374840503e-06, + "loss": 1.563, + "step": 12801 + }, + { + "epoch": 0.6999166245233246, + "grad_norm": 1.6539678573608398, + "learning_rate": 4.621011805655093e-06, + "loss": 1.4737, + "step": 12802 + }, + { + "epoch": 0.6999712969670462, + "grad_norm": 1.5098016262054443, + "learning_rate": 4.619471416086717e-06, + "loss": 1.5711, + "step": 12803 + }, + { + "epoch": 0.7000259694107678, + "grad_norm": 1.732584834098816, + "learning_rate": 4.617931206186821e-06, + "loss": 1.4279, + "step": 12804 + }, + { + "epoch": 0.7000806418544893, + "grad_norm": 1.5985156297683716, + "learning_rate": 4.61639117600683e-06, + "loss": 1.4263, + "step": 12805 + }, + { + "epoch": 0.7001353142982109, + "grad_norm": 1.4623563289642334, + "learning_rate": 4.614851325598168e-06, + "loss": 1.313, + "step": 12806 + }, + { + "epoch": 0.7001899867419324, + "grad_norm": 1.753533959388733, + "learning_rate": 4.61331165501226e-06, + "loss": 1.3977, + "step": 12807 + }, + { + "epoch": 0.7002446591856539, + "grad_norm": 1.5061750411987305, + "learning_rate": 4.611772164300516e-06, + "loss": 1.5461, + "step": 12808 + }, + { + "epoch": 0.7002993316293755, + "grad_norm": 1.5512281656265259, + "learning_rate": 4.61023285351434e-06, + "loss": 1.5021, + "step": 12809 + }, + { + "epoch": 0.700354004073097, + "grad_norm": 1.7730941772460938, + "learning_rate": 4.608693722705141e-06, + "loss": 1.6244, + "step": 12810 + }, + { + "epoch": 0.7004086765168186, + "grad_norm": 1.0849899053573608, + "learning_rate": 4.6071547719243095e-06, + "loss": 1.4303, + "step": 12811 + }, + { + "epoch": 0.7004633489605402, + "grad_norm": 1.5399394035339355, + "learning_rate": 4.60561600122323e-06, + "loss": 1.2976, + "step": 12812 + }, + { + "epoch": 0.7005180214042617, + "grad_norm": 1.5059103965759277, + "learning_rate": 4.604077410653295e-06, + "loss": 1.2769, + "step": 12813 + }, + { + "epoch": 0.7005726938479833, + "grad_norm": 1.3924773931503296, + "learning_rate": 4.6025390002658764e-06, + "loss": 1.2956, + "step": 12814 + }, + { + "epoch": 0.7006273662917049, + "grad_norm": 1.233941674232483, + "learning_rate": 4.601000770112347e-06, + "loss": 1.3797, + "step": 12815 + }, + { + "epoch": 0.7006820387354263, + "grad_norm": 1.9337557554244995, + "learning_rate": 4.599462720244071e-06, + "loss": 1.4692, + "step": 12816 + }, + { + "epoch": 0.7007367111791479, + "grad_norm": 1.424131155014038, + "learning_rate": 4.597924850712403e-06, + "loss": 1.3655, + "step": 12817 + }, + { + "epoch": 0.7007913836228695, + "grad_norm": 1.9045718908309937, + "learning_rate": 4.596387161568705e-06, + "loss": 1.2588, + "step": 12818 + }, + { + "epoch": 0.700846056066591, + "grad_norm": 1.6521188020706177, + "learning_rate": 4.59484965286432e-06, + "loss": 1.4394, + "step": 12819 + }, + { + "epoch": 0.7009007285103126, + "grad_norm": 1.657516598701477, + "learning_rate": 4.593312324650584e-06, + "loss": 1.3154, + "step": 12820 + }, + { + "epoch": 0.7009554009540342, + "grad_norm": 1.2962887287139893, + "learning_rate": 4.591775176978841e-06, + "loss": 1.2626, + "step": 12821 + }, + { + "epoch": 0.7010100733977557, + "grad_norm": 2.1760671138763428, + "learning_rate": 4.590238209900416e-06, + "loss": 1.7366, + "step": 12822 + }, + { + "epoch": 0.7010647458414773, + "grad_norm": 1.454704999923706, + "learning_rate": 4.5887014234666275e-06, + "loss": 1.7344, + "step": 12823 + }, + { + "epoch": 0.7011194182851987, + "grad_norm": 1.2842885255813599, + "learning_rate": 4.5871648177287995e-06, + "loss": 1.3887, + "step": 12824 + }, + { + "epoch": 0.7011740907289203, + "grad_norm": 1.588799238204956, + "learning_rate": 4.58562839273824e-06, + "loss": 1.3404, + "step": 12825 + }, + { + "epoch": 0.7012287631726419, + "grad_norm": 1.727037787437439, + "learning_rate": 4.584092148546254e-06, + "loss": 1.1925, + "step": 12826 + }, + { + "epoch": 0.7012834356163634, + "grad_norm": 1.562083125114441, + "learning_rate": 4.582556085204141e-06, + "loss": 1.4054, + "step": 12827 + }, + { + "epoch": 0.701338108060085, + "grad_norm": 1.4136327505111694, + "learning_rate": 4.581020202763188e-06, + "loss": 1.4373, + "step": 12828 + }, + { + "epoch": 0.7013927805038066, + "grad_norm": 1.7450313568115234, + "learning_rate": 4.579484501274691e-06, + "loss": 0.9877, + "step": 12829 + }, + { + "epoch": 0.7014474529475281, + "grad_norm": 2.212801694869995, + "learning_rate": 4.577948980789924e-06, + "loss": 1.656, + "step": 12830 + }, + { + "epoch": 0.7015021253912497, + "grad_norm": 1.5656367540359497, + "learning_rate": 4.576413641360162e-06, + "loss": 1.6637, + "step": 12831 + }, + { + "epoch": 0.7015567978349713, + "grad_norm": 1.3380321264266968, + "learning_rate": 4.574878483036679e-06, + "loss": 1.6063, + "step": 12832 + }, + { + "epoch": 0.7016114702786928, + "grad_norm": 1.9652694463729858, + "learning_rate": 4.573343505870733e-06, + "loss": 1.6181, + "step": 12833 + }, + { + "epoch": 0.7016661427224143, + "grad_norm": 1.6711831092834473, + "learning_rate": 4.571808709913578e-06, + "loss": 1.4471, + "step": 12834 + }, + { + "epoch": 0.7017208151661359, + "grad_norm": 1.284828782081604, + "learning_rate": 4.570274095216472e-06, + "loss": 1.5319, + "step": 12835 + }, + { + "epoch": 0.7017754876098574, + "grad_norm": 2.136709451675415, + "learning_rate": 4.5687396618306545e-06, + "loss": 1.4624, + "step": 12836 + }, + { + "epoch": 0.701830160053579, + "grad_norm": 1.5037399530410767, + "learning_rate": 4.567205409807362e-06, + "loss": 1.4903, + "step": 12837 + }, + { + "epoch": 0.7018848324973005, + "grad_norm": 1.4261701107025146, + "learning_rate": 4.565671339197831e-06, + "loss": 1.4635, + "step": 12838 + }, + { + "epoch": 0.7019395049410221, + "grad_norm": 1.6739546060562134, + "learning_rate": 4.564137450053288e-06, + "loss": 1.416, + "step": 12839 + }, + { + "epoch": 0.7019941773847437, + "grad_norm": 1.5129213333129883, + "learning_rate": 4.56260374242495e-06, + "loss": 1.4269, + "step": 12840 + }, + { + "epoch": 0.7020488498284652, + "grad_norm": 1.5201034545898438, + "learning_rate": 4.561070216364033e-06, + "loss": 1.2954, + "step": 12841 + }, + { + "epoch": 0.7021035222721868, + "grad_norm": 1.7477734088897705, + "learning_rate": 4.55953687192174e-06, + "loss": 1.6035, + "step": 12842 + }, + { + "epoch": 0.7021581947159083, + "grad_norm": 1.1896088123321533, + "learning_rate": 4.558003709149282e-06, + "loss": 1.4234, + "step": 12843 + }, + { + "epoch": 0.7022128671596298, + "grad_norm": 1.4253374338150024, + "learning_rate": 4.556470728097849e-06, + "loss": 1.5226, + "step": 12844 + }, + { + "epoch": 0.7022675396033514, + "grad_norm": 2.075150966644287, + "learning_rate": 4.55493792881863e-06, + "loss": 1.223, + "step": 12845 + }, + { + "epoch": 0.702322212047073, + "grad_norm": 1.497116208076477, + "learning_rate": 4.553405311362813e-06, + "loss": 1.3263, + "step": 12846 + }, + { + "epoch": 0.7023768844907945, + "grad_norm": 1.7001134157180786, + "learning_rate": 4.551872875781575e-06, + "loss": 1.707, + "step": 12847 + }, + { + "epoch": 0.7024315569345161, + "grad_norm": 1.6696988344192505, + "learning_rate": 4.5503406221260805e-06, + "loss": 1.3047, + "step": 12848 + }, + { + "epoch": 0.7024862293782377, + "grad_norm": 1.752549409866333, + "learning_rate": 4.548808550447505e-06, + "loss": 1.425, + "step": 12849 + }, + { + "epoch": 0.7025409018219592, + "grad_norm": 1.4377905130386353, + "learning_rate": 4.547276660797003e-06, + "loss": 1.3418, + "step": 12850 + }, + { + "epoch": 0.7025955742656808, + "grad_norm": 1.4900670051574707, + "learning_rate": 4.545744953225726e-06, + "loss": 1.4721, + "step": 12851 + }, + { + "epoch": 0.7026502467094023, + "grad_norm": 1.4956704378128052, + "learning_rate": 4.544213427784827e-06, + "loss": 1.505, + "step": 12852 + }, + { + "epoch": 0.7027049191531238, + "grad_norm": 1.771844744682312, + "learning_rate": 4.5426820845254446e-06, + "loss": 1.4721, + "step": 12853 + }, + { + "epoch": 0.7027595915968454, + "grad_norm": 1.5225837230682373, + "learning_rate": 4.5411509234987124e-06, + "loss": 1.4675, + "step": 12854 + }, + { + "epoch": 0.7028142640405669, + "grad_norm": 1.4871007204055786, + "learning_rate": 4.539619944755762e-06, + "loss": 1.5926, + "step": 12855 + }, + { + "epoch": 0.7028689364842885, + "grad_norm": 1.4185019731521606, + "learning_rate": 4.538089148347709e-06, + "loss": 1.3819, + "step": 12856 + }, + { + "epoch": 0.7029236089280101, + "grad_norm": 1.6115297079086304, + "learning_rate": 4.536558534325681e-06, + "loss": 1.4283, + "step": 12857 + }, + { + "epoch": 0.7029782813717316, + "grad_norm": 1.6240023374557495, + "learning_rate": 4.535028102740785e-06, + "loss": 1.3762, + "step": 12858 + }, + { + "epoch": 0.7030329538154532, + "grad_norm": 1.6770113706588745, + "learning_rate": 4.533497853644119e-06, + "loss": 1.294, + "step": 12859 + }, + { + "epoch": 0.7030876262591748, + "grad_norm": 1.4888219833374023, + "learning_rate": 4.531967787086791e-06, + "loss": 1.5811, + "step": 12860 + }, + { + "epoch": 0.7031422987028962, + "grad_norm": 2.031970500946045, + "learning_rate": 4.5304379031198906e-06, + "loss": 1.6176, + "step": 12861 + }, + { + "epoch": 0.7031969711466178, + "grad_norm": 1.9096643924713135, + "learning_rate": 4.5289082017944995e-06, + "loss": 1.5419, + "step": 12862 + }, + { + "epoch": 0.7032516435903394, + "grad_norm": 1.6132360696792603, + "learning_rate": 4.527378683161706e-06, + "loss": 1.3068, + "step": 12863 + }, + { + "epoch": 0.7033063160340609, + "grad_norm": 1.7716622352600098, + "learning_rate": 4.52584934727258e-06, + "loss": 1.5259, + "step": 12864 + }, + { + "epoch": 0.7033609884777825, + "grad_norm": 2.019867420196533, + "learning_rate": 4.524320194178189e-06, + "loss": 1.2179, + "step": 12865 + }, + { + "epoch": 0.7034156609215041, + "grad_norm": 1.4101358652114868, + "learning_rate": 4.522791223929597e-06, + "loss": 1.4353, + "step": 12866 + }, + { + "epoch": 0.7034703333652256, + "grad_norm": 1.7506592273712158, + "learning_rate": 4.521262436577858e-06, + "loss": 1.5059, + "step": 12867 + }, + { + "epoch": 0.7035250058089472, + "grad_norm": 1.567944049835205, + "learning_rate": 4.519733832174018e-06, + "loss": 1.6784, + "step": 12868 + }, + { + "epoch": 0.7035796782526686, + "grad_norm": 1.4948999881744385, + "learning_rate": 4.51820541076913e-06, + "loss": 1.2926, + "step": 12869 + }, + { + "epoch": 0.7036343506963902, + "grad_norm": 1.26522696018219, + "learning_rate": 4.516677172414223e-06, + "loss": 1.3788, + "step": 12870 + }, + { + "epoch": 0.7036890231401118, + "grad_norm": 1.5112308263778687, + "learning_rate": 4.515149117160335e-06, + "loss": 1.4248, + "step": 12871 + }, + { + "epoch": 0.7037436955838333, + "grad_norm": 1.5024608373641968, + "learning_rate": 4.5136212450584895e-06, + "loss": 1.3577, + "step": 12872 + }, + { + "epoch": 0.7037983680275549, + "grad_norm": 1.9768320322036743, + "learning_rate": 4.512093556159702e-06, + "loss": 1.3961, + "step": 12873 + }, + { + "epoch": 0.7038530404712765, + "grad_norm": 1.4511011838912964, + "learning_rate": 4.510566050514991e-06, + "loss": 1.5148, + "step": 12874 + }, + { + "epoch": 0.703907712914998, + "grad_norm": 1.7314435243606567, + "learning_rate": 4.50903872817536e-06, + "loss": 1.1869, + "step": 12875 + }, + { + "epoch": 0.7039623853587196, + "grad_norm": 1.4318537712097168, + "learning_rate": 4.507511589191809e-06, + "loss": 1.3446, + "step": 12876 + }, + { + "epoch": 0.7040170578024412, + "grad_norm": 1.3669557571411133, + "learning_rate": 4.505984633615337e-06, + "loss": 1.569, + "step": 12877 + }, + { + "epoch": 0.7040717302461627, + "grad_norm": 1.5526646375656128, + "learning_rate": 4.504457861496931e-06, + "loss": 1.4148, + "step": 12878 + }, + { + "epoch": 0.7041264026898842, + "grad_norm": 1.5454978942871094, + "learning_rate": 4.502931272887572e-06, + "loss": 1.4147, + "step": 12879 + }, + { + "epoch": 0.7041810751336058, + "grad_norm": 1.210624098777771, + "learning_rate": 4.501404867838237e-06, + "loss": 1.4969, + "step": 12880 + }, + { + "epoch": 0.7042357475773273, + "grad_norm": 1.4563344717025757, + "learning_rate": 4.499878646399897e-06, + "loss": 1.5362, + "step": 12881 + }, + { + "epoch": 0.7042904200210489, + "grad_norm": 1.6843472719192505, + "learning_rate": 4.498352608623511e-06, + "loss": 1.269, + "step": 12882 + }, + { + "epoch": 0.7043450924647704, + "grad_norm": 1.5844159126281738, + "learning_rate": 4.496826754560046e-06, + "loss": 1.3668, + "step": 12883 + }, + { + "epoch": 0.704399764908492, + "grad_norm": 1.394875407218933, + "learning_rate": 4.495301084260444e-06, + "loss": 1.603, + "step": 12884 + }, + { + "epoch": 0.7044544373522136, + "grad_norm": 1.328480839729309, + "learning_rate": 4.493775597775661e-06, + "loss": 1.4412, + "step": 12885 + }, + { + "epoch": 0.7045091097959351, + "grad_norm": 1.6264392137527466, + "learning_rate": 4.492250295156632e-06, + "loss": 1.5075, + "step": 12886 + }, + { + "epoch": 0.7045637822396567, + "grad_norm": 1.5264647006988525, + "learning_rate": 4.490725176454285e-06, + "loss": 1.3958, + "step": 12887 + }, + { + "epoch": 0.7046184546833782, + "grad_norm": 2.2871804237365723, + "learning_rate": 4.489200241719556e-06, + "loss": 1.3908, + "step": 12888 + }, + { + "epoch": 0.7046731271270997, + "grad_norm": 1.358971357345581, + "learning_rate": 4.487675491003362e-06, + "loss": 1.4242, + "step": 12889 + }, + { + "epoch": 0.7047277995708213, + "grad_norm": 1.3740888833999634, + "learning_rate": 4.486150924356617e-06, + "loss": 1.4887, + "step": 12890 + }, + { + "epoch": 0.7047824720145429, + "grad_norm": 1.5941041707992554, + "learning_rate": 4.484626541830234e-06, + "loss": 1.4285, + "step": 12891 + }, + { + "epoch": 0.7048371444582644, + "grad_norm": 1.6498936414718628, + "learning_rate": 4.483102343475112e-06, + "loss": 1.446, + "step": 12892 + }, + { + "epoch": 0.704891816901986, + "grad_norm": 1.5961031913757324, + "learning_rate": 4.481578329342149e-06, + "loss": 1.3832, + "step": 12893 + }, + { + "epoch": 0.7049464893457076, + "grad_norm": 1.7931760549545288, + "learning_rate": 4.480054499482236e-06, + "loss": 1.5031, + "step": 12894 + }, + { + "epoch": 0.7050011617894291, + "grad_norm": 1.9497168064117432, + "learning_rate": 4.478530853946255e-06, + "loss": 1.539, + "step": 12895 + }, + { + "epoch": 0.7050558342331507, + "grad_norm": 1.6695899963378906, + "learning_rate": 4.477007392785082e-06, + "loss": 1.3259, + "step": 12896 + }, + { + "epoch": 0.7051105066768721, + "grad_norm": 1.5557962656021118, + "learning_rate": 4.475484116049596e-06, + "loss": 1.4895, + "step": 12897 + }, + { + "epoch": 0.7051651791205937, + "grad_norm": 1.307645559310913, + "learning_rate": 4.473961023790655e-06, + "loss": 1.5545, + "step": 12898 + }, + { + "epoch": 0.7052198515643153, + "grad_norm": 1.2790794372558594, + "learning_rate": 4.472438116059127e-06, + "loss": 1.6614, + "step": 12899 + }, + { + "epoch": 0.7052745240080368, + "grad_norm": 1.5140466690063477, + "learning_rate": 4.470915392905862e-06, + "loss": 1.4553, + "step": 12900 + }, + { + "epoch": 0.7053291964517584, + "grad_norm": 2.4920172691345215, + "learning_rate": 4.4693928543817e-06, + "loss": 1.2214, + "step": 12901 + }, + { + "epoch": 0.70538386889548, + "grad_norm": 1.4254724979400635, + "learning_rate": 4.467870500537494e-06, + "loss": 1.3573, + "step": 12902 + }, + { + "epoch": 0.7054385413392015, + "grad_norm": 1.7793240547180176, + "learning_rate": 4.466348331424074e-06, + "loss": 1.4403, + "step": 12903 + }, + { + "epoch": 0.7054932137829231, + "grad_norm": 1.6970627307891846, + "learning_rate": 4.464826347092267e-06, + "loss": 1.6322, + "step": 12904 + }, + { + "epoch": 0.7055478862266447, + "grad_norm": 1.4493855237960815, + "learning_rate": 4.463304547592898e-06, + "loss": 1.5033, + "step": 12905 + }, + { + "epoch": 0.7056025586703661, + "grad_norm": 1.5087233781814575, + "learning_rate": 4.461782932976783e-06, + "loss": 1.3802, + "step": 12906 + }, + { + "epoch": 0.7056572311140877, + "grad_norm": 1.3987061977386475, + "learning_rate": 4.460261503294726e-06, + "loss": 1.6405, + "step": 12907 + }, + { + "epoch": 0.7057119035578093, + "grad_norm": 1.4084324836730957, + "learning_rate": 4.458740258597541e-06, + "loss": 1.3791, + "step": 12908 + }, + { + "epoch": 0.7057665760015308, + "grad_norm": 1.4280177354812622, + "learning_rate": 4.457219198936024e-06, + "loss": 1.4358, + "step": 12909 + }, + { + "epoch": 0.7058212484452524, + "grad_norm": 1.687828540802002, + "learning_rate": 4.455698324360959e-06, + "loss": 1.189, + "step": 12910 + }, + { + "epoch": 0.7058759208889739, + "grad_norm": 1.5117398500442505, + "learning_rate": 4.45417763492314e-06, + "loss": 1.5711, + "step": 12911 + }, + { + "epoch": 0.7059305933326955, + "grad_norm": 1.2836486101150513, + "learning_rate": 4.452657130673341e-06, + "loss": 1.3474, + "step": 12912 + }, + { + "epoch": 0.7059852657764171, + "grad_norm": 1.5888032913208008, + "learning_rate": 4.451136811662341e-06, + "loss": 1.4407, + "step": 12913 + }, + { + "epoch": 0.7060399382201386, + "grad_norm": 1.3698831796646118, + "learning_rate": 4.449616677940904e-06, + "loss": 1.5501, + "step": 12914 + }, + { + "epoch": 0.7060946106638601, + "grad_norm": 1.5493707656860352, + "learning_rate": 4.4480967295597856e-06, + "loss": 1.2854, + "step": 12915 + }, + { + "epoch": 0.7061492831075817, + "grad_norm": 1.5142940282821655, + "learning_rate": 4.44657696656975e-06, + "loss": 1.8118, + "step": 12916 + }, + { + "epoch": 0.7062039555513032, + "grad_norm": 1.591781735420227, + "learning_rate": 4.445057389021541e-06, + "loss": 1.4314, + "step": 12917 + }, + { + "epoch": 0.7062586279950248, + "grad_norm": 1.2015568017959595, + "learning_rate": 4.4435379969659005e-06, + "loss": 1.4533, + "step": 12918 + }, + { + "epoch": 0.7063133004387464, + "grad_norm": 2.006171226501465, + "learning_rate": 4.442018790453566e-06, + "loss": 1.2839, + "step": 12919 + }, + { + "epoch": 0.7063679728824679, + "grad_norm": 1.5386333465576172, + "learning_rate": 4.440499769535265e-06, + "loss": 1.2772, + "step": 12920 + }, + { + "epoch": 0.7064226453261895, + "grad_norm": 1.3312956094741821, + "learning_rate": 4.4389809342617195e-06, + "loss": 1.5938, + "step": 12921 + }, + { + "epoch": 0.7064773177699111, + "grad_norm": 1.3359707593917847, + "learning_rate": 4.437462284683653e-06, + "loss": 1.572, + "step": 12922 + }, + { + "epoch": 0.7065319902136326, + "grad_norm": 1.6615196466445923, + "learning_rate": 4.435943820851775e-06, + "loss": 1.4716, + "step": 12923 + }, + { + "epoch": 0.7065866626573541, + "grad_norm": 1.474050760269165, + "learning_rate": 4.4344255428167845e-06, + "loss": 1.6239, + "step": 12924 + }, + { + "epoch": 0.7066413351010756, + "grad_norm": 1.5492665767669678, + "learning_rate": 4.432907450629389e-06, + "loss": 1.2243, + "step": 12925 + }, + { + "epoch": 0.7066960075447972, + "grad_norm": 1.1581937074661255, + "learning_rate": 4.431389544340273e-06, + "loss": 1.5001, + "step": 12926 + }, + { + "epoch": 0.7067506799885188, + "grad_norm": 1.4539544582366943, + "learning_rate": 4.429871824000133e-06, + "loss": 1.4203, + "step": 12927 + }, + { + "epoch": 0.7068053524322403, + "grad_norm": 1.6264464855194092, + "learning_rate": 4.428354289659641e-06, + "loss": 1.6997, + "step": 12928 + }, + { + "epoch": 0.7068600248759619, + "grad_norm": 1.7715842723846436, + "learning_rate": 4.426836941369471e-06, + "loss": 1.6156, + "step": 12929 + }, + { + "epoch": 0.7069146973196835, + "grad_norm": 1.6552990674972534, + "learning_rate": 4.425319779180297e-06, + "loss": 1.3817, + "step": 12930 + }, + { + "epoch": 0.706969369763405, + "grad_norm": 1.5315821170806885, + "learning_rate": 4.423802803142777e-06, + "loss": 1.5269, + "step": 12931 + }, + { + "epoch": 0.7070240422071266, + "grad_norm": 1.7853665351867676, + "learning_rate": 4.422286013307567e-06, + "loss": 1.5456, + "step": 12932 + }, + { + "epoch": 0.7070787146508482, + "grad_norm": 1.6901237964630127, + "learning_rate": 4.420769409725315e-06, + "loss": 1.5735, + "step": 12933 + }, + { + "epoch": 0.7071333870945696, + "grad_norm": 1.6945050954818726, + "learning_rate": 4.419252992446664e-06, + "loss": 1.5001, + "step": 12934 + }, + { + "epoch": 0.7071880595382912, + "grad_norm": 1.837979793548584, + "learning_rate": 4.417736761522249e-06, + "loss": 1.35, + "step": 12935 + }, + { + "epoch": 0.7072427319820128, + "grad_norm": 1.7616536617279053, + "learning_rate": 4.416220717002705e-06, + "loss": 1.6013, + "step": 12936 + }, + { + "epoch": 0.7072974044257343, + "grad_norm": 1.9417585134506226, + "learning_rate": 4.414704858938655e-06, + "loss": 1.3122, + "step": 12937 + }, + { + "epoch": 0.7073520768694559, + "grad_norm": 1.948093295097351, + "learning_rate": 4.413189187380711e-06, + "loss": 1.5553, + "step": 12938 + }, + { + "epoch": 0.7074067493131774, + "grad_norm": 1.567394495010376, + "learning_rate": 4.411673702379495e-06, + "loss": 1.4214, + "step": 12939 + }, + { + "epoch": 0.707461421756899, + "grad_norm": 1.6060680150985718, + "learning_rate": 4.410158403985603e-06, + "loss": 1.4983, + "step": 12940 + }, + { + "epoch": 0.7075160942006206, + "grad_norm": 1.6097133159637451, + "learning_rate": 4.408643292249642e-06, + "loss": 1.4121, + "step": 12941 + }, + { + "epoch": 0.707570766644342, + "grad_norm": 1.6217098236083984, + "learning_rate": 4.407128367222203e-06, + "loss": 1.3148, + "step": 12942 + }, + { + "epoch": 0.7076254390880636, + "grad_norm": 1.5170092582702637, + "learning_rate": 4.405613628953871e-06, + "loss": 1.2789, + "step": 12943 + }, + { + "epoch": 0.7076801115317852, + "grad_norm": 1.710203766822815, + "learning_rate": 4.404099077495229e-06, + "loss": 1.4072, + "step": 12944 + }, + { + "epoch": 0.7077347839755067, + "grad_norm": 1.5601989030838013, + "learning_rate": 4.402584712896849e-06, + "loss": 1.6031, + "step": 12945 + }, + { + "epoch": 0.7077894564192283, + "grad_norm": 1.6413261890411377, + "learning_rate": 4.401070535209296e-06, + "loss": 1.2978, + "step": 12946 + }, + { + "epoch": 0.7078441288629499, + "grad_norm": 1.5429537296295166, + "learning_rate": 4.399556544483141e-06, + "loss": 1.5515, + "step": 12947 + }, + { + "epoch": 0.7078988013066714, + "grad_norm": 1.2166706323623657, + "learning_rate": 4.3980427407689345e-06, + "loss": 1.4919, + "step": 12948 + }, + { + "epoch": 0.707953473750393, + "grad_norm": 1.1911661624908447, + "learning_rate": 4.396529124117223e-06, + "loss": 1.4643, + "step": 12949 + }, + { + "epoch": 0.7080081461941146, + "grad_norm": 1.4363203048706055, + "learning_rate": 4.395015694578555e-06, + "loss": 1.4776, + "step": 12950 + }, + { + "epoch": 0.708062818637836, + "grad_norm": 1.5858259201049805, + "learning_rate": 4.393502452203466e-06, + "loss": 1.6602, + "step": 12951 + }, + { + "epoch": 0.7081174910815576, + "grad_norm": 1.7620230913162231, + "learning_rate": 4.391989397042485e-06, + "loss": 1.5646, + "step": 12952 + }, + { + "epoch": 0.7081721635252791, + "grad_norm": 1.9839156866073608, + "learning_rate": 4.390476529146138e-06, + "loss": 1.5583, + "step": 12953 + }, + { + "epoch": 0.7082268359690007, + "grad_norm": 1.1476612091064453, + "learning_rate": 4.388963848564941e-06, + "loss": 1.7006, + "step": 12954 + }, + { + "epoch": 0.7082815084127223, + "grad_norm": 1.3882067203521729, + "learning_rate": 4.38745135534941e-06, + "loss": 1.4257, + "step": 12955 + }, + { + "epoch": 0.7083361808564438, + "grad_norm": 1.4029903411865234, + "learning_rate": 4.38593904955005e-06, + "loss": 1.4477, + "step": 12956 + }, + { + "epoch": 0.7083908533001654, + "grad_norm": 1.3943058252334595, + "learning_rate": 4.384426931217359e-06, + "loss": 1.4215, + "step": 12957 + }, + { + "epoch": 0.708445525743887, + "grad_norm": 1.3697532415390015, + "learning_rate": 4.382915000401829e-06, + "loss": 1.533, + "step": 12958 + }, + { + "epoch": 0.7085001981876085, + "grad_norm": 1.8219515085220337, + "learning_rate": 4.381403257153949e-06, + "loss": 1.5053, + "step": 12959 + }, + { + "epoch": 0.70855487063133, + "grad_norm": 1.4486627578735352, + "learning_rate": 4.3798917015241944e-06, + "loss": 1.3893, + "step": 12960 + }, + { + "epoch": 0.7086095430750516, + "grad_norm": 1.5712175369262695, + "learning_rate": 4.378380333563048e-06, + "loss": 1.5869, + "step": 12961 + }, + { + "epoch": 0.7086642155187731, + "grad_norm": 1.958151936531067, + "learning_rate": 4.376869153320974e-06, + "loss": 1.0916, + "step": 12962 + }, + { + "epoch": 0.7087188879624947, + "grad_norm": 1.775438666343689, + "learning_rate": 4.37535816084843e-06, + "loss": 1.4808, + "step": 12963 + }, + { + "epoch": 0.7087735604062163, + "grad_norm": 1.341709852218628, + "learning_rate": 4.3738473561958795e-06, + "loss": 1.5355, + "step": 12964 + }, + { + "epoch": 0.7088282328499378, + "grad_norm": 1.6148803234100342, + "learning_rate": 4.372336739413767e-06, + "loss": 1.4175, + "step": 12965 + }, + { + "epoch": 0.7088829052936594, + "grad_norm": 1.4778910875320435, + "learning_rate": 4.370826310552534e-06, + "loss": 1.2662, + "step": 12966 + }, + { + "epoch": 0.7089375777373809, + "grad_norm": 1.3950328826904297, + "learning_rate": 4.369316069662623e-06, + "loss": 1.4103, + "step": 12967 + }, + { + "epoch": 0.7089922501811025, + "grad_norm": 1.6018787622451782, + "learning_rate": 4.367806016794458e-06, + "loss": 1.34, + "step": 12968 + }, + { + "epoch": 0.709046922624824, + "grad_norm": 1.3516024351119995, + "learning_rate": 4.366296151998469e-06, + "loss": 1.598, + "step": 12969 + }, + { + "epoch": 0.7091015950685455, + "grad_norm": 1.3927080631256104, + "learning_rate": 4.364786475325072e-06, + "loss": 1.3026, + "step": 12970 + }, + { + "epoch": 0.7091562675122671, + "grad_norm": 1.6653802394866943, + "learning_rate": 4.3632769868246784e-06, + "loss": 1.2656, + "step": 12971 + }, + { + "epoch": 0.7092109399559887, + "grad_norm": 1.5981247425079346, + "learning_rate": 4.361767686547693e-06, + "loss": 1.3667, + "step": 12972 + }, + { + "epoch": 0.7092656123997102, + "grad_norm": 1.752989411354065, + "learning_rate": 4.360258574544516e-06, + "loss": 1.6259, + "step": 12973 + }, + { + "epoch": 0.7093202848434318, + "grad_norm": 1.4216065406799316, + "learning_rate": 4.358749650865534e-06, + "loss": 1.3367, + "step": 12974 + }, + { + "epoch": 0.7093749572871534, + "grad_norm": 1.7686960697174072, + "learning_rate": 4.357240915561143e-06, + "loss": 1.1997, + "step": 12975 + }, + { + "epoch": 0.7094296297308749, + "grad_norm": 1.4834049940109253, + "learning_rate": 4.3557323686817185e-06, + "loss": 1.3898, + "step": 12976 + }, + { + "epoch": 0.7094843021745965, + "grad_norm": 1.5059834718704224, + "learning_rate": 4.354224010277632e-06, + "loss": 1.3822, + "step": 12977 + }, + { + "epoch": 0.709538974618318, + "grad_norm": 1.3274904489517212, + "learning_rate": 4.352715840399257e-06, + "loss": 1.4731, + "step": 12978 + }, + { + "epoch": 0.7095936470620395, + "grad_norm": 1.4497863054275513, + "learning_rate": 4.351207859096953e-06, + "loss": 1.471, + "step": 12979 + }, + { + "epoch": 0.7096483195057611, + "grad_norm": 1.4287337064743042, + "learning_rate": 4.3497000664210695e-06, + "loss": 1.39, + "step": 12980 + }, + { + "epoch": 0.7097029919494826, + "grad_norm": 2.1960341930389404, + "learning_rate": 4.348192462421963e-06, + "loss": 1.3542, + "step": 12981 + }, + { + "epoch": 0.7097576643932042, + "grad_norm": 1.4774131774902344, + "learning_rate": 4.346685047149973e-06, + "loss": 1.4414, + "step": 12982 + }, + { + "epoch": 0.7098123368369258, + "grad_norm": 1.5984281301498413, + "learning_rate": 4.345177820655435e-06, + "loss": 1.5017, + "step": 12983 + }, + { + "epoch": 0.7098670092806473, + "grad_norm": 1.5812193155288696, + "learning_rate": 4.343670782988679e-06, + "loss": 1.4541, + "step": 12984 + }, + { + "epoch": 0.7099216817243689, + "grad_norm": 1.7226642370224, + "learning_rate": 4.3421639342000255e-06, + "loss": 1.5432, + "step": 12985 + }, + { + "epoch": 0.7099763541680905, + "grad_norm": 1.5129210948944092, + "learning_rate": 4.340657274339798e-06, + "loss": 1.4057, + "step": 12986 + }, + { + "epoch": 0.7100310266118119, + "grad_norm": 1.1849305629730225, + "learning_rate": 4.339150803458304e-06, + "loss": 1.3943, + "step": 12987 + }, + { + "epoch": 0.7100856990555335, + "grad_norm": 3.243652105331421, + "learning_rate": 4.337644521605845e-06, + "loss": 1.5629, + "step": 12988 + }, + { + "epoch": 0.7101403714992551, + "grad_norm": 2.1412746906280518, + "learning_rate": 4.336138428832726e-06, + "loss": 1.563, + "step": 12989 + }, + { + "epoch": 0.7101950439429766, + "grad_norm": 1.859395146369934, + "learning_rate": 4.334632525189235e-06, + "loss": 1.5317, + "step": 12990 + }, + { + "epoch": 0.7102497163866982, + "grad_norm": 1.3824663162231445, + "learning_rate": 4.333126810725655e-06, + "loss": 1.4385, + "step": 12991 + }, + { + "epoch": 0.7103043888304198, + "grad_norm": 1.3918031454086304, + "learning_rate": 4.331621285492272e-06, + "loss": 1.4142, + "step": 12992 + }, + { + "epoch": 0.7103590612741413, + "grad_norm": 1.7692904472351074, + "learning_rate": 4.330115949539356e-06, + "loss": 1.5128, + "step": 12993 + }, + { + "epoch": 0.7104137337178629, + "grad_norm": 1.557433843612671, + "learning_rate": 4.328610802917169e-06, + "loss": 1.5472, + "step": 12994 + }, + { + "epoch": 0.7104684061615844, + "grad_norm": 1.9421236515045166, + "learning_rate": 4.327105845675979e-06, + "loss": 1.469, + "step": 12995 + }, + { + "epoch": 0.710523078605306, + "grad_norm": 1.3019322156906128, + "learning_rate": 4.325601077866039e-06, + "loss": 1.5296, + "step": 12996 + }, + { + "epoch": 0.7105777510490275, + "grad_norm": 1.3424067497253418, + "learning_rate": 4.324096499537592e-06, + "loss": 1.3654, + "step": 12997 + }, + { + "epoch": 0.710632423492749, + "grad_norm": 1.5225361585617065, + "learning_rate": 4.322592110740882e-06, + "loss": 1.4944, + "step": 12998 + }, + { + "epoch": 0.7106870959364706, + "grad_norm": 1.7345889806747437, + "learning_rate": 4.32108791152614e-06, + "loss": 1.6405, + "step": 12999 + }, + { + "epoch": 0.7107417683801922, + "grad_norm": 1.6142139434814453, + "learning_rate": 4.319583901943604e-06, + "loss": 1.5287, + "step": 13000 + }, + { + "epoch": 0.7107964408239137, + "grad_norm": 1.4534788131713867, + "learning_rate": 4.31808008204349e-06, + "loss": 1.5763, + "step": 13001 + }, + { + "epoch": 0.7108511132676353, + "grad_norm": 1.49476158618927, + "learning_rate": 4.316576451876011e-06, + "loss": 1.4028, + "step": 13002 + }, + { + "epoch": 0.7109057857113569, + "grad_norm": 1.6443703174591064, + "learning_rate": 4.315073011491385e-06, + "loss": 1.3914, + "step": 13003 + }, + { + "epoch": 0.7109604581550784, + "grad_norm": 2.0742807388305664, + "learning_rate": 4.313569760939811e-06, + "loss": 1.3154, + "step": 13004 + }, + { + "epoch": 0.7110151305988, + "grad_norm": 1.4675012826919556, + "learning_rate": 4.312066700271483e-06, + "loss": 1.4094, + "step": 13005 + }, + { + "epoch": 0.7110698030425215, + "grad_norm": 1.589579463005066, + "learning_rate": 4.310563829536598e-06, + "loss": 1.5082, + "step": 13006 + }, + { + "epoch": 0.711124475486243, + "grad_norm": 1.4249401092529297, + "learning_rate": 4.3090611487853385e-06, + "loss": 1.7036, + "step": 13007 + }, + { + "epoch": 0.7111791479299646, + "grad_norm": 1.4608813524246216, + "learning_rate": 4.307558658067878e-06, + "loss": 1.6861, + "step": 13008 + }, + { + "epoch": 0.7112338203736861, + "grad_norm": 1.315579891204834, + "learning_rate": 4.306056357434394e-06, + "loss": 1.4381, + "step": 13009 + }, + { + "epoch": 0.7112884928174077, + "grad_norm": 1.489118218421936, + "learning_rate": 4.3045542469350495e-06, + "loss": 1.4507, + "step": 13010 + }, + { + "epoch": 0.7113431652611293, + "grad_norm": 1.102532982826233, + "learning_rate": 4.303052326620004e-06, + "loss": 1.7267, + "step": 13011 + }, + { + "epoch": 0.7113978377048508, + "grad_norm": 1.408752202987671, + "learning_rate": 4.30155059653941e-06, + "loss": 1.3833, + "step": 13012 + }, + { + "epoch": 0.7114525101485724, + "grad_norm": 1.5758739709854126, + "learning_rate": 4.300049056743409e-06, + "loss": 1.271, + "step": 13013 + }, + { + "epoch": 0.711507182592294, + "grad_norm": 1.3237169981002808, + "learning_rate": 4.298547707282149e-06, + "loss": 1.5551, + "step": 13014 + }, + { + "epoch": 0.7115618550360154, + "grad_norm": 1.6325396299362183, + "learning_rate": 4.297046548205761e-06, + "loss": 1.4191, + "step": 13015 + }, + { + "epoch": 0.711616527479737, + "grad_norm": 1.9627538919448853, + "learning_rate": 4.295545579564366e-06, + "loss": 1.6491, + "step": 13016 + }, + { + "epoch": 0.7116711999234586, + "grad_norm": 1.4996213912963867, + "learning_rate": 4.294044801408095e-06, + "loss": 1.3433, + "step": 13017 + }, + { + "epoch": 0.7117258723671801, + "grad_norm": 1.5440014600753784, + "learning_rate": 4.292544213787056e-06, + "loss": 1.4118, + "step": 13018 + }, + { + "epoch": 0.7117805448109017, + "grad_norm": 1.4390836954116821, + "learning_rate": 4.291043816751357e-06, + "loss": 1.4518, + "step": 13019 + }, + { + "epoch": 0.7118352172546233, + "grad_norm": 1.4466055631637573, + "learning_rate": 4.289543610351104e-06, + "loss": 1.1866, + "step": 13020 + }, + { + "epoch": 0.7118898896983448, + "grad_norm": 1.5917415618896484, + "learning_rate": 4.288043594636389e-06, + "loss": 1.4942, + "step": 13021 + }, + { + "epoch": 0.7119445621420664, + "grad_norm": 1.876228928565979, + "learning_rate": 4.2865437696573046e-06, + "loss": 1.5864, + "step": 13022 + }, + { + "epoch": 0.7119992345857878, + "grad_norm": 1.545379400253296, + "learning_rate": 4.285044135463929e-06, + "loss": 1.5407, + "step": 13023 + }, + { + "epoch": 0.7120539070295094, + "grad_norm": 1.2899113893508911, + "learning_rate": 4.283544692106336e-06, + "loss": 1.3438, + "step": 13024 + }, + { + "epoch": 0.712108579473231, + "grad_norm": 1.5105626583099365, + "learning_rate": 4.282045439634605e-06, + "loss": 1.485, + "step": 13025 + }, + { + "epoch": 0.7121632519169525, + "grad_norm": 1.4659075736999512, + "learning_rate": 4.280546378098792e-06, + "loss": 1.3811, + "step": 13026 + }, + { + "epoch": 0.7122179243606741, + "grad_norm": 1.5974985361099243, + "learning_rate": 4.279047507548955e-06, + "loss": 1.4553, + "step": 13027 + }, + { + "epoch": 0.7122725968043957, + "grad_norm": 2.0769615173339844, + "learning_rate": 4.27754882803515e-06, + "loss": 1.3137, + "step": 13028 + }, + { + "epoch": 0.7123272692481172, + "grad_norm": 1.6061255931854248, + "learning_rate": 4.276050339607416e-06, + "loss": 1.4725, + "step": 13029 + }, + { + "epoch": 0.7123819416918388, + "grad_norm": 1.4967142343521118, + "learning_rate": 4.274552042315791e-06, + "loss": 1.6389, + "step": 13030 + }, + { + "epoch": 0.7124366141355604, + "grad_norm": 1.4650856256484985, + "learning_rate": 4.273053936210312e-06, + "loss": 1.6382, + "step": 13031 + }, + { + "epoch": 0.7124912865792818, + "grad_norm": 1.6861554384231567, + "learning_rate": 4.271556021341e-06, + "loss": 1.5172, + "step": 13032 + }, + { + "epoch": 0.7125459590230034, + "grad_norm": 1.6943179368972778, + "learning_rate": 4.270058297757871e-06, + "loss": 1.3947, + "step": 13033 + }, + { + "epoch": 0.712600631466725, + "grad_norm": 1.929425597190857, + "learning_rate": 4.2685607655109455e-06, + "loss": 1.547, + "step": 13034 + }, + { + "epoch": 0.7126553039104465, + "grad_norm": 1.5724260807037354, + "learning_rate": 4.267063424650224e-06, + "loss": 1.534, + "step": 13035 + }, + { + "epoch": 0.7127099763541681, + "grad_norm": 1.678425908088684, + "learning_rate": 4.265566275225709e-06, + "loss": 1.4188, + "step": 13036 + }, + { + "epoch": 0.7127646487978896, + "grad_norm": 1.6603552103042603, + "learning_rate": 4.2640693172873914e-06, + "loss": 1.55, + "step": 13037 + }, + { + "epoch": 0.7128193212416112, + "grad_norm": 1.4018374681472778, + "learning_rate": 4.2625725508852555e-06, + "loss": 1.362, + "step": 13038 + }, + { + "epoch": 0.7128739936853328, + "grad_norm": 1.8132802248001099, + "learning_rate": 4.26107597606929e-06, + "loss": 1.469, + "step": 13039 + }, + { + "epoch": 0.7129286661290543, + "grad_norm": 1.7160059213638306, + "learning_rate": 4.259579592889464e-06, + "loss": 1.3298, + "step": 13040 + }, + { + "epoch": 0.7129833385727758, + "grad_norm": 1.4792706966400146, + "learning_rate": 4.258083401395742e-06, + "loss": 1.4773, + "step": 13041 + }, + { + "epoch": 0.7130380110164974, + "grad_norm": 1.2185546159744263, + "learning_rate": 4.2565874016380915e-06, + "loss": 1.3766, + "step": 13042 + }, + { + "epoch": 0.7130926834602189, + "grad_norm": 1.37496817111969, + "learning_rate": 4.255091593666466e-06, + "loss": 1.3321, + "step": 13043 + }, + { + "epoch": 0.7131473559039405, + "grad_norm": 1.4514094591140747, + "learning_rate": 4.25359597753081e-06, + "loss": 1.6343, + "step": 13044 + }, + { + "epoch": 0.7132020283476621, + "grad_norm": 1.7179484367370605, + "learning_rate": 4.252100553281072e-06, + "loss": 1.2974, + "step": 13045 + }, + { + "epoch": 0.7132567007913836, + "grad_norm": 1.4490561485290527, + "learning_rate": 4.250605320967184e-06, + "loss": 1.5397, + "step": 13046 + }, + { + "epoch": 0.7133113732351052, + "grad_norm": 3.0056540966033936, + "learning_rate": 4.249110280639076e-06, + "loss": 1.6483, + "step": 13047 + }, + { + "epoch": 0.7133660456788268, + "grad_norm": 1.7238417863845825, + "learning_rate": 4.24761543234667e-06, + "loss": 1.2261, + "step": 13048 + }, + { + "epoch": 0.7134207181225483, + "grad_norm": 1.529535174369812, + "learning_rate": 4.246120776139884e-06, + "loss": 1.22, + "step": 13049 + }, + { + "epoch": 0.7134753905662699, + "grad_norm": 1.3964096307754517, + "learning_rate": 4.244626312068623e-06, + "loss": 1.4133, + "step": 13050 + }, + { + "epoch": 0.7135300630099914, + "grad_norm": 1.5999500751495361, + "learning_rate": 4.243132040182798e-06, + "loss": 1.4877, + "step": 13051 + }, + { + "epoch": 0.7135847354537129, + "grad_norm": 1.6183964014053345, + "learning_rate": 4.241637960532301e-06, + "loss": 1.4682, + "step": 13052 + }, + { + "epoch": 0.7136394078974345, + "grad_norm": 1.9143503904342651, + "learning_rate": 4.240144073167028e-06, + "loss": 1.5208, + "step": 13053 + }, + { + "epoch": 0.713694080341156, + "grad_norm": 1.1866936683654785, + "learning_rate": 4.238650378136859e-06, + "loss": 1.4826, + "step": 13054 + }, + { + "epoch": 0.7137487527848776, + "grad_norm": 2.0880203247070312, + "learning_rate": 4.237156875491671e-06, + "loss": 1.3093, + "step": 13055 + }, + { + "epoch": 0.7138034252285992, + "grad_norm": 1.3927162885665894, + "learning_rate": 4.23566356528134e-06, + "loss": 1.3926, + "step": 13056 + }, + { + "epoch": 0.7138580976723207, + "grad_norm": 1.6762981414794922, + "learning_rate": 4.234170447555731e-06, + "loss": 1.4404, + "step": 13057 + }, + { + "epoch": 0.7139127701160423, + "grad_norm": 1.8635603189468384, + "learning_rate": 4.2326775223646965e-06, + "loss": 1.4568, + "step": 13058 + }, + { + "epoch": 0.7139674425597639, + "grad_norm": 1.7445259094238281, + "learning_rate": 4.231184789758096e-06, + "loss": 1.5339, + "step": 13059 + }, + { + "epoch": 0.7140221150034853, + "grad_norm": 1.4401590824127197, + "learning_rate": 4.229692249785773e-06, + "loss": 1.5346, + "step": 13060 + }, + { + "epoch": 0.7140767874472069, + "grad_norm": 1.4365683794021606, + "learning_rate": 4.228199902497565e-06, + "loss": 1.6659, + "step": 13061 + }, + { + "epoch": 0.7141314598909285, + "grad_norm": 1.3038069009780884, + "learning_rate": 4.2267077479433075e-06, + "loss": 1.7407, + "step": 13062 + }, + { + "epoch": 0.71418613233465, + "grad_norm": 1.461060643196106, + "learning_rate": 4.225215786172825e-06, + "loss": 1.425, + "step": 13063 + }, + { + "epoch": 0.7142408047783716, + "grad_norm": 1.7550910711288452, + "learning_rate": 4.223724017235935e-06, + "loss": 1.3383, + "step": 13064 + }, + { + "epoch": 0.7142954772220932, + "grad_norm": 1.5997791290283203, + "learning_rate": 4.222232441182459e-06, + "loss": 1.3826, + "step": 13065 + }, + { + "epoch": 0.7143501496658147, + "grad_norm": 1.4727035760879517, + "learning_rate": 4.220741058062194e-06, + "loss": 1.5289, + "step": 13066 + }, + { + "epoch": 0.7144048221095363, + "grad_norm": 1.2310770750045776, + "learning_rate": 4.219249867924953e-06, + "loss": 1.3153, + "step": 13067 + }, + { + "epoch": 0.7144594945532577, + "grad_norm": 1.7610292434692383, + "learning_rate": 4.217758870820522e-06, + "loss": 1.7336, + "step": 13068 + }, + { + "epoch": 0.7145141669969793, + "grad_norm": 1.6870814561843872, + "learning_rate": 4.216268066798687e-06, + "loss": 1.4579, + "step": 13069 + }, + { + "epoch": 0.7145688394407009, + "grad_norm": 1.1007031202316284, + "learning_rate": 4.214777455909237e-06, + "loss": 1.5156, + "step": 13070 + }, + { + "epoch": 0.7146235118844224, + "grad_norm": 1.6139649152755737, + "learning_rate": 4.213287038201943e-06, + "loss": 1.3507, + "step": 13071 + }, + { + "epoch": 0.714678184328144, + "grad_norm": 1.924335241317749, + "learning_rate": 4.211796813726571e-06, + "loss": 1.2454, + "step": 13072 + }, + { + "epoch": 0.7147328567718656, + "grad_norm": 2.2974863052368164, + "learning_rate": 4.210306782532889e-06, + "loss": 1.3927, + "step": 13073 + }, + { + "epoch": 0.7147875292155871, + "grad_norm": 1.3498876094818115, + "learning_rate": 4.208816944670649e-06, + "loss": 1.3222, + "step": 13074 + }, + { + "epoch": 0.7148422016593087, + "grad_norm": 1.6662369966506958, + "learning_rate": 4.207327300189602e-06, + "loss": 1.4494, + "step": 13075 + }, + { + "epoch": 0.7148968741030303, + "grad_norm": 1.6602647304534912, + "learning_rate": 4.205837849139488e-06, + "loss": 1.411, + "step": 13076 + }, + { + "epoch": 0.7149515465467517, + "grad_norm": 1.912451982498169, + "learning_rate": 4.204348591570046e-06, + "loss": 1.3972, + "step": 13077 + }, + { + "epoch": 0.7150062189904733, + "grad_norm": 1.3320748805999756, + "learning_rate": 4.202859527530999e-06, + "loss": 1.1371, + "step": 13078 + }, + { + "epoch": 0.7150608914341949, + "grad_norm": 1.513242244720459, + "learning_rate": 4.2013706570720815e-06, + "loss": 1.5572, + "step": 13079 + }, + { + "epoch": 0.7151155638779164, + "grad_norm": 1.5667763948440552, + "learning_rate": 4.199881980243003e-06, + "loss": 1.6004, + "step": 13080 + }, + { + "epoch": 0.715170236321638, + "grad_norm": 1.8339768648147583, + "learning_rate": 4.1983934970934725e-06, + "loss": 1.3377, + "step": 13081 + }, + { + "epoch": 0.7152249087653595, + "grad_norm": 1.9315241575241089, + "learning_rate": 4.196905207673201e-06, + "loss": 1.4224, + "step": 13082 + }, + { + "epoch": 0.7152795812090811, + "grad_norm": 1.5973113775253296, + "learning_rate": 4.195417112031878e-06, + "loss": 1.6878, + "step": 13083 + }, + { + "epoch": 0.7153342536528027, + "grad_norm": 1.9064050912857056, + "learning_rate": 4.193929210219202e-06, + "loss": 1.3187, + "step": 13084 + }, + { + "epoch": 0.7153889260965242, + "grad_norm": 1.714694857597351, + "learning_rate": 4.1924415022848545e-06, + "loss": 1.2212, + "step": 13085 + }, + { + "epoch": 0.7154435985402458, + "grad_norm": 1.4599565267562866, + "learning_rate": 4.1909539882785135e-06, + "loss": 1.4249, + "step": 13086 + }, + { + "epoch": 0.7154982709839673, + "grad_norm": 1.9110506772994995, + "learning_rate": 4.1894666682498485e-06, + "loss": 1.1638, + "step": 13087 + }, + { + "epoch": 0.7155529434276888, + "grad_norm": 1.7113866806030273, + "learning_rate": 4.187979542248528e-06, + "loss": 1.4142, + "step": 13088 + }, + { + "epoch": 0.7156076158714104, + "grad_norm": 1.6330751180648804, + "learning_rate": 4.186492610324204e-06, + "loss": 1.7798, + "step": 13089 + }, + { + "epoch": 0.715662288315132, + "grad_norm": 1.1201181411743164, + "learning_rate": 4.185005872526538e-06, + "loss": 1.561, + "step": 13090 + }, + { + "epoch": 0.7157169607588535, + "grad_norm": 1.870288372039795, + "learning_rate": 4.183519328905171e-06, + "loss": 1.5308, + "step": 13091 + }, + { + "epoch": 0.7157716332025751, + "grad_norm": 1.7754582166671753, + "learning_rate": 4.18203297950974e-06, + "loss": 1.5424, + "step": 13092 + }, + { + "epoch": 0.7158263056462967, + "grad_norm": 1.4883053302764893, + "learning_rate": 4.180546824389881e-06, + "loss": 1.6687, + "step": 13093 + }, + { + "epoch": 0.7158809780900182, + "grad_norm": 1.1054941415786743, + "learning_rate": 4.1790608635952214e-06, + "loss": 1.4493, + "step": 13094 + }, + { + "epoch": 0.7159356505337398, + "grad_norm": 1.667412519454956, + "learning_rate": 4.177575097175375e-06, + "loss": 1.5433, + "step": 13095 + }, + { + "epoch": 0.7159903229774612, + "grad_norm": 1.6451380252838135, + "learning_rate": 4.176089525179961e-06, + "loss": 1.4826, + "step": 13096 + }, + { + "epoch": 0.7160449954211828, + "grad_norm": 1.4433881044387817, + "learning_rate": 4.174604147658582e-06, + "loss": 1.5097, + "step": 13097 + }, + { + "epoch": 0.7160996678649044, + "grad_norm": 1.337777853012085, + "learning_rate": 4.173118964660844e-06, + "loss": 1.4312, + "step": 13098 + }, + { + "epoch": 0.7161543403086259, + "grad_norm": 2.0405704975128174, + "learning_rate": 4.171633976236335e-06, + "loss": 1.3909, + "step": 13099 + }, + { + "epoch": 0.7162090127523475, + "grad_norm": 1.3896489143371582, + "learning_rate": 4.1701491824346465e-06, + "loss": 1.4901, + "step": 13100 + }, + { + "epoch": 0.7162636851960691, + "grad_norm": 1.6193374395370483, + "learning_rate": 4.168664583305357e-06, + "loss": 1.2222, + "step": 13101 + }, + { + "epoch": 0.7163183576397906, + "grad_norm": 1.6393359899520874, + "learning_rate": 4.167180178898039e-06, + "loss": 1.302, + "step": 13102 + }, + { + "epoch": 0.7163730300835122, + "grad_norm": 1.602333664894104, + "learning_rate": 4.165695969262259e-06, + "loss": 1.4264, + "step": 13103 + }, + { + "epoch": 0.7164277025272338, + "grad_norm": 1.3197523355484009, + "learning_rate": 4.164211954447585e-06, + "loss": 1.4589, + "step": 13104 + }, + { + "epoch": 0.7164823749709552, + "grad_norm": 1.4450639486312866, + "learning_rate": 4.162728134503568e-06, + "loss": 1.4682, + "step": 13105 + }, + { + "epoch": 0.7165370474146768, + "grad_norm": 1.2030797004699707, + "learning_rate": 4.1612445094797515e-06, + "loss": 1.4686, + "step": 13106 + }, + { + "epoch": 0.7165917198583984, + "grad_norm": 1.5628876686096191, + "learning_rate": 4.159761079425687e-06, + "loss": 1.5558, + "step": 13107 + }, + { + "epoch": 0.7166463923021199, + "grad_norm": 1.5043610334396362, + "learning_rate": 4.158277844390904e-06, + "loss": 1.3977, + "step": 13108 + }, + { + "epoch": 0.7167010647458415, + "grad_norm": 1.262266755104065, + "learning_rate": 4.1567948044249284e-06, + "loss": 1.3544, + "step": 13109 + }, + { + "epoch": 0.716755737189563, + "grad_norm": 1.434258222579956, + "learning_rate": 4.155311959577289e-06, + "loss": 1.4884, + "step": 13110 + }, + { + "epoch": 0.7168104096332846, + "grad_norm": 1.4378879070281982, + "learning_rate": 4.153829309897494e-06, + "loss": 1.4369, + "step": 13111 + }, + { + "epoch": 0.7168650820770062, + "grad_norm": 1.5145779848098755, + "learning_rate": 4.152346855435062e-06, + "loss": 1.5987, + "step": 13112 + }, + { + "epoch": 0.7169197545207276, + "grad_norm": 1.7571951150894165, + "learning_rate": 4.15086459623949e-06, + "loss": 1.3418, + "step": 13113 + }, + { + "epoch": 0.7169744269644492, + "grad_norm": 1.7899706363677979, + "learning_rate": 4.149382532360275e-06, + "loss": 1.6822, + "step": 13114 + }, + { + "epoch": 0.7170290994081708, + "grad_norm": 1.5070120096206665, + "learning_rate": 4.147900663846904e-06, + "loss": 1.3489, + "step": 13115 + }, + { + "epoch": 0.7170837718518923, + "grad_norm": 2.1732118129730225, + "learning_rate": 4.146418990748865e-06, + "loss": 1.1033, + "step": 13116 + }, + { + "epoch": 0.7171384442956139, + "grad_norm": 1.7124661207199097, + "learning_rate": 4.144937513115627e-06, + "loss": 1.1865, + "step": 13117 + }, + { + "epoch": 0.7171931167393355, + "grad_norm": 2.7753336429595947, + "learning_rate": 4.143456230996667e-06, + "loss": 1.466, + "step": 13118 + }, + { + "epoch": 0.717247789183057, + "grad_norm": 1.2773839235305786, + "learning_rate": 4.141975144441448e-06, + "loss": 1.4271, + "step": 13119 + }, + { + "epoch": 0.7173024616267786, + "grad_norm": 1.6531552076339722, + "learning_rate": 4.140494253499421e-06, + "loss": 1.2704, + "step": 13120 + }, + { + "epoch": 0.7173571340705002, + "grad_norm": 1.4663357734680176, + "learning_rate": 4.139013558220044e-06, + "loss": 1.2905, + "step": 13121 + }, + { + "epoch": 0.7174118065142217, + "grad_norm": 1.5838379859924316, + "learning_rate": 4.137533058652758e-06, + "loss": 1.4307, + "step": 13122 + }, + { + "epoch": 0.7174664789579432, + "grad_norm": 1.4241300821304321, + "learning_rate": 4.136052754846996e-06, + "loss": 1.4895, + "step": 13123 + }, + { + "epoch": 0.7175211514016647, + "grad_norm": 1.7616939544677734, + "learning_rate": 4.134572646852196e-06, + "loss": 1.4661, + "step": 13124 + }, + { + "epoch": 0.7175758238453863, + "grad_norm": 1.4416898488998413, + "learning_rate": 4.1330927347177795e-06, + "loss": 1.4658, + "step": 13125 + }, + { + "epoch": 0.7176304962891079, + "grad_norm": 1.418319821357727, + "learning_rate": 4.1316130184931646e-06, + "loss": 1.2812, + "step": 13126 + }, + { + "epoch": 0.7176851687328294, + "grad_norm": 1.4623030424118042, + "learning_rate": 4.130133498227761e-06, + "loss": 1.3458, + "step": 13127 + }, + { + "epoch": 0.717739841176551, + "grad_norm": 1.4032970666885376, + "learning_rate": 4.12865417397097e-06, + "loss": 1.2949, + "step": 13128 + }, + { + "epoch": 0.7177945136202726, + "grad_norm": 1.8411355018615723, + "learning_rate": 4.127175045772196e-06, + "loss": 1.4792, + "step": 13129 + }, + { + "epoch": 0.7178491860639941, + "grad_norm": 1.7222555875778198, + "learning_rate": 4.125696113680831e-06, + "loss": 1.4567, + "step": 13130 + }, + { + "epoch": 0.7179038585077157, + "grad_norm": 1.5724513530731201, + "learning_rate": 4.124217377746251e-06, + "loss": 1.2335, + "step": 13131 + }, + { + "epoch": 0.7179585309514372, + "grad_norm": 1.7806034088134766, + "learning_rate": 4.122738838017845e-06, + "loss": 1.0974, + "step": 13132 + }, + { + "epoch": 0.7180132033951587, + "grad_norm": 1.5325777530670166, + "learning_rate": 4.121260494544982e-06, + "loss": 1.3988, + "step": 13133 + }, + { + "epoch": 0.7180678758388803, + "grad_norm": 1.3121658563613892, + "learning_rate": 4.1197823473770215e-06, + "loss": 1.3709, + "step": 13134 + }, + { + "epoch": 0.7181225482826019, + "grad_norm": 1.4970754384994507, + "learning_rate": 4.11830439656333e-06, + "loss": 1.581, + "step": 13135 + }, + { + "epoch": 0.7181772207263234, + "grad_norm": 1.7712581157684326, + "learning_rate": 4.116826642153256e-06, + "loss": 1.4305, + "step": 13136 + }, + { + "epoch": 0.718231893170045, + "grad_norm": 1.4211275577545166, + "learning_rate": 4.115349084196144e-06, + "loss": 1.3796, + "step": 13137 + }, + { + "epoch": 0.7182865656137665, + "grad_norm": 1.473523497581482, + "learning_rate": 4.113871722741337e-06, + "loss": 1.4355, + "step": 13138 + }, + { + "epoch": 0.7183412380574881, + "grad_norm": 1.2352180480957031, + "learning_rate": 4.1123945578381665e-06, + "loss": 1.5504, + "step": 13139 + }, + { + "epoch": 0.7183959105012097, + "grad_norm": 1.8377498388290405, + "learning_rate": 4.110917589535957e-06, + "loss": 1.3972, + "step": 13140 + }, + { + "epoch": 0.7184505829449311, + "grad_norm": 1.5324265956878662, + "learning_rate": 4.109440817884027e-06, + "loss": 1.4907, + "step": 13141 + }, + { + "epoch": 0.7185052553886527, + "grad_norm": 1.7203919887542725, + "learning_rate": 4.107964242931689e-06, + "loss": 1.2756, + "step": 13142 + }, + { + "epoch": 0.7185599278323743, + "grad_norm": 1.5754934549331665, + "learning_rate": 4.106487864728254e-06, + "loss": 1.4357, + "step": 13143 + }, + { + "epoch": 0.7186146002760958, + "grad_norm": 1.5342180728912354, + "learning_rate": 4.105011683323018e-06, + "loss": 1.2642, + "step": 13144 + }, + { + "epoch": 0.7186692727198174, + "grad_norm": 1.9565460681915283, + "learning_rate": 4.103535698765272e-06, + "loss": 1.4581, + "step": 13145 + }, + { + "epoch": 0.718723945163539, + "grad_norm": 1.547796607017517, + "learning_rate": 4.102059911104309e-06, + "loss": 1.4911, + "step": 13146 + }, + { + "epoch": 0.7187786176072605, + "grad_norm": 1.2368885278701782, + "learning_rate": 4.100584320389406e-06, + "loss": 1.6825, + "step": 13147 + }, + { + "epoch": 0.7188332900509821, + "grad_norm": 1.775264024734497, + "learning_rate": 4.099108926669832e-06, + "loss": 1.4627, + "step": 13148 + }, + { + "epoch": 0.7188879624947037, + "grad_norm": 1.3172073364257812, + "learning_rate": 4.097633729994861e-06, + "loss": 1.4331, + "step": 13149 + }, + { + "epoch": 0.7189426349384251, + "grad_norm": 1.5584205389022827, + "learning_rate": 4.096158730413751e-06, + "loss": 1.2431, + "step": 13150 + }, + { + "epoch": 0.7189973073821467, + "grad_norm": 1.4352573156356812, + "learning_rate": 4.09468392797575e-06, + "loss": 1.4676, + "step": 13151 + }, + { + "epoch": 0.7190519798258682, + "grad_norm": 1.6535203456878662, + "learning_rate": 4.093209322730114e-06, + "loss": 1.4079, + "step": 13152 + }, + { + "epoch": 0.7191066522695898, + "grad_norm": 1.7340953350067139, + "learning_rate": 4.0917349147260796e-06, + "loss": 1.3326, + "step": 13153 + }, + { + "epoch": 0.7191613247133114, + "grad_norm": 1.2692103385925293, + "learning_rate": 4.09026070401288e-06, + "loss": 1.513, + "step": 13154 + }, + { + "epoch": 0.7192159971570329, + "grad_norm": 1.62180495262146, + "learning_rate": 4.088786690639744e-06, + "loss": 1.2929, + "step": 13155 + }, + { + "epoch": 0.7192706696007545, + "grad_norm": 1.7836757898330688, + "learning_rate": 4.087312874655888e-06, + "loss": 1.4605, + "step": 13156 + }, + { + "epoch": 0.7193253420444761, + "grad_norm": 1.4645204544067383, + "learning_rate": 4.085839256110533e-06, + "loss": 1.4633, + "step": 13157 + }, + { + "epoch": 0.7193800144881976, + "grad_norm": 1.6769639253616333, + "learning_rate": 4.084365835052883e-06, + "loss": 1.2888, + "step": 13158 + }, + { + "epoch": 0.7194346869319191, + "grad_norm": 1.358956217765808, + "learning_rate": 4.082892611532136e-06, + "loss": 1.1613, + "step": 13159 + }, + { + "epoch": 0.7194893593756407, + "grad_norm": 1.634675145149231, + "learning_rate": 4.081419585597493e-06, + "loss": 1.3731, + "step": 13160 + }, + { + "epoch": 0.7195440318193622, + "grad_norm": 1.2544163465499878, + "learning_rate": 4.079946757298138e-06, + "loss": 1.5164, + "step": 13161 + }, + { + "epoch": 0.7195987042630838, + "grad_norm": 1.3161097764968872, + "learning_rate": 4.078474126683249e-06, + "loss": 1.6165, + "step": 13162 + }, + { + "epoch": 0.7196533767068054, + "grad_norm": 1.732683777809143, + "learning_rate": 4.077001693802008e-06, + "loss": 1.528, + "step": 13163 + }, + { + "epoch": 0.7197080491505269, + "grad_norm": 1.377756118774414, + "learning_rate": 4.0755294587035796e-06, + "loss": 1.7455, + "step": 13164 + }, + { + "epoch": 0.7197627215942485, + "grad_norm": 1.2353211641311646, + "learning_rate": 4.074057421437124e-06, + "loss": 1.4026, + "step": 13165 + }, + { + "epoch": 0.71981739403797, + "grad_norm": 1.6983377933502197, + "learning_rate": 4.072585582051798e-06, + "loss": 1.2575, + "step": 13166 + }, + { + "epoch": 0.7198720664816916, + "grad_norm": 1.7242224216461182, + "learning_rate": 4.071113940596744e-06, + "loss": 1.4982, + "step": 13167 + }, + { + "epoch": 0.7199267389254131, + "grad_norm": 1.6007931232452393, + "learning_rate": 4.0696424971211124e-06, + "loss": 1.2987, + "step": 13168 + }, + { + "epoch": 0.7199814113691346, + "grad_norm": 1.6487853527069092, + "learning_rate": 4.068171251674033e-06, + "loss": 1.444, + "step": 13169 + }, + { + "epoch": 0.7200360838128562, + "grad_norm": 1.3656079769134521, + "learning_rate": 4.066700204304631e-06, + "loss": 1.3822, + "step": 13170 + }, + { + "epoch": 0.7200907562565778, + "grad_norm": 1.5105732679367065, + "learning_rate": 4.065229355062037e-06, + "loss": 1.4315, + "step": 13171 + }, + { + "epoch": 0.7201454287002993, + "grad_norm": 1.5570576190948486, + "learning_rate": 4.063758703995361e-06, + "loss": 1.5912, + "step": 13172 + }, + { + "epoch": 0.7202001011440209, + "grad_norm": 1.369338035583496, + "learning_rate": 4.0622882511537076e-06, + "loss": 1.5845, + "step": 13173 + }, + { + "epoch": 0.7202547735877425, + "grad_norm": 1.497483730316162, + "learning_rate": 4.060817996586186e-06, + "loss": 1.4202, + "step": 13174 + }, + { + "epoch": 0.720309446031464, + "grad_norm": 1.589689016342163, + "learning_rate": 4.059347940341889e-06, + "loss": 1.4669, + "step": 13175 + }, + { + "epoch": 0.7203641184751856, + "grad_norm": 1.5141345262527466, + "learning_rate": 4.0578780824699005e-06, + "loss": 1.3892, + "step": 13176 + }, + { + "epoch": 0.7204187909189071, + "grad_norm": 1.4961721897125244, + "learning_rate": 4.05640842301931e-06, + "loss": 1.4829, + "step": 13177 + }, + { + "epoch": 0.7204734633626286, + "grad_norm": 1.8889962434768677, + "learning_rate": 4.05493896203919e-06, + "loss": 1.5334, + "step": 13178 + }, + { + "epoch": 0.7205281358063502, + "grad_norm": 1.7908579111099243, + "learning_rate": 4.053469699578608e-06, + "loss": 1.2685, + "step": 13179 + }, + { + "epoch": 0.7205828082500717, + "grad_norm": 1.701077938079834, + "learning_rate": 4.052000635686627e-06, + "loss": 1.5105, + "step": 13180 + }, + { + "epoch": 0.7206374806937933, + "grad_norm": 1.491957664489746, + "learning_rate": 4.050531770412298e-06, + "loss": 1.5799, + "step": 13181 + }, + { + "epoch": 0.7206921531375149, + "grad_norm": 1.5207282304763794, + "learning_rate": 4.049063103804678e-06, + "loss": 1.5397, + "step": 13182 + }, + { + "epoch": 0.7207468255812364, + "grad_norm": 1.5553051233291626, + "learning_rate": 4.047594635912805e-06, + "loss": 1.3587, + "step": 13183 + }, + { + "epoch": 0.720801498024958, + "grad_norm": 1.5758029222488403, + "learning_rate": 4.046126366785712e-06, + "loss": 1.588, + "step": 13184 + }, + { + "epoch": 0.7208561704686796, + "grad_norm": 1.563637614250183, + "learning_rate": 4.044658296472433e-06, + "loss": 1.2143, + "step": 13185 + }, + { + "epoch": 0.720910842912401, + "grad_norm": 1.6326706409454346, + "learning_rate": 4.04319042502199e-06, + "loss": 1.3736, + "step": 13186 + }, + { + "epoch": 0.7209655153561226, + "grad_norm": 1.631207823753357, + "learning_rate": 4.0417227524833925e-06, + "loss": 1.2328, + "step": 13187 + }, + { + "epoch": 0.7210201877998442, + "grad_norm": 1.5135536193847656, + "learning_rate": 4.040255278905657e-06, + "loss": 1.4598, + "step": 13188 + }, + { + "epoch": 0.7210748602435657, + "grad_norm": 1.466257095336914, + "learning_rate": 4.038788004337783e-06, + "loss": 1.7472, + "step": 13189 + }, + { + "epoch": 0.7211295326872873, + "grad_norm": 1.254534363746643, + "learning_rate": 4.0373209288287616e-06, + "loss": 1.5377, + "step": 13190 + }, + { + "epoch": 0.7211842051310089, + "grad_norm": 1.4372141361236572, + "learning_rate": 4.035854052427591e-06, + "loss": 1.4504, + "step": 13191 + }, + { + "epoch": 0.7212388775747304, + "grad_norm": 1.2069599628448486, + "learning_rate": 4.034387375183248e-06, + "loss": 1.4292, + "step": 13192 + }, + { + "epoch": 0.721293550018452, + "grad_norm": 1.7290300130844116, + "learning_rate": 4.0329208971447095e-06, + "loss": 1.4532, + "step": 13193 + }, + { + "epoch": 0.7213482224621734, + "grad_norm": 1.5022491216659546, + "learning_rate": 4.031454618360945e-06, + "loss": 1.3534, + "step": 13194 + }, + { + "epoch": 0.721402894905895, + "grad_norm": 1.6861615180969238, + "learning_rate": 4.029988538880913e-06, + "loss": 1.3244, + "step": 13195 + }, + { + "epoch": 0.7214575673496166, + "grad_norm": 2.160780191421509, + "learning_rate": 4.028522658753575e-06, + "loss": 1.133, + "step": 13196 + }, + { + "epoch": 0.7215122397933381, + "grad_norm": 1.2841484546661377, + "learning_rate": 4.027056978027879e-06, + "loss": 1.4209, + "step": 13197 + }, + { + "epoch": 0.7215669122370597, + "grad_norm": 1.6649326086044312, + "learning_rate": 4.025591496752763e-06, + "loss": 1.4348, + "step": 13198 + }, + { + "epoch": 0.7216215846807813, + "grad_norm": 1.6449629068374634, + "learning_rate": 4.024126214977169e-06, + "loss": 1.4851, + "step": 13199 + }, + { + "epoch": 0.7216762571245028, + "grad_norm": 1.5922504663467407, + "learning_rate": 4.0226611327500245e-06, + "loss": 1.5879, + "step": 13200 + }, + { + "epoch": 0.7217309295682244, + "grad_norm": 1.607558012008667, + "learning_rate": 4.021196250120248e-06, + "loss": 1.4818, + "step": 13201 + }, + { + "epoch": 0.721785602011946, + "grad_norm": 1.695650577545166, + "learning_rate": 4.01973156713676e-06, + "loss": 1.6343, + "step": 13202 + }, + { + "epoch": 0.7218402744556675, + "grad_norm": 1.5108736753463745, + "learning_rate": 4.018267083848468e-06, + "loss": 1.3971, + "step": 13203 + }, + { + "epoch": 0.721894946899389, + "grad_norm": 1.5815892219543457, + "learning_rate": 4.016802800304277e-06, + "loss": 1.2031, + "step": 13204 + }, + { + "epoch": 0.7219496193431106, + "grad_norm": 1.8956847190856934, + "learning_rate": 4.015338716553079e-06, + "loss": 1.4153, + "step": 13205 + }, + { + "epoch": 0.7220042917868321, + "grad_norm": 1.5043123960494995, + "learning_rate": 4.0138748326437645e-06, + "loss": 1.5309, + "step": 13206 + }, + { + "epoch": 0.7220589642305537, + "grad_norm": 1.804561734199524, + "learning_rate": 4.012411148625214e-06, + "loss": 1.425, + "step": 13207 + }, + { + "epoch": 0.7221136366742752, + "grad_norm": 1.3492985963821411, + "learning_rate": 4.0109476645463076e-06, + "loss": 1.3431, + "step": 13208 + }, + { + "epoch": 0.7221683091179968, + "grad_norm": 1.6286214590072632, + "learning_rate": 4.0094843804559095e-06, + "loss": 1.6574, + "step": 13209 + }, + { + "epoch": 0.7222229815617184, + "grad_norm": 1.3531147241592407, + "learning_rate": 4.0080212964028884e-06, + "loss": 1.2839, + "step": 13210 + }, + { + "epoch": 0.7222776540054399, + "grad_norm": 1.724473237991333, + "learning_rate": 4.006558412436098e-06, + "loss": 1.1338, + "step": 13211 + }, + { + "epoch": 0.7223323264491615, + "grad_norm": 1.5505741834640503, + "learning_rate": 4.0050957286043815e-06, + "loss": 1.354, + "step": 13212 + }, + { + "epoch": 0.722386998892883, + "grad_norm": 1.4027820825576782, + "learning_rate": 4.00363324495659e-06, + "loss": 1.4669, + "step": 13213 + }, + { + "epoch": 0.7224416713366045, + "grad_norm": 1.6405876874923706, + "learning_rate": 4.002170961541555e-06, + "loss": 1.5737, + "step": 13214 + }, + { + "epoch": 0.7224963437803261, + "grad_norm": 1.7624998092651367, + "learning_rate": 4.000708878408103e-06, + "loss": 1.2826, + "step": 13215 + }, + { + "epoch": 0.7225510162240477, + "grad_norm": 1.6795177459716797, + "learning_rate": 3.999246995605063e-06, + "loss": 1.3346, + "step": 13216 + }, + { + "epoch": 0.7226056886677692, + "grad_norm": 1.6271684169769287, + "learning_rate": 3.997785313181246e-06, + "loss": 1.5201, + "step": 13217 + }, + { + "epoch": 0.7226603611114908, + "grad_norm": 1.4044464826583862, + "learning_rate": 3.996323831185462e-06, + "loss": 1.3397, + "step": 13218 + }, + { + "epoch": 0.7227150335552124, + "grad_norm": 1.5254098176956177, + "learning_rate": 3.994862549666515e-06, + "loss": 1.4977, + "step": 13219 + }, + { + "epoch": 0.7227697059989339, + "grad_norm": 1.3587467670440674, + "learning_rate": 3.9934014686731985e-06, + "loss": 1.4762, + "step": 13220 + }, + { + "epoch": 0.7228243784426555, + "grad_norm": 1.5278100967407227, + "learning_rate": 3.991940588254297e-06, + "loss": 1.4584, + "step": 13221 + }, + { + "epoch": 0.7228790508863769, + "grad_norm": 1.7369399070739746, + "learning_rate": 3.990479908458602e-06, + "loss": 1.3148, + "step": 13222 + }, + { + "epoch": 0.7229337233300985, + "grad_norm": 1.2591503858566284, + "learning_rate": 3.989019429334881e-06, + "loss": 1.4476, + "step": 13223 + }, + { + "epoch": 0.7229883957738201, + "grad_norm": 1.6355817317962646, + "learning_rate": 3.98755915093191e-06, + "loss": 1.4084, + "step": 13224 + }, + { + "epoch": 0.7230430682175416, + "grad_norm": 1.4381383657455444, + "learning_rate": 3.986099073298447e-06, + "loss": 1.4608, + "step": 13225 + }, + { + "epoch": 0.7230977406612632, + "grad_norm": 1.3325529098510742, + "learning_rate": 3.984639196483245e-06, + "loss": 1.702, + "step": 13226 + }, + { + "epoch": 0.7231524131049848, + "grad_norm": 1.4100496768951416, + "learning_rate": 3.98317952053506e-06, + "loss": 1.4121, + "step": 13227 + }, + { + "epoch": 0.7232070855487063, + "grad_norm": 1.57686448097229, + "learning_rate": 3.9817200455026295e-06, + "loss": 1.4459, + "step": 13228 + }, + { + "epoch": 0.7232617579924279, + "grad_norm": 1.238291621208191, + "learning_rate": 3.980260771434685e-06, + "loss": 1.4257, + "step": 13229 + }, + { + "epoch": 0.7233164304361495, + "grad_norm": 1.3632584810256958, + "learning_rate": 3.978801698379963e-06, + "loss": 1.5828, + "step": 13230 + }, + { + "epoch": 0.7233711028798709, + "grad_norm": 1.6205557584762573, + "learning_rate": 3.977342826387181e-06, + "loss": 1.461, + "step": 13231 + }, + { + "epoch": 0.7234257753235925, + "grad_norm": 1.4810141324996948, + "learning_rate": 3.975884155505054e-06, + "loss": 1.6028, + "step": 13232 + }, + { + "epoch": 0.7234804477673141, + "grad_norm": 1.6033811569213867, + "learning_rate": 3.974425685782293e-06, + "loss": 1.4187, + "step": 13233 + }, + { + "epoch": 0.7235351202110356, + "grad_norm": 1.2976518869400024, + "learning_rate": 3.972967417267596e-06, + "loss": 1.3094, + "step": 13234 + }, + { + "epoch": 0.7235897926547572, + "grad_norm": 1.2179251909255981, + "learning_rate": 3.971509350009657e-06, + "loss": 1.3426, + "step": 13235 + }, + { + "epoch": 0.7236444650984787, + "grad_norm": 1.2430006265640259, + "learning_rate": 3.970051484057171e-06, + "loss": 1.6174, + "step": 13236 + }, + { + "epoch": 0.7236991375422003, + "grad_norm": 1.466780185699463, + "learning_rate": 3.968593819458812e-06, + "loss": 1.3331, + "step": 13237 + }, + { + "epoch": 0.7237538099859219, + "grad_norm": 1.8210694789886475, + "learning_rate": 3.967136356263261e-06, + "loss": 1.5597, + "step": 13238 + }, + { + "epoch": 0.7238084824296434, + "grad_norm": 1.6749870777130127, + "learning_rate": 3.965679094519184e-06, + "loss": 1.3736, + "step": 13239 + }, + { + "epoch": 0.723863154873365, + "grad_norm": 1.5878028869628906, + "learning_rate": 3.964222034275239e-06, + "loss": 1.2956, + "step": 13240 + }, + { + "epoch": 0.7239178273170865, + "grad_norm": 1.381609559059143, + "learning_rate": 3.962765175580088e-06, + "loss": 1.491, + "step": 13241 + }, + { + "epoch": 0.723972499760808, + "grad_norm": 2.3088037967681885, + "learning_rate": 3.961308518482373e-06, + "loss": 1.3673, + "step": 13242 + }, + { + "epoch": 0.7240271722045296, + "grad_norm": 1.4180841445922852, + "learning_rate": 3.959852063030738e-06, + "loss": 1.3694, + "step": 13243 + }, + { + "epoch": 0.7240818446482512, + "grad_norm": 1.703084945678711, + "learning_rate": 3.958395809273815e-06, + "loss": 1.393, + "step": 13244 + }, + { + "epoch": 0.7241365170919727, + "grad_norm": 1.4072864055633545, + "learning_rate": 3.956939757260234e-06, + "loss": 1.5372, + "step": 13245 + }, + { + "epoch": 0.7241911895356943, + "grad_norm": 1.6140722036361694, + "learning_rate": 3.955483907038612e-06, + "loss": 1.4835, + "step": 13246 + }, + { + "epoch": 0.7242458619794159, + "grad_norm": 1.4482288360595703, + "learning_rate": 3.954028258657568e-06, + "loss": 1.4636, + "step": 13247 + }, + { + "epoch": 0.7243005344231374, + "grad_norm": 1.4765735864639282, + "learning_rate": 3.952572812165709e-06, + "loss": 1.3166, + "step": 13248 + }, + { + "epoch": 0.724355206866859, + "grad_norm": 1.6076658964157104, + "learning_rate": 3.951117567611631e-06, + "loss": 1.5768, + "step": 13249 + }, + { + "epoch": 0.7244098793105805, + "grad_norm": 1.6140544414520264, + "learning_rate": 3.949662525043935e-06, + "loss": 1.4175, + "step": 13250 + }, + { + "epoch": 0.724464551754302, + "grad_norm": 1.6033962965011597, + "learning_rate": 3.9482076845112006e-06, + "loss": 1.7085, + "step": 13251 + }, + { + "epoch": 0.7245192241980236, + "grad_norm": 1.3474370241165161, + "learning_rate": 3.946753046062017e-06, + "loss": 1.516, + "step": 13252 + }, + { + "epoch": 0.7245738966417451, + "grad_norm": 1.32924222946167, + "learning_rate": 3.945298609744953e-06, + "loss": 1.6215, + "step": 13253 + }, + { + "epoch": 0.7246285690854667, + "grad_norm": 1.504380702972412, + "learning_rate": 3.943844375608573e-06, + "loss": 1.294, + "step": 13254 + }, + { + "epoch": 0.7246832415291883, + "grad_norm": 1.366795301437378, + "learning_rate": 3.942390343701444e-06, + "loss": 1.3049, + "step": 13255 + }, + { + "epoch": 0.7247379139729098, + "grad_norm": 1.4492844343185425, + "learning_rate": 3.940936514072117e-06, + "loss": 1.4366, + "step": 13256 + }, + { + "epoch": 0.7247925864166314, + "grad_norm": 1.4854103326797485, + "learning_rate": 3.939482886769136e-06, + "loss": 1.3737, + "step": 13257 + }, + { + "epoch": 0.724847258860353, + "grad_norm": 1.458869457244873, + "learning_rate": 3.938029461841044e-06, + "loss": 1.4158, + "step": 13258 + }, + { + "epoch": 0.7249019313040744, + "grad_norm": 1.2121959924697876, + "learning_rate": 3.9365762393363725e-06, + "loss": 1.6136, + "step": 13259 + }, + { + "epoch": 0.724956603747796, + "grad_norm": 1.2260644435882568, + "learning_rate": 3.935123219303646e-06, + "loss": 1.5928, + "step": 13260 + }, + { + "epoch": 0.7250112761915176, + "grad_norm": 1.4305371046066284, + "learning_rate": 3.9336704017913895e-06, + "loss": 1.4185, + "step": 13261 + }, + { + "epoch": 0.7250659486352391, + "grad_norm": 1.506477952003479, + "learning_rate": 3.932217786848114e-06, + "loss": 1.5049, + "step": 13262 + }, + { + "epoch": 0.7251206210789607, + "grad_norm": 1.8011261224746704, + "learning_rate": 3.930765374522322e-06, + "loss": 1.5066, + "step": 13263 + }, + { + "epoch": 0.7251752935226823, + "grad_norm": 1.2552787065505981, + "learning_rate": 3.929313164862518e-06, + "loss": 1.3009, + "step": 13264 + }, + { + "epoch": 0.7252299659664038, + "grad_norm": 1.3815323114395142, + "learning_rate": 3.92786115791719e-06, + "loss": 1.3147, + "step": 13265 + }, + { + "epoch": 0.7252846384101254, + "grad_norm": 1.8768582344055176, + "learning_rate": 3.9264093537348305e-06, + "loss": 1.6322, + "step": 13266 + }, + { + "epoch": 0.7253393108538468, + "grad_norm": 1.5525083541870117, + "learning_rate": 3.924957752363915e-06, + "loss": 1.5728, + "step": 13267 + }, + { + "epoch": 0.7253939832975684, + "grad_norm": 1.8624274730682373, + "learning_rate": 3.923506353852912e-06, + "loss": 1.4482, + "step": 13268 + }, + { + "epoch": 0.72544865574129, + "grad_norm": 1.3540011644363403, + "learning_rate": 3.9220551582502934e-06, + "loss": 1.4971, + "step": 13269 + }, + { + "epoch": 0.7255033281850115, + "grad_norm": 1.8456929922103882, + "learning_rate": 3.9206041656045155e-06, + "loss": 1.5032, + "step": 13270 + }, + { + "epoch": 0.7255580006287331, + "grad_norm": 1.593967080116272, + "learning_rate": 3.919153375964032e-06, + "loss": 1.4004, + "step": 13271 + }, + { + "epoch": 0.7256126730724547, + "grad_norm": 1.6739734411239624, + "learning_rate": 3.917702789377284e-06, + "loss": 1.6562, + "step": 13272 + }, + { + "epoch": 0.7256673455161762, + "grad_norm": 1.3126674890518188, + "learning_rate": 3.916252405892714e-06, + "loss": 1.5092, + "step": 13273 + }, + { + "epoch": 0.7257220179598978, + "grad_norm": 1.5940309762954712, + "learning_rate": 3.91480222555875e-06, + "loss": 1.4011, + "step": 13274 + }, + { + "epoch": 0.7257766904036194, + "grad_norm": 1.7119898796081543, + "learning_rate": 3.91335224842382e-06, + "loss": 1.4906, + "step": 13275 + }, + { + "epoch": 0.7258313628473408, + "grad_norm": 1.7001351118087769, + "learning_rate": 3.911902474536342e-06, + "loss": 1.3604, + "step": 13276 + }, + { + "epoch": 0.7258860352910624, + "grad_norm": 1.555800437927246, + "learning_rate": 3.910452903944722e-06, + "loss": 1.2079, + "step": 13277 + }, + { + "epoch": 0.725940707734784, + "grad_norm": 1.2195218801498413, + "learning_rate": 3.909003536697374e-06, + "loss": 1.3393, + "step": 13278 + }, + { + "epoch": 0.7259953801785055, + "grad_norm": 1.656873106956482, + "learning_rate": 3.907554372842688e-06, + "loss": 1.6177, + "step": 13279 + }, + { + "epoch": 0.7260500526222271, + "grad_norm": 1.4471007585525513, + "learning_rate": 3.90610541242906e-06, + "loss": 1.4336, + "step": 13280 + }, + { + "epoch": 0.7261047250659486, + "grad_norm": 1.3505171537399292, + "learning_rate": 3.904656655504872e-06, + "loss": 1.2856, + "step": 13281 + }, + { + "epoch": 0.7261593975096702, + "grad_norm": 1.7851006984710693, + "learning_rate": 3.903208102118503e-06, + "loss": 1.109, + "step": 13282 + }, + { + "epoch": 0.7262140699533918, + "grad_norm": 1.5471391677856445, + "learning_rate": 3.90175975231832e-06, + "loss": 1.4812, + "step": 13283 + }, + { + "epoch": 0.7262687423971133, + "grad_norm": 1.3231555223464966, + "learning_rate": 3.90031160615269e-06, + "loss": 1.4868, + "step": 13284 + }, + { + "epoch": 0.7263234148408348, + "grad_norm": 1.9832241535186768, + "learning_rate": 3.898863663669965e-06, + "loss": 1.411, + "step": 13285 + }, + { + "epoch": 0.7263780872845564, + "grad_norm": 1.1309517621994019, + "learning_rate": 3.897415924918503e-06, + "loss": 1.5332, + "step": 13286 + }, + { + "epoch": 0.7264327597282779, + "grad_norm": 1.680532693862915, + "learning_rate": 3.895968389946644e-06, + "loss": 1.4403, + "step": 13287 + }, + { + "epoch": 0.7264874321719995, + "grad_norm": 1.8059581518173218, + "learning_rate": 3.894521058802719e-06, + "loss": 1.3406, + "step": 13288 + }, + { + "epoch": 0.7265421046157211, + "grad_norm": 1.5780386924743652, + "learning_rate": 3.893073931535068e-06, + "loss": 1.5349, + "step": 13289 + }, + { + "epoch": 0.7265967770594426, + "grad_norm": 1.8339895009994507, + "learning_rate": 3.891627008192007e-06, + "loss": 1.5751, + "step": 13290 + }, + { + "epoch": 0.7266514495031642, + "grad_norm": 1.638952374458313, + "learning_rate": 3.890180288821851e-06, + "loss": 1.31, + "step": 13291 + }, + { + "epoch": 0.7267061219468858, + "grad_norm": 1.5233728885650635, + "learning_rate": 3.888733773472916e-06, + "loss": 1.4991, + "step": 13292 + }, + { + "epoch": 0.7267607943906073, + "grad_norm": 1.398949384689331, + "learning_rate": 3.8872874621934976e-06, + "loss": 1.0724, + "step": 13293 + }, + { + "epoch": 0.7268154668343288, + "grad_norm": 1.5108290910720825, + "learning_rate": 3.885841355031897e-06, + "loss": 1.4886, + "step": 13294 + }, + { + "epoch": 0.7268701392780503, + "grad_norm": 1.773985505104065, + "learning_rate": 3.8843954520364026e-06, + "loss": 1.3387, + "step": 13295 + }, + { + "epoch": 0.7269248117217719, + "grad_norm": 1.6302847862243652, + "learning_rate": 3.882949753255294e-06, + "loss": 1.4085, + "step": 13296 + }, + { + "epoch": 0.7269794841654935, + "grad_norm": 1.5621613264083862, + "learning_rate": 3.881504258736847e-06, + "loss": 1.6049, + "step": 13297 + }, + { + "epoch": 0.727034156609215, + "grad_norm": 1.4438364505767822, + "learning_rate": 3.88005896852933e-06, + "loss": 1.4881, + "step": 13298 + }, + { + "epoch": 0.7270888290529366, + "grad_norm": 1.4121193885803223, + "learning_rate": 3.878613882681002e-06, + "loss": 1.5636, + "step": 13299 + }, + { + "epoch": 0.7271435014966582, + "grad_norm": 1.4593404531478882, + "learning_rate": 3.877169001240124e-06, + "loss": 1.5251, + "step": 13300 + }, + { + "epoch": 0.7271981739403797, + "grad_norm": 1.4935710430145264, + "learning_rate": 3.875724324254941e-06, + "loss": 1.552, + "step": 13301 + }, + { + "epoch": 0.7272528463841013, + "grad_norm": 1.399801254272461, + "learning_rate": 3.874279851773691e-06, + "loss": 1.4087, + "step": 13302 + }, + { + "epoch": 0.7273075188278229, + "grad_norm": 1.2299259901046753, + "learning_rate": 3.872835583844614e-06, + "loss": 1.6436, + "step": 13303 + }, + { + "epoch": 0.7273621912715443, + "grad_norm": 1.5724565982818604, + "learning_rate": 3.871391520515935e-06, + "loss": 1.4658, + "step": 13304 + }, + { + "epoch": 0.7274168637152659, + "grad_norm": 1.5872364044189453, + "learning_rate": 3.86994766183587e-06, + "loss": 1.4833, + "step": 13305 + }, + { + "epoch": 0.7274715361589875, + "grad_norm": 2.1237127780914307, + "learning_rate": 3.868504007852641e-06, + "loss": 1.289, + "step": 13306 + }, + { + "epoch": 0.727526208602709, + "grad_norm": 1.482327938079834, + "learning_rate": 3.867060558614451e-06, + "loss": 1.5908, + "step": 13307 + }, + { + "epoch": 0.7275808810464306, + "grad_norm": 1.8886573314666748, + "learning_rate": 3.865617314169502e-06, + "loss": 1.5673, + "step": 13308 + }, + { + "epoch": 0.7276355534901521, + "grad_norm": 1.6192015409469604, + "learning_rate": 3.864174274565984e-06, + "loss": 1.3354, + "step": 13309 + }, + { + "epoch": 0.7276902259338737, + "grad_norm": 1.2231295108795166, + "learning_rate": 3.862731439852082e-06, + "loss": 1.3774, + "step": 13310 + }, + { + "epoch": 0.7277448983775953, + "grad_norm": 1.7621214389801025, + "learning_rate": 3.861288810075983e-06, + "loss": 1.3842, + "step": 13311 + }, + { + "epoch": 0.7277995708213167, + "grad_norm": 1.6765997409820557, + "learning_rate": 3.859846385285855e-06, + "loss": 1.152, + "step": 13312 + }, + { + "epoch": 0.7278542432650383, + "grad_norm": 1.6104133129119873, + "learning_rate": 3.8584041655298606e-06, + "loss": 1.4529, + "step": 13313 + }, + { + "epoch": 0.7279089157087599, + "grad_norm": 1.8772867918014526, + "learning_rate": 3.856962150856167e-06, + "loss": 1.4315, + "step": 13314 + }, + { + "epoch": 0.7279635881524814, + "grad_norm": 2.1236491203308105, + "learning_rate": 3.855520341312922e-06, + "loss": 1.2874, + "step": 13315 + }, + { + "epoch": 0.728018260596203, + "grad_norm": 1.767043113708496, + "learning_rate": 3.854078736948268e-06, + "loss": 1.4277, + "step": 13316 + }, + { + "epoch": 0.7280729330399246, + "grad_norm": 1.72379732131958, + "learning_rate": 3.85263733781035e-06, + "loss": 1.2941, + "step": 13317 + }, + { + "epoch": 0.7281276054836461, + "grad_norm": 2.4358315467834473, + "learning_rate": 3.851196143947296e-06, + "loss": 1.2256, + "step": 13318 + }, + { + "epoch": 0.7281822779273677, + "grad_norm": 1.437806487083435, + "learning_rate": 3.849755155407229e-06, + "loss": 1.4857, + "step": 13319 + }, + { + "epoch": 0.7282369503710893, + "grad_norm": 1.8795044422149658, + "learning_rate": 3.848314372238272e-06, + "loss": 1.4064, + "step": 13320 + }, + { + "epoch": 0.7282916228148107, + "grad_norm": 1.5820749998092651, + "learning_rate": 3.846873794488534e-06, + "loss": 1.5493, + "step": 13321 + }, + { + "epoch": 0.7283462952585323, + "grad_norm": 1.7500860691070557, + "learning_rate": 3.845433422206119e-06, + "loss": 1.2216, + "step": 13322 + }, + { + "epoch": 0.7284009677022538, + "grad_norm": 1.6226259469985962, + "learning_rate": 3.843993255439124e-06, + "loss": 1.4843, + "step": 13323 + }, + { + "epoch": 0.7284556401459754, + "grad_norm": 1.4640028476715088, + "learning_rate": 3.842553294235635e-06, + "loss": 1.5208, + "step": 13324 + }, + { + "epoch": 0.728510312589697, + "grad_norm": 1.5230238437652588, + "learning_rate": 3.841113538643745e-06, + "loss": 1.5823, + "step": 13325 + }, + { + "epoch": 0.7285649850334185, + "grad_norm": 1.3446518182754517, + "learning_rate": 3.839673988711526e-06, + "loss": 1.3054, + "step": 13326 + }, + { + "epoch": 0.7286196574771401, + "grad_norm": 2.5348308086395264, + "learning_rate": 3.838234644487045e-06, + "loss": 1.6129, + "step": 13327 + }, + { + "epoch": 0.7286743299208617, + "grad_norm": 1.5821940898895264, + "learning_rate": 3.836795506018371e-06, + "loss": 1.4604, + "step": 13328 + }, + { + "epoch": 0.7287290023645832, + "grad_norm": 1.9905047416687012, + "learning_rate": 3.835356573353558e-06, + "loss": 1.3828, + "step": 13329 + }, + { + "epoch": 0.7287836748083047, + "grad_norm": 1.4087319374084473, + "learning_rate": 3.833917846540651e-06, + "loss": 1.5222, + "step": 13330 + }, + { + "epoch": 0.7288383472520263, + "grad_norm": 1.859775185585022, + "learning_rate": 3.8324793256277e-06, + "loss": 1.3628, + "step": 13331 + }, + { + "epoch": 0.7288930196957478, + "grad_norm": 2.250486373901367, + "learning_rate": 3.831041010662737e-06, + "loss": 1.3514, + "step": 13332 + }, + { + "epoch": 0.7289476921394694, + "grad_norm": 2.0011510848999023, + "learning_rate": 3.829602901693788e-06, + "loss": 1.3354, + "step": 13333 + }, + { + "epoch": 0.729002364583191, + "grad_norm": 1.6097612380981445, + "learning_rate": 3.828164998768879e-06, + "loss": 1.6236, + "step": 13334 + }, + { + "epoch": 0.7290570370269125, + "grad_norm": 1.331938624382019, + "learning_rate": 3.826727301936025e-06, + "loss": 1.7243, + "step": 13335 + }, + { + "epoch": 0.7291117094706341, + "grad_norm": 1.4311782121658325, + "learning_rate": 3.8252898112432315e-06, + "loss": 1.6403, + "step": 13336 + }, + { + "epoch": 0.7291663819143556, + "grad_norm": 1.3544336557388306, + "learning_rate": 3.823852526738501e-06, + "loss": 1.3139, + "step": 13337 + }, + { + "epoch": 0.7292210543580772, + "grad_norm": 1.800646185874939, + "learning_rate": 3.822415448469824e-06, + "loss": 1.5619, + "step": 13338 + }, + { + "epoch": 0.7292757268017988, + "grad_norm": 1.5592379570007324, + "learning_rate": 3.820978576485194e-06, + "loss": 1.3414, + "step": 13339 + }, + { + "epoch": 0.7293303992455202, + "grad_norm": 1.2399264574050903, + "learning_rate": 3.8195419108325896e-06, + "loss": 1.5188, + "step": 13340 + }, + { + "epoch": 0.7293850716892418, + "grad_norm": 1.2310720682144165, + "learning_rate": 3.8181054515599806e-06, + "loss": 1.6523, + "step": 13341 + }, + { + "epoch": 0.7294397441329634, + "grad_norm": 1.3470350503921509, + "learning_rate": 3.816669198715339e-06, + "loss": 1.2851, + "step": 13342 + }, + { + "epoch": 0.7294944165766849, + "grad_norm": 1.6672903299331665, + "learning_rate": 3.815233152346623e-06, + "loss": 1.509, + "step": 13343 + }, + { + "epoch": 0.7295490890204065, + "grad_norm": 1.4953453540802002, + "learning_rate": 3.8137973125017825e-06, + "loss": 1.4215, + "step": 13344 + }, + { + "epoch": 0.7296037614641281, + "grad_norm": 1.5753779411315918, + "learning_rate": 3.812361679228769e-06, + "loss": 1.656, + "step": 13345 + }, + { + "epoch": 0.7296584339078496, + "grad_norm": 1.5551292896270752, + "learning_rate": 3.810926252575519e-06, + "loss": 1.5296, + "step": 13346 + }, + { + "epoch": 0.7297131063515712, + "grad_norm": 1.482080101966858, + "learning_rate": 3.8094910325899637e-06, + "loss": 1.4436, + "step": 13347 + }, + { + "epoch": 0.7297677787952928, + "grad_norm": 1.5198686122894287, + "learning_rate": 3.8080560193200288e-06, + "loss": 1.5291, + "step": 13348 + }, + { + "epoch": 0.7298224512390142, + "grad_norm": 1.3106558322906494, + "learning_rate": 3.8066212128136315e-06, + "loss": 1.6005, + "step": 13349 + }, + { + "epoch": 0.7298771236827358, + "grad_norm": 1.4008965492248535, + "learning_rate": 3.8051866131186876e-06, + "loss": 1.4821, + "step": 13350 + }, + { + "epoch": 0.7299317961264573, + "grad_norm": 1.9557528495788574, + "learning_rate": 3.8037522202831002e-06, + "loss": 1.2952, + "step": 13351 + }, + { + "epoch": 0.7299864685701789, + "grad_norm": 1.127260684967041, + "learning_rate": 3.8023180343547615e-06, + "loss": 1.4946, + "step": 13352 + }, + { + "epoch": 0.7300411410139005, + "grad_norm": 1.4749301671981812, + "learning_rate": 3.8008840553815707e-06, + "loss": 1.4104, + "step": 13353 + }, + { + "epoch": 0.730095813457622, + "grad_norm": 1.5706344842910767, + "learning_rate": 3.799450283411409e-06, + "loss": 1.237, + "step": 13354 + }, + { + "epoch": 0.7301504859013436, + "grad_norm": 2.0808985233306885, + "learning_rate": 3.798016718492148e-06, + "loss": 1.3015, + "step": 13355 + }, + { + "epoch": 0.7302051583450652, + "grad_norm": 1.5069395303726196, + "learning_rate": 3.796583360671665e-06, + "loss": 1.4928, + "step": 13356 + }, + { + "epoch": 0.7302598307887866, + "grad_norm": 1.218415379524231, + "learning_rate": 3.795150209997822e-06, + "loss": 1.6865, + "step": 13357 + }, + { + "epoch": 0.7303145032325082, + "grad_norm": 1.5947023630142212, + "learning_rate": 3.7937172665184684e-06, + "loss": 1.4281, + "step": 13358 + }, + { + "epoch": 0.7303691756762298, + "grad_norm": 1.5453779697418213, + "learning_rate": 3.7922845302814636e-06, + "loss": 1.3786, + "step": 13359 + }, + { + "epoch": 0.7304238481199513, + "grad_norm": 1.8705229759216309, + "learning_rate": 3.790852001334645e-06, + "loss": 1.3498, + "step": 13360 + }, + { + "epoch": 0.7304785205636729, + "grad_norm": 1.50159752368927, + "learning_rate": 3.7894196797258475e-06, + "loss": 1.2209, + "step": 13361 + }, + { + "epoch": 0.7305331930073945, + "grad_norm": 1.4355019330978394, + "learning_rate": 3.7879875655029018e-06, + "loss": 1.2913, + "step": 13362 + }, + { + "epoch": 0.730587865451116, + "grad_norm": 1.6755009889602661, + "learning_rate": 3.7865556587136233e-06, + "loss": 1.4049, + "step": 13363 + }, + { + "epoch": 0.7306425378948376, + "grad_norm": 1.7467372417449951, + "learning_rate": 3.7851239594058365e-06, + "loss": 1.2167, + "step": 13364 + }, + { + "epoch": 0.7306972103385591, + "grad_norm": 1.4835891723632812, + "learning_rate": 3.7836924676273433e-06, + "loss": 1.5495, + "step": 13365 + }, + { + "epoch": 0.7307518827822806, + "grad_norm": 1.8519418239593506, + "learning_rate": 3.7822611834259425e-06, + "loss": 1.2944, + "step": 13366 + }, + { + "epoch": 0.7308065552260022, + "grad_norm": 1.3229511976242065, + "learning_rate": 3.7808301068494347e-06, + "loss": 1.3752, + "step": 13367 + }, + { + "epoch": 0.7308612276697237, + "grad_norm": 1.4655908346176147, + "learning_rate": 3.7793992379456033e-06, + "loss": 1.3358, + "step": 13368 + }, + { + "epoch": 0.7309159001134453, + "grad_norm": 1.1009256839752197, + "learning_rate": 3.7779685767622255e-06, + "loss": 1.5847, + "step": 13369 + }, + { + "epoch": 0.7309705725571669, + "grad_norm": 1.6311774253845215, + "learning_rate": 3.77653812334708e-06, + "loss": 1.4868, + "step": 13370 + }, + { + "epoch": 0.7310252450008884, + "grad_norm": 1.5384341478347778, + "learning_rate": 3.77510787774793e-06, + "loss": 1.431, + "step": 13371 + }, + { + "epoch": 0.73107991744461, + "grad_norm": 1.8240870237350464, + "learning_rate": 3.7736778400125328e-06, + "loss": 1.2362, + "step": 13372 + }, + { + "epoch": 0.7311345898883316, + "grad_norm": 1.1110038757324219, + "learning_rate": 3.772248010188646e-06, + "loss": 1.6721, + "step": 13373 + }, + { + "epoch": 0.7311892623320531, + "grad_norm": 1.420612096786499, + "learning_rate": 3.7708183883240123e-06, + "loss": 1.5135, + "step": 13374 + }, + { + "epoch": 0.7312439347757747, + "grad_norm": 1.5103168487548828, + "learning_rate": 3.769388974466369e-06, + "loss": 1.2529, + "step": 13375 + }, + { + "epoch": 0.7312986072194962, + "grad_norm": 1.7370504140853882, + "learning_rate": 3.7679597686634495e-06, + "loss": 1.7345, + "step": 13376 + }, + { + "epoch": 0.7313532796632177, + "grad_norm": 1.6185237169265747, + "learning_rate": 3.766530770962974e-06, + "loss": 1.5414, + "step": 13377 + }, + { + "epoch": 0.7314079521069393, + "grad_norm": 2.0172908306121826, + "learning_rate": 3.7651019814126656e-06, + "loss": 1.4812, + "step": 13378 + }, + { + "epoch": 0.7314626245506608, + "grad_norm": 1.7700647115707397, + "learning_rate": 3.763673400060234e-06, + "loss": 1.2873, + "step": 13379 + }, + { + "epoch": 0.7315172969943824, + "grad_norm": 1.515883207321167, + "learning_rate": 3.7622450269533773e-06, + "loss": 1.4148, + "step": 13380 + }, + { + "epoch": 0.731571969438104, + "grad_norm": 1.73356032371521, + "learning_rate": 3.7608168621398e-06, + "loss": 1.4516, + "step": 13381 + }, + { + "epoch": 0.7316266418818255, + "grad_norm": 1.776974081993103, + "learning_rate": 3.759388905667188e-06, + "loss": 1.5008, + "step": 13382 + }, + { + "epoch": 0.7316813143255471, + "grad_norm": 1.3918668031692505, + "learning_rate": 3.757961157583221e-06, + "loss": 1.5177, + "step": 13383 + }, + { + "epoch": 0.7317359867692687, + "grad_norm": 1.5641367435455322, + "learning_rate": 3.756533617935583e-06, + "loss": 1.423, + "step": 13384 + }, + { + "epoch": 0.7317906592129901, + "grad_norm": 1.3920977115631104, + "learning_rate": 3.7551062867719367e-06, + "loss": 1.4804, + "step": 13385 + }, + { + "epoch": 0.7318453316567117, + "grad_norm": 1.6896178722381592, + "learning_rate": 3.753679164139947e-06, + "loss": 1.0599, + "step": 13386 + }, + { + "epoch": 0.7319000041004333, + "grad_norm": 1.5532642602920532, + "learning_rate": 3.752252250087267e-06, + "loss": 1.5761, + "step": 13387 + }, + { + "epoch": 0.7319546765441548, + "grad_norm": 1.3672583103179932, + "learning_rate": 3.750825544661545e-06, + "loss": 1.4353, + "step": 13388 + }, + { + "epoch": 0.7320093489878764, + "grad_norm": 1.6590806245803833, + "learning_rate": 3.749399047910418e-06, + "loss": 1.2118, + "step": 13389 + }, + { + "epoch": 0.732064021431598, + "grad_norm": 1.3703737258911133, + "learning_rate": 3.7479727598815287e-06, + "loss": 1.2742, + "step": 13390 + }, + { + "epoch": 0.7321186938753195, + "grad_norm": 1.4921975135803223, + "learning_rate": 3.7465466806225006e-06, + "loss": 1.4123, + "step": 13391 + }, + { + "epoch": 0.7321733663190411, + "grad_norm": 1.4451005458831787, + "learning_rate": 3.7451208101809477e-06, + "loss": 1.6033, + "step": 13392 + }, + { + "epoch": 0.7322280387627625, + "grad_norm": 1.6899950504302979, + "learning_rate": 3.7436951486044927e-06, + "loss": 1.4029, + "step": 13393 + }, + { + "epoch": 0.7322827112064841, + "grad_norm": 1.5812088251113892, + "learning_rate": 3.7422696959407347e-06, + "loss": 1.4982, + "step": 13394 + }, + { + "epoch": 0.7323373836502057, + "grad_norm": 1.4376424551010132, + "learning_rate": 3.740844452237279e-06, + "loss": 1.7144, + "step": 13395 + }, + { + "epoch": 0.7323920560939272, + "grad_norm": 1.940476417541504, + "learning_rate": 3.739419417541714e-06, + "loss": 1.4307, + "step": 13396 + }, + { + "epoch": 0.7324467285376488, + "grad_norm": 1.647002100944519, + "learning_rate": 3.7379945919016225e-06, + "loss": 1.6192, + "step": 13397 + }, + { + "epoch": 0.7325014009813704, + "grad_norm": 1.4097458124160767, + "learning_rate": 3.7365699753645888e-06, + "loss": 1.3685, + "step": 13398 + }, + { + "epoch": 0.7325560734250919, + "grad_norm": 1.3426270484924316, + "learning_rate": 3.7351455679781823e-06, + "loss": 1.2416, + "step": 13399 + }, + { + "epoch": 0.7326107458688135, + "grad_norm": 1.4601081609725952, + "learning_rate": 3.7337213697899656e-06, + "loss": 1.1943, + "step": 13400 + }, + { + "epoch": 0.7326654183125351, + "grad_norm": 1.475833773612976, + "learning_rate": 3.732297380847496e-06, + "loss": 1.6068, + "step": 13401 + }, + { + "epoch": 0.7327200907562565, + "grad_norm": 1.5628817081451416, + "learning_rate": 3.730873601198326e-06, + "loss": 1.4112, + "step": 13402 + }, + { + "epoch": 0.7327747631999781, + "grad_norm": 1.8037608861923218, + "learning_rate": 3.729450030889993e-06, + "loss": 1.5588, + "step": 13403 + }, + { + "epoch": 0.7328294356436997, + "grad_norm": 1.5412400960922241, + "learning_rate": 3.7280266699700406e-06, + "loss": 1.5274, + "step": 13404 + }, + { + "epoch": 0.7328841080874212, + "grad_norm": 1.416869878768921, + "learning_rate": 3.726603518485996e-06, + "loss": 1.296, + "step": 13405 + }, + { + "epoch": 0.7329387805311428, + "grad_norm": 2.439817428588867, + "learning_rate": 3.7251805764853776e-06, + "loss": 1.3748, + "step": 13406 + }, + { + "epoch": 0.7329934529748643, + "grad_norm": 1.3935067653656006, + "learning_rate": 3.7237578440157076e-06, + "loss": 1.615, + "step": 13407 + }, + { + "epoch": 0.7330481254185859, + "grad_norm": 1.7191493511199951, + "learning_rate": 3.722335321124487e-06, + "loss": 1.3438, + "step": 13408 + }, + { + "epoch": 0.7331027978623075, + "grad_norm": 1.9211534261703491, + "learning_rate": 3.720913007859225e-06, + "loss": 1.3107, + "step": 13409 + }, + { + "epoch": 0.733157470306029, + "grad_norm": 1.7295295000076294, + "learning_rate": 3.7194909042674123e-06, + "loss": 1.2475, + "step": 13410 + }, + { + "epoch": 0.7332121427497506, + "grad_norm": 1.5845015048980713, + "learning_rate": 3.7180690103965313e-06, + "loss": 1.2202, + "step": 13411 + }, + { + "epoch": 0.7332668151934721, + "grad_norm": 1.787498116493225, + "learning_rate": 3.7166473262940717e-06, + "loss": 1.6107, + "step": 13412 + }, + { + "epoch": 0.7333214876371936, + "grad_norm": 1.4365795850753784, + "learning_rate": 3.715225852007501e-06, + "loss": 1.2415, + "step": 13413 + }, + { + "epoch": 0.7333761600809152, + "grad_norm": 1.4242818355560303, + "learning_rate": 3.7138045875842877e-06, + "loss": 1.6006, + "step": 13414 + }, + { + "epoch": 0.7334308325246368, + "grad_norm": 1.6511727571487427, + "learning_rate": 3.7123835330718903e-06, + "loss": 1.3654, + "step": 13415 + }, + { + "epoch": 0.7334855049683583, + "grad_norm": 1.3396072387695312, + "learning_rate": 3.7109626885177606e-06, + "loss": 1.6676, + "step": 13416 + }, + { + "epoch": 0.7335401774120799, + "grad_norm": 1.448935866355896, + "learning_rate": 3.7095420539693417e-06, + "loss": 1.1643, + "step": 13417 + }, + { + "epoch": 0.7335948498558015, + "grad_norm": 1.4246575832366943, + "learning_rate": 3.7081216294740773e-06, + "loss": 1.4579, + "step": 13418 + }, + { + "epoch": 0.733649522299523, + "grad_norm": 1.866485834121704, + "learning_rate": 3.7067014150793955e-06, + "loss": 1.4372, + "step": 13419 + }, + { + "epoch": 0.7337041947432446, + "grad_norm": 1.6957005262374878, + "learning_rate": 3.7052814108327194e-06, + "loss": 1.3485, + "step": 13420 + }, + { + "epoch": 0.733758867186966, + "grad_norm": 1.6059281826019287, + "learning_rate": 3.70386161678147e-06, + "loss": 1.3834, + "step": 13421 + }, + { + "epoch": 0.7338135396306876, + "grad_norm": 1.5246071815490723, + "learning_rate": 3.7024420329730527e-06, + "loss": 1.3548, + "step": 13422 + }, + { + "epoch": 0.7338682120744092, + "grad_norm": 1.373416781425476, + "learning_rate": 3.701022659454877e-06, + "loss": 1.3583, + "step": 13423 + }, + { + "epoch": 0.7339228845181307, + "grad_norm": 4.120674133300781, + "learning_rate": 3.6996034962743354e-06, + "loss": 1.8005, + "step": 13424 + }, + { + "epoch": 0.7339775569618523, + "grad_norm": 1.590395212173462, + "learning_rate": 3.6981845434788188e-06, + "loss": 1.6435, + "step": 13425 + }, + { + "epoch": 0.7340322294055739, + "grad_norm": 1.5016955137252808, + "learning_rate": 3.696765801115706e-06, + "loss": 1.7373, + "step": 13426 + }, + { + "epoch": 0.7340869018492954, + "grad_norm": 1.151505947113037, + "learning_rate": 3.6953472692323757e-06, + "loss": 1.6891, + "step": 13427 + }, + { + "epoch": 0.734141574293017, + "grad_norm": 1.6324352025985718, + "learning_rate": 3.69392894787619e-06, + "loss": 1.5423, + "step": 13428 + }, + { + "epoch": 0.7341962467367386, + "grad_norm": 1.458947777748108, + "learning_rate": 3.6925108370945183e-06, + "loss": 1.4521, + "step": 13429 + }, + { + "epoch": 0.73425091918046, + "grad_norm": 1.479331135749817, + "learning_rate": 3.6910929369347105e-06, + "loss": 1.625, + "step": 13430 + }, + { + "epoch": 0.7343055916241816, + "grad_norm": 1.5854912996292114, + "learning_rate": 3.68967524744411e-06, + "loss": 1.1362, + "step": 13431 + }, + { + "epoch": 0.7343602640679032, + "grad_norm": 1.3506340980529785, + "learning_rate": 3.688257768670065e-06, + "loss": 1.5693, + "step": 13432 + }, + { + "epoch": 0.7344149365116247, + "grad_norm": 1.4448660612106323, + "learning_rate": 3.686840500659904e-06, + "loss": 1.2857, + "step": 13433 + }, + { + "epoch": 0.7344696089553463, + "grad_norm": 1.679440975189209, + "learning_rate": 3.685423443460948e-06, + "loss": 1.2744, + "step": 13434 + }, + { + "epoch": 0.7345242813990678, + "grad_norm": 1.3563364744186401, + "learning_rate": 3.6840065971205263e-06, + "loss": 1.4166, + "step": 13435 + }, + { + "epoch": 0.7345789538427894, + "grad_norm": 1.444667100906372, + "learning_rate": 3.6825899616859404e-06, + "loss": 1.3748, + "step": 13436 + }, + { + "epoch": 0.734633626286511, + "grad_norm": 1.5162273645401, + "learning_rate": 3.6811735372045043e-06, + "loss": 1.5431, + "step": 13437 + }, + { + "epoch": 0.7346882987302324, + "grad_norm": 1.7949000597000122, + "learning_rate": 3.6797573237235108e-06, + "loss": 1.1926, + "step": 13438 + }, + { + "epoch": 0.734742971173954, + "grad_norm": 1.449021816253662, + "learning_rate": 3.678341321290252e-06, + "loss": 1.4572, + "step": 13439 + }, + { + "epoch": 0.7347976436176756, + "grad_norm": 1.5321393013000488, + "learning_rate": 3.676925529952009e-06, + "loss": 1.4326, + "step": 13440 + }, + { + "epoch": 0.7348523160613971, + "grad_norm": 2.0915346145629883, + "learning_rate": 3.675509949756062e-06, + "loss": 1.4616, + "step": 13441 + }, + { + "epoch": 0.7349069885051187, + "grad_norm": 1.6372709274291992, + "learning_rate": 3.674094580749674e-06, + "loss": 1.5377, + "step": 13442 + }, + { + "epoch": 0.7349616609488403, + "grad_norm": 1.380859136581421, + "learning_rate": 3.6726794229801168e-06, + "loss": 1.337, + "step": 13443 + }, + { + "epoch": 0.7350163333925618, + "grad_norm": 1.5480071306228638, + "learning_rate": 3.671264476494639e-06, + "loss": 1.4178, + "step": 13444 + }, + { + "epoch": 0.7350710058362834, + "grad_norm": 1.3003185987472534, + "learning_rate": 3.66984974134049e-06, + "loss": 1.5593, + "step": 13445 + }, + { + "epoch": 0.735125678280005, + "grad_norm": 1.1428664922714233, + "learning_rate": 3.668435217564915e-06, + "loss": 1.3795, + "step": 13446 + }, + { + "epoch": 0.7351803507237265, + "grad_norm": 1.6277978420257568, + "learning_rate": 3.6670209052151452e-06, + "loss": 1.5332, + "step": 13447 + }, + { + "epoch": 0.735235023167448, + "grad_norm": 1.5290156602859497, + "learning_rate": 3.665606804338405e-06, + "loss": 1.2911, + "step": 13448 + }, + { + "epoch": 0.7352896956111695, + "grad_norm": 1.8016371726989746, + "learning_rate": 3.664192914981921e-06, + "loss": 1.5448, + "step": 13449 + }, + { + "epoch": 0.7353443680548911, + "grad_norm": 1.6449944972991943, + "learning_rate": 3.6627792371928993e-06, + "loss": 1.2531, + "step": 13450 + }, + { + "epoch": 0.7353990404986127, + "grad_norm": 1.6651976108551025, + "learning_rate": 3.6613657710185537e-06, + "loss": 1.3654, + "step": 13451 + }, + { + "epoch": 0.7354537129423342, + "grad_norm": 1.9149733781814575, + "learning_rate": 3.659952516506079e-06, + "loss": 1.6981, + "step": 13452 + }, + { + "epoch": 0.7355083853860558, + "grad_norm": 1.4982494115829468, + "learning_rate": 3.658539473702667e-06, + "loss": 1.6464, + "step": 13453 + }, + { + "epoch": 0.7355630578297774, + "grad_norm": 1.947449803352356, + "learning_rate": 3.657126642655503e-06, + "loss": 1.1529, + "step": 13454 + }, + { + "epoch": 0.7356177302734989, + "grad_norm": 1.4240318536758423, + "learning_rate": 3.655714023411764e-06, + "loss": 1.3072, + "step": 13455 + }, + { + "epoch": 0.7356724027172205, + "grad_norm": 1.6242623329162598, + "learning_rate": 3.654301616018617e-06, + "loss": 1.471, + "step": 13456 + }, + { + "epoch": 0.735727075160942, + "grad_norm": 1.4453785419464111, + "learning_rate": 3.6528894205232346e-06, + "loss": 1.4064, + "step": 13457 + }, + { + "epoch": 0.7357817476046635, + "grad_norm": 1.4998019933700562, + "learning_rate": 3.6514774369727678e-06, + "loss": 1.4316, + "step": 13458 + }, + { + "epoch": 0.7358364200483851, + "grad_norm": 1.7329906225204468, + "learning_rate": 3.650065665414363e-06, + "loss": 1.5457, + "step": 13459 + }, + { + "epoch": 0.7358910924921067, + "grad_norm": 1.302865982055664, + "learning_rate": 3.6486541058951696e-06, + "loss": 1.5972, + "step": 13460 + }, + { + "epoch": 0.7359457649358282, + "grad_norm": 1.6930270195007324, + "learning_rate": 3.6472427584623194e-06, + "loss": 1.1863, + "step": 13461 + }, + { + "epoch": 0.7360004373795498, + "grad_norm": 1.474751591682434, + "learning_rate": 3.6458316231629377e-06, + "loss": 1.547, + "step": 13462 + }, + { + "epoch": 0.7360551098232714, + "grad_norm": 1.5525152683258057, + "learning_rate": 3.6444207000441524e-06, + "loss": 1.563, + "step": 13463 + }, + { + "epoch": 0.7361097822669929, + "grad_norm": 1.3034594058990479, + "learning_rate": 3.6430099891530735e-06, + "loss": 1.4644, + "step": 13464 + }, + { + "epoch": 0.7361644547107145, + "grad_norm": 1.6057593822479248, + "learning_rate": 3.641599490536808e-06, + "loss": 1.2527, + "step": 13465 + }, + { + "epoch": 0.7362191271544359, + "grad_norm": 1.7634025812149048, + "learning_rate": 3.6401892042424557e-06, + "loss": 1.4792, + "step": 13466 + }, + { + "epoch": 0.7362737995981575, + "grad_norm": 1.2429136037826538, + "learning_rate": 3.638779130317106e-06, + "loss": 1.4571, + "step": 13467 + }, + { + "epoch": 0.7363284720418791, + "grad_norm": 1.6971371173858643, + "learning_rate": 3.637369268807852e-06, + "loss": 1.3704, + "step": 13468 + }, + { + "epoch": 0.7363831444856006, + "grad_norm": 1.929516077041626, + "learning_rate": 3.6359596197617687e-06, + "loss": 1.5514, + "step": 13469 + }, + { + "epoch": 0.7364378169293222, + "grad_norm": 1.8308826684951782, + "learning_rate": 3.6345501832259233e-06, + "loss": 1.3507, + "step": 13470 + }, + { + "epoch": 0.7364924893730438, + "grad_norm": 1.4731597900390625, + "learning_rate": 3.6331409592473887e-06, + "loss": 1.3117, + "step": 13471 + }, + { + "epoch": 0.7365471618167653, + "grad_norm": 1.3628991842269897, + "learning_rate": 3.631731947873217e-06, + "loss": 1.5022, + "step": 13472 + }, + { + "epoch": 0.7366018342604869, + "grad_norm": 1.4617990255355835, + "learning_rate": 3.6303231491504566e-06, + "loss": 1.1006, + "step": 13473 + }, + { + "epoch": 0.7366565067042085, + "grad_norm": 1.5195577144622803, + "learning_rate": 3.628914563126156e-06, + "loss": 1.3269, + "step": 13474 + }, + { + "epoch": 0.7367111791479299, + "grad_norm": 1.3787691593170166, + "learning_rate": 3.6275061898473484e-06, + "loss": 1.7148, + "step": 13475 + }, + { + "epoch": 0.7367658515916515, + "grad_norm": 1.5985052585601807, + "learning_rate": 3.626098029361059e-06, + "loss": 1.4943, + "step": 13476 + }, + { + "epoch": 0.7368205240353731, + "grad_norm": 1.3416112661361694, + "learning_rate": 3.624690081714317e-06, + "loss": 1.5003, + "step": 13477 + }, + { + "epoch": 0.7368751964790946, + "grad_norm": 1.5234931707382202, + "learning_rate": 3.6232823469541333e-06, + "loss": 1.4644, + "step": 13478 + }, + { + "epoch": 0.7369298689228162, + "grad_norm": 1.6523114442825317, + "learning_rate": 3.621874825127515e-06, + "loss": 1.5092, + "step": 13479 + }, + { + "epoch": 0.7369845413665377, + "grad_norm": 1.5806870460510254, + "learning_rate": 3.620467516281464e-06, + "loss": 1.4802, + "step": 13480 + }, + { + "epoch": 0.7370392138102593, + "grad_norm": 1.8456709384918213, + "learning_rate": 3.6190604204629685e-06, + "loss": 1.4176, + "step": 13481 + }, + { + "epoch": 0.7370938862539809, + "grad_norm": 1.8368992805480957, + "learning_rate": 3.6176535377190226e-06, + "loss": 1.3702, + "step": 13482 + }, + { + "epoch": 0.7371485586977023, + "grad_norm": 1.376426339149475, + "learning_rate": 3.616246868096601e-06, + "loss": 1.5097, + "step": 13483 + }, + { + "epoch": 0.7372032311414239, + "grad_norm": 1.4757936000823975, + "learning_rate": 3.614840411642674e-06, + "loss": 1.1301, + "step": 13484 + }, + { + "epoch": 0.7372579035851455, + "grad_norm": 1.3476494550704956, + "learning_rate": 3.6134341684042116e-06, + "loss": 1.5272, + "step": 13485 + }, + { + "epoch": 0.737312576028867, + "grad_norm": 1.6518714427947998, + "learning_rate": 3.6120281384281685e-06, + "loss": 1.4216, + "step": 13486 + }, + { + "epoch": 0.7373672484725886, + "grad_norm": 1.3580073118209839, + "learning_rate": 3.6106223217614934e-06, + "loss": 1.5854, + "step": 13487 + }, + { + "epoch": 0.7374219209163102, + "grad_norm": 1.2053204774856567, + "learning_rate": 3.6092167184511352e-06, + "loss": 1.593, + "step": 13488 + }, + { + "epoch": 0.7374765933600317, + "grad_norm": 1.5405246019363403, + "learning_rate": 3.6078113285440277e-06, + "loss": 1.3569, + "step": 13489 + }, + { + "epoch": 0.7375312658037533, + "grad_norm": 1.6679699420928955, + "learning_rate": 3.606406152087095e-06, + "loss": 1.4497, + "step": 13490 + }, + { + "epoch": 0.7375859382474749, + "grad_norm": 1.539584755897522, + "learning_rate": 3.6050011891272686e-06, + "loss": 1.6273, + "step": 13491 + }, + { + "epoch": 0.7376406106911964, + "grad_norm": 1.9106731414794922, + "learning_rate": 3.6035964397114577e-06, + "loss": 1.408, + "step": 13492 + }, + { + "epoch": 0.737695283134918, + "grad_norm": 1.47463858127594, + "learning_rate": 3.6021919038865716e-06, + "loss": 1.139, + "step": 13493 + }, + { + "epoch": 0.7377499555786394, + "grad_norm": 1.762960433959961, + "learning_rate": 3.6007875816995108e-06, + "loss": 1.5902, + "step": 13494 + }, + { + "epoch": 0.737804628022361, + "grad_norm": 1.6736692190170288, + "learning_rate": 3.5993834731971654e-06, + "loss": 1.434, + "step": 13495 + }, + { + "epoch": 0.7378593004660826, + "grad_norm": 1.6401875019073486, + "learning_rate": 3.5979795784264294e-06, + "loss": 1.2882, + "step": 13496 + }, + { + "epoch": 0.7379139729098041, + "grad_norm": 1.6848772764205933, + "learning_rate": 3.596575897434178e-06, + "loss": 1.5213, + "step": 13497 + }, + { + "epoch": 0.7379686453535257, + "grad_norm": 1.5916833877563477, + "learning_rate": 3.5951724302672796e-06, + "loss": 1.5203, + "step": 13498 + }, + { + "epoch": 0.7380233177972473, + "grad_norm": 1.4501186609268188, + "learning_rate": 3.593769176972607e-06, + "loss": 1.3378, + "step": 13499 + }, + { + "epoch": 0.7380779902409688, + "grad_norm": 1.4621495008468628, + "learning_rate": 3.5923661375970142e-06, + "loss": 1.4979, + "step": 13500 + }, + { + "epoch": 0.7381326626846904, + "grad_norm": 1.7560874223709106, + "learning_rate": 3.590963312187348e-06, + "loss": 1.3873, + "step": 13501 + }, + { + "epoch": 0.738187335128412, + "grad_norm": 1.5127984285354614, + "learning_rate": 3.5895607007904597e-06, + "loss": 1.4126, + "step": 13502 + }, + { + "epoch": 0.7382420075721334, + "grad_norm": 1.889890193939209, + "learning_rate": 3.5881583034531832e-06, + "loss": 1.5121, + "step": 13503 + }, + { + "epoch": 0.738296680015855, + "grad_norm": 1.3473280668258667, + "learning_rate": 3.5867561202223455e-06, + "loss": 1.5666, + "step": 13504 + }, + { + "epoch": 0.7383513524595766, + "grad_norm": 1.275011420249939, + "learning_rate": 3.585354151144771e-06, + "loss": 1.5359, + "step": 13505 + }, + { + "epoch": 0.7384060249032981, + "grad_norm": 2.07246470451355, + "learning_rate": 3.5839523962672694e-06, + "loss": 1.3936, + "step": 13506 + }, + { + "epoch": 0.7384606973470197, + "grad_norm": 1.1872385740280151, + "learning_rate": 3.5825508556366574e-06, + "loss": 1.3998, + "step": 13507 + }, + { + "epoch": 0.7385153697907412, + "grad_norm": 1.4372309446334839, + "learning_rate": 3.581149529299731e-06, + "loss": 1.6131, + "step": 13508 + }, + { + "epoch": 0.7385700422344628, + "grad_norm": 1.7831836938858032, + "learning_rate": 3.5797484173032806e-06, + "loss": 1.5304, + "step": 13509 + }, + { + "epoch": 0.7386247146781844, + "grad_norm": 1.4620749950408936, + "learning_rate": 3.5783475196940997e-06, + "loss": 1.342, + "step": 13510 + }, + { + "epoch": 0.7386793871219058, + "grad_norm": 1.4250094890594482, + "learning_rate": 3.576946836518964e-06, + "loss": 1.3202, + "step": 13511 + }, + { + "epoch": 0.7387340595656274, + "grad_norm": 1.370801329612732, + "learning_rate": 3.5755463678246417e-06, + "loss": 1.4112, + "step": 13512 + }, + { + "epoch": 0.738788732009349, + "grad_norm": 1.6409895420074463, + "learning_rate": 3.574146113657906e-06, + "loss": 1.3425, + "step": 13513 + }, + { + "epoch": 0.7388434044530705, + "grad_norm": 1.4162205457687378, + "learning_rate": 3.5727460740655097e-06, + "loss": 1.4649, + "step": 13514 + }, + { + "epoch": 0.7388980768967921, + "grad_norm": 1.4709242582321167, + "learning_rate": 3.5713462490942006e-06, + "loss": 1.5013, + "step": 13515 + }, + { + "epoch": 0.7389527493405137, + "grad_norm": 2.0246942043304443, + "learning_rate": 3.569946638790729e-06, + "loss": 1.4807, + "step": 13516 + }, + { + "epoch": 0.7390074217842352, + "grad_norm": 1.8153367042541504, + "learning_rate": 3.5685472432018274e-06, + "loss": 1.5695, + "step": 13517 + }, + { + "epoch": 0.7390620942279568, + "grad_norm": 1.341246485710144, + "learning_rate": 3.567148062374226e-06, + "loss": 1.6878, + "step": 13518 + }, + { + "epoch": 0.7391167666716784, + "grad_norm": 1.6259280443191528, + "learning_rate": 3.565749096354645e-06, + "loss": 1.2624, + "step": 13519 + }, + { + "epoch": 0.7391714391153998, + "grad_norm": 1.4196679592132568, + "learning_rate": 3.5643503451897975e-06, + "loss": 1.371, + "step": 13520 + }, + { + "epoch": 0.7392261115591214, + "grad_norm": 2.685206890106201, + "learning_rate": 3.562951808926397e-06, + "loss": 1.6684, + "step": 13521 + }, + { + "epoch": 0.7392807840028429, + "grad_norm": 1.4687410593032837, + "learning_rate": 3.56155348761114e-06, + "loss": 1.6524, + "step": 13522 + }, + { + "epoch": 0.7393354564465645, + "grad_norm": 1.4882986545562744, + "learning_rate": 3.5601553812907174e-06, + "loss": 1.399, + "step": 13523 + }, + { + "epoch": 0.7393901288902861, + "grad_norm": 1.5915703773498535, + "learning_rate": 3.5587574900118215e-06, + "loss": 1.4003, + "step": 13524 + }, + { + "epoch": 0.7394448013340076, + "grad_norm": 1.660284161567688, + "learning_rate": 3.5573598138211284e-06, + "loss": 1.3778, + "step": 13525 + }, + { + "epoch": 0.7394994737777292, + "grad_norm": 1.556099534034729, + "learning_rate": 3.5559623527653054e-06, + "loss": 1.2757, + "step": 13526 + }, + { + "epoch": 0.7395541462214508, + "grad_norm": 1.534103512763977, + "learning_rate": 3.5545651068910245e-06, + "loss": 1.45, + "step": 13527 + }, + { + "epoch": 0.7396088186651723, + "grad_norm": 1.2167022228240967, + "learning_rate": 3.553168076244938e-06, + "loss": 1.5636, + "step": 13528 + }, + { + "epoch": 0.7396634911088938, + "grad_norm": 1.7684038877487183, + "learning_rate": 3.5517712608737e-06, + "loss": 1.3609, + "step": 13529 + }, + { + "epoch": 0.7397181635526154, + "grad_norm": 1.3611801862716675, + "learning_rate": 3.5503746608239487e-06, + "loss": 1.6665, + "step": 13530 + }, + { + "epoch": 0.7397728359963369, + "grad_norm": 1.6211836338043213, + "learning_rate": 3.548978276142323e-06, + "loss": 1.4681, + "step": 13531 + }, + { + "epoch": 0.7398275084400585, + "grad_norm": 1.3403797149658203, + "learning_rate": 3.547582106875447e-06, + "loss": 1.4821, + "step": 13532 + }, + { + "epoch": 0.7398821808837801, + "grad_norm": 1.360535740852356, + "learning_rate": 3.546186153069948e-06, + "loss": 1.3856, + "step": 13533 + }, + { + "epoch": 0.7399368533275016, + "grad_norm": 1.7772529125213623, + "learning_rate": 3.5447904147724344e-06, + "loss": 1.3501, + "step": 13534 + }, + { + "epoch": 0.7399915257712232, + "grad_norm": 1.411773920059204, + "learning_rate": 3.5433948920295216e-06, + "loss": 1.2141, + "step": 13535 + }, + { + "epoch": 0.7400461982149447, + "grad_norm": 1.556365966796875, + "learning_rate": 3.541999584887802e-06, + "loss": 1.4561, + "step": 13536 + }, + { + "epoch": 0.7401008706586663, + "grad_norm": 1.4803986549377441, + "learning_rate": 3.5406044933938688e-06, + "loss": 1.2446, + "step": 13537 + }, + { + "epoch": 0.7401555431023878, + "grad_norm": 1.4750055074691772, + "learning_rate": 3.5392096175943113e-06, + "loss": 1.4979, + "step": 13538 + }, + { + "epoch": 0.7402102155461093, + "grad_norm": 1.6214041709899902, + "learning_rate": 3.5378149575357058e-06, + "loss": 1.3023, + "step": 13539 + }, + { + "epoch": 0.7402648879898309, + "grad_norm": 1.593661904335022, + "learning_rate": 3.536420513264619e-06, + "loss": 1.4531, + "step": 13540 + }, + { + "epoch": 0.7403195604335525, + "grad_norm": 1.7279343605041504, + "learning_rate": 3.535026284827623e-06, + "loss": 1.3411, + "step": 13541 + }, + { + "epoch": 0.740374232877274, + "grad_norm": 1.673140048980713, + "learning_rate": 3.533632272271269e-06, + "loss": 1.1679, + "step": 13542 + }, + { + "epoch": 0.7404289053209956, + "grad_norm": 1.7157189846038818, + "learning_rate": 3.532238475642108e-06, + "loss": 1.4485, + "step": 13543 + }, + { + "epoch": 0.7404835777647172, + "grad_norm": 1.6784745454788208, + "learning_rate": 3.5308448949866805e-06, + "loss": 1.4228, + "step": 13544 + }, + { + "epoch": 0.7405382502084387, + "grad_norm": 1.4127827882766724, + "learning_rate": 3.529451530351522e-06, + "loss": 1.3481, + "step": 13545 + }, + { + "epoch": 0.7405929226521603, + "grad_norm": 1.8730837106704712, + "learning_rate": 3.528058381783158e-06, + "loss": 1.5311, + "step": 13546 + }, + { + "epoch": 0.7406475950958818, + "grad_norm": 1.4150112867355347, + "learning_rate": 3.526665449328115e-06, + "loss": 1.6932, + "step": 13547 + }, + { + "epoch": 0.7407022675396033, + "grad_norm": 1.7303844690322876, + "learning_rate": 3.5252727330328996e-06, + "loss": 1.4739, + "step": 13548 + }, + { + "epoch": 0.7407569399833249, + "grad_norm": 1.513749122619629, + "learning_rate": 3.5238802329440234e-06, + "loss": 1.4067, + "step": 13549 + }, + { + "epoch": 0.7408116124270464, + "grad_norm": 1.5933562517166138, + "learning_rate": 3.522487949107983e-06, + "loss": 1.7831, + "step": 13550 + }, + { + "epoch": 0.740866284870768, + "grad_norm": 1.478091835975647, + "learning_rate": 3.5210958815712672e-06, + "loss": 1.496, + "step": 13551 + }, + { + "epoch": 0.7409209573144896, + "grad_norm": 1.4986194372177124, + "learning_rate": 3.5197040303803665e-06, + "loss": 1.5044, + "step": 13552 + }, + { + "epoch": 0.7409756297582111, + "grad_norm": 1.558963418006897, + "learning_rate": 3.5183123955817545e-06, + "loss": 1.4575, + "step": 13553 + }, + { + "epoch": 0.7410303022019327, + "grad_norm": 1.6033846139907837, + "learning_rate": 3.516920977221898e-06, + "loss": 1.2444, + "step": 13554 + }, + { + "epoch": 0.7410849746456543, + "grad_norm": 1.2315057516098022, + "learning_rate": 3.515529775347267e-06, + "loss": 1.54, + "step": 13555 + }, + { + "epoch": 0.7411396470893757, + "grad_norm": 1.6968098878860474, + "learning_rate": 3.514138790004312e-06, + "loss": 1.2919, + "step": 13556 + }, + { + "epoch": 0.7411943195330973, + "grad_norm": 1.8097432851791382, + "learning_rate": 3.5127480212394836e-06, + "loss": 1.3162, + "step": 13557 + }, + { + "epoch": 0.7412489919768189, + "grad_norm": 2.0429158210754395, + "learning_rate": 3.5113574690992203e-06, + "loss": 1.4559, + "step": 13558 + }, + { + "epoch": 0.7413036644205404, + "grad_norm": 1.5949770212173462, + "learning_rate": 3.509967133629958e-06, + "loss": 1.2839, + "step": 13559 + }, + { + "epoch": 0.741358336864262, + "grad_norm": 1.4280574321746826, + "learning_rate": 3.5085770148781195e-06, + "loss": 1.4416, + "step": 13560 + }, + { + "epoch": 0.7414130093079836, + "grad_norm": 1.3137682676315308, + "learning_rate": 3.507187112890129e-06, + "loss": 1.202, + "step": 13561 + }, + { + "epoch": 0.7414676817517051, + "grad_norm": 1.843167781829834, + "learning_rate": 3.505797427712394e-06, + "loss": 1.5121, + "step": 13562 + }, + { + "epoch": 0.7415223541954267, + "grad_norm": 2.5765559673309326, + "learning_rate": 3.504407959391326e-06, + "loss": 1.1946, + "step": 13563 + }, + { + "epoch": 0.7415770266391482, + "grad_norm": 1.622389316558838, + "learning_rate": 3.503018707973318e-06, + "loss": 1.3932, + "step": 13564 + }, + { + "epoch": 0.7416316990828697, + "grad_norm": 1.471714973449707, + "learning_rate": 3.5016296735047584e-06, + "loss": 1.4797, + "step": 13565 + }, + { + "epoch": 0.7416863715265913, + "grad_norm": 1.2496395111083984, + "learning_rate": 3.5002408560320356e-06, + "loss": 1.5243, + "step": 13566 + }, + { + "epoch": 0.7417410439703128, + "grad_norm": 1.7971158027648926, + "learning_rate": 3.4988522556015223e-06, + "loss": 1.3459, + "step": 13567 + }, + { + "epoch": 0.7417957164140344, + "grad_norm": 1.5449950695037842, + "learning_rate": 3.4974638722595887e-06, + "loss": 1.511, + "step": 13568 + }, + { + "epoch": 0.741850388857756, + "grad_norm": 1.457847237586975, + "learning_rate": 3.496075706052594e-06, + "loss": 1.2815, + "step": 13569 + }, + { + "epoch": 0.7419050613014775, + "grad_norm": 1.4136158227920532, + "learning_rate": 3.4946877570268943e-06, + "loss": 1.6193, + "step": 13570 + }, + { + "epoch": 0.7419597337451991, + "grad_norm": 1.7531529664993286, + "learning_rate": 3.493300025228832e-06, + "loss": 1.2692, + "step": 13571 + }, + { + "epoch": 0.7420144061889207, + "grad_norm": 1.7885193824768066, + "learning_rate": 3.4919125107047537e-06, + "loss": 1.5521, + "step": 13572 + }, + { + "epoch": 0.7420690786326422, + "grad_norm": 1.4070873260498047, + "learning_rate": 3.490525213500987e-06, + "loss": 1.4412, + "step": 13573 + }, + { + "epoch": 0.7421237510763637, + "grad_norm": 1.2939636707305908, + "learning_rate": 3.4891381336638565e-06, + "loss": 1.4478, + "step": 13574 + }, + { + "epoch": 0.7421784235200853, + "grad_norm": 1.7086808681488037, + "learning_rate": 3.4877512712396856e-06, + "loss": 1.5068, + "step": 13575 + }, + { + "epoch": 0.7422330959638068, + "grad_norm": 1.395593285560608, + "learning_rate": 3.486364626274776e-06, + "loss": 1.2415, + "step": 13576 + }, + { + "epoch": 0.7422877684075284, + "grad_norm": 1.3202831745147705, + "learning_rate": 3.484978198815442e-06, + "loss": 1.552, + "step": 13577 + }, + { + "epoch": 0.7423424408512499, + "grad_norm": 1.3686121702194214, + "learning_rate": 3.483591988907973e-06, + "loss": 1.4706, + "step": 13578 + }, + { + "epoch": 0.7423971132949715, + "grad_norm": 1.4906132221221924, + "learning_rate": 3.482205996598654e-06, + "loss": 1.4652, + "step": 13579 + }, + { + "epoch": 0.7424517857386931, + "grad_norm": 1.3575156927108765, + "learning_rate": 3.480820221933776e-06, + "loss": 1.4994, + "step": 13580 + }, + { + "epoch": 0.7425064581824146, + "grad_norm": 1.384548544883728, + "learning_rate": 3.4794346649596088e-06, + "loss": 1.5532, + "step": 13581 + }, + { + "epoch": 0.7425611306261362, + "grad_norm": 1.6566728353500366, + "learning_rate": 3.4780493257224192e-06, + "loss": 1.3924, + "step": 13582 + }, + { + "epoch": 0.7426158030698577, + "grad_norm": 1.4460450410842896, + "learning_rate": 3.4766642042684652e-06, + "loss": 1.4371, + "step": 13583 + }, + { + "epoch": 0.7426704755135792, + "grad_norm": 1.5390597581863403, + "learning_rate": 3.4752793006440024e-06, + "loss": 1.5546, + "step": 13584 + }, + { + "epoch": 0.7427251479573008, + "grad_norm": 1.6762757301330566, + "learning_rate": 3.4738946148952703e-06, + "loss": 1.5645, + "step": 13585 + }, + { + "epoch": 0.7427798204010224, + "grad_norm": 1.291329026222229, + "learning_rate": 3.472510147068515e-06, + "loss": 1.4156, + "step": 13586 + }, + { + "epoch": 0.7428344928447439, + "grad_norm": 1.1191927194595337, + "learning_rate": 3.4711258972099624e-06, + "loss": 1.6274, + "step": 13587 + }, + { + "epoch": 0.7428891652884655, + "grad_norm": 1.366170883178711, + "learning_rate": 3.4697418653658345e-06, + "loss": 1.2909, + "step": 13588 + }, + { + "epoch": 0.7429438377321871, + "grad_norm": 1.7759438753128052, + "learning_rate": 3.468358051582352e-06, + "loss": 1.6357, + "step": 13589 + }, + { + "epoch": 0.7429985101759086, + "grad_norm": 1.767216682434082, + "learning_rate": 3.4669744559057173e-06, + "loss": 1.3812, + "step": 13590 + }, + { + "epoch": 0.7430531826196302, + "grad_norm": 1.8095852136611938, + "learning_rate": 3.46559107838214e-06, + "loss": 1.3918, + "step": 13591 + }, + { + "epoch": 0.7431078550633516, + "grad_norm": 1.364656925201416, + "learning_rate": 3.4642079190578094e-06, + "loss": 1.4518, + "step": 13592 + }, + { + "epoch": 0.7431625275070732, + "grad_norm": 1.6571563482284546, + "learning_rate": 3.4628249779789105e-06, + "loss": 1.2164, + "step": 13593 + }, + { + "epoch": 0.7432171999507948, + "grad_norm": 1.5770747661590576, + "learning_rate": 3.461442255191628e-06, + "loss": 1.2758, + "step": 13594 + }, + { + "epoch": 0.7432718723945163, + "grad_norm": 1.7263402938842773, + "learning_rate": 3.4600597507421317e-06, + "loss": 1.4251, + "step": 13595 + }, + { + "epoch": 0.7433265448382379, + "grad_norm": 1.7208935022354126, + "learning_rate": 3.4586774646765875e-06, + "loss": 1.6994, + "step": 13596 + }, + { + "epoch": 0.7433812172819595, + "grad_norm": 1.6678797006607056, + "learning_rate": 3.4572953970411527e-06, + "loss": 1.1775, + "step": 13597 + }, + { + "epoch": 0.743435889725681, + "grad_norm": 2.2417829036712646, + "learning_rate": 3.4559135478819772e-06, + "loss": 1.589, + "step": 13598 + }, + { + "epoch": 0.7434905621694026, + "grad_norm": 1.3859736919403076, + "learning_rate": 3.4545319172452005e-06, + "loss": 1.3141, + "step": 13599 + }, + { + "epoch": 0.7435452346131242, + "grad_norm": 1.7452473640441895, + "learning_rate": 3.4531505051769665e-06, + "loss": 1.3413, + "step": 13600 + }, + { + "epoch": 0.7435999070568456, + "grad_norm": 1.459730863571167, + "learning_rate": 3.4517693117233995e-06, + "loss": 1.7035, + "step": 13601 + }, + { + "epoch": 0.7436545795005672, + "grad_norm": 1.1599042415618896, + "learning_rate": 3.450388336930618e-06, + "loss": 1.4553, + "step": 13602 + }, + { + "epoch": 0.7437092519442888, + "grad_norm": 2.408548593521118, + "learning_rate": 3.449007580844742e-06, + "loss": 1.1566, + "step": 13603 + }, + { + "epoch": 0.7437639243880103, + "grad_norm": 1.5350817441940308, + "learning_rate": 3.447627043511872e-06, + "loss": 1.3727, + "step": 13604 + }, + { + "epoch": 0.7438185968317319, + "grad_norm": 1.291683554649353, + "learning_rate": 3.446246724978115e-06, + "loss": 1.6246, + "step": 13605 + }, + { + "epoch": 0.7438732692754534, + "grad_norm": 1.8683141469955444, + "learning_rate": 3.444866625289558e-06, + "loss": 1.2222, + "step": 13606 + }, + { + "epoch": 0.743927941719175, + "grad_norm": 1.26542329788208, + "learning_rate": 3.4434867444922857e-06, + "loss": 1.6815, + "step": 13607 + }, + { + "epoch": 0.7439826141628966, + "grad_norm": 1.4425073862075806, + "learning_rate": 3.4421070826323775e-06, + "loss": 1.4694, + "step": 13608 + }, + { + "epoch": 0.744037286606618, + "grad_norm": 1.3886685371398926, + "learning_rate": 3.440727639755902e-06, + "loss": 1.637, + "step": 13609 + }, + { + "epoch": 0.7440919590503396, + "grad_norm": 1.8198752403259277, + "learning_rate": 3.4393484159089187e-06, + "loss": 1.3196, + "step": 13610 + }, + { + "epoch": 0.7441466314940612, + "grad_norm": 1.735657811164856, + "learning_rate": 3.4379694111374904e-06, + "loss": 1.4708, + "step": 13611 + }, + { + "epoch": 0.7442013039377827, + "grad_norm": 1.95780611038208, + "learning_rate": 3.4365906254876623e-06, + "loss": 1.3104, + "step": 13612 + }, + { + "epoch": 0.7442559763815043, + "grad_norm": 1.7772870063781738, + "learning_rate": 3.4352120590054705e-06, + "loss": 1.2261, + "step": 13613 + }, + { + "epoch": 0.7443106488252259, + "grad_norm": 1.187445044517517, + "learning_rate": 3.433833711736957e-06, + "loss": 1.5037, + "step": 13614 + }, + { + "epoch": 0.7443653212689474, + "grad_norm": 1.3714295625686646, + "learning_rate": 3.4324555837281435e-06, + "loss": 1.4709, + "step": 13615 + }, + { + "epoch": 0.744419993712669, + "grad_norm": 1.541676640510559, + "learning_rate": 3.431077675025045e-06, + "loss": 1.4939, + "step": 13616 + }, + { + "epoch": 0.7444746661563906, + "grad_norm": 1.1428449153900146, + "learning_rate": 3.4296999856736824e-06, + "loss": 1.5412, + "step": 13617 + }, + { + "epoch": 0.7445293386001121, + "grad_norm": 1.5325076580047607, + "learning_rate": 3.4283225157200507e-06, + "loss": 1.551, + "step": 13618 + }, + { + "epoch": 0.7445840110438336, + "grad_norm": 1.4853572845458984, + "learning_rate": 3.4269452652101543e-06, + "loss": 1.4729, + "step": 13619 + }, + { + "epoch": 0.7446386834875551, + "grad_norm": 1.6316351890563965, + "learning_rate": 3.42556823418998e-06, + "loss": 1.3682, + "step": 13620 + }, + { + "epoch": 0.7446933559312767, + "grad_norm": 1.3634811639785767, + "learning_rate": 3.4241914227055096e-06, + "loss": 1.3715, + "step": 13621 + }, + { + "epoch": 0.7447480283749983, + "grad_norm": 1.8535641431808472, + "learning_rate": 3.4228148308027186e-06, + "loss": 1.7449, + "step": 13622 + }, + { + "epoch": 0.7448027008187198, + "grad_norm": 1.497046947479248, + "learning_rate": 3.421438458527574e-06, + "loss": 1.1785, + "step": 13623 + }, + { + "epoch": 0.7448573732624414, + "grad_norm": 1.57645583152771, + "learning_rate": 3.4200623059260328e-06, + "loss": 1.54, + "step": 13624 + }, + { + "epoch": 0.744912045706163, + "grad_norm": 3.386538505554199, + "learning_rate": 3.4186863730440554e-06, + "loss": 1.3654, + "step": 13625 + }, + { + "epoch": 0.7449667181498845, + "grad_norm": 1.3944125175476074, + "learning_rate": 3.417310659927583e-06, + "loss": 1.3377, + "step": 13626 + }, + { + "epoch": 0.7450213905936061, + "grad_norm": 1.7473695278167725, + "learning_rate": 3.4159351666225515e-06, + "loss": 1.6927, + "step": 13627 + }, + { + "epoch": 0.7450760630373277, + "grad_norm": 1.4959642887115479, + "learning_rate": 3.414559893174898e-06, + "loss": 1.7, + "step": 13628 + }, + { + "epoch": 0.7451307354810491, + "grad_norm": 1.85264253616333, + "learning_rate": 3.4131848396305423e-06, + "loss": 1.2242, + "step": 13629 + }, + { + "epoch": 0.7451854079247707, + "grad_norm": 2.0092873573303223, + "learning_rate": 3.4118100060353985e-06, + "loss": 1.3375, + "step": 13630 + }, + { + "epoch": 0.7452400803684923, + "grad_norm": 1.6851097345352173, + "learning_rate": 3.4104353924353818e-06, + "loss": 1.5285, + "step": 13631 + }, + { + "epoch": 0.7452947528122138, + "grad_norm": 1.527086615562439, + "learning_rate": 3.4090609988763867e-06, + "loss": 1.5088, + "step": 13632 + }, + { + "epoch": 0.7453494252559354, + "grad_norm": 1.2942906618118286, + "learning_rate": 3.4076868254043138e-06, + "loss": 1.7792, + "step": 13633 + }, + { + "epoch": 0.7454040976996569, + "grad_norm": 1.9732340574264526, + "learning_rate": 3.4063128720650475e-06, + "loss": 1.6541, + "step": 13634 + }, + { + "epoch": 0.7454587701433785, + "grad_norm": 1.2819864749908447, + "learning_rate": 3.4049391389044674e-06, + "loss": 1.3954, + "step": 13635 + }, + { + "epoch": 0.7455134425871001, + "grad_norm": 1.3501678705215454, + "learning_rate": 3.4035656259684446e-06, + "loss": 1.2791, + "step": 13636 + }, + { + "epoch": 0.7455681150308215, + "grad_norm": 1.6023873090744019, + "learning_rate": 3.402192333302845e-06, + "loss": 1.5856, + "step": 13637 + }, + { + "epoch": 0.7456227874745431, + "grad_norm": 1.6761362552642822, + "learning_rate": 3.4008192609535216e-06, + "loss": 1.7443, + "step": 13638 + }, + { + "epoch": 0.7456774599182647, + "grad_norm": 1.1320700645446777, + "learning_rate": 3.3994464089663327e-06, + "loss": 1.637, + "step": 13639 + }, + { + "epoch": 0.7457321323619862, + "grad_norm": 1.7192710638046265, + "learning_rate": 3.3980737773871163e-06, + "loss": 1.4967, + "step": 13640 + }, + { + "epoch": 0.7457868048057078, + "grad_norm": 1.3434323072433472, + "learning_rate": 3.3967013662617053e-06, + "loss": 1.3709, + "step": 13641 + }, + { + "epoch": 0.7458414772494294, + "grad_norm": 1.5678414106369019, + "learning_rate": 3.3953291756359354e-06, + "loss": 1.5461, + "step": 13642 + }, + { + "epoch": 0.7458961496931509, + "grad_norm": 2.193091630935669, + "learning_rate": 3.3939572055556203e-06, + "loss": 1.3274, + "step": 13643 + }, + { + "epoch": 0.7459508221368725, + "grad_norm": 1.574170708656311, + "learning_rate": 3.392585456066574e-06, + "loss": 1.452, + "step": 13644 + }, + { + "epoch": 0.7460054945805941, + "grad_norm": 1.997931718826294, + "learning_rate": 3.3912139272146073e-06, + "loss": 1.4722, + "step": 13645 + }, + { + "epoch": 0.7460601670243155, + "grad_norm": 1.2781893014907837, + "learning_rate": 3.3898426190455147e-06, + "loss": 1.3101, + "step": 13646 + }, + { + "epoch": 0.7461148394680371, + "grad_norm": 1.6194285154342651, + "learning_rate": 3.3884715316050886e-06, + "loss": 1.5126, + "step": 13647 + }, + { + "epoch": 0.7461695119117586, + "grad_norm": 1.1969635486602783, + "learning_rate": 3.3871006649391126e-06, + "loss": 1.6301, + "step": 13648 + }, + { + "epoch": 0.7462241843554802, + "grad_norm": 1.2830499410629272, + "learning_rate": 3.3857300190933606e-06, + "loss": 1.3547, + "step": 13649 + }, + { + "epoch": 0.7462788567992018, + "grad_norm": 1.3131215572357178, + "learning_rate": 3.3843595941136065e-06, + "loss": 1.4738, + "step": 13650 + }, + { + "epoch": 0.7463335292429233, + "grad_norm": 1.3927396535873413, + "learning_rate": 3.382989390045609e-06, + "loss": 1.5242, + "step": 13651 + }, + { + "epoch": 0.7463882016866449, + "grad_norm": 1.961129069328308, + "learning_rate": 3.3816194069351204e-06, + "loss": 1.3198, + "step": 13652 + }, + { + "epoch": 0.7464428741303665, + "grad_norm": 1.5151537656784058, + "learning_rate": 3.380249644827894e-06, + "loss": 1.3141, + "step": 13653 + }, + { + "epoch": 0.746497546574088, + "grad_norm": 1.4576524496078491, + "learning_rate": 3.378880103769666e-06, + "loss": 1.5145, + "step": 13654 + }, + { + "epoch": 0.7465522190178095, + "grad_norm": 1.761535406112671, + "learning_rate": 3.3775107838061637e-06, + "loss": 1.6741, + "step": 13655 + }, + { + "epoch": 0.7466068914615311, + "grad_norm": 1.646407961845398, + "learning_rate": 3.376141684983121e-06, + "loss": 1.2258, + "step": 13656 + }, + { + "epoch": 0.7466615639052526, + "grad_norm": 1.7374271154403687, + "learning_rate": 3.3747728073462506e-06, + "loss": 1.3839, + "step": 13657 + }, + { + "epoch": 0.7467162363489742, + "grad_norm": 1.235771894454956, + "learning_rate": 3.3734041509412584e-06, + "loss": 1.3613, + "step": 13658 + }, + { + "epoch": 0.7467709087926958, + "grad_norm": 1.2942501306533813, + "learning_rate": 3.372035715813856e-06, + "loss": 1.4168, + "step": 13659 + }, + { + "epoch": 0.7468255812364173, + "grad_norm": 1.1484177112579346, + "learning_rate": 3.3706675020097335e-06, + "loss": 1.6572, + "step": 13660 + }, + { + "epoch": 0.7468802536801389, + "grad_norm": 2.14180064201355, + "learning_rate": 3.3692995095745796e-06, + "loss": 1.35, + "step": 13661 + }, + { + "epoch": 0.7469349261238605, + "grad_norm": 1.5339125394821167, + "learning_rate": 3.3679317385540744e-06, + "loss": 1.2432, + "step": 13662 + }, + { + "epoch": 0.746989598567582, + "grad_norm": 1.5252187252044678, + "learning_rate": 3.366564188993887e-06, + "loss": 1.367, + "step": 13663 + }, + { + "epoch": 0.7470442710113036, + "grad_norm": 1.3280411958694458, + "learning_rate": 3.365196860939691e-06, + "loss": 1.5632, + "step": 13664 + }, + { + "epoch": 0.747098943455025, + "grad_norm": 2.8206229209899902, + "learning_rate": 3.363829754437141e-06, + "loss": 1.2435, + "step": 13665 + }, + { + "epoch": 0.7471536158987466, + "grad_norm": 1.476351022720337, + "learning_rate": 3.362462869531885e-06, + "loss": 1.4782, + "step": 13666 + }, + { + "epoch": 0.7472082883424682, + "grad_norm": 1.0538511276245117, + "learning_rate": 3.361096206269572e-06, + "loss": 1.5716, + "step": 13667 + }, + { + "epoch": 0.7472629607861897, + "grad_norm": 1.7012938261032104, + "learning_rate": 3.3597297646958348e-06, + "loss": 1.4521, + "step": 13668 + }, + { + "epoch": 0.7473176332299113, + "grad_norm": 1.4927456378936768, + "learning_rate": 3.3583635448563e-06, + "loss": 1.1202, + "step": 13669 + }, + { + "epoch": 0.7473723056736329, + "grad_norm": 1.7021373510360718, + "learning_rate": 3.3569975467965955e-06, + "loss": 1.2051, + "step": 13670 + }, + { + "epoch": 0.7474269781173544, + "grad_norm": 1.819669246673584, + "learning_rate": 3.35563177056233e-06, + "loss": 1.2995, + "step": 13671 + }, + { + "epoch": 0.747481650561076, + "grad_norm": 1.4334443807601929, + "learning_rate": 3.354266216199108e-06, + "loss": 1.4517, + "step": 13672 + }, + { + "epoch": 0.7475363230047976, + "grad_norm": 1.385165810585022, + "learning_rate": 3.3529008837525355e-06, + "loss": 1.4082, + "step": 13673 + }, + { + "epoch": 0.747590995448519, + "grad_norm": 1.396604299545288, + "learning_rate": 3.3515357732682008e-06, + "loss": 1.4305, + "step": 13674 + }, + { + "epoch": 0.7476456678922406, + "grad_norm": 1.6285032033920288, + "learning_rate": 3.350170884791687e-06, + "loss": 1.3715, + "step": 13675 + }, + { + "epoch": 0.7477003403359622, + "grad_norm": 1.6103137731552124, + "learning_rate": 3.348806218368571e-06, + "loss": 1.4037, + "step": 13676 + }, + { + "epoch": 0.7477550127796837, + "grad_norm": 1.5528069734573364, + "learning_rate": 3.347441774044421e-06, + "loss": 1.3531, + "step": 13677 + }, + { + "epoch": 0.7478096852234053, + "grad_norm": 1.402382254600525, + "learning_rate": 3.3460775518648037e-06, + "loss": 1.6135, + "step": 13678 + }, + { + "epoch": 0.7478643576671268, + "grad_norm": 1.829032063484192, + "learning_rate": 3.3447135518752705e-06, + "loss": 1.1625, + "step": 13679 + }, + { + "epoch": 0.7479190301108484, + "grad_norm": 1.3646185398101807, + "learning_rate": 3.343349774121366e-06, + "loss": 1.5387, + "step": 13680 + }, + { + "epoch": 0.74797370255457, + "grad_norm": 1.5214585065841675, + "learning_rate": 3.3419862186486364e-06, + "loss": 1.3638, + "step": 13681 + }, + { + "epoch": 0.7480283749982914, + "grad_norm": 1.9226253032684326, + "learning_rate": 3.34062288550261e-06, + "loss": 1.3767, + "step": 13682 + }, + { + "epoch": 0.748083047442013, + "grad_norm": 1.6127724647521973, + "learning_rate": 3.339259774728809e-06, + "loss": 1.5024, + "step": 13683 + }, + { + "epoch": 0.7481377198857346, + "grad_norm": 1.5918904542922974, + "learning_rate": 3.337896886372757e-06, + "loss": 1.5652, + "step": 13684 + }, + { + "epoch": 0.7481923923294561, + "grad_norm": 1.7276006937026978, + "learning_rate": 3.3365342204799613e-06, + "loss": 1.6869, + "step": 13685 + }, + { + "epoch": 0.7482470647731777, + "grad_norm": 1.6417653560638428, + "learning_rate": 3.3351717770959246e-06, + "loss": 1.458, + "step": 13686 + }, + { + "epoch": 0.7483017372168993, + "grad_norm": 1.7460325956344604, + "learning_rate": 3.333809556266142e-06, + "loss": 1.3455, + "step": 13687 + }, + { + "epoch": 0.7483564096606208, + "grad_norm": 1.355178713798523, + "learning_rate": 3.3324475580361005e-06, + "loss": 1.3681, + "step": 13688 + }, + { + "epoch": 0.7484110821043424, + "grad_norm": 1.7671818733215332, + "learning_rate": 3.3310857824512776e-06, + "loss": 1.3282, + "step": 13689 + }, + { + "epoch": 0.748465754548064, + "grad_norm": 1.234691858291626, + "learning_rate": 3.329724229557153e-06, + "loss": 1.5494, + "step": 13690 + }, + { + "epoch": 0.7485204269917854, + "grad_norm": 1.5650933980941772, + "learning_rate": 3.3283628993991846e-06, + "loss": 1.4219, + "step": 13691 + }, + { + "epoch": 0.748575099435507, + "grad_norm": 1.5814632177352905, + "learning_rate": 3.327001792022839e-06, + "loss": 1.3259, + "step": 13692 + }, + { + "epoch": 0.7486297718792285, + "grad_norm": 1.2690531015396118, + "learning_rate": 3.325640907473562e-06, + "loss": 1.6041, + "step": 13693 + }, + { + "epoch": 0.7486844443229501, + "grad_norm": 1.2260929346084595, + "learning_rate": 3.3242802457967928e-06, + "loss": 1.4368, + "step": 13694 + }, + { + "epoch": 0.7487391167666717, + "grad_norm": 1.3191190958023071, + "learning_rate": 3.3229198070379754e-06, + "loss": 1.6618, + "step": 13695 + }, + { + "epoch": 0.7487937892103932, + "grad_norm": 1.7168822288513184, + "learning_rate": 3.3215595912425336e-06, + "loss": 1.5248, + "step": 13696 + }, + { + "epoch": 0.7488484616541148, + "grad_norm": 1.7196176052093506, + "learning_rate": 3.3201995984558854e-06, + "loss": 1.496, + "step": 13697 + }, + { + "epoch": 0.7489031340978364, + "grad_norm": 1.3926562070846558, + "learning_rate": 3.3188398287234504e-06, + "loss": 1.4343, + "step": 13698 + }, + { + "epoch": 0.7489578065415579, + "grad_norm": 1.3024166822433472, + "learning_rate": 3.3174802820906315e-06, + "loss": 1.5449, + "step": 13699 + }, + { + "epoch": 0.7490124789852795, + "grad_norm": 1.9939589500427246, + "learning_rate": 3.3161209586028265e-06, + "loss": 1.4303, + "step": 13700 + }, + { + "epoch": 0.749067151429001, + "grad_norm": 1.899216651916504, + "learning_rate": 3.3147618583054277e-06, + "loss": 1.4724, + "step": 13701 + }, + { + "epoch": 0.7491218238727225, + "grad_norm": 1.6589308977127075, + "learning_rate": 3.313402981243817e-06, + "loss": 1.3696, + "step": 13702 + }, + { + "epoch": 0.7491764963164441, + "grad_norm": 1.5675162076950073, + "learning_rate": 3.3120443274633683e-06, + "loss": 1.4775, + "step": 13703 + }, + { + "epoch": 0.7492311687601657, + "grad_norm": 1.7592430114746094, + "learning_rate": 3.310685897009457e-06, + "loss": 1.6602, + "step": 13704 + }, + { + "epoch": 0.7492858412038872, + "grad_norm": 1.3979049921035767, + "learning_rate": 3.3093276899274373e-06, + "loss": 1.3496, + "step": 13705 + }, + { + "epoch": 0.7493405136476088, + "grad_norm": 1.5998073816299438, + "learning_rate": 3.307969706262669e-06, + "loss": 1.5295, + "step": 13706 + }, + { + "epoch": 0.7493951860913303, + "grad_norm": 1.4756927490234375, + "learning_rate": 3.306611946060496e-06, + "loss": 1.4904, + "step": 13707 + }, + { + "epoch": 0.7494498585350519, + "grad_norm": 1.8269339799880981, + "learning_rate": 3.3052544093662533e-06, + "loss": 1.5728, + "step": 13708 + }, + { + "epoch": 0.7495045309787735, + "grad_norm": 2.1966745853424072, + "learning_rate": 3.3038970962252793e-06, + "loss": 1.2706, + "step": 13709 + }, + { + "epoch": 0.7495592034224949, + "grad_norm": 1.7675530910491943, + "learning_rate": 3.3025400066828926e-06, + "loss": 1.591, + "step": 13710 + }, + { + "epoch": 0.7496138758662165, + "grad_norm": 1.3343864679336548, + "learning_rate": 3.3011831407844085e-06, + "loss": 1.3255, + "step": 13711 + }, + { + "epoch": 0.7496685483099381, + "grad_norm": 2.0085909366607666, + "learning_rate": 3.2998264985751425e-06, + "loss": 1.6899, + "step": 13712 + }, + { + "epoch": 0.7497232207536596, + "grad_norm": 1.6812665462493896, + "learning_rate": 3.298470080100392e-06, + "loss": 1.4573, + "step": 13713 + }, + { + "epoch": 0.7497778931973812, + "grad_norm": 1.2535933256149292, + "learning_rate": 3.2971138854054506e-06, + "loss": 1.4245, + "step": 13714 + }, + { + "epoch": 0.7498325656411028, + "grad_norm": 1.3732502460479736, + "learning_rate": 3.2957579145356067e-06, + "loss": 1.5936, + "step": 13715 + }, + { + "epoch": 0.7498872380848243, + "grad_norm": 2.2018799781799316, + "learning_rate": 3.2944021675361372e-06, + "loss": 1.2699, + "step": 13716 + }, + { + "epoch": 0.7499419105285459, + "grad_norm": 1.4990369081497192, + "learning_rate": 3.2930466444523112e-06, + "loss": 1.4583, + "step": 13717 + }, + { + "epoch": 0.7499965829722675, + "grad_norm": 1.9040018320083618, + "learning_rate": 3.2916913453293984e-06, + "loss": 1.4728, + "step": 13718 + }, + { + "epoch": 0.7500512554159889, + "grad_norm": 1.4726508855819702, + "learning_rate": 3.2903362702126516e-06, + "loss": 1.2777, + "step": 13719 + }, + { + "epoch": 0.7501059278597105, + "grad_norm": 1.206283688545227, + "learning_rate": 3.2889814191473234e-06, + "loss": 1.7409, + "step": 13720 + }, + { + "epoch": 0.750160600303432, + "grad_norm": 1.5264368057250977, + "learning_rate": 3.2876267921786544e-06, + "loss": 1.4976, + "step": 13721 + }, + { + "epoch": 0.7502152727471536, + "grad_norm": 1.2914289236068726, + "learning_rate": 3.2862723893518743e-06, + "loss": 1.2484, + "step": 13722 + }, + { + "epoch": 0.7502699451908752, + "grad_norm": 1.6488667726516724, + "learning_rate": 3.284918210712217e-06, + "loss": 1.1508, + "step": 13723 + }, + { + "epoch": 0.7503246176345967, + "grad_norm": 1.5970045328140259, + "learning_rate": 3.2835642563048977e-06, + "loss": 1.6299, + "step": 13724 + }, + { + "epoch": 0.7503792900783183, + "grad_norm": 1.654013752937317, + "learning_rate": 3.282210526175128e-06, + "loss": 1.4068, + "step": 13725 + }, + { + "epoch": 0.7504339625220399, + "grad_norm": 1.4912865161895752, + "learning_rate": 3.2808570203681135e-06, + "loss": 1.56, + "step": 13726 + }, + { + "epoch": 0.7504886349657613, + "grad_norm": 1.5789504051208496, + "learning_rate": 3.2795037389290498e-06, + "loss": 1.4057, + "step": 13727 + }, + { + "epoch": 0.7505433074094829, + "grad_norm": 1.6031739711761475, + "learning_rate": 3.278150681903123e-06, + "loss": 1.5156, + "step": 13728 + }, + { + "epoch": 0.7505979798532045, + "grad_norm": 1.3931468725204468, + "learning_rate": 3.2767978493355214e-06, + "loss": 1.4362, + "step": 13729 + }, + { + "epoch": 0.750652652296926, + "grad_norm": 1.6879457235336304, + "learning_rate": 3.2754452412714153e-06, + "loss": 1.4015, + "step": 13730 + }, + { + "epoch": 0.7507073247406476, + "grad_norm": 1.5252375602722168, + "learning_rate": 3.2740928577559705e-06, + "loss": 1.4377, + "step": 13731 + }, + { + "epoch": 0.7507619971843692, + "grad_norm": 1.4807918071746826, + "learning_rate": 3.2727406988343504e-06, + "loss": 1.7432, + "step": 13732 + }, + { + "epoch": 0.7508166696280907, + "grad_norm": 2.3281877040863037, + "learning_rate": 3.271388764551702e-06, + "loss": 1.3561, + "step": 13733 + }, + { + "epoch": 0.7508713420718123, + "grad_norm": 1.2486114501953125, + "learning_rate": 3.2700370549531734e-06, + "loss": 1.5221, + "step": 13734 + }, + { + "epoch": 0.7509260145155338, + "grad_norm": 1.6408443450927734, + "learning_rate": 3.2686855700839017e-06, + "loss": 1.6222, + "step": 13735 + }, + { + "epoch": 0.7509806869592554, + "grad_norm": 1.419606328010559, + "learning_rate": 3.26733430998901e-06, + "loss": 1.5186, + "step": 13736 + }, + { + "epoch": 0.7510353594029769, + "grad_norm": 1.502598524093628, + "learning_rate": 3.2659832747136276e-06, + "loss": 1.4, + "step": 13737 + }, + { + "epoch": 0.7510900318466984, + "grad_norm": 2.1110353469848633, + "learning_rate": 3.264632464302867e-06, + "loss": 1.6418, + "step": 13738 + }, + { + "epoch": 0.75114470429042, + "grad_norm": 1.5786893367767334, + "learning_rate": 3.2632818788018317e-06, + "loss": 1.3849, + "step": 13739 + }, + { + "epoch": 0.7511993767341416, + "grad_norm": 1.8422304391860962, + "learning_rate": 3.2619315182556234e-06, + "loss": 1.2709, + "step": 13740 + }, + { + "epoch": 0.7512540491778631, + "grad_norm": 1.268247127532959, + "learning_rate": 3.2605813827093335e-06, + "loss": 1.6232, + "step": 13741 + }, + { + "epoch": 0.7513087216215847, + "grad_norm": 1.4594395160675049, + "learning_rate": 3.259231472208042e-06, + "loss": 1.7215, + "step": 13742 + }, + { + "epoch": 0.7513633940653063, + "grad_norm": 1.6207175254821777, + "learning_rate": 3.2578817867968327e-06, + "loss": 1.3369, + "step": 13743 + }, + { + "epoch": 0.7514180665090278, + "grad_norm": 1.6728192567825317, + "learning_rate": 3.2565323265207718e-06, + "loss": 1.5893, + "step": 13744 + }, + { + "epoch": 0.7514727389527494, + "grad_norm": 1.4734357595443726, + "learning_rate": 3.255183091424916e-06, + "loss": 1.5943, + "step": 13745 + }, + { + "epoch": 0.751527411396471, + "grad_norm": 1.3786202669143677, + "learning_rate": 3.253834081554329e-06, + "loss": 1.2862, + "step": 13746 + }, + { + "epoch": 0.7515820838401924, + "grad_norm": 1.5796432495117188, + "learning_rate": 3.2524852969540477e-06, + "loss": 1.4559, + "step": 13747 + }, + { + "epoch": 0.751636756283914, + "grad_norm": 1.6729947328567505, + "learning_rate": 3.2511367376691194e-06, + "loss": 1.3719, + "step": 13748 + }, + { + "epoch": 0.7516914287276355, + "grad_norm": 1.527845025062561, + "learning_rate": 3.2497884037445726e-06, + "loss": 1.4617, + "step": 13749 + }, + { + "epoch": 0.7517461011713571, + "grad_norm": 1.4721699953079224, + "learning_rate": 3.248440295225428e-06, + "loss": 1.5423, + "step": 13750 + }, + { + "epoch": 0.7518007736150787, + "grad_norm": 1.5290217399597168, + "learning_rate": 3.2470924121567072e-06, + "loss": 1.2785, + "step": 13751 + }, + { + "epoch": 0.7518554460588002, + "grad_norm": 1.685876727104187, + "learning_rate": 3.2457447545834177e-06, + "loss": 1.527, + "step": 13752 + }, + { + "epoch": 0.7519101185025218, + "grad_norm": 1.5738472938537598, + "learning_rate": 3.24439732255056e-06, + "loss": 1.4782, + "step": 13753 + }, + { + "epoch": 0.7519647909462434, + "grad_norm": 1.833475947380066, + "learning_rate": 3.243050116103128e-06, + "loss": 1.611, + "step": 13754 + }, + { + "epoch": 0.7520194633899648, + "grad_norm": 1.2568439245224, + "learning_rate": 3.2417031352861085e-06, + "loss": 1.336, + "step": 13755 + }, + { + "epoch": 0.7520741358336864, + "grad_norm": 1.6815712451934814, + "learning_rate": 3.2403563801444772e-06, + "loss": 1.6401, + "step": 13756 + }, + { + "epoch": 0.752128808277408, + "grad_norm": 1.6363037824630737, + "learning_rate": 3.2390098507232113e-06, + "loss": 1.4352, + "step": 13757 + }, + { + "epoch": 0.7521834807211295, + "grad_norm": 1.4164048433303833, + "learning_rate": 3.2376635470672713e-06, + "loss": 1.5025, + "step": 13758 + }, + { + "epoch": 0.7522381531648511, + "grad_norm": 2.2671334743499756, + "learning_rate": 3.2363174692216113e-06, + "loss": 1.4768, + "step": 13759 + }, + { + "epoch": 0.7522928256085727, + "grad_norm": 1.6834499835968018, + "learning_rate": 3.234971617231185e-06, + "loss": 1.4255, + "step": 13760 + }, + { + "epoch": 0.7523474980522942, + "grad_norm": 1.9459843635559082, + "learning_rate": 3.2336259911409283e-06, + "loss": 1.5053, + "step": 13761 + }, + { + "epoch": 0.7524021704960158, + "grad_norm": 1.446134328842163, + "learning_rate": 3.23228059099578e-06, + "loss": 1.3836, + "step": 13762 + }, + { + "epoch": 0.7524568429397372, + "grad_norm": 1.409437656402588, + "learning_rate": 3.230935416840665e-06, + "loss": 1.3703, + "step": 13763 + }, + { + "epoch": 0.7525115153834588, + "grad_norm": 1.4942805767059326, + "learning_rate": 3.2295904687204995e-06, + "loss": 1.2599, + "step": 13764 + }, + { + "epoch": 0.7525661878271804, + "grad_norm": 1.3110815286636353, + "learning_rate": 3.2282457466801962e-06, + "loss": 1.2033, + "step": 13765 + }, + { + "epoch": 0.7526208602709019, + "grad_norm": 1.2701975107192993, + "learning_rate": 3.226901250764657e-06, + "loss": 1.5445, + "step": 13766 + }, + { + "epoch": 0.7526755327146235, + "grad_norm": 1.4524223804473877, + "learning_rate": 3.225556981018776e-06, + "loss": 1.2369, + "step": 13767 + }, + { + "epoch": 0.7527302051583451, + "grad_norm": 1.452161431312561, + "learning_rate": 3.2242129374874478e-06, + "loss": 1.3819, + "step": 13768 + }, + { + "epoch": 0.7527848776020666, + "grad_norm": 1.7388790845870972, + "learning_rate": 3.222869120215548e-06, + "loss": 1.6158, + "step": 13769 + }, + { + "epoch": 0.7528395500457882, + "grad_norm": 1.7691928148269653, + "learning_rate": 3.2215255292479496e-06, + "loss": 1.6767, + "step": 13770 + }, + { + "epoch": 0.7528942224895098, + "grad_norm": 1.4256139993667603, + "learning_rate": 3.2201821646295227e-06, + "loss": 1.4086, + "step": 13771 + }, + { + "epoch": 0.7529488949332312, + "grad_norm": 1.8768035173416138, + "learning_rate": 3.2188390264051226e-06, + "loss": 1.4935, + "step": 13772 + }, + { + "epoch": 0.7530035673769528, + "grad_norm": 1.6871310472488403, + "learning_rate": 3.217496114619596e-06, + "loss": 1.3078, + "step": 13773 + }, + { + "epoch": 0.7530582398206744, + "grad_norm": 1.7498747110366821, + "learning_rate": 3.2161534293177942e-06, + "loss": 1.4713, + "step": 13774 + }, + { + "epoch": 0.7531129122643959, + "grad_norm": 1.8307667970657349, + "learning_rate": 3.2148109705445442e-06, + "loss": 1.256, + "step": 13775 + }, + { + "epoch": 0.7531675847081175, + "grad_norm": 1.5121124982833862, + "learning_rate": 3.2134687383446815e-06, + "loss": 1.4458, + "step": 13776 + }, + { + "epoch": 0.753222257151839, + "grad_norm": 1.4437909126281738, + "learning_rate": 3.2121267327630222e-06, + "loss": 1.5215, + "step": 13777 + }, + { + "epoch": 0.7532769295955606, + "grad_norm": 1.7730122804641724, + "learning_rate": 3.2107849538443802e-06, + "loss": 1.4282, + "step": 13778 + }, + { + "epoch": 0.7533316020392822, + "grad_norm": 1.4758071899414062, + "learning_rate": 3.20944340163356e-06, + "loss": 1.4502, + "step": 13779 + }, + { + "epoch": 0.7533862744830037, + "grad_norm": 1.6585609912872314, + "learning_rate": 3.208102076175358e-06, + "loss": 1.4304, + "step": 13780 + }, + { + "epoch": 0.7534409469267253, + "grad_norm": 1.3931689262390137, + "learning_rate": 3.2067609775145625e-06, + "loss": 1.4832, + "step": 13781 + }, + { + "epoch": 0.7534956193704468, + "grad_norm": 1.5114768743515015, + "learning_rate": 3.205420105695963e-06, + "loss": 1.4316, + "step": 13782 + }, + { + "epoch": 0.7535502918141683, + "grad_norm": 1.315360188484192, + "learning_rate": 3.20407946076433e-06, + "loss": 1.3574, + "step": 13783 + }, + { + "epoch": 0.7536049642578899, + "grad_norm": 1.402938961982727, + "learning_rate": 3.2027390427644267e-06, + "loss": 1.3018, + "step": 13784 + }, + { + "epoch": 0.7536596367016115, + "grad_norm": 1.7090506553649902, + "learning_rate": 3.201398851741021e-06, + "loss": 1.2935, + "step": 13785 + }, + { + "epoch": 0.753714309145333, + "grad_norm": 2.2083752155303955, + "learning_rate": 3.2000588877388606e-06, + "loss": 1.4353, + "step": 13786 + }, + { + "epoch": 0.7537689815890546, + "grad_norm": 1.4826505184173584, + "learning_rate": 3.1987191508026884e-06, + "loss": 1.2892, + "step": 13787 + }, + { + "epoch": 0.7538236540327762, + "grad_norm": 1.3954570293426514, + "learning_rate": 3.197379640977245e-06, + "loss": 1.2807, + "step": 13788 + }, + { + "epoch": 0.7538783264764977, + "grad_norm": 1.5274578332901, + "learning_rate": 3.1960403583072596e-06, + "loss": 1.1917, + "step": 13789 + }, + { + "epoch": 0.7539329989202193, + "grad_norm": 1.9412970542907715, + "learning_rate": 3.1947013028374517e-06, + "loss": 1.385, + "step": 13790 + }, + { + "epoch": 0.7539876713639407, + "grad_norm": 1.5983210802078247, + "learning_rate": 3.1933624746125368e-06, + "loss": 1.4127, + "step": 13791 + }, + { + "epoch": 0.7540423438076623, + "grad_norm": 1.403420090675354, + "learning_rate": 3.192023873677218e-06, + "loss": 1.358, + "step": 13792 + }, + { + "epoch": 0.7540970162513839, + "grad_norm": 1.698994517326355, + "learning_rate": 3.1906855000762005e-06, + "loss": 1.3544, + "step": 13793 + }, + { + "epoch": 0.7541516886951054, + "grad_norm": 1.4994620084762573, + "learning_rate": 3.189347353854173e-06, + "loss": 1.6214, + "step": 13794 + }, + { + "epoch": 0.754206361138827, + "grad_norm": 1.6866140365600586, + "learning_rate": 3.1880094350558155e-06, + "loss": 1.4264, + "step": 13795 + }, + { + "epoch": 0.7542610335825486, + "grad_norm": 1.6210483312606812, + "learning_rate": 3.186671743725812e-06, + "loss": 1.4057, + "step": 13796 + }, + { + "epoch": 0.7543157060262701, + "grad_norm": 1.3638100624084473, + "learning_rate": 3.185334279908826e-06, + "loss": 1.5148, + "step": 13797 + }, + { + "epoch": 0.7543703784699917, + "grad_norm": 1.6505661010742188, + "learning_rate": 3.183997043649516e-06, + "loss": 1.5524, + "step": 13798 + }, + { + "epoch": 0.7544250509137133, + "grad_norm": 1.4122202396392822, + "learning_rate": 3.1826600349925427e-06, + "loss": 1.471, + "step": 13799 + }, + { + "epoch": 0.7544797233574347, + "grad_norm": 1.2393238544464111, + "learning_rate": 3.181323253982549e-06, + "loss": 1.1994, + "step": 13800 + }, + { + "epoch": 0.7545343958011563, + "grad_norm": 1.475633144378662, + "learning_rate": 3.1799867006641684e-06, + "loss": 1.5633, + "step": 13801 + }, + { + "epoch": 0.7545890682448779, + "grad_norm": 1.5848616361618042, + "learning_rate": 3.1786503750820384e-06, + "loss": 1.5859, + "step": 13802 + }, + { + "epoch": 0.7546437406885994, + "grad_norm": 1.6849051713943481, + "learning_rate": 3.1773142772807796e-06, + "loss": 1.4045, + "step": 13803 + }, + { + "epoch": 0.754698413132321, + "grad_norm": 1.3719087839126587, + "learning_rate": 3.175978407305006e-06, + "loss": 1.5855, + "step": 13804 + }, + { + "epoch": 0.7547530855760425, + "grad_norm": 1.6042982339859009, + "learning_rate": 3.1746427651993273e-06, + "loss": 1.4684, + "step": 13805 + }, + { + "epoch": 0.7548077580197641, + "grad_norm": 1.5238745212554932, + "learning_rate": 3.17330735100834e-06, + "loss": 1.3938, + "step": 13806 + }, + { + "epoch": 0.7548624304634857, + "grad_norm": 1.5369534492492676, + "learning_rate": 3.171972164776642e-06, + "loss": 1.5089, + "step": 13807 + }, + { + "epoch": 0.7549171029072071, + "grad_norm": 1.2445958852767944, + "learning_rate": 3.1706372065488166e-06, + "loss": 1.5521, + "step": 13808 + }, + { + "epoch": 0.7549717753509287, + "grad_norm": 2.0192644596099854, + "learning_rate": 3.1693024763694368e-06, + "loss": 1.7084, + "step": 13809 + }, + { + "epoch": 0.7550264477946503, + "grad_norm": 1.4164246320724487, + "learning_rate": 3.1679679742830806e-06, + "loss": 1.5318, + "step": 13810 + }, + { + "epoch": 0.7550811202383718, + "grad_norm": 1.7272487878799438, + "learning_rate": 3.166633700334304e-06, + "loss": 1.3768, + "step": 13811 + }, + { + "epoch": 0.7551357926820934, + "grad_norm": 1.2693110704421997, + "learning_rate": 3.1652996545676605e-06, + "loss": 1.4889, + "step": 13812 + }, + { + "epoch": 0.755190465125815, + "grad_norm": 1.3068172931671143, + "learning_rate": 3.163965837027703e-06, + "loss": 1.4996, + "step": 13813 + }, + { + "epoch": 0.7552451375695365, + "grad_norm": 1.5255358219146729, + "learning_rate": 3.1626322477589667e-06, + "loss": 1.5198, + "step": 13814 + }, + { + "epoch": 0.7552998100132581, + "grad_norm": 1.6290950775146484, + "learning_rate": 3.161298886805981e-06, + "loss": 1.4018, + "step": 13815 + }, + { + "epoch": 0.7553544824569797, + "grad_norm": 1.761328935623169, + "learning_rate": 3.159965754213277e-06, + "loss": 1.3592, + "step": 13816 + }, + { + "epoch": 0.7554091549007012, + "grad_norm": 1.4742006063461304, + "learning_rate": 3.158632850025367e-06, + "loss": 1.5721, + "step": 13817 + }, + { + "epoch": 0.7554638273444227, + "grad_norm": 1.2055779695510864, + "learning_rate": 3.1573001742867594e-06, + "loss": 1.3047, + "step": 13818 + }, + { + "epoch": 0.7555184997881442, + "grad_norm": 1.1265838146209717, + "learning_rate": 3.1559677270419564e-06, + "loss": 1.5611, + "step": 13819 + }, + { + "epoch": 0.7555731722318658, + "grad_norm": 2.006861448287964, + "learning_rate": 3.1546355083354474e-06, + "loss": 1.2449, + "step": 13820 + }, + { + "epoch": 0.7556278446755874, + "grad_norm": 1.3482569456100464, + "learning_rate": 3.1533035182117254e-06, + "loss": 1.4246, + "step": 13821 + }, + { + "epoch": 0.7556825171193089, + "grad_norm": 2.9220173358917236, + "learning_rate": 3.151971756715264e-06, + "loss": 1.0182, + "step": 13822 + }, + { + "epoch": 0.7557371895630305, + "grad_norm": 1.5153067111968994, + "learning_rate": 3.150640223890533e-06, + "loss": 1.4866, + "step": 13823 + }, + { + "epoch": 0.7557918620067521, + "grad_norm": 1.3645185232162476, + "learning_rate": 3.1493089197820015e-06, + "loss": 1.4782, + "step": 13824 + }, + { + "epoch": 0.7558465344504736, + "grad_norm": 1.6098827123641968, + "learning_rate": 3.147977844434119e-06, + "loss": 1.3582, + "step": 13825 + }, + { + "epoch": 0.7559012068941952, + "grad_norm": 2.432001829147339, + "learning_rate": 3.146646997891333e-06, + "loss": 1.176, + "step": 13826 + }, + { + "epoch": 0.7559558793379167, + "grad_norm": 1.7174124717712402, + "learning_rate": 3.145316380198088e-06, + "loss": 1.4772, + "step": 13827 + }, + { + "epoch": 0.7560105517816382, + "grad_norm": 1.3273307085037231, + "learning_rate": 3.143985991398815e-06, + "loss": 1.4128, + "step": 13828 + }, + { + "epoch": 0.7560652242253598, + "grad_norm": 1.4461727142333984, + "learning_rate": 3.1426558315379375e-06, + "loss": 1.6314, + "step": 13829 + }, + { + "epoch": 0.7561198966690814, + "grad_norm": 1.592842698097229, + "learning_rate": 3.141325900659873e-06, + "loss": 1.506, + "step": 13830 + }, + { + "epoch": 0.7561745691128029, + "grad_norm": 1.6689445972442627, + "learning_rate": 3.139996198809028e-06, + "loss": 1.4572, + "step": 13831 + }, + { + "epoch": 0.7562292415565245, + "grad_norm": 1.634441614151001, + "learning_rate": 3.138666726029811e-06, + "loss": 1.3316, + "step": 13832 + }, + { + "epoch": 0.756283914000246, + "grad_norm": 1.353011131286621, + "learning_rate": 3.1373374823666113e-06, + "loss": 1.4483, + "step": 13833 + }, + { + "epoch": 0.7563385864439676, + "grad_norm": 1.340558648109436, + "learning_rate": 3.136008467863815e-06, + "loss": 1.3979, + "step": 13834 + }, + { + "epoch": 0.7563932588876892, + "grad_norm": 1.4118571281433105, + "learning_rate": 3.1346796825658053e-06, + "loss": 1.5848, + "step": 13835 + }, + { + "epoch": 0.7564479313314106, + "grad_norm": 1.497338056564331, + "learning_rate": 3.1333511265169513e-06, + "loss": 1.409, + "step": 13836 + }, + { + "epoch": 0.7565026037751322, + "grad_norm": 1.5941236019134521, + "learning_rate": 3.1320227997616127e-06, + "loss": 1.3941, + "step": 13837 + }, + { + "epoch": 0.7565572762188538, + "grad_norm": 1.3671475648880005, + "learning_rate": 3.1306947023441524e-06, + "loss": 1.4516, + "step": 13838 + }, + { + "epoch": 0.7566119486625753, + "grad_norm": 1.6913697719573975, + "learning_rate": 3.1293668343089157e-06, + "loss": 1.6521, + "step": 13839 + }, + { + "epoch": 0.7566666211062969, + "grad_norm": 1.4521812200546265, + "learning_rate": 3.1280391957002387e-06, + "loss": 1.4665, + "step": 13840 + }, + { + "epoch": 0.7567212935500185, + "grad_norm": 1.2937155961990356, + "learning_rate": 3.126711786562463e-06, + "loss": 1.5767, + "step": 13841 + }, + { + "epoch": 0.75677596599374, + "grad_norm": 2.384974479675293, + "learning_rate": 3.1253846069399084e-06, + "loss": 1.4768, + "step": 13842 + }, + { + "epoch": 0.7568306384374616, + "grad_norm": 2.478214979171753, + "learning_rate": 3.1240576568768943e-06, + "loss": 1.5036, + "step": 13843 + }, + { + "epoch": 0.7568853108811832, + "grad_norm": 1.5897148847579956, + "learning_rate": 3.1227309364177293e-06, + "loss": 1.5164, + "step": 13844 + }, + { + "epoch": 0.7569399833249046, + "grad_norm": 1.6690144538879395, + "learning_rate": 3.121404445606714e-06, + "loss": 1.2307, + "step": 13845 + }, + { + "epoch": 0.7569946557686262, + "grad_norm": 1.3259879350662231, + "learning_rate": 3.1200781844881477e-06, + "loss": 1.5666, + "step": 13846 + }, + { + "epoch": 0.7570493282123477, + "grad_norm": 1.5057367086410522, + "learning_rate": 3.1187521531063146e-06, + "loss": 1.4768, + "step": 13847 + }, + { + "epoch": 0.7571040006560693, + "grad_norm": 1.6032737493515015, + "learning_rate": 3.1174263515054927e-06, + "loss": 1.6923, + "step": 13848 + }, + { + "epoch": 0.7571586730997909, + "grad_norm": 1.8480195999145508, + "learning_rate": 3.1161007797299583e-06, + "loss": 1.7667, + "step": 13849 + }, + { + "epoch": 0.7572133455435124, + "grad_norm": 1.5041537284851074, + "learning_rate": 3.1147754378239716e-06, + "loss": 1.1998, + "step": 13850 + }, + { + "epoch": 0.757268017987234, + "grad_norm": 1.5330853462219238, + "learning_rate": 3.1134503258317872e-06, + "loss": 1.4307, + "step": 13851 + }, + { + "epoch": 0.7573226904309556, + "grad_norm": 1.7638115882873535, + "learning_rate": 3.112125443797659e-06, + "loss": 1.2282, + "step": 13852 + }, + { + "epoch": 0.757377362874677, + "grad_norm": 2.446063995361328, + "learning_rate": 3.1108007917658257e-06, + "loss": 1.2263, + "step": 13853 + }, + { + "epoch": 0.7574320353183986, + "grad_norm": 1.3036936521530151, + "learning_rate": 3.1094763697805165e-06, + "loss": 1.4232, + "step": 13854 + }, + { + "epoch": 0.7574867077621202, + "grad_norm": 1.5761054754257202, + "learning_rate": 3.1081521778859624e-06, + "loss": 1.3237, + "step": 13855 + }, + { + "epoch": 0.7575413802058417, + "grad_norm": 1.6654702425003052, + "learning_rate": 3.1068282161263806e-06, + "loss": 1.2478, + "step": 13856 + }, + { + "epoch": 0.7575960526495633, + "grad_norm": 1.3495709896087646, + "learning_rate": 3.1055044845459804e-06, + "loss": 1.4268, + "step": 13857 + }, + { + "epoch": 0.7576507250932849, + "grad_norm": 1.5039818286895752, + "learning_rate": 3.1041809831889637e-06, + "loss": 1.4213, + "step": 13858 + }, + { + "epoch": 0.7577053975370064, + "grad_norm": 1.4437397718429565, + "learning_rate": 3.1028577120995216e-06, + "loss": 1.6467, + "step": 13859 + }, + { + "epoch": 0.757760069980728, + "grad_norm": 1.4541741609573364, + "learning_rate": 3.1015346713218488e-06, + "loss": 1.3638, + "step": 13860 + }, + { + "epoch": 0.7578147424244496, + "grad_norm": 1.1298953294754028, + "learning_rate": 3.100211860900121e-06, + "loss": 1.6575, + "step": 13861 + }, + { + "epoch": 0.757869414868171, + "grad_norm": 2.721461057662964, + "learning_rate": 3.0988892808785063e-06, + "loss": 1.3249, + "step": 13862 + }, + { + "epoch": 0.7579240873118926, + "grad_norm": 1.4361495971679688, + "learning_rate": 3.0975669313011768e-06, + "loss": 1.538, + "step": 13863 + }, + { + "epoch": 0.7579787597556141, + "grad_norm": 1.4515877962112427, + "learning_rate": 3.0962448122122834e-06, + "loss": 1.3847, + "step": 13864 + }, + { + "epoch": 0.7580334321993357, + "grad_norm": 1.8045876026153564, + "learning_rate": 3.094922923655973e-06, + "loss": 1.7191, + "step": 13865 + }, + { + "epoch": 0.7580881046430573, + "grad_norm": 1.6638317108154297, + "learning_rate": 3.0936012656763937e-06, + "loss": 1.5407, + "step": 13866 + }, + { + "epoch": 0.7581427770867788, + "grad_norm": 1.3571507930755615, + "learning_rate": 3.0922798383176733e-06, + "loss": 1.5931, + "step": 13867 + }, + { + "epoch": 0.7581974495305004, + "grad_norm": 1.733904242515564, + "learning_rate": 3.090958641623939e-06, + "loss": 1.4794, + "step": 13868 + }, + { + "epoch": 0.758252121974222, + "grad_norm": 1.4381468296051025, + "learning_rate": 3.0896376756393074e-06, + "loss": 1.5004, + "step": 13869 + }, + { + "epoch": 0.7583067944179435, + "grad_norm": 1.4440490007400513, + "learning_rate": 3.0883169404078906e-06, + "loss": 1.3034, + "step": 13870 + }, + { + "epoch": 0.7583614668616651, + "grad_norm": 2.1219329833984375, + "learning_rate": 3.086996435973787e-06, + "loss": 1.5207, + "step": 13871 + }, + { + "epoch": 0.7584161393053866, + "grad_norm": 2.4121036529541016, + "learning_rate": 3.085676162381096e-06, + "loss": 1.3985, + "step": 13872 + }, + { + "epoch": 0.7584708117491081, + "grad_norm": 1.5815109014511108, + "learning_rate": 3.0843561196739013e-06, + "loss": 1.4096, + "step": 13873 + }, + { + "epoch": 0.7585254841928297, + "grad_norm": 1.5768221616744995, + "learning_rate": 3.0830363078962854e-06, + "loss": 1.4037, + "step": 13874 + }, + { + "epoch": 0.7585801566365513, + "grad_norm": 1.278786540031433, + "learning_rate": 3.0817167270923197e-06, + "loss": 1.3116, + "step": 13875 + }, + { + "epoch": 0.7586348290802728, + "grad_norm": 1.505985975265503, + "learning_rate": 3.0803973773060634e-06, + "loss": 1.466, + "step": 13876 + }, + { + "epoch": 0.7586895015239944, + "grad_norm": 3.866219997406006, + "learning_rate": 3.079078258581579e-06, + "loss": 1.3589, + "step": 13877 + }, + { + "epoch": 0.7587441739677159, + "grad_norm": 1.3951345682144165, + "learning_rate": 3.0777593709629115e-06, + "loss": 1.34, + "step": 13878 + }, + { + "epoch": 0.7587988464114375, + "grad_norm": 1.4521915912628174, + "learning_rate": 3.0764407144941e-06, + "loss": 1.5557, + "step": 13879 + }, + { + "epoch": 0.7588535188551591, + "grad_norm": 1.4109137058258057, + "learning_rate": 3.075122289219181e-06, + "loss": 1.4033, + "step": 13880 + }, + { + "epoch": 0.7589081912988805, + "grad_norm": 1.7371002435684204, + "learning_rate": 3.07380409518218e-06, + "loss": 1.3547, + "step": 13881 + }, + { + "epoch": 0.7589628637426021, + "grad_norm": 1.5855282545089722, + "learning_rate": 3.0724861324271137e-06, + "loss": 1.3288, + "step": 13882 + }, + { + "epoch": 0.7590175361863237, + "grad_norm": 1.3935136795043945, + "learning_rate": 3.0711684009979904e-06, + "loss": 1.3557, + "step": 13883 + }, + { + "epoch": 0.7590722086300452, + "grad_norm": 2.0231800079345703, + "learning_rate": 3.0698509009388134e-06, + "loss": 1.4848, + "step": 13884 + }, + { + "epoch": 0.7591268810737668, + "grad_norm": 1.4165295362472534, + "learning_rate": 3.068533632293573e-06, + "loss": 1.3911, + "step": 13885 + }, + { + "epoch": 0.7591815535174884, + "grad_norm": 1.5445468425750732, + "learning_rate": 3.067216595106264e-06, + "loss": 1.2183, + "step": 13886 + }, + { + "epoch": 0.7592362259612099, + "grad_norm": 1.7226091623306274, + "learning_rate": 3.0658997894208573e-06, + "loss": 1.4322, + "step": 13887 + }, + { + "epoch": 0.7592908984049315, + "grad_norm": 1.3004040718078613, + "learning_rate": 3.0645832152813315e-06, + "loss": 1.825, + "step": 13888 + }, + { + "epoch": 0.7593455708486531, + "grad_norm": 1.681675910949707, + "learning_rate": 3.063266872731646e-06, + "loss": 1.3109, + "step": 13889 + }, + { + "epoch": 0.7594002432923745, + "grad_norm": 1.6060079336166382, + "learning_rate": 3.061950761815755e-06, + "loss": 1.3723, + "step": 13890 + }, + { + "epoch": 0.7594549157360961, + "grad_norm": 1.6317354440689087, + "learning_rate": 3.060634882577612e-06, + "loss": 1.4841, + "step": 13891 + }, + { + "epoch": 0.7595095881798176, + "grad_norm": 1.38942551612854, + "learning_rate": 3.0593192350611533e-06, + "loss": 1.5668, + "step": 13892 + }, + { + "epoch": 0.7595642606235392, + "grad_norm": 1.5392974615097046, + "learning_rate": 3.058003819310309e-06, + "loss": 1.4641, + "step": 13893 + }, + { + "epoch": 0.7596189330672608, + "grad_norm": 1.59529447555542, + "learning_rate": 3.0566886353690106e-06, + "loss": 1.1684, + "step": 13894 + }, + { + "epoch": 0.7596736055109823, + "grad_norm": 1.3719213008880615, + "learning_rate": 3.055373683281171e-06, + "loss": 1.3357, + "step": 13895 + }, + { + "epoch": 0.7597282779547039, + "grad_norm": 1.7510367631912231, + "learning_rate": 3.0540589630907016e-06, + "loss": 1.647, + "step": 13896 + }, + { + "epoch": 0.7597829503984255, + "grad_norm": 1.360022783279419, + "learning_rate": 3.0527444748415016e-06, + "loss": 1.598, + "step": 13897 + }, + { + "epoch": 0.759837622842147, + "grad_norm": 1.562962532043457, + "learning_rate": 3.051430218577466e-06, + "loss": 1.5379, + "step": 13898 + }, + { + "epoch": 0.7598922952858685, + "grad_norm": 1.807184100151062, + "learning_rate": 3.050116194342476e-06, + "loss": 1.5342, + "step": 13899 + }, + { + "epoch": 0.7599469677295901, + "grad_norm": 1.4118572473526, + "learning_rate": 3.0488024021804197e-06, + "loss": 1.3856, + "step": 13900 + }, + { + "epoch": 0.7600016401733116, + "grad_norm": 1.83966064453125, + "learning_rate": 3.047488842135159e-06, + "loss": 1.8337, + "step": 13901 + }, + { + "epoch": 0.7600563126170332, + "grad_norm": 1.3903752565383911, + "learning_rate": 3.0461755142505643e-06, + "loss": 1.688, + "step": 13902 + }, + { + "epoch": 0.7601109850607548, + "grad_norm": 1.3361437320709229, + "learning_rate": 3.0448624185704857e-06, + "loss": 1.3332, + "step": 13903 + }, + { + "epoch": 0.7601656575044763, + "grad_norm": 1.5693669319152832, + "learning_rate": 3.0435495551387694e-06, + "loss": 1.2999, + "step": 13904 + }, + { + "epoch": 0.7602203299481979, + "grad_norm": 2.0239925384521484, + "learning_rate": 3.04223692399926e-06, + "loss": 1.2384, + "step": 13905 + }, + { + "epoch": 0.7602750023919194, + "grad_norm": 1.7126797437667847, + "learning_rate": 3.0409245251957865e-06, + "loss": 1.4516, + "step": 13906 + }, + { + "epoch": 0.760329674835641, + "grad_norm": 1.2617155313491821, + "learning_rate": 3.0396123587721737e-06, + "loss": 1.4321, + "step": 13907 + }, + { + "epoch": 0.7603843472793625, + "grad_norm": 1.077457308769226, + "learning_rate": 3.038300424772237e-06, + "loss": 1.6069, + "step": 13908 + }, + { + "epoch": 0.760439019723084, + "grad_norm": 1.5115987062454224, + "learning_rate": 3.0369887232397855e-06, + "loss": 1.2947, + "step": 13909 + }, + { + "epoch": 0.7604936921668056, + "grad_norm": 1.5603383779525757, + "learning_rate": 3.0356772542186165e-06, + "loss": 1.4808, + "step": 13910 + }, + { + "epoch": 0.7605483646105272, + "grad_norm": 1.836501955986023, + "learning_rate": 3.034366017752528e-06, + "loss": 1.4704, + "step": 13911 + }, + { + "epoch": 0.7606030370542487, + "grad_norm": 1.690948486328125, + "learning_rate": 3.0330550138853053e-06, + "loss": 1.6619, + "step": 13912 + }, + { + "epoch": 0.7606577094979703, + "grad_norm": 2.309194803237915, + "learning_rate": 3.0317442426607203e-06, + "loss": 1.8963, + "step": 13913 + }, + { + "epoch": 0.7607123819416919, + "grad_norm": 1.4829795360565186, + "learning_rate": 3.0304337041225497e-06, + "loss": 1.5675, + "step": 13914 + }, + { + "epoch": 0.7607670543854134, + "grad_norm": 1.3355284929275513, + "learning_rate": 3.0291233983145494e-06, + "loss": 1.4925, + "step": 13915 + }, + { + "epoch": 0.760821726829135, + "grad_norm": 1.410265564918518, + "learning_rate": 3.0278133252804797e-06, + "loss": 1.515, + "step": 13916 + }, + { + "epoch": 0.7608763992728566, + "grad_norm": 2.1939234733581543, + "learning_rate": 3.026503485064084e-06, + "loss": 1.3819, + "step": 13917 + }, + { + "epoch": 0.760931071716578, + "grad_norm": 1.6036851406097412, + "learning_rate": 3.0251938777090974e-06, + "loss": 1.5129, + "step": 13918 + }, + { + "epoch": 0.7609857441602996, + "grad_norm": 2.0876176357269287, + "learning_rate": 3.0238845032592566e-06, + "loss": 1.826, + "step": 13919 + }, + { + "epoch": 0.7610404166040211, + "grad_norm": 1.5756641626358032, + "learning_rate": 3.0225753617582833e-06, + "loss": 1.3398, + "step": 13920 + }, + { + "epoch": 0.7610950890477427, + "grad_norm": 1.6459877490997314, + "learning_rate": 3.0212664532498903e-06, + "loss": 1.5128, + "step": 13921 + }, + { + "epoch": 0.7611497614914643, + "grad_norm": 1.5158278942108154, + "learning_rate": 3.019957777777788e-06, + "loss": 1.3361, + "step": 13922 + }, + { + "epoch": 0.7612044339351858, + "grad_norm": 1.7263239622116089, + "learning_rate": 3.0186493353856737e-06, + "loss": 1.3543, + "step": 13923 + }, + { + "epoch": 0.7612591063789074, + "grad_norm": 1.4852546453475952, + "learning_rate": 3.017341126117238e-06, + "loss": 1.4816, + "step": 13924 + }, + { + "epoch": 0.761313778822629, + "grad_norm": 1.5450403690338135, + "learning_rate": 3.01603315001617e-06, + "loss": 1.3019, + "step": 13925 + }, + { + "epoch": 0.7613684512663504, + "grad_norm": 1.3745667934417725, + "learning_rate": 3.014725407126143e-06, + "loss": 1.4238, + "step": 13926 + }, + { + "epoch": 0.761423123710072, + "grad_norm": 1.6837936639785767, + "learning_rate": 3.0134178974908237e-06, + "loss": 1.4691, + "step": 13927 + }, + { + "epoch": 0.7614777961537936, + "grad_norm": 1.5043232440948486, + "learning_rate": 3.0121106211538786e-06, + "loss": 1.6681, + "step": 13928 + }, + { + "epoch": 0.7615324685975151, + "grad_norm": 1.7486834526062012, + "learning_rate": 3.010803578158954e-06, + "loss": 1.3099, + "step": 13929 + }, + { + "epoch": 0.7615871410412367, + "grad_norm": 1.6202419996261597, + "learning_rate": 3.0094967685497022e-06, + "loss": 1.6008, + "step": 13930 + }, + { + "epoch": 0.7616418134849583, + "grad_norm": 1.4644346237182617, + "learning_rate": 3.0081901923697564e-06, + "loss": 1.2983, + "step": 13931 + }, + { + "epoch": 0.7616964859286798, + "grad_norm": 1.5098605155944824, + "learning_rate": 3.006883849662744e-06, + "loss": 1.4612, + "step": 13932 + }, + { + "epoch": 0.7617511583724014, + "grad_norm": 1.5230536460876465, + "learning_rate": 3.005577740472293e-06, + "loss": 1.2464, + "step": 13933 + }, + { + "epoch": 0.7618058308161229, + "grad_norm": 2.0066287517547607, + "learning_rate": 3.0042718648420145e-06, + "loss": 1.3676, + "step": 13934 + }, + { + "epoch": 0.7618605032598444, + "grad_norm": 2.0744099617004395, + "learning_rate": 3.002966222815513e-06, + "loss": 1.1465, + "step": 13935 + }, + { + "epoch": 0.761915175703566, + "grad_norm": 1.3954017162322998, + "learning_rate": 3.00166081443639e-06, + "loss": 1.4392, + "step": 13936 + }, + { + "epoch": 0.7619698481472875, + "grad_norm": 1.630635380744934, + "learning_rate": 3.0003556397482336e-06, + "loss": 1.4103, + "step": 13937 + }, + { + "epoch": 0.7620245205910091, + "grad_norm": 1.1660072803497314, + "learning_rate": 2.9990506987946244e-06, + "loss": 1.7387, + "step": 13938 + }, + { + "epoch": 0.7620791930347307, + "grad_norm": 1.5529155731201172, + "learning_rate": 2.9977459916191444e-06, + "loss": 1.5805, + "step": 13939 + }, + { + "epoch": 0.7621338654784522, + "grad_norm": 1.4549944400787354, + "learning_rate": 2.9964415182653562e-06, + "loss": 1.5987, + "step": 13940 + }, + { + "epoch": 0.7621885379221738, + "grad_norm": 1.2403486967086792, + "learning_rate": 2.9951372787768176e-06, + "loss": 1.8192, + "step": 13941 + }, + { + "epoch": 0.7622432103658954, + "grad_norm": 1.4751604795455933, + "learning_rate": 2.9938332731970854e-06, + "loss": 1.5804, + "step": 13942 + }, + { + "epoch": 0.7622978828096169, + "grad_norm": 1.920624852180481, + "learning_rate": 2.9925295015696978e-06, + "loss": 1.3226, + "step": 13943 + }, + { + "epoch": 0.7623525552533384, + "grad_norm": 1.3055704832077026, + "learning_rate": 2.9912259639381967e-06, + "loss": 1.4923, + "step": 13944 + }, + { + "epoch": 0.76240722769706, + "grad_norm": 1.5368270874023438, + "learning_rate": 2.9899226603461074e-06, + "loss": 1.3815, + "step": 13945 + }, + { + "epoch": 0.7624619001407815, + "grad_norm": 1.2704488039016724, + "learning_rate": 2.988619590836951e-06, + "loss": 1.4007, + "step": 13946 + }, + { + "epoch": 0.7625165725845031, + "grad_norm": 1.463873028755188, + "learning_rate": 2.987316755454238e-06, + "loss": 1.3708, + "step": 13947 + }, + { + "epoch": 0.7625712450282246, + "grad_norm": 1.5201218128204346, + "learning_rate": 2.9860141542414745e-06, + "loss": 1.5496, + "step": 13948 + }, + { + "epoch": 0.7626259174719462, + "grad_norm": 1.1238152980804443, + "learning_rate": 2.9847117872421537e-06, + "loss": 1.6005, + "step": 13949 + }, + { + "epoch": 0.7626805899156678, + "grad_norm": 1.1551586389541626, + "learning_rate": 2.9834096544997725e-06, + "loss": 1.2746, + "step": 13950 + }, + { + "epoch": 0.7627352623593893, + "grad_norm": 1.4979428052902222, + "learning_rate": 2.982107756057807e-06, + "loss": 1.6101, + "step": 13951 + }, + { + "epoch": 0.7627899348031109, + "grad_norm": 1.3116236925125122, + "learning_rate": 2.9808060919597282e-06, + "loss": 1.1634, + "step": 13952 + }, + { + "epoch": 0.7628446072468325, + "grad_norm": 1.361920714378357, + "learning_rate": 2.979504662249009e-06, + "loss": 1.5239, + "step": 13953 + }, + { + "epoch": 0.7628992796905539, + "grad_norm": 1.4658557176589966, + "learning_rate": 2.978203466969103e-06, + "loss": 1.2813, + "step": 13954 + }, + { + "epoch": 0.7629539521342755, + "grad_norm": 1.5585511922836304, + "learning_rate": 2.9769025061634573e-06, + "loss": 1.5069, + "step": 13955 + }, + { + "epoch": 0.7630086245779971, + "grad_norm": 1.406988501548767, + "learning_rate": 2.97560177987552e-06, + "loss": 1.5545, + "step": 13956 + }, + { + "epoch": 0.7630632970217186, + "grad_norm": 1.5294435024261475, + "learning_rate": 2.9743012881487187e-06, + "loss": 1.5624, + "step": 13957 + }, + { + "epoch": 0.7631179694654402, + "grad_norm": 1.5578279495239258, + "learning_rate": 2.9730010310264878e-06, + "loss": 1.4906, + "step": 13958 + }, + { + "epoch": 0.7631726419091618, + "grad_norm": 1.899823784828186, + "learning_rate": 2.9717010085522415e-06, + "loss": 1.5818, + "step": 13959 + }, + { + "epoch": 0.7632273143528833, + "grad_norm": 1.5026687383651733, + "learning_rate": 2.970401220769391e-06, + "loss": 1.7183, + "step": 13960 + }, + { + "epoch": 0.7632819867966049, + "grad_norm": 1.711876630783081, + "learning_rate": 2.969101667721339e-06, + "loss": 1.6891, + "step": 13961 + }, + { + "epoch": 0.7633366592403263, + "grad_norm": 1.402243971824646, + "learning_rate": 2.967802349451482e-06, + "loss": 1.5152, + "step": 13962 + }, + { + "epoch": 0.7633913316840479, + "grad_norm": 1.4910321235656738, + "learning_rate": 2.966503266003201e-06, + "loss": 1.1911, + "step": 13963 + }, + { + "epoch": 0.7634460041277695, + "grad_norm": 1.6515634059906006, + "learning_rate": 2.965204417419886e-06, + "loss": 1.632, + "step": 13964 + }, + { + "epoch": 0.763500676571491, + "grad_norm": 1.5987690687179565, + "learning_rate": 2.9639058037449008e-06, + "loss": 1.4125, + "step": 13965 + }, + { + "epoch": 0.7635553490152126, + "grad_norm": 1.4631487131118774, + "learning_rate": 2.96260742502161e-06, + "loss": 1.3863, + "step": 13966 + }, + { + "epoch": 0.7636100214589342, + "grad_norm": 1.5500410795211792, + "learning_rate": 2.961309281293374e-06, + "loss": 1.4681, + "step": 13967 + }, + { + "epoch": 0.7636646939026557, + "grad_norm": 1.7850024700164795, + "learning_rate": 2.9600113726035374e-06, + "loss": 1.3244, + "step": 13968 + }, + { + "epoch": 0.7637193663463773, + "grad_norm": 1.3822330236434937, + "learning_rate": 2.958713698995438e-06, + "loss": 1.5291, + "step": 13969 + }, + { + "epoch": 0.7637740387900989, + "grad_norm": 1.3546044826507568, + "learning_rate": 2.9574162605124147e-06, + "loss": 1.6088, + "step": 13970 + }, + { + "epoch": 0.7638287112338203, + "grad_norm": 1.5508899688720703, + "learning_rate": 2.956119057197785e-06, + "loss": 1.6303, + "step": 13971 + }, + { + "epoch": 0.7638833836775419, + "grad_norm": 1.8406469821929932, + "learning_rate": 2.9548220890948707e-06, + "loss": 1.4243, + "step": 13972 + }, + { + "epoch": 0.7639380561212635, + "grad_norm": 1.6599045991897583, + "learning_rate": 2.953525356246981e-06, + "loss": 1.6592, + "step": 13973 + }, + { + "epoch": 0.763992728564985, + "grad_norm": 1.4392155408859253, + "learning_rate": 2.9522288586974136e-06, + "loss": 1.4568, + "step": 13974 + }, + { + "epoch": 0.7640474010087066, + "grad_norm": 1.6816349029541016, + "learning_rate": 2.950932596489463e-06, + "loss": 1.3381, + "step": 13975 + }, + { + "epoch": 0.7641020734524281, + "grad_norm": 1.8596757650375366, + "learning_rate": 2.9496365696664143e-06, + "loss": 1.4671, + "step": 13976 + }, + { + "epoch": 0.7641567458961497, + "grad_norm": 1.145060658454895, + "learning_rate": 2.948340778271541e-06, + "loss": 1.5734, + "step": 13977 + }, + { + "epoch": 0.7642114183398713, + "grad_norm": 1.5951380729675293, + "learning_rate": 2.9470452223481206e-06, + "loss": 1.1882, + "step": 13978 + }, + { + "epoch": 0.7642660907835928, + "grad_norm": 1.6832295656204224, + "learning_rate": 2.9457499019394088e-06, + "loss": 1.5915, + "step": 13979 + }, + { + "epoch": 0.7643207632273143, + "grad_norm": 1.3649967908859253, + "learning_rate": 2.9444548170886588e-06, + "loss": 1.5834, + "step": 13980 + }, + { + "epoch": 0.7643754356710359, + "grad_norm": 1.93113112449646, + "learning_rate": 2.943159967839122e-06, + "loss": 1.566, + "step": 13981 + }, + { + "epoch": 0.7644301081147574, + "grad_norm": 1.5903898477554321, + "learning_rate": 2.9418653542340336e-06, + "loss": 1.6919, + "step": 13982 + }, + { + "epoch": 0.764484780558479, + "grad_norm": 1.4275856018066406, + "learning_rate": 2.94057097631662e-06, + "loss": 1.3193, + "step": 13983 + }, + { + "epoch": 0.7645394530022006, + "grad_norm": 1.6240051984786987, + "learning_rate": 2.93927683413011e-06, + "loss": 1.6744, + "step": 13984 + }, + { + "epoch": 0.7645941254459221, + "grad_norm": 1.438822627067566, + "learning_rate": 2.9379829277177152e-06, + "loss": 1.3915, + "step": 13985 + }, + { + "epoch": 0.7646487978896437, + "grad_norm": 1.6467041969299316, + "learning_rate": 2.9366892571226424e-06, + "loss": 1.6644, + "step": 13986 + }, + { + "epoch": 0.7647034703333653, + "grad_norm": 1.2482960224151611, + "learning_rate": 2.9353958223880895e-06, + "loss": 1.4868, + "step": 13987 + }, + { + "epoch": 0.7647581427770868, + "grad_norm": 1.4215744733810425, + "learning_rate": 2.9341026235572446e-06, + "loss": 1.4861, + "step": 13988 + }, + { + "epoch": 0.7648128152208084, + "grad_norm": 1.3378881216049194, + "learning_rate": 2.932809660673297e-06, + "loss": 1.5336, + "step": 13989 + }, + { + "epoch": 0.7648674876645298, + "grad_norm": 1.3681652545928955, + "learning_rate": 2.9315169337794183e-06, + "loss": 1.4379, + "step": 13990 + }, + { + "epoch": 0.7649221601082514, + "grad_norm": 1.5052759647369385, + "learning_rate": 2.9302244429187723e-06, + "loss": 1.3859, + "step": 13991 + }, + { + "epoch": 0.764976832551973, + "grad_norm": 1.6274683475494385, + "learning_rate": 2.9289321881345257e-06, + "loss": 1.5153, + "step": 13992 + }, + { + "epoch": 0.7650315049956945, + "grad_norm": 1.625515103340149, + "learning_rate": 2.9276401694698255e-06, + "loss": 1.5637, + "step": 13993 + }, + { + "epoch": 0.7650861774394161, + "grad_norm": 1.3387984037399292, + "learning_rate": 2.9263483869678133e-06, + "loss": 1.5343, + "step": 13994 + }, + { + "epoch": 0.7651408498831377, + "grad_norm": 1.535007357597351, + "learning_rate": 2.9250568406716305e-06, + "loss": 1.3368, + "step": 13995 + }, + { + "epoch": 0.7651955223268592, + "grad_norm": 1.9167176485061646, + "learning_rate": 2.9237655306244017e-06, + "loss": 1.5237, + "step": 13996 + }, + { + "epoch": 0.7652501947705808, + "grad_norm": 1.4591655731201172, + "learning_rate": 2.922474456869243e-06, + "loss": 1.4401, + "step": 13997 + }, + { + "epoch": 0.7653048672143024, + "grad_norm": 1.645362138748169, + "learning_rate": 2.921183619449274e-06, + "loss": 1.4208, + "step": 13998 + }, + { + "epoch": 0.7653595396580238, + "grad_norm": 1.529868483543396, + "learning_rate": 2.9198930184075944e-06, + "loss": 1.5776, + "step": 13999 + }, + { + "epoch": 0.7654142121017454, + "grad_norm": 1.507032871246338, + "learning_rate": 2.9186026537873003e-06, + "loss": 1.3592, + "step": 14000 + }, + { + "epoch": 0.765468884545467, + "grad_norm": 1.140234112739563, + "learning_rate": 2.9173125256314817e-06, + "loss": 1.5999, + "step": 14001 + }, + { + "epoch": 0.7655235569891885, + "grad_norm": 1.355281114578247, + "learning_rate": 2.916022633983214e-06, + "loss": 1.6927, + "step": 14002 + }, + { + "epoch": 0.7655782294329101, + "grad_norm": 1.7435945272445679, + "learning_rate": 2.914732978885577e-06, + "loss": 1.387, + "step": 14003 + }, + { + "epoch": 0.7656329018766316, + "grad_norm": 1.3883333206176758, + "learning_rate": 2.9134435603816324e-06, + "loss": 1.6094, + "step": 14004 + }, + { + "epoch": 0.7656875743203532, + "grad_norm": 1.883209228515625, + "learning_rate": 2.9121543785144333e-06, + "loss": 1.3503, + "step": 14005 + }, + { + "epoch": 0.7657422467640748, + "grad_norm": 1.7926222085952759, + "learning_rate": 2.9108654333270346e-06, + "loss": 1.1654, + "step": 14006 + }, + { + "epoch": 0.7657969192077962, + "grad_norm": 1.5412347316741943, + "learning_rate": 2.909576724862474e-06, + "loss": 1.4662, + "step": 14007 + }, + { + "epoch": 0.7658515916515178, + "grad_norm": 1.5413860082626343, + "learning_rate": 2.9082882531637813e-06, + "loss": 1.4455, + "step": 14008 + }, + { + "epoch": 0.7659062640952394, + "grad_norm": 1.835166335105896, + "learning_rate": 2.9070000182739886e-06, + "loss": 1.4624, + "step": 14009 + }, + { + "epoch": 0.7659609365389609, + "grad_norm": 1.7166924476623535, + "learning_rate": 2.90571202023611e-06, + "loss": 1.269, + "step": 14010 + }, + { + "epoch": 0.7660156089826825, + "grad_norm": 1.3822345733642578, + "learning_rate": 2.904424259093154e-06, + "loss": 1.4825, + "step": 14011 + }, + { + "epoch": 0.7660702814264041, + "grad_norm": 1.6049633026123047, + "learning_rate": 2.9031367348881224e-06, + "loss": 1.3893, + "step": 14012 + }, + { + "epoch": 0.7661249538701256, + "grad_norm": 1.764151692390442, + "learning_rate": 2.901849447664008e-06, + "loss": 1.3189, + "step": 14013 + }, + { + "epoch": 0.7661796263138472, + "grad_norm": 1.4828264713287354, + "learning_rate": 2.9005623974637943e-06, + "loss": 1.461, + "step": 14014 + }, + { + "epoch": 0.7662342987575688, + "grad_norm": 1.8436338901519775, + "learning_rate": 2.8992755843304643e-06, + "loss": 1.4649, + "step": 14015 + }, + { + "epoch": 0.7662889712012902, + "grad_norm": 1.229210376739502, + "learning_rate": 2.8979890083069817e-06, + "loss": 1.4866, + "step": 14016 + }, + { + "epoch": 0.7663436436450118, + "grad_norm": 1.656754732131958, + "learning_rate": 2.8967026694363156e-06, + "loss": 1.5293, + "step": 14017 + }, + { + "epoch": 0.7663983160887333, + "grad_norm": 1.3624013662338257, + "learning_rate": 2.8954165677614143e-06, + "loss": 1.3603, + "step": 14018 + }, + { + "epoch": 0.7664529885324549, + "grad_norm": 1.7162065505981445, + "learning_rate": 2.894130703325223e-06, + "loss": 1.4193, + "step": 14019 + }, + { + "epoch": 0.7665076609761765, + "grad_norm": 1.2604731321334839, + "learning_rate": 2.892845076170685e-06, + "loss": 1.5639, + "step": 14020 + }, + { + "epoch": 0.766562333419898, + "grad_norm": 1.319812297821045, + "learning_rate": 2.891559686340727e-06, + "loss": 1.5938, + "step": 14021 + }, + { + "epoch": 0.7666170058636196, + "grad_norm": 1.4555829763412476, + "learning_rate": 2.890274533878269e-06, + "loss": 1.4069, + "step": 14022 + }, + { + "epoch": 0.7666716783073412, + "grad_norm": 1.5928524732589722, + "learning_rate": 2.8889896188262303e-06, + "loss": 1.5219, + "step": 14023 + }, + { + "epoch": 0.7667263507510627, + "grad_norm": 1.5875270366668701, + "learning_rate": 2.8877049412275147e-06, + "loss": 1.4451, + "step": 14024 + }, + { + "epoch": 0.7667810231947843, + "grad_norm": 1.5995378494262695, + "learning_rate": 2.8864205011250214e-06, + "loss": 1.412, + "step": 14025 + }, + { + "epoch": 0.7668356956385058, + "grad_norm": 1.2429012060165405, + "learning_rate": 2.8851362985616395e-06, + "loss": 1.5635, + "step": 14026 + }, + { + "epoch": 0.7668903680822273, + "grad_norm": 1.3326646089553833, + "learning_rate": 2.8838523335802525e-06, + "loss": 1.4205, + "step": 14027 + }, + { + "epoch": 0.7669450405259489, + "grad_norm": 2.011099100112915, + "learning_rate": 2.8825686062237315e-06, + "loss": 1.1546, + "step": 14028 + }, + { + "epoch": 0.7669997129696705, + "grad_norm": 1.5545626878738403, + "learning_rate": 2.881285116534949e-06, + "loss": 1.2879, + "step": 14029 + }, + { + "epoch": 0.767054385413392, + "grad_norm": 1.4599378108978271, + "learning_rate": 2.8800018645567572e-06, + "loss": 1.3092, + "step": 14030 + }, + { + "epoch": 0.7671090578571136, + "grad_norm": 1.6688913106918335, + "learning_rate": 2.878718850332015e-06, + "loss": 1.3368, + "step": 14031 + }, + { + "epoch": 0.7671637303008351, + "grad_norm": 1.4414557218551636, + "learning_rate": 2.877436073903561e-06, + "loss": 1.3485, + "step": 14032 + }, + { + "epoch": 0.7672184027445567, + "grad_norm": 1.4926490783691406, + "learning_rate": 2.8761535353142266e-06, + "loss": 1.5842, + "step": 14033 + }, + { + "epoch": 0.7672730751882783, + "grad_norm": 1.4727674722671509, + "learning_rate": 2.8748712346068464e-06, + "loss": 1.3356, + "step": 14034 + }, + { + "epoch": 0.7673277476319997, + "grad_norm": 1.5088752508163452, + "learning_rate": 2.8735891718242347e-06, + "loss": 1.3761, + "step": 14035 + }, + { + "epoch": 0.7673824200757213, + "grad_norm": 1.5450243949890137, + "learning_rate": 2.8723073470092e-06, + "loss": 1.3808, + "step": 14036 + }, + { + "epoch": 0.7674370925194429, + "grad_norm": 1.7663321495056152, + "learning_rate": 2.8710257602045512e-06, + "loss": 1.521, + "step": 14037 + }, + { + "epoch": 0.7674917649631644, + "grad_norm": 1.415956735610962, + "learning_rate": 2.8697444114530814e-06, + "loss": 1.4285, + "step": 14038 + }, + { + "epoch": 0.767546437406886, + "grad_norm": 1.429731011390686, + "learning_rate": 2.8684633007975772e-06, + "loss": 1.6906, + "step": 14039 + }, + { + "epoch": 0.7676011098506076, + "grad_norm": 1.4080922603607178, + "learning_rate": 2.867182428280818e-06, + "loss": 1.2938, + "step": 14040 + }, + { + "epoch": 0.7676557822943291, + "grad_norm": 1.6947565078735352, + "learning_rate": 2.865901793945576e-06, + "loss": 1.2916, + "step": 14041 + }, + { + "epoch": 0.7677104547380507, + "grad_norm": 1.472896695137024, + "learning_rate": 2.8646213978346104e-06, + "loss": 1.3986, + "step": 14042 + }, + { + "epoch": 0.7677651271817723, + "grad_norm": 1.9646273851394653, + "learning_rate": 2.8633412399906825e-06, + "loss": 1.4935, + "step": 14043 + }, + { + "epoch": 0.7678197996254937, + "grad_norm": 1.7237299680709839, + "learning_rate": 2.862061320456535e-06, + "loss": 1.316, + "step": 14044 + }, + { + "epoch": 0.7678744720692153, + "grad_norm": 1.211099624633789, + "learning_rate": 2.860781639274912e-06, + "loss": 1.4959, + "step": 14045 + }, + { + "epoch": 0.7679291445129368, + "grad_norm": 1.280359148979187, + "learning_rate": 2.8595021964885426e-06, + "loss": 1.3521, + "step": 14046 + }, + { + "epoch": 0.7679838169566584, + "grad_norm": 1.3136223554611206, + "learning_rate": 2.8582229921401484e-06, + "loss": 1.49, + "step": 14047 + }, + { + "epoch": 0.76803848940038, + "grad_norm": 2.0081536769866943, + "learning_rate": 2.8569440262724502e-06, + "loss": 1.6676, + "step": 14048 + }, + { + "epoch": 0.7680931618441015, + "grad_norm": 1.539832353591919, + "learning_rate": 2.8556652989281517e-06, + "loss": 1.2543, + "step": 14049 + }, + { + "epoch": 0.7681478342878231, + "grad_norm": 2.195254325866699, + "learning_rate": 2.854386810149955e-06, + "loss": 1.6066, + "step": 14050 + }, + { + "epoch": 0.7682025067315447, + "grad_norm": 1.5397229194641113, + "learning_rate": 2.8531085599805496e-06, + "loss": 1.6276, + "step": 14051 + }, + { + "epoch": 0.7682571791752661, + "grad_norm": 1.2132012844085693, + "learning_rate": 2.8518305484626196e-06, + "loss": 1.7271, + "step": 14052 + }, + { + "epoch": 0.7683118516189877, + "grad_norm": 1.4178749322891235, + "learning_rate": 2.8505527756388384e-06, + "loss": 1.482, + "step": 14053 + }, + { + "epoch": 0.7683665240627093, + "grad_norm": 1.8541114330291748, + "learning_rate": 2.8492752415518808e-06, + "loss": 1.1217, + "step": 14054 + }, + { + "epoch": 0.7684211965064308, + "grad_norm": 1.5728645324707031, + "learning_rate": 2.8479979462444017e-06, + "loss": 1.668, + "step": 14055 + }, + { + "epoch": 0.7684758689501524, + "grad_norm": 1.4893378019332886, + "learning_rate": 2.8467208897590513e-06, + "loss": 1.3209, + "step": 14056 + }, + { + "epoch": 0.768530541393874, + "grad_norm": 1.6991358995437622, + "learning_rate": 2.845444072138479e-06, + "loss": 1.3748, + "step": 14057 + }, + { + "epoch": 0.7685852138375955, + "grad_norm": 1.5759470462799072, + "learning_rate": 2.844167493425314e-06, + "loss": 1.4003, + "step": 14058 + }, + { + "epoch": 0.7686398862813171, + "grad_norm": 1.8217427730560303, + "learning_rate": 2.8428911536621916e-06, + "loss": 1.5665, + "step": 14059 + }, + { + "epoch": 0.7686945587250386, + "grad_norm": 1.531442403793335, + "learning_rate": 2.8416150528917288e-06, + "loss": 1.5422, + "step": 14060 + }, + { + "epoch": 0.7687492311687601, + "grad_norm": 1.6846026182174683, + "learning_rate": 2.8403391911565335e-06, + "loss": 1.6317, + "step": 14061 + }, + { + "epoch": 0.7688039036124817, + "grad_norm": 1.783185601234436, + "learning_rate": 2.8390635684992163e-06, + "loss": 1.2145, + "step": 14062 + }, + { + "epoch": 0.7688585760562032, + "grad_norm": 1.8567270040512085, + "learning_rate": 2.8377881849623714e-06, + "loss": 1.3249, + "step": 14063 + }, + { + "epoch": 0.7689132484999248, + "grad_norm": 1.6269043684005737, + "learning_rate": 2.8365130405885843e-06, + "loss": 1.5027, + "step": 14064 + }, + { + "epoch": 0.7689679209436464, + "grad_norm": 1.6648952960968018, + "learning_rate": 2.835238135420436e-06, + "loss": 1.4928, + "step": 14065 + }, + { + "epoch": 0.7690225933873679, + "grad_norm": 1.9091717004776, + "learning_rate": 2.8339634695005e-06, + "loss": 1.2498, + "step": 14066 + }, + { + "epoch": 0.7690772658310895, + "grad_norm": 1.67924964427948, + "learning_rate": 2.832689042871336e-06, + "loss": 1.4861, + "step": 14067 + }, + { + "epoch": 0.7691319382748111, + "grad_norm": 1.367179036140442, + "learning_rate": 2.831414855575507e-06, + "loss": 1.6324, + "step": 14068 + }, + { + "epoch": 0.7691866107185326, + "grad_norm": 1.4485325813293457, + "learning_rate": 2.8301409076555574e-06, + "loss": 1.5661, + "step": 14069 + }, + { + "epoch": 0.7692412831622542, + "grad_norm": 1.406787633895874, + "learning_rate": 2.828867199154024e-06, + "loss": 1.3407, + "step": 14070 + }, + { + "epoch": 0.7692959556059757, + "grad_norm": 1.5548192262649536, + "learning_rate": 2.827593730113446e-06, + "loss": 1.2833, + "step": 14071 + }, + { + "epoch": 0.7693506280496972, + "grad_norm": 1.347841501235962, + "learning_rate": 2.8263205005763405e-06, + "loss": 1.5309, + "step": 14072 + }, + { + "epoch": 0.7694053004934188, + "grad_norm": 1.394758701324463, + "learning_rate": 2.8250475105852306e-06, + "loss": 1.534, + "step": 14073 + }, + { + "epoch": 0.7694599729371404, + "grad_norm": 1.451887845993042, + "learning_rate": 2.8237747601826193e-06, + "loss": 1.4593, + "step": 14074 + }, + { + "epoch": 0.7695146453808619, + "grad_norm": 1.2621417045593262, + "learning_rate": 2.8225022494110067e-06, + "loss": 1.4582, + "step": 14075 + }, + { + "epoch": 0.7695693178245835, + "grad_norm": 1.8681329488754272, + "learning_rate": 2.821229978312889e-06, + "loss": 1.3765, + "step": 14076 + }, + { + "epoch": 0.769623990268305, + "grad_norm": 1.460067868232727, + "learning_rate": 2.819957946930748e-06, + "loss": 1.4894, + "step": 14077 + }, + { + "epoch": 0.7696786627120266, + "grad_norm": 2.025966167449951, + "learning_rate": 2.81868615530706e-06, + "loss": 1.511, + "step": 14078 + }, + { + "epoch": 0.7697333351557482, + "grad_norm": 1.5485817193984985, + "learning_rate": 2.8174146034842933e-06, + "loss": 1.4537, + "step": 14079 + }, + { + "epoch": 0.7697880075994696, + "grad_norm": 2.0529770851135254, + "learning_rate": 2.816143291504906e-06, + "loss": 1.428, + "step": 14080 + }, + { + "epoch": 0.7698426800431912, + "grad_norm": 1.577582597732544, + "learning_rate": 2.8148722194113498e-06, + "loss": 1.3855, + "step": 14081 + }, + { + "epoch": 0.7698973524869128, + "grad_norm": 1.8226935863494873, + "learning_rate": 2.8136013872460733e-06, + "loss": 1.7767, + "step": 14082 + }, + { + "epoch": 0.7699520249306343, + "grad_norm": 1.4196747541427612, + "learning_rate": 2.8123307950515087e-06, + "loss": 1.6157, + "step": 14083 + }, + { + "epoch": 0.7700066973743559, + "grad_norm": 1.3003710508346558, + "learning_rate": 2.811060442870084e-06, + "loss": 1.4619, + "step": 14084 + }, + { + "epoch": 0.7700613698180775, + "grad_norm": 1.5308042764663696, + "learning_rate": 2.809790330744222e-06, + "loss": 1.6288, + "step": 14085 + }, + { + "epoch": 0.770116042261799, + "grad_norm": 1.7019391059875488, + "learning_rate": 2.8085204587163317e-06, + "loss": 1.5134, + "step": 14086 + }, + { + "epoch": 0.7701707147055206, + "grad_norm": 1.6642463207244873, + "learning_rate": 2.8072508268288212e-06, + "loss": 1.5212, + "step": 14087 + }, + { + "epoch": 0.7702253871492422, + "grad_norm": 1.2566310167312622, + "learning_rate": 2.805981435124083e-06, + "loss": 1.3953, + "step": 14088 + }, + { + "epoch": 0.7702800595929636, + "grad_norm": 1.7448887825012207, + "learning_rate": 2.8047122836445063e-06, + "loss": 1.3797, + "step": 14089 + }, + { + "epoch": 0.7703347320366852, + "grad_norm": 1.487109899520874, + "learning_rate": 2.8034433724324716e-06, + "loss": 1.5235, + "step": 14090 + }, + { + "epoch": 0.7703894044804067, + "grad_norm": 1.2847232818603516, + "learning_rate": 2.802174701530349e-06, + "loss": 1.4865, + "step": 14091 + }, + { + "epoch": 0.7704440769241283, + "grad_norm": 1.2159254550933838, + "learning_rate": 2.8009062709805014e-06, + "loss": 1.6809, + "step": 14092 + }, + { + "epoch": 0.7704987493678499, + "grad_norm": 2.0370097160339355, + "learning_rate": 2.7996380808252887e-06, + "loss": 1.5494, + "step": 14093 + }, + { + "epoch": 0.7705534218115714, + "grad_norm": 1.647266149520874, + "learning_rate": 2.7983701311070564e-06, + "loss": 1.3455, + "step": 14094 + }, + { + "epoch": 0.770608094255293, + "grad_norm": 1.3878387212753296, + "learning_rate": 2.797102421868142e-06, + "loss": 1.3203, + "step": 14095 + }, + { + "epoch": 0.7706627666990146, + "grad_norm": 1.3336461782455444, + "learning_rate": 2.7958349531508833e-06, + "loss": 1.8436, + "step": 14096 + }, + { + "epoch": 0.770717439142736, + "grad_norm": 1.9952116012573242, + "learning_rate": 2.7945677249976e-06, + "loss": 1.6252, + "step": 14097 + }, + { + "epoch": 0.7707721115864576, + "grad_norm": 1.295304536819458, + "learning_rate": 2.793300737450605e-06, + "loss": 1.6697, + "step": 14098 + }, + { + "epoch": 0.7708267840301792, + "grad_norm": 1.5784611701965332, + "learning_rate": 2.792033990552213e-06, + "loss": 1.4047, + "step": 14099 + }, + { + "epoch": 0.7708814564739007, + "grad_norm": 1.3991384506225586, + "learning_rate": 2.7907674843447172e-06, + "loss": 1.5839, + "step": 14100 + }, + { + "epoch": 0.7709361289176223, + "grad_norm": 1.7804200649261475, + "learning_rate": 2.789501218870415e-06, + "loss": 1.4183, + "step": 14101 + }, + { + "epoch": 0.7709908013613439, + "grad_norm": 1.4742212295532227, + "learning_rate": 2.788235194171588e-06, + "loss": 1.6261, + "step": 14102 + }, + { + "epoch": 0.7710454738050654, + "grad_norm": 1.5752485990524292, + "learning_rate": 2.78696941029051e-06, + "loss": 1.5409, + "step": 14103 + }, + { + "epoch": 0.771100146248787, + "grad_norm": 1.5690600872039795, + "learning_rate": 2.7857038672694492e-06, + "loss": 1.4261, + "step": 14104 + }, + { + "epoch": 0.7711548186925085, + "grad_norm": 1.997661828994751, + "learning_rate": 2.7844385651506643e-06, + "loss": 1.4792, + "step": 14105 + }, + { + "epoch": 0.77120949113623, + "grad_norm": 1.662987470626831, + "learning_rate": 2.7831735039764054e-06, + "loss": 1.3725, + "step": 14106 + }, + { + "epoch": 0.7712641635799516, + "grad_norm": 1.6703593730926514, + "learning_rate": 2.781908683788921e-06, + "loss": 1.2822, + "step": 14107 + }, + { + "epoch": 0.7713188360236731, + "grad_norm": 1.5372097492218018, + "learning_rate": 2.7806441046304432e-06, + "loss": 1.371, + "step": 14108 + }, + { + "epoch": 0.7713735084673947, + "grad_norm": 1.5078065395355225, + "learning_rate": 2.7793797665431977e-06, + "loss": 1.5499, + "step": 14109 + }, + { + "epoch": 0.7714281809111163, + "grad_norm": 1.3509594202041626, + "learning_rate": 2.7781156695694066e-06, + "loss": 1.6264, + "step": 14110 + }, + { + "epoch": 0.7714828533548378, + "grad_norm": 1.7024766206741333, + "learning_rate": 2.776851813751281e-06, + "loss": 1.699, + "step": 14111 + }, + { + "epoch": 0.7715375257985594, + "grad_norm": 1.7178435325622559, + "learning_rate": 2.7755881991310206e-06, + "loss": 1.2149, + "step": 14112 + }, + { + "epoch": 0.771592198242281, + "grad_norm": 1.9584107398986816, + "learning_rate": 2.774324825750825e-06, + "loss": 1.4751, + "step": 14113 + }, + { + "epoch": 0.7716468706860025, + "grad_norm": 1.9414016008377075, + "learning_rate": 2.7730616936528765e-06, + "loss": 1.3866, + "step": 14114 + }, + { + "epoch": 0.771701543129724, + "grad_norm": 1.9309865236282349, + "learning_rate": 2.7717988028793587e-06, + "loss": 1.3202, + "step": 14115 + }, + { + "epoch": 0.7717562155734456, + "grad_norm": 1.4953821897506714, + "learning_rate": 2.770536153472441e-06, + "loss": 1.2816, + "step": 14116 + }, + { + "epoch": 0.7718108880171671, + "grad_norm": 1.3872349262237549, + "learning_rate": 2.7692737454742858e-06, + "loss": 1.5299, + "step": 14117 + }, + { + "epoch": 0.7718655604608887, + "grad_norm": 1.3418511152267456, + "learning_rate": 2.7680115789270478e-06, + "loss": 1.4907, + "step": 14118 + }, + { + "epoch": 0.7719202329046102, + "grad_norm": 1.6646438837051392, + "learning_rate": 2.766749653872873e-06, + "loss": 1.245, + "step": 14119 + }, + { + "epoch": 0.7719749053483318, + "grad_norm": 1.5428940057754517, + "learning_rate": 2.7654879703538974e-06, + "loss": 1.3211, + "step": 14120 + }, + { + "epoch": 0.7720295777920534, + "grad_norm": 1.2640444040298462, + "learning_rate": 2.7642265284122584e-06, + "loss": 1.3162, + "step": 14121 + }, + { + "epoch": 0.7720842502357749, + "grad_norm": 1.9621771574020386, + "learning_rate": 2.762965328090075e-06, + "loss": 1.6961, + "step": 14122 + }, + { + "epoch": 0.7721389226794965, + "grad_norm": 1.7092771530151367, + "learning_rate": 2.761704369429458e-06, + "loss": 1.6557, + "step": 14123 + }, + { + "epoch": 0.7721935951232181, + "grad_norm": 1.8169379234313965, + "learning_rate": 2.760443652472521e-06, + "loss": 1.633, + "step": 14124 + }, + { + "epoch": 0.7722482675669395, + "grad_norm": 1.7026926279067993, + "learning_rate": 2.7591831772613576e-06, + "loss": 1.4519, + "step": 14125 + }, + { + "epoch": 0.7723029400106611, + "grad_norm": 2.0611324310302734, + "learning_rate": 2.7579229438380563e-06, + "loss": 1.3758, + "step": 14126 + }, + { + "epoch": 0.7723576124543827, + "grad_norm": 1.4599642753601074, + "learning_rate": 2.7566629522447054e-06, + "loss": 1.2772, + "step": 14127 + }, + { + "epoch": 0.7724122848981042, + "grad_norm": 1.372324824333191, + "learning_rate": 2.7554032025233756e-06, + "loss": 1.3535, + "step": 14128 + }, + { + "epoch": 0.7724669573418258, + "grad_norm": 1.6140635013580322, + "learning_rate": 2.754143694716133e-06, + "loss": 1.3009, + "step": 14129 + }, + { + "epoch": 0.7725216297855474, + "grad_norm": 2.1224255561828613, + "learning_rate": 2.7528844288650347e-06, + "loss": 1.1343, + "step": 14130 + }, + { + "epoch": 0.7725763022292689, + "grad_norm": 1.4548702239990234, + "learning_rate": 2.7516254050121284e-06, + "loss": 1.3342, + "step": 14131 + }, + { + "epoch": 0.7726309746729905, + "grad_norm": 1.4250537157058716, + "learning_rate": 2.750366623199462e-06, + "loss": 1.2279, + "step": 14132 + }, + { + "epoch": 0.772685647116712, + "grad_norm": 2.106951951980591, + "learning_rate": 2.7491080834690655e-06, + "loss": 1.7148, + "step": 14133 + }, + { + "epoch": 0.7727403195604335, + "grad_norm": 1.4735031127929688, + "learning_rate": 2.7478497858629617e-06, + "loss": 1.3632, + "step": 14134 + }, + { + "epoch": 0.7727949920041551, + "grad_norm": 1.1912410259246826, + "learning_rate": 2.7465917304231747e-06, + "loss": 1.3299, + "step": 14135 + }, + { + "epoch": 0.7728496644478766, + "grad_norm": 1.6289407014846802, + "learning_rate": 2.7453339171917106e-06, + "loss": 1.5243, + "step": 14136 + }, + { + "epoch": 0.7729043368915982, + "grad_norm": 1.4198561906814575, + "learning_rate": 2.744076346210567e-06, + "loss": 1.2347, + "step": 14137 + }, + { + "epoch": 0.7729590093353198, + "grad_norm": 1.2263566255569458, + "learning_rate": 2.742819017521744e-06, + "loss": 1.5791, + "step": 14138 + }, + { + "epoch": 0.7730136817790413, + "grad_norm": 1.232948660850525, + "learning_rate": 2.7415619311672236e-06, + "loss": 1.7595, + "step": 14139 + }, + { + "epoch": 0.7730683542227629, + "grad_norm": 1.445865511894226, + "learning_rate": 2.740305087188979e-06, + "loss": 1.3033, + "step": 14140 + }, + { + "epoch": 0.7731230266664845, + "grad_norm": 1.4120136499404907, + "learning_rate": 2.7390484856289867e-06, + "loss": 1.3782, + "step": 14141 + }, + { + "epoch": 0.773177699110206, + "grad_norm": 1.6254312992095947, + "learning_rate": 2.737792126529204e-06, + "loss": 1.6462, + "step": 14142 + }, + { + "epoch": 0.7732323715539275, + "grad_norm": 2.11126708984375, + "learning_rate": 2.736536009931583e-06, + "loss": 1.3527, + "step": 14143 + }, + { + "epoch": 0.7732870439976491, + "grad_norm": 1.4208980798721313, + "learning_rate": 2.73528013587807e-06, + "loss": 1.1793, + "step": 14144 + }, + { + "epoch": 0.7733417164413706, + "grad_norm": 1.6566405296325684, + "learning_rate": 2.734024504410596e-06, + "loss": 1.5643, + "step": 14145 + }, + { + "epoch": 0.7733963888850922, + "grad_norm": 1.8241461515426636, + "learning_rate": 2.7327691155710978e-06, + "loss": 1.2477, + "step": 14146 + }, + { + "epoch": 0.7734510613288137, + "grad_norm": 1.5776242017745972, + "learning_rate": 2.7315139694014913e-06, + "loss": 1.3562, + "step": 14147 + }, + { + "epoch": 0.7735057337725353, + "grad_norm": 1.315737009048462, + "learning_rate": 2.730259065943688e-06, + "loss": 1.5041, + "step": 14148 + }, + { + "epoch": 0.7735604062162569, + "grad_norm": 1.4487311840057373, + "learning_rate": 2.729004405239595e-06, + "loss": 1.4917, + "step": 14149 + }, + { + "epoch": 0.7736150786599784, + "grad_norm": 1.7510719299316406, + "learning_rate": 2.7277499873311064e-06, + "loss": 1.6029, + "step": 14150 + }, + { + "epoch": 0.7736697511037, + "grad_norm": 1.7172112464904785, + "learning_rate": 2.7264958122601083e-06, + "loss": 1.5821, + "step": 14151 + }, + { + "epoch": 0.7737244235474215, + "grad_norm": 1.6585781574249268, + "learning_rate": 2.7252418800684865e-06, + "loss": 1.4521, + "step": 14152 + }, + { + "epoch": 0.773779095991143, + "grad_norm": 1.6602096557617188, + "learning_rate": 2.723988190798108e-06, + "loss": 1.1152, + "step": 14153 + }, + { + "epoch": 0.7738337684348646, + "grad_norm": 1.6594297885894775, + "learning_rate": 2.722734744490835e-06, + "loss": 1.4622, + "step": 14154 + }, + { + "epoch": 0.7738884408785862, + "grad_norm": 1.3054540157318115, + "learning_rate": 2.7214815411885287e-06, + "loss": 1.6456, + "step": 14155 + }, + { + "epoch": 0.7739431133223077, + "grad_norm": 1.3949321508407593, + "learning_rate": 2.720228580933033e-06, + "loss": 1.0507, + "step": 14156 + }, + { + "epoch": 0.7739977857660293, + "grad_norm": 1.3151555061340332, + "learning_rate": 2.718975863766188e-06, + "loss": 1.4632, + "step": 14157 + }, + { + "epoch": 0.7740524582097509, + "grad_norm": 1.8243459463119507, + "learning_rate": 2.717723389729823e-06, + "loss": 1.358, + "step": 14158 + }, + { + "epoch": 0.7741071306534724, + "grad_norm": 1.5697461366653442, + "learning_rate": 2.716471158865761e-06, + "loss": 1.1451, + "step": 14159 + }, + { + "epoch": 0.774161803097194, + "grad_norm": 1.241958737373352, + "learning_rate": 2.7152191712158207e-06, + "loss": 1.8046, + "step": 14160 + }, + { + "epoch": 0.7742164755409154, + "grad_norm": 1.6819895505905151, + "learning_rate": 2.713967426821806e-06, + "loss": 1.4406, + "step": 14161 + }, + { + "epoch": 0.774271147984637, + "grad_norm": 1.6489365100860596, + "learning_rate": 2.712715925725514e-06, + "loss": 1.5042, + "step": 14162 + }, + { + "epoch": 0.7743258204283586, + "grad_norm": 1.680222511291504, + "learning_rate": 2.7114646679687393e-06, + "loss": 1.427, + "step": 14163 + }, + { + "epoch": 0.7743804928720801, + "grad_norm": 1.344531536102295, + "learning_rate": 2.7102136535932633e-06, + "loss": 1.0815, + "step": 14164 + }, + { + "epoch": 0.7744351653158017, + "grad_norm": 1.4906010627746582, + "learning_rate": 2.7089628826408563e-06, + "loss": 1.4929, + "step": 14165 + }, + { + "epoch": 0.7744898377595233, + "grad_norm": 1.5317845344543457, + "learning_rate": 2.7077123551532913e-06, + "loss": 1.3913, + "step": 14166 + }, + { + "epoch": 0.7745445102032448, + "grad_norm": 1.6062500476837158, + "learning_rate": 2.706462071172322e-06, + "loss": 1.5827, + "step": 14167 + }, + { + "epoch": 0.7745991826469664, + "grad_norm": 1.2022099494934082, + "learning_rate": 2.705212030739699e-06, + "loss": 1.5943, + "step": 14168 + }, + { + "epoch": 0.774653855090688, + "grad_norm": 1.5608433485031128, + "learning_rate": 2.7039622338971637e-06, + "loss": 1.32, + "step": 14169 + }, + { + "epoch": 0.7747085275344094, + "grad_norm": 1.610740065574646, + "learning_rate": 2.7027126806864467e-06, + "loss": 1.3974, + "step": 14170 + }, + { + "epoch": 0.774763199978131, + "grad_norm": 1.3773918151855469, + "learning_rate": 2.701463371149281e-06, + "loss": 1.4806, + "step": 14171 + }, + { + "epoch": 0.7748178724218526, + "grad_norm": 1.3260045051574707, + "learning_rate": 2.70021430532738e-06, + "loss": 1.7702, + "step": 14172 + }, + { + "epoch": 0.7748725448655741, + "grad_norm": 1.2450661659240723, + "learning_rate": 2.698965483262449e-06, + "loss": 1.379, + "step": 14173 + }, + { + "epoch": 0.7749272173092957, + "grad_norm": 1.7136908769607544, + "learning_rate": 2.697716904996196e-06, + "loss": 1.2774, + "step": 14174 + }, + { + "epoch": 0.7749818897530172, + "grad_norm": 2.055285930633545, + "learning_rate": 2.6964685705703107e-06, + "loss": 1.6779, + "step": 14175 + }, + { + "epoch": 0.7750365621967388, + "grad_norm": 1.8404279947280884, + "learning_rate": 2.695220480026476e-06, + "loss": 1.3935, + "step": 14176 + }, + { + "epoch": 0.7750912346404604, + "grad_norm": 1.7030717134475708, + "learning_rate": 2.693972633406373e-06, + "loss": 1.2709, + "step": 14177 + }, + { + "epoch": 0.7751459070841819, + "grad_norm": 1.5008068084716797, + "learning_rate": 2.6927250307516685e-06, + "loss": 1.4365, + "step": 14178 + }, + { + "epoch": 0.7752005795279034, + "grad_norm": 1.8662309646606445, + "learning_rate": 2.691477672104018e-06, + "loss": 1.6033, + "step": 14179 + }, + { + "epoch": 0.775255251971625, + "grad_norm": 1.408001184463501, + "learning_rate": 2.6902305575050822e-06, + "loss": 1.2554, + "step": 14180 + }, + { + "epoch": 0.7753099244153465, + "grad_norm": 1.0640653371810913, + "learning_rate": 2.6889836869965016e-06, + "loss": 1.7029, + "step": 14181 + }, + { + "epoch": 0.7753645968590681, + "grad_norm": 1.271036982536316, + "learning_rate": 2.6877370606199094e-06, + "loss": 1.7026, + "step": 14182 + }, + { + "epoch": 0.7754192693027897, + "grad_norm": 1.5436939001083374, + "learning_rate": 2.6864906784169375e-06, + "loss": 1.5965, + "step": 14183 + }, + { + "epoch": 0.7754739417465112, + "grad_norm": 1.9946794509887695, + "learning_rate": 2.685244540429199e-06, + "loss": 1.5219, + "step": 14184 + }, + { + "epoch": 0.7755286141902328, + "grad_norm": 1.5691750049591064, + "learning_rate": 2.683998646698314e-06, + "loss": 1.2141, + "step": 14185 + }, + { + "epoch": 0.7755832866339544, + "grad_norm": 2.029876232147217, + "learning_rate": 2.6827529972658816e-06, + "loss": 1.11, + "step": 14186 + }, + { + "epoch": 0.7756379590776759, + "grad_norm": 1.4458938837051392, + "learning_rate": 2.6815075921734936e-06, + "loss": 1.3291, + "step": 14187 + }, + { + "epoch": 0.7756926315213974, + "grad_norm": 1.4756639003753662, + "learning_rate": 2.6802624314627436e-06, + "loss": 1.4488, + "step": 14188 + }, + { + "epoch": 0.7757473039651189, + "grad_norm": 1.3933480978012085, + "learning_rate": 2.6790175151752073e-06, + "loss": 1.5839, + "step": 14189 + }, + { + "epoch": 0.7758019764088405, + "grad_norm": 1.8388651609420776, + "learning_rate": 2.677772843352453e-06, + "loss": 1.2484, + "step": 14190 + }, + { + "epoch": 0.7758566488525621, + "grad_norm": 1.798224925994873, + "learning_rate": 2.6765284160360483e-06, + "loss": 1.4468, + "step": 14191 + }, + { + "epoch": 0.7759113212962836, + "grad_norm": 1.4458802938461304, + "learning_rate": 2.6752842332675446e-06, + "loss": 1.5528, + "step": 14192 + }, + { + "epoch": 0.7759659937400052, + "grad_norm": 1.8074973821640015, + "learning_rate": 2.6740402950884848e-06, + "loss": 1.6438, + "step": 14193 + }, + { + "epoch": 0.7760206661837268, + "grad_norm": 1.6418020725250244, + "learning_rate": 2.672796601540415e-06, + "loss": 1.5463, + "step": 14194 + }, + { + "epoch": 0.7760753386274483, + "grad_norm": 2.0060298442840576, + "learning_rate": 2.6715531526648585e-06, + "loss": 1.3737, + "step": 14195 + }, + { + "epoch": 0.7761300110711699, + "grad_norm": 1.9101276397705078, + "learning_rate": 2.67030994850334e-06, + "loss": 1.3242, + "step": 14196 + }, + { + "epoch": 0.7761846835148914, + "grad_norm": 1.5429433584213257, + "learning_rate": 2.669066989097373e-06, + "loss": 1.4651, + "step": 14197 + }, + { + "epoch": 0.7762393559586129, + "grad_norm": 1.9105491638183594, + "learning_rate": 2.6678242744884575e-06, + "loss": 1.4496, + "step": 14198 + }, + { + "epoch": 0.7762940284023345, + "grad_norm": 1.4079279899597168, + "learning_rate": 2.666581804718098e-06, + "loss": 1.4433, + "step": 14199 + }, + { + "epoch": 0.7763487008460561, + "grad_norm": 2.930981159210205, + "learning_rate": 2.6653395798277814e-06, + "loss": 1.5601, + "step": 14200 + }, + { + "epoch": 0.7764033732897776, + "grad_norm": 2.040600061416626, + "learning_rate": 2.6640975998589836e-06, + "loss": 1.4606, + "step": 14201 + }, + { + "epoch": 0.7764580457334992, + "grad_norm": 1.6797168254852295, + "learning_rate": 2.6628558648531845e-06, + "loss": 1.3705, + "step": 14202 + }, + { + "epoch": 0.7765127181772207, + "grad_norm": 2.7981019020080566, + "learning_rate": 2.6616143748518453e-06, + "loss": 1.4451, + "step": 14203 + }, + { + "epoch": 0.7765673906209423, + "grad_norm": 1.557023048400879, + "learning_rate": 2.6603731298964186e-06, + "loss": 1.5458, + "step": 14204 + }, + { + "epoch": 0.7766220630646639, + "grad_norm": 1.5375206470489502, + "learning_rate": 2.6591321300283603e-06, + "loss": 1.3871, + "step": 14205 + }, + { + "epoch": 0.7766767355083853, + "grad_norm": 1.9944332838058472, + "learning_rate": 2.6578913752891044e-06, + "loss": 1.5959, + "step": 14206 + }, + { + "epoch": 0.7767314079521069, + "grad_norm": 1.4293930530548096, + "learning_rate": 2.656650865720085e-06, + "loss": 1.4147, + "step": 14207 + }, + { + "epoch": 0.7767860803958285, + "grad_norm": 1.5555340051651, + "learning_rate": 2.6554106013627256e-06, + "loss": 1.33, + "step": 14208 + }, + { + "epoch": 0.77684075283955, + "grad_norm": 1.3687727451324463, + "learning_rate": 2.654170582258441e-06, + "loss": 1.5412, + "step": 14209 + }, + { + "epoch": 0.7768954252832716, + "grad_norm": 1.6494728326797485, + "learning_rate": 2.652930808448634e-06, + "loss": 1.2317, + "step": 14210 + }, + { + "epoch": 0.7769500977269932, + "grad_norm": 1.7909237146377563, + "learning_rate": 2.6516912799747106e-06, + "loss": 1.3849, + "step": 14211 + }, + { + "epoch": 0.7770047701707147, + "grad_norm": 1.1536338329315186, + "learning_rate": 2.650451996878056e-06, + "loss": 1.3778, + "step": 14212 + }, + { + "epoch": 0.7770594426144363, + "grad_norm": 1.742174506187439, + "learning_rate": 2.649212959200059e-06, + "loss": 1.422, + "step": 14213 + }, + { + "epoch": 0.7771141150581579, + "grad_norm": 1.3485702276229858, + "learning_rate": 2.6479741669820915e-06, + "loss": 1.5296, + "step": 14214 + }, + { + "epoch": 0.7771687875018793, + "grad_norm": 1.4499458074569702, + "learning_rate": 2.6467356202655135e-06, + "loss": 1.2728, + "step": 14215 + }, + { + "epoch": 0.7772234599456009, + "grad_norm": 1.7260738611221313, + "learning_rate": 2.645497319091692e-06, + "loss": 1.2324, + "step": 14216 + }, + { + "epoch": 0.7772781323893224, + "grad_norm": 1.234911322593689, + "learning_rate": 2.6442592635019724e-06, + "loss": 1.6401, + "step": 14217 + }, + { + "epoch": 0.777332804833044, + "grad_norm": 1.9636770486831665, + "learning_rate": 2.6430214535376954e-06, + "loss": 1.4026, + "step": 14218 + }, + { + "epoch": 0.7773874772767656, + "grad_norm": 1.3773987293243408, + "learning_rate": 2.641783889240197e-06, + "loss": 1.2907, + "step": 14219 + }, + { + "epoch": 0.7774421497204871, + "grad_norm": 1.7613635063171387, + "learning_rate": 2.6405465706508014e-06, + "loss": 1.2328, + "step": 14220 + }, + { + "epoch": 0.7774968221642087, + "grad_norm": 1.8536523580551147, + "learning_rate": 2.639309497810827e-06, + "loss": 1.5252, + "step": 14221 + }, + { + "epoch": 0.7775514946079303, + "grad_norm": 1.099886178970337, + "learning_rate": 2.6380726707615787e-06, + "loss": 1.5718, + "step": 14222 + }, + { + "epoch": 0.7776061670516518, + "grad_norm": 1.4693173170089722, + "learning_rate": 2.63683608954436e-06, + "loss": 1.3638, + "step": 14223 + }, + { + "epoch": 0.7776608394953733, + "grad_norm": 1.4470025300979614, + "learning_rate": 2.6355997542004596e-06, + "loss": 1.3809, + "step": 14224 + }, + { + "epoch": 0.7777155119390949, + "grad_norm": 1.3074127435684204, + "learning_rate": 2.634363664771168e-06, + "loss": 1.6234, + "step": 14225 + }, + { + "epoch": 0.7777701843828164, + "grad_norm": 1.732064962387085, + "learning_rate": 2.633127821297754e-06, + "loss": 1.3356, + "step": 14226 + }, + { + "epoch": 0.777824856826538, + "grad_norm": 1.660919427871704, + "learning_rate": 2.6318922238214915e-06, + "loss": 1.2667, + "step": 14227 + }, + { + "epoch": 0.7778795292702596, + "grad_norm": 1.8991683721542358, + "learning_rate": 2.630656872383639e-06, + "loss": 1.6955, + "step": 14228 + }, + { + "epoch": 0.7779342017139811, + "grad_norm": 1.5161988735198975, + "learning_rate": 2.629421767025442e-06, + "loss": 1.3193, + "step": 14229 + }, + { + "epoch": 0.7779888741577027, + "grad_norm": 1.6814483404159546, + "learning_rate": 2.6281869077881507e-06, + "loss": 1.4032, + "step": 14230 + }, + { + "epoch": 0.7780435466014242, + "grad_norm": 1.3842289447784424, + "learning_rate": 2.6269522947129976e-06, + "loss": 1.2544, + "step": 14231 + }, + { + "epoch": 0.7780982190451458, + "grad_norm": 1.4555561542510986, + "learning_rate": 2.6257179278412084e-06, + "loss": 1.4899, + "step": 14232 + }, + { + "epoch": 0.7781528914888673, + "grad_norm": 1.4471362829208374, + "learning_rate": 2.6244838072140023e-06, + "loss": 1.5531, + "step": 14233 + }, + { + "epoch": 0.7782075639325888, + "grad_norm": 1.2990065813064575, + "learning_rate": 2.623249932872589e-06, + "loss": 1.2957, + "step": 14234 + }, + { + "epoch": 0.7782622363763104, + "grad_norm": 1.50108003616333, + "learning_rate": 2.622016304858167e-06, + "loss": 1.4219, + "step": 14235 + }, + { + "epoch": 0.778316908820032, + "grad_norm": 1.4331631660461426, + "learning_rate": 2.6207829232119373e-06, + "loss": 1.4362, + "step": 14236 + }, + { + "epoch": 0.7783715812637535, + "grad_norm": 2.109337329864502, + "learning_rate": 2.619549787975081e-06, + "loss": 1.3463, + "step": 14237 + }, + { + "epoch": 0.7784262537074751, + "grad_norm": 1.310140609741211, + "learning_rate": 2.618316899188773e-06, + "loss": 1.5961, + "step": 14238 + }, + { + "epoch": 0.7784809261511967, + "grad_norm": 1.186828374862671, + "learning_rate": 2.617084256894189e-06, + "loss": 1.2855, + "step": 14239 + }, + { + "epoch": 0.7785355985949182, + "grad_norm": 1.676672339439392, + "learning_rate": 2.6158518611324836e-06, + "loss": 1.179, + "step": 14240 + }, + { + "epoch": 0.7785902710386398, + "grad_norm": 1.3161557912826538, + "learning_rate": 2.6146197119448135e-06, + "loss": 1.4416, + "step": 14241 + }, + { + "epoch": 0.7786449434823614, + "grad_norm": 1.423352599143982, + "learning_rate": 2.613387809372323e-06, + "loss": 1.636, + "step": 14242 + }, + { + "epoch": 0.7786996159260828, + "grad_norm": 1.6296470165252686, + "learning_rate": 2.6121561534561423e-06, + "loss": 1.6407, + "step": 14243 + }, + { + "epoch": 0.7787542883698044, + "grad_norm": 1.479924201965332, + "learning_rate": 2.6109247442374088e-06, + "loss": 1.5198, + "step": 14244 + }, + { + "epoch": 0.7788089608135259, + "grad_norm": 1.5805081129074097, + "learning_rate": 2.6096935817572357e-06, + "loss": 1.4851, + "step": 14245 + }, + { + "epoch": 0.7788636332572475, + "grad_norm": 1.6211683750152588, + "learning_rate": 2.608462666056736e-06, + "loss": 1.2334, + "step": 14246 + }, + { + "epoch": 0.7789183057009691, + "grad_norm": 1.611861228942871, + "learning_rate": 2.6072319971770122e-06, + "loss": 1.1822, + "step": 14247 + }, + { + "epoch": 0.7789729781446906, + "grad_norm": 1.546858787536621, + "learning_rate": 2.6060015751591605e-06, + "loss": 1.3044, + "step": 14248 + }, + { + "epoch": 0.7790276505884122, + "grad_norm": 1.4102219343185425, + "learning_rate": 2.6047714000442634e-06, + "loss": 1.6147, + "step": 14249 + }, + { + "epoch": 0.7790823230321338, + "grad_norm": 1.6689602136611938, + "learning_rate": 2.6035414718734052e-06, + "loss": 1.406, + "step": 14250 + }, + { + "epoch": 0.7791369954758552, + "grad_norm": 1.6748448610305786, + "learning_rate": 2.602311790687655e-06, + "loss": 1.4099, + "step": 14251 + }, + { + "epoch": 0.7791916679195768, + "grad_norm": 1.547628402709961, + "learning_rate": 2.6010823565280695e-06, + "loss": 1.4516, + "step": 14252 + }, + { + "epoch": 0.7792463403632984, + "grad_norm": 1.3930796384811401, + "learning_rate": 2.599853169435709e-06, + "loss": 1.4693, + "step": 14253 + }, + { + "epoch": 0.7793010128070199, + "grad_norm": 1.4554479122161865, + "learning_rate": 2.5986242294516127e-06, + "loss": 1.302, + "step": 14254 + }, + { + "epoch": 0.7793556852507415, + "grad_norm": 1.8361413478851318, + "learning_rate": 2.5973955366168257e-06, + "loss": 1.4258, + "step": 14255 + }, + { + "epoch": 0.7794103576944631, + "grad_norm": 1.6107947826385498, + "learning_rate": 2.5961670909723714e-06, + "loss": 1.1762, + "step": 14256 + }, + { + "epoch": 0.7794650301381846, + "grad_norm": 1.3126766681671143, + "learning_rate": 2.5949388925592687e-06, + "loss": 1.3951, + "step": 14257 + }, + { + "epoch": 0.7795197025819062, + "grad_norm": 1.5268610715866089, + "learning_rate": 2.593710941418537e-06, + "loss": 1.448, + "step": 14258 + }, + { + "epoch": 0.7795743750256277, + "grad_norm": 1.4758129119873047, + "learning_rate": 2.5924832375911746e-06, + "loss": 1.457, + "step": 14259 + }, + { + "epoch": 0.7796290474693492, + "grad_norm": 1.7244157791137695, + "learning_rate": 2.5912557811181802e-06, + "loss": 1.4018, + "step": 14260 + }, + { + "epoch": 0.7796837199130708, + "grad_norm": 1.6106388568878174, + "learning_rate": 2.5900285720405403e-06, + "loss": 1.3498, + "step": 14261 + }, + { + "epoch": 0.7797383923567923, + "grad_norm": 1.598981499671936, + "learning_rate": 2.588801610399234e-06, + "loss": 1.314, + "step": 14262 + }, + { + "epoch": 0.7797930648005139, + "grad_norm": 1.6275688409805298, + "learning_rate": 2.587574896235231e-06, + "loss": 1.3165, + "step": 14263 + }, + { + "epoch": 0.7798477372442355, + "grad_norm": 1.5480107069015503, + "learning_rate": 2.586348429589498e-06, + "loss": 1.3741, + "step": 14264 + }, + { + "epoch": 0.779902409687957, + "grad_norm": 1.248457670211792, + "learning_rate": 2.585122210502987e-06, + "loss": 1.4964, + "step": 14265 + }, + { + "epoch": 0.7799570821316786, + "grad_norm": 1.4006723165512085, + "learning_rate": 2.5838962390166433e-06, + "loss": 1.623, + "step": 14266 + }, + { + "epoch": 0.7800117545754002, + "grad_norm": 1.3941457271575928, + "learning_rate": 2.582670515171409e-06, + "loss": 1.4305, + "step": 14267 + }, + { + "epoch": 0.7800664270191217, + "grad_norm": 1.3857593536376953, + "learning_rate": 2.581445039008209e-06, + "loss": 1.4672, + "step": 14268 + }, + { + "epoch": 0.7801210994628432, + "grad_norm": 1.7167423963546753, + "learning_rate": 2.58021981056797e-06, + "loss": 1.5121, + "step": 14269 + }, + { + "epoch": 0.7801757719065648, + "grad_norm": 1.319146752357483, + "learning_rate": 2.5789948298916025e-06, + "loss": 1.5429, + "step": 14270 + }, + { + "epoch": 0.7802304443502863, + "grad_norm": 1.363847255706787, + "learning_rate": 2.5777700970200115e-06, + "loss": 1.4673, + "step": 14271 + }, + { + "epoch": 0.7802851167940079, + "grad_norm": 1.1053822040557861, + "learning_rate": 2.5765456119940933e-06, + "loss": 1.8574, + "step": 14272 + }, + { + "epoch": 0.7803397892377295, + "grad_norm": 1.6468946933746338, + "learning_rate": 2.575321374854738e-06, + "loss": 1.2161, + "step": 14273 + }, + { + "epoch": 0.780394461681451, + "grad_norm": 1.9084222316741943, + "learning_rate": 2.5740973856428207e-06, + "loss": 1.7153, + "step": 14274 + }, + { + "epoch": 0.7804491341251726, + "grad_norm": 1.3311913013458252, + "learning_rate": 2.57287364439922e-06, + "loss": 1.4826, + "step": 14275 + }, + { + "epoch": 0.7805038065688941, + "grad_norm": 1.1617385149002075, + "learning_rate": 2.5716501511647975e-06, + "loss": 1.5785, + "step": 14276 + }, + { + "epoch": 0.7805584790126157, + "grad_norm": 1.491043210029602, + "learning_rate": 2.5704269059804034e-06, + "loss": 1.5585, + "step": 14277 + }, + { + "epoch": 0.7806131514563373, + "grad_norm": 1.4784976243972778, + "learning_rate": 2.5692039088868927e-06, + "loss": 1.1967, + "step": 14278 + }, + { + "epoch": 0.7806678239000587, + "grad_norm": 1.3237282037734985, + "learning_rate": 2.5679811599251003e-06, + "loss": 1.4856, + "step": 14279 + }, + { + "epoch": 0.7807224963437803, + "grad_norm": 1.798587441444397, + "learning_rate": 2.566758659135854e-06, + "loss": 1.5035, + "step": 14280 + }, + { + "epoch": 0.7807771687875019, + "grad_norm": 1.415136456489563, + "learning_rate": 2.565536406559982e-06, + "loss": 1.4978, + "step": 14281 + }, + { + "epoch": 0.7808318412312234, + "grad_norm": 1.5234410762786865, + "learning_rate": 2.5643144022382904e-06, + "loss": 1.4798, + "step": 14282 + }, + { + "epoch": 0.780886513674945, + "grad_norm": 1.6785008907318115, + "learning_rate": 2.5630926462115934e-06, + "loss": 1.4108, + "step": 14283 + }, + { + "epoch": 0.7809411861186666, + "grad_norm": 1.7309815883636475, + "learning_rate": 2.561871138520684e-06, + "loss": 1.3865, + "step": 14284 + }, + { + "epoch": 0.7809958585623881, + "grad_norm": 1.6592447757720947, + "learning_rate": 2.5606498792063515e-06, + "loss": 1.4851, + "step": 14285 + }, + { + "epoch": 0.7810505310061097, + "grad_norm": 1.3520710468292236, + "learning_rate": 2.559428868309377e-06, + "loss": 1.3743, + "step": 14286 + }, + { + "epoch": 0.7811052034498313, + "grad_norm": 1.4368244409561157, + "learning_rate": 2.558208105870531e-06, + "loss": 1.4304, + "step": 14287 + }, + { + "epoch": 0.7811598758935527, + "grad_norm": 1.5361812114715576, + "learning_rate": 2.5569875919305777e-06, + "loss": 1.3991, + "step": 14288 + }, + { + "epoch": 0.7812145483372743, + "grad_norm": 1.6304285526275635, + "learning_rate": 2.555767326530276e-06, + "loss": 1.3419, + "step": 14289 + }, + { + "epoch": 0.7812692207809958, + "grad_norm": 1.5658904314041138, + "learning_rate": 2.5545473097103725e-06, + "loss": 1.4989, + "step": 14290 + }, + { + "epoch": 0.7813238932247174, + "grad_norm": 1.422631025314331, + "learning_rate": 2.553327541511602e-06, + "loss": 1.34, + "step": 14291 + }, + { + "epoch": 0.781378565668439, + "grad_norm": 1.3951416015625, + "learning_rate": 2.552108021974703e-06, + "loss": 1.5131, + "step": 14292 + }, + { + "epoch": 0.7814332381121605, + "grad_norm": 1.5352082252502441, + "learning_rate": 2.5508887511403936e-06, + "loss": 1.347, + "step": 14293 + }, + { + "epoch": 0.7814879105558821, + "grad_norm": 1.6321532726287842, + "learning_rate": 2.5496697290493855e-06, + "loss": 1.3771, + "step": 14294 + }, + { + "epoch": 0.7815425829996037, + "grad_norm": 2.0241763591766357, + "learning_rate": 2.548450955742391e-06, + "loss": 1.6857, + "step": 14295 + }, + { + "epoch": 0.7815972554433251, + "grad_norm": 1.2658840417861938, + "learning_rate": 2.5472324312601017e-06, + "loss": 1.5262, + "step": 14296 + }, + { + "epoch": 0.7816519278870467, + "grad_norm": 1.5892237424850464, + "learning_rate": 2.5460141556432127e-06, + "loss": 1.3137, + "step": 14297 + }, + { + "epoch": 0.7817066003307683, + "grad_norm": 2.1685333251953125, + "learning_rate": 2.544796128932403e-06, + "loss": 1.5683, + "step": 14298 + }, + { + "epoch": 0.7817612727744898, + "grad_norm": 1.4377710819244385, + "learning_rate": 2.5435783511683444e-06, + "loss": 1.3965, + "step": 14299 + }, + { + "epoch": 0.7818159452182114, + "grad_norm": 1.3815628290176392, + "learning_rate": 2.542360822391702e-06, + "loss": 1.3075, + "step": 14300 + }, + { + "epoch": 0.781870617661933, + "grad_norm": 1.7348352670669556, + "learning_rate": 2.541143542643132e-06, + "loss": 1.2604, + "step": 14301 + }, + { + "epoch": 0.7819252901056545, + "grad_norm": 2.038252830505371, + "learning_rate": 2.539926511963278e-06, + "loss": 1.4769, + "step": 14302 + }, + { + "epoch": 0.7819799625493761, + "grad_norm": 1.3636771440505981, + "learning_rate": 2.5387097303927864e-06, + "loss": 1.6192, + "step": 14303 + }, + { + "epoch": 0.7820346349930976, + "grad_norm": 1.9290884733200073, + "learning_rate": 2.5374931979722863e-06, + "loss": 1.2819, + "step": 14304 + }, + { + "epoch": 0.7820893074368191, + "grad_norm": 1.4026663303375244, + "learning_rate": 2.536276914742395e-06, + "loss": 1.2048, + "step": 14305 + }, + { + "epoch": 0.7821439798805407, + "grad_norm": 1.6098709106445312, + "learning_rate": 2.535060880743736e-06, + "loss": 1.4143, + "step": 14306 + }, + { + "epoch": 0.7821986523242622, + "grad_norm": 1.515063762664795, + "learning_rate": 2.5338450960169105e-06, + "loss": 1.5708, + "step": 14307 + }, + { + "epoch": 0.7822533247679838, + "grad_norm": 1.7104042768478394, + "learning_rate": 2.532629560602514e-06, + "loss": 1.4364, + "step": 14308 + }, + { + "epoch": 0.7823079972117054, + "grad_norm": 1.7540218830108643, + "learning_rate": 2.531414274541143e-06, + "loss": 1.6674, + "step": 14309 + }, + { + "epoch": 0.7823626696554269, + "grad_norm": 1.7348886728286743, + "learning_rate": 2.5301992378733753e-06, + "loss": 1.5667, + "step": 14310 + }, + { + "epoch": 0.7824173420991485, + "grad_norm": 1.4763593673706055, + "learning_rate": 2.528984450639782e-06, + "loss": 1.5602, + "step": 14311 + }, + { + "epoch": 0.7824720145428701, + "grad_norm": 1.6409677267074585, + "learning_rate": 2.5277699128809307e-06, + "loss": 1.4465, + "step": 14312 + }, + { + "epoch": 0.7825266869865916, + "grad_norm": 1.5644893646240234, + "learning_rate": 2.5265556246373724e-06, + "loss": 1.3766, + "step": 14313 + }, + { + "epoch": 0.7825813594303132, + "grad_norm": 1.3073087930679321, + "learning_rate": 2.525341585949662e-06, + "loss": 1.4939, + "step": 14314 + }, + { + "epoch": 0.7826360318740347, + "grad_norm": 1.3948911428451538, + "learning_rate": 2.5241277968583355e-06, + "loss": 1.4645, + "step": 14315 + }, + { + "epoch": 0.7826907043177562, + "grad_norm": 1.6509993076324463, + "learning_rate": 2.5229142574039224e-06, + "loss": 1.4859, + "step": 14316 + }, + { + "epoch": 0.7827453767614778, + "grad_norm": 1.9366954565048218, + "learning_rate": 2.52170096762695e-06, + "loss": 1.6673, + "step": 14317 + }, + { + "epoch": 0.7828000492051993, + "grad_norm": 1.2882128953933716, + "learning_rate": 2.5204879275679307e-06, + "loss": 1.5283, + "step": 14318 + }, + { + "epoch": 0.7828547216489209, + "grad_norm": 1.4621672630310059, + "learning_rate": 2.5192751372673673e-06, + "loss": 1.3229, + "step": 14319 + }, + { + "epoch": 0.7829093940926425, + "grad_norm": 1.566204309463501, + "learning_rate": 2.5180625967657647e-06, + "loss": 1.4425, + "step": 14320 + }, + { + "epoch": 0.782964066536364, + "grad_norm": 1.4227681159973145, + "learning_rate": 2.5168503061036086e-06, + "loss": 1.5013, + "step": 14321 + }, + { + "epoch": 0.7830187389800856, + "grad_norm": 1.1792857646942139, + "learning_rate": 2.5156382653213786e-06, + "loss": 1.4636, + "step": 14322 + }, + { + "epoch": 0.7830734114238072, + "grad_norm": 1.758719563484192, + "learning_rate": 2.5144264744595515e-06, + "loss": 1.2436, + "step": 14323 + }, + { + "epoch": 0.7831280838675286, + "grad_norm": 1.4030474424362183, + "learning_rate": 2.5132149335585896e-06, + "loss": 1.3919, + "step": 14324 + }, + { + "epoch": 0.7831827563112502, + "grad_norm": 1.2984639406204224, + "learning_rate": 2.51200364265895e-06, + "loss": 1.4145, + "step": 14325 + }, + { + "epoch": 0.7832374287549718, + "grad_norm": 1.4770599603652954, + "learning_rate": 2.5107926018010796e-06, + "loss": 1.3057, + "step": 14326 + }, + { + "epoch": 0.7832921011986933, + "grad_norm": 1.4853346347808838, + "learning_rate": 2.5095818110254155e-06, + "loss": 1.3339, + "step": 14327 + }, + { + "epoch": 0.7833467736424149, + "grad_norm": 1.2923216819763184, + "learning_rate": 2.5083712703723952e-06, + "loss": 1.5899, + "step": 14328 + }, + { + "epoch": 0.7834014460861365, + "grad_norm": 1.8460681438446045, + "learning_rate": 2.507160979882436e-06, + "loss": 1.3239, + "step": 14329 + }, + { + "epoch": 0.783456118529858, + "grad_norm": 1.428098201751709, + "learning_rate": 2.5059509395959523e-06, + "loss": 1.3127, + "step": 14330 + }, + { + "epoch": 0.7835107909735796, + "grad_norm": 1.5464529991149902, + "learning_rate": 2.5047411495533556e-06, + "loss": 1.3341, + "step": 14331 + }, + { + "epoch": 0.783565463417301, + "grad_norm": 1.745168924331665, + "learning_rate": 2.503531609795039e-06, + "loss": 1.4851, + "step": 14332 + }, + { + "epoch": 0.7836201358610226, + "grad_norm": 1.2309902906417847, + "learning_rate": 2.502322320361391e-06, + "loss": 1.5586, + "step": 14333 + }, + { + "epoch": 0.7836748083047442, + "grad_norm": 1.3461885452270508, + "learning_rate": 2.5011132812927963e-06, + "loss": 1.2954, + "step": 14334 + }, + { + "epoch": 0.7837294807484657, + "grad_norm": 1.4653867483139038, + "learning_rate": 2.499904492629627e-06, + "loss": 1.5719, + "step": 14335 + }, + { + "epoch": 0.7837841531921873, + "grad_norm": 1.280697226524353, + "learning_rate": 2.4986959544122423e-06, + "loss": 1.4251, + "step": 14336 + }, + { + "epoch": 0.7838388256359089, + "grad_norm": 1.4517152309417725, + "learning_rate": 2.4974876666810053e-06, + "loss": 1.6813, + "step": 14337 + }, + { + "epoch": 0.7838934980796304, + "grad_norm": 1.5088536739349365, + "learning_rate": 2.4962796294762615e-06, + "loss": 1.3731, + "step": 14338 + }, + { + "epoch": 0.783948170523352, + "grad_norm": 1.6255329847335815, + "learning_rate": 2.495071842838348e-06, + "loss": 1.236, + "step": 14339 + }, + { + "epoch": 0.7840028429670736, + "grad_norm": 1.629616141319275, + "learning_rate": 2.4938643068075962e-06, + "loss": 1.2539, + "step": 14340 + }, + { + "epoch": 0.784057515410795, + "grad_norm": 1.5005751848220825, + "learning_rate": 2.4926570214243264e-06, + "loss": 1.4075, + "step": 14341 + }, + { + "epoch": 0.7841121878545166, + "grad_norm": 1.6149094104766846, + "learning_rate": 2.4914499867288577e-06, + "loss": 1.4746, + "step": 14342 + }, + { + "epoch": 0.7841668602982382, + "grad_norm": 1.713252305984497, + "learning_rate": 2.4902432027614933e-06, + "loss": 1.6564, + "step": 14343 + }, + { + "epoch": 0.7842215327419597, + "grad_norm": 1.7946090698242188, + "learning_rate": 2.489036669562528e-06, + "loss": 1.3811, + "step": 14344 + }, + { + "epoch": 0.7842762051856813, + "grad_norm": 1.3561960458755493, + "learning_rate": 2.4878303871722564e-06, + "loss": 1.5914, + "step": 14345 + }, + { + "epoch": 0.7843308776294028, + "grad_norm": 2.0434248447418213, + "learning_rate": 2.4866243556309557e-06, + "loss": 1.3703, + "step": 14346 + }, + { + "epoch": 0.7843855500731244, + "grad_norm": 1.4691760540008545, + "learning_rate": 2.485418574978895e-06, + "loss": 1.6101, + "step": 14347 + }, + { + "epoch": 0.784440222516846, + "grad_norm": 1.388358473777771, + "learning_rate": 2.4842130452563453e-06, + "loss": 1.479, + "step": 14348 + }, + { + "epoch": 0.7844948949605675, + "grad_norm": 1.6181459426879883, + "learning_rate": 2.483007766503558e-06, + "loss": 1.1863, + "step": 14349 + }, + { + "epoch": 0.784549567404289, + "grad_norm": 1.531059741973877, + "learning_rate": 2.4818027387607814e-06, + "loss": 1.4038, + "step": 14350 + }, + { + "epoch": 0.7846042398480106, + "grad_norm": 1.5185797214508057, + "learning_rate": 2.480597962068252e-06, + "loss": 1.6318, + "step": 14351 + }, + { + "epoch": 0.7846589122917321, + "grad_norm": 1.675533652305603, + "learning_rate": 2.479393436466202e-06, + "loss": 1.1983, + "step": 14352 + }, + { + "epoch": 0.7847135847354537, + "grad_norm": 1.312811017036438, + "learning_rate": 2.4781891619948506e-06, + "loss": 1.494, + "step": 14353 + }, + { + "epoch": 0.7847682571791753, + "grad_norm": 1.4626144170761108, + "learning_rate": 2.4769851386944157e-06, + "loss": 1.5154, + "step": 14354 + }, + { + "epoch": 0.7848229296228968, + "grad_norm": 1.9437845945358276, + "learning_rate": 2.475781366605098e-06, + "loss": 1.2863, + "step": 14355 + }, + { + "epoch": 0.7848776020666184, + "grad_norm": 1.6418845653533936, + "learning_rate": 2.474577845767099e-06, + "loss": 1.5861, + "step": 14356 + }, + { + "epoch": 0.78493227451034, + "grad_norm": 1.3487660884857178, + "learning_rate": 2.4733745762206042e-06, + "loss": 1.55, + "step": 14357 + }, + { + "epoch": 0.7849869469540615, + "grad_norm": 1.3643385171890259, + "learning_rate": 2.4721715580057926e-06, + "loss": 1.4976, + "step": 14358 + }, + { + "epoch": 0.785041619397783, + "grad_norm": 1.4878618717193604, + "learning_rate": 2.470968791162839e-06, + "loss": 1.6508, + "step": 14359 + }, + { + "epoch": 0.7850962918415045, + "grad_norm": 1.695469617843628, + "learning_rate": 2.4697662757319053e-06, + "loss": 1.4013, + "step": 14360 + }, + { + "epoch": 0.7851509642852261, + "grad_norm": 1.8112854957580566, + "learning_rate": 2.4685640117531427e-06, + "loss": 1.286, + "step": 14361 + }, + { + "epoch": 0.7852056367289477, + "grad_norm": 1.7059433460235596, + "learning_rate": 2.467361999266704e-06, + "loss": 1.3882, + "step": 14362 + }, + { + "epoch": 0.7852603091726692, + "grad_norm": 1.6693317890167236, + "learning_rate": 2.4661602383127235e-06, + "loss": 1.2901, + "step": 14363 + }, + { + "epoch": 0.7853149816163908, + "grad_norm": 1.5278366804122925, + "learning_rate": 2.4649587289313325e-06, + "loss": 1.6976, + "step": 14364 + }, + { + "epoch": 0.7853696540601124, + "grad_norm": 1.7223888635635376, + "learning_rate": 2.46375747116265e-06, + "loss": 1.4544, + "step": 14365 + }, + { + "epoch": 0.7854243265038339, + "grad_norm": 1.3451480865478516, + "learning_rate": 2.4625564650467904e-06, + "loss": 1.1939, + "step": 14366 + }, + { + "epoch": 0.7854789989475555, + "grad_norm": 2.1788718700408936, + "learning_rate": 2.461355710623855e-06, + "loss": 1.2103, + "step": 14367 + }, + { + "epoch": 0.785533671391277, + "grad_norm": 2.255262613296509, + "learning_rate": 2.4601552079339453e-06, + "loss": 1.2582, + "step": 14368 + }, + { + "epoch": 0.7855883438349985, + "grad_norm": 1.3367602825164795, + "learning_rate": 2.4589549570171423e-06, + "loss": 1.2922, + "step": 14369 + }, + { + "epoch": 0.7856430162787201, + "grad_norm": 1.6969228982925415, + "learning_rate": 2.4577549579135318e-06, + "loss": 1.3246, + "step": 14370 + }, + { + "epoch": 0.7856976887224417, + "grad_norm": 1.4519011974334717, + "learning_rate": 2.456555210663183e-06, + "loss": 1.5056, + "step": 14371 + }, + { + "epoch": 0.7857523611661632, + "grad_norm": 1.7552586793899536, + "learning_rate": 2.4553557153061527e-06, + "loss": 1.4649, + "step": 14372 + }, + { + "epoch": 0.7858070336098848, + "grad_norm": 2.101107358932495, + "learning_rate": 2.4541564718825028e-06, + "loss": 1.4317, + "step": 14373 + }, + { + "epoch": 0.7858617060536063, + "grad_norm": 1.6832479238510132, + "learning_rate": 2.4529574804322744e-06, + "loss": 1.4499, + "step": 14374 + }, + { + "epoch": 0.7859163784973279, + "grad_norm": 1.5848370790481567, + "learning_rate": 2.4517587409955036e-06, + "loss": 1.3881, + "step": 14375 + }, + { + "epoch": 0.7859710509410495, + "grad_norm": 1.3268680572509766, + "learning_rate": 2.450560253612223e-06, + "loss": 1.5846, + "step": 14376 + }, + { + "epoch": 0.786025723384771, + "grad_norm": 1.518947720527649, + "learning_rate": 2.449362018322451e-06, + "loss": 1.3591, + "step": 14377 + }, + { + "epoch": 0.7860803958284925, + "grad_norm": 1.3798993825912476, + "learning_rate": 2.4481640351661995e-06, + "loss": 1.2676, + "step": 14378 + }, + { + "epoch": 0.7861350682722141, + "grad_norm": 1.5061508417129517, + "learning_rate": 2.4469663041834713e-06, + "loss": 1.5429, + "step": 14379 + }, + { + "epoch": 0.7861897407159356, + "grad_norm": 1.5626555681228638, + "learning_rate": 2.445768825414263e-06, + "loss": 1.6164, + "step": 14380 + }, + { + "epoch": 0.7862444131596572, + "grad_norm": 1.7188619375228882, + "learning_rate": 2.4445715988985562e-06, + "loss": 1.6795, + "step": 14381 + }, + { + "epoch": 0.7862990856033788, + "grad_norm": 1.5478538274765015, + "learning_rate": 2.443374624676337e-06, + "loss": 1.3722, + "step": 14382 + }, + { + "epoch": 0.7863537580471003, + "grad_norm": 1.3346607685089111, + "learning_rate": 2.4421779027875668e-06, + "loss": 1.5856, + "step": 14383 + }, + { + "epoch": 0.7864084304908219, + "grad_norm": 1.779372215270996, + "learning_rate": 2.440981433272216e-06, + "loss": 1.6858, + "step": 14384 + }, + { + "epoch": 0.7864631029345435, + "grad_norm": 1.6200426816940308, + "learning_rate": 2.4397852161702317e-06, + "loss": 1.4835, + "step": 14385 + }, + { + "epoch": 0.786517775378265, + "grad_norm": 1.3008993864059448, + "learning_rate": 2.4385892515215583e-06, + "loss": 1.305, + "step": 14386 + }, + { + "epoch": 0.7865724478219865, + "grad_norm": 1.5916388034820557, + "learning_rate": 2.437393539366134e-06, + "loss": 1.6096, + "step": 14387 + }, + { + "epoch": 0.786627120265708, + "grad_norm": 1.8753947019577026, + "learning_rate": 2.4361980797438868e-06, + "loss": 1.6152, + "step": 14388 + }, + { + "epoch": 0.7866817927094296, + "grad_norm": 1.1222666501998901, + "learning_rate": 2.435002872694735e-06, + "loss": 1.596, + "step": 14389 + }, + { + "epoch": 0.7867364651531512, + "grad_norm": 1.9784830808639526, + "learning_rate": 2.433807918258588e-06, + "loss": 1.3628, + "step": 14390 + }, + { + "epoch": 0.7867911375968727, + "grad_norm": 1.5538685321807861, + "learning_rate": 2.43261321647535e-06, + "loss": 1.3818, + "step": 14391 + }, + { + "epoch": 0.7868458100405943, + "grad_norm": 1.208833932876587, + "learning_rate": 2.4314187673849122e-06, + "loss": 1.5297, + "step": 14392 + }, + { + "epoch": 0.7869004824843159, + "grad_norm": 1.0856209993362427, + "learning_rate": 2.4302245710271634e-06, + "loss": 1.4798, + "step": 14393 + }, + { + "epoch": 0.7869551549280374, + "grad_norm": 1.6069411039352417, + "learning_rate": 2.4290306274419794e-06, + "loss": 1.3557, + "step": 14394 + }, + { + "epoch": 0.787009827371759, + "grad_norm": 1.5107091665267944, + "learning_rate": 2.4278369366692268e-06, + "loss": 1.4872, + "step": 14395 + }, + { + "epoch": 0.7870644998154805, + "grad_norm": 1.3292614221572876, + "learning_rate": 2.4266434987487697e-06, + "loss": 1.3796, + "step": 14396 + }, + { + "epoch": 0.787119172259202, + "grad_norm": 1.5862829685211182, + "learning_rate": 2.4254503137204544e-06, + "loss": 1.4091, + "step": 14397 + }, + { + "epoch": 0.7871738447029236, + "grad_norm": 1.9341202974319458, + "learning_rate": 2.42425738162413e-06, + "loss": 1.5558, + "step": 14398 + }, + { + "epoch": 0.7872285171466452, + "grad_norm": 1.9436280727386475, + "learning_rate": 2.423064702499629e-06, + "loss": 1.2441, + "step": 14399 + }, + { + "epoch": 0.7872831895903667, + "grad_norm": 1.5445520877838135, + "learning_rate": 2.4218722763867754e-06, + "loss": 1.3432, + "step": 14400 + }, + { + "epoch": 0.7873378620340883, + "grad_norm": 1.3247497081756592, + "learning_rate": 2.4206801033253914e-06, + "loss": 1.4459, + "step": 14401 + }, + { + "epoch": 0.7873925344778098, + "grad_norm": 2.2654566764831543, + "learning_rate": 2.419488183355284e-06, + "loss": 1.4412, + "step": 14402 + }, + { + "epoch": 0.7874472069215314, + "grad_norm": 1.7739455699920654, + "learning_rate": 2.418296516516254e-06, + "loss": 1.4875, + "step": 14403 + }, + { + "epoch": 0.787501879365253, + "grad_norm": 1.7175852060317993, + "learning_rate": 2.4171051028480953e-06, + "loss": 1.4915, + "step": 14404 + }, + { + "epoch": 0.7875565518089744, + "grad_norm": 1.2337063550949097, + "learning_rate": 2.4159139423905898e-06, + "loss": 1.4765, + "step": 14405 + }, + { + "epoch": 0.787611224252696, + "grad_norm": 1.4896595478057861, + "learning_rate": 2.414723035183513e-06, + "loss": 1.3664, + "step": 14406 + }, + { + "epoch": 0.7876658966964176, + "grad_norm": 1.4543994665145874, + "learning_rate": 2.4135323812666357e-06, + "loss": 1.4075, + "step": 14407 + }, + { + "epoch": 0.7877205691401391, + "grad_norm": 1.6233506202697754, + "learning_rate": 2.4123419806797143e-06, + "loss": 1.3154, + "step": 14408 + }, + { + "epoch": 0.7877752415838607, + "grad_norm": 1.5723638534545898, + "learning_rate": 2.411151833462496e-06, + "loss": 1.4849, + "step": 14409 + }, + { + "epoch": 0.7878299140275823, + "grad_norm": 1.4413849115371704, + "learning_rate": 2.4099619396547293e-06, + "loss": 1.7048, + "step": 14410 + }, + { + "epoch": 0.7878845864713038, + "grad_norm": 1.9078985452651978, + "learning_rate": 2.4087722992961406e-06, + "loss": 1.4863, + "step": 14411 + }, + { + "epoch": 0.7879392589150254, + "grad_norm": 1.8315770626068115, + "learning_rate": 2.4075829124264606e-06, + "loss": 1.416, + "step": 14412 + }, + { + "epoch": 0.787993931358747, + "grad_norm": 1.5140044689178467, + "learning_rate": 2.406393779085404e-06, + "loss": 1.4324, + "step": 14413 + }, + { + "epoch": 0.7880486038024684, + "grad_norm": 1.780116081237793, + "learning_rate": 2.4052048993126754e-06, + "loss": 1.3735, + "step": 14414 + }, + { + "epoch": 0.78810327624619, + "grad_norm": 2.0495848655700684, + "learning_rate": 2.4040162731479786e-06, + "loss": 1.2712, + "step": 14415 + }, + { + "epoch": 0.7881579486899115, + "grad_norm": 1.6656116247177124, + "learning_rate": 2.4028279006310053e-06, + "loss": 1.2027, + "step": 14416 + }, + { + "epoch": 0.7882126211336331, + "grad_norm": 1.325385570526123, + "learning_rate": 2.4016397818014336e-06, + "loss": 1.4951, + "step": 14417 + }, + { + "epoch": 0.7882672935773547, + "grad_norm": 1.7076665163040161, + "learning_rate": 2.4004519166989405e-06, + "loss": 1.536, + "step": 14418 + }, + { + "epoch": 0.7883219660210762, + "grad_norm": 1.2854070663452148, + "learning_rate": 2.3992643053631904e-06, + "loss": 1.3643, + "step": 14419 + }, + { + "epoch": 0.7883766384647978, + "grad_norm": 1.2422319650650024, + "learning_rate": 2.398076947833838e-06, + "loss": 1.4576, + "step": 14420 + }, + { + "epoch": 0.7884313109085194, + "grad_norm": 1.5670723915100098, + "learning_rate": 2.3968898441505384e-06, + "loss": 1.5216, + "step": 14421 + }, + { + "epoch": 0.7884859833522408, + "grad_norm": 1.372245192527771, + "learning_rate": 2.3957029943529276e-06, + "loss": 1.3873, + "step": 14422 + }, + { + "epoch": 0.7885406557959624, + "grad_norm": 1.6749138832092285, + "learning_rate": 2.3945163984806354e-06, + "loss": 1.4288, + "step": 14423 + }, + { + "epoch": 0.788595328239684, + "grad_norm": 1.5174524784088135, + "learning_rate": 2.39333005657329e-06, + "loss": 1.3881, + "step": 14424 + }, + { + "epoch": 0.7886500006834055, + "grad_norm": 1.5589817762374878, + "learning_rate": 2.3921439686705005e-06, + "loss": 1.5044, + "step": 14425 + }, + { + "epoch": 0.7887046731271271, + "grad_norm": 1.5247950553894043, + "learning_rate": 2.3909581348118803e-06, + "loss": 1.3436, + "step": 14426 + }, + { + "epoch": 0.7887593455708487, + "grad_norm": 1.4544569253921509, + "learning_rate": 2.389772555037022e-06, + "loss": 1.3699, + "step": 14427 + }, + { + "epoch": 0.7888140180145702, + "grad_norm": 1.3395084142684937, + "learning_rate": 2.388587229385516e-06, + "loss": 1.3366, + "step": 14428 + }, + { + "epoch": 0.7888686904582918, + "grad_norm": 1.1375761032104492, + "learning_rate": 2.3874021578969443e-06, + "loss": 1.5691, + "step": 14429 + }, + { + "epoch": 0.7889233629020133, + "grad_norm": 1.387787938117981, + "learning_rate": 2.3862173406108767e-06, + "loss": 1.4463, + "step": 14430 + }, + { + "epoch": 0.7889780353457349, + "grad_norm": 1.4638004302978516, + "learning_rate": 2.3850327775668758e-06, + "loss": 1.5395, + "step": 14431 + }, + { + "epoch": 0.7890327077894564, + "grad_norm": 2.0956997871398926, + "learning_rate": 2.3838484688045026e-06, + "loss": 1.544, + "step": 14432 + }, + { + "epoch": 0.7890873802331779, + "grad_norm": 1.3191921710968018, + "learning_rate": 2.3826644143633017e-06, + "loss": 1.5381, + "step": 14433 + }, + { + "epoch": 0.7891420526768995, + "grad_norm": 1.4766961336135864, + "learning_rate": 2.381480614282807e-06, + "loss": 1.367, + "step": 14434 + }, + { + "epoch": 0.7891967251206211, + "grad_norm": 1.2398595809936523, + "learning_rate": 2.380297068602555e-06, + "loss": 1.3826, + "step": 14435 + }, + { + "epoch": 0.7892513975643426, + "grad_norm": 1.6176499128341675, + "learning_rate": 2.3791137773620644e-06, + "loss": 1.4279, + "step": 14436 + }, + { + "epoch": 0.7893060700080642, + "grad_norm": 1.816237211227417, + "learning_rate": 2.3779307406008444e-06, + "loss": 1.5404, + "step": 14437 + }, + { + "epoch": 0.7893607424517858, + "grad_norm": 1.5604627132415771, + "learning_rate": 2.376747958358405e-06, + "loss": 1.2691, + "step": 14438 + }, + { + "epoch": 0.7894154148955073, + "grad_norm": 1.7489418983459473, + "learning_rate": 2.375565430674238e-06, + "loss": 1.246, + "step": 14439 + }, + { + "epoch": 0.7894700873392289, + "grad_norm": 1.6757258176803589, + "learning_rate": 2.3743831575878352e-06, + "loss": 1.3832, + "step": 14440 + }, + { + "epoch": 0.7895247597829504, + "grad_norm": 1.47445809841156, + "learning_rate": 2.3732011391386724e-06, + "loss": 1.619, + "step": 14441 + }, + { + "epoch": 0.7895794322266719, + "grad_norm": 1.4877444505691528, + "learning_rate": 2.37201937536622e-06, + "loss": 1.3211, + "step": 14442 + }, + { + "epoch": 0.7896341046703935, + "grad_norm": 1.7206645011901855, + "learning_rate": 2.370837866309942e-06, + "loss": 1.3587, + "step": 14443 + }, + { + "epoch": 0.789688777114115, + "grad_norm": 1.2889115810394287, + "learning_rate": 2.369656612009289e-06, + "loss": 1.5436, + "step": 14444 + }, + { + "epoch": 0.7897434495578366, + "grad_norm": 1.5126663446426392, + "learning_rate": 2.3684756125037033e-06, + "loss": 1.5215, + "step": 14445 + }, + { + "epoch": 0.7897981220015582, + "grad_norm": 1.8356332778930664, + "learning_rate": 2.367294867832629e-06, + "loss": 1.4159, + "step": 14446 + }, + { + "epoch": 0.7898527944452797, + "grad_norm": 1.592169165611267, + "learning_rate": 2.366114378035489e-06, + "loss": 1.6814, + "step": 14447 + }, + { + "epoch": 0.7899074668890013, + "grad_norm": 1.9134012460708618, + "learning_rate": 2.3649341431517005e-06, + "loss": 1.427, + "step": 14448 + }, + { + "epoch": 0.7899621393327229, + "grad_norm": 1.4237844944000244, + "learning_rate": 2.3637541632206804e-06, + "loss": 1.4567, + "step": 14449 + }, + { + "epoch": 0.7900168117764443, + "grad_norm": 1.6640832424163818, + "learning_rate": 2.362574438281827e-06, + "loss": 1.3906, + "step": 14450 + }, + { + "epoch": 0.7900714842201659, + "grad_norm": 1.6512951850891113, + "learning_rate": 2.361394968374533e-06, + "loss": 1.5522, + "step": 14451 + }, + { + "epoch": 0.7901261566638875, + "grad_norm": 1.4499409198760986, + "learning_rate": 2.360215753538189e-06, + "loss": 1.3377, + "step": 14452 + }, + { + "epoch": 0.790180829107609, + "grad_norm": 1.5052440166473389, + "learning_rate": 2.3590367938121637e-06, + "loss": 1.8837, + "step": 14453 + }, + { + "epoch": 0.7902355015513306, + "grad_norm": 1.601043462753296, + "learning_rate": 2.3578580892358337e-06, + "loss": 1.4513, + "step": 14454 + }, + { + "epoch": 0.7902901739950522, + "grad_norm": 1.6465603113174438, + "learning_rate": 2.356679639848555e-06, + "loss": 1.2923, + "step": 14455 + }, + { + "epoch": 0.7903448464387737, + "grad_norm": 1.6585180759429932, + "learning_rate": 2.3555014456896786e-06, + "loss": 1.3686, + "step": 14456 + }, + { + "epoch": 0.7903995188824953, + "grad_norm": 1.6359367370605469, + "learning_rate": 2.354323506798547e-06, + "loss": 1.3538, + "step": 14457 + }, + { + "epoch": 0.7904541913262167, + "grad_norm": 2.156196117401123, + "learning_rate": 2.3531458232144953e-06, + "loss": 1.3732, + "step": 14458 + }, + { + "epoch": 0.7905088637699383, + "grad_norm": 1.8864504098892212, + "learning_rate": 2.351968394976846e-06, + "loss": 1.5192, + "step": 14459 + }, + { + "epoch": 0.7905635362136599, + "grad_norm": 1.8819674253463745, + "learning_rate": 2.3507912221249206e-06, + "loss": 1.5676, + "step": 14460 + }, + { + "epoch": 0.7906182086573814, + "grad_norm": 1.3207480907440186, + "learning_rate": 2.3496143046980256e-06, + "loss": 1.5281, + "step": 14461 + }, + { + "epoch": 0.790672881101103, + "grad_norm": 3.187866687774658, + "learning_rate": 2.348437642735458e-06, + "loss": 1.201, + "step": 14462 + }, + { + "epoch": 0.7907275535448246, + "grad_norm": 1.5967044830322266, + "learning_rate": 2.347261236276517e-06, + "loss": 1.7406, + "step": 14463 + }, + { + "epoch": 0.7907822259885461, + "grad_norm": 1.4827394485473633, + "learning_rate": 2.34608508536048e-06, + "loss": 1.4443, + "step": 14464 + }, + { + "epoch": 0.7908368984322677, + "grad_norm": 1.2540159225463867, + "learning_rate": 2.3449091900266196e-06, + "loss": 1.3561, + "step": 14465 + }, + { + "epoch": 0.7908915708759893, + "grad_norm": 1.6883188486099243, + "learning_rate": 2.3437335503142065e-06, + "loss": 1.4933, + "step": 14466 + }, + { + "epoch": 0.7909462433197108, + "grad_norm": 1.604469895362854, + "learning_rate": 2.3425581662624975e-06, + "loss": 1.2832, + "step": 14467 + }, + { + "epoch": 0.7910009157634323, + "grad_norm": 1.2734403610229492, + "learning_rate": 2.3413830379107395e-06, + "loss": 1.5407, + "step": 14468 + }, + { + "epoch": 0.7910555882071539, + "grad_norm": 1.568411946296692, + "learning_rate": 2.3402081652981733e-06, + "loss": 1.675, + "step": 14469 + }, + { + "epoch": 0.7911102606508754, + "grad_norm": 1.5267534255981445, + "learning_rate": 2.339033548464028e-06, + "loss": 1.296, + "step": 14470 + }, + { + "epoch": 0.791164933094597, + "grad_norm": 1.3552486896514893, + "learning_rate": 2.337859187447533e-06, + "loss": 1.5703, + "step": 14471 + }, + { + "epoch": 0.7912196055383186, + "grad_norm": 1.9379087686538696, + "learning_rate": 2.3366850822878996e-06, + "loss": 1.2117, + "step": 14472 + }, + { + "epoch": 0.7912742779820401, + "grad_norm": 1.1835062503814697, + "learning_rate": 2.33551123302433e-06, + "loss": 1.5097, + "step": 14473 + }, + { + "epoch": 0.7913289504257617, + "grad_norm": 1.8589622974395752, + "learning_rate": 2.3343376396960282e-06, + "loss": 1.4621, + "step": 14474 + }, + { + "epoch": 0.7913836228694832, + "grad_norm": 1.4965612888336182, + "learning_rate": 2.3331643023421813e-06, + "loss": 1.4584, + "step": 14475 + }, + { + "epoch": 0.7914382953132048, + "grad_norm": 1.4116405248641968, + "learning_rate": 2.331991221001967e-06, + "loss": 1.5004, + "step": 14476 + }, + { + "epoch": 0.7914929677569263, + "grad_norm": 1.2071834802627563, + "learning_rate": 2.3308183957145613e-06, + "loss": 1.1234, + "step": 14477 + }, + { + "epoch": 0.7915476402006478, + "grad_norm": 1.9255378246307373, + "learning_rate": 2.329645826519126e-06, + "loss": 1.5272, + "step": 14478 + }, + { + "epoch": 0.7916023126443694, + "grad_norm": 1.706763505935669, + "learning_rate": 2.328473513454812e-06, + "loss": 1.2843, + "step": 14479 + }, + { + "epoch": 0.791656985088091, + "grad_norm": 1.1750999689102173, + "learning_rate": 2.3273014565607734e-06, + "loss": 1.6459, + "step": 14480 + }, + { + "epoch": 0.7917116575318125, + "grad_norm": 1.953334093093872, + "learning_rate": 2.3261296558761427e-06, + "loss": 1.3245, + "step": 14481 + }, + { + "epoch": 0.7917663299755341, + "grad_norm": 1.5943889617919922, + "learning_rate": 2.324958111440051e-06, + "loss": 1.3389, + "step": 14482 + }, + { + "epoch": 0.7918210024192557, + "grad_norm": 1.6500366926193237, + "learning_rate": 2.323786823291617e-06, + "loss": 1.1585, + "step": 14483 + }, + { + "epoch": 0.7918756748629772, + "grad_norm": 2.0927226543426514, + "learning_rate": 2.322615791469951e-06, + "loss": 1.423, + "step": 14484 + }, + { + "epoch": 0.7919303473066988, + "grad_norm": 1.6986427307128906, + "learning_rate": 2.321445016014162e-06, + "loss": 1.2893, + "step": 14485 + }, + { + "epoch": 0.7919850197504203, + "grad_norm": 1.7709304094314575, + "learning_rate": 2.3202744969633427e-06, + "loss": 1.3691, + "step": 14486 + }, + { + "epoch": 0.7920396921941418, + "grad_norm": 1.3380181789398193, + "learning_rate": 2.319104234356576e-06, + "loss": 1.5004, + "step": 14487 + }, + { + "epoch": 0.7920943646378634, + "grad_norm": 2.499570369720459, + "learning_rate": 2.3179342282329463e-06, + "loss": 1.6069, + "step": 14488 + }, + { + "epoch": 0.7921490370815849, + "grad_norm": 1.8304643630981445, + "learning_rate": 2.316764478631518e-06, + "loss": 1.3227, + "step": 14489 + }, + { + "epoch": 0.7922037095253065, + "grad_norm": 1.3537713289260864, + "learning_rate": 2.3155949855913516e-06, + "loss": 1.3737, + "step": 14490 + }, + { + "epoch": 0.7922583819690281, + "grad_norm": 1.9599629640579224, + "learning_rate": 2.314425749151502e-06, + "loss": 1.3901, + "step": 14491 + }, + { + "epoch": 0.7923130544127496, + "grad_norm": 1.5483474731445312, + "learning_rate": 2.3132567693510123e-06, + "loss": 1.4863, + "step": 14492 + }, + { + "epoch": 0.7923677268564712, + "grad_norm": 1.530969500541687, + "learning_rate": 2.3120880462289165e-06, + "loss": 1.341, + "step": 14493 + }, + { + "epoch": 0.7924223993001928, + "grad_norm": 1.882502794265747, + "learning_rate": 2.310919579824241e-06, + "loss": 1.4284, + "step": 14494 + }, + { + "epoch": 0.7924770717439142, + "grad_norm": 1.4714221954345703, + "learning_rate": 2.309751370176001e-06, + "loss": 1.2469, + "step": 14495 + }, + { + "epoch": 0.7925317441876358, + "grad_norm": 1.4821842908859253, + "learning_rate": 2.30858341732321e-06, + "loss": 1.3725, + "step": 14496 + }, + { + "epoch": 0.7925864166313574, + "grad_norm": 1.3326038122177124, + "learning_rate": 2.3074157213048686e-06, + "loss": 1.3908, + "step": 14497 + }, + { + "epoch": 0.7926410890750789, + "grad_norm": 1.296804428100586, + "learning_rate": 2.306248282159965e-06, + "loss": 1.4095, + "step": 14498 + }, + { + "epoch": 0.7926957615188005, + "grad_norm": 1.4793214797973633, + "learning_rate": 2.3050810999274874e-06, + "loss": 1.2665, + "step": 14499 + }, + { + "epoch": 0.7927504339625221, + "grad_norm": 1.8302454948425293, + "learning_rate": 2.303914174646409e-06, + "loss": 1.4726, + "step": 14500 + }, + { + "epoch": 0.7928051064062436, + "grad_norm": 1.4244428873062134, + "learning_rate": 2.3027475063556913e-06, + "loss": 1.4654, + "step": 14501 + }, + { + "epoch": 0.7928597788499652, + "grad_norm": 1.5015819072723389, + "learning_rate": 2.301581095094301e-06, + "loss": 1.5293, + "step": 14502 + }, + { + "epoch": 0.7929144512936867, + "grad_norm": 1.441278338432312, + "learning_rate": 2.300414940901182e-06, + "loss": 1.5148, + "step": 14503 + }, + { + "epoch": 0.7929691237374082, + "grad_norm": 1.4088557958602905, + "learning_rate": 2.2992490438152735e-06, + "loss": 1.4442, + "step": 14504 + }, + { + "epoch": 0.7930237961811298, + "grad_norm": 1.4852278232574463, + "learning_rate": 2.298083403875513e-06, + "loss": 1.5653, + "step": 14505 + }, + { + "epoch": 0.7930784686248513, + "grad_norm": 1.4858134984970093, + "learning_rate": 2.2969180211208195e-06, + "loss": 1.4077, + "step": 14506 + }, + { + "epoch": 0.7931331410685729, + "grad_norm": 1.5290508270263672, + "learning_rate": 2.2957528955901097e-06, + "loss": 1.4387, + "step": 14507 + }, + { + "epoch": 0.7931878135122945, + "grad_norm": 1.6099227666854858, + "learning_rate": 2.29458802732229e-06, + "loss": 1.5286, + "step": 14508 + }, + { + "epoch": 0.793242485956016, + "grad_norm": 1.9231338500976562, + "learning_rate": 2.293423416356254e-06, + "loss": 1.6719, + "step": 14509 + }, + { + "epoch": 0.7932971583997376, + "grad_norm": 1.4611005783081055, + "learning_rate": 2.292259062730897e-06, + "loss": 1.592, + "step": 14510 + }, + { + "epoch": 0.7933518308434592, + "grad_norm": 1.6053261756896973, + "learning_rate": 2.2910949664850967e-06, + "loss": 1.3457, + "step": 14511 + }, + { + "epoch": 0.7934065032871807, + "grad_norm": 1.397532343864441, + "learning_rate": 2.2899311276577217e-06, + "loss": 1.4065, + "step": 14512 + }, + { + "epoch": 0.7934611757309022, + "grad_norm": 1.599623441696167, + "learning_rate": 2.2887675462876425e-06, + "loss": 1.4263, + "step": 14513 + }, + { + "epoch": 0.7935158481746238, + "grad_norm": 1.5766555070877075, + "learning_rate": 2.2876042224137085e-06, + "loss": 1.3792, + "step": 14514 + }, + { + "epoch": 0.7935705206183453, + "grad_norm": 2.2641947269439697, + "learning_rate": 2.2864411560747655e-06, + "loss": 1.4006, + "step": 14515 + }, + { + "epoch": 0.7936251930620669, + "grad_norm": 1.3368321657180786, + "learning_rate": 2.285278347309655e-06, + "loss": 1.4428, + "step": 14516 + }, + { + "epoch": 0.7936798655057884, + "grad_norm": 1.5556613206863403, + "learning_rate": 2.2841157961572034e-06, + "loss": 1.4251, + "step": 14517 + }, + { + "epoch": 0.79373453794951, + "grad_norm": 1.95380699634552, + "learning_rate": 2.2829535026562287e-06, + "loss": 1.3191, + "step": 14518 + }, + { + "epoch": 0.7937892103932316, + "grad_norm": 1.346893072128296, + "learning_rate": 2.2817914668455486e-06, + "loss": 1.5484, + "step": 14519 + }, + { + "epoch": 0.7938438828369531, + "grad_norm": 1.503824234008789, + "learning_rate": 2.2806296887639622e-06, + "loss": 1.5099, + "step": 14520 + }, + { + "epoch": 0.7938985552806747, + "grad_norm": 1.1790980100631714, + "learning_rate": 2.279468168450265e-06, + "loss": 1.8058, + "step": 14521 + }, + { + "epoch": 0.7939532277243962, + "grad_norm": 1.8813143968582153, + "learning_rate": 2.2783069059432417e-06, + "loss": 1.4647, + "step": 14522 + }, + { + "epoch": 0.7940079001681177, + "grad_norm": 1.565643072128296, + "learning_rate": 2.277145901281668e-06, + "loss": 1.4105, + "step": 14523 + }, + { + "epoch": 0.7940625726118393, + "grad_norm": 1.336492657661438, + "learning_rate": 2.2759851545043175e-06, + "loss": 1.452, + "step": 14524 + }, + { + "epoch": 0.7941172450555609, + "grad_norm": 1.6519548892974854, + "learning_rate": 2.2748246656499485e-06, + "loss": 1.3333, + "step": 14525 + }, + { + "epoch": 0.7941719174992824, + "grad_norm": 1.4250222444534302, + "learning_rate": 2.273664434757308e-06, + "loss": 1.409, + "step": 14526 + }, + { + "epoch": 0.794226589943004, + "grad_norm": 1.370194911956787, + "learning_rate": 2.272504461865145e-06, + "loss": 1.3266, + "step": 14527 + }, + { + "epoch": 0.7942812623867256, + "grad_norm": 1.5600061416625977, + "learning_rate": 2.2713447470121917e-06, + "loss": 1.4385, + "step": 14528 + }, + { + "epoch": 0.7943359348304471, + "grad_norm": 1.3339128494262695, + "learning_rate": 2.27018529023717e-06, + "loss": 1.4795, + "step": 14529 + }, + { + "epoch": 0.7943906072741687, + "grad_norm": 1.543634295463562, + "learning_rate": 2.269026091578803e-06, + "loss": 1.4412, + "step": 14530 + }, + { + "epoch": 0.7944452797178901, + "grad_norm": 1.6066975593566895, + "learning_rate": 2.2678671510757953e-06, + "loss": 1.4642, + "step": 14531 + }, + { + "epoch": 0.7944999521616117, + "grad_norm": 1.3256860971450806, + "learning_rate": 2.266708468766848e-06, + "loss": 1.5176, + "step": 14532 + }, + { + "epoch": 0.7945546246053333, + "grad_norm": 2.3872087001800537, + "learning_rate": 2.265550044690653e-06, + "loss": 1.2659, + "step": 14533 + }, + { + "epoch": 0.7946092970490548, + "grad_norm": 1.6923408508300781, + "learning_rate": 2.26439187888589e-06, + "loss": 1.4344, + "step": 14534 + }, + { + "epoch": 0.7946639694927764, + "grad_norm": 1.8918720483779907, + "learning_rate": 2.263233971391232e-06, + "loss": 1.3097, + "step": 14535 + }, + { + "epoch": 0.794718641936498, + "grad_norm": 2.3563501834869385, + "learning_rate": 2.262076322245349e-06, + "loss": 1.2298, + "step": 14536 + }, + { + "epoch": 0.7947733143802195, + "grad_norm": 3.0788025856018066, + "learning_rate": 2.2609189314868927e-06, + "loss": 1.184, + "step": 14537 + }, + { + "epoch": 0.7948279868239411, + "grad_norm": 1.688546061515808, + "learning_rate": 2.259761799154516e-06, + "loss": 1.254, + "step": 14538 + }, + { + "epoch": 0.7948826592676627, + "grad_norm": 1.8143705129623413, + "learning_rate": 2.258604925286857e-06, + "loss": 1.6073, + "step": 14539 + }, + { + "epoch": 0.7949373317113841, + "grad_norm": 1.8822286128997803, + "learning_rate": 2.257448309922542e-06, + "loss": 1.574, + "step": 14540 + }, + { + "epoch": 0.7949920041551057, + "grad_norm": 1.8619203567504883, + "learning_rate": 2.2562919531001983e-06, + "loss": 1.5048, + "step": 14541 + }, + { + "epoch": 0.7950466765988273, + "grad_norm": 1.8972464799880981, + "learning_rate": 2.255135854858438e-06, + "loss": 1.4157, + "step": 14542 + }, + { + "epoch": 0.7951013490425488, + "grad_norm": 1.5350632667541504, + "learning_rate": 2.2539800152358626e-06, + "loss": 1.6032, + "step": 14543 + }, + { + "epoch": 0.7951560214862704, + "grad_norm": 1.464329481124878, + "learning_rate": 2.252824434271075e-06, + "loss": 1.4241, + "step": 14544 + }, + { + "epoch": 0.7952106939299919, + "grad_norm": 1.3077536821365356, + "learning_rate": 2.251669112002657e-06, + "loss": 1.3564, + "step": 14545 + }, + { + "epoch": 0.7952653663737135, + "grad_norm": 1.5537770986557007, + "learning_rate": 2.25051404846919e-06, + "loss": 1.2167, + "step": 14546 + }, + { + "epoch": 0.7953200388174351, + "grad_norm": 1.3362051248550415, + "learning_rate": 2.249359243709245e-06, + "loss": 1.4072, + "step": 14547 + }, + { + "epoch": 0.7953747112611566, + "grad_norm": 1.6049913167953491, + "learning_rate": 2.2482046977613805e-06, + "loss": 1.3637, + "step": 14548 + }, + { + "epoch": 0.7954293837048781, + "grad_norm": 1.3457090854644775, + "learning_rate": 2.2470504106641487e-06, + "loss": 1.3739, + "step": 14549 + }, + { + "epoch": 0.7954840561485997, + "grad_norm": 1.689260482788086, + "learning_rate": 2.2458963824561007e-06, + "loss": 1.6165, + "step": 14550 + }, + { + "epoch": 0.7955387285923212, + "grad_norm": 1.508406639099121, + "learning_rate": 2.244742613175764e-06, + "loss": 1.5696, + "step": 14551 + }, + { + "epoch": 0.7955934010360428, + "grad_norm": 1.4590675830841064, + "learning_rate": 2.243589102861673e-06, + "loss": 1.7135, + "step": 14552 + }, + { + "epoch": 0.7956480734797644, + "grad_norm": 1.5473581552505493, + "learning_rate": 2.2424358515523426e-06, + "loss": 1.2528, + "step": 14553 + }, + { + "epoch": 0.7957027459234859, + "grad_norm": 1.4609479904174805, + "learning_rate": 2.24128285928628e-06, + "loss": 1.4955, + "step": 14554 + }, + { + "epoch": 0.7957574183672075, + "grad_norm": 1.3443589210510254, + "learning_rate": 2.2401301261019927e-06, + "loss": 1.3609, + "step": 14555 + }, + { + "epoch": 0.7958120908109291, + "grad_norm": 1.5376452207565308, + "learning_rate": 2.238977652037969e-06, + "loss": 1.572, + "step": 14556 + }, + { + "epoch": 0.7958667632546506, + "grad_norm": 1.4251503944396973, + "learning_rate": 2.23782543713269e-06, + "loss": 1.6355, + "step": 14557 + }, + { + "epoch": 0.7959214356983721, + "grad_norm": 1.5756210088729858, + "learning_rate": 2.2366734814246383e-06, + "loss": 1.2853, + "step": 14558 + }, + { + "epoch": 0.7959761081420936, + "grad_norm": 1.583609700202942, + "learning_rate": 2.235521784952275e-06, + "loss": 1.6311, + "step": 14559 + }, + { + "epoch": 0.7960307805858152, + "grad_norm": 1.7703198194503784, + "learning_rate": 2.2343703477540603e-06, + "loss": 1.5048, + "step": 14560 + }, + { + "epoch": 0.7960854530295368, + "grad_norm": 1.7976715564727783, + "learning_rate": 2.2332191698684413e-06, + "loss": 1.2108, + "step": 14561 + }, + { + "epoch": 0.7961401254732583, + "grad_norm": 1.4825549125671387, + "learning_rate": 2.23206825133386e-06, + "loss": 1.3983, + "step": 14562 + }, + { + "epoch": 0.7961947979169799, + "grad_norm": 1.6302437782287598, + "learning_rate": 2.2309175921887447e-06, + "loss": 1.4109, + "step": 14563 + }, + { + "epoch": 0.7962494703607015, + "grad_norm": 1.5359843969345093, + "learning_rate": 2.229767192471525e-06, + "loss": 1.4749, + "step": 14564 + }, + { + "epoch": 0.796304142804423, + "grad_norm": 1.7639868259429932, + "learning_rate": 2.2286170522206086e-06, + "loss": 1.4301, + "step": 14565 + }, + { + "epoch": 0.7963588152481446, + "grad_norm": 1.4816884994506836, + "learning_rate": 2.227467171474409e-06, + "loss": 1.6304, + "step": 14566 + }, + { + "epoch": 0.7964134876918662, + "grad_norm": 1.7846155166625977, + "learning_rate": 2.2263175502713187e-06, + "loss": 1.6092, + "step": 14567 + }, + { + "epoch": 0.7964681601355876, + "grad_norm": 1.6801717281341553, + "learning_rate": 2.2251681886497235e-06, + "loss": 1.4541, + "step": 14568 + }, + { + "epoch": 0.7965228325793092, + "grad_norm": 1.1612640619277954, + "learning_rate": 2.2240190866480105e-06, + "loss": 1.4263, + "step": 14569 + }, + { + "epoch": 0.7965775050230308, + "grad_norm": 1.748874306678772, + "learning_rate": 2.2228702443045456e-06, + "loss": 1.3174, + "step": 14570 + }, + { + "epoch": 0.7966321774667523, + "grad_norm": 1.4552723169326782, + "learning_rate": 2.2217216616576944e-06, + "loss": 1.636, + "step": 14571 + }, + { + "epoch": 0.7966868499104739, + "grad_norm": 1.6822978258132935, + "learning_rate": 2.2205733387458083e-06, + "loss": 1.5498, + "step": 14572 + }, + { + "epoch": 0.7967415223541954, + "grad_norm": 1.9382377862930298, + "learning_rate": 2.2194252756072343e-06, + "loss": 1.5838, + "step": 14573 + }, + { + "epoch": 0.796796194797917, + "grad_norm": 1.4530731439590454, + "learning_rate": 2.218277472280305e-06, + "loss": 1.4079, + "step": 14574 + }, + { + "epoch": 0.7968508672416386, + "grad_norm": 1.2215512990951538, + "learning_rate": 2.217129928803353e-06, + "loss": 1.5762, + "step": 14575 + }, + { + "epoch": 0.79690553968536, + "grad_norm": 1.306292176246643, + "learning_rate": 2.215982645214697e-06, + "loss": 1.3992, + "step": 14576 + }, + { + "epoch": 0.7969602121290816, + "grad_norm": 1.2946151494979858, + "learning_rate": 2.2148356215526436e-06, + "loss": 1.3624, + "step": 14577 + }, + { + "epoch": 0.7970148845728032, + "grad_norm": 1.2227082252502441, + "learning_rate": 2.2136888578554993e-06, + "loss": 1.6085, + "step": 14578 + }, + { + "epoch": 0.7970695570165247, + "grad_norm": 1.3271664381027222, + "learning_rate": 2.212542354161552e-06, + "loss": 1.5109, + "step": 14579 + }, + { + "epoch": 0.7971242294602463, + "grad_norm": 1.8549103736877441, + "learning_rate": 2.2113961105090933e-06, + "loss": 1.2813, + "step": 14580 + }, + { + "epoch": 0.7971789019039679, + "grad_norm": 1.4540774822235107, + "learning_rate": 2.210250126936394e-06, + "loss": 1.4323, + "step": 14581 + }, + { + "epoch": 0.7972335743476894, + "grad_norm": 1.3734867572784424, + "learning_rate": 2.20910440348172e-06, + "loss": 1.3895, + "step": 14582 + }, + { + "epoch": 0.797288246791411, + "grad_norm": 1.5316451787948608, + "learning_rate": 2.2079589401833348e-06, + "loss": 1.3671, + "step": 14583 + }, + { + "epoch": 0.7973429192351326, + "grad_norm": 1.585562825202942, + "learning_rate": 2.206813737079485e-06, + "loss": 1.643, + "step": 14584 + }, + { + "epoch": 0.797397591678854, + "grad_norm": 1.5643062591552734, + "learning_rate": 2.2056687942084108e-06, + "loss": 1.4782, + "step": 14585 + }, + { + "epoch": 0.7974522641225756, + "grad_norm": 1.786818265914917, + "learning_rate": 2.2045241116083472e-06, + "loss": 1.3357, + "step": 14586 + }, + { + "epoch": 0.7975069365662971, + "grad_norm": 1.4284957647323608, + "learning_rate": 2.2033796893175152e-06, + "loss": 1.6035, + "step": 14587 + }, + { + "epoch": 0.7975616090100187, + "grad_norm": 1.6227288246154785, + "learning_rate": 2.202235527374128e-06, + "loss": 1.5874, + "step": 14588 + }, + { + "epoch": 0.7976162814537403, + "grad_norm": 1.8765097856521606, + "learning_rate": 2.201091625816397e-06, + "loss": 1.5703, + "step": 14589 + }, + { + "epoch": 0.7976709538974618, + "grad_norm": 1.464707374572754, + "learning_rate": 2.199947984682518e-06, + "loss": 1.5366, + "step": 14590 + }, + { + "epoch": 0.7977256263411834, + "grad_norm": 1.6029530763626099, + "learning_rate": 2.198804604010677e-06, + "loss": 1.6934, + "step": 14591 + }, + { + "epoch": 0.797780298784905, + "grad_norm": 1.841819405555725, + "learning_rate": 2.1976614838390576e-06, + "loss": 1.4114, + "step": 14592 + }, + { + "epoch": 0.7978349712286265, + "grad_norm": 1.5517480373382568, + "learning_rate": 2.196518624205828e-06, + "loss": 1.3962, + "step": 14593 + }, + { + "epoch": 0.797889643672348, + "grad_norm": 1.2589613199234009, + "learning_rate": 2.195376025149156e-06, + "loss": 1.3351, + "step": 14594 + }, + { + "epoch": 0.7979443161160696, + "grad_norm": 1.5722463130950928, + "learning_rate": 2.194233686707192e-06, + "loss": 1.4083, + "step": 14595 + }, + { + "epoch": 0.7979989885597911, + "grad_norm": 2.2012112140655518, + "learning_rate": 2.19309160891808e-06, + "loss": 1.2449, + "step": 14596 + }, + { + "epoch": 0.7980536610035127, + "grad_norm": 1.5733342170715332, + "learning_rate": 2.1919497918199605e-06, + "loss": 1.3482, + "step": 14597 + }, + { + "epoch": 0.7981083334472343, + "grad_norm": 1.3707325458526611, + "learning_rate": 2.190808235450961e-06, + "loss": 1.409, + "step": 14598 + }, + { + "epoch": 0.7981630058909558, + "grad_norm": 1.2167701721191406, + "learning_rate": 2.1896669398491975e-06, + "loss": 1.414, + "step": 14599 + }, + { + "epoch": 0.7982176783346774, + "grad_norm": 1.9548372030258179, + "learning_rate": 2.188525905052784e-06, + "loss": 1.5154, + "step": 14600 + }, + { + "epoch": 0.7982723507783989, + "grad_norm": 1.6260753870010376, + "learning_rate": 2.1873851310998194e-06, + "loss": 1.3684, + "step": 14601 + }, + { + "epoch": 0.7983270232221205, + "grad_norm": 1.4427279233932495, + "learning_rate": 2.186244618028397e-06, + "loss": 1.3282, + "step": 14602 + }, + { + "epoch": 0.798381695665842, + "grad_norm": 1.9027312994003296, + "learning_rate": 2.1851043658766034e-06, + "loss": 1.4792, + "step": 14603 + }, + { + "epoch": 0.7984363681095635, + "grad_norm": 1.5111244916915894, + "learning_rate": 2.1839643746825145e-06, + "loss": 1.167, + "step": 14604 + }, + { + "epoch": 0.7984910405532851, + "grad_norm": 1.5899031162261963, + "learning_rate": 2.1828246444841925e-06, + "loss": 1.6279, + "step": 14605 + }, + { + "epoch": 0.7985457129970067, + "grad_norm": 1.3123772144317627, + "learning_rate": 2.1816851753197023e-06, + "loss": 1.2676, + "step": 14606 + }, + { + "epoch": 0.7986003854407282, + "grad_norm": 1.5111140012741089, + "learning_rate": 2.1805459672270913e-06, + "loss": 1.3893, + "step": 14607 + }, + { + "epoch": 0.7986550578844498, + "grad_norm": 1.3920893669128418, + "learning_rate": 2.179407020244395e-06, + "loss": 1.5933, + "step": 14608 + }, + { + "epoch": 0.7987097303281714, + "grad_norm": 1.8171604871749878, + "learning_rate": 2.178268334409653e-06, + "loss": 1.3027, + "step": 14609 + }, + { + "epoch": 0.7987644027718929, + "grad_norm": 1.253626823425293, + "learning_rate": 2.1771299097608866e-06, + "loss": 1.5069, + "step": 14610 + }, + { + "epoch": 0.7988190752156145, + "grad_norm": 1.4367777109146118, + "learning_rate": 2.175991746336108e-06, + "loss": 1.2411, + "step": 14611 + }, + { + "epoch": 0.798873747659336, + "grad_norm": 1.5135390758514404, + "learning_rate": 2.174853844173326e-06, + "loss": 1.7401, + "step": 14612 + }, + { + "epoch": 0.7989284201030575, + "grad_norm": 1.3544294834136963, + "learning_rate": 2.173716203310533e-06, + "loss": 1.5605, + "step": 14613 + }, + { + "epoch": 0.7989830925467791, + "grad_norm": 1.4349534511566162, + "learning_rate": 2.1725788237857235e-06, + "loss": 1.5764, + "step": 14614 + }, + { + "epoch": 0.7990377649905006, + "grad_norm": 1.5062912702560425, + "learning_rate": 2.1714417056368752e-06, + "loss": 1.5423, + "step": 14615 + }, + { + "epoch": 0.7990924374342222, + "grad_norm": 3.4410619735717773, + "learning_rate": 2.170304848901955e-06, + "loss": 1.0681, + "step": 14616 + }, + { + "epoch": 0.7991471098779438, + "grad_norm": 1.7529175281524658, + "learning_rate": 2.1691682536189327e-06, + "loss": 1.4925, + "step": 14617 + }, + { + "epoch": 0.7992017823216653, + "grad_norm": 1.3267600536346436, + "learning_rate": 2.1680319198257573e-06, + "loss": 1.5026, + "step": 14618 + }, + { + "epoch": 0.7992564547653869, + "grad_norm": 1.6753586530685425, + "learning_rate": 2.166895847560372e-06, + "loss": 1.7297, + "step": 14619 + }, + { + "epoch": 0.7993111272091085, + "grad_norm": 1.2791287899017334, + "learning_rate": 2.165760036860718e-06, + "loss": 1.4081, + "step": 14620 + }, + { + "epoch": 0.79936579965283, + "grad_norm": 2.0163164138793945, + "learning_rate": 2.1646244877647195e-06, + "loss": 1.533, + "step": 14621 + }, + { + "epoch": 0.7994204720965515, + "grad_norm": 1.618145227432251, + "learning_rate": 2.1634892003102935e-06, + "loss": 1.3122, + "step": 14622 + }, + { + "epoch": 0.7994751445402731, + "grad_norm": 1.4672385454177856, + "learning_rate": 2.1623541745353547e-06, + "loss": 1.3955, + "step": 14623 + }, + { + "epoch": 0.7995298169839946, + "grad_norm": 1.3052942752838135, + "learning_rate": 2.1612194104778016e-06, + "loss": 1.2935, + "step": 14624 + }, + { + "epoch": 0.7995844894277162, + "grad_norm": 1.82358980178833, + "learning_rate": 2.160084908175526e-06, + "loss": 1.6711, + "step": 14625 + }, + { + "epoch": 0.7996391618714378, + "grad_norm": 1.7008845806121826, + "learning_rate": 2.1589506676664128e-06, + "loss": 1.4209, + "step": 14626 + }, + { + "epoch": 0.7996938343151593, + "grad_norm": 1.938933253288269, + "learning_rate": 2.1578166889883336e-06, + "loss": 1.3632, + "step": 14627 + }, + { + "epoch": 0.7997485067588809, + "grad_norm": 1.6236555576324463, + "learning_rate": 2.1566829721791603e-06, + "loss": 1.3306, + "step": 14628 + }, + { + "epoch": 0.7998031792026024, + "grad_norm": 1.4557706117630005, + "learning_rate": 2.155549517276747e-06, + "loss": 1.4931, + "step": 14629 + }, + { + "epoch": 0.799857851646324, + "grad_norm": 1.589518427848816, + "learning_rate": 2.154416324318941e-06, + "loss": 1.6312, + "step": 14630 + }, + { + "epoch": 0.7999125240900455, + "grad_norm": 1.8833000659942627, + "learning_rate": 2.153283393343587e-06, + "loss": 1.4143, + "step": 14631 + }, + { + "epoch": 0.799967196533767, + "grad_norm": 1.5863124132156372, + "learning_rate": 2.152150724388512e-06, + "loss": 1.5342, + "step": 14632 + }, + { + "epoch": 0.8000218689774886, + "grad_norm": 1.7859272956848145, + "learning_rate": 2.151018317491539e-06, + "loss": 1.4571, + "step": 14633 + }, + { + "epoch": 0.8000765414212102, + "grad_norm": 1.414076328277588, + "learning_rate": 2.149886172690484e-06, + "loss": 1.7285, + "step": 14634 + }, + { + "epoch": 0.8001312138649317, + "grad_norm": 1.56626558303833, + "learning_rate": 2.1487542900231508e-06, + "loss": 1.2746, + "step": 14635 + }, + { + "epoch": 0.8001858863086533, + "grad_norm": 1.4125926494598389, + "learning_rate": 2.1476226695273326e-06, + "loss": 1.5252, + "step": 14636 + }, + { + "epoch": 0.8002405587523749, + "grad_norm": 1.3317153453826904, + "learning_rate": 2.1464913112408225e-06, + "loss": 1.486, + "step": 14637 + }, + { + "epoch": 0.8002952311960964, + "grad_norm": 1.5899991989135742, + "learning_rate": 2.1453602152013965e-06, + "loss": 1.4503, + "step": 14638 + }, + { + "epoch": 0.800349903639818, + "grad_norm": 1.5606801509857178, + "learning_rate": 2.144229381446824e-06, + "loss": 1.6494, + "step": 14639 + }, + { + "epoch": 0.8004045760835395, + "grad_norm": 1.849753975868225, + "learning_rate": 2.1430988100148663e-06, + "loss": 1.3887, + "step": 14640 + }, + { + "epoch": 0.800459248527261, + "grad_norm": 1.4732006788253784, + "learning_rate": 2.1419685009432746e-06, + "loss": 1.4611, + "step": 14641 + }, + { + "epoch": 0.8005139209709826, + "grad_norm": 1.5734330415725708, + "learning_rate": 2.140838454269796e-06, + "loss": 1.1641, + "step": 14642 + }, + { + "epoch": 0.8005685934147041, + "grad_norm": 1.6565862894058228, + "learning_rate": 2.1397086700321635e-06, + "loss": 1.4218, + "step": 14643 + }, + { + "epoch": 0.8006232658584257, + "grad_norm": 1.2019398212432861, + "learning_rate": 2.1385791482681007e-06, + "loss": 1.7095, + "step": 14644 + }, + { + "epoch": 0.8006779383021473, + "grad_norm": 1.61814284324646, + "learning_rate": 2.1374498890153305e-06, + "loss": 1.652, + "step": 14645 + }, + { + "epoch": 0.8007326107458688, + "grad_norm": 1.842839241027832, + "learning_rate": 2.136320892311559e-06, + "loss": 1.4839, + "step": 14646 + }, + { + "epoch": 0.8007872831895904, + "grad_norm": 1.815873384475708, + "learning_rate": 2.1351921581944813e-06, + "loss": 1.2729, + "step": 14647 + }, + { + "epoch": 0.800841955633312, + "grad_norm": 1.482305645942688, + "learning_rate": 2.134063686701797e-06, + "loss": 1.283, + "step": 14648 + }, + { + "epoch": 0.8008966280770334, + "grad_norm": 1.5884935855865479, + "learning_rate": 2.132935477871183e-06, + "loss": 1.4366, + "step": 14649 + }, + { + "epoch": 0.800951300520755, + "grad_norm": 1.1547092199325562, + "learning_rate": 2.1318075317403152e-06, + "loss": 1.4673, + "step": 14650 + }, + { + "epoch": 0.8010059729644766, + "grad_norm": 1.4329280853271484, + "learning_rate": 2.130679848346857e-06, + "loss": 1.3427, + "step": 14651 + }, + { + "epoch": 0.8010606454081981, + "grad_norm": 1.4551334381103516, + "learning_rate": 2.1295524277284617e-06, + "loss": 1.3412, + "step": 14652 + }, + { + "epoch": 0.8011153178519197, + "grad_norm": 1.167224407196045, + "learning_rate": 2.1284252699227813e-06, + "loss": 1.4099, + "step": 14653 + }, + { + "epoch": 0.8011699902956413, + "grad_norm": 1.7073677778244019, + "learning_rate": 2.1272983749674537e-06, + "loss": 1.2608, + "step": 14654 + }, + { + "epoch": 0.8012246627393628, + "grad_norm": 1.403167724609375, + "learning_rate": 2.126171742900104e-06, + "loss": 1.327, + "step": 14655 + }, + { + "epoch": 0.8012793351830844, + "grad_norm": 1.4124186038970947, + "learning_rate": 2.125045373758359e-06, + "loss": 1.5919, + "step": 14656 + }, + { + "epoch": 0.8013340076268058, + "grad_norm": 1.5971479415893555, + "learning_rate": 2.123919267579828e-06, + "loss": 1.1644, + "step": 14657 + }, + { + "epoch": 0.8013886800705274, + "grad_norm": 1.6655865907669067, + "learning_rate": 2.1227934244021108e-06, + "loss": 1.4255, + "step": 14658 + }, + { + "epoch": 0.801443352514249, + "grad_norm": 1.4290708303451538, + "learning_rate": 2.1216678442628104e-06, + "loss": 1.309, + "step": 14659 + }, + { + "epoch": 0.8014980249579705, + "grad_norm": 1.9012044668197632, + "learning_rate": 2.120542527199506e-06, + "loss": 1.3355, + "step": 14660 + }, + { + "epoch": 0.8015526974016921, + "grad_norm": 1.521527886390686, + "learning_rate": 2.119417473249774e-06, + "loss": 1.7196, + "step": 14661 + }, + { + "epoch": 0.8016073698454137, + "grad_norm": 1.7497444152832031, + "learning_rate": 2.1182926824511887e-06, + "loss": 1.5249, + "step": 14662 + }, + { + "epoch": 0.8016620422891352, + "grad_norm": 1.5087971687316895, + "learning_rate": 2.1171681548413046e-06, + "loss": 1.4121, + "step": 14663 + }, + { + "epoch": 0.8017167147328568, + "grad_norm": 1.3496172428131104, + "learning_rate": 2.1160438904576743e-06, + "loss": 1.5871, + "step": 14664 + }, + { + "epoch": 0.8017713871765784, + "grad_norm": 1.0322487354278564, + "learning_rate": 2.1149198893378377e-06, + "loss": 1.5739, + "step": 14665 + }, + { + "epoch": 0.8018260596202998, + "grad_norm": 1.2851864099502563, + "learning_rate": 2.1137961515193274e-06, + "loss": 1.309, + "step": 14666 + }, + { + "epoch": 0.8018807320640214, + "grad_norm": 1.7687263488769531, + "learning_rate": 2.1126726770396712e-06, + "loss": 1.3317, + "step": 14667 + }, + { + "epoch": 0.801935404507743, + "grad_norm": 1.7790933847427368, + "learning_rate": 2.1115494659363824e-06, + "loss": 1.4707, + "step": 14668 + }, + { + "epoch": 0.8019900769514645, + "grad_norm": 1.3901547193527222, + "learning_rate": 2.110426518246965e-06, + "loss": 1.0719, + "step": 14669 + }, + { + "epoch": 0.8020447493951861, + "grad_norm": 1.1719516515731812, + "learning_rate": 2.1093038340089235e-06, + "loss": 1.5895, + "step": 14670 + }, + { + "epoch": 0.8020994218389076, + "grad_norm": 1.5479637384414673, + "learning_rate": 2.108181413259741e-06, + "loss": 1.6599, + "step": 14671 + }, + { + "epoch": 0.8021540942826292, + "grad_norm": 1.8312546014785767, + "learning_rate": 2.1070592560368986e-06, + "loss": 1.3171, + "step": 14672 + }, + { + "epoch": 0.8022087667263508, + "grad_norm": 1.4773128032684326, + "learning_rate": 2.1059373623778722e-06, + "loss": 1.4171, + "step": 14673 + }, + { + "epoch": 0.8022634391700723, + "grad_norm": 1.733896017074585, + "learning_rate": 2.10481573232012e-06, + "loss": 1.3293, + "step": 14674 + }, + { + "epoch": 0.8023181116137938, + "grad_norm": 1.2855806350708008, + "learning_rate": 2.1036943659010945e-06, + "loss": 1.6706, + "step": 14675 + }, + { + "epoch": 0.8023727840575154, + "grad_norm": 2.067307710647583, + "learning_rate": 2.1025732631582475e-06, + "loss": 1.4056, + "step": 14676 + }, + { + "epoch": 0.8024274565012369, + "grad_norm": 1.712925910949707, + "learning_rate": 2.101452424129009e-06, + "loss": 1.2505, + "step": 14677 + }, + { + "epoch": 0.8024821289449585, + "grad_norm": 1.5620685815811157, + "learning_rate": 2.1003318488508107e-06, + "loss": 1.1396, + "step": 14678 + }, + { + "epoch": 0.8025368013886801, + "grad_norm": 1.3779197931289673, + "learning_rate": 2.0992115373610677e-06, + "loss": 1.4149, + "step": 14679 + }, + { + "epoch": 0.8025914738324016, + "grad_norm": 1.7109427452087402, + "learning_rate": 2.0980914896971892e-06, + "loss": 1.4632, + "step": 14680 + }, + { + "epoch": 0.8026461462761232, + "grad_norm": 1.3249694108963013, + "learning_rate": 2.096971705896581e-06, + "loss": 1.5397, + "step": 14681 + }, + { + "epoch": 0.8027008187198448, + "grad_norm": 1.6187083721160889, + "learning_rate": 2.095852185996632e-06, + "loss": 1.5636, + "step": 14682 + }, + { + "epoch": 0.8027554911635663, + "grad_norm": 1.310439944267273, + "learning_rate": 2.094732930034724e-06, + "loss": 1.4402, + "step": 14683 + }, + { + "epoch": 0.8028101636072879, + "grad_norm": 1.2214603424072266, + "learning_rate": 2.0936139380482357e-06, + "loss": 1.5822, + "step": 14684 + }, + { + "epoch": 0.8028648360510094, + "grad_norm": 1.929632306098938, + "learning_rate": 2.092495210074532e-06, + "loss": 1.5323, + "step": 14685 + }, + { + "epoch": 0.8029195084947309, + "grad_norm": 1.8543928861618042, + "learning_rate": 2.0913767461509647e-06, + "loss": 1.3927, + "step": 14686 + }, + { + "epoch": 0.8029741809384525, + "grad_norm": 1.5864498615264893, + "learning_rate": 2.0902585463148907e-06, + "loss": 1.5184, + "step": 14687 + }, + { + "epoch": 0.803028853382174, + "grad_norm": 1.6034363508224487, + "learning_rate": 2.089140610603643e-06, + "loss": 1.403, + "step": 14688 + }, + { + "epoch": 0.8030835258258956, + "grad_norm": 1.4318511486053467, + "learning_rate": 2.0880229390545536e-06, + "loss": 1.4781, + "step": 14689 + }, + { + "epoch": 0.8031381982696172, + "grad_norm": 1.8771024942398071, + "learning_rate": 2.086905531704946e-06, + "loss": 1.5301, + "step": 14690 + }, + { + "epoch": 0.8031928707133387, + "grad_norm": 1.56476628780365, + "learning_rate": 2.085788388592129e-06, + "loss": 1.2349, + "step": 14691 + }, + { + "epoch": 0.8032475431570603, + "grad_norm": 1.263702392578125, + "learning_rate": 2.0846715097534087e-06, + "loss": 1.5192, + "step": 14692 + }, + { + "epoch": 0.8033022156007819, + "grad_norm": 1.4840439558029175, + "learning_rate": 2.0835548952260822e-06, + "loss": 1.5783, + "step": 14693 + }, + { + "epoch": 0.8033568880445033, + "grad_norm": 1.5410162210464478, + "learning_rate": 2.0824385450474314e-06, + "loss": 1.3601, + "step": 14694 + }, + { + "epoch": 0.8034115604882249, + "grad_norm": 1.3495638370513916, + "learning_rate": 2.081322459254739e-06, + "loss": 1.3176, + "step": 14695 + }, + { + "epoch": 0.8034662329319465, + "grad_norm": 1.3737480640411377, + "learning_rate": 2.0802066378852725e-06, + "loss": 1.4959, + "step": 14696 + }, + { + "epoch": 0.803520905375668, + "grad_norm": 1.4971224069595337, + "learning_rate": 2.0790910809762876e-06, + "loss": 1.5853, + "step": 14697 + }, + { + "epoch": 0.8035755778193896, + "grad_norm": 1.733214259147644, + "learning_rate": 2.077975788565041e-06, + "loss": 1.2088, + "step": 14698 + }, + { + "epoch": 0.8036302502631112, + "grad_norm": 1.2256743907928467, + "learning_rate": 2.0768607606887724e-06, + "loss": 1.5839, + "step": 14699 + }, + { + "epoch": 0.8036849227068327, + "grad_norm": 2.049471855163574, + "learning_rate": 2.075745997384713e-06, + "loss": 1.2219, + "step": 14700 + }, + { + "epoch": 0.8037395951505543, + "grad_norm": 1.6931610107421875, + "learning_rate": 2.074631498690092e-06, + "loss": 1.4386, + "step": 14701 + }, + { + "epoch": 0.8037942675942757, + "grad_norm": 1.594809889793396, + "learning_rate": 2.073517264642122e-06, + "loss": 1.6502, + "step": 14702 + }, + { + "epoch": 0.8038489400379973, + "grad_norm": 1.4206972122192383, + "learning_rate": 2.0724032952780115e-06, + "loss": 1.5898, + "step": 14703 + }, + { + "epoch": 0.8039036124817189, + "grad_norm": 1.577603816986084, + "learning_rate": 2.071289590634957e-06, + "loss": 1.6078, + "step": 14704 + }, + { + "epoch": 0.8039582849254404, + "grad_norm": 1.3248294591903687, + "learning_rate": 2.0701761507501495e-06, + "loss": 1.3716, + "step": 14705 + }, + { + "epoch": 0.804012957369162, + "grad_norm": 1.6994305849075317, + "learning_rate": 2.069062975660765e-06, + "loss": 1.4027, + "step": 14706 + }, + { + "epoch": 0.8040676298128836, + "grad_norm": 1.5218430757522583, + "learning_rate": 2.067950065403981e-06, + "loss": 1.4621, + "step": 14707 + }, + { + "epoch": 0.8041223022566051, + "grad_norm": 1.6283504962921143, + "learning_rate": 2.066837420016954e-06, + "loss": 1.3475, + "step": 14708 + }, + { + "epoch": 0.8041769747003267, + "grad_norm": 1.7144776582717896, + "learning_rate": 2.0657250395368443e-06, + "loss": 1.603, + "step": 14709 + }, + { + "epoch": 0.8042316471440483, + "grad_norm": 1.615228295326233, + "learning_rate": 2.064612924000795e-06, + "loss": 1.5738, + "step": 14710 + }, + { + "epoch": 0.8042863195877697, + "grad_norm": 1.713348388671875, + "learning_rate": 2.0635010734459372e-06, + "loss": 1.2277, + "step": 14711 + }, + { + "epoch": 0.8043409920314913, + "grad_norm": 1.516652226448059, + "learning_rate": 2.062389487909405e-06, + "loss": 1.6128, + "step": 14712 + }, + { + "epoch": 0.8043956644752129, + "grad_norm": 1.9164875745773315, + "learning_rate": 2.0612781674283142e-06, + "loss": 1.3755, + "step": 14713 + }, + { + "epoch": 0.8044503369189344, + "grad_norm": 1.3098201751708984, + "learning_rate": 2.060167112039775e-06, + "loss": 1.5952, + "step": 14714 + }, + { + "epoch": 0.804505009362656, + "grad_norm": 1.864863395690918, + "learning_rate": 2.0590563217808858e-06, + "loss": 1.5219, + "step": 14715 + }, + { + "epoch": 0.8045596818063775, + "grad_norm": 1.7777953147888184, + "learning_rate": 2.0579457966887406e-06, + "loss": 1.5434, + "step": 14716 + }, + { + "epoch": 0.8046143542500991, + "grad_norm": 1.7415262460708618, + "learning_rate": 2.05683553680042e-06, + "loss": 1.3917, + "step": 14717 + }, + { + "epoch": 0.8046690266938207, + "grad_norm": 1.512485384941101, + "learning_rate": 2.055725542153002e-06, + "loss": 1.3631, + "step": 14718 + }, + { + "epoch": 0.8047236991375422, + "grad_norm": 1.4346505403518677, + "learning_rate": 2.0546158127835503e-06, + "loss": 1.27, + "step": 14719 + }, + { + "epoch": 0.8047783715812638, + "grad_norm": 1.5802278518676758, + "learning_rate": 2.0535063487291176e-06, + "loss": 1.6277, + "step": 14720 + }, + { + "epoch": 0.8048330440249853, + "grad_norm": 1.4277746677398682, + "learning_rate": 2.0523971500267583e-06, + "loss": 1.7667, + "step": 14721 + }, + { + "epoch": 0.8048877164687068, + "grad_norm": 1.6843091249465942, + "learning_rate": 2.051288216713505e-06, + "loss": 1.5433, + "step": 14722 + }, + { + "epoch": 0.8049423889124284, + "grad_norm": 2.300072193145752, + "learning_rate": 2.050179548826393e-06, + "loss": 1.7363, + "step": 14723 + }, + { + "epoch": 0.80499706135615, + "grad_norm": 1.2679879665374756, + "learning_rate": 2.0490711464024403e-06, + "loss": 1.3613, + "step": 14724 + }, + { + "epoch": 0.8050517337998715, + "grad_norm": 1.4014396667480469, + "learning_rate": 2.047963009478657e-06, + "loss": 1.3194, + "step": 14725 + }, + { + "epoch": 0.8051064062435931, + "grad_norm": 1.5194919109344482, + "learning_rate": 2.046855138092052e-06, + "loss": 1.3241, + "step": 14726 + }, + { + "epoch": 0.8051610786873147, + "grad_norm": 1.319414734840393, + "learning_rate": 2.045747532279616e-06, + "loss": 1.3983, + "step": 14727 + }, + { + "epoch": 0.8052157511310362, + "grad_norm": 1.7807738780975342, + "learning_rate": 2.0446401920783353e-06, + "loss": 1.5304, + "step": 14728 + }, + { + "epoch": 0.8052704235747578, + "grad_norm": 1.2734477519989014, + "learning_rate": 2.0435331175251873e-06, + "loss": 1.5821, + "step": 14729 + }, + { + "epoch": 0.8053250960184792, + "grad_norm": 1.4489728212356567, + "learning_rate": 2.042426308657138e-06, + "loss": 1.2804, + "step": 14730 + }, + { + "epoch": 0.8053797684622008, + "grad_norm": 1.5660247802734375, + "learning_rate": 2.041319765511145e-06, + "loss": 1.4346, + "step": 14731 + }, + { + "epoch": 0.8054344409059224, + "grad_norm": 1.6812965869903564, + "learning_rate": 2.040213488124163e-06, + "loss": 1.4457, + "step": 14732 + }, + { + "epoch": 0.8054891133496439, + "grad_norm": 2.023022174835205, + "learning_rate": 2.0391074765331307e-06, + "loss": 1.4129, + "step": 14733 + }, + { + "epoch": 0.8055437857933655, + "grad_norm": 2.297863245010376, + "learning_rate": 2.038001730774978e-06, + "loss": 1.4177, + "step": 14734 + }, + { + "epoch": 0.8055984582370871, + "grad_norm": 1.7420549392700195, + "learning_rate": 2.036896250886634e-06, + "loss": 1.3634, + "step": 14735 + }, + { + "epoch": 0.8056531306808086, + "grad_norm": 1.5201103687286377, + "learning_rate": 2.035791036905007e-06, + "loss": 1.5544, + "step": 14736 + }, + { + "epoch": 0.8057078031245302, + "grad_norm": 1.236574411392212, + "learning_rate": 2.0346860888670095e-06, + "loss": 1.3547, + "step": 14737 + }, + { + "epoch": 0.8057624755682518, + "grad_norm": 1.8688974380493164, + "learning_rate": 2.033581406809534e-06, + "loss": 1.2893, + "step": 14738 + }, + { + "epoch": 0.8058171480119732, + "grad_norm": 1.4066005945205688, + "learning_rate": 2.0324769907694665e-06, + "loss": 1.5275, + "step": 14739 + }, + { + "epoch": 0.8058718204556948, + "grad_norm": 1.2622520923614502, + "learning_rate": 2.031372840783691e-06, + "loss": 1.8344, + "step": 14740 + }, + { + "epoch": 0.8059264928994164, + "grad_norm": 1.682801365852356, + "learning_rate": 2.0302689568890753e-06, + "loss": 1.3212, + "step": 14741 + }, + { + "epoch": 0.8059811653431379, + "grad_norm": 1.646494746208191, + "learning_rate": 2.029165339122482e-06, + "loss": 1.4722, + "step": 14742 + }, + { + "epoch": 0.8060358377868595, + "grad_norm": 1.5578171014785767, + "learning_rate": 2.028061987520761e-06, + "loss": 1.3889, + "step": 14743 + }, + { + "epoch": 0.806090510230581, + "grad_norm": 1.3122299909591675, + "learning_rate": 2.026958902120757e-06, + "loss": 1.7556, + "step": 14744 + }, + { + "epoch": 0.8061451826743026, + "grad_norm": 1.3675554990768433, + "learning_rate": 2.025856082959302e-06, + "loss": 1.5885, + "step": 14745 + }, + { + "epoch": 0.8061998551180242, + "grad_norm": 1.3066471815109253, + "learning_rate": 2.0247535300732267e-06, + "loss": 1.5837, + "step": 14746 + }, + { + "epoch": 0.8062545275617456, + "grad_norm": 1.6844761371612549, + "learning_rate": 2.023651243499346e-06, + "loss": 1.1499, + "step": 14747 + }, + { + "epoch": 0.8063092000054672, + "grad_norm": 1.51031494140625, + "learning_rate": 2.022549223274465e-06, + "loss": 1.3976, + "step": 14748 + }, + { + "epoch": 0.8063638724491888, + "grad_norm": 1.9499250650405884, + "learning_rate": 2.0214474694353868e-06, + "loss": 1.2912, + "step": 14749 + }, + { + "epoch": 0.8064185448929103, + "grad_norm": 1.493260383605957, + "learning_rate": 2.0203459820188974e-06, + "loss": 1.4734, + "step": 14750 + }, + { + "epoch": 0.8064732173366319, + "grad_norm": 1.4917099475860596, + "learning_rate": 2.019244761061784e-06, + "loss": 1.2231, + "step": 14751 + }, + { + "epoch": 0.8065278897803535, + "grad_norm": 1.6357980966567993, + "learning_rate": 2.0181438066008154e-06, + "loss": 1.2727, + "step": 14752 + }, + { + "epoch": 0.806582562224075, + "grad_norm": 1.3623387813568115, + "learning_rate": 2.0170431186727545e-06, + "loss": 1.6232, + "step": 14753 + }, + { + "epoch": 0.8066372346677966, + "grad_norm": 1.4862476587295532, + "learning_rate": 2.015942697314357e-06, + "loss": 1.1971, + "step": 14754 + }, + { + "epoch": 0.8066919071115182, + "grad_norm": 1.540602445602417, + "learning_rate": 2.0148425425623673e-06, + "loss": 1.4669, + "step": 14755 + }, + { + "epoch": 0.8067465795552397, + "grad_norm": 1.3478630781173706, + "learning_rate": 2.013742654453521e-06, + "loss": 1.3152, + "step": 14756 + }, + { + "epoch": 0.8068012519989612, + "grad_norm": 1.4170846939086914, + "learning_rate": 2.0126430330245493e-06, + "loss": 1.4997, + "step": 14757 + }, + { + "epoch": 0.8068559244426827, + "grad_norm": 1.885705590248108, + "learning_rate": 2.011543678312171e-06, + "loss": 1.1071, + "step": 14758 + }, + { + "epoch": 0.8069105968864043, + "grad_norm": 1.8950800895690918, + "learning_rate": 2.0104445903530912e-06, + "loss": 1.3938, + "step": 14759 + }, + { + "epoch": 0.8069652693301259, + "grad_norm": 1.7655961513519287, + "learning_rate": 2.0093457691840178e-06, + "loss": 1.4383, + "step": 14760 + }, + { + "epoch": 0.8070199417738474, + "grad_norm": 1.7256014347076416, + "learning_rate": 2.0082472148416387e-06, + "loss": 1.6516, + "step": 14761 + }, + { + "epoch": 0.807074614217569, + "grad_norm": 1.265839695930481, + "learning_rate": 2.0071489273626376e-06, + "loss": 1.4296, + "step": 14762 + }, + { + "epoch": 0.8071292866612906, + "grad_norm": 1.358214020729065, + "learning_rate": 2.0060509067836907e-06, + "loss": 1.2649, + "step": 14763 + }, + { + "epoch": 0.8071839591050121, + "grad_norm": 1.6090295314788818, + "learning_rate": 2.00495315314146e-06, + "loss": 1.2039, + "step": 14764 + }, + { + "epoch": 0.8072386315487337, + "grad_norm": 1.6824467182159424, + "learning_rate": 2.0038556664726083e-06, + "loss": 1.3165, + "step": 14765 + }, + { + "epoch": 0.8072933039924552, + "grad_norm": 1.500442385673523, + "learning_rate": 2.0027584468137784e-06, + "loss": 1.4044, + "step": 14766 + }, + { + "epoch": 0.8073479764361767, + "grad_norm": 1.370886206626892, + "learning_rate": 2.00166149420161e-06, + "loss": 1.4752, + "step": 14767 + }, + { + "epoch": 0.8074026488798983, + "grad_norm": 1.694844126701355, + "learning_rate": 2.0005648086727337e-06, + "loss": 1.4622, + "step": 14768 + }, + { + "epoch": 0.8074573213236199, + "grad_norm": 1.5287550687789917, + "learning_rate": 1.999468390263769e-06, + "loss": 1.366, + "step": 14769 + }, + { + "epoch": 0.8075119937673414, + "grad_norm": 1.790235161781311, + "learning_rate": 1.9983722390113257e-06, + "loss": 1.5878, + "step": 14770 + }, + { + "epoch": 0.807566666211063, + "grad_norm": 1.7241857051849365, + "learning_rate": 1.9972763549520137e-06, + "loss": 1.4287, + "step": 14771 + }, + { + "epoch": 0.8076213386547845, + "grad_norm": 1.787111759185791, + "learning_rate": 1.996180738122422e-06, + "loss": 1.6498, + "step": 14772 + }, + { + "epoch": 0.8076760110985061, + "grad_norm": 2.4260611534118652, + "learning_rate": 1.9950853885591346e-06, + "loss": 1.5184, + "step": 14773 + }, + { + "epoch": 0.8077306835422277, + "grad_norm": 1.7358510494232178, + "learning_rate": 1.993990306298733e-06, + "loss": 1.5342, + "step": 14774 + }, + { + "epoch": 0.8077853559859491, + "grad_norm": 1.4463231563568115, + "learning_rate": 1.992895491377782e-06, + "loss": 1.3724, + "step": 14775 + }, + { + "epoch": 0.8078400284296707, + "grad_norm": 1.5175296068191528, + "learning_rate": 1.9918009438328365e-06, + "loss": 1.3505, + "step": 14776 + }, + { + "epoch": 0.8078947008733923, + "grad_norm": 1.473353385925293, + "learning_rate": 1.9907066637004526e-06, + "loss": 1.6219, + "step": 14777 + }, + { + "epoch": 0.8079493733171138, + "grad_norm": 1.6541094779968262, + "learning_rate": 1.989612651017164e-06, + "loss": 1.3245, + "step": 14778 + }, + { + "epoch": 0.8080040457608354, + "grad_norm": 1.6062190532684326, + "learning_rate": 1.98851890581951e-06, + "loss": 1.213, + "step": 14779 + }, + { + "epoch": 0.808058718204557, + "grad_norm": 1.5088560581207275, + "learning_rate": 1.9874254281440085e-06, + "loss": 1.4944, + "step": 14780 + }, + { + "epoch": 0.8081133906482785, + "grad_norm": 2.2063510417938232, + "learning_rate": 1.986332218027174e-06, + "loss": 1.5382, + "step": 14781 + }, + { + "epoch": 0.8081680630920001, + "grad_norm": 2.396085262298584, + "learning_rate": 1.9852392755055117e-06, + "loss": 1.0853, + "step": 14782 + }, + { + "epoch": 0.8082227355357217, + "grad_norm": 1.5240826606750488, + "learning_rate": 1.9841466006155162e-06, + "loss": 1.6194, + "step": 14783 + }, + { + "epoch": 0.8082774079794431, + "grad_norm": 1.8832273483276367, + "learning_rate": 1.983054193393674e-06, + "loss": 1.4617, + "step": 14784 + }, + { + "epoch": 0.8083320804231647, + "grad_norm": 1.819169282913208, + "learning_rate": 1.981962053876467e-06, + "loss": 1.3558, + "step": 14785 + }, + { + "epoch": 0.8083867528668862, + "grad_norm": 1.7908872365951538, + "learning_rate": 1.9808701821003615e-06, + "loss": 1.466, + "step": 14786 + }, + { + "epoch": 0.8084414253106078, + "grad_norm": 1.3500350713729858, + "learning_rate": 1.9797785781018164e-06, + "loss": 1.4511, + "step": 14787 + }, + { + "epoch": 0.8084960977543294, + "grad_norm": 1.8784079551696777, + "learning_rate": 1.9786872419172863e-06, + "loss": 1.3071, + "step": 14788 + }, + { + "epoch": 0.8085507701980509, + "grad_norm": 1.5868171453475952, + "learning_rate": 1.9775961735832126e-06, + "loss": 1.5407, + "step": 14789 + }, + { + "epoch": 0.8086054426417725, + "grad_norm": 1.7928160429000854, + "learning_rate": 1.976505373136025e-06, + "loss": 1.6522, + "step": 14790 + }, + { + "epoch": 0.8086601150854941, + "grad_norm": 1.3454349040985107, + "learning_rate": 1.975414840612153e-06, + "loss": 1.5113, + "step": 14791 + }, + { + "epoch": 0.8087147875292156, + "grad_norm": 1.4740540981292725, + "learning_rate": 1.97432457604801e-06, + "loss": 1.7198, + "step": 14792 + }, + { + "epoch": 0.8087694599729371, + "grad_norm": 1.8336089849472046, + "learning_rate": 1.973234579480001e-06, + "loss": 1.4588, + "step": 14793 + }, + { + "epoch": 0.8088241324166587, + "grad_norm": 1.7611503601074219, + "learning_rate": 1.9721448509445264e-06, + "loss": 1.6425, + "step": 14794 + }, + { + "epoch": 0.8088788048603802, + "grad_norm": 1.2132302522659302, + "learning_rate": 1.9710553904779708e-06, + "loss": 1.638, + "step": 14795 + }, + { + "epoch": 0.8089334773041018, + "grad_norm": 1.3259973526000977, + "learning_rate": 1.969966198116717e-06, + "loss": 1.5176, + "step": 14796 + }, + { + "epoch": 0.8089881497478234, + "grad_norm": 1.4497294425964355, + "learning_rate": 1.968877273897136e-06, + "loss": 1.6238, + "step": 14797 + }, + { + "epoch": 0.8090428221915449, + "grad_norm": 1.9943360090255737, + "learning_rate": 1.967788617855586e-06, + "loss": 1.262, + "step": 14798 + }, + { + "epoch": 0.8090974946352665, + "grad_norm": 1.9519826173782349, + "learning_rate": 1.9667002300284255e-06, + "loss": 1.4288, + "step": 14799 + }, + { + "epoch": 0.809152167078988, + "grad_norm": 1.328566312789917, + "learning_rate": 1.965612110451994e-06, + "loss": 1.7262, + "step": 14800 + }, + { + "epoch": 0.8092068395227096, + "grad_norm": 1.5861284732818604, + "learning_rate": 1.9645242591626244e-06, + "loss": 1.5155, + "step": 14801 + }, + { + "epoch": 0.8092615119664311, + "grad_norm": 1.1531128883361816, + "learning_rate": 1.9634366761966495e-06, + "loss": 1.2016, + "step": 14802 + }, + { + "epoch": 0.8093161844101526, + "grad_norm": 1.6553099155426025, + "learning_rate": 1.962349361590381e-06, + "loss": 1.4944, + "step": 14803 + }, + { + "epoch": 0.8093708568538742, + "grad_norm": 1.4027382135391235, + "learning_rate": 1.9612623153801267e-06, + "loss": 1.3106, + "step": 14804 + }, + { + "epoch": 0.8094255292975958, + "grad_norm": 4.55228853225708, + "learning_rate": 1.960175537602189e-06, + "loss": 1.7294, + "step": 14805 + }, + { + "epoch": 0.8094802017413173, + "grad_norm": 1.420040249824524, + "learning_rate": 1.9590890282928574e-06, + "loss": 1.5103, + "step": 14806 + }, + { + "epoch": 0.8095348741850389, + "grad_norm": 1.3901995420455933, + "learning_rate": 1.9580027874884112e-06, + "loss": 1.446, + "step": 14807 + }, + { + "epoch": 0.8095895466287605, + "grad_norm": 1.6134833097457886, + "learning_rate": 1.956916815225122e-06, + "loss": 1.4057, + "step": 14808 + }, + { + "epoch": 0.809644219072482, + "grad_norm": 1.449926495552063, + "learning_rate": 1.9558311115392524e-06, + "loss": 1.2667, + "step": 14809 + }, + { + "epoch": 0.8096988915162036, + "grad_norm": 1.4492236375808716, + "learning_rate": 1.95474567646706e-06, + "loss": 1.4818, + "step": 14810 + }, + { + "epoch": 0.8097535639599251, + "grad_norm": 1.6854777336120605, + "learning_rate": 1.953660510044789e-06, + "loss": 1.4244, + "step": 14811 + }, + { + "epoch": 0.8098082364036466, + "grad_norm": 1.8026498556137085, + "learning_rate": 1.9525756123086726e-06, + "loss": 1.5402, + "step": 14812 + }, + { + "epoch": 0.8098629088473682, + "grad_norm": 1.620513677597046, + "learning_rate": 1.9514909832949427e-06, + "loss": 1.3292, + "step": 14813 + }, + { + "epoch": 0.8099175812910897, + "grad_norm": 1.5884132385253906, + "learning_rate": 1.9504066230398156e-06, + "loss": 1.4843, + "step": 14814 + }, + { + "epoch": 0.8099722537348113, + "grad_norm": 1.5167783498764038, + "learning_rate": 1.949322531579496e-06, + "loss": 1.5289, + "step": 14815 + }, + { + "epoch": 0.8100269261785329, + "grad_norm": 2.2188961505889893, + "learning_rate": 1.948238708950193e-06, + "loss": 1.4641, + "step": 14816 + }, + { + "epoch": 0.8100815986222544, + "grad_norm": 1.4153815507888794, + "learning_rate": 1.947155155188093e-06, + "loss": 1.1627, + "step": 14817 + }, + { + "epoch": 0.810136271065976, + "grad_norm": 2.8525924682617188, + "learning_rate": 1.946071870329377e-06, + "loss": 1.3818, + "step": 14818 + }, + { + "epoch": 0.8101909435096976, + "grad_norm": 1.535531997680664, + "learning_rate": 1.9449888544102215e-06, + "loss": 1.2776, + "step": 14819 + }, + { + "epoch": 0.810245615953419, + "grad_norm": 1.393115758895874, + "learning_rate": 1.943906107466791e-06, + "loss": 1.24, + "step": 14820 + }, + { + "epoch": 0.8103002883971406, + "grad_norm": 1.5744116306304932, + "learning_rate": 1.9428236295352388e-06, + "loss": 1.6693, + "step": 14821 + }, + { + "epoch": 0.8103549608408622, + "grad_norm": 1.748048186302185, + "learning_rate": 1.941741420651714e-06, + "loss": 1.4532, + "step": 14822 + }, + { + "epoch": 0.8104096332845837, + "grad_norm": 1.4577728509902954, + "learning_rate": 1.9406594808523484e-06, + "loss": 1.5405, + "step": 14823 + }, + { + "epoch": 0.8104643057283053, + "grad_norm": 1.4748413562774658, + "learning_rate": 1.9395778101732777e-06, + "loss": 1.509, + "step": 14824 + }, + { + "epoch": 0.8105189781720269, + "grad_norm": 1.5435773134231567, + "learning_rate": 1.9384964086506185e-06, + "loss": 1.2911, + "step": 14825 + }, + { + "epoch": 0.8105736506157484, + "grad_norm": 1.3664610385894775, + "learning_rate": 1.9374152763204777e-06, + "loss": 1.4582, + "step": 14826 + }, + { + "epoch": 0.81062832305947, + "grad_norm": 1.3808932304382324, + "learning_rate": 1.9363344132189633e-06, + "loss": 1.4267, + "step": 14827 + }, + { + "epoch": 0.8106829955031915, + "grad_norm": 1.4309005737304688, + "learning_rate": 1.9352538193821645e-06, + "loss": 1.4647, + "step": 14828 + }, + { + "epoch": 0.810737667946913, + "grad_norm": 1.462998628616333, + "learning_rate": 1.9341734948461633e-06, + "loss": 1.33, + "step": 14829 + }, + { + "epoch": 0.8107923403906346, + "grad_norm": 1.715887427330017, + "learning_rate": 1.933093439647039e-06, + "loss": 1.3736, + "step": 14830 + }, + { + "epoch": 0.8108470128343561, + "grad_norm": 1.4862000942230225, + "learning_rate": 1.9320136538208535e-06, + "loss": 1.2706, + "step": 14831 + }, + { + "epoch": 0.8109016852780777, + "grad_norm": 1.5471876859664917, + "learning_rate": 1.930934137403665e-06, + "loss": 1.489, + "step": 14832 + }, + { + "epoch": 0.8109563577217993, + "grad_norm": 1.4979883432388306, + "learning_rate": 1.9298548904315197e-06, + "loss": 1.3482, + "step": 14833 + }, + { + "epoch": 0.8110110301655208, + "grad_norm": 1.3112635612487793, + "learning_rate": 1.928775912940454e-06, + "loss": 1.3629, + "step": 14834 + }, + { + "epoch": 0.8110657026092424, + "grad_norm": 1.6058286428451538, + "learning_rate": 1.9276972049665033e-06, + "loss": 1.2674, + "step": 14835 + }, + { + "epoch": 0.811120375052964, + "grad_norm": 1.3181592226028442, + "learning_rate": 1.9266187665456857e-06, + "loss": 1.3391, + "step": 14836 + }, + { + "epoch": 0.8111750474966855, + "grad_norm": 1.693713903427124, + "learning_rate": 1.9255405977140083e-06, + "loss": 1.6709, + "step": 14837 + }, + { + "epoch": 0.811229719940407, + "grad_norm": 1.6783288717269897, + "learning_rate": 1.924462698507481e-06, + "loss": 1.1174, + "step": 14838 + }, + { + "epoch": 0.8112843923841286, + "grad_norm": 1.5233843326568604, + "learning_rate": 1.923385068962095e-06, + "loss": 1.2378, + "step": 14839 + }, + { + "epoch": 0.8113390648278501, + "grad_norm": 1.2504976987838745, + "learning_rate": 1.92230770911383e-06, + "loss": 1.4537, + "step": 14840 + }, + { + "epoch": 0.8113937372715717, + "grad_norm": 2.176748514175415, + "learning_rate": 1.9212306189986686e-06, + "loss": 1.4879, + "step": 14841 + }, + { + "epoch": 0.8114484097152932, + "grad_norm": 1.5499348640441895, + "learning_rate": 1.9201537986525743e-06, + "loss": 1.4534, + "step": 14842 + }, + { + "epoch": 0.8115030821590148, + "grad_norm": 2.140711784362793, + "learning_rate": 1.9190772481115017e-06, + "loss": 1.5822, + "step": 14843 + }, + { + "epoch": 0.8115577546027364, + "grad_norm": 1.6644176244735718, + "learning_rate": 1.9180009674114055e-06, + "loss": 1.2437, + "step": 14844 + }, + { + "epoch": 0.8116124270464579, + "grad_norm": 1.603711485862732, + "learning_rate": 1.916924956588221e-06, + "loss": 1.5419, + "step": 14845 + }, + { + "epoch": 0.8116670994901795, + "grad_norm": 1.659252643585205, + "learning_rate": 1.9158492156778807e-06, + "loss": 1.5197, + "step": 14846 + }, + { + "epoch": 0.811721771933901, + "grad_norm": 1.6260889768600464, + "learning_rate": 1.914773744716304e-06, + "loss": 1.4344, + "step": 14847 + }, + { + "epoch": 0.8117764443776225, + "grad_norm": 1.4771060943603516, + "learning_rate": 1.913698543739403e-06, + "loss": 1.3518, + "step": 14848 + }, + { + "epoch": 0.8118311168213441, + "grad_norm": 1.7075403928756714, + "learning_rate": 1.9126236127830843e-06, + "loss": 1.6077, + "step": 14849 + }, + { + "epoch": 0.8118857892650657, + "grad_norm": 1.396510362625122, + "learning_rate": 1.911548951883242e-06, + "loss": 1.4993, + "step": 14850 + }, + { + "epoch": 0.8119404617087872, + "grad_norm": 1.4991297721862793, + "learning_rate": 1.910474561075757e-06, + "loss": 1.4659, + "step": 14851 + }, + { + "epoch": 0.8119951341525088, + "grad_norm": 1.5821970701217651, + "learning_rate": 1.9094004403965116e-06, + "loss": 1.3626, + "step": 14852 + }, + { + "epoch": 0.8120498065962304, + "grad_norm": 1.7508436441421509, + "learning_rate": 1.908326589881372e-06, + "loss": 1.3032, + "step": 14853 + }, + { + "epoch": 0.8121044790399519, + "grad_norm": 1.7766469717025757, + "learning_rate": 1.9072530095661912e-06, + "loss": 1.3404, + "step": 14854 + }, + { + "epoch": 0.8121591514836735, + "grad_norm": 1.4153742790222168, + "learning_rate": 1.9061796994868254e-06, + "loss": 1.5019, + "step": 14855 + }, + { + "epoch": 0.8122138239273949, + "grad_norm": 1.3200327157974243, + "learning_rate": 1.9051066596791124e-06, + "loss": 1.5732, + "step": 14856 + }, + { + "epoch": 0.8122684963711165, + "grad_norm": 1.6578590869903564, + "learning_rate": 1.9040338901788813e-06, + "loss": 1.4451, + "step": 14857 + }, + { + "epoch": 0.8123231688148381, + "grad_norm": 1.8566950559616089, + "learning_rate": 1.902961391021958e-06, + "loss": 1.5857, + "step": 14858 + }, + { + "epoch": 0.8123778412585596, + "grad_norm": 1.481663703918457, + "learning_rate": 1.901889162244155e-06, + "loss": 1.4737, + "step": 14859 + }, + { + "epoch": 0.8124325137022812, + "grad_norm": 1.9084057807922363, + "learning_rate": 1.9008172038812744e-06, + "loss": 1.2494, + "step": 14860 + }, + { + "epoch": 0.8124871861460028, + "grad_norm": 1.9403209686279297, + "learning_rate": 1.8997455159691135e-06, + "loss": 1.4239, + "step": 14861 + }, + { + "epoch": 0.8125418585897243, + "grad_norm": 1.5411535501480103, + "learning_rate": 1.898674098543456e-06, + "loss": 1.3228, + "step": 14862 + }, + { + "epoch": 0.8125965310334459, + "grad_norm": 1.4952911138534546, + "learning_rate": 1.897602951640082e-06, + "loss": 1.6624, + "step": 14863 + }, + { + "epoch": 0.8126512034771675, + "grad_norm": 1.3688000440597534, + "learning_rate": 1.8965320752947592e-06, + "loss": 1.3157, + "step": 14864 + }, + { + "epoch": 0.8127058759208889, + "grad_norm": 1.5847769975662231, + "learning_rate": 1.8954614695432427e-06, + "loss": 1.4402, + "step": 14865 + }, + { + "epoch": 0.8127605483646105, + "grad_norm": 1.400248408317566, + "learning_rate": 1.8943911344212873e-06, + "loss": 1.4079, + "step": 14866 + }, + { + "epoch": 0.8128152208083321, + "grad_norm": 1.484707236289978, + "learning_rate": 1.8933210699646342e-06, + "loss": 1.1412, + "step": 14867 + }, + { + "epoch": 0.8128698932520536, + "grad_norm": 1.1882175207138062, + "learning_rate": 1.8922512762090096e-06, + "loss": 1.4683, + "step": 14868 + }, + { + "epoch": 0.8129245656957752, + "grad_norm": 1.3789278268814087, + "learning_rate": 1.8911817531901432e-06, + "loss": 1.4475, + "step": 14869 + }, + { + "epoch": 0.8129792381394967, + "grad_norm": 1.337510347366333, + "learning_rate": 1.890112500943746e-06, + "loss": 1.6405, + "step": 14870 + }, + { + "epoch": 0.8130339105832183, + "grad_norm": 1.6855748891830444, + "learning_rate": 1.8890435195055235e-06, + "loss": 1.5684, + "step": 14871 + }, + { + "epoch": 0.8130885830269399, + "grad_norm": 2.2362966537475586, + "learning_rate": 1.8879748089111693e-06, + "loss": 1.6567, + "step": 14872 + }, + { + "epoch": 0.8131432554706614, + "grad_norm": 1.4096804857254028, + "learning_rate": 1.886906369196373e-06, + "loss": 1.5337, + "step": 14873 + }, + { + "epoch": 0.813197927914383, + "grad_norm": 1.4677345752716064, + "learning_rate": 1.885838200396808e-06, + "loss": 1.4061, + "step": 14874 + }, + { + "epoch": 0.8132526003581045, + "grad_norm": 2.1687331199645996, + "learning_rate": 1.8847703025481489e-06, + "loss": 1.469, + "step": 14875 + }, + { + "epoch": 0.813307272801826, + "grad_norm": 1.752183437347412, + "learning_rate": 1.88370267568605e-06, + "loss": 1.4313, + "step": 14876 + }, + { + "epoch": 0.8133619452455476, + "grad_norm": 1.2661186456680298, + "learning_rate": 1.8826353198461655e-06, + "loss": 1.4101, + "step": 14877 + }, + { + "epoch": 0.8134166176892692, + "grad_norm": 1.6960121393203735, + "learning_rate": 1.8815682350641373e-06, + "loss": 1.2598, + "step": 14878 + }, + { + "epoch": 0.8134712901329907, + "grad_norm": 1.703798770904541, + "learning_rate": 1.8805014213755924e-06, + "loss": 1.4586, + "step": 14879 + }, + { + "epoch": 0.8135259625767123, + "grad_norm": 1.4421982765197754, + "learning_rate": 1.879434878816161e-06, + "loss": 1.5257, + "step": 14880 + }, + { + "epoch": 0.8135806350204339, + "grad_norm": 1.239929437637329, + "learning_rate": 1.8783686074214546e-06, + "loss": 1.4639, + "step": 14881 + }, + { + "epoch": 0.8136353074641554, + "grad_norm": 1.5039606094360352, + "learning_rate": 1.8773026072270762e-06, + "loss": 1.3555, + "step": 14882 + }, + { + "epoch": 0.813689979907877, + "grad_norm": 1.5948264598846436, + "learning_rate": 1.8762368782686258e-06, + "loss": 1.4929, + "step": 14883 + }, + { + "epoch": 0.8137446523515985, + "grad_norm": 1.4472063779830933, + "learning_rate": 1.8751714205816897e-06, + "loss": 1.4294, + "step": 14884 + }, + { + "epoch": 0.81379932479532, + "grad_norm": 1.4948339462280273, + "learning_rate": 1.8741062342018458e-06, + "loss": 1.4381, + "step": 14885 + }, + { + "epoch": 0.8138539972390416, + "grad_norm": 1.2865959405899048, + "learning_rate": 1.8730413191646623e-06, + "loss": 1.7063, + "step": 14886 + }, + { + "epoch": 0.8139086696827631, + "grad_norm": 1.4509607553482056, + "learning_rate": 1.871976675505699e-06, + "loss": 1.5075, + "step": 14887 + }, + { + "epoch": 0.8139633421264847, + "grad_norm": 1.3669260740280151, + "learning_rate": 1.8709123032605058e-06, + "loss": 1.5259, + "step": 14888 + }, + { + "epoch": 0.8140180145702063, + "grad_norm": 1.3765734434127808, + "learning_rate": 1.8698482024646291e-06, + "loss": 1.5756, + "step": 14889 + }, + { + "epoch": 0.8140726870139278, + "grad_norm": 1.4637283086776733, + "learning_rate": 1.868784373153596e-06, + "loss": 1.4111, + "step": 14890 + }, + { + "epoch": 0.8141273594576494, + "grad_norm": 1.4787832498550415, + "learning_rate": 1.8677208153629356e-06, + "loss": 1.5639, + "step": 14891 + }, + { + "epoch": 0.814182031901371, + "grad_norm": 1.3530712127685547, + "learning_rate": 1.8666575291281597e-06, + "loss": 1.5452, + "step": 14892 + }, + { + "epoch": 0.8142367043450924, + "grad_norm": 1.844223976135254, + "learning_rate": 1.865594514484772e-06, + "loss": 1.4435, + "step": 14893 + }, + { + "epoch": 0.814291376788814, + "grad_norm": 1.638002634048462, + "learning_rate": 1.8645317714682742e-06, + "loss": 1.4218, + "step": 14894 + }, + { + "epoch": 0.8143460492325356, + "grad_norm": 1.3891727924346924, + "learning_rate": 1.8634693001141513e-06, + "loss": 1.5125, + "step": 14895 + }, + { + "epoch": 0.8144007216762571, + "grad_norm": 1.4134293794631958, + "learning_rate": 1.8624071004578792e-06, + "loss": 1.6384, + "step": 14896 + }, + { + "epoch": 0.8144553941199787, + "grad_norm": 1.5863056182861328, + "learning_rate": 1.8613451725349318e-06, + "loss": 1.5012, + "step": 14897 + }, + { + "epoch": 0.8145100665637003, + "grad_norm": 1.891811728477478, + "learning_rate": 1.8602835163807664e-06, + "loss": 1.2235, + "step": 14898 + }, + { + "epoch": 0.8145647390074218, + "grad_norm": 2.0743567943573, + "learning_rate": 1.8592221320308358e-06, + "loss": 1.285, + "step": 14899 + }, + { + "epoch": 0.8146194114511434, + "grad_norm": 1.2700718641281128, + "learning_rate": 1.85816101952058e-06, + "loss": 1.4913, + "step": 14900 + }, + { + "epoch": 0.8146740838948648, + "grad_norm": 1.7533018589019775, + "learning_rate": 1.8571001788854338e-06, + "loss": 1.4667, + "step": 14901 + }, + { + "epoch": 0.8147287563385864, + "grad_norm": 2.0537643432617188, + "learning_rate": 1.856039610160818e-06, + "loss": 1.3198, + "step": 14902 + }, + { + "epoch": 0.814783428782308, + "grad_norm": 2.1444954872131348, + "learning_rate": 1.8549793133821525e-06, + "loss": 1.3142, + "step": 14903 + }, + { + "epoch": 0.8148381012260295, + "grad_norm": 1.9233683347702026, + "learning_rate": 1.8539192885848377e-06, + "loss": 1.1147, + "step": 14904 + }, + { + "epoch": 0.8148927736697511, + "grad_norm": 1.5850403308868408, + "learning_rate": 1.8528595358042768e-06, + "loss": 1.6357, + "step": 14905 + }, + { + "epoch": 0.8149474461134727, + "grad_norm": 1.5370665788650513, + "learning_rate": 1.851800055075853e-06, + "loss": 1.408, + "step": 14906 + }, + { + "epoch": 0.8150021185571942, + "grad_norm": 1.2958905696868896, + "learning_rate": 1.850740846434943e-06, + "loss": 1.4947, + "step": 14907 + }, + { + "epoch": 0.8150567910009158, + "grad_norm": 1.733358383178711, + "learning_rate": 1.8496819099169227e-06, + "loss": 1.4588, + "step": 14908 + }, + { + "epoch": 0.8151114634446374, + "grad_norm": 1.263715147972107, + "learning_rate": 1.8486232455571473e-06, + "loss": 1.598, + "step": 14909 + }, + { + "epoch": 0.8151661358883588, + "grad_norm": 1.5781769752502441, + "learning_rate": 1.8475648533909707e-06, + "loss": 1.6301, + "step": 14910 + }, + { + "epoch": 0.8152208083320804, + "grad_norm": 1.405505657196045, + "learning_rate": 1.8465067334537335e-06, + "loss": 1.4503, + "step": 14911 + }, + { + "epoch": 0.815275480775802, + "grad_norm": 1.651855707168579, + "learning_rate": 1.8454488857807684e-06, + "loss": 1.5674, + "step": 14912 + }, + { + "epoch": 0.8153301532195235, + "grad_norm": 1.3929287195205688, + "learning_rate": 1.8443913104073984e-06, + "loss": 1.3276, + "step": 14913 + }, + { + "epoch": 0.8153848256632451, + "grad_norm": 1.9671276807785034, + "learning_rate": 1.8433340073689432e-06, + "loss": 1.5543, + "step": 14914 + }, + { + "epoch": 0.8154394981069666, + "grad_norm": 1.1767364740371704, + "learning_rate": 1.8422769767007053e-06, + "loss": 1.4271, + "step": 14915 + }, + { + "epoch": 0.8154941705506882, + "grad_norm": 1.311510682106018, + "learning_rate": 1.8412202184379801e-06, + "loss": 1.2706, + "step": 14916 + }, + { + "epoch": 0.8155488429944098, + "grad_norm": 1.2808879613876343, + "learning_rate": 1.8401637326160582e-06, + "loss": 1.5766, + "step": 14917 + }, + { + "epoch": 0.8156035154381313, + "grad_norm": 1.6161999702453613, + "learning_rate": 1.8391075192702179e-06, + "loss": 1.4715, + "step": 14918 + }, + { + "epoch": 0.8156581878818528, + "grad_norm": 1.3669912815093994, + "learning_rate": 1.8380515784357245e-06, + "loss": 1.2876, + "step": 14919 + }, + { + "epoch": 0.8157128603255744, + "grad_norm": 1.3909623622894287, + "learning_rate": 1.836995910147845e-06, + "loss": 1.591, + "step": 14920 + }, + { + "epoch": 0.8157675327692959, + "grad_norm": 1.5126266479492188, + "learning_rate": 1.8359405144418241e-06, + "loss": 1.3965, + "step": 14921 + }, + { + "epoch": 0.8158222052130175, + "grad_norm": 1.4426062107086182, + "learning_rate": 1.8348853913529085e-06, + "loss": 1.6101, + "step": 14922 + }, + { + "epoch": 0.8158768776567391, + "grad_norm": 1.6338809728622437, + "learning_rate": 1.8338305409163314e-06, + "loss": 1.6151, + "step": 14923 + }, + { + "epoch": 0.8159315501004606, + "grad_norm": 1.4108476638793945, + "learning_rate": 1.8327759631673136e-06, + "loss": 1.5734, + "step": 14924 + }, + { + "epoch": 0.8159862225441822, + "grad_norm": 1.6156251430511475, + "learning_rate": 1.8317216581410725e-06, + "loss": 1.3595, + "step": 14925 + }, + { + "epoch": 0.8160408949879038, + "grad_norm": 1.401513934135437, + "learning_rate": 1.8306676258728118e-06, + "loss": 1.2762, + "step": 14926 + }, + { + "epoch": 0.8160955674316253, + "grad_norm": 1.4957208633422852, + "learning_rate": 1.829613866397727e-06, + "loss": 1.4069, + "step": 14927 + }, + { + "epoch": 0.8161502398753469, + "grad_norm": 1.2582827806472778, + "learning_rate": 1.8285603797510098e-06, + "loss": 1.503, + "step": 14928 + }, + { + "epoch": 0.8162049123190683, + "grad_norm": 1.3899800777435303, + "learning_rate": 1.8275071659678367e-06, + "loss": 1.4684, + "step": 14929 + }, + { + "epoch": 0.8162595847627899, + "grad_norm": 1.6901047229766846, + "learning_rate": 1.826454225083375e-06, + "loss": 1.4039, + "step": 14930 + }, + { + "epoch": 0.8163142572065115, + "grad_norm": 2.153722047805786, + "learning_rate": 1.8254015571327876e-06, + "loss": 1.4012, + "step": 14931 + }, + { + "epoch": 0.816368929650233, + "grad_norm": 1.4017605781555176, + "learning_rate": 1.8243491621512255e-06, + "loss": 1.7419, + "step": 14932 + }, + { + "epoch": 0.8164236020939546, + "grad_norm": 1.4832305908203125, + "learning_rate": 1.823297040173826e-06, + "loss": 1.5659, + "step": 14933 + }, + { + "epoch": 0.8164782745376762, + "grad_norm": 1.637264609336853, + "learning_rate": 1.8222451912357287e-06, + "loss": 1.2787, + "step": 14934 + }, + { + "epoch": 0.8165329469813977, + "grad_norm": 1.767854928970337, + "learning_rate": 1.8211936153720523e-06, + "loss": 1.424, + "step": 14935 + }, + { + "epoch": 0.8165876194251193, + "grad_norm": 1.4839484691619873, + "learning_rate": 1.820142312617915e-06, + "loss": 1.4793, + "step": 14936 + }, + { + "epoch": 0.8166422918688409, + "grad_norm": 1.6034228801727295, + "learning_rate": 1.8190912830084207e-06, + "loss": 1.36, + "step": 14937 + }, + { + "epoch": 0.8166969643125623, + "grad_norm": 1.5376768112182617, + "learning_rate": 1.818040526578666e-06, + "loss": 1.1948, + "step": 14938 + }, + { + "epoch": 0.8167516367562839, + "grad_norm": 2.454139232635498, + "learning_rate": 1.8169900433637366e-06, + "loss": 1.3705, + "step": 14939 + }, + { + "epoch": 0.8168063092000055, + "grad_norm": 1.5371779203414917, + "learning_rate": 1.8159398333987133e-06, + "loss": 1.5238, + "step": 14940 + }, + { + "epoch": 0.816860981643727, + "grad_norm": 1.4970488548278809, + "learning_rate": 1.814889896718659e-06, + "loss": 1.5795, + "step": 14941 + }, + { + "epoch": 0.8169156540874486, + "grad_norm": 1.9222910404205322, + "learning_rate": 1.8138402333586425e-06, + "loss": 1.4235, + "step": 14942 + }, + { + "epoch": 0.8169703265311701, + "grad_norm": 1.4383305311203003, + "learning_rate": 1.8127908433537088e-06, + "loss": 1.6096, + "step": 14943 + }, + { + "epoch": 0.8170249989748917, + "grad_norm": 1.8163883686065674, + "learning_rate": 1.811741726738898e-06, + "loss": 1.2533, + "step": 14944 + }, + { + "epoch": 0.8170796714186133, + "grad_norm": 1.4531534910202026, + "learning_rate": 1.8106928835492488e-06, + "loss": 1.6074, + "step": 14945 + }, + { + "epoch": 0.8171343438623347, + "grad_norm": 1.6666465997695923, + "learning_rate": 1.8096443138197806e-06, + "loss": 1.4539, + "step": 14946 + }, + { + "epoch": 0.8171890163060563, + "grad_norm": 1.2629318237304688, + "learning_rate": 1.8085960175855056e-06, + "loss": 1.4655, + "step": 14947 + }, + { + "epoch": 0.8172436887497779, + "grad_norm": 1.4260685443878174, + "learning_rate": 1.8075479948814334e-06, + "loss": 1.4156, + "step": 14948 + }, + { + "epoch": 0.8172983611934994, + "grad_norm": 1.7671223878860474, + "learning_rate": 1.8065002457425583e-06, + "loss": 1.3702, + "step": 14949 + }, + { + "epoch": 0.817353033637221, + "grad_norm": 1.3956269025802612, + "learning_rate": 1.805452770203866e-06, + "loss": 1.6109, + "step": 14950 + }, + { + "epoch": 0.8174077060809426, + "grad_norm": 1.4386523962020874, + "learning_rate": 1.8044055683003358e-06, + "loss": 1.4213, + "step": 14951 + }, + { + "epoch": 0.8174623785246641, + "grad_norm": 1.6557530164718628, + "learning_rate": 1.8033586400669322e-06, + "loss": 1.3623, + "step": 14952 + }, + { + "epoch": 0.8175170509683857, + "grad_norm": 1.5958807468414307, + "learning_rate": 1.8023119855386196e-06, + "loss": 1.4372, + "step": 14953 + }, + { + "epoch": 0.8175717234121073, + "grad_norm": 1.688464641571045, + "learning_rate": 1.8012656047503475e-06, + "loss": 1.2144, + "step": 14954 + }, + { + "epoch": 0.8176263958558287, + "grad_norm": 1.3967620134353638, + "learning_rate": 1.8002194977370523e-06, + "loss": 1.6581, + "step": 14955 + }, + { + "epoch": 0.8176810682995503, + "grad_norm": 1.588027000427246, + "learning_rate": 1.799173664533672e-06, + "loss": 1.5222, + "step": 14956 + }, + { + "epoch": 0.8177357407432718, + "grad_norm": 1.3951326608657837, + "learning_rate": 1.7981281051751276e-06, + "loss": 1.4884, + "step": 14957 + }, + { + "epoch": 0.8177904131869934, + "grad_norm": 1.3718794584274292, + "learning_rate": 1.7970828196963286e-06, + "loss": 1.3352, + "step": 14958 + }, + { + "epoch": 0.817845085630715, + "grad_norm": 1.6117708683013916, + "learning_rate": 1.796037808132186e-06, + "loss": 1.8257, + "step": 14959 + }, + { + "epoch": 0.8178997580744365, + "grad_norm": 1.2851858139038086, + "learning_rate": 1.7949930705175922e-06, + "loss": 1.5465, + "step": 14960 + }, + { + "epoch": 0.8179544305181581, + "grad_norm": 2.2964301109313965, + "learning_rate": 1.7939486068874311e-06, + "loss": 1.3436, + "step": 14961 + }, + { + "epoch": 0.8180091029618797, + "grad_norm": 1.3156810998916626, + "learning_rate": 1.792904417276584e-06, + "loss": 1.7881, + "step": 14962 + }, + { + "epoch": 0.8180637754056012, + "grad_norm": 1.3587710857391357, + "learning_rate": 1.7918605017199176e-06, + "loss": 1.5136, + "step": 14963 + }, + { + "epoch": 0.8181184478493227, + "grad_norm": 2.0334367752075195, + "learning_rate": 1.7908168602522903e-06, + "loss": 1.5397, + "step": 14964 + }, + { + "epoch": 0.8181731202930443, + "grad_norm": 1.8754525184631348, + "learning_rate": 1.7897734929085508e-06, + "loss": 1.3882, + "step": 14965 + }, + { + "epoch": 0.8182277927367658, + "grad_norm": 2.1579184532165527, + "learning_rate": 1.7887303997235372e-06, + "loss": 1.4602, + "step": 14966 + }, + { + "epoch": 0.8182824651804874, + "grad_norm": 1.5244665145874023, + "learning_rate": 1.7876875807320881e-06, + "loss": 1.2767, + "step": 14967 + }, + { + "epoch": 0.818337137624209, + "grad_norm": 1.191864013671875, + "learning_rate": 1.7866450359690203e-06, + "loss": 1.7067, + "step": 14968 + }, + { + "epoch": 0.8183918100679305, + "grad_norm": 1.5768241882324219, + "learning_rate": 1.7856027654691454e-06, + "loss": 1.5221, + "step": 14969 + }, + { + "epoch": 0.8184464825116521, + "grad_norm": 1.7873774766921997, + "learning_rate": 1.784560769267273e-06, + "loss": 1.511, + "step": 14970 + }, + { + "epoch": 0.8185011549553736, + "grad_norm": 1.4477415084838867, + "learning_rate": 1.7835190473981945e-06, + "loss": 1.5427, + "step": 14971 + }, + { + "epoch": 0.8185558273990952, + "grad_norm": 1.3503988981246948, + "learning_rate": 1.7824775998966926e-06, + "loss": 1.4298, + "step": 14972 + }, + { + "epoch": 0.8186104998428168, + "grad_norm": 1.420925498008728, + "learning_rate": 1.7814364267975493e-06, + "loss": 1.4589, + "step": 14973 + }, + { + "epoch": 0.8186651722865382, + "grad_norm": 1.6991519927978516, + "learning_rate": 1.7803955281355302e-06, + "loss": 1.6858, + "step": 14974 + }, + { + "epoch": 0.8187198447302598, + "grad_norm": 1.5587884187698364, + "learning_rate": 1.7793549039453905e-06, + "loss": 1.2407, + "step": 14975 + }, + { + "epoch": 0.8187745171739814, + "grad_norm": 1.4330123662948608, + "learning_rate": 1.7783145542618819e-06, + "loss": 1.4691, + "step": 14976 + }, + { + "epoch": 0.8188291896177029, + "grad_norm": 1.5837899446487427, + "learning_rate": 1.7772744791197406e-06, + "loss": 1.5783, + "step": 14977 + }, + { + "epoch": 0.8188838620614245, + "grad_norm": 1.6989749670028687, + "learning_rate": 1.776234678553702e-06, + "loss": 1.4973, + "step": 14978 + }, + { + "epoch": 0.8189385345051461, + "grad_norm": 1.7644912004470825, + "learning_rate": 1.7751951525984857e-06, + "loss": 1.6219, + "step": 14979 + }, + { + "epoch": 0.8189932069488676, + "grad_norm": 1.3200255632400513, + "learning_rate": 1.774155901288801e-06, + "loss": 1.3197, + "step": 14980 + }, + { + "epoch": 0.8190478793925892, + "grad_norm": 2.0506606101989746, + "learning_rate": 1.773116924659355e-06, + "loss": 1.3618, + "step": 14981 + }, + { + "epoch": 0.8191025518363108, + "grad_norm": 1.3060678243637085, + "learning_rate": 1.7720782227448407e-06, + "loss": 1.6038, + "step": 14982 + }, + { + "epoch": 0.8191572242800322, + "grad_norm": 1.6500107049942017, + "learning_rate": 1.7710397955799386e-06, + "loss": 1.5705, + "step": 14983 + }, + { + "epoch": 0.8192118967237538, + "grad_norm": 1.9441509246826172, + "learning_rate": 1.7700016431993305e-06, + "loss": 1.4955, + "step": 14984 + }, + { + "epoch": 0.8192665691674753, + "grad_norm": 2.482980489730835, + "learning_rate": 1.768963765637679e-06, + "loss": 1.4211, + "step": 14985 + }, + { + "epoch": 0.8193212416111969, + "grad_norm": 1.4656533002853394, + "learning_rate": 1.7679261629296408e-06, + "loss": 1.266, + "step": 14986 + }, + { + "epoch": 0.8193759140549185, + "grad_norm": 1.625335454940796, + "learning_rate": 1.7668888351098678e-06, + "loss": 1.5274, + "step": 14987 + }, + { + "epoch": 0.81943058649864, + "grad_norm": 1.3468676805496216, + "learning_rate": 1.765851782212995e-06, + "loss": 1.7323, + "step": 14988 + }, + { + "epoch": 0.8194852589423616, + "grad_norm": 1.4821140766143799, + "learning_rate": 1.7648150042736546e-06, + "loss": 1.3332, + "step": 14989 + }, + { + "epoch": 0.8195399313860832, + "grad_norm": 1.5503103733062744, + "learning_rate": 1.763778501326464e-06, + "loss": 1.5367, + "step": 14990 + }, + { + "epoch": 0.8195946038298046, + "grad_norm": 2.185934066772461, + "learning_rate": 1.7627422734060352e-06, + "loss": 1.3938, + "step": 14991 + }, + { + "epoch": 0.8196492762735262, + "grad_norm": 1.3796586990356445, + "learning_rate": 1.761706320546973e-06, + "loss": 1.4411, + "step": 14992 + }, + { + "epoch": 0.8197039487172478, + "grad_norm": 1.4562413692474365, + "learning_rate": 1.7606706427838682e-06, + "loss": 1.1327, + "step": 14993 + }, + { + "epoch": 0.8197586211609693, + "grad_norm": 1.5777980089187622, + "learning_rate": 1.7596352401513027e-06, + "loss": 1.6178, + "step": 14994 + }, + { + "epoch": 0.8198132936046909, + "grad_norm": 1.7685209512710571, + "learning_rate": 1.7586001126838558e-06, + "loss": 1.1665, + "step": 14995 + }, + { + "epoch": 0.8198679660484125, + "grad_norm": 1.4281007051467896, + "learning_rate": 1.7575652604160898e-06, + "loss": 1.4553, + "step": 14996 + }, + { + "epoch": 0.819922638492134, + "grad_norm": 1.6576273441314697, + "learning_rate": 1.756530683382559e-06, + "loss": 1.3776, + "step": 14997 + }, + { + "epoch": 0.8199773109358556, + "grad_norm": 1.553926944732666, + "learning_rate": 1.7554963816178162e-06, + "loss": 1.3344, + "step": 14998 + }, + { + "epoch": 0.8200319833795771, + "grad_norm": 1.6427263021469116, + "learning_rate": 1.7544623551563932e-06, + "loss": 1.3885, + "step": 14999 + }, + { + "epoch": 0.8200866558232986, + "grad_norm": 1.6545939445495605, + "learning_rate": 1.7534286040328208e-06, + "loss": 1.4817, + "step": 15000 + }, + { + "epoch": 0.8201413282670202, + "grad_norm": 1.666954755783081, + "learning_rate": 1.7523951282816199e-06, + "loss": 1.4005, + "step": 15001 + }, + { + "epoch": 0.8201960007107417, + "grad_norm": 1.472603678703308, + "learning_rate": 1.7513619279372984e-06, + "loss": 1.5585, + "step": 15002 + }, + { + "epoch": 0.8202506731544633, + "grad_norm": 1.8497672080993652, + "learning_rate": 1.750329003034359e-06, + "loss": 1.2602, + "step": 15003 + }, + { + "epoch": 0.8203053455981849, + "grad_norm": 1.7530689239501953, + "learning_rate": 1.7492963536072927e-06, + "loss": 1.2649, + "step": 15004 + }, + { + "epoch": 0.8203600180419064, + "grad_norm": 1.6576119661331177, + "learning_rate": 1.7482639796905798e-06, + "loss": 1.3854, + "step": 15005 + }, + { + "epoch": 0.820414690485628, + "grad_norm": 1.952384114265442, + "learning_rate": 1.7472318813186984e-06, + "loss": 1.6186, + "step": 15006 + }, + { + "epoch": 0.8204693629293496, + "grad_norm": 1.3687975406646729, + "learning_rate": 1.7462000585261096e-06, + "loss": 1.2345, + "step": 15007 + }, + { + "epoch": 0.8205240353730711, + "grad_norm": 1.4713184833526611, + "learning_rate": 1.7451685113472673e-06, + "loss": 1.5866, + "step": 15008 + }, + { + "epoch": 0.8205787078167927, + "grad_norm": 1.6158273220062256, + "learning_rate": 1.74413723981662e-06, + "loss": 1.4424, + "step": 15009 + }, + { + "epoch": 0.8206333802605142, + "grad_norm": 1.4343396425247192, + "learning_rate": 1.7431062439686052e-06, + "loss": 1.4116, + "step": 15010 + }, + { + "epoch": 0.8206880527042357, + "grad_norm": 1.2778005599975586, + "learning_rate": 1.742075523837644e-06, + "loss": 1.4897, + "step": 15011 + }, + { + "epoch": 0.8207427251479573, + "grad_norm": 1.5765212774276733, + "learning_rate": 1.7410450794581623e-06, + "loss": 1.4711, + "step": 15012 + }, + { + "epoch": 0.8207973975916788, + "grad_norm": 1.6128196716308594, + "learning_rate": 1.7400149108645658e-06, + "loss": 1.3781, + "step": 15013 + }, + { + "epoch": 0.8208520700354004, + "grad_norm": 1.3596508502960205, + "learning_rate": 1.7389850180912537e-06, + "loss": 1.3516, + "step": 15014 + }, + { + "epoch": 0.820906742479122, + "grad_norm": 1.9212201833724976, + "learning_rate": 1.7379554011726175e-06, + "loss": 1.6097, + "step": 15015 + }, + { + "epoch": 0.8209614149228435, + "grad_norm": 1.5837825536727905, + "learning_rate": 1.7369260601430371e-06, + "loss": 1.4448, + "step": 15016 + }, + { + "epoch": 0.8210160873665651, + "grad_norm": 1.5344399213790894, + "learning_rate": 1.7358969950368842e-06, + "loss": 1.6522, + "step": 15017 + }, + { + "epoch": 0.8210707598102867, + "grad_norm": 1.3583518266677856, + "learning_rate": 1.7348682058885247e-06, + "loss": 1.5221, + "step": 15018 + }, + { + "epoch": 0.8211254322540081, + "grad_norm": 1.6693482398986816, + "learning_rate": 1.7338396927323076e-06, + "loss": 1.654, + "step": 15019 + }, + { + "epoch": 0.8211801046977297, + "grad_norm": 1.6029410362243652, + "learning_rate": 1.7328114556025832e-06, + "loss": 1.3553, + "step": 15020 + }, + { + "epoch": 0.8212347771414513, + "grad_norm": 2.0497870445251465, + "learning_rate": 1.7317834945336843e-06, + "loss": 1.1627, + "step": 15021 + }, + { + "epoch": 0.8212894495851728, + "grad_norm": 1.604388952255249, + "learning_rate": 1.7307558095599332e-06, + "loss": 1.3797, + "step": 15022 + }, + { + "epoch": 0.8213441220288944, + "grad_norm": 1.574680209159851, + "learning_rate": 1.7297284007156533e-06, + "loss": 1.2236, + "step": 15023 + }, + { + "epoch": 0.821398794472616, + "grad_norm": 1.225468397140503, + "learning_rate": 1.7287012680351479e-06, + "loss": 1.5826, + "step": 15024 + }, + { + "epoch": 0.8214534669163375, + "grad_norm": 1.985381007194519, + "learning_rate": 1.7276744115527144e-06, + "loss": 1.5134, + "step": 15025 + }, + { + "epoch": 0.8215081393600591, + "grad_norm": 1.5024782419204712, + "learning_rate": 1.7266478313026469e-06, + "loss": 1.5905, + "step": 15026 + }, + { + "epoch": 0.8215628118037805, + "grad_norm": 1.0593116283416748, + "learning_rate": 1.7256215273192223e-06, + "loss": 1.5026, + "step": 15027 + }, + { + "epoch": 0.8216174842475021, + "grad_norm": 1.5069220066070557, + "learning_rate": 1.724595499636711e-06, + "loss": 1.3171, + "step": 15028 + }, + { + "epoch": 0.8216721566912237, + "grad_norm": 1.4398752450942993, + "learning_rate": 1.7235697482893743e-06, + "loss": 1.4879, + "step": 15029 + }, + { + "epoch": 0.8217268291349452, + "grad_norm": 1.478062391281128, + "learning_rate": 1.722544273311465e-06, + "loss": 1.4171, + "step": 15030 + }, + { + "epoch": 0.8217815015786668, + "grad_norm": 1.3547717332839966, + "learning_rate": 1.7215190747372246e-06, + "loss": 1.4783, + "step": 15031 + }, + { + "epoch": 0.8218361740223884, + "grad_norm": 1.596684217453003, + "learning_rate": 1.72049415260089e-06, + "loss": 1.3235, + "step": 15032 + }, + { + "epoch": 0.8218908464661099, + "grad_norm": 1.1074355840682983, + "learning_rate": 1.7194695069366818e-06, + "loss": 1.6616, + "step": 15033 + }, + { + "epoch": 0.8219455189098315, + "grad_norm": 1.3699047565460205, + "learning_rate": 1.7184451377788202e-06, + "loss": 1.4033, + "step": 15034 + }, + { + "epoch": 0.8220001913535531, + "grad_norm": 1.424763798713684, + "learning_rate": 1.7174210451615091e-06, + "loss": 1.3802, + "step": 15035 + }, + { + "epoch": 0.8220548637972745, + "grad_norm": 1.722934603691101, + "learning_rate": 1.7163972291189423e-06, + "loss": 1.3625, + "step": 15036 + }, + { + "epoch": 0.8221095362409961, + "grad_norm": 1.2115930318832397, + "learning_rate": 1.7153736896853124e-06, + "loss": 1.372, + "step": 15037 + }, + { + "epoch": 0.8221642086847177, + "grad_norm": 2.0737040042877197, + "learning_rate": 1.7143504268947952e-06, + "loss": 1.5767, + "step": 15038 + }, + { + "epoch": 0.8222188811284392, + "grad_norm": 1.304153561592102, + "learning_rate": 1.7133274407815581e-06, + "loss": 1.4256, + "step": 15039 + }, + { + "epoch": 0.8222735535721608, + "grad_norm": 1.6840028762817383, + "learning_rate": 1.7123047313797657e-06, + "loss": 1.6234, + "step": 15040 + }, + { + "epoch": 0.8223282260158823, + "grad_norm": 1.1728224754333496, + "learning_rate": 1.7112822987235656e-06, + "loss": 1.5657, + "step": 15041 + }, + { + "epoch": 0.8223828984596039, + "grad_norm": 1.364615797996521, + "learning_rate": 1.7102601428470988e-06, + "loss": 1.3033, + "step": 15042 + }, + { + "epoch": 0.8224375709033255, + "grad_norm": 1.096937656402588, + "learning_rate": 1.7092382637844995e-06, + "loss": 1.4846, + "step": 15043 + }, + { + "epoch": 0.822492243347047, + "grad_norm": 1.9561816453933716, + "learning_rate": 1.7082166615698893e-06, + "loss": 1.4196, + "step": 15044 + }, + { + "epoch": 0.8225469157907686, + "grad_norm": 1.532332181930542, + "learning_rate": 1.7071953362373795e-06, + "loss": 1.7275, + "step": 15045 + }, + { + "epoch": 0.8226015882344901, + "grad_norm": 1.90166437625885, + "learning_rate": 1.7061742878210797e-06, + "loss": 1.5511, + "step": 15046 + }, + { + "epoch": 0.8226562606782116, + "grad_norm": 1.5502734184265137, + "learning_rate": 1.7051535163550804e-06, + "loss": 1.4184, + "step": 15047 + }, + { + "epoch": 0.8227109331219332, + "grad_norm": 1.5092580318450928, + "learning_rate": 1.704133021873471e-06, + "loss": 1.6233, + "step": 15048 + }, + { + "epoch": 0.8227656055656548, + "grad_norm": 1.6270256042480469, + "learning_rate": 1.7031128044103272e-06, + "loss": 1.378, + "step": 15049 + }, + { + "epoch": 0.8228202780093763, + "grad_norm": 1.5725321769714355, + "learning_rate": 1.7020928639997136e-06, + "loss": 1.3903, + "step": 15050 + }, + { + "epoch": 0.8228749504530979, + "grad_norm": 1.358703851699829, + "learning_rate": 1.7010732006756948e-06, + "loss": 1.6083, + "step": 15051 + }, + { + "epoch": 0.8229296228968195, + "grad_norm": 1.4409472942352295, + "learning_rate": 1.7000538144723145e-06, + "loss": 1.4102, + "step": 15052 + }, + { + "epoch": 0.822984295340541, + "grad_norm": 1.3404240608215332, + "learning_rate": 1.6990347054236134e-06, + "loss": 1.6575, + "step": 15053 + }, + { + "epoch": 0.8230389677842626, + "grad_norm": 1.54950749874115, + "learning_rate": 1.698015873563623e-06, + "loss": 1.427, + "step": 15054 + }, + { + "epoch": 0.823093640227984, + "grad_norm": 1.2794222831726074, + "learning_rate": 1.6969973189263644e-06, + "loss": 1.3726, + "step": 15055 + }, + { + "epoch": 0.8231483126717056, + "grad_norm": 1.6183502674102783, + "learning_rate": 1.6959790415458454e-06, + "loss": 1.3349, + "step": 15056 + }, + { + "epoch": 0.8232029851154272, + "grad_norm": 1.4915906190872192, + "learning_rate": 1.6949610414560746e-06, + "loss": 1.5444, + "step": 15057 + }, + { + "epoch": 0.8232576575591487, + "grad_norm": 1.7242448329925537, + "learning_rate": 1.6939433186910436e-06, + "loss": 1.2168, + "step": 15058 + }, + { + "epoch": 0.8233123300028703, + "grad_norm": 1.524200439453125, + "learning_rate": 1.6929258732847332e-06, + "loss": 1.4136, + "step": 15059 + }, + { + "epoch": 0.8233670024465919, + "grad_norm": 1.44352388381958, + "learning_rate": 1.6919087052711236e-06, + "loss": 1.6969, + "step": 15060 + }, + { + "epoch": 0.8234216748903134, + "grad_norm": 1.7758636474609375, + "learning_rate": 1.6908918146841758e-06, + "loss": 1.6051, + "step": 15061 + }, + { + "epoch": 0.823476347334035, + "grad_norm": 1.5373601913452148, + "learning_rate": 1.68987520155785e-06, + "loss": 1.3091, + "step": 15062 + }, + { + "epoch": 0.8235310197777566, + "grad_norm": 1.6653192043304443, + "learning_rate": 1.6888588659260929e-06, + "loss": 1.4981, + "step": 15063 + }, + { + "epoch": 0.823585692221478, + "grad_norm": 1.556901216506958, + "learning_rate": 1.687842807822837e-06, + "loss": 1.5791, + "step": 15064 + }, + { + "epoch": 0.8236403646651996, + "grad_norm": 2.898974895477295, + "learning_rate": 1.6868270272820175e-06, + "loss": 1.3932, + "step": 15065 + }, + { + "epoch": 0.8236950371089212, + "grad_norm": 1.4072041511535645, + "learning_rate": 1.6858115243375516e-06, + "loss": 1.4188, + "step": 15066 + }, + { + "epoch": 0.8237497095526427, + "grad_norm": 1.879135012626648, + "learning_rate": 1.684796299023349e-06, + "loss": 1.4807, + "step": 15067 + }, + { + "epoch": 0.8238043819963643, + "grad_norm": 1.5402413606643677, + "learning_rate": 1.6837813513733093e-06, + "loss": 1.3362, + "step": 15068 + }, + { + "epoch": 0.8238590544400858, + "grad_norm": 1.4612133502960205, + "learning_rate": 1.682766681421325e-06, + "loss": 1.4926, + "step": 15069 + }, + { + "epoch": 0.8239137268838074, + "grad_norm": 2.0162882804870605, + "learning_rate": 1.6817522892012762e-06, + "loss": 1.2614, + "step": 15070 + }, + { + "epoch": 0.823968399327529, + "grad_norm": 1.8009920120239258, + "learning_rate": 1.6807381747470408e-06, + "loss": 1.2503, + "step": 15071 + }, + { + "epoch": 0.8240230717712504, + "grad_norm": 2.0138072967529297, + "learning_rate": 1.6797243380924788e-06, + "loss": 1.6017, + "step": 15072 + }, + { + "epoch": 0.824077744214972, + "grad_norm": 1.7518682479858398, + "learning_rate": 1.678710779271443e-06, + "loss": 1.2271, + "step": 15073 + }, + { + "epoch": 0.8241324166586936, + "grad_norm": 1.837259292602539, + "learning_rate": 1.677697498317783e-06, + "loss": 1.5894, + "step": 15074 + }, + { + "epoch": 0.8241870891024151, + "grad_norm": 1.5254226922988892, + "learning_rate": 1.6766844952653294e-06, + "loss": 1.3738, + "step": 15075 + }, + { + "epoch": 0.8242417615461367, + "grad_norm": 1.7779521942138672, + "learning_rate": 1.6756717701479152e-06, + "loss": 1.6363, + "step": 15076 + }, + { + "epoch": 0.8242964339898583, + "grad_norm": 1.4382596015930176, + "learning_rate": 1.6746593229993545e-06, + "loss": 1.4153, + "step": 15077 + }, + { + "epoch": 0.8243511064335798, + "grad_norm": 1.5558595657348633, + "learning_rate": 1.6736471538534516e-06, + "loss": 1.4544, + "step": 15078 + }, + { + "epoch": 0.8244057788773014, + "grad_norm": 1.523675799369812, + "learning_rate": 1.6726352627440122e-06, + "loss": 1.4628, + "step": 15079 + }, + { + "epoch": 0.824460451321023, + "grad_norm": 1.19990074634552, + "learning_rate": 1.6716236497048211e-06, + "loss": 1.5034, + "step": 15080 + }, + { + "epoch": 0.8245151237647445, + "grad_norm": 1.4014968872070312, + "learning_rate": 1.6706123147696596e-06, + "loss": 1.2795, + "step": 15081 + }, + { + "epoch": 0.824569796208466, + "grad_norm": 1.417985200881958, + "learning_rate": 1.6696012579722986e-06, + "loss": 1.3181, + "step": 15082 + }, + { + "epoch": 0.8246244686521876, + "grad_norm": 1.6193586587905884, + "learning_rate": 1.6685904793465003e-06, + "loss": 1.4623, + "step": 15083 + }, + { + "epoch": 0.8246791410959091, + "grad_norm": 1.5300675630569458, + "learning_rate": 1.6675799789260128e-06, + "loss": 1.5338, + "step": 15084 + }, + { + "epoch": 0.8247338135396307, + "grad_norm": 1.2384124994277954, + "learning_rate": 1.6665697567445848e-06, + "loss": 1.7277, + "step": 15085 + }, + { + "epoch": 0.8247884859833522, + "grad_norm": 1.2928540706634521, + "learning_rate": 1.6655598128359486e-06, + "loss": 1.4635, + "step": 15086 + }, + { + "epoch": 0.8248431584270738, + "grad_norm": 1.372496247291565, + "learning_rate": 1.6645501472338243e-06, + "loss": 1.6412, + "step": 15087 + }, + { + "epoch": 0.8248978308707954, + "grad_norm": 1.4207994937896729, + "learning_rate": 1.6635407599719332e-06, + "loss": 1.4547, + "step": 15088 + }, + { + "epoch": 0.8249525033145169, + "grad_norm": 1.5852903127670288, + "learning_rate": 1.6625316510839752e-06, + "loss": 1.5017, + "step": 15089 + }, + { + "epoch": 0.8250071757582385, + "grad_norm": 2.102163076400757, + "learning_rate": 1.6615228206036527e-06, + "loss": 1.5139, + "step": 15090 + }, + { + "epoch": 0.82506184820196, + "grad_norm": 1.722412347793579, + "learning_rate": 1.6605142685646503e-06, + "loss": 1.4781, + "step": 15091 + }, + { + "epoch": 0.8251165206456815, + "grad_norm": 1.7572355270385742, + "learning_rate": 1.6595059950006454e-06, + "loss": 1.4766, + "step": 15092 + }, + { + "epoch": 0.8251711930894031, + "grad_norm": 1.5004583597183228, + "learning_rate": 1.6584979999453065e-06, + "loss": 1.5039, + "step": 15093 + }, + { + "epoch": 0.8252258655331247, + "grad_norm": 1.3172662258148193, + "learning_rate": 1.6574902834322937e-06, + "loss": 1.6141, + "step": 15094 + }, + { + "epoch": 0.8252805379768462, + "grad_norm": 1.3827158212661743, + "learning_rate": 1.656482845495254e-06, + "loss": 1.4164, + "step": 15095 + }, + { + "epoch": 0.8253352104205678, + "grad_norm": 1.1544384956359863, + "learning_rate": 1.6554756861678345e-06, + "loss": 1.4916, + "step": 15096 + }, + { + "epoch": 0.8253898828642894, + "grad_norm": 1.719738483428955, + "learning_rate": 1.6544688054836611e-06, + "loss": 1.4504, + "step": 15097 + }, + { + "epoch": 0.8254445553080109, + "grad_norm": 1.3334306478500366, + "learning_rate": 1.6534622034763558e-06, + "loss": 1.3628, + "step": 15098 + }, + { + "epoch": 0.8254992277517325, + "grad_norm": 1.5732516050338745, + "learning_rate": 1.6524558801795366e-06, + "loss": 1.412, + "step": 15099 + }, + { + "epoch": 0.8255539001954539, + "grad_norm": 1.691916584968567, + "learning_rate": 1.6514498356268027e-06, + "loss": 1.2333, + "step": 15100 + }, + { + "epoch": 0.8256085726391755, + "grad_norm": 1.5475393533706665, + "learning_rate": 1.6504440698517477e-06, + "loss": 1.6308, + "step": 15101 + }, + { + "epoch": 0.8256632450828971, + "grad_norm": 1.6001776456832886, + "learning_rate": 1.64943858288796e-06, + "loss": 1.5177, + "step": 15102 + }, + { + "epoch": 0.8257179175266186, + "grad_norm": 1.994655966758728, + "learning_rate": 1.6484333747690107e-06, + "loss": 1.4855, + "step": 15103 + }, + { + "epoch": 0.8257725899703402, + "grad_norm": 1.2868374586105347, + "learning_rate": 1.6474284455284707e-06, + "loss": 1.3255, + "step": 15104 + }, + { + "epoch": 0.8258272624140618, + "grad_norm": 1.3368052244186401, + "learning_rate": 1.6464237951998952e-06, + "loss": 1.7724, + "step": 15105 + }, + { + "epoch": 0.8258819348577833, + "grad_norm": 1.5681849718093872, + "learning_rate": 1.645419423816832e-06, + "loss": 1.4045, + "step": 15106 + }, + { + "epoch": 0.8259366073015049, + "grad_norm": 2.2455716133117676, + "learning_rate": 1.6444153314128175e-06, + "loss": 1.2883, + "step": 15107 + }, + { + "epoch": 0.8259912797452265, + "grad_norm": 1.4231147766113281, + "learning_rate": 1.6434115180213828e-06, + "loss": 1.5164, + "step": 15108 + }, + { + "epoch": 0.8260459521889479, + "grad_norm": 1.5819520950317383, + "learning_rate": 1.6424079836760454e-06, + "loss": 1.3571, + "step": 15109 + }, + { + "epoch": 0.8261006246326695, + "grad_norm": 1.8478585481643677, + "learning_rate": 1.6414047284103185e-06, + "loss": 1.3429, + "step": 15110 + }, + { + "epoch": 0.8261552970763911, + "grad_norm": 1.9801080226898193, + "learning_rate": 1.640401752257702e-06, + "loss": 1.4452, + "step": 15111 + }, + { + "epoch": 0.8262099695201126, + "grad_norm": 2.0409584045410156, + "learning_rate": 1.6393990552516848e-06, + "loss": 1.4057, + "step": 15112 + }, + { + "epoch": 0.8262646419638342, + "grad_norm": 1.9307997226715088, + "learning_rate": 1.6383966374257544e-06, + "loss": 1.4729, + "step": 15113 + }, + { + "epoch": 0.8263193144075557, + "grad_norm": 1.5169754028320312, + "learning_rate": 1.6373944988133817e-06, + "loss": 1.4957, + "step": 15114 + }, + { + "epoch": 0.8263739868512773, + "grad_norm": 1.401835322380066, + "learning_rate": 1.636392639448028e-06, + "loss": 1.2404, + "step": 15115 + }, + { + "epoch": 0.8264286592949989, + "grad_norm": 1.5378307104110718, + "learning_rate": 1.6353910593631507e-06, + "loss": 1.4183, + "step": 15116 + }, + { + "epoch": 0.8264833317387204, + "grad_norm": 1.4007539749145508, + "learning_rate": 1.634389758592193e-06, + "loss": 1.6277, + "step": 15117 + }, + { + "epoch": 0.8265380041824419, + "grad_norm": 1.6733381748199463, + "learning_rate": 1.633388737168594e-06, + "loss": 1.1487, + "step": 15118 + }, + { + "epoch": 0.8265926766261635, + "grad_norm": 1.3937832117080688, + "learning_rate": 1.6323879951257783e-06, + "loss": 1.5971, + "step": 15119 + }, + { + "epoch": 0.826647349069885, + "grad_norm": 1.6297836303710938, + "learning_rate": 1.6313875324971618e-06, + "loss": 1.3835, + "step": 15120 + }, + { + "epoch": 0.8267020215136066, + "grad_norm": 1.5510107278823853, + "learning_rate": 1.6303873493161538e-06, + "loss": 1.528, + "step": 15121 + }, + { + "epoch": 0.8267566939573282, + "grad_norm": 1.4622814655303955, + "learning_rate": 1.6293874456161518e-06, + "loss": 1.4867, + "step": 15122 + }, + { + "epoch": 0.8268113664010497, + "grad_norm": 1.1891182661056519, + "learning_rate": 1.6283878214305438e-06, + "loss": 1.5553, + "step": 15123 + }, + { + "epoch": 0.8268660388447713, + "grad_norm": 1.4738547801971436, + "learning_rate": 1.6273884767927117e-06, + "loss": 1.5351, + "step": 15124 + }, + { + "epoch": 0.8269207112884929, + "grad_norm": 1.5717527866363525, + "learning_rate": 1.6263894117360268e-06, + "loss": 1.5907, + "step": 15125 + }, + { + "epoch": 0.8269753837322144, + "grad_norm": 1.7478713989257812, + "learning_rate": 1.6253906262938457e-06, + "loss": 1.3052, + "step": 15126 + }, + { + "epoch": 0.827030056175936, + "grad_norm": 1.6165930032730103, + "learning_rate": 1.624392120499526e-06, + "loss": 1.3763, + "step": 15127 + }, + { + "epoch": 0.8270847286196574, + "grad_norm": 1.4149399995803833, + "learning_rate": 1.623393894386407e-06, + "loss": 1.5289, + "step": 15128 + }, + { + "epoch": 0.827139401063379, + "grad_norm": 1.4187591075897217, + "learning_rate": 1.6223959479878193e-06, + "loss": 1.6441, + "step": 15129 + }, + { + "epoch": 0.8271940735071006, + "grad_norm": 1.5341849327087402, + "learning_rate": 1.6213982813370931e-06, + "loss": 1.2751, + "step": 15130 + }, + { + "epoch": 0.8272487459508221, + "grad_norm": 1.4277207851409912, + "learning_rate": 1.6204008944675387e-06, + "loss": 1.3011, + "step": 15131 + }, + { + "epoch": 0.8273034183945437, + "grad_norm": 1.4323214292526245, + "learning_rate": 1.6194037874124612e-06, + "loss": 1.583, + "step": 15132 + }, + { + "epoch": 0.8273580908382653, + "grad_norm": 1.554487943649292, + "learning_rate": 1.6184069602051578e-06, + "loss": 1.1878, + "step": 15133 + }, + { + "epoch": 0.8274127632819868, + "grad_norm": 1.8393681049346924, + "learning_rate": 1.6174104128789115e-06, + "loss": 1.6712, + "step": 15134 + }, + { + "epoch": 0.8274674357257084, + "grad_norm": 1.559419870376587, + "learning_rate": 1.6164141454670034e-06, + "loss": 1.668, + "step": 15135 + }, + { + "epoch": 0.82752210816943, + "grad_norm": 1.4065420627593994, + "learning_rate": 1.6154181580027006e-06, + "loss": 1.5518, + "step": 15136 + }, + { + "epoch": 0.8275767806131514, + "grad_norm": 1.395597219467163, + "learning_rate": 1.6144224505192586e-06, + "loss": 1.4846, + "step": 15137 + }, + { + "epoch": 0.827631453056873, + "grad_norm": 1.9893749952316284, + "learning_rate": 1.6134270230499294e-06, + "loss": 1.4902, + "step": 15138 + }, + { + "epoch": 0.8276861255005946, + "grad_norm": 1.226369857788086, + "learning_rate": 1.6124318756279533e-06, + "loss": 1.5075, + "step": 15139 + }, + { + "epoch": 0.8277407979443161, + "grad_norm": 1.687976360321045, + "learning_rate": 1.611437008286555e-06, + "loss": 1.3823, + "step": 15140 + }, + { + "epoch": 0.8277954703880377, + "grad_norm": 1.7226999998092651, + "learning_rate": 1.610442421058962e-06, + "loss": 1.6005, + "step": 15141 + }, + { + "epoch": 0.8278501428317592, + "grad_norm": 1.511767029762268, + "learning_rate": 1.6094481139783836e-06, + "loss": 1.3366, + "step": 15142 + }, + { + "epoch": 0.8279048152754808, + "grad_norm": 1.491126298904419, + "learning_rate": 1.6084540870780197e-06, + "loss": 1.6014, + "step": 15143 + }, + { + "epoch": 0.8279594877192024, + "grad_norm": 1.6630573272705078, + "learning_rate": 1.607460340391067e-06, + "loss": 1.294, + "step": 15144 + }, + { + "epoch": 0.8280141601629238, + "grad_norm": 1.6079093217849731, + "learning_rate": 1.6064668739507072e-06, + "loss": 1.4612, + "step": 15145 + }, + { + "epoch": 0.8280688326066454, + "grad_norm": 1.5502369403839111, + "learning_rate": 1.6054736877901156e-06, + "loss": 1.0923, + "step": 15146 + }, + { + "epoch": 0.828123505050367, + "grad_norm": 1.449607253074646, + "learning_rate": 1.6044807819424545e-06, + "loss": 1.3976, + "step": 15147 + }, + { + "epoch": 0.8281781774940885, + "grad_norm": 1.3180161714553833, + "learning_rate": 1.603488156440879e-06, + "loss": 1.5004, + "step": 15148 + }, + { + "epoch": 0.8282328499378101, + "grad_norm": 1.2802692651748657, + "learning_rate": 1.6024958113185395e-06, + "loss": 1.4467, + "step": 15149 + }, + { + "epoch": 0.8282875223815317, + "grad_norm": 1.3391045331954956, + "learning_rate": 1.6015037466085704e-06, + "loss": 1.572, + "step": 15150 + }, + { + "epoch": 0.8283421948252532, + "grad_norm": 1.7337597608566284, + "learning_rate": 1.6005119623440956e-06, + "loss": 1.2341, + "step": 15151 + }, + { + "epoch": 0.8283968672689748, + "grad_norm": 1.696588397026062, + "learning_rate": 1.5995204585582392e-06, + "loss": 1.5546, + "step": 15152 + }, + { + "epoch": 0.8284515397126964, + "grad_norm": 1.4999788999557495, + "learning_rate": 1.5985292352841074e-06, + "loss": 1.372, + "step": 15153 + }, + { + "epoch": 0.8285062121564178, + "grad_norm": 2.2079789638519287, + "learning_rate": 1.5975382925547966e-06, + "loss": 1.5851, + "step": 15154 + }, + { + "epoch": 0.8285608846001394, + "grad_norm": 1.3467016220092773, + "learning_rate": 1.5965476304034023e-06, + "loss": 1.3314, + "step": 15155 + }, + { + "epoch": 0.8286155570438609, + "grad_norm": 1.568280577659607, + "learning_rate": 1.5955572488630012e-06, + "loss": 1.4983, + "step": 15156 + }, + { + "epoch": 0.8286702294875825, + "grad_norm": 1.6478571891784668, + "learning_rate": 1.5945671479666625e-06, + "loss": 1.4104, + "step": 15157 + }, + { + "epoch": 0.8287249019313041, + "grad_norm": 1.5934844017028809, + "learning_rate": 1.593577327747453e-06, + "loss": 1.419, + "step": 15158 + }, + { + "epoch": 0.8287795743750256, + "grad_norm": 1.6795027256011963, + "learning_rate": 1.5925877882384232e-06, + "loss": 1.3375, + "step": 15159 + }, + { + "epoch": 0.8288342468187472, + "grad_norm": 1.5035834312438965, + "learning_rate": 1.5915985294726156e-06, + "loss": 1.6332, + "step": 15160 + }, + { + "epoch": 0.8288889192624688, + "grad_norm": 2.080901622772217, + "learning_rate": 1.5906095514830645e-06, + "loss": 1.4027, + "step": 15161 + }, + { + "epoch": 0.8289435917061903, + "grad_norm": 1.6823252439498901, + "learning_rate": 1.5896208543027912e-06, + "loss": 1.5327, + "step": 15162 + }, + { + "epoch": 0.8289982641499118, + "grad_norm": 1.6349217891693115, + "learning_rate": 1.5886324379648156e-06, + "loss": 1.4185, + "step": 15163 + }, + { + "epoch": 0.8290529365936334, + "grad_norm": 1.6393370628356934, + "learning_rate": 1.5876443025021404e-06, + "loss": 1.4519, + "step": 15164 + }, + { + "epoch": 0.8291076090373549, + "grad_norm": 1.4633597135543823, + "learning_rate": 1.5866564479477599e-06, + "loss": 1.6652, + "step": 15165 + }, + { + "epoch": 0.8291622814810765, + "grad_norm": 1.2953208684921265, + "learning_rate": 1.585668874334665e-06, + "loss": 1.3423, + "step": 15166 + }, + { + "epoch": 0.8292169539247981, + "grad_norm": 1.493628978729248, + "learning_rate": 1.5846815816958317e-06, + "loss": 1.3376, + "step": 15167 + }, + { + "epoch": 0.8292716263685196, + "grad_norm": 1.2976291179656982, + "learning_rate": 1.5836945700642248e-06, + "loss": 1.4136, + "step": 15168 + }, + { + "epoch": 0.8293262988122412, + "grad_norm": 1.5587435960769653, + "learning_rate": 1.582707839472809e-06, + "loss": 1.5893, + "step": 15169 + }, + { + "epoch": 0.8293809712559627, + "grad_norm": 1.3175954818725586, + "learning_rate": 1.5817213899545293e-06, + "loss": 1.6317, + "step": 15170 + }, + { + "epoch": 0.8294356436996843, + "grad_norm": 1.4370871782302856, + "learning_rate": 1.5807352215423278e-06, + "loss": 1.3574, + "step": 15171 + }, + { + "epoch": 0.8294903161434058, + "grad_norm": 1.3357270956039429, + "learning_rate": 1.5797493342691328e-06, + "loss": 1.4925, + "step": 15172 + }, + { + "epoch": 0.8295449885871273, + "grad_norm": 1.7810310125350952, + "learning_rate": 1.5787637281678637e-06, + "loss": 1.3629, + "step": 15173 + }, + { + "epoch": 0.8295996610308489, + "grad_norm": 2.1608033180236816, + "learning_rate": 1.577778403271437e-06, + "loss": 1.7435, + "step": 15174 + }, + { + "epoch": 0.8296543334745705, + "grad_norm": 1.5578080415725708, + "learning_rate": 1.5767933596127528e-06, + "loss": 1.5387, + "step": 15175 + }, + { + "epoch": 0.829709005918292, + "grad_norm": 1.3713562488555908, + "learning_rate": 1.5758085972247017e-06, + "loss": 1.298, + "step": 15176 + }, + { + "epoch": 0.8297636783620136, + "grad_norm": 1.5248374938964844, + "learning_rate": 1.5748241161401723e-06, + "loss": 1.4603, + "step": 15177 + }, + { + "epoch": 0.8298183508057352, + "grad_norm": 1.6701381206512451, + "learning_rate": 1.5738399163920359e-06, + "loss": 1.4462, + "step": 15178 + }, + { + "epoch": 0.8298730232494567, + "grad_norm": 1.4089720249176025, + "learning_rate": 1.5728559980131553e-06, + "loss": 1.3761, + "step": 15179 + }, + { + "epoch": 0.8299276956931783, + "grad_norm": 1.9865269660949707, + "learning_rate": 1.5718723610363895e-06, + "loss": 1.398, + "step": 15180 + }, + { + "epoch": 0.8299823681368999, + "grad_norm": 1.8689017295837402, + "learning_rate": 1.5708890054945824e-06, + "loss": 1.2091, + "step": 15181 + }, + { + "epoch": 0.8300370405806213, + "grad_norm": 1.6482857465744019, + "learning_rate": 1.56990593142057e-06, + "loss": 1.7091, + "step": 15182 + }, + { + "epoch": 0.8300917130243429, + "grad_norm": 1.1843464374542236, + "learning_rate": 1.5689231388471816e-06, + "loss": 1.6965, + "step": 15183 + }, + { + "epoch": 0.8301463854680644, + "grad_norm": 1.5305602550506592, + "learning_rate": 1.567940627807234e-06, + "loss": 1.3851, + "step": 15184 + }, + { + "epoch": 0.830201057911786, + "grad_norm": 2.4809398651123047, + "learning_rate": 1.5669583983335356e-06, + "loss": 1.2291, + "step": 15185 + }, + { + "epoch": 0.8302557303555076, + "grad_norm": 1.5717768669128418, + "learning_rate": 1.5659764504588848e-06, + "loss": 1.366, + "step": 15186 + }, + { + "epoch": 0.8303104027992291, + "grad_norm": 1.46058988571167, + "learning_rate": 1.5649947842160683e-06, + "loss": 1.2665, + "step": 15187 + }, + { + "epoch": 0.8303650752429507, + "grad_norm": 2.177565336227417, + "learning_rate": 1.5640133996378725e-06, + "loss": 1.3458, + "step": 15188 + }, + { + "epoch": 0.8304197476866723, + "grad_norm": 1.6354999542236328, + "learning_rate": 1.5630322967570655e-06, + "loss": 1.2551, + "step": 15189 + }, + { + "epoch": 0.8304744201303937, + "grad_norm": 1.6595960855484009, + "learning_rate": 1.5620514756064043e-06, + "loss": 1.3839, + "step": 15190 + }, + { + "epoch": 0.8305290925741153, + "grad_norm": 1.3167881965637207, + "learning_rate": 1.5610709362186482e-06, + "loss": 1.7404, + "step": 15191 + }, + { + "epoch": 0.8305837650178369, + "grad_norm": 2.860194683074951, + "learning_rate": 1.5600906786265358e-06, + "loss": 1.1534, + "step": 15192 + }, + { + "epoch": 0.8306384374615584, + "grad_norm": 1.1713967323303223, + "learning_rate": 1.5591107028627972e-06, + "loss": 1.5857, + "step": 15193 + }, + { + "epoch": 0.83069310990528, + "grad_norm": 1.3966593742370605, + "learning_rate": 1.558131008960163e-06, + "loss": 1.7441, + "step": 15194 + }, + { + "epoch": 0.8307477823490016, + "grad_norm": 1.4335949420928955, + "learning_rate": 1.5571515969513428e-06, + "loss": 1.5127, + "step": 15195 + }, + { + "epoch": 0.8308024547927231, + "grad_norm": 1.3831952810287476, + "learning_rate": 1.5561724668690436e-06, + "loss": 1.4703, + "step": 15196 + }, + { + "epoch": 0.8308571272364447, + "grad_norm": 2.220785140991211, + "learning_rate": 1.5551936187459594e-06, + "loss": 1.6629, + "step": 15197 + }, + { + "epoch": 0.8309117996801662, + "grad_norm": 1.5343972444534302, + "learning_rate": 1.554215052614776e-06, + "loss": 1.409, + "step": 15198 + }, + { + "epoch": 0.8309664721238877, + "grad_norm": 1.4969348907470703, + "learning_rate": 1.5532367685081685e-06, + "loss": 1.5576, + "step": 15199 + }, + { + "epoch": 0.8310211445676093, + "grad_norm": 1.3525309562683105, + "learning_rate": 1.5522587664588097e-06, + "loss": 1.4785, + "step": 15200 + }, + { + "epoch": 0.8310758170113308, + "grad_norm": 1.380537986755371, + "learning_rate": 1.55128104649935e-06, + "loss": 1.6146, + "step": 15201 + }, + { + "epoch": 0.8311304894550524, + "grad_norm": 1.6148724555969238, + "learning_rate": 1.5503036086624456e-06, + "loss": 1.4199, + "step": 15202 + }, + { + "epoch": 0.831185161898774, + "grad_norm": 1.4784647226333618, + "learning_rate": 1.5493264529807305e-06, + "loss": 1.4846, + "step": 15203 + }, + { + "epoch": 0.8312398343424955, + "grad_norm": 1.6161892414093018, + "learning_rate": 1.548349579486833e-06, + "loss": 1.4423, + "step": 15204 + }, + { + "epoch": 0.8312945067862171, + "grad_norm": 1.418710708618164, + "learning_rate": 1.547372988213378e-06, + "loss": 1.5168, + "step": 15205 + }, + { + "epoch": 0.8313491792299387, + "grad_norm": 1.707288384437561, + "learning_rate": 1.546396679192974e-06, + "loss": 1.4055, + "step": 15206 + }, + { + "epoch": 0.8314038516736602, + "grad_norm": 1.3763620853424072, + "learning_rate": 1.5454206524582194e-06, + "loss": 1.5806, + "step": 15207 + }, + { + "epoch": 0.8314585241173817, + "grad_norm": 1.7886693477630615, + "learning_rate": 1.54444490804171e-06, + "loss": 1.6705, + "step": 15208 + }, + { + "epoch": 0.8315131965611033, + "grad_norm": 1.4710328578948975, + "learning_rate": 1.543469445976028e-06, + "loss": 1.6174, + "step": 15209 + }, + { + "epoch": 0.8315678690048248, + "grad_norm": 1.4798424243927002, + "learning_rate": 1.5424942662937436e-06, + "loss": 1.5581, + "step": 15210 + }, + { + "epoch": 0.8316225414485464, + "grad_norm": 1.6751515865325928, + "learning_rate": 1.5415193690274234e-06, + "loss": 1.6025, + "step": 15211 + }, + { + "epoch": 0.8316772138922679, + "grad_norm": 1.5955686569213867, + "learning_rate": 1.5405447542096187e-06, + "loss": 1.3083, + "step": 15212 + }, + { + "epoch": 0.8317318863359895, + "grad_norm": 1.5081983804702759, + "learning_rate": 1.5395704218728736e-06, + "loss": 1.2223, + "step": 15213 + }, + { + "epoch": 0.8317865587797111, + "grad_norm": 1.4008420705795288, + "learning_rate": 1.5385963720497278e-06, + "loss": 1.7046, + "step": 15214 + }, + { + "epoch": 0.8318412312234326, + "grad_norm": 1.4035598039627075, + "learning_rate": 1.5376226047727005e-06, + "loss": 1.3631, + "step": 15215 + }, + { + "epoch": 0.8318959036671542, + "grad_norm": 1.674720287322998, + "learning_rate": 1.536649120074316e-06, + "loss": 1.2312, + "step": 15216 + }, + { + "epoch": 0.8319505761108758, + "grad_norm": 1.5501911640167236, + "learning_rate": 1.5356759179870762e-06, + "loss": 1.2828, + "step": 15217 + }, + { + "epoch": 0.8320052485545972, + "grad_norm": 1.3776376247406006, + "learning_rate": 1.5347029985434781e-06, + "loss": 1.2145, + "step": 15218 + }, + { + "epoch": 0.8320599209983188, + "grad_norm": 2.5089948177337646, + "learning_rate": 1.5337303617760136e-06, + "loss": 1.5329, + "step": 15219 + }, + { + "epoch": 0.8321145934420404, + "grad_norm": 1.6784144639968872, + "learning_rate": 1.5327580077171589e-06, + "loss": 1.3654, + "step": 15220 + }, + { + "epoch": 0.8321692658857619, + "grad_norm": 1.3105652332305908, + "learning_rate": 1.5317859363993814e-06, + "loss": 1.5401, + "step": 15221 + }, + { + "epoch": 0.8322239383294835, + "grad_norm": 1.503839373588562, + "learning_rate": 1.5308141478551441e-06, + "loss": 1.2781, + "step": 15222 + }, + { + "epoch": 0.8322786107732051, + "grad_norm": 1.2809348106384277, + "learning_rate": 1.5298426421168965e-06, + "loss": 1.4808, + "step": 15223 + }, + { + "epoch": 0.8323332832169266, + "grad_norm": 1.518485426902771, + "learning_rate": 1.5288714192170796e-06, + "loss": 1.4905, + "step": 15224 + }, + { + "epoch": 0.8323879556606482, + "grad_norm": 1.508816123008728, + "learning_rate": 1.5279004791881236e-06, + "loss": 1.5647, + "step": 15225 + }, + { + "epoch": 0.8324426281043696, + "grad_norm": 1.797987937927246, + "learning_rate": 1.5269298220624506e-06, + "loss": 1.38, + "step": 15226 + }, + { + "epoch": 0.8324973005480912, + "grad_norm": 1.6315001249313354, + "learning_rate": 1.5259594478724715e-06, + "loss": 1.3595, + "step": 15227 + }, + { + "epoch": 0.8325519729918128, + "grad_norm": 1.7838982343673706, + "learning_rate": 1.5249893566505935e-06, + "loss": 1.2308, + "step": 15228 + }, + { + "epoch": 0.8326066454355343, + "grad_norm": 1.3538094758987427, + "learning_rate": 1.5240195484292087e-06, + "loss": 1.4227, + "step": 15229 + }, + { + "epoch": 0.8326613178792559, + "grad_norm": 1.5498855113983154, + "learning_rate": 1.5230500232406975e-06, + "loss": 1.3968, + "step": 15230 + }, + { + "epoch": 0.8327159903229775, + "grad_norm": 1.3196355104446411, + "learning_rate": 1.522080781117441e-06, + "loss": 1.481, + "step": 15231 + }, + { + "epoch": 0.832770662766699, + "grad_norm": 1.7223845720291138, + "learning_rate": 1.5211118220917987e-06, + "loss": 1.3309, + "step": 15232 + }, + { + "epoch": 0.8328253352104206, + "grad_norm": 1.6443246603012085, + "learning_rate": 1.5201431461961314e-06, + "loss": 1.5952, + "step": 15233 + }, + { + "epoch": 0.8328800076541422, + "grad_norm": 1.3171746730804443, + "learning_rate": 1.5191747534627822e-06, + "loss": 1.5618, + "step": 15234 + }, + { + "epoch": 0.8329346800978636, + "grad_norm": 1.4003300666809082, + "learning_rate": 1.5182066439240894e-06, + "loss": 1.3025, + "step": 15235 + }, + { + "epoch": 0.8329893525415852, + "grad_norm": 1.4477730989456177, + "learning_rate": 1.5172388176123808e-06, + "loss": 1.2234, + "step": 15236 + }, + { + "epoch": 0.8330440249853068, + "grad_norm": 1.6001160144805908, + "learning_rate": 1.5162712745599728e-06, + "loss": 1.3228, + "step": 15237 + }, + { + "epoch": 0.8330986974290283, + "grad_norm": 1.839746117591858, + "learning_rate": 1.5153040147991716e-06, + "loss": 1.4113, + "step": 15238 + }, + { + "epoch": 0.8331533698727499, + "grad_norm": 1.518980860710144, + "learning_rate": 1.5143370383622825e-06, + "loss": 1.7337, + "step": 15239 + }, + { + "epoch": 0.8332080423164714, + "grad_norm": 1.6592191457748413, + "learning_rate": 1.5133703452815917e-06, + "loss": 1.5291, + "step": 15240 + }, + { + "epoch": 0.833262714760193, + "grad_norm": 1.9222633838653564, + "learning_rate": 1.512403935589377e-06, + "loss": 1.3472, + "step": 15241 + }, + { + "epoch": 0.8333173872039146, + "grad_norm": 1.9910755157470703, + "learning_rate": 1.5114378093179148e-06, + "loss": 1.2644, + "step": 15242 + }, + { + "epoch": 0.833372059647636, + "grad_norm": 1.6203563213348389, + "learning_rate": 1.5104719664994626e-06, + "loss": 1.3937, + "step": 15243 + }, + { + "epoch": 0.8334267320913576, + "grad_norm": 1.8750330209732056, + "learning_rate": 1.5095064071662702e-06, + "loss": 1.5275, + "step": 15244 + }, + { + "epoch": 0.8334814045350792, + "grad_norm": 2.146573066711426, + "learning_rate": 1.5085411313505849e-06, + "loss": 1.5492, + "step": 15245 + }, + { + "epoch": 0.8335360769788007, + "grad_norm": 1.8317762613296509, + "learning_rate": 1.5075761390846344e-06, + "loss": 1.5876, + "step": 15246 + }, + { + "epoch": 0.8335907494225223, + "grad_norm": 1.853904128074646, + "learning_rate": 1.5066114304006473e-06, + "loss": 1.4483, + "step": 15247 + }, + { + "epoch": 0.8336454218662439, + "grad_norm": 1.2407617568969727, + "learning_rate": 1.5056470053308358e-06, + "loss": 1.4809, + "step": 15248 + }, + { + "epoch": 0.8337000943099654, + "grad_norm": 1.5922017097473145, + "learning_rate": 1.5046828639074028e-06, + "loss": 1.6813, + "step": 15249 + }, + { + "epoch": 0.833754766753687, + "grad_norm": 1.352382779121399, + "learning_rate": 1.5037190061625429e-06, + "loss": 1.3278, + "step": 15250 + }, + { + "epoch": 0.8338094391974086, + "grad_norm": 1.881347417831421, + "learning_rate": 1.5027554321284442e-06, + "loss": 1.476, + "step": 15251 + }, + { + "epoch": 0.8338641116411301, + "grad_norm": 1.4178638458251953, + "learning_rate": 1.5017921418372772e-06, + "loss": 1.5386, + "step": 15252 + }, + { + "epoch": 0.8339187840848516, + "grad_norm": 1.5391473770141602, + "learning_rate": 1.5008291353212157e-06, + "loss": 1.4453, + "step": 15253 + }, + { + "epoch": 0.8339734565285731, + "grad_norm": 1.5325191020965576, + "learning_rate": 1.499866412612413e-06, + "loss": 1.4623, + "step": 15254 + }, + { + "epoch": 0.8340281289722947, + "grad_norm": 1.6219072341918945, + "learning_rate": 1.4989039737430144e-06, + "loss": 1.6806, + "step": 15255 + }, + { + "epoch": 0.8340828014160163, + "grad_norm": 1.4444637298583984, + "learning_rate": 1.4979418187451632e-06, + "loss": 1.2448, + "step": 15256 + }, + { + "epoch": 0.8341374738597378, + "grad_norm": 2.11356782913208, + "learning_rate": 1.496979947650985e-06, + "loss": 1.4754, + "step": 15257 + }, + { + "epoch": 0.8341921463034594, + "grad_norm": 1.4754608869552612, + "learning_rate": 1.4960183604925972e-06, + "loss": 1.3613, + "step": 15258 + }, + { + "epoch": 0.834246818747181, + "grad_norm": 1.6424099206924438, + "learning_rate": 1.4950570573021138e-06, + "loss": 1.3016, + "step": 15259 + }, + { + "epoch": 0.8343014911909025, + "grad_norm": 1.5245327949523926, + "learning_rate": 1.4940960381116299e-06, + "loss": 1.2719, + "step": 15260 + }, + { + "epoch": 0.8343561636346241, + "grad_norm": 1.5008918046951294, + "learning_rate": 1.4931353029532425e-06, + "loss": 1.7796, + "step": 15261 + }, + { + "epoch": 0.8344108360783457, + "grad_norm": 1.5313152074813843, + "learning_rate": 1.4921748518590284e-06, + "loss": 1.8102, + "step": 15262 + }, + { + "epoch": 0.8344655085220671, + "grad_norm": 1.6329272985458374, + "learning_rate": 1.491214684861061e-06, + "loss": 1.2695, + "step": 15263 + }, + { + "epoch": 0.8345201809657887, + "grad_norm": 1.7582141160964966, + "learning_rate": 1.490254801991401e-06, + "loss": 1.4069, + "step": 15264 + }, + { + "epoch": 0.8345748534095103, + "grad_norm": 1.71860671043396, + "learning_rate": 1.4892952032821017e-06, + "loss": 1.5504, + "step": 15265 + }, + { + "epoch": 0.8346295258532318, + "grad_norm": 1.5052199363708496, + "learning_rate": 1.4883358887652044e-06, + "loss": 1.3004, + "step": 15266 + }, + { + "epoch": 0.8346841982969534, + "grad_norm": 1.3758307695388794, + "learning_rate": 1.4873768584727478e-06, + "loss": 1.402, + "step": 15267 + }, + { + "epoch": 0.8347388707406749, + "grad_norm": 1.6471195220947266, + "learning_rate": 1.4864181124367538e-06, + "loss": 1.5161, + "step": 15268 + }, + { + "epoch": 0.8347935431843965, + "grad_norm": 1.5332832336425781, + "learning_rate": 1.4854596506892338e-06, + "loss": 1.2345, + "step": 15269 + }, + { + "epoch": 0.8348482156281181, + "grad_norm": 2.79559063911438, + "learning_rate": 1.4845014732621987e-06, + "loss": 1.4934, + "step": 15270 + }, + { + "epoch": 0.8349028880718395, + "grad_norm": 1.2980937957763672, + "learning_rate": 1.4835435801876409e-06, + "loss": 1.2415, + "step": 15271 + }, + { + "epoch": 0.8349575605155611, + "grad_norm": 1.6046054363250732, + "learning_rate": 1.4825859714975455e-06, + "loss": 1.562, + "step": 15272 + }, + { + "epoch": 0.8350122329592827, + "grad_norm": 1.2096692323684692, + "learning_rate": 1.4816286472238939e-06, + "loss": 1.6126, + "step": 15273 + }, + { + "epoch": 0.8350669054030042, + "grad_norm": 1.6109263896942139, + "learning_rate": 1.4806716073986504e-06, + "loss": 1.5967, + "step": 15274 + }, + { + "epoch": 0.8351215778467258, + "grad_norm": 1.4868780374526978, + "learning_rate": 1.479714852053774e-06, + "loss": 1.3888, + "step": 15275 + }, + { + "epoch": 0.8351762502904474, + "grad_norm": 2.0433390140533447, + "learning_rate": 1.4787583812212114e-06, + "loss": 1.0543, + "step": 15276 + }, + { + "epoch": 0.8352309227341689, + "grad_norm": 1.6034839153289795, + "learning_rate": 1.4778021949329003e-06, + "loss": 1.3848, + "step": 15277 + }, + { + "epoch": 0.8352855951778905, + "grad_norm": 1.5705119371414185, + "learning_rate": 1.4768462932207727e-06, + "loss": 1.322, + "step": 15278 + }, + { + "epoch": 0.8353402676216121, + "grad_norm": 1.39812433719635, + "learning_rate": 1.475890676116749e-06, + "loss": 1.4479, + "step": 15279 + }, + { + "epoch": 0.8353949400653335, + "grad_norm": 1.4927654266357422, + "learning_rate": 1.474935343652736e-06, + "loss": 1.2787, + "step": 15280 + }, + { + "epoch": 0.8354496125090551, + "grad_norm": 1.4489890336990356, + "learning_rate": 1.4739802958606386e-06, + "loss": 1.5229, + "step": 15281 + }, + { + "epoch": 0.8355042849527766, + "grad_norm": 1.5225435495376587, + "learning_rate": 1.4730255327723452e-06, + "loss": 1.4839, + "step": 15282 + }, + { + "epoch": 0.8355589573964982, + "grad_norm": 1.3128398656845093, + "learning_rate": 1.4720710544197369e-06, + "loss": 1.2533, + "step": 15283 + }, + { + "epoch": 0.8356136298402198, + "grad_norm": 1.358522891998291, + "learning_rate": 1.4711168608346893e-06, + "loss": 1.4119, + "step": 15284 + }, + { + "epoch": 0.8356683022839413, + "grad_norm": 1.8173608779907227, + "learning_rate": 1.4701629520490646e-06, + "loss": 1.5759, + "step": 15285 + }, + { + "epoch": 0.8357229747276629, + "grad_norm": 1.6098852157592773, + "learning_rate": 1.4692093280947106e-06, + "loss": 1.4373, + "step": 15286 + }, + { + "epoch": 0.8357776471713845, + "grad_norm": 1.5028436183929443, + "learning_rate": 1.4682559890034787e-06, + "loss": 1.625, + "step": 15287 + }, + { + "epoch": 0.835832319615106, + "grad_norm": 1.3215696811676025, + "learning_rate": 1.4673029348072e-06, + "loss": 1.2847, + "step": 15288 + }, + { + "epoch": 0.8358869920588275, + "grad_norm": 1.6949408054351807, + "learning_rate": 1.4663501655376989e-06, + "loss": 1.442, + "step": 15289 + }, + { + "epoch": 0.8359416645025491, + "grad_norm": 1.7722795009613037, + "learning_rate": 1.46539768122679e-06, + "loss": 1.3067, + "step": 15290 + }, + { + "epoch": 0.8359963369462706, + "grad_norm": 2.5917739868164062, + "learning_rate": 1.464445481906277e-06, + "loss": 1.6056, + "step": 15291 + }, + { + "epoch": 0.8360510093899922, + "grad_norm": 1.4673038721084595, + "learning_rate": 1.463493567607962e-06, + "loss": 1.6365, + "step": 15292 + }, + { + "epoch": 0.8361056818337138, + "grad_norm": 1.4120635986328125, + "learning_rate": 1.4625419383636275e-06, + "loss": 1.4289, + "step": 15293 + }, + { + "epoch": 0.8361603542774353, + "grad_norm": 1.6833745241165161, + "learning_rate": 1.461590594205049e-06, + "loss": 1.4413, + "step": 15294 + }, + { + "epoch": 0.8362150267211569, + "grad_norm": 1.4674286842346191, + "learning_rate": 1.4606395351640002e-06, + "loss": 1.3399, + "step": 15295 + }, + { + "epoch": 0.8362696991648785, + "grad_norm": 1.7212592363357544, + "learning_rate": 1.4596887612722345e-06, + "loss": 1.5699, + "step": 15296 + }, + { + "epoch": 0.8363243716086, + "grad_norm": 1.3657089471817017, + "learning_rate": 1.4587382725614997e-06, + "loss": 1.1585, + "step": 15297 + }, + { + "epoch": 0.8363790440523216, + "grad_norm": 1.446578025817871, + "learning_rate": 1.4577880690635381e-06, + "loss": 1.3173, + "step": 15298 + }, + { + "epoch": 0.836433716496043, + "grad_norm": 1.3094632625579834, + "learning_rate": 1.4568381508100782e-06, + "loss": 1.3417, + "step": 15299 + }, + { + "epoch": 0.8364883889397646, + "grad_norm": 1.5206876993179321, + "learning_rate": 1.4558885178328374e-06, + "loss": 1.681, + "step": 15300 + }, + { + "epoch": 0.8365430613834862, + "grad_norm": 1.786439299583435, + "learning_rate": 1.4549391701635308e-06, + "loss": 1.5843, + "step": 15301 + }, + { + "epoch": 0.8365977338272077, + "grad_norm": 1.5977973937988281, + "learning_rate": 1.453990107833857e-06, + "loss": 1.4759, + "step": 15302 + }, + { + "epoch": 0.8366524062709293, + "grad_norm": 1.7191731929779053, + "learning_rate": 1.4530413308755075e-06, + "loss": 1.4722, + "step": 15303 + }, + { + "epoch": 0.8367070787146509, + "grad_norm": 3.1742048263549805, + "learning_rate": 1.4520928393201638e-06, + "loss": 1.3831, + "step": 15304 + }, + { + "epoch": 0.8367617511583724, + "grad_norm": 1.5665639638900757, + "learning_rate": 1.4511446331994961e-06, + "loss": 1.2149, + "step": 15305 + }, + { + "epoch": 0.836816423602094, + "grad_norm": 1.3328962326049805, + "learning_rate": 1.450196712545172e-06, + "loss": 1.4511, + "step": 15306 + }, + { + "epoch": 0.8368710960458156, + "grad_norm": 1.5500342845916748, + "learning_rate": 1.4492490773888424e-06, + "loss": 1.5227, + "step": 15307 + }, + { + "epoch": 0.836925768489537, + "grad_norm": 1.5542974472045898, + "learning_rate": 1.4483017277621482e-06, + "loss": 1.3495, + "step": 15308 + }, + { + "epoch": 0.8369804409332586, + "grad_norm": 1.3989355564117432, + "learning_rate": 1.4473546636967296e-06, + "loss": 1.3994, + "step": 15309 + }, + { + "epoch": 0.8370351133769802, + "grad_norm": 1.3670185804367065, + "learning_rate": 1.446407885224208e-06, + "loss": 1.4713, + "step": 15310 + }, + { + "epoch": 0.8370897858207017, + "grad_norm": 1.5787461996078491, + "learning_rate": 1.4454613923761962e-06, + "loss": 1.2512, + "step": 15311 + }, + { + "epoch": 0.8371444582644233, + "grad_norm": 1.1310006380081177, + "learning_rate": 1.4445151851843042e-06, + "loss": 1.5449, + "step": 15312 + }, + { + "epoch": 0.8371991307081448, + "grad_norm": 2.056694269180298, + "learning_rate": 1.4435692636801268e-06, + "loss": 1.4534, + "step": 15313 + }, + { + "epoch": 0.8372538031518664, + "grad_norm": 1.2563364505767822, + "learning_rate": 1.442623627895251e-06, + "loss": 1.3906, + "step": 15314 + }, + { + "epoch": 0.837308475595588, + "grad_norm": 1.6740074157714844, + "learning_rate": 1.4416782778612514e-06, + "loss": 1.504, + "step": 15315 + }, + { + "epoch": 0.8373631480393094, + "grad_norm": 1.8180325031280518, + "learning_rate": 1.4407332136096953e-06, + "loss": 1.4458, + "step": 15316 + }, + { + "epoch": 0.837417820483031, + "grad_norm": 1.8913617134094238, + "learning_rate": 1.4397884351721436e-06, + "loss": 1.3817, + "step": 15317 + }, + { + "epoch": 0.8374724929267526, + "grad_norm": 1.8220903873443604, + "learning_rate": 1.4388439425801437e-06, + "loss": 1.5619, + "step": 15318 + }, + { + "epoch": 0.8375271653704741, + "grad_norm": 1.4344850778579712, + "learning_rate": 1.4378997358652313e-06, + "loss": 1.5278, + "step": 15319 + }, + { + "epoch": 0.8375818378141957, + "grad_norm": 1.3694813251495361, + "learning_rate": 1.4369558150589413e-06, + "loss": 1.5148, + "step": 15320 + }, + { + "epoch": 0.8376365102579173, + "grad_norm": 1.47625732421875, + "learning_rate": 1.4360121801927907e-06, + "loss": 1.3067, + "step": 15321 + }, + { + "epoch": 0.8376911827016388, + "grad_norm": 1.4772130250930786, + "learning_rate": 1.4350688312982864e-06, + "loss": 1.4405, + "step": 15322 + }, + { + "epoch": 0.8377458551453604, + "grad_norm": 1.2570945024490356, + "learning_rate": 1.4341257684069344e-06, + "loss": 1.5492, + "step": 15323 + }, + { + "epoch": 0.837800527589082, + "grad_norm": 1.7834081649780273, + "learning_rate": 1.4331829915502226e-06, + "loss": 0.9911, + "step": 15324 + }, + { + "epoch": 0.8378552000328034, + "grad_norm": 1.98884117603302, + "learning_rate": 1.4322405007596329e-06, + "loss": 1.5223, + "step": 15325 + }, + { + "epoch": 0.837909872476525, + "grad_norm": 1.6320793628692627, + "learning_rate": 1.4312982960666388e-06, + "loss": 1.3141, + "step": 15326 + }, + { + "epoch": 0.8379645449202465, + "grad_norm": 1.7525986433029175, + "learning_rate": 1.430356377502702e-06, + "loss": 1.4731, + "step": 15327 + }, + { + "epoch": 0.8380192173639681, + "grad_norm": 2.0500288009643555, + "learning_rate": 1.4294147450992757e-06, + "loss": 1.0432, + "step": 15328 + }, + { + "epoch": 0.8380738898076897, + "grad_norm": 1.3399150371551514, + "learning_rate": 1.428473398887802e-06, + "loss": 1.5662, + "step": 15329 + }, + { + "epoch": 0.8381285622514112, + "grad_norm": 1.7097599506378174, + "learning_rate": 1.427532338899712e-06, + "loss": 1.2736, + "step": 15330 + }, + { + "epoch": 0.8381832346951328, + "grad_norm": 1.4258004426956177, + "learning_rate": 1.4265915651664363e-06, + "loss": 1.5483, + "step": 15331 + }, + { + "epoch": 0.8382379071388544, + "grad_norm": 1.6467112302780151, + "learning_rate": 1.4256510777193866e-06, + "loss": 1.5633, + "step": 15332 + }, + { + "epoch": 0.8382925795825759, + "grad_norm": 1.6814625263214111, + "learning_rate": 1.4247108765899654e-06, + "loss": 1.3171, + "step": 15333 + }, + { + "epoch": 0.8383472520262975, + "grad_norm": 1.9230290651321411, + "learning_rate": 1.423770961809573e-06, + "loss": 1.5313, + "step": 15334 + }, + { + "epoch": 0.838401924470019, + "grad_norm": 1.5330348014831543, + "learning_rate": 1.4228313334095923e-06, + "loss": 1.5285, + "step": 15335 + }, + { + "epoch": 0.8384565969137405, + "grad_norm": 1.6748852729797363, + "learning_rate": 1.421891991421399e-06, + "loss": 1.584, + "step": 15336 + }, + { + "epoch": 0.8385112693574621, + "grad_norm": 1.2957983016967773, + "learning_rate": 1.420952935876363e-06, + "loss": 1.5792, + "step": 15337 + }, + { + "epoch": 0.8385659418011837, + "grad_norm": 1.6778615713119507, + "learning_rate": 1.4200141668058397e-06, + "loss": 1.5302, + "step": 15338 + }, + { + "epoch": 0.8386206142449052, + "grad_norm": 1.7509264945983887, + "learning_rate": 1.4190756842411746e-06, + "loss": 1.5118, + "step": 15339 + }, + { + "epoch": 0.8386752866886268, + "grad_norm": 1.3470388650894165, + "learning_rate": 1.41813748821371e-06, + "loss": 1.3754, + "step": 15340 + }, + { + "epoch": 0.8387299591323483, + "grad_norm": 1.797145128250122, + "learning_rate": 1.4171995787547732e-06, + "loss": 1.6292, + "step": 15341 + }, + { + "epoch": 0.8387846315760699, + "grad_norm": 2.2062766551971436, + "learning_rate": 1.4162619558956836e-06, + "loss": 1.2156, + "step": 15342 + }, + { + "epoch": 0.8388393040197915, + "grad_norm": 1.5717780590057373, + "learning_rate": 1.4153246196677483e-06, + "loss": 1.4496, + "step": 15343 + }, + { + "epoch": 0.8388939764635129, + "grad_norm": 1.3842365741729736, + "learning_rate": 1.414387570102267e-06, + "loss": 1.495, + "step": 15344 + }, + { + "epoch": 0.8389486489072345, + "grad_norm": 1.3070834875106812, + "learning_rate": 1.413450807230533e-06, + "loss": 1.1546, + "step": 15345 + }, + { + "epoch": 0.8390033213509561, + "grad_norm": 1.3830565214157104, + "learning_rate": 1.4125143310838262e-06, + "loss": 1.4549, + "step": 15346 + }, + { + "epoch": 0.8390579937946776, + "grad_norm": 1.4538743495941162, + "learning_rate": 1.4115781416934148e-06, + "loss": 1.3913, + "step": 15347 + }, + { + "epoch": 0.8391126662383992, + "grad_norm": 1.6406923532485962, + "learning_rate": 1.4106422390905649e-06, + "loss": 1.2645, + "step": 15348 + }, + { + "epoch": 0.8391673386821208, + "grad_norm": 1.4740879535675049, + "learning_rate": 1.409706623306526e-06, + "loss": 1.4409, + "step": 15349 + }, + { + "epoch": 0.8392220111258423, + "grad_norm": 1.3470404148101807, + "learning_rate": 1.4087712943725384e-06, + "loss": 1.1578, + "step": 15350 + }, + { + "epoch": 0.8392766835695639, + "grad_norm": 1.3127374649047852, + "learning_rate": 1.4078362523198385e-06, + "loss": 1.6121, + "step": 15351 + }, + { + "epoch": 0.8393313560132855, + "grad_norm": 1.241742491722107, + "learning_rate": 1.4069014971796502e-06, + "loss": 1.502, + "step": 15352 + }, + { + "epoch": 0.8393860284570069, + "grad_norm": 1.3264566659927368, + "learning_rate": 1.405967028983184e-06, + "loss": 1.6283, + "step": 15353 + }, + { + "epoch": 0.8394407009007285, + "grad_norm": 1.7325495481491089, + "learning_rate": 1.405032847761646e-06, + "loss": 1.5451, + "step": 15354 + }, + { + "epoch": 0.83949537334445, + "grad_norm": 1.4662142992019653, + "learning_rate": 1.404098953546229e-06, + "loss": 1.3224, + "step": 15355 + }, + { + "epoch": 0.8395500457881716, + "grad_norm": 1.3656649589538574, + "learning_rate": 1.4031653463681172e-06, + "loss": 1.4909, + "step": 15356 + }, + { + "epoch": 0.8396047182318932, + "grad_norm": 1.4850263595581055, + "learning_rate": 1.40223202625849e-06, + "loss": 1.4034, + "step": 15357 + }, + { + "epoch": 0.8396593906756147, + "grad_norm": 1.5672276020050049, + "learning_rate": 1.4012989932485077e-06, + "loss": 1.5259, + "step": 15358 + }, + { + "epoch": 0.8397140631193363, + "grad_norm": 1.4867464303970337, + "learning_rate": 1.4003662473693324e-06, + "loss": 1.6364, + "step": 15359 + }, + { + "epoch": 0.8397687355630579, + "grad_norm": 1.205033540725708, + "learning_rate": 1.399433788652107e-06, + "loss": 1.3776, + "step": 15360 + }, + { + "epoch": 0.8398234080067793, + "grad_norm": 1.7294020652770996, + "learning_rate": 1.3985016171279675e-06, + "loss": 1.2438, + "step": 15361 + }, + { + "epoch": 0.8398780804505009, + "grad_norm": 1.3582994937896729, + "learning_rate": 1.3975697328280457e-06, + "loss": 1.3502, + "step": 15362 + }, + { + "epoch": 0.8399327528942225, + "grad_norm": 1.5263947248458862, + "learning_rate": 1.3966381357834568e-06, + "loss": 1.1486, + "step": 15363 + }, + { + "epoch": 0.839987425337944, + "grad_norm": 1.8474940061569214, + "learning_rate": 1.395706826025306e-06, + "loss": 1.3325, + "step": 15364 + }, + { + "epoch": 0.8400420977816656, + "grad_norm": 1.4725948572158813, + "learning_rate": 1.3947758035846981e-06, + "loss": 1.4445, + "step": 15365 + }, + { + "epoch": 0.8400967702253872, + "grad_norm": 1.3063514232635498, + "learning_rate": 1.3938450684927185e-06, + "loss": 1.478, + "step": 15366 + }, + { + "epoch": 0.8401514426691087, + "grad_norm": 1.5640417337417603, + "learning_rate": 1.3929146207804468e-06, + "loss": 1.7445, + "step": 15367 + }, + { + "epoch": 0.8402061151128303, + "grad_norm": 1.2904154062271118, + "learning_rate": 1.3919844604789534e-06, + "loss": 1.5712, + "step": 15368 + }, + { + "epoch": 0.8402607875565518, + "grad_norm": 2.132664203643799, + "learning_rate": 1.3910545876192971e-06, + "loss": 1.5125, + "step": 15369 + }, + { + "epoch": 0.8403154600002734, + "grad_norm": 2.0422017574310303, + "learning_rate": 1.3901250022325286e-06, + "loss": 1.5171, + "step": 15370 + }, + { + "epoch": 0.8403701324439949, + "grad_norm": 1.2375859022140503, + "learning_rate": 1.3891957043496917e-06, + "loss": 1.4222, + "step": 15371 + }, + { + "epoch": 0.8404248048877164, + "grad_norm": 1.5179221630096436, + "learning_rate": 1.3882666940018141e-06, + "loss": 1.3842, + "step": 15372 + }, + { + "epoch": 0.840479477331438, + "grad_norm": 1.5926783084869385, + "learning_rate": 1.387337971219922e-06, + "loss": 1.5889, + "step": 15373 + }, + { + "epoch": 0.8405341497751596, + "grad_norm": 1.5692052841186523, + "learning_rate": 1.3864095360350249e-06, + "loss": 1.671, + "step": 15374 + }, + { + "epoch": 0.8405888222188811, + "grad_norm": 2.332733154296875, + "learning_rate": 1.3854813884781238e-06, + "loss": 1.3429, + "step": 15375 + }, + { + "epoch": 0.8406434946626027, + "grad_norm": 1.4104783535003662, + "learning_rate": 1.384553528580216e-06, + "loss": 1.8308, + "step": 15376 + }, + { + "epoch": 0.8406981671063243, + "grad_norm": 1.3714333772659302, + "learning_rate": 1.3836259563722832e-06, + "loss": 1.5377, + "step": 15377 + }, + { + "epoch": 0.8407528395500458, + "grad_norm": 1.4598166942596436, + "learning_rate": 1.3826986718852952e-06, + "loss": 1.5495, + "step": 15378 + }, + { + "epoch": 0.8408075119937674, + "grad_norm": 1.4965797662734985, + "learning_rate": 1.381771675150223e-06, + "loss": 1.3203, + "step": 15379 + }, + { + "epoch": 0.840862184437489, + "grad_norm": 1.441786766052246, + "learning_rate": 1.3808449661980173e-06, + "loss": 1.3215, + "step": 15380 + }, + { + "epoch": 0.8409168568812104, + "grad_norm": 1.5729334354400635, + "learning_rate": 1.3799185450596243e-06, + "loss": 1.5432, + "step": 15381 + }, + { + "epoch": 0.840971529324932, + "grad_norm": 1.8801285028457642, + "learning_rate": 1.3789924117659782e-06, + "loss": 1.7001, + "step": 15382 + }, + { + "epoch": 0.8410262017686535, + "grad_norm": 1.744778037071228, + "learning_rate": 1.3780665663480052e-06, + "loss": 1.2338, + "step": 15383 + }, + { + "epoch": 0.8410808742123751, + "grad_norm": 1.9751205444335938, + "learning_rate": 1.377141008836619e-06, + "loss": 1.5989, + "step": 15384 + }, + { + "epoch": 0.8411355466560967, + "grad_norm": 1.4108933210372925, + "learning_rate": 1.3762157392627317e-06, + "loss": 1.5166, + "step": 15385 + }, + { + "epoch": 0.8411902190998182, + "grad_norm": 1.4362472295761108, + "learning_rate": 1.375290757657235e-06, + "loss": 1.4612, + "step": 15386 + }, + { + "epoch": 0.8412448915435398, + "grad_norm": 1.4190301895141602, + "learning_rate": 1.3743660640510205e-06, + "loss": 1.5372, + "step": 15387 + }, + { + "epoch": 0.8412995639872614, + "grad_norm": 1.3244074583053589, + "learning_rate": 1.3734416584749633e-06, + "loss": 1.2182, + "step": 15388 + }, + { + "epoch": 0.8413542364309828, + "grad_norm": 1.5000720024108887, + "learning_rate": 1.37251754095993e-06, + "loss": 1.4049, + "step": 15389 + }, + { + "epoch": 0.8414089088747044, + "grad_norm": 2.0340538024902344, + "learning_rate": 1.3715937115367829e-06, + "loss": 1.4445, + "step": 15390 + }, + { + "epoch": 0.841463581318426, + "grad_norm": 1.6905357837677002, + "learning_rate": 1.3706701702363701e-06, + "loss": 1.3706, + "step": 15391 + }, + { + "epoch": 0.8415182537621475, + "grad_norm": 1.513433575630188, + "learning_rate": 1.3697469170895282e-06, + "loss": 1.2257, + "step": 15392 + }, + { + "epoch": 0.8415729262058691, + "grad_norm": 1.606351613998413, + "learning_rate": 1.3688239521270897e-06, + "loss": 1.6504, + "step": 15393 + }, + { + "epoch": 0.8416275986495907, + "grad_norm": 1.5846961736679077, + "learning_rate": 1.3679012753798726e-06, + "loss": 1.3861, + "step": 15394 + }, + { + "epoch": 0.8416822710933122, + "grad_norm": 1.2967584133148193, + "learning_rate": 1.366978886878685e-06, + "loss": 1.4375, + "step": 15395 + }, + { + "epoch": 0.8417369435370338, + "grad_norm": 1.4562664031982422, + "learning_rate": 1.3660567866543328e-06, + "loss": 1.3241, + "step": 15396 + }, + { + "epoch": 0.8417916159807552, + "grad_norm": 1.8297616243362427, + "learning_rate": 1.3651349747376053e-06, + "loss": 1.3544, + "step": 15397 + }, + { + "epoch": 0.8418462884244768, + "grad_norm": 1.512551188468933, + "learning_rate": 1.364213451159281e-06, + "loss": 1.3662, + "step": 15398 + }, + { + "epoch": 0.8419009608681984, + "grad_norm": 1.577147126197815, + "learning_rate": 1.363292215950135e-06, + "loss": 1.475, + "step": 15399 + }, + { + "epoch": 0.8419556333119199, + "grad_norm": 1.543370246887207, + "learning_rate": 1.3623712691409274e-06, + "loss": 1.3866, + "step": 15400 + }, + { + "epoch": 0.8420103057556415, + "grad_norm": 1.4960063695907593, + "learning_rate": 1.3614506107624148e-06, + "loss": 1.6163, + "step": 15401 + }, + { + "epoch": 0.8420649781993631, + "grad_norm": 1.5556985139846802, + "learning_rate": 1.3605302408453359e-06, + "loss": 1.5045, + "step": 15402 + }, + { + "epoch": 0.8421196506430846, + "grad_norm": 1.4662050008773804, + "learning_rate": 1.3596101594204248e-06, + "loss": 1.4776, + "step": 15403 + }, + { + "epoch": 0.8421743230868062, + "grad_norm": 1.8018734455108643, + "learning_rate": 1.358690366518407e-06, + "loss": 1.6842, + "step": 15404 + }, + { + "epoch": 0.8422289955305278, + "grad_norm": 1.550472378730774, + "learning_rate": 1.3577708621699948e-06, + "loss": 1.4471, + "step": 15405 + }, + { + "epoch": 0.8422836679742493, + "grad_norm": 1.5063472986221313, + "learning_rate": 1.3568516464058946e-06, + "loss": 1.4926, + "step": 15406 + }, + { + "epoch": 0.8423383404179708, + "grad_norm": 1.3714293241500854, + "learning_rate": 1.3559327192567984e-06, + "loss": 1.4917, + "step": 15407 + }, + { + "epoch": 0.8423930128616924, + "grad_norm": 1.869659185409546, + "learning_rate": 1.355014080753393e-06, + "loss": 1.2584, + "step": 15408 + }, + { + "epoch": 0.8424476853054139, + "grad_norm": 1.7315988540649414, + "learning_rate": 1.3540957309263513e-06, + "loss": 1.444, + "step": 15409 + }, + { + "epoch": 0.8425023577491355, + "grad_norm": 1.7309540510177612, + "learning_rate": 1.3531776698063436e-06, + "loss": 1.3558, + "step": 15410 + }, + { + "epoch": 0.842557030192857, + "grad_norm": 1.4161094427108765, + "learning_rate": 1.3522598974240241e-06, + "loss": 1.465, + "step": 15411 + }, + { + "epoch": 0.8426117026365786, + "grad_norm": 1.6915781497955322, + "learning_rate": 1.3513424138100372e-06, + "loss": 1.3235, + "step": 15412 + }, + { + "epoch": 0.8426663750803002, + "grad_norm": 1.617344617843628, + "learning_rate": 1.350425218995024e-06, + "loss": 1.5815, + "step": 15413 + }, + { + "epoch": 0.8427210475240217, + "grad_norm": 1.6991477012634277, + "learning_rate": 1.3495083130096066e-06, + "loss": 1.178, + "step": 15414 + }, + { + "epoch": 0.8427757199677433, + "grad_norm": 1.6984498500823975, + "learning_rate": 1.3485916958844093e-06, + "loss": 1.3889, + "step": 15415 + }, + { + "epoch": 0.8428303924114648, + "grad_norm": 1.4264616966247559, + "learning_rate": 1.3476753676500355e-06, + "loss": 1.4013, + "step": 15416 + }, + { + "epoch": 0.8428850648551863, + "grad_norm": 2.083688259124756, + "learning_rate": 1.3467593283370817e-06, + "loss": 1.4727, + "step": 15417 + }, + { + "epoch": 0.8429397372989079, + "grad_norm": 1.362945795059204, + "learning_rate": 1.3458435779761425e-06, + "loss": 1.4339, + "step": 15418 + }, + { + "epoch": 0.8429944097426295, + "grad_norm": 1.2509227991104126, + "learning_rate": 1.3449281165977935e-06, + "loss": 1.5038, + "step": 15419 + }, + { + "epoch": 0.843049082186351, + "grad_norm": 1.6720625162124634, + "learning_rate": 1.3440129442326045e-06, + "loss": 1.4994, + "step": 15420 + }, + { + "epoch": 0.8431037546300726, + "grad_norm": 1.233627200126648, + "learning_rate": 1.3430980609111354e-06, + "loss": 1.5306, + "step": 15421 + }, + { + "epoch": 0.8431584270737942, + "grad_norm": 1.7021801471710205, + "learning_rate": 1.3421834666639355e-06, + "loss": 1.563, + "step": 15422 + }, + { + "epoch": 0.8432130995175157, + "grad_norm": 1.53375244140625, + "learning_rate": 1.3412691615215445e-06, + "loss": 1.3418, + "step": 15423 + }, + { + "epoch": 0.8432677719612373, + "grad_norm": 1.482475757598877, + "learning_rate": 1.3403551455144958e-06, + "loss": 1.5792, + "step": 15424 + }, + { + "epoch": 0.8433224444049587, + "grad_norm": 1.5231995582580566, + "learning_rate": 1.3394414186733096e-06, + "loss": 1.5684, + "step": 15425 + }, + { + "epoch": 0.8433771168486803, + "grad_norm": 1.436405897140503, + "learning_rate": 1.3385279810284956e-06, + "loss": 1.4654, + "step": 15426 + }, + { + "epoch": 0.8434317892924019, + "grad_norm": 1.5038273334503174, + "learning_rate": 1.3376148326105586e-06, + "loss": 1.3542, + "step": 15427 + }, + { + "epoch": 0.8434864617361234, + "grad_norm": 1.5080376863479614, + "learning_rate": 1.3367019734499876e-06, + "loss": 1.3842, + "step": 15428 + }, + { + "epoch": 0.843541134179845, + "grad_norm": 1.534467339515686, + "learning_rate": 1.3357894035772678e-06, + "loss": 1.4025, + "step": 15429 + }, + { + "epoch": 0.8435958066235666, + "grad_norm": 1.3132355213165283, + "learning_rate": 1.3348771230228718e-06, + "loss": 1.7698, + "step": 15430 + }, + { + "epoch": 0.8436504790672881, + "grad_norm": 1.5655488967895508, + "learning_rate": 1.333965131817263e-06, + "loss": 1.2451, + "step": 15431 + }, + { + "epoch": 0.8437051515110097, + "grad_norm": 1.3893215656280518, + "learning_rate": 1.3330534299908925e-06, + "loss": 1.5506, + "step": 15432 + }, + { + "epoch": 0.8437598239547313, + "grad_norm": 1.5054311752319336, + "learning_rate": 1.3321420175742061e-06, + "loss": 1.5054, + "step": 15433 + }, + { + "epoch": 0.8438144963984527, + "grad_norm": 1.610314965248108, + "learning_rate": 1.3312308945976348e-06, + "loss": 1.2796, + "step": 15434 + }, + { + "epoch": 0.8438691688421743, + "grad_norm": 1.677923321723938, + "learning_rate": 1.330320061091609e-06, + "loss": 1.3066, + "step": 15435 + }, + { + "epoch": 0.8439238412858959, + "grad_norm": 1.4105974435806274, + "learning_rate": 1.3294095170865395e-06, + "loss": 1.3584, + "step": 15436 + }, + { + "epoch": 0.8439785137296174, + "grad_norm": 1.6382321119308472, + "learning_rate": 1.3284992626128312e-06, + "loss": 1.4639, + "step": 15437 + }, + { + "epoch": 0.844033186173339, + "grad_norm": 1.850294828414917, + "learning_rate": 1.327589297700882e-06, + "loss": 1.4612, + "step": 15438 + }, + { + "epoch": 0.8440878586170605, + "grad_norm": 1.5828649997711182, + "learning_rate": 1.3266796223810773e-06, + "loss": 1.7063, + "step": 15439 + }, + { + "epoch": 0.8441425310607821, + "grad_norm": 1.6126737594604492, + "learning_rate": 1.32577023668379e-06, + "loss": 1.4453, + "step": 15440 + }, + { + "epoch": 0.8441972035045037, + "grad_norm": 1.6148384809494019, + "learning_rate": 1.3248611406393918e-06, + "loss": 1.2195, + "step": 15441 + }, + { + "epoch": 0.8442518759482252, + "grad_norm": 1.6396235227584839, + "learning_rate": 1.3239523342782345e-06, + "loss": 1.3955, + "step": 15442 + }, + { + "epoch": 0.8443065483919467, + "grad_norm": 1.6060599088668823, + "learning_rate": 1.3230438176306693e-06, + "loss": 1.225, + "step": 15443 + }, + { + "epoch": 0.8443612208356683, + "grad_norm": 1.5057380199432373, + "learning_rate": 1.3221355907270329e-06, + "loss": 1.6147, + "step": 15444 + }, + { + "epoch": 0.8444158932793898, + "grad_norm": 1.5260467529296875, + "learning_rate": 1.321227653597653e-06, + "loss": 1.4556, + "step": 15445 + }, + { + "epoch": 0.8444705657231114, + "grad_norm": 1.6803529262542725, + "learning_rate": 1.320320006272846e-06, + "loss": 1.3609, + "step": 15446 + }, + { + "epoch": 0.844525238166833, + "grad_norm": 1.596247911453247, + "learning_rate": 1.3194126487829218e-06, + "loss": 1.4672, + "step": 15447 + }, + { + "epoch": 0.8445799106105545, + "grad_norm": 1.8026820421218872, + "learning_rate": 1.318505581158177e-06, + "loss": 1.0869, + "step": 15448 + }, + { + "epoch": 0.8446345830542761, + "grad_norm": 1.3918644189834595, + "learning_rate": 1.3175988034289045e-06, + "loss": 1.5849, + "step": 15449 + }, + { + "epoch": 0.8446892554979977, + "grad_norm": 1.508741021156311, + "learning_rate": 1.316692315625382e-06, + "loss": 1.3515, + "step": 15450 + }, + { + "epoch": 0.8447439279417192, + "grad_norm": 1.7175041437149048, + "learning_rate": 1.3157861177778764e-06, + "loss": 1.4963, + "step": 15451 + }, + { + "epoch": 0.8447986003854407, + "grad_norm": 1.8791074752807617, + "learning_rate": 1.3148802099166535e-06, + "loss": 1.3531, + "step": 15452 + }, + { + "epoch": 0.8448532728291622, + "grad_norm": 1.3503291606903076, + "learning_rate": 1.3139745920719605e-06, + "loss": 1.6212, + "step": 15453 + }, + { + "epoch": 0.8449079452728838, + "grad_norm": 1.5793201923370361, + "learning_rate": 1.313069264274035e-06, + "loss": 1.2859, + "step": 15454 + }, + { + "epoch": 0.8449626177166054, + "grad_norm": 1.6177775859832764, + "learning_rate": 1.3121642265531154e-06, + "loss": 1.4911, + "step": 15455 + }, + { + "epoch": 0.8450172901603269, + "grad_norm": 1.7582530975341797, + "learning_rate": 1.3112594789394174e-06, + "loss": 1.4735, + "step": 15456 + }, + { + "epoch": 0.8450719626040485, + "grad_norm": 1.5270520448684692, + "learning_rate": 1.3103550214631544e-06, + "loss": 1.4248, + "step": 15457 + }, + { + "epoch": 0.8451266350477701, + "grad_norm": 1.5727989673614502, + "learning_rate": 1.3094508541545282e-06, + "loss": 1.3069, + "step": 15458 + }, + { + "epoch": 0.8451813074914916, + "grad_norm": 1.7430434226989746, + "learning_rate": 1.3085469770437286e-06, + "loss": 1.4164, + "step": 15459 + }, + { + "epoch": 0.8452359799352132, + "grad_norm": 1.4259239435195923, + "learning_rate": 1.307643390160943e-06, + "loss": 1.5881, + "step": 15460 + }, + { + "epoch": 0.8452906523789347, + "grad_norm": 1.4007580280303955, + "learning_rate": 1.3067400935363427e-06, + "loss": 1.5049, + "step": 15461 + }, + { + "epoch": 0.8453453248226562, + "grad_norm": 1.3614609241485596, + "learning_rate": 1.305837087200087e-06, + "loss": 1.3946, + "step": 15462 + }, + { + "epoch": 0.8453999972663778, + "grad_norm": 1.4846724271774292, + "learning_rate": 1.3049343711823347e-06, + "loss": 1.5184, + "step": 15463 + }, + { + "epoch": 0.8454546697100994, + "grad_norm": 1.5079641342163086, + "learning_rate": 1.3040319455132277e-06, + "loss": 1.3858, + "step": 15464 + }, + { + "epoch": 0.8455093421538209, + "grad_norm": 1.2874866724014282, + "learning_rate": 1.3031298102228974e-06, + "loss": 1.4052, + "step": 15465 + }, + { + "epoch": 0.8455640145975425, + "grad_norm": 1.4234482049942017, + "learning_rate": 1.3022279653414728e-06, + "loss": 1.6367, + "step": 15466 + }, + { + "epoch": 0.845618687041264, + "grad_norm": 1.640803575515747, + "learning_rate": 1.3013264108990654e-06, + "loss": 1.4228, + "step": 15467 + }, + { + "epoch": 0.8456733594849856, + "grad_norm": 1.9093936681747437, + "learning_rate": 1.30042514692578e-06, + "loss": 1.5081, + "step": 15468 + }, + { + "epoch": 0.8457280319287072, + "grad_norm": 1.161138892173767, + "learning_rate": 1.299524173451715e-06, + "loss": 1.681, + "step": 15469 + }, + { + "epoch": 0.8457827043724286, + "grad_norm": 1.6647077798843384, + "learning_rate": 1.298623490506955e-06, + "loss": 1.5143, + "step": 15470 + }, + { + "epoch": 0.8458373768161502, + "grad_norm": 1.5629665851593018, + "learning_rate": 1.2977230981215738e-06, + "loss": 1.4002, + "step": 15471 + }, + { + "epoch": 0.8458920492598718, + "grad_norm": 1.5682553052902222, + "learning_rate": 1.2968229963256395e-06, + "loss": 1.4639, + "step": 15472 + }, + { + "epoch": 0.8459467217035933, + "grad_norm": 1.680696725845337, + "learning_rate": 1.295923185149206e-06, + "loss": 1.3998, + "step": 15473 + }, + { + "epoch": 0.8460013941473149, + "grad_norm": 1.2437089681625366, + "learning_rate": 1.2950236646223246e-06, + "loss": 1.617, + "step": 15474 + }, + { + "epoch": 0.8460560665910365, + "grad_norm": 1.3186825513839722, + "learning_rate": 1.2941244347750292e-06, + "loss": 1.3756, + "step": 15475 + }, + { + "epoch": 0.846110739034758, + "grad_norm": 1.409908652305603, + "learning_rate": 1.2932254956373457e-06, + "loss": 1.2997, + "step": 15476 + }, + { + "epoch": 0.8461654114784796, + "grad_norm": 1.4978301525115967, + "learning_rate": 1.292326847239297e-06, + "loss": 1.5459, + "step": 15477 + }, + { + "epoch": 0.8462200839222012, + "grad_norm": 1.6093852519989014, + "learning_rate": 1.2914284896108875e-06, + "loss": 1.409, + "step": 15478 + }, + { + "epoch": 0.8462747563659226, + "grad_norm": 1.3467849493026733, + "learning_rate": 1.2905304227821136e-06, + "loss": 1.3688, + "step": 15479 + }, + { + "epoch": 0.8463294288096442, + "grad_norm": 1.6958322525024414, + "learning_rate": 1.289632646782969e-06, + "loss": 1.4878, + "step": 15480 + }, + { + "epoch": 0.8463841012533657, + "grad_norm": 1.9809236526489258, + "learning_rate": 1.2887351616434285e-06, + "loss": 1.3712, + "step": 15481 + }, + { + "epoch": 0.8464387736970873, + "grad_norm": 1.2799909114837646, + "learning_rate": 1.2878379673934616e-06, + "loss": 1.7038, + "step": 15482 + }, + { + "epoch": 0.8464934461408089, + "grad_norm": 2.0325944423675537, + "learning_rate": 1.286941064063031e-06, + "loss": 1.3453, + "step": 15483 + }, + { + "epoch": 0.8465481185845304, + "grad_norm": 1.5556371212005615, + "learning_rate": 1.2860444516820835e-06, + "loss": 1.2943, + "step": 15484 + }, + { + "epoch": 0.846602791028252, + "grad_norm": 1.3062514066696167, + "learning_rate": 1.2851481302805603e-06, + "loss": 1.3052, + "step": 15485 + }, + { + "epoch": 0.8466574634719736, + "grad_norm": 2.057631254196167, + "learning_rate": 1.2842520998883912e-06, + "loss": 1.577, + "step": 15486 + }, + { + "epoch": 0.846712135915695, + "grad_norm": 1.2267210483551025, + "learning_rate": 1.2833563605354938e-06, + "loss": 1.4033, + "step": 15487 + }, + { + "epoch": 0.8467668083594166, + "grad_norm": 1.5008405447006226, + "learning_rate": 1.282460912251784e-06, + "loss": 1.1615, + "step": 15488 + }, + { + "epoch": 0.8468214808031382, + "grad_norm": 1.6336886882781982, + "learning_rate": 1.2815657550671612e-06, + "loss": 1.5358, + "step": 15489 + }, + { + "epoch": 0.8468761532468597, + "grad_norm": 1.3280837535858154, + "learning_rate": 1.2806708890115138e-06, + "loss": 1.5527, + "step": 15490 + }, + { + "epoch": 0.8469308256905813, + "grad_norm": 1.4625627994537354, + "learning_rate": 1.279776314114728e-06, + "loss": 1.4462, + "step": 15491 + }, + { + "epoch": 0.8469854981343029, + "grad_norm": 1.332884430885315, + "learning_rate": 1.278882030406674e-06, + "loss": 1.6362, + "step": 15492 + }, + { + "epoch": 0.8470401705780244, + "grad_norm": 2.0592854022979736, + "learning_rate": 1.2779880379172105e-06, + "loss": 1.4092, + "step": 15493 + }, + { + "epoch": 0.847094843021746, + "grad_norm": 1.5086426734924316, + "learning_rate": 1.2770943366761968e-06, + "loss": 1.4324, + "step": 15494 + }, + { + "epoch": 0.8471495154654676, + "grad_norm": 1.5858250856399536, + "learning_rate": 1.2762009267134712e-06, + "loss": 1.4901, + "step": 15495 + }, + { + "epoch": 0.847204187909189, + "grad_norm": 1.4031720161437988, + "learning_rate": 1.2753078080588688e-06, + "loss": 1.3591, + "step": 15496 + }, + { + "epoch": 0.8472588603529106, + "grad_norm": 1.5981481075286865, + "learning_rate": 1.2744149807422113e-06, + "loss": 1.529, + "step": 15497 + }, + { + "epoch": 0.8473135327966321, + "grad_norm": 1.7114653587341309, + "learning_rate": 1.2735224447933104e-06, + "loss": 1.4497, + "step": 15498 + }, + { + "epoch": 0.8473682052403537, + "grad_norm": 1.3193483352661133, + "learning_rate": 1.2726302002419744e-06, + "loss": 1.5109, + "step": 15499 + }, + { + "epoch": 0.8474228776840753, + "grad_norm": 1.6866532564163208, + "learning_rate": 1.2717382471179961e-06, + "loss": 1.3944, + "step": 15500 + }, + { + "epoch": 0.8474775501277968, + "grad_norm": 1.3692790269851685, + "learning_rate": 1.2708465854511565e-06, + "loss": 1.5193, + "step": 15501 + }, + { + "epoch": 0.8475322225715184, + "grad_norm": 1.6035032272338867, + "learning_rate": 1.2699552152712348e-06, + "loss": 1.7265, + "step": 15502 + }, + { + "epoch": 0.84758689501524, + "grad_norm": 1.8852899074554443, + "learning_rate": 1.2690641366079947e-06, + "loss": 1.4046, + "step": 15503 + }, + { + "epoch": 0.8476415674589615, + "grad_norm": 1.461191177368164, + "learning_rate": 1.2681733494911897e-06, + "loss": 1.5954, + "step": 15504 + }, + { + "epoch": 0.8476962399026831, + "grad_norm": 1.3110108375549316, + "learning_rate": 1.2672828539505665e-06, + "loss": 1.629, + "step": 15505 + }, + { + "epoch": 0.8477509123464047, + "grad_norm": 1.6378823518753052, + "learning_rate": 1.2663926500158618e-06, + "loss": 1.2121, + "step": 15506 + }, + { + "epoch": 0.8478055847901261, + "grad_norm": 1.4855636358261108, + "learning_rate": 1.2655027377167994e-06, + "loss": 1.3976, + "step": 15507 + }, + { + "epoch": 0.8478602572338477, + "grad_norm": 1.5641279220581055, + "learning_rate": 1.2646131170830977e-06, + "loss": 1.4595, + "step": 15508 + }, + { + "epoch": 0.8479149296775693, + "grad_norm": 1.4391876459121704, + "learning_rate": 1.2637237881444619e-06, + "loss": 1.2911, + "step": 15509 + }, + { + "epoch": 0.8479696021212908, + "grad_norm": 1.3578263521194458, + "learning_rate": 1.2628347509305905e-06, + "loss": 1.3406, + "step": 15510 + }, + { + "epoch": 0.8480242745650124, + "grad_norm": 1.8352646827697754, + "learning_rate": 1.2619460054711685e-06, + "loss": 1.3164, + "step": 15511 + }, + { + "epoch": 0.8480789470087339, + "grad_norm": 1.6550980806350708, + "learning_rate": 1.2610575517958713e-06, + "loss": 1.5826, + "step": 15512 + }, + { + "epoch": 0.8481336194524555, + "grad_norm": 1.2215900421142578, + "learning_rate": 1.2601693899343714e-06, + "loss": 1.5302, + "step": 15513 + }, + { + "epoch": 0.8481882918961771, + "grad_norm": 1.4901865720748901, + "learning_rate": 1.2592815199163244e-06, + "loss": 1.289, + "step": 15514 + }, + { + "epoch": 0.8482429643398985, + "grad_norm": 1.4502055644989014, + "learning_rate": 1.2583939417713764e-06, + "loss": 1.3662, + "step": 15515 + }, + { + "epoch": 0.8482976367836201, + "grad_norm": 1.6860284805297852, + "learning_rate": 1.257506655529168e-06, + "loss": 1.4863, + "step": 15516 + }, + { + "epoch": 0.8483523092273417, + "grad_norm": 1.7113852500915527, + "learning_rate": 1.2566196612193293e-06, + "loss": 1.2796, + "step": 15517 + }, + { + "epoch": 0.8484069816710632, + "grad_norm": 1.6432462930679321, + "learning_rate": 1.2557329588714739e-06, + "loss": 1.3097, + "step": 15518 + }, + { + "epoch": 0.8484616541147848, + "grad_norm": 1.9729468822479248, + "learning_rate": 1.2548465485152162e-06, + "loss": 1.3794, + "step": 15519 + }, + { + "epoch": 0.8485163265585064, + "grad_norm": 1.7551523447036743, + "learning_rate": 1.2539604301801544e-06, + "loss": 1.3741, + "step": 15520 + }, + { + "epoch": 0.8485709990022279, + "grad_norm": 1.7696764469146729, + "learning_rate": 1.2530746038958741e-06, + "loss": 1.4002, + "step": 15521 + }, + { + "epoch": 0.8486256714459495, + "grad_norm": 1.5503292083740234, + "learning_rate": 1.2521890696919604e-06, + "loss": 1.674, + "step": 15522 + }, + { + "epoch": 0.8486803438896711, + "grad_norm": 1.6390571594238281, + "learning_rate": 1.2513038275979805e-06, + "loss": 1.4867, + "step": 15523 + }, + { + "epoch": 0.8487350163333925, + "grad_norm": 1.0455760955810547, + "learning_rate": 1.2504188776434955e-06, + "loss": 1.6454, + "step": 15524 + }, + { + "epoch": 0.8487896887771141, + "grad_norm": 1.4293532371520996, + "learning_rate": 1.2495342198580562e-06, + "loss": 1.5598, + "step": 15525 + }, + { + "epoch": 0.8488443612208356, + "grad_norm": 1.9301872253417969, + "learning_rate": 1.2486498542712e-06, + "loss": 1.2814, + "step": 15526 + }, + { + "epoch": 0.8488990336645572, + "grad_norm": 1.6516512632369995, + "learning_rate": 1.2477657809124632e-06, + "loss": 1.4159, + "step": 15527 + }, + { + "epoch": 0.8489537061082788, + "grad_norm": 1.8449617624282837, + "learning_rate": 1.2468819998113658e-06, + "loss": 1.4872, + "step": 15528 + }, + { + "epoch": 0.8490083785520003, + "grad_norm": 1.5889009237289429, + "learning_rate": 1.2459985109974149e-06, + "loss": 1.3614, + "step": 15529 + }, + { + "epoch": 0.8490630509957219, + "grad_norm": 1.1361124515533447, + "learning_rate": 1.2451153145001183e-06, + "loss": 1.5594, + "step": 15530 + }, + { + "epoch": 0.8491177234394435, + "grad_norm": 1.5982187986373901, + "learning_rate": 1.2442324103489656e-06, + "loss": 1.4406, + "step": 15531 + }, + { + "epoch": 0.849172395883165, + "grad_norm": 1.4753446578979492, + "learning_rate": 1.2433497985734356e-06, + "loss": 1.1664, + "step": 15532 + }, + { + "epoch": 0.8492270683268865, + "grad_norm": 1.4070429801940918, + "learning_rate": 1.2424674792030067e-06, + "loss": 1.7637, + "step": 15533 + }, + { + "epoch": 0.8492817407706081, + "grad_norm": 1.2470479011535645, + "learning_rate": 1.2415854522671388e-06, + "loss": 1.6053, + "step": 15534 + }, + { + "epoch": 0.8493364132143296, + "grad_norm": 1.5688121318817139, + "learning_rate": 1.2407037177952852e-06, + "loss": 1.3858, + "step": 15535 + }, + { + "epoch": 0.8493910856580512, + "grad_norm": 1.287258505821228, + "learning_rate": 1.2398222758168887e-06, + "loss": 1.3636, + "step": 15536 + }, + { + "epoch": 0.8494457581017728, + "grad_norm": 1.6267545223236084, + "learning_rate": 1.2389411263613826e-06, + "loss": 1.4443, + "step": 15537 + }, + { + "epoch": 0.8495004305454943, + "grad_norm": 1.4654632806777954, + "learning_rate": 1.238060269458189e-06, + "loss": 1.5109, + "step": 15538 + }, + { + "epoch": 0.8495551029892159, + "grad_norm": 1.3782835006713867, + "learning_rate": 1.237179705136725e-06, + "loss": 1.4928, + "step": 15539 + }, + { + "epoch": 0.8496097754329374, + "grad_norm": 1.330258846282959, + "learning_rate": 1.2362994334263933e-06, + "loss": 1.5614, + "step": 15540 + }, + { + "epoch": 0.849664447876659, + "grad_norm": 1.6154375076293945, + "learning_rate": 1.2354194543565868e-06, + "loss": 1.5188, + "step": 15541 + }, + { + "epoch": 0.8497191203203805, + "grad_norm": 1.5429192781448364, + "learning_rate": 1.2345397679566927e-06, + "loss": 1.3977, + "step": 15542 + }, + { + "epoch": 0.849773792764102, + "grad_norm": 2.7397677898406982, + "learning_rate": 1.2336603742560826e-06, + "loss": 1.4554, + "step": 15543 + }, + { + "epoch": 0.8498284652078236, + "grad_norm": 1.7459954023361206, + "learning_rate": 1.232781273284126e-06, + "loss": 1.5483, + "step": 15544 + }, + { + "epoch": 0.8498831376515452, + "grad_norm": 1.9475924968719482, + "learning_rate": 1.231902465070176e-06, + "loss": 1.3396, + "step": 15545 + }, + { + "epoch": 0.8499378100952667, + "grad_norm": 1.7566919326782227, + "learning_rate": 1.2310239496435749e-06, + "loss": 1.5057, + "step": 15546 + }, + { + "epoch": 0.8499924825389883, + "grad_norm": 1.6270328760147095, + "learning_rate": 1.2301457270336637e-06, + "loss": 1.4096, + "step": 15547 + }, + { + "epoch": 0.8500471549827099, + "grad_norm": 1.5066838264465332, + "learning_rate": 1.2292677972697654e-06, + "loss": 1.3701, + "step": 15548 + }, + { + "epoch": 0.8501018274264314, + "grad_norm": 1.7160356044769287, + "learning_rate": 1.2283901603811964e-06, + "loss": 1.5685, + "step": 15549 + }, + { + "epoch": 0.850156499870153, + "grad_norm": 1.7110792398452759, + "learning_rate": 1.2275128163972638e-06, + "loss": 1.3845, + "step": 15550 + }, + { + "epoch": 0.8502111723138746, + "grad_norm": 1.363652229309082, + "learning_rate": 1.2266357653472626e-06, + "loss": 1.3912, + "step": 15551 + }, + { + "epoch": 0.850265844757596, + "grad_norm": 1.878206491470337, + "learning_rate": 1.2257590072604796e-06, + "loss": 1.5341, + "step": 15552 + }, + { + "epoch": 0.8503205172013176, + "grad_norm": 1.6748249530792236, + "learning_rate": 1.2248825421661937e-06, + "loss": 1.1958, + "step": 15553 + }, + { + "epoch": 0.8503751896450391, + "grad_norm": 1.515552043914795, + "learning_rate": 1.2240063700936722e-06, + "loss": 1.6111, + "step": 15554 + }, + { + "epoch": 0.8504298620887607, + "grad_norm": 1.5946969985961914, + "learning_rate": 1.2231304910721686e-06, + "loss": 1.2759, + "step": 15555 + }, + { + "epoch": 0.8504845345324823, + "grad_norm": 1.3948734998703003, + "learning_rate": 1.2222549051309362e-06, + "loss": 1.3525, + "step": 15556 + }, + { + "epoch": 0.8505392069762038, + "grad_norm": 1.9807153940200806, + "learning_rate": 1.2213796122992072e-06, + "loss": 1.4573, + "step": 15557 + }, + { + "epoch": 0.8505938794199254, + "grad_norm": 1.8674345016479492, + "learning_rate": 1.220504612606216e-06, + "loss": 1.4717, + "step": 15558 + }, + { + "epoch": 0.850648551863647, + "grad_norm": 1.7856640815734863, + "learning_rate": 1.2196299060811756e-06, + "loss": 1.2825, + "step": 15559 + }, + { + "epoch": 0.8507032243073684, + "grad_norm": 1.3262324333190918, + "learning_rate": 1.2187554927532963e-06, + "loss": 1.5987, + "step": 15560 + }, + { + "epoch": 0.85075789675109, + "grad_norm": 1.6202155351638794, + "learning_rate": 1.2178813726517779e-06, + "loss": 1.1703, + "step": 15561 + }, + { + "epoch": 0.8508125691948116, + "grad_norm": 1.7561519145965576, + "learning_rate": 1.2170075458058084e-06, + "loss": 1.3679, + "step": 15562 + }, + { + "epoch": 0.8508672416385331, + "grad_norm": 1.391623854637146, + "learning_rate": 1.2161340122445674e-06, + "loss": 1.3325, + "step": 15563 + }, + { + "epoch": 0.8509219140822547, + "grad_norm": 1.4573017358779907, + "learning_rate": 1.215260771997223e-06, + "loss": 1.6066, + "step": 15564 + }, + { + "epoch": 0.8509765865259763, + "grad_norm": 1.5626294612884521, + "learning_rate": 1.2143878250929364e-06, + "loss": 1.4996, + "step": 15565 + }, + { + "epoch": 0.8510312589696978, + "grad_norm": 1.4512755870819092, + "learning_rate": 1.213515171560854e-06, + "loss": 1.6133, + "step": 15566 + }, + { + "epoch": 0.8510859314134194, + "grad_norm": 1.6452040672302246, + "learning_rate": 1.2126428114301204e-06, + "loss": 1.4263, + "step": 15567 + }, + { + "epoch": 0.8511406038571409, + "grad_norm": 1.3643004894256592, + "learning_rate": 1.2117707447298633e-06, + "loss": 1.3961, + "step": 15568 + }, + { + "epoch": 0.8511952763008624, + "grad_norm": 1.25151789188385, + "learning_rate": 1.2108989714892006e-06, + "loss": 1.273, + "step": 15569 + }, + { + "epoch": 0.851249948744584, + "grad_norm": 1.4647589921951294, + "learning_rate": 1.2100274917372479e-06, + "loss": 1.3644, + "step": 15570 + }, + { + "epoch": 0.8513046211883055, + "grad_norm": 1.68436598777771, + "learning_rate": 1.209156305503102e-06, + "loss": 1.4064, + "step": 15571 + }, + { + "epoch": 0.8513592936320271, + "grad_norm": 1.5149974822998047, + "learning_rate": 1.2082854128158572e-06, + "loss": 1.3297, + "step": 15572 + }, + { + "epoch": 0.8514139660757487, + "grad_norm": 1.6744437217712402, + "learning_rate": 1.2074148137045926e-06, + "loss": 1.4775, + "step": 15573 + }, + { + "epoch": 0.8514686385194702, + "grad_norm": 1.6548837423324585, + "learning_rate": 1.2065445081983795e-06, + "loss": 1.4896, + "step": 15574 + }, + { + "epoch": 0.8515233109631918, + "grad_norm": 1.4815783500671387, + "learning_rate": 1.2056744963262813e-06, + "loss": 1.2456, + "step": 15575 + }, + { + "epoch": 0.8515779834069134, + "grad_norm": 1.4224542379379272, + "learning_rate": 1.2048047781173467e-06, + "loss": 1.5917, + "step": 15576 + }, + { + "epoch": 0.8516326558506349, + "grad_norm": 1.5210312604904175, + "learning_rate": 1.2039353536006171e-06, + "loss": 1.5584, + "step": 15577 + }, + { + "epoch": 0.8516873282943564, + "grad_norm": 1.3238548040390015, + "learning_rate": 1.2030662228051292e-06, + "loss": 1.3416, + "step": 15578 + }, + { + "epoch": 0.851742000738078, + "grad_norm": 1.4408174753189087, + "learning_rate": 1.2021973857599034e-06, + "loss": 1.4481, + "step": 15579 + }, + { + "epoch": 0.8517966731817995, + "grad_norm": 1.517709493637085, + "learning_rate": 1.2013288424939484e-06, + "loss": 1.569, + "step": 15580 + }, + { + "epoch": 0.8518513456255211, + "grad_norm": 1.3484822511672974, + "learning_rate": 1.2004605930362724e-06, + "loss": 1.4671, + "step": 15581 + }, + { + "epoch": 0.8519060180692426, + "grad_norm": 1.5594186782836914, + "learning_rate": 1.1995926374158663e-06, + "loss": 1.1848, + "step": 15582 + }, + { + "epoch": 0.8519606905129642, + "grad_norm": 1.4993265867233276, + "learning_rate": 1.1987249756617103e-06, + "loss": 1.2541, + "step": 15583 + }, + { + "epoch": 0.8520153629566858, + "grad_norm": 2.4193472862243652, + "learning_rate": 1.1978576078027825e-06, + "loss": 1.3703, + "step": 15584 + }, + { + "epoch": 0.8520700354004073, + "grad_norm": 1.834639310836792, + "learning_rate": 1.1969905338680421e-06, + "loss": 1.6354, + "step": 15585 + }, + { + "epoch": 0.8521247078441289, + "grad_norm": 1.5389461517333984, + "learning_rate": 1.1961237538864468e-06, + "loss": 1.4607, + "step": 15586 + }, + { + "epoch": 0.8521793802878505, + "grad_norm": 1.5715289115905762, + "learning_rate": 1.195257267886939e-06, + "loss": 1.2781, + "step": 15587 + }, + { + "epoch": 0.8522340527315719, + "grad_norm": 1.3242756128311157, + "learning_rate": 1.194391075898451e-06, + "loss": 1.3567, + "step": 15588 + }, + { + "epoch": 0.8522887251752935, + "grad_norm": 1.5552918910980225, + "learning_rate": 1.193525177949908e-06, + "loss": 1.392, + "step": 15589 + }, + { + "epoch": 0.8523433976190151, + "grad_norm": 2.378056287765503, + "learning_rate": 1.192659574070225e-06, + "loss": 1.2583, + "step": 15590 + }, + { + "epoch": 0.8523980700627366, + "grad_norm": 1.2726598978042603, + "learning_rate": 1.1917942642883028e-06, + "loss": 1.4863, + "step": 15591 + }, + { + "epoch": 0.8524527425064582, + "grad_norm": 2.248893976211548, + "learning_rate": 1.1909292486330427e-06, + "loss": 1.1895, + "step": 15592 + }, + { + "epoch": 0.8525074149501798, + "grad_norm": 1.5007624626159668, + "learning_rate": 1.1900645271333245e-06, + "loss": 1.2281, + "step": 15593 + }, + { + "epoch": 0.8525620873939013, + "grad_norm": 1.668338656425476, + "learning_rate": 1.1892000998180242e-06, + "loss": 1.639, + "step": 15594 + }, + { + "epoch": 0.8526167598376229, + "grad_norm": 1.4052176475524902, + "learning_rate": 1.1883359667160087e-06, + "loss": 1.3388, + "step": 15595 + }, + { + "epoch": 0.8526714322813443, + "grad_norm": 1.5884780883789062, + "learning_rate": 1.187472127856133e-06, + "loss": 1.5096, + "step": 15596 + }, + { + "epoch": 0.8527261047250659, + "grad_norm": 1.4651433229446411, + "learning_rate": 1.1866085832672403e-06, + "loss": 1.3089, + "step": 15597 + }, + { + "epoch": 0.8527807771687875, + "grad_norm": 1.4209623336791992, + "learning_rate": 1.185745332978171e-06, + "loss": 1.7167, + "step": 15598 + }, + { + "epoch": 0.852835449612509, + "grad_norm": 1.496969223022461, + "learning_rate": 1.1848823770177453e-06, + "loss": 1.6668, + "step": 15599 + }, + { + "epoch": 0.8528901220562306, + "grad_norm": 1.3757256269454956, + "learning_rate": 1.1840197154147836e-06, + "loss": 1.6571, + "step": 15600 + }, + { + "epoch": 0.8529447944999522, + "grad_norm": 1.2402927875518799, + "learning_rate": 1.1831573481980929e-06, + "loss": 1.4969, + "step": 15601 + }, + { + "epoch": 0.8529994669436737, + "grad_norm": 1.6102722883224487, + "learning_rate": 1.1822952753964667e-06, + "loss": 1.4231, + "step": 15602 + }, + { + "epoch": 0.8530541393873953, + "grad_norm": 1.6467891931533813, + "learning_rate": 1.1814334970386921e-06, + "loss": 1.4918, + "step": 15603 + }, + { + "epoch": 0.8531088118311169, + "grad_norm": 1.3382041454315186, + "learning_rate": 1.180572013153547e-06, + "loss": 1.579, + "step": 15604 + }, + { + "epoch": 0.8531634842748383, + "grad_norm": 1.9625906944274902, + "learning_rate": 1.1797108237697963e-06, + "loss": 1.443, + "step": 15605 + }, + { + "epoch": 0.8532181567185599, + "grad_norm": 1.4563803672790527, + "learning_rate": 1.1788499289162003e-06, + "loss": 1.599, + "step": 15606 + }, + { + "epoch": 0.8532728291622815, + "grad_norm": 1.0997357368469238, + "learning_rate": 1.1779893286215039e-06, + "loss": 1.7417, + "step": 15607 + }, + { + "epoch": 0.853327501606003, + "grad_norm": 1.688785433769226, + "learning_rate": 1.1771290229144438e-06, + "loss": 1.4018, + "step": 15608 + }, + { + "epoch": 0.8533821740497246, + "grad_norm": 1.5004847049713135, + "learning_rate": 1.1762690118237518e-06, + "loss": 1.5815, + "step": 15609 + }, + { + "epoch": 0.8534368464934461, + "grad_norm": 1.7042306661605835, + "learning_rate": 1.1754092953781426e-06, + "loss": 1.4031, + "step": 15610 + }, + { + "epoch": 0.8534915189371677, + "grad_norm": 1.5355440378189087, + "learning_rate": 1.1745498736063222e-06, + "loss": 1.301, + "step": 15611 + }, + { + "epoch": 0.8535461913808893, + "grad_norm": 1.3681422472000122, + "learning_rate": 1.173690746536994e-06, + "loss": 1.5371, + "step": 15612 + }, + { + "epoch": 0.8536008638246108, + "grad_norm": 1.4290268421173096, + "learning_rate": 1.172831914198842e-06, + "loss": 1.3761, + "step": 15613 + }, + { + "epoch": 0.8536555362683323, + "grad_norm": 1.3651286363601685, + "learning_rate": 1.1719733766205477e-06, + "loss": 1.1473, + "step": 15614 + }, + { + "epoch": 0.8537102087120539, + "grad_norm": 1.372665286064148, + "learning_rate": 1.171115133830777e-06, + "loss": 1.3715, + "step": 15615 + }, + { + "epoch": 0.8537648811557754, + "grad_norm": 1.4510542154312134, + "learning_rate": 1.1702571858581867e-06, + "loss": 1.4959, + "step": 15616 + }, + { + "epoch": 0.853819553599497, + "grad_norm": 1.6165471076965332, + "learning_rate": 1.1693995327314322e-06, + "loss": 1.4548, + "step": 15617 + }, + { + "epoch": 0.8538742260432186, + "grad_norm": 1.5863845348358154, + "learning_rate": 1.1685421744791481e-06, + "loss": 1.5487, + "step": 15618 + }, + { + "epoch": 0.8539288984869401, + "grad_norm": 1.515718698501587, + "learning_rate": 1.1676851111299625e-06, + "loss": 1.4707, + "step": 15619 + }, + { + "epoch": 0.8539835709306617, + "grad_norm": 1.342453956604004, + "learning_rate": 1.1668283427124994e-06, + "loss": 1.3899, + "step": 15620 + }, + { + "epoch": 0.8540382433743833, + "grad_norm": 1.644506573677063, + "learning_rate": 1.1659718692553646e-06, + "loss": 1.3503, + "step": 15621 + }, + { + "epoch": 0.8540929158181048, + "grad_norm": 1.472367286682129, + "learning_rate": 1.1651156907871575e-06, + "loss": 1.4088, + "step": 15622 + }, + { + "epoch": 0.8541475882618264, + "grad_norm": 1.1310111284255981, + "learning_rate": 1.1642598073364707e-06, + "loss": 1.4413, + "step": 15623 + }, + { + "epoch": 0.8542022607055478, + "grad_norm": 1.6639398336410522, + "learning_rate": 1.1634042189318828e-06, + "loss": 1.6003, + "step": 15624 + }, + { + "epoch": 0.8542569331492694, + "grad_norm": 1.438018798828125, + "learning_rate": 1.1625489256019618e-06, + "loss": 1.5171, + "step": 15625 + }, + { + "epoch": 0.854311605592991, + "grad_norm": 1.497096300125122, + "learning_rate": 1.1616939273752715e-06, + "loss": 1.5101, + "step": 15626 + }, + { + "epoch": 0.8543662780367125, + "grad_norm": 1.5536056756973267, + "learning_rate": 1.1608392242803613e-06, + "loss": 1.476, + "step": 15627 + }, + { + "epoch": 0.8544209504804341, + "grad_norm": 1.293907642364502, + "learning_rate": 1.1599848163457716e-06, + "loss": 1.4068, + "step": 15628 + }, + { + "epoch": 0.8544756229241557, + "grad_norm": 1.6417983770370483, + "learning_rate": 1.1591307036000321e-06, + "loss": 1.3874, + "step": 15629 + }, + { + "epoch": 0.8545302953678772, + "grad_norm": 1.69977867603302, + "learning_rate": 1.158276886071662e-06, + "loss": 1.4313, + "step": 15630 + }, + { + "epoch": 0.8545849678115988, + "grad_norm": 1.6308362483978271, + "learning_rate": 1.1574233637891762e-06, + "loss": 1.3821, + "step": 15631 + }, + { + "epoch": 0.8546396402553204, + "grad_norm": 1.675219178199768, + "learning_rate": 1.1565701367810744e-06, + "loss": 1.4582, + "step": 15632 + }, + { + "epoch": 0.8546943126990418, + "grad_norm": 1.5341150760650635, + "learning_rate": 1.1557172050758447e-06, + "loss": 1.4796, + "step": 15633 + }, + { + "epoch": 0.8547489851427634, + "grad_norm": 1.6458287239074707, + "learning_rate": 1.1548645687019745e-06, + "loss": 1.2952, + "step": 15634 + }, + { + "epoch": 0.854803657586485, + "grad_norm": 1.485860824584961, + "learning_rate": 1.1540122276879317e-06, + "loss": 1.6072, + "step": 15635 + }, + { + "epoch": 0.8548583300302065, + "grad_norm": 1.6960827112197876, + "learning_rate": 1.1531601820621763e-06, + "loss": 1.3292, + "step": 15636 + }, + { + "epoch": 0.8549130024739281, + "grad_norm": 1.3170599937438965, + "learning_rate": 1.1523084318531641e-06, + "loss": 1.4113, + "step": 15637 + }, + { + "epoch": 0.8549676749176496, + "grad_norm": 1.7960423231124878, + "learning_rate": 1.1514569770893347e-06, + "loss": 1.4414, + "step": 15638 + }, + { + "epoch": 0.8550223473613712, + "grad_norm": 1.4363477230072021, + "learning_rate": 1.1506058177991198e-06, + "loss": 1.2727, + "step": 15639 + }, + { + "epoch": 0.8550770198050928, + "grad_norm": 1.7628778219223022, + "learning_rate": 1.149754954010943e-06, + "loss": 1.3193, + "step": 15640 + }, + { + "epoch": 0.8551316922488142, + "grad_norm": 1.4527682065963745, + "learning_rate": 1.1489043857532167e-06, + "loss": 1.5082, + "step": 15641 + }, + { + "epoch": 0.8551863646925358, + "grad_norm": 1.5111173391342163, + "learning_rate": 1.1480541130543433e-06, + "loss": 1.4259, + "step": 15642 + }, + { + "epoch": 0.8552410371362574, + "grad_norm": 1.8166877031326294, + "learning_rate": 1.1472041359427145e-06, + "loss": 1.452, + "step": 15643 + }, + { + "epoch": 0.8552957095799789, + "grad_norm": 1.9304900169372559, + "learning_rate": 1.1463544544467109e-06, + "loss": 1.2721, + "step": 15644 + }, + { + "epoch": 0.8553503820237005, + "grad_norm": 1.9958536624908447, + "learning_rate": 1.14550506859471e-06, + "loss": 1.401, + "step": 15645 + }, + { + "epoch": 0.8554050544674221, + "grad_norm": 1.4282475709915161, + "learning_rate": 1.1446559784150724e-06, + "loss": 1.4267, + "step": 15646 + }, + { + "epoch": 0.8554597269111436, + "grad_norm": 1.7729977369308472, + "learning_rate": 1.1438071839361498e-06, + "loss": 1.3556, + "step": 15647 + }, + { + "epoch": 0.8555143993548652, + "grad_norm": 1.3450839519500732, + "learning_rate": 1.1429586851862884e-06, + "loss": 1.8054, + "step": 15648 + }, + { + "epoch": 0.8555690717985868, + "grad_norm": 1.6138776540756226, + "learning_rate": 1.14211048219382e-06, + "loss": 1.2658, + "step": 15649 + }, + { + "epoch": 0.8556237442423082, + "grad_norm": 1.7574822902679443, + "learning_rate": 1.1412625749870676e-06, + "loss": 1.5814, + "step": 15650 + }, + { + "epoch": 0.8556784166860298, + "grad_norm": 2.1208231449127197, + "learning_rate": 1.1404149635943462e-06, + "loss": 1.2827, + "step": 15651 + }, + { + "epoch": 0.8557330891297513, + "grad_norm": 1.4270210266113281, + "learning_rate": 1.13956764804396e-06, + "loss": 1.1316, + "step": 15652 + }, + { + "epoch": 0.8557877615734729, + "grad_norm": 1.3057130575180054, + "learning_rate": 1.1387206283642005e-06, + "loss": 1.4553, + "step": 15653 + }, + { + "epoch": 0.8558424340171945, + "grad_norm": 1.577943205833435, + "learning_rate": 1.1378739045833543e-06, + "loss": 1.5232, + "step": 15654 + }, + { + "epoch": 0.855897106460916, + "grad_norm": 1.6292786598205566, + "learning_rate": 1.1370274767296907e-06, + "loss": 1.3513, + "step": 15655 + }, + { + "epoch": 0.8559517789046376, + "grad_norm": 1.3182681798934937, + "learning_rate": 1.1361813448314796e-06, + "loss": 1.2399, + "step": 15656 + }, + { + "epoch": 0.8560064513483592, + "grad_norm": 2.074552059173584, + "learning_rate": 1.1353355089169737e-06, + "loss": 1.3465, + "step": 15657 + }, + { + "epoch": 0.8560611237920807, + "grad_norm": 1.6697914600372314, + "learning_rate": 1.134489969014414e-06, + "loss": 1.5276, + "step": 15658 + }, + { + "epoch": 0.8561157962358023, + "grad_norm": 1.381954550743103, + "learning_rate": 1.1336447251520398e-06, + "loss": 1.3871, + "step": 15659 + }, + { + "epoch": 0.8561704686795238, + "grad_norm": 1.8592467308044434, + "learning_rate": 1.1327997773580733e-06, + "loss": 1.43, + "step": 15660 + }, + { + "epoch": 0.8562251411232453, + "grad_norm": 1.5287166833877563, + "learning_rate": 1.1319551256607286e-06, + "loss": 1.409, + "step": 15661 + }, + { + "epoch": 0.8562798135669669, + "grad_norm": 1.6199748516082764, + "learning_rate": 1.131110770088214e-06, + "loss": 1.2364, + "step": 15662 + }, + { + "epoch": 0.8563344860106885, + "grad_norm": 1.5720365047454834, + "learning_rate": 1.1302667106687227e-06, + "loss": 1.4073, + "step": 15663 + }, + { + "epoch": 0.85638915845441, + "grad_norm": 1.5285899639129639, + "learning_rate": 1.1294229474304364e-06, + "loss": 1.5102, + "step": 15664 + }, + { + "epoch": 0.8564438308981316, + "grad_norm": 1.4254170656204224, + "learning_rate": 1.1285794804015349e-06, + "loss": 1.1859, + "step": 15665 + }, + { + "epoch": 0.8564985033418531, + "grad_norm": 1.3611830472946167, + "learning_rate": 1.1277363096101834e-06, + "loss": 1.4394, + "step": 15666 + }, + { + "epoch": 0.8565531757855747, + "grad_norm": 1.5301506519317627, + "learning_rate": 1.1268934350845351e-06, + "loss": 1.522, + "step": 15667 + }, + { + "epoch": 0.8566078482292963, + "grad_norm": 1.4789875745773315, + "learning_rate": 1.1260508568527362e-06, + "loss": 1.7173, + "step": 15668 + }, + { + "epoch": 0.8566625206730177, + "grad_norm": 1.616151213645935, + "learning_rate": 1.125208574942921e-06, + "loss": 1.4675, + "step": 15669 + }, + { + "epoch": 0.8567171931167393, + "grad_norm": 1.901903510093689, + "learning_rate": 1.124366589383219e-06, + "loss": 1.4573, + "step": 15670 + }, + { + "epoch": 0.8567718655604609, + "grad_norm": 1.4063893556594849, + "learning_rate": 1.1235249002017434e-06, + "loss": 1.2308, + "step": 15671 + }, + { + "epoch": 0.8568265380041824, + "grad_norm": 1.4308310747146606, + "learning_rate": 1.1226835074265985e-06, + "loss": 1.5453, + "step": 15672 + }, + { + "epoch": 0.856881210447904, + "grad_norm": 1.5111974477767944, + "learning_rate": 1.121842411085885e-06, + "loss": 1.5999, + "step": 15673 + }, + { + "epoch": 0.8569358828916256, + "grad_norm": 1.8557946681976318, + "learning_rate": 1.1210016112076871e-06, + "loss": 1.2408, + "step": 15674 + }, + { + "epoch": 0.8569905553353471, + "grad_norm": 1.4387643337249756, + "learning_rate": 1.120161107820078e-06, + "loss": 1.5054, + "step": 15675 + }, + { + "epoch": 0.8570452277790687, + "grad_norm": 1.7655446529388428, + "learning_rate": 1.1193209009511285e-06, + "loss": 1.4653, + "step": 15676 + }, + { + "epoch": 0.8570999002227903, + "grad_norm": 1.7493265867233276, + "learning_rate": 1.118480990628894e-06, + "loss": 1.5027, + "step": 15677 + }, + { + "epoch": 0.8571545726665117, + "grad_norm": 1.8541638851165771, + "learning_rate": 1.1176413768814198e-06, + "loss": 1.3694, + "step": 15678 + }, + { + "epoch": 0.8572092451102333, + "grad_norm": 1.5962584018707275, + "learning_rate": 1.1168020597367435e-06, + "loss": 1.3919, + "step": 15679 + }, + { + "epoch": 0.8572639175539548, + "grad_norm": 1.1366713047027588, + "learning_rate": 1.1159630392228904e-06, + "loss": 1.6187, + "step": 15680 + }, + { + "epoch": 0.8573185899976764, + "grad_norm": 2.39506196975708, + "learning_rate": 1.1151243153678782e-06, + "loss": 1.1817, + "step": 15681 + }, + { + "epoch": 0.857373262441398, + "grad_norm": 1.1334789991378784, + "learning_rate": 1.1142858881997155e-06, + "loss": 1.6629, + "step": 15682 + }, + { + "epoch": 0.8574279348851195, + "grad_norm": 1.3336299657821655, + "learning_rate": 1.1134477577463954e-06, + "loss": 1.3996, + "step": 15683 + }, + { + "epoch": 0.8574826073288411, + "grad_norm": 1.8646801710128784, + "learning_rate": 1.1126099240359101e-06, + "loss": 1.558, + "step": 15684 + }, + { + "epoch": 0.8575372797725627, + "grad_norm": 1.6249463558197021, + "learning_rate": 1.1117723870962337e-06, + "loss": 1.1306, + "step": 15685 + }, + { + "epoch": 0.8575919522162841, + "grad_norm": 1.6334398984909058, + "learning_rate": 1.1109351469553331e-06, + "loss": 1.3944, + "step": 15686 + }, + { + "epoch": 0.8576466246600057, + "grad_norm": 1.416678547859192, + "learning_rate": 1.1100982036411679e-06, + "loss": 1.5177, + "step": 15687 + }, + { + "epoch": 0.8577012971037273, + "grad_norm": 1.5232017040252686, + "learning_rate": 1.1092615571816855e-06, + "loss": 1.4541, + "step": 15688 + }, + { + "epoch": 0.8577559695474488, + "grad_norm": 1.268938660621643, + "learning_rate": 1.1084252076048195e-06, + "loss": 1.4485, + "step": 15689 + }, + { + "epoch": 0.8578106419911704, + "grad_norm": 1.270403504371643, + "learning_rate": 1.1075891549385032e-06, + "loss": 1.3627, + "step": 15690 + }, + { + "epoch": 0.857865314434892, + "grad_norm": 1.4292056560516357, + "learning_rate": 1.1067533992106516e-06, + "loss": 1.5209, + "step": 15691 + }, + { + "epoch": 0.8579199868786135, + "grad_norm": 1.7241824865341187, + "learning_rate": 1.1059179404491726e-06, + "loss": 1.387, + "step": 15692 + }, + { + "epoch": 0.8579746593223351, + "grad_norm": 1.7691088914871216, + "learning_rate": 1.105082778681964e-06, + "loss": 1.4841, + "step": 15693 + }, + { + "epoch": 0.8580293317660567, + "grad_norm": 1.3105238676071167, + "learning_rate": 1.1042479139369144e-06, + "loss": 1.5026, + "step": 15694 + }, + { + "epoch": 0.8580840042097782, + "grad_norm": 1.5034250020980835, + "learning_rate": 1.1034133462418995e-06, + "loss": 1.2457, + "step": 15695 + }, + { + "epoch": 0.8581386766534997, + "grad_norm": 1.362968921661377, + "learning_rate": 1.1025790756247912e-06, + "loss": 1.4836, + "step": 15696 + }, + { + "epoch": 0.8581933490972212, + "grad_norm": 3.36417555809021, + "learning_rate": 1.101745102113444e-06, + "loss": 1.2939, + "step": 15697 + }, + { + "epoch": 0.8582480215409428, + "grad_norm": 1.2676500082015991, + "learning_rate": 1.1009114257357101e-06, + "loss": 1.5049, + "step": 15698 + }, + { + "epoch": 0.8583026939846644, + "grad_norm": 1.474260687828064, + "learning_rate": 1.100078046519426e-06, + "loss": 1.4584, + "step": 15699 + }, + { + "epoch": 0.8583573664283859, + "grad_norm": 2.0739853382110596, + "learning_rate": 1.0992449644924186e-06, + "loss": 1.4563, + "step": 15700 + }, + { + "epoch": 0.8584120388721075, + "grad_norm": 1.6042486429214478, + "learning_rate": 1.0984121796825099e-06, + "loss": 1.2935, + "step": 15701 + }, + { + "epoch": 0.8584667113158291, + "grad_norm": 1.482509732246399, + "learning_rate": 1.0975796921175065e-06, + "loss": 1.334, + "step": 15702 + }, + { + "epoch": 0.8585213837595506, + "grad_norm": 1.5220478773117065, + "learning_rate": 1.096747501825206e-06, + "loss": 1.3804, + "step": 15703 + }, + { + "epoch": 0.8585760562032722, + "grad_norm": 1.5356076955795288, + "learning_rate": 1.0959156088334e-06, + "loss": 1.5206, + "step": 15704 + }, + { + "epoch": 0.8586307286469937, + "grad_norm": 1.467358946800232, + "learning_rate": 1.095084013169867e-06, + "loss": 1.5982, + "step": 15705 + }, + { + "epoch": 0.8586854010907152, + "grad_norm": 1.7103527784347534, + "learning_rate": 1.0942527148623738e-06, + "loss": 1.5511, + "step": 15706 + }, + { + "epoch": 0.8587400735344368, + "grad_norm": 1.5327386856079102, + "learning_rate": 1.0934217139386805e-06, + "loss": 1.3905, + "step": 15707 + }, + { + "epoch": 0.8587947459781584, + "grad_norm": 1.1448520421981812, + "learning_rate": 1.0925910104265359e-06, + "loss": 1.592, + "step": 15708 + }, + { + "epoch": 0.8588494184218799, + "grad_norm": 1.82705819606781, + "learning_rate": 1.0917606043536777e-06, + "loss": 1.2726, + "step": 15709 + }, + { + "epoch": 0.8589040908656015, + "grad_norm": 1.8711150884628296, + "learning_rate": 1.0909304957478394e-06, + "loss": 1.4519, + "step": 15710 + }, + { + "epoch": 0.858958763309323, + "grad_norm": 1.2651337385177612, + "learning_rate": 1.0901006846367347e-06, + "loss": 1.4803, + "step": 15711 + }, + { + "epoch": 0.8590134357530446, + "grad_norm": 1.6725102663040161, + "learning_rate": 1.0892711710480785e-06, + "loss": 1.5671, + "step": 15712 + }, + { + "epoch": 0.8590681081967662, + "grad_norm": 1.7963672876358032, + "learning_rate": 1.088441955009567e-06, + "loss": 1.3064, + "step": 15713 + }, + { + "epoch": 0.8591227806404876, + "grad_norm": 1.4343512058258057, + "learning_rate": 1.087613036548888e-06, + "loss": 1.345, + "step": 15714 + }, + { + "epoch": 0.8591774530842092, + "grad_norm": 1.4551281929016113, + "learning_rate": 1.0867844156937257e-06, + "loss": 1.3585, + "step": 15715 + }, + { + "epoch": 0.8592321255279308, + "grad_norm": 1.4990991353988647, + "learning_rate": 1.0859560924717483e-06, + "loss": 1.4054, + "step": 15716 + }, + { + "epoch": 0.8592867979716523, + "grad_norm": 1.4028111696243286, + "learning_rate": 1.0851280669106135e-06, + "loss": 1.443, + "step": 15717 + }, + { + "epoch": 0.8593414704153739, + "grad_norm": 1.486256718635559, + "learning_rate": 1.0843003390379714e-06, + "loss": 1.1581, + "step": 15718 + }, + { + "epoch": 0.8593961428590955, + "grad_norm": 1.3853139877319336, + "learning_rate": 1.0834729088814632e-06, + "loss": 1.6219, + "step": 15719 + }, + { + "epoch": 0.859450815302817, + "grad_norm": 1.687893271446228, + "learning_rate": 1.0826457764687148e-06, + "loss": 1.4952, + "step": 15720 + }, + { + "epoch": 0.8595054877465386, + "grad_norm": 1.5398674011230469, + "learning_rate": 1.0818189418273527e-06, + "loss": 1.4519, + "step": 15721 + }, + { + "epoch": 0.8595601601902602, + "grad_norm": 1.3233249187469482, + "learning_rate": 1.0809924049849819e-06, + "loss": 1.348, + "step": 15722 + }, + { + "epoch": 0.8596148326339816, + "grad_norm": 1.5012767314910889, + "learning_rate": 1.080166165969202e-06, + "loss": 1.6608, + "step": 15723 + }, + { + "epoch": 0.8596695050777032, + "grad_norm": 1.4424939155578613, + "learning_rate": 1.079340224807608e-06, + "loss": 1.4851, + "step": 15724 + }, + { + "epoch": 0.8597241775214247, + "grad_norm": 1.7853182554244995, + "learning_rate": 1.0785145815277741e-06, + "loss": 1.7292, + "step": 15725 + }, + { + "epoch": 0.8597788499651463, + "grad_norm": 1.6601850986480713, + "learning_rate": 1.0776892361572756e-06, + "loss": 1.2272, + "step": 15726 + }, + { + "epoch": 0.8598335224088679, + "grad_norm": 1.4374920129776, + "learning_rate": 1.0768641887236697e-06, + "loss": 1.3753, + "step": 15727 + }, + { + "epoch": 0.8598881948525894, + "grad_norm": 1.424635648727417, + "learning_rate": 1.0760394392545058e-06, + "loss": 1.4449, + "step": 15728 + }, + { + "epoch": 0.859942867296311, + "grad_norm": 1.4781116247177124, + "learning_rate": 1.0752149877773278e-06, + "loss": 1.414, + "step": 15729 + }, + { + "epoch": 0.8599975397400326, + "grad_norm": 1.364914894104004, + "learning_rate": 1.0743908343196629e-06, + "loss": 1.5086, + "step": 15730 + }, + { + "epoch": 0.860052212183754, + "grad_norm": 1.4908788204193115, + "learning_rate": 1.0735669789090341e-06, + "loss": 1.162, + "step": 15731 + }, + { + "epoch": 0.8601068846274756, + "grad_norm": 1.446491003036499, + "learning_rate": 1.0727434215729494e-06, + "loss": 1.3551, + "step": 15732 + }, + { + "epoch": 0.8601615570711972, + "grad_norm": 1.4046735763549805, + "learning_rate": 1.0719201623389107e-06, + "loss": 1.3715, + "step": 15733 + }, + { + "epoch": 0.8602162295149187, + "grad_norm": 1.114652395248413, + "learning_rate": 1.0710972012344067e-06, + "loss": 1.3528, + "step": 15734 + }, + { + "epoch": 0.8602709019586403, + "grad_norm": 1.2632797956466675, + "learning_rate": 1.0702745382869207e-06, + "loss": 1.6755, + "step": 15735 + }, + { + "epoch": 0.8603255744023619, + "grad_norm": 1.6641038656234741, + "learning_rate": 1.0694521735239226e-06, + "loss": 1.7537, + "step": 15736 + }, + { + "epoch": 0.8603802468460834, + "grad_norm": 1.2875763177871704, + "learning_rate": 1.068630106972871e-06, + "loss": 1.5352, + "step": 15737 + }, + { + "epoch": 0.860434919289805, + "grad_norm": 1.6867876052856445, + "learning_rate": 1.0678083386612193e-06, + "loss": 1.1971, + "step": 15738 + }, + { + "epoch": 0.8604895917335265, + "grad_norm": 1.78334641456604, + "learning_rate": 1.0669868686164053e-06, + "loss": 1.3201, + "step": 15739 + }, + { + "epoch": 0.860544264177248, + "grad_norm": 1.1907151937484741, + "learning_rate": 1.0661656968658641e-06, + "loss": 1.4945, + "step": 15740 + }, + { + "epoch": 0.8605989366209696, + "grad_norm": 1.4928948879241943, + "learning_rate": 1.0653448234370144e-06, + "loss": 1.555, + "step": 15741 + }, + { + "epoch": 0.8606536090646911, + "grad_norm": 1.2618262767791748, + "learning_rate": 1.064524248357265e-06, + "loss": 1.5466, + "step": 15742 + }, + { + "epoch": 0.8607082815084127, + "grad_norm": 1.4249032735824585, + "learning_rate": 1.0637039716540197e-06, + "loss": 1.431, + "step": 15743 + }, + { + "epoch": 0.8607629539521343, + "grad_norm": 1.487084984779358, + "learning_rate": 1.0628839933546697e-06, + "loss": 1.5617, + "step": 15744 + }, + { + "epoch": 0.8608176263958558, + "grad_norm": 1.2335537672042847, + "learning_rate": 1.0620643134865937e-06, + "loss": 1.4563, + "step": 15745 + }, + { + "epoch": 0.8608722988395774, + "grad_norm": 1.636313557624817, + "learning_rate": 1.0612449320771645e-06, + "loss": 1.2309, + "step": 15746 + }, + { + "epoch": 0.860926971283299, + "grad_norm": 1.4299263954162598, + "learning_rate": 1.0604258491537423e-06, + "loss": 1.3998, + "step": 15747 + }, + { + "epoch": 0.8609816437270205, + "grad_norm": 1.60265052318573, + "learning_rate": 1.0596070647436763e-06, + "loss": 1.4487, + "step": 15748 + }, + { + "epoch": 0.861036316170742, + "grad_norm": 1.7204716205596924, + "learning_rate": 1.0587885788743112e-06, + "loss": 1.3077, + "step": 15749 + }, + { + "epoch": 0.8610909886144636, + "grad_norm": 1.7573869228363037, + "learning_rate": 1.0579703915729777e-06, + "loss": 1.3503, + "step": 15750 + }, + { + "epoch": 0.8611456610581851, + "grad_norm": 2.0624372959136963, + "learning_rate": 1.0571525028669927e-06, + "loss": 1.6696, + "step": 15751 + }, + { + "epoch": 0.8612003335019067, + "grad_norm": 1.617044448852539, + "learning_rate": 1.0563349127836731e-06, + "loss": 1.458, + "step": 15752 + }, + { + "epoch": 0.8612550059456282, + "grad_norm": 1.6972997188568115, + "learning_rate": 1.055517621350316e-06, + "loss": 1.3642, + "step": 15753 + }, + { + "epoch": 0.8613096783893498, + "grad_norm": 1.3043183088302612, + "learning_rate": 1.0547006285942163e-06, + "loss": 1.3032, + "step": 15754 + }, + { + "epoch": 0.8613643508330714, + "grad_norm": 1.437731146812439, + "learning_rate": 1.0538839345426543e-06, + "loss": 1.361, + "step": 15755 + }, + { + "epoch": 0.8614190232767929, + "grad_norm": 1.7229710817337036, + "learning_rate": 1.0530675392228995e-06, + "loss": 1.4311, + "step": 15756 + }, + { + "epoch": 0.8614736957205145, + "grad_norm": 1.4971998929977417, + "learning_rate": 1.0522514426622143e-06, + "loss": 1.4995, + "step": 15757 + }, + { + "epoch": 0.8615283681642361, + "grad_norm": 1.356444239616394, + "learning_rate": 1.0514356448878505e-06, + "loss": 1.6445, + "step": 15758 + }, + { + "epoch": 0.8615830406079575, + "grad_norm": 1.5938100814819336, + "learning_rate": 1.050620145927046e-06, + "loss": 1.6463, + "step": 15759 + }, + { + "epoch": 0.8616377130516791, + "grad_norm": 1.5244104862213135, + "learning_rate": 1.049804945807038e-06, + "loss": 1.4411, + "step": 15760 + }, + { + "epoch": 0.8616923854954007, + "grad_norm": 1.8486499786376953, + "learning_rate": 1.048990044555045e-06, + "loss": 1.4768, + "step": 15761 + }, + { + "epoch": 0.8617470579391222, + "grad_norm": 1.726745843887329, + "learning_rate": 1.0481754421982758e-06, + "loss": 1.5604, + "step": 15762 + }, + { + "epoch": 0.8618017303828438, + "grad_norm": 1.3715864419937134, + "learning_rate": 1.047361138763937e-06, + "loss": 1.4711, + "step": 15763 + }, + { + "epoch": 0.8618564028265654, + "grad_norm": 1.7699878215789795, + "learning_rate": 1.0465471342792188e-06, + "loss": 1.301, + "step": 15764 + }, + { + "epoch": 0.8619110752702869, + "grad_norm": 1.5820122957229614, + "learning_rate": 1.0457334287712994e-06, + "loss": 1.2912, + "step": 15765 + }, + { + "epoch": 0.8619657477140085, + "grad_norm": 1.973320484161377, + "learning_rate": 1.0449200222673538e-06, + "loss": 1.4701, + "step": 15766 + }, + { + "epoch": 0.86202042015773, + "grad_norm": 1.365687608718872, + "learning_rate": 1.0441069147945414e-06, + "loss": 1.6925, + "step": 15767 + }, + { + "epoch": 0.8620750926014515, + "grad_norm": 1.7444894313812256, + "learning_rate": 1.043294106380016e-06, + "loss": 1.2195, + "step": 15768 + }, + { + "epoch": 0.8621297650451731, + "grad_norm": 1.4628565311431885, + "learning_rate": 1.0424815970509184e-06, + "loss": 1.3413, + "step": 15769 + }, + { + "epoch": 0.8621844374888946, + "grad_norm": 1.9160022735595703, + "learning_rate": 1.0416693868343796e-06, + "loss": 1.3197, + "step": 15770 + }, + { + "epoch": 0.8622391099326162, + "grad_norm": 1.3621846437454224, + "learning_rate": 1.0408574757575218e-06, + "loss": 1.4515, + "step": 15771 + }, + { + "epoch": 0.8622937823763378, + "grad_norm": 1.4363181591033936, + "learning_rate": 1.0400458638474552e-06, + "loss": 1.3925, + "step": 15772 + }, + { + "epoch": 0.8623484548200593, + "grad_norm": 1.546670913696289, + "learning_rate": 1.0392345511312806e-06, + "loss": 1.5321, + "step": 15773 + }, + { + "epoch": 0.8624031272637809, + "grad_norm": 1.7792054414749146, + "learning_rate": 1.038423537636094e-06, + "loss": 1.439, + "step": 15774 + }, + { + "epoch": 0.8624577997075025, + "grad_norm": 1.479426383972168, + "learning_rate": 1.0376128233889747e-06, + "loss": 1.6887, + "step": 15775 + }, + { + "epoch": 0.862512472151224, + "grad_norm": 1.5805630683898926, + "learning_rate": 1.0368024084169914e-06, + "loss": 1.511, + "step": 15776 + }, + { + "epoch": 0.8625671445949455, + "grad_norm": 1.5708431005477905, + "learning_rate": 1.035992292747211e-06, + "loss": 1.5562, + "step": 15777 + }, + { + "epoch": 0.8626218170386671, + "grad_norm": 1.3734947443008423, + "learning_rate": 1.035182476406682e-06, + "loss": 1.2406, + "step": 15778 + }, + { + "epoch": 0.8626764894823886, + "grad_norm": 1.6467784643173218, + "learning_rate": 1.034372959422445e-06, + "loss": 1.3858, + "step": 15779 + }, + { + "epoch": 0.8627311619261102, + "grad_norm": 1.4018973112106323, + "learning_rate": 1.033563741821536e-06, + "loss": 1.2451, + "step": 15780 + }, + { + "epoch": 0.8627858343698317, + "grad_norm": 1.8020074367523193, + "learning_rate": 1.0327548236309714e-06, + "loss": 1.6687, + "step": 15781 + }, + { + "epoch": 0.8628405068135533, + "grad_norm": 1.3744115829467773, + "learning_rate": 1.0319462048777685e-06, + "loss": 1.3766, + "step": 15782 + }, + { + "epoch": 0.8628951792572749, + "grad_norm": 1.210669755935669, + "learning_rate": 1.0311378855889243e-06, + "loss": 1.4403, + "step": 15783 + }, + { + "epoch": 0.8629498517009964, + "grad_norm": 1.30763578414917, + "learning_rate": 1.030329865791434e-06, + "loss": 1.4471, + "step": 15784 + }, + { + "epoch": 0.863004524144718, + "grad_norm": 1.555216670036316, + "learning_rate": 1.0295221455122762e-06, + "loss": 1.4657, + "step": 15785 + }, + { + "epoch": 0.8630591965884395, + "grad_norm": 1.4478809833526611, + "learning_rate": 1.0287147247784246e-06, + "loss": 1.5191, + "step": 15786 + }, + { + "epoch": 0.863113869032161, + "grad_norm": 2.041663885116577, + "learning_rate": 1.0279076036168379e-06, + "loss": 1.4402, + "step": 15787 + }, + { + "epoch": 0.8631685414758826, + "grad_norm": 1.8035916090011597, + "learning_rate": 1.027100782054471e-06, + "loss": 1.5555, + "step": 15788 + }, + { + "epoch": 0.8632232139196042, + "grad_norm": 1.8049547672271729, + "learning_rate": 1.0262942601182657e-06, + "loss": 1.3674, + "step": 15789 + }, + { + "epoch": 0.8632778863633257, + "grad_norm": 1.710492491722107, + "learning_rate": 1.0254880378351506e-06, + "loss": 1.4859, + "step": 15790 + }, + { + "epoch": 0.8633325588070473, + "grad_norm": 1.2597357034683228, + "learning_rate": 1.0246821152320507e-06, + "loss": 1.4542, + "step": 15791 + }, + { + "epoch": 0.8633872312507689, + "grad_norm": 1.4326156377792358, + "learning_rate": 1.0238764923358768e-06, + "loss": 1.5863, + "step": 15792 + }, + { + "epoch": 0.8634419036944904, + "grad_norm": 1.5642423629760742, + "learning_rate": 1.0230711691735273e-06, + "loss": 1.5933, + "step": 15793 + }, + { + "epoch": 0.863496576138212, + "grad_norm": 1.5932190418243408, + "learning_rate": 1.0222661457718986e-06, + "loss": 1.5215, + "step": 15794 + }, + { + "epoch": 0.8635512485819334, + "grad_norm": 1.4216209650039673, + "learning_rate": 1.0214614221578701e-06, + "loss": 1.6643, + "step": 15795 + }, + { + "epoch": 0.863605921025655, + "grad_norm": 1.102648138999939, + "learning_rate": 1.0206569983583137e-06, + "loss": 1.6083, + "step": 15796 + }, + { + "epoch": 0.8636605934693766, + "grad_norm": 1.6216754913330078, + "learning_rate": 1.0198528744000913e-06, + "loss": 1.2476, + "step": 15797 + }, + { + "epoch": 0.8637152659130981, + "grad_norm": 1.496405005455017, + "learning_rate": 1.0190490503100515e-06, + "loss": 1.5494, + "step": 15798 + }, + { + "epoch": 0.8637699383568197, + "grad_norm": 1.9485639333724976, + "learning_rate": 1.0182455261150404e-06, + "loss": 1.4618, + "step": 15799 + }, + { + "epoch": 0.8638246108005413, + "grad_norm": 1.596828818321228, + "learning_rate": 1.0174423018418877e-06, + "loss": 1.477, + "step": 15800 + }, + { + "epoch": 0.8638792832442628, + "grad_norm": 1.3330178260803223, + "learning_rate": 1.0166393775174121e-06, + "loss": 1.6986, + "step": 15801 + }, + { + "epoch": 0.8639339556879844, + "grad_norm": 1.757030725479126, + "learning_rate": 1.015836753168431e-06, + "loss": 1.3298, + "step": 15802 + }, + { + "epoch": 0.863988628131706, + "grad_norm": 1.6997394561767578, + "learning_rate": 1.0150344288217418e-06, + "loss": 1.5555, + "step": 15803 + }, + { + "epoch": 0.8640433005754274, + "grad_norm": 1.3913476467132568, + "learning_rate": 1.0142324045041351e-06, + "loss": 1.2811, + "step": 15804 + }, + { + "epoch": 0.864097973019149, + "grad_norm": 1.519248604774475, + "learning_rate": 1.0134306802423965e-06, + "loss": 1.3631, + "step": 15805 + }, + { + "epoch": 0.8641526454628706, + "grad_norm": 1.3381214141845703, + "learning_rate": 1.0126292560632944e-06, + "loss": 1.5813, + "step": 15806 + }, + { + "epoch": 0.8642073179065921, + "grad_norm": 1.373335838317871, + "learning_rate": 1.0118281319935896e-06, + "loss": 1.4864, + "step": 15807 + }, + { + "epoch": 0.8642619903503137, + "grad_norm": 1.3656039237976074, + "learning_rate": 1.0110273080600374e-06, + "loss": 1.4868, + "step": 15808 + }, + { + "epoch": 0.8643166627940352, + "grad_norm": 1.4266048669815063, + "learning_rate": 1.0102267842893753e-06, + "loss": 1.4901, + "step": 15809 + }, + { + "epoch": 0.8643713352377568, + "grad_norm": 2.5003998279571533, + "learning_rate": 1.0094265607083375e-06, + "loss": 1.5879, + "step": 15810 + }, + { + "epoch": 0.8644260076814784, + "grad_norm": 1.3975666761398315, + "learning_rate": 1.0086266373436427e-06, + "loss": 1.5696, + "step": 15811 + }, + { + "epoch": 0.8644806801251999, + "grad_norm": 1.4360883235931396, + "learning_rate": 1.0078270142220015e-06, + "loss": 1.3935, + "step": 15812 + }, + { + "epoch": 0.8645353525689214, + "grad_norm": 3.508620023727417, + "learning_rate": 1.0070276913701193e-06, + "loss": 1.7251, + "step": 15813 + }, + { + "epoch": 0.864590025012643, + "grad_norm": 1.5168577432632446, + "learning_rate": 1.006228668814686e-06, + "loss": 1.4859, + "step": 15814 + }, + { + "epoch": 0.8646446974563645, + "grad_norm": 1.6405705213546753, + "learning_rate": 1.0054299465823791e-06, + "loss": 1.5902, + "step": 15815 + }, + { + "epoch": 0.8646993699000861, + "grad_norm": 1.4611111879348755, + "learning_rate": 1.004631524699875e-06, + "loss": 1.3785, + "step": 15816 + }, + { + "epoch": 0.8647540423438077, + "grad_norm": 1.5070855617523193, + "learning_rate": 1.0038334031938324e-06, + "loss": 1.4223, + "step": 15817 + }, + { + "epoch": 0.8648087147875292, + "grad_norm": 1.6120078563690186, + "learning_rate": 1.0030355820908998e-06, + "loss": 1.3986, + "step": 15818 + }, + { + "epoch": 0.8648633872312508, + "grad_norm": 1.618787407875061, + "learning_rate": 1.0022380614177251e-06, + "loss": 1.3626, + "step": 15819 + }, + { + "epoch": 0.8649180596749724, + "grad_norm": 3.3306288719177246, + "learning_rate": 1.0014408412009335e-06, + "loss": 1.2602, + "step": 15820 + }, + { + "epoch": 0.8649727321186939, + "grad_norm": 1.3828184604644775, + "learning_rate": 1.0006439214671471e-06, + "loss": 1.616, + "step": 15821 + }, + { + "epoch": 0.8650274045624154, + "grad_norm": 1.4552415609359741, + "learning_rate": 9.9984730224298e-07, + "loss": 1.6528, + "step": 15822 + }, + { + "epoch": 0.8650820770061369, + "grad_norm": 1.3052749633789062, + "learning_rate": 9.9905098355503e-07, + "loss": 1.3416, + "step": 15823 + }, + { + "epoch": 0.8651367494498585, + "grad_norm": 1.4762684106826782, + "learning_rate": 9.9825496542989e-07, + "loss": 1.3956, + "step": 15824 + }, + { + "epoch": 0.8651914218935801, + "grad_norm": 1.612430214881897, + "learning_rate": 9.974592478941393e-07, + "loss": 1.205, + "step": 15825 + }, + { + "epoch": 0.8652460943373016, + "grad_norm": 1.3733628988265991, + "learning_rate": 9.966638309743481e-07, + "loss": 1.6137, + "step": 15826 + }, + { + "epoch": 0.8653007667810232, + "grad_norm": 1.9826126098632812, + "learning_rate": 9.958687146970802e-07, + "loss": 1.3358, + "step": 15827 + }, + { + "epoch": 0.8653554392247448, + "grad_norm": 1.5168840885162354, + "learning_rate": 9.950738990888841e-07, + "loss": 1.5603, + "step": 15828 + }, + { + "epoch": 0.8654101116684663, + "grad_norm": 1.1899856328964233, + "learning_rate": 9.942793841762999e-07, + "loss": 1.431, + "step": 15829 + }, + { + "epoch": 0.8654647841121879, + "grad_norm": 1.5052008628845215, + "learning_rate": 9.934851699858616e-07, + "loss": 1.4618, + "step": 15830 + }, + { + "epoch": 0.8655194565559094, + "grad_norm": 2.2736754417419434, + "learning_rate": 9.926912565440883e-07, + "loss": 1.3181, + "step": 15831 + }, + { + "epoch": 0.8655741289996309, + "grad_norm": 1.3303117752075195, + "learning_rate": 9.918976438774884e-07, + "loss": 1.3788, + "step": 15832 + }, + { + "epoch": 0.8656288014433525, + "grad_norm": 1.9188127517700195, + "learning_rate": 9.911043320125657e-07, + "loss": 1.1119, + "step": 15833 + }, + { + "epoch": 0.8656834738870741, + "grad_norm": 1.2507448196411133, + "learning_rate": 9.903113209758098e-07, + "loss": 1.5129, + "step": 15834 + }, + { + "epoch": 0.8657381463307956, + "grad_norm": 1.5456688404083252, + "learning_rate": 9.895186107937005e-07, + "loss": 1.3854, + "step": 15835 + }, + { + "epoch": 0.8657928187745172, + "grad_norm": 1.796198844909668, + "learning_rate": 9.887262014927079e-07, + "loss": 1.4476, + "step": 15836 + }, + { + "epoch": 0.8658474912182387, + "grad_norm": 1.341819405555725, + "learning_rate": 9.879340930992943e-07, + "loss": 1.8089, + "step": 15837 + }, + { + "epoch": 0.8659021636619603, + "grad_norm": 1.7134082317352295, + "learning_rate": 9.87142285639906e-07, + "loss": 1.5031, + "step": 15838 + }, + { + "epoch": 0.8659568361056819, + "grad_norm": 2.4695663452148438, + "learning_rate": 9.863507791409876e-07, + "loss": 1.2475, + "step": 15839 + }, + { + "epoch": 0.8660115085494033, + "grad_norm": 1.4404182434082031, + "learning_rate": 9.85559573628967e-07, + "loss": 1.2518, + "step": 15840 + }, + { + "epoch": 0.8660661809931249, + "grad_norm": 1.222210168838501, + "learning_rate": 9.847686691302671e-07, + "loss": 1.5534, + "step": 15841 + }, + { + "epoch": 0.8661208534368465, + "grad_norm": 2.267523765563965, + "learning_rate": 9.83978065671296e-07, + "loss": 1.4869, + "step": 15842 + }, + { + "epoch": 0.866175525880568, + "grad_norm": 1.4322874546051025, + "learning_rate": 9.831877632784525e-07, + "loss": 1.3353, + "step": 15843 + }, + { + "epoch": 0.8662301983242896, + "grad_norm": 1.6390347480773926, + "learning_rate": 9.823977619781288e-07, + "loss": 1.2084, + "step": 15844 + }, + { + "epoch": 0.8662848707680112, + "grad_norm": 1.6182746887207031, + "learning_rate": 9.81608061796706e-07, + "loss": 1.3527, + "step": 15845 + }, + { + "epoch": 0.8663395432117327, + "grad_norm": 1.5745079517364502, + "learning_rate": 9.808186627605498e-07, + "loss": 1.4181, + "step": 15846 + }, + { + "epoch": 0.8663942156554543, + "grad_norm": 1.3068345785140991, + "learning_rate": 9.800295648960245e-07, + "loss": 1.497, + "step": 15847 + }, + { + "epoch": 0.8664488880991759, + "grad_norm": 1.2664932012557983, + "learning_rate": 9.79240768229478e-07, + "loss": 1.5821, + "step": 15848 + }, + { + "epoch": 0.8665035605428973, + "grad_norm": 1.2047264575958252, + "learning_rate": 9.784522727872493e-07, + "loss": 1.3882, + "step": 15849 + }, + { + "epoch": 0.8665582329866189, + "grad_norm": 1.6127508878707886, + "learning_rate": 9.776640785956703e-07, + "loss": 1.5558, + "step": 15850 + }, + { + "epoch": 0.8666129054303404, + "grad_norm": 1.36369788646698, + "learning_rate": 9.768761856810581e-07, + "loss": 1.5716, + "step": 15851 + }, + { + "epoch": 0.866667577874062, + "grad_norm": 1.8323938846588135, + "learning_rate": 9.760885940697229e-07, + "loss": 1.6674, + "step": 15852 + }, + { + "epoch": 0.8667222503177836, + "grad_norm": 1.6934500932693481, + "learning_rate": 9.753013037879655e-07, + "loss": 1.1979, + "step": 15853 + }, + { + "epoch": 0.8667769227615051, + "grad_norm": 1.3673537969589233, + "learning_rate": 9.74514314862074e-07, + "loss": 1.3746, + "step": 15854 + }, + { + "epoch": 0.8668315952052267, + "grad_norm": 2.15612530708313, + "learning_rate": 9.737276273183294e-07, + "loss": 1.3022, + "step": 15855 + }, + { + "epoch": 0.8668862676489483, + "grad_norm": 1.215067744255066, + "learning_rate": 9.729412411829998e-07, + "loss": 1.445, + "step": 15856 + }, + { + "epoch": 0.8669409400926698, + "grad_norm": 2.1148183345794678, + "learning_rate": 9.72155156482344e-07, + "loss": 1.4523, + "step": 15857 + }, + { + "epoch": 0.8669956125363913, + "grad_norm": 1.449779748916626, + "learning_rate": 9.713693732426132e-07, + "loss": 1.4188, + "step": 15858 + }, + { + "epoch": 0.8670502849801129, + "grad_norm": 1.760446548461914, + "learning_rate": 9.705838914900456e-07, + "loss": 1.5211, + "step": 15859 + }, + { + "epoch": 0.8671049574238344, + "grad_norm": 1.7692457437515259, + "learning_rate": 9.697987112508688e-07, + "loss": 1.2657, + "step": 15860 + }, + { + "epoch": 0.867159629867556, + "grad_norm": 2.1355741024017334, + "learning_rate": 9.690138325513043e-07, + "loss": 1.3883, + "step": 15861 + }, + { + "epoch": 0.8672143023112776, + "grad_norm": 1.4895884990692139, + "learning_rate": 9.6822925541756e-07, + "loss": 1.4195, + "step": 15862 + }, + { + "epoch": 0.8672689747549991, + "grad_norm": 1.5118870735168457, + "learning_rate": 9.674449798758334e-07, + "loss": 1.3247, + "step": 15863 + }, + { + "epoch": 0.8673236471987207, + "grad_norm": 1.6385586261749268, + "learning_rate": 9.666610059523163e-07, + "loss": 1.5421, + "step": 15864 + }, + { + "epoch": 0.8673783196424422, + "grad_norm": 1.807206630706787, + "learning_rate": 9.65877333673184e-07, + "loss": 1.4592, + "step": 15865 + }, + { + "epoch": 0.8674329920861638, + "grad_norm": 1.5616321563720703, + "learning_rate": 9.65093963064606e-07, + "loss": 1.1859, + "step": 15866 + }, + { + "epoch": 0.8674876645298853, + "grad_norm": 1.2021100521087646, + "learning_rate": 9.643108941527435e-07, + "loss": 1.4659, + "step": 15867 + }, + { + "epoch": 0.8675423369736068, + "grad_norm": 1.434768557548523, + "learning_rate": 9.635281269637409e-07, + "loss": 1.5473, + "step": 15868 + }, + { + "epoch": 0.8675970094173284, + "grad_norm": 1.5872777700424194, + "learning_rate": 9.627456615237406e-07, + "loss": 1.5109, + "step": 15869 + }, + { + "epoch": 0.86765168186105, + "grad_norm": 1.4689174890518188, + "learning_rate": 9.6196349785887e-07, + "loss": 1.5308, + "step": 15870 + }, + { + "epoch": 0.8677063543047715, + "grad_norm": 1.845636010169983, + "learning_rate": 9.611816359952442e-07, + "loss": 1.4821, + "step": 15871 + }, + { + "epoch": 0.8677610267484931, + "grad_norm": 1.3157819509506226, + "learning_rate": 9.604000759589748e-07, + "loss": 1.7106, + "step": 15872 + }, + { + "epoch": 0.8678156991922147, + "grad_norm": 1.432026982307434, + "learning_rate": 9.5961881777616e-07, + "loss": 1.5594, + "step": 15873 + }, + { + "epoch": 0.8678703716359362, + "grad_norm": 1.3242857456207275, + "learning_rate": 9.588378614728867e-07, + "loss": 1.5168, + "step": 15874 + }, + { + "epoch": 0.8679250440796578, + "grad_norm": 1.879165768623352, + "learning_rate": 9.580572070752335e-07, + "loss": 1.4064, + "step": 15875 + }, + { + "epoch": 0.8679797165233794, + "grad_norm": 1.5216864347457886, + "learning_rate": 9.57276854609267e-07, + "loss": 1.2133, + "step": 15876 + }, + { + "epoch": 0.8680343889671008, + "grad_norm": 1.395585298538208, + "learning_rate": 9.564968041010435e-07, + "loss": 1.409, + "step": 15877 + }, + { + "epoch": 0.8680890614108224, + "grad_norm": 1.3993861675262451, + "learning_rate": 9.55717055576616e-07, + "loss": 1.6045, + "step": 15878 + }, + { + "epoch": 0.8681437338545439, + "grad_norm": 1.712119698524475, + "learning_rate": 9.549376090620188e-07, + "loss": 1.3621, + "step": 15879 + }, + { + "epoch": 0.8681984062982655, + "grad_norm": 1.5162476301193237, + "learning_rate": 9.541584645832768e-07, + "loss": 1.4954, + "step": 15880 + }, + { + "epoch": 0.8682530787419871, + "grad_norm": 1.7888926267623901, + "learning_rate": 9.533796221664137e-07, + "loss": 1.4811, + "step": 15881 + }, + { + "epoch": 0.8683077511857086, + "grad_norm": 2.028014898300171, + "learning_rate": 9.52601081837431e-07, + "loss": 1.4182, + "step": 15882 + }, + { + "epoch": 0.8683624236294302, + "grad_norm": 1.3967888355255127, + "learning_rate": 9.518228436223298e-07, + "loss": 1.5535, + "step": 15883 + }, + { + "epoch": 0.8684170960731518, + "grad_norm": 1.2292672395706177, + "learning_rate": 9.510449075470973e-07, + "loss": 1.4417, + "step": 15884 + }, + { + "epoch": 0.8684717685168732, + "grad_norm": 1.3328567743301392, + "learning_rate": 9.502672736377061e-07, + "loss": 1.5145, + "step": 15885 + }, + { + "epoch": 0.8685264409605948, + "grad_norm": 1.9282193183898926, + "learning_rate": 9.494899419201298e-07, + "loss": 1.1337, + "step": 15886 + }, + { + "epoch": 0.8685811134043164, + "grad_norm": 1.922210454940796, + "learning_rate": 9.487129124203209e-07, + "loss": 1.4653, + "step": 15887 + }, + { + "epoch": 0.8686357858480379, + "grad_norm": 1.754929542541504, + "learning_rate": 9.479361851642277e-07, + "loss": 1.3924, + "step": 15888 + }, + { + "epoch": 0.8686904582917595, + "grad_norm": 1.1197253465652466, + "learning_rate": 9.471597601777871e-07, + "loss": 1.4652, + "step": 15889 + }, + { + "epoch": 0.8687451307354811, + "grad_norm": 1.607455849647522, + "learning_rate": 9.46383637486925e-07, + "loss": 1.3577, + "step": 15890 + }, + { + "epoch": 0.8687998031792026, + "grad_norm": 1.4426013231277466, + "learning_rate": 9.456078171175564e-07, + "loss": 1.1719, + "step": 15891 + }, + { + "epoch": 0.8688544756229242, + "grad_norm": 1.4413155317306519, + "learning_rate": 9.448322990955916e-07, + "loss": 1.4347, + "step": 15892 + }, + { + "epoch": 0.8689091480666457, + "grad_norm": 1.6609954833984375, + "learning_rate": 9.440570834469243e-07, + "loss": 1.3882, + "step": 15893 + }, + { + "epoch": 0.8689638205103672, + "grad_norm": 2.0795834064483643, + "learning_rate": 9.432821701974393e-07, + "loss": 1.6112, + "step": 15894 + }, + { + "epoch": 0.8690184929540888, + "grad_norm": 1.4944950342178345, + "learning_rate": 9.425075593730181e-07, + "loss": 1.4944, + "step": 15895 + }, + { + "epoch": 0.8690731653978103, + "grad_norm": 1.7512000799179077, + "learning_rate": 9.417332509995203e-07, + "loss": 1.4214, + "step": 15896 + }, + { + "epoch": 0.8691278378415319, + "grad_norm": 1.4067714214324951, + "learning_rate": 9.409592451028082e-07, + "loss": 1.4528, + "step": 15897 + }, + { + "epoch": 0.8691825102852535, + "grad_norm": 1.8053385019302368, + "learning_rate": 9.401855417087236e-07, + "loss": 1.5599, + "step": 15898 + }, + { + "epoch": 0.869237182728975, + "grad_norm": 2.2137434482574463, + "learning_rate": 9.394121408431022e-07, + "loss": 1.4441, + "step": 15899 + }, + { + "epoch": 0.8692918551726966, + "grad_norm": 1.667281985282898, + "learning_rate": 9.386390425317726e-07, + "loss": 1.2361, + "step": 15900 + }, + { + "epoch": 0.8693465276164182, + "grad_norm": 1.8450067043304443, + "learning_rate": 9.378662468005484e-07, + "loss": 1.6393, + "step": 15901 + }, + { + "epoch": 0.8694012000601397, + "grad_norm": 1.4336055517196655, + "learning_rate": 9.370937536752344e-07, + "loss": 1.4971, + "step": 15902 + }, + { + "epoch": 0.8694558725038612, + "grad_norm": 1.6595700979232788, + "learning_rate": 9.36321563181628e-07, + "loss": 1.4007, + "step": 15903 + }, + { + "epoch": 0.8695105449475828, + "grad_norm": 1.9954487085342407, + "learning_rate": 9.355496753455118e-07, + "loss": 1.2979, + "step": 15904 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 1.9562537670135498, + "learning_rate": 9.347780901926617e-07, + "loss": 1.4484, + "step": 15905 + }, + { + "epoch": 0.8696198898350259, + "grad_norm": 1.64585280418396, + "learning_rate": 9.340068077488451e-07, + "loss": 1.5499, + "step": 15906 + }, + { + "epoch": 0.8696745622787475, + "grad_norm": 1.7439874410629272, + "learning_rate": 9.332358280398146e-07, + "loss": 1.2675, + "step": 15907 + }, + { + "epoch": 0.869729234722469, + "grad_norm": 1.3976104259490967, + "learning_rate": 9.32465151091314e-07, + "loss": 1.5448, + "step": 15908 + }, + { + "epoch": 0.8697839071661906, + "grad_norm": 1.5987998247146606, + "learning_rate": 9.316947769290819e-07, + "loss": 1.3761, + "step": 15909 + }, + { + "epoch": 0.8698385796099121, + "grad_norm": 1.2655813694000244, + "learning_rate": 9.309247055788384e-07, + "loss": 1.4537, + "step": 15910 + }, + { + "epoch": 0.8698932520536337, + "grad_norm": 1.7607282400131226, + "learning_rate": 9.301549370663022e-07, + "loss": 1.5733, + "step": 15911 + }, + { + "epoch": 0.8699479244973553, + "grad_norm": 1.5254734754562378, + "learning_rate": 9.293854714171758e-07, + "loss": 1.4284, + "step": 15912 + }, + { + "epoch": 0.8700025969410767, + "grad_norm": 1.3985049724578857, + "learning_rate": 9.286163086571531e-07, + "loss": 1.4149, + "step": 15913 + }, + { + "epoch": 0.8700572693847983, + "grad_norm": 1.3940941095352173, + "learning_rate": 9.278474488119182e-07, + "loss": 1.8198, + "step": 15914 + }, + { + "epoch": 0.8701119418285199, + "grad_norm": 1.394802451133728, + "learning_rate": 9.270788919071461e-07, + "loss": 1.4092, + "step": 15915 + }, + { + "epoch": 0.8701666142722414, + "grad_norm": 1.3482877016067505, + "learning_rate": 9.263106379684972e-07, + "loss": 1.4103, + "step": 15916 + }, + { + "epoch": 0.870221286715963, + "grad_norm": 1.3253318071365356, + "learning_rate": 9.255426870216311e-07, + "loss": 1.6809, + "step": 15917 + }, + { + "epoch": 0.8702759591596846, + "grad_norm": 1.4208226203918457, + "learning_rate": 9.247750390921883e-07, + "loss": 1.4833, + "step": 15918 + }, + { + "epoch": 0.8703306316034061, + "grad_norm": 1.3976548910140991, + "learning_rate": 9.240076942058008e-07, + "loss": 1.4737, + "step": 15919 + }, + { + "epoch": 0.8703853040471277, + "grad_norm": 1.4039132595062256, + "learning_rate": 9.232406523880954e-07, + "loss": 1.5019, + "step": 15920 + }, + { + "epoch": 0.8704399764908493, + "grad_norm": 1.4604939222335815, + "learning_rate": 9.224739136646843e-07, + "loss": 1.4029, + "step": 15921 + }, + { + "epoch": 0.8704946489345707, + "grad_norm": 1.7398484945297241, + "learning_rate": 9.21707478061169e-07, + "loss": 1.5631, + "step": 15922 + }, + { + "epoch": 0.8705493213782923, + "grad_norm": 1.288702130317688, + "learning_rate": 9.209413456031446e-07, + "loss": 1.6628, + "step": 15923 + }, + { + "epoch": 0.8706039938220138, + "grad_norm": 1.7985565662384033, + "learning_rate": 9.201755163161918e-07, + "loss": 1.5469, + "step": 15924 + }, + { + "epoch": 0.8706586662657354, + "grad_norm": 1.3585171699523926, + "learning_rate": 9.19409990225888e-07, + "loss": 1.5741, + "step": 15925 + }, + { + "epoch": 0.870713338709457, + "grad_norm": 1.6784369945526123, + "learning_rate": 9.186447673577914e-07, + "loss": 1.542, + "step": 15926 + }, + { + "epoch": 0.8707680111531785, + "grad_norm": 1.7347761392593384, + "learning_rate": 9.178798477374562e-07, + "loss": 1.3494, + "step": 15927 + }, + { + "epoch": 0.8708226835969001, + "grad_norm": 1.384800672531128, + "learning_rate": 9.171152313904253e-07, + "loss": 1.5082, + "step": 15928 + }, + { + "epoch": 0.8708773560406217, + "grad_norm": 1.9467254877090454, + "learning_rate": 9.163509183422303e-07, + "loss": 1.1824, + "step": 15929 + }, + { + "epoch": 0.8709320284843431, + "grad_norm": 1.6595468521118164, + "learning_rate": 9.155869086183922e-07, + "loss": 1.131, + "step": 15930 + }, + { + "epoch": 0.8709867009280647, + "grad_norm": 1.728055477142334, + "learning_rate": 9.148232022444259e-07, + "loss": 1.3819, + "step": 15931 + }, + { + "epoch": 0.8710413733717863, + "grad_norm": 1.5539379119873047, + "learning_rate": 9.140597992458322e-07, + "loss": 1.3663, + "step": 15932 + }, + { + "epoch": 0.8710960458155078, + "grad_norm": 1.3065425157546997, + "learning_rate": 9.132966996480996e-07, + "loss": 1.51, + "step": 15933 + }, + { + "epoch": 0.8711507182592294, + "grad_norm": 1.4452505111694336, + "learning_rate": 9.125339034767155e-07, + "loss": 1.3813, + "step": 15934 + }, + { + "epoch": 0.871205390702951, + "grad_norm": 1.618558645248413, + "learning_rate": 9.117714107571496e-07, + "loss": 1.2985, + "step": 15935 + }, + { + "epoch": 0.8712600631466725, + "grad_norm": 1.3445574045181274, + "learning_rate": 9.110092215148592e-07, + "loss": 1.2632, + "step": 15936 + }, + { + "epoch": 0.8713147355903941, + "grad_norm": 1.602541446685791, + "learning_rate": 9.102473357753017e-07, + "loss": 1.3232, + "step": 15937 + }, + { + "epoch": 0.8713694080341156, + "grad_norm": 1.5534727573394775, + "learning_rate": 9.094857535639157e-07, + "loss": 1.4143, + "step": 15938 + }, + { + "epoch": 0.8714240804778371, + "grad_norm": 1.6353477239608765, + "learning_rate": 9.087244749061308e-07, + "loss": 1.3943, + "step": 15939 + }, + { + "epoch": 0.8714787529215587, + "grad_norm": 1.7482571601867676, + "learning_rate": 9.079634998273701e-07, + "loss": 1.1912, + "step": 15940 + }, + { + "epoch": 0.8715334253652802, + "grad_norm": 1.2983410358428955, + "learning_rate": 9.072028283530399e-07, + "loss": 1.5675, + "step": 15941 + }, + { + "epoch": 0.8715880978090018, + "grad_norm": 1.5550142526626587, + "learning_rate": 9.064424605085476e-07, + "loss": 1.3876, + "step": 15942 + }, + { + "epoch": 0.8716427702527234, + "grad_norm": 1.3584955930709839, + "learning_rate": 9.056823963192796e-07, + "loss": 1.584, + "step": 15943 + }, + { + "epoch": 0.8716974426964449, + "grad_norm": 1.5485048294067383, + "learning_rate": 9.049226358106156e-07, + "loss": 1.477, + "step": 15944 + }, + { + "epoch": 0.8717521151401665, + "grad_norm": 1.680645227432251, + "learning_rate": 9.041631790079275e-07, + "loss": 1.3315, + "step": 15945 + }, + { + "epoch": 0.8718067875838881, + "grad_norm": 1.5371710062026978, + "learning_rate": 9.034040259365762e-07, + "loss": 1.4439, + "step": 15946 + }, + { + "epoch": 0.8718614600276096, + "grad_norm": 1.2943028211593628, + "learning_rate": 9.02645176621908e-07, + "loss": 1.4547, + "step": 15947 + }, + { + "epoch": 0.8719161324713312, + "grad_norm": 1.5528528690338135, + "learning_rate": 9.018866310892671e-07, + "loss": 1.4657, + "step": 15948 + }, + { + "epoch": 0.8719708049150527, + "grad_norm": 1.834403395652771, + "learning_rate": 9.011283893639811e-07, + "loss": 1.4533, + "step": 15949 + }, + { + "epoch": 0.8720254773587742, + "grad_norm": 1.6604795455932617, + "learning_rate": 9.003704514713663e-07, + "loss": 1.2668, + "step": 15950 + }, + { + "epoch": 0.8720801498024958, + "grad_norm": 1.505867600440979, + "learning_rate": 8.99612817436738e-07, + "loss": 1.4925, + "step": 15951 + }, + { + "epoch": 0.8721348222462173, + "grad_norm": 1.8758596181869507, + "learning_rate": 8.988554872853927e-07, + "loss": 1.4851, + "step": 15952 + }, + { + "epoch": 0.8721894946899389, + "grad_norm": 1.6681348085403442, + "learning_rate": 8.98098461042618e-07, + "loss": 1.4324, + "step": 15953 + }, + { + "epoch": 0.8722441671336605, + "grad_norm": 1.364090919494629, + "learning_rate": 8.973417387336947e-07, + "loss": 1.5637, + "step": 15954 + }, + { + "epoch": 0.872298839577382, + "grad_norm": 1.2935945987701416, + "learning_rate": 8.965853203838892e-07, + "loss": 1.2061, + "step": 15955 + }, + { + "epoch": 0.8723535120211036, + "grad_norm": 1.3314217329025269, + "learning_rate": 8.958292060184637e-07, + "loss": 1.567, + "step": 15956 + }, + { + "epoch": 0.8724081844648252, + "grad_norm": 1.414312720298767, + "learning_rate": 8.950733956626634e-07, + "loss": 1.4783, + "step": 15957 + }, + { + "epoch": 0.8724628569085466, + "grad_norm": 1.759376883506775, + "learning_rate": 8.94317889341727e-07, + "loss": 1.4447, + "step": 15958 + }, + { + "epoch": 0.8725175293522682, + "grad_norm": 1.518141508102417, + "learning_rate": 8.935626870808856e-07, + "loss": 1.618, + "step": 15959 + }, + { + "epoch": 0.8725722017959898, + "grad_norm": 1.6630570888519287, + "learning_rate": 8.928077889053544e-07, + "loss": 1.5911, + "step": 15960 + }, + { + "epoch": 0.8726268742397113, + "grad_norm": 1.7427366971969604, + "learning_rate": 8.9205319484034e-07, + "loss": 1.467, + "step": 15961 + }, + { + "epoch": 0.8726815466834329, + "grad_norm": 1.9943956136703491, + "learning_rate": 8.912989049110432e-07, + "loss": 1.4513, + "step": 15962 + }, + { + "epoch": 0.8727362191271545, + "grad_norm": 1.6585698127746582, + "learning_rate": 8.905449191426507e-07, + "loss": 1.5485, + "step": 15963 + }, + { + "epoch": 0.872790891570876, + "grad_norm": 1.646083116531372, + "learning_rate": 8.897912375603379e-07, + "loss": 1.3555, + "step": 15964 + }, + { + "epoch": 0.8728455640145976, + "grad_norm": 1.6817591190338135, + "learning_rate": 8.890378601892746e-07, + "loss": 1.4026, + "step": 15965 + }, + { + "epoch": 0.872900236458319, + "grad_norm": 1.6499860286712646, + "learning_rate": 8.882847870546174e-07, + "loss": 1.4833, + "step": 15966 + }, + { + "epoch": 0.8729549089020406, + "grad_norm": 1.6542960405349731, + "learning_rate": 8.875320181815117e-07, + "loss": 1.3521, + "step": 15967 + }, + { + "epoch": 0.8730095813457622, + "grad_norm": 1.2773617506027222, + "learning_rate": 8.867795535950951e-07, + "loss": 1.6793, + "step": 15968 + }, + { + "epoch": 0.8730642537894837, + "grad_norm": 1.4987937211990356, + "learning_rate": 8.860273933204933e-07, + "loss": 1.428, + "step": 15969 + }, + { + "epoch": 0.8731189262332053, + "grad_norm": 1.5433430671691895, + "learning_rate": 8.852755373828236e-07, + "loss": 1.4833, + "step": 15970 + }, + { + "epoch": 0.8731735986769269, + "grad_norm": 1.4926338195800781, + "learning_rate": 8.84523985807193e-07, + "loss": 1.447, + "step": 15971 + }, + { + "epoch": 0.8732282711206484, + "grad_norm": 1.5391240119934082, + "learning_rate": 8.837727386186956e-07, + "loss": 1.4655, + "step": 15972 + }, + { + "epoch": 0.87328294356437, + "grad_norm": 1.4420499801635742, + "learning_rate": 8.830217958424192e-07, + "loss": 1.6074, + "step": 15973 + }, + { + "epoch": 0.8733376160080916, + "grad_norm": 1.5130635499954224, + "learning_rate": 8.822711575034381e-07, + "loss": 1.6842, + "step": 15974 + }, + { + "epoch": 0.873392288451813, + "grad_norm": 1.4943126440048218, + "learning_rate": 8.81520823626818e-07, + "loss": 1.5033, + "step": 15975 + }, + { + "epoch": 0.8734469608955346, + "grad_norm": 1.8796930313110352, + "learning_rate": 8.807707942376165e-07, + "loss": 1.5022, + "step": 15976 + }, + { + "epoch": 0.8735016333392562, + "grad_norm": 1.3347622156143188, + "learning_rate": 8.800210693608758e-07, + "loss": 1.5191, + "step": 15977 + }, + { + "epoch": 0.8735563057829777, + "grad_norm": 1.312559962272644, + "learning_rate": 8.792716490216335e-07, + "loss": 1.2335, + "step": 15978 + }, + { + "epoch": 0.8736109782266993, + "grad_norm": 1.682990312576294, + "learning_rate": 8.785225332449133e-07, + "loss": 1.4111, + "step": 15979 + }, + { + "epoch": 0.8736656506704208, + "grad_norm": 1.258252739906311, + "learning_rate": 8.77773722055727e-07, + "loss": 1.4148, + "step": 15980 + }, + { + "epoch": 0.8737203231141424, + "grad_norm": 1.562368631362915, + "learning_rate": 8.770252154790848e-07, + "loss": 1.4116, + "step": 15981 + }, + { + "epoch": 0.873774995557864, + "grad_norm": 1.2453317642211914, + "learning_rate": 8.762770135399778e-07, + "loss": 1.3261, + "step": 15982 + }, + { + "epoch": 0.8738296680015855, + "grad_norm": 1.44491708278656, + "learning_rate": 8.755291162633894e-07, + "loss": 1.4582, + "step": 15983 + }, + { + "epoch": 0.873884340445307, + "grad_norm": 1.402620792388916, + "learning_rate": 8.747815236742974e-07, + "loss": 1.3139, + "step": 15984 + }, + { + "epoch": 0.8739390128890286, + "grad_norm": 1.5786889791488647, + "learning_rate": 8.740342357976628e-07, + "loss": 1.3844, + "step": 15985 + }, + { + "epoch": 0.8739936853327501, + "grad_norm": 1.6037628650665283, + "learning_rate": 8.73287252658438e-07, + "loss": 1.4629, + "step": 15986 + }, + { + "epoch": 0.8740483577764717, + "grad_norm": 1.6800559759140015, + "learning_rate": 8.725405742815695e-07, + "loss": 1.2568, + "step": 15987 + }, + { + "epoch": 0.8741030302201933, + "grad_norm": 1.2251733541488647, + "learning_rate": 8.717942006919911e-07, + "loss": 1.4907, + "step": 15988 + }, + { + "epoch": 0.8741577026639148, + "grad_norm": 3.230215072631836, + "learning_rate": 8.710481319146213e-07, + "loss": 1.7308, + "step": 15989 + }, + { + "epoch": 0.8742123751076364, + "grad_norm": 1.3616560697555542, + "learning_rate": 8.703023679743783e-07, + "loss": 1.3155, + "step": 15990 + }, + { + "epoch": 0.874267047551358, + "grad_norm": 1.74410879611969, + "learning_rate": 8.69556908896162e-07, + "loss": 1.4065, + "step": 15991 + }, + { + "epoch": 0.8743217199950795, + "grad_norm": 1.7412397861480713, + "learning_rate": 8.688117547048669e-07, + "loss": 1.4335, + "step": 15992 + }, + { + "epoch": 0.874376392438801, + "grad_norm": 1.5375391244888306, + "learning_rate": 8.680669054253732e-07, + "loss": 1.5504, + "step": 15993 + }, + { + "epoch": 0.8744310648825225, + "grad_norm": 1.9762904644012451, + "learning_rate": 8.673223610825532e-07, + "loss": 1.6141, + "step": 15994 + }, + { + "epoch": 0.8744857373262441, + "grad_norm": 1.1835752725601196, + "learning_rate": 8.665781217012725e-07, + "loss": 1.6453, + "step": 15995 + }, + { + "epoch": 0.8745404097699657, + "grad_norm": 1.7151106595993042, + "learning_rate": 8.658341873063792e-07, + "loss": 1.3423, + "step": 15996 + }, + { + "epoch": 0.8745950822136872, + "grad_norm": 1.4874560832977295, + "learning_rate": 8.650905579227154e-07, + "loss": 1.5569, + "step": 15997 + }, + { + "epoch": 0.8746497546574088, + "grad_norm": 1.7583951950073242, + "learning_rate": 8.643472335751157e-07, + "loss": 1.4069, + "step": 15998 + }, + { + "epoch": 0.8747044271011304, + "grad_norm": 1.319462537765503, + "learning_rate": 8.636042142883982e-07, + "loss": 1.4982, + "step": 15999 + }, + { + "epoch": 0.8747590995448519, + "grad_norm": 1.6380635499954224, + "learning_rate": 8.628615000873741e-07, + "loss": 1.5493, + "step": 16000 + }, + { + "epoch": 0.8748137719885735, + "grad_norm": 1.6000686883926392, + "learning_rate": 8.62119090996848e-07, + "loss": 1.5239, + "step": 16001 + }, + { + "epoch": 0.8748684444322951, + "grad_norm": 1.2563904523849487, + "learning_rate": 8.613769870416067e-07, + "loss": 1.4693, + "step": 16002 + }, + { + "epoch": 0.8749231168760165, + "grad_norm": 1.77444589138031, + "learning_rate": 8.606351882464314e-07, + "loss": 1.6364, + "step": 16003 + }, + { + "epoch": 0.8749777893197381, + "grad_norm": 1.6741127967834473, + "learning_rate": 8.598936946360948e-07, + "loss": 1.4849, + "step": 16004 + }, + { + "epoch": 0.8750324617634597, + "grad_norm": 1.2786240577697754, + "learning_rate": 8.591525062353557e-07, + "loss": 1.4501, + "step": 16005 + }, + { + "epoch": 0.8750871342071812, + "grad_norm": 1.965118646621704, + "learning_rate": 8.584116230689643e-07, + "loss": 1.4191, + "step": 16006 + }, + { + "epoch": 0.8751418066509028, + "grad_norm": 2.0879976749420166, + "learning_rate": 8.576710451616599e-07, + "loss": 1.3739, + "step": 16007 + }, + { + "epoch": 0.8751964790946243, + "grad_norm": 1.6026558876037598, + "learning_rate": 8.569307725381715e-07, + "loss": 1.3434, + "step": 16008 + }, + { + "epoch": 0.8752511515383459, + "grad_norm": 1.4405925273895264, + "learning_rate": 8.561908052232204e-07, + "loss": 1.4653, + "step": 16009 + }, + { + "epoch": 0.8753058239820675, + "grad_norm": 1.470206379890442, + "learning_rate": 8.554511432415147e-07, + "loss": 1.4984, + "step": 16010 + }, + { + "epoch": 0.875360496425789, + "grad_norm": 2.2085278034210205, + "learning_rate": 8.547117866177524e-07, + "loss": 1.3697, + "step": 16011 + }, + { + "epoch": 0.8754151688695105, + "grad_norm": 1.4349862337112427, + "learning_rate": 8.539727353766259e-07, + "loss": 1.2831, + "step": 16012 + }, + { + "epoch": 0.8754698413132321, + "grad_norm": 1.7236895561218262, + "learning_rate": 8.53233989542811e-07, + "loss": 1.4129, + "step": 16013 + }, + { + "epoch": 0.8755245137569536, + "grad_norm": 1.2736467123031616, + "learning_rate": 8.524955491409748e-07, + "loss": 1.3533, + "step": 16014 + }, + { + "epoch": 0.8755791862006752, + "grad_norm": 1.8352490663528442, + "learning_rate": 8.517574141957796e-07, + "loss": 1.1542, + "step": 16015 + }, + { + "epoch": 0.8756338586443968, + "grad_norm": 1.3133420944213867, + "learning_rate": 8.510195847318714e-07, + "loss": 1.5498, + "step": 16016 + }, + { + "epoch": 0.8756885310881183, + "grad_norm": 1.3707342147827148, + "learning_rate": 8.502820607738871e-07, + "loss": 1.489, + "step": 16017 + }, + { + "epoch": 0.8757432035318399, + "grad_norm": 1.4064487218856812, + "learning_rate": 8.495448423464569e-07, + "loss": 1.4513, + "step": 16018 + }, + { + "epoch": 0.8757978759755615, + "grad_norm": 1.3970305919647217, + "learning_rate": 8.488079294741957e-07, + "loss": 1.4412, + "step": 16019 + }, + { + "epoch": 0.875852548419283, + "grad_norm": 1.5748993158340454, + "learning_rate": 8.480713221817095e-07, + "loss": 1.4815, + "step": 16020 + }, + { + "epoch": 0.8759072208630045, + "grad_norm": 2.0941076278686523, + "learning_rate": 8.473350204936004e-07, + "loss": 1.4562, + "step": 16021 + }, + { + "epoch": 0.875961893306726, + "grad_norm": 1.3463491201400757, + "learning_rate": 8.46599024434449e-07, + "loss": 1.5276, + "step": 16022 + }, + { + "epoch": 0.8760165657504476, + "grad_norm": 1.663298487663269, + "learning_rate": 8.458633340288391e-07, + "loss": 1.4314, + "step": 16023 + }, + { + "epoch": 0.8760712381941692, + "grad_norm": 1.506986141204834, + "learning_rate": 8.45127949301332e-07, + "loss": 1.3009, + "step": 16024 + }, + { + "epoch": 0.8761259106378907, + "grad_norm": 2.9557759761810303, + "learning_rate": 8.443928702764836e-07, + "loss": 1.2702, + "step": 16025 + }, + { + "epoch": 0.8761805830816123, + "grad_norm": 1.888043999671936, + "learning_rate": 8.436580969788432e-07, + "loss": 1.4732, + "step": 16026 + }, + { + "epoch": 0.8762352555253339, + "grad_norm": 2.1315906047821045, + "learning_rate": 8.429236294329457e-07, + "loss": 1.6178, + "step": 16027 + }, + { + "epoch": 0.8762899279690554, + "grad_norm": 1.6216647624969482, + "learning_rate": 8.421894676633136e-07, + "loss": 1.6113, + "step": 16028 + }, + { + "epoch": 0.876344600412777, + "grad_norm": 1.4716839790344238, + "learning_rate": 8.414556116944672e-07, + "loss": 1.2124, + "step": 16029 + }, + { + "epoch": 0.8763992728564985, + "grad_norm": 1.5605404376983643, + "learning_rate": 8.407220615509081e-07, + "loss": 1.2141, + "step": 16030 + }, + { + "epoch": 0.87645394530022, + "grad_norm": 1.4561200141906738, + "learning_rate": 8.399888172571324e-07, + "loss": 1.5987, + "step": 16031 + }, + { + "epoch": 0.8765086177439416, + "grad_norm": 1.7076056003570557, + "learning_rate": 8.392558788376248e-07, + "loss": 1.4879, + "step": 16032 + }, + { + "epoch": 0.8765632901876632, + "grad_norm": 1.6201198101043701, + "learning_rate": 8.385232463168602e-07, + "loss": 1.4339, + "step": 16033 + }, + { + "epoch": 0.8766179626313847, + "grad_norm": 1.330774188041687, + "learning_rate": 8.377909197193013e-07, + "loss": 1.5752, + "step": 16034 + }, + { + "epoch": 0.8766726350751063, + "grad_norm": 1.4079890251159668, + "learning_rate": 8.37058899069404e-07, + "loss": 1.4367, + "step": 16035 + }, + { + "epoch": 0.8767273075188278, + "grad_norm": 1.4987562894821167, + "learning_rate": 8.363271843916099e-07, + "loss": 1.4993, + "step": 16036 + }, + { + "epoch": 0.8767819799625494, + "grad_norm": 1.4594393968582153, + "learning_rate": 8.355957757103562e-07, + "loss": 1.4868, + "step": 16037 + }, + { + "epoch": 0.876836652406271, + "grad_norm": 1.772728443145752, + "learning_rate": 8.348646730500654e-07, + "loss": 1.4138, + "step": 16038 + }, + { + "epoch": 0.8768913248499924, + "grad_norm": 1.836403489112854, + "learning_rate": 8.34133876435147e-07, + "loss": 1.3733, + "step": 16039 + }, + { + "epoch": 0.876945997293714, + "grad_norm": 1.484626293182373, + "learning_rate": 8.334033858900092e-07, + "loss": 1.1888, + "step": 16040 + }, + { + "epoch": 0.8770006697374356, + "grad_norm": 1.4657666683197021, + "learning_rate": 8.326732014390415e-07, + "loss": 1.4819, + "step": 16041 + }, + { + "epoch": 0.8770553421811571, + "grad_norm": 1.4089624881744385, + "learning_rate": 8.319433231066265e-07, + "loss": 1.4667, + "step": 16042 + }, + { + "epoch": 0.8771100146248787, + "grad_norm": 1.3672086000442505, + "learning_rate": 8.312137509171392e-07, + "loss": 1.511, + "step": 16043 + }, + { + "epoch": 0.8771646870686003, + "grad_norm": 1.5325390100479126, + "learning_rate": 8.3048448489494e-07, + "loss": 1.3998, + "step": 16044 + }, + { + "epoch": 0.8772193595123218, + "grad_norm": 1.8028852939605713, + "learning_rate": 8.297555250643808e-07, + "loss": 1.1168, + "step": 16045 + }, + { + "epoch": 0.8772740319560434, + "grad_norm": 1.6057188510894775, + "learning_rate": 8.290268714498029e-07, + "loss": 1.0749, + "step": 16046 + }, + { + "epoch": 0.877328704399765, + "grad_norm": 1.5125987529754639, + "learning_rate": 8.282985240755381e-07, + "loss": 1.5838, + "step": 16047 + }, + { + "epoch": 0.8773833768434864, + "grad_norm": 1.5337587594985962, + "learning_rate": 8.27570482965906e-07, + "loss": 1.5507, + "step": 16048 + }, + { + "epoch": 0.877438049287208, + "grad_norm": 1.2844865322113037, + "learning_rate": 8.268427481452213e-07, + "loss": 1.3868, + "step": 16049 + }, + { + "epoch": 0.8774927217309295, + "grad_norm": 1.5763673782348633, + "learning_rate": 8.261153196377814e-07, + "loss": 1.3639, + "step": 16050 + }, + { + "epoch": 0.8775473941746511, + "grad_norm": 1.4014307260513306, + "learning_rate": 8.25388197467879e-07, + "loss": 1.668, + "step": 16051 + }, + { + "epoch": 0.8776020666183727, + "grad_norm": 1.5024771690368652, + "learning_rate": 8.246613816597937e-07, + "loss": 1.5116, + "step": 16052 + }, + { + "epoch": 0.8776567390620942, + "grad_norm": 1.643006682395935, + "learning_rate": 8.239348722377937e-07, + "loss": 1.3661, + "step": 16053 + }, + { + "epoch": 0.8777114115058158, + "grad_norm": 1.3888405561447144, + "learning_rate": 8.232086692261432e-07, + "loss": 1.5607, + "step": 16054 + }, + { + "epoch": 0.8777660839495374, + "grad_norm": 1.4810909032821655, + "learning_rate": 8.224827726490891e-07, + "loss": 1.5676, + "step": 16055 + }, + { + "epoch": 0.8778207563932588, + "grad_norm": 1.7177482843399048, + "learning_rate": 8.217571825308701e-07, + "loss": 1.4519, + "step": 16056 + }, + { + "epoch": 0.8778754288369804, + "grad_norm": 1.3847895860671997, + "learning_rate": 8.210318988957166e-07, + "loss": 1.4071, + "step": 16057 + }, + { + "epoch": 0.877930101280702, + "grad_norm": 1.3477550745010376, + "learning_rate": 8.203069217678472e-07, + "loss": 1.4612, + "step": 16058 + }, + { + "epoch": 0.8779847737244235, + "grad_norm": 1.5095834732055664, + "learning_rate": 8.195822511714691e-07, + "loss": 1.5926, + "step": 16059 + }, + { + "epoch": 0.8780394461681451, + "grad_norm": 1.4099665880203247, + "learning_rate": 8.188578871307829e-07, + "loss": 1.192, + "step": 16060 + }, + { + "epoch": 0.8780941186118667, + "grad_norm": 1.3604835271835327, + "learning_rate": 8.18133829669977e-07, + "loss": 1.4487, + "step": 16061 + }, + { + "epoch": 0.8781487910555882, + "grad_norm": 1.9038654565811157, + "learning_rate": 8.174100788132266e-07, + "loss": 1.6426, + "step": 16062 + }, + { + "epoch": 0.8782034634993098, + "grad_norm": 1.5335320234298706, + "learning_rate": 8.166866345847025e-07, + "loss": 1.3716, + "step": 16063 + }, + { + "epoch": 0.8782581359430313, + "grad_norm": 1.3719209432601929, + "learning_rate": 8.159634970085595e-07, + "loss": 1.5423, + "step": 16064 + }, + { + "epoch": 0.8783128083867529, + "grad_norm": 1.6097590923309326, + "learning_rate": 8.152406661089485e-07, + "loss": 1.4619, + "step": 16065 + }, + { + "epoch": 0.8783674808304744, + "grad_norm": 1.5807056427001953, + "learning_rate": 8.145181419100034e-07, + "loss": 1.2283, + "step": 16066 + }, + { + "epoch": 0.8784221532741959, + "grad_norm": 1.6065664291381836, + "learning_rate": 8.137959244358506e-07, + "loss": 1.4073, + "step": 16067 + }, + { + "epoch": 0.8784768257179175, + "grad_norm": 1.400306224822998, + "learning_rate": 8.130740137106108e-07, + "loss": 1.375, + "step": 16068 + }, + { + "epoch": 0.8785314981616391, + "grad_norm": 1.6583279371261597, + "learning_rate": 8.123524097583857e-07, + "loss": 1.5663, + "step": 16069 + }, + { + "epoch": 0.8785861706053606, + "grad_norm": 1.3509024381637573, + "learning_rate": 8.11631112603275e-07, + "loss": 1.4444, + "step": 16070 + }, + { + "epoch": 0.8786408430490822, + "grad_norm": 1.6210654973983765, + "learning_rate": 8.109101222693616e-07, + "loss": 1.399, + "step": 16071 + }, + { + "epoch": 0.8786955154928038, + "grad_norm": 1.7957913875579834, + "learning_rate": 8.101894387807219e-07, + "loss": 1.3601, + "step": 16072 + }, + { + "epoch": 0.8787501879365253, + "grad_norm": 1.315197467803955, + "learning_rate": 8.094690621614199e-07, + "loss": 1.6211, + "step": 16073 + }, + { + "epoch": 0.8788048603802469, + "grad_norm": 1.6957296133041382, + "learning_rate": 8.087489924355141e-07, + "loss": 1.3276, + "step": 16074 + }, + { + "epoch": 0.8788595328239684, + "grad_norm": 1.4458869695663452, + "learning_rate": 8.080292296270476e-07, + "loss": 1.4698, + "step": 16075 + }, + { + "epoch": 0.8789142052676899, + "grad_norm": 1.4307690858840942, + "learning_rate": 8.073097737600522e-07, + "loss": 1.3623, + "step": 16076 + }, + { + "epoch": 0.8789688777114115, + "grad_norm": 1.6400444507598877, + "learning_rate": 8.065906248585564e-07, + "loss": 1.3346, + "step": 16077 + }, + { + "epoch": 0.879023550155133, + "grad_norm": 1.2671154737472534, + "learning_rate": 8.058717829465723e-07, + "loss": 1.4772, + "step": 16078 + }, + { + "epoch": 0.8790782225988546, + "grad_norm": 1.6779452562332153, + "learning_rate": 8.05153248048105e-07, + "loss": 1.4246, + "step": 16079 + }, + { + "epoch": 0.8791328950425762, + "grad_norm": 1.5641565322875977, + "learning_rate": 8.044350201871465e-07, + "loss": 1.5932, + "step": 16080 + }, + { + "epoch": 0.8791875674862977, + "grad_norm": 1.4838712215423584, + "learning_rate": 8.037170993876797e-07, + "loss": 1.1412, + "step": 16081 + }, + { + "epoch": 0.8792422399300193, + "grad_norm": 1.9049186706542969, + "learning_rate": 8.029994856736811e-07, + "loss": 1.4881, + "step": 16082 + }, + { + "epoch": 0.8792969123737409, + "grad_norm": 1.7907872200012207, + "learning_rate": 8.022821790691104e-07, + "loss": 1.4398, + "step": 16083 + }, + { + "epoch": 0.8793515848174623, + "grad_norm": 1.405060887336731, + "learning_rate": 8.015651795979206e-07, + "loss": 1.3466, + "step": 16084 + }, + { + "epoch": 0.8794062572611839, + "grad_norm": 1.575493574142456, + "learning_rate": 8.008484872840538e-07, + "loss": 1.5721, + "step": 16085 + }, + { + "epoch": 0.8794609297049055, + "grad_norm": 1.62589430809021, + "learning_rate": 8.001321021514442e-07, + "loss": 1.4256, + "step": 16086 + }, + { + "epoch": 0.879515602148627, + "grad_norm": 1.7155073881149292, + "learning_rate": 7.99416024224009e-07, + "loss": 1.3554, + "step": 16087 + }, + { + "epoch": 0.8795702745923486, + "grad_norm": 1.607006311416626, + "learning_rate": 7.987002535256638e-07, + "loss": 1.3641, + "step": 16088 + }, + { + "epoch": 0.8796249470360702, + "grad_norm": 1.711856484413147, + "learning_rate": 7.979847900803095e-07, + "loss": 1.3221, + "step": 16089 + }, + { + "epoch": 0.8796796194797917, + "grad_norm": 1.9898695945739746, + "learning_rate": 7.972696339118346e-07, + "loss": 1.394, + "step": 16090 + }, + { + "epoch": 0.8797342919235133, + "grad_norm": 1.202308177947998, + "learning_rate": 7.965547850441224e-07, + "loss": 1.6056, + "step": 16091 + }, + { + "epoch": 0.8797889643672347, + "grad_norm": 1.6932731866836548, + "learning_rate": 7.958402435010415e-07, + "loss": 1.5985, + "step": 16092 + }, + { + "epoch": 0.8798436368109563, + "grad_norm": 1.4805246591567993, + "learning_rate": 7.95126009306455e-07, + "loss": 1.6955, + "step": 16093 + }, + { + "epoch": 0.8798983092546779, + "grad_norm": 1.5670909881591797, + "learning_rate": 7.944120824842106e-07, + "loss": 1.5524, + "step": 16094 + }, + { + "epoch": 0.8799529816983994, + "grad_norm": 1.7051963806152344, + "learning_rate": 7.93698463058149e-07, + "loss": 1.1267, + "step": 16095 + }, + { + "epoch": 0.880007654142121, + "grad_norm": 1.6986079216003418, + "learning_rate": 7.92985151052098e-07, + "loss": 1.3629, + "step": 16096 + }, + { + "epoch": 0.8800623265858426, + "grad_norm": 1.1968636512756348, + "learning_rate": 7.922721464898786e-07, + "loss": 1.4711, + "step": 16097 + }, + { + "epoch": 0.8801169990295641, + "grad_norm": 1.5292435884475708, + "learning_rate": 7.91559449395296e-07, + "loss": 1.381, + "step": 16098 + }, + { + "epoch": 0.8801716714732857, + "grad_norm": 1.4381299018859863, + "learning_rate": 7.908470597921547e-07, + "loss": 1.5012, + "step": 16099 + }, + { + "epoch": 0.8802263439170073, + "grad_norm": 1.4131665229797363, + "learning_rate": 7.901349777042389e-07, + "loss": 1.4144, + "step": 16100 + }, + { + "epoch": 0.8802810163607288, + "grad_norm": 1.3330981731414795, + "learning_rate": 7.894232031553262e-07, + "loss": 1.491, + "step": 16101 + }, + { + "epoch": 0.8803356888044503, + "grad_norm": 1.4215092658996582, + "learning_rate": 7.887117361691888e-07, + "loss": 1.4239, + "step": 16102 + }, + { + "epoch": 0.8803903612481719, + "grad_norm": 1.4831678867340088, + "learning_rate": 7.880005767695809e-07, + "loss": 1.3404, + "step": 16103 + }, + { + "epoch": 0.8804450336918934, + "grad_norm": 1.1943080425262451, + "learning_rate": 7.87289724980248e-07, + "loss": 1.7753, + "step": 16104 + }, + { + "epoch": 0.880499706135615, + "grad_norm": 1.6269383430480957, + "learning_rate": 7.865791808249324e-07, + "loss": 1.534, + "step": 16105 + }, + { + "epoch": 0.8805543785793366, + "grad_norm": 1.506001591682434, + "learning_rate": 7.858689443273548e-07, + "loss": 1.2817, + "step": 16106 + }, + { + "epoch": 0.8806090510230581, + "grad_norm": 1.7798289060592651, + "learning_rate": 7.851590155112376e-07, + "loss": 1.2272, + "step": 16107 + }, + { + "epoch": 0.8806637234667797, + "grad_norm": 1.1651500463485718, + "learning_rate": 7.84449394400284e-07, + "loss": 1.5486, + "step": 16108 + }, + { + "epoch": 0.8807183959105012, + "grad_norm": 1.795766830444336, + "learning_rate": 7.837400810181894e-07, + "loss": 1.2184, + "step": 16109 + }, + { + "epoch": 0.8807730683542228, + "grad_norm": 1.4554331302642822, + "learning_rate": 7.830310753886406e-07, + "loss": 1.5108, + "step": 16110 + }, + { + "epoch": 0.8808277407979443, + "grad_norm": 1.533933162689209, + "learning_rate": 7.823223775353128e-07, + "loss": 1.5295, + "step": 16111 + }, + { + "epoch": 0.8808824132416658, + "grad_norm": 1.3401343822479248, + "learning_rate": 7.816139874818696e-07, + "loss": 1.2417, + "step": 16112 + }, + { + "epoch": 0.8809370856853874, + "grad_norm": 1.6562711000442505, + "learning_rate": 7.809059052519674e-07, + "loss": 1.3588, + "step": 16113 + }, + { + "epoch": 0.880991758129109, + "grad_norm": 1.4064934253692627, + "learning_rate": 7.801981308692508e-07, + "loss": 1.6327, + "step": 16114 + }, + { + "epoch": 0.8810464305728305, + "grad_norm": 1.1545524597167969, + "learning_rate": 7.794906643573519e-07, + "loss": 1.6364, + "step": 16115 + }, + { + "epoch": 0.8811011030165521, + "grad_norm": 1.5328762531280518, + "learning_rate": 7.787835057398985e-07, + "loss": 1.2027, + "step": 16116 + }, + { + "epoch": 0.8811557754602737, + "grad_norm": 1.498975157737732, + "learning_rate": 7.780766550405006e-07, + "loss": 1.5008, + "step": 16117 + }, + { + "epoch": 0.8812104479039952, + "grad_norm": 1.4528025388717651, + "learning_rate": 7.773701122827626e-07, + "loss": 1.3099, + "step": 16118 + }, + { + "epoch": 0.8812651203477168, + "grad_norm": 1.3956632614135742, + "learning_rate": 7.766638774902802e-07, + "loss": 1.5513, + "step": 16119 + }, + { + "epoch": 0.8813197927914384, + "grad_norm": 1.8047189712524414, + "learning_rate": 7.759579506866311e-07, + "loss": 1.5041, + "step": 16120 + }, + { + "epoch": 0.8813744652351598, + "grad_norm": 1.223536491394043, + "learning_rate": 7.752523318953942e-07, + "loss": 1.564, + "step": 16121 + }, + { + "epoch": 0.8814291376788814, + "grad_norm": 1.4544697999954224, + "learning_rate": 7.745470211401274e-07, + "loss": 1.3604, + "step": 16122 + }, + { + "epoch": 0.8814838101226029, + "grad_norm": 1.7329591512680054, + "learning_rate": 7.73842018444384e-07, + "loss": 1.4025, + "step": 16123 + }, + { + "epoch": 0.8815384825663245, + "grad_norm": 1.8902589082717896, + "learning_rate": 7.731373238317053e-07, + "loss": 1.2012, + "step": 16124 + }, + { + "epoch": 0.8815931550100461, + "grad_norm": 1.6467444896697998, + "learning_rate": 7.724329373256234e-07, + "loss": 1.5186, + "step": 16125 + }, + { + "epoch": 0.8816478274537676, + "grad_norm": 1.7743788957595825, + "learning_rate": 7.717288589496563e-07, + "loss": 1.5245, + "step": 16126 + }, + { + "epoch": 0.8817024998974892, + "grad_norm": 1.5787336826324463, + "learning_rate": 7.710250887273196e-07, + "loss": 1.5432, + "step": 16127 + }, + { + "epoch": 0.8817571723412108, + "grad_norm": 2.313491106033325, + "learning_rate": 7.703216266821123e-07, + "loss": 1.4243, + "step": 16128 + }, + { + "epoch": 0.8818118447849322, + "grad_norm": 1.1952685117721558, + "learning_rate": 7.696184728375222e-07, + "loss": 1.5541, + "step": 16129 + }, + { + "epoch": 0.8818665172286538, + "grad_norm": 1.5374417304992676, + "learning_rate": 7.689156272170318e-07, + "loss": 1.5306, + "step": 16130 + }, + { + "epoch": 0.8819211896723754, + "grad_norm": 1.6578631401062012, + "learning_rate": 7.68213089844111e-07, + "loss": 1.5516, + "step": 16131 + }, + { + "epoch": 0.8819758621160969, + "grad_norm": 1.8581764698028564, + "learning_rate": 7.675108607422154e-07, + "loss": 1.544, + "step": 16132 + }, + { + "epoch": 0.8820305345598185, + "grad_norm": 1.306150197982788, + "learning_rate": 7.668089399348e-07, + "loss": 1.3498, + "step": 16133 + }, + { + "epoch": 0.8820852070035401, + "grad_norm": 1.2417603731155396, + "learning_rate": 7.66107327445299e-07, + "loss": 1.3941, + "step": 16134 + }, + { + "epoch": 0.8821398794472616, + "grad_norm": 1.3290441036224365, + "learning_rate": 7.654060232971427e-07, + "loss": 1.2991, + "step": 16135 + }, + { + "epoch": 0.8821945518909832, + "grad_norm": 1.647229552268982, + "learning_rate": 7.647050275137502e-07, + "loss": 1.7227, + "step": 16136 + }, + { + "epoch": 0.8822492243347047, + "grad_norm": 1.340150237083435, + "learning_rate": 7.640043401185249e-07, + "loss": 1.562, + "step": 16137 + }, + { + "epoch": 0.8823038967784262, + "grad_norm": 1.5008018016815186, + "learning_rate": 7.633039611348702e-07, + "loss": 1.4133, + "step": 16138 + }, + { + "epoch": 0.8823585692221478, + "grad_norm": 1.6037236452102661, + "learning_rate": 7.626038905861699e-07, + "loss": 1.3645, + "step": 16139 + }, + { + "epoch": 0.8824132416658693, + "grad_norm": 1.3291033506393433, + "learning_rate": 7.619041284958017e-07, + "loss": 1.4905, + "step": 16140 + }, + { + "epoch": 0.8824679141095909, + "grad_norm": 1.52708101272583, + "learning_rate": 7.612046748871327e-07, + "loss": 1.5815, + "step": 16141 + }, + { + "epoch": 0.8825225865533125, + "grad_norm": 1.9906162023544312, + "learning_rate": 7.605055297835196e-07, + "loss": 1.3297, + "step": 16142 + }, + { + "epoch": 0.882577258997034, + "grad_norm": 2.0253543853759766, + "learning_rate": 7.59806693208307e-07, + "loss": 1.618, + "step": 16143 + }, + { + "epoch": 0.8826319314407556, + "grad_norm": 1.514363169670105, + "learning_rate": 7.591081651848331e-07, + "loss": 1.4805, + "step": 16144 + }, + { + "epoch": 0.8826866038844772, + "grad_norm": 1.3684769868850708, + "learning_rate": 7.584099457364213e-07, + "loss": 1.4943, + "step": 16145 + }, + { + "epoch": 0.8827412763281987, + "grad_norm": 2.347153902053833, + "learning_rate": 7.577120348863864e-07, + "loss": 1.3027, + "step": 16146 + }, + { + "epoch": 0.8827959487719202, + "grad_norm": 1.7636569738388062, + "learning_rate": 7.570144326580365e-07, + "loss": 1.4709, + "step": 16147 + }, + { + "epoch": 0.8828506212156418, + "grad_norm": 1.6653884649276733, + "learning_rate": 7.563171390746627e-07, + "loss": 1.4413, + "step": 16148 + }, + { + "epoch": 0.8829052936593633, + "grad_norm": 1.1895136833190918, + "learning_rate": 7.556201541595521e-07, + "loss": 1.6381, + "step": 16149 + }, + { + "epoch": 0.8829599661030849, + "grad_norm": 1.690147876739502, + "learning_rate": 7.54923477935976e-07, + "loss": 1.6147, + "step": 16150 + }, + { + "epoch": 0.8830146385468064, + "grad_norm": 1.3391026258468628, + "learning_rate": 7.542271104271981e-07, + "loss": 1.3219, + "step": 16151 + }, + { + "epoch": 0.883069310990528, + "grad_norm": 1.6000969409942627, + "learning_rate": 7.535310516564742e-07, + "loss": 1.3742, + "step": 16152 + }, + { + "epoch": 0.8831239834342496, + "grad_norm": 1.7270468473434448, + "learning_rate": 7.528353016470468e-07, + "loss": 1.5347, + "step": 16153 + }, + { + "epoch": 0.8831786558779711, + "grad_norm": 1.4437875747680664, + "learning_rate": 7.521398604221453e-07, + "loss": 1.5439, + "step": 16154 + }, + { + "epoch": 0.8832333283216927, + "grad_norm": 1.627203106880188, + "learning_rate": 7.514447280049964e-07, + "loss": 1.4814, + "step": 16155 + }, + { + "epoch": 0.8832880007654142, + "grad_norm": 1.5825532674789429, + "learning_rate": 7.507499044188105e-07, + "loss": 1.4888, + "step": 16156 + }, + { + "epoch": 0.8833426732091357, + "grad_norm": 1.9349178075790405, + "learning_rate": 7.50055389686788e-07, + "loss": 1.464, + "step": 16157 + }, + { + "epoch": 0.8833973456528573, + "grad_norm": 1.4445362091064453, + "learning_rate": 7.493611838321236e-07, + "loss": 1.3597, + "step": 16158 + }, + { + "epoch": 0.8834520180965789, + "grad_norm": 1.3215562105178833, + "learning_rate": 7.486672868779954e-07, + "loss": 1.3505, + "step": 16159 + }, + { + "epoch": 0.8835066905403004, + "grad_norm": 1.6912813186645508, + "learning_rate": 7.479736988475772e-07, + "loss": 1.3116, + "step": 16160 + }, + { + "epoch": 0.883561362984022, + "grad_norm": 1.3188703060150146, + "learning_rate": 7.47280419764026e-07, + "loss": 1.5235, + "step": 16161 + }, + { + "epoch": 0.8836160354277436, + "grad_norm": 1.4011815786361694, + "learning_rate": 7.465874496504944e-07, + "loss": 1.4406, + "step": 16162 + }, + { + "epoch": 0.8836707078714651, + "grad_norm": 1.7680919170379639, + "learning_rate": 7.458947885301204e-07, + "loss": 1.224, + "step": 16163 + }, + { + "epoch": 0.8837253803151867, + "grad_norm": 1.5202611684799194, + "learning_rate": 7.452024364260368e-07, + "loss": 1.396, + "step": 16164 + }, + { + "epoch": 0.8837800527589081, + "grad_norm": 1.263998031616211, + "learning_rate": 7.445103933613585e-07, + "loss": 1.5006, + "step": 16165 + }, + { + "epoch": 0.8838347252026297, + "grad_norm": 1.4411492347717285, + "learning_rate": 7.438186593591989e-07, + "loss": 1.4742, + "step": 16166 + }, + { + "epoch": 0.8838893976463513, + "grad_norm": 1.704664707183838, + "learning_rate": 7.431272344426544e-07, + "loss": 1.2645, + "step": 16167 + }, + { + "epoch": 0.8839440700900728, + "grad_norm": 1.396483063697815, + "learning_rate": 7.424361186348117e-07, + "loss": 1.5767, + "step": 16168 + }, + { + "epoch": 0.8839987425337944, + "grad_norm": 1.571336269378662, + "learning_rate": 7.417453119587525e-07, + "loss": 1.5288, + "step": 16169 + }, + { + "epoch": 0.884053414977516, + "grad_norm": 1.4863368272781372, + "learning_rate": 7.410548144375418e-07, + "loss": 1.2658, + "step": 16170 + }, + { + "epoch": 0.8841080874212375, + "grad_norm": 1.4480098485946655, + "learning_rate": 7.403646260942366e-07, + "loss": 1.7655, + "step": 16171 + }, + { + "epoch": 0.8841627598649591, + "grad_norm": 1.714497685432434, + "learning_rate": 7.396747469518862e-07, + "loss": 1.4772, + "step": 16172 + }, + { + "epoch": 0.8842174323086807, + "grad_norm": 1.6831525564193726, + "learning_rate": 7.389851770335266e-07, + "loss": 1.2276, + "step": 16173 + }, + { + "epoch": 0.8842721047524021, + "grad_norm": 1.6202799081802368, + "learning_rate": 7.382959163621828e-07, + "loss": 1.4538, + "step": 16174 + }, + { + "epoch": 0.8843267771961237, + "grad_norm": 1.672637701034546, + "learning_rate": 7.37606964960873e-07, + "loss": 1.4672, + "step": 16175 + }, + { + "epoch": 0.8843814496398453, + "grad_norm": 1.425581932067871, + "learning_rate": 7.369183228526e-07, + "loss": 1.3849, + "step": 16176 + }, + { + "epoch": 0.8844361220835668, + "grad_norm": 1.6004337072372437, + "learning_rate": 7.362299900603598e-07, + "loss": 1.4472, + "step": 16177 + }, + { + "epoch": 0.8844907945272884, + "grad_norm": 1.2702544927597046, + "learning_rate": 7.355419666071406e-07, + "loss": 1.5134, + "step": 16178 + }, + { + "epoch": 0.8845454669710099, + "grad_norm": 1.441880226135254, + "learning_rate": 7.348542525159119e-07, + "loss": 1.3351, + "step": 16179 + }, + { + "epoch": 0.8846001394147315, + "grad_norm": 1.4188605546951294, + "learning_rate": 7.341668478096431e-07, + "loss": 1.5288, + "step": 16180 + }, + { + "epoch": 0.8846548118584531, + "grad_norm": 1.5820331573486328, + "learning_rate": 7.334797525112868e-07, + "loss": 1.3311, + "step": 16181 + }, + { + "epoch": 0.8847094843021746, + "grad_norm": 1.5675349235534668, + "learning_rate": 7.327929666437839e-07, + "loss": 1.3397, + "step": 16182 + }, + { + "epoch": 0.8847641567458961, + "grad_norm": 1.4614617824554443, + "learning_rate": 7.321064902300723e-07, + "loss": 1.3445, + "step": 16183 + }, + { + "epoch": 0.8848188291896177, + "grad_norm": 1.3097970485687256, + "learning_rate": 7.314203232930728e-07, + "loss": 1.3251, + "step": 16184 + }, + { + "epoch": 0.8848735016333392, + "grad_norm": 1.3800480365753174, + "learning_rate": 7.307344658556959e-07, + "loss": 1.3138, + "step": 16185 + }, + { + "epoch": 0.8849281740770608, + "grad_norm": 1.7238335609436035, + "learning_rate": 7.300489179408477e-07, + "loss": 1.3652, + "step": 16186 + }, + { + "epoch": 0.8849828465207824, + "grad_norm": 1.952590823173523, + "learning_rate": 7.293636795714187e-07, + "loss": 1.4321, + "step": 16187 + }, + { + "epoch": 0.8850375189645039, + "grad_norm": 1.55171537399292, + "learning_rate": 7.28678750770292e-07, + "loss": 1.3522, + "step": 16188 + }, + { + "epoch": 0.8850921914082255, + "grad_norm": 1.5849099159240723, + "learning_rate": 7.279941315603356e-07, + "loss": 1.3144, + "step": 16189 + }, + { + "epoch": 0.8851468638519471, + "grad_norm": 2.6582844257354736, + "learning_rate": 7.273098219644137e-07, + "loss": 1.3214, + "step": 16190 + }, + { + "epoch": 0.8852015362956686, + "grad_norm": 1.3907747268676758, + "learning_rate": 7.266258220053746e-07, + "loss": 1.3491, + "step": 16191 + }, + { + "epoch": 0.8852562087393901, + "grad_norm": 1.4716633558273315, + "learning_rate": 7.259421317060611e-07, + "loss": 1.5317, + "step": 16192 + }, + { + "epoch": 0.8853108811831116, + "grad_norm": 1.6591637134552002, + "learning_rate": 7.252587510893005e-07, + "loss": 1.739, + "step": 16193 + }, + { + "epoch": 0.8853655536268332, + "grad_norm": 1.3203784227371216, + "learning_rate": 7.245756801779158e-07, + "loss": 1.4235, + "step": 16194 + }, + { + "epoch": 0.8854202260705548, + "grad_norm": 1.364305019378662, + "learning_rate": 7.238929189947153e-07, + "loss": 1.5098, + "step": 16195 + }, + { + "epoch": 0.8854748985142763, + "grad_norm": 1.5827797651290894, + "learning_rate": 7.23210467562494e-07, + "loss": 1.6228, + "step": 16196 + }, + { + "epoch": 0.8855295709579979, + "grad_norm": 1.924926996231079, + "learning_rate": 7.225283259040472e-07, + "loss": 1.4693, + "step": 16197 + }, + { + "epoch": 0.8855842434017195, + "grad_norm": 1.234319806098938, + "learning_rate": 7.218464940421488e-07, + "loss": 1.4817, + "step": 16198 + }, + { + "epoch": 0.885638915845441, + "grad_norm": 1.901957392692566, + "learning_rate": 7.211649719995684e-07, + "loss": 1.7583, + "step": 16199 + }, + { + "epoch": 0.8856935882891626, + "grad_norm": 1.4457422494888306, + "learning_rate": 7.204837597990622e-07, + "loss": 1.5173, + "step": 16200 + }, + { + "epoch": 0.8857482607328842, + "grad_norm": 1.4677520990371704, + "learning_rate": 7.198028574633787e-07, + "loss": 1.1652, + "step": 16201 + }, + { + "epoch": 0.8858029331766056, + "grad_norm": 1.6817771196365356, + "learning_rate": 7.19122265015253e-07, + "loss": 1.1098, + "step": 16202 + }, + { + "epoch": 0.8858576056203272, + "grad_norm": 1.6180412769317627, + "learning_rate": 7.184419824774147e-07, + "loss": 1.877, + "step": 16203 + }, + { + "epoch": 0.8859122780640488, + "grad_norm": 1.6049630641937256, + "learning_rate": 7.17762009872579e-07, + "loss": 1.2634, + "step": 16204 + }, + { + "epoch": 0.8859669505077703, + "grad_norm": 1.6611207723617554, + "learning_rate": 7.17082347223449e-07, + "loss": 1.5158, + "step": 16205 + }, + { + "epoch": 0.8860216229514919, + "grad_norm": 1.3945965766906738, + "learning_rate": 7.16402994552724e-07, + "loss": 1.2995, + "step": 16206 + }, + { + "epoch": 0.8860762953952134, + "grad_norm": 1.4773622751235962, + "learning_rate": 7.157239518830872e-07, + "loss": 1.3356, + "step": 16207 + }, + { + "epoch": 0.886130967838935, + "grad_norm": 1.7411839962005615, + "learning_rate": 7.150452192372138e-07, + "loss": 1.5036, + "step": 16208 + }, + { + "epoch": 0.8861856402826566, + "grad_norm": 1.394511342048645, + "learning_rate": 7.143667966377699e-07, + "loss": 1.4838, + "step": 16209 + }, + { + "epoch": 0.886240312726378, + "grad_norm": 1.8068311214447021, + "learning_rate": 7.136886841074053e-07, + "loss": 1.6596, + "step": 16210 + }, + { + "epoch": 0.8862949851700996, + "grad_norm": 1.8414915800094604, + "learning_rate": 7.130108816687687e-07, + "loss": 1.4659, + "step": 16211 + }, + { + "epoch": 0.8863496576138212, + "grad_norm": 1.2755542993545532, + "learning_rate": 7.123333893444906e-07, + "loss": 1.4659, + "step": 16212 + }, + { + "epoch": 0.8864043300575427, + "grad_norm": 1.567720890045166, + "learning_rate": 7.116562071571964e-07, + "loss": 1.4768, + "step": 16213 + }, + { + "epoch": 0.8864590025012643, + "grad_norm": 1.7304368019104004, + "learning_rate": 7.109793351294958e-07, + "loss": 1.3765, + "step": 16214 + }, + { + "epoch": 0.8865136749449859, + "grad_norm": 1.6111994981765747, + "learning_rate": 7.103027732839929e-07, + "loss": 1.3756, + "step": 16215 + }, + { + "epoch": 0.8865683473887074, + "grad_norm": 1.4621714353561401, + "learning_rate": 7.096265216432786e-07, + "loss": 1.2123, + "step": 16216 + }, + { + "epoch": 0.886623019832429, + "grad_norm": 1.3937894105911255, + "learning_rate": 7.089505802299357e-07, + "loss": 1.407, + "step": 16217 + }, + { + "epoch": 0.8866776922761506, + "grad_norm": 2.082335948944092, + "learning_rate": 7.082749490665353e-07, + "loss": 1.1931, + "step": 16218 + }, + { + "epoch": 0.886732364719872, + "grad_norm": 1.468451738357544, + "learning_rate": 7.07599628175637e-07, + "loss": 1.5637, + "step": 16219 + }, + { + "epoch": 0.8867870371635936, + "grad_norm": 1.6394555568695068, + "learning_rate": 7.069246175797939e-07, + "loss": 1.5043, + "step": 16220 + }, + { + "epoch": 0.8868417096073151, + "grad_norm": 1.3531404733657837, + "learning_rate": 7.062499173015425e-07, + "loss": 1.4732, + "step": 16221 + }, + { + "epoch": 0.8868963820510367, + "grad_norm": 1.6120429039001465, + "learning_rate": 7.055755273634169e-07, + "loss": 1.4847, + "step": 16222 + }, + { + "epoch": 0.8869510544947583, + "grad_norm": 1.3786835670471191, + "learning_rate": 7.049014477879346e-07, + "loss": 1.4948, + "step": 16223 + }, + { + "epoch": 0.8870057269384798, + "grad_norm": 1.746566891670227, + "learning_rate": 7.042276785976032e-07, + "loss": 1.5838, + "step": 16224 + }, + { + "epoch": 0.8870603993822014, + "grad_norm": 1.669145107269287, + "learning_rate": 7.035542198149237e-07, + "loss": 1.5823, + "step": 16225 + }, + { + "epoch": 0.887115071825923, + "grad_norm": 1.612711787223816, + "learning_rate": 7.028810714623846e-07, + "loss": 1.8534, + "step": 16226 + }, + { + "epoch": 0.8871697442696445, + "grad_norm": 1.6937808990478516, + "learning_rate": 7.022082335624614e-07, + "loss": 1.4889, + "step": 16227 + }, + { + "epoch": 0.887224416713366, + "grad_norm": 1.4877644777297974, + "learning_rate": 7.015357061376249e-07, + "loss": 1.395, + "step": 16228 + }, + { + "epoch": 0.8872790891570876, + "grad_norm": 1.5139209032058716, + "learning_rate": 7.008634892103294e-07, + "loss": 1.6105, + "step": 16229 + }, + { + "epoch": 0.8873337616008091, + "grad_norm": 1.3463317155838013, + "learning_rate": 7.001915828030225e-07, + "loss": 1.5621, + "step": 16230 + }, + { + "epoch": 0.8873884340445307, + "grad_norm": 1.6254148483276367, + "learning_rate": 6.995199869381419e-07, + "loss": 1.5199, + "step": 16231 + }, + { + "epoch": 0.8874431064882523, + "grad_norm": 1.7667056322097778, + "learning_rate": 6.98848701638114e-07, + "loss": 1.4225, + "step": 16232 + }, + { + "epoch": 0.8874977789319738, + "grad_norm": 1.7593142986297607, + "learning_rate": 6.98177726925352e-07, + "loss": 1.517, + "step": 16233 + }, + { + "epoch": 0.8875524513756954, + "grad_norm": 2.2635743618011475, + "learning_rate": 6.975070628222646e-07, + "loss": 1.4688, + "step": 16234 + }, + { + "epoch": 0.8876071238194169, + "grad_norm": 2.0219414234161377, + "learning_rate": 6.96836709351244e-07, + "loss": 1.4895, + "step": 16235 + }, + { + "epoch": 0.8876617962631385, + "grad_norm": 1.4635961055755615, + "learning_rate": 6.961666665346767e-07, + "loss": 1.2722, + "step": 16236 + }, + { + "epoch": 0.88771646870686, + "grad_norm": 1.560375452041626, + "learning_rate": 6.954969343949381e-07, + "loss": 1.2876, + "step": 16237 + }, + { + "epoch": 0.8877711411505815, + "grad_norm": 1.4190006256103516, + "learning_rate": 6.948275129543902e-07, + "loss": 1.3941, + "step": 16238 + }, + { + "epoch": 0.8878258135943031, + "grad_norm": 1.402791976928711, + "learning_rate": 6.941584022353865e-07, + "loss": 1.7144, + "step": 16239 + }, + { + "epoch": 0.8878804860380247, + "grad_norm": 1.463727355003357, + "learning_rate": 6.934896022602699e-07, + "loss": 1.3601, + "step": 16240 + }, + { + "epoch": 0.8879351584817462, + "grad_norm": 1.9044917821884155, + "learning_rate": 6.92821113051374e-07, + "loss": 1.2215, + "step": 16241 + }, + { + "epoch": 0.8879898309254678, + "grad_norm": 1.6780641078948975, + "learning_rate": 6.921529346310218e-07, + "loss": 1.6182, + "step": 16242 + }, + { + "epoch": 0.8880445033691894, + "grad_norm": 1.3747764825820923, + "learning_rate": 6.914850670215245e-07, + "loss": 1.4501, + "step": 16243 + }, + { + "epoch": 0.8880991758129109, + "grad_norm": 1.6279698610305786, + "learning_rate": 6.90817510245182e-07, + "loss": 1.5138, + "step": 16244 + }, + { + "epoch": 0.8881538482566325, + "grad_norm": 1.6858596801757812, + "learning_rate": 6.901502643242897e-07, + "loss": 1.5063, + "step": 16245 + }, + { + "epoch": 0.888208520700354, + "grad_norm": 1.618234395980835, + "learning_rate": 6.894833292811265e-07, + "loss": 1.4238, + "step": 16246 + }, + { + "epoch": 0.8882631931440755, + "grad_norm": 1.6296985149383545, + "learning_rate": 6.888167051379602e-07, + "loss": 1.6915, + "step": 16247 + }, + { + "epoch": 0.8883178655877971, + "grad_norm": 1.5889835357666016, + "learning_rate": 6.881503919170563e-07, + "loss": 1.3204, + "step": 16248 + }, + { + "epoch": 0.8883725380315186, + "grad_norm": 1.8347625732421875, + "learning_rate": 6.874843896406591e-07, + "loss": 1.5894, + "step": 16249 + }, + { + "epoch": 0.8884272104752402, + "grad_norm": 1.7253443002700806, + "learning_rate": 6.868186983310133e-07, + "loss": 1.4229, + "step": 16250 + }, + { + "epoch": 0.8884818829189618, + "grad_norm": 2.531568765640259, + "learning_rate": 6.861533180103441e-07, + "loss": 1.2961, + "step": 16251 + }, + { + "epoch": 0.8885365553626833, + "grad_norm": 1.8378591537475586, + "learning_rate": 6.854882487008718e-07, + "loss": 1.4088, + "step": 16252 + }, + { + "epoch": 0.8885912278064049, + "grad_norm": 1.5279104709625244, + "learning_rate": 6.848234904248041e-07, + "loss": 1.4203, + "step": 16253 + }, + { + "epoch": 0.8886459002501265, + "grad_norm": 1.9117575883865356, + "learning_rate": 6.841590432043388e-07, + "loss": 1.4295, + "step": 16254 + }, + { + "epoch": 0.888700572693848, + "grad_norm": 1.57663893699646, + "learning_rate": 6.834949070616626e-07, + "loss": 1.358, + "step": 16255 + }, + { + "epoch": 0.8887552451375695, + "grad_norm": 1.523841142654419, + "learning_rate": 6.828310820189533e-07, + "loss": 1.2856, + "step": 16256 + }, + { + "epoch": 0.8888099175812911, + "grad_norm": 1.782097339630127, + "learning_rate": 6.821675680983786e-07, + "loss": 1.4287, + "step": 16257 + }, + { + "epoch": 0.8888645900250126, + "grad_norm": 1.3224366903305054, + "learning_rate": 6.815043653220921e-07, + "loss": 1.4807, + "step": 16258 + }, + { + "epoch": 0.8889192624687342, + "grad_norm": 2.0455121994018555, + "learning_rate": 6.808414737122426e-07, + "loss": 1.2556, + "step": 16259 + }, + { + "epoch": 0.8889739349124558, + "grad_norm": 1.2198853492736816, + "learning_rate": 6.801788932909648e-07, + "loss": 1.4414, + "step": 16260 + }, + { + "epoch": 0.8890286073561773, + "grad_norm": 1.6084768772125244, + "learning_rate": 6.795166240803819e-07, + "loss": 1.3695, + "step": 16261 + }, + { + "epoch": 0.8890832797998989, + "grad_norm": 1.3343405723571777, + "learning_rate": 6.788546661026108e-07, + "loss": 1.4171, + "step": 16262 + }, + { + "epoch": 0.8891379522436204, + "grad_norm": 1.5750327110290527, + "learning_rate": 6.781930193797548e-07, + "loss": 1.1301, + "step": 16263 + }, + { + "epoch": 0.889192624687342, + "grad_norm": 1.6731823682785034, + "learning_rate": 6.775316839339086e-07, + "loss": 1.3608, + "step": 16264 + }, + { + "epoch": 0.8892472971310635, + "grad_norm": 2.141545057296753, + "learning_rate": 6.768706597871555e-07, + "loss": 1.4315, + "step": 16265 + }, + { + "epoch": 0.889301969574785, + "grad_norm": 1.4035377502441406, + "learning_rate": 6.76209946961569e-07, + "loss": 1.5738, + "step": 16266 + }, + { + "epoch": 0.8893566420185066, + "grad_norm": 1.4626829624176025, + "learning_rate": 6.755495454792116e-07, + "loss": 1.3025, + "step": 16267 + }, + { + "epoch": 0.8894113144622282, + "grad_norm": 1.309512734413147, + "learning_rate": 6.748894553621344e-07, + "loss": 1.3787, + "step": 16268 + }, + { + "epoch": 0.8894659869059497, + "grad_norm": 1.511082649230957, + "learning_rate": 6.742296766323797e-07, + "loss": 1.3268, + "step": 16269 + }, + { + "epoch": 0.8895206593496713, + "grad_norm": 1.3639287948608398, + "learning_rate": 6.735702093119811e-07, + "loss": 1.4469, + "step": 16270 + }, + { + "epoch": 0.8895753317933929, + "grad_norm": 1.5305231809616089, + "learning_rate": 6.729110534229577e-07, + "loss": 1.2999, + "step": 16271 + }, + { + "epoch": 0.8896300042371144, + "grad_norm": 1.556709885597229, + "learning_rate": 6.722522089873196e-07, + "loss": 1.1256, + "step": 16272 + }, + { + "epoch": 0.889684676680836, + "grad_norm": 1.6136205196380615, + "learning_rate": 6.715936760270703e-07, + "loss": 1.3472, + "step": 16273 + }, + { + "epoch": 0.8897393491245575, + "grad_norm": 2.1640498638153076, + "learning_rate": 6.709354545641989e-07, + "loss": 1.4771, + "step": 16274 + }, + { + "epoch": 0.889794021568279, + "grad_norm": 1.4434908628463745, + "learning_rate": 6.702775446206811e-07, + "loss": 1.4368, + "step": 16275 + }, + { + "epoch": 0.8898486940120006, + "grad_norm": 1.261137843132019, + "learning_rate": 6.696199462184905e-07, + "loss": 1.4133, + "step": 16276 + }, + { + "epoch": 0.8899033664557221, + "grad_norm": 1.6143419742584229, + "learning_rate": 6.68962659379585e-07, + "loss": 1.2799, + "step": 16277 + }, + { + "epoch": 0.8899580388994437, + "grad_norm": 1.4003461599349976, + "learning_rate": 6.683056841259117e-07, + "loss": 1.4855, + "step": 16278 + }, + { + "epoch": 0.8900127113431653, + "grad_norm": 1.9462662935256958, + "learning_rate": 6.676490204794094e-07, + "loss": 1.3612, + "step": 16279 + }, + { + "epoch": 0.8900673837868868, + "grad_norm": 1.504008173942566, + "learning_rate": 6.669926684620043e-07, + "loss": 1.3345, + "step": 16280 + }, + { + "epoch": 0.8901220562306084, + "grad_norm": 1.996196985244751, + "learning_rate": 6.663366280956152e-07, + "loss": 1.7108, + "step": 16281 + }, + { + "epoch": 0.89017672867433, + "grad_norm": 1.4169095754623413, + "learning_rate": 6.656808994021491e-07, + "loss": 1.4078, + "step": 16282 + }, + { + "epoch": 0.8902314011180514, + "grad_norm": 1.3769458532333374, + "learning_rate": 6.650254824034996e-07, + "loss": 1.4734, + "step": 16283 + }, + { + "epoch": 0.890286073561773, + "grad_norm": 1.571679711341858, + "learning_rate": 6.643703771215548e-07, + "loss": 1.3395, + "step": 16284 + }, + { + "epoch": 0.8903407460054946, + "grad_norm": 1.5923980474472046, + "learning_rate": 6.637155835781917e-07, + "loss": 1.2408, + "step": 16285 + }, + { + "epoch": 0.8903954184492161, + "grad_norm": 2.2958319187164307, + "learning_rate": 6.630611017952704e-07, + "loss": 1.3217, + "step": 16286 + }, + { + "epoch": 0.8904500908929377, + "grad_norm": 1.2849080562591553, + "learning_rate": 6.624069317946513e-07, + "loss": 1.6203, + "step": 16287 + }, + { + "epoch": 0.8905047633366593, + "grad_norm": 1.833269715309143, + "learning_rate": 6.617530735981758e-07, + "loss": 1.6161, + "step": 16288 + }, + { + "epoch": 0.8905594357803808, + "grad_norm": 2.492702007293701, + "learning_rate": 6.610995272276765e-07, + "loss": 1.6132, + "step": 16289 + }, + { + "epoch": 0.8906141082241024, + "grad_norm": 2.1002299785614014, + "learning_rate": 6.604462927049804e-07, + "loss": 1.3917, + "step": 16290 + }, + { + "epoch": 0.8906687806678238, + "grad_norm": 1.4871819019317627, + "learning_rate": 6.597933700518977e-07, + "loss": 1.3615, + "step": 16291 + }, + { + "epoch": 0.8907234531115454, + "grad_norm": 1.3712905645370483, + "learning_rate": 6.591407592902321e-07, + "loss": 1.3534, + "step": 16292 + }, + { + "epoch": 0.890778125555267, + "grad_norm": 1.2419013977050781, + "learning_rate": 6.584884604417763e-07, + "loss": 1.4995, + "step": 16293 + }, + { + "epoch": 0.8908327979989885, + "grad_norm": 1.5630340576171875, + "learning_rate": 6.578364735283094e-07, + "loss": 1.4312, + "step": 16294 + }, + { + "epoch": 0.8908874704427101, + "grad_norm": 1.992966651916504, + "learning_rate": 6.571847985716063e-07, + "loss": 1.4329, + "step": 16295 + }, + { + "epoch": 0.8909421428864317, + "grad_norm": 1.6572189331054688, + "learning_rate": 6.565334355934261e-07, + "loss": 1.6144, + "step": 16296 + }, + { + "epoch": 0.8909968153301532, + "grad_norm": 1.7381490468978882, + "learning_rate": 6.558823846155182e-07, + "loss": 1.2749, + "step": 16297 + }, + { + "epoch": 0.8910514877738748, + "grad_norm": 1.5652183294296265, + "learning_rate": 6.552316456596252e-07, + "loss": 1.4678, + "step": 16298 + }, + { + "epoch": 0.8911061602175964, + "grad_norm": 1.0000970363616943, + "learning_rate": 6.545812187474765e-07, + "loss": 1.7346, + "step": 16299 + }, + { + "epoch": 0.8911608326613178, + "grad_norm": 1.3290985822677612, + "learning_rate": 6.53931103900789e-07, + "loss": 1.4874, + "step": 16300 + }, + { + "epoch": 0.8912155051050394, + "grad_norm": 1.5429435968399048, + "learning_rate": 6.532813011412742e-07, + "loss": 1.426, + "step": 16301 + }, + { + "epoch": 0.891270177548761, + "grad_norm": 1.2618542909622192, + "learning_rate": 6.526318104906293e-07, + "loss": 1.5342, + "step": 16302 + }, + { + "epoch": 0.8913248499924825, + "grad_norm": 1.7524856328964233, + "learning_rate": 6.519826319705413e-07, + "loss": 1.4055, + "step": 16303 + }, + { + "epoch": 0.8913795224362041, + "grad_norm": 1.456858515739441, + "learning_rate": 6.513337656026908e-07, + "loss": 1.5884, + "step": 16304 + }, + { + "epoch": 0.8914341948799257, + "grad_norm": 1.9734545946121216, + "learning_rate": 6.506852114087436e-07, + "loss": 1.3111, + "step": 16305 + }, + { + "epoch": 0.8914888673236472, + "grad_norm": 2.1810390949249268, + "learning_rate": 6.500369694103559e-07, + "loss": 1.5727, + "step": 16306 + }, + { + "epoch": 0.8915435397673688, + "grad_norm": 1.5721874237060547, + "learning_rate": 6.493890396291736e-07, + "loss": 1.3094, + "step": 16307 + }, + { + "epoch": 0.8915982122110903, + "grad_norm": 1.6220192909240723, + "learning_rate": 6.487414220868315e-07, + "loss": 1.3601, + "step": 16308 + }, + { + "epoch": 0.8916528846548119, + "grad_norm": 1.7227509021759033, + "learning_rate": 6.480941168049593e-07, + "loss": 1.6622, + "step": 16309 + }, + { + "epoch": 0.8917075570985334, + "grad_norm": 1.7037099599838257, + "learning_rate": 6.474471238051683e-07, + "loss": 1.5734, + "step": 16310 + }, + { + "epoch": 0.8917622295422549, + "grad_norm": 1.4680036306381226, + "learning_rate": 6.468004431090636e-07, + "loss": 1.5276, + "step": 16311 + }, + { + "epoch": 0.8918169019859765, + "grad_norm": 1.5928291082382202, + "learning_rate": 6.461540747382411e-07, + "loss": 1.3146, + "step": 16312 + }, + { + "epoch": 0.8918715744296981, + "grad_norm": 1.4784451723098755, + "learning_rate": 6.455080187142837e-07, + "loss": 1.708, + "step": 16313 + }, + { + "epoch": 0.8919262468734196, + "grad_norm": 1.22265625, + "learning_rate": 6.44862275058763e-07, + "loss": 1.5349, + "step": 16314 + }, + { + "epoch": 0.8919809193171412, + "grad_norm": 1.6878917217254639, + "learning_rate": 6.44216843793245e-07, + "loss": 1.5092, + "step": 16315 + }, + { + "epoch": 0.8920355917608628, + "grad_norm": 1.4665656089782715, + "learning_rate": 6.435717249392803e-07, + "loss": 1.4979, + "step": 16316 + }, + { + "epoch": 0.8920902642045843, + "grad_norm": 2.0804247856140137, + "learning_rate": 6.429269185184117e-07, + "loss": 1.3737, + "step": 16317 + }, + { + "epoch": 0.8921449366483059, + "grad_norm": 1.4049384593963623, + "learning_rate": 6.422824245521708e-07, + "loss": 1.4347, + "step": 16318 + }, + { + "epoch": 0.8921996090920274, + "grad_norm": 1.4159696102142334, + "learning_rate": 6.41638243062076e-07, + "loss": 1.4395, + "step": 16319 + }, + { + "epoch": 0.8922542815357489, + "grad_norm": 1.2772754430770874, + "learning_rate": 6.409943740696423e-07, + "loss": 1.2799, + "step": 16320 + }, + { + "epoch": 0.8923089539794705, + "grad_norm": 1.598325252532959, + "learning_rate": 6.40350817596368e-07, + "loss": 1.465, + "step": 16321 + }, + { + "epoch": 0.892363626423192, + "grad_norm": 1.028804898262024, + "learning_rate": 6.397075736637404e-07, + "loss": 1.5581, + "step": 16322 + }, + { + "epoch": 0.8924182988669136, + "grad_norm": 1.8658499717712402, + "learning_rate": 6.390646422932445e-07, + "loss": 1.3927, + "step": 16323 + }, + { + "epoch": 0.8924729713106352, + "grad_norm": 1.2257307767868042, + "learning_rate": 6.384220235063454e-07, + "loss": 1.711, + "step": 16324 + }, + { + "epoch": 0.8925276437543567, + "grad_norm": 2.0454494953155518, + "learning_rate": 6.377797173245015e-07, + "loss": 1.5034, + "step": 16325 + }, + { + "epoch": 0.8925823161980783, + "grad_norm": 1.7533131837844849, + "learning_rate": 6.371377237691634e-07, + "loss": 1.375, + "step": 16326 + }, + { + "epoch": 0.8926369886417999, + "grad_norm": 1.700047254562378, + "learning_rate": 6.364960428617673e-07, + "loss": 1.2886, + "step": 16327 + }, + { + "epoch": 0.8926916610855213, + "grad_norm": 1.408478021621704, + "learning_rate": 6.358546746237393e-07, + "loss": 1.5367, + "step": 16328 + }, + { + "epoch": 0.8927463335292429, + "grad_norm": 1.2518887519836426, + "learning_rate": 6.352136190764991e-07, + "loss": 1.5432, + "step": 16329 + }, + { + "epoch": 0.8928010059729645, + "grad_norm": 1.9185985326766968, + "learning_rate": 6.345728762414504e-07, + "loss": 1.4342, + "step": 16330 + }, + { + "epoch": 0.892855678416686, + "grad_norm": 1.474828839302063, + "learning_rate": 6.33932446139991e-07, + "loss": 1.6393, + "step": 16331 + }, + { + "epoch": 0.8929103508604076, + "grad_norm": 1.7505958080291748, + "learning_rate": 6.332923287935044e-07, + "loss": 1.6218, + "step": 16332 + }, + { + "epoch": 0.8929650233041292, + "grad_norm": 1.300061583518982, + "learning_rate": 6.326525242233661e-07, + "loss": 1.4008, + "step": 16333 + }, + { + "epoch": 0.8930196957478507, + "grad_norm": 1.6646206378936768, + "learning_rate": 6.320130324509421e-07, + "loss": 1.5747, + "step": 16334 + }, + { + "epoch": 0.8930743681915723, + "grad_norm": 2.4105403423309326, + "learning_rate": 6.313738534975855e-07, + "loss": 1.3151, + "step": 16335 + }, + { + "epoch": 0.8931290406352937, + "grad_norm": 2.1407508850097656, + "learning_rate": 6.307349873846392e-07, + "loss": 1.2849, + "step": 16336 + }, + { + "epoch": 0.8931837130790153, + "grad_norm": 1.4353725910186768, + "learning_rate": 6.300964341334382e-07, + "loss": 1.5281, + "step": 16337 + }, + { + "epoch": 0.8932383855227369, + "grad_norm": 1.4627832174301147, + "learning_rate": 6.294581937653044e-07, + "loss": 1.4115, + "step": 16338 + }, + { + "epoch": 0.8932930579664584, + "grad_norm": 1.6367300748825073, + "learning_rate": 6.288202663015485e-07, + "loss": 1.5125, + "step": 16339 + }, + { + "epoch": 0.89334773041018, + "grad_norm": 1.2527384757995605, + "learning_rate": 6.281826517634759e-07, + "loss": 1.526, + "step": 16340 + }, + { + "epoch": 0.8934024028539016, + "grad_norm": 1.3699613809585571, + "learning_rate": 6.275453501723771e-07, + "loss": 1.5021, + "step": 16341 + }, + { + "epoch": 0.8934570752976231, + "grad_norm": 1.6477123498916626, + "learning_rate": 6.269083615495297e-07, + "loss": 1.4175, + "step": 16342 + }, + { + "epoch": 0.8935117477413447, + "grad_norm": 1.5026229619979858, + "learning_rate": 6.262716859162088e-07, + "loss": 1.4438, + "step": 16343 + }, + { + "epoch": 0.8935664201850663, + "grad_norm": 1.449314832687378, + "learning_rate": 6.256353232936718e-07, + "loss": 1.5195, + "step": 16344 + }, + { + "epoch": 0.8936210926287877, + "grad_norm": 1.2895219326019287, + "learning_rate": 6.249992737031695e-07, + "loss": 1.42, + "step": 16345 + }, + { + "epoch": 0.8936757650725093, + "grad_norm": 1.4974108934402466, + "learning_rate": 6.243635371659396e-07, + "loss": 1.2652, + "step": 16346 + }, + { + "epoch": 0.8937304375162309, + "grad_norm": 1.7641850709915161, + "learning_rate": 6.237281137032114e-07, + "loss": 1.4025, + "step": 16347 + }, + { + "epoch": 0.8937851099599524, + "grad_norm": 1.632384181022644, + "learning_rate": 6.230930033362048e-07, + "loss": 1.4602, + "step": 16348 + }, + { + "epoch": 0.893839782403674, + "grad_norm": 2.0180277824401855, + "learning_rate": 6.224582060861262e-07, + "loss": 1.5742, + "step": 16349 + }, + { + "epoch": 0.8938944548473955, + "grad_norm": 1.6356474161148071, + "learning_rate": 6.218237219741718e-07, + "loss": 1.6467, + "step": 16350 + }, + { + "epoch": 0.8939491272911171, + "grad_norm": 1.8865729570388794, + "learning_rate": 6.211895510215316e-07, + "loss": 1.3976, + "step": 16351 + }, + { + "epoch": 0.8940037997348387, + "grad_norm": 1.398714303970337, + "learning_rate": 6.205556932493806e-07, + "loss": 1.4047, + "step": 16352 + }, + { + "epoch": 0.8940584721785602, + "grad_norm": 1.2581223249435425, + "learning_rate": 6.199221486788831e-07, + "loss": 1.5979, + "step": 16353 + }, + { + "epoch": 0.8941131446222818, + "grad_norm": 1.3108662366867065, + "learning_rate": 6.192889173311966e-07, + "loss": 1.7485, + "step": 16354 + }, + { + "epoch": 0.8941678170660033, + "grad_norm": 1.3644722700119019, + "learning_rate": 6.186559992274665e-07, + "loss": 1.4137, + "step": 16355 + }, + { + "epoch": 0.8942224895097248, + "grad_norm": 2.2849225997924805, + "learning_rate": 6.18023394388827e-07, + "loss": 1.3678, + "step": 16356 + }, + { + "epoch": 0.8942771619534464, + "grad_norm": 1.6400959491729736, + "learning_rate": 6.17391102836401e-07, + "loss": 1.4004, + "step": 16357 + }, + { + "epoch": 0.894331834397168, + "grad_norm": 1.359726071357727, + "learning_rate": 6.167591245913029e-07, + "loss": 1.3411, + "step": 16358 + }, + { + "epoch": 0.8943865068408895, + "grad_norm": 1.611759901046753, + "learning_rate": 6.161274596746347e-07, + "loss": 1.5201, + "step": 16359 + }, + { + "epoch": 0.8944411792846111, + "grad_norm": 1.382888913154602, + "learning_rate": 6.154961081074929e-07, + "loss": 1.348, + "step": 16360 + }, + { + "epoch": 0.8944958517283327, + "grad_norm": 1.8610843420028687, + "learning_rate": 6.148650699109559e-07, + "loss": 1.4374, + "step": 16361 + }, + { + "epoch": 0.8945505241720542, + "grad_norm": 1.2697159051895142, + "learning_rate": 6.142343451060973e-07, + "loss": 1.4465, + "step": 16362 + }, + { + "epoch": 0.8946051966157758, + "grad_norm": 1.8442211151123047, + "learning_rate": 6.136039337139799e-07, + "loss": 1.5427, + "step": 16363 + }, + { + "epoch": 0.8946598690594972, + "grad_norm": 1.5751535892486572, + "learning_rate": 6.129738357556503e-07, + "loss": 1.3189, + "step": 16364 + }, + { + "epoch": 0.8947145415032188, + "grad_norm": 1.460377812385559, + "learning_rate": 6.123440512521539e-07, + "loss": 1.5867, + "step": 16365 + }, + { + "epoch": 0.8947692139469404, + "grad_norm": 1.3830008506774902, + "learning_rate": 6.117145802245183e-07, + "loss": 1.3579, + "step": 16366 + }, + { + "epoch": 0.8948238863906619, + "grad_norm": 1.5765777826309204, + "learning_rate": 6.110854226937613e-07, + "loss": 1.3733, + "step": 16367 + }, + { + "epoch": 0.8948785588343835, + "grad_norm": 1.4387264251708984, + "learning_rate": 6.104565786808959e-07, + "loss": 1.2263, + "step": 16368 + }, + { + "epoch": 0.8949332312781051, + "grad_norm": 2.0377538204193115, + "learning_rate": 6.098280482069186e-07, + "loss": 1.5728, + "step": 16369 + }, + { + "epoch": 0.8949879037218266, + "grad_norm": 1.682052493095398, + "learning_rate": 6.091998312928171e-07, + "loss": 1.427, + "step": 16370 + }, + { + "epoch": 0.8950425761655482, + "grad_norm": 1.810163140296936, + "learning_rate": 6.085719279595692e-07, + "loss": 1.475, + "step": 16371 + }, + { + "epoch": 0.8950972486092698, + "grad_norm": 1.3718125820159912, + "learning_rate": 6.079443382281424e-07, + "loss": 1.5375, + "step": 16372 + }, + { + "epoch": 0.8951519210529912, + "grad_norm": 1.7774442434310913, + "learning_rate": 6.073170621194924e-07, + "loss": 1.6174, + "step": 16373 + }, + { + "epoch": 0.8952065934967128, + "grad_norm": 1.6569042205810547, + "learning_rate": 6.066900996545677e-07, + "loss": 1.4387, + "step": 16374 + }, + { + "epoch": 0.8952612659404344, + "grad_norm": 1.460545539855957, + "learning_rate": 6.060634508543006e-07, + "loss": 1.5529, + "step": 16375 + }, + { + "epoch": 0.8953159383841559, + "grad_norm": 1.508429765701294, + "learning_rate": 6.0543711573962e-07, + "loss": 1.4041, + "step": 16376 + }, + { + "epoch": 0.8953706108278775, + "grad_norm": 1.4399138689041138, + "learning_rate": 6.0481109433144e-07, + "loss": 1.2894, + "step": 16377 + }, + { + "epoch": 0.895425283271599, + "grad_norm": 1.8724061250686646, + "learning_rate": 6.041853866506619e-07, + "loss": 1.6832, + "step": 16378 + }, + { + "epoch": 0.8954799557153206, + "grad_norm": 2.056140661239624, + "learning_rate": 6.035599927181834e-07, + "loss": 1.2145, + "step": 16379 + }, + { + "epoch": 0.8955346281590422, + "grad_norm": 1.4392671585083008, + "learning_rate": 6.029349125548856e-07, + "loss": 1.4631, + "step": 16380 + }, + { + "epoch": 0.8955893006027636, + "grad_norm": 1.331281065940857, + "learning_rate": 6.023101461816427e-07, + "loss": 1.3785, + "step": 16381 + }, + { + "epoch": 0.8956439730464852, + "grad_norm": 1.5210435390472412, + "learning_rate": 6.016856936193161e-07, + "loss": 1.5164, + "step": 16382 + }, + { + "epoch": 0.8956986454902068, + "grad_norm": 1.4044522047042847, + "learning_rate": 6.010615548887577e-07, + "loss": 1.3856, + "step": 16383 + }, + { + "epoch": 0.8957533179339283, + "grad_norm": 1.8237578868865967, + "learning_rate": 6.004377300108078e-07, + "loss": 1.2821, + "step": 16384 + }, + { + "epoch": 0.8958079903776499, + "grad_norm": 1.9428246021270752, + "learning_rate": 5.998142190063017e-07, + "loss": 1.3493, + "step": 16385 + }, + { + "epoch": 0.8958626628213715, + "grad_norm": 1.4541285037994385, + "learning_rate": 5.99191021896055e-07, + "loss": 1.5022, + "step": 16386 + }, + { + "epoch": 0.895917335265093, + "grad_norm": 2.053745746612549, + "learning_rate": 5.985681387008802e-07, + "loss": 1.2231, + "step": 16387 + }, + { + "epoch": 0.8959720077088146, + "grad_norm": 1.9002445936203003, + "learning_rate": 5.97945569441577e-07, + "loss": 1.3856, + "step": 16388 + }, + { + "epoch": 0.8960266801525362, + "grad_norm": 1.1901005506515503, + "learning_rate": 5.973233141389323e-07, + "loss": 1.5038, + "step": 16389 + }, + { + "epoch": 0.8960813525962577, + "grad_norm": 1.7422959804534912, + "learning_rate": 5.967013728137272e-07, + "loss": 1.5705, + "step": 16390 + }, + { + "epoch": 0.8961360250399792, + "grad_norm": 1.8953853845596313, + "learning_rate": 5.960797454867295e-07, + "loss": 1.2964, + "step": 16391 + }, + { + "epoch": 0.8961906974837007, + "grad_norm": 1.2097662687301636, + "learning_rate": 5.954584321786949e-07, + "loss": 1.6288, + "step": 16392 + }, + { + "epoch": 0.8962453699274223, + "grad_norm": 1.378791093826294, + "learning_rate": 5.948374329103723e-07, + "loss": 1.3607, + "step": 16393 + }, + { + "epoch": 0.8963000423711439, + "grad_norm": 1.3885524272918701, + "learning_rate": 5.942167477024985e-07, + "loss": 1.3899, + "step": 16394 + }, + { + "epoch": 0.8963547148148654, + "grad_norm": 1.2510277032852173, + "learning_rate": 5.93596376575798e-07, + "loss": 1.3985, + "step": 16395 + }, + { + "epoch": 0.896409387258587, + "grad_norm": 1.2371684312820435, + "learning_rate": 5.929763195509875e-07, + "loss": 1.5333, + "step": 16396 + }, + { + "epoch": 0.8964640597023086, + "grad_norm": 1.7045353651046753, + "learning_rate": 5.923565766487727e-07, + "loss": 1.5561, + "step": 16397 + }, + { + "epoch": 0.8965187321460301, + "grad_norm": 1.1407864093780518, + "learning_rate": 5.91737147889846e-07, + "loss": 1.5898, + "step": 16398 + }, + { + "epoch": 0.8965734045897517, + "grad_norm": 1.4329947233200073, + "learning_rate": 5.91118033294894e-07, + "loss": 1.3291, + "step": 16399 + }, + { + "epoch": 0.8966280770334732, + "grad_norm": 1.354667067527771, + "learning_rate": 5.904992328845893e-07, + "loss": 1.5846, + "step": 16400 + }, + { + "epoch": 0.8966827494771947, + "grad_norm": 1.3923072814941406, + "learning_rate": 5.898807466795941e-07, + "loss": 1.2846, + "step": 16401 + }, + { + "epoch": 0.8967374219209163, + "grad_norm": 1.539507508277893, + "learning_rate": 5.892625747005642e-07, + "loss": 1.5549, + "step": 16402 + }, + { + "epoch": 0.8967920943646379, + "grad_norm": 1.68113112449646, + "learning_rate": 5.886447169681386e-07, + "loss": 1.2718, + "step": 16403 + }, + { + "epoch": 0.8968467668083594, + "grad_norm": 1.57086980342865, + "learning_rate": 5.880271735029508e-07, + "loss": 1.5551, + "step": 16404 + }, + { + "epoch": 0.896901439252081, + "grad_norm": 1.5584360361099243, + "learning_rate": 5.874099443256221e-07, + "loss": 1.4958, + "step": 16405 + }, + { + "epoch": 0.8969561116958025, + "grad_norm": 1.4040591716766357, + "learning_rate": 5.867930294567614e-07, + "loss": 1.7447, + "step": 16406 + }, + { + "epoch": 0.8970107841395241, + "grad_norm": 1.4765347242355347, + "learning_rate": 5.861764289169713e-07, + "loss": 1.4925, + "step": 16407 + }, + { + "epoch": 0.8970654565832457, + "grad_norm": 1.682780146598816, + "learning_rate": 5.855601427268409e-07, + "loss": 1.554, + "step": 16408 + }, + { + "epoch": 0.8971201290269671, + "grad_norm": 1.404530644416809, + "learning_rate": 5.849441709069492e-07, + "loss": 1.4221, + "step": 16409 + }, + { + "epoch": 0.8971748014706887, + "grad_norm": 1.5721793174743652, + "learning_rate": 5.843285134778643e-07, + "loss": 1.2885, + "step": 16410 + }, + { + "epoch": 0.8972294739144103, + "grad_norm": 1.2410693168640137, + "learning_rate": 5.837131704601451e-07, + "loss": 1.4569, + "step": 16411 + }, + { + "epoch": 0.8972841463581318, + "grad_norm": 1.5287619829177856, + "learning_rate": 5.830981418743376e-07, + "loss": 1.3117, + "step": 16412 + }, + { + "epoch": 0.8973388188018534, + "grad_norm": 1.4339113235473633, + "learning_rate": 5.824834277409818e-07, + "loss": 1.3224, + "step": 16413 + }, + { + "epoch": 0.897393491245575, + "grad_norm": 1.788517713546753, + "learning_rate": 5.818690280806038e-07, + "loss": 1.5918, + "step": 16414 + }, + { + "epoch": 0.8974481636892965, + "grad_norm": 1.5545014142990112, + "learning_rate": 5.812549429137171e-07, + "loss": 1.6202, + "step": 16415 + }, + { + "epoch": 0.8975028361330181, + "grad_norm": 1.2533899545669556, + "learning_rate": 5.806411722608318e-07, + "loss": 1.393, + "step": 16416 + }, + { + "epoch": 0.8975575085767397, + "grad_norm": 1.937424898147583, + "learning_rate": 5.800277161424383e-07, + "loss": 1.44, + "step": 16417 + }, + { + "epoch": 0.8976121810204611, + "grad_norm": 1.6129916906356812, + "learning_rate": 5.794145745790269e-07, + "loss": 1.2534, + "step": 16418 + }, + { + "epoch": 0.8976668534641827, + "grad_norm": 1.9496221542358398, + "learning_rate": 5.788017475910679e-07, + "loss": 1.2972, + "step": 16419 + }, + { + "epoch": 0.8977215259079042, + "grad_norm": 1.4612849950790405, + "learning_rate": 5.781892351990271e-07, + "loss": 1.5493, + "step": 16420 + }, + { + "epoch": 0.8977761983516258, + "grad_norm": 1.2626419067382812, + "learning_rate": 5.775770374233558e-07, + "loss": 1.3972, + "step": 16421 + }, + { + "epoch": 0.8978308707953474, + "grad_norm": 1.9802899360656738, + "learning_rate": 5.769651542844989e-07, + "loss": 1.5413, + "step": 16422 + }, + { + "epoch": 0.8978855432390689, + "grad_norm": 1.560896873474121, + "learning_rate": 5.763535858028846e-07, + "loss": 1.4262, + "step": 16423 + }, + { + "epoch": 0.8979402156827905, + "grad_norm": 1.7408965826034546, + "learning_rate": 5.757423319989397e-07, + "loss": 1.5031, + "step": 16424 + }, + { + "epoch": 0.8979948881265121, + "grad_norm": 1.2723824977874756, + "learning_rate": 5.751313928930724e-07, + "loss": 1.4958, + "step": 16425 + }, + { + "epoch": 0.8980495605702336, + "grad_norm": 1.5613481998443604, + "learning_rate": 5.74520768505682e-07, + "loss": 1.2072, + "step": 16426 + }, + { + "epoch": 0.8981042330139551, + "grad_norm": 1.3168240785598755, + "learning_rate": 5.739104588571631e-07, + "loss": 1.289, + "step": 16427 + }, + { + "epoch": 0.8981589054576767, + "grad_norm": 1.4944043159484863, + "learning_rate": 5.73300463967893e-07, + "loss": 1.5486, + "step": 16428 + }, + { + "epoch": 0.8982135779013982, + "grad_norm": 2.105769395828247, + "learning_rate": 5.726907838582396e-07, + "loss": 1.2723, + "step": 16429 + }, + { + "epoch": 0.8982682503451198, + "grad_norm": 1.5351310968399048, + "learning_rate": 5.720814185485635e-07, + "loss": 1.316, + "step": 16430 + }, + { + "epoch": 0.8983229227888414, + "grad_norm": 1.4714826345443726, + "learning_rate": 5.714723680592116e-07, + "loss": 1.4388, + "step": 16431 + }, + { + "epoch": 0.8983775952325629, + "grad_norm": 1.300494909286499, + "learning_rate": 5.708636324105222e-07, + "loss": 1.5752, + "step": 16432 + }, + { + "epoch": 0.8984322676762845, + "grad_norm": 1.5001496076583862, + "learning_rate": 5.702552116228232e-07, + "loss": 1.3811, + "step": 16433 + }, + { + "epoch": 0.898486940120006, + "grad_norm": 1.5629558563232422, + "learning_rate": 5.696471057164299e-07, + "loss": 1.4416, + "step": 16434 + }, + { + "epoch": 0.8985416125637276, + "grad_norm": 1.4790536165237427, + "learning_rate": 5.690393147116491e-07, + "loss": 1.4233, + "step": 16435 + }, + { + "epoch": 0.8985962850074491, + "grad_norm": 1.5601170063018799, + "learning_rate": 5.684318386287758e-07, + "loss": 1.5312, + "step": 16436 + }, + { + "epoch": 0.8986509574511706, + "grad_norm": 1.6746326684951782, + "learning_rate": 5.678246774880946e-07, + "loss": 1.4973, + "step": 16437 + }, + { + "epoch": 0.8987056298948922, + "grad_norm": 1.657077670097351, + "learning_rate": 5.672178313098808e-07, + "loss": 1.5597, + "step": 16438 + }, + { + "epoch": 0.8987603023386138, + "grad_norm": 1.426019310951233, + "learning_rate": 5.666113001143991e-07, + "loss": 1.4829, + "step": 16439 + }, + { + "epoch": 0.8988149747823353, + "grad_norm": 1.5444587469100952, + "learning_rate": 5.660050839219011e-07, + "loss": 1.1148, + "step": 16440 + }, + { + "epoch": 0.8988696472260569, + "grad_norm": 1.296566367149353, + "learning_rate": 5.653991827526318e-07, + "loss": 1.5829, + "step": 16441 + }, + { + "epoch": 0.8989243196697785, + "grad_norm": 1.8365974426269531, + "learning_rate": 5.647935966268225e-07, + "loss": 1.5238, + "step": 16442 + }, + { + "epoch": 0.8989789921135, + "grad_norm": 1.5599714517593384, + "learning_rate": 5.641883255646952e-07, + "loss": 1.3425, + "step": 16443 + }, + { + "epoch": 0.8990336645572216, + "grad_norm": 1.3554898500442505, + "learning_rate": 5.635833695864623e-07, + "loss": 1.2307, + "step": 16444 + }, + { + "epoch": 0.8990883370009431, + "grad_norm": 1.365712285041809, + "learning_rate": 5.629787287123223e-07, + "loss": 1.538, + "step": 16445 + }, + { + "epoch": 0.8991430094446646, + "grad_norm": 1.7286334037780762, + "learning_rate": 5.623744029624678e-07, + "loss": 1.5955, + "step": 16446 + }, + { + "epoch": 0.8991976818883862, + "grad_norm": 1.5507980585098267, + "learning_rate": 5.617703923570795e-07, + "loss": 1.3544, + "step": 16447 + }, + { + "epoch": 0.8992523543321077, + "grad_norm": 1.485810399055481, + "learning_rate": 5.611666969163243e-07, + "loss": 1.6431, + "step": 16448 + }, + { + "epoch": 0.8993070267758293, + "grad_norm": 1.4816007614135742, + "learning_rate": 5.60563316660363e-07, + "loss": 1.6578, + "step": 16449 + }, + { + "epoch": 0.8993616992195509, + "grad_norm": 1.4396716356277466, + "learning_rate": 5.599602516093427e-07, + "loss": 1.463, + "step": 16450 + }, + { + "epoch": 0.8994163716632724, + "grad_norm": 1.7276084423065186, + "learning_rate": 5.593575017833997e-07, + "loss": 1.3684, + "step": 16451 + }, + { + "epoch": 0.899471044106994, + "grad_norm": 1.600358486175537, + "learning_rate": 5.587550672026643e-07, + "loss": 1.5742, + "step": 16452 + }, + { + "epoch": 0.8995257165507156, + "grad_norm": 1.3113394975662231, + "learning_rate": 5.581529478872516e-07, + "loss": 1.5268, + "step": 16453 + }, + { + "epoch": 0.899580388994437, + "grad_norm": 1.6178568601608276, + "learning_rate": 5.575511438572656e-07, + "loss": 1.2015, + "step": 16454 + }, + { + "epoch": 0.8996350614381586, + "grad_norm": 1.7005659341812134, + "learning_rate": 5.569496551328069e-07, + "loss": 1.3343, + "step": 16455 + }, + { + "epoch": 0.8996897338818802, + "grad_norm": 1.4634919166564941, + "learning_rate": 5.563484817339581e-07, + "loss": 1.5883, + "step": 16456 + }, + { + "epoch": 0.8997444063256017, + "grad_norm": 1.5824960470199585, + "learning_rate": 5.557476236807923e-07, + "loss": 1.2664, + "step": 16457 + }, + { + "epoch": 0.8997990787693233, + "grad_norm": 1.3173954486846924, + "learning_rate": 5.551470809933757e-07, + "loss": 1.5327, + "step": 16458 + }, + { + "epoch": 0.8998537512130449, + "grad_norm": 1.3273133039474487, + "learning_rate": 5.545468536917619e-07, + "loss": 1.6323, + "step": 16459 + }, + { + "epoch": 0.8999084236567664, + "grad_norm": 1.4669185876846313, + "learning_rate": 5.539469417959931e-07, + "loss": 1.3563, + "step": 16460 + }, + { + "epoch": 0.899963096100488, + "grad_norm": 1.6891868114471436, + "learning_rate": 5.533473453261007e-07, + "loss": 1.3657, + "step": 16461 + }, + { + "epoch": 0.9000177685442095, + "grad_norm": 1.5314797163009644, + "learning_rate": 5.527480643021077e-07, + "loss": 1.3548, + "step": 16462 + }, + { + "epoch": 0.900072440987931, + "grad_norm": 1.4374117851257324, + "learning_rate": 5.521490987440259e-07, + "loss": 1.3429, + "step": 16463 + }, + { + "epoch": 0.9001271134316526, + "grad_norm": 1.9214942455291748, + "learning_rate": 5.515504486718559e-07, + "loss": 1.1304, + "step": 16464 + }, + { + "epoch": 0.9001817858753741, + "grad_norm": 1.3469059467315674, + "learning_rate": 5.509521141055874e-07, + "loss": 1.5669, + "step": 16465 + }, + { + "epoch": 0.9002364583190957, + "grad_norm": 1.486772060394287, + "learning_rate": 5.503540950652009e-07, + "loss": 1.3688, + "step": 16466 + }, + { + "epoch": 0.9002911307628173, + "grad_norm": 1.6873743534088135, + "learning_rate": 5.497563915706661e-07, + "loss": 1.3959, + "step": 16467 + }, + { + "epoch": 0.9003458032065388, + "grad_norm": 3.1863372325897217, + "learning_rate": 5.491590036419392e-07, + "loss": 1.9604, + "step": 16468 + }, + { + "epoch": 0.9004004756502604, + "grad_norm": 2.2286667823791504, + "learning_rate": 5.48561931298972e-07, + "loss": 1.7066, + "step": 16469 + }, + { + "epoch": 0.900455148093982, + "grad_norm": 1.4216094017028809, + "learning_rate": 5.47965174561701e-07, + "loss": 1.2935, + "step": 16470 + }, + { + "epoch": 0.9005098205377035, + "grad_norm": 1.644869327545166, + "learning_rate": 5.473687334500499e-07, + "loss": 1.3323, + "step": 16471 + }, + { + "epoch": 0.900564492981425, + "grad_norm": 1.4454847574234009, + "learning_rate": 5.467726079839408e-07, + "loss": 1.4641, + "step": 16472 + }, + { + "epoch": 0.9006191654251466, + "grad_norm": 1.158736228942871, + "learning_rate": 5.461767981832766e-07, + "loss": 1.3572, + "step": 16473 + }, + { + "epoch": 0.9006738378688681, + "grad_norm": 1.4989540576934814, + "learning_rate": 5.455813040679536e-07, + "loss": 1.3723, + "step": 16474 + }, + { + "epoch": 0.9007285103125897, + "grad_norm": 1.2428607940673828, + "learning_rate": 5.449861256578559e-07, + "loss": 1.5407, + "step": 16475 + }, + { + "epoch": 0.9007831827563112, + "grad_norm": 1.6920877695083618, + "learning_rate": 5.443912629728565e-07, + "loss": 1.7043, + "step": 16476 + }, + { + "epoch": 0.9008378552000328, + "grad_norm": 1.5828906297683716, + "learning_rate": 5.437967160328228e-07, + "loss": 1.0163, + "step": 16477 + }, + { + "epoch": 0.9008925276437544, + "grad_norm": 1.4411680698394775, + "learning_rate": 5.432024848576067e-07, + "loss": 1.3717, + "step": 16478 + }, + { + "epoch": 0.9009472000874759, + "grad_norm": 1.6234725713729858, + "learning_rate": 5.426085694670491e-07, + "loss": 1.5794, + "step": 16479 + }, + { + "epoch": 0.9010018725311975, + "grad_norm": 1.699508786201477, + "learning_rate": 5.420149698809851e-07, + "loss": 1.1958, + "step": 16480 + }, + { + "epoch": 0.901056544974919, + "grad_norm": 1.6019742488861084, + "learning_rate": 5.414216861192356e-07, + "loss": 1.4226, + "step": 16481 + }, + { + "epoch": 0.9011112174186405, + "grad_norm": 1.4500170946121216, + "learning_rate": 5.408287182016092e-07, + "loss": 1.3666, + "step": 16482 + }, + { + "epoch": 0.9011658898623621, + "grad_norm": 1.4123446941375732, + "learning_rate": 5.402360661479101e-07, + "loss": 1.5596, + "step": 16483 + }, + { + "epoch": 0.9012205623060837, + "grad_norm": 1.4430288076400757, + "learning_rate": 5.396437299779278e-07, + "loss": 1.4664, + "step": 16484 + }, + { + "epoch": 0.9012752347498052, + "grad_norm": 1.522377610206604, + "learning_rate": 5.390517097114378e-07, + "loss": 1.3337, + "step": 16485 + }, + { + "epoch": 0.9013299071935268, + "grad_norm": 1.8474875688552856, + "learning_rate": 5.384600053682143e-07, + "loss": 1.2697, + "step": 16486 + }, + { + "epoch": 0.9013845796372484, + "grad_norm": 1.4016155004501343, + "learning_rate": 5.378686169680137e-07, + "loss": 1.2514, + "step": 16487 + }, + { + "epoch": 0.9014392520809699, + "grad_norm": 1.345678448677063, + "learning_rate": 5.372775445305833e-07, + "loss": 1.6336, + "step": 16488 + }, + { + "epoch": 0.9014939245246915, + "grad_norm": 1.5813905000686646, + "learning_rate": 5.366867880756599e-07, + "loss": 1.6354, + "step": 16489 + }, + { + "epoch": 0.9015485969684129, + "grad_norm": 1.240536093711853, + "learning_rate": 5.360963476229708e-07, + "loss": 1.4226, + "step": 16490 + }, + { + "epoch": 0.9016032694121345, + "grad_norm": 1.2796398401260376, + "learning_rate": 5.355062231922326e-07, + "loss": 1.3564, + "step": 16491 + }, + { + "epoch": 0.9016579418558561, + "grad_norm": 2.171245574951172, + "learning_rate": 5.349164148031515e-07, + "loss": 1.1685, + "step": 16492 + }, + { + "epoch": 0.9017126142995776, + "grad_norm": 1.6000397205352783, + "learning_rate": 5.3432692247542e-07, + "loss": 1.4085, + "step": 16493 + }, + { + "epoch": 0.9017672867432992, + "grad_norm": 1.4584095478057861, + "learning_rate": 5.337377462287263e-07, + "loss": 1.6049, + "step": 16494 + }, + { + "epoch": 0.9018219591870208, + "grad_norm": 1.6192913055419922, + "learning_rate": 5.331488860827427e-07, + "loss": 1.4399, + "step": 16495 + }, + { + "epoch": 0.9018766316307423, + "grad_norm": 1.9474595785140991, + "learning_rate": 5.325603420571302e-07, + "loss": 1.5746, + "step": 16496 + }, + { + "epoch": 0.9019313040744639, + "grad_norm": 2.0060369968414307, + "learning_rate": 5.319721141715461e-07, + "loss": 1.5103, + "step": 16497 + }, + { + "epoch": 0.9019859765181855, + "grad_norm": 1.1009674072265625, + "learning_rate": 5.313842024456306e-07, + "loss": 1.5111, + "step": 16498 + }, + { + "epoch": 0.9020406489619069, + "grad_norm": 1.5925041437149048, + "learning_rate": 5.307966068990144e-07, + "loss": 1.5067, + "step": 16499 + }, + { + "epoch": 0.9020953214056285, + "grad_norm": 1.5514806509017944, + "learning_rate": 5.302093275513209e-07, + "loss": 1.6389, + "step": 16500 + }, + { + "epoch": 0.9021499938493501, + "grad_norm": 1.474603533744812, + "learning_rate": 5.296223644221588e-07, + "loss": 1.3437, + "step": 16501 + }, + { + "epoch": 0.9022046662930716, + "grad_norm": 1.6320736408233643, + "learning_rate": 5.290357175311278e-07, + "loss": 1.4421, + "step": 16502 + }, + { + "epoch": 0.9022593387367932, + "grad_norm": 1.5670907497406006, + "learning_rate": 5.284493868978191e-07, + "loss": 1.9227, + "step": 16503 + }, + { + "epoch": 0.9023140111805147, + "grad_norm": 1.7005856037139893, + "learning_rate": 5.278633725418103e-07, + "loss": 1.4084, + "step": 16504 + }, + { + "epoch": 0.9023686836242363, + "grad_norm": 1.5313396453857422, + "learning_rate": 5.272776744826724e-07, + "loss": 1.2551, + "step": 16505 + }, + { + "epoch": 0.9024233560679579, + "grad_norm": 2.1080801486968994, + "learning_rate": 5.26692292739961e-07, + "loss": 1.5326, + "step": 16506 + }, + { + "epoch": 0.9024780285116794, + "grad_norm": 1.5658650398254395, + "learning_rate": 5.261072273332224e-07, + "loss": 1.5151, + "step": 16507 + }, + { + "epoch": 0.902532700955401, + "grad_norm": 1.363915205001831, + "learning_rate": 5.255224782819957e-07, + "loss": 1.4941, + "step": 16508 + }, + { + "epoch": 0.9025873733991225, + "grad_norm": 1.4209532737731934, + "learning_rate": 5.249380456058062e-07, + "loss": 1.4924, + "step": 16509 + }, + { + "epoch": 0.902642045842844, + "grad_norm": 1.8449751138687134, + "learning_rate": 5.243539293241684e-07, + "loss": 1.5485, + "step": 16510 + }, + { + "epoch": 0.9026967182865656, + "grad_norm": 1.5074023008346558, + "learning_rate": 5.237701294565889e-07, + "loss": 1.3682, + "step": 16511 + }, + { + "epoch": 0.9027513907302872, + "grad_norm": 1.4025393724441528, + "learning_rate": 5.231866460225621e-07, + "loss": 1.3543, + "step": 16512 + }, + { + "epoch": 0.9028060631740087, + "grad_norm": 2.100919246673584, + "learning_rate": 5.226034790415702e-07, + "loss": 1.4159, + "step": 16513 + }, + { + "epoch": 0.9028607356177303, + "grad_norm": 2.0277230739593506, + "learning_rate": 5.220206285330887e-07, + "loss": 1.3565, + "step": 16514 + }, + { + "epoch": 0.9029154080614519, + "grad_norm": 1.5094881057739258, + "learning_rate": 5.214380945165787e-07, + "loss": 1.2128, + "step": 16515 + }, + { + "epoch": 0.9029700805051734, + "grad_norm": 1.6341674327850342, + "learning_rate": 5.208558770114913e-07, + "loss": 1.3574, + "step": 16516 + }, + { + "epoch": 0.903024752948895, + "grad_norm": 1.5307223796844482, + "learning_rate": 5.20273976037271e-07, + "loss": 1.1487, + "step": 16517 + }, + { + "epoch": 0.9030794253926165, + "grad_norm": 1.4953702688217163, + "learning_rate": 5.196923916133467e-07, + "loss": 1.6871, + "step": 16518 + }, + { + "epoch": 0.903134097836338, + "grad_norm": 1.3156406879425049, + "learning_rate": 5.191111237591406e-07, + "loss": 1.4766, + "step": 16519 + }, + { + "epoch": 0.9031887702800596, + "grad_norm": 1.2519465684890747, + "learning_rate": 5.185301724940617e-07, + "loss": 1.5927, + "step": 16520 + }, + { + "epoch": 0.9032434427237811, + "grad_norm": 1.5635899305343628, + "learning_rate": 5.179495378375077e-07, + "loss": 1.5682, + "step": 16521 + }, + { + "epoch": 0.9032981151675027, + "grad_norm": 1.4165804386138916, + "learning_rate": 5.173692198088709e-07, + "loss": 1.2375, + "step": 16522 + }, + { + "epoch": 0.9033527876112243, + "grad_norm": 1.6663925647735596, + "learning_rate": 5.167892184275269e-07, + "loss": 1.4841, + "step": 16523 + }, + { + "epoch": 0.9034074600549458, + "grad_norm": 1.3449209928512573, + "learning_rate": 5.162095337128426e-07, + "loss": 1.4756, + "step": 16524 + }, + { + "epoch": 0.9034621324986674, + "grad_norm": 1.7490160465240479, + "learning_rate": 5.156301656841789e-07, + "loss": 1.5974, + "step": 16525 + }, + { + "epoch": 0.903516804942389, + "grad_norm": 1.9760361909866333, + "learning_rate": 5.150511143608782e-07, + "loss": 1.3813, + "step": 16526 + }, + { + "epoch": 0.9035714773861104, + "grad_norm": 1.5445208549499512, + "learning_rate": 5.144723797622786e-07, + "loss": 1.6275, + "step": 16527 + }, + { + "epoch": 0.903626149829832, + "grad_norm": 1.4433343410491943, + "learning_rate": 5.138939619077055e-07, + "loss": 1.3087, + "step": 16528 + }, + { + "epoch": 0.9036808222735536, + "grad_norm": 1.4394246339797974, + "learning_rate": 5.133158608164724e-07, + "loss": 1.5475, + "step": 16529 + }, + { + "epoch": 0.9037354947172751, + "grad_norm": 1.7230180501937866, + "learning_rate": 5.127380765078815e-07, + "loss": 1.4852, + "step": 16530 + }, + { + "epoch": 0.9037901671609967, + "grad_norm": 1.4409379959106445, + "learning_rate": 5.121606090012309e-07, + "loss": 1.4601, + "step": 16531 + }, + { + "epoch": 0.9038448396047183, + "grad_norm": 1.5077805519104004, + "learning_rate": 5.115834583158008e-07, + "loss": 1.4498, + "step": 16532 + }, + { + "epoch": 0.9038995120484398, + "grad_norm": 1.616725206375122, + "learning_rate": 5.110066244708645e-07, + "loss": 1.4441, + "step": 16533 + }, + { + "epoch": 0.9039541844921614, + "grad_norm": 1.4912543296813965, + "learning_rate": 5.104301074856843e-07, + "loss": 1.5774, + "step": 16534 + }, + { + "epoch": 0.9040088569358828, + "grad_norm": 1.3936572074890137, + "learning_rate": 5.098539073795095e-07, + "loss": 1.6002, + "step": 16535 + }, + { + "epoch": 0.9040635293796044, + "grad_norm": 1.7135405540466309, + "learning_rate": 5.092780241715833e-07, + "loss": 1.4276, + "step": 16536 + }, + { + "epoch": 0.904118201823326, + "grad_norm": 1.6234632730484009, + "learning_rate": 5.08702457881134e-07, + "loss": 1.5801, + "step": 16537 + }, + { + "epoch": 0.9041728742670475, + "grad_norm": 1.5794366598129272, + "learning_rate": 5.081272085273825e-07, + "loss": 1.5184, + "step": 16538 + }, + { + "epoch": 0.9042275467107691, + "grad_norm": 1.8673683404922485, + "learning_rate": 5.075522761295359e-07, + "loss": 1.581, + "step": 16539 + }, + { + "epoch": 0.9042822191544907, + "grad_norm": 1.684181571006775, + "learning_rate": 5.069776607067944e-07, + "loss": 1.5251, + "step": 16540 + }, + { + "epoch": 0.9043368915982122, + "grad_norm": 1.5512717962265015, + "learning_rate": 5.064033622783426e-07, + "loss": 1.27, + "step": 16541 + }, + { + "epoch": 0.9043915640419338, + "grad_norm": 1.8484680652618408, + "learning_rate": 5.058293808633629e-07, + "loss": 1.4312, + "step": 16542 + }, + { + "epoch": 0.9044462364856554, + "grad_norm": 1.603475570678711, + "learning_rate": 5.052557164810179e-07, + "loss": 1.5574, + "step": 16543 + }, + { + "epoch": 0.9045009089293768, + "grad_norm": 1.4037622213363647, + "learning_rate": 5.046823691504632e-07, + "loss": 1.8022, + "step": 16544 + }, + { + "epoch": 0.9045555813730984, + "grad_norm": 1.526639699935913, + "learning_rate": 5.041093388908469e-07, + "loss": 1.4731, + "step": 16545 + }, + { + "epoch": 0.90461025381682, + "grad_norm": 1.2657562494277954, + "learning_rate": 5.035366257213015e-07, + "loss": 1.4007, + "step": 16546 + }, + { + "epoch": 0.9046649262605415, + "grad_norm": 1.9500476121902466, + "learning_rate": 5.029642296609538e-07, + "loss": 1.671, + "step": 16547 + }, + { + "epoch": 0.9047195987042631, + "grad_norm": 2.175229549407959, + "learning_rate": 5.023921507289153e-07, + "loss": 1.5633, + "step": 16548 + }, + { + "epoch": 0.9047742711479846, + "grad_norm": 1.5587905645370483, + "learning_rate": 5.018203889442896e-07, + "loss": 1.411, + "step": 16549 + }, + { + "epoch": 0.9048289435917062, + "grad_norm": 1.9414373636245728, + "learning_rate": 5.012489443261693e-07, + "loss": 1.4219, + "step": 16550 + }, + { + "epoch": 0.9048836160354278, + "grad_norm": 1.4197851419448853, + "learning_rate": 5.006778168936377e-07, + "loss": 1.3515, + "step": 16551 + }, + { + "epoch": 0.9049382884791493, + "grad_norm": 1.3401503562927246, + "learning_rate": 5.001070066657631e-07, + "loss": 1.4446, + "step": 16552 + }, + { + "epoch": 0.9049929609228708, + "grad_norm": 1.6304343938827515, + "learning_rate": 4.995365136616092e-07, + "loss": 1.5023, + "step": 16553 + }, + { + "epoch": 0.9050476333665924, + "grad_norm": 1.9572242498397827, + "learning_rate": 4.98966337900224e-07, + "loss": 1.5812, + "step": 16554 + }, + { + "epoch": 0.9051023058103139, + "grad_norm": 1.2456179857254028, + "learning_rate": 4.983964794006457e-07, + "loss": 1.3452, + "step": 16555 + }, + { + "epoch": 0.9051569782540355, + "grad_norm": 1.8783615827560425, + "learning_rate": 4.978269381819068e-07, + "loss": 1.3928, + "step": 16556 + }, + { + "epoch": 0.9052116506977571, + "grad_norm": 1.911392092704773, + "learning_rate": 4.972577142630242e-07, + "loss": 1.6299, + "step": 16557 + }, + { + "epoch": 0.9052663231414786, + "grad_norm": 1.3099197149276733, + "learning_rate": 4.96688807663005e-07, + "loss": 1.6015, + "step": 16558 + }, + { + "epoch": 0.9053209955852002, + "grad_norm": 1.4748399257659912, + "learning_rate": 4.961202184008462e-07, + "loss": 1.5591, + "step": 16559 + }, + { + "epoch": 0.9053756680289218, + "grad_norm": 1.3178858757019043, + "learning_rate": 4.95551946495535e-07, + "loss": 1.6053, + "step": 16560 + }, + { + "epoch": 0.9054303404726433, + "grad_norm": 1.8868361711502075, + "learning_rate": 4.949839919660481e-07, + "loss": 1.3569, + "step": 16561 + }, + { + "epoch": 0.9054850129163649, + "grad_norm": 1.6211555004119873, + "learning_rate": 4.944163548313496e-07, + "loss": 1.225, + "step": 16562 + }, + { + "epoch": 0.9055396853600863, + "grad_norm": 1.3733115196228027, + "learning_rate": 4.938490351103931e-07, + "loss": 1.2283, + "step": 16563 + }, + { + "epoch": 0.9055943578038079, + "grad_norm": 1.9238301515579224, + "learning_rate": 4.932820328221266e-07, + "loss": 1.2887, + "step": 16564 + }, + { + "epoch": 0.9056490302475295, + "grad_norm": 1.5086592435836792, + "learning_rate": 4.927153479854807e-07, + "loss": 1.1103, + "step": 16565 + }, + { + "epoch": 0.905703702691251, + "grad_norm": 1.7131986618041992, + "learning_rate": 4.921489806193779e-07, + "loss": 1.5399, + "step": 16566 + }, + { + "epoch": 0.9057583751349726, + "grad_norm": 1.4551976919174194, + "learning_rate": 4.915829307427333e-07, + "loss": 1.3102, + "step": 16567 + }, + { + "epoch": 0.9058130475786942, + "grad_norm": 1.5155781507492065, + "learning_rate": 4.91017198374445e-07, + "loss": 1.4701, + "step": 16568 + }, + { + "epoch": 0.9058677200224157, + "grad_norm": 1.666883945465088, + "learning_rate": 4.904517835334055e-07, + "loss": 1.1197, + "step": 16569 + }, + { + "epoch": 0.9059223924661373, + "grad_norm": 1.6012890338897705, + "learning_rate": 4.898866862384976e-07, + "loss": 1.2718, + "step": 16570 + }, + { + "epoch": 0.9059770649098589, + "grad_norm": 1.7079696655273438, + "learning_rate": 4.893219065085886e-07, + "loss": 1.5184, + "step": 16571 + }, + { + "epoch": 0.9060317373535803, + "grad_norm": 1.4390469789505005, + "learning_rate": 4.887574443625376e-07, + "loss": 1.3853, + "step": 16572 + }, + { + "epoch": 0.9060864097973019, + "grad_norm": 2.6773557662963867, + "learning_rate": 4.881932998191963e-07, + "loss": 1.382, + "step": 16573 + }, + { + "epoch": 0.9061410822410235, + "grad_norm": 1.3780665397644043, + "learning_rate": 4.876294728973984e-07, + "loss": 1.6865, + "step": 16574 + }, + { + "epoch": 0.906195754684745, + "grad_norm": 1.3174928426742554, + "learning_rate": 4.870659636159758e-07, + "loss": 1.3409, + "step": 16575 + }, + { + "epoch": 0.9062504271284666, + "grad_norm": 1.4403079748153687, + "learning_rate": 4.865027719937443e-07, + "loss": 1.2878, + "step": 16576 + }, + { + "epoch": 0.9063050995721881, + "grad_norm": 1.360338568687439, + "learning_rate": 4.859398980495078e-07, + "loss": 1.4323, + "step": 16577 + }, + { + "epoch": 0.9063597720159097, + "grad_norm": 1.516973853111267, + "learning_rate": 4.853773418020646e-07, + "loss": 1.5799, + "step": 16578 + }, + { + "epoch": 0.9064144444596313, + "grad_norm": 1.6476037502288818, + "learning_rate": 4.848151032701987e-07, + "loss": 1.272, + "step": 16579 + }, + { + "epoch": 0.9064691169033527, + "grad_norm": 1.9817909002304077, + "learning_rate": 4.842531824726826e-07, + "loss": 1.5371, + "step": 16580 + }, + { + "epoch": 0.9065237893470743, + "grad_norm": 1.9213200807571411, + "learning_rate": 4.836915794282838e-07, + "loss": 1.4107, + "step": 16581 + }, + { + "epoch": 0.9065784617907959, + "grad_norm": 1.7167314291000366, + "learning_rate": 4.831302941557537e-07, + "loss": 1.3114, + "step": 16582 + }, + { + "epoch": 0.9066331342345174, + "grad_norm": 1.431925892829895, + "learning_rate": 4.82569326673834e-07, + "loss": 1.6466, + "step": 16583 + }, + { + "epoch": 0.906687806678239, + "grad_norm": 1.5814262628555298, + "learning_rate": 4.8200867700126e-07, + "loss": 1.5123, + "step": 16584 + }, + { + "epoch": 0.9067424791219606, + "grad_norm": 1.327040433883667, + "learning_rate": 4.814483451567498e-07, + "loss": 1.5285, + "step": 16585 + }, + { + "epoch": 0.9067971515656821, + "grad_norm": 1.689491629600525, + "learning_rate": 4.80888331159014e-07, + "loss": 1.4825, + "step": 16586 + }, + { + "epoch": 0.9068518240094037, + "grad_norm": 1.8286093473434448, + "learning_rate": 4.803286350267555e-07, + "loss": 1.541, + "step": 16587 + }, + { + "epoch": 0.9069064964531253, + "grad_norm": 1.5227874517440796, + "learning_rate": 4.797692567786616e-07, + "loss": 1.4647, + "step": 16588 + }, + { + "epoch": 0.9069611688968467, + "grad_norm": 1.5197539329528809, + "learning_rate": 4.79210196433414e-07, + "loss": 1.8504, + "step": 16589 + }, + { + "epoch": 0.9070158413405683, + "grad_norm": 1.567513108253479, + "learning_rate": 4.786514540096776e-07, + "loss": 1.6139, + "step": 16590 + }, + { + "epoch": 0.9070705137842898, + "grad_norm": 1.362748622894287, + "learning_rate": 4.780930295261133e-07, + "loss": 1.6824, + "step": 16591 + }, + { + "epoch": 0.9071251862280114, + "grad_norm": 1.6526172161102295, + "learning_rate": 4.77534923001366e-07, + "loss": 1.4684, + "step": 16592 + }, + { + "epoch": 0.907179858671733, + "grad_norm": 1.2453995943069458, + "learning_rate": 4.769771344540719e-07, + "loss": 1.5806, + "step": 16593 + }, + { + "epoch": 0.9072345311154545, + "grad_norm": 2.1283414363861084, + "learning_rate": 4.764196639028573e-07, + "loss": 1.4897, + "step": 16594 + }, + { + "epoch": 0.9072892035591761, + "grad_norm": 1.7564514875411987, + "learning_rate": 4.7586251136633956e-07, + "loss": 1.2405, + "step": 16595 + }, + { + "epoch": 0.9073438760028977, + "grad_norm": 1.6273683309555054, + "learning_rate": 4.7530567686312035e-07, + "loss": 1.7468, + "step": 16596 + }, + { + "epoch": 0.9073985484466192, + "grad_norm": 1.3795156478881836, + "learning_rate": 4.7474916041179487e-07, + "loss": 1.2886, + "step": 16597 + }, + { + "epoch": 0.9074532208903408, + "grad_norm": 1.605350136756897, + "learning_rate": 4.7419296203094713e-07, + "loss": 1.5796, + "step": 16598 + }, + { + "epoch": 0.9075078933340623, + "grad_norm": 1.4682596921920776, + "learning_rate": 4.7363708173915e-07, + "loss": 1.4446, + "step": 16599 + }, + { + "epoch": 0.9075625657777838, + "grad_norm": 1.4082486629486084, + "learning_rate": 4.730815195549643e-07, + "loss": 1.3918, + "step": 16600 + }, + { + "epoch": 0.9076172382215054, + "grad_norm": 1.344869613647461, + "learning_rate": 4.7252627549694285e-07, + "loss": 1.453, + "step": 16601 + }, + { + "epoch": 0.907671910665227, + "grad_norm": 2.029784917831421, + "learning_rate": 4.719713495836242e-07, + "loss": 1.4495, + "step": 16602 + }, + { + "epoch": 0.9077265831089485, + "grad_norm": 1.3657629489898682, + "learning_rate": 4.7141674183354247e-07, + "loss": 1.4855, + "step": 16603 + }, + { + "epoch": 0.9077812555526701, + "grad_norm": 1.3628010749816895, + "learning_rate": 4.70862452265215e-07, + "loss": 1.7583, + "step": 16604 + }, + { + "epoch": 0.9078359279963916, + "grad_norm": 1.4808510541915894, + "learning_rate": 4.703084808971503e-07, + "loss": 1.3724, + "step": 16605 + }, + { + "epoch": 0.9078906004401132, + "grad_norm": 1.5172638893127441, + "learning_rate": 4.697548277478481e-07, + "loss": 1.6871, + "step": 16606 + }, + { + "epoch": 0.9079452728838348, + "grad_norm": 1.3598082065582275, + "learning_rate": 4.6920149283579574e-07, + "loss": 1.2174, + "step": 16607 + }, + { + "epoch": 0.9079999453275562, + "grad_norm": 2.187749147415161, + "learning_rate": 4.6864847617946964e-07, + "loss": 1.6094, + "step": 16608 + }, + { + "epoch": 0.9080546177712778, + "grad_norm": 1.3432947397232056, + "learning_rate": 4.6809577779733715e-07, + "loss": 1.5383, + "step": 16609 + }, + { + "epoch": 0.9081092902149994, + "grad_norm": 1.5464075803756714, + "learning_rate": 4.6754339770785474e-07, + "loss": 1.3921, + "step": 16610 + }, + { + "epoch": 0.9081639626587209, + "grad_norm": 1.5525521039962769, + "learning_rate": 4.6699133592946535e-07, + "loss": 1.2858, + "step": 16611 + }, + { + "epoch": 0.9082186351024425, + "grad_norm": 1.3084759712219238, + "learning_rate": 4.6643959248060643e-07, + "loss": 1.4346, + "step": 16612 + }, + { + "epoch": 0.9082733075461641, + "grad_norm": 1.5102957487106323, + "learning_rate": 4.658881673797e-07, + "loss": 1.3544, + "step": 16613 + }, + { + "epoch": 0.9083279799898856, + "grad_norm": 1.6294901371002197, + "learning_rate": 4.6533706064516015e-07, + "loss": 1.5692, + "step": 16614 + }, + { + "epoch": 0.9083826524336072, + "grad_norm": 1.4189276695251465, + "learning_rate": 4.6478627229539e-07, + "loss": 1.4987, + "step": 16615 + }, + { + "epoch": 0.9084373248773288, + "grad_norm": 1.457108974456787, + "learning_rate": 4.6423580234878143e-07, + "loss": 1.4116, + "step": 16616 + }, + { + "epoch": 0.9084919973210502, + "grad_norm": 1.3066251277923584, + "learning_rate": 4.636856508237164e-07, + "loss": 1.4778, + "step": 16617 + }, + { + "epoch": 0.9085466697647718, + "grad_norm": 1.390095829963684, + "learning_rate": 4.6313581773856474e-07, + "loss": 1.5428, + "step": 16618 + }, + { + "epoch": 0.9086013422084933, + "grad_norm": 1.7729719877243042, + "learning_rate": 4.6258630311168505e-07, + "loss": 1.1238, + "step": 16619 + }, + { + "epoch": 0.9086560146522149, + "grad_norm": 1.7937465906143188, + "learning_rate": 4.6203710696143153e-07, + "loss": 1.4198, + "step": 16620 + }, + { + "epoch": 0.9087106870959365, + "grad_norm": 1.7091108560562134, + "learning_rate": 4.614882293061396e-07, + "loss": 1.4374, + "step": 16621 + }, + { + "epoch": 0.908765359539658, + "grad_norm": 1.3614387512207031, + "learning_rate": 4.6093967016413774e-07, + "loss": 1.3376, + "step": 16622 + }, + { + "epoch": 0.9088200319833796, + "grad_norm": 1.3629167079925537, + "learning_rate": 4.6039142955374483e-07, + "loss": 1.4316, + "step": 16623 + }, + { + "epoch": 0.9088747044271012, + "grad_norm": 1.5960475206375122, + "learning_rate": 4.5984350749326835e-07, + "loss": 1.3584, + "step": 16624 + }, + { + "epoch": 0.9089293768708226, + "grad_norm": 1.4380074739456177, + "learning_rate": 4.592959040010025e-07, + "loss": 1.4985, + "step": 16625 + }, + { + "epoch": 0.9089840493145442, + "grad_norm": 1.8394908905029297, + "learning_rate": 4.5874861909523506e-07, + "loss": 1.3788, + "step": 16626 + }, + { + "epoch": 0.9090387217582658, + "grad_norm": 1.7009764909744263, + "learning_rate": 4.582016527942412e-07, + "loss": 1.3975, + "step": 16627 + }, + { + "epoch": 0.9090933942019873, + "grad_norm": 1.7550947666168213, + "learning_rate": 4.5765500511628314e-07, + "loss": 1.5332, + "step": 16628 + }, + { + "epoch": 0.9091480666457089, + "grad_norm": 1.4883133172988892, + "learning_rate": 4.571086760796173e-07, + "loss": 1.4311, + "step": 16629 + }, + { + "epoch": 0.9092027390894305, + "grad_norm": 1.3270790576934814, + "learning_rate": 4.5656266570248687e-07, + "loss": 1.5133, + "step": 16630 + }, + { + "epoch": 0.909257411533152, + "grad_norm": 1.944191575050354, + "learning_rate": 4.5601697400312175e-07, + "loss": 1.3577, + "step": 16631 + }, + { + "epoch": 0.9093120839768736, + "grad_norm": 1.4677447080612183, + "learning_rate": 4.554716009997473e-07, + "loss": 1.6681, + "step": 16632 + }, + { + "epoch": 0.9093667564205951, + "grad_norm": 1.5414713621139526, + "learning_rate": 4.5492654671057014e-07, + "loss": 1.4207, + "step": 16633 + }, + { + "epoch": 0.9094214288643166, + "grad_norm": 1.8434052467346191, + "learning_rate": 4.5438181115379564e-07, + "loss": 1.6942, + "step": 16634 + }, + { + "epoch": 0.9094761013080382, + "grad_norm": 1.7853291034698486, + "learning_rate": 4.5383739434761265e-07, + "loss": 1.2701, + "step": 16635 + }, + { + "epoch": 0.9095307737517597, + "grad_norm": 1.5073788166046143, + "learning_rate": 4.532932963101977e-07, + "loss": 1.5243, + "step": 16636 + }, + { + "epoch": 0.9095854461954813, + "grad_norm": 1.7075164318084717, + "learning_rate": 4.5274951705972294e-07, + "loss": 1.5329, + "step": 16637 + }, + { + "epoch": 0.9096401186392029, + "grad_norm": 1.4355227947235107, + "learning_rate": 4.5220605661434605e-07, + "loss": 1.6033, + "step": 16638 + }, + { + "epoch": 0.9096947910829244, + "grad_norm": 1.4837616682052612, + "learning_rate": 4.5166291499221137e-07, + "loss": 1.6092, + "step": 16639 + }, + { + "epoch": 0.909749463526646, + "grad_norm": 1.3926763534545898, + "learning_rate": 4.511200922114589e-07, + "loss": 1.2788, + "step": 16640 + }, + { + "epoch": 0.9098041359703676, + "grad_norm": 1.2640084028244019, + "learning_rate": 4.505775882902141e-07, + "loss": 1.6904, + "step": 16641 + }, + { + "epoch": 0.9098588084140891, + "grad_norm": 1.4422311782836914, + "learning_rate": 4.5003540324659255e-07, + "loss": 1.2653, + "step": 16642 + }, + { + "epoch": 0.9099134808578107, + "grad_norm": 1.982452630996704, + "learning_rate": 4.494935370986986e-07, + "loss": 1.2899, + "step": 16643 + }, + { + "epoch": 0.9099681533015322, + "grad_norm": 1.2946665287017822, + "learning_rate": 4.489519898646244e-07, + "loss": 1.4471, + "step": 16644 + }, + { + "epoch": 0.9100228257452537, + "grad_norm": 1.5372899770736694, + "learning_rate": 4.4841076156245665e-07, + "loss": 1.5809, + "step": 16645 + }, + { + "epoch": 0.9100774981889753, + "grad_norm": 1.982246994972229, + "learning_rate": 4.4786985221026756e-07, + "loss": 1.4388, + "step": 16646 + }, + { + "epoch": 0.9101321706326968, + "grad_norm": 1.6235270500183105, + "learning_rate": 4.4732926182611826e-07, + "loss": 1.4114, + "step": 16647 + }, + { + "epoch": 0.9101868430764184, + "grad_norm": 1.3775150775909424, + "learning_rate": 4.467889904280609e-07, + "loss": 1.4433, + "step": 16648 + }, + { + "epoch": 0.91024151552014, + "grad_norm": 1.5604674816131592, + "learning_rate": 4.4624903803413667e-07, + "loss": 1.5103, + "step": 16649 + }, + { + "epoch": 0.9102961879638615, + "grad_norm": 1.1554011106491089, + "learning_rate": 4.457094046623756e-07, + "loss": 1.3723, + "step": 16650 + }, + { + "epoch": 0.9103508604075831, + "grad_norm": 1.4234286546707153, + "learning_rate": 4.451700903307976e-07, + "loss": 1.771, + "step": 16651 + }, + { + "epoch": 0.9104055328513047, + "grad_norm": 1.723787546157837, + "learning_rate": 4.4463109505741177e-07, + "loss": 1.2329, + "step": 16652 + }, + { + "epoch": 0.9104602052950261, + "grad_norm": 1.7492060661315918, + "learning_rate": 4.440924188602136e-07, + "loss": 1.485, + "step": 16653 + }, + { + "epoch": 0.9105148777387477, + "grad_norm": 1.647519588470459, + "learning_rate": 4.435540617571965e-07, + "loss": 1.2502, + "step": 16654 + }, + { + "epoch": 0.9105695501824693, + "grad_norm": 1.333848476409912, + "learning_rate": 4.430160237663328e-07, + "loss": 1.4295, + "step": 16655 + }, + { + "epoch": 0.9106242226261908, + "grad_norm": 1.4446117877960205, + "learning_rate": 4.424783049055903e-07, + "loss": 1.5551, + "step": 16656 + }, + { + "epoch": 0.9106788950699124, + "grad_norm": 1.4632972478866577, + "learning_rate": 4.4194090519292467e-07, + "loss": 1.2832, + "step": 16657 + }, + { + "epoch": 0.910733567513634, + "grad_norm": 1.6879723072052002, + "learning_rate": 4.414038246462804e-07, + "loss": 1.5722, + "step": 16658 + }, + { + "epoch": 0.9107882399573555, + "grad_norm": 1.5231037139892578, + "learning_rate": 4.408670632835932e-07, + "loss": 1.6228, + "step": 16659 + }, + { + "epoch": 0.9108429124010771, + "grad_norm": 1.322140097618103, + "learning_rate": 4.4033062112278537e-07, + "loss": 1.3353, + "step": 16660 + }, + { + "epoch": 0.9108975848447985, + "grad_norm": 1.417049527168274, + "learning_rate": 4.397944981817703e-07, + "loss": 1.512, + "step": 16661 + }, + { + "epoch": 0.9109522572885201, + "grad_norm": 1.3932801485061646, + "learning_rate": 4.392586944784505e-07, + "loss": 1.5752, + "step": 16662 + }, + { + "epoch": 0.9110069297322417, + "grad_norm": 1.6002942323684692, + "learning_rate": 4.387232100307193e-07, + "loss": 1.4472, + "step": 16663 + }, + { + "epoch": 0.9110616021759632, + "grad_norm": 1.553659200668335, + "learning_rate": 4.3818804485645463e-07, + "loss": 1.6242, + "step": 16664 + }, + { + "epoch": 0.9111162746196848, + "grad_norm": 1.805490255355835, + "learning_rate": 4.3765319897352997e-07, + "loss": 1.6124, + "step": 16665 + }, + { + "epoch": 0.9111709470634064, + "grad_norm": 1.5832812786102295, + "learning_rate": 4.3711867239980335e-07, + "loss": 1.652, + "step": 16666 + }, + { + "epoch": 0.9112256195071279, + "grad_norm": 1.6879050731658936, + "learning_rate": 4.365844651531237e-07, + "loss": 1.4694, + "step": 16667 + }, + { + "epoch": 0.9112802919508495, + "grad_norm": 1.4239535331726074, + "learning_rate": 4.3605057725133015e-07, + "loss": 1.4545, + "step": 16668 + }, + { + "epoch": 0.9113349643945711, + "grad_norm": 1.5067708492279053, + "learning_rate": 4.3551700871225177e-07, + "loss": 1.4389, + "step": 16669 + }, + { + "epoch": 0.9113896368382925, + "grad_norm": 1.7362186908721924, + "learning_rate": 4.349837595537032e-07, + "loss": 1.5638, + "step": 16670 + }, + { + "epoch": 0.9114443092820141, + "grad_norm": 1.4819859266281128, + "learning_rate": 4.344508297934924e-07, + "loss": 1.5922, + "step": 16671 + }, + { + "epoch": 0.9114989817257357, + "grad_norm": 1.485520362854004, + "learning_rate": 4.339182194494129e-07, + "loss": 1.4538, + "step": 16672 + }, + { + "epoch": 0.9115536541694572, + "grad_norm": 1.2558636665344238, + "learning_rate": 4.3338592853925277e-07, + "loss": 1.3448, + "step": 16673 + }, + { + "epoch": 0.9116083266131788, + "grad_norm": 1.405309796333313, + "learning_rate": 4.3285395708078547e-07, + "loss": 1.3085, + "step": 16674 + }, + { + "epoch": 0.9116629990569003, + "grad_norm": 2.2397844791412354, + "learning_rate": 4.323223050917735e-07, + "loss": 1.3278, + "step": 16675 + }, + { + "epoch": 0.9117176715006219, + "grad_norm": 1.6534284353256226, + "learning_rate": 4.317909725899727e-07, + "loss": 1.3257, + "step": 16676 + }, + { + "epoch": 0.9117723439443435, + "grad_norm": 1.9553894996643066, + "learning_rate": 4.312599595931233e-07, + "loss": 1.1154, + "step": 16677 + }, + { + "epoch": 0.911827016388065, + "grad_norm": 1.10032320022583, + "learning_rate": 4.307292661189566e-07, + "loss": 1.467, + "step": 16678 + }, + { + "epoch": 0.9118816888317866, + "grad_norm": 1.7185001373291016, + "learning_rate": 4.3019889218519516e-07, + "loss": 1.4179, + "step": 16679 + }, + { + "epoch": 0.9119363612755081, + "grad_norm": 1.3887134790420532, + "learning_rate": 4.296688378095493e-07, + "loss": 1.321, + "step": 16680 + }, + { + "epoch": 0.9119910337192296, + "grad_norm": 1.9237791299819946, + "learning_rate": 4.291391030097192e-07, + "loss": 1.5053, + "step": 16681 + }, + { + "epoch": 0.9120457061629512, + "grad_norm": 1.5915606021881104, + "learning_rate": 4.2860968780339296e-07, + "loss": 1.4677, + "step": 16682 + }, + { + "epoch": 0.9121003786066728, + "grad_norm": 1.7541253566741943, + "learning_rate": 4.280805922082487e-07, + "loss": 1.4933, + "step": 16683 + }, + { + "epoch": 0.9121550510503943, + "grad_norm": 1.53954017162323, + "learning_rate": 4.2755181624195344e-07, + "loss": 1.3969, + "step": 16684 + }, + { + "epoch": 0.9122097234941159, + "grad_norm": 1.5204848051071167, + "learning_rate": 4.270233599221674e-07, + "loss": 1.379, + "step": 16685 + }, + { + "epoch": 0.9122643959378375, + "grad_norm": 1.772190809249878, + "learning_rate": 4.2649522326653315e-07, + "loss": 1.4052, + "step": 16686 + }, + { + "epoch": 0.912319068381559, + "grad_norm": 1.633531928062439, + "learning_rate": 4.2596740629268997e-07, + "loss": 1.3784, + "step": 16687 + }, + { + "epoch": 0.9123737408252806, + "grad_norm": 1.5029513835906982, + "learning_rate": 4.2543990901826035e-07, + "loss": 1.372, + "step": 16688 + }, + { + "epoch": 0.912428413269002, + "grad_norm": 1.683058261871338, + "learning_rate": 4.249127314608592e-07, + "loss": 1.3444, + "step": 16689 + }, + { + "epoch": 0.9124830857127236, + "grad_norm": 1.2892299890518188, + "learning_rate": 4.2438587363809127e-07, + "loss": 1.8789, + "step": 16690 + }, + { + "epoch": 0.9125377581564452, + "grad_norm": 1.5025215148925781, + "learning_rate": 4.238593355675502e-07, + "loss": 1.5539, + "step": 16691 + }, + { + "epoch": 0.9125924306001667, + "grad_norm": 1.7768996953964233, + "learning_rate": 4.2333311726681426e-07, + "loss": 1.3893, + "step": 16692 + }, + { + "epoch": 0.9126471030438883, + "grad_norm": 1.7117761373519897, + "learning_rate": 4.228072187534604e-07, + "loss": 1.171, + "step": 16693 + }, + { + "epoch": 0.9127017754876099, + "grad_norm": 1.4276357889175415, + "learning_rate": 4.222816400450458e-07, + "loss": 1.4923, + "step": 16694 + }, + { + "epoch": 0.9127564479313314, + "grad_norm": 1.3710061311721802, + "learning_rate": 4.2175638115912296e-07, + "loss": 1.6558, + "step": 16695 + }, + { + "epoch": 0.912811120375053, + "grad_norm": 1.4296504259109497, + "learning_rate": 4.212314421132302e-07, + "loss": 1.3463, + "step": 16696 + }, + { + "epoch": 0.9128657928187746, + "grad_norm": 1.6508716344833374, + "learning_rate": 4.2070682292489674e-07, + "loss": 1.4529, + "step": 16697 + }, + { + "epoch": 0.912920465262496, + "grad_norm": 2.418224334716797, + "learning_rate": 4.201825236116408e-07, + "loss": 1.4346, + "step": 16698 + }, + { + "epoch": 0.9129751377062176, + "grad_norm": 1.594344139099121, + "learning_rate": 4.196585441909706e-07, + "loss": 1.3559, + "step": 16699 + }, + { + "epoch": 0.9130298101499392, + "grad_norm": 1.6097208261489868, + "learning_rate": 4.191348846803811e-07, + "loss": 1.6466, + "step": 16700 + }, + { + "epoch": 0.9130844825936607, + "grad_norm": 1.6014610528945923, + "learning_rate": 4.186115450973616e-07, + "loss": 1.0773, + "step": 16701 + }, + { + "epoch": 0.9131391550373823, + "grad_norm": 1.6409519910812378, + "learning_rate": 4.1808852545938583e-07, + "loss": 1.3883, + "step": 16702 + }, + { + "epoch": 0.9131938274811038, + "grad_norm": 2.0416979789733887, + "learning_rate": 4.175658257839177e-07, + "loss": 1.1789, + "step": 16703 + }, + { + "epoch": 0.9132484999248254, + "grad_norm": 1.5143452882766724, + "learning_rate": 4.170434460884132e-07, + "loss": 1.6261, + "step": 16704 + }, + { + "epoch": 0.913303172368547, + "grad_norm": 1.4217872619628906, + "learning_rate": 4.1652138639031614e-07, + "loss": 1.3985, + "step": 16705 + }, + { + "epoch": 0.9133578448122684, + "grad_norm": 1.5267361402511597, + "learning_rate": 4.1599964670705705e-07, + "loss": 1.3953, + "step": 16706 + }, + { + "epoch": 0.91341251725599, + "grad_norm": 1.5297183990478516, + "learning_rate": 4.154782270560598e-07, + "loss": 1.6568, + "step": 16707 + }, + { + "epoch": 0.9134671896997116, + "grad_norm": 1.4313313961029053, + "learning_rate": 4.1495712745473595e-07, + "loss": 1.4553, + "step": 16708 + }, + { + "epoch": 0.9135218621434331, + "grad_norm": 1.8714346885681152, + "learning_rate": 4.144363479204849e-07, + "loss": 1.6222, + "step": 16709 + }, + { + "epoch": 0.9135765345871547, + "grad_norm": 1.577540397644043, + "learning_rate": 4.1391588847069844e-07, + "loss": 1.2262, + "step": 16710 + }, + { + "epoch": 0.9136312070308763, + "grad_norm": 1.2806649208068848, + "learning_rate": 4.133957491227547e-07, + "loss": 1.4062, + "step": 16711 + }, + { + "epoch": 0.9136858794745978, + "grad_norm": 1.9488893747329712, + "learning_rate": 4.12875929894021e-07, + "loss": 1.5749, + "step": 16712 + }, + { + "epoch": 0.9137405519183194, + "grad_norm": 2.1432135105133057, + "learning_rate": 4.1235643080185797e-07, + "loss": 1.2829, + "step": 16713 + }, + { + "epoch": 0.913795224362041, + "grad_norm": 1.4150711297988892, + "learning_rate": 4.1183725186361044e-07, + "loss": 1.3773, + "step": 16714 + }, + { + "epoch": 0.9138498968057625, + "grad_norm": 1.895113229751587, + "learning_rate": 4.1131839309661803e-07, + "loss": 1.3171, + "step": 16715 + }, + { + "epoch": 0.913904569249484, + "grad_norm": 1.6188870668411255, + "learning_rate": 4.107998545182046e-07, + "loss": 1.4757, + "step": 16716 + }, + { + "epoch": 0.9139592416932056, + "grad_norm": 1.8441134691238403, + "learning_rate": 4.1028163614568516e-07, + "loss": 1.4988, + "step": 16717 + }, + { + "epoch": 0.9140139141369271, + "grad_norm": 1.4319075345993042, + "learning_rate": 4.097637379963659e-07, + "loss": 1.423, + "step": 16718 + }, + { + "epoch": 0.9140685865806487, + "grad_norm": 1.4399973154067993, + "learning_rate": 4.092461600875397e-07, + "loss": 1.3715, + "step": 16719 + }, + { + "epoch": 0.9141232590243702, + "grad_norm": 1.7619843482971191, + "learning_rate": 4.0872890243648933e-07, + "loss": 1.249, + "step": 16720 + }, + { + "epoch": 0.9141779314680918, + "grad_norm": 1.4997916221618652, + "learning_rate": 4.0821196506048764e-07, + "loss": 1.4778, + "step": 16721 + }, + { + "epoch": 0.9142326039118134, + "grad_norm": 1.5830663442611694, + "learning_rate": 4.076953479767964e-07, + "loss": 1.4636, + "step": 16722 + }, + { + "epoch": 0.9142872763555349, + "grad_norm": 1.2814377546310425, + "learning_rate": 4.071790512026652e-07, + "loss": 1.4624, + "step": 16723 + }, + { + "epoch": 0.9143419487992565, + "grad_norm": 1.898269772529602, + "learning_rate": 4.0666307475533686e-07, + "loss": 1.5704, + "step": 16724 + }, + { + "epoch": 0.914396621242978, + "grad_norm": 1.5947519540786743, + "learning_rate": 4.061474186520409e-07, + "loss": 1.4114, + "step": 16725 + }, + { + "epoch": 0.9144512936866995, + "grad_norm": 1.671090841293335, + "learning_rate": 4.056320829099925e-07, + "loss": 1.4355, + "step": 16726 + }, + { + "epoch": 0.9145059661304211, + "grad_norm": 1.451131820678711, + "learning_rate": 4.0511706754640557e-07, + "loss": 1.4186, + "step": 16727 + }, + { + "epoch": 0.9145606385741427, + "grad_norm": 1.2736256122589111, + "learning_rate": 4.0460237257847203e-07, + "loss": 1.4281, + "step": 16728 + }, + { + "epoch": 0.9146153110178642, + "grad_norm": 1.7558070421218872, + "learning_rate": 4.040879980233836e-07, + "loss": 1.5428, + "step": 16729 + }, + { + "epoch": 0.9146699834615858, + "grad_norm": 1.3169188499450684, + "learning_rate": 4.035739438983144e-07, + "loss": 1.5159, + "step": 16730 + }, + { + "epoch": 0.9147246559053074, + "grad_norm": 1.3064603805541992, + "learning_rate": 4.030602102204284e-07, + "loss": 1.5063, + "step": 16731 + }, + { + "epoch": 0.9147793283490289, + "grad_norm": 1.8852030038833618, + "learning_rate": 4.0254679700688414e-07, + "loss": 1.1658, + "step": 16732 + }, + { + "epoch": 0.9148340007927505, + "grad_norm": 1.4533766508102417, + "learning_rate": 4.020337042748224e-07, + "loss": 1.5473, + "step": 16733 + }, + { + "epoch": 0.9148886732364719, + "grad_norm": 1.264182448387146, + "learning_rate": 4.015209320413782e-07, + "loss": 1.4458, + "step": 16734 + }, + { + "epoch": 0.9149433456801935, + "grad_norm": 1.4921587705612183, + "learning_rate": 4.010084803236736e-07, + "loss": 1.4514, + "step": 16735 + }, + { + "epoch": 0.9149980181239151, + "grad_norm": 1.8243563175201416, + "learning_rate": 4.004963491388203e-07, + "loss": 1.4122, + "step": 16736 + }, + { + "epoch": 0.9150526905676366, + "grad_norm": 2.0852584838867188, + "learning_rate": 3.9998453850391807e-07, + "loss": 1.5683, + "step": 16737 + }, + { + "epoch": 0.9151073630113582, + "grad_norm": 1.3109947443008423, + "learning_rate": 3.9947304843606093e-07, + "loss": 1.3575, + "step": 16738 + }, + { + "epoch": 0.9151620354550798, + "grad_norm": 1.5609478950500488, + "learning_rate": 3.989618789523275e-07, + "loss": 1.5306, + "step": 16739 + }, + { + "epoch": 0.9152167078988013, + "grad_norm": 2.8394410610198975, + "learning_rate": 3.9845103006978525e-07, + "loss": 1.2608, + "step": 16740 + }, + { + "epoch": 0.9152713803425229, + "grad_norm": 1.6909856796264648, + "learning_rate": 3.979405018054949e-07, + "loss": 1.4359, + "step": 16741 + }, + { + "epoch": 0.9153260527862445, + "grad_norm": 1.5791195631027222, + "learning_rate": 3.974302941765007e-07, + "loss": 1.4374, + "step": 16742 + }, + { + "epoch": 0.9153807252299659, + "grad_norm": 1.401731014251709, + "learning_rate": 3.969204071998445e-07, + "loss": 1.6507, + "step": 16743 + }, + { + "epoch": 0.9154353976736875, + "grad_norm": 1.5988861322402954, + "learning_rate": 3.9641084089255045e-07, + "loss": 1.3821, + "step": 16744 + }, + { + "epoch": 0.9154900701174091, + "grad_norm": 1.428257942199707, + "learning_rate": 3.959015952716327e-07, + "loss": 1.3939, + "step": 16745 + }, + { + "epoch": 0.9155447425611306, + "grad_norm": 1.6304278373718262, + "learning_rate": 3.953926703540978e-07, + "loss": 1.4767, + "step": 16746 + }, + { + "epoch": 0.9155994150048522, + "grad_norm": 1.394682765007019, + "learning_rate": 3.9488406615694084e-07, + "loss": 2.0537, + "step": 16747 + }, + { + "epoch": 0.9156540874485737, + "grad_norm": 1.4296917915344238, + "learning_rate": 3.943757826971428e-07, + "loss": 1.4344, + "step": 16748 + }, + { + "epoch": 0.9157087598922953, + "grad_norm": 2.041576862335205, + "learning_rate": 3.938678199916779e-07, + "loss": 1.5446, + "step": 16749 + }, + { + "epoch": 0.9157634323360169, + "grad_norm": 1.35573410987854, + "learning_rate": 3.93360178057508e-07, + "loss": 1.3258, + "step": 16750 + }, + { + "epoch": 0.9158181047797384, + "grad_norm": 1.3690046072006226, + "learning_rate": 3.928528569115841e-07, + "loss": 1.4087, + "step": 16751 + }, + { + "epoch": 0.9158727772234599, + "grad_norm": 1.8144675493240356, + "learning_rate": 3.923458565708471e-07, + "loss": 1.444, + "step": 16752 + }, + { + "epoch": 0.9159274496671815, + "grad_norm": 1.2792036533355713, + "learning_rate": 3.9183917705222787e-07, + "loss": 1.5319, + "step": 16753 + }, + { + "epoch": 0.915982122110903, + "grad_norm": 1.7381612062454224, + "learning_rate": 3.9133281837264394e-07, + "loss": 1.2214, + "step": 16754 + }, + { + "epoch": 0.9160367945546246, + "grad_norm": 1.2690083980560303, + "learning_rate": 3.908267805490051e-07, + "loss": 1.505, + "step": 16755 + }, + { + "epoch": 0.9160914669983462, + "grad_norm": 2.2218923568725586, + "learning_rate": 3.903210635982091e-07, + "loss": 1.6191, + "step": 16756 + }, + { + "epoch": 0.9161461394420677, + "grad_norm": 1.4542206525802612, + "learning_rate": 3.8981566753714116e-07, + "loss": 1.547, + "step": 16757 + }, + { + "epoch": 0.9162008118857893, + "grad_norm": 1.777184009552002, + "learning_rate": 3.893105923826801e-07, + "loss": 1.4211, + "step": 16758 + }, + { + "epoch": 0.9162554843295109, + "grad_norm": 1.5646406412124634, + "learning_rate": 3.8880583815169014e-07, + "loss": 1.205, + "step": 16759 + }, + { + "epoch": 0.9163101567732324, + "grad_norm": 1.3803353309631348, + "learning_rate": 3.8830140486102785e-07, + "loss": 1.5687, + "step": 16760 + }, + { + "epoch": 0.916364829216954, + "grad_norm": 1.5778447389602661, + "learning_rate": 3.877972925275353e-07, + "loss": 1.6201, + "step": 16761 + }, + { + "epoch": 0.9164195016606754, + "grad_norm": 1.432194471359253, + "learning_rate": 3.8729350116804564e-07, + "loss": 1.4706, + "step": 16762 + }, + { + "epoch": 0.916474174104397, + "grad_norm": 1.4099758863449097, + "learning_rate": 3.8679003079938437e-07, + "loss": 1.3282, + "step": 16763 + }, + { + "epoch": 0.9165288465481186, + "grad_norm": 1.4907689094543457, + "learning_rate": 3.8628688143836244e-07, + "loss": 1.4779, + "step": 16764 + }, + { + "epoch": 0.9165835189918401, + "grad_norm": 1.8548225164413452, + "learning_rate": 3.857840531017798e-07, + "loss": 1.6607, + "step": 16765 + }, + { + "epoch": 0.9166381914355617, + "grad_norm": 1.6317218542099, + "learning_rate": 3.8528154580642853e-07, + "loss": 1.5885, + "step": 16766 + }, + { + "epoch": 0.9166928638792833, + "grad_norm": 1.4017646312713623, + "learning_rate": 3.847793595690885e-07, + "loss": 1.3316, + "step": 16767 + }, + { + "epoch": 0.9167475363230048, + "grad_norm": 1.4209764003753662, + "learning_rate": 3.842774944065264e-07, + "loss": 1.4079, + "step": 16768 + }, + { + "epoch": 0.9168022087667264, + "grad_norm": 1.6483854055404663, + "learning_rate": 3.837759503355054e-07, + "loss": 1.5902, + "step": 16769 + }, + { + "epoch": 0.916856881210448, + "grad_norm": 1.335326075553894, + "learning_rate": 3.8327472737276995e-07, + "loss": 1.4118, + "step": 16770 + }, + { + "epoch": 0.9169115536541694, + "grad_norm": 1.9019168615341187, + "learning_rate": 3.827738255350555e-07, + "loss": 1.3505, + "step": 16771 + }, + { + "epoch": 0.916966226097891, + "grad_norm": 1.3336979150772095, + "learning_rate": 3.8227324483909313e-07, + "loss": 1.5679, + "step": 16772 + }, + { + "epoch": 0.9170208985416126, + "grad_norm": 1.7064236402511597, + "learning_rate": 3.81772985301595e-07, + "loss": 1.4977, + "step": 16773 + }, + { + "epoch": 0.9170755709853341, + "grad_norm": 1.4884997606277466, + "learning_rate": 3.812730469392678e-07, + "loss": 1.4165, + "step": 16774 + }, + { + "epoch": 0.9171302434290557, + "grad_norm": 1.4201632738113403, + "learning_rate": 3.807734297688037e-07, + "loss": 1.466, + "step": 16775 + }, + { + "epoch": 0.9171849158727772, + "grad_norm": 1.4802933931350708, + "learning_rate": 3.8027413380688603e-07, + "loss": 1.3247, + "step": 16776 + }, + { + "epoch": 0.9172395883164988, + "grad_norm": 1.6511659622192383, + "learning_rate": 3.7977515907018927e-07, + "loss": 1.489, + "step": 16777 + }, + { + "epoch": 0.9172942607602204, + "grad_norm": 1.2870557308197021, + "learning_rate": 3.792765055753755e-07, + "loss": 1.3491, + "step": 16778 + }, + { + "epoch": 0.9173489332039418, + "grad_norm": 1.863667607307434, + "learning_rate": 3.7877817333909275e-07, + "loss": 1.249, + "step": 16779 + }, + { + "epoch": 0.9174036056476634, + "grad_norm": 1.6126481294631958, + "learning_rate": 3.7828016237798525e-07, + "loss": 1.5266, + "step": 16780 + }, + { + "epoch": 0.917458278091385, + "grad_norm": 1.5703575611114502, + "learning_rate": 3.77782472708681e-07, + "loss": 1.4349, + "step": 16781 + }, + { + "epoch": 0.9175129505351065, + "grad_norm": 1.6108314990997314, + "learning_rate": 3.7728510434779876e-07, + "loss": 1.3925, + "step": 16782 + }, + { + "epoch": 0.9175676229788281, + "grad_norm": 1.3490811586380005, + "learning_rate": 3.7678805731194754e-07, + "loss": 1.5247, + "step": 16783 + }, + { + "epoch": 0.9176222954225497, + "grad_norm": 1.4925694465637207, + "learning_rate": 3.7629133161772525e-07, + "loss": 1.3979, + "step": 16784 + }, + { + "epoch": 0.9176769678662712, + "grad_norm": 1.4292471408843994, + "learning_rate": 3.757949272817174e-07, + "loss": 1.5653, + "step": 16785 + }, + { + "epoch": 0.9177316403099928, + "grad_norm": 1.6609392166137695, + "learning_rate": 3.7529884432050077e-07, + "loss": 1.4908, + "step": 16786 + }, + { + "epoch": 0.9177863127537144, + "grad_norm": 1.2221325635910034, + "learning_rate": 3.7480308275064214e-07, + "loss": 1.5751, + "step": 16787 + }, + { + "epoch": 0.9178409851974358, + "grad_norm": 1.4240378141403198, + "learning_rate": 3.7430764258869377e-07, + "loss": 1.2863, + "step": 16788 + }, + { + "epoch": 0.9178956576411574, + "grad_norm": 1.3944435119628906, + "learning_rate": 3.738125238512014e-07, + "loss": 1.4458, + "step": 16789 + }, + { + "epoch": 0.9179503300848789, + "grad_norm": 1.4858760833740234, + "learning_rate": 3.73317726554695e-07, + "loss": 1.3731, + "step": 16790 + }, + { + "epoch": 0.9180050025286005, + "grad_norm": 1.5921862125396729, + "learning_rate": 3.7282325071570145e-07, + "loss": 1.5791, + "step": 16791 + }, + { + "epoch": 0.9180596749723221, + "grad_norm": 1.5797120332717896, + "learning_rate": 3.723290963507309e-07, + "loss": 1.3806, + "step": 16792 + }, + { + "epoch": 0.9181143474160436, + "grad_norm": 1.193418025970459, + "learning_rate": 3.718352634762823e-07, + "loss": 1.5832, + "step": 16793 + }, + { + "epoch": 0.9181690198597652, + "grad_norm": 1.6130263805389404, + "learning_rate": 3.7134175210884807e-07, + "loss": 1.3605, + "step": 16794 + }, + { + "epoch": 0.9182236923034868, + "grad_norm": 1.6500877141952515, + "learning_rate": 3.7084856226490716e-07, + "loss": 1.1791, + "step": 16795 + }, + { + "epoch": 0.9182783647472083, + "grad_norm": 1.7737687826156616, + "learning_rate": 3.7035569396092763e-07, + "loss": 1.5823, + "step": 16796 + }, + { + "epoch": 0.9183330371909298, + "grad_norm": 1.6367738246917725, + "learning_rate": 3.698631472133696e-07, + "loss": 1.2242, + "step": 16797 + }, + { + "epoch": 0.9183877096346514, + "grad_norm": 1.868175983428955, + "learning_rate": 3.6937092203867874e-07, + "loss": 1.3792, + "step": 16798 + }, + { + "epoch": 0.9184423820783729, + "grad_norm": 1.2670596837997437, + "learning_rate": 3.688790184532909e-07, + "loss": 1.4919, + "step": 16799 + }, + { + "epoch": 0.9184970545220945, + "grad_norm": 3.2836475372314453, + "learning_rate": 3.68387436473634e-07, + "loss": 1.4827, + "step": 16800 + }, + { + "epoch": 0.9185517269658161, + "grad_norm": 1.2674771547317505, + "learning_rate": 3.678961761161193e-07, + "loss": 1.4091, + "step": 16801 + }, + { + "epoch": 0.9186063994095376, + "grad_norm": 1.5907269716262817, + "learning_rate": 3.674052373971559e-07, + "loss": 1.5469, + "step": 16802 + }, + { + "epoch": 0.9186610718532592, + "grad_norm": 1.4816323518753052, + "learning_rate": 3.669146203331353e-07, + "loss": 1.627, + "step": 16803 + }, + { + "epoch": 0.9187157442969807, + "grad_norm": 1.24106764793396, + "learning_rate": 3.664243249404387e-07, + "loss": 1.6373, + "step": 16804 + }, + { + "epoch": 0.9187704167407023, + "grad_norm": 1.4600627422332764, + "learning_rate": 3.6593435123544075e-07, + "loss": 1.4479, + "step": 16805 + }, + { + "epoch": 0.9188250891844238, + "grad_norm": 1.55588960647583, + "learning_rate": 3.654446992345018e-07, + "loss": 1.3413, + "step": 16806 + }, + { + "epoch": 0.9188797616281453, + "grad_norm": 1.2421202659606934, + "learning_rate": 3.64955368953972e-07, + "loss": 1.5256, + "step": 16807 + }, + { + "epoch": 0.9189344340718669, + "grad_norm": 1.2407904863357544, + "learning_rate": 3.6446636041019276e-07, + "loss": 1.305, + "step": 16808 + }, + { + "epoch": 0.9189891065155885, + "grad_norm": 1.2928333282470703, + "learning_rate": 3.639776736194922e-07, + "loss": 1.474, + "step": 16809 + }, + { + "epoch": 0.91904377895931, + "grad_norm": 1.2203346490859985, + "learning_rate": 3.634893085981872e-07, + "loss": 1.453, + "step": 16810 + }, + { + "epoch": 0.9190984514030316, + "grad_norm": 1.6581530570983887, + "learning_rate": 3.6300126536258806e-07, + "loss": 1.2449, + "step": 16811 + }, + { + "epoch": 0.9191531238467532, + "grad_norm": 1.6409776210784912, + "learning_rate": 3.625135439289917e-07, + "loss": 1.4368, + "step": 16812 + }, + { + "epoch": 0.9192077962904747, + "grad_norm": 1.2183226346969604, + "learning_rate": 3.620261443136819e-07, + "loss": 1.5724, + "step": 16813 + }, + { + "epoch": 0.9192624687341963, + "grad_norm": 1.5033588409423828, + "learning_rate": 3.6153906653293544e-07, + "loss": 1.2831, + "step": 16814 + }, + { + "epoch": 0.9193171411779179, + "grad_norm": 1.3876140117645264, + "learning_rate": 3.6105231060301613e-07, + "loss": 1.2151, + "step": 16815 + }, + { + "epoch": 0.9193718136216393, + "grad_norm": 1.4092730283737183, + "learning_rate": 3.6056587654018094e-07, + "loss": 1.3068, + "step": 16816 + }, + { + "epoch": 0.9194264860653609, + "grad_norm": 1.7054954767227173, + "learning_rate": 3.6007976436066903e-07, + "loss": 1.6633, + "step": 16817 + }, + { + "epoch": 0.9194811585090824, + "grad_norm": 1.5235466957092285, + "learning_rate": 3.5959397408071416e-07, + "loss": 1.4161, + "step": 16818 + }, + { + "epoch": 0.919535830952804, + "grad_norm": 1.3630917072296143, + "learning_rate": 3.5910850571653997e-07, + "loss": 1.4688, + "step": 16819 + }, + { + "epoch": 0.9195905033965256, + "grad_norm": 2.4967119693756104, + "learning_rate": 3.5862335928435465e-07, + "loss": 1.5543, + "step": 16820 + }, + { + "epoch": 0.9196451758402471, + "grad_norm": 1.2192444801330566, + "learning_rate": 3.5813853480035966e-07, + "loss": 1.3427, + "step": 16821 + }, + { + "epoch": 0.9196998482839687, + "grad_norm": 1.1719715595245361, + "learning_rate": 3.5765403228074424e-07, + "loss": 1.4201, + "step": 16822 + }, + { + "epoch": 0.9197545207276903, + "grad_norm": 1.443971037864685, + "learning_rate": 3.5716985174168884e-07, + "loss": 1.36, + "step": 16823 + }, + { + "epoch": 0.9198091931714117, + "grad_norm": 1.4658907651901245, + "learning_rate": 3.5668599319935717e-07, + "loss": 1.3427, + "step": 16824 + }, + { + "epoch": 0.9198638656151333, + "grad_norm": 1.4253259897232056, + "learning_rate": 3.562024566699107e-07, + "loss": 1.491, + "step": 16825 + }, + { + "epoch": 0.9199185380588549, + "grad_norm": 1.6591644287109375, + "learning_rate": 3.5571924216949327e-07, + "loss": 1.1323, + "step": 16826 + }, + { + "epoch": 0.9199732105025764, + "grad_norm": 1.762393832206726, + "learning_rate": 3.5523634971424194e-07, + "loss": 1.7643, + "step": 16827 + }, + { + "epoch": 0.920027882946298, + "grad_norm": 1.360395908355713, + "learning_rate": 3.5475377932028155e-07, + "loss": 1.5796, + "step": 16828 + }, + { + "epoch": 0.9200825553900196, + "grad_norm": 1.5402040481567383, + "learning_rate": 3.5427153100372367e-07, + "loss": 1.3287, + "step": 16829 + }, + { + "epoch": 0.9201372278337411, + "grad_norm": 1.4299274682998657, + "learning_rate": 3.5378960478067547e-07, + "loss": 1.5882, + "step": 16830 + }, + { + "epoch": 0.9201919002774627, + "grad_norm": 1.5980931520462036, + "learning_rate": 3.533080006672285e-07, + "loss": 1.3266, + "step": 16831 + }, + { + "epoch": 0.9202465727211842, + "grad_norm": 1.3111587762832642, + "learning_rate": 3.5282671867946206e-07, + "loss": 1.2872, + "step": 16832 + }, + { + "epoch": 0.9203012451649057, + "grad_norm": 1.6082369089126587, + "learning_rate": 3.5234575883345e-07, + "loss": 1.3931, + "step": 16833 + }, + { + "epoch": 0.9203559176086273, + "grad_norm": 1.5649505853652954, + "learning_rate": 3.5186512114525283e-07, + "loss": 1.7683, + "step": 16834 + }, + { + "epoch": 0.9204105900523488, + "grad_norm": 1.239952564239502, + "learning_rate": 3.513848056309177e-07, + "loss": 1.3893, + "step": 16835 + }, + { + "epoch": 0.9204652624960704, + "grad_norm": 1.7384511232376099, + "learning_rate": 3.509048123064862e-07, + "loss": 1.5407, + "step": 16836 + }, + { + "epoch": 0.920519934939792, + "grad_norm": 1.298910140991211, + "learning_rate": 3.504251411879855e-07, + "loss": 1.5116, + "step": 16837 + }, + { + "epoch": 0.9205746073835135, + "grad_norm": 1.7189445495605469, + "learning_rate": 3.4994579229143287e-07, + "loss": 1.4535, + "step": 16838 + }, + { + "epoch": 0.9206292798272351, + "grad_norm": 1.4368585348129272, + "learning_rate": 3.4946676563283545e-07, + "loss": 1.4462, + "step": 16839 + }, + { + "epoch": 0.9206839522709567, + "grad_norm": 1.6208654642105103, + "learning_rate": 3.489880612281871e-07, + "loss": 1.5027, + "step": 16840 + }, + { + "epoch": 0.9207386247146782, + "grad_norm": 2.2305822372436523, + "learning_rate": 3.485096790934739e-07, + "loss": 1.3561, + "step": 16841 + }, + { + "epoch": 0.9207932971583997, + "grad_norm": 1.6972764730453491, + "learning_rate": 3.48031619244672e-07, + "loss": 1.377, + "step": 16842 + }, + { + "epoch": 0.9208479696021213, + "grad_norm": 1.856968879699707, + "learning_rate": 3.4755388169774086e-07, + "loss": 1.363, + "step": 16843 + }, + { + "epoch": 0.9209026420458428, + "grad_norm": 1.5836931467056274, + "learning_rate": 3.470764664686377e-07, + "loss": 1.7008, + "step": 16844 + }, + { + "epoch": 0.9209573144895644, + "grad_norm": 1.629787564277649, + "learning_rate": 3.465993735733031e-07, + "loss": 1.4269, + "step": 16845 + }, + { + "epoch": 0.9210119869332859, + "grad_norm": 1.1519168615341187, + "learning_rate": 3.4612260302766653e-07, + "loss": 1.4569, + "step": 16846 + }, + { + "epoch": 0.9210666593770075, + "grad_norm": 1.5774714946746826, + "learning_rate": 3.4564615484764975e-07, + "loss": 1.1421, + "step": 16847 + }, + { + "epoch": 0.9211213318207291, + "grad_norm": 2.019690990447998, + "learning_rate": 3.451700290491633e-07, + "loss": 1.4571, + "step": 16848 + }, + { + "epoch": 0.9211760042644506, + "grad_norm": 1.5495072603225708, + "learning_rate": 3.4469422564810453e-07, + "loss": 1.316, + "step": 16849 + }, + { + "epoch": 0.9212306767081722, + "grad_norm": 1.8138834238052368, + "learning_rate": 3.4421874466036286e-07, + "loss": 1.347, + "step": 16850 + }, + { + "epoch": 0.9212853491518938, + "grad_norm": 1.3287594318389893, + "learning_rate": 3.437435861018168e-07, + "loss": 1.6123, + "step": 16851 + }, + { + "epoch": 0.9213400215956152, + "grad_norm": 1.4223082065582275, + "learning_rate": 3.4326874998833026e-07, + "loss": 1.4411, + "step": 16852 + }, + { + "epoch": 0.9213946940393368, + "grad_norm": 1.5007736682891846, + "learning_rate": 3.427942363357606e-07, + "loss": 1.5261, + "step": 16853 + }, + { + "epoch": 0.9214493664830584, + "grad_norm": 1.4517258405685425, + "learning_rate": 3.4232004515995287e-07, + "loss": 1.1893, + "step": 16854 + }, + { + "epoch": 0.9215040389267799, + "grad_norm": 1.2418522834777832, + "learning_rate": 3.4184617647674e-07, + "loss": 1.5352, + "step": 16855 + }, + { + "epoch": 0.9215587113705015, + "grad_norm": 1.4025421142578125, + "learning_rate": 3.4137263030194713e-07, + "loss": 1.4067, + "step": 16856 + }, + { + "epoch": 0.9216133838142231, + "grad_norm": 1.7259633541107178, + "learning_rate": 3.408994066513871e-07, + "loss": 1.3713, + "step": 16857 + }, + { + "epoch": 0.9216680562579446, + "grad_norm": 1.5131893157958984, + "learning_rate": 3.404265055408618e-07, + "loss": 1.5049, + "step": 16858 + }, + { + "epoch": 0.9217227287016662, + "grad_norm": 1.3880363702774048, + "learning_rate": 3.399539269861629e-07, + "loss": 1.3677, + "step": 16859 + }, + { + "epoch": 0.9217774011453876, + "grad_norm": 1.6100960969924927, + "learning_rate": 3.3948167100306906e-07, + "loss": 1.3896, + "step": 16860 + }, + { + "epoch": 0.9218320735891092, + "grad_norm": 1.4295310974121094, + "learning_rate": 3.3900973760735313e-07, + "loss": 1.4034, + "step": 16861 + }, + { + "epoch": 0.9218867460328308, + "grad_norm": 1.2388334274291992, + "learning_rate": 3.3853812681477136e-07, + "loss": 1.6109, + "step": 16862 + }, + { + "epoch": 0.9219414184765523, + "grad_norm": 1.5485912561416626, + "learning_rate": 3.3806683864107347e-07, + "loss": 1.3288, + "step": 16863 + }, + { + "epoch": 0.9219960909202739, + "grad_norm": 1.3169902563095093, + "learning_rate": 3.375958731019957e-07, + "loss": 1.2751, + "step": 16864 + }, + { + "epoch": 0.9220507633639955, + "grad_norm": 1.6322494745254517, + "learning_rate": 3.371252302132666e-07, + "loss": 1.5276, + "step": 16865 + }, + { + "epoch": 0.922105435807717, + "grad_norm": 1.8641103506088257, + "learning_rate": 3.36654909990598e-07, + "loss": 1.3045, + "step": 16866 + }, + { + "epoch": 0.9221601082514386, + "grad_norm": 1.7471816539764404, + "learning_rate": 3.3618491244969965e-07, + "loss": 1.502, + "step": 16867 + }, + { + "epoch": 0.9222147806951602, + "grad_norm": 1.4767214059829712, + "learning_rate": 3.3571523760626333e-07, + "loss": 1.5348, + "step": 16868 + }, + { + "epoch": 0.9222694531388816, + "grad_norm": 1.6618545055389404, + "learning_rate": 3.3524588547597327e-07, + "loss": 1.2819, + "step": 16869 + }, + { + "epoch": 0.9223241255826032, + "grad_norm": 1.4414398670196533, + "learning_rate": 3.347768560745024e-07, + "loss": 1.2883, + "step": 16870 + }, + { + "epoch": 0.9223787980263248, + "grad_norm": 2.3157997131347656, + "learning_rate": 3.3430814941751153e-07, + "loss": 1.4864, + "step": 16871 + }, + { + "epoch": 0.9224334704700463, + "grad_norm": 1.5870137214660645, + "learning_rate": 3.3383976552065376e-07, + "loss": 1.5079, + "step": 16872 + }, + { + "epoch": 0.9224881429137679, + "grad_norm": 1.676842212677002, + "learning_rate": 3.333717043995677e-07, + "loss": 1.2396, + "step": 16873 + }, + { + "epoch": 0.9225428153574894, + "grad_norm": 1.6628812551498413, + "learning_rate": 3.3290396606988404e-07, + "loss": 1.4204, + "step": 16874 + }, + { + "epoch": 0.922597487801211, + "grad_norm": 1.2939088344573975, + "learning_rate": 3.324365505472227e-07, + "loss": 1.4133, + "step": 16875 + }, + { + "epoch": 0.9226521602449326, + "grad_norm": 1.2401857376098633, + "learning_rate": 3.3196945784718993e-07, + "loss": 1.7078, + "step": 16876 + }, + { + "epoch": 0.922706832688654, + "grad_norm": 1.7852650880813599, + "learning_rate": 3.315026879853833e-07, + "loss": 1.6257, + "step": 16877 + }, + { + "epoch": 0.9227615051323756, + "grad_norm": 1.9177730083465576, + "learning_rate": 3.310362409773904e-07, + "loss": 1.2786, + "step": 16878 + }, + { + "epoch": 0.9228161775760972, + "grad_norm": 1.5375410318374634, + "learning_rate": 3.3057011683878647e-07, + "loss": 1.4823, + "step": 16879 + }, + { + "epoch": 0.9228708500198187, + "grad_norm": 1.479912281036377, + "learning_rate": 3.3010431558513577e-07, + "loss": 1.4112, + "step": 16880 + }, + { + "epoch": 0.9229255224635403, + "grad_norm": 1.5322134494781494, + "learning_rate": 3.2963883723199364e-07, + "loss": 1.3053, + "step": 16881 + }, + { + "epoch": 0.9229801949072619, + "grad_norm": 1.5531281232833862, + "learning_rate": 3.291736817949021e-07, + "loss": 1.4433, + "step": 16882 + }, + { + "epoch": 0.9230348673509834, + "grad_norm": 1.4427415132522583, + "learning_rate": 3.287088492893942e-07, + "loss": 1.3783, + "step": 16883 + }, + { + "epoch": 0.923089539794705, + "grad_norm": 1.5730677843093872, + "learning_rate": 3.2824433973099425e-07, + "loss": 1.5512, + "step": 16884 + }, + { + "epoch": 0.9231442122384266, + "grad_norm": 1.6506497859954834, + "learning_rate": 3.277801531352087e-07, + "loss": 1.6305, + "step": 16885 + }, + { + "epoch": 0.9231988846821481, + "grad_norm": 1.7756743431091309, + "learning_rate": 3.2731628951754193e-07, + "loss": 1.4851, + "step": 16886 + }, + { + "epoch": 0.9232535571258697, + "grad_norm": 1.4558011293411255, + "learning_rate": 3.268527488934825e-07, + "loss": 1.3665, + "step": 16887 + }, + { + "epoch": 0.9233082295695911, + "grad_norm": 1.4764918088912964, + "learning_rate": 3.26389531278507e-07, + "loss": 1.4482, + "step": 16888 + }, + { + "epoch": 0.9233629020133127, + "grad_norm": 1.6353025436401367, + "learning_rate": 3.2592663668808645e-07, + "loss": 1.1902, + "step": 16889 + }, + { + "epoch": 0.9234175744570343, + "grad_norm": 1.5743356943130493, + "learning_rate": 3.25464065137675e-07, + "loss": 1.1381, + "step": 16890 + }, + { + "epoch": 0.9234722469007558, + "grad_norm": 1.3716325759887695, + "learning_rate": 3.250018166427216e-07, + "loss": 1.3605, + "step": 16891 + }, + { + "epoch": 0.9235269193444774, + "grad_norm": 1.4221311807632446, + "learning_rate": 3.245398912186604e-07, + "loss": 1.3969, + "step": 16892 + }, + { + "epoch": 0.923581591788199, + "grad_norm": 1.2969112396240234, + "learning_rate": 3.2407828888091687e-07, + "loss": 1.5531, + "step": 16893 + }, + { + "epoch": 0.9236362642319205, + "grad_norm": 1.945243000984192, + "learning_rate": 3.236170096449032e-07, + "loss": 1.0628, + "step": 16894 + }, + { + "epoch": 0.9236909366756421, + "grad_norm": 1.8813098669052124, + "learning_rate": 3.2315605352602474e-07, + "loss": 1.3948, + "step": 16895 + }, + { + "epoch": 0.9237456091193637, + "grad_norm": 1.6693196296691895, + "learning_rate": 3.226954205396737e-07, + "loss": 1.4707, + "step": 16896 + }, + { + "epoch": 0.9238002815630851, + "grad_norm": 1.4023622274398804, + "learning_rate": 3.2223511070122893e-07, + "loss": 1.6082, + "step": 16897 + }, + { + "epoch": 0.9238549540068067, + "grad_norm": 1.3683574199676514, + "learning_rate": 3.217751240260647e-07, + "loss": 1.4936, + "step": 16898 + }, + { + "epoch": 0.9239096264505283, + "grad_norm": 1.2845016717910767, + "learning_rate": 3.2131546052953987e-07, + "loss": 1.3143, + "step": 16899 + }, + { + "epoch": 0.9239642988942498, + "grad_norm": 1.3685990571975708, + "learning_rate": 3.208561202270033e-07, + "loss": 1.5143, + "step": 16900 + }, + { + "epoch": 0.9240189713379714, + "grad_norm": 1.5850645303726196, + "learning_rate": 3.203971031337938e-07, + "loss": 1.4036, + "step": 16901 + }, + { + "epoch": 0.9240736437816929, + "grad_norm": 1.731519341468811, + "learning_rate": 3.199384092652402e-07, + "loss": 1.3716, + "step": 16902 + }, + { + "epoch": 0.9241283162254145, + "grad_norm": 1.8897937536239624, + "learning_rate": 3.19480038636657e-07, + "loss": 1.4013, + "step": 16903 + }, + { + "epoch": 0.9241829886691361, + "grad_norm": 1.4821916818618774, + "learning_rate": 3.190219912633519e-07, + "loss": 1.3715, + "step": 16904 + }, + { + "epoch": 0.9242376611128575, + "grad_norm": 1.2623196840286255, + "learning_rate": 3.185642671606182e-07, + "loss": 1.4339, + "step": 16905 + }, + { + "epoch": 0.9242923335565791, + "grad_norm": 1.452492117881775, + "learning_rate": 3.1810686634374253e-07, + "loss": 1.4824, + "step": 16906 + }, + { + "epoch": 0.9243470060003007, + "grad_norm": 2.138434886932373, + "learning_rate": 3.1764978882799833e-07, + "loss": 1.4208, + "step": 16907 + }, + { + "epoch": 0.9244016784440222, + "grad_norm": 1.7525525093078613, + "learning_rate": 3.171930346286467e-07, + "loss": 1.6493, + "step": 16908 + }, + { + "epoch": 0.9244563508877438, + "grad_norm": 1.3552923202514648, + "learning_rate": 3.167366037609421e-07, + "loss": 1.5522, + "step": 16909 + }, + { + "epoch": 0.9245110233314654, + "grad_norm": 1.8425058126449585, + "learning_rate": 3.162804962401256e-07, + "loss": 1.4402, + "step": 16910 + }, + { + "epoch": 0.9245656957751869, + "grad_norm": 2.1180503368377686, + "learning_rate": 3.158247120814251e-07, + "loss": 1.205, + "step": 16911 + }, + { + "epoch": 0.9246203682189085, + "grad_norm": 1.3876522779464722, + "learning_rate": 3.153692513000628e-07, + "loss": 1.0941, + "step": 16912 + }, + { + "epoch": 0.9246750406626301, + "grad_norm": 1.5045876502990723, + "learning_rate": 3.149141139112466e-07, + "loss": 1.1339, + "step": 16913 + }, + { + "epoch": 0.9247297131063515, + "grad_norm": 1.824338436126709, + "learning_rate": 3.1445929993017545e-07, + "loss": 1.5201, + "step": 16914 + }, + { + "epoch": 0.9247843855500731, + "grad_norm": 1.4158191680908203, + "learning_rate": 3.1400480937203604e-07, + "loss": 1.2853, + "step": 16915 + }, + { + "epoch": 0.9248390579937947, + "grad_norm": 1.515967607498169, + "learning_rate": 3.1355064225200516e-07, + "loss": 1.4409, + "step": 16916 + }, + { + "epoch": 0.9248937304375162, + "grad_norm": 1.5189731121063232, + "learning_rate": 3.1309679858524846e-07, + "loss": 1.3954, + "step": 16917 + }, + { + "epoch": 0.9249484028812378, + "grad_norm": 1.8563488721847534, + "learning_rate": 3.1264327838692153e-07, + "loss": 1.4032, + "step": 16918 + }, + { + "epoch": 0.9250030753249593, + "grad_norm": 1.3512338399887085, + "learning_rate": 3.121900816721646e-07, + "loss": 1.5688, + "step": 16919 + }, + { + "epoch": 0.9250577477686809, + "grad_norm": 1.7606158256530762, + "learning_rate": 3.1173720845611654e-07, + "loss": 1.5066, + "step": 16920 + }, + { + "epoch": 0.9251124202124025, + "grad_norm": 1.5514196157455444, + "learning_rate": 3.1128465875389646e-07, + "loss": 1.5465, + "step": 16921 + }, + { + "epoch": 0.925167092656124, + "grad_norm": 1.4954135417938232, + "learning_rate": 3.108324325806167e-07, + "loss": 1.328, + "step": 16922 + }, + { + "epoch": 0.9252217650998456, + "grad_norm": 1.5485285520553589, + "learning_rate": 3.103805299513796e-07, + "loss": 1.5275, + "step": 16923 + }, + { + "epoch": 0.9252764375435671, + "grad_norm": 1.4909323453903198, + "learning_rate": 3.0992895088127306e-07, + "loss": 1.2061, + "step": 16924 + }, + { + "epoch": 0.9253311099872886, + "grad_norm": 1.431506633758545, + "learning_rate": 3.094776953853762e-07, + "loss": 1.327, + "step": 16925 + }, + { + "epoch": 0.9253857824310102, + "grad_norm": 1.2947112321853638, + "learning_rate": 3.0902676347876025e-07, + "loss": 1.4493, + "step": 16926 + }, + { + "epoch": 0.9254404548747318, + "grad_norm": 2.1138155460357666, + "learning_rate": 3.085761551764799e-07, + "loss": 1.5783, + "step": 16927 + }, + { + "epoch": 0.9254951273184533, + "grad_norm": 1.5532892942428589, + "learning_rate": 3.081258704935841e-07, + "loss": 1.2929, + "step": 16928 + }, + { + "epoch": 0.9255497997621749, + "grad_norm": 1.6831188201904297, + "learning_rate": 3.076759094451087e-07, + "loss": 1.4629, + "step": 16929 + }, + { + "epoch": 0.9256044722058965, + "grad_norm": 2.15350341796875, + "learning_rate": 3.0722627204607834e-07, + "loss": 1.419, + "step": 16930 + }, + { + "epoch": 0.925659144649618, + "grad_norm": 1.451022744178772, + "learning_rate": 3.0677695831150767e-07, + "loss": 1.3813, + "step": 16931 + }, + { + "epoch": 0.9257138170933396, + "grad_norm": 1.76641845703125, + "learning_rate": 3.063279682564002e-07, + "loss": 1.3793, + "step": 16932 + }, + { + "epoch": 0.925768489537061, + "grad_norm": 1.4633642435073853, + "learning_rate": 3.0587930189574734e-07, + "loss": 1.6498, + "step": 16933 + }, + { + "epoch": 0.9258231619807826, + "grad_norm": 1.2836095094680786, + "learning_rate": 3.054309592445348e-07, + "loss": 1.3696, + "step": 16934 + }, + { + "epoch": 0.9258778344245042, + "grad_norm": 1.4917762279510498, + "learning_rate": 3.049829403177307e-07, + "loss": 1.43, + "step": 16935 + }, + { + "epoch": 0.9259325068682257, + "grad_norm": 1.7362558841705322, + "learning_rate": 3.045352451302952e-07, + "loss": 1.5336, + "step": 16936 + }, + { + "epoch": 0.9259871793119473, + "grad_norm": 1.3153612613677979, + "learning_rate": 3.040878736971797e-07, + "loss": 1.7936, + "step": 16937 + }, + { + "epoch": 0.9260418517556689, + "grad_norm": 1.3344241380691528, + "learning_rate": 3.0364082603332235e-07, + "loss": 1.5522, + "step": 16938 + }, + { + "epoch": 0.9260965241993904, + "grad_norm": 1.4524743556976318, + "learning_rate": 3.0319410215365e-07, + "loss": 1.5102, + "step": 16939 + }, + { + "epoch": 0.926151196643112, + "grad_norm": 1.9896135330200195, + "learning_rate": 3.027477020730829e-07, + "loss": 1.2259, + "step": 16940 + }, + { + "epoch": 0.9262058690868336, + "grad_norm": 1.6648898124694824, + "learning_rate": 3.0230162580652367e-07, + "loss": 1.35, + "step": 16941 + }, + { + "epoch": 0.926260541530555, + "grad_norm": 1.513750672340393, + "learning_rate": 3.0185587336887034e-07, + "loss": 1.6281, + "step": 16942 + }, + { + "epoch": 0.9263152139742766, + "grad_norm": 1.4839876890182495, + "learning_rate": 3.014104447750077e-07, + "loss": 1.6448, + "step": 16943 + }, + { + "epoch": 0.9263698864179982, + "grad_norm": 1.5622336864471436, + "learning_rate": 3.0096534003980606e-07, + "loss": 1.4032, + "step": 16944 + }, + { + "epoch": 0.9264245588617197, + "grad_norm": 1.6183264255523682, + "learning_rate": 3.005205591781335e-07, + "loss": 1.3422, + "step": 16945 + }, + { + "epoch": 0.9264792313054413, + "grad_norm": 1.3458882570266724, + "learning_rate": 3.0007610220483927e-07, + "loss": 1.342, + "step": 16946 + }, + { + "epoch": 0.9265339037491628, + "grad_norm": 1.233234167098999, + "learning_rate": 2.996319691347649e-07, + "loss": 1.3494, + "step": 16947 + }, + { + "epoch": 0.9265885761928844, + "grad_norm": 1.7667019367218018, + "learning_rate": 2.991881599827429e-07, + "loss": 1.4282, + "step": 16948 + }, + { + "epoch": 0.926643248636606, + "grad_norm": 1.392136573791504, + "learning_rate": 2.987446747635925e-07, + "loss": 1.3548, + "step": 16949 + }, + { + "epoch": 0.9266979210803274, + "grad_norm": 1.4962559938430786, + "learning_rate": 2.983015134921197e-07, + "loss": 1.491, + "step": 16950 + }, + { + "epoch": 0.926752593524049, + "grad_norm": 1.9149059057235718, + "learning_rate": 2.9785867618312705e-07, + "loss": 1.5657, + "step": 16951 + }, + { + "epoch": 0.9268072659677706, + "grad_norm": 1.350430965423584, + "learning_rate": 2.9741616285139943e-07, + "loss": 1.3913, + "step": 16952 + }, + { + "epoch": 0.9268619384114921, + "grad_norm": 1.462496280670166, + "learning_rate": 2.969739735117128e-07, + "loss": 1.4163, + "step": 16953 + }, + { + "epoch": 0.9269166108552137, + "grad_norm": 1.4617794752120972, + "learning_rate": 2.965321081788364e-07, + "loss": 1.3584, + "step": 16954 + }, + { + "epoch": 0.9269712832989353, + "grad_norm": 2.0882091522216797, + "learning_rate": 2.960905668675218e-07, + "loss": 1.3343, + "step": 16955 + }, + { + "epoch": 0.9270259557426568, + "grad_norm": 1.5136853456497192, + "learning_rate": 2.956493495925139e-07, + "loss": 1.4781, + "step": 16956 + }, + { + "epoch": 0.9270806281863784, + "grad_norm": 1.7331809997558594, + "learning_rate": 2.9520845636854644e-07, + "loss": 1.4062, + "step": 16957 + }, + { + "epoch": 0.9271353006301, + "grad_norm": 1.620888113975525, + "learning_rate": 2.94767887210341e-07, + "loss": 1.5692, + "step": 16958 + }, + { + "epoch": 0.9271899730738214, + "grad_norm": 1.3692079782485962, + "learning_rate": 2.9432764213261025e-07, + "loss": 1.4088, + "step": 16959 + }, + { + "epoch": 0.927244645517543, + "grad_norm": 1.314358115196228, + "learning_rate": 2.9388772115005457e-07, + "loss": 1.4685, + "step": 16960 + }, + { + "epoch": 0.9272993179612645, + "grad_norm": 1.6554936170578003, + "learning_rate": 2.934481242773635e-07, + "loss": 1.5337, + "step": 16961 + }, + { + "epoch": 0.9273539904049861, + "grad_norm": 1.8778356313705444, + "learning_rate": 2.930088515292173e-07, + "loss": 1.5854, + "step": 16962 + }, + { + "epoch": 0.9274086628487077, + "grad_norm": 1.742802619934082, + "learning_rate": 2.925699029202844e-07, + "loss": 1.3958, + "step": 16963 + }, + { + "epoch": 0.9274633352924292, + "grad_norm": 1.4084020853042603, + "learning_rate": 2.921312784652197e-07, + "loss": 1.4462, + "step": 16964 + }, + { + "epoch": 0.9275180077361508, + "grad_norm": 1.3522530794143677, + "learning_rate": 2.916929781786737e-07, + "loss": 1.5662, + "step": 16965 + }, + { + "epoch": 0.9275726801798724, + "grad_norm": 1.4003292322158813, + "learning_rate": 2.912550020752791e-07, + "loss": 1.189, + "step": 16966 + }, + { + "epoch": 0.9276273526235939, + "grad_norm": 1.4226568937301636, + "learning_rate": 2.9081735016966205e-07, + "loss": 1.2647, + "step": 16967 + }, + { + "epoch": 0.9276820250673155, + "grad_norm": 1.2979662418365479, + "learning_rate": 2.9038002247643857e-07, + "loss": 1.5403, + "step": 16968 + }, + { + "epoch": 0.927736697511037, + "grad_norm": 1.5638211965560913, + "learning_rate": 2.8994301901021035e-07, + "loss": 1.636, + "step": 16969 + }, + { + "epoch": 0.9277913699547585, + "grad_norm": 1.592597484588623, + "learning_rate": 2.8950633978556907e-07, + "loss": 1.4063, + "step": 16970 + }, + { + "epoch": 0.9278460423984801, + "grad_norm": 3.1115188598632812, + "learning_rate": 2.8906998481709857e-07, + "loss": 1.2324, + "step": 16971 + }, + { + "epoch": 0.9279007148422017, + "grad_norm": 1.605542540550232, + "learning_rate": 2.886339541193672e-07, + "loss": 1.3659, + "step": 16972 + }, + { + "epoch": 0.9279553872859232, + "grad_norm": 1.609081745147705, + "learning_rate": 2.881982477069378e-07, + "loss": 1.4746, + "step": 16973 + }, + { + "epoch": 0.9280100597296448, + "grad_norm": 1.7580101490020752, + "learning_rate": 2.877628655943576e-07, + "loss": 1.3535, + "step": 16974 + }, + { + "epoch": 0.9280647321733663, + "grad_norm": 1.7330811023712158, + "learning_rate": 2.873278077961661e-07, + "loss": 1.3702, + "step": 16975 + }, + { + "epoch": 0.9281194046170879, + "grad_norm": 1.950870156288147, + "learning_rate": 2.8689307432689053e-07, + "loss": 1.713, + "step": 16976 + }, + { + "epoch": 0.9281740770608095, + "grad_norm": 1.4570050239562988, + "learning_rate": 2.8645866520104815e-07, + "loss": 1.488, + "step": 16977 + }, + { + "epoch": 0.9282287495045309, + "grad_norm": 1.543444037437439, + "learning_rate": 2.8602458043314296e-07, + "loss": 1.7304, + "step": 16978 + }, + { + "epoch": 0.9282834219482525, + "grad_norm": 1.2286134958267212, + "learning_rate": 2.8559082003767334e-07, + "loss": 1.3684, + "step": 16979 + }, + { + "epoch": 0.9283380943919741, + "grad_norm": 1.287579894065857, + "learning_rate": 2.851573840291211e-07, + "loss": 1.5871, + "step": 16980 + }, + { + "epoch": 0.9283927668356956, + "grad_norm": 1.5410981178283691, + "learning_rate": 2.847242724219612e-07, + "loss": 1.4131, + "step": 16981 + }, + { + "epoch": 0.9284474392794172, + "grad_norm": 1.2896533012390137, + "learning_rate": 2.8429148523065443e-07, + "loss": 1.5281, + "step": 16982 + }, + { + "epoch": 0.9285021117231388, + "grad_norm": 1.6465078592300415, + "learning_rate": 2.8385902246965357e-07, + "loss": 1.3007, + "step": 16983 + }, + { + "epoch": 0.9285567841668603, + "grad_norm": 1.6569715738296509, + "learning_rate": 2.834268841534005e-07, + "loss": 1.4674, + "step": 16984 + }, + { + "epoch": 0.9286114566105819, + "grad_norm": 2.145514726638794, + "learning_rate": 2.8299507029632356e-07, + "loss": 1.4696, + "step": 16985 + }, + { + "epoch": 0.9286661290543035, + "grad_norm": 1.5046958923339844, + "learning_rate": 2.825635809128424e-07, + "loss": 1.5889, + "step": 16986 + }, + { + "epoch": 0.9287208014980249, + "grad_norm": 1.5787104368209839, + "learning_rate": 2.8213241601736775e-07, + "loss": 1.4643, + "step": 16987 + }, + { + "epoch": 0.9287754739417465, + "grad_norm": 1.6145427227020264, + "learning_rate": 2.8170157562429466e-07, + "loss": 1.5492, + "step": 16988 + }, + { + "epoch": 0.928830146385468, + "grad_norm": 1.6586437225341797, + "learning_rate": 2.812710597480095e-07, + "loss": 1.3883, + "step": 16989 + }, + { + "epoch": 0.9288848188291896, + "grad_norm": 1.6339685916900635, + "learning_rate": 2.8084086840289074e-07, + "loss": 1.395, + "step": 16990 + }, + { + "epoch": 0.9289394912729112, + "grad_norm": 1.5662500858306885, + "learning_rate": 2.8041100160330127e-07, + "loss": 1.3488, + "step": 16991 + }, + { + "epoch": 0.9289941637166327, + "grad_norm": 2.2467880249023438, + "learning_rate": 2.7998145936359635e-07, + "loss": 1.5294, + "step": 16992 + }, + { + "epoch": 0.9290488361603543, + "grad_norm": 1.600691556930542, + "learning_rate": 2.7955224169812e-07, + "loss": 1.3902, + "step": 16993 + }, + { + "epoch": 0.9291035086040759, + "grad_norm": 1.263131022453308, + "learning_rate": 2.7912334862120305e-07, + "loss": 1.6739, + "step": 16994 + }, + { + "epoch": 0.9291581810477973, + "grad_norm": 1.5259736776351929, + "learning_rate": 2.7869478014716953e-07, + "loss": 1.6527, + "step": 16995 + }, + { + "epoch": 0.9292128534915189, + "grad_norm": 1.2748721837997437, + "learning_rate": 2.7826653629032806e-07, + "loss": 1.6372, + "step": 16996 + }, + { + "epoch": 0.9292675259352405, + "grad_norm": 1.623033046722412, + "learning_rate": 2.778386170649794e-07, + "loss": 1.3602, + "step": 16997 + }, + { + "epoch": 0.929322198378962, + "grad_norm": 1.4598840475082397, + "learning_rate": 2.774110224854132e-07, + "loss": 1.7025, + "step": 16998 + }, + { + "epoch": 0.9293768708226836, + "grad_norm": 1.4428660869598389, + "learning_rate": 2.7698375256590916e-07, + "loss": 1.6496, + "step": 16999 + }, + { + "epoch": 0.9294315432664052, + "grad_norm": 1.2995458841323853, + "learning_rate": 2.765568073207314e-07, + "loss": 1.5913, + "step": 17000 + }, + { + "epoch": 0.9294862157101267, + "grad_norm": 1.4573317766189575, + "learning_rate": 2.761301867641397e-07, + "loss": 1.5448, + "step": 17001 + }, + { + "epoch": 0.9295408881538483, + "grad_norm": 1.1353485584259033, + "learning_rate": 2.757038909103793e-07, + "loss": 1.7465, + "step": 17002 + }, + { + "epoch": 0.9295955605975698, + "grad_norm": 1.3664898872375488, + "learning_rate": 2.752779197736832e-07, + "loss": 1.2004, + "step": 17003 + }, + { + "epoch": 0.9296502330412914, + "grad_norm": 1.9148486852645874, + "learning_rate": 2.748522733682779e-07, + "loss": 1.3259, + "step": 17004 + }, + { + "epoch": 0.929704905485013, + "grad_norm": 1.9031848907470703, + "learning_rate": 2.744269517083764e-07, + "loss": 1.5022, + "step": 17005 + }, + { + "epoch": 0.9297595779287344, + "grad_norm": 1.4424091577529907, + "learning_rate": 2.740019548081796e-07, + "loss": 1.412, + "step": 17006 + }, + { + "epoch": 0.929814250372456, + "grad_norm": 1.341076135635376, + "learning_rate": 2.7357728268188167e-07, + "loss": 1.5617, + "step": 17007 + }, + { + "epoch": 0.9298689228161776, + "grad_norm": 1.819504737854004, + "learning_rate": 2.731529353436624e-07, + "loss": 1.4155, + "step": 17008 + }, + { + "epoch": 0.9299235952598991, + "grad_norm": 1.5528326034545898, + "learning_rate": 2.7272891280769044e-07, + "loss": 1.234, + "step": 17009 + }, + { + "epoch": 0.9299782677036207, + "grad_norm": 2.726170301437378, + "learning_rate": 2.7230521508812556e-07, + "loss": 1.662, + "step": 17010 + }, + { + "epoch": 0.9300329401473423, + "grad_norm": 1.8166083097457886, + "learning_rate": 2.718818421991165e-07, + "loss": 1.6366, + "step": 17011 + }, + { + "epoch": 0.9300876125910638, + "grad_norm": 1.6685197353363037, + "learning_rate": 2.714587941548008e-07, + "loss": 1.1675, + "step": 17012 + }, + { + "epoch": 0.9301422850347854, + "grad_norm": 1.7847115993499756, + "learning_rate": 2.7103607096930497e-07, + "loss": 1.4274, + "step": 17013 + }, + { + "epoch": 0.930196957478507, + "grad_norm": 1.3868870735168457, + "learning_rate": 2.7061367265674323e-07, + "loss": 1.4071, + "step": 17014 + }, + { + "epoch": 0.9302516299222284, + "grad_norm": 1.2791087627410889, + "learning_rate": 2.701915992312221e-07, + "loss": 1.5397, + "step": 17015 + }, + { + "epoch": 0.93030630236595, + "grad_norm": 1.2494899034500122, + "learning_rate": 2.697698507068358e-07, + "loss": 1.5135, + "step": 17016 + }, + { + "epoch": 0.9303609748096715, + "grad_norm": 1.6433839797973633, + "learning_rate": 2.693484270976665e-07, + "loss": 1.5237, + "step": 17017 + }, + { + "epoch": 0.9304156472533931, + "grad_norm": 1.4110749959945679, + "learning_rate": 2.6892732841778736e-07, + "loss": 1.3462, + "step": 17018 + }, + { + "epoch": 0.9304703196971147, + "grad_norm": 2.0767178535461426, + "learning_rate": 2.685065546812593e-07, + "loss": 1.6579, + "step": 17019 + }, + { + "epoch": 0.9305249921408362, + "grad_norm": 1.6863553524017334, + "learning_rate": 2.6808610590213336e-07, + "loss": 1.3562, + "step": 17020 + }, + { + "epoch": 0.9305796645845578, + "grad_norm": 1.4997029304504395, + "learning_rate": 2.6766598209444825e-07, + "loss": 1.4305, + "step": 17021 + }, + { + "epoch": 0.9306343370282794, + "grad_norm": 1.4731308221817017, + "learning_rate": 2.6724618327223394e-07, + "loss": 1.8656, + "step": 17022 + }, + { + "epoch": 0.9306890094720008, + "grad_norm": 1.3908867835998535, + "learning_rate": 2.6682670944950804e-07, + "loss": 1.3447, + "step": 17023 + }, + { + "epoch": 0.9307436819157224, + "grad_norm": 1.7020504474639893, + "learning_rate": 2.664075606402783e-07, + "loss": 1.4854, + "step": 17024 + }, + { + "epoch": 0.930798354359444, + "grad_norm": 1.6558133363723755, + "learning_rate": 2.6598873685853897e-07, + "loss": 1.3757, + "step": 17025 + }, + { + "epoch": 0.9308530268031655, + "grad_norm": 2.0989439487457275, + "learning_rate": 2.6557023811827897e-07, + "loss": 1.4461, + "step": 17026 + }, + { + "epoch": 0.9309076992468871, + "grad_norm": 1.8553533554077148, + "learning_rate": 2.6515206443347153e-07, + "loss": 1.3072, + "step": 17027 + }, + { + "epoch": 0.9309623716906087, + "grad_norm": 1.2381095886230469, + "learning_rate": 2.6473421581807877e-07, + "loss": 1.6708, + "step": 17028 + }, + { + "epoch": 0.9310170441343302, + "grad_norm": 1.9236204624176025, + "learning_rate": 2.6431669228605625e-07, + "loss": 1.5869, + "step": 17029 + }, + { + "epoch": 0.9310717165780518, + "grad_norm": 1.6522408723831177, + "learning_rate": 2.638994938513451e-07, + "loss": 1.5107, + "step": 17030 + }, + { + "epoch": 0.9311263890217732, + "grad_norm": 1.6489458084106445, + "learning_rate": 2.634826205278751e-07, + "loss": 1.4981, + "step": 17031 + }, + { + "epoch": 0.9311810614654948, + "grad_norm": 2.064692974090576, + "learning_rate": 2.630660723295686e-07, + "loss": 1.3241, + "step": 17032 + }, + { + "epoch": 0.9312357339092164, + "grad_norm": 1.1593225002288818, + "learning_rate": 2.6264984927033445e-07, + "loss": 1.5333, + "step": 17033 + }, + { + "epoch": 0.9312904063529379, + "grad_norm": 1.596632957458496, + "learning_rate": 2.6223395136407146e-07, + "loss": 1.4653, + "step": 17034 + }, + { + "epoch": 0.9313450787966595, + "grad_norm": 1.421141505241394, + "learning_rate": 2.618183786246675e-07, + "loss": 1.4906, + "step": 17035 + }, + { + "epoch": 0.9313997512403811, + "grad_norm": 1.492068886756897, + "learning_rate": 2.6140313106599813e-07, + "loss": 1.5804, + "step": 17036 + }, + { + "epoch": 0.9314544236841026, + "grad_norm": 1.5093895196914673, + "learning_rate": 2.609882087019311e-07, + "loss": 1.5106, + "step": 17037 + }, + { + "epoch": 0.9315090961278242, + "grad_norm": 1.3373006582260132, + "learning_rate": 2.605736115463209e-07, + "loss": 1.5246, + "step": 17038 + }, + { + "epoch": 0.9315637685715458, + "grad_norm": 1.6536478996276855, + "learning_rate": 2.601593396130109e-07, + "loss": 1.715, + "step": 17039 + }, + { + "epoch": 0.9316184410152673, + "grad_norm": 1.5469200611114502, + "learning_rate": 2.597453929158378e-07, + "loss": 1.5565, + "step": 17040 + }, + { + "epoch": 0.9316731134589888, + "grad_norm": 1.5108650922775269, + "learning_rate": 2.5933177146862167e-07, + "loss": 1.5874, + "step": 17041 + }, + { + "epoch": 0.9317277859027104, + "grad_norm": 1.8300317525863647, + "learning_rate": 2.589184752851748e-07, + "loss": 1.4229, + "step": 17042 + }, + { + "epoch": 0.9317824583464319, + "grad_norm": 1.4323521852493286, + "learning_rate": 2.5850550437929834e-07, + "loss": 1.6569, + "step": 17043 + }, + { + "epoch": 0.9318371307901535, + "grad_norm": 1.483668565750122, + "learning_rate": 2.580928587647824e-07, + "loss": 1.2754, + "step": 17044 + }, + { + "epoch": 0.931891803233875, + "grad_norm": 1.4343796968460083, + "learning_rate": 2.5768053845540484e-07, + "loss": 1.4697, + "step": 17045 + }, + { + "epoch": 0.9319464756775966, + "grad_norm": 1.243040919303894, + "learning_rate": 2.572685434649358e-07, + "loss": 1.3262, + "step": 17046 + }, + { + "epoch": 0.9320011481213182, + "grad_norm": 1.415353775024414, + "learning_rate": 2.568568738071331e-07, + "loss": 1.3463, + "step": 17047 + }, + { + "epoch": 0.9320558205650397, + "grad_norm": 1.5773075819015503, + "learning_rate": 2.564455294957413e-07, + "loss": 1.5981, + "step": 17048 + }, + { + "epoch": 0.9321104930087613, + "grad_norm": 1.6103827953338623, + "learning_rate": 2.5603451054449835e-07, + "loss": 1.4769, + "step": 17049 + }, + { + "epoch": 0.9321651654524828, + "grad_norm": 1.5586017370224, + "learning_rate": 2.556238169671266e-07, + "loss": 1.2557, + "step": 17050 + }, + { + "epoch": 0.9322198378962043, + "grad_norm": 1.5041474103927612, + "learning_rate": 2.5521344877734165e-07, + "loss": 1.5035, + "step": 17051 + }, + { + "epoch": 0.9322745103399259, + "grad_norm": 1.4737995862960815, + "learning_rate": 2.548034059888471e-07, + "loss": 1.4546, + "step": 17052 + }, + { + "epoch": 0.9323291827836475, + "grad_norm": 1.2629843950271606, + "learning_rate": 2.543936886153342e-07, + "loss": 1.5895, + "step": 17053 + }, + { + "epoch": 0.932383855227369, + "grad_norm": 1.4489102363586426, + "learning_rate": 2.539842966704853e-07, + "loss": 1.5859, + "step": 17054 + }, + { + "epoch": 0.9324385276710906, + "grad_norm": 1.7569684982299805, + "learning_rate": 2.535752301679706e-07, + "loss": 1.4168, + "step": 17055 + }, + { + "epoch": 0.9324932001148122, + "grad_norm": 1.379438877105713, + "learning_rate": 2.531664891214491e-07, + "loss": 1.2749, + "step": 17056 + }, + { + "epoch": 0.9325478725585337, + "grad_norm": 1.4019684791564941, + "learning_rate": 2.527580735445701e-07, + "loss": 1.6637, + "step": 17057 + }, + { + "epoch": 0.9326025450022553, + "grad_norm": 1.2299848794937134, + "learning_rate": 2.523499834509724e-07, + "loss": 1.3241, + "step": 17058 + }, + { + "epoch": 0.9326572174459767, + "grad_norm": 1.071990728378296, + "learning_rate": 2.519422188542819e-07, + "loss": 1.6653, + "step": 17059 + }, + { + "epoch": 0.9327118898896983, + "grad_norm": 2.0314760208129883, + "learning_rate": 2.515347797681156e-07, + "loss": 1.3729, + "step": 17060 + }, + { + "epoch": 0.9327665623334199, + "grad_norm": 1.487389326095581, + "learning_rate": 2.511276662060791e-07, + "loss": 1.4896, + "step": 17061 + }, + { + "epoch": 0.9328212347771414, + "grad_norm": 1.5554890632629395, + "learning_rate": 2.507208781817638e-07, + "loss": 1.7169, + "step": 17062 + }, + { + "epoch": 0.932875907220863, + "grad_norm": 2.3528072834014893, + "learning_rate": 2.5031441570875783e-07, + "loss": 1.1904, + "step": 17063 + }, + { + "epoch": 0.9329305796645846, + "grad_norm": 1.3083924055099487, + "learning_rate": 2.499082788006313e-07, + "loss": 1.5314, + "step": 17064 + }, + { + "epoch": 0.9329852521083061, + "grad_norm": 1.5610097646713257, + "learning_rate": 2.495024674709468e-07, + "loss": 1.2479, + "step": 17065 + }, + { + "epoch": 0.9330399245520277, + "grad_norm": 1.513271689414978, + "learning_rate": 2.490969817332545e-07, + "loss": 1.6404, + "step": 17066 + }, + { + "epoch": 0.9330945969957493, + "grad_norm": 1.3902794122695923, + "learning_rate": 2.4869182160109696e-07, + "loss": 1.4003, + "step": 17067 + }, + { + "epoch": 0.9331492694394707, + "grad_norm": 1.7797307968139648, + "learning_rate": 2.482869870879989e-07, + "loss": 1.3948, + "step": 17068 + }, + { + "epoch": 0.9332039418831923, + "grad_norm": 1.6753692626953125, + "learning_rate": 2.47882478207484e-07, + "loss": 1.3169, + "step": 17069 + }, + { + "epoch": 0.9332586143269139, + "grad_norm": 1.9934386014938354, + "learning_rate": 2.4747829497305477e-07, + "loss": 1.3295, + "step": 17070 + }, + { + "epoch": 0.9333132867706354, + "grad_norm": 1.289839744567871, + "learning_rate": 2.470744373982126e-07, + "loss": 1.4853, + "step": 17071 + }, + { + "epoch": 0.933367959214357, + "grad_norm": 1.4634239673614502, + "learning_rate": 2.4667090549644e-07, + "loss": 1.5583, + "step": 17072 + }, + { + "epoch": 0.9334226316580785, + "grad_norm": 1.3201066255569458, + "learning_rate": 2.46267699281213e-07, + "loss": 1.3357, + "step": 17073 + }, + { + "epoch": 0.9334773041018001, + "grad_norm": 1.5542101860046387, + "learning_rate": 2.458648187659962e-07, + "loss": 1.4562, + "step": 17074 + }, + { + "epoch": 0.9335319765455217, + "grad_norm": 1.8746250867843628, + "learning_rate": 2.454622639642412e-07, + "loss": 1.4823, + "step": 17075 + }, + { + "epoch": 0.9335866489892432, + "grad_norm": 1.4833405017852783, + "learning_rate": 2.4506003488938943e-07, + "loss": 1.4139, + "step": 17076 + }, + { + "epoch": 0.9336413214329647, + "grad_norm": 1.3733534812927246, + "learning_rate": 2.4465813155487574e-07, + "loss": 1.569, + "step": 17077 + }, + { + "epoch": 0.9336959938766863, + "grad_norm": 1.51372230052948, + "learning_rate": 2.442565539741182e-07, + "loss": 1.375, + "step": 17078 + }, + { + "epoch": 0.9337506663204078, + "grad_norm": 1.5069769620895386, + "learning_rate": 2.43855302160525e-07, + "loss": 1.3702, + "step": 17079 + }, + { + "epoch": 0.9338053387641294, + "grad_norm": 1.5081207752227783, + "learning_rate": 2.434543761274988e-07, + "loss": 1.3422, + "step": 17080 + }, + { + "epoch": 0.933860011207851, + "grad_norm": 1.7892498970031738, + "learning_rate": 2.4305377588842547e-07, + "loss": 1.2497, + "step": 17081 + }, + { + "epoch": 0.9339146836515725, + "grad_norm": 1.2897918224334717, + "learning_rate": 2.426535014566811e-07, + "loss": 1.6146, + "step": 17082 + }, + { + "epoch": 0.9339693560952941, + "grad_norm": 1.687558650970459, + "learning_rate": 2.4225355284563265e-07, + "loss": 1.3746, + "step": 17083 + }, + { + "epoch": 0.9340240285390157, + "grad_norm": 1.7218308448791504, + "learning_rate": 2.418539300686351e-07, + "loss": 1.5982, + "step": 17084 + }, + { + "epoch": 0.9340787009827372, + "grad_norm": 1.4540523290634155, + "learning_rate": 2.414546331390344e-07, + "loss": 1.3697, + "step": 17085 + }, + { + "epoch": 0.9341333734264587, + "grad_norm": 1.7435617446899414, + "learning_rate": 2.4105566207016207e-07, + "loss": 1.3446, + "step": 17086 + }, + { + "epoch": 0.9341880458701802, + "grad_norm": 1.7224242687225342, + "learning_rate": 2.406570168753408e-07, + "loss": 1.5124, + "step": 17087 + }, + { + "epoch": 0.9342427183139018, + "grad_norm": 1.3554712533950806, + "learning_rate": 2.4025869756788333e-07, + "loss": 1.5659, + "step": 17088 + }, + { + "epoch": 0.9342973907576234, + "grad_norm": 1.5311675071716309, + "learning_rate": 2.398607041610901e-07, + "loss": 1.2706, + "step": 17089 + }, + { + "epoch": 0.9343520632013449, + "grad_norm": 1.2849291563034058, + "learning_rate": 2.3946303666824934e-07, + "loss": 1.4444, + "step": 17090 + }, + { + "epoch": 0.9344067356450665, + "grad_norm": 1.3506542444229126, + "learning_rate": 2.3906569510264375e-07, + "loss": 1.4108, + "step": 17091 + }, + { + "epoch": 0.9344614080887881, + "grad_norm": 1.7088429927825928, + "learning_rate": 2.3866867947753836e-07, + "loss": 1.6305, + "step": 17092 + }, + { + "epoch": 0.9345160805325096, + "grad_norm": 1.3309695720672607, + "learning_rate": 2.3827198980619025e-07, + "loss": 1.4958, + "step": 17093 + }, + { + "epoch": 0.9345707529762312, + "grad_norm": 1.2700281143188477, + "learning_rate": 2.3787562610184888e-07, + "loss": 1.5323, + "step": 17094 + }, + { + "epoch": 0.9346254254199527, + "grad_norm": 1.579209566116333, + "learning_rate": 2.37479588377747e-07, + "loss": 1.4726, + "step": 17095 + }, + { + "epoch": 0.9346800978636742, + "grad_norm": 1.2546488046646118, + "learning_rate": 2.3708387664710952e-07, + "loss": 1.3614, + "step": 17096 + }, + { + "epoch": 0.9347347703073958, + "grad_norm": 1.895527720451355, + "learning_rate": 2.366884909231515e-07, + "loss": 1.2026, + "step": 17097 + }, + { + "epoch": 0.9347894427511174, + "grad_norm": 1.5779471397399902, + "learning_rate": 2.3629343121907566e-07, + "loss": 1.3364, + "step": 17098 + }, + { + "epoch": 0.9348441151948389, + "grad_norm": 1.634653091430664, + "learning_rate": 2.358986975480726e-07, + "loss": 1.1344, + "step": 17099 + }, + { + "epoch": 0.9348987876385605, + "grad_norm": 1.4152617454528809, + "learning_rate": 2.3550428992332508e-07, + "loss": 1.2454, + "step": 17100 + }, + { + "epoch": 0.934953460082282, + "grad_norm": 1.4425467252731323, + "learning_rate": 2.3511020835800147e-07, + "loss": 1.4026, + "step": 17101 + }, + { + "epoch": 0.9350081325260036, + "grad_norm": 1.6051431894302368, + "learning_rate": 2.3471645286526233e-07, + "loss": 1.1566, + "step": 17102 + }, + { + "epoch": 0.9350628049697252, + "grad_norm": 1.6891497373580933, + "learning_rate": 2.3432302345825608e-07, + "loss": 1.6855, + "step": 17103 + }, + { + "epoch": 0.9351174774134466, + "grad_norm": 1.3066935539245605, + "learning_rate": 2.3392992015011883e-07, + "loss": 1.5813, + "step": 17104 + }, + { + "epoch": 0.9351721498571682, + "grad_norm": 1.537061333656311, + "learning_rate": 2.33537142953979e-07, + "loss": 1.5306, + "step": 17105 + }, + { + "epoch": 0.9352268223008898, + "grad_norm": 1.7304375171661377, + "learning_rate": 2.3314469188295273e-07, + "loss": 1.4396, + "step": 17106 + }, + { + "epoch": 0.9352814947446113, + "grad_norm": 1.4918432235717773, + "learning_rate": 2.327525669501418e-07, + "loss": 1.4926, + "step": 17107 + }, + { + "epoch": 0.9353361671883329, + "grad_norm": 1.699804425239563, + "learning_rate": 2.323607681686446e-07, + "loss": 1.2716, + "step": 17108 + }, + { + "epoch": 0.9353908396320545, + "grad_norm": 1.4735043048858643, + "learning_rate": 2.3196929555154068e-07, + "loss": 1.4694, + "step": 17109 + }, + { + "epoch": 0.935445512075776, + "grad_norm": 2.0045454502105713, + "learning_rate": 2.315781491119029e-07, + "loss": 1.6117, + "step": 17110 + }, + { + "epoch": 0.9355001845194976, + "grad_norm": 1.3409662246704102, + "learning_rate": 2.3118732886279304e-07, + "loss": 1.2909, + "step": 17111 + }, + { + "epoch": 0.9355548569632192, + "grad_norm": 1.5524482727050781, + "learning_rate": 2.307968348172629e-07, + "loss": 1.3325, + "step": 17112 + }, + { + "epoch": 0.9356095294069406, + "grad_norm": 2.076137065887451, + "learning_rate": 2.304066669883498e-07, + "loss": 1.2732, + "step": 17113 + }, + { + "epoch": 0.9356642018506622, + "grad_norm": 1.4536759853363037, + "learning_rate": 2.3001682538908333e-07, + "loss": 1.4567, + "step": 17114 + }, + { + "epoch": 0.9357188742943837, + "grad_norm": 1.177793025970459, + "learning_rate": 2.2962731003247972e-07, + "loss": 1.4522, + "step": 17115 + }, + { + "epoch": 0.9357735467381053, + "grad_norm": 1.9211933612823486, + "learning_rate": 2.2923812093154861e-07, + "loss": 1.4983, + "step": 17116 + }, + { + "epoch": 0.9358282191818269, + "grad_norm": 1.724084496498108, + "learning_rate": 2.2884925809928404e-07, + "loss": 1.1435, + "step": 17117 + }, + { + "epoch": 0.9358828916255484, + "grad_norm": 1.4775508642196655, + "learning_rate": 2.2846072154867117e-07, + "loss": 1.433, + "step": 17118 + }, + { + "epoch": 0.93593756406927, + "grad_norm": 1.2925801277160645, + "learning_rate": 2.2807251129268404e-07, + "loss": 1.6501, + "step": 17119 + }, + { + "epoch": 0.9359922365129916, + "grad_norm": 1.8270108699798584, + "learning_rate": 2.2768462734428786e-07, + "loss": 1.6034, + "step": 17120 + }, + { + "epoch": 0.936046908956713, + "grad_norm": 1.4932395219802856, + "learning_rate": 2.2729706971643117e-07, + "loss": 1.4994, + "step": 17121 + }, + { + "epoch": 0.9361015814004346, + "grad_norm": 1.3813345432281494, + "learning_rate": 2.2690983842205916e-07, + "loss": 1.5818, + "step": 17122 + }, + { + "epoch": 0.9361562538441562, + "grad_norm": 1.6914032697677612, + "learning_rate": 2.2652293347410148e-07, + "loss": 1.6964, + "step": 17123 + }, + { + "epoch": 0.9362109262878777, + "grad_norm": 1.273358941078186, + "learning_rate": 2.261363548854767e-07, + "loss": 1.4555, + "step": 17124 + }, + { + "epoch": 0.9362655987315993, + "grad_norm": 1.2728772163391113, + "learning_rate": 2.2575010266909448e-07, + "loss": 1.2608, + "step": 17125 + }, + { + "epoch": 0.9363202711753209, + "grad_norm": 1.3353474140167236, + "learning_rate": 2.2536417683785117e-07, + "loss": 1.6839, + "step": 17126 + }, + { + "epoch": 0.9363749436190424, + "grad_norm": 1.6424981355667114, + "learning_rate": 2.2497857740463536e-07, + "loss": 1.453, + "step": 17127 + }, + { + "epoch": 0.936429616062764, + "grad_norm": 1.395302414894104, + "learning_rate": 2.245933043823234e-07, + "loss": 1.4714, + "step": 17128 + }, + { + "epoch": 0.9364842885064856, + "grad_norm": 1.3817874193191528, + "learning_rate": 2.2420835778377837e-07, + "loss": 1.5195, + "step": 17129 + }, + { + "epoch": 0.9365389609502071, + "grad_norm": 1.5191707611083984, + "learning_rate": 2.238237376218566e-07, + "loss": 1.658, + "step": 17130 + }, + { + "epoch": 0.9365936333939286, + "grad_norm": 1.3508341312408447, + "learning_rate": 2.2343944390940119e-07, + "loss": 1.4764, + "step": 17131 + }, + { + "epoch": 0.9366483058376501, + "grad_norm": 1.3948503732681274, + "learning_rate": 2.2305547665924298e-07, + "loss": 1.4516, + "step": 17132 + }, + { + "epoch": 0.9367029782813717, + "grad_norm": 1.5539129972457886, + "learning_rate": 2.2267183588420616e-07, + "loss": 1.6555, + "step": 17133 + }, + { + "epoch": 0.9367576507250933, + "grad_norm": 1.5217440128326416, + "learning_rate": 2.2228852159709935e-07, + "loss": 1.2923, + "step": 17134 + }, + { + "epoch": 0.9368123231688148, + "grad_norm": 1.41934072971344, + "learning_rate": 2.2190553381072234e-07, + "loss": 1.2114, + "step": 17135 + }, + { + "epoch": 0.9368669956125364, + "grad_norm": 1.1723636388778687, + "learning_rate": 2.2152287253786598e-07, + "loss": 1.6559, + "step": 17136 + }, + { + "epoch": 0.936921668056258, + "grad_norm": 1.6767046451568604, + "learning_rate": 2.2114053779130561e-07, + "loss": 1.4229, + "step": 17137 + }, + { + "epoch": 0.9369763404999795, + "grad_norm": 2.43161678314209, + "learning_rate": 2.2075852958380995e-07, + "loss": 1.3328, + "step": 17138 + }, + { + "epoch": 0.9370310129437011, + "grad_norm": 1.5322331190109253, + "learning_rate": 2.2037684792813542e-07, + "loss": 1.2527, + "step": 17139 + }, + { + "epoch": 0.9370856853874227, + "grad_norm": 2.079376697540283, + "learning_rate": 2.1999549283702514e-07, + "loss": 1.3963, + "step": 17140 + }, + { + "epoch": 0.9371403578311441, + "grad_norm": 1.7635365724563599, + "learning_rate": 2.1961446432321564e-07, + "loss": 1.5498, + "step": 17141 + }, + { + "epoch": 0.9371950302748657, + "grad_norm": 1.6216191053390503, + "learning_rate": 2.1923376239942895e-07, + "loss": 1.4349, + "step": 17142 + }, + { + "epoch": 0.9372497027185873, + "grad_norm": 1.4270014762878418, + "learning_rate": 2.1885338707837822e-07, + "loss": 1.4556, + "step": 17143 + }, + { + "epoch": 0.9373043751623088, + "grad_norm": 1.5023432970046997, + "learning_rate": 2.1847333837276552e-07, + "loss": 1.4561, + "step": 17144 + }, + { + "epoch": 0.9373590476060304, + "grad_norm": 1.4054138660430908, + "learning_rate": 2.180936162952818e-07, + "loss": 1.5728, + "step": 17145 + }, + { + "epoch": 0.9374137200497519, + "grad_norm": 1.5238747596740723, + "learning_rate": 2.1771422085860473e-07, + "loss": 1.4736, + "step": 17146 + }, + { + "epoch": 0.9374683924934735, + "grad_norm": 1.400681495666504, + "learning_rate": 2.1733515207540634e-07, + "loss": 1.5813, + "step": 17147 + }, + { + "epoch": 0.9375230649371951, + "grad_norm": 1.3417518138885498, + "learning_rate": 2.169564099583421e-07, + "loss": 1.608, + "step": 17148 + }, + { + "epoch": 0.9375777373809165, + "grad_norm": 1.419631004333496, + "learning_rate": 2.1657799452005856e-07, + "loss": 1.439, + "step": 17149 + }, + { + "epoch": 0.9376324098246381, + "grad_norm": 1.4053255319595337, + "learning_rate": 2.1619990577319562e-07, + "loss": 1.4131, + "step": 17150 + }, + { + "epoch": 0.9376870822683597, + "grad_norm": 2.1211822032928467, + "learning_rate": 2.1582214373037536e-07, + "loss": 1.4054, + "step": 17151 + }, + { + "epoch": 0.9377417547120812, + "grad_norm": 1.6712185144424438, + "learning_rate": 2.154447084042133e-07, + "loss": 1.5353, + "step": 17152 + }, + { + "epoch": 0.9377964271558028, + "grad_norm": 1.3873423337936401, + "learning_rate": 2.150675998073126e-07, + "loss": 1.4282, + "step": 17153 + }, + { + "epoch": 0.9378510995995244, + "grad_norm": 1.1703600883483887, + "learning_rate": 2.1469081795226443e-07, + "loss": 1.4387, + "step": 17154 + }, + { + "epoch": 0.9379057720432459, + "grad_norm": 1.5035465955734253, + "learning_rate": 2.1431436285165307e-07, + "loss": 1.5777, + "step": 17155 + }, + { + "epoch": 0.9379604444869675, + "grad_norm": 1.1982988119125366, + "learning_rate": 2.139382345180474e-07, + "loss": 1.4265, + "step": 17156 + }, + { + "epoch": 0.9380151169306891, + "grad_norm": 2.0290162563323975, + "learning_rate": 2.1356243296400846e-07, + "loss": 1.6865, + "step": 17157 + }, + { + "epoch": 0.9380697893744105, + "grad_norm": 1.6372793912887573, + "learning_rate": 2.13186958202084e-07, + "loss": 1.3378, + "step": 17158 + }, + { + "epoch": 0.9381244618181321, + "grad_norm": 1.4603369235992432, + "learning_rate": 2.128118102448129e-07, + "loss": 1.4158, + "step": 17159 + }, + { + "epoch": 0.9381791342618536, + "grad_norm": 1.6301579475402832, + "learning_rate": 2.1243698910472067e-07, + "loss": 1.2466, + "step": 17160 + }, + { + "epoch": 0.9382338067055752, + "grad_norm": 1.4577058553695679, + "learning_rate": 2.1206249479432617e-07, + "loss": 1.4151, + "step": 17161 + }, + { + "epoch": 0.9382884791492968, + "grad_norm": 1.5495585203170776, + "learning_rate": 2.1168832732613164e-07, + "loss": 1.4921, + "step": 17162 + }, + { + "epoch": 0.9383431515930183, + "grad_norm": 1.5524946451187134, + "learning_rate": 2.1131448671263378e-07, + "loss": 1.6572, + "step": 17163 + }, + { + "epoch": 0.9383978240367399, + "grad_norm": 1.5158063173294067, + "learning_rate": 2.1094097296631587e-07, + "loss": 1.4293, + "step": 17164 + }, + { + "epoch": 0.9384524964804615, + "grad_norm": 1.5178533792495728, + "learning_rate": 2.105677860996491e-07, + "loss": 1.6164, + "step": 17165 + }, + { + "epoch": 0.938507168924183, + "grad_norm": 1.3110380172729492, + "learning_rate": 2.101949261250935e-07, + "loss": 1.5486, + "step": 17166 + }, + { + "epoch": 0.9385618413679045, + "grad_norm": 1.8309471607208252, + "learning_rate": 2.0982239305510355e-07, + "loss": 1.2759, + "step": 17167 + }, + { + "epoch": 0.9386165138116261, + "grad_norm": 1.6459338665008545, + "learning_rate": 2.0945018690211706e-07, + "loss": 1.4658, + "step": 17168 + }, + { + "epoch": 0.9386711862553476, + "grad_norm": 1.8084019422531128, + "learning_rate": 2.0907830767856295e-07, + "loss": 1.6203, + "step": 17169 + }, + { + "epoch": 0.9387258586990692, + "grad_norm": 1.3423445224761963, + "learning_rate": 2.0870675539686024e-07, + "loss": 1.3085, + "step": 17170 + }, + { + "epoch": 0.9387805311427908, + "grad_norm": 1.6816339492797852, + "learning_rate": 2.0833553006941343e-07, + "loss": 1.3487, + "step": 17171 + }, + { + "epoch": 0.9388352035865123, + "grad_norm": 2.0798444747924805, + "learning_rate": 2.0796463170862147e-07, + "loss": 1.4082, + "step": 17172 + }, + { + "epoch": 0.9388898760302339, + "grad_norm": 1.8346598148345947, + "learning_rate": 2.075940603268678e-07, + "loss": 1.4685, + "step": 17173 + }, + { + "epoch": 0.9389445484739554, + "grad_norm": 1.8851888179779053, + "learning_rate": 2.0722381593652586e-07, + "loss": 1.0778, + "step": 17174 + }, + { + "epoch": 0.938999220917677, + "grad_norm": 1.2690356969833374, + "learning_rate": 2.068538985499613e-07, + "loss": 1.5435, + "step": 17175 + }, + { + "epoch": 0.9390538933613986, + "grad_norm": 1.467358112335205, + "learning_rate": 2.0648430817952537e-07, + "loss": 1.5055, + "step": 17176 + }, + { + "epoch": 0.93910856580512, + "grad_norm": 1.2328407764434814, + "learning_rate": 2.0611504483756038e-07, + "loss": 1.5204, + "step": 17177 + }, + { + "epoch": 0.9391632382488416, + "grad_norm": 2.0217857360839844, + "learning_rate": 2.0574610853639544e-07, + "loss": 1.4781, + "step": 17178 + }, + { + "epoch": 0.9392179106925632, + "grad_norm": 1.4421048164367676, + "learning_rate": 2.053774992883506e-07, + "loss": 1.5554, + "step": 17179 + }, + { + "epoch": 0.9392725831362847, + "grad_norm": 1.7012401819229126, + "learning_rate": 2.0500921710573385e-07, + "loss": 1.3419, + "step": 17180 + }, + { + "epoch": 0.9393272555800063, + "grad_norm": 1.7715750932693481, + "learning_rate": 2.0464126200084532e-07, + "loss": 1.2413, + "step": 17181 + }, + { + "epoch": 0.9393819280237279, + "grad_norm": 1.6494932174682617, + "learning_rate": 2.0427363398596966e-07, + "loss": 1.4752, + "step": 17182 + }, + { + "epoch": 0.9394366004674494, + "grad_norm": 1.5201927423477173, + "learning_rate": 2.039063330733848e-07, + "loss": 1.3778, + "step": 17183 + }, + { + "epoch": 0.939491272911171, + "grad_norm": 1.6949517726898193, + "learning_rate": 2.0353935927535428e-07, + "loss": 1.4152, + "step": 17184 + }, + { + "epoch": 0.9395459453548926, + "grad_norm": 1.7572215795516968, + "learning_rate": 2.0317271260413273e-07, + "loss": 1.3564, + "step": 17185 + }, + { + "epoch": 0.939600617798614, + "grad_norm": 1.506880283355713, + "learning_rate": 2.028063930719637e-07, + "loss": 1.3307, + "step": 17186 + }, + { + "epoch": 0.9396552902423356, + "grad_norm": 1.8437604904174805, + "learning_rate": 2.024404006910785e-07, + "loss": 1.3159, + "step": 17187 + }, + { + "epoch": 0.9397099626860571, + "grad_norm": 1.5104390382766724, + "learning_rate": 2.020747354736985e-07, + "loss": 1.3854, + "step": 17188 + }, + { + "epoch": 0.9397646351297787, + "grad_norm": 1.5986171960830688, + "learning_rate": 2.0170939743203499e-07, + "loss": 1.4569, + "step": 17189 + }, + { + "epoch": 0.9398193075735003, + "grad_norm": 1.5688092708587646, + "learning_rate": 2.0134438657828824e-07, + "loss": 1.4656, + "step": 17190 + }, + { + "epoch": 0.9398739800172218, + "grad_norm": 1.7222377061843872, + "learning_rate": 2.009797029246452e-07, + "loss": 1.4173, + "step": 17191 + }, + { + "epoch": 0.9399286524609434, + "grad_norm": 1.6983673572540283, + "learning_rate": 2.0061534648328384e-07, + "loss": 1.4687, + "step": 17192 + }, + { + "epoch": 0.939983324904665, + "grad_norm": 1.5398184061050415, + "learning_rate": 2.0025131726637116e-07, + "loss": 1.8916, + "step": 17193 + }, + { + "epoch": 0.9400379973483864, + "grad_norm": 1.607460618019104, + "learning_rate": 1.9988761528606182e-07, + "loss": 1.3678, + "step": 17194 + }, + { + "epoch": 0.940092669792108, + "grad_norm": 1.7387137413024902, + "learning_rate": 1.9952424055450282e-07, + "loss": 1.4558, + "step": 17195 + }, + { + "epoch": 0.9401473422358296, + "grad_norm": 1.4758356809616089, + "learning_rate": 1.9916119308382553e-07, + "loss": 1.397, + "step": 17196 + }, + { + "epoch": 0.9402020146795511, + "grad_norm": 1.3953684568405151, + "learning_rate": 1.9879847288615583e-07, + "loss": 1.4384, + "step": 17197 + }, + { + "epoch": 0.9402566871232727, + "grad_norm": 1.7308424711227417, + "learning_rate": 1.9843607997360403e-07, + "loss": 1.2849, + "step": 17198 + }, + { + "epoch": 0.9403113595669943, + "grad_norm": 2.156946897506714, + "learning_rate": 1.9807401435827045e-07, + "loss": 1.4775, + "step": 17199 + }, + { + "epoch": 0.9403660320107158, + "grad_norm": 1.780652642250061, + "learning_rate": 1.9771227605224763e-07, + "loss": 1.2474, + "step": 17200 + }, + { + "epoch": 0.9404207044544374, + "grad_norm": 1.4688361883163452, + "learning_rate": 1.9735086506761368e-07, + "loss": 1.4355, + "step": 17201 + }, + { + "epoch": 0.9404753768981589, + "grad_norm": 1.4443130493164062, + "learning_rate": 1.9698978141643786e-07, + "loss": 1.3659, + "step": 17202 + }, + { + "epoch": 0.9405300493418804, + "grad_norm": 1.6847617626190186, + "learning_rate": 1.9662902511077607e-07, + "loss": 1.4957, + "step": 17203 + }, + { + "epoch": 0.940584721785602, + "grad_norm": 1.314048171043396, + "learning_rate": 1.9626859616267536e-07, + "loss": 1.2421, + "step": 17204 + }, + { + "epoch": 0.9406393942293235, + "grad_norm": 1.4824352264404297, + "learning_rate": 1.959084945841705e-07, + "loss": 1.3795, + "step": 17205 + }, + { + "epoch": 0.9406940666730451, + "grad_norm": 1.724710464477539, + "learning_rate": 1.9554872038728746e-07, + "loss": 1.5824, + "step": 17206 + }, + { + "epoch": 0.9407487391167667, + "grad_norm": 1.619201421737671, + "learning_rate": 1.9518927358403994e-07, + "loss": 1.3258, + "step": 17207 + }, + { + "epoch": 0.9408034115604882, + "grad_norm": 1.6936252117156982, + "learning_rate": 1.9483015418642947e-07, + "loss": 1.4524, + "step": 17208 + }, + { + "epoch": 0.9408580840042098, + "grad_norm": 2.0291876792907715, + "learning_rate": 1.9447136220644979e-07, + "loss": 1.2818, + "step": 17209 + }, + { + "epoch": 0.9409127564479314, + "grad_norm": 1.3110941648483276, + "learning_rate": 1.941128976560791e-07, + "loss": 1.6011, + "step": 17210 + }, + { + "epoch": 0.9409674288916529, + "grad_norm": 1.732804775238037, + "learning_rate": 1.9375476054729115e-07, + "loss": 1.5657, + "step": 17211 + }, + { + "epoch": 0.9410221013353745, + "grad_norm": 1.5824939012527466, + "learning_rate": 1.9339695089204192e-07, + "loss": 1.4704, + "step": 17212 + }, + { + "epoch": 0.941076773779096, + "grad_norm": 1.341825246810913, + "learning_rate": 1.9303946870227964e-07, + "loss": 1.478, + "step": 17213 + }, + { + "epoch": 0.9411314462228175, + "grad_norm": 1.664829969406128, + "learning_rate": 1.9268231398994363e-07, + "loss": 1.4271, + "step": 17214 + }, + { + "epoch": 0.9411861186665391, + "grad_norm": 1.4005507230758667, + "learning_rate": 1.9232548676695772e-07, + "loss": 1.2808, + "step": 17215 + }, + { + "epoch": 0.9412407911102606, + "grad_norm": 2.1443893909454346, + "learning_rate": 1.9196898704523902e-07, + "loss": 1.6359, + "step": 17216 + }, + { + "epoch": 0.9412954635539822, + "grad_norm": 1.5482348203659058, + "learning_rate": 1.9161281483669025e-07, + "loss": 1.3933, + "step": 17217 + }, + { + "epoch": 0.9413501359977038, + "grad_norm": 1.521087408065796, + "learning_rate": 1.9125697015320632e-07, + "loss": 1.4973, + "step": 17218 + }, + { + "epoch": 0.9414048084414253, + "grad_norm": 1.457648754119873, + "learning_rate": 1.9090145300666885e-07, + "loss": 1.5228, + "step": 17219 + }, + { + "epoch": 0.9414594808851469, + "grad_norm": 1.3266570568084717, + "learning_rate": 1.9054626340894943e-07, + "loss": 1.0756, + "step": 17220 + }, + { + "epoch": 0.9415141533288685, + "grad_norm": 1.7478281259536743, + "learning_rate": 1.9019140137190973e-07, + "loss": 1.2567, + "step": 17221 + }, + { + "epoch": 0.9415688257725899, + "grad_norm": 1.5225837230682373, + "learning_rate": 1.8983686690739688e-07, + "loss": 1.1928, + "step": 17222 + }, + { + "epoch": 0.9416234982163115, + "grad_norm": 1.7488900423049927, + "learning_rate": 1.8948266002725258e-07, + "loss": 1.4765, + "step": 17223 + }, + { + "epoch": 0.9416781706600331, + "grad_norm": 1.8177132606506348, + "learning_rate": 1.8912878074330288e-07, + "loss": 1.6788, + "step": 17224 + }, + { + "epoch": 0.9417328431037546, + "grad_norm": 1.6863456964492798, + "learning_rate": 1.8877522906736612e-07, + "loss": 1.4412, + "step": 17225 + }, + { + "epoch": 0.9417875155474762, + "grad_norm": 1.6519361734390259, + "learning_rate": 1.884220050112462e-07, + "loss": 1.5381, + "step": 17226 + }, + { + "epoch": 0.9418421879911978, + "grad_norm": 1.6103363037109375, + "learning_rate": 1.880691085867392e-07, + "loss": 1.5231, + "step": 17227 + }, + { + "epoch": 0.9418968604349193, + "grad_norm": 1.447341799736023, + "learning_rate": 1.8771653980562908e-07, + "loss": 1.4259, + "step": 17228 + }, + { + "epoch": 0.9419515328786409, + "grad_norm": 1.3117995262145996, + "learning_rate": 1.8736429867968976e-07, + "loss": 1.5785, + "step": 17229 + }, + { + "epoch": 0.9420062053223623, + "grad_norm": 1.2493202686309814, + "learning_rate": 1.8701238522068176e-07, + "loss": 1.4277, + "step": 17230 + }, + { + "epoch": 0.9420608777660839, + "grad_norm": 1.598107933998108, + "learning_rate": 1.8666079944035797e-07, + "loss": 1.4106, + "step": 17231 + }, + { + "epoch": 0.9421155502098055, + "grad_norm": 1.461864948272705, + "learning_rate": 1.8630954135045677e-07, + "loss": 1.2092, + "step": 17232 + }, + { + "epoch": 0.942170222653527, + "grad_norm": 1.3333486318588257, + "learning_rate": 1.8595861096270874e-07, + "loss": 1.3113, + "step": 17233 + }, + { + "epoch": 0.9422248950972486, + "grad_norm": 1.5321574211120605, + "learning_rate": 1.8560800828883229e-07, + "loss": 1.5372, + "step": 17234 + }, + { + "epoch": 0.9422795675409702, + "grad_norm": 1.7403162717819214, + "learning_rate": 1.8525773334053476e-07, + "loss": 1.5114, + "step": 17235 + }, + { + "epoch": 0.9423342399846917, + "grad_norm": 1.2220124006271362, + "learning_rate": 1.849077861295123e-07, + "loss": 1.5144, + "step": 17236 + }, + { + "epoch": 0.9423889124284133, + "grad_norm": 1.8551874160766602, + "learning_rate": 1.8455816666745119e-07, + "loss": 1.3967, + "step": 17237 + }, + { + "epoch": 0.9424435848721349, + "grad_norm": 1.3804351091384888, + "learning_rate": 1.8420887496602424e-07, + "loss": 1.5729, + "step": 17238 + }, + { + "epoch": 0.9424982573158563, + "grad_norm": 1.2366373538970947, + "learning_rate": 1.838599110368977e-07, + "loss": 1.7205, + "step": 17239 + }, + { + "epoch": 0.9425529297595779, + "grad_norm": 1.8861690759658813, + "learning_rate": 1.8351127489172227e-07, + "loss": 1.4572, + "step": 17240 + }, + { + "epoch": 0.9426076022032995, + "grad_norm": 1.3615306615829468, + "learning_rate": 1.8316296654214084e-07, + "loss": 1.4298, + "step": 17241 + }, + { + "epoch": 0.942662274647021, + "grad_norm": 1.617874264717102, + "learning_rate": 1.828149859997841e-07, + "loss": 1.5133, + "step": 17242 + }, + { + "epoch": 0.9427169470907426, + "grad_norm": 1.9851497411727905, + "learning_rate": 1.8246733327627275e-07, + "loss": 1.5297, + "step": 17243 + }, + { + "epoch": 0.9427716195344641, + "grad_norm": 1.4432028532028198, + "learning_rate": 1.8212000838321197e-07, + "loss": 1.3581, + "step": 17244 + }, + { + "epoch": 0.9428262919781857, + "grad_norm": 1.514786958694458, + "learning_rate": 1.8177301133220472e-07, + "loss": 1.6194, + "step": 17245 + }, + { + "epoch": 0.9428809644219073, + "grad_norm": 1.4659076929092407, + "learning_rate": 1.8142634213483502e-07, + "loss": 1.1687, + "step": 17246 + }, + { + "epoch": 0.9429356368656288, + "grad_norm": 1.4253721237182617, + "learning_rate": 1.8108000080267918e-07, + "loss": 1.3197, + "step": 17247 + }, + { + "epoch": 0.9429903093093503, + "grad_norm": 1.650304913520813, + "learning_rate": 1.807339873473035e-07, + "loss": 1.209, + "step": 17248 + }, + { + "epoch": 0.9430449817530719, + "grad_norm": 1.1676770448684692, + "learning_rate": 1.8038830178026213e-07, + "loss": 1.7489, + "step": 17249 + }, + { + "epoch": 0.9430996541967934, + "grad_norm": 1.4087941646575928, + "learning_rate": 1.8004294411309687e-07, + "loss": 1.4491, + "step": 17250 + }, + { + "epoch": 0.943154326640515, + "grad_norm": 1.5599443912506104, + "learning_rate": 1.7969791435734184e-07, + "loss": 1.6057, + "step": 17251 + }, + { + "epoch": 0.9432089990842366, + "grad_norm": 1.3638099431991577, + "learning_rate": 1.7935321252451677e-07, + "loss": 1.241, + "step": 17252 + }, + { + "epoch": 0.9432636715279581, + "grad_norm": 1.576883316040039, + "learning_rate": 1.7900883862613348e-07, + "loss": 1.2574, + "step": 17253 + }, + { + "epoch": 0.9433183439716797, + "grad_norm": 1.483855962753296, + "learning_rate": 1.7866479267369062e-07, + "loss": 1.5329, + "step": 17254 + }, + { + "epoch": 0.9433730164154013, + "grad_norm": 2.3126566410064697, + "learning_rate": 1.7832107467867676e-07, + "loss": 1.6162, + "step": 17255 + }, + { + "epoch": 0.9434276888591228, + "grad_norm": 1.7677199840545654, + "learning_rate": 1.7797768465256938e-07, + "loss": 1.3312, + "step": 17256 + }, + { + "epoch": 0.9434823613028444, + "grad_norm": 2.6045727729797363, + "learning_rate": 1.7763462260683483e-07, + "loss": 1.275, + "step": 17257 + }, + { + "epoch": 0.9435370337465658, + "grad_norm": 1.6635777950286865, + "learning_rate": 1.7729188855292957e-07, + "loss": 1.3488, + "step": 17258 + }, + { + "epoch": 0.9435917061902874, + "grad_norm": 1.2744944095611572, + "learning_rate": 1.7694948250229772e-07, + "loss": 1.6664, + "step": 17259 + }, + { + "epoch": 0.943646378634009, + "grad_norm": 1.9683505296707153, + "learning_rate": 1.7660740446637348e-07, + "loss": 1.6461, + "step": 17260 + }, + { + "epoch": 0.9437010510777305, + "grad_norm": 1.4492707252502441, + "learning_rate": 1.7626565445657883e-07, + "loss": 1.4368, + "step": 17261 + }, + { + "epoch": 0.9437557235214521, + "grad_norm": 1.6146043539047241, + "learning_rate": 1.7592423248432577e-07, + "loss": 1.2655, + "step": 17262 + }, + { + "epoch": 0.9438103959651737, + "grad_norm": 2.3136653900146484, + "learning_rate": 1.7558313856101627e-07, + "loss": 1.4302, + "step": 17263 + }, + { + "epoch": 0.9438650684088952, + "grad_norm": 2.1692843437194824, + "learning_rate": 1.75242372698039e-07, + "loss": 1.5753, + "step": 17264 + }, + { + "epoch": 0.9439197408526168, + "grad_norm": 1.9758341312408447, + "learning_rate": 1.7490193490677377e-07, + "loss": 1.4902, + "step": 17265 + }, + { + "epoch": 0.9439744132963384, + "grad_norm": 1.5448129177093506, + "learning_rate": 1.7456182519858812e-07, + "loss": 1.1426, + "step": 17266 + }, + { + "epoch": 0.9440290857400598, + "grad_norm": 1.2478821277618408, + "learning_rate": 1.7422204358483962e-07, + "loss": 1.5126, + "step": 17267 + }, + { + "epoch": 0.9440837581837814, + "grad_norm": 1.4841294288635254, + "learning_rate": 1.7388259007687368e-07, + "loss": 1.4079, + "step": 17268 + }, + { + "epoch": 0.944138430627503, + "grad_norm": 1.479604721069336, + "learning_rate": 1.7354346468602567e-07, + "loss": 1.5735, + "step": 17269 + }, + { + "epoch": 0.9441931030712245, + "grad_norm": 1.6703218221664429, + "learning_rate": 1.7320466742361984e-07, + "loss": 1.3508, + "step": 17270 + }, + { + "epoch": 0.9442477755149461, + "grad_norm": 1.242653250694275, + "learning_rate": 1.728661983009705e-07, + "loss": 1.1782, + "step": 17271 + }, + { + "epoch": 0.9443024479586676, + "grad_norm": 1.9568474292755127, + "learning_rate": 1.7252805732937749e-07, + "loss": 1.7283, + "step": 17272 + }, + { + "epoch": 0.9443571204023892, + "grad_norm": 1.3964797258377075, + "learning_rate": 1.72190244520134e-07, + "loss": 1.5411, + "step": 17273 + }, + { + "epoch": 0.9444117928461108, + "grad_norm": 1.3352829217910767, + "learning_rate": 1.7185275988451987e-07, + "loss": 1.473, + "step": 17274 + }, + { + "epoch": 0.9444664652898322, + "grad_norm": 1.7877581119537354, + "learning_rate": 1.715156034338039e-07, + "loss": 1.5007, + "step": 17275 + }, + { + "epoch": 0.9445211377335538, + "grad_norm": 1.375962257385254, + "learning_rate": 1.7117877517924597e-07, + "loss": 1.4228, + "step": 17276 + }, + { + "epoch": 0.9445758101772754, + "grad_norm": 1.775595784187317, + "learning_rate": 1.7084227513209374e-07, + "loss": 1.4147, + "step": 17277 + }, + { + "epoch": 0.9446304826209969, + "grad_norm": 1.6049317121505737, + "learning_rate": 1.7050610330358043e-07, + "loss": 1.395, + "step": 17278 + }, + { + "epoch": 0.9446851550647185, + "grad_norm": 2.1905314922332764, + "learning_rate": 1.7017025970493595e-07, + "loss": 1.5082, + "step": 17279 + }, + { + "epoch": 0.9447398275084401, + "grad_norm": 1.4021979570388794, + "learning_rate": 1.6983474434737246e-07, + "loss": 1.3896, + "step": 17280 + }, + { + "epoch": 0.9447944999521616, + "grad_norm": 1.4196823835372925, + "learning_rate": 1.6949955724209433e-07, + "loss": 1.5773, + "step": 17281 + }, + { + "epoch": 0.9448491723958832, + "grad_norm": 1.4722349643707275, + "learning_rate": 1.691646984002937e-07, + "loss": 1.3567, + "step": 17282 + }, + { + "epoch": 0.9449038448396048, + "grad_norm": 1.3799593448638916, + "learning_rate": 1.6883016783315165e-07, + "loss": 1.5953, + "step": 17283 + }, + { + "epoch": 0.9449585172833262, + "grad_norm": 1.4298679828643799, + "learning_rate": 1.6849596555184033e-07, + "loss": 1.3907, + "step": 17284 + }, + { + "epoch": 0.9450131897270478, + "grad_norm": 1.7771010398864746, + "learning_rate": 1.6816209156751973e-07, + "loss": 1.3042, + "step": 17285 + }, + { + "epoch": 0.9450678621707693, + "grad_norm": 1.3883085250854492, + "learning_rate": 1.6782854589133645e-07, + "loss": 1.5705, + "step": 17286 + }, + { + "epoch": 0.9451225346144909, + "grad_norm": 3.1470963954925537, + "learning_rate": 1.6749532853443163e-07, + "loss": 1.1418, + "step": 17287 + }, + { + "epoch": 0.9451772070582125, + "grad_norm": 1.7002711296081543, + "learning_rate": 1.6716243950793077e-07, + "loss": 1.5145, + "step": 17288 + }, + { + "epoch": 0.945231879501934, + "grad_norm": 1.606156349182129, + "learning_rate": 1.6682987882294722e-07, + "loss": 1.2983, + "step": 17289 + }, + { + "epoch": 0.9452865519456556, + "grad_norm": 3.521550416946411, + "learning_rate": 1.6649764649059097e-07, + "loss": 1.2712, + "step": 17290 + }, + { + "epoch": 0.9453412243893772, + "grad_norm": 1.8167935609817505, + "learning_rate": 1.6616574252195205e-07, + "loss": 1.4388, + "step": 17291 + }, + { + "epoch": 0.9453958968330987, + "grad_norm": 1.6412403583526611, + "learning_rate": 1.6583416692811382e-07, + "loss": 1.4202, + "step": 17292 + }, + { + "epoch": 0.9454505692768203, + "grad_norm": 1.9439080953598022, + "learning_rate": 1.6550291972015077e-07, + "loss": 1.3803, + "step": 17293 + }, + { + "epoch": 0.9455052417205418, + "grad_norm": 1.470674991607666, + "learning_rate": 1.651720009091229e-07, + "loss": 1.3535, + "step": 17294 + }, + { + "epoch": 0.9455599141642633, + "grad_norm": 1.4499876499176025, + "learning_rate": 1.6484141050607915e-07, + "loss": 1.6504, + "step": 17295 + }, + { + "epoch": 0.9456145866079849, + "grad_norm": 1.3494977951049805, + "learning_rate": 1.6451114852206073e-07, + "loss": 1.5463, + "step": 17296 + }, + { + "epoch": 0.9456692590517065, + "grad_norm": 1.670607089996338, + "learning_rate": 1.6418121496809324e-07, + "loss": 1.537, + "step": 17297 + }, + { + "epoch": 0.945723931495428, + "grad_norm": 1.4521617889404297, + "learning_rate": 1.6385160985519566e-07, + "loss": 1.5905, + "step": 17298 + }, + { + "epoch": 0.9457786039391496, + "grad_norm": 1.3017199039459229, + "learning_rate": 1.6352233319437473e-07, + "loss": 1.2817, + "step": 17299 + }, + { + "epoch": 0.9458332763828711, + "grad_norm": 1.250929355621338, + "learning_rate": 1.6319338499662496e-07, + "loss": 1.4493, + "step": 17300 + }, + { + "epoch": 0.9458879488265927, + "grad_norm": 1.4752612113952637, + "learning_rate": 1.6286476527293095e-07, + "loss": 1.4213, + "step": 17301 + }, + { + "epoch": 0.9459426212703143, + "grad_norm": 1.732102394104004, + "learning_rate": 1.625364740342661e-07, + "loss": 1.2742, + "step": 17302 + }, + { + "epoch": 0.9459972937140357, + "grad_norm": 1.935200810432434, + "learning_rate": 1.6220851129159164e-07, + "loss": 1.3205, + "step": 17303 + }, + { + "epoch": 0.9460519661577573, + "grad_norm": 1.7749836444854736, + "learning_rate": 1.6188087705586108e-07, + "loss": 1.2269, + "step": 17304 + }, + { + "epoch": 0.9461066386014789, + "grad_norm": 1.1770304441452026, + "learning_rate": 1.6155357133801342e-07, + "loss": 1.4075, + "step": 17305 + }, + { + "epoch": 0.9461613110452004, + "grad_norm": 1.4461636543273926, + "learning_rate": 1.6122659414897878e-07, + "loss": 1.5534, + "step": 17306 + }, + { + "epoch": 0.946215983488922, + "grad_norm": 1.6641018390655518, + "learning_rate": 1.6089994549967625e-07, + "loss": 1.4086, + "step": 17307 + }, + { + "epoch": 0.9462706559326436, + "grad_norm": 2.0297775268554688, + "learning_rate": 1.6057362540101262e-07, + "loss": 1.4126, + "step": 17308 + }, + { + "epoch": 0.9463253283763651, + "grad_norm": 1.5183125734329224, + "learning_rate": 1.6024763386388365e-07, + "loss": 1.5533, + "step": 17309 + }, + { + "epoch": 0.9463800008200867, + "grad_norm": 1.5593723058700562, + "learning_rate": 1.5992197089917727e-07, + "loss": 1.2895, + "step": 17310 + }, + { + "epoch": 0.9464346732638083, + "grad_norm": 1.6060813665390015, + "learning_rate": 1.595966365177648e-07, + "loss": 1.5461, + "step": 17311 + }, + { + "epoch": 0.9464893457075297, + "grad_norm": 1.4923474788665771, + "learning_rate": 1.5927163073051312e-07, + "loss": 1.5017, + "step": 17312 + }, + { + "epoch": 0.9465440181512513, + "grad_norm": 1.863303303718567, + "learning_rate": 1.589469535482735e-07, + "loss": 1.7098, + "step": 17313 + }, + { + "epoch": 0.9465986905949728, + "grad_norm": 1.4306199550628662, + "learning_rate": 1.586226049818873e-07, + "loss": 1.5576, + "step": 17314 + }, + { + "epoch": 0.9466533630386944, + "grad_norm": 1.8468888998031616, + "learning_rate": 1.58298585042187e-07, + "loss": 1.6379, + "step": 17315 + }, + { + "epoch": 0.946708035482416, + "grad_norm": 1.926947832107544, + "learning_rate": 1.5797489373999053e-07, + "loss": 1.543, + "step": 17316 + }, + { + "epoch": 0.9467627079261375, + "grad_norm": 2.0423221588134766, + "learning_rate": 1.576515310861071e-07, + "loss": 1.4491, + "step": 17317 + }, + { + "epoch": 0.9468173803698591, + "grad_norm": 1.3379491567611694, + "learning_rate": 1.573284970913358e-07, + "loss": 1.7099, + "step": 17318 + }, + { + "epoch": 0.9468720528135807, + "grad_norm": 1.728022575378418, + "learning_rate": 1.5700579176646246e-07, + "loss": 1.435, + "step": 17319 + }, + { + "epoch": 0.9469267252573021, + "grad_norm": 1.583817958831787, + "learning_rate": 1.5668341512226182e-07, + "loss": 1.4782, + "step": 17320 + }, + { + "epoch": 0.9469813977010237, + "grad_norm": 1.4653971195220947, + "learning_rate": 1.563613671695019e-07, + "loss": 1.2599, + "step": 17321 + }, + { + "epoch": 0.9470360701447453, + "grad_norm": 1.465592622756958, + "learning_rate": 1.56039647918933e-07, + "loss": 1.245, + "step": 17322 + }, + { + "epoch": 0.9470907425884668, + "grad_norm": 1.3320273160934448, + "learning_rate": 1.5571825738129987e-07, + "loss": 1.4861, + "step": 17323 + }, + { + "epoch": 0.9471454150321884, + "grad_norm": 1.5515403747558594, + "learning_rate": 1.55397195567335e-07, + "loss": 1.5848, + "step": 17324 + }, + { + "epoch": 0.94720008747591, + "grad_norm": 1.352394700050354, + "learning_rate": 1.5507646248775875e-07, + "loss": 1.4864, + "step": 17325 + }, + { + "epoch": 0.9472547599196315, + "grad_norm": 1.2871367931365967, + "learning_rate": 1.5475605815328142e-07, + "loss": 1.6421, + "step": 17326 + }, + { + "epoch": 0.9473094323633531, + "grad_norm": 1.5342494249343872, + "learning_rate": 1.5443598257460225e-07, + "loss": 1.3995, + "step": 17327 + }, + { + "epoch": 0.9473641048070747, + "grad_norm": 1.360372543334961, + "learning_rate": 1.541162357624082e-07, + "loss": 1.5354, + "step": 17328 + }, + { + "epoch": 0.9474187772507962, + "grad_norm": 1.6214933395385742, + "learning_rate": 1.5379681772737743e-07, + "loss": 1.5207, + "step": 17329 + }, + { + "epoch": 0.9474734496945177, + "grad_norm": 1.7695178985595703, + "learning_rate": 1.5347772848017584e-07, + "loss": 1.4882, + "step": 17330 + }, + { + "epoch": 0.9475281221382392, + "grad_norm": 2.148247241973877, + "learning_rate": 1.5315896803145824e-07, + "loss": 1.4723, + "step": 17331 + }, + { + "epoch": 0.9475827945819608, + "grad_norm": 1.4760686159133911, + "learning_rate": 1.5284053639186947e-07, + "loss": 1.4729, + "step": 17332 + }, + { + "epoch": 0.9476374670256824, + "grad_norm": 1.9926338195800781, + "learning_rate": 1.5252243357204212e-07, + "loss": 1.2335, + "step": 17333 + }, + { + "epoch": 0.9476921394694039, + "grad_norm": 1.6183440685272217, + "learning_rate": 1.5220465958259878e-07, + "loss": 1.4206, + "step": 17334 + }, + { + "epoch": 0.9477468119131255, + "grad_norm": 1.233021855354309, + "learning_rate": 1.5188721443414988e-07, + "loss": 1.3039, + "step": 17335 + }, + { + "epoch": 0.9478014843568471, + "grad_norm": 1.4746825695037842, + "learning_rate": 1.5157009813729585e-07, + "loss": 1.247, + "step": 17336 + }, + { + "epoch": 0.9478561568005686, + "grad_norm": 1.3922550678253174, + "learning_rate": 1.5125331070262706e-07, + "loss": 1.797, + "step": 17337 + }, + { + "epoch": 0.9479108292442902, + "grad_norm": 1.3283474445343018, + "learning_rate": 1.5093685214072173e-07, + "loss": 1.5173, + "step": 17338 + }, + { + "epoch": 0.9479655016880117, + "grad_norm": 1.8185245990753174, + "learning_rate": 1.5062072246214476e-07, + "loss": 1.3903, + "step": 17339 + }, + { + "epoch": 0.9480201741317332, + "grad_norm": 1.3887865543365479, + "learning_rate": 1.5030492167745547e-07, + "loss": 1.3228, + "step": 17340 + }, + { + "epoch": 0.9480748465754548, + "grad_norm": 1.8300206661224365, + "learning_rate": 1.4998944979719765e-07, + "loss": 1.2869, + "step": 17341 + }, + { + "epoch": 0.9481295190191764, + "grad_norm": 1.573165774345398, + "learning_rate": 1.496743068319051e-07, + "loss": 1.3104, + "step": 17342 + }, + { + "epoch": 0.9481841914628979, + "grad_norm": 2.115528106689453, + "learning_rate": 1.4935949279210272e-07, + "loss": 1.5081, + "step": 17343 + }, + { + "epoch": 0.9482388639066195, + "grad_norm": 1.600760817527771, + "learning_rate": 1.490450076883021e-07, + "loss": 1.2854, + "step": 17344 + }, + { + "epoch": 0.948293536350341, + "grad_norm": 1.650588035583496, + "learning_rate": 1.4873085153100485e-07, + "loss": 1.5409, + "step": 17345 + }, + { + "epoch": 0.9483482087940626, + "grad_norm": 1.6446876525878906, + "learning_rate": 1.4841702433070038e-07, + "loss": 1.5082, + "step": 17346 + }, + { + "epoch": 0.9484028812377842, + "grad_norm": 1.304387092590332, + "learning_rate": 1.4810352609787028e-07, + "loss": 1.4149, + "step": 17347 + }, + { + "epoch": 0.9484575536815056, + "grad_norm": 1.4212818145751953, + "learning_rate": 1.477903568429795e-07, + "loss": 1.4964, + "step": 17348 + }, + { + "epoch": 0.9485122261252272, + "grad_norm": 1.765142798423767, + "learning_rate": 1.4747751657648968e-07, + "loss": 1.563, + "step": 17349 + }, + { + "epoch": 0.9485668985689488, + "grad_norm": 1.5986645221710205, + "learning_rate": 1.471650053088436e-07, + "loss": 1.4152, + "step": 17350 + }, + { + "epoch": 0.9486215710126703, + "grad_norm": 1.407766342163086, + "learning_rate": 1.4685282305047956e-07, + "loss": 1.5179, + "step": 17351 + }, + { + "epoch": 0.9486762434563919, + "grad_norm": 1.3294072151184082, + "learning_rate": 1.4654096981182031e-07, + "loss": 1.6799, + "step": 17352 + }, + { + "epoch": 0.9487309159001135, + "grad_norm": 2.0246405601501465, + "learning_rate": 1.462294456032798e-07, + "loss": 1.5583, + "step": 17353 + }, + { + "epoch": 0.948785588343835, + "grad_norm": 1.339263677597046, + "learning_rate": 1.4591825043526075e-07, + "loss": 1.2705, + "step": 17354 + }, + { + "epoch": 0.9488402607875566, + "grad_norm": 1.4827570915222168, + "learning_rate": 1.45607384318156e-07, + "loss": 1.411, + "step": 17355 + }, + { + "epoch": 0.9488949332312782, + "grad_norm": 1.320453405380249, + "learning_rate": 1.4529684726234284e-07, + "loss": 1.4221, + "step": 17356 + }, + { + "epoch": 0.9489496056749996, + "grad_norm": 1.7152891159057617, + "learning_rate": 1.44986639278194e-07, + "loss": 1.4254, + "step": 17357 + }, + { + "epoch": 0.9490042781187212, + "grad_norm": 1.6809172630310059, + "learning_rate": 1.4467676037606682e-07, + "loss": 1.5669, + "step": 17358 + }, + { + "epoch": 0.9490589505624427, + "grad_norm": 1.429702877998352, + "learning_rate": 1.4436721056630853e-07, + "loss": 1.4933, + "step": 17359 + }, + { + "epoch": 0.9491136230061643, + "grad_norm": 1.595205307006836, + "learning_rate": 1.4405798985925533e-07, + "loss": 1.3318, + "step": 17360 + }, + { + "epoch": 0.9491682954498859, + "grad_norm": 1.9094442129135132, + "learning_rate": 1.4374909826523453e-07, + "loss": 1.5075, + "step": 17361 + }, + { + "epoch": 0.9492229678936074, + "grad_norm": 1.8955910205841064, + "learning_rate": 1.4344053579455897e-07, + "loss": 1.31, + "step": 17362 + }, + { + "epoch": 0.949277640337329, + "grad_norm": 1.403849720954895, + "learning_rate": 1.4313230245753374e-07, + "loss": 1.4394, + "step": 17363 + }, + { + "epoch": 0.9493323127810506, + "grad_norm": 1.4926226139068604, + "learning_rate": 1.428243982644506e-07, + "loss": 1.4535, + "step": 17364 + }, + { + "epoch": 0.949386985224772, + "grad_norm": 1.921408772468567, + "learning_rate": 1.4251682322559134e-07, + "loss": 1.4813, + "step": 17365 + }, + { + "epoch": 0.9494416576684936, + "grad_norm": 1.8575806617736816, + "learning_rate": 1.4220957735122663e-07, + "loss": 1.2696, + "step": 17366 + }, + { + "epoch": 0.9494963301122152, + "grad_norm": 1.7742178440093994, + "learning_rate": 1.4190266065161607e-07, + "loss": 1.4422, + "step": 17367 + }, + { + "epoch": 0.9495510025559367, + "grad_norm": 1.4072015285491943, + "learning_rate": 1.4159607313700808e-07, + "loss": 1.7544, + "step": 17368 + }, + { + "epoch": 0.9496056749996583, + "grad_norm": 1.2724138498306274, + "learning_rate": 1.4128981481764115e-07, + "loss": 1.4385, + "step": 17369 + }, + { + "epoch": 0.9496603474433799, + "grad_norm": 1.4009867906570435, + "learning_rate": 1.4098388570374154e-07, + "loss": 1.4281, + "step": 17370 + }, + { + "epoch": 0.9497150198871014, + "grad_norm": 1.6873509883880615, + "learning_rate": 1.4067828580552445e-07, + "loss": 1.4081, + "step": 17371 + }, + { + "epoch": 0.949769692330823, + "grad_norm": 1.7937361001968384, + "learning_rate": 1.4037301513319613e-07, + "loss": 1.2684, + "step": 17372 + }, + { + "epoch": 0.9498243647745445, + "grad_norm": 1.3990260362625122, + "learning_rate": 1.400680736969484e-07, + "loss": 1.2423, + "step": 17373 + }, + { + "epoch": 0.949879037218266, + "grad_norm": 1.2296199798583984, + "learning_rate": 1.397634615069654e-07, + "loss": 1.6329, + "step": 17374 + }, + { + "epoch": 0.9499337096619876, + "grad_norm": 1.548081874847412, + "learning_rate": 1.3945917857341673e-07, + "loss": 1.2204, + "step": 17375 + }, + { + "epoch": 0.9499883821057091, + "grad_norm": 1.4131592512130737, + "learning_rate": 1.3915522490646538e-07, + "loss": 1.5993, + "step": 17376 + }, + { + "epoch": 0.9500430545494307, + "grad_norm": 2.1172988414764404, + "learning_rate": 1.38851600516261e-07, + "loss": 1.594, + "step": 17377 + }, + { + "epoch": 0.9500977269931523, + "grad_norm": 3.0277180671691895, + "learning_rate": 1.3854830541294105e-07, + "loss": 1.3661, + "step": 17378 + }, + { + "epoch": 0.9501523994368738, + "grad_norm": 1.6446828842163086, + "learning_rate": 1.38245339606633e-07, + "loss": 1.4114, + "step": 17379 + }, + { + "epoch": 0.9502070718805954, + "grad_norm": 1.4613579511642456, + "learning_rate": 1.3794270310745538e-07, + "loss": 1.4658, + "step": 17380 + }, + { + "epoch": 0.950261744324317, + "grad_norm": 1.4731755256652832, + "learning_rate": 1.3764039592551125e-07, + "loss": 1.553, + "step": 17381 + }, + { + "epoch": 0.9503164167680385, + "grad_norm": 2.0510451793670654, + "learning_rate": 1.3733841807089921e-07, + "loss": 1.5975, + "step": 17382 + }, + { + "epoch": 0.9503710892117601, + "grad_norm": 1.6728007793426514, + "learning_rate": 1.3703676955370003e-07, + "loss": 1.4986, + "step": 17383 + }, + { + "epoch": 0.9504257616554816, + "grad_norm": 1.8023854494094849, + "learning_rate": 1.3673545038398683e-07, + "loss": 1.4513, + "step": 17384 + }, + { + "epoch": 0.9504804340992031, + "grad_norm": 1.2951269149780273, + "learning_rate": 1.3643446057182264e-07, + "loss": 1.6006, + "step": 17385 + }, + { + "epoch": 0.9505351065429247, + "grad_norm": 1.2563025951385498, + "learning_rate": 1.3613380012725718e-07, + "loss": 1.4374, + "step": 17386 + }, + { + "epoch": 0.9505897789866462, + "grad_norm": 1.3513870239257812, + "learning_rate": 1.3583346906033024e-07, + "loss": 1.6376, + "step": 17387 + }, + { + "epoch": 0.9506444514303678, + "grad_norm": 1.2597215175628662, + "learning_rate": 1.3553346738107044e-07, + "loss": 1.423, + "step": 17388 + }, + { + "epoch": 0.9506991238740894, + "grad_norm": 1.339146375656128, + "learning_rate": 1.352337950994964e-07, + "loss": 1.5414, + "step": 17389 + }, + { + "epoch": 0.9507537963178109, + "grad_norm": 1.659827470779419, + "learning_rate": 1.3493445222561353e-07, + "loss": 1.5432, + "step": 17390 + }, + { + "epoch": 0.9508084687615325, + "grad_norm": 1.1527153253555298, + "learning_rate": 1.346354387694193e-07, + "loss": 1.4715, + "step": 17391 + }, + { + "epoch": 0.9508631412052541, + "grad_norm": 1.367820382118225, + "learning_rate": 1.3433675474089803e-07, + "loss": 1.6856, + "step": 17392 + }, + { + "epoch": 0.9509178136489755, + "grad_norm": 1.2704108953475952, + "learning_rate": 1.3403840015002168e-07, + "loss": 1.7637, + "step": 17393 + }, + { + "epoch": 0.9509724860926971, + "grad_norm": 1.450965404510498, + "learning_rate": 1.3374037500675452e-07, + "loss": 1.5322, + "step": 17394 + }, + { + "epoch": 0.9510271585364187, + "grad_norm": 1.8761364221572876, + "learning_rate": 1.334426793210486e-07, + "loss": 1.3337, + "step": 17395 + }, + { + "epoch": 0.9510818309801402, + "grad_norm": 1.4047911167144775, + "learning_rate": 1.3314531310284485e-07, + "loss": 1.3984, + "step": 17396 + }, + { + "epoch": 0.9511365034238618, + "grad_norm": 1.2362747192382812, + "learning_rate": 1.3284827636207198e-07, + "loss": 1.4289, + "step": 17397 + }, + { + "epoch": 0.9511911758675834, + "grad_norm": 2.1167960166931152, + "learning_rate": 1.3255156910864874e-07, + "loss": 1.4615, + "step": 17398 + }, + { + "epoch": 0.9512458483113049, + "grad_norm": 1.1284332275390625, + "learning_rate": 1.322551913524839e-07, + "loss": 1.5641, + "step": 17399 + }, + { + "epoch": 0.9513005207550265, + "grad_norm": 2.258600950241089, + "learning_rate": 1.319591431034728e-07, + "loss": 1.5057, + "step": 17400 + }, + { + "epoch": 0.951355193198748, + "grad_norm": 1.5050313472747803, + "learning_rate": 1.3166342437150204e-07, + "loss": 1.7807, + "step": 17401 + }, + { + "epoch": 0.9514098656424695, + "grad_norm": 1.4550623893737793, + "learning_rate": 1.3136803516644704e-07, + "loss": 1.5107, + "step": 17402 + }, + { + "epoch": 0.9514645380861911, + "grad_norm": 1.594183325767517, + "learning_rate": 1.310729754981699e-07, + "loss": 1.3423, + "step": 17403 + }, + { + "epoch": 0.9515192105299126, + "grad_norm": 1.5121335983276367, + "learning_rate": 1.3077824537652494e-07, + "loss": 1.2503, + "step": 17404 + }, + { + "epoch": 0.9515738829736342, + "grad_norm": 1.6314623355865479, + "learning_rate": 1.3048384481135323e-07, + "loss": 1.5045, + "step": 17405 + }, + { + "epoch": 0.9516285554173558, + "grad_norm": 1.6012457609176636, + "learning_rate": 1.3018977381248576e-07, + "loss": 1.3505, + "step": 17406 + }, + { + "epoch": 0.9516832278610773, + "grad_norm": 2.021503210067749, + "learning_rate": 1.2989603238974024e-07, + "loss": 1.6292, + "step": 17407 + }, + { + "epoch": 0.9517379003047989, + "grad_norm": 2.3841521739959717, + "learning_rate": 1.2960262055292884e-07, + "loss": 1.5157, + "step": 17408 + }, + { + "epoch": 0.9517925727485205, + "grad_norm": 0.9999831318855286, + "learning_rate": 1.2930953831184701e-07, + "loss": 1.5427, + "step": 17409 + }, + { + "epoch": 0.951847245192242, + "grad_norm": 1.9786796569824219, + "learning_rate": 1.290167856762825e-07, + "loss": 1.3976, + "step": 17410 + }, + { + "epoch": 0.9519019176359635, + "grad_norm": 1.483170509338379, + "learning_rate": 1.2872436265600973e-07, + "loss": 1.1697, + "step": 17411 + }, + { + "epoch": 0.9519565900796851, + "grad_norm": 1.7284975051879883, + "learning_rate": 1.2843226926079532e-07, + "loss": 1.2619, + "step": 17412 + }, + { + "epoch": 0.9520112625234066, + "grad_norm": 1.6090118885040283, + "learning_rate": 1.2814050550039148e-07, + "loss": 1.4819, + "step": 17413 + }, + { + "epoch": 0.9520659349671282, + "grad_norm": 2.2919604778289795, + "learning_rate": 1.278490713845404e-07, + "loss": 1.4618, + "step": 17414 + }, + { + "epoch": 0.9521206074108497, + "grad_norm": 1.5119901895523071, + "learning_rate": 1.275579669229743e-07, + "loss": 1.2107, + "step": 17415 + }, + { + "epoch": 0.9521752798545713, + "grad_norm": 2.495443105697632, + "learning_rate": 1.2726719212541538e-07, + "loss": 1.1276, + "step": 17416 + }, + { + "epoch": 0.9522299522982929, + "grad_norm": 1.499248743057251, + "learning_rate": 1.2697674700157148e-07, + "loss": 1.4786, + "step": 17417 + }, + { + "epoch": 0.9522846247420144, + "grad_norm": 1.9850866794586182, + "learning_rate": 1.2668663156114036e-07, + "loss": 1.1608, + "step": 17418 + }, + { + "epoch": 0.952339297185736, + "grad_norm": 1.4536397457122803, + "learning_rate": 1.2639684581381317e-07, + "loss": 1.3206, + "step": 17419 + }, + { + "epoch": 0.9523939696294575, + "grad_norm": 1.4965218305587769, + "learning_rate": 1.261073897692633e-07, + "loss": 1.6651, + "step": 17420 + }, + { + "epoch": 0.952448642073179, + "grad_norm": 1.265441656112671, + "learning_rate": 1.258182634371574e-07, + "loss": 1.4624, + "step": 17421 + }, + { + "epoch": 0.9525033145169006, + "grad_norm": 1.641069769859314, + "learning_rate": 1.2552946682715116e-07, + "loss": 1.3711, + "step": 17422 + }, + { + "epoch": 0.9525579869606222, + "grad_norm": 1.7281595468521118, + "learning_rate": 1.2524099994888683e-07, + "loss": 1.3111, + "step": 17423 + }, + { + "epoch": 0.9526126594043437, + "grad_norm": 1.4824321269989014, + "learning_rate": 1.249528628119967e-07, + "loss": 1.3559, + "step": 17424 + }, + { + "epoch": 0.9526673318480653, + "grad_norm": 1.348772406578064, + "learning_rate": 1.2466505542610419e-07, + "loss": 1.5004, + "step": 17425 + }, + { + "epoch": 0.9527220042917869, + "grad_norm": 1.6975505352020264, + "learning_rate": 1.2437757780081717e-07, + "loss": 1.3324, + "step": 17426 + }, + { + "epoch": 0.9527766767355084, + "grad_norm": 1.7124557495117188, + "learning_rate": 1.2409042994573795e-07, + "loss": 1.8053, + "step": 17427 + }, + { + "epoch": 0.95283134917923, + "grad_norm": 1.6055623292922974, + "learning_rate": 1.238036118704533e-07, + "loss": 1.4225, + "step": 17428 + }, + { + "epoch": 0.9528860216229514, + "grad_norm": 1.353237509727478, + "learning_rate": 1.2351712358454115e-07, + "loss": 1.508, + "step": 17429 + }, + { + "epoch": 0.952940694066673, + "grad_norm": 1.5042057037353516, + "learning_rate": 1.2323096509756827e-07, + "loss": 1.2934, + "step": 17430 + }, + { + "epoch": 0.9529953665103946, + "grad_norm": 1.5574365854263306, + "learning_rate": 1.2294513641909034e-07, + "loss": 1.5466, + "step": 17431 + }, + { + "epoch": 0.9530500389541161, + "grad_norm": 1.4984123706817627, + "learning_rate": 1.2265963755865083e-07, + "loss": 1.3204, + "step": 17432 + }, + { + "epoch": 0.9531047113978377, + "grad_norm": 1.4976236820220947, + "learning_rate": 1.2237446852578438e-07, + "loss": 1.5243, + "step": 17433 + }, + { + "epoch": 0.9531593838415593, + "grad_norm": 1.1947102546691895, + "learning_rate": 1.2208962933001333e-07, + "loss": 1.4979, + "step": 17434 + }, + { + "epoch": 0.9532140562852808, + "grad_norm": 1.3157474994659424, + "learning_rate": 1.2180511998084788e-07, + "loss": 1.4369, + "step": 17435 + }, + { + "epoch": 0.9532687287290024, + "grad_norm": 1.6773864030838013, + "learning_rate": 1.215209404877904e-07, + "loss": 1.5192, + "step": 17436 + }, + { + "epoch": 0.953323401172724, + "grad_norm": 1.9420735836029053, + "learning_rate": 1.2123709086032887e-07, + "loss": 1.4121, + "step": 17437 + }, + { + "epoch": 0.9533780736164454, + "grad_norm": 1.258547306060791, + "learning_rate": 1.2095357110794238e-07, + "loss": 1.5154, + "step": 17438 + }, + { + "epoch": 0.953432746060167, + "grad_norm": 1.4150711297988892, + "learning_rate": 1.2067038124009778e-07, + "loss": 1.3, + "step": 17439 + }, + { + "epoch": 0.9534874185038886, + "grad_norm": 1.7101041078567505, + "learning_rate": 1.2038752126625087e-07, + "loss": 1.245, + "step": 17440 + }, + { + "epoch": 0.9535420909476101, + "grad_norm": 1.5782465934753418, + "learning_rate": 1.2010499119584963e-07, + "loss": 1.4511, + "step": 17441 + }, + { + "epoch": 0.9535967633913317, + "grad_norm": 1.278324842453003, + "learning_rate": 1.198227910383254e-07, + "loss": 1.5582, + "step": 17442 + }, + { + "epoch": 0.9536514358350532, + "grad_norm": 1.5620100498199463, + "learning_rate": 1.1954092080310288e-07, + "loss": 1.5443, + "step": 17443 + }, + { + "epoch": 0.9537061082787748, + "grad_norm": 1.5825889110565186, + "learning_rate": 1.192593804995945e-07, + "loss": 1.487, + "step": 17444 + }, + { + "epoch": 0.9537607807224964, + "grad_norm": 1.569441795349121, + "learning_rate": 1.189781701372017e-07, + "loss": 1.2704, + "step": 17445 + }, + { + "epoch": 0.9538154531662179, + "grad_norm": 1.477103590965271, + "learning_rate": 1.1869728972531247e-07, + "loss": 1.4145, + "step": 17446 + }, + { + "epoch": 0.9538701256099394, + "grad_norm": 1.3223718404769897, + "learning_rate": 1.1841673927331043e-07, + "loss": 1.3118, + "step": 17447 + }, + { + "epoch": 0.953924798053661, + "grad_norm": 1.8513416051864624, + "learning_rate": 1.1813651879056031e-07, + "loss": 1.3439, + "step": 17448 + }, + { + "epoch": 0.9539794704973825, + "grad_norm": 1.9842098951339722, + "learning_rate": 1.1785662828641908e-07, + "loss": 1.6339, + "step": 17449 + }, + { + "epoch": 0.9540341429411041, + "grad_norm": 1.6520302295684814, + "learning_rate": 1.1757706777023592e-07, + "loss": 1.3592, + "step": 17450 + }, + { + "epoch": 0.9540888153848257, + "grad_norm": 1.626137614250183, + "learning_rate": 1.1729783725134336e-07, + "loss": 1.3777, + "step": 17451 + }, + { + "epoch": 0.9541434878285472, + "grad_norm": 1.48982572555542, + "learning_rate": 1.1701893673906729e-07, + "loss": 1.4247, + "step": 17452 + }, + { + "epoch": 0.9541981602722688, + "grad_norm": 1.7009061574935913, + "learning_rate": 1.1674036624272023e-07, + "loss": 1.6315, + "step": 17453 + }, + { + "epoch": 0.9542528327159904, + "grad_norm": 1.637553334236145, + "learning_rate": 1.1646212577160254e-07, + "loss": 1.372, + "step": 17454 + }, + { + "epoch": 0.9543075051597119, + "grad_norm": 1.5593218803405762, + "learning_rate": 1.1618421533500901e-07, + "loss": 1.4417, + "step": 17455 + }, + { + "epoch": 0.9543621776034334, + "grad_norm": 1.6941654682159424, + "learning_rate": 1.1590663494221665e-07, + "loss": 1.3912, + "step": 17456 + }, + { + "epoch": 0.9544168500471549, + "grad_norm": 1.3892732858657837, + "learning_rate": 1.1562938460249473e-07, + "loss": 1.4942, + "step": 17457 + }, + { + "epoch": 0.9544715224908765, + "grad_norm": 1.4147331714630127, + "learning_rate": 1.153524643251025e-07, + "loss": 1.3383, + "step": 17458 + }, + { + "epoch": 0.9545261949345981, + "grad_norm": 1.6097711324691772, + "learning_rate": 1.15075874119287e-07, + "loss": 1.4913, + "step": 17459 + }, + { + "epoch": 0.9545808673783196, + "grad_norm": 1.585044026374817, + "learning_rate": 1.1479961399428308e-07, + "loss": 1.3307, + "step": 17460 + }, + { + "epoch": 0.9546355398220412, + "grad_norm": 1.5722805261611938, + "learning_rate": 1.1452368395931668e-07, + "loss": 1.4735, + "step": 17461 + }, + { + "epoch": 0.9546902122657628, + "grad_norm": 1.5457483530044556, + "learning_rate": 1.1424808402360266e-07, + "loss": 1.3773, + "step": 17462 + }, + { + "epoch": 0.9547448847094843, + "grad_norm": 1.4728537797927856, + "learning_rate": 1.1397281419634143e-07, + "loss": 1.3702, + "step": 17463 + }, + { + "epoch": 0.9547995571532059, + "grad_norm": 1.263299822807312, + "learning_rate": 1.1369787448672675e-07, + "loss": 1.5196, + "step": 17464 + }, + { + "epoch": 0.9548542295969275, + "grad_norm": 1.779106616973877, + "learning_rate": 1.1342326490393796e-07, + "loss": 1.535, + "step": 17465 + }, + { + "epoch": 0.9549089020406489, + "grad_norm": 2.076103925704956, + "learning_rate": 1.1314898545714769e-07, + "loss": 1.4473, + "step": 17466 + }, + { + "epoch": 0.9549635744843705, + "grad_norm": 1.47579026222229, + "learning_rate": 1.1287503615551199e-07, + "loss": 1.3578, + "step": 17467 + }, + { + "epoch": 0.9550182469280921, + "grad_norm": 1.7337775230407715, + "learning_rate": 1.1260141700817906e-07, + "loss": 1.3781, + "step": 17468 + }, + { + "epoch": 0.9550729193718136, + "grad_norm": 1.6150577068328857, + "learning_rate": 1.1232812802428716e-07, + "loss": 1.3221, + "step": 17469 + }, + { + "epoch": 0.9551275918155352, + "grad_norm": 1.6464455127716064, + "learning_rate": 1.1205516921296122e-07, + "loss": 1.5728, + "step": 17470 + }, + { + "epoch": 0.9551822642592567, + "grad_norm": 1.7635663747787476, + "learning_rate": 1.1178254058331616e-07, + "loss": 1.3236, + "step": 17471 + }, + { + "epoch": 0.9552369367029783, + "grad_norm": 1.3918970823287964, + "learning_rate": 1.1151024214445472e-07, + "loss": 1.2309, + "step": 17472 + }, + { + "epoch": 0.9552916091466999, + "grad_norm": 1.4513752460479736, + "learning_rate": 1.1123827390547071e-07, + "loss": 1.3652, + "step": 17473 + }, + { + "epoch": 0.9553462815904213, + "grad_norm": 1.536668062210083, + "learning_rate": 1.1096663587544576e-07, + "loss": 1.6198, + "step": 17474 + }, + { + "epoch": 0.9554009540341429, + "grad_norm": 1.2861177921295166, + "learning_rate": 1.106953280634504e-07, + "loss": 1.3881, + "step": 17475 + }, + { + "epoch": 0.9554556264778645, + "grad_norm": 1.2761136293411255, + "learning_rate": 1.1042435047854295e-07, + "loss": 1.4881, + "step": 17476 + }, + { + "epoch": 0.955510298921586, + "grad_norm": 1.373596429824829, + "learning_rate": 1.1015370312977392e-07, + "loss": 1.2879, + "step": 17477 + }, + { + "epoch": 0.9555649713653076, + "grad_norm": 1.4725492000579834, + "learning_rate": 1.0988338602618053e-07, + "loss": 1.0794, + "step": 17478 + }, + { + "epoch": 0.9556196438090292, + "grad_norm": 1.325976848602295, + "learning_rate": 1.096133991767867e-07, + "loss": 1.4537, + "step": 17479 + }, + { + "epoch": 0.9556743162527507, + "grad_norm": 1.4852286577224731, + "learning_rate": 1.0934374259061187e-07, + "loss": 1.3709, + "step": 17480 + }, + { + "epoch": 0.9557289886964723, + "grad_norm": 1.6261227130889893, + "learning_rate": 1.0907441627665883e-07, + "loss": 1.194, + "step": 17481 + }, + { + "epoch": 0.9557836611401939, + "grad_norm": 1.9719200134277344, + "learning_rate": 1.0880542024391927e-07, + "loss": 1.4264, + "step": 17482 + }, + { + "epoch": 0.9558383335839153, + "grad_norm": 1.4547010660171509, + "learning_rate": 1.0853675450137824e-07, + "loss": 1.5044, + "step": 17483 + }, + { + "epoch": 0.9558930060276369, + "grad_norm": 1.5976499319076538, + "learning_rate": 1.0826841905800522e-07, + "loss": 1.3272, + "step": 17484 + }, + { + "epoch": 0.9559476784713584, + "grad_norm": 1.5677334070205688, + "learning_rate": 1.0800041392276194e-07, + "loss": 1.447, + "step": 17485 + }, + { + "epoch": 0.95600235091508, + "grad_norm": 1.3226524591445923, + "learning_rate": 1.0773273910459681e-07, + "loss": 1.4982, + "step": 17486 + }, + { + "epoch": 0.9560570233588016, + "grad_norm": 1.2783547639846802, + "learning_rate": 1.0746539461244932e-07, + "loss": 1.3393, + "step": 17487 + }, + { + "epoch": 0.9561116958025231, + "grad_norm": 1.23164963722229, + "learning_rate": 1.0719838045524456e-07, + "loss": 1.4751, + "step": 17488 + }, + { + "epoch": 0.9561663682462447, + "grad_norm": 1.3771504163742065, + "learning_rate": 1.0693169664190095e-07, + "loss": 1.2782, + "step": 17489 + }, + { + "epoch": 0.9562210406899663, + "grad_norm": 1.6860102415084839, + "learning_rate": 1.0666534318132249e-07, + "loss": 1.336, + "step": 17490 + }, + { + "epoch": 0.9562757131336878, + "grad_norm": 1.5990203619003296, + "learning_rate": 1.0639932008240428e-07, + "loss": 1.3343, + "step": 17491 + }, + { + "epoch": 0.9563303855774093, + "grad_norm": 1.5797473192214966, + "learning_rate": 1.061336273540281e-07, + "loss": 1.3868, + "step": 17492 + }, + { + "epoch": 0.9563850580211309, + "grad_norm": 1.6512863636016846, + "learning_rate": 1.0586826500506686e-07, + "loss": 1.5473, + "step": 17493 + }, + { + "epoch": 0.9564397304648524, + "grad_norm": 1.60399329662323, + "learning_rate": 1.0560323304438125e-07, + "loss": 1.2578, + "step": 17494 + }, + { + "epoch": 0.956494402908574, + "grad_norm": 1.5876487493515015, + "learning_rate": 1.0533853148082197e-07, + "loss": 1.1514, + "step": 17495 + }, + { + "epoch": 0.9565490753522956, + "grad_norm": 1.5750049352645874, + "learning_rate": 1.0507416032322748e-07, + "loss": 1.5007, + "step": 17496 + }, + { + "epoch": 0.9566037477960171, + "grad_norm": 1.4788877964019775, + "learning_rate": 1.048101195804263e-07, + "loss": 1.3812, + "step": 17497 + }, + { + "epoch": 0.9566584202397387, + "grad_norm": 1.6052488088607788, + "learning_rate": 1.0454640926123583e-07, + "loss": 1.6141, + "step": 17498 + }, + { + "epoch": 0.9567130926834602, + "grad_norm": 1.4152722358703613, + "learning_rate": 1.0428302937445899e-07, + "loss": 1.5125, + "step": 17499 + }, + { + "epoch": 0.9567677651271818, + "grad_norm": 1.7259873151779175, + "learning_rate": 1.0401997992889434e-07, + "loss": 1.3245, + "step": 17500 + }, + { + "epoch": 0.9568224375709034, + "grad_norm": 1.4472798109054565, + "learning_rate": 1.0375726093332484e-07, + "loss": 1.3163, + "step": 17501 + }, + { + "epoch": 0.9568771100146248, + "grad_norm": 1.4656856060028076, + "learning_rate": 1.0349487239652123e-07, + "loss": 1.3023, + "step": 17502 + }, + { + "epoch": 0.9569317824583464, + "grad_norm": 1.7374745607376099, + "learning_rate": 1.0323281432724763e-07, + "loss": 1.5893, + "step": 17503 + }, + { + "epoch": 0.956986454902068, + "grad_norm": 1.270398497581482, + "learning_rate": 1.0297108673425371e-07, + "loss": 1.5533, + "step": 17504 + }, + { + "epoch": 0.9570411273457895, + "grad_norm": 1.5810049772262573, + "learning_rate": 1.0270968962627914e-07, + "loss": 1.6123, + "step": 17505 + }, + { + "epoch": 0.9570957997895111, + "grad_norm": 1.518706202507019, + "learning_rate": 1.024486230120525e-07, + "loss": 1.6239, + "step": 17506 + }, + { + "epoch": 0.9571504722332327, + "grad_norm": 1.7646400928497314, + "learning_rate": 1.0218788690029124e-07, + "loss": 1.3838, + "step": 17507 + }, + { + "epoch": 0.9572051446769542, + "grad_norm": 1.5504934787750244, + "learning_rate": 1.0192748129970287e-07, + "loss": 1.4584, + "step": 17508 + }, + { + "epoch": 0.9572598171206758, + "grad_norm": 1.317823886871338, + "learning_rate": 1.0166740621898374e-07, + "loss": 1.4214, + "step": 17509 + }, + { + "epoch": 0.9573144895643974, + "grad_norm": 1.5029209852218628, + "learning_rate": 1.014076616668147e-07, + "loss": 1.3642, + "step": 17510 + }, + { + "epoch": 0.9573691620081188, + "grad_norm": 1.8722236156463623, + "learning_rate": 1.0114824765187326e-07, + "loss": 1.2233, + "step": 17511 + }, + { + "epoch": 0.9574238344518404, + "grad_norm": 1.4272414445877075, + "learning_rate": 1.0088916418282024e-07, + "loss": 1.405, + "step": 17512 + }, + { + "epoch": 0.9574785068955619, + "grad_norm": 1.4394962787628174, + "learning_rate": 1.0063041126830542e-07, + "loss": 1.4249, + "step": 17513 + }, + { + "epoch": 0.9575331793392835, + "grad_norm": 2.2319905757904053, + "learning_rate": 1.0037198891697298e-07, + "loss": 1.53, + "step": 17514 + }, + { + "epoch": 0.9575878517830051, + "grad_norm": 1.7697017192840576, + "learning_rate": 1.0011389713744824e-07, + "loss": 1.4745, + "step": 17515 + }, + { + "epoch": 0.9576425242267266, + "grad_norm": 1.3787246942520142, + "learning_rate": 9.985613593835319e-08, + "loss": 1.5572, + "step": 17516 + }, + { + "epoch": 0.9576971966704482, + "grad_norm": 1.6134051084518433, + "learning_rate": 9.959870532829208e-08, + "loss": 1.5558, + "step": 17517 + }, + { + "epoch": 0.9577518691141698, + "grad_norm": 1.4087110757827759, + "learning_rate": 9.934160531586134e-08, + "loss": 1.216, + "step": 17518 + }, + { + "epoch": 0.9578065415578912, + "grad_norm": 1.5777207612991333, + "learning_rate": 9.908483590964746e-08, + "loss": 1.4874, + "step": 17519 + }, + { + "epoch": 0.9578612140016128, + "grad_norm": 1.5105606317520142, + "learning_rate": 9.882839711822468e-08, + "loss": 1.4784, + "step": 17520 + }, + { + "epoch": 0.9579158864453344, + "grad_norm": 1.587235450744629, + "learning_rate": 9.857228895015503e-08, + "loss": 1.735, + "step": 17521 + }, + { + "epoch": 0.9579705588890559, + "grad_norm": 1.5979933738708496, + "learning_rate": 9.831651141399167e-08, + "loss": 1.4588, + "step": 17522 + }, + { + "epoch": 0.9580252313327775, + "grad_norm": 1.2026559114456177, + "learning_rate": 9.806106451827557e-08, + "loss": 1.5106, + "step": 17523 + }, + { + "epoch": 0.9580799037764991, + "grad_norm": 1.431357502937317, + "learning_rate": 9.780594827153434e-08, + "loss": 1.2377, + "step": 17524 + }, + { + "epoch": 0.9581345762202206, + "grad_norm": 1.2817635536193848, + "learning_rate": 9.755116268229003e-08, + "loss": 1.5602, + "step": 17525 + }, + { + "epoch": 0.9581892486639422, + "grad_norm": 1.3530820608139038, + "learning_rate": 9.72967077590492e-08, + "loss": 1.5578, + "step": 17526 + }, + { + "epoch": 0.9582439211076638, + "grad_norm": 1.3683754205703735, + "learning_rate": 9.704258351030838e-08, + "loss": 1.5853, + "step": 17527 + }, + { + "epoch": 0.9582985935513852, + "grad_norm": 1.4115047454833984, + "learning_rate": 9.678878994455298e-08, + "loss": 1.48, + "step": 17528 + }, + { + "epoch": 0.9583532659951068, + "grad_norm": 1.3010233640670776, + "learning_rate": 9.653532707025959e-08, + "loss": 1.2823, + "step": 17529 + }, + { + "epoch": 0.9584079384388283, + "grad_norm": 1.6292365789413452, + "learning_rate": 9.628219489589141e-08, + "loss": 1.5643, + "step": 17530 + }, + { + "epoch": 0.9584626108825499, + "grad_norm": 1.472286581993103, + "learning_rate": 9.602939342989948e-08, + "loss": 1.5807, + "step": 17531 + }, + { + "epoch": 0.9585172833262715, + "grad_norm": 1.6350593566894531, + "learning_rate": 9.577692268072702e-08, + "loss": 1.274, + "step": 17532 + }, + { + "epoch": 0.958571955769993, + "grad_norm": 1.4535235166549683, + "learning_rate": 9.552478265680287e-08, + "loss": 1.3973, + "step": 17533 + }, + { + "epoch": 0.9586266282137146, + "grad_norm": 1.437073826789856, + "learning_rate": 9.527297336654917e-08, + "loss": 1.676, + "step": 17534 + }, + { + "epoch": 0.9586813006574362, + "grad_norm": 1.2420940399169922, + "learning_rate": 9.502149481837252e-08, + "loss": 1.4664, + "step": 17535 + }, + { + "epoch": 0.9587359731011577, + "grad_norm": 1.67424476146698, + "learning_rate": 9.477034702067067e-08, + "loss": 1.3638, + "step": 17536 + }, + { + "epoch": 0.9587906455448792, + "grad_norm": 1.3405845165252686, + "learning_rate": 9.451952998183022e-08, + "loss": 1.4858, + "step": 17537 + }, + { + "epoch": 0.9588453179886008, + "grad_norm": 1.5659236907958984, + "learning_rate": 9.426904371022672e-08, + "loss": 1.5721, + "step": 17538 + }, + { + "epoch": 0.9588999904323223, + "grad_norm": 1.4551078081130981, + "learning_rate": 9.401888821422566e-08, + "loss": 1.2679, + "step": 17539 + }, + { + "epoch": 0.9589546628760439, + "grad_norm": 2.1041064262390137, + "learning_rate": 9.376906350217819e-08, + "loss": 1.3026, + "step": 17540 + }, + { + "epoch": 0.9590093353197655, + "grad_norm": 1.2129281759262085, + "learning_rate": 9.351956958242648e-08, + "loss": 1.5585, + "step": 17541 + }, + { + "epoch": 0.959064007763487, + "grad_norm": 1.2686126232147217, + "learning_rate": 9.327040646330388e-08, + "loss": 1.4726, + "step": 17542 + }, + { + "epoch": 0.9591186802072086, + "grad_norm": 1.4263585805892944, + "learning_rate": 9.302157415312929e-08, + "loss": 1.3478, + "step": 17543 + }, + { + "epoch": 0.9591733526509301, + "grad_norm": 1.535187005996704, + "learning_rate": 9.277307266021052e-08, + "loss": 1.423, + "step": 17544 + }, + { + "epoch": 0.9592280250946517, + "grad_norm": 1.3171050548553467, + "learning_rate": 9.252490199284758e-08, + "loss": 1.4702, + "step": 17545 + }, + { + "epoch": 0.9592826975383733, + "grad_norm": 1.5836611986160278, + "learning_rate": 9.227706215932719e-08, + "loss": 1.419, + "step": 17546 + }, + { + "epoch": 0.9593373699820947, + "grad_norm": 1.641352653503418, + "learning_rate": 9.202955316792384e-08, + "loss": 1.1473, + "step": 17547 + }, + { + "epoch": 0.9593920424258163, + "grad_norm": 1.6358553171157837, + "learning_rate": 9.178237502690423e-08, + "loss": 1.4679, + "step": 17548 + }, + { + "epoch": 0.9594467148695379, + "grad_norm": 1.7531620264053345, + "learning_rate": 9.153552774452179e-08, + "loss": 1.4165, + "step": 17549 + }, + { + "epoch": 0.9595013873132594, + "grad_norm": 1.3508899211883545, + "learning_rate": 9.12890113290188e-08, + "loss": 1.4749, + "step": 17550 + }, + { + "epoch": 0.959556059756981, + "grad_norm": 1.4172643423080444, + "learning_rate": 9.104282578862644e-08, + "loss": 1.4582, + "step": 17551 + }, + { + "epoch": 0.9596107322007026, + "grad_norm": 1.4224127531051636, + "learning_rate": 9.079697113156705e-08, + "loss": 1.1868, + "step": 17552 + }, + { + "epoch": 0.9596654046444241, + "grad_norm": 1.5635125637054443, + "learning_rate": 9.05514473660496e-08, + "loss": 1.2019, + "step": 17553 + }, + { + "epoch": 0.9597200770881457, + "grad_norm": 1.4237011671066284, + "learning_rate": 9.030625450027198e-08, + "loss": 1.4655, + "step": 17554 + }, + { + "epoch": 0.9597747495318673, + "grad_norm": 1.4242677688598633, + "learning_rate": 9.006139254242319e-08, + "loss": 1.6246, + "step": 17555 + }, + { + "epoch": 0.9598294219755887, + "grad_norm": 2.0583603382110596, + "learning_rate": 8.981686150067781e-08, + "loss": 1.4749, + "step": 17556 + }, + { + "epoch": 0.9598840944193103, + "grad_norm": 1.2938255071640015, + "learning_rate": 8.957266138320375e-08, + "loss": 1.51, + "step": 17557 + }, + { + "epoch": 0.9599387668630318, + "grad_norm": 1.486434817314148, + "learning_rate": 8.932879219815227e-08, + "loss": 1.4868, + "step": 17558 + }, + { + "epoch": 0.9599934393067534, + "grad_norm": 1.5125914812088013, + "learning_rate": 8.908525395367018e-08, + "loss": 1.5058, + "step": 17559 + }, + { + "epoch": 0.960048111750475, + "grad_norm": 1.6498768329620361, + "learning_rate": 8.884204665788765e-08, + "loss": 1.3569, + "step": 17560 + }, + { + "epoch": 0.9601027841941965, + "grad_norm": 1.3121291399002075, + "learning_rate": 8.859917031892595e-08, + "loss": 1.3168, + "step": 17561 + }, + { + "epoch": 0.9601574566379181, + "grad_norm": 1.6974018812179565, + "learning_rate": 8.835662494489638e-08, + "loss": 1.4653, + "step": 17562 + }, + { + "epoch": 0.9602121290816397, + "grad_norm": 1.6848756074905396, + "learning_rate": 8.81144105438958e-08, + "loss": 1.4611, + "step": 17563 + }, + { + "epoch": 0.9602668015253611, + "grad_norm": 1.7203781604766846, + "learning_rate": 8.787252712401662e-08, + "loss": 1.4669, + "step": 17564 + }, + { + "epoch": 0.9603214739690827, + "grad_norm": 1.8379979133605957, + "learning_rate": 8.763097469333237e-08, + "loss": 1.3642, + "step": 17565 + }, + { + "epoch": 0.9603761464128043, + "grad_norm": 1.523127794265747, + "learning_rate": 8.738975325990884e-08, + "loss": 1.5674, + "step": 17566 + }, + { + "epoch": 0.9604308188565258, + "grad_norm": 1.4833359718322754, + "learning_rate": 8.714886283180291e-08, + "loss": 1.5465, + "step": 17567 + }, + { + "epoch": 0.9604854913002474, + "grad_norm": 1.4888908863067627, + "learning_rate": 8.690830341705814e-08, + "loss": 1.3241, + "step": 17568 + }, + { + "epoch": 0.960540163743969, + "grad_norm": 1.5138806104660034, + "learning_rate": 8.6668075023707e-08, + "loss": 1.5335, + "step": 17569 + }, + { + "epoch": 0.9605948361876905, + "grad_norm": 1.3438795804977417, + "learning_rate": 8.642817765977086e-08, + "loss": 1.5397, + "step": 17570 + }, + { + "epoch": 0.9606495086314121, + "grad_norm": 1.4791030883789062, + "learning_rate": 8.618861133326106e-08, + "loss": 1.2735, + "step": 17571 + }, + { + "epoch": 0.9607041810751336, + "grad_norm": 1.6751638650894165, + "learning_rate": 8.594937605217568e-08, + "loss": 1.3927, + "step": 17572 + }, + { + "epoch": 0.9607588535188551, + "grad_norm": 1.243960976600647, + "learning_rate": 8.571047182450609e-08, + "loss": 1.4707, + "step": 17573 + }, + { + "epoch": 0.9608135259625767, + "grad_norm": 2.316892623901367, + "learning_rate": 8.547189865822814e-08, + "loss": 1.2212, + "step": 17574 + }, + { + "epoch": 0.9608681984062982, + "grad_norm": 1.859010934829712, + "learning_rate": 8.523365656130767e-08, + "loss": 1.3996, + "step": 17575 + }, + { + "epoch": 0.9609228708500198, + "grad_norm": 1.244596004486084, + "learning_rate": 8.499574554170276e-08, + "loss": 1.5536, + "step": 17576 + }, + { + "epoch": 0.9609775432937414, + "grad_norm": 1.6233593225479126, + "learning_rate": 8.475816560735484e-08, + "loss": 1.7577, + "step": 17577 + }, + { + "epoch": 0.9610322157374629, + "grad_norm": 1.8501490354537964, + "learning_rate": 8.452091676619978e-08, + "loss": 1.3827, + "step": 17578 + }, + { + "epoch": 0.9610868881811845, + "grad_norm": 1.6670070886611938, + "learning_rate": 8.428399902615791e-08, + "loss": 1.113, + "step": 17579 + }, + { + "epoch": 0.9611415606249061, + "grad_norm": 2.1240956783294678, + "learning_rate": 8.404741239514181e-08, + "loss": 1.5191, + "step": 17580 + }, + { + "epoch": 0.9611962330686276, + "grad_norm": 1.2315906286239624, + "learning_rate": 8.381115688105068e-08, + "loss": 1.4678, + "step": 17581 + }, + { + "epoch": 0.9612509055123492, + "grad_norm": 1.4930880069732666, + "learning_rate": 8.35752324917749e-08, + "loss": 1.6258, + "step": 17582 + }, + { + "epoch": 0.9613055779560707, + "grad_norm": 1.9182617664337158, + "learning_rate": 8.333963923519039e-08, + "loss": 1.6569, + "step": 17583 + }, + { + "epoch": 0.9613602503997922, + "grad_norm": 1.2816792726516724, + "learning_rate": 8.310437711916641e-08, + "loss": 1.5355, + "step": 17584 + }, + { + "epoch": 0.9614149228435138, + "grad_norm": 1.8052812814712524, + "learning_rate": 8.286944615155778e-08, + "loss": 1.5422, + "step": 17585 + }, + { + "epoch": 0.9614695952872353, + "grad_norm": 1.5567559003829956, + "learning_rate": 8.263484634020935e-08, + "loss": 1.427, + "step": 17586 + }, + { + "epoch": 0.9615242677309569, + "grad_norm": 1.5297281742095947, + "learning_rate": 8.240057769295485e-08, + "loss": 1.4454, + "step": 17587 + }, + { + "epoch": 0.9615789401746785, + "grad_norm": 3.5448498725891113, + "learning_rate": 8.2166640217618e-08, + "loss": 1.6567, + "step": 17588 + }, + { + "epoch": 0.9616336126184, + "grad_norm": 1.7578352689743042, + "learning_rate": 8.193303392201036e-08, + "loss": 1.5054, + "step": 17589 + }, + { + "epoch": 0.9616882850621216, + "grad_norm": 1.5824843645095825, + "learning_rate": 8.169975881393122e-08, + "loss": 1.3924, + "step": 17590 + }, + { + "epoch": 0.9617429575058432, + "grad_norm": 2.0622386932373047, + "learning_rate": 8.146681490117214e-08, + "loss": 1.3521, + "step": 17591 + }, + { + "epoch": 0.9617976299495646, + "grad_norm": 1.7945516109466553, + "learning_rate": 8.123420219151023e-08, + "loss": 1.4298, + "step": 17592 + }, + { + "epoch": 0.9618523023932862, + "grad_norm": 1.5638928413391113, + "learning_rate": 8.100192069271374e-08, + "loss": 1.5044, + "step": 17593 + }, + { + "epoch": 0.9619069748370078, + "grad_norm": 1.8408551216125488, + "learning_rate": 8.076997041253865e-08, + "loss": 1.3989, + "step": 17594 + }, + { + "epoch": 0.9619616472807293, + "grad_norm": 1.4646620750427246, + "learning_rate": 8.05383513587299e-08, + "loss": 1.3427, + "step": 17595 + }, + { + "epoch": 0.9620163197244509, + "grad_norm": 1.5136007070541382, + "learning_rate": 8.030706353902351e-08, + "loss": 1.4615, + "step": 17596 + }, + { + "epoch": 0.9620709921681725, + "grad_norm": 1.6493808031082153, + "learning_rate": 8.007610696114e-08, + "loss": 1.3881, + "step": 17597 + }, + { + "epoch": 0.962125664611894, + "grad_norm": 1.6960382461547852, + "learning_rate": 7.984548163279426e-08, + "loss": 1.417, + "step": 17598 + }, + { + "epoch": 0.9621803370556156, + "grad_norm": 1.454721212387085, + "learning_rate": 7.961518756168574e-08, + "loss": 1.3025, + "step": 17599 + }, + { + "epoch": 0.962235009499337, + "grad_norm": 1.6136960983276367, + "learning_rate": 7.938522475550492e-08, + "loss": 1.4292, + "step": 17600 + }, + { + "epoch": 0.9622896819430586, + "grad_norm": 1.6036262512207031, + "learning_rate": 7.915559322193123e-08, + "loss": 1.3303, + "step": 17601 + }, + { + "epoch": 0.9623443543867802, + "grad_norm": 1.5101817846298218, + "learning_rate": 7.892629296863297e-08, + "loss": 1.4635, + "step": 17602 + }, + { + "epoch": 0.9623990268305017, + "grad_norm": 1.5497918128967285, + "learning_rate": 7.869732400326513e-08, + "loss": 1.5201, + "step": 17603 + }, + { + "epoch": 0.9624536992742233, + "grad_norm": 1.2806018590927124, + "learning_rate": 7.846868633347492e-08, + "loss": 1.2929, + "step": 17604 + }, + { + "epoch": 0.9625083717179449, + "grad_norm": 1.4823777675628662, + "learning_rate": 7.824037996689738e-08, + "loss": 1.2895, + "step": 17605 + }, + { + "epoch": 0.9625630441616664, + "grad_norm": 1.5683183670043945, + "learning_rate": 7.801240491115525e-08, + "loss": 1.4637, + "step": 17606 + }, + { + "epoch": 0.962617716605388, + "grad_norm": 1.3563112020492554, + "learning_rate": 7.778476117386247e-08, + "loss": 1.4477, + "step": 17607 + }, + { + "epoch": 0.9626723890491096, + "grad_norm": 1.5369242429733276, + "learning_rate": 7.75574487626185e-08, + "loss": 1.4604, + "step": 17608 + }, + { + "epoch": 0.962727061492831, + "grad_norm": 1.7215163707733154, + "learning_rate": 7.733046768501617e-08, + "loss": 1.5196, + "step": 17609 + }, + { + "epoch": 0.9627817339365526, + "grad_norm": 1.5638127326965332, + "learning_rate": 7.710381794863275e-08, + "loss": 1.4193, + "step": 17610 + }, + { + "epoch": 0.9628364063802742, + "grad_norm": 1.5120917558670044, + "learning_rate": 7.687749956103774e-08, + "loss": 1.4241, + "step": 17611 + }, + { + "epoch": 0.9628910788239957, + "grad_norm": 1.8343313932418823, + "learning_rate": 7.665151252978842e-08, + "loss": 1.3947, + "step": 17612 + }, + { + "epoch": 0.9629457512677173, + "grad_norm": 2.051286220550537, + "learning_rate": 7.642585686243209e-08, + "loss": 1.3793, + "step": 17613 + }, + { + "epoch": 0.9630004237114388, + "grad_norm": 1.4751324653625488, + "learning_rate": 7.620053256650162e-08, + "loss": 1.5289, + "step": 17614 + }, + { + "epoch": 0.9630550961551604, + "grad_norm": 1.5897939205169678, + "learning_rate": 7.59755396495221e-08, + "loss": 1.589, + "step": 17615 + }, + { + "epoch": 0.963109768598882, + "grad_norm": 1.244740605354309, + "learning_rate": 7.57508781190075e-08, + "loss": 1.5179, + "step": 17616 + }, + { + "epoch": 0.9631644410426035, + "grad_norm": 1.3699489831924438, + "learning_rate": 7.55265479824585e-08, + "loss": 1.5299, + "step": 17617 + }, + { + "epoch": 0.963219113486325, + "grad_norm": 1.6791282892227173, + "learning_rate": 7.530254924736691e-08, + "loss": 1.4348, + "step": 17618 + }, + { + "epoch": 0.9632737859300466, + "grad_norm": 1.532203197479248, + "learning_rate": 7.507888192121338e-08, + "loss": 1.566, + "step": 17619 + }, + { + "epoch": 0.9633284583737681, + "grad_norm": 1.2808862924575806, + "learning_rate": 7.485554601146417e-08, + "loss": 1.4551, + "step": 17620 + }, + { + "epoch": 0.9633831308174897, + "grad_norm": 1.414220929145813, + "learning_rate": 7.463254152557885e-08, + "loss": 1.3877, + "step": 17621 + }, + { + "epoch": 0.9634378032612113, + "grad_norm": 1.1162481307983398, + "learning_rate": 7.440986847100262e-08, + "loss": 1.6497, + "step": 17622 + }, + { + "epoch": 0.9634924757049328, + "grad_norm": 1.6142807006835938, + "learning_rate": 7.418752685517283e-08, + "loss": 1.479, + "step": 17623 + }, + { + "epoch": 0.9635471481486544, + "grad_norm": 2.4113194942474365, + "learning_rate": 7.396551668551355e-08, + "loss": 1.3892, + "step": 17624 + }, + { + "epoch": 0.963601820592376, + "grad_norm": 1.5590678453445435, + "learning_rate": 7.374383796943663e-08, + "loss": 1.3907, + "step": 17625 + }, + { + "epoch": 0.9636564930360975, + "grad_norm": 1.9031462669372559, + "learning_rate": 7.352249071434614e-08, + "loss": 1.1808, + "step": 17626 + }, + { + "epoch": 0.963711165479819, + "grad_norm": 1.1127192974090576, + "learning_rate": 7.330147492763396e-08, + "loss": 1.4005, + "step": 17627 + }, + { + "epoch": 0.9637658379235405, + "grad_norm": 1.340810775756836, + "learning_rate": 7.30807906166775e-08, + "loss": 1.6125, + "step": 17628 + }, + { + "epoch": 0.9638205103672621, + "grad_norm": 1.2389862537384033, + "learning_rate": 7.286043778884865e-08, + "loss": 1.5416, + "step": 17629 + }, + { + "epoch": 0.9638751828109837, + "grad_norm": 2.879774332046509, + "learning_rate": 7.264041645150488e-08, + "loss": 1.5456, + "step": 17630 + }, + { + "epoch": 0.9639298552547052, + "grad_norm": 1.2509807348251343, + "learning_rate": 7.242072661199251e-08, + "loss": 1.4372, + "step": 17631 + }, + { + "epoch": 0.9639845276984268, + "grad_norm": 1.201894760131836, + "learning_rate": 7.22013682776479e-08, + "loss": 1.4837, + "step": 17632 + }, + { + "epoch": 0.9640392001421484, + "grad_norm": 1.4570133686065674, + "learning_rate": 7.198234145579519e-08, + "loss": 1.2692, + "step": 17633 + }, + { + "epoch": 0.9640938725858699, + "grad_norm": 1.7395156621932983, + "learning_rate": 7.176364615374964e-08, + "loss": 1.4184, + "step": 17634 + }, + { + "epoch": 0.9641485450295915, + "grad_norm": 1.828413724899292, + "learning_rate": 7.154528237881431e-08, + "loss": 1.2283, + "step": 17635 + }, + { + "epoch": 0.9642032174733131, + "grad_norm": 1.3645367622375488, + "learning_rate": 7.132725013827779e-08, + "loss": 1.454, + "step": 17636 + }, + { + "epoch": 0.9642578899170345, + "grad_norm": 1.6148014068603516, + "learning_rate": 7.110954943942428e-08, + "loss": 1.3961, + "step": 17637 + }, + { + "epoch": 0.9643125623607561, + "grad_norm": 2.1012537479400635, + "learning_rate": 7.089218028952128e-08, + "loss": 1.0256, + "step": 17638 + }, + { + "epoch": 0.9643672348044777, + "grad_norm": 1.957023024559021, + "learning_rate": 7.067514269582743e-08, + "loss": 1.4345, + "step": 17639 + }, + { + "epoch": 0.9644219072481992, + "grad_norm": 1.6884434223175049, + "learning_rate": 7.045843666559027e-08, + "loss": 1.462, + "step": 17640 + }, + { + "epoch": 0.9644765796919208, + "grad_norm": 1.3106454610824585, + "learning_rate": 7.024206220604734e-08, + "loss": 1.4329, + "step": 17641 + }, + { + "epoch": 0.9645312521356423, + "grad_norm": 1.2069820165634155, + "learning_rate": 7.002601932442177e-08, + "loss": 1.3637, + "step": 17642 + }, + { + "epoch": 0.9645859245793639, + "grad_norm": 1.6199885606765747, + "learning_rate": 6.981030802792998e-08, + "loss": 1.4836, + "step": 17643 + }, + { + "epoch": 0.9646405970230855, + "grad_norm": 1.2671772241592407, + "learning_rate": 6.95949283237729e-08, + "loss": 1.459, + "step": 17644 + }, + { + "epoch": 0.964695269466807, + "grad_norm": 1.6320374011993408, + "learning_rate": 6.937988021914477e-08, + "loss": 1.5761, + "step": 17645 + }, + { + "epoch": 0.9647499419105285, + "grad_norm": 1.5721275806427002, + "learning_rate": 6.916516372122429e-08, + "loss": 1.2403, + "step": 17646 + }, + { + "epoch": 0.9648046143542501, + "grad_norm": 1.648544192314148, + "learning_rate": 6.895077883718237e-08, + "loss": 1.4643, + "step": 17647 + }, + { + "epoch": 0.9648592867979716, + "grad_norm": 1.5318125486373901, + "learning_rate": 6.873672557417777e-08, + "loss": 1.4763, + "step": 17648 + }, + { + "epoch": 0.9649139592416932, + "grad_norm": 1.4092837572097778, + "learning_rate": 6.852300393935918e-08, + "loss": 1.3016, + "step": 17649 + }, + { + "epoch": 0.9649686316854148, + "grad_norm": 1.8617608547210693, + "learning_rate": 6.830961393986202e-08, + "loss": 1.3023, + "step": 17650 + }, + { + "epoch": 0.9650233041291363, + "grad_norm": 1.2232414484024048, + "learning_rate": 6.80965555828128e-08, + "loss": 1.3388, + "step": 17651 + }, + { + "epoch": 0.9650779765728579, + "grad_norm": 1.3320248126983643, + "learning_rate": 6.788382887532475e-08, + "loss": 1.6065, + "step": 17652 + }, + { + "epoch": 0.9651326490165795, + "grad_norm": 1.3589038848876953, + "learning_rate": 6.767143382450214e-08, + "loss": 1.5286, + "step": 17653 + }, + { + "epoch": 0.965187321460301, + "grad_norm": 1.35422945022583, + "learning_rate": 6.745937043743712e-08, + "loss": 1.4187, + "step": 17654 + }, + { + "epoch": 0.9652419939040225, + "grad_norm": 1.4804564714431763, + "learning_rate": 6.724763872121177e-08, + "loss": 1.2656, + "step": 17655 + }, + { + "epoch": 0.965296666347744, + "grad_norm": 2.196712017059326, + "learning_rate": 6.7036238682896e-08, + "loss": 1.1674, + "step": 17656 + }, + { + "epoch": 0.9653513387914656, + "grad_norm": 1.6386281251907349, + "learning_rate": 6.68251703295475e-08, + "loss": 1.5021, + "step": 17657 + }, + { + "epoch": 0.9654060112351872, + "grad_norm": 1.4064090251922607, + "learning_rate": 6.661443366821618e-08, + "loss": 1.4421, + "step": 17658 + }, + { + "epoch": 0.9654606836789087, + "grad_norm": 1.991674542427063, + "learning_rate": 6.640402870593865e-08, + "loss": 1.0821, + "step": 17659 + }, + { + "epoch": 0.9655153561226303, + "grad_norm": 1.690466046333313, + "learning_rate": 6.619395544974039e-08, + "loss": 1.563, + "step": 17660 + }, + { + "epoch": 0.9655700285663519, + "grad_norm": 1.2466332912445068, + "learning_rate": 6.598421390663578e-08, + "loss": 1.4789, + "step": 17661 + }, + { + "epoch": 0.9656247010100734, + "grad_norm": 1.966116189956665, + "learning_rate": 6.577480408363035e-08, + "loss": 1.4876, + "step": 17662 + }, + { + "epoch": 0.965679373453795, + "grad_norm": 1.8434685468673706, + "learning_rate": 6.556572598771404e-08, + "loss": 1.2836, + "step": 17663 + }, + { + "epoch": 0.9657340458975165, + "grad_norm": 2.0147364139556885, + "learning_rate": 6.535697962587129e-08, + "loss": 1.4204, + "step": 17664 + }, + { + "epoch": 0.965788718341238, + "grad_norm": 1.6650056838989258, + "learning_rate": 6.514856500507094e-08, + "loss": 1.5122, + "step": 17665 + }, + { + "epoch": 0.9658433907849596, + "grad_norm": 1.3596370220184326, + "learning_rate": 6.4940482132273e-08, + "loss": 1.4588, + "step": 17666 + }, + { + "epoch": 0.9658980632286812, + "grad_norm": 3.349036693572998, + "learning_rate": 6.473273101442412e-08, + "loss": 1.3479, + "step": 17667 + }, + { + "epoch": 0.9659527356724027, + "grad_norm": 1.648055076599121, + "learning_rate": 6.452531165846543e-08, + "loss": 1.2531, + "step": 17668 + }, + { + "epoch": 0.9660074081161243, + "grad_norm": 1.8848583698272705, + "learning_rate": 6.431822407132027e-08, + "loss": 1.3923, + "step": 17669 + }, + { + "epoch": 0.9660620805598458, + "grad_norm": 1.536513328552246, + "learning_rate": 6.411146825990311e-08, + "loss": 1.2806, + "step": 17670 + }, + { + "epoch": 0.9661167530035674, + "grad_norm": 1.490831971168518, + "learning_rate": 6.390504423112065e-08, + "loss": 1.5292, + "step": 17671 + }, + { + "epoch": 0.966171425447289, + "grad_norm": 1.6208709478378296, + "learning_rate": 6.369895199186404e-08, + "loss": 1.3077, + "step": 17672 + }, + { + "epoch": 0.9662260978910104, + "grad_norm": 1.6386157274246216, + "learning_rate": 6.349319154901668e-08, + "loss": 1.5237, + "step": 17673 + }, + { + "epoch": 0.966280770334732, + "grad_norm": 1.8441683053970337, + "learning_rate": 6.32877629094475e-08, + "loss": 1.487, + "step": 17674 + }, + { + "epoch": 0.9663354427784536, + "grad_norm": 1.4753947257995605, + "learning_rate": 6.308266608001656e-08, + "loss": 1.5258, + "step": 17675 + }, + { + "epoch": 0.9663901152221751, + "grad_norm": 1.254140853881836, + "learning_rate": 6.287790106757396e-08, + "loss": 1.3914, + "step": 17676 + }, + { + "epoch": 0.9664447876658967, + "grad_norm": 1.5971312522888184, + "learning_rate": 6.267346787895645e-08, + "loss": 1.4233, + "step": 17677 + }, + { + "epoch": 0.9664994601096183, + "grad_norm": 1.4654406309127808, + "learning_rate": 6.246936652099078e-08, + "loss": 1.5713, + "step": 17678 + }, + { + "epoch": 0.9665541325533398, + "grad_norm": 1.5310947895050049, + "learning_rate": 6.226559700049151e-08, + "loss": 1.4133, + "step": 17679 + }, + { + "epoch": 0.9666088049970614, + "grad_norm": 1.8945649862289429, + "learning_rate": 6.206215932426319e-08, + "loss": 1.4084, + "step": 17680 + }, + { + "epoch": 0.966663477440783, + "grad_norm": 1.2394824028015137, + "learning_rate": 6.185905349910038e-08, + "loss": 1.5276, + "step": 17681 + }, + { + "epoch": 0.9667181498845044, + "grad_norm": 1.486871600151062, + "learning_rate": 6.165627953178433e-08, + "loss": 1.4531, + "step": 17682 + }, + { + "epoch": 0.966772822328226, + "grad_norm": 1.698360562324524, + "learning_rate": 6.145383742908517e-08, + "loss": 1.3902, + "step": 17683 + }, + { + "epoch": 0.9668274947719475, + "grad_norm": 1.316770076751709, + "learning_rate": 6.125172719776529e-08, + "loss": 1.3415, + "step": 17684 + }, + { + "epoch": 0.9668821672156691, + "grad_norm": 1.3707462549209595, + "learning_rate": 6.10499488445715e-08, + "loss": 1.3643, + "step": 17685 + }, + { + "epoch": 0.9669368396593907, + "grad_norm": 1.3585439920425415, + "learning_rate": 6.084850237624285e-08, + "loss": 1.7503, + "step": 17686 + }, + { + "epoch": 0.9669915121031122, + "grad_norm": 2.1138722896575928, + "learning_rate": 6.064738779950397e-08, + "loss": 1.6691, + "step": 17687 + }, + { + "epoch": 0.9670461845468338, + "grad_norm": 1.395354986190796, + "learning_rate": 6.044660512107392e-08, + "loss": 1.4919, + "step": 17688 + }, + { + "epoch": 0.9671008569905554, + "grad_norm": 1.7355537414550781, + "learning_rate": 6.024615434765513e-08, + "loss": 1.2491, + "step": 17689 + }, + { + "epoch": 0.9671555294342769, + "grad_norm": 1.3324192762374878, + "learning_rate": 6.004603548594112e-08, + "loss": 1.5313, + "step": 17690 + }, + { + "epoch": 0.9672102018779984, + "grad_norm": 1.2420623302459717, + "learning_rate": 5.984624854261545e-08, + "loss": 1.6529, + "step": 17691 + }, + { + "epoch": 0.96726487432172, + "grad_norm": 1.695685625076294, + "learning_rate": 5.964679352434833e-08, + "loss": 1.5522, + "step": 17692 + }, + { + "epoch": 0.9673195467654415, + "grad_norm": 1.6756736040115356, + "learning_rate": 5.944767043780109e-08, + "loss": 1.5592, + "step": 17693 + }, + { + "epoch": 0.9673742192091631, + "grad_norm": 1.6417810916900635, + "learning_rate": 5.924887928962286e-08, + "loss": 1.4224, + "step": 17694 + }, + { + "epoch": 0.9674288916528847, + "grad_norm": 1.6756278276443481, + "learning_rate": 5.905042008645057e-08, + "loss": 1.4205, + "step": 17695 + }, + { + "epoch": 0.9674835640966062, + "grad_norm": 1.4515349864959717, + "learning_rate": 5.885229283491223e-08, + "loss": 1.3203, + "step": 17696 + }, + { + "epoch": 0.9675382365403278, + "grad_norm": 1.4799304008483887, + "learning_rate": 5.865449754162256e-08, + "loss": 1.3046, + "step": 17697 + }, + { + "epoch": 0.9675929089840493, + "grad_norm": 1.4270917177200317, + "learning_rate": 5.845703421318849e-08, + "loss": 1.5366, + "step": 17698 + }, + { + "epoch": 0.9676475814277709, + "grad_norm": 1.2677979469299316, + "learning_rate": 5.825990285620253e-08, + "loss": 1.5267, + "step": 17699 + }, + { + "epoch": 0.9677022538714924, + "grad_norm": 1.813846230506897, + "learning_rate": 5.806310347724609e-08, + "loss": 1.285, + "step": 17700 + }, + { + "epoch": 0.9677569263152139, + "grad_norm": 1.266079068183899, + "learning_rate": 5.786663608289278e-08, + "loss": 1.5558, + "step": 17701 + }, + { + "epoch": 0.9678115987589355, + "grad_norm": 1.7232595682144165, + "learning_rate": 5.7670500679702925e-08, + "loss": 1.5166, + "step": 17702 + }, + { + "epoch": 0.9678662712026571, + "grad_norm": 1.4171648025512695, + "learning_rate": 5.747469727422572e-08, + "loss": 1.4693, + "step": 17703 + }, + { + "epoch": 0.9679209436463786, + "grad_norm": 1.687293291091919, + "learning_rate": 5.727922587299817e-08, + "loss": 1.2777, + "step": 17704 + }, + { + "epoch": 0.9679756160901002, + "grad_norm": 1.427318811416626, + "learning_rate": 5.7084086482549486e-08, + "loss": 1.5591, + "step": 17705 + }, + { + "epoch": 0.9680302885338218, + "grad_norm": 1.2460308074951172, + "learning_rate": 5.688927910939446e-08, + "loss": 1.4688, + "step": 17706 + }, + { + "epoch": 0.9680849609775433, + "grad_norm": 1.907014012336731, + "learning_rate": 5.6694803760039e-08, + "loss": 1.4751, + "step": 17707 + }, + { + "epoch": 0.9681396334212649, + "grad_norm": 1.3821802139282227, + "learning_rate": 5.6500660440975684e-08, + "loss": 1.5611, + "step": 17708 + }, + { + "epoch": 0.9681943058649864, + "grad_norm": 1.8345918655395508, + "learning_rate": 5.630684915868934e-08, + "loss": 1.4562, + "step": 17709 + }, + { + "epoch": 0.9682489783087079, + "grad_norm": 1.3936975002288818, + "learning_rate": 5.611336991965144e-08, + "loss": 1.5363, + "step": 17710 + }, + { + "epoch": 0.9683036507524295, + "grad_norm": 1.5350204706192017, + "learning_rate": 5.5920222730321275e-08, + "loss": 1.4885, + "step": 17711 + }, + { + "epoch": 0.968358323196151, + "grad_norm": 1.5055373907089233, + "learning_rate": 5.572740759715034e-08, + "loss": 1.2177, + "step": 17712 + }, + { + "epoch": 0.9684129956398726, + "grad_norm": 1.3397555351257324, + "learning_rate": 5.5534924526575716e-08, + "loss": 1.3209, + "step": 17713 + }, + { + "epoch": 0.9684676680835942, + "grad_norm": 1.3966370820999146, + "learning_rate": 5.5342773525024484e-08, + "loss": 1.3844, + "step": 17714 + }, + { + "epoch": 0.9685223405273157, + "grad_norm": 1.861647367477417, + "learning_rate": 5.515095459891484e-08, + "loss": 1.6529, + "step": 17715 + }, + { + "epoch": 0.9685770129710373, + "grad_norm": 1.5247137546539307, + "learning_rate": 5.4959467754651665e-08, + "loss": 1.4305, + "step": 17716 + }, + { + "epoch": 0.9686316854147589, + "grad_norm": 1.516504168510437, + "learning_rate": 5.4768312998627616e-08, + "loss": 1.6723, + "step": 17717 + }, + { + "epoch": 0.9686863578584803, + "grad_norm": 1.4652076959609985, + "learning_rate": 5.457749033722648e-08, + "loss": 1.5385, + "step": 17718 + }, + { + "epoch": 0.9687410303022019, + "grad_norm": 1.4496361017227173, + "learning_rate": 5.438699977682205e-08, + "loss": 1.5285, + "step": 17719 + }, + { + "epoch": 0.9687957027459235, + "grad_norm": 1.39829683303833, + "learning_rate": 5.4196841323772565e-08, + "loss": 1.4953, + "step": 17720 + }, + { + "epoch": 0.968850375189645, + "grad_norm": 1.4614518880844116, + "learning_rate": 5.400701498442962e-08, + "loss": 1.5873, + "step": 17721 + }, + { + "epoch": 0.9689050476333666, + "grad_norm": 1.776880145072937, + "learning_rate": 5.381752076513147e-08, + "loss": 1.7113, + "step": 17722 + }, + { + "epoch": 0.9689597200770882, + "grad_norm": 1.5180200338363647, + "learning_rate": 5.3628358672205285e-08, + "loss": 1.5217, + "step": 17723 + }, + { + "epoch": 0.9690143925208097, + "grad_norm": 1.3436269760131836, + "learning_rate": 5.343952871196934e-08, + "loss": 1.3402, + "step": 17724 + }, + { + "epoch": 0.9690690649645313, + "grad_norm": 1.4258317947387695, + "learning_rate": 5.3251030890727474e-08, + "loss": 1.4356, + "step": 17725 + }, + { + "epoch": 0.9691237374082528, + "grad_norm": 1.2228736877441406, + "learning_rate": 5.306286521477355e-08, + "loss": 1.5316, + "step": 17726 + }, + { + "epoch": 0.9691784098519743, + "grad_norm": 1.6822882890701294, + "learning_rate": 5.287503169039143e-08, + "loss": 1.3302, + "step": 17727 + }, + { + "epoch": 0.9692330822956959, + "grad_norm": 1.8479118347167969, + "learning_rate": 5.2687530323854985e-08, + "loss": 1.5973, + "step": 17728 + }, + { + "epoch": 0.9692877547394174, + "grad_norm": 1.9121134281158447, + "learning_rate": 5.250036112142365e-08, + "loss": 1.5013, + "step": 17729 + }, + { + "epoch": 0.969342427183139, + "grad_norm": 1.360732913017273, + "learning_rate": 5.231352408934687e-08, + "loss": 1.5174, + "step": 17730 + }, + { + "epoch": 0.9693970996268606, + "grad_norm": 1.975156307220459, + "learning_rate": 5.2127019233866316e-08, + "loss": 1.5674, + "step": 17731 + }, + { + "epoch": 0.9694517720705821, + "grad_norm": 1.2812269926071167, + "learning_rate": 5.1940846561205907e-08, + "loss": 1.641, + "step": 17732 + }, + { + "epoch": 0.9695064445143037, + "grad_norm": 1.7131558656692505, + "learning_rate": 5.175500607758621e-08, + "loss": 1.4034, + "step": 17733 + }, + { + "epoch": 0.9695611169580253, + "grad_norm": 1.4819235801696777, + "learning_rate": 5.156949778921006e-08, + "loss": 1.4198, + "step": 17734 + }, + { + "epoch": 0.9696157894017468, + "grad_norm": 1.4253935813903809, + "learning_rate": 5.1384321702273586e-08, + "loss": 1.2965, + "step": 17735 + }, + { + "epoch": 0.9696704618454683, + "grad_norm": 1.6036252975463867, + "learning_rate": 5.119947782295964e-08, + "loss": 1.261, + "step": 17736 + }, + { + "epoch": 0.9697251342891899, + "grad_norm": 2.187241315841675, + "learning_rate": 5.101496615744106e-08, + "loss": 1.4865, + "step": 17737 + }, + { + "epoch": 0.9697798067329114, + "grad_norm": 1.5904167890548706, + "learning_rate": 5.083078671187846e-08, + "loss": 1.3118, + "step": 17738 + }, + { + "epoch": 0.969834479176633, + "grad_norm": 1.3839205503463745, + "learning_rate": 5.06469394924225e-08, + "loss": 1.6092, + "step": 17739 + }, + { + "epoch": 0.9698891516203546, + "grad_norm": 1.507312536239624, + "learning_rate": 5.046342450521158e-08, + "loss": 1.5988, + "step": 17740 + }, + { + "epoch": 0.9699438240640761, + "grad_norm": 1.849858045578003, + "learning_rate": 5.028024175637525e-08, + "loss": 1.3098, + "step": 17741 + }, + { + "epoch": 0.9699984965077977, + "grad_norm": 1.3717442750930786, + "learning_rate": 5.0097391252028616e-08, + "loss": 1.6393, + "step": 17742 + }, + { + "epoch": 0.9700531689515192, + "grad_norm": 1.3588076829910278, + "learning_rate": 4.9914872998277906e-08, + "loss": 1.5906, + "step": 17743 + }, + { + "epoch": 0.9701078413952408, + "grad_norm": 1.8536903858184814, + "learning_rate": 4.973268700121936e-08, + "loss": 1.366, + "step": 17744 + }, + { + "epoch": 0.9701625138389623, + "grad_norm": 1.757987141609192, + "learning_rate": 4.955083326693477e-08, + "loss": 1.4916, + "step": 17745 + }, + { + "epoch": 0.9702171862826838, + "grad_norm": 1.5097839832305908, + "learning_rate": 4.9369311801497065e-08, + "loss": 1.3701, + "step": 17746 + }, + { + "epoch": 0.9702718587264054, + "grad_norm": 1.3417325019836426, + "learning_rate": 4.918812261096806e-08, + "loss": 1.5738, + "step": 17747 + }, + { + "epoch": 0.970326531170127, + "grad_norm": 1.6197857856750488, + "learning_rate": 4.9007265701397357e-08, + "loss": 1.3501, + "step": 17748 + }, + { + "epoch": 0.9703812036138485, + "grad_norm": 1.3448803424835205, + "learning_rate": 4.882674107882568e-08, + "loss": 1.5123, + "step": 17749 + }, + { + "epoch": 0.9704358760575701, + "grad_norm": 1.5921183824539185, + "learning_rate": 4.864654874928043e-08, + "loss": 1.5684, + "step": 17750 + }, + { + "epoch": 0.9704905485012917, + "grad_norm": 1.4495091438293457, + "learning_rate": 4.846668871877902e-08, + "loss": 1.3705, + "step": 17751 + }, + { + "epoch": 0.9705452209450132, + "grad_norm": 1.7954051494598389, + "learning_rate": 4.8287160993325535e-08, + "loss": 1.442, + "step": 17752 + }, + { + "epoch": 0.9705998933887348, + "grad_norm": 1.4367002248764038, + "learning_rate": 4.8107965578917395e-08, + "loss": 1.4302, + "step": 17753 + }, + { + "epoch": 0.9706545658324564, + "grad_norm": 1.413891077041626, + "learning_rate": 4.792910248153537e-08, + "loss": 1.5733, + "step": 17754 + }, + { + "epoch": 0.9707092382761778, + "grad_norm": 1.7415499687194824, + "learning_rate": 4.77505717071558e-08, + "loss": 1.2413, + "step": 17755 + }, + { + "epoch": 0.9707639107198994, + "grad_norm": 1.6649211645126343, + "learning_rate": 4.757237326173725e-08, + "loss": 1.4541, + "step": 17756 + }, + { + "epoch": 0.9708185831636209, + "grad_norm": 1.6911364793777466, + "learning_rate": 4.739450715123162e-08, + "loss": 1.4357, + "step": 17757 + }, + { + "epoch": 0.9708732556073425, + "grad_norm": 1.47695791721344, + "learning_rate": 4.721697338157749e-08, + "loss": 1.5534, + "step": 17758 + }, + { + "epoch": 0.9709279280510641, + "grad_norm": 1.5116087198257446, + "learning_rate": 4.703977195870346e-08, + "loss": 1.5486, + "step": 17759 + }, + { + "epoch": 0.9709826004947856, + "grad_norm": 1.764304518699646, + "learning_rate": 4.6862902888527016e-08, + "loss": 1.1892, + "step": 17760 + }, + { + "epoch": 0.9710372729385072, + "grad_norm": 1.0830883979797363, + "learning_rate": 4.668636617695454e-08, + "loss": 1.5803, + "step": 17761 + }, + { + "epoch": 0.9710919453822288, + "grad_norm": 1.5312750339508057, + "learning_rate": 4.651016182988022e-08, + "loss": 1.4627, + "step": 17762 + }, + { + "epoch": 0.9711466178259502, + "grad_norm": 1.3741123676300049, + "learning_rate": 4.6334289853188216e-08, + "loss": 1.318, + "step": 17763 + }, + { + "epoch": 0.9712012902696718, + "grad_norm": 1.2459263801574707, + "learning_rate": 4.615875025275163e-08, + "loss": 1.2982, + "step": 17764 + }, + { + "epoch": 0.9712559627133934, + "grad_norm": 1.6879702806472778, + "learning_rate": 4.59835430344302e-08, + "loss": 1.5008, + "step": 17765 + }, + { + "epoch": 0.9713106351571149, + "grad_norm": 1.3045681715011597, + "learning_rate": 4.5808668204078146e-08, + "loss": 1.4387, + "step": 17766 + }, + { + "epoch": 0.9713653076008365, + "grad_norm": 1.514054775238037, + "learning_rate": 4.56341257675319e-08, + "loss": 1.3958, + "step": 17767 + }, + { + "epoch": 0.9714199800445581, + "grad_norm": 1.6110848188400269, + "learning_rate": 4.545991573062014e-08, + "loss": 1.5361, + "step": 17768 + }, + { + "epoch": 0.9714746524882796, + "grad_norm": 1.804405927658081, + "learning_rate": 4.528603809916154e-08, + "loss": 1.5268, + "step": 17769 + }, + { + "epoch": 0.9715293249320012, + "grad_norm": 1.9426255226135254, + "learning_rate": 4.5112492878962574e-08, + "loss": 1.4577, + "step": 17770 + }, + { + "epoch": 0.9715839973757227, + "grad_norm": 1.2897424697875977, + "learning_rate": 4.493928007581527e-08, + "loss": 1.6422, + "step": 17771 + }, + { + "epoch": 0.9716386698194442, + "grad_norm": 1.6061861515045166, + "learning_rate": 4.476639969550722e-08, + "loss": 1.4434, + "step": 17772 + }, + { + "epoch": 0.9716933422631658, + "grad_norm": 1.646399736404419, + "learning_rate": 4.459385174380937e-08, + "loss": 1.2951, + "step": 17773 + }, + { + "epoch": 0.9717480147068873, + "grad_norm": 1.5159425735473633, + "learning_rate": 4.442163622648377e-08, + "loss": 1.3185, + "step": 17774 + }, + { + "epoch": 0.9718026871506089, + "grad_norm": 1.3429031372070312, + "learning_rate": 4.424975314928137e-08, + "loss": 1.407, + "step": 17775 + }, + { + "epoch": 0.9718573595943305, + "grad_norm": 1.277761459350586, + "learning_rate": 4.4078202517942037e-08, + "loss": 1.31, + "step": 17776 + }, + { + "epoch": 0.971912032038052, + "grad_norm": 1.7771899700164795, + "learning_rate": 4.39069843381934e-08, + "loss": 1.3418, + "step": 17777 + }, + { + "epoch": 0.9719667044817736, + "grad_norm": 1.4926708936691284, + "learning_rate": 4.373609861575423e-08, + "loss": 1.4893, + "step": 17778 + }, + { + "epoch": 0.9720213769254952, + "grad_norm": 1.4895236492156982, + "learning_rate": 4.3565545356327734e-08, + "loss": 1.2964, + "step": 17779 + }, + { + "epoch": 0.9720760493692167, + "grad_norm": 1.4240161180496216, + "learning_rate": 4.3395324565612686e-08, + "loss": 1.4322, + "step": 17780 + }, + { + "epoch": 0.9721307218129382, + "grad_norm": 1.4803011417388916, + "learning_rate": 4.322543624929121e-08, + "loss": 1.6586, + "step": 17781 + }, + { + "epoch": 0.9721853942566598, + "grad_norm": 1.7615528106689453, + "learning_rate": 4.3055880413036545e-08, + "loss": 1.3771, + "step": 17782 + }, + { + "epoch": 0.9722400667003813, + "grad_norm": 1.222733974456787, + "learning_rate": 4.288665706251194e-08, + "loss": 1.3756, + "step": 17783 + }, + { + "epoch": 0.9722947391441029, + "grad_norm": 1.6274077892303467, + "learning_rate": 4.271776620336621e-08, + "loss": 1.4353, + "step": 17784 + }, + { + "epoch": 0.9723494115878244, + "grad_norm": 1.7151212692260742, + "learning_rate": 4.2549207841239284e-08, + "loss": 1.4929, + "step": 17785 + }, + { + "epoch": 0.972404084031546, + "grad_norm": 1.4756253957748413, + "learning_rate": 4.2380981981759994e-08, + "loss": 1.3567, + "step": 17786 + }, + { + "epoch": 0.9724587564752676, + "grad_norm": 1.706784725189209, + "learning_rate": 4.2213088630547185e-08, + "loss": 1.5232, + "step": 17787 + }, + { + "epoch": 0.9725134289189891, + "grad_norm": 1.4917937517166138, + "learning_rate": 4.2045527793204145e-08, + "loss": 1.4464, + "step": 17788 + }, + { + "epoch": 0.9725681013627107, + "grad_norm": 1.6411858797073364, + "learning_rate": 4.187829947532973e-08, + "loss": 1.5199, + "step": 17789 + }, + { + "epoch": 0.9726227738064323, + "grad_norm": 1.2071281671524048, + "learning_rate": 4.171140368250615e-08, + "loss": 1.3101, + "step": 17790 + }, + { + "epoch": 0.9726774462501537, + "grad_norm": 1.459096908569336, + "learning_rate": 4.154484042030671e-08, + "loss": 1.4405, + "step": 17791 + }, + { + "epoch": 0.9727321186938753, + "grad_norm": 1.7019110918045044, + "learning_rate": 4.1378609694292526e-08, + "loss": 1.44, + "step": 17792 + }, + { + "epoch": 0.9727867911375969, + "grad_norm": 2.131887674331665, + "learning_rate": 4.1212711510015826e-08, + "loss": 1.7383, + "step": 17793 + }, + { + "epoch": 0.9728414635813184, + "grad_norm": 1.906151294708252, + "learning_rate": 4.1047145873015504e-08, + "loss": 1.1908, + "step": 17794 + }, + { + "epoch": 0.97289613602504, + "grad_norm": 1.6079297065734863, + "learning_rate": 4.0881912788820476e-08, + "loss": 1.2748, + "step": 17795 + }, + { + "epoch": 0.9729508084687616, + "grad_norm": 1.4376708269119263, + "learning_rate": 4.071701226294744e-08, + "loss": 1.4547, + "step": 17796 + }, + { + "epoch": 0.9730054809124831, + "grad_norm": 1.737701177597046, + "learning_rate": 4.0552444300904213e-08, + "loss": 1.4983, + "step": 17797 + }, + { + "epoch": 0.9730601533562047, + "grad_norm": 1.8509246110916138, + "learning_rate": 4.038820890818529e-08, + "loss": 1.4584, + "step": 17798 + }, + { + "epoch": 0.9731148257999261, + "grad_norm": 1.2498204708099365, + "learning_rate": 4.0224306090275165e-08, + "loss": 1.549, + "step": 17799 + }, + { + "epoch": 0.9731694982436477, + "grad_norm": 1.6354213953018188, + "learning_rate": 4.006073585264725e-08, + "loss": 1.4198, + "step": 17800 + }, + { + "epoch": 0.9732241706873693, + "grad_norm": 1.3072896003723145, + "learning_rate": 3.989749820076272e-08, + "loss": 1.4971, + "step": 17801 + }, + { + "epoch": 0.9732788431310908, + "grad_norm": 1.8360021114349365, + "learning_rate": 3.973459314007278e-08, + "loss": 1.5852, + "step": 17802 + }, + { + "epoch": 0.9733335155748124, + "grad_norm": 1.2925795316696167, + "learning_rate": 3.957202067601751e-08, + "loss": 1.2953, + "step": 17803 + }, + { + "epoch": 0.973388188018534, + "grad_norm": 1.3201897144317627, + "learning_rate": 3.94097808140248e-08, + "loss": 1.6299, + "step": 17804 + }, + { + "epoch": 0.9734428604622555, + "grad_norm": 1.113189935684204, + "learning_rate": 3.924787355951254e-08, + "loss": 1.5171, + "step": 17805 + }, + { + "epoch": 0.9734975329059771, + "grad_norm": 1.9056694507598877, + "learning_rate": 3.908629891788862e-08, + "loss": 1.763, + "step": 17806 + }, + { + "epoch": 0.9735522053496987, + "grad_norm": 1.668828010559082, + "learning_rate": 3.89250568945454e-08, + "loss": 1.1301, + "step": 17807 + }, + { + "epoch": 0.9736068777934201, + "grad_norm": 2.685407876968384, + "learning_rate": 3.8764147494870787e-08, + "loss": 1.4324, + "step": 17808 + }, + { + "epoch": 0.9736615502371417, + "grad_norm": 1.3443710803985596, + "learning_rate": 3.860357072423493e-08, + "loss": 1.4541, + "step": 17809 + }, + { + "epoch": 0.9737162226808633, + "grad_norm": 1.3029398918151855, + "learning_rate": 3.844332658800132e-08, + "loss": 1.4897, + "step": 17810 + }, + { + "epoch": 0.9737708951245848, + "grad_norm": 1.7434377670288086, + "learning_rate": 3.8283415091521224e-08, + "loss": 1.4736, + "step": 17811 + }, + { + "epoch": 0.9738255675683064, + "grad_norm": 1.3707066774368286, + "learning_rate": 3.812383624013261e-08, + "loss": 1.5233, + "step": 17812 + }, + { + "epoch": 0.9738802400120279, + "grad_norm": 1.523341417312622, + "learning_rate": 3.7964590039165637e-08, + "loss": 1.3762, + "step": 17813 + }, + { + "epoch": 0.9739349124557495, + "grad_norm": 1.2292274236679077, + "learning_rate": 3.7805676493938294e-08, + "loss": 1.5091, + "step": 17814 + }, + { + "epoch": 0.9739895848994711, + "grad_norm": 1.7340688705444336, + "learning_rate": 3.764709560975632e-08, + "loss": 1.587, + "step": 17815 + }, + { + "epoch": 0.9740442573431926, + "grad_norm": 1.860113263130188, + "learning_rate": 3.7488847391915494e-08, + "loss": 1.3599, + "step": 17816 + }, + { + "epoch": 0.9740989297869141, + "grad_norm": 1.9973064661026, + "learning_rate": 3.733093184569936e-08, + "loss": 1.289, + "step": 17817 + }, + { + "epoch": 0.9741536022306357, + "grad_norm": 1.0980257987976074, + "learning_rate": 3.717334897638147e-08, + "loss": 1.6367, + "step": 17818 + }, + { + "epoch": 0.9742082746743572, + "grad_norm": 1.7088969945907593, + "learning_rate": 3.701609878922319e-08, + "loss": 1.4028, + "step": 17819 + }, + { + "epoch": 0.9742629471180788, + "grad_norm": 1.5895551443099976, + "learning_rate": 3.6859181289478077e-08, + "loss": 1.3794, + "step": 17820 + }, + { + "epoch": 0.9743176195618004, + "grad_norm": 1.997982382774353, + "learning_rate": 3.6702596482381946e-08, + "loss": 1.379, + "step": 17821 + }, + { + "epoch": 0.9743722920055219, + "grad_norm": 1.1225756406784058, + "learning_rate": 3.6546344373167286e-08, + "loss": 1.6596, + "step": 17822 + }, + { + "epoch": 0.9744269644492435, + "grad_norm": 1.7132270336151123, + "learning_rate": 3.639042496704992e-08, + "loss": 1.3318, + "step": 17823 + }, + { + "epoch": 0.9744816368929651, + "grad_norm": 1.2861257791519165, + "learning_rate": 3.6234838269236795e-08, + "loss": 1.3346, + "step": 17824 + }, + { + "epoch": 0.9745363093366866, + "grad_norm": 1.4129940271377563, + "learning_rate": 3.607958428492264e-08, + "loss": 1.3561, + "step": 17825 + }, + { + "epoch": 0.9745909817804081, + "grad_norm": 1.6777994632720947, + "learning_rate": 3.59246630192922e-08, + "loss": 1.588, + "step": 17826 + }, + { + "epoch": 0.9746456542241296, + "grad_norm": 1.524746298789978, + "learning_rate": 3.5770074477519124e-08, + "loss": 1.6563, + "step": 17827 + }, + { + "epoch": 0.9747003266678512, + "grad_norm": 1.6595878601074219, + "learning_rate": 3.5615818664764825e-08, + "loss": 1.2271, + "step": 17828 + }, + { + "epoch": 0.9747549991115728, + "grad_norm": 1.429929256439209, + "learning_rate": 3.546189558618074e-08, + "loss": 1.7, + "step": 17829 + }, + { + "epoch": 0.9748096715552943, + "grad_norm": 1.6564165353775024, + "learning_rate": 3.5308305246906096e-08, + "loss": 1.3957, + "step": 17830 + }, + { + "epoch": 0.9748643439990159, + "grad_norm": 2.63620662689209, + "learning_rate": 3.515504765207012e-08, + "loss": 1.2645, + "step": 17831 + }, + { + "epoch": 0.9749190164427375, + "grad_norm": 1.548210620880127, + "learning_rate": 3.500212280678983e-08, + "loss": 1.3303, + "step": 17832 + }, + { + "epoch": 0.974973688886459, + "grad_norm": 1.408143162727356, + "learning_rate": 3.4849530716173365e-08, + "loss": 1.4518, + "step": 17833 + }, + { + "epoch": 0.9750283613301806, + "grad_norm": 1.7504209280014038, + "learning_rate": 3.469727138531442e-08, + "loss": 1.4585, + "step": 17834 + }, + { + "epoch": 0.9750830337739022, + "grad_norm": 1.8482133150100708, + "learning_rate": 3.454534481929783e-08, + "loss": 1.286, + "step": 17835 + }, + { + "epoch": 0.9751377062176236, + "grad_norm": 1.3887896537780762, + "learning_rate": 3.439375102319731e-08, + "loss": 1.5008, + "step": 17836 + }, + { + "epoch": 0.9751923786613452, + "grad_norm": 1.545050024986267, + "learning_rate": 3.424249000207436e-08, + "loss": 1.3829, + "step": 17837 + }, + { + "epoch": 0.9752470511050668, + "grad_norm": 1.5668162107467651, + "learning_rate": 3.4091561760979384e-08, + "loss": 1.3434, + "step": 17838 + }, + { + "epoch": 0.9753017235487883, + "grad_norm": 1.372833490371704, + "learning_rate": 3.394096630495281e-08, + "loss": 1.6087, + "step": 17839 + }, + { + "epoch": 0.9753563959925099, + "grad_norm": 1.5234712362289429, + "learning_rate": 3.379070363902504e-08, + "loss": 1.3241, + "step": 17840 + }, + { + "epoch": 0.9754110684362314, + "grad_norm": 1.290772557258606, + "learning_rate": 3.3640773768210953e-08, + "loss": 1.8097, + "step": 17841 + }, + { + "epoch": 0.975465740879953, + "grad_norm": 1.7729862928390503, + "learning_rate": 3.349117669751767e-08, + "loss": 1.3958, + "step": 17842 + }, + { + "epoch": 0.9755204133236746, + "grad_norm": 1.7235682010650635, + "learning_rate": 3.3341912431942294e-08, + "loss": 1.4563, + "step": 17843 + }, + { + "epoch": 0.975575085767396, + "grad_norm": 1.3192498683929443, + "learning_rate": 3.31929809764675e-08, + "loss": 1.3214, + "step": 17844 + }, + { + "epoch": 0.9756297582111176, + "grad_norm": 1.3601940870285034, + "learning_rate": 3.30443823360671e-08, + "loss": 1.5862, + "step": 17845 + }, + { + "epoch": 0.9756844306548392, + "grad_norm": 1.5297389030456543, + "learning_rate": 3.289611651570268e-08, + "loss": 1.467, + "step": 17846 + }, + { + "epoch": 0.9757391030985607, + "grad_norm": 1.582661747932434, + "learning_rate": 3.2748183520325827e-08, + "loss": 1.4844, + "step": 17847 + }, + { + "epoch": 0.9757937755422823, + "grad_norm": 1.5520482063293457, + "learning_rate": 3.2600583354874814e-08, + "loss": 1.3354, + "step": 17848 + }, + { + "epoch": 0.9758484479860039, + "grad_norm": 1.5864250659942627, + "learning_rate": 3.245331602428126e-08, + "loss": 1.3784, + "step": 17849 + }, + { + "epoch": 0.9759031204297254, + "grad_norm": 1.3228490352630615, + "learning_rate": 3.230638153346011e-08, + "loss": 1.3775, + "step": 17850 + }, + { + "epoch": 0.975957792873447, + "grad_norm": 1.533451795578003, + "learning_rate": 3.215977988731855e-08, + "loss": 1.3811, + "step": 17851 + }, + { + "epoch": 0.9760124653171686, + "grad_norm": 2.0804502964019775, + "learning_rate": 3.201351109075268e-08, + "loss": 1.3929, + "step": 17852 + }, + { + "epoch": 0.97606713776089, + "grad_norm": 1.9782264232635498, + "learning_rate": 3.186757514864636e-08, + "loss": 1.3933, + "step": 17853 + }, + { + "epoch": 0.9761218102046116, + "grad_norm": 1.4668004512786865, + "learning_rate": 3.172197206587235e-08, + "loss": 1.3702, + "step": 17854 + }, + { + "epoch": 0.9761764826483331, + "grad_norm": 1.3141177892684937, + "learning_rate": 3.157670184729344e-08, + "loss": 1.2631, + "step": 17855 + }, + { + "epoch": 0.9762311550920547, + "grad_norm": 1.6022700071334839, + "learning_rate": 3.14317644977602e-08, + "loss": 1.2913, + "step": 17856 + }, + { + "epoch": 0.9762858275357763, + "grad_norm": 1.7196217775344849, + "learning_rate": 3.128716002211207e-08, + "loss": 1.6757, + "step": 17857 + }, + { + "epoch": 0.9763404999794978, + "grad_norm": 1.236960768699646, + "learning_rate": 3.114288842517743e-08, + "loss": 1.4081, + "step": 17858 + }, + { + "epoch": 0.9763951724232194, + "grad_norm": 1.5479689836502075, + "learning_rate": 3.099894971177464e-08, + "loss": 1.3054, + "step": 17859 + }, + { + "epoch": 0.976449844866941, + "grad_norm": 1.640597939491272, + "learning_rate": 3.085534388670986e-08, + "loss": 1.5304, + "step": 17860 + }, + { + "epoch": 0.9765045173106625, + "grad_norm": 1.224570393562317, + "learning_rate": 3.071207095477924e-08, + "loss": 1.6512, + "step": 17861 + }, + { + "epoch": 0.976559189754384, + "grad_norm": 1.4515522718429565, + "learning_rate": 3.056913092076563e-08, + "loss": 1.459, + "step": 17862 + }, + { + "epoch": 0.9766138621981056, + "grad_norm": 1.6232919692993164, + "learning_rate": 3.0426523789442994e-08, + "loss": 1.3248, + "step": 17863 + }, + { + "epoch": 0.9766685346418271, + "grad_norm": 1.90277099609375, + "learning_rate": 3.0284249565573076e-08, + "loss": 1.4239, + "step": 17864 + }, + { + "epoch": 0.9767232070855487, + "grad_norm": 1.6614792346954346, + "learning_rate": 3.0142308253906515e-08, + "loss": 1.8017, + "step": 17865 + }, + { + "epoch": 0.9767778795292703, + "grad_norm": 1.2921303510665894, + "learning_rate": 3.000069985918397e-08, + "loss": 1.3858, + "step": 17866 + }, + { + "epoch": 0.9768325519729918, + "grad_norm": 1.2605533599853516, + "learning_rate": 2.985942438613276e-08, + "loss": 1.3518, + "step": 17867 + }, + { + "epoch": 0.9768872244167134, + "grad_norm": 1.4917641878128052, + "learning_rate": 2.9718481839470236e-08, + "loss": 1.4598, + "step": 17868 + }, + { + "epoch": 0.9769418968604349, + "grad_norm": 1.7226054668426514, + "learning_rate": 2.9577872223904846e-08, + "loss": 1.5919, + "step": 17869 + }, + { + "epoch": 0.9769965693041565, + "grad_norm": 1.386763095855713, + "learning_rate": 2.9437595544130615e-08, + "loss": 1.2492, + "step": 17870 + }, + { + "epoch": 0.977051241747878, + "grad_norm": 1.4606815576553345, + "learning_rate": 2.9297651804830464e-08, + "loss": 1.3864, + "step": 17871 + }, + { + "epoch": 0.9771059141915995, + "grad_norm": 1.7691246271133423, + "learning_rate": 2.915804101067954e-08, + "loss": 1.4644, + "step": 17872 + }, + { + "epoch": 0.9771605866353211, + "grad_norm": 1.4107367992401123, + "learning_rate": 2.901876316633967e-08, + "loss": 1.398, + "step": 17873 + }, + { + "epoch": 0.9772152590790427, + "grad_norm": 1.3183534145355225, + "learning_rate": 2.887981827645936e-08, + "loss": 1.5917, + "step": 17874 + }, + { + "epoch": 0.9772699315227642, + "grad_norm": 1.6721670627593994, + "learning_rate": 2.874120634568156e-08, + "loss": 1.3391, + "step": 17875 + }, + { + "epoch": 0.9773246039664858, + "grad_norm": 1.2133901119232178, + "learning_rate": 2.860292737863146e-08, + "loss": 1.5437, + "step": 17876 + }, + { + "epoch": 0.9773792764102074, + "grad_norm": 1.4747192859649658, + "learning_rate": 2.8464981379929814e-08, + "loss": 1.4994, + "step": 17877 + }, + { + "epoch": 0.9774339488539289, + "grad_norm": 1.453108310699463, + "learning_rate": 2.8327368354180707e-08, + "loss": 1.4882, + "step": 17878 + }, + { + "epoch": 0.9774886212976505, + "grad_norm": 1.316787600517273, + "learning_rate": 2.8190088305979357e-08, + "loss": 1.5665, + "step": 17879 + }, + { + "epoch": 0.977543293741372, + "grad_norm": 1.367666482925415, + "learning_rate": 2.8053141239912095e-08, + "loss": 1.4589, + "step": 17880 + }, + { + "epoch": 0.9775979661850935, + "grad_norm": 1.6273900270462036, + "learning_rate": 2.791652716054971e-08, + "loss": 1.3184, + "step": 17881 + }, + { + "epoch": 0.9776526386288151, + "grad_norm": 1.7790653705596924, + "learning_rate": 2.778024607245411e-08, + "loss": 1.4631, + "step": 17882 + }, + { + "epoch": 0.9777073110725366, + "grad_norm": 1.773590326309204, + "learning_rate": 2.7644297980177204e-08, + "loss": 1.3759, + "step": 17883 + }, + { + "epoch": 0.9777619835162582, + "grad_norm": 2.0136990547180176, + "learning_rate": 2.7508682888257587e-08, + "loss": 1.3215, + "step": 17884 + }, + { + "epoch": 0.9778166559599798, + "grad_norm": 1.8650541305541992, + "learning_rate": 2.737340080122497e-08, + "loss": 1.6659, + "step": 17885 + }, + { + "epoch": 0.9778713284037013, + "grad_norm": 1.5791518688201904, + "learning_rate": 2.7238451723594628e-08, + "loss": 1.3209, + "step": 17886 + }, + { + "epoch": 0.9779260008474229, + "grad_norm": 1.5766865015029907, + "learning_rate": 2.7103835659875177e-08, + "loss": 1.474, + "step": 17887 + }, + { + "epoch": 0.9779806732911445, + "grad_norm": 2.0432088375091553, + "learning_rate": 2.69695526145608e-08, + "loss": 1.3041, + "step": 17888 + }, + { + "epoch": 0.978035345734866, + "grad_norm": 1.3252812623977661, + "learning_rate": 2.683560259213569e-08, + "loss": 1.8098, + "step": 17889 + }, + { + "epoch": 0.9780900181785875, + "grad_norm": 1.6488494873046875, + "learning_rate": 2.6701985597071822e-08, + "loss": 1.4946, + "step": 17890 + }, + { + "epoch": 0.9781446906223091, + "grad_norm": 2.5018205642700195, + "learning_rate": 2.6568701633832295e-08, + "loss": 1.2856, + "step": 17891 + }, + { + "epoch": 0.9781993630660306, + "grad_norm": 1.5136622190475464, + "learning_rate": 2.643575070686688e-08, + "loss": 1.3028, + "step": 17892 + }, + { + "epoch": 0.9782540355097522, + "grad_norm": 1.703057050704956, + "learning_rate": 2.6303132820616474e-08, + "loss": 1.5352, + "step": 17893 + }, + { + "epoch": 0.9783087079534738, + "grad_norm": 1.8717290163040161, + "learning_rate": 2.617084797950753e-08, + "loss": 1.3759, + "step": 17894 + }, + { + "epoch": 0.9783633803971953, + "grad_norm": 1.639100432395935, + "learning_rate": 2.603889618795874e-08, + "loss": 1.491, + "step": 17895 + }, + { + "epoch": 0.9784180528409169, + "grad_norm": 1.5967373847961426, + "learning_rate": 2.590727745037547e-08, + "loss": 1.4815, + "step": 17896 + }, + { + "epoch": 0.9784727252846384, + "grad_norm": 1.3627288341522217, + "learning_rate": 2.5775991771153085e-08, + "loss": 1.7538, + "step": 17897 + }, + { + "epoch": 0.97852739772836, + "grad_norm": 1.4720652103424072, + "learning_rate": 2.5645039154675867e-08, + "loss": 1.5676, + "step": 17898 + }, + { + "epoch": 0.9785820701720815, + "grad_norm": 1.4767831563949585, + "learning_rate": 2.5514419605315867e-08, + "loss": 1.4443, + "step": 17899 + }, + { + "epoch": 0.978636742615803, + "grad_norm": 1.8764714002609253, + "learning_rate": 2.538413312743515e-08, + "loss": 1.3896, + "step": 17900 + }, + { + "epoch": 0.9786914150595246, + "grad_norm": 2.9162049293518066, + "learning_rate": 2.5254179725384686e-08, + "loss": 1.1868, + "step": 17901 + }, + { + "epoch": 0.9787460875032462, + "grad_norm": 1.430561900138855, + "learning_rate": 2.5124559403504334e-08, + "loss": 1.5693, + "step": 17902 + }, + { + "epoch": 0.9788007599469677, + "grad_norm": 1.2461543083190918, + "learning_rate": 2.499527216612063e-08, + "loss": 1.9801, + "step": 17903 + }, + { + "epoch": 0.9788554323906893, + "grad_norm": 1.9691259860992432, + "learning_rate": 2.486631801755235e-08, + "loss": 1.3239, + "step": 17904 + }, + { + "epoch": 0.9789101048344109, + "grad_norm": 1.987707495689392, + "learning_rate": 2.4737696962106038e-08, + "loss": 1.4665, + "step": 17905 + }, + { + "epoch": 0.9789647772781324, + "grad_norm": 1.249052882194519, + "learning_rate": 2.4609409004074937e-08, + "loss": 1.289, + "step": 17906 + }, + { + "epoch": 0.979019449721854, + "grad_norm": 1.1923905611038208, + "learning_rate": 2.448145414774339e-08, + "loss": 1.7789, + "step": 17907 + }, + { + "epoch": 0.9790741221655755, + "grad_norm": 1.8396843671798706, + "learning_rate": 2.4353832397384647e-08, + "loss": 1.2563, + "step": 17908 + }, + { + "epoch": 0.979128794609297, + "grad_norm": 1.3212076425552368, + "learning_rate": 2.4226543757259746e-08, + "loss": 1.2887, + "step": 17909 + }, + { + "epoch": 0.9791834670530186, + "grad_norm": 1.3279849290847778, + "learning_rate": 2.4099588231619732e-08, + "loss": 1.2774, + "step": 17910 + }, + { + "epoch": 0.9792381394967401, + "grad_norm": 1.7950818538665771, + "learning_rate": 2.3972965824703433e-08, + "loss": 1.6053, + "step": 17911 + }, + { + "epoch": 0.9792928119404617, + "grad_norm": 1.6148021221160889, + "learning_rate": 2.3846676540739687e-08, + "loss": 1.5212, + "step": 17912 + }, + { + "epoch": 0.9793474843841833, + "grad_norm": 1.58357834815979, + "learning_rate": 2.372072038394402e-08, + "loss": 1.6163, + "step": 17913 + }, + { + "epoch": 0.9794021568279048, + "grad_norm": 1.616226315498352, + "learning_rate": 2.3595097358525275e-08, + "loss": 1.3562, + "step": 17914 + }, + { + "epoch": 0.9794568292716264, + "grad_norm": 1.4938286542892456, + "learning_rate": 2.3469807468675664e-08, + "loss": 1.178, + "step": 17915 + }, + { + "epoch": 0.979511501715348, + "grad_norm": 1.3709536790847778, + "learning_rate": 2.3344850718579613e-08, + "loss": 1.2642, + "step": 17916 + }, + { + "epoch": 0.9795661741590694, + "grad_norm": 1.477663516998291, + "learning_rate": 2.3220227112410455e-08, + "loss": 1.4299, + "step": 17917 + }, + { + "epoch": 0.979620846602791, + "grad_norm": 1.390068531036377, + "learning_rate": 2.3095936654328187e-08, + "loss": 1.6005, + "step": 17918 + }, + { + "epoch": 0.9796755190465126, + "grad_norm": 1.548303484916687, + "learning_rate": 2.2971979348485053e-08, + "loss": 1.516, + "step": 17919 + }, + { + "epoch": 0.9797301914902341, + "grad_norm": 1.850728154182434, + "learning_rate": 2.2848355199019954e-08, + "loss": 1.5841, + "step": 17920 + }, + { + "epoch": 0.9797848639339557, + "grad_norm": 1.2575500011444092, + "learning_rate": 2.27250642100596e-08, + "loss": 1.6007, + "step": 17921 + }, + { + "epoch": 0.9798395363776773, + "grad_norm": 1.713570475578308, + "learning_rate": 2.26021063857218e-08, + "loss": 1.331, + "step": 17922 + }, + { + "epoch": 0.9798942088213988, + "grad_norm": 1.7767891883850098, + "learning_rate": 2.2479481730112162e-08, + "loss": 1.4971, + "step": 17923 + }, + { + "epoch": 0.9799488812651204, + "grad_norm": 1.6262617111206055, + "learning_rate": 2.23571902473263e-08, + "loss": 1.3366, + "step": 17924 + }, + { + "epoch": 0.9800035537088418, + "grad_norm": 1.4280132055282593, + "learning_rate": 2.22352319414465e-08, + "loss": 1.1224, + "step": 17925 + }, + { + "epoch": 0.9800582261525634, + "grad_norm": 1.7851969003677368, + "learning_rate": 2.2113606816546172e-08, + "loss": 1.481, + "step": 17926 + }, + { + "epoch": 0.980112898596285, + "grad_norm": 1.329948902130127, + "learning_rate": 2.1992314876686515e-08, + "loss": 1.3521, + "step": 17927 + }, + { + "epoch": 0.9801675710400065, + "grad_norm": 1.6541680097579956, + "learning_rate": 2.187135612591651e-08, + "loss": 1.2518, + "step": 17928 + }, + { + "epoch": 0.9802222434837281, + "grad_norm": 1.354457974433899, + "learning_rate": 2.1750730568277366e-08, + "loss": 1.3902, + "step": 17929 + }, + { + "epoch": 0.9802769159274497, + "grad_norm": 1.6876780986785889, + "learning_rate": 2.1630438207795866e-08, + "loss": 1.4009, + "step": 17930 + }, + { + "epoch": 0.9803315883711712, + "grad_norm": 1.512227177619934, + "learning_rate": 2.1510479048488797e-08, + "loss": 1.5454, + "step": 17931 + }, + { + "epoch": 0.9803862608148928, + "grad_norm": 1.6301887035369873, + "learning_rate": 2.139085309436184e-08, + "loss": 1.3911, + "step": 17932 + }, + { + "epoch": 0.9804409332586144, + "grad_norm": 2.149733066558838, + "learning_rate": 2.127156034941069e-08, + "loss": 1.5024, + "step": 17933 + }, + { + "epoch": 0.9804956057023358, + "grad_norm": 1.6432298421859741, + "learning_rate": 2.115260081761772e-08, + "loss": 1.2645, + "step": 17934 + }, + { + "epoch": 0.9805502781460574, + "grad_norm": 1.798677921295166, + "learning_rate": 2.10339745029553e-08, + "loss": 1.3091, + "step": 17935 + }, + { + "epoch": 0.980604950589779, + "grad_norm": 1.5478899478912354, + "learning_rate": 2.0915681409384713e-08, + "loss": 1.5148, + "step": 17936 + }, + { + "epoch": 0.9806596230335005, + "grad_norm": 1.3170820474624634, + "learning_rate": 2.0797721540856132e-08, + "loss": 1.4739, + "step": 17937 + }, + { + "epoch": 0.9807142954772221, + "grad_norm": 1.6750907897949219, + "learning_rate": 2.0680094901308622e-08, + "loss": 1.4498, + "step": 17938 + }, + { + "epoch": 0.9807689679209437, + "grad_norm": 1.543102502822876, + "learning_rate": 2.056280149467016e-08, + "loss": 1.477, + "step": 17939 + }, + { + "epoch": 0.9808236403646652, + "grad_norm": 1.2508125305175781, + "learning_rate": 2.0445841324856497e-08, + "loss": 1.4577, + "step": 17940 + }, + { + "epoch": 0.9808783128083868, + "grad_norm": 1.5266450643539429, + "learning_rate": 2.032921439577562e-08, + "loss": 1.5032, + "step": 17941 + }, + { + "epoch": 0.9809329852521083, + "grad_norm": 1.4480253458023071, + "learning_rate": 2.021292071131886e-08, + "loss": 1.2827, + "step": 17942 + }, + { + "epoch": 0.9809876576958299, + "grad_norm": 1.7015191316604614, + "learning_rate": 2.0096960275370893e-08, + "loss": 1.2826, + "step": 17943 + }, + { + "epoch": 0.9810423301395514, + "grad_norm": 1.6113941669464111, + "learning_rate": 1.9981333091805278e-08, + "loss": 1.4412, + "step": 17944 + }, + { + "epoch": 0.9810970025832729, + "grad_norm": 1.359933853149414, + "learning_rate": 1.9866039164481156e-08, + "loss": 1.3428, + "step": 17945 + }, + { + "epoch": 0.9811516750269945, + "grad_norm": 1.5707628726959229, + "learning_rate": 1.9751078497248777e-08, + "loss": 1.7035, + "step": 17946 + }, + { + "epoch": 0.9812063474707161, + "grad_norm": 1.367645025253296, + "learning_rate": 1.9636451093947296e-08, + "loss": 1.2817, + "step": 17947 + }, + { + "epoch": 0.9812610199144376, + "grad_norm": 1.3623080253601074, + "learning_rate": 1.9522156958404758e-08, + "loss": 1.5388, + "step": 17948 + }, + { + "epoch": 0.9813156923581592, + "grad_norm": 1.7430005073547363, + "learning_rate": 1.9408196094437004e-08, + "loss": 1.6055, + "step": 17949 + }, + { + "epoch": 0.9813703648018808, + "grad_norm": 1.2906516790390015, + "learning_rate": 1.9294568505849876e-08, + "loss": 1.6859, + "step": 17950 + }, + { + "epoch": 0.9814250372456023, + "grad_norm": 1.4089373350143433, + "learning_rate": 1.9181274196437005e-08, + "loss": 1.3193, + "step": 17951 + }, + { + "epoch": 0.9814797096893239, + "grad_norm": 1.5625331401824951, + "learning_rate": 1.9068313169983144e-08, + "loss": 1.5091, + "step": 17952 + }, + { + "epoch": 0.9815343821330454, + "grad_norm": 1.5987474918365479, + "learning_rate": 1.8955685430258608e-08, + "loss": 1.3453, + "step": 17953 + }, + { + "epoch": 0.9815890545767669, + "grad_norm": 1.8118940591812134, + "learning_rate": 1.8843390981024835e-08, + "loss": 1.2316, + "step": 17954 + }, + { + "epoch": 0.9816437270204885, + "grad_norm": 2.01167631149292, + "learning_rate": 1.8731429826032154e-08, + "loss": 1.345, + "step": 17955 + }, + { + "epoch": 0.98169839946421, + "grad_norm": 1.5382815599441528, + "learning_rate": 1.8619801969018692e-08, + "loss": 1.2608, + "step": 17956 + }, + { + "epoch": 0.9817530719079316, + "grad_norm": 1.3027026653289795, + "learning_rate": 1.8508507413712572e-08, + "loss": 1.3576, + "step": 17957 + }, + { + "epoch": 0.9818077443516532, + "grad_norm": 1.5246679782867432, + "learning_rate": 1.8397546163829716e-08, + "loss": 1.6797, + "step": 17958 + }, + { + "epoch": 0.9818624167953747, + "grad_norm": 1.3801751136779785, + "learning_rate": 1.8286918223074935e-08, + "loss": 1.2815, + "step": 17959 + }, + { + "epoch": 0.9819170892390963, + "grad_norm": 1.35205078125, + "learning_rate": 1.817662359514194e-08, + "loss": 1.4497, + "step": 17960 + }, + { + "epoch": 0.9819717616828179, + "grad_norm": 1.744530200958252, + "learning_rate": 1.8066662283715562e-08, + "loss": 1.5065, + "step": 17961 + }, + { + "epoch": 0.9820264341265393, + "grad_norm": 1.8839616775512695, + "learning_rate": 1.7957034292466204e-08, + "loss": 1.42, + "step": 17962 + }, + { + "epoch": 0.9820811065702609, + "grad_norm": 1.434599757194519, + "learning_rate": 1.7847739625055372e-08, + "loss": 1.5609, + "step": 17963 + }, + { + "epoch": 0.9821357790139825, + "grad_norm": 1.2340019941329956, + "learning_rate": 1.7738778285132373e-08, + "loss": 1.4328, + "step": 17964 + }, + { + "epoch": 0.982190451457704, + "grad_norm": 1.6210458278656006, + "learning_rate": 1.7630150276336523e-08, + "loss": 1.3853, + "step": 17965 + }, + { + "epoch": 0.9822451239014256, + "grad_norm": 1.427489995956421, + "learning_rate": 1.7521855602292693e-08, + "loss": 1.5756, + "step": 17966 + }, + { + "epoch": 0.9822997963451472, + "grad_norm": 1.910926342010498, + "learning_rate": 1.7413894266619104e-08, + "loss": 1.4182, + "step": 17967 + }, + { + "epoch": 0.9823544687888687, + "grad_norm": 1.311643362045288, + "learning_rate": 1.7306266272921756e-08, + "loss": 1.4392, + "step": 17968 + }, + { + "epoch": 0.9824091412325903, + "grad_norm": 1.8483372926712036, + "learning_rate": 1.7198971624792226e-08, + "loss": 1.6136, + "step": 17969 + }, + { + "epoch": 0.9824638136763117, + "grad_norm": 1.4528543949127197, + "learning_rate": 1.7092010325814312e-08, + "loss": 1.5943, + "step": 17970 + }, + { + "epoch": 0.9825184861200333, + "grad_norm": 2.128981828689575, + "learning_rate": 1.6985382379559602e-08, + "loss": 1.4091, + "step": 17971 + }, + { + "epoch": 0.9825731585637549, + "grad_norm": 1.6027917861938477, + "learning_rate": 1.6879087789589687e-08, + "loss": 1.4672, + "step": 17972 + }, + { + "epoch": 0.9826278310074764, + "grad_norm": 1.380798578262329, + "learning_rate": 1.6773126559452845e-08, + "loss": 1.4369, + "step": 17973 + }, + { + "epoch": 0.982682503451198, + "grad_norm": 2.014335870742798, + "learning_rate": 1.6667498692687358e-08, + "loss": 1.4633, + "step": 17974 + }, + { + "epoch": 0.9827371758949196, + "grad_norm": 1.8359943628311157, + "learning_rate": 1.6562204192821507e-08, + "loss": 1.3454, + "step": 17975 + }, + { + "epoch": 0.9827918483386411, + "grad_norm": 2.1775176525115967, + "learning_rate": 1.6457243063370265e-08, + "loss": 1.3638, + "step": 17976 + }, + { + "epoch": 0.9828465207823627, + "grad_norm": 1.6118463277816772, + "learning_rate": 1.6352615307838604e-08, + "loss": 1.7159, + "step": 17977 + }, + { + "epoch": 0.9829011932260843, + "grad_norm": 1.3925389051437378, + "learning_rate": 1.6248320929719285e-08, + "loss": 1.6219, + "step": 17978 + }, + { + "epoch": 0.9829558656698058, + "grad_norm": 1.7474768161773682, + "learning_rate": 1.61443599324973e-08, + "loss": 1.2782, + "step": 17979 + }, + { + "epoch": 0.9830105381135273, + "grad_norm": 1.4746543169021606, + "learning_rate": 1.6040732319643204e-08, + "loss": 1.466, + "step": 17980 + }, + { + "epoch": 0.9830652105572489, + "grad_norm": 1.4174506664276123, + "learning_rate": 1.5937438094617564e-08, + "loss": 1.5267, + "step": 17981 + }, + { + "epoch": 0.9831198830009704, + "grad_norm": 1.5212726593017578, + "learning_rate": 1.583447726086762e-08, + "loss": 1.4342, + "step": 17982 + }, + { + "epoch": 0.983174555444692, + "grad_norm": 1.3297951221466064, + "learning_rate": 1.5731849821833955e-08, + "loss": 1.4476, + "step": 17983 + }, + { + "epoch": 0.9832292278884135, + "grad_norm": 1.5770055055618286, + "learning_rate": 1.5629555780942717e-08, + "loss": 1.4686, + "step": 17984 + }, + { + "epoch": 0.9832839003321351, + "grad_norm": 1.5902786254882812, + "learning_rate": 1.5527595141610064e-08, + "loss": 1.4624, + "step": 17985 + }, + { + "epoch": 0.9833385727758567, + "grad_norm": 1.7052278518676758, + "learning_rate": 1.5425967907239935e-08, + "loss": 1.4678, + "step": 17986 + }, + { + "epoch": 0.9833932452195782, + "grad_norm": 1.9112871885299683, + "learning_rate": 1.5324674081226286e-08, + "loss": 1.2265, + "step": 17987 + }, + { + "epoch": 0.9834479176632998, + "grad_norm": 1.786947250366211, + "learning_rate": 1.5223713666950857e-08, + "loss": 1.5061, + "step": 17988 + }, + { + "epoch": 0.9835025901070213, + "grad_norm": 1.244701862335205, + "learning_rate": 1.5123086667786502e-08, + "loss": 1.3321, + "step": 17989 + }, + { + "epoch": 0.9835572625507428, + "grad_norm": 1.4789767265319824, + "learning_rate": 1.5022793087092757e-08, + "loss": 1.3532, + "step": 17990 + }, + { + "epoch": 0.9836119349944644, + "grad_norm": 1.8350359201431274, + "learning_rate": 1.4922832928218058e-08, + "loss": 1.44, + "step": 17991 + }, + { + "epoch": 0.983666607438186, + "grad_norm": 1.5841344594955444, + "learning_rate": 1.4823206194499728e-08, + "loss": 1.4885, + "step": 17992 + }, + { + "epoch": 0.9837212798819075, + "grad_norm": 1.6465811729431152, + "learning_rate": 1.4723912889266224e-08, + "loss": 1.4741, + "step": 17993 + }, + { + "epoch": 0.9837759523256291, + "grad_norm": 1.8360543251037598, + "learning_rate": 1.4624953015832666e-08, + "loss": 1.3912, + "step": 17994 + }, + { + "epoch": 0.9838306247693507, + "grad_norm": 1.4436904191970825, + "learning_rate": 1.452632657750308e-08, + "loss": 1.249, + "step": 17995 + }, + { + "epoch": 0.9838852972130722, + "grad_norm": 1.5068265199661255, + "learning_rate": 1.4428033577571498e-08, + "loss": 1.2725, + "step": 17996 + }, + { + "epoch": 0.9839399696567938, + "grad_norm": 2.4360368251800537, + "learning_rate": 1.433007401931974e-08, + "loss": 1.1046, + "step": 17997 + }, + { + "epoch": 0.9839946421005152, + "grad_norm": 1.9103964567184448, + "learning_rate": 1.423244790601852e-08, + "loss": 1.3906, + "step": 17998 + }, + { + "epoch": 0.9840493145442368, + "grad_norm": 1.4174911975860596, + "learning_rate": 1.4135155240928566e-08, + "loss": 1.5784, + "step": 17999 + }, + { + "epoch": 0.9841039869879584, + "grad_norm": 1.3439942598342896, + "learning_rate": 1.4038196027298389e-08, + "loss": 1.5903, + "step": 18000 + }, + { + "epoch": 0.9841586594316799, + "grad_norm": 1.220409870147705, + "learning_rate": 1.3941570268365401e-08, + "loss": 1.3893, + "step": 18001 + }, + { + "epoch": 0.9842133318754015, + "grad_norm": 1.6014363765716553, + "learning_rate": 1.3845277967355908e-08, + "loss": 1.3527, + "step": 18002 + }, + { + "epoch": 0.9842680043191231, + "grad_norm": 1.7899733781814575, + "learning_rate": 1.3749319127486228e-08, + "loss": 1.4584, + "step": 18003 + }, + { + "epoch": 0.9843226767628446, + "grad_norm": 1.628475546836853, + "learning_rate": 1.3653693751960461e-08, + "loss": 1.6784, + "step": 18004 + }, + { + "epoch": 0.9843773492065662, + "grad_norm": 1.6807347536087036, + "learning_rate": 1.3558401843971613e-08, + "loss": 1.3667, + "step": 18005 + }, + { + "epoch": 0.9844320216502878, + "grad_norm": 1.4954643249511719, + "learning_rate": 1.3463443406701581e-08, + "loss": 1.534, + "step": 18006 + }, + { + "epoch": 0.9844866940940092, + "grad_norm": 1.2739770412445068, + "learning_rate": 1.3368818443321163e-08, + "loss": 1.5237, + "step": 18007 + }, + { + "epoch": 0.9845413665377308, + "grad_norm": 1.763657569885254, + "learning_rate": 1.3274526956990052e-08, + "loss": 1.4441, + "step": 18008 + }, + { + "epoch": 0.9845960389814524, + "grad_norm": 1.3959156274795532, + "learning_rate": 1.3180568950856843e-08, + "loss": 1.5376, + "step": 18009 + }, + { + "epoch": 0.9846507114251739, + "grad_norm": 1.6612459421157837, + "learning_rate": 1.3086944428060132e-08, + "loss": 1.3772, + "step": 18010 + }, + { + "epoch": 0.9847053838688955, + "grad_norm": 1.8009002208709717, + "learning_rate": 1.2993653391725204e-08, + "loss": 1.6116, + "step": 18011 + }, + { + "epoch": 0.984760056312617, + "grad_norm": 1.2931902408599854, + "learning_rate": 1.2900695844967336e-08, + "loss": 1.4967, + "step": 18012 + }, + { + "epoch": 0.9848147287563386, + "grad_norm": 1.6816084384918213, + "learning_rate": 1.2808071790889609e-08, + "loss": 1.5216, + "step": 18013 + }, + { + "epoch": 0.9848694012000602, + "grad_norm": 1.5357741117477417, + "learning_rate": 1.271578123258732e-08, + "loss": 1.2972, + "step": 18014 + }, + { + "epoch": 0.9849240736437817, + "grad_norm": 1.6421679258346558, + "learning_rate": 1.2623824173140231e-08, + "loss": 1.3819, + "step": 18015 + }, + { + "epoch": 0.9849787460875032, + "grad_norm": 1.5384091138839722, + "learning_rate": 1.2532200615620327e-08, + "loss": 1.4889, + "step": 18016 + }, + { + "epoch": 0.9850334185312248, + "grad_norm": 1.2580510377883911, + "learning_rate": 1.2440910563086273e-08, + "loss": 1.2281, + "step": 18017 + }, + { + "epoch": 0.9850880909749463, + "grad_norm": 3.6593010425567627, + "learning_rate": 1.234995401858785e-08, + "loss": 1.3275, + "step": 18018 + }, + { + "epoch": 0.9851427634186679, + "grad_norm": 1.6834295988082886, + "learning_rate": 1.2259330985159302e-08, + "loss": 1.5829, + "step": 18019 + }, + { + "epoch": 0.9851974358623895, + "grad_norm": 1.6838335990905762, + "learning_rate": 1.2169041465830423e-08, + "loss": 1.4268, + "step": 18020 + }, + { + "epoch": 0.985252108306111, + "grad_norm": 1.5181046724319458, + "learning_rate": 1.2079085463613249e-08, + "loss": 1.4696, + "step": 18021 + }, + { + "epoch": 0.9853067807498326, + "grad_norm": 1.3638153076171875, + "learning_rate": 1.1989462981513156e-08, + "loss": 1.3711, + "step": 18022 + }, + { + "epoch": 0.9853614531935542, + "grad_norm": 1.447048306465149, + "learning_rate": 1.1900174022522192e-08, + "loss": 1.7229, + "step": 18023 + }, + { + "epoch": 0.9854161256372757, + "grad_norm": 1.7211382389068604, + "learning_rate": 1.181121858962353e-08, + "loss": 1.4391, + "step": 18024 + }, + { + "epoch": 0.9854707980809972, + "grad_norm": 1.7483692169189453, + "learning_rate": 1.1722596685784793e-08, + "loss": 1.5649, + "step": 18025 + }, + { + "epoch": 0.9855254705247187, + "grad_norm": 1.7472002506256104, + "learning_rate": 1.1634308313966947e-08, + "loss": 1.3183, + "step": 18026 + }, + { + "epoch": 0.9855801429684403, + "grad_norm": 1.5646157264709473, + "learning_rate": 1.1546353477118743e-08, + "loss": 1.277, + "step": 18027 + }, + { + "epoch": 0.9856348154121619, + "grad_norm": 1.6757290363311768, + "learning_rate": 1.1458732178175613e-08, + "loss": 1.6167, + "step": 18028 + }, + { + "epoch": 0.9856894878558834, + "grad_norm": 1.583946943283081, + "learning_rate": 1.1371444420065214e-08, + "loss": 1.4119, + "step": 18029 + }, + { + "epoch": 0.985744160299605, + "grad_norm": 1.3283075094223022, + "learning_rate": 1.1284490205700771e-08, + "loss": 1.403, + "step": 18030 + }, + { + "epoch": 0.9857988327433266, + "grad_norm": 2.109675645828247, + "learning_rate": 1.1197869537986627e-08, + "loss": 1.327, + "step": 18031 + }, + { + "epoch": 0.9858535051870481, + "grad_norm": 1.736030101776123, + "learning_rate": 1.1111582419814914e-08, + "loss": 1.4448, + "step": 18032 + }, + { + "epoch": 0.9859081776307697, + "grad_norm": 1.320744514465332, + "learning_rate": 1.102562885406666e-08, + "loss": 1.3128, + "step": 18033 + }, + { + "epoch": 0.9859628500744912, + "grad_norm": 1.5801371335983276, + "learning_rate": 1.0940008843612904e-08, + "loss": 1.3615, + "step": 18034 + }, + { + "epoch": 0.9860175225182127, + "grad_norm": 1.2509557008743286, + "learning_rate": 1.0854722391312467e-08, + "loss": 1.5167, + "step": 18035 + }, + { + "epoch": 0.9860721949619343, + "grad_norm": 1.2605632543563843, + "learning_rate": 1.0769769500013072e-08, + "loss": 1.4252, + "step": 18036 + }, + { + "epoch": 0.9861268674056559, + "grad_norm": 1.467165231704712, + "learning_rate": 1.068515017255245e-08, + "loss": 1.3194, + "step": 18037 + }, + { + "epoch": 0.9861815398493774, + "grad_norm": 1.9028055667877197, + "learning_rate": 1.0600864411753897e-08, + "loss": 1.3752, + "step": 18038 + }, + { + "epoch": 0.986236212293099, + "grad_norm": 1.702620267868042, + "learning_rate": 1.051691222043405e-08, + "loss": 1.4599, + "step": 18039 + }, + { + "epoch": 0.9862908847368205, + "grad_norm": 1.5771998167037964, + "learning_rate": 1.0433293601395112e-08, + "loss": 1.456, + "step": 18040 + }, + { + "epoch": 0.9863455571805421, + "grad_norm": 1.5104976892471313, + "learning_rate": 1.0350008557430402e-08, + "loss": 1.5258, + "step": 18041 + }, + { + "epoch": 0.9864002296242637, + "grad_norm": 1.4022786617279053, + "learning_rate": 1.0267057091319921e-08, + "loss": 1.5683, + "step": 18042 + }, + { + "epoch": 0.9864549020679851, + "grad_norm": 1.344634771347046, + "learning_rate": 1.0184439205833675e-08, + "loss": 1.5823, + "step": 18043 + }, + { + "epoch": 0.9865095745117067, + "grad_norm": 1.8762515783309937, + "learning_rate": 1.0102154903731676e-08, + "loss": 1.5051, + "step": 18044 + }, + { + "epoch": 0.9865642469554283, + "grad_norm": 1.510294795036316, + "learning_rate": 1.0020204187759507e-08, + "loss": 1.4453, + "step": 18045 + }, + { + "epoch": 0.9866189193991498, + "grad_norm": 1.3982045650482178, + "learning_rate": 9.93858706065609e-09, + "loss": 1.4042, + "step": 18046 + }, + { + "epoch": 0.9866735918428714, + "grad_norm": 1.943657398223877, + "learning_rate": 9.857303525145911e-09, + "loss": 1.1991, + "step": 18047 + }, + { + "epoch": 0.986728264286593, + "grad_norm": 1.8489307165145874, + "learning_rate": 9.776353583942356e-09, + "loss": 1.3079, + "step": 18048 + }, + { + "epoch": 0.9867829367303145, + "grad_norm": 1.574211597442627, + "learning_rate": 9.695737239748815e-09, + "loss": 1.4507, + "step": 18049 + }, + { + "epoch": 0.9868376091740361, + "grad_norm": 1.4079437255859375, + "learning_rate": 9.615454495257581e-09, + "loss": 1.4735, + "step": 18050 + }, + { + "epoch": 0.9868922816177577, + "grad_norm": 1.6813846826553345, + "learning_rate": 9.535505353149843e-09, + "loss": 1.2386, + "step": 18051 + }, + { + "epoch": 0.9869469540614791, + "grad_norm": 1.2993543148040771, + "learning_rate": 9.455889816095687e-09, + "loss": 1.2252, + "step": 18052 + }, + { + "epoch": 0.9870016265052007, + "grad_norm": 1.4434391260147095, + "learning_rate": 9.376607886751876e-09, + "loss": 1.8243, + "step": 18053 + }, + { + "epoch": 0.9870562989489222, + "grad_norm": 1.7387789487838745, + "learning_rate": 9.297659567767403e-09, + "loss": 1.3367, + "step": 18054 + }, + { + "epoch": 0.9871109713926438, + "grad_norm": 1.3260489702224731, + "learning_rate": 9.219044861777937e-09, + "loss": 1.6432, + "step": 18055 + }, + { + "epoch": 0.9871656438363654, + "grad_norm": 1.5994552373886108, + "learning_rate": 9.140763771408045e-09, + "loss": 1.5114, + "step": 18056 + }, + { + "epoch": 0.9872203162800869, + "grad_norm": 1.5331673622131348, + "learning_rate": 9.062816299272304e-09, + "loss": 1.3057, + "step": 18057 + }, + { + "epoch": 0.9872749887238085, + "grad_norm": 1.5809857845306396, + "learning_rate": 8.985202447974183e-09, + "loss": 1.4338, + "step": 18058 + }, + { + "epoch": 0.9873296611675301, + "grad_norm": 1.9266173839569092, + "learning_rate": 8.907922220104947e-09, + "loss": 1.33, + "step": 18059 + }, + { + "epoch": 0.9873843336112516, + "grad_norm": 1.4101786613464355, + "learning_rate": 8.830975618244752e-09, + "loss": 1.5039, + "step": 18060 + }, + { + "epoch": 0.9874390060549731, + "grad_norm": 1.7372101545333862, + "learning_rate": 8.754362644963765e-09, + "loss": 1.4327, + "step": 18061 + }, + { + "epoch": 0.9874936784986947, + "grad_norm": 1.3555986881256104, + "learning_rate": 8.67808330281883e-09, + "loss": 1.4126, + "step": 18062 + }, + { + "epoch": 0.9875483509424162, + "grad_norm": 1.4791145324707031, + "learning_rate": 8.602137594359016e-09, + "loss": 1.3749, + "step": 18063 + }, + { + "epoch": 0.9876030233861378, + "grad_norm": 1.4618884325027466, + "learning_rate": 8.526525522118967e-09, + "loss": 1.3833, + "step": 18064 + }, + { + "epoch": 0.9876576958298594, + "grad_norm": 1.8712913990020752, + "learning_rate": 8.451247088623327e-09, + "loss": 1.4168, + "step": 18065 + }, + { + "epoch": 0.9877123682735809, + "grad_norm": 1.3830574750900269, + "learning_rate": 8.376302296387862e-09, + "loss": 1.4869, + "step": 18066 + }, + { + "epoch": 0.9877670407173025, + "grad_norm": 1.40156090259552, + "learning_rate": 8.301691147912794e-09, + "loss": 1.4502, + "step": 18067 + }, + { + "epoch": 0.987821713161024, + "grad_norm": 1.7078394889831543, + "learning_rate": 8.227413645690574e-09, + "loss": 1.4069, + "step": 18068 + }, + { + "epoch": 0.9878763856047456, + "grad_norm": 1.3595595359802246, + "learning_rate": 8.15346979220144e-09, + "loss": 1.3765, + "step": 18069 + }, + { + "epoch": 0.9879310580484671, + "grad_norm": 1.38761305809021, + "learning_rate": 8.07985958991453e-09, + "loss": 1.4311, + "step": 18070 + }, + { + "epoch": 0.9879857304921886, + "grad_norm": 1.4871448278427124, + "learning_rate": 8.006583041287874e-09, + "loss": 1.4521, + "step": 18071 + }, + { + "epoch": 0.9880404029359102, + "grad_norm": 1.4016886949539185, + "learning_rate": 7.933640148768406e-09, + "loss": 1.4135, + "step": 18072 + }, + { + "epoch": 0.9880950753796318, + "grad_norm": 1.9977465867996216, + "learning_rate": 7.861030914791956e-09, + "loss": 1.4549, + "step": 18073 + }, + { + "epoch": 0.9881497478233533, + "grad_norm": 2.0203897953033447, + "learning_rate": 7.78875534178325e-09, + "loss": 1.5004, + "step": 18074 + }, + { + "epoch": 0.9882044202670749, + "grad_norm": 1.3410289287567139, + "learning_rate": 7.716813432154802e-09, + "loss": 1.5119, + "step": 18075 + }, + { + "epoch": 0.9882590927107965, + "grad_norm": 1.6324907541275024, + "learning_rate": 7.645205188310245e-09, + "loss": 1.447, + "step": 18076 + }, + { + "epoch": 0.988313765154518, + "grad_norm": 1.272011637687683, + "learning_rate": 7.57393061263989e-09, + "loss": 1.5512, + "step": 18077 + }, + { + "epoch": 0.9883684375982396, + "grad_norm": 1.5660293102264404, + "learning_rate": 7.502989707524056e-09, + "loss": 1.3471, + "step": 18078 + }, + { + "epoch": 0.9884231100419612, + "grad_norm": 1.3986601829528809, + "learning_rate": 7.432382475330846e-09, + "loss": 1.2885, + "step": 18079 + }, + { + "epoch": 0.9884777824856826, + "grad_norm": 1.6558375358581543, + "learning_rate": 7.362108918418376e-09, + "loss": 1.3663, + "step": 18080 + }, + { + "epoch": 0.9885324549294042, + "grad_norm": 1.4934898614883423, + "learning_rate": 7.292169039134767e-09, + "loss": 1.295, + "step": 18081 + }, + { + "epoch": 0.9885871273731257, + "grad_norm": 1.7314434051513672, + "learning_rate": 7.222562839813707e-09, + "loss": 1.6088, + "step": 18082 + }, + { + "epoch": 0.9886417998168473, + "grad_norm": 1.33162260055542, + "learning_rate": 7.153290322780004e-09, + "loss": 1.6405, + "step": 18083 + }, + { + "epoch": 0.9886964722605689, + "grad_norm": 1.4692209959030151, + "learning_rate": 7.084351490347363e-09, + "loss": 1.5159, + "step": 18084 + }, + { + "epoch": 0.9887511447042904, + "grad_norm": 1.6542112827301025, + "learning_rate": 7.015746344816166e-09, + "loss": 1.5825, + "step": 18085 + }, + { + "epoch": 0.988805817148012, + "grad_norm": 1.6062002182006836, + "learning_rate": 6.947474888480132e-09, + "loss": 1.4687, + "step": 18086 + }, + { + "epoch": 0.9888604895917336, + "grad_norm": 1.2868729829788208, + "learning_rate": 6.8795371236163315e-09, + "loss": 1.6949, + "step": 18087 + }, + { + "epoch": 0.988915162035455, + "grad_norm": 1.9696440696716309, + "learning_rate": 6.811933052494057e-09, + "loss": 1.7239, + "step": 18088 + }, + { + "epoch": 0.9889698344791766, + "grad_norm": 1.4287980794906616, + "learning_rate": 6.744662677371505e-09, + "loss": 1.532, + "step": 18089 + }, + { + "epoch": 0.9890245069228982, + "grad_norm": 1.7733631134033203, + "learning_rate": 6.6777260004946555e-09, + "loss": 1.527, + "step": 18090 + }, + { + "epoch": 0.9890791793666197, + "grad_norm": 1.5024880170822144, + "learning_rate": 6.611123024098387e-09, + "loss": 1.1954, + "step": 18091 + }, + { + "epoch": 0.9891338518103413, + "grad_norm": 2.4933736324310303, + "learning_rate": 6.544853750407587e-09, + "loss": 1.1483, + "step": 18092 + }, + { + "epoch": 0.9891885242540629, + "grad_norm": 1.53069269657135, + "learning_rate": 6.478918181633819e-09, + "loss": 1.6161, + "step": 18093 + }, + { + "epoch": 0.9892431966977844, + "grad_norm": 1.6654022932052612, + "learning_rate": 6.413316319979768e-09, + "loss": 1.6625, + "step": 18094 + }, + { + "epoch": 0.989297869141506, + "grad_norm": 1.2721056938171387, + "learning_rate": 6.3480481676359006e-09, + "loss": 1.4755, + "step": 18095 + }, + { + "epoch": 0.9893525415852275, + "grad_norm": 1.6805875301361084, + "learning_rate": 6.283113726781587e-09, + "loss": 1.2203, + "step": 18096 + }, + { + "epoch": 0.989407214028949, + "grad_norm": 1.409666657447815, + "learning_rate": 6.218512999583981e-09, + "loss": 1.453, + "step": 18097 + }, + { + "epoch": 0.9894618864726706, + "grad_norm": 1.792763352394104, + "learning_rate": 6.154245988202467e-09, + "loss": 1.3406, + "step": 18098 + }, + { + "epoch": 0.9895165589163921, + "grad_norm": 1.4194880723953247, + "learning_rate": 6.0903126947819965e-09, + "loss": 1.465, + "step": 18099 + }, + { + "epoch": 0.9895712313601137, + "grad_norm": 1.5270622968673706, + "learning_rate": 6.026713121457528e-09, + "loss": 1.4719, + "step": 18100 + }, + { + "epoch": 0.9896259038038353, + "grad_norm": 1.6845932006835938, + "learning_rate": 5.9634472703518075e-09, + "loss": 1.2849, + "step": 18101 + }, + { + "epoch": 0.9896805762475568, + "grad_norm": 1.6898106336593628, + "learning_rate": 5.90051514357981e-09, + "loss": 1.5341, + "step": 18102 + }, + { + "epoch": 0.9897352486912784, + "grad_norm": 1.75912606716156, + "learning_rate": 5.837916743239857e-09, + "loss": 1.4359, + "step": 18103 + }, + { + "epoch": 0.989789921135, + "grad_norm": 1.308415412902832, + "learning_rate": 5.77565207142472e-09, + "loss": 1.4927, + "step": 18104 + }, + { + "epoch": 0.9898445935787215, + "grad_norm": 1.4973841905593872, + "learning_rate": 5.713721130212735e-09, + "loss": 1.4379, + "step": 18105 + }, + { + "epoch": 0.989899266022443, + "grad_norm": 1.8346056938171387, + "learning_rate": 5.6521239216722476e-09, + "loss": 1.4315, + "step": 18106 + }, + { + "epoch": 0.9899539384661646, + "grad_norm": 1.2856240272521973, + "learning_rate": 5.590860447858282e-09, + "loss": 1.4802, + "step": 18107 + }, + { + "epoch": 0.9900086109098861, + "grad_norm": 1.3344931602478027, + "learning_rate": 5.529930710820308e-09, + "loss": 1.4436, + "step": 18108 + }, + { + "epoch": 0.9900632833536077, + "grad_norm": 1.6574490070343018, + "learning_rate": 5.4693347125889255e-09, + "loss": 1.3253, + "step": 18109 + }, + { + "epoch": 0.9901179557973292, + "grad_norm": 1.9636117219924927, + "learning_rate": 5.409072455190289e-09, + "loss": 1.4982, + "step": 18110 + }, + { + "epoch": 0.9901726282410508, + "grad_norm": 1.4000616073608398, + "learning_rate": 5.349143940635015e-09, + "loss": 1.2258, + "step": 18111 + }, + { + "epoch": 0.9902273006847724, + "grad_norm": 1.6871531009674072, + "learning_rate": 5.289549170925945e-09, + "loss": 1.4693, + "step": 18112 + }, + { + "epoch": 0.9902819731284939, + "grad_norm": 1.3857262134552002, + "learning_rate": 5.230288148051488e-09, + "loss": 1.2735, + "step": 18113 + }, + { + "epoch": 0.9903366455722155, + "grad_norm": 1.3755815029144287, + "learning_rate": 5.1713608739911714e-09, + "loss": 1.5457, + "step": 18114 + }, + { + "epoch": 0.990391318015937, + "grad_norm": 1.5220776796340942, + "learning_rate": 5.112767350713421e-09, + "loss": 1.5205, + "step": 18115 + }, + { + "epoch": 0.9904459904596585, + "grad_norm": 1.5411854982376099, + "learning_rate": 5.05450758017334e-09, + "loss": 1.4843, + "step": 18116 + }, + { + "epoch": 0.9905006629033801, + "grad_norm": 1.4860519170761108, + "learning_rate": 4.996581564318259e-09, + "loss": 1.6626, + "step": 18117 + }, + { + "epoch": 0.9905553353471017, + "grad_norm": 1.3712096214294434, + "learning_rate": 4.938989305079966e-09, + "loss": 1.4941, + "step": 18118 + }, + { + "epoch": 0.9906100077908232, + "grad_norm": 1.562035322189331, + "learning_rate": 4.881730804383589e-09, + "loss": 1.4467, + "step": 18119 + }, + { + "epoch": 0.9906646802345448, + "grad_norm": 1.3343740701675415, + "learning_rate": 4.82480606413982e-09, + "loss": 1.5446, + "step": 18120 + }, + { + "epoch": 0.9907193526782664, + "grad_norm": 1.5709269046783447, + "learning_rate": 4.7682150862515816e-09, + "loss": 1.4923, + "step": 18121 + }, + { + "epoch": 0.9907740251219879, + "grad_norm": 1.359639286994934, + "learning_rate": 4.711957872606254e-09, + "loss": 1.4582, + "step": 18122 + }, + { + "epoch": 0.9908286975657095, + "grad_norm": 1.7693957090377808, + "learning_rate": 4.656034425083445e-09, + "loss": 1.3252, + "step": 18123 + }, + { + "epoch": 0.9908833700094309, + "grad_norm": 1.9618241786956787, + "learning_rate": 4.600444745550548e-09, + "loss": 1.3798, + "step": 18124 + }, + { + "epoch": 0.9909380424531525, + "grad_norm": 1.6518275737762451, + "learning_rate": 4.5451888358627465e-09, + "loss": 1.4929, + "step": 18125 + }, + { + "epoch": 0.9909927148968741, + "grad_norm": 1.531855821609497, + "learning_rate": 4.490266697867451e-09, + "loss": 1.8378, + "step": 18126 + }, + { + "epoch": 0.9910473873405956, + "grad_norm": 1.8109384775161743, + "learning_rate": 4.435678333397641e-09, + "loss": 1.4546, + "step": 18127 + }, + { + "epoch": 0.9911020597843172, + "grad_norm": 1.5945663452148438, + "learning_rate": 4.381423744275193e-09, + "loss": 1.2964, + "step": 18128 + }, + { + "epoch": 0.9911567322280388, + "grad_norm": 1.568173885345459, + "learning_rate": 4.327502932311989e-09, + "loss": 1.3813, + "step": 18129 + }, + { + "epoch": 0.9912114046717603, + "grad_norm": 1.5256366729736328, + "learning_rate": 4.273915899309922e-09, + "loss": 1.3729, + "step": 18130 + }, + { + "epoch": 0.9912660771154819, + "grad_norm": 1.4037652015686035, + "learning_rate": 4.220662647056451e-09, + "loss": 1.4472, + "step": 18131 + }, + { + "epoch": 0.9913207495592035, + "grad_norm": 1.3995903730392456, + "learning_rate": 4.167743177331262e-09, + "loss": 1.3317, + "step": 18132 + }, + { + "epoch": 0.9913754220029249, + "grad_norm": 1.6972646713256836, + "learning_rate": 4.115157491901834e-09, + "loss": 1.6156, + "step": 18133 + }, + { + "epoch": 0.9914300944466465, + "grad_norm": 1.775880217552185, + "learning_rate": 4.062905592522315e-09, + "loss": 1.4436, + "step": 18134 + }, + { + "epoch": 0.9914847668903681, + "grad_norm": 1.9230079650878906, + "learning_rate": 4.010987480939088e-09, + "loss": 1.4149, + "step": 18135 + }, + { + "epoch": 0.9915394393340896, + "grad_norm": 1.3490166664123535, + "learning_rate": 3.959403158885211e-09, + "loss": 1.3122, + "step": 18136 + }, + { + "epoch": 0.9915941117778112, + "grad_norm": 1.5833067893981934, + "learning_rate": 3.9081526280837504e-09, + "loss": 1.2819, + "step": 18137 + }, + { + "epoch": 0.9916487842215328, + "grad_norm": 1.3019100427627563, + "learning_rate": 3.857235890245559e-09, + "loss": 1.4137, + "step": 18138 + }, + { + "epoch": 0.9917034566652543, + "grad_norm": 1.59822416305542, + "learning_rate": 3.8066529470703885e-09, + "loss": 1.4574, + "step": 18139 + }, + { + "epoch": 0.9917581291089759, + "grad_norm": 1.7409924268722534, + "learning_rate": 3.756403800249109e-09, + "loss": 1.1476, + "step": 18140 + }, + { + "epoch": 0.9918128015526974, + "grad_norm": 1.5328781604766846, + "learning_rate": 3.7064884514570464e-09, + "loss": 1.3094, + "step": 18141 + }, + { + "epoch": 0.991867473996419, + "grad_norm": 1.2073066234588623, + "learning_rate": 3.656906902362867e-09, + "loss": 1.5166, + "step": 18142 + }, + { + "epoch": 0.9919221464401405, + "grad_norm": 1.4702640771865845, + "learning_rate": 3.607659154621912e-09, + "loss": 1.4239, + "step": 18143 + }, + { + "epoch": 0.991976818883862, + "grad_norm": 2.7659478187561035, + "learning_rate": 3.5587452098784225e-09, + "loss": 1.3493, + "step": 18144 + }, + { + "epoch": 0.9920314913275836, + "grad_norm": 1.4450438022613525, + "learning_rate": 3.5101650697655363e-09, + "loss": 1.3228, + "step": 18145 + }, + { + "epoch": 0.9920861637713052, + "grad_norm": 1.845949411392212, + "learning_rate": 3.46191873590529e-09, + "loss": 1.3851, + "step": 18146 + }, + { + "epoch": 0.9921408362150267, + "grad_norm": 1.768850326538086, + "learning_rate": 3.414006209909726e-09, + "loss": 1.3469, + "step": 18147 + }, + { + "epoch": 0.9921955086587483, + "grad_norm": 1.8280521631240845, + "learning_rate": 3.366427493378677e-09, + "loss": 1.5315, + "step": 18148 + }, + { + "epoch": 0.9922501811024699, + "grad_norm": 2.0908477306365967, + "learning_rate": 3.3191825878997606e-09, + "loss": 1.3222, + "step": 18149 + }, + { + "epoch": 0.9923048535461914, + "grad_norm": 1.412689208984375, + "learning_rate": 3.2722714950517154e-09, + "loss": 1.5463, + "step": 18150 + }, + { + "epoch": 0.992359525989913, + "grad_norm": 1.4608479738235474, + "learning_rate": 3.225694216401065e-09, + "loss": 1.4071, + "step": 18151 + }, + { + "epoch": 0.9924141984336345, + "grad_norm": 1.5599433183670044, + "learning_rate": 3.1794507535010122e-09, + "loss": 1.5105, + "step": 18152 + }, + { + "epoch": 0.992468870877356, + "grad_norm": 1.5552046298980713, + "learning_rate": 3.1335411078992074e-09, + "loss": 1.3718, + "step": 18153 + }, + { + "epoch": 0.9925235433210776, + "grad_norm": 1.3513702154159546, + "learning_rate": 3.0879652811255377e-09, + "loss": 1.5285, + "step": 18154 + }, + { + "epoch": 0.9925782157647991, + "grad_norm": 1.1984413862228394, + "learning_rate": 3.0427232747043402e-09, + "loss": 1.3849, + "step": 18155 + }, + { + "epoch": 0.9926328882085207, + "grad_norm": 1.5069552659988403, + "learning_rate": 2.997815090144407e-09, + "loss": 1.3019, + "step": 18156 + }, + { + "epoch": 0.9926875606522423, + "grad_norm": 1.5721361637115479, + "learning_rate": 2.95324072894565e-09, + "loss": 1.357, + "step": 18157 + }, + { + "epoch": 0.9927422330959638, + "grad_norm": 1.3543354272842407, + "learning_rate": 2.909000192597988e-09, + "loss": 1.1814, + "step": 18158 + }, + { + "epoch": 0.9927969055396854, + "grad_norm": 1.4684069156646729, + "learning_rate": 2.865093482576908e-09, + "loss": 1.2853, + "step": 18159 + }, + { + "epoch": 0.992851577983407, + "grad_norm": 1.5606368780136108, + "learning_rate": 2.821520600350125e-09, + "loss": 1.3561, + "step": 18160 + }, + { + "epoch": 0.9929062504271284, + "grad_norm": 1.6288968324661255, + "learning_rate": 2.7782815473720304e-09, + "loss": 1.3986, + "step": 18161 + }, + { + "epoch": 0.99296092287085, + "grad_norm": 1.3857160806655884, + "learning_rate": 2.7353763250848044e-09, + "loss": 1.6334, + "step": 18162 + }, + { + "epoch": 0.9930155953145716, + "grad_norm": 1.8132866621017456, + "learning_rate": 2.692804934923965e-09, + "loss": 1.5293, + "step": 18163 + }, + { + "epoch": 0.9930702677582931, + "grad_norm": 1.4016869068145752, + "learning_rate": 2.6505673783094875e-09, + "loss": 1.6416, + "step": 18164 + }, + { + "epoch": 0.9931249402020147, + "grad_norm": 1.654461145401001, + "learning_rate": 2.608663656652466e-09, + "loss": 1.3496, + "step": 18165 + }, + { + "epoch": 0.9931796126457363, + "grad_norm": 1.4821662902832031, + "learning_rate": 2.5670937713517807e-09, + "loss": 1.5081, + "step": 18166 + }, + { + "epoch": 0.9932342850894578, + "grad_norm": 1.6589365005493164, + "learning_rate": 2.525857723795211e-09, + "loss": 1.333, + "step": 18167 + }, + { + "epoch": 0.9932889575331794, + "grad_norm": 1.3845676183700562, + "learning_rate": 2.4849555153594328e-09, + "loss": 1.3802, + "step": 18168 + }, + { + "epoch": 0.9933436299769008, + "grad_norm": 1.5589817762374878, + "learning_rate": 2.4443871474122415e-09, + "loss": 1.5669, + "step": 18169 + }, + { + "epoch": 0.9933983024206224, + "grad_norm": 1.5275520086288452, + "learning_rate": 2.4041526213058885e-09, + "loss": 1.4218, + "step": 18170 + }, + { + "epoch": 0.993452974864344, + "grad_norm": 1.7438571453094482, + "learning_rate": 2.364251938384854e-09, + "loss": 1.3816, + "step": 18171 + }, + { + "epoch": 0.9935076473080655, + "grad_norm": 1.385884404182434, + "learning_rate": 2.3246850999825153e-09, + "loss": 1.197, + "step": 18172 + }, + { + "epoch": 0.9935623197517871, + "grad_norm": 1.6302213668823242, + "learning_rate": 2.2854521074189284e-09, + "loss": 1.391, + "step": 18173 + }, + { + "epoch": 0.9936169921955087, + "grad_norm": 1.4120644330978394, + "learning_rate": 2.246552962004156e-09, + "loss": 1.4918, + "step": 18174 + }, + { + "epoch": 0.9936716646392302, + "grad_norm": 1.2968008518218994, + "learning_rate": 2.207987665037159e-09, + "loss": 1.2972, + "step": 18175 + }, + { + "epoch": 0.9937263370829518, + "grad_norm": 1.1966336965560913, + "learning_rate": 2.1697562178069067e-09, + "loss": 1.5133, + "step": 18176 + }, + { + "epoch": 0.9937810095266734, + "grad_norm": 1.6704787015914917, + "learning_rate": 2.1318586215890447e-09, + "loss": 1.4241, + "step": 18177 + }, + { + "epoch": 0.9938356819703948, + "grad_norm": 1.6772220134735107, + "learning_rate": 2.0942948776481175e-09, + "loss": 1.6229, + "step": 18178 + }, + { + "epoch": 0.9938903544141164, + "grad_norm": 1.4969881772994995, + "learning_rate": 2.0570649872408977e-09, + "loss": 1.3404, + "step": 18179 + }, + { + "epoch": 0.993945026857838, + "grad_norm": 2.386305809020996, + "learning_rate": 2.020168951608614e-09, + "loss": 1.511, + "step": 18180 + }, + { + "epoch": 0.9939996993015595, + "grad_norm": 2.0027575492858887, + "learning_rate": 1.983606771983615e-09, + "loss": 1.4144, + "step": 18181 + }, + { + "epoch": 0.9940543717452811, + "grad_norm": 2.0481302738189697, + "learning_rate": 1.947378449587145e-09, + "loss": 1.4017, + "step": 18182 + }, + { + "epoch": 0.9941090441890026, + "grad_norm": 1.6305480003356934, + "learning_rate": 1.9114839856293475e-09, + "loss": 1.4405, + "step": 18183 + }, + { + "epoch": 0.9941637166327242, + "grad_norm": 2.643477439880371, + "learning_rate": 1.875923381307043e-09, + "loss": 1.3659, + "step": 18184 + }, + { + "epoch": 0.9942183890764458, + "grad_norm": 2.512073278427124, + "learning_rate": 1.8406966378103909e-09, + "loss": 1.2681, + "step": 18185 + }, + { + "epoch": 0.9942730615201673, + "grad_norm": 1.4824168682098389, + "learning_rate": 1.8058037563140064e-09, + "loss": 1.3148, + "step": 18186 + }, + { + "epoch": 0.9943277339638888, + "grad_norm": 1.5990246534347534, + "learning_rate": 1.7712447379825137e-09, + "loss": 1.4668, + "step": 18187 + }, + { + "epoch": 0.9943824064076104, + "grad_norm": 1.301457405090332, + "learning_rate": 1.7370195839716552e-09, + "loss": 1.3987, + "step": 18188 + }, + { + "epoch": 0.9944370788513319, + "grad_norm": 1.4367517232894897, + "learning_rate": 1.7031282954227402e-09, + "loss": 1.5281, + "step": 18189 + }, + { + "epoch": 0.9944917512950535, + "grad_norm": 1.7174469232559204, + "learning_rate": 1.6695708734693061e-09, + "loss": 1.5295, + "step": 18190 + }, + { + "epoch": 0.9945464237387751, + "grad_norm": 1.2403937578201294, + "learning_rate": 1.6363473192293478e-09, + "loss": 1.5632, + "step": 18191 + }, + { + "epoch": 0.9946010961824966, + "grad_norm": 1.3715760707855225, + "learning_rate": 1.6034576338141982e-09, + "loss": 1.4935, + "step": 18192 + }, + { + "epoch": 0.9946557686262182, + "grad_norm": 1.2608555555343628, + "learning_rate": 1.5709018183218684e-09, + "loss": 1.6415, + "step": 18193 + }, + { + "epoch": 0.9947104410699398, + "grad_norm": 1.6321651935577393, + "learning_rate": 1.5386798738381558e-09, + "loss": 1.3967, + "step": 18194 + }, + { + "epoch": 0.9947651135136613, + "grad_norm": 1.441564917564392, + "learning_rate": 1.5067918014410877e-09, + "loss": 1.2694, + "step": 18195 + }, + { + "epoch": 0.9948197859573829, + "grad_norm": 1.2880924940109253, + "learning_rate": 1.475237602194257e-09, + "loss": 1.2045, + "step": 18196 + }, + { + "epoch": 0.9948744584011043, + "grad_norm": 1.5093311071395874, + "learning_rate": 1.444017277151266e-09, + "loss": 1.1936, + "step": 18197 + }, + { + "epoch": 0.9949291308448259, + "grad_norm": 1.582761526107788, + "learning_rate": 1.413130827354614e-09, + "loss": 1.5438, + "step": 18198 + }, + { + "epoch": 0.9949838032885475, + "grad_norm": 1.6330283880233765, + "learning_rate": 1.3825782538368083e-09, + "loss": 1.5565, + "step": 18199 + }, + { + "epoch": 0.995038475732269, + "grad_norm": 1.6820744276046753, + "learning_rate": 1.3523595576159232e-09, + "loss": 1.4618, + "step": 18200 + }, + { + "epoch": 0.9950931481759906, + "grad_norm": 1.6241941452026367, + "learning_rate": 1.3224747397033721e-09, + "loss": 1.4363, + "step": 18201 + }, + { + "epoch": 0.9951478206197122, + "grad_norm": 1.362838625907898, + "learning_rate": 1.2929238010961354e-09, + "loss": 1.3245, + "step": 18202 + }, + { + "epoch": 0.9952024930634337, + "grad_norm": 1.7110694646835327, + "learning_rate": 1.2637067427800909e-09, + "loss": 1.6136, + "step": 18203 + }, + { + "epoch": 0.9952571655071553, + "grad_norm": 1.8109112977981567, + "learning_rate": 1.234823565732235e-09, + "loss": 1.4762, + "step": 18204 + }, + { + "epoch": 0.9953118379508769, + "grad_norm": 1.7464628219604492, + "learning_rate": 1.206274270916241e-09, + "loss": 1.5214, + "step": 18205 + }, + { + "epoch": 0.9953665103945983, + "grad_norm": 1.7978214025497437, + "learning_rate": 1.178058859285791e-09, + "loss": 1.6549, + "step": 18206 + }, + { + "epoch": 0.9954211828383199, + "grad_norm": 1.4347832202911377, + "learning_rate": 1.150177331782354e-09, + "loss": 1.4818, + "step": 18207 + }, + { + "epoch": 0.9954758552820415, + "grad_norm": 1.944506049156189, + "learning_rate": 1.1226296893374068e-09, + "loss": 1.4446, + "step": 18208 + }, + { + "epoch": 0.995530527725763, + "grad_norm": 1.4207944869995117, + "learning_rate": 1.0954159328724345e-09, + "loss": 1.4355, + "step": 18209 + }, + { + "epoch": 0.9955852001694846, + "grad_norm": 1.5170767307281494, + "learning_rate": 1.0685360632933794e-09, + "loss": 1.4195, + "step": 18210 + }, + { + "epoch": 0.9956398726132061, + "grad_norm": 1.4898464679718018, + "learning_rate": 1.041990081499522e-09, + "loss": 1.4241, + "step": 18211 + }, + { + "epoch": 0.9956945450569277, + "grad_norm": 1.3096078634262085, + "learning_rate": 1.0157779883768203e-09, + "loss": 1.5411, + "step": 18212 + }, + { + "epoch": 0.9957492175006493, + "grad_norm": 1.5484110116958618, + "learning_rate": 9.898997848001302e-10, + "loss": 1.2279, + "step": 18213 + }, + { + "epoch": 0.9958038899443707, + "grad_norm": 1.9354029893875122, + "learning_rate": 9.643554716354253e-10, + "loss": 1.3201, + "step": 18214 + }, + { + "epoch": 0.9958585623880923, + "grad_norm": 1.350189208984375, + "learning_rate": 9.391450497331367e-10, + "loss": 1.4646, + "step": 18215 + }, + { + "epoch": 0.9959132348318139, + "grad_norm": 1.3742344379425049, + "learning_rate": 9.142685199370338e-10, + "loss": 1.5272, + "step": 18216 + }, + { + "epoch": 0.9959679072755354, + "grad_norm": 1.3478453159332275, + "learning_rate": 8.897258830764533e-10, + "loss": 1.5173, + "step": 18217 + }, + { + "epoch": 0.996022579719257, + "grad_norm": 1.6255507469177246, + "learning_rate": 8.655171399718498e-10, + "loss": 1.3218, + "step": 18218 + }, + { + "epoch": 0.9960772521629786, + "grad_norm": 1.189104676246643, + "learning_rate": 8.416422914325762e-10, + "loss": 1.5595, + "step": 18219 + }, + { + "epoch": 0.9961319246067001, + "grad_norm": 2.0489988327026367, + "learning_rate": 8.181013382524416e-10, + "loss": 1.3615, + "step": 18220 + }, + { + "epoch": 0.9961865970504217, + "grad_norm": 1.5128419399261475, + "learning_rate": 7.94894281220815e-10, + "loss": 1.4336, + "step": 18221 + }, + { + "epoch": 0.9962412694941433, + "grad_norm": 1.7769193649291992, + "learning_rate": 7.720211211115214e-10, + "loss": 1.5418, + "step": 18222 + }, + { + "epoch": 0.9962959419378647, + "grad_norm": 1.6640161275863647, + "learning_rate": 7.494818586883945e-10, + "loss": 1.3962, + "step": 18223 + }, + { + "epoch": 0.9963506143815863, + "grad_norm": 1.3471883535385132, + "learning_rate": 7.272764947041655e-10, + "loss": 1.2163, + "step": 18224 + }, + { + "epoch": 0.9964052868253078, + "grad_norm": 1.3232028484344482, + "learning_rate": 7.054050299004633e-10, + "loss": 1.4537, + "step": 18225 + }, + { + "epoch": 0.9964599592690294, + "grad_norm": 1.2313891649246216, + "learning_rate": 6.838674650067045e-10, + "loss": 1.4365, + "step": 18226 + }, + { + "epoch": 0.996514631712751, + "grad_norm": 1.7177023887634277, + "learning_rate": 6.626638007434239e-10, + "loss": 1.4079, + "step": 18227 + }, + { + "epoch": 0.9965693041564725, + "grad_norm": 1.4291187524795532, + "learning_rate": 6.417940378167231e-10, + "loss": 1.2649, + "step": 18228 + }, + { + "epoch": 0.9966239766001941, + "grad_norm": 1.7196983098983765, + "learning_rate": 6.212581769260428e-10, + "loss": 1.5565, + "step": 18229 + }, + { + "epoch": 0.9966786490439157, + "grad_norm": 1.691702127456665, + "learning_rate": 6.010562187552804e-10, + "loss": 1.4271, + "step": 18230 + }, + { + "epoch": 0.9967333214876372, + "grad_norm": 1.6855432987213135, + "learning_rate": 5.811881639794515e-10, + "loss": 1.3244, + "step": 18231 + }, + { + "epoch": 0.9967879939313588, + "grad_norm": 1.4637281894683838, + "learning_rate": 5.616540132624693e-10, + "loss": 1.6214, + "step": 18232 + }, + { + "epoch": 0.9968426663750803, + "grad_norm": 1.1691533327102661, + "learning_rate": 5.424537672560349e-10, + "loss": 1.6065, + "step": 18233 + }, + { + "epoch": 0.9968973388188018, + "grad_norm": 2.0974972248077393, + "learning_rate": 5.23587426601857e-10, + "loss": 1.4996, + "step": 18234 + }, + { + "epoch": 0.9969520112625234, + "grad_norm": 1.6901142597198486, + "learning_rate": 5.050549919294323e-10, + "loss": 1.4086, + "step": 18235 + }, + { + "epoch": 0.997006683706245, + "grad_norm": 1.7277789115905762, + "learning_rate": 4.868564638571549e-10, + "loss": 1.5426, + "step": 18236 + }, + { + "epoch": 0.9970613561499665, + "grad_norm": 1.355199933052063, + "learning_rate": 4.689918429945373e-10, + "loss": 1.4067, + "step": 18237 + }, + { + "epoch": 0.9971160285936881, + "grad_norm": 1.6103770732879639, + "learning_rate": 4.514611299355487e-10, + "loss": 1.482, + "step": 18238 + }, + { + "epoch": 0.9971707010374096, + "grad_norm": 1.6051750183105469, + "learning_rate": 4.342643252686074e-10, + "loss": 1.4958, + "step": 18239 + }, + { + "epoch": 0.9972253734811312, + "grad_norm": 1.804622769355774, + "learning_rate": 4.17401429564368e-10, + "loss": 1.4223, + "step": 18240 + }, + { + "epoch": 0.9972800459248528, + "grad_norm": 1.4528993368148804, + "learning_rate": 4.008724433890443e-10, + "loss": 1.3907, + "step": 18241 + }, + { + "epoch": 0.9973347183685742, + "grad_norm": 1.589676022529602, + "learning_rate": 3.8467736729330687e-10, + "loss": 1.4962, + "step": 18242 + }, + { + "epoch": 0.9973893908122958, + "grad_norm": 1.3283032178878784, + "learning_rate": 3.688162018178343e-10, + "loss": 1.3438, + "step": 18243 + }, + { + "epoch": 0.9974440632560174, + "grad_norm": 1.6428873538970947, + "learning_rate": 3.53288947492203e-10, + "loss": 1.2717, + "step": 18244 + }, + { + "epoch": 0.9974987356997389, + "grad_norm": 1.7123379707336426, + "learning_rate": 3.3809560483599735e-10, + "loss": 1.3143, + "step": 18245 + }, + { + "epoch": 0.9975534081434605, + "grad_norm": 1.4591028690338135, + "learning_rate": 3.2323617435547905e-10, + "loss": 1.5218, + "step": 18246 + }, + { + "epoch": 0.9976080805871821, + "grad_norm": 1.3568129539489746, + "learning_rate": 3.087106565469178e-10, + "loss": 1.5703, + "step": 18247 + }, + { + "epoch": 0.9976627530309036, + "grad_norm": 1.200326681137085, + "learning_rate": 2.94519051895481e-10, + "loss": 1.5504, + "step": 18248 + }, + { + "epoch": 0.9977174254746252, + "grad_norm": 1.5451427698135376, + "learning_rate": 2.806613608741238e-10, + "loss": 1.2973, + "step": 18249 + }, + { + "epoch": 0.9977720979183468, + "grad_norm": 1.4961940050125122, + "learning_rate": 2.6713758394802945e-10, + "loss": 1.4898, + "step": 18250 + }, + { + "epoch": 0.9978267703620682, + "grad_norm": 1.9534196853637695, + "learning_rate": 2.5394772156683845e-10, + "loss": 1.6341, + "step": 18251 + }, + { + "epoch": 0.9978814428057898, + "grad_norm": 1.4210249185562134, + "learning_rate": 2.4109177417130925e-10, + "loss": 1.4522, + "step": 18252 + }, + { + "epoch": 0.9979361152495113, + "grad_norm": 1.6767425537109375, + "learning_rate": 2.2856974219109817e-10, + "loss": 1.5007, + "step": 18253 + }, + { + "epoch": 0.9979907876932329, + "grad_norm": 1.7501417398452759, + "learning_rate": 2.163816260436491e-10, + "loss": 1.5865, + "step": 18254 + }, + { + "epoch": 0.9980454601369545, + "grad_norm": 1.5958389043807983, + "learning_rate": 2.0452742613641386e-10, + "loss": 1.4483, + "step": 18255 + }, + { + "epoch": 0.998100132580676, + "grad_norm": 1.9427204132080078, + "learning_rate": 1.9300714286574207e-10, + "loss": 1.2998, + "step": 18256 + }, + { + "epoch": 0.9981548050243976, + "grad_norm": 1.1803059577941895, + "learning_rate": 1.818207766157709e-10, + "loss": 1.3745, + "step": 18257 + }, + { + "epoch": 0.9982094774681192, + "grad_norm": 1.53663969039917, + "learning_rate": 1.7096832776064554e-10, + "loss": 1.6623, + "step": 18258 + }, + { + "epoch": 0.9982641499118406, + "grad_norm": 2.0249502658843994, + "learning_rate": 1.6044979666118843e-10, + "loss": 1.5616, + "step": 18259 + }, + { + "epoch": 0.9983188223555622, + "grad_norm": 1.5732359886169434, + "learning_rate": 1.5026518367045052e-10, + "loss": 1.6676, + "step": 18260 + }, + { + "epoch": 0.9983734947992838, + "grad_norm": 1.3603520393371582, + "learning_rate": 1.404144891270498e-10, + "loss": 1.513, + "step": 18261 + }, + { + "epoch": 0.9984281672430053, + "grad_norm": 1.6541051864624023, + "learning_rate": 1.3089771336072256e-10, + "loss": 1.4956, + "step": 18262 + }, + { + "epoch": 0.9984828396867269, + "grad_norm": 1.47328782081604, + "learning_rate": 1.2171485669010275e-10, + "loss": 1.4986, + "step": 18263 + }, + { + "epoch": 0.9985375121304485, + "grad_norm": 1.5864646434783936, + "learning_rate": 1.1286591941939151e-10, + "loss": 1.37, + "step": 18264 + }, + { + "epoch": 0.99859218457417, + "grad_norm": 1.9041023254394531, + "learning_rate": 1.0435090184723884e-10, + "loss": 1.4002, + "step": 18265 + }, + { + "epoch": 0.9986468570178916, + "grad_norm": 1.3708417415618896, + "learning_rate": 9.616980425453116e-11, + "loss": 1.6215, + "step": 18266 + }, + { + "epoch": 0.9987015294616131, + "grad_norm": 1.4087474346160889, + "learning_rate": 8.832262691771398e-11, + "loss": 1.271, + "step": 18267 + }, + { + "epoch": 0.9987562019053347, + "grad_norm": 1.5101284980773926, + "learning_rate": 8.080937009657952e-11, + "loss": 1.4993, + "step": 18268 + }, + { + "epoch": 0.9988108743490562, + "grad_norm": 1.4236091375350952, + "learning_rate": 7.363003404314839e-11, + "loss": 1.3598, + "step": 18269 + }, + { + "epoch": 0.9988655467927777, + "grad_norm": 1.498419165611267, + "learning_rate": 6.678461899611854e-11, + "loss": 1.4176, + "step": 18270 + }, + { + "epoch": 0.9989202192364993, + "grad_norm": 1.5021982192993164, + "learning_rate": 6.027312518530614e-11, + "loss": 1.4329, + "step": 18271 + }, + { + "epoch": 0.9989748916802209, + "grad_norm": 1.3810960054397583, + "learning_rate": 5.409555282720469e-11, + "loss": 1.4501, + "step": 18272 + }, + { + "epoch": 0.9990295641239424, + "grad_norm": 1.4997296333312988, + "learning_rate": 4.825190212831565e-11, + "loss": 1.5205, + "step": 18273 + }, + { + "epoch": 0.999084236567664, + "grad_norm": 1.7043561935424805, + "learning_rate": 4.274217328514851e-11, + "loss": 1.2339, + "step": 18274 + }, + { + "epoch": 0.9991389090113856, + "grad_norm": 1.6529847383499146, + "learning_rate": 3.756636647866962e-11, + "loss": 1.3399, + "step": 18275 + }, + { + "epoch": 0.9991935814551071, + "grad_norm": 1.3509653806686401, + "learning_rate": 3.272448188429422e-11, + "loss": 1.5382, + "step": 18276 + }, + { + "epoch": 0.9992482538988287, + "grad_norm": 1.4026296138763428, + "learning_rate": 2.821651966300465e-11, + "loss": 1.3788, + "step": 18277 + }, + { + "epoch": 0.9993029263425502, + "grad_norm": 1.584303617477417, + "learning_rate": 2.4042479965791234e-11, + "loss": 1.4736, + "step": 18278 + }, + { + "epoch": 0.9993575987862717, + "grad_norm": 1.6415742635726929, + "learning_rate": 2.020236293143185e-11, + "loss": 1.3039, + "step": 18279 + }, + { + "epoch": 0.9994122712299933, + "grad_norm": 1.605071783065796, + "learning_rate": 1.6696168687602155e-11, + "loss": 1.5276, + "step": 18280 + }, + { + "epoch": 0.9994669436737148, + "grad_norm": 1.5009551048278809, + "learning_rate": 1.3523897351985783e-11, + "loss": 1.3437, + "step": 18281 + }, + { + "epoch": 0.9995216161174364, + "grad_norm": 1.4933103322982788, + "learning_rate": 1.0685549030053921e-11, + "loss": 1.5339, + "step": 18282 + }, + { + "epoch": 0.999576288561158, + "grad_norm": 1.2666126489639282, + "learning_rate": 8.181123817285752e-12, + "loss": 1.5596, + "step": 18283 + }, + { + "epoch": 0.9996309610048795, + "grad_norm": 1.6396511793136597, + "learning_rate": 6.010621798058225e-12, + "loss": 1.2219, + "step": 18284 + }, + { + "epoch": 0.9996856334486011, + "grad_norm": 1.617615818977356, + "learning_rate": 4.174043043425613e-12, + "loss": 1.4368, + "step": 18285 + }, + { + "epoch": 0.9997403058923227, + "grad_norm": 1.4422838687896729, + "learning_rate": 2.6713876144501827e-12, + "loss": 1.2235, + "step": 18286 + }, + { + "epoch": 0.9997949783360441, + "grad_norm": 1.3461558818817139, + "learning_rate": 1.5026555622021932e-12, + "loss": 1.4567, + "step": 18287 + }, + { + "epoch": 0.9998496507797657, + "grad_norm": 1.3379034996032715, + "learning_rate": 6.678469255394504e-13, + "loss": 1.3313, + "step": 18288 + }, + { + "epoch": 0.9999043232234873, + "grad_norm": 1.7478768825531006, + "learning_rate": 1.6696173332775289e-13, + "loss": 1.5241, + "step": 18289 + }, + { + "epoch": 0.9999589956672088, + "grad_norm": 1.326999545097351, + "learning_rate": 0.0, + "loss": 1.4102, + "step": 18290 + }, + { + "epoch": 0.9999589956672088, + "step": 18290, + "total_flos": 1.6170385225104415e+18, + "train_loss": 1.50054688282258, + "train_runtime": 29741.9046, + "train_samples_per_second": 19.679, + "train_steps_per_second": 0.615 + } + ], + "logging_steps": 1.0, + "max_steps": 18290, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 1.6170385225104415e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}