{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9928400954653938, "eval_steps": 500, "global_step": 418, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00477326968973747, "grad_norm": 6.016661782287111, "learning_rate": 9.523809523809523e-08, "loss": 1.0606, "step": 1 }, { "epoch": 0.00954653937947494, "grad_norm": 6.0953583965166676, "learning_rate": 1.9047619047619045e-07, "loss": 1.0205, "step": 2 }, { "epoch": 0.014319809069212411, "grad_norm": 5.545005089565625, "learning_rate": 2.857142857142857e-07, "loss": 1.0092, "step": 3 }, { "epoch": 0.01909307875894988, "grad_norm": 6.013893802513984, "learning_rate": 3.809523809523809e-07, "loss": 0.9935, "step": 4 }, { "epoch": 0.02386634844868735, "grad_norm": 5.376025361134291, "learning_rate": 4.761904761904761e-07, "loss": 1.0184, "step": 5 }, { "epoch": 0.028639618138424822, "grad_norm": 5.360630219441705, "learning_rate": 5.714285714285714e-07, "loss": 1.0072, "step": 6 }, { "epoch": 0.03341288782816229, "grad_norm": 5.135874756495844, "learning_rate": 6.666666666666666e-07, "loss": 0.9987, "step": 7 }, { "epoch": 0.03818615751789976, "grad_norm": 4.183937042855387, "learning_rate": 7.619047619047618e-07, "loss": 0.9639, "step": 8 }, { "epoch": 0.04295942720763723, "grad_norm": 4.099248159117762, "learning_rate": 8.57142857142857e-07, "loss": 0.9497, "step": 9 }, { "epoch": 0.0477326968973747, "grad_norm": 4.049680347020253, "learning_rate": 9.523809523809522e-07, "loss": 0.9488, "step": 10 }, { "epoch": 0.05250596658711217, "grad_norm": 3.3413136627880506, "learning_rate": 1.0476190476190476e-06, "loss": 0.925, "step": 11 }, { "epoch": 0.057279236276849645, "grad_norm": 3.0774268853711955, "learning_rate": 1.1428571428571428e-06, "loss": 0.9231, "step": 12 }, { "epoch": 0.06205250596658711, "grad_norm": 2.911361629869161, "learning_rate": 1.238095238095238e-06, "loss": 0.9177, "step": 13 }, { "epoch": 0.06682577565632458, "grad_norm": 2.428528787361087, "learning_rate": 1.3333333333333332e-06, "loss": 0.8811, "step": 14 }, { "epoch": 0.07159904534606205, "grad_norm": 1.8195746682720535, "learning_rate": 1.4285714285714286e-06, "loss": 0.8533, "step": 15 }, { "epoch": 0.07637231503579953, "grad_norm": 1.947836891057091, "learning_rate": 1.5238095238095236e-06, "loss": 0.8706, "step": 16 }, { "epoch": 0.081145584725537, "grad_norm": 1.8462478481431221, "learning_rate": 1.619047619047619e-06, "loss": 0.8208, "step": 17 }, { "epoch": 0.08591885441527446, "grad_norm": 2.137106522697402, "learning_rate": 1.714285714285714e-06, "loss": 0.8181, "step": 18 }, { "epoch": 0.09069212410501193, "grad_norm": 1.9201958176581, "learning_rate": 1.8095238095238095e-06, "loss": 0.8245, "step": 19 }, { "epoch": 0.0954653937947494, "grad_norm": 1.523269971944646, "learning_rate": 1.9047619047619045e-06, "loss": 0.7545, "step": 20 }, { "epoch": 0.10023866348448687, "grad_norm": 1.8071688523978784, "learning_rate": 2e-06, "loss": 0.776, "step": 21 }, { "epoch": 0.10501193317422435, "grad_norm": 1.5489776099138406, "learning_rate": 1.9999686897547167e-06, "loss": 0.7445, "step": 22 }, { "epoch": 0.10978520286396182, "grad_norm": 1.483755076083143, "learning_rate": 1.9998747609795305e-06, "loss": 0.7351, "step": 23 }, { "epoch": 0.11455847255369929, "grad_norm": 1.4689369885238521, "learning_rate": 1.999718219556307e-06, "loss": 0.7332, "step": 24 }, { "epoch": 0.11933174224343675, "grad_norm": 1.4297615546988631, "learning_rate": 1.999499075287747e-06, "loss": 0.738, "step": 25 }, { "epoch": 0.12410501193317422, "grad_norm": 1.1356908706873299, "learning_rate": 1.999217341896772e-06, "loss": 0.7089, "step": 26 }, { "epoch": 0.1288782816229117, "grad_norm": 1.2895783555833555, "learning_rate": 1.998873037025665e-06, "loss": 0.6936, "step": 27 }, { "epoch": 0.13365155131264916, "grad_norm": 1.1235754706376115, "learning_rate": 1.9984661822349665e-06, "loss": 0.6785, "step": 28 }, { "epoch": 0.13842482100238662, "grad_norm": 1.0872843150821465, "learning_rate": 1.997996803002123e-06, "loss": 0.6978, "step": 29 }, { "epoch": 0.1431980906921241, "grad_norm": 1.0898740583426263, "learning_rate": 1.9974649287198914e-06, "loss": 0.669, "step": 30 }, { "epoch": 0.14797136038186157, "grad_norm": 1.050118078989169, "learning_rate": 1.9968705926945013e-06, "loss": 0.6674, "step": 31 }, { "epoch": 0.15274463007159905, "grad_norm": 0.8999107812930152, "learning_rate": 1.9962138321435656e-06, "loss": 0.6765, "step": 32 }, { "epoch": 0.1575178997613365, "grad_norm": 0.9612552915492341, "learning_rate": 1.9954946881937524e-06, "loss": 0.6745, "step": 33 }, { "epoch": 0.162291169451074, "grad_norm": 0.8921943607730816, "learning_rate": 1.994713205878208e-06, "loss": 0.6546, "step": 34 }, { "epoch": 0.16706443914081145, "grad_norm": 1.1285406074360596, "learning_rate": 1.9938694341337393e-06, "loss": 0.6612, "step": 35 }, { "epoch": 0.1718377088305489, "grad_norm": 0.9414713249176138, "learning_rate": 1.9929634257977467e-06, "loss": 0.6525, "step": 36 }, { "epoch": 0.1766109785202864, "grad_norm": 0.8006257830637218, "learning_rate": 1.991995237604916e-06, "loss": 0.6465, "step": 37 }, { "epoch": 0.18138424821002386, "grad_norm": 0.9526156911755369, "learning_rate": 1.9909649301836674e-06, "loss": 0.6581, "step": 38 }, { "epoch": 0.18615751789976134, "grad_norm": 0.9928551887252647, "learning_rate": 1.9898725680523566e-06, "loss": 0.6623, "step": 39 }, { "epoch": 0.1909307875894988, "grad_norm": 0.9455779960744521, "learning_rate": 1.9887182196152367e-06, "loss": 0.6527, "step": 40 }, { "epoch": 0.1957040572792363, "grad_norm": 0.8607850079912805, "learning_rate": 1.9875019571581726e-06, "loss": 0.6148, "step": 41 }, { "epoch": 0.20047732696897375, "grad_norm": 1.050584563468554, "learning_rate": 1.9862238568441165e-06, "loss": 0.6288, "step": 42 }, { "epoch": 0.2052505966587112, "grad_norm": 0.9453773840363461, "learning_rate": 1.9848839987083364e-06, "loss": 0.6373, "step": 43 }, { "epoch": 0.2100238663484487, "grad_norm": 0.8585148243018186, "learning_rate": 1.983482466653407e-06, "loss": 0.6401, "step": 44 }, { "epoch": 0.21479713603818615, "grad_norm": 0.9814711262628284, "learning_rate": 1.982019348443952e-06, "loss": 0.6274, "step": 45 }, { "epoch": 0.21957040572792363, "grad_norm": 0.9528618373675226, "learning_rate": 1.9804947357011523e-06, "loss": 0.6694, "step": 46 }, { "epoch": 0.2243436754176611, "grad_norm": 0.7974828002980384, "learning_rate": 1.978908723897005e-06, "loss": 0.6236, "step": 47 }, { "epoch": 0.22911694510739858, "grad_norm": 0.8409814486609728, "learning_rate": 1.9772614123483485e-06, "loss": 0.6408, "step": 48 }, { "epoch": 0.23389021479713604, "grad_norm": 0.9308103385624037, "learning_rate": 1.9755529042106393e-06, "loss": 0.5919, "step": 49 }, { "epoch": 0.2386634844868735, "grad_norm": 0.8733866970343211, "learning_rate": 1.973783306471495e-06, "loss": 0.5969, "step": 50 }, { "epoch": 0.24343675417661098, "grad_norm": 0.785222837947662, "learning_rate": 1.971952729943994e-06, "loss": 0.5973, "step": 51 }, { "epoch": 0.24821002386634844, "grad_norm": 0.7734659209134986, "learning_rate": 1.9700612892597372e-06, "loss": 0.6098, "step": 52 }, { "epoch": 0.2529832935560859, "grad_norm": 0.7186573988976016, "learning_rate": 1.9681091028616676e-06, "loss": 0.5991, "step": 53 }, { "epoch": 0.2577565632458234, "grad_norm": 0.7687713083483249, "learning_rate": 1.966096292996655e-06, "loss": 0.612, "step": 54 }, { "epoch": 0.26252983293556087, "grad_norm": 0.8621825025712473, "learning_rate": 1.9640229857078413e-06, "loss": 0.5949, "step": 55 }, { "epoch": 0.26730310262529833, "grad_norm": 0.8257565967386995, "learning_rate": 1.9618893108267454e-06, "loss": 0.6103, "step": 56 }, { "epoch": 0.2720763723150358, "grad_norm": 0.8446529899139308, "learning_rate": 1.9596954019651354e-06, "loss": 0.5788, "step": 57 }, { "epoch": 0.27684964200477324, "grad_norm": 0.7679525180581322, "learning_rate": 1.95744139650666e-06, "loss": 0.6069, "step": 58 }, { "epoch": 0.28162291169451076, "grad_norm": 2.820852049381465, "learning_rate": 1.955127435598247e-06, "loss": 0.5987, "step": 59 }, { "epoch": 0.2863961813842482, "grad_norm": 0.814858484667224, "learning_rate": 1.9527536641412637e-06, "loss": 0.6112, "step": 60 }, { "epoch": 0.2911694510739857, "grad_norm": 0.8763795811271151, "learning_rate": 1.950320230782443e-06, "loss": 0.5827, "step": 61 }, { "epoch": 0.29594272076372313, "grad_norm": 0.7856677046080051, "learning_rate": 1.9478272879045763e-06, "loss": 0.596, "step": 62 }, { "epoch": 0.30071599045346065, "grad_norm": 0.922334054326887, "learning_rate": 1.9452749916169685e-06, "loss": 0.6131, "step": 63 }, { "epoch": 0.3054892601431981, "grad_norm": 0.9217839433257945, "learning_rate": 1.942663501745666e-06, "loss": 0.6014, "step": 64 }, { "epoch": 0.31026252983293556, "grad_norm": 0.7832581576619595, "learning_rate": 1.939992981823445e-06, "loss": 0.599, "step": 65 }, { "epoch": 0.315035799522673, "grad_norm": 0.8859458814902181, "learning_rate": 1.9372635990795744e-06, "loss": 0.5606, "step": 66 }, { "epoch": 0.3198090692124105, "grad_norm": 0.8263334442045513, "learning_rate": 1.934475524429339e-06, "loss": 0.5845, "step": 67 }, { "epoch": 0.324582338902148, "grad_norm": 0.7750993438508201, "learning_rate": 1.9316289324633416e-06, "loss": 0.5938, "step": 68 }, { "epoch": 0.32935560859188545, "grad_norm": 0.8557785609879223, "learning_rate": 1.928724001436568e-06, "loss": 0.5971, "step": 69 }, { "epoch": 0.3341288782816229, "grad_norm": 0.8226562290018226, "learning_rate": 1.925760913257224e-06, "loss": 0.5896, "step": 70 }, { "epoch": 0.33890214797136037, "grad_norm": 0.7875386984949746, "learning_rate": 1.922739853475345e-06, "loss": 0.5957, "step": 71 }, { "epoch": 0.3436754176610978, "grad_norm": 0.7466977444466465, "learning_rate": 1.919661011271176e-06, "loss": 0.5782, "step": 72 }, { "epoch": 0.34844868735083534, "grad_norm": 0.8961994451430955, "learning_rate": 1.916524579443327e-06, "loss": 0.5912, "step": 73 }, { "epoch": 0.3532219570405728, "grad_norm": 0.80271020509165, "learning_rate": 1.9133307543966972e-06, "loss": 0.568, "step": 74 }, { "epoch": 0.35799522673031026, "grad_norm": 9.459451148135054, "learning_rate": 1.910079736130178e-06, "loss": 0.5831, "step": 75 }, { "epoch": 0.3627684964200477, "grad_norm": 0.8049759872673024, "learning_rate": 1.9067717282241275e-06, "loss": 0.5752, "step": 76 }, { "epoch": 0.36754176610978523, "grad_norm": 0.9365222380955207, "learning_rate": 1.9034069378276248e-06, "loss": 0.6037, "step": 77 }, { "epoch": 0.3723150357995227, "grad_norm": 0.7813871400804118, "learning_rate": 1.8999855756454943e-06, "loss": 0.5814, "step": 78 }, { "epoch": 0.37708830548926014, "grad_norm": 0.8403752789759832, "learning_rate": 1.8965078559251141e-06, "loss": 0.5864, "step": 79 }, { "epoch": 0.3818615751789976, "grad_norm": 0.8051967980548511, "learning_rate": 1.892973996443e-06, "loss": 0.5872, "step": 80 }, { "epoch": 0.38663484486873506, "grad_norm": 0.8042594188373205, "learning_rate": 1.8893842184911652e-06, "loss": 0.5763, "step": 81 }, { "epoch": 0.3914081145584726, "grad_norm": 0.9020174309993688, "learning_rate": 1.8857387468632673e-06, "loss": 0.5663, "step": 82 }, { "epoch": 0.39618138424821003, "grad_norm": 0.7886287092080712, "learning_rate": 1.8820378098405269e-06, "loss": 0.5749, "step": 83 }, { "epoch": 0.4009546539379475, "grad_norm": 0.7891386094058271, "learning_rate": 1.878281639177437e-06, "loss": 0.5791, "step": 84 }, { "epoch": 0.40572792362768495, "grad_norm": 0.8638559742903111, "learning_rate": 1.874470470087246e-06, "loss": 0.594, "step": 85 }, { "epoch": 0.4105011933174224, "grad_norm": 0.8722054176885525, "learning_rate": 1.8706045412272329e-06, "loss": 0.5958, "step": 86 }, { "epoch": 0.4152744630071599, "grad_norm": 0.8861516356836725, "learning_rate": 1.8666840946837588e-06, "loss": 0.5831, "step": 87 }, { "epoch": 0.4200477326968974, "grad_norm": 1.1646833402992178, "learning_rate": 1.8627093759571097e-06, "loss": 0.5773, "step": 88 }, { "epoch": 0.42482100238663484, "grad_norm": 1.015546055180046, "learning_rate": 1.8586806339461223e-06, "loss": 0.567, "step": 89 }, { "epoch": 0.4295942720763723, "grad_norm": 0.9466071623549958, "learning_rate": 1.8545981209325974e-06, "loss": 0.5859, "step": 90 }, { "epoch": 0.4343675417661098, "grad_norm": 0.7761872762176855, "learning_rate": 1.850462092565503e-06, "loss": 0.5786, "step": 91 }, { "epoch": 0.43914081145584727, "grad_norm": 0.7212954328261074, "learning_rate": 1.846272807844964e-06, "loss": 0.5643, "step": 92 }, { "epoch": 0.4439140811455847, "grad_norm": 0.9652990021129971, "learning_rate": 1.8420305291060453e-06, "loss": 0.5772, "step": 93 }, { "epoch": 0.4486873508353222, "grad_norm": 3.476043998914064, "learning_rate": 1.837735522002322e-06, "loss": 0.5973, "step": 94 }, { "epoch": 0.45346062052505964, "grad_norm": 0.9648957060855661, "learning_rate": 1.8333880554892465e-06, "loss": 0.5683, "step": 95 }, { "epoch": 0.45823389021479716, "grad_norm": 0.8226895202723103, "learning_rate": 1.828988401807304e-06, "loss": 0.5631, "step": 96 }, { "epoch": 0.4630071599045346, "grad_norm": 0.8353418687299229, "learning_rate": 1.8245368364649672e-06, "loss": 0.5478, "step": 97 }, { "epoch": 0.4677804295942721, "grad_norm": 0.7861845701165756, "learning_rate": 1.8200336382214404e-06, "loss": 0.5814, "step": 98 }, { "epoch": 0.47255369928400953, "grad_norm": 0.7869818557092823, "learning_rate": 1.815479089069208e-06, "loss": 0.5831, "step": 99 }, { "epoch": 0.477326968973747, "grad_norm": 1.0793699054838668, "learning_rate": 1.8108734742163714e-06, "loss": 0.5711, "step": 100 }, { "epoch": 0.4821002386634845, "grad_norm": 0.9191351283369057, "learning_rate": 1.8062170820687923e-06, "loss": 0.5829, "step": 101 }, { "epoch": 0.48687350835322196, "grad_norm": 0.8555793060148964, "learning_rate": 1.8015102042120314e-06, "loss": 0.5651, "step": 102 }, { "epoch": 0.4916467780429594, "grad_norm": 0.8381062392654873, "learning_rate": 1.796753135393089e-06, "loss": 0.578, "step": 103 }, { "epoch": 0.4964200477326969, "grad_norm": 0.9192300787533598, "learning_rate": 1.791946173501948e-06, "loss": 0.549, "step": 104 }, { "epoch": 0.5011933174224343, "grad_norm": 0.8307533286502056, "learning_rate": 1.7870896195529204e-06, "loss": 0.5427, "step": 105 }, { "epoch": 0.5059665871121718, "grad_norm": 0.7905696548307439, "learning_rate": 1.7821837776657967e-06, "loss": 0.5765, "step": 106 }, { "epoch": 0.5107398568019093, "grad_norm": 0.8311340345264336, "learning_rate": 1.777228955046803e-06, "loss": 0.5627, "step": 107 }, { "epoch": 0.5155131264916468, "grad_norm": 1.1408460136923761, "learning_rate": 1.7722254619693617e-06, "loss": 0.5615, "step": 108 }, { "epoch": 0.5202863961813843, "grad_norm": 0.9215940982960842, "learning_rate": 1.7671736117546643e-06, "loss": 0.559, "step": 109 }, { "epoch": 0.5250596658711217, "grad_norm": 0.9073194364535173, "learning_rate": 1.7620737207520498e-06, "loss": 0.5675, "step": 110 }, { "epoch": 0.5298329355608592, "grad_norm": 0.9064733521778133, "learning_rate": 1.756926108319194e-06, "loss": 0.564, "step": 111 }, { "epoch": 0.5346062052505967, "grad_norm": 0.8006367733355821, "learning_rate": 1.751731096802113e-06, "loss": 0.5697, "step": 112 }, { "epoch": 0.5393794749403341, "grad_norm": 0.7703477827683232, "learning_rate": 1.7464890115149759e-06, "loss": 0.5556, "step": 113 }, { "epoch": 0.5441527446300716, "grad_norm": 0.7808625090724881, "learning_rate": 1.7412001807197361e-06, "loss": 0.5699, "step": 114 }, { "epoch": 0.548926014319809, "grad_norm": 0.7891354086520267, "learning_rate": 1.735864935605572e-06, "loss": 0.5535, "step": 115 }, { "epoch": 0.5536992840095465, "grad_norm": 0.8559410057738829, "learning_rate": 1.7304836102681493e-06, "loss": 0.5456, "step": 116 }, { "epoch": 0.5584725536992841, "grad_norm": 1.0113045114994854, "learning_rate": 1.7250565416887015e-06, "loss": 0.5724, "step": 117 }, { "epoch": 0.5632458233890215, "grad_norm": 0.8876991951748312, "learning_rate": 1.719584069712925e-06, "loss": 0.568, "step": 118 }, { "epoch": 0.568019093078759, "grad_norm": 0.8642199309829095, "learning_rate": 1.7140665370296992e-06, "loss": 0.5501, "step": 119 }, { "epoch": 0.5727923627684964, "grad_norm": 0.7976943947559357, "learning_rate": 1.708504289149628e-06, "loss": 0.586, "step": 120 }, { "epoch": 0.5775656324582339, "grad_norm": 0.8256312101115841, "learning_rate": 1.702897674383402e-06, "loss": 0.5533, "step": 121 }, { "epoch": 0.5823389021479713, "grad_norm": 1.0090990785205396, "learning_rate": 1.697247043819988e-06, "loss": 0.5662, "step": 122 }, { "epoch": 0.5871121718377088, "grad_norm": 0.9155456337094188, "learning_rate": 1.6915527513046443e-06, "loss": 0.5683, "step": 123 }, { "epoch": 0.5918854415274463, "grad_norm": 0.8131468025811117, "learning_rate": 1.6858151534167616e-06, "loss": 0.5621, "step": 124 }, { "epoch": 0.5966587112171837, "grad_norm": 0.8064567687343521, "learning_rate": 1.6800346094475346e-06, "loss": 0.5596, "step": 125 }, { "epoch": 0.6014319809069213, "grad_norm": 0.7492395201342102, "learning_rate": 1.6742114813774618e-06, "loss": 0.5531, "step": 126 }, { "epoch": 0.6062052505966588, "grad_norm": 0.7647965464540142, "learning_rate": 1.6683461338536798e-06, "loss": 0.5832, "step": 127 }, { "epoch": 0.6109785202863962, "grad_norm": 0.7808066517921948, "learning_rate": 1.6624389341671278e-06, "loss": 0.5541, "step": 128 }, { "epoch": 0.6157517899761337, "grad_norm": 0.8430152851631113, "learning_rate": 1.656490252229548e-06, "loss": 0.5528, "step": 129 }, { "epoch": 0.6205250596658711, "grad_norm": 0.799740321239669, "learning_rate": 1.6505004605503223e-06, "loss": 0.5754, "step": 130 }, { "epoch": 0.6252983293556086, "grad_norm": 0.8524369396059758, "learning_rate": 1.6444699342131428e-06, "loss": 0.5659, "step": 131 }, { "epoch": 0.630071599045346, "grad_norm": 0.8594592125322017, "learning_rate": 1.638399050852528e-06, "loss": 0.5468, "step": 132 }, { "epoch": 0.6348448687350835, "grad_norm": 0.8710890648276657, "learning_rate": 1.632288190630172e-06, "loss": 0.5547, "step": 133 }, { "epoch": 0.639618138424821, "grad_norm": 1.3695399621239903, "learning_rate": 1.6261377362111396e-06, "loss": 0.5475, "step": 134 }, { "epoch": 0.6443914081145584, "grad_norm": 0.9119912953537386, "learning_rate": 1.6199480727399032e-06, "loss": 0.5622, "step": 135 }, { "epoch": 0.649164677804296, "grad_norm": 0.8174877663301265, "learning_rate": 1.6137195878162267e-06, "loss": 0.5646, "step": 136 }, { "epoch": 0.6539379474940334, "grad_norm": 0.9968710402813645, "learning_rate": 1.607452671470891e-06, "loss": 0.5524, "step": 137 }, { "epoch": 0.6587112171837709, "grad_norm": 0.7838173267581942, "learning_rate": 1.601147716141272e-06, "loss": 0.5517, "step": 138 }, { "epoch": 0.6634844868735084, "grad_norm": 0.8600041378892647, "learning_rate": 1.5948051166467657e-06, "loss": 0.5664, "step": 139 }, { "epoch": 0.6682577565632458, "grad_norm": 0.7393813982622772, "learning_rate": 1.5884252701640634e-06, "loss": 0.5611, "step": 140 }, { "epoch": 0.6730310262529833, "grad_norm": 0.8312116599801993, "learning_rate": 1.5820085762022823e-06, "loss": 0.5609, "step": 141 }, { "epoch": 0.6778042959427207, "grad_norm": 0.782610924284724, "learning_rate": 1.5755554365779455e-06, "loss": 0.5586, "step": 142 }, { "epoch": 0.6825775656324582, "grad_norm": 0.7869375949652244, "learning_rate": 1.5690662553898222e-06, "loss": 0.5557, "step": 143 }, { "epoch": 0.6873508353221957, "grad_norm": 0.7871275055021261, "learning_rate": 1.5625414389936218e-06, "loss": 0.5379, "step": 144 }, { "epoch": 0.6921241050119332, "grad_norm": 0.7978567113817064, "learning_rate": 1.555981395976548e-06, "loss": 0.5459, "step": 145 }, { "epoch": 0.6968973747016707, "grad_norm": 0.8678454065910531, "learning_rate": 1.5493865371317123e-06, "loss": 0.5538, "step": 146 }, { "epoch": 0.7016706443914081, "grad_norm": 0.8640558568867235, "learning_rate": 1.542757275432411e-06, "loss": 0.5511, "step": 147 }, { "epoch": 0.7064439140811456, "grad_norm": 0.8257539417151866, "learning_rate": 1.5360940260062635e-06, "loss": 0.5395, "step": 148 }, { "epoch": 0.711217183770883, "grad_norm": 0.7735477084244853, "learning_rate": 1.5293972061092185e-06, "loss": 0.5487, "step": 149 }, { "epoch": 0.7159904534606205, "grad_norm": 2.21607832896325, "learning_rate": 1.522667235099422e-06, "loss": 0.5313, "step": 150 }, { "epoch": 0.720763723150358, "grad_norm": 0.8260305997634725, "learning_rate": 1.515904534410961e-06, "loss": 0.548, "step": 151 }, { "epoch": 0.7255369928400954, "grad_norm": 0.9282281415854876, "learning_rate": 1.5091095275274699e-06, "loss": 0.5366, "step": 152 }, { "epoch": 0.7303102625298329, "grad_norm": 0.835392664470487, "learning_rate": 1.5022826399556133e-06, "loss": 0.5365, "step": 153 }, { "epoch": 0.7350835322195705, "grad_norm": 1.0014547232970634, "learning_rate": 1.4954242991984396e-06, "loss": 0.5601, "step": 154 }, { "epoch": 0.7398568019093079, "grad_norm": 0.7999358357306402, "learning_rate": 1.4885349347286115e-06, "loss": 0.549, "step": 155 }, { "epoch": 0.7446300715990454, "grad_norm": 0.7456244196208853, "learning_rate": 1.4816149779615126e-06, "loss": 0.5516, "step": 156 }, { "epoch": 0.7494033412887828, "grad_norm": 0.7568817924270603, "learning_rate": 1.474664862228229e-06, "loss": 0.5572, "step": 157 }, { "epoch": 0.7541766109785203, "grad_norm": 0.9329993871672655, "learning_rate": 1.467685022748419e-06, "loss": 0.5617, "step": 158 }, { "epoch": 0.7589498806682577, "grad_norm": 0.7402702977169047, "learning_rate": 1.4606758966030534e-06, "loss": 0.5426, "step": 159 }, { "epoch": 0.7637231503579952, "grad_norm": 0.7912657849322988, "learning_rate": 1.4536379227070509e-06, "loss": 0.544, "step": 160 }, { "epoch": 0.7684964200477327, "grad_norm": 0.8280839624728757, "learning_rate": 1.4465715417817888e-06, "loss": 0.5435, "step": 161 }, { "epoch": 0.7732696897374701, "grad_norm": 0.7376680395132865, "learning_rate": 1.4394771963275076e-06, "loss": 0.5199, "step": 162 }, { "epoch": 0.7780429594272077, "grad_norm": 0.7984252215551224, "learning_rate": 1.4323553305955997e-06, "loss": 0.5479, "step": 163 }, { "epoch": 0.7828162291169452, "grad_norm": 0.788726316639838, "learning_rate": 1.4252063905607909e-06, "loss": 0.5219, "step": 164 }, { "epoch": 0.7875894988066826, "grad_norm": 0.7350598897520126, "learning_rate": 1.4180308238932135e-06, "loss": 0.531, "step": 165 }, { "epoch": 0.7923627684964201, "grad_norm": 0.7786806805958749, "learning_rate": 1.410829079930372e-06, "loss": 0.5481, "step": 166 }, { "epoch": 0.7971360381861575, "grad_norm": 0.9607237271282482, "learning_rate": 1.4036016096490064e-06, "loss": 0.5478, "step": 167 }, { "epoch": 0.801909307875895, "grad_norm": 0.7782148550862285, "learning_rate": 1.3963488656368517e-06, "loss": 0.535, "step": 168 }, { "epoch": 0.8066825775656324, "grad_norm": 0.8100946646751193, "learning_rate": 1.389071302064295e-06, "loss": 0.5277, "step": 169 }, { "epoch": 0.8114558472553699, "grad_norm": 0.7502947220609039, "learning_rate": 1.381769374655938e-06, "loss": 0.5553, "step": 170 }, { "epoch": 0.8162291169451074, "grad_norm": 0.9124000354997026, "learning_rate": 1.374443540662057e-06, "loss": 0.5518, "step": 171 }, { "epoch": 0.8210023866348448, "grad_norm": 0.8409623949497625, "learning_rate": 1.3670942588299705e-06, "loss": 0.5294, "step": 172 }, { "epoch": 0.8257756563245824, "grad_norm": 0.8018568702519514, "learning_rate": 1.3597219893753117e-06, "loss": 0.5121, "step": 173 }, { "epoch": 0.8305489260143198, "grad_norm": 0.9262097539109866, "learning_rate": 1.352327193953211e-06, "loss": 0.5259, "step": 174 }, { "epoch": 0.8353221957040573, "grad_norm": 0.7289872898963717, "learning_rate": 1.3449103356293852e-06, "loss": 0.5601, "step": 175 }, { "epoch": 0.8400954653937948, "grad_norm": 0.7836398407929648, "learning_rate": 1.337471878851141e-06, "loss": 0.5359, "step": 176 }, { "epoch": 0.8448687350835322, "grad_norm": 0.8058359597234802, "learning_rate": 1.3300122894182909e-06, "loss": 0.5485, "step": 177 }, { "epoch": 0.8496420047732697, "grad_norm": 0.9118002301436436, "learning_rate": 1.3225320344539842e-06, "loss": 0.5562, "step": 178 }, { "epoch": 0.8544152744630071, "grad_norm": 0.7609979767002807, "learning_rate": 1.315031582375457e-06, "loss": 0.5485, "step": 179 }, { "epoch": 0.8591885441527446, "grad_norm": 0.7105869344115592, "learning_rate": 1.3075114028646974e-06, "loss": 0.5444, "step": 180 }, { "epoch": 0.863961813842482, "grad_norm": 0.8004311294692876, "learning_rate": 1.299971966839036e-06, "loss": 0.5481, "step": 181 }, { "epoch": 0.8687350835322196, "grad_norm": 0.7667234252631754, "learning_rate": 1.292413746421655e-06, "loss": 0.5345, "step": 182 }, { "epoch": 0.8735083532219571, "grad_norm": 0.7709523318159157, "learning_rate": 1.2848372149120246e-06, "loss": 0.512, "step": 183 }, { "epoch": 0.8782816229116945, "grad_norm": 0.8742048693859581, "learning_rate": 1.2772428467562651e-06, "loss": 0.55, "step": 184 }, { "epoch": 0.883054892601432, "grad_norm": 0.8768649061250284, "learning_rate": 1.2696311175174357e-06, "loss": 0.5365, "step": 185 }, { "epoch": 0.8878281622911695, "grad_norm": 0.8468420712736167, "learning_rate": 1.2620025038457554e-06, "loss": 0.5421, "step": 186 }, { "epoch": 0.8926014319809069, "grad_norm": 0.725877140171063, "learning_rate": 1.254357483448755e-06, "loss": 0.519, "step": 187 }, { "epoch": 0.8973747016706444, "grad_norm": 0.7168188099187686, "learning_rate": 1.2466965350613615e-06, "loss": 0.5651, "step": 188 }, { "epoch": 0.9021479713603818, "grad_norm": 0.8993966404570418, "learning_rate": 1.2390201384159219e-06, "loss": 0.5603, "step": 189 }, { "epoch": 0.9069212410501193, "grad_norm": 0.741646072361816, "learning_rate": 1.231328774212159e-06, "loss": 0.5157, "step": 190 }, { "epoch": 0.9116945107398569, "grad_norm": 0.7741706595084717, "learning_rate": 1.223622924087073e-06, "loss": 0.5367, "step": 191 }, { "epoch": 0.9164677804295943, "grad_norm": 0.760645151447744, "learning_rate": 1.215903070584779e-06, "loss": 0.5401, "step": 192 }, { "epoch": 0.9212410501193318, "grad_norm": 0.7462809840684769, "learning_rate": 1.2081696971262903e-06, "loss": 0.5458, "step": 193 }, { "epoch": 0.9260143198090692, "grad_norm": 0.867349599337623, "learning_rate": 1.2004232879792464e-06, "loss": 0.5398, "step": 194 }, { "epoch": 0.9307875894988067, "grad_norm": 0.7728255267176583, "learning_rate": 1.1926643282275882e-06, "loss": 0.5343, "step": 195 }, { "epoch": 0.9355608591885441, "grad_norm": 0.7946709962404823, "learning_rate": 1.1848933037411825e-06, "loss": 0.5181, "step": 196 }, { "epoch": 0.9403341288782816, "grad_norm": 0.7159173523126642, "learning_rate": 1.1771107011453933e-06, "loss": 0.5442, "step": 197 }, { "epoch": 0.9451073985680191, "grad_norm": 0.8493976289870552, "learning_rate": 1.1693170077906143e-06, "loss": 0.5467, "step": 198 }, { "epoch": 0.9498806682577565, "grad_norm": 0.7390118080756048, "learning_rate": 1.1615127117217463e-06, "loss": 0.5251, "step": 199 }, { "epoch": 0.954653937947494, "grad_norm": 0.7595495597083671, "learning_rate": 1.1536983016476373e-06, "loss": 0.5368, "step": 200 }, { "epoch": 0.9594272076372315, "grad_norm": 0.7399505119485492, "learning_rate": 1.1458742669104803e-06, "loss": 0.514, "step": 201 }, { "epoch": 0.964200477326969, "grad_norm": 0.7693531287817772, "learning_rate": 1.1380410974551682e-06, "loss": 0.5327, "step": 202 }, { "epoch": 0.9689737470167065, "grad_norm": 0.7361655101073081, "learning_rate": 1.130199283798615e-06, "loss": 0.5152, "step": 203 }, { "epoch": 0.9737470167064439, "grad_norm": 0.8174253218643999, "learning_rate": 1.1223493169990391e-06, "loss": 0.5376, "step": 204 }, { "epoch": 0.9785202863961814, "grad_norm": 0.7646163527785592, "learning_rate": 1.1144916886252124e-06, "loss": 0.5198, "step": 205 }, { "epoch": 0.9832935560859188, "grad_norm": 0.7600726494815581, "learning_rate": 1.1066268907256782e-06, "loss": 0.5358, "step": 206 }, { "epoch": 0.9880668257756563, "grad_norm": 0.8292480992474258, "learning_rate": 1.098755415797939e-06, "loss": 0.5319, "step": 207 }, { "epoch": 0.9928400954653938, "grad_norm": 0.7584975382780693, "learning_rate": 1.0908777567576168e-06, "loss": 0.5453, "step": 208 }, { "epoch": 0.9976133651551312, "grad_norm": 0.7360353406613074, "learning_rate": 1.0829944069075847e-06, "loss": 0.5398, "step": 209 }, { "epoch": 1.0, "grad_norm": 0.7360353406613074, "learning_rate": 1.0751058599070781e-06, "loss": 0.2683, "step": 210 }, { "epoch": 1.0047732696897376, "grad_norm": 0.7735348980384088, "learning_rate": 1.0672126097407795e-06, "loss": 0.4862, "step": 211 }, { "epoch": 1.009546539379475, "grad_norm": 0.6892850244639656, "learning_rate": 1.0593151506878865e-06, "loss": 0.4886, "step": 212 }, { "epoch": 1.0143198090692125, "grad_norm": 0.7416432308937427, "learning_rate": 1.0514139772911597e-06, "loss": 0.4755, "step": 213 }, { "epoch": 1.0190930787589498, "grad_norm": 0.6788376232914372, "learning_rate": 1.043509584325953e-06, "loss": 0.4643, "step": 214 }, { "epoch": 1.0238663484486874, "grad_norm": 0.7328906073842687, "learning_rate": 1.0356024667692314e-06, "loss": 0.4934, "step": 215 }, { "epoch": 1.0286396181384247, "grad_norm": 0.7697429459150121, "learning_rate": 1.0276931197685753e-06, "loss": 0.4976, "step": 216 }, { "epoch": 1.0334128878281623, "grad_norm": 0.7939705310040335, "learning_rate": 1.0197820386111737e-06, "loss": 0.4897, "step": 217 }, { "epoch": 1.0381861575178997, "grad_norm": 0.9752936792347606, "learning_rate": 1.0118697186928105e-06, "loss": 0.4632, "step": 218 }, { "epoch": 1.0429594272076372, "grad_norm": 0.810300278966379, "learning_rate": 1.0039566554868392e-06, "loss": 0.4667, "step": 219 }, { "epoch": 1.0477326968973748, "grad_norm": 0.7651633767231123, "learning_rate": 9.960433445131607e-07, "loss": 0.4913, "step": 220 }, { "epoch": 1.0525059665871122, "grad_norm": 0.7783544485209318, "learning_rate": 9.881302813071896e-07, "loss": 0.485, "step": 221 }, { "epoch": 1.0572792362768497, "grad_norm": 0.7728747490030172, "learning_rate": 9.802179613888262e-07, "loss": 0.4663, "step": 222 }, { "epoch": 1.062052505966587, "grad_norm": 0.7199803548701269, "learning_rate": 9.723068802314246e-07, "loss": 0.4724, "step": 223 }, { "epoch": 1.0668257756563246, "grad_norm": 0.8173682429078198, "learning_rate": 9.643975332307687e-07, "loss": 0.4777, "step": 224 }, { "epoch": 1.071599045346062, "grad_norm": 0.9029276240129886, "learning_rate": 9.564904156740471e-07, "loss": 0.4664, "step": 225 }, { "epoch": 1.0763723150357996, "grad_norm": 0.7595074592495551, "learning_rate": 9.485860227088405e-07, "loss": 0.4808, "step": 226 }, { "epoch": 1.081145584725537, "grad_norm": 0.8019805756491788, "learning_rate": 9.406848493121134e-07, "loss": 0.4764, "step": 227 }, { "epoch": 1.0859188544152745, "grad_norm": 0.7750922258239085, "learning_rate": 9.327873902592205e-07, "loss": 0.4711, "step": 228 }, { "epoch": 1.0906921241050118, "grad_norm": 0.7272348247085987, "learning_rate": 9.248941400929222e-07, "loss": 0.4753, "step": 229 }, { "epoch": 1.0954653937947494, "grad_norm": 0.8135968715591004, "learning_rate": 9.17005593092415e-07, "loss": 0.49, "step": 230 }, { "epoch": 1.100238663484487, "grad_norm": 0.784517413630989, "learning_rate": 9.09122243242383e-07, "loss": 0.4636, "step": 231 }, { "epoch": 1.1050119331742243, "grad_norm": 0.7967633635464352, "learning_rate": 9.01244584202061e-07, "loss": 0.4638, "step": 232 }, { "epoch": 1.1097852028639619, "grad_norm": 0.7347125585892648, "learning_rate": 8.933731092743219e-07, "loss": 0.4951, "step": 233 }, { "epoch": 1.1145584725536992, "grad_norm": 0.762506543894173, "learning_rate": 8.855083113747875e-07, "loss": 0.4715, "step": 234 }, { "epoch": 1.1193317422434368, "grad_norm": 0.7657159811972606, "learning_rate": 8.776506830009607e-07, "loss": 0.4792, "step": 235 }, { "epoch": 1.1241050119331741, "grad_norm": 0.7746280343348994, "learning_rate": 8.698007162013849e-07, "loss": 0.4734, "step": 236 }, { "epoch": 1.1288782816229117, "grad_norm": 0.7287940319917965, "learning_rate": 8.619589025448318e-07, "loss": 0.4899, "step": 237 }, { "epoch": 1.1336515513126493, "grad_norm": 0.7283506274833321, "learning_rate": 8.541257330895197e-07, "loss": 0.461, "step": 238 }, { "epoch": 1.1384248210023866, "grad_norm": 1.109020964160513, "learning_rate": 8.463016983523627e-07, "loss": 0.4789, "step": 239 }, { "epoch": 1.1431980906921242, "grad_norm": 0.8916069268430648, "learning_rate": 8.384872882782541e-07, "loss": 0.4951, "step": 240 }, { "epoch": 1.1479713603818615, "grad_norm": 0.7832561259348029, "learning_rate": 8.306829922093857e-07, "loss": 0.4666, "step": 241 }, { "epoch": 1.152744630071599, "grad_norm": 0.7246823419762234, "learning_rate": 8.228892988546067e-07, "loss": 0.475, "step": 242 }, { "epoch": 1.1575178997613365, "grad_norm": 0.705366097498364, "learning_rate": 8.15106696258818e-07, "loss": 0.4727, "step": 243 }, { "epoch": 1.162291169451074, "grad_norm": 0.7563603316000965, "learning_rate": 8.073356717724115e-07, "loss": 0.4779, "step": 244 }, { "epoch": 1.1670644391408114, "grad_norm": 0.7463996376621957, "learning_rate": 7.995767120207536e-07, "loss": 0.4647, "step": 245 }, { "epoch": 1.171837708830549, "grad_norm": 0.7117618711530662, "learning_rate": 7.918303028737096e-07, "loss": 0.4712, "step": 246 }, { "epoch": 1.1766109785202863, "grad_norm": 0.7445420769436453, "learning_rate": 7.840969294152211e-07, "loss": 0.4747, "step": 247 }, { "epoch": 1.1813842482100239, "grad_norm": 0.7339272409779617, "learning_rate": 7.763770759129269e-07, "loss": 0.4732, "step": 248 }, { "epoch": 1.1861575178997614, "grad_norm": 0.7680499628702099, "learning_rate": 7.68671225787841e-07, "loss": 0.4677, "step": 249 }, { "epoch": 1.1909307875894988, "grad_norm": 0.7289596879207738, "learning_rate": 7.609798615840785e-07, "loss": 0.4788, "step": 250 }, { "epoch": 1.1957040572792363, "grad_norm": 0.7375098113291024, "learning_rate": 7.533034649386384e-07, "loss": 0.456, "step": 251 }, { "epoch": 1.2004773269689737, "grad_norm": 0.7788484912408599, "learning_rate": 7.456425165512452e-07, "loss": 0.4768, "step": 252 }, { "epoch": 1.2052505966587113, "grad_norm": 0.7545300469644135, "learning_rate": 7.379974961542447e-07, "loss": 0.4864, "step": 253 }, { "epoch": 1.2100238663484486, "grad_norm": 0.8818787967594464, "learning_rate": 7.303688824825646e-07, "loss": 0.4768, "step": 254 }, { "epoch": 1.2147971360381862, "grad_norm": 0.7762788166887581, "learning_rate": 7.227571532437349e-07, "loss": 0.4676, "step": 255 }, { "epoch": 1.2195704057279237, "grad_norm": 0.674374793234199, "learning_rate": 7.151627850879755e-07, "loss": 0.4688, "step": 256 }, { "epoch": 1.224343675417661, "grad_norm": 0.7391271163895584, "learning_rate": 7.075862535783453e-07, "loss": 0.4545, "step": 257 }, { "epoch": 1.2291169451073987, "grad_norm": 0.7377869581736503, "learning_rate": 7.00028033160964e-07, "loss": 0.4842, "step": 258 }, { "epoch": 1.233890214797136, "grad_norm": 0.7182033053068443, "learning_rate": 6.924885971353026e-07, "loss": 0.4841, "step": 259 }, { "epoch": 1.2386634844868736, "grad_norm": 0.7165206421556828, "learning_rate": 6.849684176245431e-07, "loss": 0.4485, "step": 260 }, { "epoch": 1.243436754176611, "grad_norm": 0.8274126483370449, "learning_rate": 6.774679655460158e-07, "loss": 0.4632, "step": 261 }, { "epoch": 1.2482100238663485, "grad_norm": 0.7849668814937834, "learning_rate": 6.699877105817092e-07, "loss": 0.4701, "step": 262 }, { "epoch": 1.2529832935560858, "grad_norm": 0.7246643685451561, "learning_rate": 6.625281211488591e-07, "loss": 0.4884, "step": 263 }, { "epoch": 1.2577565632458234, "grad_norm": 0.7413214893244733, "learning_rate": 6.55089664370615e-07, "loss": 0.4821, "step": 264 }, { "epoch": 1.2625298329355608, "grad_norm": 0.7307541408287506, "learning_rate": 6.476728060467888e-07, "loss": 0.4585, "step": 265 }, { "epoch": 1.2673031026252983, "grad_norm": 0.7439228818529052, "learning_rate": 6.402780106246884e-07, "loss": 0.4688, "step": 266 }, { "epoch": 1.272076372315036, "grad_norm": 0.7075632105234686, "learning_rate": 6.329057411700298e-07, "loss": 0.4813, "step": 267 }, { "epoch": 1.2768496420047732, "grad_norm": 0.757650326028371, "learning_rate": 6.255564593379429e-07, "loss": 0.4878, "step": 268 }, { "epoch": 1.2816229116945108, "grad_norm": 0.729712295017678, "learning_rate": 6.182306253440619e-07, "loss": 0.4629, "step": 269 }, { "epoch": 1.2863961813842482, "grad_norm": 0.8230987908171445, "learning_rate": 6.109286979357051e-07, "loss": 0.4842, "step": 270 }, { "epoch": 1.2911694510739857, "grad_norm": 0.7878144207218812, "learning_rate": 6.036511343631488e-07, "loss": 0.4588, "step": 271 }, { "epoch": 1.295942720763723, "grad_norm": 0.7162555025211284, "learning_rate": 5.963983903509935e-07, "loss": 0.4817, "step": 272 }, { "epoch": 1.3007159904534606, "grad_norm": 0.7352227500252277, "learning_rate": 5.89170920069628e-07, "loss": 0.4781, "step": 273 }, { "epoch": 1.3054892601431982, "grad_norm": 0.7097358431174013, "learning_rate": 5.819691761067865e-07, "loss": 0.46, "step": 274 }, { "epoch": 1.3102625298329356, "grad_norm": 1.146161188184777, "learning_rate": 5.747936094392089e-07, "loss": 0.4647, "step": 275 }, { "epoch": 1.315035799522673, "grad_norm": 0.7072592435264768, "learning_rate": 5.676446694044002e-07, "loss": 0.4639, "step": 276 }, { "epoch": 1.3198090692124105, "grad_norm": 0.7215149618117556, "learning_rate": 5.605228036724927e-07, "loss": 0.4652, "step": 277 }, { "epoch": 1.324582338902148, "grad_norm": 0.670785774408122, "learning_rate": 5.534284582182114e-07, "loss": 0.4717, "step": 278 }, { "epoch": 1.3293556085918854, "grad_norm": 0.747767864677791, "learning_rate": 5.463620772929494e-07, "loss": 0.4536, "step": 279 }, { "epoch": 1.334128878281623, "grad_norm": 0.8516514509018951, "learning_rate": 5.393241033969466e-07, "loss": 0.4649, "step": 280 }, { "epoch": 1.3389021479713603, "grad_norm": 0.8138001829719436, "learning_rate": 5.323149772515812e-07, "loss": 0.4668, "step": 281 }, { "epoch": 1.3436754176610979, "grad_norm": 0.7576171145048753, "learning_rate": 5.253351377717706e-07, "loss": 0.4761, "step": 282 }, { "epoch": 1.3484486873508352, "grad_norm": 0.8613520066962265, "learning_rate": 5.183850220384873e-07, "loss": 0.469, "step": 283 }, { "epoch": 1.3532219570405728, "grad_norm": 0.766228885306893, "learning_rate": 5.114650652713884e-07, "loss": 0.4802, "step": 284 }, { "epoch": 1.3579952267303104, "grad_norm": 0.7068637893292556, "learning_rate": 5.045757008015606e-07, "loss": 0.4773, "step": 285 }, { "epoch": 1.3627684964200477, "grad_norm": 0.8429657657602729, "learning_rate": 4.977173600443868e-07, "loss": 0.4605, "step": 286 }, { "epoch": 1.3675417661097853, "grad_norm": 0.7007932505507933, "learning_rate": 4.908904724725299e-07, "loss": 0.4767, "step": 287 }, { "epoch": 1.3723150357995226, "grad_norm": 0.7671222670718428, "learning_rate": 4.840954655890391e-07, "loss": 0.4682, "step": 288 }, { "epoch": 1.3770883054892602, "grad_norm": 0.694265618019185, "learning_rate": 4.773327649005777e-07, "loss": 0.4855, "step": 289 }, { "epoch": 1.3818615751789975, "grad_norm": 0.7519150028535938, "learning_rate": 4.7060279389078184e-07, "loss": 0.4761, "step": 290 }, { "epoch": 1.3866348448687351, "grad_norm": 0.7486630511459641, "learning_rate": 4.6390597399373644e-07, "loss": 0.4565, "step": 291 }, { "epoch": 1.3914081145584727, "grad_norm": 0.7422555751664944, "learning_rate": 4.5724272456758907e-07, "loss": 0.4826, "step": 292 }, { "epoch": 1.39618138424821, "grad_norm": 0.77856112043872, "learning_rate": 4.506134628682877e-07, "loss": 0.4763, "step": 293 }, { "epoch": 1.4009546539379474, "grad_norm": 0.7684572854516972, "learning_rate": 4.440186040234524e-07, "loss": 0.4672, "step": 294 }, { "epoch": 1.405727923627685, "grad_norm": 0.7665847058665568, "learning_rate": 4.3745856100637834e-07, "loss": 0.4656, "step": 295 }, { "epoch": 1.4105011933174225, "grad_norm": 0.733469970387663, "learning_rate": 4.3093374461017785e-07, "loss": 0.4676, "step": 296 }, { "epoch": 1.4152744630071599, "grad_norm": 0.8421640257156171, "learning_rate": 4.244445634220545e-07, "loss": 0.4843, "step": 297 }, { "epoch": 1.4200477326968974, "grad_norm": 0.8009564109297522, "learning_rate": 4.1799142379771766e-07, "loss": 0.4809, "step": 298 }, { "epoch": 1.4248210023866348, "grad_norm": 0.7033349702559853, "learning_rate": 4.115747298359363e-07, "loss": 0.464, "step": 299 }, { "epoch": 1.4295942720763724, "grad_norm": 0.7437100788001662, "learning_rate": 4.0519488335323415e-07, "loss": 0.4851, "step": 300 }, { "epoch": 1.4343675417661097, "grad_norm": 0.7732697984175376, "learning_rate": 3.9885228385872806e-07, "loss": 0.4594, "step": 301 }, { "epoch": 1.4391408114558473, "grad_norm": 0.7940793070581448, "learning_rate": 3.925473285291091e-07, "loss": 0.4661, "step": 302 }, { "epoch": 1.4439140811455848, "grad_norm": 0.7351909971969558, "learning_rate": 3.862804121837733e-07, "loss": 0.4757, "step": 303 }, { "epoch": 1.4486873508353222, "grad_norm": 0.781207875542895, "learning_rate": 3.8005192726009663e-07, "loss": 0.4787, "step": 304 }, { "epoch": 1.4534606205250595, "grad_norm": 0.7991516861553173, "learning_rate": 3.738622637888608e-07, "loss": 0.4668, "step": 305 }, { "epoch": 1.458233890214797, "grad_norm": 0.8987252432386614, "learning_rate": 3.677118093698278e-07, "loss": 0.4606, "step": 306 }, { "epoch": 1.4630071599045347, "grad_norm": 0.698103668533834, "learning_rate": 3.61600949147472e-07, "loss": 0.4683, "step": 307 }, { "epoch": 1.467780429594272, "grad_norm": 0.7560261667555234, "learning_rate": 3.5553006578685706e-07, "loss": 0.4519, "step": 308 }, { "epoch": 1.4725536992840096, "grad_norm": 0.7382407678980342, "learning_rate": 3.494995394496778e-07, "loss": 0.469, "step": 309 }, { "epoch": 1.477326968973747, "grad_norm": 0.720898348204588, "learning_rate": 3.435097477704517e-07, "loss": 0.449, "step": 310 }, { "epoch": 1.4821002386634845, "grad_norm": 0.7319822241837816, "learning_rate": 3.3756106583287205e-07, "loss": 0.4745, "step": 311 }, { "epoch": 1.4868735083532219, "grad_norm": 0.7518826329514531, "learning_rate": 3.316538661463204e-07, "loss": 0.4918, "step": 312 }, { "epoch": 1.4916467780429594, "grad_norm": 0.8013086574909619, "learning_rate": 3.2578851862253796e-07, "loss": 0.4846, "step": 313 }, { "epoch": 1.496420047732697, "grad_norm": 0.7101861238945232, "learning_rate": 3.199653905524654e-07, "loss": 0.4604, "step": 314 }, { "epoch": 1.5011933174224343, "grad_norm": 0.7204781171906866, "learning_rate": 3.1418484658323806e-07, "loss": 0.4772, "step": 315 }, { "epoch": 1.5059665871121717, "grad_norm": 0.73033687450555, "learning_rate": 3.0844724869535577e-07, "loss": 0.468, "step": 316 }, { "epoch": 1.5107398568019093, "grad_norm": 0.7700114197888783, "learning_rate": 3.027529561800117e-07, "loss": 0.4808, "step": 317 }, { "epoch": 1.5155131264916468, "grad_norm": 0.8599415830432524, "learning_rate": 2.971023256165983e-07, "loss": 0.469, "step": 318 }, { "epoch": 1.5202863961813842, "grad_norm": 0.7490557961852297, "learning_rate": 2.9149571085037215e-07, "loss": 0.4758, "step": 319 }, { "epoch": 1.5250596658711217, "grad_norm": 0.6911043116400506, "learning_rate": 2.8593346297030073e-07, "loss": 0.4662, "step": 320 }, { "epoch": 1.5298329355608593, "grad_norm": 0.7444306144257443, "learning_rate": 2.804159302870751e-07, "loss": 0.4638, "step": 321 }, { "epoch": 1.5346062052505967, "grad_norm": 0.6930295325600317, "learning_rate": 2.7494345831129837e-07, "loss": 0.4584, "step": 322 }, { "epoch": 1.539379474940334, "grad_norm": 0.7461580524158721, "learning_rate": 2.6951638973185073e-07, "loss": 0.4757, "step": 323 }, { "epoch": 1.5441527446300716, "grad_norm": 0.7678530858976563, "learning_rate": 2.64135064394428e-07, "loss": 0.4807, "step": 324 }, { "epoch": 1.5489260143198091, "grad_norm": 1.8144860255245707, "learning_rate": 2.587998192802638e-07, "loss": 0.4605, "step": 325 }, { "epoch": 1.5536992840095465, "grad_norm": 0.7200319691236525, "learning_rate": 2.5351098848502386e-07, "loss": 0.474, "step": 326 }, { "epoch": 1.558472553699284, "grad_norm": 0.7134577877268367, "learning_rate": 2.482689031978872e-07, "loss": 0.4715, "step": 327 }, { "epoch": 1.5632458233890216, "grad_norm": 0.9468756981275396, "learning_rate": 2.4307389168080606e-07, "loss": 0.4656, "step": 328 }, { "epoch": 1.568019093078759, "grad_norm": 0.6688722309384391, "learning_rate": 2.3792627924795038e-07, "loss": 0.4922, "step": 329 }, { "epoch": 1.5727923627684963, "grad_norm": 0.7125789762828182, "learning_rate": 2.3282638824533529e-07, "loss": 0.4692, "step": 330 }, { "epoch": 1.577565632458234, "grad_norm": 0.8844333458882234, "learning_rate": 2.277745380306383e-07, "loss": 0.4876, "step": 331 }, { "epoch": 1.5823389021479715, "grad_norm": 0.7950308834961601, "learning_rate": 2.227710449531971e-07, "loss": 0.4918, "step": 332 }, { "epoch": 1.5871121718377088, "grad_norm": 0.796382860942759, "learning_rate": 2.178162223342035e-07, "loss": 0.4641, "step": 333 }, { "epoch": 1.5918854415274462, "grad_norm": 0.7285520770077796, "learning_rate": 2.1291038044707965e-07, "loss": 0.4661, "step": 334 }, { "epoch": 1.5966587112171837, "grad_norm": 0.6921820001369808, "learning_rate": 2.0805382649805225e-07, "loss": 0.4681, "step": 335 }, { "epoch": 1.6014319809069213, "grad_norm": 0.7552481890637776, "learning_rate": 2.032468646069112e-07, "loss": 0.4672, "step": 336 }, { "epoch": 1.6062052505966586, "grad_norm": 0.7155745101307224, "learning_rate": 1.9848979578796865e-07, "loss": 0.4767, "step": 337 }, { "epoch": 1.6109785202863962, "grad_norm": 0.6993076336434562, "learning_rate": 1.937829179312076e-07, "loss": 0.4822, "step": 338 }, { "epoch": 1.6157517899761338, "grad_norm": 0.7530303728674003, "learning_rate": 1.8912652578362853e-07, "loss": 0.4709, "step": 339 }, { "epoch": 1.6205250596658711, "grad_norm": 0.7510327363849882, "learning_rate": 1.8452091093079215e-07, "loss": 0.4604, "step": 340 }, { "epoch": 1.6252983293556085, "grad_norm": 0.7282910633876013, "learning_rate": 1.7996636177855928e-07, "loss": 0.4984, "step": 341 }, { "epoch": 1.630071599045346, "grad_norm": 0.7524297400825809, "learning_rate": 1.75463163535033e-07, "loss": 0.4823, "step": 342 }, { "epoch": 1.6348448687350836, "grad_norm": 0.7049222733481684, "learning_rate": 1.7101159819269583e-07, "loss": 0.4635, "step": 343 }, { "epoch": 1.639618138424821, "grad_norm": 1.1034453594616451, "learning_rate": 1.6661194451075345e-07, "loss": 0.4765, "step": 344 }, { "epoch": 1.6443914081145583, "grad_norm": 0.83013391018154, "learning_rate": 1.6226447799767772e-07, "loss": 0.4533, "step": 345 }, { "epoch": 1.649164677804296, "grad_norm": 2.858030289791699, "learning_rate": 1.5796947089395475e-07, "loss": 0.4691, "step": 346 }, { "epoch": 1.6539379474940334, "grad_norm": 0.7332905568570133, "learning_rate": 1.5372719215503582e-07, "loss": 0.4544, "step": 347 }, { "epoch": 1.6587112171837708, "grad_norm": 0.7481224605220782, "learning_rate": 1.4953790743449702e-07, "loss": 0.4806, "step": 348 }, { "epoch": 1.6634844868735084, "grad_norm": 0.9099408876904721, "learning_rate": 1.4540187906740241e-07, "loss": 0.4569, "step": 349 }, { "epoch": 1.668257756563246, "grad_norm": 0.6921320546034447, "learning_rate": 1.4131936605387762e-07, "loss": 0.4897, "step": 350 }, { "epoch": 1.6730310262529833, "grad_norm": 0.7172188028374827, "learning_rate": 1.3729062404289017e-07, "loss": 0.4799, "step": 351 }, { "epoch": 1.6778042959427206, "grad_norm": 0.7348308299387173, "learning_rate": 1.3331590531624115e-07, "loss": 0.4714, "step": 352 }, { "epoch": 1.6825775656324582, "grad_norm": 0.7524117454719962, "learning_rate": 1.2939545877276726e-07, "loss": 0.4679, "step": 353 }, { "epoch": 1.6873508353221958, "grad_norm": 0.7609980327732692, "learning_rate": 1.25529529912754e-07, "loss": 0.4678, "step": 354 }, { "epoch": 1.692124105011933, "grad_norm": 0.7906234591099575, "learning_rate": 1.2171836082256316e-07, "loss": 0.4754, "step": 355 }, { "epoch": 1.6968973747016707, "grad_norm": 0.7519337557814546, "learning_rate": 1.1796219015947285e-07, "loss": 0.4803, "step": 356 }, { "epoch": 1.7016706443914082, "grad_norm": 0.6859134821445197, "learning_rate": 1.1426125313673285e-07, "loss": 0.4939, "step": 357 }, { "epoch": 1.7064439140811456, "grad_norm": 0.8229493204752176, "learning_rate": 1.1061578150883444e-07, "loss": 0.4372, "step": 358 }, { "epoch": 1.711217183770883, "grad_norm": 0.692317996696451, "learning_rate": 1.070260035570002e-07, "loss": 0.4792, "step": 359 }, { "epoch": 1.7159904534606205, "grad_norm": 0.7390705342617898, "learning_rate": 1.0349214407488571e-07, "loss": 0.4719, "step": 360 }, { "epoch": 1.720763723150358, "grad_norm": 0.7057263439063961, "learning_rate": 1.000144243545058e-07, "loss": 0.4724, "step": 361 }, { "epoch": 1.7255369928400954, "grad_norm": 0.707795857913463, "learning_rate": 9.659306217237517e-08, "loss": 0.4717, "step": 362 }, { "epoch": 1.7303102625298328, "grad_norm": 0.7912536951606031, "learning_rate": 9.322827177587212e-08, "loss": 0.4623, "step": 363 }, { "epoch": 1.7350835322195706, "grad_norm": 0.746736598206851, "learning_rate": 8.992026386982221e-08, "loss": 0.4735, "step": 364 }, { "epoch": 1.739856801909308, "grad_norm": 0.6948885657819285, "learning_rate": 8.66692456033029e-08, "loss": 0.4825, "step": 365 }, { "epoch": 1.7446300715990453, "grad_norm": 0.7262491961744311, "learning_rate": 8.347542055667311e-08, "loss": 0.4699, "step": 366 }, { "epoch": 1.7494033412887828, "grad_norm": 0.7863038143235231, "learning_rate": 8.033898872882394e-08, "loss": 0.4679, "step": 367 }, { "epoch": 1.7541766109785204, "grad_norm": 0.6727626949269937, "learning_rate": 7.726014652465507e-08, "loss": 0.4421, "step": 368 }, { "epoch": 1.7589498806682577, "grad_norm": 0.6867145980818331, "learning_rate": 7.423908674277579e-08, "loss": 0.4778, "step": 369 }, { "epoch": 1.763723150357995, "grad_norm": 1.4213029472300538, "learning_rate": 7.127599856343192e-08, "loss": 0.4727, "step": 370 }, { "epoch": 1.7684964200477327, "grad_norm": 0.692012937763345, "learning_rate": 6.837106753665823e-08, "loss": 0.4741, "step": 371 }, { "epoch": 1.7732696897374702, "grad_norm": 0.7092148893859065, "learning_rate": 6.552447557066109e-08, "loss": 0.4697, "step": 372 }, { "epoch": 1.7780429594272076, "grad_norm": 0.6973356829898804, "learning_rate": 6.273640092042575e-08, "loss": 0.4544, "step": 373 }, { "epoch": 1.7828162291169452, "grad_norm": 1.5448551643686548, "learning_rate": 6.000701817655474e-08, "loss": 0.4523, "step": 374 }, { "epoch": 1.7875894988066827, "grad_norm": 1.4827724692081619, "learning_rate": 5.733649825433384e-08, "loss": 0.4551, "step": 375 }, { "epoch": 1.79236276849642, "grad_norm": 0.7790516793749164, "learning_rate": 5.47250083830314e-08, "loss": 0.494, "step": 376 }, { "epoch": 1.7971360381861574, "grad_norm": 0.7365514384441436, "learning_rate": 5.217271209542384e-08, "loss": 0.4735, "step": 377 }, { "epoch": 1.801909307875895, "grad_norm": 0.7707502808832377, "learning_rate": 4.967976921755679e-08, "loss": 0.4501, "step": 378 }, { "epoch": 1.8066825775656326, "grad_norm": 0.7176835200739754, "learning_rate": 4.724633585873627e-08, "loss": 0.4686, "step": 379 }, { "epoch": 1.81145584725537, "grad_norm": 0.6889468337016494, "learning_rate": 4.487256440175291e-08, "loss": 0.4771, "step": 380 }, { "epoch": 1.8162291169451072, "grad_norm": 1.0649529564643607, "learning_rate": 4.255860349334006e-08, "loss": 0.4661, "step": 381 }, { "epoch": 1.8210023866348448, "grad_norm": 1.1333041301606328, "learning_rate": 4.030459803486464e-08, "loss": 0.4606, "step": 382 }, { "epoch": 1.8257756563245824, "grad_norm": 0.765268616008849, "learning_rate": 3.811068917325444e-08, "loss": 0.4442, "step": 383 }, { "epoch": 1.8305489260143197, "grad_norm": 0.701547689578903, "learning_rate": 3.5977014292158495e-08, "loss": 0.4739, "step": 384 }, { "epoch": 1.8353221957040573, "grad_norm": 0.7141975076446941, "learning_rate": 3.3903707003344774e-08, "loss": 0.4719, "step": 385 }, { "epoch": 1.8400954653937949, "grad_norm": 0.6918753885495199, "learning_rate": 3.189089713833226e-08, "loss": 0.4772, "step": 386 }, { "epoch": 1.8448687350835322, "grad_norm": 0.714964202433507, "learning_rate": 2.9938710740262884e-08, "loss": 0.4561, "step": 387 }, { "epoch": 1.8496420047732696, "grad_norm": 0.7838822438811583, "learning_rate": 2.8047270056005934e-08, "loss": 0.4565, "step": 388 }, { "epoch": 1.8544152744630071, "grad_norm": 0.7061577623995287, "learning_rate": 2.6216693528505195e-08, "loss": 0.4648, "step": 389 }, { "epoch": 1.8591885441527447, "grad_norm": 0.9071757882196184, "learning_rate": 2.4447095789360884e-08, "loss": 0.4711, "step": 390 }, { "epoch": 1.863961813842482, "grad_norm": 0.765845128347514, "learning_rate": 2.2738587651651487e-08, "loss": 0.4577, "step": 391 }, { "epoch": 1.8687350835322196, "grad_norm": 0.7650600946027074, "learning_rate": 2.109127610299466e-08, "loss": 0.4679, "step": 392 }, { "epoch": 1.8735083532219572, "grad_norm": 0.6957819402359949, "learning_rate": 1.950526429884769e-08, "loss": 0.4559, "step": 393 }, { "epoch": 1.8782816229116945, "grad_norm": 0.7430172728751436, "learning_rate": 1.7980651556048e-08, "loss": 0.4732, "step": 394 }, { "epoch": 1.8830548926014319, "grad_norm": 0.6767278663023139, "learning_rate": 1.6517533346593226e-08, "loss": 0.4758, "step": 395 }, { "epoch": 1.8878281622911695, "grad_norm": 0.7619777582419104, "learning_rate": 1.5116001291663462e-08, "loss": 0.4932, "step": 396 }, { "epoch": 1.892601431980907, "grad_norm": 0.790159743362526, "learning_rate": 1.3776143155883491e-08, "loss": 0.4558, "step": 397 }, { "epoch": 1.8973747016706444, "grad_norm": 0.7261843559497824, "learning_rate": 1.2498042841827317e-08, "loss": 0.4595, "step": 398 }, { "epoch": 1.9021479713603817, "grad_norm": 0.7017669980294373, "learning_rate": 1.128178038476324e-08, "loss": 0.4625, "step": 399 }, { "epoch": 1.9069212410501193, "grad_norm": 0.6784318458229694, "learning_rate": 1.0127431947643316e-08, "loss": 0.4671, "step": 400 }, { "epoch": 1.9116945107398569, "grad_norm": 0.675130035717692, "learning_rate": 9.035069816332619e-09, "loss": 0.464, "step": 401 }, { "epoch": 1.9164677804295942, "grad_norm": 0.8221120490850481, "learning_rate": 8.004762395083963e-09, "loss": 0.4537, "step": 402 }, { "epoch": 1.9212410501193318, "grad_norm": 0.7612136405972405, "learning_rate": 7.036574202253343e-09, "loss": 0.4914, "step": 403 }, { "epoch": 1.9260143198090693, "grad_norm": 0.8291594902189451, "learning_rate": 6.130565866260484e-09, "loss": 0.4727, "step": 404 }, { "epoch": 1.9307875894988067, "grad_norm": 1.0329364100399496, "learning_rate": 5.286794121791782e-09, "loss": 0.4767, "step": 405 }, { "epoch": 1.935560859188544, "grad_norm": 0.8758229910700595, "learning_rate": 4.5053118062478025e-09, "loss": 0.4501, "step": 406 }, { "epoch": 1.9403341288782816, "grad_norm": 0.7067697193260255, "learning_rate": 3.786167856434375e-09, "loss": 0.4747, "step": 407 }, { "epoch": 1.9451073985680192, "grad_norm": 0.7459961970155857, "learning_rate": 3.1294073054987102e-09, "loss": 0.4605, "step": 408 }, { "epoch": 1.9498806682577565, "grad_norm": 0.7585534385150827, "learning_rate": 2.5350712801084363e-09, "loss": 0.4528, "step": 409 }, { "epoch": 1.9546539379474939, "grad_norm": 0.6767868247269999, "learning_rate": 2.003196997877099e-09, "loss": 0.4585, "step": 410 }, { "epoch": 1.9594272076372317, "grad_norm": 0.7126370902337825, "learning_rate": 1.5338177650332517e-09, "loss": 0.4591, "step": 411 }, { "epoch": 1.964200477326969, "grad_norm": 0.7172728813954358, "learning_rate": 1.1269629743346777e-09, "loss": 0.4589, "step": 412 }, { "epoch": 1.9689737470167064, "grad_norm": 0.8158860106123756, "learning_rate": 7.826581032279734e-10, "loss": 0.4601, "step": 413 }, { "epoch": 1.973747016706444, "grad_norm": 0.8261699459606863, "learning_rate": 5.00924712252937e-10, "loss": 0.4731, "step": 414 }, { "epoch": 1.9785202863961815, "grad_norm": 0.7168072767819187, "learning_rate": 2.8178044369286945e-10, "loss": 0.4657, "step": 415 }, { "epoch": 1.9832935560859188, "grad_norm": 0.6783006404134123, "learning_rate": 1.2523902046934763e-10, "loss": 0.452, "step": 416 }, { "epoch": 1.9880668257756562, "grad_norm": 0.7080089156985594, "learning_rate": 3.131024528302273e-11, "loss": 0.4737, "step": 417 }, { "epoch": 1.9928400954653938, "grad_norm": 0.7031897431837284, "learning_rate": 0.0, "loss": 0.4817, "step": 418 }, { "epoch": 1.9928400954653938, "step": 418, "total_flos": 3166299160051712.0, "train_loss": 0.5405693022828353, "train_runtime": 17864.3337, "train_samples_per_second": 2.995, "train_steps_per_second": 0.023 } ], "logging_steps": 1, "max_steps": 418, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3166299160051712.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }