{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7724991614826253, "eval_steps": 500, "global_step": 30517, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007847256940708912, "grad_norm": 26.94572639465332, "learning_rate": 1.0157273918741808e-06, "loss": 8.5879, "step": 31 }, { "epoch": 0.0015694513881417823, "grad_norm": 14.633014678955078, "learning_rate": 2.0314547837483616e-06, "loss": 7.5048, "step": 62 }, { "epoch": 0.002354177082212673, "grad_norm": 15.984803199768066, "learning_rate": 3.0471821756225426e-06, "loss": 6.1391, "step": 93 }, { "epoch": 0.0031389027762835646, "grad_norm": 11.297175407409668, "learning_rate": 4.062909567496723e-06, "loss": 4.9299, "step": 124 }, { "epoch": 0.003923628470354455, "grad_norm": 14.864474296569824, "learning_rate": 5.078636959370905e-06, "loss": 4.3205, "step": 155 }, { "epoch": 0.004708354164425346, "grad_norm": 11.237608909606934, "learning_rate": 6.094364351245085e-06, "loss": 4.0, "step": 186 }, { "epoch": 0.005493079858496238, "grad_norm": 23.79303550720215, "learning_rate": 7.110091743119267e-06, "loss": 3.7952, "step": 217 }, { "epoch": 0.006277805552567129, "grad_norm": 15.1551513671875, "learning_rate": 8.125819134993446e-06, "loss": 3.689, "step": 248 }, { "epoch": 0.00706253124663802, "grad_norm": 14.605571746826172, "learning_rate": 9.141546526867629e-06, "loss": 3.5147, "step": 279 }, { "epoch": 0.00784725694070891, "grad_norm": 16.463390350341797, "learning_rate": 1.015727391874181e-05, "loss": 3.3901, "step": 310 }, { "epoch": 0.008631982634779801, "grad_norm": 13.09945011138916, "learning_rate": 1.117300131061599e-05, "loss": 3.317, "step": 341 }, { "epoch": 0.009416708328850693, "grad_norm": 11.993067741394043, "learning_rate": 1.218872870249017e-05, "loss": 3.2508, "step": 372 }, { "epoch": 0.010201434022921584, "grad_norm": 10.388030052185059, "learning_rate": 1.3204456094364351e-05, "loss": 3.1239, "step": 403 }, { "epoch": 0.010986159716992476, "grad_norm": 11.977804183959961, "learning_rate": 1.4220183486238533e-05, "loss": 3.0739, "step": 434 }, { "epoch": 0.011770885411063367, "grad_norm": 8.925983428955078, "learning_rate": 1.5235910878112714e-05, "loss": 3.0169, "step": 465 }, { "epoch": 0.012555611105134258, "grad_norm": 9.57411003112793, "learning_rate": 1.6251638269986893e-05, "loss": 2.959, "step": 496 }, { "epoch": 0.01334033679920515, "grad_norm": 7.380288124084473, "learning_rate": 1.7267365661861077e-05, "loss": 2.8921, "step": 527 }, { "epoch": 0.01412506249327604, "grad_norm": 8.812368392944336, "learning_rate": 1.8283093053735257e-05, "loss": 2.843, "step": 558 }, { "epoch": 0.014909788187346932, "grad_norm": 8.870095252990723, "learning_rate": 1.9298820445609438e-05, "loss": 2.7895, "step": 589 }, { "epoch": 0.01569451388141782, "grad_norm": 9.503872871398926, "learning_rate": 2.031454783748362e-05, "loss": 2.7757, "step": 620 }, { "epoch": 0.016479239575488712, "grad_norm": 6.582827568054199, "learning_rate": 2.13302752293578e-05, "loss": 2.7099, "step": 651 }, { "epoch": 0.017263965269559603, "grad_norm": 6.266632556915283, "learning_rate": 2.234600262123198e-05, "loss": 2.6729, "step": 682 }, { "epoch": 0.018048690963630494, "grad_norm": 6.645415306091309, "learning_rate": 2.336173001310616e-05, "loss": 2.6616, "step": 713 }, { "epoch": 0.018833416657701385, "grad_norm": 7.8323073387146, "learning_rate": 2.437745740498034e-05, "loss": 2.6291, "step": 744 }, { "epoch": 0.019618142351772276, "grad_norm": 5.577521324157715, "learning_rate": 2.5393184796854525e-05, "loss": 2.6072, "step": 775 }, { "epoch": 0.020402868045843167, "grad_norm": 5.603636264801025, "learning_rate": 2.6408912188728702e-05, "loss": 2.5787, "step": 806 }, { "epoch": 0.021187593739914058, "grad_norm": 6.945438385009766, "learning_rate": 2.7424639580602886e-05, "loss": 2.5198, "step": 837 }, { "epoch": 0.021972319433984953, "grad_norm": 5.6279826164245605, "learning_rate": 2.8440366972477066e-05, "loss": 2.5417, "step": 868 }, { "epoch": 0.022757045128055844, "grad_norm": 5.517001628875732, "learning_rate": 2.9456094364351244e-05, "loss": 2.4849, "step": 899 }, { "epoch": 0.023541770822126735, "grad_norm": 5.865486145019531, "learning_rate": 3.0471821756225428e-05, "loss": 2.5103, "step": 930 }, { "epoch": 0.024326496516197626, "grad_norm": 4.949043273925781, "learning_rate": 3.148754914809961e-05, "loss": 2.4581, "step": 961 }, { "epoch": 0.025111222210268517, "grad_norm": 4.701717853546143, "learning_rate": 3.2503276539973785e-05, "loss": 2.4315, "step": 992 }, { "epoch": 0.025895947904339408, "grad_norm": 4.533145904541016, "learning_rate": 3.351900393184797e-05, "loss": 2.4056, "step": 1023 }, { "epoch": 0.0266806735984103, "grad_norm": 4.724672794342041, "learning_rate": 3.453473132372215e-05, "loss": 2.3994, "step": 1054 }, { "epoch": 0.02746539929248119, "grad_norm": 4.745669364929199, "learning_rate": 3.555045871559633e-05, "loss": 2.3546, "step": 1085 }, { "epoch": 0.02825012498655208, "grad_norm": 4.4554948806762695, "learning_rate": 3.6566186107470514e-05, "loss": 2.3642, "step": 1116 }, { "epoch": 0.029034850680622972, "grad_norm": 4.4792304039001465, "learning_rate": 3.7581913499344695e-05, "loss": 2.3296, "step": 1147 }, { "epoch": 0.029819576374693863, "grad_norm": 3.9329679012298584, "learning_rate": 3.8597640891218876e-05, "loss": 2.3105, "step": 1178 }, { "epoch": 0.030604302068764754, "grad_norm": 4.338287830352783, "learning_rate": 3.9613368283093056e-05, "loss": 2.2811, "step": 1209 }, { "epoch": 0.03138902776283564, "grad_norm": 4.130499839782715, "learning_rate": 4.062909567496724e-05, "loss": 2.2898, "step": 1240 }, { "epoch": 0.03217375345690653, "grad_norm": 3.5664470195770264, "learning_rate": 4.164482306684142e-05, "loss": 2.2786, "step": 1271 }, { "epoch": 0.032958479150977424, "grad_norm": 3.642627716064453, "learning_rate": 4.26605504587156e-05, "loss": 2.2439, "step": 1302 }, { "epoch": 0.033743204845048315, "grad_norm": 3.7562780380249023, "learning_rate": 4.367627785058978e-05, "loss": 2.2441, "step": 1333 }, { "epoch": 0.034527930539119206, "grad_norm": 3.3117406368255615, "learning_rate": 4.469200524246396e-05, "loss": 2.2604, "step": 1364 }, { "epoch": 0.0353126562331901, "grad_norm": 3.4313724040985107, "learning_rate": 4.570773263433814e-05, "loss": 2.2069, "step": 1395 }, { "epoch": 0.03609738192726099, "grad_norm": 3.4720091819763184, "learning_rate": 4.672346002621232e-05, "loss": 2.2087, "step": 1426 }, { "epoch": 0.03688210762133188, "grad_norm": 3.491856575012207, "learning_rate": 4.77391874180865e-05, "loss": 2.1808, "step": 1457 }, { "epoch": 0.03766683331540277, "grad_norm": 3.3730666637420654, "learning_rate": 4.875491480996068e-05, "loss": 2.1907, "step": 1488 }, { "epoch": 0.03845155900947366, "grad_norm": 2.894322395324707, "learning_rate": 4.977064220183487e-05, "loss": 2.1689, "step": 1519 }, { "epoch": 0.03923628470354455, "grad_norm": 3.195884943008423, "learning_rate": 4.9999915451558777e-05, "loss": 2.194, "step": 1550 }, { "epoch": 0.04002101039761544, "grad_norm": 3.154061794281006, "learning_rate": 4.999955597496219e-05, "loss": 2.1409, "step": 1581 }, { "epoch": 0.040805736091686334, "grad_norm": 2.8204188346862793, "learning_rate": 4.9998914381774255e-05, "loss": 2.145, "step": 1612 }, { "epoch": 0.041590461785757225, "grad_norm": 2.98260760307312, "learning_rate": 4.999799067923527e-05, "loss": 2.1523, "step": 1643 }, { "epoch": 0.042375187479828116, "grad_norm": 2.917949914932251, "learning_rate": 4.999678487776908e-05, "loss": 2.1221, "step": 1674 }, { "epoch": 0.04315991317389901, "grad_norm": 2.811469554901123, "learning_rate": 4.9995296990983006e-05, "loss": 2.1242, "step": 1705 }, { "epoch": 0.043944638867969905, "grad_norm": 3.067636728286743, "learning_rate": 4.999352703566763e-05, "loss": 2.1092, "step": 1736 }, { "epoch": 0.044729364562040796, "grad_norm": 2.6231868267059326, "learning_rate": 4.999147503179668e-05, "loss": 2.1018, "step": 1767 }, { "epoch": 0.04551409025611169, "grad_norm": 2.8247616291046143, "learning_rate": 4.998914100252672e-05, "loss": 2.074, "step": 1798 }, { "epoch": 0.04629881595018258, "grad_norm": 2.5960075855255127, "learning_rate": 4.998652497419696e-05, "loss": 2.0824, "step": 1829 }, { "epoch": 0.04708354164425347, "grad_norm": 2.7796943187713623, "learning_rate": 4.9983626976328927e-05, "loss": 2.0998, "step": 1860 }, { "epoch": 0.04786826733832436, "grad_norm": 2.49242901802063, "learning_rate": 4.998044704162613e-05, "loss": 2.0893, "step": 1891 }, { "epoch": 0.04865299303239525, "grad_norm": 2.4294378757476807, "learning_rate": 4.9976985205973705e-05, "loss": 2.0617, "step": 1922 }, { "epoch": 0.04943771872646614, "grad_norm": 2.553217649459839, "learning_rate": 4.997324150843799e-05, "loss": 2.0632, "step": 1953 }, { "epoch": 0.050222444420537034, "grad_norm": 2.6711318492889404, "learning_rate": 4.99692159912661e-05, "loss": 2.0445, "step": 1984 }, { "epoch": 0.051007170114607925, "grad_norm": 2.714432716369629, "learning_rate": 4.996490869988546e-05, "loss": 2.0185, "step": 2015 }, { "epoch": 0.051791895808678816, "grad_norm": 2.6516053676605225, "learning_rate": 4.996031968290326e-05, "loss": 2.057, "step": 2046 }, { "epoch": 0.05257662150274971, "grad_norm": 2.4798831939697266, "learning_rate": 4.995544899210594e-05, "loss": 2.0199, "step": 2077 }, { "epoch": 0.0533613471968206, "grad_norm": 2.5150041580200195, "learning_rate": 4.9950296682458583e-05, "loss": 2.0264, "step": 2108 }, { "epoch": 0.05414607289089149, "grad_norm": 2.637777805328369, "learning_rate": 4.994486281210429e-05, "loss": 2.0233, "step": 2139 }, { "epoch": 0.05493079858496238, "grad_norm": 2.330376148223877, "learning_rate": 4.9939147442363566e-05, "loss": 2.0201, "step": 2170 }, { "epoch": 0.05571552427903327, "grad_norm": 2.3436174392700195, "learning_rate": 4.9933150637733574e-05, "loss": 1.9865, "step": 2201 }, { "epoch": 0.05650024997310416, "grad_norm": 2.7756845951080322, "learning_rate": 4.992687246588743e-05, "loss": 1.9983, "step": 2232 }, { "epoch": 0.05728497566717505, "grad_norm": 2.1725504398345947, "learning_rate": 4.992031299767347e-05, "loss": 1.9689, "step": 2263 }, { "epoch": 0.058069701361245944, "grad_norm": 2.2163312435150146, "learning_rate": 4.9913472307114386e-05, "loss": 1.9829, "step": 2294 }, { "epoch": 0.058854427055316835, "grad_norm": 2.2829232215881348, "learning_rate": 4.9906350471406446e-05, "loss": 2.0142, "step": 2325 }, { "epoch": 0.059639152749387726, "grad_norm": 2.239596366882324, "learning_rate": 4.989894757091861e-05, "loss": 1.9697, "step": 2356 }, { "epoch": 0.06042387844345862, "grad_norm": 2.2926037311553955, "learning_rate": 4.989126368919158e-05, "loss": 1.9688, "step": 2387 }, { "epoch": 0.06120860413752951, "grad_norm": 10.08767032623291, "learning_rate": 4.988329891293693e-05, "loss": 1.9845, "step": 2418 }, { "epoch": 0.0619933298316004, "grad_norm": 2.2427194118499756, "learning_rate": 4.987505333203608e-05, "loss": 1.9744, "step": 2449 }, { "epoch": 0.06277805552567128, "grad_norm": 2.5111870765686035, "learning_rate": 4.9866527039539276e-05, "loss": 1.9526, "step": 2480 }, { "epoch": 0.06356278121974218, "grad_norm": 2.2100026607513428, "learning_rate": 4.9857720131664594e-05, "loss": 1.9826, "step": 2511 }, { "epoch": 0.06434750691381307, "grad_norm": 2.2112088203430176, "learning_rate": 4.9848632707796773e-05, "loss": 1.9698, "step": 2542 }, { "epoch": 0.06513223260788396, "grad_norm": 2.404014825820923, "learning_rate": 4.9839264870486155e-05, "loss": 1.9628, "step": 2573 }, { "epoch": 0.06591695830195485, "grad_norm": 2.526423692703247, "learning_rate": 4.9829616725447526e-05, "loss": 1.9481, "step": 2604 }, { "epoch": 0.06670168399602575, "grad_norm": 2.2506027221679688, "learning_rate": 4.981968838155888e-05, "loss": 1.9418, "step": 2635 }, { "epoch": 0.06748640969009663, "grad_norm": 2.4334371089935303, "learning_rate": 4.980947995086024e-05, "loss": 1.9423, "step": 2666 }, { "epoch": 0.06827113538416753, "grad_norm": 2.3028314113616943, "learning_rate": 4.979899154855234e-05, "loss": 1.9391, "step": 2697 }, { "epoch": 0.06905586107823841, "grad_norm": 2.122143030166626, "learning_rate": 4.9788223292995386e-05, "loss": 1.933, "step": 2728 }, { "epoch": 0.06984058677230931, "grad_norm": 2.1335129737854004, "learning_rate": 4.977717530570768e-05, "loss": 1.9212, "step": 2759 }, { "epoch": 0.0706253124663802, "grad_norm": 2.198650598526001, "learning_rate": 4.976584771136425e-05, "loss": 1.9217, "step": 2790 }, { "epoch": 0.07141003816045109, "grad_norm": 2.4985201358795166, "learning_rate": 4.975424063779547e-05, "loss": 1.9277, "step": 2821 }, { "epoch": 0.07219476385452198, "grad_norm": 1.9877598285675049, "learning_rate": 4.974235421598557e-05, "loss": 1.9278, "step": 2852 }, { "epoch": 0.07297948954859287, "grad_norm": 3.0082573890686035, "learning_rate": 4.973018858007122e-05, "loss": 1.9261, "step": 2883 }, { "epoch": 0.07376421524266376, "grad_norm": 2.139742851257324, "learning_rate": 4.9717743867339963e-05, "loss": 1.9168, "step": 2914 }, { "epoch": 0.07454894093673466, "grad_norm": 2.1748037338256836, "learning_rate": 4.9705020218228695e-05, "loss": 1.9132, "step": 2945 }, { "epoch": 0.07533366663080554, "grad_norm": 2.0570950508117676, "learning_rate": 4.969201777632205e-05, "loss": 1.9177, "step": 2976 }, { "epoch": 0.07611839232487644, "grad_norm": 1.9970216751098633, "learning_rate": 4.9678736688350846e-05, "loss": 1.9105, "step": 3007 }, { "epoch": 0.07690311801894732, "grad_norm": 1.9640527963638306, "learning_rate": 4.966517710419033e-05, "loss": 1.9084, "step": 3038 }, { "epoch": 0.07768784371301822, "grad_norm": 2.172874927520752, "learning_rate": 4.965133917685858e-05, "loss": 1.8995, "step": 3069 }, { "epoch": 0.0784725694070891, "grad_norm": 2.1881916522979736, "learning_rate": 4.9637223062514714e-05, "loss": 1.9019, "step": 3100 }, { "epoch": 0.07925729510116, "grad_norm": 1.975496530532837, "learning_rate": 4.962282892045718e-05, "loss": 1.8967, "step": 3131 }, { "epoch": 0.08004202079523089, "grad_norm": 2.0970685482025146, "learning_rate": 4.9608156913121904e-05, "loss": 1.8867, "step": 3162 }, { "epoch": 0.08082674648930178, "grad_norm": 2.096353769302368, "learning_rate": 4.959320720608049e-05, "loss": 1.8967, "step": 3193 }, { "epoch": 0.08161147218337267, "grad_norm": 1.998336911201477, "learning_rate": 4.9577979968038354e-05, "loss": 1.8876, "step": 3224 }, { "epoch": 0.08239619787744357, "grad_norm": 2.098055362701416, "learning_rate": 4.956247537083282e-05, "loss": 1.9, "step": 3255 }, { "epoch": 0.08318092357151445, "grad_norm": 2.0739505290985107, "learning_rate": 4.9546693589431145e-05, "loss": 1.8902, "step": 3286 }, { "epoch": 0.08396564926558535, "grad_norm": 1.9556243419647217, "learning_rate": 4.9530634801928595e-05, "loss": 1.888, "step": 3317 }, { "epoch": 0.08475037495965623, "grad_norm": 2.096874952316284, "learning_rate": 4.9514299189546395e-05, "loss": 1.8785, "step": 3348 }, { "epoch": 0.08553510065372713, "grad_norm": 1.9407072067260742, "learning_rate": 4.949768693662973e-05, "loss": 1.8646, "step": 3379 }, { "epoch": 0.08631982634779801, "grad_norm": 1.9928467273712158, "learning_rate": 4.948079823064559e-05, "loss": 1.8751, "step": 3410 }, { "epoch": 0.08710455204186891, "grad_norm": 1.9670037031173706, "learning_rate": 4.946363326218074e-05, "loss": 1.8831, "step": 3441 }, { "epoch": 0.08788927773593981, "grad_norm": 1.999193787574768, "learning_rate": 4.9446192224939525e-05, "loss": 1.8605, "step": 3472 }, { "epoch": 0.0886740034300107, "grad_norm": 1.9073724746704102, "learning_rate": 4.942847531574167e-05, "loss": 1.8576, "step": 3503 }, { "epoch": 0.08945872912408159, "grad_norm": 2.179824113845825, "learning_rate": 4.941048273452008e-05, "loss": 1.8682, "step": 3534 }, { "epoch": 0.09024345481815248, "grad_norm": 1.954990029335022, "learning_rate": 4.9392214684318605e-05, "loss": 1.8807, "step": 3565 }, { "epoch": 0.09102818051222338, "grad_norm": 1.7695640325546265, "learning_rate": 4.93736713712897e-05, "loss": 1.879, "step": 3596 }, { "epoch": 0.09181290620629426, "grad_norm": 1.7708550691604614, "learning_rate": 4.9354853004692124e-05, "loss": 1.8677, "step": 3627 }, { "epoch": 0.09259763190036516, "grad_norm": 1.9683934450149536, "learning_rate": 4.93357597968886e-05, "loss": 1.8595, "step": 3658 }, { "epoch": 0.09338235759443604, "grad_norm": 2.00441312789917, "learning_rate": 4.931639196334338e-05, "loss": 1.8462, "step": 3689 }, { "epoch": 0.09416708328850694, "grad_norm": 1.875543475151062, "learning_rate": 4.9296749722619826e-05, "loss": 1.8502, "step": 3720 }, { "epoch": 0.09495180898257782, "grad_norm": 1.932658314704895, "learning_rate": 4.9276833296377966e-05, "loss": 1.8457, "step": 3751 }, { "epoch": 0.09573653467664872, "grad_norm": 1.9957045316696167, "learning_rate": 4.925664290937196e-05, "loss": 1.843, "step": 3782 }, { "epoch": 0.0965212603707196, "grad_norm": 1.8579176664352417, "learning_rate": 4.9236178789447576e-05, "loss": 1.8504, "step": 3813 }, { "epoch": 0.0973059860647905, "grad_norm": 1.9646131992340088, "learning_rate": 4.921544116753962e-05, "loss": 1.8512, "step": 3844 }, { "epoch": 0.09809071175886139, "grad_norm": 1.8213136196136475, "learning_rate": 4.919443027766935e-05, "loss": 1.8618, "step": 3875 }, { "epoch": 0.09887543745293229, "grad_norm": 2.017280101776123, "learning_rate": 4.91731463569418e-05, "loss": 1.863, "step": 3906 }, { "epoch": 0.09966016314700317, "grad_norm": 1.9125665426254272, "learning_rate": 4.915158964554312e-05, "loss": 1.8259, "step": 3937 }, { "epoch": 0.10044488884107407, "grad_norm": 2.0414695739746094, "learning_rate": 4.912976038673786e-05, "loss": 1.8347, "step": 3968 }, { "epoch": 0.10122961453514495, "grad_norm": 1.7705485820770264, "learning_rate": 4.9107658826866254e-05, "loss": 1.8502, "step": 3999 }, { "epoch": 0.10201434022921585, "grad_norm": 1.8961102962493896, "learning_rate": 4.908528521534139e-05, "loss": 1.84, "step": 4030 }, { "epoch": 0.10279906592328673, "grad_norm": 1.784387230873108, "learning_rate": 4.906263980464644e-05, "loss": 1.842, "step": 4061 }, { "epoch": 0.10358379161735763, "grad_norm": 11.229472160339355, "learning_rate": 4.903972285033178e-05, "loss": 1.8476, "step": 4092 }, { "epoch": 0.10436851731142852, "grad_norm": 1.9657154083251953, "learning_rate": 4.901653461101213e-05, "loss": 1.8465, "step": 4123 }, { "epoch": 0.10515324300549941, "grad_norm": 1.7702244520187378, "learning_rate": 4.8993075348363626e-05, "loss": 1.8249, "step": 4154 }, { "epoch": 0.1059379686995703, "grad_norm": 1.8672112226486206, "learning_rate": 4.896934532712084e-05, "loss": 1.8232, "step": 4185 }, { "epoch": 0.1067226943936412, "grad_norm": 1.7806147336959839, "learning_rate": 4.8945344815073846e-05, "loss": 1.8256, "step": 4216 }, { "epoch": 0.10750742008771208, "grad_norm": 1.7830456495285034, "learning_rate": 4.892107408306516e-05, "loss": 1.8271, "step": 4247 }, { "epoch": 0.10829214578178298, "grad_norm": 1.96640944480896, "learning_rate": 4.889653340498669e-05, "loss": 1.82, "step": 4278 }, { "epoch": 0.10907687147585386, "grad_norm": 1.8224470615386963, "learning_rate": 4.8871723057776664e-05, "loss": 1.8216, "step": 4309 }, { "epoch": 0.10986159716992476, "grad_norm": 2.5164501667022705, "learning_rate": 4.8846643321416476e-05, "loss": 1.8252, "step": 4340 }, { "epoch": 0.11064632286399564, "grad_norm": 1.7248613834381104, "learning_rate": 4.882129447892753e-05, "loss": 1.8133, "step": 4371 }, { "epoch": 0.11143104855806654, "grad_norm": 2.060304880142212, "learning_rate": 4.8795676816368076e-05, "loss": 1.8282, "step": 4402 }, { "epoch": 0.11221577425213743, "grad_norm": 1.8709039688110352, "learning_rate": 4.876979062282995e-05, "loss": 1.8154, "step": 4433 }, { "epoch": 0.11300049994620832, "grad_norm": 1.7444674968719482, "learning_rate": 4.8743636190435325e-05, "loss": 1.8173, "step": 4464 }, { "epoch": 0.11378522564027921, "grad_norm": 1.7357319593429565, "learning_rate": 4.871721381433344e-05, "loss": 1.8351, "step": 4495 }, { "epoch": 0.1145699513343501, "grad_norm": 1.728070855140686, "learning_rate": 4.869052379269719e-05, "loss": 1.8119, "step": 4526 }, { "epoch": 0.11535467702842099, "grad_norm": 1.742035984992981, "learning_rate": 4.866356642671985e-05, "loss": 1.7967, "step": 4557 }, { "epoch": 0.11613940272249189, "grad_norm": 1.7010915279388428, "learning_rate": 4.8636342020611634e-05, "loss": 1.8004, "step": 4588 }, { "epoch": 0.11692412841656277, "grad_norm": 1.6775914430618286, "learning_rate": 4.860885088159626e-05, "loss": 1.8173, "step": 4619 }, { "epoch": 0.11770885411063367, "grad_norm": 1.9107964038848877, "learning_rate": 4.858109331990751e-05, "loss": 1.7984, "step": 4650 }, { "epoch": 0.11849357980470455, "grad_norm": 1.713429570198059, "learning_rate": 4.855306964878567e-05, "loss": 1.7967, "step": 4681 }, { "epoch": 0.11927830549877545, "grad_norm": 1.9373931884765625, "learning_rate": 4.8524780184474084e-05, "loss": 1.8072, "step": 4712 }, { "epoch": 0.12006303119284634, "grad_norm": 1.8975365161895752, "learning_rate": 4.8496225246215496e-05, "loss": 1.8121, "step": 4743 }, { "epoch": 0.12084775688691723, "grad_norm": 5.285326957702637, "learning_rate": 4.8467405156248505e-05, "loss": 1.8189, "step": 4774 }, { "epoch": 0.12163248258098812, "grad_norm": 1.7155263423919678, "learning_rate": 4.843832023980392e-05, "loss": 1.8093, "step": 4805 }, { "epoch": 0.12241720827505902, "grad_norm": 1.726831316947937, "learning_rate": 4.840897082510106e-05, "loss": 1.7952, "step": 4836 }, { "epoch": 0.1232019339691299, "grad_norm": 1.739639401435852, "learning_rate": 4.8379357243344084e-05, "loss": 1.8103, "step": 4867 }, { "epoch": 0.1239866596632008, "grad_norm": 1.6978296041488647, "learning_rate": 4.8349479828718236e-05, "loss": 1.8006, "step": 4898 }, { "epoch": 0.12477138535727168, "grad_norm": 1.7154194116592407, "learning_rate": 4.8319338918386075e-05, "loss": 1.7876, "step": 4929 }, { "epoch": 0.12555611105134257, "grad_norm": 1.6323316097259521, "learning_rate": 4.828893485248369e-05, "loss": 1.8159, "step": 4960 }, { "epoch": 0.12634083674541347, "grad_norm": 1.641784429550171, "learning_rate": 4.825826797411682e-05, "loss": 1.7959, "step": 4991 }, { "epoch": 0.12712556243948436, "grad_norm": 1.6947154998779297, "learning_rate": 4.822733862935702e-05, "loss": 1.7895, "step": 5022 }, { "epoch": 0.12791028813355526, "grad_norm": 1.6331220865249634, "learning_rate": 4.819614716723775e-05, "loss": 1.7707, "step": 5053 }, { "epoch": 0.12869501382762613, "grad_norm": 1.8207937479019165, "learning_rate": 4.8164693939750425e-05, "loss": 1.8123, "step": 5084 }, { "epoch": 0.12947973952169703, "grad_norm": 1.6664263010025024, "learning_rate": 4.813297930184042e-05, "loss": 1.8089, "step": 5115 }, { "epoch": 0.13026446521576793, "grad_norm": 1.9931398630142212, "learning_rate": 4.810100361140314e-05, "loss": 1.7757, "step": 5146 }, { "epoch": 0.13104919090983883, "grad_norm": 1.839200735092163, "learning_rate": 4.8068767229279885e-05, "loss": 1.7969, "step": 5177 }, { "epoch": 0.1318339166039097, "grad_norm": 1.781187653541565, "learning_rate": 4.8036270519253854e-05, "loss": 1.7937, "step": 5208 }, { "epoch": 0.1326186422979806, "grad_norm": 1.7144343852996826, "learning_rate": 4.8003513848046e-05, "loss": 1.7816, "step": 5239 }, { "epoch": 0.1334033679920515, "grad_norm": 1.6819554567337036, "learning_rate": 4.79704975853109e-05, "loss": 1.7851, "step": 5270 }, { "epoch": 0.1341880936861224, "grad_norm": 1.6748546361923218, "learning_rate": 4.793722210363262e-05, "loss": 1.7941, "step": 5301 }, { "epoch": 0.13497281938019326, "grad_norm": 1.615569829940796, "learning_rate": 4.7903687778520414e-05, "loss": 1.7799, "step": 5332 }, { "epoch": 0.13575754507426416, "grad_norm": 1.7959198951721191, "learning_rate": 4.7869894988404593e-05, "loss": 1.7802, "step": 5363 }, { "epoch": 0.13654227076833506, "grad_norm": 1.598946452140808, "learning_rate": 4.783584411463221e-05, "loss": 1.7929, "step": 5394 }, { "epoch": 0.13732699646240595, "grad_norm": 1.793511986732483, "learning_rate": 4.780153554146274e-05, "loss": 1.7591, "step": 5425 }, { "epoch": 0.13811172215647682, "grad_norm": 1.718671202659607, "learning_rate": 4.7766969656063766e-05, "loss": 1.7807, "step": 5456 }, { "epoch": 0.13889644785054772, "grad_norm": 1.6548669338226318, "learning_rate": 4.773214684850662e-05, "loss": 1.775, "step": 5487 }, { "epoch": 0.13968117354461862, "grad_norm": 1.6727256774902344, "learning_rate": 4.769706751176193e-05, "loss": 1.7756, "step": 5518 }, { "epoch": 0.14046589923868952, "grad_norm": 1.7169344425201416, "learning_rate": 4.7661732041695264e-05, "loss": 1.7887, "step": 5549 }, { "epoch": 0.1412506249327604, "grad_norm": 1.6376421451568604, "learning_rate": 4.762614083706258e-05, "loss": 1.7939, "step": 5580 }, { "epoch": 0.14203535062683129, "grad_norm": 1.7083207368850708, "learning_rate": 4.759029429950581e-05, "loss": 1.7705, "step": 5611 }, { "epoch": 0.14282007632090218, "grad_norm": 1.6359349489212036, "learning_rate": 4.7554192833548235e-05, "loss": 1.7732, "step": 5642 }, { "epoch": 0.14360480201497308, "grad_norm": 1.684005618095398, "learning_rate": 4.751783684659e-05, "loss": 1.7766, "step": 5673 }, { "epoch": 0.14438952770904395, "grad_norm": 1.7531359195709229, "learning_rate": 4.748122674890348e-05, "loss": 1.7815, "step": 5704 }, { "epoch": 0.14517425340311485, "grad_norm": 1.5898247957229614, "learning_rate": 4.7444362953628654e-05, "loss": 1.7837, "step": 5735 }, { "epoch": 0.14595897909718575, "grad_norm": 1.6781623363494873, "learning_rate": 4.7407245876768424e-05, "loss": 1.7381, "step": 5766 }, { "epoch": 0.14674370479125665, "grad_norm": 1.6126357316970825, "learning_rate": 4.736987593718397e-05, "loss": 1.7714, "step": 5797 }, { "epoch": 0.14752843048532752, "grad_norm": 1.6623587608337402, "learning_rate": 4.733225355658999e-05, "loss": 1.7625, "step": 5828 }, { "epoch": 0.14831315617939841, "grad_norm": 1.6715524196624756, "learning_rate": 4.7294379159549926e-05, "loss": 1.7631, "step": 5859 }, { "epoch": 0.1490978818734693, "grad_norm": 1.6739026308059692, "learning_rate": 4.725625317347119e-05, "loss": 1.775, "step": 5890 }, { "epoch": 0.1498826075675402, "grad_norm": 1.8141075372695923, "learning_rate": 4.7217876028600374e-05, "loss": 1.7881, "step": 5921 }, { "epoch": 0.15066733326161108, "grad_norm": 1.6842069625854492, "learning_rate": 4.717924815801832e-05, "loss": 1.7707, "step": 5952 }, { "epoch": 0.15145205895568198, "grad_norm": 1.7032698392868042, "learning_rate": 4.714036999763532e-05, "loss": 1.7631, "step": 5983 }, { "epoch": 0.15223678464975288, "grad_norm": 1.7856013774871826, "learning_rate": 4.7101241986186116e-05, "loss": 1.7545, "step": 6014 }, { "epoch": 0.15302151034382377, "grad_norm": 1.679623007774353, "learning_rate": 4.7061864565225e-05, "loss": 1.7676, "step": 6045 }, { "epoch": 0.15380623603789464, "grad_norm": 1.626792073249817, "learning_rate": 4.702223817912081e-05, "loss": 1.7434, "step": 6076 }, { "epoch": 0.15459096173196554, "grad_norm": 1.850042700767517, "learning_rate": 4.698236327505195e-05, "loss": 1.7805, "step": 6107 }, { "epoch": 0.15537568742603644, "grad_norm": 1.6403062343597412, "learning_rate": 4.694224030300127e-05, "loss": 1.7495, "step": 6138 }, { "epoch": 0.15616041312010734, "grad_norm": 1.5897477865219116, "learning_rate": 4.690186971575107e-05, "loss": 1.779, "step": 6169 }, { "epoch": 0.1569451388141782, "grad_norm": 1.8173433542251587, "learning_rate": 4.6861251968877916e-05, "loss": 1.7705, "step": 6200 }, { "epoch": 0.1577298645082491, "grad_norm": 1.788022756576538, "learning_rate": 4.68203875207476e-05, "loss": 1.7457, "step": 6231 }, { "epoch": 0.15851459020232, "grad_norm": 1.6219838857650757, "learning_rate": 4.677927683250983e-05, "loss": 1.7758, "step": 6262 }, { "epoch": 0.1592993158963909, "grad_norm": 1.678890347480774, "learning_rate": 4.6737920368093156e-05, "loss": 1.7394, "step": 6293 }, { "epoch": 0.16008404159046177, "grad_norm": 1.5719743967056274, "learning_rate": 4.669631859419965e-05, "loss": 1.7549, "step": 6324 }, { "epoch": 0.16086876728453267, "grad_norm": 1.6332769393920898, "learning_rate": 4.6654471980299676e-05, "loss": 1.7462, "step": 6355 }, { "epoch": 0.16165349297860357, "grad_norm": 1.6942561864852905, "learning_rate": 4.661238099862658e-05, "loss": 1.7506, "step": 6386 }, { "epoch": 0.16243821867267447, "grad_norm": 1.8173885345458984, "learning_rate": 4.657004612417138e-05, "loss": 1.7455, "step": 6417 }, { "epoch": 0.16322294436674534, "grad_norm": 1.6209042072296143, "learning_rate": 4.6527467834677374e-05, "loss": 1.7413, "step": 6448 }, { "epoch": 0.16400767006081624, "grad_norm": 1.5801094770431519, "learning_rate": 4.648464661063478e-05, "loss": 1.7491, "step": 6479 }, { "epoch": 0.16479239575488713, "grad_norm": 1.5499264001846313, "learning_rate": 4.6441582935275264e-05, "loss": 1.7276, "step": 6510 }, { "epoch": 0.16557712144895803, "grad_norm": 1.6154171228408813, "learning_rate": 4.6398277294566586e-05, "loss": 1.7816, "step": 6541 }, { "epoch": 0.1663618471430289, "grad_norm": 1.5633410215377808, "learning_rate": 4.6354730177207e-05, "loss": 1.7447, "step": 6572 }, { "epoch": 0.1671465728370998, "grad_norm": 1.7070655822753906, "learning_rate": 4.6310942074619787e-05, "loss": 1.7477, "step": 6603 }, { "epoch": 0.1679312985311707, "grad_norm": 1.7502373456954956, "learning_rate": 4.626691348094777e-05, "loss": 1.74, "step": 6634 }, { "epoch": 0.1687160242252416, "grad_norm": 1.9541263580322266, "learning_rate": 4.622264489304762e-05, "loss": 1.7389, "step": 6665 }, { "epoch": 0.16950074991931247, "grad_norm": 1.64599609375, "learning_rate": 4.617813681048434e-05, "loss": 1.7445, "step": 6696 }, { "epoch": 0.17028547561338336, "grad_norm": 1.9360859394073486, "learning_rate": 4.61333897355256e-05, "loss": 1.73, "step": 6727 }, { "epoch": 0.17107020130745426, "grad_norm": 1.693892240524292, "learning_rate": 4.608840417313604e-05, "loss": 1.7229, "step": 6758 }, { "epoch": 0.17185492700152516, "grad_norm": 1.6243150234222412, "learning_rate": 4.6043180630971646e-05, "loss": 1.7421, "step": 6789 }, { "epoch": 0.17263965269559603, "grad_norm": 1.5926107168197632, "learning_rate": 4.599771961937391e-05, "loss": 1.7447, "step": 6820 }, { "epoch": 0.17342437838966693, "grad_norm": 1.695167064666748, "learning_rate": 4.5952021651364204e-05, "loss": 1.7463, "step": 6851 }, { "epoch": 0.17420910408373783, "grad_norm": 1.5915182828903198, "learning_rate": 4.590608724263786e-05, "loss": 1.7198, "step": 6882 }, { "epoch": 0.17499382977780872, "grad_norm": 1.6135920286178589, "learning_rate": 4.585991691155845e-05, "loss": 1.7233, "step": 6913 }, { "epoch": 0.17577855547187962, "grad_norm": 1.5855350494384766, "learning_rate": 4.581351117915188e-05, "loss": 1.7519, "step": 6944 }, { "epoch": 0.1765632811659505, "grad_norm": 1.5782060623168945, "learning_rate": 4.5766870569100534e-05, "loss": 1.729, "step": 6975 }, { "epoch": 0.1773480068600214, "grad_norm": 1.4931174516677856, "learning_rate": 4.571999560773736e-05, "loss": 1.7197, "step": 7006 }, { "epoch": 0.1781327325540923, "grad_norm": 1.809645414352417, "learning_rate": 4.5672886824039915e-05, "loss": 1.7409, "step": 7037 }, { "epoch": 0.17891745824816319, "grad_norm": 1.544233798980713, "learning_rate": 4.5625544749624435e-05, "loss": 1.7331, "step": 7068 }, { "epoch": 0.17970218394223406, "grad_norm": 1.5316941738128662, "learning_rate": 4.5577969918739794e-05, "loss": 1.7245, "step": 7099 }, { "epoch": 0.18048690963630495, "grad_norm": 1.4646427631378174, "learning_rate": 4.5530162868261486e-05, "loss": 1.7341, "step": 7130 }, { "epoch": 0.18127163533037585, "grad_norm": 1.6266372203826904, "learning_rate": 4.548212413768558e-05, "loss": 1.7311, "step": 7161 }, { "epoch": 0.18205636102444675, "grad_norm": 1.6372709274291992, "learning_rate": 4.543385426912261e-05, "loss": 1.7344, "step": 7192 }, { "epoch": 0.18284108671851762, "grad_norm": 1.642005443572998, "learning_rate": 4.53853538072915e-05, "loss": 1.7472, "step": 7223 }, { "epoch": 0.18362581241258852, "grad_norm": 1.7344322204589844, "learning_rate": 4.533662329951336e-05, "loss": 1.7379, "step": 7254 }, { "epoch": 0.18441053810665942, "grad_norm": 1.6593672037124634, "learning_rate": 4.528766329570536e-05, "loss": 1.7363, "step": 7285 }, { "epoch": 0.18519526380073031, "grad_norm": 1.590846300125122, "learning_rate": 4.523847434837447e-05, "loss": 1.7432, "step": 7316 }, { "epoch": 0.18597998949480118, "grad_norm": 1.6701788902282715, "learning_rate": 4.518905701261128e-05, "loss": 1.7287, "step": 7347 }, { "epoch": 0.18676471518887208, "grad_norm": 1.6129958629608154, "learning_rate": 4.5139411846083715e-05, "loss": 1.7252, "step": 7378 }, { "epoch": 0.18754944088294298, "grad_norm": 1.5602383613586426, "learning_rate": 4.508953940903073e-05, "loss": 1.7365, "step": 7409 }, { "epoch": 0.18833416657701388, "grad_norm": 1.60308039188385, "learning_rate": 4.5039440264255994e-05, "loss": 1.7361, "step": 7440 }, { "epoch": 0.18911889227108475, "grad_norm": 1.588299036026001, "learning_rate": 4.498911497712155e-05, "loss": 1.7574, "step": 7471 }, { "epoch": 0.18990361796515565, "grad_norm": 1.5599571466445923, "learning_rate": 4.493856411554142e-05, "loss": 1.738, "step": 7502 }, { "epoch": 0.19068834365922654, "grad_norm": 1.5749436616897583, "learning_rate": 4.4887788249975206e-05, "loss": 1.7272, "step": 7533 }, { "epoch": 0.19147306935329744, "grad_norm": 1.5536047220230103, "learning_rate": 4.4836787953421656e-05, "loss": 1.7249, "step": 7564 }, { "epoch": 0.1922577950473683, "grad_norm": 1.5227411985397339, "learning_rate": 4.478556380141218e-05, "loss": 1.7137, "step": 7595 }, { "epoch": 0.1930425207414392, "grad_norm": 1.5771219730377197, "learning_rate": 4.4734116372004375e-05, "loss": 1.7094, "step": 7626 }, { "epoch": 0.1938272464355101, "grad_norm": 1.4533522129058838, "learning_rate": 4.4682446245775477e-05, "loss": 1.7493, "step": 7657 }, { "epoch": 0.194611972129581, "grad_norm": 1.5640264749526978, "learning_rate": 4.463055400581586e-05, "loss": 1.7228, "step": 7688 }, { "epoch": 0.19539669782365188, "grad_norm": 1.4606215953826904, "learning_rate": 4.4578440237722374e-05, "loss": 1.7414, "step": 7719 }, { "epoch": 0.19618142351772277, "grad_norm": 1.5216374397277832, "learning_rate": 4.452610552959183e-05, "loss": 1.7155, "step": 7750 }, { "epoch": 0.19696614921179367, "grad_norm": 1.683119535446167, "learning_rate": 4.447355047201428e-05, "loss": 1.7346, "step": 7781 }, { "epoch": 0.19775087490586457, "grad_norm": 1.6055350303649902, "learning_rate": 4.4420775658066414e-05, "loss": 1.7112, "step": 7812 }, { "epoch": 0.19853560059993544, "grad_norm": 1.514739751815796, "learning_rate": 4.436778168330484e-05, "loss": 1.7274, "step": 7843 }, { "epoch": 0.19932032629400634, "grad_norm": 2.131218433380127, "learning_rate": 4.4314569145759353e-05, "loss": 1.7127, "step": 7874 }, { "epoch": 0.20010505198807724, "grad_norm": 1.4867665767669678, "learning_rate": 4.42611386459262e-05, "loss": 1.7245, "step": 7905 }, { "epoch": 0.20088977768214814, "grad_norm": 1.6395418643951416, "learning_rate": 4.420749078676133e-05, "loss": 1.7146, "step": 7936 }, { "epoch": 0.201674503376219, "grad_norm": 1.629939079284668, "learning_rate": 4.4153626173673516e-05, "loss": 1.7153, "step": 7967 }, { "epoch": 0.2024592290702899, "grad_norm": 1.5973584651947021, "learning_rate": 4.409954541451762e-05, "loss": 1.7102, "step": 7998 }, { "epoch": 0.2032439547643608, "grad_norm": 1.4822708368301392, "learning_rate": 4.404524911958764e-05, "loss": 1.7046, "step": 8029 }, { "epoch": 0.2040286804584317, "grad_norm": 1.4706634283065796, "learning_rate": 4.399073790160989e-05, "loss": 1.7022, "step": 8060 }, { "epoch": 0.20481340615250257, "grad_norm": 1.5917459726333618, "learning_rate": 4.393601237573607e-05, "loss": 1.6983, "step": 8091 }, { "epoch": 0.20559813184657347, "grad_norm": 1.7328417301177979, "learning_rate": 4.388107315953628e-05, "loss": 1.7164, "step": 8122 }, { "epoch": 0.20638285754064437, "grad_norm": 1.6152797937393188, "learning_rate": 4.382592087299212e-05, "loss": 1.7302, "step": 8153 }, { "epoch": 0.20716758323471526, "grad_norm": 1.7153429985046387, "learning_rate": 4.377055613848964e-05, "loss": 1.7278, "step": 8184 }, { "epoch": 0.20795230892878613, "grad_norm": 1.7167855501174927, "learning_rate": 4.3714979580812355e-05, "loss": 1.7021, "step": 8215 }, { "epoch": 0.20873703462285703, "grad_norm": 1.458811640739441, "learning_rate": 4.365919182713416e-05, "loss": 1.7099, "step": 8246 }, { "epoch": 0.20952176031692793, "grad_norm": 5.516291618347168, "learning_rate": 4.360319350701226e-05, "loss": 1.7069, "step": 8277 }, { "epoch": 0.21030648601099883, "grad_norm": 1.5669766664505005, "learning_rate": 4.3546985252380115e-05, "loss": 1.6983, "step": 8308 }, { "epoch": 0.2110912117050697, "grad_norm": 1.4598067998886108, "learning_rate": 4.349056769754021e-05, "loss": 1.7265, "step": 8339 }, { "epoch": 0.2118759373991406, "grad_norm": 1.5436547994613647, "learning_rate": 4.3433941479156994e-05, "loss": 1.7128, "step": 8370 }, { "epoch": 0.2126606630932115, "grad_norm": 1.6275660991668701, "learning_rate": 4.3377107236249647e-05, "loss": 1.7229, "step": 8401 }, { "epoch": 0.2134453887872824, "grad_norm": 1.6207513809204102, "learning_rate": 4.332006561018488e-05, "loss": 1.702, "step": 8432 }, { "epoch": 0.21423011448135326, "grad_norm": 1.6795597076416016, "learning_rate": 4.3262817244669683e-05, "loss": 1.6808, "step": 8463 }, { "epoch": 0.21501484017542416, "grad_norm": 1.660192608833313, "learning_rate": 4.3205362785744083e-05, "loss": 1.7071, "step": 8494 }, { "epoch": 0.21579956586949506, "grad_norm": 1.6086353063583374, "learning_rate": 4.314770288177384e-05, "loss": 1.7083, "step": 8525 }, { "epoch": 0.21658429156356596, "grad_norm": 1.475216269493103, "learning_rate": 4.308983818344313e-05, "loss": 1.7234, "step": 8556 }, { "epoch": 0.21736901725763683, "grad_norm": 1.7111340761184692, "learning_rate": 4.3031769343747206e-05, "loss": 1.6872, "step": 8587 }, { "epoch": 0.21815374295170772, "grad_norm": 1.4544799327850342, "learning_rate": 4.297349701798505e-05, "loss": 1.692, "step": 8618 }, { "epoch": 0.21893846864577862, "grad_norm": 1.6593588590621948, "learning_rate": 4.2915021863751916e-05, "loss": 1.6886, "step": 8649 }, { "epoch": 0.21972319433984952, "grad_norm": 1.641408085823059, "learning_rate": 4.285634454093198e-05, "loss": 1.6872, "step": 8680 }, { "epoch": 0.2205079200339204, "grad_norm": 1.6036972999572754, "learning_rate": 4.279746571169086e-05, "loss": 1.7055, "step": 8711 }, { "epoch": 0.2212926457279913, "grad_norm": 1.4984327554702759, "learning_rate": 4.2738386040468136e-05, "loss": 1.6997, "step": 8742 }, { "epoch": 0.2220773714220622, "grad_norm": 1.471111536026001, "learning_rate": 4.2679106193969866e-05, "loss": 1.6926, "step": 8773 }, { "epoch": 0.22286209711613308, "grad_norm": 1.521364688873291, "learning_rate": 4.261962684116106e-05, "loss": 1.6851, "step": 8804 }, { "epoch": 0.22364682281020395, "grad_norm": 1.6068321466445923, "learning_rate": 4.2559948653258145e-05, "loss": 1.7113, "step": 8835 }, { "epoch": 0.22443154850427485, "grad_norm": 1.453379511833191, "learning_rate": 4.250007230372134e-05, "loss": 1.7025, "step": 8866 }, { "epoch": 0.22521627419834575, "grad_norm": 1.5845959186553955, "learning_rate": 4.2439998468247126e-05, "loss": 1.6978, "step": 8897 }, { "epoch": 0.22600099989241665, "grad_norm": 1.5308622121810913, "learning_rate": 4.2379727824760566e-05, "loss": 1.6956, "step": 8928 }, { "epoch": 0.22678572558648752, "grad_norm": 1.6339962482452393, "learning_rate": 4.231926105340768e-05, "loss": 1.6831, "step": 8959 }, { "epoch": 0.22757045128055842, "grad_norm": 1.4533487558364868, "learning_rate": 4.225859883654776e-05, "loss": 1.7025, "step": 8990 }, { "epoch": 0.22835517697462931, "grad_norm": 3.971897840499878, "learning_rate": 4.219774185874569e-05, "loss": 1.689, "step": 9021 }, { "epoch": 0.2291399026687002, "grad_norm": 1.4394114017486572, "learning_rate": 4.213669080676418e-05, "loss": 1.6841, "step": 9052 }, { "epoch": 0.22992462836277108, "grad_norm": 1.821142315864563, "learning_rate": 4.2075446369556056e-05, "loss": 1.6883, "step": 9083 }, { "epoch": 0.23070935405684198, "grad_norm": 1.6653649806976318, "learning_rate": 4.201400923825648e-05, "loss": 1.7011, "step": 9114 }, { "epoch": 0.23149407975091288, "grad_norm": 1.5895901918411255, "learning_rate": 4.195238010617511e-05, "loss": 1.7004, "step": 9145 }, { "epoch": 0.23227880544498378, "grad_norm": 1.4648844003677368, "learning_rate": 4.1890559668788344e-05, "loss": 1.6872, "step": 9176 }, { "epoch": 0.23306353113905465, "grad_norm": 1.5886753797531128, "learning_rate": 4.1828548623731405e-05, "loss": 1.6851, "step": 9207 }, { "epoch": 0.23384825683312555, "grad_norm": 1.4713412523269653, "learning_rate": 4.1766347670790506e-05, "loss": 1.6818, "step": 9238 }, { "epoch": 0.23463298252719644, "grad_norm": 1.5660710334777832, "learning_rate": 4.170395751189495e-05, "loss": 1.6844, "step": 9269 }, { "epoch": 0.23541770822126734, "grad_norm": 1.7024312019348145, "learning_rate": 4.164137885110921e-05, "loss": 1.6839, "step": 9300 }, { "epoch": 0.2362024339153382, "grad_norm": 1.5936214923858643, "learning_rate": 4.157861239462495e-05, "loss": 1.6953, "step": 9331 }, { "epoch": 0.2369871596094091, "grad_norm": 1.4709779024124146, "learning_rate": 4.1515658850753114e-05, "loss": 1.6806, "step": 9362 }, { "epoch": 0.23777188530348, "grad_norm": 1.4303510189056396, "learning_rate": 4.145251892991588e-05, "loss": 1.6792, "step": 9393 }, { "epoch": 0.2385566109975509, "grad_norm": 1.5452120304107666, "learning_rate": 4.138919334463868e-05, "loss": 1.6712, "step": 9424 }, { "epoch": 0.23934133669162178, "grad_norm": 1.4944697618484497, "learning_rate": 4.1325682809542124e-05, "loss": 1.6777, "step": 9455 }, { "epoch": 0.24012606238569267, "grad_norm": 1.6359312534332275, "learning_rate": 4.126198804133398e-05, "loss": 1.6782, "step": 9486 }, { "epoch": 0.24091078807976357, "grad_norm": 1.3874454498291016, "learning_rate": 4.1198109758801055e-05, "loss": 1.6805, "step": 9517 }, { "epoch": 0.24169551377383447, "grad_norm": 1.4747340679168701, "learning_rate": 4.113404868280107e-05, "loss": 1.6704, "step": 9548 }, { "epoch": 0.24248023946790534, "grad_norm": 1.95576012134552, "learning_rate": 4.106980553625457e-05, "loss": 1.7008, "step": 9579 }, { "epoch": 0.24326496516197624, "grad_norm": 1.454005479812622, "learning_rate": 4.100538104413674e-05, "loss": 1.6771, "step": 9610 }, { "epoch": 0.24404969085604714, "grad_norm": 1.5640463829040527, "learning_rate": 4.09407759334692e-05, "loss": 1.6763, "step": 9641 }, { "epoch": 0.24483441655011803, "grad_norm": 1.5076780319213867, "learning_rate": 4.087599093331186e-05, "loss": 1.6977, "step": 9672 }, { "epoch": 0.2456191422441889, "grad_norm": 1.5072520971298218, "learning_rate": 4.081102677475462e-05, "loss": 1.6749, "step": 9703 }, { "epoch": 0.2464038679382598, "grad_norm": 1.6311815977096558, "learning_rate": 4.0745884190909194e-05, "loss": 1.684, "step": 9734 }, { "epoch": 0.2471885936323307, "grad_norm": 1.5691202878952026, "learning_rate": 4.0680563916900796e-05, "loss": 1.6804, "step": 9765 }, { "epoch": 0.2479733193264016, "grad_norm": 1.4325530529022217, "learning_rate": 4.0615066689859815e-05, "loss": 1.719, "step": 9796 }, { "epoch": 0.24875804502047247, "grad_norm": 1.439177393913269, "learning_rate": 4.0549393248913584e-05, "loss": 1.6873, "step": 9827 }, { "epoch": 0.24954277071454337, "grad_norm": 1.4155471324920654, "learning_rate": 4.048354433517794e-05, "loss": 1.692, "step": 9858 }, { "epoch": 0.25032749640861424, "grad_norm": 1.5917115211486816, "learning_rate": 4.0417520691748916e-05, "loss": 1.6752, "step": 9889 }, { "epoch": 0.25111222210268513, "grad_norm": 1.649154543876648, "learning_rate": 4.035132306369438e-05, "loss": 1.6603, "step": 9920 }, { "epoch": 0.25189694779675603, "grad_norm": 1.5114792585372925, "learning_rate": 4.028495219804555e-05, "loss": 1.7005, "step": 9951 }, { "epoch": 0.25268167349082693, "grad_norm": 16.910812377929688, "learning_rate": 4.021840884378864e-05, "loss": 1.6846, "step": 9982 }, { "epoch": 0.25346639918489783, "grad_norm": 1.4342628717422485, "learning_rate": 4.015169375185633e-05, "loss": 1.6678, "step": 10013 }, { "epoch": 0.2542511248789687, "grad_norm": 1.4815376996994019, "learning_rate": 4.0084807675119396e-05, "loss": 1.671, "step": 10044 }, { "epoch": 0.2550358505730396, "grad_norm": 1.4633368253707886, "learning_rate": 4.0017751368378106e-05, "loss": 1.6824, "step": 10075 }, { "epoch": 0.2558205762671105, "grad_norm": 1.3904149532318115, "learning_rate": 3.995052558835377e-05, "loss": 1.6775, "step": 10106 }, { "epoch": 0.25660530196118136, "grad_norm": 1.5234646797180176, "learning_rate": 3.988313109368017e-05, "loss": 1.6854, "step": 10137 }, { "epoch": 0.25739002765525226, "grad_norm": 1.4530494213104248, "learning_rate": 3.981556864489504e-05, "loss": 1.6727, "step": 10168 }, { "epoch": 0.25817475334932316, "grad_norm": 1.5600273609161377, "learning_rate": 3.974783900443142e-05, "loss": 1.6645, "step": 10199 }, { "epoch": 0.25895947904339406, "grad_norm": 1.4213160276412964, "learning_rate": 3.9679942936609095e-05, "loss": 1.6898, "step": 10230 }, { "epoch": 0.25974420473746496, "grad_norm": 1.5741041898727417, "learning_rate": 3.961188120762596e-05, "loss": 1.693, "step": 10261 }, { "epoch": 0.26052893043153585, "grad_norm": 1.564493179321289, "learning_rate": 3.954365458554938e-05, "loss": 1.6836, "step": 10292 }, { "epoch": 0.26131365612560675, "grad_norm": 1.5584787130355835, "learning_rate": 3.947526384030751e-05, "loss": 1.6852, "step": 10323 }, { "epoch": 0.26209838181967765, "grad_norm": 1.4936350584030151, "learning_rate": 3.9406709743680624e-05, "loss": 1.6777, "step": 10354 }, { "epoch": 0.26288310751374855, "grad_norm": 1.504725694656372, "learning_rate": 3.9337993069292366e-05, "loss": 1.6765, "step": 10385 }, { "epoch": 0.2636678332078194, "grad_norm": 1.4809914827346802, "learning_rate": 3.926911459260109e-05, "loss": 1.6578, "step": 10416 }, { "epoch": 0.2644525589018903, "grad_norm": 1.529976725578308, "learning_rate": 3.920007509089102e-05, "loss": 1.6709, "step": 10447 }, { "epoch": 0.2652372845959612, "grad_norm": 1.483694076538086, "learning_rate": 3.913087534326357e-05, "loss": 1.6713, "step": 10478 }, { "epoch": 0.2660220102900321, "grad_norm": 1.4282972812652588, "learning_rate": 3.9061516130628475e-05, "loss": 1.6784, "step": 10509 }, { "epoch": 0.266806735984103, "grad_norm": 1.5122032165527344, "learning_rate": 3.8991998235695025e-05, "loss": 1.6603, "step": 10540 }, { "epoch": 0.2675914616781739, "grad_norm": 1.5154742002487183, "learning_rate": 3.8922322442963224e-05, "loss": 1.6831, "step": 10571 }, { "epoch": 0.2683761873722448, "grad_norm": 1.4630860090255737, "learning_rate": 3.885248953871491e-05, "loss": 1.6715, "step": 10602 }, { "epoch": 0.2691609130663157, "grad_norm": 1.4164702892303467, "learning_rate": 3.8782500311004915e-05, "loss": 1.6654, "step": 10633 }, { "epoch": 0.2699456387603865, "grad_norm": 1.5865578651428223, "learning_rate": 3.871235554965218e-05, "loss": 1.6829, "step": 10664 }, { "epoch": 0.2707303644544574, "grad_norm": 1.4984766244888306, "learning_rate": 3.864205604623078e-05, "loss": 1.673, "step": 10695 }, { "epoch": 0.2715150901485283, "grad_norm": 1.5477566719055176, "learning_rate": 3.857160259406107e-05, "loss": 1.6711, "step": 10726 }, { "epoch": 0.2722998158425992, "grad_norm": 1.5356842279434204, "learning_rate": 3.8500995988200674e-05, "loss": 1.6556, "step": 10757 }, { "epoch": 0.2730845415366701, "grad_norm": 1.413104772567749, "learning_rate": 3.843023702543556e-05, "loss": 1.658, "step": 10788 }, { "epoch": 0.273869267230741, "grad_norm": 1.5174081325531006, "learning_rate": 3.8359326504270984e-05, "loss": 1.6672, "step": 10819 }, { "epoch": 0.2746539929248119, "grad_norm": 1.4649910926818848, "learning_rate": 3.828826522492255e-05, "loss": 1.6625, "step": 10850 }, { "epoch": 0.2754387186188828, "grad_norm": 1.5240408182144165, "learning_rate": 3.821705398930713e-05, "loss": 1.6619, "step": 10881 }, { "epoch": 0.27622344431295365, "grad_norm": 1.4349104166030884, "learning_rate": 3.814569360103385e-05, "loss": 1.6595, "step": 10912 }, { "epoch": 0.27700817000702455, "grad_norm": 1.4311225414276123, "learning_rate": 3.807418486539499e-05, "loss": 1.6557, "step": 10943 }, { "epoch": 0.27779289570109544, "grad_norm": 1.5817755460739136, "learning_rate": 3.80025285893569e-05, "loss": 1.6882, "step": 10974 }, { "epoch": 0.27857762139516634, "grad_norm": 1.5182181596755981, "learning_rate": 3.793072558155093e-05, "loss": 1.6697, "step": 11005 }, { "epoch": 0.27936234708923724, "grad_norm": 1.4836517572402954, "learning_rate": 3.785877665226426e-05, "loss": 1.6576, "step": 11036 }, { "epoch": 0.28014707278330814, "grad_norm": 1.460788607597351, "learning_rate": 3.778668261343079e-05, "loss": 1.6607, "step": 11067 }, { "epoch": 0.28093179847737904, "grad_norm": 1.4307125806808472, "learning_rate": 3.771444427862192e-05, "loss": 1.662, "step": 11098 }, { "epoch": 0.28171652417144993, "grad_norm": 1.4999738931655884, "learning_rate": 3.7642062463037465e-05, "loss": 1.6406, "step": 11129 }, { "epoch": 0.2825012498655208, "grad_norm": 1.4646129608154297, "learning_rate": 3.7569537983496373e-05, "loss": 1.6653, "step": 11160 }, { "epoch": 0.2832859755595917, "grad_norm": 1.4709292650222778, "learning_rate": 3.749687165842753e-05, "loss": 1.6704, "step": 11191 }, { "epoch": 0.28407070125366257, "grad_norm": 1.494458556175232, "learning_rate": 3.7424064307860536e-05, "loss": 1.6534, "step": 11222 }, { "epoch": 0.28485542694773347, "grad_norm": 1.4409736394882202, "learning_rate": 3.735111675341645e-05, "loss": 1.6645, "step": 11253 }, { "epoch": 0.28564015264180437, "grad_norm": 1.4628338813781738, "learning_rate": 3.7278029818298524e-05, "loss": 1.6611, "step": 11284 }, { "epoch": 0.28642487833587527, "grad_norm": 1.3659113645553589, "learning_rate": 3.720480432728287e-05, "loss": 1.6435, "step": 11315 }, { "epoch": 0.28720960402994616, "grad_norm": 1.3704752922058105, "learning_rate": 3.71314411067092e-05, "loss": 1.6507, "step": 11346 }, { "epoch": 0.28799432972401706, "grad_norm": 1.579837441444397, "learning_rate": 3.70579409844715e-05, "loss": 1.6716, "step": 11377 }, { "epoch": 0.2887790554180879, "grad_norm": 1.5566996335983276, "learning_rate": 3.698430479000865e-05, "loss": 1.6439, "step": 11408 }, { "epoch": 0.2895637811121588, "grad_norm": 1.4722687005996704, "learning_rate": 3.691053335429509e-05, "loss": 1.683, "step": 11439 }, { "epoch": 0.2903485068062297, "grad_norm": 1.491283893585205, "learning_rate": 3.683662750983147e-05, "loss": 1.6606, "step": 11470 }, { "epoch": 0.2911332325003006, "grad_norm": 1.402040719985962, "learning_rate": 3.676258809063518e-05, "loss": 1.6582, "step": 11501 }, { "epoch": 0.2919179581943715, "grad_norm": 1.4377038478851318, "learning_rate": 3.6688415932231004e-05, "loss": 1.6398, "step": 11532 }, { "epoch": 0.2927026838884424, "grad_norm": 1.4151259660720825, "learning_rate": 3.661411187164166e-05, "loss": 1.6645, "step": 11563 }, { "epoch": 0.2934874095825133, "grad_norm": 1.5219615697860718, "learning_rate": 3.65396767473784e-05, "loss": 1.6705, "step": 11594 }, { "epoch": 0.2942721352765842, "grad_norm": 1.533252239227295, "learning_rate": 3.6465111399431465e-05, "loss": 1.6714, "step": 11625 }, { "epoch": 0.29505686097065503, "grad_norm": 1.410959243774414, "learning_rate": 3.6390416669260674e-05, "loss": 1.6533, "step": 11656 }, { "epoch": 0.29584158666472593, "grad_norm": 1.5377541780471802, "learning_rate": 3.63155933997859e-05, "loss": 1.6505, "step": 11687 }, { "epoch": 0.29662631235879683, "grad_norm": 1.4504135847091675, "learning_rate": 3.624064243537758e-05, "loss": 1.6287, "step": 11718 }, { "epoch": 0.2974110380528677, "grad_norm": 1.4606986045837402, "learning_rate": 3.616556462184716e-05, "loss": 1.6592, "step": 11749 }, { "epoch": 0.2981957637469386, "grad_norm": 1.4440289735794067, "learning_rate": 3.609036080643755e-05, "loss": 1.6598, "step": 11780 }, { "epoch": 0.2989804894410095, "grad_norm": 1.5399249792099, "learning_rate": 3.60150318378136e-05, "loss": 1.6852, "step": 11811 }, { "epoch": 0.2997652151350804, "grad_norm": 1.4778543710708618, "learning_rate": 3.5939578566052465e-05, "loss": 1.6462, "step": 11842 }, { "epoch": 0.3005499408291513, "grad_norm": 1.4979726076126099, "learning_rate": 3.586400184263408e-05, "loss": 1.6576, "step": 11873 }, { "epoch": 0.30133466652322216, "grad_norm": 1.4904232025146484, "learning_rate": 3.578830252043148e-05, "loss": 1.6476, "step": 11904 }, { "epoch": 0.30211939221729306, "grad_norm": 1.5472886562347412, "learning_rate": 3.571248145370125e-05, "loss": 1.6721, "step": 11935 }, { "epoch": 0.30290411791136396, "grad_norm": 1.4954209327697754, "learning_rate": 3.5636539498073794e-05, "loss": 1.6483, "step": 11966 }, { "epoch": 0.30368884360543486, "grad_norm": 1.4504363536834717, "learning_rate": 3.556047751054378e-05, "loss": 1.657, "step": 11997 }, { "epoch": 0.30447356929950575, "grad_norm": 1.3581033945083618, "learning_rate": 3.548429634946039e-05, "loss": 1.6579, "step": 12028 }, { "epoch": 0.30525829499357665, "grad_norm": 1.4421014785766602, "learning_rate": 3.540799687451768e-05, "loss": 1.6496, "step": 12059 }, { "epoch": 0.30604302068764755, "grad_norm": 1.523169994354248, "learning_rate": 3.533157994674485e-05, "loss": 1.6714, "step": 12090 }, { "epoch": 0.30682774638171845, "grad_norm": 1.455269455909729, "learning_rate": 3.5255046428496546e-05, "loss": 1.6695, "step": 12121 }, { "epoch": 0.3076124720757893, "grad_norm": 1.4330891370773315, "learning_rate": 3.517839718344311e-05, "loss": 1.6519, "step": 12152 }, { "epoch": 0.3083971977698602, "grad_norm": 1.3913158178329468, "learning_rate": 3.510163307656086e-05, "loss": 1.6329, "step": 12183 }, { "epoch": 0.3091819234639311, "grad_norm": 1.355193018913269, "learning_rate": 3.5024754974122324e-05, "loss": 1.624, "step": 12214 }, { "epoch": 0.309966649158002, "grad_norm": 1.4055231809616089, "learning_rate": 3.494776374368643e-05, "loss": 1.6491, "step": 12245 }, { "epoch": 0.3107513748520729, "grad_norm": 1.4227032661437988, "learning_rate": 3.4870660254088724e-05, "loss": 1.6274, "step": 12276 }, { "epoch": 0.3115361005461438, "grad_norm": 1.4558427333831787, "learning_rate": 3.479344537543164e-05, "loss": 1.6419, "step": 12307 }, { "epoch": 0.3123208262402147, "grad_norm": 1.5154629945755005, "learning_rate": 3.4716119979074565e-05, "loss": 1.6443, "step": 12338 }, { "epoch": 0.3131055519342856, "grad_norm": 1.4458774328231812, "learning_rate": 3.463868493762412e-05, "loss": 1.6615, "step": 12369 }, { "epoch": 0.3138902776283564, "grad_norm": 1.4116544723510742, "learning_rate": 3.456114112492418e-05, "loss": 1.6481, "step": 12400 }, { "epoch": 0.3146750033224273, "grad_norm": 1.8497071266174316, "learning_rate": 3.4483489416046164e-05, "loss": 1.6262, "step": 12431 }, { "epoch": 0.3154597290164982, "grad_norm": 1.3854331970214844, "learning_rate": 3.440573068727905e-05, "loss": 1.6387, "step": 12462 }, { "epoch": 0.3162444547105691, "grad_norm": 1.509178876876831, "learning_rate": 3.4327865816119495e-05, "loss": 1.6566, "step": 12493 }, { "epoch": 0.31702918040464, "grad_norm": 1.3977612257003784, "learning_rate": 3.4249895681262025e-05, "loss": 1.6676, "step": 12524 }, { "epoch": 0.3178139060987109, "grad_norm": 1.3736423254013062, "learning_rate": 3.417182116258899e-05, "loss": 1.6238, "step": 12555 }, { "epoch": 0.3185986317927818, "grad_norm": 1.4226630926132202, "learning_rate": 3.409364314116074e-05, "loss": 1.6513, "step": 12586 }, { "epoch": 0.3193833574868527, "grad_norm": 1.4804571866989136, "learning_rate": 3.401536249920559e-05, "loss": 1.6383, "step": 12617 }, { "epoch": 0.32016808318092355, "grad_norm": 1.456168532371521, "learning_rate": 3.393698012010998e-05, "loss": 1.6621, "step": 12648 }, { "epoch": 0.32095280887499444, "grad_norm": 1.3990952968597412, "learning_rate": 3.385849688840839e-05, "loss": 1.6376, "step": 12679 }, { "epoch": 0.32173753456906534, "grad_norm": 1.3588812351226807, "learning_rate": 3.3779913689773414e-05, "loss": 1.656, "step": 12710 }, { "epoch": 0.32252226026313624, "grad_norm": 1.4718931913375854, "learning_rate": 3.370123141100578e-05, "loss": 1.6255, "step": 12741 }, { "epoch": 0.32330698595720714, "grad_norm": 1.3603503704071045, "learning_rate": 3.3622450940024305e-05, "loss": 1.6517, "step": 12772 }, { "epoch": 0.32409171165127804, "grad_norm": 1.4493441581726074, "learning_rate": 3.35435731658559e-05, "loss": 1.643, "step": 12803 }, { "epoch": 0.32487643734534893, "grad_norm": 1.3813337087631226, "learning_rate": 3.346459897862552e-05, "loss": 1.6449, "step": 12834 }, { "epoch": 0.32566116303941983, "grad_norm": 1.5027899742126465, "learning_rate": 3.338552926954613e-05, "loss": 1.6497, "step": 12865 }, { "epoch": 0.3264458887334907, "grad_norm": 1.3805309534072876, "learning_rate": 3.330636493090868e-05, "loss": 1.6449, "step": 12896 }, { "epoch": 0.3272306144275616, "grad_norm": 1.642248511314392, "learning_rate": 3.322710685607193e-05, "loss": 1.6261, "step": 12927 }, { "epoch": 0.32801534012163247, "grad_norm": 1.4579522609710693, "learning_rate": 3.314775593945251e-05, "loss": 1.6648, "step": 12958 }, { "epoch": 0.32880006581570337, "grad_norm": 1.3579092025756836, "learning_rate": 3.3068313076514714e-05, "loss": 1.6468, "step": 12989 }, { "epoch": 0.32958479150977427, "grad_norm": 1.406051754951477, "learning_rate": 3.298877916376047e-05, "loss": 1.6249, "step": 13020 }, { "epoch": 0.33036951720384516, "grad_norm": 1.457335114479065, "learning_rate": 3.290915509871915e-05, "loss": 1.6353, "step": 13051 }, { "epoch": 0.33115424289791606, "grad_norm": 1.4548041820526123, "learning_rate": 3.282944177993753e-05, "loss": 1.6272, "step": 13082 }, { "epoch": 0.33193896859198696, "grad_norm": 1.4140032529830933, "learning_rate": 3.274964010696957e-05, "loss": 1.6479, "step": 13113 }, { "epoch": 0.3327236942860578, "grad_norm": 1.3436623811721802, "learning_rate": 3.266975098036629e-05, "loss": 1.6452, "step": 13144 }, { "epoch": 0.3335084199801287, "grad_norm": 1.4224274158477783, "learning_rate": 3.258977530166562e-05, "loss": 1.6242, "step": 13175 }, { "epoch": 0.3342931456741996, "grad_norm": 1.5661940574645996, "learning_rate": 3.250971397338227e-05, "loss": 1.6404, "step": 13206 }, { "epoch": 0.3350778713682705, "grad_norm": 1.4696576595306396, "learning_rate": 3.2429567898997404e-05, "loss": 1.6436, "step": 13237 }, { "epoch": 0.3358625970623414, "grad_norm": 1.4438591003417969, "learning_rate": 3.234933798294859e-05, "loss": 1.6404, "step": 13268 }, { "epoch": 0.3366473227564123, "grad_norm": 1.4548406600952148, "learning_rate": 3.2269025130619535e-05, "loss": 1.6461, "step": 13299 }, { "epoch": 0.3374320484504832, "grad_norm": 1.4180691242218018, "learning_rate": 3.218863024832985e-05, "loss": 1.6377, "step": 13330 }, { "epoch": 0.3382167741445541, "grad_norm": 1.4060105085372925, "learning_rate": 3.2108154243324864e-05, "loss": 1.6045, "step": 13361 }, { "epoch": 0.33900149983862493, "grad_norm": 1.4134920835494995, "learning_rate": 3.2027598023765345e-05, "loss": 1.6264, "step": 13392 }, { "epoch": 0.33978622553269583, "grad_norm": 1.4582122564315796, "learning_rate": 3.194696249871729e-05, "loss": 1.623, "step": 13423 }, { "epoch": 0.3405709512267667, "grad_norm": 1.4027389287948608, "learning_rate": 3.186624857814164e-05, "loss": 1.6337, "step": 13454 }, { "epoch": 0.3413556769208376, "grad_norm": 1.3397070169448853, "learning_rate": 3.178545717288401e-05, "loss": 1.6334, "step": 13485 }, { "epoch": 0.3421404026149085, "grad_norm": 1.5358332395553589, "learning_rate": 3.170458919466444e-05, "loss": 1.6393, "step": 13516 }, { "epoch": 0.3429251283089794, "grad_norm": 1.5479260683059692, "learning_rate": 3.1623645556067063e-05, "loss": 1.6357, "step": 13547 }, { "epoch": 0.3437098540030503, "grad_norm": 1.3949965238571167, "learning_rate": 3.154262717052985e-05, "loss": 1.6325, "step": 13578 }, { "epoch": 0.3444945796971212, "grad_norm": 1.392903208732605, "learning_rate": 3.146153495233426e-05, "loss": 1.6071, "step": 13609 }, { "epoch": 0.34527930539119206, "grad_norm": 1.4290788173675537, "learning_rate": 3.1380369816594944e-05, "loss": 1.6266, "step": 13640 }, { "epoch": 0.34606403108526296, "grad_norm": 1.4005228281021118, "learning_rate": 3.129913267924946e-05, "loss": 1.6391, "step": 13671 }, { "epoch": 0.34684875677933386, "grad_norm": 1.378369927406311, "learning_rate": 3.121782445704782e-05, "loss": 1.6495, "step": 13702 }, { "epoch": 0.34763348247340475, "grad_norm": 1.4202784299850464, "learning_rate": 3.11364460675423e-05, "loss": 1.637, "step": 13733 }, { "epoch": 0.34841820816747565, "grad_norm": 1.3670291900634766, "learning_rate": 3.1054998429076934e-05, "loss": 1.5941, "step": 13764 }, { "epoch": 0.34920293386154655, "grad_norm": 1.3714202642440796, "learning_rate": 3.097348246077728e-05, "loss": 1.6096, "step": 13795 }, { "epoch": 0.34998765955561745, "grad_norm": 1.4889552593231201, "learning_rate": 3.0891899082539924e-05, "loss": 1.6245, "step": 13826 }, { "epoch": 0.35077238524968835, "grad_norm": 1.4640086889266968, "learning_rate": 3.0810249215022233e-05, "loss": 1.6197, "step": 13857 }, { "epoch": 0.35155711094375924, "grad_norm": 1.385380506515503, "learning_rate": 3.0728533779631865e-05, "loss": 1.61, "step": 13888 }, { "epoch": 0.3523418366378301, "grad_norm": 1.3958945274353027, "learning_rate": 3.064675369851637e-05, "loss": 1.6139, "step": 13919 }, { "epoch": 0.353126562331901, "grad_norm": 1.3746731281280518, "learning_rate": 3.056490989455289e-05, "loss": 1.6307, "step": 13950 }, { "epoch": 0.3539112880259719, "grad_norm": 1.4196429252624512, "learning_rate": 3.0483003291337596e-05, "loss": 1.6192, "step": 13981 }, { "epoch": 0.3546960137200428, "grad_norm": 1.3648637533187866, "learning_rate": 3.040103481317539e-05, "loss": 1.6124, "step": 14012 }, { "epoch": 0.3554807394141137, "grad_norm": 1.422004222869873, "learning_rate": 3.03190053850694e-05, "loss": 1.6288, "step": 14043 }, { "epoch": 0.3562654651081846, "grad_norm": 1.4687801599502563, "learning_rate": 3.0236915932710573e-05, "loss": 1.6118, "step": 14074 }, { "epoch": 0.3570501908022555, "grad_norm": 1.30635404586792, "learning_rate": 3.0154767382467232e-05, "loss": 1.6341, "step": 14105 }, { "epoch": 0.35783491649632637, "grad_norm": 1.4216945171356201, "learning_rate": 3.0072560661374582e-05, "loss": 1.6385, "step": 14136 }, { "epoch": 0.3586196421903972, "grad_norm": 1.4296518564224243, "learning_rate": 2.999029669712431e-05, "loss": 1.6262, "step": 14167 }, { "epoch": 0.3594043678844681, "grad_norm": 1.4529691934585571, "learning_rate": 2.990797641805408e-05, "loss": 1.6136, "step": 14198 }, { "epoch": 0.360189093578539, "grad_norm": 1.389478325843811, "learning_rate": 2.982560075313704e-05, "loss": 1.6263, "step": 14229 }, { "epoch": 0.3609738192726099, "grad_norm": 1.3917667865753174, "learning_rate": 2.9743170631971368e-05, "loss": 1.6456, "step": 14260 }, { "epoch": 0.3617585449666808, "grad_norm": 1.3452563285827637, "learning_rate": 2.9660686984769792e-05, "loss": 1.6284, "step": 14291 }, { "epoch": 0.3625432706607517, "grad_norm": 1.421159029006958, "learning_rate": 2.9578150742349047e-05, "loss": 1.6232, "step": 14322 }, { "epoch": 0.3633279963548226, "grad_norm": 1.4312077760696411, "learning_rate": 2.949556283611942e-05, "loss": 1.6006, "step": 14353 }, { "epoch": 0.3641127220488935, "grad_norm": 1.4271692037582397, "learning_rate": 2.9412924198074206e-05, "loss": 1.6177, "step": 14384 }, { "epoch": 0.36489744774296434, "grad_norm": 1.3584555387496948, "learning_rate": 2.9330235760779208e-05, "loss": 1.6148, "step": 14415 }, { "epoch": 0.36568217343703524, "grad_norm": 1.3882123231887817, "learning_rate": 2.9247498457362188e-05, "loss": 1.6327, "step": 14446 }, { "epoch": 0.36646689913110614, "grad_norm": 1.540114402770996, "learning_rate": 2.9164713221502373e-05, "loss": 1.6052, "step": 14477 }, { "epoch": 0.36725162482517704, "grad_norm": 1.3554641008377075, "learning_rate": 2.9081880987419912e-05, "loss": 1.6091, "step": 14508 }, { "epoch": 0.36803635051924793, "grad_norm": 1.3693712949752808, "learning_rate": 2.8999002689865296e-05, "loss": 1.5936, "step": 14539 }, { "epoch": 0.36882107621331883, "grad_norm": 1.354278564453125, "learning_rate": 2.8916079264108852e-05, "loss": 1.612, "step": 14570 }, { "epoch": 0.36960580190738973, "grad_norm": 1.3731021881103516, "learning_rate": 2.883311164593017e-05, "loss": 1.6064, "step": 14601 }, { "epoch": 0.37039052760146063, "grad_norm": 1.3914356231689453, "learning_rate": 2.875010077160754e-05, "loss": 1.6036, "step": 14632 }, { "epoch": 0.37117525329553147, "grad_norm": 1.4811164140701294, "learning_rate": 2.866704757790741e-05, "loss": 1.6195, "step": 14663 }, { "epoch": 0.37195997898960237, "grad_norm": 1.4619332551956177, "learning_rate": 2.858395300207376e-05, "loss": 1.6315, "step": 14694 }, { "epoch": 0.37274470468367327, "grad_norm": 1.456950306892395, "learning_rate": 2.8500817981817607e-05, "loss": 1.6276, "step": 14725 }, { "epoch": 0.37352943037774416, "grad_norm": 5.129410266876221, "learning_rate": 2.8417643455306336e-05, "loss": 1.6234, "step": 14756 }, { "epoch": 0.37431415607181506, "grad_norm": 1.3831191062927246, "learning_rate": 2.8334430361153185e-05, "loss": 1.6163, "step": 14787 }, { "epoch": 0.37509888176588596, "grad_norm": 1.3817623853683472, "learning_rate": 2.8251179638406612e-05, "loss": 1.6206, "step": 14818 }, { "epoch": 0.37588360745995686, "grad_norm": 1.5285260677337646, "learning_rate": 2.8167892226539704e-05, "loss": 1.6117, "step": 14849 }, { "epoch": 0.37666833315402776, "grad_norm": 1.403324007987976, "learning_rate": 2.8084569065439588e-05, "loss": 1.5962, "step": 14880 }, { "epoch": 0.3774530588480986, "grad_norm": 1.3314014673233032, "learning_rate": 2.8001211095396807e-05, "loss": 1.6116, "step": 14911 }, { "epoch": 0.3782377845421695, "grad_norm": 1.4300462007522583, "learning_rate": 2.791781925709473e-05, "loss": 1.6234, "step": 14942 }, { "epoch": 0.3790225102362404, "grad_norm": 1.424811601638794, "learning_rate": 2.7834394491598908e-05, "loss": 1.5986, "step": 14973 }, { "epoch": 0.3798072359303113, "grad_norm": 1.3818182945251465, "learning_rate": 2.7750937740346485e-05, "loss": 1.6012, "step": 15004 }, { "epoch": 0.3805919616243822, "grad_norm": 1.4053683280944824, "learning_rate": 2.7667449945135564e-05, "loss": 1.6018, "step": 15035 }, { "epoch": 0.3813766873184531, "grad_norm": 1.5093421936035156, "learning_rate": 2.7583932048114557e-05, "loss": 1.61, "step": 15066 }, { "epoch": 0.382161413012524, "grad_norm": 1.412494421005249, "learning_rate": 2.7500384991771587e-05, "loss": 1.613, "step": 15097 }, { "epoch": 0.3829461387065949, "grad_norm": 1.335167646408081, "learning_rate": 2.7416809718923825e-05, "loss": 1.6197, "step": 15128 }, { "epoch": 0.3837308644006657, "grad_norm": 1.334786295890808, "learning_rate": 2.7333207172706864e-05, "loss": 1.6284, "step": 15159 }, { "epoch": 0.3845155900947366, "grad_norm": 1.4039522409439087, "learning_rate": 2.7249578296564088e-05, "loss": 1.5889, "step": 15190 }, { "epoch": 0.3853003157888075, "grad_norm": 1.4196487665176392, "learning_rate": 2.7165924034235973e-05, "loss": 1.6132, "step": 15221 }, { "epoch": 0.3860850414828784, "grad_norm": 1.4701744318008423, "learning_rate": 2.708224532974953e-05, "loss": 1.6009, "step": 15252 }, { "epoch": 0.3868697671769493, "grad_norm": 1.319935917854309, "learning_rate": 2.6998543127407538e-05, "loss": 1.6333, "step": 15283 }, { "epoch": 0.3876544928710202, "grad_norm": 1.3962234258651733, "learning_rate": 2.6914818371777988e-05, "loss": 1.6175, "step": 15314 }, { "epoch": 0.3884392185650911, "grad_norm": 1.4284230470657349, "learning_rate": 2.6831072007683373e-05, "loss": 1.6007, "step": 15345 }, { "epoch": 0.389223944259162, "grad_norm": 1.298251748085022, "learning_rate": 2.6747304980190018e-05, "loss": 1.605, "step": 15376 }, { "epoch": 0.39000866995323286, "grad_norm": 1.294994831085205, "learning_rate": 2.6663518234597453e-05, "loss": 1.6025, "step": 15407 }, { "epoch": 0.39079339564730375, "grad_norm": 1.440958023071289, "learning_rate": 2.6579712716427696e-05, "loss": 1.6002, "step": 15438 }, { "epoch": 0.39157812134137465, "grad_norm": 1.439590573310852, "learning_rate": 2.6495889371414652e-05, "loss": 1.6025, "step": 15469 }, { "epoch": 0.39236284703544555, "grad_norm": 1.4235502481460571, "learning_rate": 2.6412049145493367e-05, "loss": 1.5993, "step": 15500 }, { "epoch": 0.39314757272951645, "grad_norm": 1.4449518918991089, "learning_rate": 2.632819298478939e-05, "loss": 1.63, "step": 15531 }, { "epoch": 0.39393229842358735, "grad_norm": 1.4422321319580078, "learning_rate": 2.6244321835608105e-05, "loss": 1.6193, "step": 15562 }, { "epoch": 0.39471702411765824, "grad_norm": 1.4232275485992432, "learning_rate": 2.6160436644424024e-05, "loss": 1.6193, "step": 15593 }, { "epoch": 0.39550174981172914, "grad_norm": 1.5187265872955322, "learning_rate": 2.6076538357870133e-05, "loss": 1.618, "step": 15624 }, { "epoch": 0.3962864755058, "grad_norm": 1.4493205547332764, "learning_rate": 2.5992627922727196e-05, "loss": 1.6082, "step": 15655 }, { "epoch": 0.3970712011998709, "grad_norm": 1.5100423097610474, "learning_rate": 2.5908706285913066e-05, "loss": 1.6081, "step": 15686 }, { "epoch": 0.3978559268939418, "grad_norm": 1.465114712715149, "learning_rate": 2.5824774394472008e-05, "loss": 1.6125, "step": 15717 }, { "epoch": 0.3986406525880127, "grad_norm": 1.4160761833190918, "learning_rate": 2.5740833195563996e-05, "loss": 1.5951, "step": 15748 }, { "epoch": 0.3994253782820836, "grad_norm": 1.381658673286438, "learning_rate": 2.5656883636454067e-05, "loss": 1.6051, "step": 15779 }, { "epoch": 0.4002101039761545, "grad_norm": 1.3883142471313477, "learning_rate": 2.557292666450159e-05, "loss": 1.6039, "step": 15810 }, { "epoch": 0.4009948296702254, "grad_norm": 1.506911039352417, "learning_rate": 2.5488963227149566e-05, "loss": 1.5761, "step": 15841 }, { "epoch": 0.40177955536429627, "grad_norm": 1.4450113773345947, "learning_rate": 2.5404994271913983e-05, "loss": 1.5734, "step": 15872 }, { "epoch": 0.4025642810583671, "grad_norm": 1.3970619440078735, "learning_rate": 2.5321020746373085e-05, "loss": 1.6094, "step": 15903 }, { "epoch": 0.403349006752438, "grad_norm": 1.4761073589324951, "learning_rate": 2.52370435981567e-05, "loss": 1.6075, "step": 15934 }, { "epoch": 0.4041337324465089, "grad_norm": 1.3969392776489258, "learning_rate": 2.5153063774935533e-05, "loss": 1.5788, "step": 15965 }, { "epoch": 0.4049184581405798, "grad_norm": 1.3772737979888916, "learning_rate": 2.506908222441045e-05, "loss": 1.61, "step": 15996 }, { "epoch": 0.4057031838346507, "grad_norm": 1.3969396352767944, "learning_rate": 2.498509989430187e-05, "loss": 1.5943, "step": 16027 }, { "epoch": 0.4064879095287216, "grad_norm": 1.3052096366882324, "learning_rate": 2.4901117732338958e-05, "loss": 1.61, "step": 16058 }, { "epoch": 0.4072726352227925, "grad_norm": 1.394612193107605, "learning_rate": 2.481713668624899e-05, "loss": 1.6018, "step": 16089 }, { "epoch": 0.4080573609168634, "grad_norm": 1.3575886487960815, "learning_rate": 2.4733157703746663e-05, "loss": 1.5883, "step": 16120 }, { "epoch": 0.40884208661093424, "grad_norm": 1.3952176570892334, "learning_rate": 2.4649181732523392e-05, "loss": 1.6152, "step": 16151 }, { "epoch": 0.40962681230500514, "grad_norm": 1.5711455345153809, "learning_rate": 2.4565209720236582e-05, "loss": 1.61, "step": 16182 }, { "epoch": 0.41041153799907604, "grad_norm": 1.5258722305297852, "learning_rate": 2.4481242614498975e-05, "loss": 1.628, "step": 16213 }, { "epoch": 0.41119626369314694, "grad_norm": 1.425764799118042, "learning_rate": 2.439728136286796e-05, "loss": 1.5872, "step": 16244 }, { "epoch": 0.41198098938721783, "grad_norm": 1.3165446519851685, "learning_rate": 2.4313326912834852e-05, "loss": 1.6008, "step": 16275 }, { "epoch": 0.41276571508128873, "grad_norm": 1.386579155921936, "learning_rate": 2.4229380211814206e-05, "loss": 1.5783, "step": 16306 }, { "epoch": 0.41355044077535963, "grad_norm": 1.464693307876587, "learning_rate": 2.4145442207133124e-05, "loss": 1.5947, "step": 16337 }, { "epoch": 0.4143351664694305, "grad_norm": 1.334782600402832, "learning_rate": 2.406151384602059e-05, "loss": 1.5886, "step": 16368 }, { "epoch": 0.41511989216350137, "grad_norm": 1.4115489721298218, "learning_rate": 2.3977596075596747e-05, "loss": 1.5821, "step": 16399 }, { "epoch": 0.41590461785757227, "grad_norm": 1.391065001487732, "learning_rate": 2.3893689842862223e-05, "loss": 1.6141, "step": 16430 }, { "epoch": 0.41668934355164317, "grad_norm": 1.4244657754898071, "learning_rate": 2.3809796094687475e-05, "loss": 1.6008, "step": 16461 }, { "epoch": 0.41747406924571406, "grad_norm": 1.3113791942596436, "learning_rate": 2.372591577780202e-05, "loss": 1.608, "step": 16492 }, { "epoch": 0.41825879493978496, "grad_norm": 1.4262186288833618, "learning_rate": 2.3642049838783838e-05, "loss": 1.5801, "step": 16523 }, { "epoch": 0.41904352063385586, "grad_norm": 1.4219175577163696, "learning_rate": 2.3558199224048666e-05, "loss": 1.592, "step": 16554 }, { "epoch": 0.41982824632792676, "grad_norm": 1.4542045593261719, "learning_rate": 2.347436487983929e-05, "loss": 1.6062, "step": 16585 }, { "epoch": 0.42061297202199766, "grad_norm": 1.4484211206436157, "learning_rate": 2.3390547752214888e-05, "loss": 1.6042, "step": 16616 }, { "epoch": 0.4213976977160685, "grad_norm": 1.4561681747436523, "learning_rate": 2.330674878704035e-05, "loss": 1.617, "step": 16647 }, { "epoch": 0.4221824234101394, "grad_norm": 1.4250808954238892, "learning_rate": 2.322296892997561e-05, "loss": 1.5947, "step": 16678 }, { "epoch": 0.4229671491042103, "grad_norm": 1.3762766122817993, "learning_rate": 2.313920912646497e-05, "loss": 1.5962, "step": 16709 }, { "epoch": 0.4237518747982812, "grad_norm": 1.3508645296096802, "learning_rate": 2.305547032172643e-05, "loss": 1.5969, "step": 16740 }, { "epoch": 0.4245366004923521, "grad_norm": 1.4839844703674316, "learning_rate": 2.2971753460741014e-05, "loss": 1.5697, "step": 16771 }, { "epoch": 0.425321326186423, "grad_norm": 1.4027475118637085, "learning_rate": 2.288805948824212e-05, "loss": 1.5758, "step": 16802 }, { "epoch": 0.4261060518804939, "grad_norm": 1.3288599252700806, "learning_rate": 2.2804389348704858e-05, "loss": 1.5817, "step": 16833 }, { "epoch": 0.4268907775745648, "grad_norm": 1.411028265953064, "learning_rate": 2.2720743986335374e-05, "loss": 1.6059, "step": 16864 }, { "epoch": 0.4276755032686356, "grad_norm": 1.4803740978240967, "learning_rate": 2.2637124345060233e-05, "loss": 1.6061, "step": 16895 }, { "epoch": 0.4284602289627065, "grad_norm": 1.6195276975631714, "learning_rate": 2.2553531368515695e-05, "loss": 1.5948, "step": 16926 }, { "epoch": 0.4292449546567774, "grad_norm": 1.368160605430603, "learning_rate": 2.2469966000037144e-05, "loss": 1.5884, "step": 16957 }, { "epoch": 0.4300296803508483, "grad_norm": 2.9462714195251465, "learning_rate": 2.2386429182648417e-05, "loss": 1.5834, "step": 16988 }, { "epoch": 0.4308144060449192, "grad_norm": 1.319602370262146, "learning_rate": 2.230292185905114e-05, "loss": 1.571, "step": 17019 }, { "epoch": 0.4315991317389901, "grad_norm": 1.412001371383667, "learning_rate": 2.2219444971614116e-05, "loss": 1.6091, "step": 17050 }, { "epoch": 0.432383857433061, "grad_norm": 1.4459586143493652, "learning_rate": 2.2135999462362655e-05, "loss": 1.5803, "step": 17081 }, { "epoch": 0.4331685831271319, "grad_norm": 1.3342795372009277, "learning_rate": 2.2052586272968003e-05, "loss": 1.5809, "step": 17112 }, { "epoch": 0.43395330882120275, "grad_norm": 1.3263877630233765, "learning_rate": 2.196920634473666e-05, "loss": 1.5742, "step": 17143 }, { "epoch": 0.43473803451527365, "grad_norm": 1.3818809986114502, "learning_rate": 2.1885860618599787e-05, "loss": 1.5701, "step": 17174 }, { "epoch": 0.43552276020934455, "grad_norm": 1.4324009418487549, "learning_rate": 2.1802550035102577e-05, "loss": 1.5622, "step": 17205 }, { "epoch": 0.43630748590341545, "grad_norm": 1.3489223718643188, "learning_rate": 2.171927553439363e-05, "loss": 1.5737, "step": 17236 }, { "epoch": 0.43709221159748635, "grad_norm": 1.6844401359558105, "learning_rate": 2.1636038056214376e-05, "loss": 1.5916, "step": 17267 }, { "epoch": 0.43787693729155724, "grad_norm": 1.3632712364196777, "learning_rate": 2.155283853988844e-05, "loss": 1.6055, "step": 17298 }, { "epoch": 0.43866166298562814, "grad_norm": 1.4866870641708374, "learning_rate": 2.146967792431106e-05, "loss": 1.5858, "step": 17329 }, { "epoch": 0.43944638867969904, "grad_norm": 1.5456846952438354, "learning_rate": 2.138655714793849e-05, "loss": 1.6098, "step": 17360 }, { "epoch": 0.44023111437376994, "grad_norm": 1.4177597761154175, "learning_rate": 2.1303477148777367e-05, "loss": 1.5833, "step": 17391 }, { "epoch": 0.4410158400678408, "grad_norm": 1.4126933813095093, "learning_rate": 2.122043886437421e-05, "loss": 1.599, "step": 17422 }, { "epoch": 0.4418005657619117, "grad_norm": 1.4183374643325806, "learning_rate": 2.1137443231804765e-05, "loss": 1.5941, "step": 17453 }, { "epoch": 0.4425852914559826, "grad_norm": 1.4230761528015137, "learning_rate": 2.105449118766347e-05, "loss": 1.5743, "step": 17484 }, { "epoch": 0.4433700171500535, "grad_norm": 1.6844847202301025, "learning_rate": 2.097158366805287e-05, "loss": 1.5672, "step": 17515 }, { "epoch": 0.4441547428441244, "grad_norm": 1.410435438156128, "learning_rate": 2.0888721608573047e-05, "loss": 1.5896, "step": 17546 }, { "epoch": 0.44493946853819527, "grad_norm": 1.3948931694030762, "learning_rate": 2.0805905944311087e-05, "loss": 1.5899, "step": 17577 }, { "epoch": 0.44572419423226617, "grad_norm": 1.3747113943099976, "learning_rate": 2.0723137609830497e-05, "loss": 1.5576, "step": 17608 }, { "epoch": 0.44650891992633707, "grad_norm": 1.477161169052124, "learning_rate": 2.0640417539160686e-05, "loss": 1.5576, "step": 17639 }, { "epoch": 0.4472936456204079, "grad_norm": 1.372091293334961, "learning_rate": 2.0557746665786427e-05, "loss": 1.5958, "step": 17670 }, { "epoch": 0.4480783713144788, "grad_norm": 1.361820936203003, "learning_rate": 2.0475125922637256e-05, "loss": 1.5917, "step": 17701 }, { "epoch": 0.4488630970085497, "grad_norm": 1.367297887802124, "learning_rate": 2.0392556242077047e-05, "loss": 1.5965, "step": 17732 }, { "epoch": 0.4496478227026206, "grad_norm": 1.538565754890442, "learning_rate": 2.031003855589343e-05, "loss": 1.5814, "step": 17763 }, { "epoch": 0.4504325483966915, "grad_norm": 1.4618374109268188, "learning_rate": 2.022757379528727e-05, "loss": 1.5852, "step": 17794 }, { "epoch": 0.4512172740907624, "grad_norm": 1.3954309225082397, "learning_rate": 2.0145162890862184e-05, "loss": 1.5576, "step": 17825 }, { "epoch": 0.4520019997848333, "grad_norm": 1.33854079246521, "learning_rate": 2.0062806772614022e-05, "loss": 1.5793, "step": 17856 }, { "epoch": 0.4527867254789042, "grad_norm": 1.4751428365707397, "learning_rate": 1.9980506369920392e-05, "loss": 1.5831, "step": 17887 }, { "epoch": 0.45357145117297504, "grad_norm": 1.3836451768875122, "learning_rate": 1.989826261153015e-05, "loss": 1.5967, "step": 17918 }, { "epoch": 0.45435617686704594, "grad_norm": 1.4987123012542725, "learning_rate": 1.9816076425552923e-05, "loss": 1.5953, "step": 17949 }, { "epoch": 0.45514090256111683, "grad_norm": 1.3838002681732178, "learning_rate": 1.9733948739448676e-05, "loss": 1.5614, "step": 17980 }, { "epoch": 0.45592562825518773, "grad_norm": 1.358023762702942, "learning_rate": 1.9651880480017155e-05, "loss": 1.5737, "step": 18011 }, { "epoch": 0.45671035394925863, "grad_norm": 1.3181227445602417, "learning_rate": 1.9569872573387516e-05, "loss": 1.5806, "step": 18042 }, { "epoch": 0.4574950796433295, "grad_norm": 1.3574905395507812, "learning_rate": 1.9487925945007854e-05, "loss": 1.5779, "step": 18073 }, { "epoch": 0.4582798053374004, "grad_norm": 1.3550188541412354, "learning_rate": 1.9406041519634726e-05, "loss": 1.5723, "step": 18104 }, { "epoch": 0.4590645310314713, "grad_norm": 1.3672763109207153, "learning_rate": 1.932422022132275e-05, "loss": 1.5869, "step": 18135 }, { "epoch": 0.45984925672554217, "grad_norm": 1.428689956665039, "learning_rate": 1.924246297341414e-05, "loss": 1.5743, "step": 18166 }, { "epoch": 0.46063398241961306, "grad_norm": 1.3313350677490234, "learning_rate": 1.9160770698528338e-05, "loss": 1.5836, "step": 18197 }, { "epoch": 0.46141870811368396, "grad_norm": 1.3049378395080566, "learning_rate": 1.907914431855156e-05, "loss": 1.5753, "step": 18228 }, { "epoch": 0.46220343380775486, "grad_norm": 1.3737244606018066, "learning_rate": 1.8997584754626412e-05, "loss": 1.589, "step": 18259 }, { "epoch": 0.46298815950182576, "grad_norm": 1.4522390365600586, "learning_rate": 1.8916092927141486e-05, "loss": 1.5898, "step": 18290 }, { "epoch": 0.46377288519589666, "grad_norm": 1.3189274072647095, "learning_rate": 1.883466975572098e-05, "loss": 1.5721, "step": 18321 }, { "epoch": 0.46455761088996755, "grad_norm": 1.3040895462036133, "learning_rate": 1.8753316159214312e-05, "loss": 1.58, "step": 18352 }, { "epoch": 0.46534233658403845, "grad_norm": 1.3528228998184204, "learning_rate": 1.8672033055685766e-05, "loss": 1.5812, "step": 18383 }, { "epoch": 0.4661270622781093, "grad_norm": 1.3759435415267944, "learning_rate": 1.8590821362404116e-05, "loss": 1.5905, "step": 18414 }, { "epoch": 0.4669117879721802, "grad_norm": 1.374550223350525, "learning_rate": 1.8509681995832294e-05, "loss": 1.5737, "step": 18445 }, { "epoch": 0.4676965136662511, "grad_norm": 1.4290833473205566, "learning_rate": 1.8428615871617004e-05, "loss": 1.577, "step": 18476 }, { "epoch": 0.468481239360322, "grad_norm": 1.287758231163025, "learning_rate": 1.8347623904578448e-05, "loss": 1.5652, "step": 18507 }, { "epoch": 0.4692659650543929, "grad_norm": 1.3034193515777588, "learning_rate": 1.8266707008699975e-05, "loss": 1.5708, "step": 18538 }, { "epoch": 0.4700506907484638, "grad_norm": 1.3413418531417847, "learning_rate": 1.818586609711774e-05, "loss": 1.5629, "step": 18569 }, { "epoch": 0.4708354164425347, "grad_norm": 1.3434704542160034, "learning_rate": 1.8105102082110462e-05, "loss": 1.5726, "step": 18600 }, { "epoch": 0.4716201421366056, "grad_norm": 1.3321512937545776, "learning_rate": 1.8024415875089058e-05, "loss": 1.5767, "step": 18631 }, { "epoch": 0.4724048678306764, "grad_norm": 1.3440663814544678, "learning_rate": 1.7943808386586407e-05, "loss": 1.5971, "step": 18662 }, { "epoch": 0.4731895935247473, "grad_norm": 1.356490135192871, "learning_rate": 1.7863280526247073e-05, "loss": 1.5511, "step": 18693 }, { "epoch": 0.4739743192188182, "grad_norm": 1.5594719648361206, "learning_rate": 1.7782833202817003e-05, "loss": 1.5807, "step": 18724 }, { "epoch": 0.4747590449128891, "grad_norm": 1.3007055521011353, "learning_rate": 1.7702467324133327e-05, "loss": 1.5864, "step": 18755 }, { "epoch": 0.47554377060696, "grad_norm": 1.3085851669311523, "learning_rate": 1.7622183797114042e-05, "loss": 1.5624, "step": 18786 }, { "epoch": 0.4763284963010309, "grad_norm": 1.4323654174804688, "learning_rate": 1.7541983527747838e-05, "loss": 1.5759, "step": 18817 }, { "epoch": 0.4771132219951018, "grad_norm": 1.6249394416809082, "learning_rate": 1.746186742108387e-05, "loss": 1.5853, "step": 18848 }, { "epoch": 0.4778979476891727, "grad_norm": 1.4717755317687988, "learning_rate": 1.73818363812215e-05, "loss": 1.5627, "step": 18879 }, { "epoch": 0.47868267338324355, "grad_norm": 1.4533812999725342, "learning_rate": 1.7301891311300153e-05, "loss": 1.5582, "step": 18910 }, { "epoch": 0.47946739907731445, "grad_norm": 1.4233548641204834, "learning_rate": 1.7222033113489055e-05, "loss": 1.5829, "step": 18941 }, { "epoch": 0.48025212477138535, "grad_norm": 1.4943761825561523, "learning_rate": 1.7142262688977127e-05, "loss": 1.563, "step": 18972 }, { "epoch": 0.48103685046545624, "grad_norm": 1.4122124910354614, "learning_rate": 1.7062580937962764e-05, "loss": 1.5723, "step": 19003 }, { "epoch": 0.48182157615952714, "grad_norm": 1.3874859809875488, "learning_rate": 1.698298875964369e-05, "loss": 1.5606, "step": 19034 }, { "epoch": 0.48260630185359804, "grad_norm": 1.3442684412002563, "learning_rate": 1.690348705220684e-05, "loss": 1.5794, "step": 19065 }, { "epoch": 0.48339102754766894, "grad_norm": 1.5870423316955566, "learning_rate": 1.6824076712818156e-05, "loss": 1.5782, "step": 19096 }, { "epoch": 0.48417575324173984, "grad_norm": 1.3558776378631592, "learning_rate": 1.6744758637612533e-05, "loss": 1.5642, "step": 19127 }, { "epoch": 0.4849604789358107, "grad_norm": 1.4363101720809937, "learning_rate": 1.6665533721683664e-05, "loss": 1.5698, "step": 19158 }, { "epoch": 0.4857452046298816, "grad_norm": 1.423425555229187, "learning_rate": 1.6586402859073974e-05, "loss": 1.5712, "step": 19189 }, { "epoch": 0.4865299303239525, "grad_norm": 1.3792959451675415, "learning_rate": 1.6507366942764463e-05, "loss": 1.567, "step": 19220 }, { "epoch": 0.4873146560180234, "grad_norm": 1.4269790649414062, "learning_rate": 1.6428426864664732e-05, "loss": 1.5616, "step": 19251 }, { "epoch": 0.48809938171209427, "grad_norm": 1.4407951831817627, "learning_rate": 1.6349583515602816e-05, "loss": 1.5786, "step": 19282 }, { "epoch": 0.48888410740616517, "grad_norm": 1.4874082803726196, "learning_rate": 1.6270837785315208e-05, "loss": 1.5907, "step": 19313 }, { "epoch": 0.48966883310023607, "grad_norm": 1.382135272026062, "learning_rate": 1.619219056243676e-05, "loss": 1.5673, "step": 19344 }, { "epoch": 0.49045355879430697, "grad_norm": 1.3598939180374146, "learning_rate": 1.6113642734490698e-05, "loss": 1.5548, "step": 19375 }, { "epoch": 0.4912382844883778, "grad_norm": 1.4186638593673706, "learning_rate": 1.6035195187878577e-05, "loss": 1.5834, "step": 19406 }, { "epoch": 0.4920230101824487, "grad_norm": 1.3320554494857788, "learning_rate": 1.5956848807870305e-05, "loss": 1.5435, "step": 19437 }, { "epoch": 0.4928077358765196, "grad_norm": 1.3170437812805176, "learning_rate": 1.587860447859413e-05, "loss": 1.5538, "step": 19468 }, { "epoch": 0.4935924615705905, "grad_norm": 1.463334321975708, "learning_rate": 1.5800463083026686e-05, "loss": 1.5603, "step": 19499 }, { "epoch": 0.4943771872646614, "grad_norm": 1.4043060541152954, "learning_rate": 1.572242550298298e-05, "loss": 1.5778, "step": 19530 }, { "epoch": 0.4951619129587323, "grad_norm": 1.3377630710601807, "learning_rate": 1.56444926191065e-05, "loss": 1.5836, "step": 19561 }, { "epoch": 0.4959466386528032, "grad_norm": 1.4007608890533447, "learning_rate": 1.5566665310859257e-05, "loss": 1.5691, "step": 19592 }, { "epoch": 0.4967313643468741, "grad_norm": 1.3231667280197144, "learning_rate": 1.5488944456511846e-05, "loss": 1.5517, "step": 19623 }, { "epoch": 0.49751609004094494, "grad_norm": 1.4343535900115967, "learning_rate": 1.5411330933133546e-05, "loss": 1.5753, "step": 19654 }, { "epoch": 0.49830081573501583, "grad_norm": 1.2943058013916016, "learning_rate": 1.533382561658241e-05, "loss": 1.5571, "step": 19685 }, { "epoch": 0.49908554142908673, "grad_norm": 1.2815899848937988, "learning_rate": 1.525642938149541e-05, "loss": 1.5796, "step": 19716 }, { "epoch": 0.49987026712315763, "grad_norm": 1.4025834798812866, "learning_rate": 1.5179143101278536e-05, "loss": 1.5672, "step": 19747 }, { "epoch": 0.5006549928172285, "grad_norm": 1.4670218229293823, "learning_rate": 1.5101967648096955e-05, "loss": 1.5702, "step": 19778 }, { "epoch": 0.5014397185112994, "grad_norm": 1.4222999811172485, "learning_rate": 1.5024903892865172e-05, "loss": 1.5842, "step": 19809 }, { "epoch": 0.5022244442053703, "grad_norm": 1.4714964628219604, "learning_rate": 1.4947952705237184e-05, "loss": 1.5552, "step": 19840 }, { "epoch": 0.5030091698994412, "grad_norm": 1.3124053478240967, "learning_rate": 1.4871114953596682e-05, "loss": 1.567, "step": 19871 }, { "epoch": 0.5037938955935121, "grad_norm": 1.343239188194275, "learning_rate": 1.4794391505047256e-05, "loss": 1.5829, "step": 19902 }, { "epoch": 0.504578621287583, "grad_norm": 1.4160040616989136, "learning_rate": 1.4717783225402596e-05, "loss": 1.5479, "step": 19933 }, { "epoch": 0.5053633469816539, "grad_norm": 1.3658647537231445, "learning_rate": 1.4641290979176735e-05, "loss": 1.558, "step": 19964 }, { "epoch": 0.5061480726757248, "grad_norm": 1.2913247346878052, "learning_rate": 1.4564915629574246e-05, "loss": 1.5795, "step": 19995 }, { "epoch": 0.5069327983697957, "grad_norm": 1.3975298404693604, "learning_rate": 1.4488658038480601e-05, "loss": 1.5557, "step": 20026 }, { "epoch": 0.5077175240638665, "grad_norm": 1.342119812965393, "learning_rate": 1.4412519066452323e-05, "loss": 1.5727, "step": 20057 }, { "epoch": 0.5085022497579375, "grad_norm": 1.3325005769729614, "learning_rate": 1.4336499572707373e-05, "loss": 1.5573, "step": 20088 }, { "epoch": 0.5092869754520083, "grad_norm": 1.3986520767211914, "learning_rate": 1.4260600415115433e-05, "loss": 1.5537, "step": 20119 }, { "epoch": 0.5100717011460792, "grad_norm": 1.3560576438903809, "learning_rate": 1.4184822450188137e-05, "loss": 1.5529, "step": 20150 }, { "epoch": 0.5108564268401501, "grad_norm": 1.4381458759307861, "learning_rate": 1.410916653306954e-05, "loss": 1.5845, "step": 20181 }, { "epoch": 0.511641152534221, "grad_norm": 1.6817706823349, "learning_rate": 1.403363351752639e-05, "loss": 1.569, "step": 20212 }, { "epoch": 0.5124258782282919, "grad_norm": 1.3956488370895386, "learning_rate": 1.3958224255938485e-05, "loss": 1.5561, "step": 20243 }, { "epoch": 0.5132106039223627, "grad_norm": 1.3474819660186768, "learning_rate": 1.388293959928911e-05, "loss": 1.5608, "step": 20274 }, { "epoch": 0.5139953296164337, "grad_norm": 1.286340594291687, "learning_rate": 1.3807780397155379e-05, "loss": 1.5661, "step": 20305 }, { "epoch": 0.5147800553105045, "grad_norm": 1.3667712211608887, "learning_rate": 1.3732747497698655e-05, "loss": 1.5778, "step": 20336 }, { "epoch": 0.5155647810045755, "grad_norm": 1.4048058986663818, "learning_rate": 1.3657841747655038e-05, "loss": 1.5444, "step": 20367 }, { "epoch": 0.5163495066986463, "grad_norm": 1.5085017681121826, "learning_rate": 1.3583063992325706e-05, "loss": 1.5657, "step": 20398 }, { "epoch": 0.5171342323927173, "grad_norm": 1.3968846797943115, "learning_rate": 1.3508415075567496e-05, "loss": 1.5641, "step": 20429 }, { "epoch": 0.5179189580867881, "grad_norm": 1.403813123703003, "learning_rate": 1.343389583978327e-05, "loss": 1.5768, "step": 20460 }, { "epoch": 0.5187036837808591, "grad_norm": 1.3661153316497803, "learning_rate": 1.3359507125912468e-05, "loss": 1.5511, "step": 20491 }, { "epoch": 0.5194884094749299, "grad_norm": 1.4918231964111328, "learning_rate": 1.3285249773421627e-05, "loss": 1.5552, "step": 20522 }, { "epoch": 0.5202731351690008, "grad_norm": 1.366255521774292, "learning_rate": 1.3211124620294884e-05, "loss": 1.5573, "step": 20553 }, { "epoch": 0.5210578608630717, "grad_norm": 1.360115885734558, "learning_rate": 1.313713250302451e-05, "loss": 1.5743, "step": 20584 }, { "epoch": 0.5218425865571426, "grad_norm": 1.396219253540039, "learning_rate": 1.3063274256601479e-05, "loss": 1.5313, "step": 20615 }, { "epoch": 0.5226273122512135, "grad_norm": 1.3751533031463623, "learning_rate": 1.2989550714506086e-05, "loss": 1.554, "step": 20646 }, { "epoch": 0.5234120379452843, "grad_norm": 1.3931307792663574, "learning_rate": 1.291596270869846e-05, "loss": 1.572, "step": 20677 }, { "epoch": 0.5241967636393553, "grad_norm": 1.3172565698623657, "learning_rate": 1.284251106960927e-05, "loss": 1.556, "step": 20708 }, { "epoch": 0.5249814893334261, "grad_norm": 1.4660224914550781, "learning_rate": 1.2769196626130263e-05, "loss": 1.563, "step": 20739 }, { "epoch": 0.5257662150274971, "grad_norm": 1.3981261253356934, "learning_rate": 1.2696020205604969e-05, "loss": 1.536, "step": 20770 }, { "epoch": 0.5265509407215679, "grad_norm": 1.3775140047073364, "learning_rate": 1.2622982633819359e-05, "loss": 1.5538, "step": 20801 }, { "epoch": 0.5273356664156388, "grad_norm": 1.3806031942367554, "learning_rate": 1.2550084734992484e-05, "loss": 1.5717, "step": 20832 }, { "epoch": 0.5281203921097097, "grad_norm": 1.663273572921753, "learning_rate": 1.247732733176724e-05, "loss": 1.5474, "step": 20863 }, { "epoch": 0.5289051178037806, "grad_norm": 1.4349000453948975, "learning_rate": 1.2404711245201044e-05, "loss": 1.563, "step": 20894 }, { "epoch": 0.5296898434978515, "grad_norm": 1.4207381010055542, "learning_rate": 1.2332237294756535e-05, "loss": 1.5769, "step": 20925 }, { "epoch": 0.5304745691919224, "grad_norm": 1.3234254121780396, "learning_rate": 1.225990629829241e-05, "loss": 1.5419, "step": 20956 }, { "epoch": 0.5312592948859933, "grad_norm": 1.3426439762115479, "learning_rate": 1.2187719072054136e-05, "loss": 1.5479, "step": 20987 }, { "epoch": 0.5320440205800642, "grad_norm": 1.3690837621688843, "learning_rate": 1.2115676430664735e-05, "loss": 1.5668, "step": 21018 }, { "epoch": 0.532828746274135, "grad_norm": 1.4441026449203491, "learning_rate": 1.2043779187115647e-05, "loss": 1.5663, "step": 21049 }, { "epoch": 0.533613471968206, "grad_norm": 1.379137396812439, "learning_rate": 1.1972028152757476e-05, "loss": 1.5704, "step": 21080 }, { "epoch": 0.5343981976622768, "grad_norm": 1.3750004768371582, "learning_rate": 1.1900424137290889e-05, "loss": 1.5518, "step": 21111 }, { "epoch": 0.5351829233563478, "grad_norm": 1.465265154838562, "learning_rate": 1.1828967948757482e-05, "loss": 1.5539, "step": 21142 }, { "epoch": 0.5359676490504186, "grad_norm": 1.3172025680541992, "learning_rate": 1.175766039353062e-05, "loss": 1.5544, "step": 21173 }, { "epoch": 0.5367523747444896, "grad_norm": 1.4065696001052856, "learning_rate": 1.1686502276306382e-05, "loss": 1.5586, "step": 21204 }, { "epoch": 0.5375371004385604, "grad_norm": 1.45732581615448, "learning_rate": 1.1615494400094445e-05, "loss": 1.5728, "step": 21235 }, { "epoch": 0.5383218261326314, "grad_norm": 1.3364806175231934, "learning_rate": 1.1544637566209029e-05, "loss": 1.5569, "step": 21266 }, { "epoch": 0.5391065518267022, "grad_norm": 1.3799667358398438, "learning_rate": 1.1473932574259886e-05, "loss": 1.5344, "step": 21297 }, { "epoch": 0.539891277520773, "grad_norm": 1.4128960371017456, "learning_rate": 1.1403380222143247e-05, "loss": 1.5546, "step": 21328 }, { "epoch": 0.540676003214844, "grad_norm": 1.5169612169265747, "learning_rate": 1.1332981306032808e-05, "loss": 1.5471, "step": 21359 }, { "epoch": 0.5414607289089148, "grad_norm": 1.4209131002426147, "learning_rate": 1.1262736620370762e-05, "loss": 1.5654, "step": 21390 }, { "epoch": 0.5422454546029858, "grad_norm": 1.3103234767913818, "learning_rate": 1.1192646957858854e-05, "loss": 1.5492, "step": 21421 }, { "epoch": 0.5430301802970566, "grad_norm": 1.7383350133895874, "learning_rate": 1.1122713109449381e-05, "loss": 1.5502, "step": 21452 }, { "epoch": 0.5438149059911276, "grad_norm": 1.3104016780853271, "learning_rate": 1.105293586433634e-05, "loss": 1.5564, "step": 21483 }, { "epoch": 0.5445996316851984, "grad_norm": 1.3233284950256348, "learning_rate": 1.0983316009946446e-05, "loss": 1.5274, "step": 21514 }, { "epoch": 0.5453843573792693, "grad_norm": 1.4942415952682495, "learning_rate": 1.0913854331930282e-05, "loss": 1.5643, "step": 21545 }, { "epoch": 0.5461690830733402, "grad_norm": 1.3964463472366333, "learning_rate": 1.0844551614153456e-05, "loss": 1.5575, "step": 21576 }, { "epoch": 0.5469538087674111, "grad_norm": 1.4472683668136597, "learning_rate": 1.0775408638687725e-05, "loss": 1.5459, "step": 21607 }, { "epoch": 0.547738534461482, "grad_norm": 1.3240516185760498, "learning_rate": 1.0706426185802165e-05, "loss": 1.5703, "step": 21638 }, { "epoch": 0.5485232601555529, "grad_norm": 1.3561683893203735, "learning_rate": 1.0637605033954371e-05, "loss": 1.5429, "step": 21669 }, { "epoch": 0.5493079858496238, "grad_norm": 1.3770638704299927, "learning_rate": 1.05689459597817e-05, "loss": 1.5575, "step": 21700 }, { "epoch": 0.5500927115436947, "grad_norm": 1.4219211339950562, "learning_rate": 1.050044973809246e-05, "loss": 1.5392, "step": 21731 }, { "epoch": 0.5508774372377656, "grad_norm": 1.3968154191970825, "learning_rate": 1.043211714185722e-05, "loss": 1.559, "step": 21762 }, { "epoch": 0.5516621629318365, "grad_norm": 1.3730138540267944, "learning_rate": 1.036394894220003e-05, "loss": 1.5452, "step": 21793 }, { "epoch": 0.5524468886259073, "grad_norm": 1.407535433769226, "learning_rate": 1.0295945908389751e-05, "loss": 1.5477, "step": 21824 }, { "epoch": 0.5532316143199782, "grad_norm": 1.440319299697876, "learning_rate": 1.0228108807831393e-05, "loss": 1.5483, "step": 21855 }, { "epoch": 0.5540163400140491, "grad_norm": 1.38417649269104, "learning_rate": 1.01604384060574e-05, "loss": 1.569, "step": 21886 }, { "epoch": 0.55480106570812, "grad_norm": 1.51227867603302, "learning_rate": 1.009293546671907e-05, "loss": 1.5441, "step": 21917 }, { "epoch": 0.5555857914021909, "grad_norm": 1.3792462348937988, "learning_rate": 1.002560075157791e-05, "loss": 1.5537, "step": 21948 }, { "epoch": 0.5563705170962618, "grad_norm": 1.3728954792022705, "learning_rate": 9.958435020496995e-06, "loss": 1.5463, "step": 21979 }, { "epoch": 0.5571552427903327, "grad_norm": 1.4337445497512817, "learning_rate": 9.89143903143249e-06, "loss": 1.5409, "step": 22010 }, { "epoch": 0.5579399684844035, "grad_norm": 1.317431092262268, "learning_rate": 9.824613540425038e-06, "loss": 1.5541, "step": 22041 }, { "epoch": 0.5587246941784745, "grad_norm": 1.3596452474594116, "learning_rate": 9.757959301591197e-06, "loss": 1.5465, "step": 22072 }, { "epoch": 0.5595094198725453, "grad_norm": 1.4173970222473145, "learning_rate": 9.691477067115017e-06, "loss": 1.5534, "step": 22103 }, { "epoch": 0.5602941455666163, "grad_norm": 2.4860451221466064, "learning_rate": 9.625167587239467e-06, "loss": 1.5458, "step": 22134 }, { "epoch": 0.5610788712606871, "grad_norm": 1.440307378768921, "learning_rate": 9.559031610258007e-06, "loss": 1.5581, "step": 22165 }, { "epoch": 0.5618635969547581, "grad_norm": 1.5789539813995361, "learning_rate": 9.493069882506164e-06, "loss": 1.5589, "step": 22196 }, { "epoch": 0.5626483226488289, "grad_norm": 1.3445873260498047, "learning_rate": 9.427283148353056e-06, "loss": 1.5533, "step": 22227 }, { "epoch": 0.5634330483428999, "grad_norm": 1.3744895458221436, "learning_rate": 9.361672150193052e-06, "loss": 1.5497, "step": 22258 }, { "epoch": 0.5642177740369707, "grad_norm": 1.4480764865875244, "learning_rate": 9.29623762843734e-06, "loss": 1.5521, "step": 22289 }, { "epoch": 0.5650024997310416, "grad_norm": 1.3482125997543335, "learning_rate": 9.230980321505594e-06, "loss": 1.5514, "step": 22320 }, { "epoch": 0.5657872254251125, "grad_norm": 1.4724624156951904, "learning_rate": 9.165900965817668e-06, "loss": 1.558, "step": 22351 }, { "epoch": 0.5665719511191833, "grad_norm": 1.4756817817687988, "learning_rate": 9.101000295785245e-06, "loss": 1.5519, "step": 22382 }, { "epoch": 0.5673566768132543, "grad_norm": 1.4908230304718018, "learning_rate": 9.036279043803565e-06, "loss": 1.5649, "step": 22413 }, { "epoch": 0.5681414025073251, "grad_norm": 1.2823692560195923, "learning_rate": 8.971737940243147e-06, "loss": 1.5561, "step": 22444 }, { "epoch": 0.5689261282013961, "grad_norm": 1.3445894718170166, "learning_rate": 8.907377713441592e-06, "loss": 1.5296, "step": 22475 }, { "epoch": 0.5697108538954669, "grad_norm": 1.3359887599945068, "learning_rate": 8.843199089695293e-06, "loss": 1.5299, "step": 22506 }, { "epoch": 0.5704955795895378, "grad_norm": 1.4024282693862915, "learning_rate": 8.779202793251311e-06, "loss": 1.555, "step": 22537 }, { "epoch": 0.5712803052836087, "grad_norm": 1.402908444404602, "learning_rate": 8.715389546299149e-06, "loss": 1.5442, "step": 22568 }, { "epoch": 0.5720650309776796, "grad_norm": 1.3054429292678833, "learning_rate": 8.651760068962617e-06, "loss": 1.5491, "step": 22599 }, { "epoch": 0.5728497566717505, "grad_norm": 1.314642071723938, "learning_rate": 8.588315079291733e-06, "loss": 1.531, "step": 22630 }, { "epoch": 0.5736344823658214, "grad_norm": 1.2906594276428223, "learning_rate": 8.52505529325457e-06, "loss": 1.525, "step": 22661 }, { "epoch": 0.5744192080598923, "grad_norm": 1.391607403755188, "learning_rate": 8.461981424729216e-06, "loss": 1.5578, "step": 22692 }, { "epoch": 0.5752039337539632, "grad_norm": 1.5275055170059204, "learning_rate": 8.399094185495725e-06, "loss": 1.5468, "step": 22723 }, { "epoch": 0.5759886594480341, "grad_norm": 1.4094804525375366, "learning_rate": 8.336394285228017e-06, "loss": 1.5336, "step": 22754 }, { "epoch": 0.576773385142105, "grad_norm": 1.4096417427062988, "learning_rate": 8.273882431485952e-06, "loss": 1.5386, "step": 22785 }, { "epoch": 0.5775581108361758, "grad_norm": 1.4015659093856812, "learning_rate": 8.211559329707316e-06, "loss": 1.5514, "step": 22816 }, { "epoch": 0.5783428365302468, "grad_norm": 1.4353171586990356, "learning_rate": 8.149425683199823e-06, "loss": 1.5432, "step": 22847 }, { "epoch": 0.5791275622243176, "grad_norm": 1.3493109941482544, "learning_rate": 8.08748219313325e-06, "loss": 1.5387, "step": 22878 }, { "epoch": 0.5799122879183886, "grad_norm": 1.376868486404419, "learning_rate": 8.025729558531453e-06, "loss": 1.5397, "step": 22909 }, { "epoch": 0.5806970136124594, "grad_norm": 1.4415427446365356, "learning_rate": 7.964168476264508e-06, "loss": 1.5556, "step": 22940 }, { "epoch": 0.5814817393065304, "grad_norm": 1.4281046390533447, "learning_rate": 7.902799641040884e-06, "loss": 1.5312, "step": 22971 }, { "epoch": 0.5822664650006012, "grad_norm": 1.372336983680725, "learning_rate": 7.841623745399523e-06, "loss": 1.5437, "step": 23002 }, { "epoch": 0.583051190694672, "grad_norm": 1.3720817565917969, "learning_rate": 7.780641479702114e-06, "loss": 1.5599, "step": 23033 }, { "epoch": 0.583835916388743, "grad_norm": 1.3714765310287476, "learning_rate": 7.719853532125227e-06, "loss": 1.5256, "step": 23064 }, { "epoch": 0.5846206420828138, "grad_norm": 1.3198277950286865, "learning_rate": 7.65926058865258e-06, "loss": 1.5609, "step": 23095 }, { "epoch": 0.5854053677768848, "grad_norm": 1.3970394134521484, "learning_rate": 7.598863333067313e-06, "loss": 1.552, "step": 23126 }, { "epoch": 0.5861900934709556, "grad_norm": 1.3451225757598877, "learning_rate": 7.538662446944253e-06, "loss": 1.5407, "step": 23157 }, { "epoch": 0.5869748191650266, "grad_norm": 1.3626407384872437, "learning_rate": 7.478658609642211e-06, "loss": 1.528, "step": 23188 }, { "epoch": 0.5877595448590974, "grad_norm": 1.295155644416809, "learning_rate": 7.418852498296327e-06, "loss": 1.5396, "step": 23219 }, { "epoch": 0.5885442705531684, "grad_norm": 1.4162577390670776, "learning_rate": 7.359244787810457e-06, "loss": 1.5442, "step": 23250 }, { "epoch": 0.5893289962472392, "grad_norm": 1.4795522689819336, "learning_rate": 7.299836150849493e-06, "loss": 1.5724, "step": 23281 }, { "epoch": 0.5901137219413101, "grad_norm": 1.4080073833465576, "learning_rate": 7.240627257831847e-06, "loss": 1.5673, "step": 23312 }, { "epoch": 0.590898447635381, "grad_norm": 1.2865021228790283, "learning_rate": 7.1816187769218195e-06, "loss": 1.5529, "step": 23343 }, { "epoch": 0.5916831733294519, "grad_norm": 2.568460464477539, "learning_rate": 7.1228113740220895e-06, "loss": 1.5379, "step": 23374 }, { "epoch": 0.5924678990235228, "grad_norm": 1.4487184286117554, "learning_rate": 7.064205712766226e-06, "loss": 1.5417, "step": 23405 }, { "epoch": 0.5932526247175937, "grad_norm": 1.3384840488433838, "learning_rate": 7.005802454511129e-06, "loss": 1.5481, "step": 23436 }, { "epoch": 0.5940373504116646, "grad_norm": 1.3432554006576538, "learning_rate": 6.947602258329639e-06, "loss": 1.521, "step": 23467 }, { "epoch": 0.5948220761057355, "grad_norm": 1.3277153968811035, "learning_rate": 6.889605781003078e-06, "loss": 1.5348, "step": 23498 }, { "epoch": 0.5956068017998063, "grad_norm": 1.4018425941467285, "learning_rate": 6.831813677013776e-06, "loss": 1.5319, "step": 23529 }, { "epoch": 0.5963915274938772, "grad_norm": 1.44899582862854, "learning_rate": 6.774226598537792e-06, "loss": 1.5624, "step": 23560 }, { "epoch": 0.5971762531879481, "grad_norm": 1.4060876369476318, "learning_rate": 6.716845195437482e-06, "loss": 1.5487, "step": 23591 }, { "epoch": 0.597960978882019, "grad_norm": 1.4121522903442383, "learning_rate": 6.659670115254168e-06, "loss": 1.5332, "step": 23622 }, { "epoch": 0.5987457045760899, "grad_norm": 1.3269188404083252, "learning_rate": 6.602702003200872e-06, "loss": 1.5276, "step": 23653 }, { "epoch": 0.5995304302701608, "grad_norm": 1.3662550449371338, "learning_rate": 6.545941502154992e-06, "loss": 1.5629, "step": 23684 }, { "epoch": 0.6003151559642317, "grad_norm": 1.4438221454620361, "learning_rate": 6.489389252651057e-06, "loss": 1.5496, "step": 23715 }, { "epoch": 0.6010998816583026, "grad_norm": 1.422269344329834, "learning_rate": 6.4330458928735325e-06, "loss": 1.533, "step": 23746 }, { "epoch": 0.6018846073523735, "grad_norm": 1.3922473192214966, "learning_rate": 6.376912058649559e-06, "loss": 1.5198, "step": 23777 }, { "epoch": 0.6026693330464443, "grad_norm": 1.4476711750030518, "learning_rate": 6.320988383441845e-06, "loss": 1.55, "step": 23808 }, { "epoch": 0.6034540587405153, "grad_norm": 1.3881078958511353, "learning_rate": 6.265275498341452e-06, "loss": 1.524, "step": 23839 }, { "epoch": 0.6042387844345861, "grad_norm": 1.4356231689453125, "learning_rate": 6.209774032060714e-06, "loss": 1.5334, "step": 23870 }, { "epoch": 0.6050235101286571, "grad_norm": 1.34247624874115, "learning_rate": 6.1544846109261365e-06, "loss": 1.5309, "step": 23901 }, { "epoch": 0.6058082358227279, "grad_norm": 1.3616281747817993, "learning_rate": 6.099407858871342e-06, "loss": 1.5202, "step": 23932 }, { "epoch": 0.6065929615167989, "grad_norm": 1.4779770374298096, "learning_rate": 6.044544397429958e-06, "loss": 1.5266, "step": 23963 }, { "epoch": 0.6073776872108697, "grad_norm": 1.3740448951721191, "learning_rate": 5.989894845728708e-06, "loss": 1.5251, "step": 23994 }, { "epoch": 0.6081624129049406, "grad_norm": 1.3835887908935547, "learning_rate": 5.9354598204803605e-06, "loss": 1.5349, "step": 24025 }, { "epoch": 0.6089471385990115, "grad_norm": 1.419488549232483, "learning_rate": 5.881239935976762e-06, "loss": 1.5236, "step": 24056 }, { "epoch": 0.6097318642930823, "grad_norm": 1.3918389081954956, "learning_rate": 5.827235804081954e-06, "loss": 1.5534, "step": 24087 }, { "epoch": 0.6105165899871533, "grad_norm": 1.4750800132751465, "learning_rate": 5.773448034225221e-06, "loss": 1.5322, "step": 24118 }, { "epoch": 0.6113013156812241, "grad_norm": 1.4278340339660645, "learning_rate": 5.719877233394228e-06, "loss": 1.5626, "step": 24149 }, { "epoch": 0.6120860413752951, "grad_norm": 1.43100106716156, "learning_rate": 5.666524006128191e-06, "loss": 1.5411, "step": 24180 }, { "epoch": 0.6128707670693659, "grad_norm": 1.397022008895874, "learning_rate": 5.613388954511015e-06, "loss": 1.5233, "step": 24211 }, { "epoch": 0.6136554927634369, "grad_norm": 1.2984530925750732, "learning_rate": 5.560472678164552e-06, "loss": 1.5487, "step": 24242 }, { "epoch": 0.6144402184575077, "grad_norm": 1.318934679031372, "learning_rate": 5.507775774241775e-06, "loss": 1.5627, "step": 24273 }, { "epoch": 0.6152249441515786, "grad_norm": 1.4760456085205078, "learning_rate": 5.4552988374200945e-06, "loss": 1.5222, "step": 24304 }, { "epoch": 0.6160096698456495, "grad_norm": 1.350392460823059, "learning_rate": 5.403042459894597e-06, "loss": 1.535, "step": 24335 }, { "epoch": 0.6167943955397204, "grad_norm": 1.3857702016830444, "learning_rate": 5.3510072313714135e-06, "loss": 1.5483, "step": 24366 }, { "epoch": 0.6175791212337913, "grad_norm": 1.4854798316955566, "learning_rate": 5.2991937390610205e-06, "loss": 1.5381, "step": 24397 }, { "epoch": 0.6183638469278622, "grad_norm": 1.3600910902023315, "learning_rate": 5.247602567671625e-06, "loss": 1.5277, "step": 24428 }, { "epoch": 0.6191485726219331, "grad_norm": 1.3631632328033447, "learning_rate": 5.196234299402603e-06, "loss": 1.5583, "step": 24459 }, { "epoch": 0.619933298316004, "grad_norm": 1.4225085973739624, "learning_rate": 5.145089513937865e-06, "loss": 1.5346, "step": 24490 }, { "epoch": 0.6207180240100749, "grad_norm": 1.3548002243041992, "learning_rate": 5.094168788439369e-06, "loss": 1.546, "step": 24521 }, { "epoch": 0.6215027497041458, "grad_norm": 1.4630082845687866, "learning_rate": 5.043472697540594e-06, "loss": 1.549, "step": 24552 }, { "epoch": 0.6222874753982166, "grad_norm": 1.4638261795043945, "learning_rate": 4.993001813340012e-06, "loss": 1.5224, "step": 24583 }, { "epoch": 0.6230722010922876, "grad_norm": 1.3274465799331665, "learning_rate": 4.942756705394702e-06, "loss": 1.538, "step": 24614 }, { "epoch": 0.6238569267863584, "grad_norm": 1.4302935600280762, "learning_rate": 4.892737940713884e-06, "loss": 1.545, "step": 24645 }, { "epoch": 0.6246416524804294, "grad_norm": 1.4292621612548828, "learning_rate": 4.842946083752511e-06, "loss": 1.5275, "step": 24676 }, { "epoch": 0.6254263781745002, "grad_norm": 1.3631361722946167, "learning_rate": 4.79338169640493e-06, "loss": 1.5552, "step": 24707 }, { "epoch": 0.6262111038685712, "grad_norm": 1.4284039735794067, "learning_rate": 4.74404533799851e-06, "loss": 1.5298, "step": 24738 }, { "epoch": 0.626995829562642, "grad_norm": 1.4611119031906128, "learning_rate": 4.694937565287344e-06, "loss": 1.5414, "step": 24769 }, { "epoch": 0.6277805552567128, "grad_norm": 1.37677800655365, "learning_rate": 4.646058932445985e-06, "loss": 1.5392, "step": 24800 }, { "epoch": 0.6285652809507838, "grad_norm": 1.4582575559616089, "learning_rate": 4.597409991063148e-06, "loss": 1.5317, "step": 24831 }, { "epoch": 0.6293500066448546, "grad_norm": 1.3665950298309326, "learning_rate": 4.5489912901355375e-06, "loss": 1.5514, "step": 24862 }, { "epoch": 0.6301347323389256, "grad_norm": 1.3817001581192017, "learning_rate": 4.500803376061608e-06, "loss": 1.5343, "step": 24893 }, { "epoch": 0.6309194580329964, "grad_norm": 1.4217463731765747, "learning_rate": 4.45284679263541e-06, "loss": 1.5247, "step": 24924 }, { "epoch": 0.6317041837270674, "grad_norm": 1.3985430002212524, "learning_rate": 4.4051220810404775e-06, "loss": 1.5348, "step": 24955 }, { "epoch": 0.6324889094211382, "grad_norm": 1.4616161584854126, "learning_rate": 4.3576297798437025e-06, "loss": 1.5563, "step": 24986 }, { "epoch": 0.6332736351152092, "grad_norm": 1.3955610990524292, "learning_rate": 4.3103704249892436e-06, "loss": 1.5204, "step": 25017 }, { "epoch": 0.63405836080928, "grad_norm": 1.3720837831497192, "learning_rate": 4.263344549792487e-06, "loss": 1.5379, "step": 25048 }, { "epoch": 0.6348430865033509, "grad_norm": 1.347891092300415, "learning_rate": 4.216552684934056e-06, "loss": 1.5285, "step": 25079 }, { "epoch": 0.6356278121974218, "grad_norm": 1.5957375764846802, "learning_rate": 4.169995358453777e-06, "loss": 1.5163, "step": 25110 }, { "epoch": 0.6364125378914927, "grad_norm": 1.3431944847106934, "learning_rate": 4.123673095744757e-06, "loss": 1.5378, "step": 25141 }, { "epoch": 0.6371972635855636, "grad_norm": 1.4405794143676758, "learning_rate": 4.077586419547435e-06, "loss": 1.5563, "step": 25172 }, { "epoch": 0.6379819892796345, "grad_norm": 1.3969746828079224, "learning_rate": 4.03173584994368e-06, "loss": 1.5441, "step": 25203 }, { "epoch": 0.6387667149737054, "grad_norm": 1.542013168334961, "learning_rate": 3.986121904350948e-06, "loss": 1.5249, "step": 25234 }, { "epoch": 0.6395514406677762, "grad_norm": 1.4267256259918213, "learning_rate": 3.940745097516407e-06, "loss": 1.5184, "step": 25265 }, { "epoch": 0.6403361663618471, "grad_norm": 1.331272840499878, "learning_rate": 3.89560594151116e-06, "loss": 1.5437, "step": 25296 }, { "epoch": 0.641120892055918, "grad_norm": 1.368691086769104, "learning_rate": 3.850704945724456e-06, "loss": 1.5265, "step": 25327 }, { "epoch": 0.6419056177499889, "grad_norm": 1.3770484924316406, "learning_rate": 3.8060426168579077e-06, "loss": 1.5291, "step": 25358 }, { "epoch": 0.6426903434440598, "grad_norm": 1.4727221727371216, "learning_rate": 3.7616194589198407e-06, "loss": 1.5326, "step": 25389 }, { "epoch": 0.6434750691381307, "grad_norm": 1.3571360111236572, "learning_rate": 3.7174359732195574e-06, "loss": 1.5278, "step": 25420 }, { "epoch": 0.6442597948322016, "grad_norm": 1.4054335355758667, "learning_rate": 3.673492658361677e-06, "loss": 1.5405, "step": 25451 }, { "epoch": 0.6450445205262725, "grad_norm": 1.4510763883590698, "learning_rate": 3.6297900102405467e-06, "loss": 1.5409, "step": 25482 }, { "epoch": 0.6458292462203434, "grad_norm": 1.5653456449508667, "learning_rate": 3.586328522034607e-06, "loss": 1.5224, "step": 25513 }, { "epoch": 0.6466139719144143, "grad_norm": 1.4818406105041504, "learning_rate": 3.543108684200838e-06, "loss": 1.5251, "step": 25544 }, { "epoch": 0.6473986976084851, "grad_norm": 1.4254684448242188, "learning_rate": 3.5001309844692464e-06, "loss": 1.5219, "step": 25575 }, { "epoch": 0.6481834233025561, "grad_norm": 1.348809838294983, "learning_rate": 3.4573959078373215e-06, "loss": 1.5285, "step": 25606 }, { "epoch": 0.6489681489966269, "grad_norm": 1.4553576707839966, "learning_rate": 3.4149039365646063e-06, "loss": 1.5419, "step": 25637 }, { "epoch": 0.6497528746906979, "grad_norm": 1.412490963935852, "learning_rate": 3.3726555501672143e-06, "loss": 1.5186, "step": 25668 }, { "epoch": 0.6505376003847687, "grad_norm": 1.4104843139648438, "learning_rate": 3.33065122541244e-06, "loss": 1.5254, "step": 25699 }, { "epoch": 0.6513223260788397, "grad_norm": 1.3806548118591309, "learning_rate": 3.288891436313385e-06, "loss": 1.5272, "step": 25730 }, { "epoch": 0.6521070517729105, "grad_norm": 1.4207285642623901, "learning_rate": 3.2473766541235963e-06, "loss": 1.536, "step": 25761 }, { "epoch": 0.6528917774669813, "grad_norm": 1.3559178113937378, "learning_rate": 3.2061073473317466e-06, "loss": 1.5394, "step": 25792 }, { "epoch": 0.6536765031610523, "grad_norm": 1.3517690896987915, "learning_rate": 3.1650839816563444e-06, "loss": 1.5488, "step": 25823 }, { "epoch": 0.6544612288551231, "grad_norm": 1.3978461027145386, "learning_rate": 3.1243070200405093e-06, "loss": 1.5261, "step": 25854 }, { "epoch": 0.6552459545491941, "grad_norm": 1.3550540208816528, "learning_rate": 3.0837769226467e-06, "loss": 1.5254, "step": 25885 }, { "epoch": 0.6560306802432649, "grad_norm": 1.3790268898010254, "learning_rate": 3.0434941468515666e-06, "loss": 1.5224, "step": 25916 }, { "epoch": 0.6568154059373359, "grad_norm": 1.3558413982391357, "learning_rate": 3.003459147240753e-06, "loss": 1.5179, "step": 25947 }, { "epoch": 0.6576001316314067, "grad_norm": 1.3683024644851685, "learning_rate": 2.9636723756037875e-06, "loss": 1.5191, "step": 25978 }, { "epoch": 0.6583848573254777, "grad_norm": 1.4349849224090576, "learning_rate": 2.9241342809289833e-06, "loss": 1.5417, "step": 26009 }, { "epoch": 0.6591695830195485, "grad_norm": 1.3950988054275513, "learning_rate": 2.8848453093983594e-06, "loss": 1.5267, "step": 26040 }, { "epoch": 0.6599543087136194, "grad_norm": 1.3628458976745605, "learning_rate": 2.8458059043826257e-06, "loss": 1.5294, "step": 26071 }, { "epoch": 0.6607390344076903, "grad_norm": 1.3483256101608276, "learning_rate": 2.807016506436172e-06, "loss": 1.5498, "step": 26102 }, { "epoch": 0.6615237601017612, "grad_norm": 1.3618528842926025, "learning_rate": 2.7684775532920566e-06, "loss": 1.5271, "step": 26133 }, { "epoch": 0.6623084857958321, "grad_norm": 1.49851393699646, "learning_rate": 2.7301894798571425e-06, "loss": 1.526, "step": 26164 }, { "epoch": 0.663093211489903, "grad_norm": 1.5132079124450684, "learning_rate": 2.6921527182071386e-06, "loss": 1.5418, "step": 26195 }, { "epoch": 0.6638779371839739, "grad_norm": 1.4265996217727661, "learning_rate": 2.654367697581725e-06, "loss": 1.5455, "step": 26226 }, { "epoch": 0.6646626628780448, "grad_norm": 1.506589412689209, "learning_rate": 2.6168348443797175e-06, "loss": 1.5209, "step": 26257 }, { "epoch": 0.6654473885721156, "grad_norm": 1.3662431240081787, "learning_rate": 2.5795545821542757e-06, "loss": 1.5169, "step": 26288 }, { "epoch": 0.6662321142661866, "grad_norm": 1.4398752450942993, "learning_rate": 2.54252733160808e-06, "loss": 1.5491, "step": 26319 }, { "epoch": 0.6670168399602574, "grad_norm": 1.4776362180709839, "learning_rate": 2.5057535105886294e-06, "loss": 1.5192, "step": 26350 }, { "epoch": 0.6678015656543284, "grad_norm": 1.3796826601028442, "learning_rate": 2.4692335340834953e-06, "loss": 1.5245, "step": 26381 }, { "epoch": 0.6685862913483992, "grad_norm": 1.3923054933547974, "learning_rate": 2.432967814215639e-06, "loss": 1.5252, "step": 26412 }, { "epoch": 0.6693710170424702, "grad_norm": 1.3372383117675781, "learning_rate": 2.396956760238794e-06, "loss": 1.5227, "step": 26443 }, { "epoch": 0.670155742736541, "grad_norm": 1.3287001848220825, "learning_rate": 2.361200778532796e-06, "loss": 1.5335, "step": 26474 }, { "epoch": 0.670940468430612, "grad_norm": 1.3403995037078857, "learning_rate": 2.325700272599049e-06, "loss": 1.5304, "step": 26505 }, { "epoch": 0.6717251941246828, "grad_norm": 1.3469324111938477, "learning_rate": 2.2904556430559415e-06, "loss": 1.5329, "step": 26536 }, { "epoch": 0.6725099198187536, "grad_norm": 1.4993536472320557, "learning_rate": 2.2554672876343106e-06, "loss": 1.5228, "step": 26567 }, { "epoch": 0.6732946455128246, "grad_norm": 1.3785438537597656, "learning_rate": 2.220735601173002e-06, "loss": 1.516, "step": 26598 }, { "epoch": 0.6740793712068954, "grad_norm": 1.3642317056655884, "learning_rate": 2.186260975614382e-06, "loss": 1.5467, "step": 26629 }, { "epoch": 0.6748640969009664, "grad_norm": 1.3815925121307373, "learning_rate": 2.1520437999999034e-06, "loss": 1.5449, "step": 26660 }, { "epoch": 0.6756488225950372, "grad_norm": 1.3854280710220337, "learning_rate": 2.1180844604657526e-06, "loss": 1.5177, "step": 26691 }, { "epoch": 0.6764335482891082, "grad_norm": 1.4565620422363281, "learning_rate": 2.084383340238455e-06, "loss": 1.5119, "step": 26722 }, { "epoch": 0.677218273983179, "grad_norm": 1.35818612575531, "learning_rate": 2.0509408196305704e-06, "loss": 1.5084, "step": 26753 }, { "epoch": 0.6780029996772499, "grad_norm": 1.4125559329986572, "learning_rate": 2.017757276036403e-06, "loss": 1.5101, "step": 26784 }, { "epoch": 0.6787877253713208, "grad_norm": 1.43025803565979, "learning_rate": 1.984833083927726e-06, "loss": 1.5318, "step": 26815 }, { "epoch": 0.6795724510653917, "grad_norm": 1.3963549137115479, "learning_rate": 1.952168614849581e-06, "loss": 1.5248, "step": 26846 }, { "epoch": 0.6803571767594626, "grad_norm": 1.4896256923675537, "learning_rate": 1.919764237416058e-06, "loss": 1.5409, "step": 26877 }, { "epoch": 0.6811419024535335, "grad_norm": 1.3385494947433472, "learning_rate": 1.8876203173061463e-06, "loss": 1.5371, "step": 26908 }, { "epoch": 0.6819266281476044, "grad_norm": 1.3572068214416504, "learning_rate": 1.8557372172596206e-06, "loss": 1.5394, "step": 26939 }, { "epoch": 0.6827113538416753, "grad_norm": 1.455278992652893, "learning_rate": 1.8241152970729341e-06, "loss": 1.5345, "step": 26970 }, { "epoch": 0.6834960795357462, "grad_norm": 1.4417409896850586, "learning_rate": 1.7927549135951572e-06, "loss": 1.5252, "step": 27001 }, { "epoch": 0.684280805229817, "grad_norm": 1.4233667850494385, "learning_rate": 1.7616564207239477e-06, "loss": 1.5221, "step": 27032 }, { "epoch": 0.6850655309238879, "grad_norm": 1.4328643083572388, "learning_rate": 1.730820169401584e-06, "loss": 1.508, "step": 27063 }, { "epoch": 0.6858502566179588, "grad_norm": 1.3445032835006714, "learning_rate": 1.7002465076109558e-06, "loss": 1.5209, "step": 27094 }, { "epoch": 0.6866349823120297, "grad_norm": 1.4214242696762085, "learning_rate": 1.6699357803716898e-06, "loss": 1.5297, "step": 27125 }, { "epoch": 0.6874197080061006, "grad_norm": 1.3590694665908813, "learning_rate": 1.6398883297362305e-06, "loss": 1.5351, "step": 27156 }, { "epoch": 0.6882044337001715, "grad_norm": 1.4039976596832275, "learning_rate": 1.6101044947859606e-06, "loss": 1.5529, "step": 27187 }, { "epoch": 0.6889891593942424, "grad_norm": 1.3939241170883179, "learning_rate": 1.5805846116274114e-06, "loss": 1.509, "step": 27218 }, { "epoch": 0.6897738850883133, "grad_norm": 1.4963489770889282, "learning_rate": 1.5513290133884611e-06, "loss": 1.5526, "step": 27249 }, { "epoch": 0.6905586107823841, "grad_norm": 1.413089632987976, "learning_rate": 1.5223380302145512e-06, "loss": 1.5271, "step": 27280 }, { "epoch": 0.6913433364764551, "grad_norm": 1.4136161804199219, "learning_rate": 1.4936119892649925e-06, "loss": 1.5365, "step": 27311 }, { "epoch": 0.6921280621705259, "grad_norm": 1.4144634008407593, "learning_rate": 1.4651512147092482e-06, "loss": 1.5255, "step": 27342 }, { "epoch": 0.6929127878645969, "grad_norm": 1.3424650430679321, "learning_rate": 1.4369560277232908e-06, "loss": 1.5275, "step": 27373 }, { "epoch": 0.6936975135586677, "grad_norm": 1.4057984352111816, "learning_rate": 1.409026746485978e-06, "loss": 1.5273, "step": 27404 }, { "epoch": 0.6944822392527387, "grad_norm": 1.4132764339447021, "learning_rate": 1.3813636861754464e-06, "loss": 1.5219, "step": 27435 }, { "epoch": 0.6952669649468095, "grad_norm": 1.541971206665039, "learning_rate": 1.3539671589655773e-06, "loss": 1.5413, "step": 27466 }, { "epoch": 0.6960516906408805, "grad_norm": 1.4268949031829834, "learning_rate": 1.3268374740224548e-06, "loss": 1.5298, "step": 27497 }, { "epoch": 0.6968364163349513, "grad_norm": 1.427729606628418, "learning_rate": 1.2999749375008807e-06, "loss": 1.5239, "step": 27528 }, { "epoch": 0.6976211420290221, "grad_norm": 1.4411410093307495, "learning_rate": 1.2733798525409346e-06, "loss": 1.5215, "step": 27559 }, { "epoch": 0.6984058677230931, "grad_norm": 1.4318063259124756, "learning_rate": 1.2470525192645383e-06, "loss": 1.5238, "step": 27590 }, { "epoch": 0.6991905934171639, "grad_norm": 1.4129235744476318, "learning_rate": 1.2209932347720666e-06, "loss": 1.5239, "step": 27621 }, { "epoch": 0.6999753191112349, "grad_norm": 1.355039358139038, "learning_rate": 1.1952022931389972e-06, "loss": 1.5205, "step": 27652 }, { "epoch": 0.7007600448053057, "grad_norm": 1.6766573190689087, "learning_rate": 1.1696799854126083e-06, "loss": 1.5369, "step": 27683 }, { "epoch": 0.7015447704993767, "grad_norm": 1.380895733833313, "learning_rate": 1.1444265996086694e-06, "loss": 1.5116, "step": 27714 }, { "epoch": 0.7023294961934475, "grad_norm": 1.3622218370437622, "learning_rate": 1.119442420708211e-06, "loss": 1.5265, "step": 27745 }, { "epoch": 0.7031142218875185, "grad_norm": 1.4054621458053589, "learning_rate": 1.0947277306542964e-06, "loss": 1.5249, "step": 27776 }, { "epoch": 0.7038989475815893, "grad_norm": 1.27810537815094, "learning_rate": 1.0702828083488353e-06, "loss": 1.5321, "step": 27807 }, { "epoch": 0.7046836732756602, "grad_norm": 1.380940318107605, "learning_rate": 1.0461079296494647e-06, "loss": 1.5381, "step": 27838 }, { "epoch": 0.7054683989697311, "grad_norm": 1.3913681507110596, "learning_rate": 1.0222033673663978e-06, "loss": 1.5334, "step": 27869 }, { "epoch": 0.706253124663802, "grad_norm": 1.3444581031799316, "learning_rate": 9.985693912593713e-07, "loss": 1.5329, "step": 27900 }, { "epoch": 0.7070378503578729, "grad_norm": 1.3935922384262085, "learning_rate": 9.752062680346035e-07, "loss": 1.5152, "step": 27931 }, { "epoch": 0.7078225760519438, "grad_norm": 1.354137897491455, "learning_rate": 9.521142613417494e-07, "loss": 1.5419, "step": 27962 }, { "epoch": 0.7086073017460147, "grad_norm": 1.417913556098938, "learning_rate": 9.292936317709722e-07, "loss": 1.5379, "step": 27993 }, { "epoch": 0.7093920274400856, "grad_norm": 1.3628367185592651, "learning_rate": 9.067446368499793e-07, "loss": 1.551, "step": 28024 }, { "epoch": 0.7101767531341564, "grad_norm": 1.3860423564910889, "learning_rate": 8.844675310411055e-07, "loss": 1.5221, "step": 28055 }, { "epoch": 0.7109614788282274, "grad_norm": 1.374284267425537, "learning_rate": 8.6246256573847e-07, "loss": 1.4989, "step": 28086 }, { "epoch": 0.7117462045222982, "grad_norm": 1.3612192869186401, "learning_rate": 8.407299892651127e-07, "loss": 1.5131, "step": 28117 }, { "epoch": 0.7125309302163692, "grad_norm": 1.4345417022705078, "learning_rate": 8.19270046870202e-07, "loss": 1.5248, "step": 28148 }, { "epoch": 0.71331565591044, "grad_norm": 1.388961911201477, "learning_rate": 7.980829807262752e-07, "loss": 1.5283, "step": 28179 }, { "epoch": 0.714100381604511, "grad_norm": 1.5089038610458374, "learning_rate": 7.771690299264889e-07, "loss": 1.528, "step": 28210 }, { "epoch": 0.7148851072985818, "grad_norm": 1.334241509437561, "learning_rate": 7.565284304819426e-07, "loss": 1.5018, "step": 28241 }, { "epoch": 0.7156698329926527, "grad_norm": 1.3419960737228394, "learning_rate": 7.361614153189922e-07, "loss": 1.5168, "step": 28272 }, { "epoch": 0.7164545586867236, "grad_norm": 1.3039295673370361, "learning_rate": 7.160682142766328e-07, "loss": 1.547, "step": 28303 }, { "epoch": 0.7172392843807944, "grad_norm": 1.5037273168563843, "learning_rate": 6.962490541039091e-07, "loss": 1.5523, "step": 28334 }, { "epoch": 0.7180240100748654, "grad_norm": 1.3340791463851929, "learning_rate": 6.767041584573531e-07, "loss": 1.5431, "step": 28365 }, { "epoch": 0.7188087357689362, "grad_norm": 1.8582775592803955, "learning_rate": 6.574337478984532e-07, "loss": 1.5477, "step": 28396 }, { "epoch": 0.7195934614630072, "grad_norm": 1.4675524234771729, "learning_rate": 6.384380398911732e-07, "loss": 1.5331, "step": 28427 }, { "epoch": 0.720378187157078, "grad_norm": 1.338765025138855, "learning_rate": 6.197172487994951e-07, "loss": 1.5164, "step": 28458 }, { "epoch": 0.721162912851149, "grad_norm": 1.3680214881896973, "learning_rate": 6.012715858850021e-07, "loss": 1.5406, "step": 28489 }, { "epoch": 0.7219476385452198, "grad_norm": 1.3477078676223755, "learning_rate": 5.831012593044971e-07, "loss": 1.5144, "step": 28520 }, { "epoch": 0.7227323642392907, "grad_norm": 1.4228640794754028, "learning_rate": 5.652064741076435e-07, "loss": 1.5467, "step": 28551 }, { "epoch": 0.7235170899333616, "grad_norm": 1.4127750396728516, "learning_rate": 5.475874322346558e-07, "loss": 1.5395, "step": 28582 }, { "epoch": 0.7243018156274325, "grad_norm": 1.3759944438934326, "learning_rate": 5.30244332514035e-07, "loss": 1.5379, "step": 28613 }, { "epoch": 0.7250865413215034, "grad_norm": 1.3767083883285522, "learning_rate": 5.131773706602977e-07, "loss": 1.5401, "step": 28644 }, { "epoch": 0.7258712670155743, "grad_norm": 1.3337562084197998, "learning_rate": 4.963867392717897e-07, "loss": 1.5305, "step": 28675 }, { "epoch": 0.7266559927096452, "grad_norm": 1.407812476158142, "learning_rate": 4.798726278285093e-07, "loss": 1.5161, "step": 28706 }, { "epoch": 0.727440718403716, "grad_norm": 1.4337633848190308, "learning_rate": 4.6363522268995097e-07, "loss": 1.5357, "step": 28737 }, { "epoch": 0.728225444097787, "grad_norm": 1.4671465158462524, "learning_rate": 4.4767470709302927e-07, "loss": 1.5153, "step": 28768 }, { "epoch": 0.7290101697918578, "grad_norm": 1.3277357816696167, "learning_rate": 4.319912611499971e-07, "loss": 1.519, "step": 28799 }, { "epoch": 0.7297948954859287, "grad_norm": 1.4174885749816895, "learning_rate": 4.1658506184640564e-07, "loss": 1.5265, "step": 28830 }, { "epoch": 0.7305796211799996, "grad_norm": 1.4684560298919678, "learning_rate": 4.0145628303911996e-07, "loss": 1.5182, "step": 28861 }, { "epoch": 0.7313643468740705, "grad_norm": 1.3946303129196167, "learning_rate": 3.866050954543565e-07, "loss": 1.5254, "step": 28892 }, { "epoch": 0.7321490725681414, "grad_norm": 1.441939353942871, "learning_rate": 3.720316666857432e-07, "loss": 1.5315, "step": 28923 }, { "epoch": 0.7329337982622123, "grad_norm": 1.3914129734039307, "learning_rate": 3.5773616119244845e-07, "loss": 1.5241, "step": 28954 }, { "epoch": 0.7337185239562832, "grad_norm": 1.4593554735183716, "learning_rate": 3.437187402973052e-07, "loss": 1.5354, "step": 28985 }, { "epoch": 0.7345032496503541, "grad_norm": 1.524565577507019, "learning_rate": 3.2997956218500104e-07, "loss": 1.5286, "step": 29016 }, { "epoch": 0.7352879753444249, "grad_norm": 1.4271135330200195, "learning_rate": 3.165187819003018e-07, "loss": 1.5184, "step": 29047 }, { "epoch": 0.7360727010384959, "grad_norm": 1.4928288459777832, "learning_rate": 3.033365513462755e-07, "loss": 1.5114, "step": 29078 }, { "epoch": 0.7368574267325667, "grad_norm": 1.414491057395935, "learning_rate": 2.9043301928260437e-07, "loss": 1.5574, "step": 29109 }, { "epoch": 0.7376421524266377, "grad_norm": 1.392284631729126, "learning_rate": 2.7780833132389773e-07, "loss": 1.5316, "step": 29140 }, { "epoch": 0.7384268781207085, "grad_norm": 1.416320562362671, "learning_rate": 2.6546262993803473e-07, "loss": 1.539, "step": 29171 }, { "epoch": 0.7392116038147795, "grad_norm": 1.418097734451294, "learning_rate": 2.533960544445879e-07, "loss": 1.5296, "step": 29202 }, { "epoch": 0.7399963295088503, "grad_norm": 1.3826491832733154, "learning_rate": 2.416087410132134e-07, "loss": 1.5418, "step": 29233 }, { "epoch": 0.7407810552029213, "grad_norm": 1.431630253791809, "learning_rate": 2.301008226621465e-07, "loss": 1.537, "step": 29264 }, { "epoch": 0.7415657808969921, "grad_norm": 1.351166009902954, "learning_rate": 2.1887242925668073e-07, "loss": 1.5006, "step": 29295 }, { "epoch": 0.7423505065910629, "grad_norm": 1.377264380455017, "learning_rate": 2.0792368750770785e-07, "loss": 1.5, "step": 29326 }, { "epoch": 0.7431352322851339, "grad_norm": 1.4020991325378418, "learning_rate": 1.9725472097028851e-07, "loss": 1.5495, "step": 29357 }, { "epoch": 0.7439199579792047, "grad_norm": 1.395375370979309, "learning_rate": 1.8686565004226718e-07, "loss": 1.535, "step": 29388 }, { "epoch": 0.7447046836732757, "grad_norm": 1.424237608909607, "learning_rate": 1.7675659196288995e-07, "loss": 1.5265, "step": 29419 }, { "epoch": 0.7454894093673465, "grad_norm": 1.432045817375183, "learning_rate": 1.6692766081150556e-07, "loss": 1.5005, "step": 29450 }, { "epoch": 0.7462741350614175, "grad_norm": 1.3229856491088867, "learning_rate": 1.5737896750626647e-07, "loss": 1.5464, "step": 29481 }, { "epoch": 0.7470588607554883, "grad_norm": 1.4807835817337036, "learning_rate": 1.4811061980287976e-07, "loss": 1.5113, "step": 29512 }, { "epoch": 0.7478435864495592, "grad_norm": 1.3511358499526978, "learning_rate": 1.3912272229338886e-07, "loss": 1.546, "step": 29543 }, { "epoch": 0.7486283121436301, "grad_norm": 1.330914855003357, "learning_rate": 1.3041537640499645e-07, "loss": 1.5271, "step": 29574 }, { "epoch": 0.749413037837701, "grad_norm": 1.6085385084152222, "learning_rate": 1.2198868039891564e-07, "loss": 1.5346, "step": 29605 }, { "epoch": 0.7501977635317719, "grad_norm": 1.3291810750961304, "learning_rate": 1.138427293692651e-07, "loss": 1.5132, "step": 29636 }, { "epoch": 0.7509824892258428, "grad_norm": 1.367587685585022, "learning_rate": 1.0597761524199778e-07, "loss": 1.5226, "step": 29667 }, { "epoch": 0.7517672149199137, "grad_norm": 1.4591524600982666, "learning_rate": 9.839342677385455e-08, "loss": 1.5447, "step": 29698 }, { "epoch": 0.7525519406139846, "grad_norm": 1.3880685567855835, "learning_rate": 9.109024955137325e-08, "loss": 1.5304, "step": 29729 }, { "epoch": 0.7533366663080555, "grad_norm": 1.3544681072235107, "learning_rate": 8.406816598991729e-08, "loss": 1.5362, "step": 29760 }, { "epoch": 0.7541213920021264, "grad_norm": 1.398155927658081, "learning_rate": 7.73272553327431e-08, "loss": 1.5075, "step": 29791 }, { "epoch": 0.7549061176961972, "grad_norm": 1.4514081478118896, "learning_rate": 7.086759365011186e-08, "loss": 1.518, "step": 29822 }, { "epoch": 0.7556908433902682, "grad_norm": 1.4363720417022705, "learning_rate": 6.468925383842639e-08, "loss": 1.5194, "step": 29853 }, { "epoch": 0.756475569084339, "grad_norm": 1.4076640605926514, "learning_rate": 5.8792305619415067e-08, "loss": 1.5053, "step": 29884 }, { "epoch": 0.75726029477841, "grad_norm": 1.3342225551605225, "learning_rate": 5.317681553933529e-08, "loss": 1.52, "step": 29915 }, { "epoch": 0.7580450204724808, "grad_norm": 1.3737679719924927, "learning_rate": 4.78428469682296e-08, "loss": 1.5417, "step": 29946 }, { "epoch": 0.7588297461665517, "grad_norm": 1.4676426649093628, "learning_rate": 4.2790460099206844e-08, "loss": 1.538, "step": 29977 }, { "epoch": 0.7596144718606226, "grad_norm": 1.693217396736145, "learning_rate": 3.801971194777043e-08, "loss": 1.5206, "step": 30008 }, { "epoch": 0.7603991975546934, "grad_norm": 1.4130475521087646, "learning_rate": 3.353065635115782e-08, "loss": 1.5305, "step": 30039 }, { "epoch": 0.7611839232487644, "grad_norm": 1.4824076890945435, "learning_rate": 2.93233439677576e-08, "loss": 1.5118, "step": 30070 }, { "epoch": 0.7619686489428352, "grad_norm": 1.3690931797027588, "learning_rate": 2.539782227651555e-08, "loss": 1.5165, "step": 30101 }, { "epoch": 0.7627533746369062, "grad_norm": 1.366620421409607, "learning_rate": 2.175413557641004e-08, "loss": 1.5418, "step": 30132 }, { "epoch": 0.763538100330977, "grad_norm": 1.4603701829910278, "learning_rate": 1.839232498594967e-08, "loss": 1.5311, "step": 30163 }, { "epoch": 0.764322826025048, "grad_norm": 1.339460015296936, "learning_rate": 1.5312428442712522e-08, "loss": 1.5446, "step": 30194 }, { "epoch": 0.7651075517191188, "grad_norm": 1.4318183660507202, "learning_rate": 1.2514480702913168e-08, "loss": 1.5058, "step": 30225 }, { "epoch": 0.7658922774131898, "grad_norm": 1.366489291191101, "learning_rate": 9.998513341005766e-09, "loss": 1.5299, "step": 30256 }, { "epoch": 0.7666770031072606, "grad_norm": 1.4269777536392212, "learning_rate": 7.764554749345454e-09, "loss": 1.5383, "step": 30287 }, { "epoch": 0.7674617288013315, "grad_norm": 1.3449435234069824, "learning_rate": 5.812630137849717e-09, "loss": 1.5281, "step": 30318 }, { "epoch": 0.7682464544954024, "grad_norm": 1.5927067995071411, "learning_rate": 4.142761533723616e-09, "loss": 1.5119, "step": 30349 }, { "epoch": 0.7690311801894733, "grad_norm": 1.3833186626434326, "learning_rate": 2.7549677812044317e-09, "loss": 1.5108, "step": 30380 }, { "epoch": 0.7698159058835442, "grad_norm": 1.47097909450531, "learning_rate": 1.6492645413590525e-09, "loss": 1.5346, "step": 30411 }, { "epoch": 0.770600631577615, "grad_norm": 1.407623052597046, "learning_rate": 8.256642918980096e-10, "loss": 1.5191, "step": 30442 }, { "epoch": 0.771385357271686, "grad_norm": 1.4193038940429688, "learning_rate": 2.841763270367004e-10, "loss": 1.5294, "step": 30473 }, { "epoch": 0.7721700829657568, "grad_norm": 1.5234286785125732, "learning_rate": 2.480675739269245e-11, "loss": 1.5123, "step": 30504 } ], "logging_steps": 31, "max_steps": 30517, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3052, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.263722516828232e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }