{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.38628755133941284, "eval_steps": 500, "global_step": 15260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007847256940708912, "grad_norm": 26.94572639465332, "learning_rate": 1.0157273918741808e-06, "loss": 8.5879, "step": 31 }, { "epoch": 0.0015694513881417823, "grad_norm": 14.633014678955078, "learning_rate": 2.0314547837483616e-06, "loss": 7.5048, "step": 62 }, { "epoch": 0.002354177082212673, "grad_norm": 15.984803199768066, "learning_rate": 3.0471821756225426e-06, "loss": 6.1391, "step": 93 }, { "epoch": 0.0031389027762835646, "grad_norm": 11.297175407409668, "learning_rate": 4.062909567496723e-06, "loss": 4.9299, "step": 124 }, { "epoch": 0.003923628470354455, "grad_norm": 14.864474296569824, "learning_rate": 5.078636959370905e-06, "loss": 4.3205, "step": 155 }, { "epoch": 0.004708354164425346, "grad_norm": 11.237608909606934, "learning_rate": 6.094364351245085e-06, "loss": 4.0, "step": 186 }, { "epoch": 0.005493079858496238, "grad_norm": 23.79303550720215, "learning_rate": 7.110091743119267e-06, "loss": 3.7952, "step": 217 }, { "epoch": 0.006277805552567129, "grad_norm": 15.1551513671875, "learning_rate": 8.125819134993446e-06, "loss": 3.689, "step": 248 }, { "epoch": 0.00706253124663802, "grad_norm": 14.605571746826172, "learning_rate": 9.141546526867629e-06, "loss": 3.5147, "step": 279 }, { "epoch": 0.00784725694070891, "grad_norm": 16.463390350341797, "learning_rate": 1.015727391874181e-05, "loss": 3.3901, "step": 310 }, { "epoch": 0.008631982634779801, "grad_norm": 13.09945011138916, "learning_rate": 1.117300131061599e-05, "loss": 3.317, "step": 341 }, { "epoch": 0.009416708328850693, "grad_norm": 11.993067741394043, "learning_rate": 1.218872870249017e-05, "loss": 3.2508, "step": 372 }, { "epoch": 0.010201434022921584, "grad_norm": 10.388030052185059, "learning_rate": 1.3204456094364351e-05, "loss": 3.1239, "step": 403 }, { "epoch": 0.010986159716992476, "grad_norm": 11.977804183959961, "learning_rate": 1.4220183486238533e-05, "loss": 3.0739, "step": 434 }, { "epoch": 0.011770885411063367, "grad_norm": 8.925983428955078, "learning_rate": 1.5235910878112714e-05, "loss": 3.0169, "step": 465 }, { "epoch": 0.012555611105134258, "grad_norm": 9.57411003112793, "learning_rate": 1.6251638269986893e-05, "loss": 2.959, "step": 496 }, { "epoch": 0.01334033679920515, "grad_norm": 7.380288124084473, "learning_rate": 1.7267365661861077e-05, "loss": 2.8921, "step": 527 }, { "epoch": 0.01412506249327604, "grad_norm": 8.812368392944336, "learning_rate": 1.8283093053735257e-05, "loss": 2.843, "step": 558 }, { "epoch": 0.014909788187346932, "grad_norm": 8.870095252990723, "learning_rate": 1.9298820445609438e-05, "loss": 2.7895, "step": 589 }, { "epoch": 0.01569451388141782, "grad_norm": 9.503872871398926, "learning_rate": 2.031454783748362e-05, "loss": 2.7757, "step": 620 }, { "epoch": 0.016479239575488712, "grad_norm": 6.582827568054199, "learning_rate": 2.13302752293578e-05, "loss": 2.7099, "step": 651 }, { "epoch": 0.017263965269559603, "grad_norm": 6.266632556915283, "learning_rate": 2.234600262123198e-05, "loss": 2.6729, "step": 682 }, { "epoch": 0.018048690963630494, "grad_norm": 6.645415306091309, "learning_rate": 2.336173001310616e-05, "loss": 2.6616, "step": 713 }, { "epoch": 0.018833416657701385, "grad_norm": 7.8323073387146, "learning_rate": 2.437745740498034e-05, "loss": 2.6291, "step": 744 }, { "epoch": 0.019618142351772276, "grad_norm": 5.577521324157715, "learning_rate": 2.5393184796854525e-05, "loss": 2.6072, "step": 775 }, { "epoch": 0.020402868045843167, "grad_norm": 5.603636264801025, "learning_rate": 2.6408912188728702e-05, "loss": 2.5787, "step": 806 }, { "epoch": 0.021187593739914058, "grad_norm": 6.945438385009766, "learning_rate": 2.7424639580602886e-05, "loss": 2.5198, "step": 837 }, { "epoch": 0.021972319433984953, "grad_norm": 5.6279826164245605, "learning_rate": 2.8440366972477066e-05, "loss": 2.5417, "step": 868 }, { "epoch": 0.022757045128055844, "grad_norm": 5.517001628875732, "learning_rate": 2.9456094364351244e-05, "loss": 2.4849, "step": 899 }, { "epoch": 0.023541770822126735, "grad_norm": 5.865486145019531, "learning_rate": 3.0471821756225428e-05, "loss": 2.5103, "step": 930 }, { "epoch": 0.024326496516197626, "grad_norm": 4.949043273925781, "learning_rate": 3.148754914809961e-05, "loss": 2.4581, "step": 961 }, { "epoch": 0.025111222210268517, "grad_norm": 4.701717853546143, "learning_rate": 3.2503276539973785e-05, "loss": 2.4315, "step": 992 }, { "epoch": 0.025895947904339408, "grad_norm": 4.533145904541016, "learning_rate": 3.351900393184797e-05, "loss": 2.4056, "step": 1023 }, { "epoch": 0.0266806735984103, "grad_norm": 4.724672794342041, "learning_rate": 3.453473132372215e-05, "loss": 2.3994, "step": 1054 }, { "epoch": 0.02746539929248119, "grad_norm": 4.745669364929199, "learning_rate": 3.555045871559633e-05, "loss": 2.3546, "step": 1085 }, { "epoch": 0.02825012498655208, "grad_norm": 4.4554948806762695, "learning_rate": 3.6566186107470514e-05, "loss": 2.3642, "step": 1116 }, { "epoch": 0.029034850680622972, "grad_norm": 4.4792304039001465, "learning_rate": 3.7581913499344695e-05, "loss": 2.3296, "step": 1147 }, { "epoch": 0.029819576374693863, "grad_norm": 3.9329679012298584, "learning_rate": 3.8597640891218876e-05, "loss": 2.3105, "step": 1178 }, { "epoch": 0.030604302068764754, "grad_norm": 4.338287830352783, "learning_rate": 3.9613368283093056e-05, "loss": 2.2811, "step": 1209 }, { "epoch": 0.03138902776283564, "grad_norm": 4.130499839782715, "learning_rate": 4.062909567496724e-05, "loss": 2.2898, "step": 1240 }, { "epoch": 0.03217375345690653, "grad_norm": 3.5664470195770264, "learning_rate": 4.164482306684142e-05, "loss": 2.2786, "step": 1271 }, { "epoch": 0.032958479150977424, "grad_norm": 3.642627716064453, "learning_rate": 4.26605504587156e-05, "loss": 2.2439, "step": 1302 }, { "epoch": 0.033743204845048315, "grad_norm": 3.7562780380249023, "learning_rate": 4.367627785058978e-05, "loss": 2.2441, "step": 1333 }, { "epoch": 0.034527930539119206, "grad_norm": 3.3117406368255615, "learning_rate": 4.469200524246396e-05, "loss": 2.2604, "step": 1364 }, { "epoch": 0.0353126562331901, "grad_norm": 3.4313724040985107, "learning_rate": 4.570773263433814e-05, "loss": 2.2069, "step": 1395 }, { "epoch": 0.03609738192726099, "grad_norm": 3.4720091819763184, "learning_rate": 4.672346002621232e-05, "loss": 2.2087, "step": 1426 }, { "epoch": 0.03688210762133188, "grad_norm": 3.491856575012207, "learning_rate": 4.77391874180865e-05, "loss": 2.1808, "step": 1457 }, { "epoch": 0.03766683331540277, "grad_norm": 3.3730666637420654, "learning_rate": 4.875491480996068e-05, "loss": 2.1907, "step": 1488 }, { "epoch": 0.03845155900947366, "grad_norm": 2.894322395324707, "learning_rate": 4.977064220183487e-05, "loss": 2.1689, "step": 1519 }, { "epoch": 0.03923628470354455, "grad_norm": 3.195884943008423, "learning_rate": 4.9999915451558777e-05, "loss": 2.194, "step": 1550 }, { "epoch": 0.04002101039761544, "grad_norm": 3.154061794281006, "learning_rate": 4.999955597496219e-05, "loss": 2.1409, "step": 1581 }, { "epoch": 0.040805736091686334, "grad_norm": 2.8204188346862793, "learning_rate": 4.9998914381774255e-05, "loss": 2.145, "step": 1612 }, { "epoch": 0.041590461785757225, "grad_norm": 2.98260760307312, "learning_rate": 4.999799067923527e-05, "loss": 2.1523, "step": 1643 }, { "epoch": 0.042375187479828116, "grad_norm": 2.917949914932251, "learning_rate": 4.999678487776908e-05, "loss": 2.1221, "step": 1674 }, { "epoch": 0.04315991317389901, "grad_norm": 2.811469554901123, "learning_rate": 4.9995296990983006e-05, "loss": 2.1242, "step": 1705 }, { "epoch": 0.043944638867969905, "grad_norm": 3.067636728286743, "learning_rate": 4.999352703566763e-05, "loss": 2.1092, "step": 1736 }, { "epoch": 0.044729364562040796, "grad_norm": 2.6231868267059326, "learning_rate": 4.999147503179668e-05, "loss": 2.1018, "step": 1767 }, { "epoch": 0.04551409025611169, "grad_norm": 2.8247616291046143, "learning_rate": 4.998914100252672e-05, "loss": 2.074, "step": 1798 }, { "epoch": 0.04629881595018258, "grad_norm": 2.5960075855255127, "learning_rate": 4.998652497419696e-05, "loss": 2.0824, "step": 1829 }, { "epoch": 0.04708354164425347, "grad_norm": 2.7796943187713623, "learning_rate": 4.9983626976328927e-05, "loss": 2.0998, "step": 1860 }, { "epoch": 0.04786826733832436, "grad_norm": 2.49242901802063, "learning_rate": 4.998044704162613e-05, "loss": 2.0893, "step": 1891 }, { "epoch": 0.04865299303239525, "grad_norm": 2.4294378757476807, "learning_rate": 4.9976985205973705e-05, "loss": 2.0617, "step": 1922 }, { "epoch": 0.04943771872646614, "grad_norm": 2.553217649459839, "learning_rate": 4.997324150843799e-05, "loss": 2.0632, "step": 1953 }, { "epoch": 0.050222444420537034, "grad_norm": 2.6711318492889404, "learning_rate": 4.99692159912661e-05, "loss": 2.0445, "step": 1984 }, { "epoch": 0.051007170114607925, "grad_norm": 2.714432716369629, "learning_rate": 4.996490869988546e-05, "loss": 2.0185, "step": 2015 }, { "epoch": 0.051791895808678816, "grad_norm": 2.6516053676605225, "learning_rate": 4.996031968290326e-05, "loss": 2.057, "step": 2046 }, { "epoch": 0.05257662150274971, "grad_norm": 2.4798831939697266, "learning_rate": 4.995544899210594e-05, "loss": 2.0199, "step": 2077 }, { "epoch": 0.0533613471968206, "grad_norm": 2.5150041580200195, "learning_rate": 4.9950296682458583e-05, "loss": 2.0264, "step": 2108 }, { "epoch": 0.05414607289089149, "grad_norm": 2.637777805328369, "learning_rate": 4.994486281210429e-05, "loss": 2.0233, "step": 2139 }, { "epoch": 0.05493079858496238, "grad_norm": 2.330376148223877, "learning_rate": 4.9939147442363566e-05, "loss": 2.0201, "step": 2170 }, { "epoch": 0.05571552427903327, "grad_norm": 2.3436174392700195, "learning_rate": 4.9933150637733574e-05, "loss": 1.9865, "step": 2201 }, { "epoch": 0.05650024997310416, "grad_norm": 2.7756845951080322, "learning_rate": 4.992687246588743e-05, "loss": 1.9983, "step": 2232 }, { "epoch": 0.05728497566717505, "grad_norm": 2.1725504398345947, "learning_rate": 4.992031299767347e-05, "loss": 1.9689, "step": 2263 }, { "epoch": 0.058069701361245944, "grad_norm": 2.2163312435150146, "learning_rate": 4.9913472307114386e-05, "loss": 1.9829, "step": 2294 }, { "epoch": 0.058854427055316835, "grad_norm": 2.2829232215881348, "learning_rate": 4.9906350471406446e-05, "loss": 2.0142, "step": 2325 }, { "epoch": 0.059639152749387726, "grad_norm": 2.239596366882324, "learning_rate": 4.989894757091861e-05, "loss": 1.9697, "step": 2356 }, { "epoch": 0.06042387844345862, "grad_norm": 2.2926037311553955, "learning_rate": 4.989126368919158e-05, "loss": 1.9688, "step": 2387 }, { "epoch": 0.06120860413752951, "grad_norm": 10.08767032623291, "learning_rate": 4.988329891293693e-05, "loss": 1.9845, "step": 2418 }, { "epoch": 0.0619933298316004, "grad_norm": 2.2427194118499756, "learning_rate": 4.987505333203608e-05, "loss": 1.9744, "step": 2449 }, { "epoch": 0.06277805552567128, "grad_norm": 2.5111870765686035, "learning_rate": 4.9866527039539276e-05, "loss": 1.9526, "step": 2480 }, { "epoch": 0.06356278121974218, "grad_norm": 2.2100026607513428, "learning_rate": 4.9857720131664594e-05, "loss": 1.9826, "step": 2511 }, { "epoch": 0.06434750691381307, "grad_norm": 2.2112088203430176, "learning_rate": 4.9848632707796773e-05, "loss": 1.9698, "step": 2542 }, { "epoch": 0.06513223260788396, "grad_norm": 2.404014825820923, "learning_rate": 4.9839264870486155e-05, "loss": 1.9628, "step": 2573 }, { "epoch": 0.06591695830195485, "grad_norm": 2.526423692703247, "learning_rate": 4.9829616725447526e-05, "loss": 1.9481, "step": 2604 }, { "epoch": 0.06670168399602575, "grad_norm": 2.2506027221679688, "learning_rate": 4.981968838155888e-05, "loss": 1.9418, "step": 2635 }, { "epoch": 0.06748640969009663, "grad_norm": 2.4334371089935303, "learning_rate": 4.980947995086024e-05, "loss": 1.9423, "step": 2666 }, { "epoch": 0.06827113538416753, "grad_norm": 2.3028314113616943, "learning_rate": 4.979899154855234e-05, "loss": 1.9391, "step": 2697 }, { "epoch": 0.06905586107823841, "grad_norm": 2.122143030166626, "learning_rate": 4.9788223292995386e-05, "loss": 1.933, "step": 2728 }, { "epoch": 0.06984058677230931, "grad_norm": 2.1335129737854004, "learning_rate": 4.977717530570768e-05, "loss": 1.9212, "step": 2759 }, { "epoch": 0.0706253124663802, "grad_norm": 2.198650598526001, "learning_rate": 4.976584771136425e-05, "loss": 1.9217, "step": 2790 }, { "epoch": 0.07141003816045109, "grad_norm": 2.4985201358795166, "learning_rate": 4.975424063779547e-05, "loss": 1.9277, "step": 2821 }, { "epoch": 0.07219476385452198, "grad_norm": 1.9877598285675049, "learning_rate": 4.974235421598557e-05, "loss": 1.9278, "step": 2852 }, { "epoch": 0.07297948954859287, "grad_norm": 3.0082573890686035, "learning_rate": 4.973018858007122e-05, "loss": 1.9261, "step": 2883 }, { "epoch": 0.07376421524266376, "grad_norm": 2.139742851257324, "learning_rate": 4.9717743867339963e-05, "loss": 1.9168, "step": 2914 }, { "epoch": 0.07454894093673466, "grad_norm": 2.1748037338256836, "learning_rate": 4.9705020218228695e-05, "loss": 1.9132, "step": 2945 }, { "epoch": 0.07533366663080554, "grad_norm": 2.0570950508117676, "learning_rate": 4.969201777632205e-05, "loss": 1.9177, "step": 2976 }, { "epoch": 0.07611839232487644, "grad_norm": 1.9970216751098633, "learning_rate": 4.9678736688350846e-05, "loss": 1.9105, "step": 3007 }, { "epoch": 0.07690311801894732, "grad_norm": 1.9640527963638306, "learning_rate": 4.966517710419033e-05, "loss": 1.9084, "step": 3038 }, { "epoch": 0.07768784371301822, "grad_norm": 2.172874927520752, "learning_rate": 4.965133917685858e-05, "loss": 1.8995, "step": 3069 }, { "epoch": 0.0784725694070891, "grad_norm": 2.1881916522979736, "learning_rate": 4.9637223062514714e-05, "loss": 1.9019, "step": 3100 }, { "epoch": 0.07925729510116, "grad_norm": 1.975496530532837, "learning_rate": 4.962282892045718e-05, "loss": 1.8967, "step": 3131 }, { "epoch": 0.08004202079523089, "grad_norm": 2.0970685482025146, "learning_rate": 4.9608156913121904e-05, "loss": 1.8867, "step": 3162 }, { "epoch": 0.08082674648930178, "grad_norm": 2.096353769302368, "learning_rate": 4.959320720608049e-05, "loss": 1.8967, "step": 3193 }, { "epoch": 0.08161147218337267, "grad_norm": 1.998336911201477, "learning_rate": 4.9577979968038354e-05, "loss": 1.8876, "step": 3224 }, { "epoch": 0.08239619787744357, "grad_norm": 2.098055362701416, "learning_rate": 4.956247537083282e-05, "loss": 1.9, "step": 3255 }, { "epoch": 0.08318092357151445, "grad_norm": 2.0739505290985107, "learning_rate": 4.9546693589431145e-05, "loss": 1.8902, "step": 3286 }, { "epoch": 0.08396564926558535, "grad_norm": 1.9556243419647217, "learning_rate": 4.9530634801928595e-05, "loss": 1.888, "step": 3317 }, { "epoch": 0.08475037495965623, "grad_norm": 2.096874952316284, "learning_rate": 4.9514299189546395e-05, "loss": 1.8785, "step": 3348 }, { "epoch": 0.08553510065372713, "grad_norm": 1.9407072067260742, "learning_rate": 4.949768693662973e-05, "loss": 1.8646, "step": 3379 }, { "epoch": 0.08631982634779801, "grad_norm": 1.9928467273712158, "learning_rate": 4.948079823064559e-05, "loss": 1.8751, "step": 3410 }, { "epoch": 0.08710455204186891, "grad_norm": 1.9670037031173706, "learning_rate": 4.946363326218074e-05, "loss": 1.8831, "step": 3441 }, { "epoch": 0.08788927773593981, "grad_norm": 1.999193787574768, "learning_rate": 4.9446192224939525e-05, "loss": 1.8605, "step": 3472 }, { "epoch": 0.0886740034300107, "grad_norm": 1.9073724746704102, "learning_rate": 4.942847531574167e-05, "loss": 1.8576, "step": 3503 }, { "epoch": 0.08945872912408159, "grad_norm": 2.179824113845825, "learning_rate": 4.941048273452008e-05, "loss": 1.8682, "step": 3534 }, { "epoch": 0.09024345481815248, "grad_norm": 1.954990029335022, "learning_rate": 4.9392214684318605e-05, "loss": 1.8807, "step": 3565 }, { "epoch": 0.09102818051222338, "grad_norm": 1.7695640325546265, "learning_rate": 4.93736713712897e-05, "loss": 1.879, "step": 3596 }, { "epoch": 0.09181290620629426, "grad_norm": 1.7708550691604614, "learning_rate": 4.9354853004692124e-05, "loss": 1.8677, "step": 3627 }, { "epoch": 0.09259763190036516, "grad_norm": 1.9683934450149536, "learning_rate": 4.93357597968886e-05, "loss": 1.8595, "step": 3658 }, { "epoch": 0.09338235759443604, "grad_norm": 2.00441312789917, "learning_rate": 4.931639196334338e-05, "loss": 1.8462, "step": 3689 }, { "epoch": 0.09416708328850694, "grad_norm": 1.875543475151062, "learning_rate": 4.9296749722619826e-05, "loss": 1.8502, "step": 3720 }, { "epoch": 0.09495180898257782, "grad_norm": 1.932658314704895, "learning_rate": 4.9276833296377966e-05, "loss": 1.8457, "step": 3751 }, { "epoch": 0.09573653467664872, "grad_norm": 1.9957045316696167, "learning_rate": 4.925664290937196e-05, "loss": 1.843, "step": 3782 }, { "epoch": 0.0965212603707196, "grad_norm": 1.8579176664352417, "learning_rate": 4.9236178789447576e-05, "loss": 1.8504, "step": 3813 }, { "epoch": 0.0973059860647905, "grad_norm": 1.9646131992340088, "learning_rate": 4.921544116753962e-05, "loss": 1.8512, "step": 3844 }, { "epoch": 0.09809071175886139, "grad_norm": 1.8213136196136475, "learning_rate": 4.919443027766935e-05, "loss": 1.8618, "step": 3875 }, { "epoch": 0.09887543745293229, "grad_norm": 2.017280101776123, "learning_rate": 4.91731463569418e-05, "loss": 1.863, "step": 3906 }, { "epoch": 0.09966016314700317, "grad_norm": 1.9125665426254272, "learning_rate": 4.915158964554312e-05, "loss": 1.8259, "step": 3937 }, { "epoch": 0.10044488884107407, "grad_norm": 2.0414695739746094, "learning_rate": 4.912976038673786e-05, "loss": 1.8347, "step": 3968 }, { "epoch": 0.10122961453514495, "grad_norm": 1.7705485820770264, "learning_rate": 4.9107658826866254e-05, "loss": 1.8502, "step": 3999 }, { "epoch": 0.10201434022921585, "grad_norm": 1.8961102962493896, "learning_rate": 4.908528521534139e-05, "loss": 1.84, "step": 4030 }, { "epoch": 0.10279906592328673, "grad_norm": 1.784387230873108, "learning_rate": 4.906263980464644e-05, "loss": 1.842, "step": 4061 }, { "epoch": 0.10358379161735763, "grad_norm": 11.229472160339355, "learning_rate": 4.903972285033178e-05, "loss": 1.8476, "step": 4092 }, { "epoch": 0.10436851731142852, "grad_norm": 1.9657154083251953, "learning_rate": 4.901653461101213e-05, "loss": 1.8465, "step": 4123 }, { "epoch": 0.10515324300549941, "grad_norm": 1.7702244520187378, "learning_rate": 4.8993075348363626e-05, "loss": 1.8249, "step": 4154 }, { "epoch": 0.1059379686995703, "grad_norm": 1.8672112226486206, "learning_rate": 4.896934532712084e-05, "loss": 1.8232, "step": 4185 }, { "epoch": 0.1067226943936412, "grad_norm": 1.7806147336959839, "learning_rate": 4.8945344815073846e-05, "loss": 1.8256, "step": 4216 }, { "epoch": 0.10750742008771208, "grad_norm": 1.7830456495285034, "learning_rate": 4.892107408306516e-05, "loss": 1.8271, "step": 4247 }, { "epoch": 0.10829214578178298, "grad_norm": 1.96640944480896, "learning_rate": 4.889653340498669e-05, "loss": 1.82, "step": 4278 }, { "epoch": 0.10907687147585386, "grad_norm": 1.8224470615386963, "learning_rate": 4.8871723057776664e-05, "loss": 1.8216, "step": 4309 }, { "epoch": 0.10986159716992476, "grad_norm": 2.5164501667022705, "learning_rate": 4.8846643321416476e-05, "loss": 1.8252, "step": 4340 }, { "epoch": 0.11064632286399564, "grad_norm": 1.7248613834381104, "learning_rate": 4.882129447892753e-05, "loss": 1.8133, "step": 4371 }, { "epoch": 0.11143104855806654, "grad_norm": 2.060304880142212, "learning_rate": 4.8795676816368076e-05, "loss": 1.8282, "step": 4402 }, { "epoch": 0.11221577425213743, "grad_norm": 1.8709039688110352, "learning_rate": 4.876979062282995e-05, "loss": 1.8154, "step": 4433 }, { "epoch": 0.11300049994620832, "grad_norm": 1.7444674968719482, "learning_rate": 4.8743636190435325e-05, "loss": 1.8173, "step": 4464 }, { "epoch": 0.11378522564027921, "grad_norm": 1.7357319593429565, "learning_rate": 4.871721381433344e-05, "loss": 1.8351, "step": 4495 }, { "epoch": 0.1145699513343501, "grad_norm": 1.728070855140686, "learning_rate": 4.869052379269719e-05, "loss": 1.8119, "step": 4526 }, { "epoch": 0.11535467702842099, "grad_norm": 1.742035984992981, "learning_rate": 4.866356642671985e-05, "loss": 1.7967, "step": 4557 }, { "epoch": 0.11613940272249189, "grad_norm": 1.7010915279388428, "learning_rate": 4.8636342020611634e-05, "loss": 1.8004, "step": 4588 }, { "epoch": 0.11692412841656277, "grad_norm": 1.6775914430618286, "learning_rate": 4.860885088159626e-05, "loss": 1.8173, "step": 4619 }, { "epoch": 0.11770885411063367, "grad_norm": 1.9107964038848877, "learning_rate": 4.858109331990751e-05, "loss": 1.7984, "step": 4650 }, { "epoch": 0.11849357980470455, "grad_norm": 1.713429570198059, "learning_rate": 4.855306964878567e-05, "loss": 1.7967, "step": 4681 }, { "epoch": 0.11927830549877545, "grad_norm": 1.9373931884765625, "learning_rate": 4.8524780184474084e-05, "loss": 1.8072, "step": 4712 }, { "epoch": 0.12006303119284634, "grad_norm": 1.8975365161895752, "learning_rate": 4.8496225246215496e-05, "loss": 1.8121, "step": 4743 }, { "epoch": 0.12084775688691723, "grad_norm": 5.285326957702637, "learning_rate": 4.8467405156248505e-05, "loss": 1.8189, "step": 4774 }, { "epoch": 0.12163248258098812, "grad_norm": 1.7155263423919678, "learning_rate": 4.843832023980392e-05, "loss": 1.8093, "step": 4805 }, { "epoch": 0.12241720827505902, "grad_norm": 1.726831316947937, "learning_rate": 4.840897082510106e-05, "loss": 1.7952, "step": 4836 }, { "epoch": 0.1232019339691299, "grad_norm": 1.739639401435852, "learning_rate": 4.8379357243344084e-05, "loss": 1.8103, "step": 4867 }, { "epoch": 0.1239866596632008, "grad_norm": 1.6978296041488647, "learning_rate": 4.8349479828718236e-05, "loss": 1.8006, "step": 4898 }, { "epoch": 0.12477138535727168, "grad_norm": 1.7154194116592407, "learning_rate": 4.8319338918386075e-05, "loss": 1.7876, "step": 4929 }, { "epoch": 0.12555611105134257, "grad_norm": 1.6323316097259521, "learning_rate": 4.828893485248369e-05, "loss": 1.8159, "step": 4960 }, { "epoch": 0.12634083674541347, "grad_norm": 1.641784429550171, "learning_rate": 4.825826797411682e-05, "loss": 1.7959, "step": 4991 }, { "epoch": 0.12712556243948436, "grad_norm": 1.6947154998779297, "learning_rate": 4.822733862935702e-05, "loss": 1.7895, "step": 5022 }, { "epoch": 0.12791028813355526, "grad_norm": 1.6331220865249634, "learning_rate": 4.819614716723775e-05, "loss": 1.7707, "step": 5053 }, { "epoch": 0.12869501382762613, "grad_norm": 1.8207937479019165, "learning_rate": 4.8164693939750425e-05, "loss": 1.8123, "step": 5084 }, { "epoch": 0.12947973952169703, "grad_norm": 1.6664263010025024, "learning_rate": 4.813297930184042e-05, "loss": 1.8089, "step": 5115 }, { "epoch": 0.13026446521576793, "grad_norm": 1.9931398630142212, "learning_rate": 4.810100361140314e-05, "loss": 1.7757, "step": 5146 }, { "epoch": 0.13104919090983883, "grad_norm": 1.839200735092163, "learning_rate": 4.8068767229279885e-05, "loss": 1.7969, "step": 5177 }, { "epoch": 0.1318339166039097, "grad_norm": 1.781187653541565, "learning_rate": 4.8036270519253854e-05, "loss": 1.7937, "step": 5208 }, { "epoch": 0.1326186422979806, "grad_norm": 1.7144343852996826, "learning_rate": 4.8003513848046e-05, "loss": 1.7816, "step": 5239 }, { "epoch": 0.1334033679920515, "grad_norm": 1.6819554567337036, "learning_rate": 4.79704975853109e-05, "loss": 1.7851, "step": 5270 }, { "epoch": 0.1341880936861224, "grad_norm": 1.6748546361923218, "learning_rate": 4.793722210363262e-05, "loss": 1.7941, "step": 5301 }, { "epoch": 0.13497281938019326, "grad_norm": 1.615569829940796, "learning_rate": 4.7903687778520414e-05, "loss": 1.7799, "step": 5332 }, { "epoch": 0.13575754507426416, "grad_norm": 1.7959198951721191, "learning_rate": 4.7869894988404593e-05, "loss": 1.7802, "step": 5363 }, { "epoch": 0.13654227076833506, "grad_norm": 1.598946452140808, "learning_rate": 4.783584411463221e-05, "loss": 1.7929, "step": 5394 }, { "epoch": 0.13732699646240595, "grad_norm": 1.793511986732483, "learning_rate": 4.780153554146274e-05, "loss": 1.7591, "step": 5425 }, { "epoch": 0.13811172215647682, "grad_norm": 1.718671202659607, "learning_rate": 4.7766969656063766e-05, "loss": 1.7807, "step": 5456 }, { "epoch": 0.13889644785054772, "grad_norm": 1.6548669338226318, "learning_rate": 4.773214684850662e-05, "loss": 1.775, "step": 5487 }, { "epoch": 0.13968117354461862, "grad_norm": 1.6727256774902344, "learning_rate": 4.769706751176193e-05, "loss": 1.7756, "step": 5518 }, { "epoch": 0.14046589923868952, "grad_norm": 1.7169344425201416, "learning_rate": 4.7661732041695264e-05, "loss": 1.7887, "step": 5549 }, { "epoch": 0.1412506249327604, "grad_norm": 1.6376421451568604, "learning_rate": 4.762614083706258e-05, "loss": 1.7939, "step": 5580 }, { "epoch": 0.14203535062683129, "grad_norm": 1.7083207368850708, "learning_rate": 4.759029429950581e-05, "loss": 1.7705, "step": 5611 }, { "epoch": 0.14282007632090218, "grad_norm": 1.6359349489212036, "learning_rate": 4.7554192833548235e-05, "loss": 1.7732, "step": 5642 }, { "epoch": 0.14360480201497308, "grad_norm": 1.684005618095398, "learning_rate": 4.751783684659e-05, "loss": 1.7766, "step": 5673 }, { "epoch": 0.14438952770904395, "grad_norm": 1.7531359195709229, "learning_rate": 4.748122674890348e-05, "loss": 1.7815, "step": 5704 }, { "epoch": 0.14517425340311485, "grad_norm": 1.5898247957229614, "learning_rate": 4.7444362953628654e-05, "loss": 1.7837, "step": 5735 }, { "epoch": 0.14595897909718575, "grad_norm": 1.6781623363494873, "learning_rate": 4.7407245876768424e-05, "loss": 1.7381, "step": 5766 }, { "epoch": 0.14674370479125665, "grad_norm": 1.6126357316970825, "learning_rate": 4.736987593718397e-05, "loss": 1.7714, "step": 5797 }, { "epoch": 0.14752843048532752, "grad_norm": 1.6623587608337402, "learning_rate": 4.733225355658999e-05, "loss": 1.7625, "step": 5828 }, { "epoch": 0.14831315617939841, "grad_norm": 1.6715524196624756, "learning_rate": 4.7294379159549926e-05, "loss": 1.7631, "step": 5859 }, { "epoch": 0.1490978818734693, "grad_norm": 1.6739026308059692, "learning_rate": 4.725625317347119e-05, "loss": 1.775, "step": 5890 }, { "epoch": 0.1498826075675402, "grad_norm": 1.8141075372695923, "learning_rate": 4.7217876028600374e-05, "loss": 1.7881, "step": 5921 }, { "epoch": 0.15066733326161108, "grad_norm": 1.6842069625854492, "learning_rate": 4.717924815801832e-05, "loss": 1.7707, "step": 5952 }, { "epoch": 0.15145205895568198, "grad_norm": 1.7032698392868042, "learning_rate": 4.714036999763532e-05, "loss": 1.7631, "step": 5983 }, { "epoch": 0.15223678464975288, "grad_norm": 1.7856013774871826, "learning_rate": 4.7101241986186116e-05, "loss": 1.7545, "step": 6014 }, { "epoch": 0.15302151034382377, "grad_norm": 1.679623007774353, "learning_rate": 4.7061864565225e-05, "loss": 1.7676, "step": 6045 }, { "epoch": 0.15380623603789464, "grad_norm": 1.626792073249817, "learning_rate": 4.702223817912081e-05, "loss": 1.7434, "step": 6076 }, { "epoch": 0.15459096173196554, "grad_norm": 1.850042700767517, "learning_rate": 4.698236327505195e-05, "loss": 1.7805, "step": 6107 }, { "epoch": 0.15537568742603644, "grad_norm": 1.6403062343597412, "learning_rate": 4.694224030300127e-05, "loss": 1.7495, "step": 6138 }, { "epoch": 0.15616041312010734, "grad_norm": 1.5897477865219116, "learning_rate": 4.690186971575107e-05, "loss": 1.779, "step": 6169 }, { "epoch": 0.1569451388141782, "grad_norm": 1.8173433542251587, "learning_rate": 4.6861251968877916e-05, "loss": 1.7705, "step": 6200 }, { "epoch": 0.1577298645082491, "grad_norm": 1.788022756576538, "learning_rate": 4.68203875207476e-05, "loss": 1.7457, "step": 6231 }, { "epoch": 0.15851459020232, "grad_norm": 1.6219838857650757, "learning_rate": 4.677927683250983e-05, "loss": 1.7758, "step": 6262 }, { "epoch": 0.1592993158963909, "grad_norm": 1.678890347480774, "learning_rate": 4.6737920368093156e-05, "loss": 1.7394, "step": 6293 }, { "epoch": 0.16008404159046177, "grad_norm": 1.5719743967056274, "learning_rate": 4.669631859419965e-05, "loss": 1.7549, "step": 6324 }, { "epoch": 0.16086876728453267, "grad_norm": 1.6332769393920898, "learning_rate": 4.6654471980299676e-05, "loss": 1.7462, "step": 6355 }, { "epoch": 0.16165349297860357, "grad_norm": 1.6942561864852905, "learning_rate": 4.661238099862658e-05, "loss": 1.7506, "step": 6386 }, { "epoch": 0.16243821867267447, "grad_norm": 1.8173885345458984, "learning_rate": 4.657004612417138e-05, "loss": 1.7455, "step": 6417 }, { "epoch": 0.16322294436674534, "grad_norm": 1.6209042072296143, "learning_rate": 4.6527467834677374e-05, "loss": 1.7413, "step": 6448 }, { "epoch": 0.16400767006081624, "grad_norm": 1.5801094770431519, "learning_rate": 4.648464661063478e-05, "loss": 1.7491, "step": 6479 }, { "epoch": 0.16479239575488713, "grad_norm": 1.5499264001846313, "learning_rate": 4.6441582935275264e-05, "loss": 1.7276, "step": 6510 }, { "epoch": 0.16557712144895803, "grad_norm": 1.6154171228408813, "learning_rate": 4.6398277294566586e-05, "loss": 1.7816, "step": 6541 }, { "epoch": 0.1663618471430289, "grad_norm": 1.5633410215377808, "learning_rate": 4.6354730177207e-05, "loss": 1.7447, "step": 6572 }, { "epoch": 0.1671465728370998, "grad_norm": 1.7070655822753906, "learning_rate": 4.6310942074619787e-05, "loss": 1.7477, "step": 6603 }, { "epoch": 0.1679312985311707, "grad_norm": 1.7502373456954956, "learning_rate": 4.626691348094777e-05, "loss": 1.74, "step": 6634 }, { "epoch": 0.1687160242252416, "grad_norm": 1.9541263580322266, "learning_rate": 4.622264489304762e-05, "loss": 1.7389, "step": 6665 }, { "epoch": 0.16950074991931247, "grad_norm": 1.64599609375, "learning_rate": 4.617813681048434e-05, "loss": 1.7445, "step": 6696 }, { "epoch": 0.17028547561338336, "grad_norm": 1.9360859394073486, "learning_rate": 4.61333897355256e-05, "loss": 1.73, "step": 6727 }, { "epoch": 0.17107020130745426, "grad_norm": 1.693892240524292, "learning_rate": 4.608840417313604e-05, "loss": 1.7229, "step": 6758 }, { "epoch": 0.17185492700152516, "grad_norm": 1.6243150234222412, "learning_rate": 4.6043180630971646e-05, "loss": 1.7421, "step": 6789 }, { "epoch": 0.17263965269559603, "grad_norm": 1.5926107168197632, "learning_rate": 4.599771961937391e-05, "loss": 1.7447, "step": 6820 }, { "epoch": 0.17342437838966693, "grad_norm": 1.695167064666748, "learning_rate": 4.5952021651364204e-05, "loss": 1.7463, "step": 6851 }, { "epoch": 0.17420910408373783, "grad_norm": 1.5915182828903198, "learning_rate": 4.590608724263786e-05, "loss": 1.7198, "step": 6882 }, { "epoch": 0.17499382977780872, "grad_norm": 1.6135920286178589, "learning_rate": 4.585991691155845e-05, "loss": 1.7233, "step": 6913 }, { "epoch": 0.17577855547187962, "grad_norm": 1.5855350494384766, "learning_rate": 4.581351117915188e-05, "loss": 1.7519, "step": 6944 }, { "epoch": 0.1765632811659505, "grad_norm": 1.5782060623168945, "learning_rate": 4.5766870569100534e-05, "loss": 1.729, "step": 6975 }, { "epoch": 0.1773480068600214, "grad_norm": 1.4931174516677856, "learning_rate": 4.571999560773736e-05, "loss": 1.7197, "step": 7006 }, { "epoch": 0.1781327325540923, "grad_norm": 1.809645414352417, "learning_rate": 4.5672886824039915e-05, "loss": 1.7409, "step": 7037 }, { "epoch": 0.17891745824816319, "grad_norm": 1.544233798980713, "learning_rate": 4.5625544749624435e-05, "loss": 1.7331, "step": 7068 }, { "epoch": 0.17970218394223406, "grad_norm": 1.5316941738128662, "learning_rate": 4.5577969918739794e-05, "loss": 1.7245, "step": 7099 }, { "epoch": 0.18048690963630495, "grad_norm": 1.4646427631378174, "learning_rate": 4.5530162868261486e-05, "loss": 1.7341, "step": 7130 }, { "epoch": 0.18127163533037585, "grad_norm": 1.6266372203826904, "learning_rate": 4.548212413768558e-05, "loss": 1.7311, "step": 7161 }, { "epoch": 0.18205636102444675, "grad_norm": 1.6372709274291992, "learning_rate": 4.543385426912261e-05, "loss": 1.7344, "step": 7192 }, { "epoch": 0.18284108671851762, "grad_norm": 1.642005443572998, "learning_rate": 4.53853538072915e-05, "loss": 1.7472, "step": 7223 }, { "epoch": 0.18362581241258852, "grad_norm": 1.7344322204589844, "learning_rate": 4.533662329951336e-05, "loss": 1.7379, "step": 7254 }, { "epoch": 0.18441053810665942, "grad_norm": 1.6593672037124634, "learning_rate": 4.528766329570536e-05, "loss": 1.7363, "step": 7285 }, { "epoch": 0.18519526380073031, "grad_norm": 1.590846300125122, "learning_rate": 4.523847434837447e-05, "loss": 1.7432, "step": 7316 }, { "epoch": 0.18597998949480118, "grad_norm": 1.6701788902282715, "learning_rate": 4.518905701261128e-05, "loss": 1.7287, "step": 7347 }, { "epoch": 0.18676471518887208, "grad_norm": 1.6129958629608154, "learning_rate": 4.5139411846083715e-05, "loss": 1.7252, "step": 7378 }, { "epoch": 0.18754944088294298, "grad_norm": 1.5602383613586426, "learning_rate": 4.508953940903073e-05, "loss": 1.7365, "step": 7409 }, { "epoch": 0.18833416657701388, "grad_norm": 1.60308039188385, "learning_rate": 4.5039440264255994e-05, "loss": 1.7361, "step": 7440 }, { "epoch": 0.18911889227108475, "grad_norm": 1.588299036026001, "learning_rate": 4.498911497712155e-05, "loss": 1.7574, "step": 7471 }, { "epoch": 0.18990361796515565, "grad_norm": 1.5599571466445923, "learning_rate": 4.493856411554142e-05, "loss": 1.738, "step": 7502 }, { "epoch": 0.19068834365922654, "grad_norm": 1.5749436616897583, "learning_rate": 4.4887788249975206e-05, "loss": 1.7272, "step": 7533 }, { "epoch": 0.19147306935329744, "grad_norm": 1.5536047220230103, "learning_rate": 4.4836787953421656e-05, "loss": 1.7249, "step": 7564 }, { "epoch": 0.1922577950473683, "grad_norm": 1.5227411985397339, "learning_rate": 4.478556380141218e-05, "loss": 1.7137, "step": 7595 }, { "epoch": 0.1930425207414392, "grad_norm": 1.5771219730377197, "learning_rate": 4.4734116372004375e-05, "loss": 1.7094, "step": 7626 }, { "epoch": 0.1938272464355101, "grad_norm": 1.4533522129058838, "learning_rate": 4.4682446245775477e-05, "loss": 1.7493, "step": 7657 }, { "epoch": 0.194611972129581, "grad_norm": 1.5640264749526978, "learning_rate": 4.463055400581586e-05, "loss": 1.7228, "step": 7688 }, { "epoch": 0.19539669782365188, "grad_norm": 1.4606215953826904, "learning_rate": 4.4578440237722374e-05, "loss": 1.7414, "step": 7719 }, { "epoch": 0.19618142351772277, "grad_norm": 1.5216374397277832, "learning_rate": 4.452610552959183e-05, "loss": 1.7155, "step": 7750 }, { "epoch": 0.19696614921179367, "grad_norm": 1.683119535446167, "learning_rate": 4.447355047201428e-05, "loss": 1.7346, "step": 7781 }, { "epoch": 0.19775087490586457, "grad_norm": 1.6055350303649902, "learning_rate": 4.4420775658066414e-05, "loss": 1.7112, "step": 7812 }, { "epoch": 0.19853560059993544, "grad_norm": 1.514739751815796, "learning_rate": 4.436778168330484e-05, "loss": 1.7274, "step": 7843 }, { "epoch": 0.19932032629400634, "grad_norm": 2.131218433380127, "learning_rate": 4.4314569145759353e-05, "loss": 1.7127, "step": 7874 }, { "epoch": 0.20010505198807724, "grad_norm": 1.4867665767669678, "learning_rate": 4.42611386459262e-05, "loss": 1.7245, "step": 7905 }, { "epoch": 0.20088977768214814, "grad_norm": 1.6395418643951416, "learning_rate": 4.420749078676133e-05, "loss": 1.7146, "step": 7936 }, { "epoch": 0.201674503376219, "grad_norm": 1.629939079284668, "learning_rate": 4.4153626173673516e-05, "loss": 1.7153, "step": 7967 }, { "epoch": 0.2024592290702899, "grad_norm": 1.5973584651947021, "learning_rate": 4.409954541451762e-05, "loss": 1.7102, "step": 7998 }, { "epoch": 0.2032439547643608, "grad_norm": 1.4822708368301392, "learning_rate": 4.404524911958764e-05, "loss": 1.7046, "step": 8029 }, { "epoch": 0.2040286804584317, "grad_norm": 1.4706634283065796, "learning_rate": 4.399073790160989e-05, "loss": 1.7022, "step": 8060 }, { "epoch": 0.20481340615250257, "grad_norm": 1.5917459726333618, "learning_rate": 4.393601237573607e-05, "loss": 1.6983, "step": 8091 }, { "epoch": 0.20559813184657347, "grad_norm": 1.7328417301177979, "learning_rate": 4.388107315953628e-05, "loss": 1.7164, "step": 8122 }, { "epoch": 0.20638285754064437, "grad_norm": 1.6152797937393188, "learning_rate": 4.382592087299212e-05, "loss": 1.7302, "step": 8153 }, { "epoch": 0.20716758323471526, "grad_norm": 1.7153429985046387, "learning_rate": 4.377055613848964e-05, "loss": 1.7278, "step": 8184 }, { "epoch": 0.20795230892878613, "grad_norm": 1.7167855501174927, "learning_rate": 4.3714979580812355e-05, "loss": 1.7021, "step": 8215 }, { "epoch": 0.20873703462285703, "grad_norm": 1.458811640739441, "learning_rate": 4.365919182713416e-05, "loss": 1.7099, "step": 8246 }, { "epoch": 0.20952176031692793, "grad_norm": 5.516291618347168, "learning_rate": 4.360319350701226e-05, "loss": 1.7069, "step": 8277 }, { "epoch": 0.21030648601099883, "grad_norm": 1.5669766664505005, "learning_rate": 4.3546985252380115e-05, "loss": 1.6983, "step": 8308 }, { "epoch": 0.2110912117050697, "grad_norm": 1.4598067998886108, "learning_rate": 4.349056769754021e-05, "loss": 1.7265, "step": 8339 }, { "epoch": 0.2118759373991406, "grad_norm": 1.5436547994613647, "learning_rate": 4.3433941479156994e-05, "loss": 1.7128, "step": 8370 }, { "epoch": 0.2126606630932115, "grad_norm": 1.6275660991668701, "learning_rate": 4.3377107236249647e-05, "loss": 1.7229, "step": 8401 }, { "epoch": 0.2134453887872824, "grad_norm": 1.6207513809204102, "learning_rate": 4.332006561018488e-05, "loss": 1.702, "step": 8432 }, { "epoch": 0.21423011448135326, "grad_norm": 1.6795597076416016, "learning_rate": 4.3262817244669683e-05, "loss": 1.6808, "step": 8463 }, { "epoch": 0.21501484017542416, "grad_norm": 1.660192608833313, "learning_rate": 4.3205362785744083e-05, "loss": 1.7071, "step": 8494 }, { "epoch": 0.21579956586949506, "grad_norm": 1.6086353063583374, "learning_rate": 4.314770288177384e-05, "loss": 1.7083, "step": 8525 }, { "epoch": 0.21658429156356596, "grad_norm": 1.475216269493103, "learning_rate": 4.308983818344313e-05, "loss": 1.7234, "step": 8556 }, { "epoch": 0.21736901725763683, "grad_norm": 1.7111340761184692, "learning_rate": 4.3031769343747206e-05, "loss": 1.6872, "step": 8587 }, { "epoch": 0.21815374295170772, "grad_norm": 1.4544799327850342, "learning_rate": 4.297349701798505e-05, "loss": 1.692, "step": 8618 }, { "epoch": 0.21893846864577862, "grad_norm": 1.6593588590621948, "learning_rate": 4.2915021863751916e-05, "loss": 1.6886, "step": 8649 }, { "epoch": 0.21972319433984952, "grad_norm": 1.641408085823059, "learning_rate": 4.285634454093198e-05, "loss": 1.6872, "step": 8680 }, { "epoch": 0.2205079200339204, "grad_norm": 1.6036972999572754, "learning_rate": 4.279746571169086e-05, "loss": 1.7055, "step": 8711 }, { "epoch": 0.2212926457279913, "grad_norm": 1.4984327554702759, "learning_rate": 4.2738386040468136e-05, "loss": 1.6997, "step": 8742 }, { "epoch": 0.2220773714220622, "grad_norm": 1.471111536026001, "learning_rate": 4.2679106193969866e-05, "loss": 1.6926, "step": 8773 }, { "epoch": 0.22286209711613308, "grad_norm": 1.521364688873291, "learning_rate": 4.261962684116106e-05, "loss": 1.6851, "step": 8804 }, { "epoch": 0.22364682281020395, "grad_norm": 1.6068321466445923, "learning_rate": 4.2559948653258145e-05, "loss": 1.7113, "step": 8835 }, { "epoch": 0.22443154850427485, "grad_norm": 1.453379511833191, "learning_rate": 4.250007230372134e-05, "loss": 1.7025, "step": 8866 }, { "epoch": 0.22521627419834575, "grad_norm": 1.5845959186553955, "learning_rate": 4.2439998468247126e-05, "loss": 1.6978, "step": 8897 }, { "epoch": 0.22600099989241665, "grad_norm": 1.5308622121810913, "learning_rate": 4.2379727824760566e-05, "loss": 1.6956, "step": 8928 }, { "epoch": 0.22678572558648752, "grad_norm": 1.6339962482452393, "learning_rate": 4.231926105340768e-05, "loss": 1.6831, "step": 8959 }, { "epoch": 0.22757045128055842, "grad_norm": 1.4533487558364868, "learning_rate": 4.225859883654776e-05, "loss": 1.7025, "step": 8990 }, { "epoch": 0.22835517697462931, "grad_norm": 3.971897840499878, "learning_rate": 4.219774185874569e-05, "loss": 1.689, "step": 9021 }, { "epoch": 0.2291399026687002, "grad_norm": 1.4394114017486572, "learning_rate": 4.213669080676418e-05, "loss": 1.6841, "step": 9052 }, { "epoch": 0.22992462836277108, "grad_norm": 1.821142315864563, "learning_rate": 4.2075446369556056e-05, "loss": 1.6883, "step": 9083 }, { "epoch": 0.23070935405684198, "grad_norm": 1.6653649806976318, "learning_rate": 4.201400923825648e-05, "loss": 1.7011, "step": 9114 }, { "epoch": 0.23149407975091288, "grad_norm": 1.5895901918411255, "learning_rate": 4.195238010617511e-05, "loss": 1.7004, "step": 9145 }, { "epoch": 0.23227880544498378, "grad_norm": 1.4648844003677368, "learning_rate": 4.1890559668788344e-05, "loss": 1.6872, "step": 9176 }, { "epoch": 0.23306353113905465, "grad_norm": 1.5886753797531128, "learning_rate": 4.1828548623731405e-05, "loss": 1.6851, "step": 9207 }, { "epoch": 0.23384825683312555, "grad_norm": 1.4713412523269653, "learning_rate": 4.1766347670790506e-05, "loss": 1.6818, "step": 9238 }, { "epoch": 0.23463298252719644, "grad_norm": 1.5660710334777832, "learning_rate": 4.170395751189495e-05, "loss": 1.6844, "step": 9269 }, { "epoch": 0.23541770822126734, "grad_norm": 1.7024312019348145, "learning_rate": 4.164137885110921e-05, "loss": 1.6839, "step": 9300 }, { "epoch": 0.2362024339153382, "grad_norm": 1.5936214923858643, "learning_rate": 4.157861239462495e-05, "loss": 1.6953, "step": 9331 }, { "epoch": 0.2369871596094091, "grad_norm": 1.4709779024124146, "learning_rate": 4.1515658850753114e-05, "loss": 1.6806, "step": 9362 }, { "epoch": 0.23777188530348, "grad_norm": 1.4303510189056396, "learning_rate": 4.145251892991588e-05, "loss": 1.6792, "step": 9393 }, { "epoch": 0.2385566109975509, "grad_norm": 1.5452120304107666, "learning_rate": 4.138919334463868e-05, "loss": 1.6712, "step": 9424 }, { "epoch": 0.23934133669162178, "grad_norm": 1.4944697618484497, "learning_rate": 4.1325682809542124e-05, "loss": 1.6777, "step": 9455 }, { "epoch": 0.24012606238569267, "grad_norm": 1.6359312534332275, "learning_rate": 4.126198804133398e-05, "loss": 1.6782, "step": 9486 }, { "epoch": 0.24091078807976357, "grad_norm": 1.3874454498291016, "learning_rate": 4.1198109758801055e-05, "loss": 1.6805, "step": 9517 }, { "epoch": 0.24169551377383447, "grad_norm": 1.4747340679168701, "learning_rate": 4.113404868280107e-05, "loss": 1.6704, "step": 9548 }, { "epoch": 0.24248023946790534, "grad_norm": 1.95576012134552, "learning_rate": 4.106980553625457e-05, "loss": 1.7008, "step": 9579 }, { "epoch": 0.24326496516197624, "grad_norm": 1.454005479812622, "learning_rate": 4.100538104413674e-05, "loss": 1.6771, "step": 9610 }, { "epoch": 0.24404969085604714, "grad_norm": 1.5640463829040527, "learning_rate": 4.09407759334692e-05, "loss": 1.6763, "step": 9641 }, { "epoch": 0.24483441655011803, "grad_norm": 1.5076780319213867, "learning_rate": 4.087599093331186e-05, "loss": 1.6977, "step": 9672 }, { "epoch": 0.2456191422441889, "grad_norm": 1.5072520971298218, "learning_rate": 4.081102677475462e-05, "loss": 1.6749, "step": 9703 }, { "epoch": 0.2464038679382598, "grad_norm": 1.6311815977096558, "learning_rate": 4.0745884190909194e-05, "loss": 1.684, "step": 9734 }, { "epoch": 0.2471885936323307, "grad_norm": 1.5691202878952026, "learning_rate": 4.0680563916900796e-05, "loss": 1.6804, "step": 9765 }, { "epoch": 0.2479733193264016, "grad_norm": 1.4325530529022217, "learning_rate": 4.0615066689859815e-05, "loss": 1.719, "step": 9796 }, { "epoch": 0.24875804502047247, "grad_norm": 1.439177393913269, "learning_rate": 4.0549393248913584e-05, "loss": 1.6873, "step": 9827 }, { "epoch": 0.24954277071454337, "grad_norm": 1.4155471324920654, "learning_rate": 4.048354433517794e-05, "loss": 1.692, "step": 9858 }, { "epoch": 0.25032749640861424, "grad_norm": 1.5917115211486816, "learning_rate": 4.0417520691748916e-05, "loss": 1.6752, "step": 9889 }, { "epoch": 0.25111222210268513, "grad_norm": 1.649154543876648, "learning_rate": 4.035132306369438e-05, "loss": 1.6603, "step": 9920 }, { "epoch": 0.25189694779675603, "grad_norm": 1.5114792585372925, "learning_rate": 4.028495219804555e-05, "loss": 1.7005, "step": 9951 }, { "epoch": 0.25268167349082693, "grad_norm": 16.910812377929688, "learning_rate": 4.021840884378864e-05, "loss": 1.6846, "step": 9982 }, { "epoch": 0.25346639918489783, "grad_norm": 1.4342628717422485, "learning_rate": 4.015169375185633e-05, "loss": 1.6678, "step": 10013 }, { "epoch": 0.2542511248789687, "grad_norm": 1.4815376996994019, "learning_rate": 4.0084807675119396e-05, "loss": 1.671, "step": 10044 }, { "epoch": 0.2550358505730396, "grad_norm": 1.4633368253707886, "learning_rate": 4.0017751368378106e-05, "loss": 1.6824, "step": 10075 }, { "epoch": 0.2558205762671105, "grad_norm": 1.3904149532318115, "learning_rate": 3.995052558835377e-05, "loss": 1.6775, "step": 10106 }, { "epoch": 0.25660530196118136, "grad_norm": 1.5234646797180176, "learning_rate": 3.988313109368017e-05, "loss": 1.6854, "step": 10137 }, { "epoch": 0.25739002765525226, "grad_norm": 1.4530494213104248, "learning_rate": 3.981556864489504e-05, "loss": 1.6727, "step": 10168 }, { "epoch": 0.25817475334932316, "grad_norm": 1.5600273609161377, "learning_rate": 3.974783900443142e-05, "loss": 1.6645, "step": 10199 }, { "epoch": 0.25895947904339406, "grad_norm": 1.4213160276412964, "learning_rate": 3.9679942936609095e-05, "loss": 1.6898, "step": 10230 }, { "epoch": 0.25974420473746496, "grad_norm": 1.5741041898727417, "learning_rate": 3.961188120762596e-05, "loss": 1.693, "step": 10261 }, { "epoch": 0.26052893043153585, "grad_norm": 1.564493179321289, "learning_rate": 3.954365458554938e-05, "loss": 1.6836, "step": 10292 }, { "epoch": 0.26131365612560675, "grad_norm": 1.5584787130355835, "learning_rate": 3.947526384030751e-05, "loss": 1.6852, "step": 10323 }, { "epoch": 0.26209838181967765, "grad_norm": 1.4936350584030151, "learning_rate": 3.9406709743680624e-05, "loss": 1.6777, "step": 10354 }, { "epoch": 0.26288310751374855, "grad_norm": 1.504725694656372, "learning_rate": 3.9337993069292366e-05, "loss": 1.6765, "step": 10385 }, { "epoch": 0.2636678332078194, "grad_norm": 1.4809914827346802, "learning_rate": 3.926911459260109e-05, "loss": 1.6578, "step": 10416 }, { "epoch": 0.2644525589018903, "grad_norm": 1.529976725578308, "learning_rate": 3.920007509089102e-05, "loss": 1.6709, "step": 10447 }, { "epoch": 0.2652372845959612, "grad_norm": 1.483694076538086, "learning_rate": 3.913087534326357e-05, "loss": 1.6713, "step": 10478 }, { "epoch": 0.2660220102900321, "grad_norm": 1.4282972812652588, "learning_rate": 3.9061516130628475e-05, "loss": 1.6784, "step": 10509 }, { "epoch": 0.266806735984103, "grad_norm": 1.5122032165527344, "learning_rate": 3.8991998235695025e-05, "loss": 1.6603, "step": 10540 }, { "epoch": 0.2675914616781739, "grad_norm": 1.5154742002487183, "learning_rate": 3.8922322442963224e-05, "loss": 1.6831, "step": 10571 }, { "epoch": 0.2683761873722448, "grad_norm": 1.4630860090255737, "learning_rate": 3.885248953871491e-05, "loss": 1.6715, "step": 10602 }, { "epoch": 0.2691609130663157, "grad_norm": 1.4164702892303467, "learning_rate": 3.8782500311004915e-05, "loss": 1.6654, "step": 10633 }, { "epoch": 0.2699456387603865, "grad_norm": 1.5865578651428223, "learning_rate": 3.871235554965218e-05, "loss": 1.6829, "step": 10664 }, { "epoch": 0.2707303644544574, "grad_norm": 1.4984766244888306, "learning_rate": 3.864205604623078e-05, "loss": 1.673, "step": 10695 }, { "epoch": 0.2715150901485283, "grad_norm": 1.5477566719055176, "learning_rate": 3.857160259406107e-05, "loss": 1.6711, "step": 10726 }, { "epoch": 0.2722998158425992, "grad_norm": 1.5356842279434204, "learning_rate": 3.8500995988200674e-05, "loss": 1.6556, "step": 10757 }, { "epoch": 0.2730845415366701, "grad_norm": 1.413104772567749, "learning_rate": 3.843023702543556e-05, "loss": 1.658, "step": 10788 }, { "epoch": 0.273869267230741, "grad_norm": 1.5174081325531006, "learning_rate": 3.8359326504270984e-05, "loss": 1.6672, "step": 10819 }, { "epoch": 0.2746539929248119, "grad_norm": 1.4649910926818848, "learning_rate": 3.828826522492255e-05, "loss": 1.6625, "step": 10850 }, { "epoch": 0.2754387186188828, "grad_norm": 1.5240408182144165, "learning_rate": 3.821705398930713e-05, "loss": 1.6619, "step": 10881 }, { "epoch": 0.27622344431295365, "grad_norm": 1.4349104166030884, "learning_rate": 3.814569360103385e-05, "loss": 1.6595, "step": 10912 }, { "epoch": 0.27700817000702455, "grad_norm": 1.4311225414276123, "learning_rate": 3.807418486539499e-05, "loss": 1.6557, "step": 10943 }, { "epoch": 0.27779289570109544, "grad_norm": 1.5817755460739136, "learning_rate": 3.80025285893569e-05, "loss": 1.6882, "step": 10974 }, { "epoch": 0.27857762139516634, "grad_norm": 1.5182181596755981, "learning_rate": 3.793072558155093e-05, "loss": 1.6697, "step": 11005 }, { "epoch": 0.27936234708923724, "grad_norm": 1.4836517572402954, "learning_rate": 3.785877665226426e-05, "loss": 1.6576, "step": 11036 }, { "epoch": 0.28014707278330814, "grad_norm": 1.460788607597351, "learning_rate": 3.778668261343079e-05, "loss": 1.6607, "step": 11067 }, { "epoch": 0.28093179847737904, "grad_norm": 1.4307125806808472, "learning_rate": 3.771444427862192e-05, "loss": 1.662, "step": 11098 }, { "epoch": 0.28171652417144993, "grad_norm": 1.4999738931655884, "learning_rate": 3.7642062463037465e-05, "loss": 1.6406, "step": 11129 }, { "epoch": 0.2825012498655208, "grad_norm": 1.4646129608154297, "learning_rate": 3.7569537983496373e-05, "loss": 1.6653, "step": 11160 }, { "epoch": 0.2832859755595917, "grad_norm": 1.4709292650222778, "learning_rate": 3.749687165842753e-05, "loss": 1.6704, "step": 11191 }, { "epoch": 0.28407070125366257, "grad_norm": 1.494458556175232, "learning_rate": 3.7424064307860536e-05, "loss": 1.6534, "step": 11222 }, { "epoch": 0.28485542694773347, "grad_norm": 1.4409736394882202, "learning_rate": 3.735111675341645e-05, "loss": 1.6645, "step": 11253 }, { "epoch": 0.28564015264180437, "grad_norm": 1.4628338813781738, "learning_rate": 3.7278029818298524e-05, "loss": 1.6611, "step": 11284 }, { "epoch": 0.28642487833587527, "grad_norm": 1.3659113645553589, "learning_rate": 3.720480432728287e-05, "loss": 1.6435, "step": 11315 }, { "epoch": 0.28720960402994616, "grad_norm": 1.3704752922058105, "learning_rate": 3.71314411067092e-05, "loss": 1.6507, "step": 11346 }, { "epoch": 0.28799432972401706, "grad_norm": 1.579837441444397, "learning_rate": 3.70579409844715e-05, "loss": 1.6716, "step": 11377 }, { "epoch": 0.2887790554180879, "grad_norm": 1.5566996335983276, "learning_rate": 3.698430479000865e-05, "loss": 1.6439, "step": 11408 }, { "epoch": 0.2895637811121588, "grad_norm": 1.4722687005996704, "learning_rate": 3.691053335429509e-05, "loss": 1.683, "step": 11439 }, { "epoch": 0.2903485068062297, "grad_norm": 1.491283893585205, "learning_rate": 3.683662750983147e-05, "loss": 1.6606, "step": 11470 }, { "epoch": 0.2911332325003006, "grad_norm": 1.402040719985962, "learning_rate": 3.676258809063518e-05, "loss": 1.6582, "step": 11501 }, { "epoch": 0.2919179581943715, "grad_norm": 1.4377038478851318, "learning_rate": 3.6688415932231004e-05, "loss": 1.6398, "step": 11532 }, { "epoch": 0.2927026838884424, "grad_norm": 1.4151259660720825, "learning_rate": 3.661411187164166e-05, "loss": 1.6645, "step": 11563 }, { "epoch": 0.2934874095825133, "grad_norm": 1.5219615697860718, "learning_rate": 3.65396767473784e-05, "loss": 1.6705, "step": 11594 }, { "epoch": 0.2942721352765842, "grad_norm": 1.533252239227295, "learning_rate": 3.6465111399431465e-05, "loss": 1.6714, "step": 11625 }, { "epoch": 0.29505686097065503, "grad_norm": 1.410959243774414, "learning_rate": 3.6390416669260674e-05, "loss": 1.6533, "step": 11656 }, { "epoch": 0.29584158666472593, "grad_norm": 1.5377541780471802, "learning_rate": 3.63155933997859e-05, "loss": 1.6505, "step": 11687 }, { "epoch": 0.29662631235879683, "grad_norm": 1.4504135847091675, "learning_rate": 3.624064243537758e-05, "loss": 1.6287, "step": 11718 }, { "epoch": 0.2974110380528677, "grad_norm": 1.4606986045837402, "learning_rate": 3.616556462184716e-05, "loss": 1.6592, "step": 11749 }, { "epoch": 0.2981957637469386, "grad_norm": 1.4440289735794067, "learning_rate": 3.609036080643755e-05, "loss": 1.6598, "step": 11780 }, { "epoch": 0.2989804894410095, "grad_norm": 1.5399249792099, "learning_rate": 3.60150318378136e-05, "loss": 1.6852, "step": 11811 }, { "epoch": 0.2997652151350804, "grad_norm": 1.4778543710708618, "learning_rate": 3.5939578566052465e-05, "loss": 1.6462, "step": 11842 }, { "epoch": 0.3005499408291513, "grad_norm": 1.4979726076126099, "learning_rate": 3.586400184263408e-05, "loss": 1.6576, "step": 11873 }, { "epoch": 0.30133466652322216, "grad_norm": 1.4904232025146484, "learning_rate": 3.578830252043148e-05, "loss": 1.6476, "step": 11904 }, { "epoch": 0.30211939221729306, "grad_norm": 1.5472886562347412, "learning_rate": 3.571248145370125e-05, "loss": 1.6721, "step": 11935 }, { "epoch": 0.30290411791136396, "grad_norm": 1.4954209327697754, "learning_rate": 3.5636539498073794e-05, "loss": 1.6483, "step": 11966 }, { "epoch": 0.30368884360543486, "grad_norm": 1.4504363536834717, "learning_rate": 3.556047751054378e-05, "loss": 1.657, "step": 11997 }, { "epoch": 0.30447356929950575, "grad_norm": 1.3581033945083618, "learning_rate": 3.548429634946039e-05, "loss": 1.6579, "step": 12028 }, { "epoch": 0.30525829499357665, "grad_norm": 1.4421014785766602, "learning_rate": 3.540799687451768e-05, "loss": 1.6496, "step": 12059 }, { "epoch": 0.30604302068764755, "grad_norm": 1.523169994354248, "learning_rate": 3.533157994674485e-05, "loss": 1.6714, "step": 12090 }, { "epoch": 0.30682774638171845, "grad_norm": 1.455269455909729, "learning_rate": 3.5255046428496546e-05, "loss": 1.6695, "step": 12121 }, { "epoch": 0.3076124720757893, "grad_norm": 1.4330891370773315, "learning_rate": 3.517839718344311e-05, "loss": 1.6519, "step": 12152 }, { "epoch": 0.3083971977698602, "grad_norm": 1.3913158178329468, "learning_rate": 3.510163307656086e-05, "loss": 1.6329, "step": 12183 }, { "epoch": 0.3091819234639311, "grad_norm": 1.355193018913269, "learning_rate": 3.5024754974122324e-05, "loss": 1.624, "step": 12214 }, { "epoch": 0.309966649158002, "grad_norm": 1.4055231809616089, "learning_rate": 3.494776374368643e-05, "loss": 1.6491, "step": 12245 }, { "epoch": 0.3107513748520729, "grad_norm": 1.4227032661437988, "learning_rate": 3.4870660254088724e-05, "loss": 1.6274, "step": 12276 }, { "epoch": 0.3115361005461438, "grad_norm": 1.4558427333831787, "learning_rate": 3.479344537543164e-05, "loss": 1.6419, "step": 12307 }, { "epoch": 0.3123208262402147, "grad_norm": 1.5154629945755005, "learning_rate": 3.4716119979074565e-05, "loss": 1.6443, "step": 12338 }, { "epoch": 0.3131055519342856, "grad_norm": 1.4458774328231812, "learning_rate": 3.463868493762412e-05, "loss": 1.6615, "step": 12369 }, { "epoch": 0.3138902776283564, "grad_norm": 1.4116544723510742, "learning_rate": 3.456114112492418e-05, "loss": 1.6481, "step": 12400 }, { "epoch": 0.3146750033224273, "grad_norm": 1.8497071266174316, "learning_rate": 3.4483489416046164e-05, "loss": 1.6262, "step": 12431 }, { "epoch": 0.3154597290164982, "grad_norm": 1.3854331970214844, "learning_rate": 3.440573068727905e-05, "loss": 1.6387, "step": 12462 }, { "epoch": 0.3162444547105691, "grad_norm": 1.509178876876831, "learning_rate": 3.4327865816119495e-05, "loss": 1.6566, "step": 12493 }, { "epoch": 0.31702918040464, "grad_norm": 1.3977612257003784, "learning_rate": 3.4249895681262025e-05, "loss": 1.6676, "step": 12524 }, { "epoch": 0.3178139060987109, "grad_norm": 1.3736423254013062, "learning_rate": 3.417182116258899e-05, "loss": 1.6238, "step": 12555 }, { "epoch": 0.3185986317927818, "grad_norm": 1.4226630926132202, "learning_rate": 3.409364314116074e-05, "loss": 1.6513, "step": 12586 }, { "epoch": 0.3193833574868527, "grad_norm": 1.4804571866989136, "learning_rate": 3.401536249920559e-05, "loss": 1.6383, "step": 12617 }, { "epoch": 0.32016808318092355, "grad_norm": 1.456168532371521, "learning_rate": 3.393698012010998e-05, "loss": 1.6621, "step": 12648 }, { "epoch": 0.32095280887499444, "grad_norm": 1.3990952968597412, "learning_rate": 3.385849688840839e-05, "loss": 1.6376, "step": 12679 }, { "epoch": 0.32173753456906534, "grad_norm": 1.3588812351226807, "learning_rate": 3.3779913689773414e-05, "loss": 1.656, "step": 12710 }, { "epoch": 0.32252226026313624, "grad_norm": 1.4718931913375854, "learning_rate": 3.370123141100578e-05, "loss": 1.6255, "step": 12741 }, { "epoch": 0.32330698595720714, "grad_norm": 1.3603503704071045, "learning_rate": 3.3622450940024305e-05, "loss": 1.6517, "step": 12772 }, { "epoch": 0.32409171165127804, "grad_norm": 1.4493441581726074, "learning_rate": 3.35435731658559e-05, "loss": 1.643, "step": 12803 }, { "epoch": 0.32487643734534893, "grad_norm": 1.3813337087631226, "learning_rate": 3.346459897862552e-05, "loss": 1.6449, "step": 12834 }, { "epoch": 0.32566116303941983, "grad_norm": 1.5027899742126465, "learning_rate": 3.338552926954613e-05, "loss": 1.6497, "step": 12865 }, { "epoch": 0.3264458887334907, "grad_norm": 1.3805309534072876, "learning_rate": 3.330636493090868e-05, "loss": 1.6449, "step": 12896 }, { "epoch": 0.3272306144275616, "grad_norm": 1.642248511314392, "learning_rate": 3.322710685607193e-05, "loss": 1.6261, "step": 12927 }, { "epoch": 0.32801534012163247, "grad_norm": 1.4579522609710693, "learning_rate": 3.314775593945251e-05, "loss": 1.6648, "step": 12958 }, { "epoch": 0.32880006581570337, "grad_norm": 1.3579092025756836, "learning_rate": 3.3068313076514714e-05, "loss": 1.6468, "step": 12989 }, { "epoch": 0.32958479150977427, "grad_norm": 1.406051754951477, "learning_rate": 3.298877916376047e-05, "loss": 1.6249, "step": 13020 }, { "epoch": 0.33036951720384516, "grad_norm": 1.457335114479065, "learning_rate": 3.290915509871915e-05, "loss": 1.6353, "step": 13051 }, { "epoch": 0.33115424289791606, "grad_norm": 1.4548041820526123, "learning_rate": 3.282944177993753e-05, "loss": 1.6272, "step": 13082 }, { "epoch": 0.33193896859198696, "grad_norm": 1.4140032529830933, "learning_rate": 3.274964010696957e-05, "loss": 1.6479, "step": 13113 }, { "epoch": 0.3327236942860578, "grad_norm": 1.3436623811721802, "learning_rate": 3.266975098036629e-05, "loss": 1.6452, "step": 13144 }, { "epoch": 0.3335084199801287, "grad_norm": 1.4224274158477783, "learning_rate": 3.258977530166562e-05, "loss": 1.6242, "step": 13175 }, { "epoch": 0.3342931456741996, "grad_norm": 1.5661940574645996, "learning_rate": 3.250971397338227e-05, "loss": 1.6404, "step": 13206 }, { "epoch": 0.3350778713682705, "grad_norm": 1.4696576595306396, "learning_rate": 3.2429567898997404e-05, "loss": 1.6436, "step": 13237 }, { "epoch": 0.3358625970623414, "grad_norm": 1.4438591003417969, "learning_rate": 3.234933798294859e-05, "loss": 1.6404, "step": 13268 }, { "epoch": 0.3366473227564123, "grad_norm": 1.4548406600952148, "learning_rate": 3.2269025130619535e-05, "loss": 1.6461, "step": 13299 }, { "epoch": 0.3374320484504832, "grad_norm": 1.4180691242218018, "learning_rate": 3.218863024832985e-05, "loss": 1.6377, "step": 13330 }, { "epoch": 0.3382167741445541, "grad_norm": 1.4060105085372925, "learning_rate": 3.2108154243324864e-05, "loss": 1.6045, "step": 13361 }, { "epoch": 0.33900149983862493, "grad_norm": 1.4134920835494995, "learning_rate": 3.2027598023765345e-05, "loss": 1.6264, "step": 13392 }, { "epoch": 0.33978622553269583, "grad_norm": 1.4582122564315796, "learning_rate": 3.194696249871729e-05, "loss": 1.623, "step": 13423 }, { "epoch": 0.3405709512267667, "grad_norm": 1.4027389287948608, "learning_rate": 3.186624857814164e-05, "loss": 1.6337, "step": 13454 }, { "epoch": 0.3413556769208376, "grad_norm": 1.3397070169448853, "learning_rate": 3.178545717288401e-05, "loss": 1.6334, "step": 13485 }, { "epoch": 0.3421404026149085, "grad_norm": 1.5358332395553589, "learning_rate": 3.170458919466444e-05, "loss": 1.6393, "step": 13516 }, { "epoch": 0.3429251283089794, "grad_norm": 1.5479260683059692, "learning_rate": 3.1623645556067063e-05, "loss": 1.6357, "step": 13547 }, { "epoch": 0.3437098540030503, "grad_norm": 1.3949965238571167, "learning_rate": 3.154262717052985e-05, "loss": 1.6325, "step": 13578 }, { "epoch": 0.3444945796971212, "grad_norm": 1.392903208732605, "learning_rate": 3.146153495233426e-05, "loss": 1.6071, "step": 13609 }, { "epoch": 0.34527930539119206, "grad_norm": 1.4290788173675537, "learning_rate": 3.1380369816594944e-05, "loss": 1.6266, "step": 13640 }, { "epoch": 0.34606403108526296, "grad_norm": 1.4005228281021118, "learning_rate": 3.129913267924946e-05, "loss": 1.6391, "step": 13671 }, { "epoch": 0.34684875677933386, "grad_norm": 1.378369927406311, "learning_rate": 3.121782445704782e-05, "loss": 1.6495, "step": 13702 }, { "epoch": 0.34763348247340475, "grad_norm": 1.4202784299850464, "learning_rate": 3.11364460675423e-05, "loss": 1.637, "step": 13733 }, { "epoch": 0.34841820816747565, "grad_norm": 1.3670291900634766, "learning_rate": 3.1054998429076934e-05, "loss": 1.5941, "step": 13764 }, { "epoch": 0.34920293386154655, "grad_norm": 1.3714202642440796, "learning_rate": 3.097348246077728e-05, "loss": 1.6096, "step": 13795 }, { "epoch": 0.34998765955561745, "grad_norm": 1.4889552593231201, "learning_rate": 3.0891899082539924e-05, "loss": 1.6245, "step": 13826 }, { "epoch": 0.35077238524968835, "grad_norm": 1.4640086889266968, "learning_rate": 3.0810249215022233e-05, "loss": 1.6197, "step": 13857 }, { "epoch": 0.35155711094375924, "grad_norm": 1.385380506515503, "learning_rate": 3.0728533779631865e-05, "loss": 1.61, "step": 13888 }, { "epoch": 0.3523418366378301, "grad_norm": 1.3958945274353027, "learning_rate": 3.064675369851637e-05, "loss": 1.6139, "step": 13919 }, { "epoch": 0.353126562331901, "grad_norm": 1.3746731281280518, "learning_rate": 3.056490989455289e-05, "loss": 1.6307, "step": 13950 }, { "epoch": 0.3539112880259719, "grad_norm": 1.4196429252624512, "learning_rate": 3.0483003291337596e-05, "loss": 1.6192, "step": 13981 }, { "epoch": 0.3546960137200428, "grad_norm": 1.3648637533187866, "learning_rate": 3.040103481317539e-05, "loss": 1.6124, "step": 14012 }, { "epoch": 0.3554807394141137, "grad_norm": 1.422004222869873, "learning_rate": 3.03190053850694e-05, "loss": 1.6288, "step": 14043 }, { "epoch": 0.3562654651081846, "grad_norm": 1.4687801599502563, "learning_rate": 3.0236915932710573e-05, "loss": 1.6118, "step": 14074 }, { "epoch": 0.3570501908022555, "grad_norm": 1.30635404586792, "learning_rate": 3.0154767382467232e-05, "loss": 1.6341, "step": 14105 }, { "epoch": 0.35783491649632637, "grad_norm": 1.4216945171356201, "learning_rate": 3.0072560661374582e-05, "loss": 1.6385, "step": 14136 }, { "epoch": 0.3586196421903972, "grad_norm": 1.4296518564224243, "learning_rate": 2.999029669712431e-05, "loss": 1.6262, "step": 14167 }, { "epoch": 0.3594043678844681, "grad_norm": 1.4529691934585571, "learning_rate": 2.990797641805408e-05, "loss": 1.6136, "step": 14198 }, { "epoch": 0.360189093578539, "grad_norm": 1.389478325843811, "learning_rate": 2.982560075313704e-05, "loss": 1.6263, "step": 14229 }, { "epoch": 0.3609738192726099, "grad_norm": 1.3917667865753174, "learning_rate": 2.9743170631971368e-05, "loss": 1.6456, "step": 14260 }, { "epoch": 0.3617585449666808, "grad_norm": 1.3452563285827637, "learning_rate": 2.9660686984769792e-05, "loss": 1.6284, "step": 14291 }, { "epoch": 0.3625432706607517, "grad_norm": 1.421159029006958, "learning_rate": 2.9578150742349047e-05, "loss": 1.6232, "step": 14322 }, { "epoch": 0.3633279963548226, "grad_norm": 1.4312077760696411, "learning_rate": 2.949556283611942e-05, "loss": 1.6006, "step": 14353 }, { "epoch": 0.3641127220488935, "grad_norm": 1.4271692037582397, "learning_rate": 2.9412924198074206e-05, "loss": 1.6177, "step": 14384 }, { "epoch": 0.36489744774296434, "grad_norm": 1.3584555387496948, "learning_rate": 2.9330235760779208e-05, "loss": 1.6148, "step": 14415 }, { "epoch": 0.36568217343703524, "grad_norm": 1.3882123231887817, "learning_rate": 2.9247498457362188e-05, "loss": 1.6327, "step": 14446 }, { "epoch": 0.36646689913110614, "grad_norm": 1.540114402770996, "learning_rate": 2.9164713221502373e-05, "loss": 1.6052, "step": 14477 }, { "epoch": 0.36725162482517704, "grad_norm": 1.3554641008377075, "learning_rate": 2.9081880987419912e-05, "loss": 1.6091, "step": 14508 }, { "epoch": 0.36803635051924793, "grad_norm": 1.3693712949752808, "learning_rate": 2.8999002689865296e-05, "loss": 1.5936, "step": 14539 }, { "epoch": 0.36882107621331883, "grad_norm": 1.354278564453125, "learning_rate": 2.8916079264108852e-05, "loss": 1.612, "step": 14570 }, { "epoch": 0.36960580190738973, "grad_norm": 1.3731021881103516, "learning_rate": 2.883311164593017e-05, "loss": 1.6064, "step": 14601 }, { "epoch": 0.37039052760146063, "grad_norm": 1.3914356231689453, "learning_rate": 2.875010077160754e-05, "loss": 1.6036, "step": 14632 }, { "epoch": 0.37117525329553147, "grad_norm": 1.4811164140701294, "learning_rate": 2.866704757790741e-05, "loss": 1.6195, "step": 14663 }, { "epoch": 0.37195997898960237, "grad_norm": 1.4619332551956177, "learning_rate": 2.858395300207376e-05, "loss": 1.6315, "step": 14694 }, { "epoch": 0.37274470468367327, "grad_norm": 1.456950306892395, "learning_rate": 2.8500817981817607e-05, "loss": 1.6276, "step": 14725 }, { "epoch": 0.37352943037774416, "grad_norm": 5.129410266876221, "learning_rate": 2.8417643455306336e-05, "loss": 1.6234, "step": 14756 }, { "epoch": 0.37431415607181506, "grad_norm": 1.3831191062927246, "learning_rate": 2.8334430361153185e-05, "loss": 1.6163, "step": 14787 }, { "epoch": 0.37509888176588596, "grad_norm": 1.3817623853683472, "learning_rate": 2.8251179638406612e-05, "loss": 1.6206, "step": 14818 }, { "epoch": 0.37588360745995686, "grad_norm": 1.5285260677337646, "learning_rate": 2.8167892226539704e-05, "loss": 1.6117, "step": 14849 }, { "epoch": 0.37666833315402776, "grad_norm": 1.403324007987976, "learning_rate": 2.8084569065439588e-05, "loss": 1.5962, "step": 14880 }, { "epoch": 0.3774530588480986, "grad_norm": 1.3314014673233032, "learning_rate": 2.8001211095396807e-05, "loss": 1.6116, "step": 14911 }, { "epoch": 0.3782377845421695, "grad_norm": 1.4300462007522583, "learning_rate": 2.791781925709473e-05, "loss": 1.6234, "step": 14942 }, { "epoch": 0.3790225102362404, "grad_norm": 1.424811601638794, "learning_rate": 2.7834394491598908e-05, "loss": 1.5986, "step": 14973 }, { "epoch": 0.3798072359303113, "grad_norm": 1.3818182945251465, "learning_rate": 2.7750937740346485e-05, "loss": 1.6012, "step": 15004 }, { "epoch": 0.3805919616243822, "grad_norm": 1.4053683280944824, "learning_rate": 2.7667449945135564e-05, "loss": 1.6018, "step": 15035 }, { "epoch": 0.3813766873184531, "grad_norm": 1.5093421936035156, "learning_rate": 2.7583932048114557e-05, "loss": 1.61, "step": 15066 }, { "epoch": 0.382161413012524, "grad_norm": 1.412494421005249, "learning_rate": 2.7500384991771587e-05, "loss": 1.613, "step": 15097 }, { "epoch": 0.3829461387065949, "grad_norm": 1.335167646408081, "learning_rate": 2.7416809718923825e-05, "loss": 1.6197, "step": 15128 }, { "epoch": 0.3837308644006657, "grad_norm": 1.334786295890808, "learning_rate": 2.7333207172706864e-05, "loss": 1.6284, "step": 15159 }, { "epoch": 0.3845155900947366, "grad_norm": 1.4039522409439087, "learning_rate": 2.7249578296564088e-05, "loss": 1.5889, "step": 15190 }, { "epoch": 0.3853003157888075, "grad_norm": 1.4196487665176392, "learning_rate": 2.7165924034235973e-05, "loss": 1.6132, "step": 15221 }, { "epoch": 0.3860850414828784, "grad_norm": 1.4701744318008423, "learning_rate": 2.708224532974953e-05, "loss": 1.6009, "step": 15252 } ], "logging_steps": 31, "max_steps": 30517, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3052, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1319725270111355e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }