{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1576, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006345177664974619, "grad_norm": 1.6225066184997559, "learning_rate": 9.999889621822132e-06, "loss": 0.9998, "step": 10 }, { "epoch": 0.012690355329949238, "grad_norm": 1.652418613433838, "learning_rate": 9.999558492161865e-06, "loss": 0.9514, "step": 20 }, { "epoch": 0.01903553299492386, "grad_norm": 1.589950680732727, "learning_rate": 9.999006625638994e-06, "loss": 0.9091, "step": 30 }, { "epoch": 0.025380710659898477, "grad_norm": 1.3906126022338867, "learning_rate": 9.998234046619128e-06, "loss": 0.9221, "step": 40 }, { "epoch": 0.031725888324873094, "grad_norm": 1.4328875541687012, "learning_rate": 9.997240789212612e-06, "loss": 0.9083, "step": 50 }, { "epoch": 0.03807106598984772, "grad_norm": 1.682231068611145, "learning_rate": 9.996026897273024e-06, "loss": 0.884, "step": 60 }, { "epoch": 0.044416243654822336, "grad_norm": 1.554661750793457, "learning_rate": 9.994592424395235e-06, "loss": 0.8974, "step": 70 }, { "epoch": 0.050761421319796954, "grad_norm": 1.4692213535308838, "learning_rate": 9.992937433913048e-06, "loss": 0.9087, "step": 80 }, { "epoch": 0.05710659898477157, "grad_norm": 1.7308210134506226, "learning_rate": 9.991061998896395e-06, "loss": 0.908, "step": 90 }, { "epoch": 0.06345177664974619, "grad_norm": 1.3942962884902954, "learning_rate": 9.988966202148115e-06, "loss": 0.8783, "step": 100 }, { "epoch": 0.06979695431472081, "grad_norm": 1.4834920167922974, "learning_rate": 9.9866501362003e-06, "loss": 0.8663, "step": 110 }, { "epoch": 0.07614213197969544, "grad_norm": 1.572066068649292, "learning_rate": 9.984113903310206e-06, "loss": 0.8876, "step": 120 }, { "epoch": 0.08248730964467005, "grad_norm": 1.412290096282959, "learning_rate": 9.981357615455738e-06, "loss": 0.8744, "step": 130 }, { "epoch": 0.08883248730964467, "grad_norm": 1.4933444261550903, "learning_rate": 9.978381394330509e-06, "loss": 0.8877, "step": 140 }, { "epoch": 0.09517766497461928, "grad_norm": 1.44158935546875, "learning_rate": 9.975185371338464e-06, "loss": 0.8683, "step": 150 }, { "epoch": 0.10152284263959391, "grad_norm": 1.4896454811096191, "learning_rate": 9.971769687588082e-06, "loss": 0.8716, "step": 160 }, { "epoch": 0.10786802030456853, "grad_norm": 1.7083044052124023, "learning_rate": 9.968134493886143e-06, "loss": 0.8689, "step": 170 }, { "epoch": 0.11421319796954314, "grad_norm": 1.6805241107940674, "learning_rate": 9.964279950731066e-06, "loss": 0.8631, "step": 180 }, { "epoch": 0.12055837563451777, "grad_norm": 1.469270944595337, "learning_rate": 9.960206228305835e-06, "loss": 0.8605, "step": 190 }, { "epoch": 0.12690355329949238, "grad_norm": 1.587968349456787, "learning_rate": 9.955913506470472e-06, "loss": 0.8744, "step": 200 }, { "epoch": 0.13324873096446702, "grad_norm": 1.3910565376281738, "learning_rate": 9.951401974754103e-06, "loss": 0.864, "step": 210 }, { "epoch": 0.13959390862944163, "grad_norm": 1.3854566812515259, "learning_rate": 9.946671832346588e-06, "loss": 0.8605, "step": 220 }, { "epoch": 0.14593908629441624, "grad_norm": 1.3671081066131592, "learning_rate": 9.941723288089727e-06, "loss": 0.8492, "step": 230 }, { "epoch": 0.15228426395939088, "grad_norm": 1.5190922021865845, "learning_rate": 9.936556560468037e-06, "loss": 0.8479, "step": 240 }, { "epoch": 0.15862944162436549, "grad_norm": 1.4076085090637207, "learning_rate": 9.931171877599113e-06, "loss": 0.8539, "step": 250 }, { "epoch": 0.1649746192893401, "grad_norm": 1.3882677555084229, "learning_rate": 9.925569477223549e-06, "loss": 0.842, "step": 260 }, { "epoch": 0.1713197969543147, "grad_norm": 1.4815824031829834, "learning_rate": 9.91974960669444e-06, "loss": 0.8368, "step": 270 }, { "epoch": 0.17766497461928935, "grad_norm": 1.468481183052063, "learning_rate": 9.91371252296647e-06, "loss": 0.8573, "step": 280 }, { "epoch": 0.18401015228426396, "grad_norm": 1.315491795539856, "learning_rate": 9.907458492584561e-06, "loss": 0.8546, "step": 290 }, { "epoch": 0.19035532994923857, "grad_norm": 1.5127493143081665, "learning_rate": 9.9009877916721e-06, "loss": 0.865, "step": 300 }, { "epoch": 0.1967005076142132, "grad_norm": 1.3773964643478394, "learning_rate": 9.89430070591876e-06, "loss": 0.8394, "step": 310 }, { "epoch": 0.20304568527918782, "grad_norm": 1.3822346925735474, "learning_rate": 9.888097563523227e-06, "loss": 0.8332, "step": 320 }, { "epoch": 0.20939086294416243, "grad_norm": 1.5273101329803467, "learning_rate": 9.881000167853215e-06, "loss": 0.863, "step": 330 }, { "epoch": 0.21573604060913706, "grad_norm": 1.39267897605896, "learning_rate": 9.873687269821322e-06, "loss": 0.8677, "step": 340 }, { "epoch": 0.22208121827411167, "grad_norm": 1.564704418182373, "learning_rate": 9.866159192301294e-06, "loss": 0.8336, "step": 350 }, { "epoch": 0.22842639593908629, "grad_norm": 1.5593595504760742, "learning_rate": 9.858416267667318e-06, "loss": 0.8476, "step": 360 }, { "epoch": 0.23477157360406092, "grad_norm": 1.5241975784301758, "learning_rate": 9.850458837779366e-06, "loss": 0.8521, "step": 370 }, { "epoch": 0.24111675126903553, "grad_norm": 1.4592645168304443, "learning_rate": 9.842287253968077e-06, "loss": 0.8531, "step": 380 }, { "epoch": 0.24746192893401014, "grad_norm": 1.327803373336792, "learning_rate": 9.833901877019266e-06, "loss": 0.8526, "step": 390 }, { "epoch": 0.25380710659898476, "grad_norm": 1.351017713546753, "learning_rate": 9.825303077157983e-06, "loss": 0.8503, "step": 400 }, { "epoch": 0.26015228426395937, "grad_norm": 1.6651396751403809, "learning_rate": 9.816491234032175e-06, "loss": 0.8758, "step": 410 }, { "epoch": 0.26649746192893403, "grad_norm": 1.4911518096923828, "learning_rate": 9.807466736695912e-06, "loss": 0.8494, "step": 420 }, { "epoch": 0.27284263959390864, "grad_norm": 1.34939706325531, "learning_rate": 9.798229983592229e-06, "loss": 0.8419, "step": 430 }, { "epoch": 0.27918781725888325, "grad_norm": 1.4626591205596924, "learning_rate": 9.788781382535512e-06, "loss": 0.8479, "step": 440 }, { "epoch": 0.28553299492385786, "grad_norm": 1.5620290040969849, "learning_rate": 9.77912135069351e-06, "loss": 0.8442, "step": 450 }, { "epoch": 0.2918781725888325, "grad_norm": 1.3260949850082397, "learning_rate": 9.769250314568907e-06, "loss": 0.818, "step": 460 }, { "epoch": 0.2982233502538071, "grad_norm": 1.3637892007827759, "learning_rate": 9.7591687099805e-06, "loss": 0.8404, "step": 470 }, { "epoch": 0.30456852791878175, "grad_norm": 1.5293395519256592, "learning_rate": 9.74887698204394e-06, "loss": 0.8334, "step": 480 }, { "epoch": 0.31091370558375636, "grad_norm": 1.3983063697814941, "learning_rate": 9.738375585152103e-06, "loss": 0.8225, "step": 490 }, { "epoch": 0.31725888324873097, "grad_norm": 1.3531032800674438, "learning_rate": 9.727664982955008e-06, "loss": 0.8427, "step": 500 }, { "epoch": 0.3236040609137056, "grad_norm": 1.5252591371536255, "learning_rate": 9.716745648339356e-06, "loss": 0.8625, "step": 510 }, { "epoch": 0.3299492385786802, "grad_norm": 1.653649926185608, "learning_rate": 9.705618063407653e-06, "loss": 0.8305, "step": 520 }, { "epoch": 0.3362944162436548, "grad_norm": 1.4992367029190063, "learning_rate": 9.694282719456916e-06, "loss": 0.8149, "step": 530 }, { "epoch": 0.3426395939086294, "grad_norm": 1.431707739830017, "learning_rate": 9.682740116956992e-06, "loss": 0.8355, "step": 540 }, { "epoch": 0.3489847715736041, "grad_norm": 1.31185781955719, "learning_rate": 9.670990765528453e-06, "loss": 0.8408, "step": 550 }, { "epoch": 0.3553299492385787, "grad_norm": 1.550418734550476, "learning_rate": 9.659035183920098e-06, "loss": 0.8227, "step": 560 }, { "epoch": 0.3616751269035533, "grad_norm": 1.4619323015213013, "learning_rate": 9.646873899986054e-06, "loss": 0.866, "step": 570 }, { "epoch": 0.3680203045685279, "grad_norm": 1.4029253721237183, "learning_rate": 9.634507450662463e-06, "loss": 0.8411, "step": 580 }, { "epoch": 0.3743654822335025, "grad_norm": 1.401058554649353, "learning_rate": 9.621936381943787e-06, "loss": 0.848, "step": 590 }, { "epoch": 0.38071065989847713, "grad_norm": 1.3282866477966309, "learning_rate": 9.609161248858684e-06, "loss": 0.8391, "step": 600 }, { "epoch": 0.3870558375634518, "grad_norm": 1.4488921165466309, "learning_rate": 9.596182615445522e-06, "loss": 0.8236, "step": 610 }, { "epoch": 0.3934010152284264, "grad_norm": 1.3285187482833862, "learning_rate": 9.583001054727463e-06, "loss": 0.8439, "step": 620 }, { "epoch": 0.399746192893401, "grad_norm": 1.5738445520401, "learning_rate": 9.56961714868717e-06, "loss": 0.8377, "step": 630 }, { "epoch": 0.40609137055837563, "grad_norm": 1.4648795127868652, "learning_rate": 9.556031488241107e-06, "loss": 0.8337, "step": 640 }, { "epoch": 0.41243654822335024, "grad_norm": 1.284525990486145, "learning_rate": 9.54224467321345e-06, "loss": 0.838, "step": 650 }, { "epoch": 0.41878172588832485, "grad_norm": 1.3344087600708008, "learning_rate": 9.528257312309608e-06, "loss": 0.8242, "step": 660 }, { "epoch": 0.4251269035532995, "grad_norm": 1.4964150190353394, "learning_rate": 9.514070023089348e-06, "loss": 0.8373, "step": 670 }, { "epoch": 0.43147208121827413, "grad_norm": 1.324698805809021, "learning_rate": 9.49968343193952e-06, "loss": 0.8422, "step": 680 }, { "epoch": 0.43781725888324874, "grad_norm": 1.2812756299972534, "learning_rate": 9.485098174046412e-06, "loss": 0.8341, "step": 690 }, { "epoch": 0.44416243654822335, "grad_norm": 1.4059038162231445, "learning_rate": 9.4703148933677e-06, "loss": 0.8187, "step": 700 }, { "epoch": 0.45050761421319796, "grad_norm": 1.5261121988296509, "learning_rate": 9.455334242604018e-06, "loss": 0.8298, "step": 710 }, { "epoch": 0.45685279187817257, "grad_norm": 1.420009732246399, "learning_rate": 9.440156883170137e-06, "loss": 0.8347, "step": 720 }, { "epoch": 0.4631979695431472, "grad_norm": 1.2983534336090088, "learning_rate": 9.424783485165775e-06, "loss": 0.8346, "step": 730 }, { "epoch": 0.46954314720812185, "grad_norm": 1.2840627431869507, "learning_rate": 9.409214727345987e-06, "loss": 0.8151, "step": 740 }, { "epoch": 0.47588832487309646, "grad_norm": 1.3696503639221191, "learning_rate": 9.39345129709123e-06, "loss": 0.8214, "step": 750 }, { "epoch": 0.48223350253807107, "grad_norm": 1.370424509048462, "learning_rate": 9.37749389037698e-06, "loss": 0.8293, "step": 760 }, { "epoch": 0.4885786802030457, "grad_norm": 1.4171712398529053, "learning_rate": 9.361343211743033e-06, "loss": 0.8235, "step": 770 }, { "epoch": 0.4949238578680203, "grad_norm": 1.2537575960159302, "learning_rate": 9.344999974262377e-06, "loss": 0.8188, "step": 780 }, { "epoch": 0.501269035532995, "grad_norm": 1.3169902563095093, "learning_rate": 9.328464899509722e-06, "loss": 0.8169, "step": 790 }, { "epoch": 0.5076142131979695, "grad_norm": 1.2017827033996582, "learning_rate": 9.31173871752964e-06, "loss": 0.8175, "step": 800 }, { "epoch": 0.5139593908629442, "grad_norm": 1.5057929754257202, "learning_rate": 9.294822166804323e-06, "loss": 0.8227, "step": 810 }, { "epoch": 0.5203045685279187, "grad_norm": 1.4769108295440674, "learning_rate": 9.277715994220989e-06, "loss": 0.8137, "step": 820 }, { "epoch": 0.5266497461928934, "grad_norm": 1.3442022800445557, "learning_rate": 9.260420955038904e-06, "loss": 0.8247, "step": 830 }, { "epoch": 0.5329949238578681, "grad_norm": 1.2690612077713013, "learning_rate": 9.242937812856034e-06, "loss": 0.8087, "step": 840 }, { "epoch": 0.5393401015228426, "grad_norm": 1.277985692024231, "learning_rate": 9.225267339575325e-06, "loss": 0.8089, "step": 850 }, { "epoch": 0.5456852791878173, "grad_norm": 1.3761783838272095, "learning_rate": 9.207410315370639e-06, "loss": 0.8248, "step": 860 }, { "epoch": 0.5520304568527918, "grad_norm": 1.3367259502410889, "learning_rate": 9.18936752865229e-06, "loss": 0.8253, "step": 870 }, { "epoch": 0.5583756345177665, "grad_norm": 1.293145775794983, "learning_rate": 9.17113977603225e-06, "loss": 0.8077, "step": 880 }, { "epoch": 0.5647208121827412, "grad_norm": 1.290528655052185, "learning_rate": 9.152727862288963e-06, "loss": 0.8216, "step": 890 }, { "epoch": 0.5710659898477157, "grad_norm": 1.298192024230957, "learning_rate": 9.134132600331829e-06, "loss": 0.824, "step": 900 }, { "epoch": 0.5774111675126904, "grad_norm": 1.313166856765747, "learning_rate": 9.115354811165298e-06, "loss": 0.8197, "step": 910 }, { "epoch": 0.583756345177665, "grad_norm": 1.2986682653427124, "learning_rate": 9.096395323852635e-06, "loss": 0.8172, "step": 920 }, { "epoch": 0.5901015228426396, "grad_norm": 1.3514500856399536, "learning_rate": 9.0772549754793e-06, "loss": 0.835, "step": 930 }, { "epoch": 0.5964467005076142, "grad_norm": 1.1752053499221802, "learning_rate": 9.057934611116008e-06, "loss": 0.8113, "step": 940 }, { "epoch": 0.6027918781725888, "grad_norm": 1.230542540550232, "learning_rate": 9.038435083781401e-06, "loss": 0.8115, "step": 950 }, { "epoch": 0.6091370558375635, "grad_norm": 1.247090458869934, "learning_rate": 9.0187572544044e-06, "loss": 0.8202, "step": 960 }, { "epoch": 0.6154822335025381, "grad_norm": 1.3615097999572754, "learning_rate": 8.998901991786186e-06, "loss": 0.8369, "step": 970 }, { "epoch": 0.6218274111675127, "grad_norm": 1.3490197658538818, "learning_rate": 8.978870172561842e-06, "loss": 0.8084, "step": 980 }, { "epoch": 0.6281725888324873, "grad_norm": 1.2877025604248047, "learning_rate": 8.95866268116165e-06, "loss": 0.7929, "step": 990 }, { "epoch": 0.6345177664974619, "grad_norm": 1.5650062561035156, "learning_rate": 8.938280409772038e-06, "loss": 0.8263, "step": 1000 }, { "epoch": 0.6408629441624365, "grad_norm": 1.2181209325790405, "learning_rate": 8.917724258296204e-06, "loss": 0.8137, "step": 1010 }, { "epoch": 0.6472081218274112, "grad_norm": 1.4872324466705322, "learning_rate": 8.896995134314361e-06, "loss": 0.8097, "step": 1020 }, { "epoch": 0.6535532994923858, "grad_norm": 1.3156676292419434, "learning_rate": 8.876093953043683e-06, "loss": 0.8149, "step": 1030 }, { "epoch": 0.6598984771573604, "grad_norm": 1.358504295349121, "learning_rate": 8.855021637297893e-06, "loss": 0.8128, "step": 1040 }, { "epoch": 0.666243654822335, "grad_norm": 1.4342836141586304, "learning_rate": 8.833779117446515e-06, "loss": 0.8101, "step": 1050 }, { "epoch": 0.6725888324873096, "grad_norm": 1.2380530834197998, "learning_rate": 8.812367331373806e-06, "loss": 0.8046, "step": 1060 }, { "epoch": 0.6789340101522843, "grad_norm": 1.253306269645691, "learning_rate": 8.790787224437334e-06, "loss": 0.8255, "step": 1070 }, { "epoch": 0.6852791878172588, "grad_norm": 1.514151692390442, "learning_rate": 8.769039749426256e-06, "loss": 0.7946, "step": 1080 }, { "epoch": 0.6916243654822335, "grad_norm": 1.4850667715072632, "learning_rate": 8.747125866519236e-06, "loss": 0.8006, "step": 1090 }, { "epoch": 0.6979695431472082, "grad_norm": 1.4190680980682373, "learning_rate": 8.725046543242061e-06, "loss": 0.8243, "step": 1100 }, { "epoch": 0.7043147208121827, "grad_norm": 1.4006381034851074, "learning_rate": 8.70280275442492e-06, "loss": 0.829, "step": 1110 }, { "epoch": 0.7106598984771574, "grad_norm": 1.2715387344360352, "learning_rate": 8.680395482159364e-06, "loss": 0.806, "step": 1120 }, { "epoch": 0.7170050761421319, "grad_norm": 1.4159283638000488, "learning_rate": 8.657825715754947e-06, "loss": 0.8173, "step": 1130 }, { "epoch": 0.7233502538071066, "grad_norm": 1.368446946144104, "learning_rate": 8.63509445169554e-06, "loss": 0.8042, "step": 1140 }, { "epoch": 0.7296954314720813, "grad_norm": 1.3700900077819824, "learning_rate": 8.612202693595356e-06, "loss": 0.8016, "step": 1150 }, { "epoch": 0.7360406091370558, "grad_norm": 1.4008903503417969, "learning_rate": 8.589151452154605e-06, "loss": 0.8295, "step": 1160 }, { "epoch": 0.7423857868020305, "grad_norm": 1.2189836502075195, "learning_rate": 8.565941745114901e-06, "loss": 0.8234, "step": 1170 }, { "epoch": 0.748730964467005, "grad_norm": 1.2055974006652832, "learning_rate": 8.542574597214314e-06, "loss": 0.8034, "step": 1180 }, { "epoch": 0.7550761421319797, "grad_norm": 1.2510757446289062, "learning_rate": 8.519051040142128e-06, "loss": 0.815, "step": 1190 }, { "epoch": 0.7614213197969543, "grad_norm": 1.3542871475219727, "learning_rate": 8.495372112493285e-06, "loss": 0.8099, "step": 1200 }, { "epoch": 0.7677664974619289, "grad_norm": 1.3265260457992554, "learning_rate": 8.471538859722545e-06, "loss": 0.7971, "step": 1210 }, { "epoch": 0.7741116751269036, "grad_norm": 1.2787420749664307, "learning_rate": 8.447552334098311e-06, "loss": 0.8031, "step": 1220 }, { "epoch": 0.7804568527918782, "grad_norm": 1.2956528663635254, "learning_rate": 8.42341359465618e-06, "loss": 0.8252, "step": 1230 }, { "epoch": 0.7868020304568528, "grad_norm": 1.6075105667114258, "learning_rate": 8.399123707152182e-06, "loss": 0.7794, "step": 1240 }, { "epoch": 0.7931472081218274, "grad_norm": 1.4134018421173096, "learning_rate": 8.374683744015728e-06, "loss": 0.8111, "step": 1250 }, { "epoch": 0.799492385786802, "grad_norm": 1.3463298082351685, "learning_rate": 8.350094784302253e-06, "loss": 0.818, "step": 1260 }, { "epoch": 0.8058375634517766, "grad_norm": 1.330447793006897, "learning_rate": 8.325357913645589e-06, "loss": 0.7955, "step": 1270 }, { "epoch": 0.8121827411167513, "grad_norm": 1.2311888933181763, "learning_rate": 8.300474224210018e-06, "loss": 0.8042, "step": 1280 }, { "epoch": 0.8185279187817259, "grad_norm": 1.2119029760360718, "learning_rate": 8.27544481464206e-06, "loss": 0.7754, "step": 1290 }, { "epoch": 0.8248730964467005, "grad_norm": 1.3066750764846802, "learning_rate": 8.25027079002196e-06, "loss": 0.7988, "step": 1300 }, { "epoch": 0.8312182741116751, "grad_norm": 1.4108072519302368, "learning_rate": 8.224953261814912e-06, "loss": 0.7966, "step": 1310 }, { "epoch": 0.8375634517766497, "grad_norm": 1.206554651260376, "learning_rate": 8.199493347821963e-06, "loss": 0.7973, "step": 1320 }, { "epoch": 0.8439086294416244, "grad_norm": 1.3971256017684937, "learning_rate": 8.173892172130683e-06, "loss": 0.7923, "step": 1330 }, { "epoch": 0.850253807106599, "grad_norm": 1.3277279138565063, "learning_rate": 8.148150865065514e-06, "loss": 0.7894, "step": 1340 }, { "epoch": 0.8565989847715736, "grad_norm": 1.0922918319702148, "learning_rate": 8.122270563137893e-06, "loss": 0.8043, "step": 1350 }, { "epoch": 0.8629441624365483, "grad_norm": 1.3102174997329712, "learning_rate": 8.096252408996043e-06, "loss": 0.7988, "step": 1360 }, { "epoch": 0.8692893401015228, "grad_norm": 1.36167311668396, "learning_rate": 8.070097551374543e-06, "loss": 0.8137, "step": 1370 }, { "epoch": 0.8756345177664975, "grad_norm": 1.3062223196029663, "learning_rate": 8.043807145043604e-06, "loss": 0.8001, "step": 1380 }, { "epoch": 0.881979695431472, "grad_norm": 1.3194143772125244, "learning_rate": 8.017382350758085e-06, "loss": 0.8137, "step": 1390 }, { "epoch": 0.8883248730964467, "grad_norm": 1.1782550811767578, "learning_rate": 7.990824335206242e-06, "loss": 0.7823, "step": 1400 }, { "epoch": 0.8946700507614214, "grad_norm": 1.4341142177581787, "learning_rate": 7.964134270958222e-06, "loss": 0.8075, "step": 1410 }, { "epoch": 0.9010152284263959, "grad_norm": 1.3810592889785767, "learning_rate": 7.937313336414287e-06, "loss": 0.8346, "step": 1420 }, { "epoch": 0.9073604060913706, "grad_norm": 1.2569160461425781, "learning_rate": 7.910362715752792e-06, "loss": 0.7885, "step": 1430 }, { "epoch": 0.9137055837563451, "grad_norm": 1.2901798486709595, "learning_rate": 7.8832835988779e-06, "loss": 0.7846, "step": 1440 }, { "epoch": 0.9200507614213198, "grad_norm": 1.369370460510254, "learning_rate": 7.856077181367036e-06, "loss": 0.796, "step": 1450 }, { "epoch": 0.9263959390862944, "grad_norm": 1.3031636476516724, "learning_rate": 7.828744664418121e-06, "loss": 0.82, "step": 1460 }, { "epoch": 0.932741116751269, "grad_norm": 1.371850609779358, "learning_rate": 7.801287254796522e-06, "loss": 0.7746, "step": 1470 }, { "epoch": 0.9390862944162437, "grad_norm": 1.2887086868286133, "learning_rate": 7.773706164781776e-06, "loss": 0.7718, "step": 1480 }, { "epoch": 0.9454314720812182, "grad_norm": 1.215701699256897, "learning_rate": 7.746002612114064e-06, "loss": 0.7974, "step": 1490 }, { "epoch": 0.9517766497461929, "grad_norm": 1.3907893896102905, "learning_rate": 7.718177819940455e-06, "loss": 0.7891, "step": 1500 }, { "epoch": 0.9581218274111675, "grad_norm": 1.279998779296875, "learning_rate": 7.690233016760891e-06, "loss": 0.8199, "step": 1510 }, { "epoch": 0.9644670050761421, "grad_norm": 1.2313257455825806, "learning_rate": 7.662169436373954e-06, "loss": 0.7825, "step": 1520 }, { "epoch": 0.9708121827411168, "grad_norm": 1.3036500215530396, "learning_rate": 7.633988317822394e-06, "loss": 0.7927, "step": 1530 }, { "epoch": 0.9771573604060914, "grad_norm": 1.4679865837097168, "learning_rate": 7.605690905338413e-06, "loss": 0.8148, "step": 1540 }, { "epoch": 0.983502538071066, "grad_norm": 1.1727280616760254, "learning_rate": 7.577278448288745e-06, "loss": 0.8088, "step": 1550 }, { "epoch": 0.9898477157360406, "grad_norm": 1.4299554824829102, "learning_rate": 7.548752201119484e-06, "loss": 0.8154, "step": 1560 }, { "epoch": 0.9961928934010152, "grad_norm": 1.2478923797607422, "learning_rate": 7.520113423300706e-06, "loss": 0.7988, "step": 1570 } ], "logging_steps": 10, "max_steps": 4728, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.066955944001798e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }