{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5213764337851929, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026068821689259644, "grad_norm": 49.331476986681444, "learning_rate": 3.2668258512966296e-05, "loss": 12.2547, "step": 10 }, { "epoch": 0.005213764337851929, "grad_norm": 8.763915233449037, "learning_rate": 4.2502384231474356e-05, "loss": 4.8812, "step": 20 }, { "epoch": 0.007820646506777894, "grad_norm": 3.334694167190291, "learning_rate": 4.825497900417907e-05, "loss": 1.6309, "step": 30 }, { "epoch": 0.010427528675703858, "grad_norm": 2.624343181311664, "learning_rate": 5.2336509949982417e-05, "loss": 1.3866, "step": 40 }, { "epoch": 0.013034410844629822, "grad_norm": 2.6057816533089366, "learning_rate": 5.550239130742453e-05, "loss": 1.2347, "step": 50 }, { "epoch": 0.01564129301355579, "grad_norm": 3.3478789899750456, "learning_rate": 5.8089104722687125e-05, "loss": 1.1339, "step": 60 }, { "epoch": 0.01824817518248175, "grad_norm": 2.233945689028257, "learning_rate": 6.027613975295318e-05, "loss": 1.1551, "step": 70 }, { "epoch": 0.020855057351407715, "grad_norm": 2.5906299607793, "learning_rate": 6.217063566849047e-05, "loss": 1.0871, "step": 80 }, { "epoch": 0.02346193952033368, "grad_norm": 2.1892355695478547, "learning_rate": 6.384169949539185e-05, "loss": 1.0, "step": 90 }, { "epoch": 0.026068821689259645, "grad_norm": 2.0124093217815813, "learning_rate": 6.533651702593259e-05, "loss": 1.0452, "step": 100 }, { "epoch": 0.02867570385818561, "grad_norm": 2.079141219384628, "learning_rate": 6.668874396522732e-05, "loss": 1.0112, "step": 110 }, { "epoch": 0.03128258602711158, "grad_norm": 1.9554916426797533, "learning_rate": 6.79232304411952e-05, "loss": 1.0344, "step": 120 }, { "epoch": 0.03388946819603754, "grad_norm": 1.9875868563850505, "learning_rate": 6.905884791492633e-05, "loss": 1.0269, "step": 130 }, { "epoch": 0.0364963503649635, "grad_norm": 2.1941623350784516, "learning_rate": 7.011026547146124e-05, "loss": 1.039, "step": 140 }, { "epoch": 0.03910323253388947, "grad_norm": 1.8155718741045461, "learning_rate": 7.10891117986373e-05, "loss": 1.0616, "step": 150 }, { "epoch": 0.04171011470281543, "grad_norm": 1.7586342549475686, "learning_rate": 7.200476138699854e-05, "loss": 1.0547, "step": 160 }, { "epoch": 0.0443169968717414, "grad_norm": 2.100292158595079, "learning_rate": 7.28648819635523e-05, "loss": 1.0031, "step": 170 }, { "epoch": 0.04692387904066736, "grad_norm": 2.0513256959729227, "learning_rate": 7.36758252138999e-05, "loss": 1.0158, "step": 180 }, { "epoch": 0.04953076120959333, "grad_norm": 2.1426958006504897, "learning_rate": 7.444291172327986e-05, "loss": 1.0174, "step": 190 }, { "epoch": 0.05213764337851929, "grad_norm": 1.8318521527764522, "learning_rate": 7.517064274444065e-05, "loss": 1.0546, "step": 200 }, { "epoch": 0.05474452554744526, "grad_norm": 1.886155228760021, "learning_rate": 7.586286024416594e-05, "loss": 1.0019, "step": 210 }, { "epoch": 0.05735140771637122, "grad_norm": 2.739142325850314, "learning_rate": 7.652286968373537e-05, "loss": 0.9374, "step": 220 }, { "epoch": 0.05995828988529719, "grad_norm": 1.7629924772556742, "learning_rate": 7.715353548429121e-05, "loss": 1.0266, "step": 230 }, { "epoch": 0.06256517205422316, "grad_norm": 2.165683236469649, "learning_rate": 7.775735615970326e-05, "loss": 1.0178, "step": 240 }, { "epoch": 0.06517205422314912, "grad_norm": 1.733337560836944, "learning_rate": 7.833652410188277e-05, "loss": 0.9352, "step": 250 }, { "epoch": 0.06777893639207508, "grad_norm": 1.4644627780270372, "learning_rate": 7.88929736334344e-05, "loss": 0.9917, "step": 260 }, { "epoch": 0.07038581856100104, "grad_norm": 1.8822940709742961, "learning_rate": 7.942841998660462e-05, "loss": 1.01, "step": 270 }, { "epoch": 0.072992700729927, "grad_norm": 1.796235628150547, "learning_rate": 7.99443911899693e-05, "loss": 0.9574, "step": 280 }, { "epoch": 0.07559958289885298, "grad_norm": 1.7794624205157685, "learning_rate": 8.044225435717374e-05, "loss": 0.9455, "step": 290 }, { "epoch": 0.07820646506777894, "grad_norm": 1.845721579155461, "learning_rate": 8.092323751714537e-05, "loss": 0.9842, "step": 300 }, { "epoch": 0.0808133472367049, "grad_norm": 1.5340232922646442, "learning_rate": 8.13884478634796e-05, "loss": 1.0016, "step": 310 }, { "epoch": 0.08342022940563086, "grad_norm": 1.7408053834467931, "learning_rate": 8.18388871055066e-05, "loss": 0.9408, "step": 320 }, { "epoch": 0.08602711157455684, "grad_norm": 2.002334563523922, "learning_rate": 8.227546445644009e-05, "loss": 1.011, "step": 330 }, { "epoch": 0.0886339937434828, "grad_norm": 1.5325036507169985, "learning_rate": 8.269900768206035e-05, "loss": 0.9431, "step": 340 }, { "epoch": 0.09124087591240876, "grad_norm": 1.656179096323659, "learning_rate": 8.31102725474114e-05, "loss": 0.948, "step": 350 }, { "epoch": 0.09384775808133472, "grad_norm": 1.579268561671285, "learning_rate": 8.350995093240796e-05, "loss": 0.9828, "step": 360 }, { "epoch": 0.09645464025026068, "grad_norm": 1.430642284312776, "learning_rate": 8.389867783526633e-05, "loss": 0.9933, "step": 370 }, { "epoch": 0.09906152241918666, "grad_norm": 1.834119895542427, "learning_rate": 8.427703744178792e-05, "loss": 0.9299, "step": 380 }, { "epoch": 0.10166840458811262, "grad_norm": 1.615728296022092, "learning_rate": 8.46455684061391e-05, "loss": 0.933, "step": 390 }, { "epoch": 0.10427528675703858, "grad_norm": 1.5075395597879127, "learning_rate": 8.500476846294871e-05, "loss": 0.9402, "step": 400 }, { "epoch": 0.10688216892596454, "grad_norm": 2.318206330951205, "learning_rate": 8.535509846982542e-05, "loss": 0.9586, "step": 410 }, { "epoch": 0.10948905109489052, "grad_norm": 1.7156459393153987, "learning_rate": 8.5696985962674e-05, "loss": 0.9669, "step": 420 }, { "epoch": 0.11209593326381648, "grad_norm": 1.661507066068275, "learning_rate": 8.603082829261603e-05, "loss": 0.9069, "step": 430 }, { "epoch": 0.11470281543274244, "grad_norm": 1.4599548493342727, "learning_rate": 8.635699540224343e-05, "loss": 0.9652, "step": 440 }, { "epoch": 0.1173096976016684, "grad_norm": 1.7927399177658685, "learning_rate": 8.667583228985008e-05, "loss": 0.9607, "step": 450 }, { "epoch": 0.11991657977059438, "grad_norm": 1.4930927495338642, "learning_rate": 8.698766120279926e-05, "loss": 0.9299, "step": 460 }, { "epoch": 0.12252346193952034, "grad_norm": 1.6044841258937315, "learning_rate": 8.729278359498751e-05, "loss": 0.9433, "step": 470 }, { "epoch": 0.1251303441084463, "grad_norm": 1.9302309005632683, "learning_rate": 8.75914818782113e-05, "loss": 0.9274, "step": 480 }, { "epoch": 0.12773722627737227, "grad_norm": 1.728242732565303, "learning_rate": 8.788402099294005e-05, "loss": 0.9181, "step": 490 }, { "epoch": 0.13034410844629823, "grad_norm": 1.4910517402601842, "learning_rate": 8.817064982039083e-05, "loss": 0.9234, "step": 500 }, { "epoch": 0.1329509906152242, "grad_norm": 1.5741224965672913, "learning_rate": 8.845160245476505e-05, "loss": 0.8945, "step": 510 }, { "epoch": 0.13555787278415016, "grad_norm": 1.3822257084676812, "learning_rate": 8.872709935194245e-05, "loss": 0.982, "step": 520 }, { "epoch": 0.13816475495307612, "grad_norm": 1.4357135301136048, "learning_rate": 8.899734836875464e-05, "loss": 0.9525, "step": 530 }, { "epoch": 0.14077163712200208, "grad_norm": 1.4310342833260414, "learning_rate": 8.926254570511269e-05, "loss": 0.9697, "step": 540 }, { "epoch": 0.14337851929092804, "grad_norm": 1.3185832595803366, "learning_rate": 8.952287675968555e-05, "loss": 0.9374, "step": 550 }, { "epoch": 0.145985401459854, "grad_norm": 1.194013911012122, "learning_rate": 8.977851690847735e-05, "loss": 0.9299, "step": 560 }, { "epoch": 0.14859228362878, "grad_norm": 1.6923709597781003, "learning_rate": 9.002963221449265e-05, "loss": 0.9219, "step": 570 }, { "epoch": 0.15119916579770595, "grad_norm": 1.526862442225114, "learning_rate": 9.02763800756818e-05, "loss": 0.9628, "step": 580 }, { "epoch": 0.15380604796663191, "grad_norm": 1.5075661508716671, "learning_rate": 9.051890981749827e-05, "loss": 0.9135, "step": 590 }, { "epoch": 0.15641293013555788, "grad_norm": 1.4428334009023258, "learning_rate": 9.075736323565343e-05, "loss": 0.909, "step": 600 }, { "epoch": 0.15901981230448384, "grad_norm": 1.741611639128364, "learning_rate": 9.09918750940095e-05, "loss": 0.897, "step": 610 }, { "epoch": 0.1616266944734098, "grad_norm": 1.187339694623765, "learning_rate": 9.122257358198768e-05, "loss": 0.9161, "step": 620 }, { "epoch": 0.16423357664233576, "grad_norm": 1.7239405643085177, "learning_rate": 9.144958073537873e-05, "loss": 0.9275, "step": 630 }, { "epoch": 0.16684045881126172, "grad_norm": 1.4934424469320755, "learning_rate": 9.167301282401467e-05, "loss": 0.9181, "step": 640 }, { "epoch": 0.16944734098018768, "grad_norm": 1.5887533903293958, "learning_rate": 9.189298070938457e-05, "loss": 0.8614, "step": 650 }, { "epoch": 0.17205422314911367, "grad_norm": 1.3073452168799204, "learning_rate": 9.210959017494815e-05, "loss": 0.9295, "step": 660 }, { "epoch": 0.17466110531803963, "grad_norm": 1.6241911957261628, "learning_rate": 9.232294223161082e-05, "loss": 0.9024, "step": 670 }, { "epoch": 0.1772679874869656, "grad_norm": 1.353634519365756, "learning_rate": 9.25331334005684e-05, "loss": 1.002, "step": 680 }, { "epoch": 0.17987486965589156, "grad_norm": 1.3805701443193243, "learning_rate": 9.274025597550396e-05, "loss": 0.9504, "step": 690 }, { "epoch": 0.18248175182481752, "grad_norm": 1.349628444790834, "learning_rate": 9.294439826591947e-05, "loss": 0.8845, "step": 700 }, { "epoch": 0.18508863399374348, "grad_norm": 1.6959737374830173, "learning_rate": 9.314564482320817e-05, "loss": 0.8913, "step": 710 }, { "epoch": 0.18769551616266944, "grad_norm": 1.214625729911582, "learning_rate": 9.334407665091604e-05, "loss": 0.9063, "step": 720 }, { "epoch": 0.1903023983315954, "grad_norm": 1.8898319277908309, "learning_rate": 9.353977140050108e-05, "loss": 0.9611, "step": 730 }, { "epoch": 0.19290928050052136, "grad_norm": 1.3636119152062076, "learning_rate": 9.373280355377439e-05, "loss": 0.9486, "step": 740 }, { "epoch": 0.19551616266944735, "grad_norm": 1.5038687436039977, "learning_rate": 9.392324459309554e-05, "loss": 0.8918, "step": 750 }, { "epoch": 0.1981230448383733, "grad_norm": 1.437881387105274, "learning_rate": 9.411116316029599e-05, "loss": 0.9538, "step": 760 }, { "epoch": 0.20072992700729927, "grad_norm": 1.4684627474926828, "learning_rate": 9.429662520521419e-05, "loss": 0.9875, "step": 770 }, { "epoch": 0.20333680917622524, "grad_norm": 1.2498034963556186, "learning_rate": 9.447969412464717e-05, "loss": 0.8688, "step": 780 }, { "epoch": 0.2059436913451512, "grad_norm": 1.6662177047291735, "learning_rate": 9.466043089245074e-05, "loss": 0.9125, "step": 790 }, { "epoch": 0.20855057351407716, "grad_norm": 1.866989275221486, "learning_rate": 9.483889418145677e-05, "loss": 0.9201, "step": 800 }, { "epoch": 0.21115745568300312, "grad_norm": 1.3968953751850115, "learning_rate": 9.501514047781739e-05, "loss": 0.8627, "step": 810 }, { "epoch": 0.21376433785192908, "grad_norm": 1.471933908900232, "learning_rate": 9.518922418833347e-05, "loss": 0.8386, "step": 820 }, { "epoch": 0.21637122002085507, "grad_norm": 1.5563986530773524, "learning_rate": 9.536119774127809e-05, "loss": 0.9067, "step": 830 }, { "epoch": 0.21897810218978103, "grad_norm": 1.3658584392037696, "learning_rate": 9.553111168118207e-05, "loss": 0.8931, "step": 840 }, { "epoch": 0.221584984358707, "grad_norm": 1.3728098970120093, "learning_rate": 9.569901475801053e-05, "loss": 0.8876, "step": 850 }, { "epoch": 0.22419186652763295, "grad_norm": 1.3264474456198803, "learning_rate": 9.58649540111241e-05, "loss": 0.908, "step": 860 }, { "epoch": 0.22679874869655892, "grad_norm": 1.4306688515313286, "learning_rate": 9.602897484838651e-05, "loss": 0.8527, "step": 870 }, { "epoch": 0.22940563086548488, "grad_norm": 1.359364809712028, "learning_rate": 9.619112112075149e-05, "loss": 0.8834, "step": 880 }, { "epoch": 0.23201251303441084, "grad_norm": 1.6419437685113607, "learning_rate": 9.63514351926354e-05, "loss": 0.9578, "step": 890 }, { "epoch": 0.2346193952033368, "grad_norm": 1.4939985595932985, "learning_rate": 9.650995800835814e-05, "loss": 0.9139, "step": 900 }, { "epoch": 0.23722627737226276, "grad_norm": 1.7581353118072822, "learning_rate": 9.66667291549132e-05, "loss": 0.8936, "step": 910 }, { "epoch": 0.23983315954118875, "grad_norm": 1.4500851564730863, "learning_rate": 9.682178692130732e-05, "loss": 0.8767, "step": 920 }, { "epoch": 0.2424400417101147, "grad_norm": 1.7403153499682062, "learning_rate": 9.697516835469238e-05, "loss": 0.8915, "step": 930 }, { "epoch": 0.24504692387904067, "grad_norm": 1.7325737919906712, "learning_rate": 9.712690931349557e-05, "loss": 0.8679, "step": 940 }, { "epoch": 0.24765380604796663, "grad_norm": 1.2607566606772898, "learning_rate": 9.72770445177381e-05, "loss": 0.9302, "step": 950 }, { "epoch": 0.2502606882168926, "grad_norm": 1.3948857503153305, "learning_rate": 9.742560759671938e-05, "loss": 0.9622, "step": 960 }, { "epoch": 0.2528675703858186, "grad_norm": 1.4990971836866378, "learning_rate": 9.757263113423036e-05, "loss": 0.8862, "step": 970 }, { "epoch": 0.25547445255474455, "grad_norm": 1.26002568302247, "learning_rate": 9.771814671144811e-05, "loss": 0.902, "step": 980 }, { "epoch": 0.2580813347236705, "grad_norm": 1.6328643018778433, "learning_rate": 9.786218494765286e-05, "loss": 0.9177, "step": 990 }, { "epoch": 0.26068821689259647, "grad_norm": 1.5208345584967045, "learning_rate": 9.800477553889888e-05, "loss": 0.9209, "step": 1000 }, { "epoch": 0.26329509906152243, "grad_norm": 1.7155524494692724, "learning_rate": 9.814594729476141e-05, "loss": 0.9237, "step": 1010 }, { "epoch": 0.2659019812304484, "grad_norm": 1.2913660127117599, "learning_rate": 9.828572817327313e-05, "loss": 0.9243, "step": 1020 }, { "epoch": 0.26850886339937435, "grad_norm": 1.7535505034210201, "learning_rate": 9.84241453141565e-05, "loss": 0.8931, "step": 1030 }, { "epoch": 0.2711157455683003, "grad_norm": 1.2898088323947252, "learning_rate": 9.856122507045051e-05, "loss": 0.8526, "step": 1040 }, { "epoch": 0.2737226277372263, "grad_norm": 1.463455335110262, "learning_rate": 9.869699303862418e-05, "loss": 0.8793, "step": 1050 }, { "epoch": 0.27632950990615224, "grad_norm": 1.3899833101623003, "learning_rate": 9.88314740872627e-05, "loss": 0.9302, "step": 1060 }, { "epoch": 0.2789363920750782, "grad_norm": 1.5658975524017984, "learning_rate": 9.896469238440684e-05, "loss": 0.9169, "step": 1070 }, { "epoch": 0.28154327424400416, "grad_norm": 1.3907147804328621, "learning_rate": 9.909667142362075e-05, "loss": 0.9001, "step": 1080 }, { "epoch": 0.2841501564129301, "grad_norm": 1.6141986396288779, "learning_rate": 9.922743404885818e-05, "loss": 0.8768, "step": 1090 }, { "epoch": 0.2867570385818561, "grad_norm": 1.3427468296126188, "learning_rate": 9.935700247819361e-05, "loss": 0.8774, "step": 1100 }, { "epoch": 0.28936392075078204, "grad_norm": 1.466877011233847, "learning_rate": 9.94853983264791e-05, "loss": 0.8975, "step": 1110 }, { "epoch": 0.291970802919708, "grad_norm": 1.5146408692829303, "learning_rate": 9.961264262698542e-05, "loss": 0.9306, "step": 1120 }, { "epoch": 0.29457768508863397, "grad_norm": 1.267673391297586, "learning_rate": 9.97387558520811e-05, "loss": 0.9058, "step": 1130 }, { "epoch": 0.29718456725756, "grad_norm": 1.4730762854067057, "learning_rate": 9.98637579330007e-05, "loss": 0.9042, "step": 1140 }, { "epoch": 0.29979144942648595, "grad_norm": 1.3202056072894564, "learning_rate": 9.998766827874944e-05, "loss": 0.9109, "step": 1150 }, { "epoch": 0.3023983315954119, "grad_norm": 1.7165734727653075, "learning_rate": 9.992275755527663e-05, "loss": 0.9557, "step": 1160 }, { "epoch": 0.30500521376433787, "grad_norm": 1.4663092625643357, "learning_rate": 9.982620449937241e-05, "loss": 0.9291, "step": 1170 }, { "epoch": 0.30761209593326383, "grad_norm": 1.4879041927617032, "learning_rate": 9.972965144346819e-05, "loss": 0.9282, "step": 1180 }, { "epoch": 0.3102189781021898, "grad_norm": 1.246157859500466, "learning_rate": 9.963309838756398e-05, "loss": 0.8993, "step": 1190 }, { "epoch": 0.31282586027111575, "grad_norm": 1.5069087106646166, "learning_rate": 9.953654533165974e-05, "loss": 0.8619, "step": 1200 }, { "epoch": 0.3154327424400417, "grad_norm": 1.6313981760455594, "learning_rate": 9.943999227575553e-05, "loss": 0.9107, "step": 1210 }, { "epoch": 0.3180396246089677, "grad_norm": 1.7942805182458694, "learning_rate": 9.934343921985131e-05, "loss": 0.8675, "step": 1220 }, { "epoch": 0.32064650677789364, "grad_norm": 1.8369591575858806, "learning_rate": 9.924688616394709e-05, "loss": 0.9358, "step": 1230 }, { "epoch": 0.3232533889468196, "grad_norm": 1.7610189048584648, "learning_rate": 9.915033310804287e-05, "loss": 0.8879, "step": 1240 }, { "epoch": 0.32586027111574556, "grad_norm": 1.1839900127231098, "learning_rate": 9.905378005213866e-05, "loss": 0.8603, "step": 1250 }, { "epoch": 0.3284671532846715, "grad_norm": 1.3882705515733773, "learning_rate": 9.895722699623443e-05, "loss": 0.892, "step": 1260 }, { "epoch": 0.3310740354535975, "grad_norm": 1.327576256659297, "learning_rate": 9.886067394033022e-05, "loss": 0.9003, "step": 1270 }, { "epoch": 0.33368091762252344, "grad_norm": 1.8987128563646904, "learning_rate": 9.8764120884426e-05, "loss": 0.8824, "step": 1280 }, { "epoch": 0.3362877997914494, "grad_norm": 1.3623197435419394, "learning_rate": 9.866756782852177e-05, "loss": 0.8985, "step": 1290 }, { "epoch": 0.33889468196037537, "grad_norm": 1.5687193453586346, "learning_rate": 9.857101477261755e-05, "loss": 0.9256, "step": 1300 }, { "epoch": 0.3415015641293014, "grad_norm": 1.7026203127602761, "learning_rate": 9.847446171671334e-05, "loss": 0.892, "step": 1310 }, { "epoch": 0.34410844629822734, "grad_norm": 1.452492802061833, "learning_rate": 9.837790866080911e-05, "loss": 0.9324, "step": 1320 }, { "epoch": 0.3467153284671533, "grad_norm": 1.3446352406962594, "learning_rate": 9.82813556049049e-05, "loss": 0.9037, "step": 1330 }, { "epoch": 0.34932221063607927, "grad_norm": 1.5312444025461112, "learning_rate": 9.818480254900068e-05, "loss": 0.8823, "step": 1340 }, { "epoch": 0.35192909280500523, "grad_norm": 1.2547101152696145, "learning_rate": 9.808824949309646e-05, "loss": 0.8986, "step": 1350 }, { "epoch": 0.3545359749739312, "grad_norm": 1.7576893497240795, "learning_rate": 9.799169643719224e-05, "loss": 0.8729, "step": 1360 }, { "epoch": 0.35714285714285715, "grad_norm": 1.2979110987807732, "learning_rate": 9.789514338128803e-05, "loss": 0.8368, "step": 1370 }, { "epoch": 0.3597497393117831, "grad_norm": 1.3445976041661503, "learning_rate": 9.77985903253838e-05, "loss": 0.9255, "step": 1380 }, { "epoch": 0.3623566214807091, "grad_norm": 1.5914212414708893, "learning_rate": 9.770203726947958e-05, "loss": 0.868, "step": 1390 }, { "epoch": 0.36496350364963503, "grad_norm": 1.6920255872918506, "learning_rate": 9.760548421357536e-05, "loss": 0.9099, "step": 1400 }, { "epoch": 0.367570385818561, "grad_norm": 1.5488187265112325, "learning_rate": 9.750893115767114e-05, "loss": 0.8716, "step": 1410 }, { "epoch": 0.37017726798748696, "grad_norm": 1.451972780765123, "learning_rate": 9.741237810176692e-05, "loss": 0.8561, "step": 1420 }, { "epoch": 0.3727841501564129, "grad_norm": 1.295850263704956, "learning_rate": 9.731582504586271e-05, "loss": 0.9015, "step": 1430 }, { "epoch": 0.3753910323253389, "grad_norm": 1.5842277075718598, "learning_rate": 9.721927198995849e-05, "loss": 0.9298, "step": 1440 }, { "epoch": 0.37799791449426484, "grad_norm": 1.3596224626112259, "learning_rate": 9.712271893405427e-05, "loss": 0.8806, "step": 1450 }, { "epoch": 0.3806047966631908, "grad_norm": 1.615036427471816, "learning_rate": 9.702616587815004e-05, "loss": 0.8381, "step": 1460 }, { "epoch": 0.38321167883211676, "grad_norm": 1.3877699271760322, "learning_rate": 9.692961282224584e-05, "loss": 0.8688, "step": 1470 }, { "epoch": 0.3858185610010427, "grad_norm": 1.4359747663434173, "learning_rate": 9.68330597663416e-05, "loss": 0.8943, "step": 1480 }, { "epoch": 0.38842544316996874, "grad_norm": 1.4881531927353704, "learning_rate": 9.673650671043739e-05, "loss": 0.906, "step": 1490 }, { "epoch": 0.3910323253388947, "grad_norm": 1.4488364078976648, "learning_rate": 9.663995365453317e-05, "loss": 0.8626, "step": 1500 }, { "epoch": 0.39363920750782067, "grad_norm": 1.4207762959482324, "learning_rate": 9.654340059862895e-05, "loss": 0.8339, "step": 1510 }, { "epoch": 0.3962460896767466, "grad_norm": 1.2660341079423447, "learning_rate": 9.644684754272473e-05, "loss": 0.9113, "step": 1520 }, { "epoch": 0.3988529718456726, "grad_norm": 1.277561759581565, "learning_rate": 9.635029448682052e-05, "loss": 0.9101, "step": 1530 }, { "epoch": 0.40145985401459855, "grad_norm": 1.3307684337403334, "learning_rate": 9.625374143091628e-05, "loss": 0.9213, "step": 1540 }, { "epoch": 0.4040667361835245, "grad_norm": 1.415345659530655, "learning_rate": 9.615718837501208e-05, "loss": 0.858, "step": 1550 }, { "epoch": 0.40667361835245047, "grad_norm": 1.7317695266528055, "learning_rate": 9.606063531910785e-05, "loss": 0.8247, "step": 1560 }, { "epoch": 0.40928050052137643, "grad_norm": 1.3841397564289928, "learning_rate": 9.596408226320363e-05, "loss": 0.9004, "step": 1570 }, { "epoch": 0.4118873826903024, "grad_norm": 1.6542817351437331, "learning_rate": 9.586752920729941e-05, "loss": 0.8688, "step": 1580 }, { "epoch": 0.41449426485922836, "grad_norm": 1.1741335364390655, "learning_rate": 9.57709761513952e-05, "loss": 0.8711, "step": 1590 }, { "epoch": 0.4171011470281543, "grad_norm": 1.6436036502277707, "learning_rate": 9.567442309549097e-05, "loss": 0.843, "step": 1600 }, { "epoch": 0.4197080291970803, "grad_norm": 1.5352723465358284, "learning_rate": 9.557787003958676e-05, "loss": 0.908, "step": 1610 }, { "epoch": 0.42231491136600624, "grad_norm": 1.5272500363549157, "learning_rate": 9.548131698368254e-05, "loss": 0.8234, "step": 1620 }, { "epoch": 0.4249217935349322, "grad_norm": 1.2675860878774323, "learning_rate": 9.538476392777832e-05, "loss": 0.8992, "step": 1630 }, { "epoch": 0.42752867570385816, "grad_norm": 1.2425270593669835, "learning_rate": 9.52882108718741e-05, "loss": 0.8693, "step": 1640 }, { "epoch": 0.4301355578727841, "grad_norm": 1.570064901663505, "learning_rate": 9.519165781596989e-05, "loss": 0.8652, "step": 1650 }, { "epoch": 0.43274244004171014, "grad_norm": 1.3621769054786714, "learning_rate": 9.509510476006565e-05, "loss": 0.8596, "step": 1660 }, { "epoch": 0.4353493222106361, "grad_norm": 1.4560708789685335, "learning_rate": 9.499855170416144e-05, "loss": 0.8062, "step": 1670 }, { "epoch": 0.43795620437956206, "grad_norm": 1.315052892176818, "learning_rate": 9.490199864825722e-05, "loss": 0.8548, "step": 1680 }, { "epoch": 0.440563086548488, "grad_norm": 1.3293319921282323, "learning_rate": 9.4805445592353e-05, "loss": 0.9029, "step": 1690 }, { "epoch": 0.443169968717414, "grad_norm": 1.6758016698805198, "learning_rate": 9.470889253644878e-05, "loss": 0.8288, "step": 1700 }, { "epoch": 0.44577685088633995, "grad_norm": 1.4994197671525644, "learning_rate": 9.461233948054457e-05, "loss": 0.8813, "step": 1710 }, { "epoch": 0.4483837330552659, "grad_norm": 1.4043994377616684, "learning_rate": 9.451578642464035e-05, "loss": 0.8745, "step": 1720 }, { "epoch": 0.45099061522419187, "grad_norm": 1.3750450980048579, "learning_rate": 9.441923336873612e-05, "loss": 0.9459, "step": 1730 }, { "epoch": 0.45359749739311783, "grad_norm": 1.7045501519636879, "learning_rate": 9.43226803128319e-05, "loss": 0.8674, "step": 1740 }, { "epoch": 0.4562043795620438, "grad_norm": 1.6497345479066228, "learning_rate": 9.422612725692768e-05, "loss": 0.8646, "step": 1750 }, { "epoch": 0.45881126173096975, "grad_norm": 1.5931163752198003, "learning_rate": 9.412957420102346e-05, "loss": 0.8455, "step": 1760 }, { "epoch": 0.4614181438998957, "grad_norm": 1.7230724221709457, "learning_rate": 9.403302114511925e-05, "loss": 0.9131, "step": 1770 }, { "epoch": 0.4640250260688217, "grad_norm": 1.46180495132515, "learning_rate": 9.393646808921503e-05, "loss": 0.8871, "step": 1780 }, { "epoch": 0.46663190823774764, "grad_norm": 1.5935162065821926, "learning_rate": 9.383991503331081e-05, "loss": 0.8675, "step": 1790 }, { "epoch": 0.4692387904066736, "grad_norm": 1.1992013716756416, "learning_rate": 9.374336197740659e-05, "loss": 0.8699, "step": 1800 }, { "epoch": 0.47184567257559956, "grad_norm": 1.2641218432215904, "learning_rate": 9.364680892150238e-05, "loss": 0.8158, "step": 1810 }, { "epoch": 0.4744525547445255, "grad_norm": 1.2473362593090205, "learning_rate": 9.355025586559814e-05, "loss": 0.888, "step": 1820 }, { "epoch": 0.4770594369134515, "grad_norm": 1.464832534464096, "learning_rate": 9.345370280969393e-05, "loss": 0.86, "step": 1830 }, { "epoch": 0.4796663190823775, "grad_norm": 1.4843223334337357, "learning_rate": 9.335714975378971e-05, "loss": 0.8826, "step": 1840 }, { "epoch": 0.48227320125130346, "grad_norm": 1.4314799678240455, "learning_rate": 9.326059669788549e-05, "loss": 0.9123, "step": 1850 }, { "epoch": 0.4848800834202294, "grad_norm": 1.5671717582839904, "learning_rate": 9.316404364198127e-05, "loss": 0.8619, "step": 1860 }, { "epoch": 0.4874869655891554, "grad_norm": 1.5285351609841458, "learning_rate": 9.306749058607706e-05, "loss": 0.8608, "step": 1870 }, { "epoch": 0.49009384775808135, "grad_norm": 1.4572720749431034, "learning_rate": 9.297093753017283e-05, "loss": 0.8562, "step": 1880 }, { "epoch": 0.4927007299270073, "grad_norm": 1.3144129556287165, "learning_rate": 9.287438447426862e-05, "loss": 0.8674, "step": 1890 }, { "epoch": 0.49530761209593327, "grad_norm": 1.765323561126351, "learning_rate": 9.27778314183644e-05, "loss": 0.8338, "step": 1900 }, { "epoch": 0.49791449426485923, "grad_norm": 1.440502764478305, "learning_rate": 9.268127836246017e-05, "loss": 0.8791, "step": 1910 }, { "epoch": 0.5005213764337852, "grad_norm": 1.5684279942469967, "learning_rate": 9.258472530655595e-05, "loss": 0.8177, "step": 1920 }, { "epoch": 0.5031282586027112, "grad_norm": 1.310102072350701, "learning_rate": 9.248817225065174e-05, "loss": 0.8493, "step": 1930 }, { "epoch": 0.5057351407716372, "grad_norm": 1.5214634407076848, "learning_rate": 9.239161919474751e-05, "loss": 0.8537, "step": 1940 }, { "epoch": 0.5083420229405631, "grad_norm": 1.5587847023414687, "learning_rate": 9.22950661388433e-05, "loss": 0.9156, "step": 1950 }, { "epoch": 0.5109489051094891, "grad_norm": 1.388133145041633, "learning_rate": 9.219851308293908e-05, "loss": 0.8231, "step": 1960 }, { "epoch": 0.513555787278415, "grad_norm": 1.5926213586780094, "learning_rate": 9.210196002703486e-05, "loss": 0.8427, "step": 1970 }, { "epoch": 0.516162669447341, "grad_norm": 1.3201354229543043, "learning_rate": 9.200540697113063e-05, "loss": 0.8208, "step": 1980 }, { "epoch": 0.5187695516162669, "grad_norm": 1.393366393240565, "learning_rate": 9.190885391522643e-05, "loss": 0.8929, "step": 1990 }, { "epoch": 0.5213764337851929, "grad_norm": 1.362764482222623, "learning_rate": 9.18123008593222e-05, "loss": 0.8929, "step": 2000 } ], "logging_steps": 10, "max_steps": 11508, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }