{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 951, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010515247108307045, "grad_norm": 9.591360424317704, "learning_rate": 2.0833333333333333e-07, "loss": 1.5585, "step": 1 }, { "epoch": 0.005257623554153523, "grad_norm": 9.792129596772575, "learning_rate": 1.0416666666666667e-06, "loss": 1.5717, "step": 5 }, { "epoch": 0.010515247108307046, "grad_norm": 3.5907937606092246, "learning_rate": 2.0833333333333334e-06, "loss": 1.54, "step": 10 }, { "epoch": 0.015772870662460567, "grad_norm": 2.305935396467309, "learning_rate": 3.125e-06, "loss": 1.5164, "step": 15 }, { "epoch": 0.02103049421661409, "grad_norm": 1.3430988630406788, "learning_rate": 4.166666666666667e-06, "loss": 1.4398, "step": 20 }, { "epoch": 0.026288117770767613, "grad_norm": 1.3395602302159622, "learning_rate": 5.208333333333334e-06, "loss": 1.4374, "step": 25 }, { "epoch": 0.031545741324921134, "grad_norm": 0.927990494487892, "learning_rate": 6.25e-06, "loss": 1.3952, "step": 30 }, { "epoch": 0.03680336487907466, "grad_norm": 0.817869963107523, "learning_rate": 7.291666666666667e-06, "loss": 1.3688, "step": 35 }, { "epoch": 0.04206098843322818, "grad_norm": 0.8099795395904227, "learning_rate": 8.333333333333334e-06, "loss": 1.3543, "step": 40 }, { "epoch": 0.0473186119873817, "grad_norm": 0.8196989977897423, "learning_rate": 9.375000000000001e-06, "loss": 1.4065, "step": 45 }, { "epoch": 0.052576235541535225, "grad_norm": 0.7891645835706297, "learning_rate": 1.0416666666666668e-05, "loss": 1.3719, "step": 50 }, { "epoch": 0.05783385909568875, "grad_norm": 0.7730662990445142, "learning_rate": 1.1458333333333333e-05, "loss": 1.3534, "step": 55 }, { "epoch": 0.06309148264984227, "grad_norm": 0.8436428208666454, "learning_rate": 1.25e-05, "loss": 1.3484, "step": 60 }, { "epoch": 0.0683491062039958, "grad_norm": 0.930633139226441, "learning_rate": 1.3541666666666668e-05, "loss": 1.3339, "step": 65 }, { "epoch": 0.07360672975814932, "grad_norm": 1.2035603215763095, "learning_rate": 1.4583333333333333e-05, "loss": 1.3577, "step": 70 }, { "epoch": 0.07886435331230283, "grad_norm": 0.9963124148010385, "learning_rate": 1.5625e-05, "loss": 1.3504, "step": 75 }, { "epoch": 0.08412197686645637, "grad_norm": 0.7705000485956546, "learning_rate": 1.6666666666666667e-05, "loss": 1.3316, "step": 80 }, { "epoch": 0.08937960042060988, "grad_norm": 0.892949375851743, "learning_rate": 1.7708333333333335e-05, "loss": 1.3289, "step": 85 }, { "epoch": 0.0946372239747634, "grad_norm": 0.909849658521525, "learning_rate": 1.8750000000000002e-05, "loss": 1.3311, "step": 90 }, { "epoch": 0.09989484752891693, "grad_norm": 0.8255721604143701, "learning_rate": 1.979166666666667e-05, "loss": 1.3224, "step": 95 }, { "epoch": 0.10515247108307045, "grad_norm": 0.8253137907707107, "learning_rate": 1.9998919935516768e-05, "loss": 1.3291, "step": 100 }, { "epoch": 0.11041009463722397, "grad_norm": 0.8322397994562853, "learning_rate": 1.999453257340926e-05, "loss": 1.321, "step": 105 }, { "epoch": 0.1156677181913775, "grad_norm": 0.8169700972120059, "learning_rate": 1.9986771889316172e-05, "loss": 1.3145, "step": 110 }, { "epoch": 0.12092534174553102, "grad_norm": 0.8054378165910604, "learning_rate": 1.9975640502598243e-05, "loss": 1.3537, "step": 115 }, { "epoch": 0.12618296529968454, "grad_norm": 0.8851185792466449, "learning_rate": 1.9961142170284762e-05, "loss": 1.3081, "step": 120 }, { "epoch": 0.13144058885383805, "grad_norm": 0.7983608745240434, "learning_rate": 1.9943281785805483e-05, "loss": 1.3235, "step": 125 }, { "epoch": 0.1366982124079916, "grad_norm": 0.8101924460502825, "learning_rate": 1.9922065377339037e-05, "loss": 1.3234, "step": 130 }, { "epoch": 0.14195583596214512, "grad_norm": 0.8181918475547033, "learning_rate": 1.98975001057783e-05, "loss": 1.3275, "step": 135 }, { "epoch": 0.14721345951629863, "grad_norm": 0.8204094295073951, "learning_rate": 1.986959426231349e-05, "loss": 1.3191, "step": 140 }, { "epoch": 0.15247108307045215, "grad_norm": 0.8193214439419285, "learning_rate": 1.983835726563373e-05, "loss": 1.3151, "step": 145 }, { "epoch": 0.15772870662460567, "grad_norm": 0.8008496299830612, "learning_rate": 1.9803799658748096e-05, "loss": 1.3173, "step": 150 }, { "epoch": 0.16298633017875921, "grad_norm": 0.8976369540265376, "learning_rate": 1.976593310542718e-05, "loss": 1.3193, "step": 155 }, { "epoch": 0.16824395373291273, "grad_norm": 0.8219320759522418, "learning_rate": 1.9724770386266363e-05, "loss": 1.3074, "step": 160 }, { "epoch": 0.17350157728706625, "grad_norm": 0.7917666416050304, "learning_rate": 1.968032539437215e-05, "loss": 1.3229, "step": 165 }, { "epoch": 0.17875920084121977, "grad_norm": 0.8468899498707804, "learning_rate": 1.963261313067302e-05, "loss": 1.3053, "step": 170 }, { "epoch": 0.18401682439537329, "grad_norm": 0.7826255604907222, "learning_rate": 1.958164969885636e-05, "loss": 1.2994, "step": 175 }, { "epoch": 0.1892744479495268, "grad_norm": 0.7854651229191463, "learning_rate": 1.9527452299933192e-05, "loss": 1.2933, "step": 180 }, { "epoch": 0.19453207150368035, "grad_norm": 0.8335518596689603, "learning_rate": 1.9470039226432562e-05, "loss": 1.3053, "step": 185 }, { "epoch": 0.19978969505783387, "grad_norm": 0.7925970741738505, "learning_rate": 1.9409429856227487e-05, "loss": 1.3118, "step": 190 }, { "epoch": 0.20504731861198738, "grad_norm": 0.9022667633817576, "learning_rate": 1.934564464599461e-05, "loss": 1.3006, "step": 195 }, { "epoch": 0.2103049421661409, "grad_norm": 0.7846381821366918, "learning_rate": 1.9278705124309724e-05, "loss": 1.3019, "step": 200 }, { "epoch": 0.21556256572029442, "grad_norm": 0.8523088106258838, "learning_rate": 1.9208633884381528e-05, "loss": 1.3096, "step": 205 }, { "epoch": 0.22082018927444794, "grad_norm": 0.7773410227959319, "learning_rate": 1.913545457642601e-05, "loss": 1.3292, "step": 210 }, { "epoch": 0.22607781282860148, "grad_norm": 0.7700372006553495, "learning_rate": 1.9059191899684154e-05, "loss": 1.3039, "step": 215 }, { "epoch": 0.231335436382755, "grad_norm": 0.888501634972992, "learning_rate": 1.8979871594085482e-05, "loss": 1.2877, "step": 220 }, { "epoch": 0.23659305993690852, "grad_norm": 1.1002462859379447, "learning_rate": 1.8897520431560435e-05, "loss": 1.3015, "step": 225 }, { "epoch": 0.24185068349106204, "grad_norm": 0.8737204056440837, "learning_rate": 1.881216620700437e-05, "loss": 1.3115, "step": 230 }, { "epoch": 0.24710830704521555, "grad_norm": 0.789475981373434, "learning_rate": 1.872383772889634e-05, "loss": 1.3046, "step": 235 }, { "epoch": 0.25236593059936907, "grad_norm": 0.7857837110627846, "learning_rate": 1.863256480957574e-05, "loss": 1.314, "step": 240 }, { "epoch": 0.2576235541535226, "grad_norm": 0.8735939515101088, "learning_rate": 1.853837825518014e-05, "loss": 1.2965, "step": 245 }, { "epoch": 0.2628811777076761, "grad_norm": 0.8274285162573444, "learning_rate": 1.844130985524771e-05, "loss": 1.2847, "step": 250 }, { "epoch": 0.26813880126182965, "grad_norm": 0.9655358070442068, "learning_rate": 1.83413923719877e-05, "loss": 1.3033, "step": 255 }, { "epoch": 0.2733964248159832, "grad_norm": 0.784933865330852, "learning_rate": 1.8238659529222672e-05, "loss": 1.2964, "step": 260 }, { "epoch": 0.2786540483701367, "grad_norm": 0.8072089928871935, "learning_rate": 1.813314600100612e-05, "loss": 1.3, "step": 265 }, { "epoch": 0.28391167192429023, "grad_norm": 0.7600500534210821, "learning_rate": 1.802488739991941e-05, "loss": 1.2897, "step": 270 }, { "epoch": 0.2891692954784437, "grad_norm": 0.7716489846603405, "learning_rate": 1.7913920265051947e-05, "loss": 1.2994, "step": 275 }, { "epoch": 0.29442691903259727, "grad_norm": 0.8439416659266935, "learning_rate": 1.7800282049668593e-05, "loss": 1.3146, "step": 280 }, { "epoch": 0.2996845425867508, "grad_norm": 0.7413366975663128, "learning_rate": 1.7684011108568593e-05, "loss": 1.3157, "step": 285 }, { "epoch": 0.3049421661409043, "grad_norm": 0.9324094146103826, "learning_rate": 1.7565146685140168e-05, "loss": 1.2944, "step": 290 }, { "epoch": 0.31019978969505785, "grad_norm": 0.7497525070350954, "learning_rate": 1.7443728898115228e-05, "loss": 1.3041, "step": 295 }, { "epoch": 0.31545741324921134, "grad_norm": 0.7274267632257158, "learning_rate": 1.7319798728028617e-05, "loss": 1.2855, "step": 300 }, { "epoch": 0.3207150368033649, "grad_norm": 0.7306525895599777, "learning_rate": 1.7193398003386514e-05, "loss": 1.2967, "step": 305 }, { "epoch": 0.32597266035751843, "grad_norm": 0.7718504682998596, "learning_rate": 1.7064569386548586e-05, "loss": 1.3116, "step": 310 }, { "epoch": 0.3312302839116719, "grad_norm": 0.7002164572114049, "learning_rate": 1.6933356359328756e-05, "loss": 1.2812, "step": 315 }, { "epoch": 0.33648790746582546, "grad_norm": 0.757112323690967, "learning_rate": 1.679980320831934e-05, "loss": 1.2654, "step": 320 }, { "epoch": 0.34174553101997895, "grad_norm": 0.7509591301953, "learning_rate": 1.6663955009943603e-05, "loss": 1.2755, "step": 325 }, { "epoch": 0.3470031545741325, "grad_norm": 0.7896423206563724, "learning_rate": 1.6525857615241686e-05, "loss": 1.291, "step": 330 }, { "epoch": 0.352260778128286, "grad_norm": 0.8538357892614057, "learning_rate": 1.6385557634395138e-05, "loss": 1.3, "step": 335 }, { "epoch": 0.35751840168243953, "grad_norm": 0.8257603179264333, "learning_rate": 1.624310242099518e-05, "loss": 1.2825, "step": 340 }, { "epoch": 0.3627760252365931, "grad_norm": 0.7506565139625698, "learning_rate": 1.609854005606009e-05, "loss": 1.2903, "step": 345 }, { "epoch": 0.36803364879074657, "grad_norm": 0.7962631813094616, "learning_rate": 1.5951919331807052e-05, "loss": 1.32, "step": 350 }, { "epoch": 0.3732912723449001, "grad_norm": 0.7846004527594455, "learning_rate": 1.5803289735183952e-05, "loss": 1.3094, "step": 355 }, { "epoch": 0.3785488958990536, "grad_norm": 0.7529193912570731, "learning_rate": 1.565270143116672e-05, "loss": 1.3097, "step": 360 }, { "epoch": 0.38380651945320715, "grad_norm": 0.8450472817355228, "learning_rate": 1.5500205245827814e-05, "loss": 1.2954, "step": 365 }, { "epoch": 0.3890641430073607, "grad_norm": 0.7884988874909659, "learning_rate": 1.5345852649181555e-05, "loss": 1.2774, "step": 370 }, { "epoch": 0.3943217665615142, "grad_norm": 0.7181038933510216, "learning_rate": 1.5189695737812153e-05, "loss": 1.2788, "step": 375 }, { "epoch": 0.39957939011566773, "grad_norm": 0.7398759340722635, "learning_rate": 1.503178721729022e-05, "loss": 1.2825, "step": 380 }, { "epoch": 0.4048370136698212, "grad_norm": 0.6946338532175111, "learning_rate": 1.4872180384383772e-05, "loss": 1.2945, "step": 385 }, { "epoch": 0.41009463722397477, "grad_norm": 0.7312230396618198, "learning_rate": 1.4710929109069674e-05, "loss": 1.2774, "step": 390 }, { "epoch": 0.4153522607781283, "grad_norm": 0.685269793573933, "learning_rate": 1.4548087816351616e-05, "loss": 1.2691, "step": 395 }, { "epoch": 0.4206098843322818, "grad_norm": 0.7367025121344211, "learning_rate": 1.4383711467890776e-05, "loss": 1.3029, "step": 400 }, { "epoch": 0.42586750788643535, "grad_norm": 0.954670307639884, "learning_rate": 1.4217855543455323e-05, "loss": 1.2846, "step": 405 }, { "epoch": 0.43112513144058884, "grad_norm": 0.7884436143443709, "learning_rate": 1.4050576022195084e-05, "loss": 1.2686, "step": 410 }, { "epoch": 0.4363827549947424, "grad_norm": 0.6923648258189327, "learning_rate": 1.3881929363747628e-05, "loss": 1.2717, "step": 415 }, { "epoch": 0.4416403785488959, "grad_norm": 0.7131098366130528, "learning_rate": 1.3711972489182208e-05, "loss": 1.2968, "step": 420 }, { "epoch": 0.4468980021030494, "grad_norm": 0.7320310034272568, "learning_rate": 1.3540762761787938e-05, "loss": 1.2829, "step": 425 }, { "epoch": 0.45215562565720296, "grad_norm": 0.7327893476336677, "learning_rate": 1.3368357967712726e-05, "loss": 1.2877, "step": 430 }, { "epoch": 0.45741324921135645, "grad_norm": 0.6970014096675695, "learning_rate": 1.3194816296459483e-05, "loss": 1.2871, "step": 435 }, { "epoch": 0.46267087276551, "grad_norm": 0.7014388522833548, "learning_rate": 1.302019632124619e-05, "loss": 1.2897, "step": 440 }, { "epoch": 0.4679284963196635, "grad_norm": 0.6970153763179125, "learning_rate": 1.2844556979236463e-05, "loss": 1.2714, "step": 445 }, { "epoch": 0.47318611987381703, "grad_norm": 0.7162117080997548, "learning_rate": 1.2667957551647263e-05, "loss": 1.2705, "step": 450 }, { "epoch": 0.4784437434279706, "grad_norm": 0.7402201632812351, "learning_rate": 1.24904576437405e-05, "loss": 1.2654, "step": 455 }, { "epoch": 0.48370136698212407, "grad_norm": 1.0567871681807486, "learning_rate": 1.2312117164705267e-05, "loss": 1.2784, "step": 460 }, { "epoch": 0.4889589905362776, "grad_norm": 0.7383322032141622, "learning_rate": 1.213299630743747e-05, "loss": 1.2574, "step": 465 }, { "epoch": 0.4942166140904311, "grad_norm": 0.7402615331947396, "learning_rate": 1.1953155528223728e-05, "loss": 1.2861, "step": 470 }, { "epoch": 0.49947423764458465, "grad_norm": 0.7073342447220505, "learning_rate": 1.1772655526336367e-05, "loss": 1.2899, "step": 475 }, { "epoch": 0.5047318611987381, "grad_norm": 0.6966270957094003, "learning_rate": 1.1591557223546394e-05, "loss": 1.2607, "step": 480 }, { "epoch": 0.5099894847528917, "grad_norm": 0.6995026613750213, "learning_rate": 1.1409921743561383e-05, "loss": 1.285, "step": 485 }, { "epoch": 0.5152471083070452, "grad_norm": 0.677915660036085, "learning_rate": 1.1227810391395199e-05, "loss": 1.2787, "step": 490 }, { "epoch": 0.5205047318611987, "grad_norm": 0.694058172582696, "learning_rate": 1.1045284632676535e-05, "loss": 1.2817, "step": 495 }, { "epoch": 0.5257623554153522, "grad_norm": 0.6977571213673606, "learning_rate": 1.0862406072903224e-05, "loss": 1.269, "step": 500 }, { "epoch": 0.5310199789695058, "grad_norm": 0.709669977534319, "learning_rate": 1.067923643664936e-05, "loss": 1.2569, "step": 505 }, { "epoch": 0.5362776025236593, "grad_norm": 0.700607260206098, "learning_rate": 1.0495837546732224e-05, "loss": 1.2722, "step": 510 }, { "epoch": 0.5415352260778128, "grad_norm": 0.6965062537462676, "learning_rate": 1.031227130334604e-05, "loss": 1.2689, "step": 515 }, { "epoch": 0.5467928496319664, "grad_norm": 0.6975551930599063, "learning_rate": 1.0128599663169629e-05, "loss": 1.3171, "step": 520 }, { "epoch": 0.5520504731861199, "grad_norm": 0.6835553883087109, "learning_rate": 9.944884618454996e-06, "loss": 1.2616, "step": 525 }, { "epoch": 0.5573080967402734, "grad_norm": 0.6932270302779978, "learning_rate": 9.761188176103902e-06, "loss": 1.2842, "step": 530 }, { "epoch": 0.562565720294427, "grad_norm": 0.7523295705383632, "learning_rate": 9.577572336739491e-06, "loss": 1.276, "step": 535 }, { "epoch": 0.5678233438485805, "grad_norm": 0.7410406370549468, "learning_rate": 9.394099073780066e-06, "loss": 1.2451, "step": 540 }, { "epoch": 0.573080967402734, "grad_norm": 0.6626519082062838, "learning_rate": 9.210830312521991e-06, "loss": 1.2505, "step": 545 }, { "epoch": 0.5783385909568874, "grad_norm": 0.678178932637677, "learning_rate": 9.027827909238902e-06, "loss": 1.2884, "step": 550 }, { "epoch": 0.583596214511041, "grad_norm": 0.6964235915302415, "learning_rate": 8.84515363030414e-06, "loss": 1.2674, "step": 555 }, { "epoch": 0.5888538380651945, "grad_norm": 0.7295327929832226, "learning_rate": 8.662869131343607e-06, "loss": 1.2606, "step": 560 }, { "epoch": 0.594111461619348, "grad_norm": 0.6671644394075068, "learning_rate": 8.481035936425928e-06, "loss": 1.2631, "step": 565 }, { "epoch": 0.5993690851735016, "grad_norm": 0.6918129382316475, "learning_rate": 8.299715417297072e-06, "loss": 1.2733, "step": 570 }, { "epoch": 0.6046267087276551, "grad_norm": 0.7648575778497364, "learning_rate": 8.118968772666338e-06, "loss": 1.2768, "step": 575 }, { "epoch": 0.6098843322818086, "grad_norm": 0.6660976751002976, "learning_rate": 7.938857007550797e-06, "loss": 1.2712, "step": 580 }, { "epoch": 0.6151419558359621, "grad_norm": 0.8206761231157318, "learning_rate": 7.759440912685043e-06, "loss": 1.2629, "step": 585 }, { "epoch": 0.6203995793901157, "grad_norm": 0.6781893460495839, "learning_rate": 7.580781044003324e-06, "loss": 1.2928, "step": 590 }, { "epoch": 0.6256572029442692, "grad_norm": 0.7043738811990014, "learning_rate": 7.402937702200905e-06, "loss": 1.2565, "step": 595 }, { "epoch": 0.6309148264984227, "grad_norm": 0.6676378108729303, "learning_rate": 7.225970912381557e-06, "loss": 1.2441, "step": 600 }, { "epoch": 0.6361724500525763, "grad_norm": 0.7086319165657909, "learning_rate": 7.04994040379809e-06, "loss": 1.2526, "step": 605 }, { "epoch": 0.6414300736067298, "grad_norm": 0.6584513029407596, "learning_rate": 6.874905589692734e-06, "loss": 1.2689, "step": 610 }, { "epoch": 0.6466876971608833, "grad_norm": 0.6556498134167102, "learning_rate": 6.700925547244173e-06, "loss": 1.254, "step": 615 }, { "epoch": 0.6519453207150369, "grad_norm": 0.7317497625816267, "learning_rate": 6.528058997627995e-06, "loss": 1.2773, "step": 620 }, { "epoch": 0.6572029442691903, "grad_norm": 0.6943495622984553, "learning_rate": 6.356364286197341e-06, "loss": 1.2774, "step": 625 }, { "epoch": 0.6624605678233438, "grad_norm": 0.6557703327216338, "learning_rate": 6.18589936279034e-06, "loss": 1.2561, "step": 630 }, { "epoch": 0.6677181913774973, "grad_norm": 0.7479106302240468, "learning_rate": 6.016721762171098e-06, "loss": 1.2592, "step": 635 }, { "epoch": 0.6729758149316509, "grad_norm": 0.6501670860792287, "learning_rate": 5.848888584610727e-06, "loss": 1.2647, "step": 640 }, { "epoch": 0.6782334384858044, "grad_norm": 0.6750656043866252, "learning_rate": 5.6824564766150724e-06, "loss": 1.2687, "step": 645 }, { "epoch": 0.6834910620399579, "grad_norm": 0.707162362118668, "learning_rate": 5.51748161180554e-06, "loss": 1.2658, "step": 650 }, { "epoch": 0.6887486855941115, "grad_norm": 0.638182422688773, "learning_rate": 5.354019671959601e-06, "loss": 1.2618, "step": 655 }, { "epoch": 0.694006309148265, "grad_norm": 0.6799258857804185, "learning_rate": 5.192125828217203e-06, "loss": 1.265, "step": 660 }, { "epoch": 0.6992639327024185, "grad_norm": 0.6614260729055945, "learning_rate": 5.0318547224596525e-06, "loss": 1.2726, "step": 665 }, { "epoch": 0.704521556256572, "grad_norm": 0.6571261971890797, "learning_rate": 4.873260448867004e-06, "loss": 1.2635, "step": 670 }, { "epoch": 0.7097791798107256, "grad_norm": 0.6434154288185444, "learning_rate": 4.716396535660412e-06, "loss": 1.2571, "step": 675 }, { "epoch": 0.7150368033648791, "grad_norm": 0.6507843588939136, "learning_rate": 4.5613159270354455e-06, "loss": 1.2768, "step": 680 }, { "epoch": 0.7202944269190326, "grad_norm": 0.6888038425645421, "learning_rate": 4.408070965292534e-06, "loss": 1.2729, "step": 685 }, { "epoch": 0.7255520504731862, "grad_norm": 0.6548677910438372, "learning_rate": 4.256713373170565e-06, "loss": 1.2754, "step": 690 }, { "epoch": 0.7308096740273397, "grad_norm": 0.6924489060189015, "learning_rate": 4.107294236389603e-06, "loss": 1.2428, "step": 695 }, { "epoch": 0.7360672975814931, "grad_norm": 0.675766971656018, "learning_rate": 3.959863986408593e-06, "loss": 1.2422, "step": 700 }, { "epoch": 0.7413249211356467, "grad_norm": 0.6759780214552317, "learning_rate": 3.8144723834039076e-06, "loss": 1.2596, "step": 705 }, { "epoch": 0.7465825446898002, "grad_norm": 0.6577848370025432, "learning_rate": 3.671168499474449e-06, "loss": 1.2653, "step": 710 }, { "epoch": 0.7518401682439537, "grad_norm": 0.6816662338694881, "learning_rate": 3.5300007020789997e-06, "loss": 1.2612, "step": 715 }, { "epoch": 0.7570977917981072, "grad_norm": 0.6412644189429695, "learning_rate": 3.3910166377113894e-06, "loss": 1.2606, "step": 720 }, { "epoch": 0.7623554153522608, "grad_norm": 0.6489342949048617, "learning_rate": 3.2542632158190135e-06, "loss": 1.2499, "step": 725 }, { "epoch": 0.7676130389064143, "grad_norm": 0.6305858451193405, "learning_rate": 3.119786592970102e-06, "loss": 1.2368, "step": 730 }, { "epoch": 0.7728706624605678, "grad_norm": 0.6601632070842128, "learning_rate": 2.9876321572751143e-06, "loss": 1.2826, "step": 735 }, { "epoch": 0.7781282860147214, "grad_norm": 0.6512679140886882, "learning_rate": 2.8578445130674835e-06, "loss": 1.2509, "step": 740 }, { "epoch": 0.7833859095688749, "grad_norm": 0.662446849135675, "learning_rate": 2.7304674658489104e-06, "loss": 1.2593, "step": 745 }, { "epoch": 0.7886435331230284, "grad_norm": 0.6503725043711072, "learning_rate": 2.6055440075042793e-06, "loss": 1.2696, "step": 750 }, { "epoch": 0.7939011566771819, "grad_norm": 0.6349030168998883, "learning_rate": 2.4831163017911687e-06, "loss": 1.2458, "step": 755 }, { "epoch": 0.7991587802313355, "grad_norm": 0.6296499888770365, "learning_rate": 2.3632256701088817e-06, "loss": 1.2581, "step": 760 }, { "epoch": 0.804416403785489, "grad_norm": 0.6477912073667947, "learning_rate": 2.2459125775517854e-06, "loss": 1.2614, "step": 765 }, { "epoch": 0.8096740273396424, "grad_norm": 0.6340147585135972, "learning_rate": 2.1312166192516593e-06, "loss": 1.2707, "step": 770 }, { "epoch": 0.814931650893796, "grad_norm": 0.6535117628009195, "learning_rate": 2.019176507013677e-06, "loss": 1.2586, "step": 775 }, { "epoch": 0.8201892744479495, "grad_norm": 0.6467399510691909, "learning_rate": 1.9098300562505266e-06, "loss": 1.2548, "step": 780 }, { "epoch": 0.825446898002103, "grad_norm": 0.6237075846859721, "learning_rate": 1.8032141732190722e-06, "loss": 1.26, "step": 785 }, { "epoch": 0.8307045215562566, "grad_norm": 0.6374095389974708, "learning_rate": 1.6993648425638797e-06, "loss": 1.2605, "step": 790 }, { "epoch": 0.8359621451104101, "grad_norm": 0.6326077447290254, "learning_rate": 1.5983171151717924e-06, "loss": 1.2701, "step": 795 }, { "epoch": 0.8412197686645636, "grad_norm": 0.6299244980807773, "learning_rate": 1.5001050963416718e-06, "loss": 1.2427, "step": 800 }, { "epoch": 0.8464773922187171, "grad_norm": 0.6281934013360307, "learning_rate": 1.404761934273291e-06, "loss": 1.2635, "step": 805 }, { "epoch": 0.8517350157728707, "grad_norm": 0.6374294536071755, "learning_rate": 1.3123198088792577e-06, "loss": 1.2717, "step": 810 }, { "epoch": 0.8569926393270242, "grad_norm": 0.6298699771941692, "learning_rate": 1.222809920923761e-06, "loss": 1.2499, "step": 815 }, { "epoch": 0.8622502628811777, "grad_norm": 0.6359419744147026, "learning_rate": 1.1362624814917843e-06, "loss": 1.2552, "step": 820 }, { "epoch": 0.8675078864353313, "grad_norm": 0.6285813274381151, "learning_rate": 1.0527067017923654e-06, "loss": 1.2737, "step": 825 }, { "epoch": 0.8727655099894848, "grad_norm": 0.6210713562517961, "learning_rate": 9.721707832993232e-07, "loss": 1.2663, "step": 830 }, { "epoch": 0.8780231335436383, "grad_norm": 0.6645416418213763, "learning_rate": 8.946819082327829e-07, "loss": 1.2581, "step": 835 }, { "epoch": 0.8832807570977917, "grad_norm": 0.6251751866944497, "learning_rate": 8.202662303847298e-07, "loss": 1.2519, "step": 840 }, { "epoch": 0.8885383806519453, "grad_norm": 0.6431790207111582, "learning_rate": 7.48948866291661e-07, "loss": 1.2614, "step": 845 }, { "epoch": 0.8937960042060988, "grad_norm": 0.6384407054635708, "learning_rate": 6.80753886757336e-07, "loss": 1.2722, "step": 850 }, { "epoch": 0.8990536277602523, "grad_norm": 0.631430226459749, "learning_rate": 6.157043087284797e-07, "loss": 1.2587, "step": 855 }, { "epoch": 0.9043112513144059, "grad_norm": 0.631843611779411, "learning_rate": 5.538220875261736e-07, "loss": 1.2481, "step": 860 }, { "epoch": 0.9095688748685594, "grad_norm": 0.6250494334708813, "learning_rate": 4.951281094355708e-07, "loss": 1.2552, "step": 865 }, { "epoch": 0.9148264984227129, "grad_norm": 0.6300532606909163, "learning_rate": 4.396421846564236e-07, "loss": 1.2536, "step": 870 }, { "epoch": 0.9200841219768665, "grad_norm": 0.6200012582073006, "learning_rate": 3.8738304061681107e-07, "loss": 1.2694, "step": 875 }, { "epoch": 0.92534174553102, "grad_norm": 0.6324403821172889, "learning_rate": 3.3836831565231877e-07, "loss": 1.2545, "step": 880 }, { "epoch": 0.9305993690851735, "grad_norm": 0.6325686488791483, "learning_rate": 2.926145530528002e-07, "loss": 1.2531, "step": 885 }, { "epoch": 0.935856992639327, "grad_norm": 0.6373731914514896, "learning_rate": 2.501371954787479e-07, "loss": 1.2648, "step": 890 }, { "epoch": 0.9411146161934806, "grad_norm": 0.6255593953506046, "learning_rate": 2.109505797491318e-07, "loss": 1.2392, "step": 895 }, { "epoch": 0.9463722397476341, "grad_norm": 0.6353907042408413, "learning_rate": 1.7506793200248507e-07, "loss": 1.2633, "step": 900 }, { "epoch": 0.9516298633017876, "grad_norm": 0.6307207707434458, "learning_rate": 1.4250136323285868e-07, "loss": 1.2603, "step": 905 }, { "epoch": 0.9568874868559412, "grad_norm": 0.6277705175750871, "learning_rate": 1.1326186520215888e-07, "loss": 1.2453, "step": 910 }, { "epoch": 0.9621451104100947, "grad_norm": 0.6031775390358434, "learning_rate": 8.735930673024806e-08, "loss": 1.2544, "step": 915 }, { "epoch": 0.9674027339642481, "grad_norm": 0.6189714576376613, "learning_rate": 6.480243036404598e-08, "loss": 1.2661, "step": 920 }, { "epoch": 0.9726603575184016, "grad_norm": 0.6219922382476483, "learning_rate": 4.5598849426777833e-08, "loss": 1.2418, "step": 925 }, { "epoch": 0.9779179810725552, "grad_norm": 0.6188912441406357, "learning_rate": 2.9755045448351948e-08, "loss": 1.2471, "step": 930 }, { "epoch": 0.9831756046267087, "grad_norm": 0.6241442557644146, "learning_rate": 1.7276365977730858e-08, "loss": 1.2591, "step": 935 }, { "epoch": 0.9884332281808622, "grad_norm": 0.6148942917091785, "learning_rate": 8.167022778045042e-09, "loss": 1.2783, "step": 940 }, { "epoch": 0.9936908517350158, "grad_norm": 0.6142845224019251, "learning_rate": 2.430090405054486e-09, "loss": 1.2311, "step": 945 }, { "epoch": 0.9989484752891693, "grad_norm": 0.6318778333420784, "learning_rate": 6.750516943321295e-11, "loss": 1.2488, "step": 950 }, { "epoch": 1.0, "eval_loss": 1.266036868095398, "eval_runtime": 32.6481, "eval_samples_per_second": 412.213, "eval_steps_per_second": 6.463, "step": 951 }, { "epoch": 1.0, "step": 951, "total_flos": 101091718987776.0, "train_loss": 1.289618753484371, "train_runtime": 1098.6048, "train_samples_per_second": 110.718, "train_steps_per_second": 0.866 } ], "logging_steps": 5, "max_steps": 951, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 101091718987776.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }