{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.851523834331857, "eval_steps": 500, "global_step": 74500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03256056264652253, "grad_norm": 5.030904293060303, "learning_rate": 1.986975774941391e-05, "loss": 5.9746, "step": 500 }, { "epoch": 0.06512112529304506, "grad_norm": 3.0790352821350098, "learning_rate": 1.973951549882782e-05, "loss": 4.2176, "step": 1000 }, { "epoch": 0.0976816879395676, "grad_norm": 2.3053739070892334, "learning_rate": 1.9609273248241733e-05, "loss": 3.3847, "step": 1500 }, { "epoch": 0.13024225058609012, "grad_norm": 2.5033621788024902, "learning_rate": 1.9479030997655642e-05, "loss": 2.9223, "step": 2000 }, { "epoch": 0.16280281323261267, "grad_norm": 2.464855909347534, "learning_rate": 1.934878874706955e-05, "loss": 2.582, "step": 2500 }, { "epoch": 0.1953633758791352, "grad_norm": 2.3733980655670166, "learning_rate": 1.921854649648346e-05, "loss": 2.381, "step": 3000 }, { "epoch": 0.22792393852565773, "grad_norm": 2.560279130935669, "learning_rate": 1.908830424589737e-05, "loss": 2.2095, "step": 3500 }, { "epoch": 0.26048450117218025, "grad_norm": 2.146317958831787, "learning_rate": 1.895806199531128e-05, "loss": 2.0995, "step": 4000 }, { "epoch": 0.29304506381870277, "grad_norm": 2.359065294265747, "learning_rate": 1.8827819744725192e-05, "loss": 1.9948, "step": 4500 }, { "epoch": 0.32560562646522534, "grad_norm": 2.245957851409912, "learning_rate": 1.86975774941391e-05, "loss": 1.9036, "step": 5000 }, { "epoch": 0.35816618911174786, "grad_norm": 2.824934482574463, "learning_rate": 1.856733524355301e-05, "loss": 1.8212, "step": 5500 }, { "epoch": 0.3907267517582704, "grad_norm": 2.4427430629730225, "learning_rate": 1.843709299296692e-05, "loss": 1.7307, "step": 6000 }, { "epoch": 0.4232873144047929, "grad_norm": 2.3356220722198486, "learning_rate": 1.830685074238083e-05, "loss": 1.658, "step": 6500 }, { "epoch": 0.45584787705131546, "grad_norm": 2.7466249465942383, "learning_rate": 1.817660849179474e-05, "loss": 1.5993, "step": 7000 }, { "epoch": 0.488408439697838, "grad_norm": 2.31550669670105, "learning_rate": 1.8046366241208652e-05, "loss": 1.5493, "step": 7500 }, { "epoch": 0.5209690023443605, "grad_norm": 2.412864923477173, "learning_rate": 1.791612399062256e-05, "loss": 1.4979, "step": 8000 }, { "epoch": 0.553529564990883, "grad_norm": 2.5272300243377686, "learning_rate": 1.778588174003647e-05, "loss": 1.4487, "step": 8500 }, { "epoch": 0.5860901276374055, "grad_norm": 2.343013286590576, "learning_rate": 1.765563948945038e-05, "loss": 1.4119, "step": 9000 }, { "epoch": 0.618650690283928, "grad_norm": 2.6124706268310547, "learning_rate": 1.752539723886429e-05, "loss": 1.3896, "step": 9500 }, { "epoch": 0.6512112529304507, "grad_norm": 2.8961498737335205, "learning_rate": 1.73951549882782e-05, "loss": 1.3333, "step": 10000 }, { "epoch": 0.6837718155769732, "grad_norm": 2.8462820053100586, "learning_rate": 1.7264912737692108e-05, "loss": 1.3036, "step": 10500 }, { "epoch": 0.7163323782234957, "grad_norm": 2.2509639263153076, "learning_rate": 1.7134670487106017e-05, "loss": 1.2872, "step": 11000 }, { "epoch": 0.7488929408700182, "grad_norm": 2.3151662349700928, "learning_rate": 1.7004428236519926e-05, "loss": 1.2498, "step": 11500 }, { "epoch": 0.7814535035165407, "grad_norm": 2.587400197982788, "learning_rate": 1.687418598593384e-05, "loss": 1.2433, "step": 12000 }, { "epoch": 0.8140140661630633, "grad_norm": 2.7084901332855225, "learning_rate": 1.674394373534775e-05, "loss": 1.2189, "step": 12500 }, { "epoch": 0.8465746288095858, "grad_norm": 2.3007726669311523, "learning_rate": 1.6613701484761658e-05, "loss": 1.1927, "step": 13000 }, { "epoch": 0.8791351914561084, "grad_norm": 2.200362205505371, "learning_rate": 1.6483459234175567e-05, "loss": 1.1849, "step": 13500 }, { "epoch": 0.9116957541026309, "grad_norm": 2.2914557456970215, "learning_rate": 1.6353216983589476e-05, "loss": 1.1706, "step": 14000 }, { "epoch": 0.9442563167491534, "grad_norm": 2.357699155807495, "learning_rate": 1.6222974733003386e-05, "loss": 1.161, "step": 14500 }, { "epoch": 0.976816879395676, "grad_norm": 2.5686471462249756, "learning_rate": 1.60927324824173e-05, "loss": 1.1459, "step": 15000 }, { "epoch": 1.0093774420421986, "grad_norm": 2.511021375656128, "learning_rate": 1.5962490231831208e-05, "loss": 1.114, "step": 15500 }, { "epoch": 1.041938004688721, "grad_norm": 2.976020097732544, "learning_rate": 1.5832247981245117e-05, "loss": 1.0509, "step": 16000 }, { "epoch": 1.0744985673352436, "grad_norm": 2.2788777351379395, "learning_rate": 1.5702005730659026e-05, "loss": 1.0342, "step": 16500 }, { "epoch": 1.107059129981766, "grad_norm": 2.359161853790283, "learning_rate": 1.5571763480072936e-05, "loss": 1.0347, "step": 17000 }, { "epoch": 1.1396196926282887, "grad_norm": 2.8540244102478027, "learning_rate": 1.5441521229486845e-05, "loss": 1.0288, "step": 17500 }, { "epoch": 1.172180255274811, "grad_norm": 2.635509729385376, "learning_rate": 1.5311278978900758e-05, "loss": 1.0166, "step": 18000 }, { "epoch": 1.2047408179213337, "grad_norm": 2.5582518577575684, "learning_rate": 1.5181036728314667e-05, "loss": 1.0124, "step": 18500 }, { "epoch": 1.2373013805678563, "grad_norm": 2.1439788341522217, "learning_rate": 1.5050794477728576e-05, "loss": 1.0141, "step": 19000 }, { "epoch": 1.2698619432143787, "grad_norm": 2.3901960849761963, "learning_rate": 1.4920552227142486e-05, "loss": 1.0014, "step": 19500 }, { "epoch": 1.3024225058609014, "grad_norm": 2.6219823360443115, "learning_rate": 1.4790309976556397e-05, "loss": 1.0073, "step": 20000 }, { "epoch": 1.3349830685074238, "grad_norm": 2.7062482833862305, "learning_rate": 1.4660067725970306e-05, "loss": 0.9964, "step": 20500 }, { "epoch": 1.3675436311539464, "grad_norm": 2.4956464767456055, "learning_rate": 1.4529825475384215e-05, "loss": 0.9936, "step": 21000 }, { "epoch": 1.4001041938004688, "grad_norm": 2.357893228530884, "learning_rate": 1.4399583224798126e-05, "loss": 0.9904, "step": 21500 }, { "epoch": 1.4326647564469914, "grad_norm": 2.3728160858154297, "learning_rate": 1.4269340974212036e-05, "loss": 0.9798, "step": 22000 }, { "epoch": 1.465225319093514, "grad_norm": 2.1804134845733643, "learning_rate": 1.4139098723625945e-05, "loss": 0.9786, "step": 22500 }, { "epoch": 1.4977858817400365, "grad_norm": 2.3426220417022705, "learning_rate": 1.4008856473039856e-05, "loss": 0.9717, "step": 23000 }, { "epoch": 1.5303464443865589, "grad_norm": 2.6158998012542725, "learning_rate": 1.3878614222453765e-05, "loss": 0.969, "step": 23500 }, { "epoch": 1.5629070070330815, "grad_norm": 2.3006558418273926, "learning_rate": 1.3748371971867675e-05, "loss": 0.9655, "step": 24000 }, { "epoch": 1.5954675696796041, "grad_norm": 2.3054986000061035, "learning_rate": 1.3618129721281586e-05, "loss": 0.9576, "step": 24500 }, { "epoch": 1.6280281323261265, "grad_norm": 2.3399717807769775, "learning_rate": 1.3487887470695495e-05, "loss": 0.9522, "step": 25000 }, { "epoch": 1.6605886949726492, "grad_norm": 2.381333589553833, "learning_rate": 1.3357645220109406e-05, "loss": 0.963, "step": 25500 }, { "epoch": 1.6931492576191718, "grad_norm": 2.5838122367858887, "learning_rate": 1.3227402969523315e-05, "loss": 0.952, "step": 26000 }, { "epoch": 1.7257098202656942, "grad_norm": 2.398665428161621, "learning_rate": 1.3097160718937225e-05, "loss": 0.9482, "step": 26500 }, { "epoch": 1.7582703829122166, "grad_norm": 2.4087893962860107, "learning_rate": 1.2966918468351136e-05, "loss": 0.9436, "step": 27000 }, { "epoch": 1.7908309455587392, "grad_norm": 2.380199432373047, "learning_rate": 1.2836676217765045e-05, "loss": 0.9491, "step": 27500 }, { "epoch": 1.8233915082052619, "grad_norm": 2.5550014972686768, "learning_rate": 1.2706433967178954e-05, "loss": 0.9365, "step": 28000 }, { "epoch": 1.8559520708517843, "grad_norm": 2.352365493774414, "learning_rate": 1.2576191716592865e-05, "loss": 0.9314, "step": 28500 }, { "epoch": 1.888512633498307, "grad_norm": 2.1357262134552, "learning_rate": 1.2445949466006773e-05, "loss": 0.9287, "step": 29000 }, { "epoch": 1.9210731961448295, "grad_norm": 2.809288501739502, "learning_rate": 1.2315707215420682e-05, "loss": 0.9231, "step": 29500 }, { "epoch": 1.953633758791352, "grad_norm": 2.195413589477539, "learning_rate": 1.2185464964834592e-05, "loss": 0.9165, "step": 30000 }, { "epoch": 1.9861943214378743, "grad_norm": 2.4369585514068604, "learning_rate": 1.2055222714248503e-05, "loss": 0.9261, "step": 30500 }, { "epoch": 2.018754884084397, "grad_norm": 2.0791983604431152, "learning_rate": 1.1924980463662412e-05, "loss": 0.8401, "step": 31000 }, { "epoch": 2.0513154467309196, "grad_norm": 2.3653042316436768, "learning_rate": 1.1794738213076321e-05, "loss": 0.7797, "step": 31500 }, { "epoch": 2.083876009377442, "grad_norm": 2.7878382205963135, "learning_rate": 1.1664495962490232e-05, "loss": 0.7782, "step": 32000 }, { "epoch": 2.1164365720239644, "grad_norm": 2.4624345302581787, "learning_rate": 1.1534253711904142e-05, "loss": 0.7783, "step": 32500 }, { "epoch": 2.1489971346704873, "grad_norm": 2.4672300815582275, "learning_rate": 1.1404011461318051e-05, "loss": 0.7778, "step": 33000 }, { "epoch": 2.1815576973170097, "grad_norm": 2.6120986938476562, "learning_rate": 1.1273769210731962e-05, "loss": 0.7774, "step": 33500 }, { "epoch": 2.214118259963532, "grad_norm": 2.7739064693450928, "learning_rate": 1.1143526960145871e-05, "loss": 0.7817, "step": 34000 }, { "epoch": 2.246678822610055, "grad_norm": 2.5610642433166504, "learning_rate": 1.1013284709559782e-05, "loss": 0.7733, "step": 34500 }, { "epoch": 2.2792393852565773, "grad_norm": 2.655161142349243, "learning_rate": 1.0883042458973692e-05, "loss": 0.78, "step": 35000 }, { "epoch": 2.3117999479030997, "grad_norm": 2.468252182006836, "learning_rate": 1.0752800208387601e-05, "loss": 0.7799, "step": 35500 }, { "epoch": 2.344360510549622, "grad_norm": 2.766505718231201, "learning_rate": 1.0622557957801512e-05, "loss": 0.7743, "step": 36000 }, { "epoch": 2.376921073196145, "grad_norm": 3.1091792583465576, "learning_rate": 1.0492315707215421e-05, "loss": 0.7831, "step": 36500 }, { "epoch": 2.4094816358426674, "grad_norm": 2.9491870403289795, "learning_rate": 1.036207345662933e-05, "loss": 0.7766, "step": 37000 }, { "epoch": 2.44204219848919, "grad_norm": 2.8023264408111572, "learning_rate": 1.0231831206043242e-05, "loss": 0.7759, "step": 37500 }, { "epoch": 2.4746027611357126, "grad_norm": 2.604647636413574, "learning_rate": 1.0101588955457151e-05, "loss": 0.7778, "step": 38000 }, { "epoch": 2.507163323782235, "grad_norm": 2.879962205886841, "learning_rate": 9.97134670487106e-06, "loss": 0.7685, "step": 38500 }, { "epoch": 2.5397238864287575, "grad_norm": 3.1485841274261475, "learning_rate": 9.841104454284971e-06, "loss": 0.7758, "step": 39000 }, { "epoch": 2.57228444907528, "grad_norm": 2.426480293273926, "learning_rate": 9.71086220369888e-06, "loss": 0.7696, "step": 39500 }, { "epoch": 2.6048450117218027, "grad_norm": 2.696232318878174, "learning_rate": 9.58061995311279e-06, "loss": 0.7738, "step": 40000 }, { "epoch": 2.637405574368325, "grad_norm": 3.0641300678253174, "learning_rate": 9.450377702526701e-06, "loss": 0.7718, "step": 40500 }, { "epoch": 2.6699661370148475, "grad_norm": 2.822618246078491, "learning_rate": 9.32013545194061e-06, "loss": 0.7657, "step": 41000 }, { "epoch": 2.7025266996613704, "grad_norm": 3.1593356132507324, "learning_rate": 9.18989320135452e-06, "loss": 0.7718, "step": 41500 }, { "epoch": 2.735087262307893, "grad_norm": 2.6383330821990967, "learning_rate": 9.05965095076843e-06, "loss": 0.7693, "step": 42000 }, { "epoch": 2.767647824954415, "grad_norm": 2.7163684368133545, "learning_rate": 8.92940870018234e-06, "loss": 0.7648, "step": 42500 }, { "epoch": 2.8002083876009376, "grad_norm": 3.0254065990448, "learning_rate": 8.79916644959625e-06, "loss": 0.7609, "step": 43000 }, { "epoch": 2.83276895024746, "grad_norm": 3.440492630004883, "learning_rate": 8.668924199010159e-06, "loss": 0.7641, "step": 43500 }, { "epoch": 2.865329512893983, "grad_norm": 2.6121511459350586, "learning_rate": 8.53868194842407e-06, "loss": 0.7645, "step": 44000 }, { "epoch": 2.8978900755405053, "grad_norm": 2.865845203399658, "learning_rate": 8.40843969783798e-06, "loss": 0.7652, "step": 44500 }, { "epoch": 2.930450638187028, "grad_norm": 2.8584651947021484, "learning_rate": 8.278197447251888e-06, "loss": 0.7603, "step": 45000 }, { "epoch": 2.9630112008335505, "grad_norm": 2.286515235900879, "learning_rate": 8.1479551966658e-06, "loss": 0.7655, "step": 45500 }, { "epoch": 2.995571763480073, "grad_norm": 3.0863349437713623, "learning_rate": 8.017712946079709e-06, "loss": 0.7598, "step": 46000 }, { "epoch": 3.0281323261265953, "grad_norm": 2.7062647342681885, "learning_rate": 7.887470695493618e-06, "loss": 0.6164, "step": 46500 }, { "epoch": 3.060692888773118, "grad_norm": 3.3541259765625, "learning_rate": 7.75722844490753e-06, "loss": 0.5882, "step": 47000 }, { "epoch": 3.0932534514196406, "grad_norm": 3.511744260787964, "learning_rate": 7.6269861943214385e-06, "loss": 0.5884, "step": 47500 }, { "epoch": 3.125814014066163, "grad_norm": 3.1489553451538086, "learning_rate": 7.496743943735349e-06, "loss": 0.5837, "step": 48000 }, { "epoch": 3.1583745767126854, "grad_norm": 3.2325332164764404, "learning_rate": 7.366501693149258e-06, "loss": 0.5841, "step": 48500 }, { "epoch": 3.1909351393592083, "grad_norm": 3.4985926151275635, "learning_rate": 7.236259442563168e-06, "loss": 0.5847, "step": 49000 }, { "epoch": 3.2234957020057307, "grad_norm": 3.218742609024048, "learning_rate": 7.106017191977078e-06, "loss": 0.5868, "step": 49500 }, { "epoch": 3.256056264652253, "grad_norm": 3.2203478813171387, "learning_rate": 6.975774941390988e-06, "loss": 0.5883, "step": 50000 }, { "epoch": 3.288616827298776, "grad_norm": 3.2793335914611816, "learning_rate": 6.845532690804898e-06, "loss": 0.5876, "step": 50500 }, { "epoch": 3.3211773899452983, "grad_norm": 3.3763086795806885, "learning_rate": 6.715290440218808e-06, "loss": 0.5843, "step": 51000 }, { "epoch": 3.3537379525918207, "grad_norm": 3.314659833908081, "learning_rate": 6.585048189632718e-06, "loss": 0.5834, "step": 51500 }, { "epoch": 3.386298515238343, "grad_norm": 4.0635457038879395, "learning_rate": 6.4548059390466275e-06, "loss": 0.5839, "step": 52000 }, { "epoch": 3.418859077884866, "grad_norm": 3.561662197113037, "learning_rate": 6.324563688460537e-06, "loss": 0.586, "step": 52500 }, { "epoch": 3.4514196405313884, "grad_norm": 3.3345561027526855, "learning_rate": 6.194321437874446e-06, "loss": 0.5819, "step": 53000 }, { "epoch": 3.483980203177911, "grad_norm": 3.2945241928100586, "learning_rate": 6.064079187288356e-06, "loss": 0.5846, "step": 53500 }, { "epoch": 3.516540765824433, "grad_norm": 3.8004238605499268, "learning_rate": 5.9338369367022665e-06, "loss": 0.5847, "step": 54000 }, { "epoch": 3.549101328470956, "grad_norm": 3.7713723182678223, "learning_rate": 5.803594686116176e-06, "loss": 0.5846, "step": 54500 }, { "epoch": 3.5816618911174785, "grad_norm": 3.562333822250366, "learning_rate": 5.673352435530086e-06, "loss": 0.5849, "step": 55000 }, { "epoch": 3.6142224537640013, "grad_norm": 4.006633758544922, "learning_rate": 5.543110184943996e-06, "loss": 0.5847, "step": 55500 }, { "epoch": 3.6467830164105237, "grad_norm": 3.453509569168091, "learning_rate": 5.412867934357906e-06, "loss": 0.5825, "step": 56000 }, { "epoch": 3.679343579057046, "grad_norm": 3.36258864402771, "learning_rate": 5.282625683771816e-06, "loss": 0.5819, "step": 56500 }, { "epoch": 3.7119041417035685, "grad_norm": 3.6564488410949707, "learning_rate": 5.152383433185726e-06, "loss": 0.5809, "step": 57000 }, { "epoch": 3.744464704350091, "grad_norm": 3.977710485458374, "learning_rate": 5.022141182599636e-06, "loss": 0.5803, "step": 57500 }, { "epoch": 3.777025266996614, "grad_norm": 3.4889750480651855, "learning_rate": 4.891898932013545e-06, "loss": 0.5808, "step": 58000 }, { "epoch": 3.809585829643136, "grad_norm": 3.451753616333008, "learning_rate": 4.7616566814274556e-06, "loss": 0.5783, "step": 58500 }, { "epoch": 3.842146392289659, "grad_norm": 3.9667842388153076, "learning_rate": 4.631414430841366e-06, "loss": 0.578, "step": 59000 }, { "epoch": 3.8747069549361814, "grad_norm": 3.6356189250946045, "learning_rate": 4.501172180255275e-06, "loss": 0.5776, "step": 59500 }, { "epoch": 3.907267517582704, "grad_norm": 4.25313663482666, "learning_rate": 4.370929929669185e-06, "loss": 0.5775, "step": 60000 }, { "epoch": 3.9398280802292263, "grad_norm": 3.822178602218628, "learning_rate": 4.2406876790830946e-06, "loss": 0.5774, "step": 60500 }, { "epoch": 3.9723886428757487, "grad_norm": 3.882927179336548, "learning_rate": 4.110445428497005e-06, "loss": 0.5733, "step": 61000 }, { "epoch": 4.004949205522271, "grad_norm": 2.9648609161376953, "learning_rate": 3.980203177910915e-06, "loss": 0.553, "step": 61500 }, { "epoch": 4.037509768168794, "grad_norm": 3.1388580799102783, "learning_rate": 3.849960927324824e-06, "loss": 0.4113, "step": 62000 }, { "epoch": 4.070070330815317, "grad_norm": 3.7440290451049805, "learning_rate": 3.7197186767387344e-06, "loss": 0.4071, "step": 62500 }, { "epoch": 4.102630893461839, "grad_norm": 3.4993302822113037, "learning_rate": 3.589476426152644e-06, "loss": 0.4078, "step": 63000 }, { "epoch": 4.135191456108362, "grad_norm": 3.8999550342559814, "learning_rate": 3.4592341755665543e-06, "loss": 0.404, "step": 63500 }, { "epoch": 4.167752018754884, "grad_norm": 3.9213688373565674, "learning_rate": 3.328991924980464e-06, "loss": 0.4057, "step": 64000 }, { "epoch": 4.200312581401406, "grad_norm": 4.091826438903809, "learning_rate": 3.1987496743943734e-06, "loss": 0.4037, "step": 64500 }, { "epoch": 4.232873144047929, "grad_norm": 3.9140231609344482, "learning_rate": 3.0685074238082836e-06, "loss": 0.4053, "step": 65000 }, { "epoch": 4.265433706694452, "grad_norm": 4.0627760887146, "learning_rate": 2.9382651732221933e-06, "loss": 0.4029, "step": 65500 }, { "epoch": 4.2979942693409745, "grad_norm": 3.8601019382476807, "learning_rate": 2.8080229226361035e-06, "loss": 0.4005, "step": 66000 }, { "epoch": 4.330554831987497, "grad_norm": 3.769637107849121, "learning_rate": 2.6777806720500133e-06, "loss": 0.4001, "step": 66500 }, { "epoch": 4.363115394634019, "grad_norm": 4.234343528747559, "learning_rate": 2.547538421463923e-06, "loss": 0.4002, "step": 67000 }, { "epoch": 4.395675957280542, "grad_norm": 3.9124088287353516, "learning_rate": 2.417296170877833e-06, "loss": 0.4005, "step": 67500 }, { "epoch": 4.428236519927064, "grad_norm": 3.8314108848571777, "learning_rate": 2.2870539202917425e-06, "loss": 0.3993, "step": 68000 }, { "epoch": 4.4607970825735865, "grad_norm": 4.098474979400635, "learning_rate": 2.1568116697056527e-06, "loss": 0.3988, "step": 68500 }, { "epoch": 4.49335764522011, "grad_norm": 3.8353285789489746, "learning_rate": 2.0265694191195624e-06, "loss": 0.3987, "step": 69000 }, { "epoch": 4.525918207866632, "grad_norm": 3.7794976234436035, "learning_rate": 1.8963271685334724e-06, "loss": 0.3972, "step": 69500 }, { "epoch": 4.558478770513155, "grad_norm": 4.056552410125732, "learning_rate": 1.7660849179473824e-06, "loss": 0.3958, "step": 70000 }, { "epoch": 4.591039333159677, "grad_norm": 3.7579519748687744, "learning_rate": 1.6358426673612921e-06, "loss": 0.3955, "step": 70500 }, { "epoch": 4.6235998958061995, "grad_norm": 4.280270576477051, "learning_rate": 1.5056004167752019e-06, "loss": 0.3951, "step": 71000 }, { "epoch": 4.656160458452722, "grad_norm": 4.043455123901367, "learning_rate": 1.3753581661891118e-06, "loss": 0.3944, "step": 71500 }, { "epoch": 4.688721021099244, "grad_norm": 3.790985584259033, "learning_rate": 1.2451159156030216e-06, "loss": 0.395, "step": 72000 }, { "epoch": 4.721281583745768, "grad_norm": 3.877270460128784, "learning_rate": 1.1148736650169315e-06, "loss": 0.3916, "step": 72500 }, { "epoch": 4.75384214639229, "grad_norm": 4.055418491363525, "learning_rate": 9.846314144308415e-07, "loss": 0.3928, "step": 73000 }, { "epoch": 4.786402709038812, "grad_norm": 4.357405662536621, "learning_rate": 8.543891638447512e-07, "loss": 0.3911, "step": 73500 }, { "epoch": 4.818963271685335, "grad_norm": 3.596019983291626, "learning_rate": 7.241469132586612e-07, "loss": 0.3897, "step": 74000 }, { "epoch": 4.851523834331857, "grad_norm": 4.408013820648193, "learning_rate": 5.939046626725711e-07, "loss": 0.3887, "step": 74500 } ], "logging_steps": 500, "max_steps": 76780, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.199090369536721e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }