|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.851523834331857, |
|
"eval_steps": 500, |
|
"global_step": 74500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03256056264652253, |
|
"grad_norm": 5.030904293060303, |
|
"learning_rate": 1.986975774941391e-05, |
|
"loss": 5.9746, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06512112529304506, |
|
"grad_norm": 3.0790352821350098, |
|
"learning_rate": 1.973951549882782e-05, |
|
"loss": 4.2176, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0976816879395676, |
|
"grad_norm": 2.3053739070892334, |
|
"learning_rate": 1.9609273248241733e-05, |
|
"loss": 3.3847, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13024225058609012, |
|
"grad_norm": 2.5033621788024902, |
|
"learning_rate": 1.9479030997655642e-05, |
|
"loss": 2.9223, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16280281323261267, |
|
"grad_norm": 2.464855909347534, |
|
"learning_rate": 1.934878874706955e-05, |
|
"loss": 2.582, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1953633758791352, |
|
"grad_norm": 2.3733980655670166, |
|
"learning_rate": 1.921854649648346e-05, |
|
"loss": 2.381, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22792393852565773, |
|
"grad_norm": 2.560279130935669, |
|
"learning_rate": 1.908830424589737e-05, |
|
"loss": 2.2095, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.26048450117218025, |
|
"grad_norm": 2.146317958831787, |
|
"learning_rate": 1.895806199531128e-05, |
|
"loss": 2.0995, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.29304506381870277, |
|
"grad_norm": 2.359065294265747, |
|
"learning_rate": 1.8827819744725192e-05, |
|
"loss": 1.9948, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.32560562646522534, |
|
"grad_norm": 2.245957851409912, |
|
"learning_rate": 1.86975774941391e-05, |
|
"loss": 1.9036, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.35816618911174786, |
|
"grad_norm": 2.824934482574463, |
|
"learning_rate": 1.856733524355301e-05, |
|
"loss": 1.8212, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3907267517582704, |
|
"grad_norm": 2.4427430629730225, |
|
"learning_rate": 1.843709299296692e-05, |
|
"loss": 1.7307, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4232873144047929, |
|
"grad_norm": 2.3356220722198486, |
|
"learning_rate": 1.830685074238083e-05, |
|
"loss": 1.658, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.45584787705131546, |
|
"grad_norm": 2.7466249465942383, |
|
"learning_rate": 1.817660849179474e-05, |
|
"loss": 1.5993, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.488408439697838, |
|
"grad_norm": 2.31550669670105, |
|
"learning_rate": 1.8046366241208652e-05, |
|
"loss": 1.5493, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5209690023443605, |
|
"grad_norm": 2.412864923477173, |
|
"learning_rate": 1.791612399062256e-05, |
|
"loss": 1.4979, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.553529564990883, |
|
"grad_norm": 2.5272300243377686, |
|
"learning_rate": 1.778588174003647e-05, |
|
"loss": 1.4487, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5860901276374055, |
|
"grad_norm": 2.343013286590576, |
|
"learning_rate": 1.765563948945038e-05, |
|
"loss": 1.4119, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.618650690283928, |
|
"grad_norm": 2.6124706268310547, |
|
"learning_rate": 1.752539723886429e-05, |
|
"loss": 1.3896, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.6512112529304507, |
|
"grad_norm": 2.8961498737335205, |
|
"learning_rate": 1.73951549882782e-05, |
|
"loss": 1.3333, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6837718155769732, |
|
"grad_norm": 2.8462820053100586, |
|
"learning_rate": 1.7264912737692108e-05, |
|
"loss": 1.3036, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.7163323782234957, |
|
"grad_norm": 2.2509639263153076, |
|
"learning_rate": 1.7134670487106017e-05, |
|
"loss": 1.2872, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.7488929408700182, |
|
"grad_norm": 2.3151662349700928, |
|
"learning_rate": 1.7004428236519926e-05, |
|
"loss": 1.2498, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7814535035165407, |
|
"grad_norm": 2.587400197982788, |
|
"learning_rate": 1.687418598593384e-05, |
|
"loss": 1.2433, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8140140661630633, |
|
"grad_norm": 2.7084901332855225, |
|
"learning_rate": 1.674394373534775e-05, |
|
"loss": 1.2189, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.8465746288095858, |
|
"grad_norm": 2.3007726669311523, |
|
"learning_rate": 1.6613701484761658e-05, |
|
"loss": 1.1927, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.8791351914561084, |
|
"grad_norm": 2.200362205505371, |
|
"learning_rate": 1.6483459234175567e-05, |
|
"loss": 1.1849, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.9116957541026309, |
|
"grad_norm": 2.2914557456970215, |
|
"learning_rate": 1.6353216983589476e-05, |
|
"loss": 1.1706, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.9442563167491534, |
|
"grad_norm": 2.357699155807495, |
|
"learning_rate": 1.6222974733003386e-05, |
|
"loss": 1.161, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.976816879395676, |
|
"grad_norm": 2.5686471462249756, |
|
"learning_rate": 1.60927324824173e-05, |
|
"loss": 1.1459, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.0093774420421986, |
|
"grad_norm": 2.511021375656128, |
|
"learning_rate": 1.5962490231831208e-05, |
|
"loss": 1.114, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.041938004688721, |
|
"grad_norm": 2.976020097732544, |
|
"learning_rate": 1.5832247981245117e-05, |
|
"loss": 1.0509, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.0744985673352436, |
|
"grad_norm": 2.2788777351379395, |
|
"learning_rate": 1.5702005730659026e-05, |
|
"loss": 1.0342, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.107059129981766, |
|
"grad_norm": 2.359161853790283, |
|
"learning_rate": 1.5571763480072936e-05, |
|
"loss": 1.0347, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.1396196926282887, |
|
"grad_norm": 2.8540244102478027, |
|
"learning_rate": 1.5441521229486845e-05, |
|
"loss": 1.0288, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.172180255274811, |
|
"grad_norm": 2.635509729385376, |
|
"learning_rate": 1.5311278978900758e-05, |
|
"loss": 1.0166, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.2047408179213337, |
|
"grad_norm": 2.5582518577575684, |
|
"learning_rate": 1.5181036728314667e-05, |
|
"loss": 1.0124, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.2373013805678563, |
|
"grad_norm": 2.1439788341522217, |
|
"learning_rate": 1.5050794477728576e-05, |
|
"loss": 1.0141, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.2698619432143787, |
|
"grad_norm": 2.3901960849761963, |
|
"learning_rate": 1.4920552227142486e-05, |
|
"loss": 1.0014, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.3024225058609014, |
|
"grad_norm": 2.6219823360443115, |
|
"learning_rate": 1.4790309976556397e-05, |
|
"loss": 1.0073, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.3349830685074238, |
|
"grad_norm": 2.7062482833862305, |
|
"learning_rate": 1.4660067725970306e-05, |
|
"loss": 0.9964, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.3675436311539464, |
|
"grad_norm": 2.4956464767456055, |
|
"learning_rate": 1.4529825475384215e-05, |
|
"loss": 0.9936, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.4001041938004688, |
|
"grad_norm": 2.357893228530884, |
|
"learning_rate": 1.4399583224798126e-05, |
|
"loss": 0.9904, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.4326647564469914, |
|
"grad_norm": 2.3728160858154297, |
|
"learning_rate": 1.4269340974212036e-05, |
|
"loss": 0.9798, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.465225319093514, |
|
"grad_norm": 2.1804134845733643, |
|
"learning_rate": 1.4139098723625945e-05, |
|
"loss": 0.9786, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.4977858817400365, |
|
"grad_norm": 2.3426220417022705, |
|
"learning_rate": 1.4008856473039856e-05, |
|
"loss": 0.9717, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.5303464443865589, |
|
"grad_norm": 2.6158998012542725, |
|
"learning_rate": 1.3878614222453765e-05, |
|
"loss": 0.969, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.5629070070330815, |
|
"grad_norm": 2.3006558418273926, |
|
"learning_rate": 1.3748371971867675e-05, |
|
"loss": 0.9655, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.5954675696796041, |
|
"grad_norm": 2.3054986000061035, |
|
"learning_rate": 1.3618129721281586e-05, |
|
"loss": 0.9576, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.6280281323261265, |
|
"grad_norm": 2.3399717807769775, |
|
"learning_rate": 1.3487887470695495e-05, |
|
"loss": 0.9522, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.6605886949726492, |
|
"grad_norm": 2.381333589553833, |
|
"learning_rate": 1.3357645220109406e-05, |
|
"loss": 0.963, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.6931492576191718, |
|
"grad_norm": 2.5838122367858887, |
|
"learning_rate": 1.3227402969523315e-05, |
|
"loss": 0.952, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.7257098202656942, |
|
"grad_norm": 2.398665428161621, |
|
"learning_rate": 1.3097160718937225e-05, |
|
"loss": 0.9482, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.7582703829122166, |
|
"grad_norm": 2.4087893962860107, |
|
"learning_rate": 1.2966918468351136e-05, |
|
"loss": 0.9436, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.7908309455587392, |
|
"grad_norm": 2.380199432373047, |
|
"learning_rate": 1.2836676217765045e-05, |
|
"loss": 0.9491, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.8233915082052619, |
|
"grad_norm": 2.5550014972686768, |
|
"learning_rate": 1.2706433967178954e-05, |
|
"loss": 0.9365, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.8559520708517843, |
|
"grad_norm": 2.352365493774414, |
|
"learning_rate": 1.2576191716592865e-05, |
|
"loss": 0.9314, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.888512633498307, |
|
"grad_norm": 2.1357262134552, |
|
"learning_rate": 1.2445949466006773e-05, |
|
"loss": 0.9287, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.9210731961448295, |
|
"grad_norm": 2.809288501739502, |
|
"learning_rate": 1.2315707215420682e-05, |
|
"loss": 0.9231, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.953633758791352, |
|
"grad_norm": 2.195413589477539, |
|
"learning_rate": 1.2185464964834592e-05, |
|
"loss": 0.9165, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.9861943214378743, |
|
"grad_norm": 2.4369585514068604, |
|
"learning_rate": 1.2055222714248503e-05, |
|
"loss": 0.9261, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.018754884084397, |
|
"grad_norm": 2.0791983604431152, |
|
"learning_rate": 1.1924980463662412e-05, |
|
"loss": 0.8401, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.0513154467309196, |
|
"grad_norm": 2.3653042316436768, |
|
"learning_rate": 1.1794738213076321e-05, |
|
"loss": 0.7797, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.083876009377442, |
|
"grad_norm": 2.7878382205963135, |
|
"learning_rate": 1.1664495962490232e-05, |
|
"loss": 0.7782, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.1164365720239644, |
|
"grad_norm": 2.4624345302581787, |
|
"learning_rate": 1.1534253711904142e-05, |
|
"loss": 0.7783, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.1489971346704873, |
|
"grad_norm": 2.4672300815582275, |
|
"learning_rate": 1.1404011461318051e-05, |
|
"loss": 0.7778, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.1815576973170097, |
|
"grad_norm": 2.6120986938476562, |
|
"learning_rate": 1.1273769210731962e-05, |
|
"loss": 0.7774, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.214118259963532, |
|
"grad_norm": 2.7739064693450928, |
|
"learning_rate": 1.1143526960145871e-05, |
|
"loss": 0.7817, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.246678822610055, |
|
"grad_norm": 2.5610642433166504, |
|
"learning_rate": 1.1013284709559782e-05, |
|
"loss": 0.7733, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.2792393852565773, |
|
"grad_norm": 2.655161142349243, |
|
"learning_rate": 1.0883042458973692e-05, |
|
"loss": 0.78, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.3117999479030997, |
|
"grad_norm": 2.468252182006836, |
|
"learning_rate": 1.0752800208387601e-05, |
|
"loss": 0.7799, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.344360510549622, |
|
"grad_norm": 2.766505718231201, |
|
"learning_rate": 1.0622557957801512e-05, |
|
"loss": 0.7743, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.376921073196145, |
|
"grad_norm": 3.1091792583465576, |
|
"learning_rate": 1.0492315707215421e-05, |
|
"loss": 0.7831, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.4094816358426674, |
|
"grad_norm": 2.9491870403289795, |
|
"learning_rate": 1.036207345662933e-05, |
|
"loss": 0.7766, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.44204219848919, |
|
"grad_norm": 2.8023264408111572, |
|
"learning_rate": 1.0231831206043242e-05, |
|
"loss": 0.7759, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.4746027611357126, |
|
"grad_norm": 2.604647636413574, |
|
"learning_rate": 1.0101588955457151e-05, |
|
"loss": 0.7778, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.507163323782235, |
|
"grad_norm": 2.879962205886841, |
|
"learning_rate": 9.97134670487106e-06, |
|
"loss": 0.7685, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.5397238864287575, |
|
"grad_norm": 3.1485841274261475, |
|
"learning_rate": 9.841104454284971e-06, |
|
"loss": 0.7758, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.57228444907528, |
|
"grad_norm": 2.426480293273926, |
|
"learning_rate": 9.71086220369888e-06, |
|
"loss": 0.7696, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.6048450117218027, |
|
"grad_norm": 2.696232318878174, |
|
"learning_rate": 9.58061995311279e-06, |
|
"loss": 0.7738, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.637405574368325, |
|
"grad_norm": 3.0641300678253174, |
|
"learning_rate": 9.450377702526701e-06, |
|
"loss": 0.7718, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.6699661370148475, |
|
"grad_norm": 2.822618246078491, |
|
"learning_rate": 9.32013545194061e-06, |
|
"loss": 0.7657, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.7025266996613704, |
|
"grad_norm": 3.1593356132507324, |
|
"learning_rate": 9.18989320135452e-06, |
|
"loss": 0.7718, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.735087262307893, |
|
"grad_norm": 2.6383330821990967, |
|
"learning_rate": 9.05965095076843e-06, |
|
"loss": 0.7693, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.767647824954415, |
|
"grad_norm": 2.7163684368133545, |
|
"learning_rate": 8.92940870018234e-06, |
|
"loss": 0.7648, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.8002083876009376, |
|
"grad_norm": 3.0254065990448, |
|
"learning_rate": 8.79916644959625e-06, |
|
"loss": 0.7609, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.83276895024746, |
|
"grad_norm": 3.440492630004883, |
|
"learning_rate": 8.668924199010159e-06, |
|
"loss": 0.7641, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.865329512893983, |
|
"grad_norm": 2.6121511459350586, |
|
"learning_rate": 8.53868194842407e-06, |
|
"loss": 0.7645, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.8978900755405053, |
|
"grad_norm": 2.865845203399658, |
|
"learning_rate": 8.40843969783798e-06, |
|
"loss": 0.7652, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.930450638187028, |
|
"grad_norm": 2.8584651947021484, |
|
"learning_rate": 8.278197447251888e-06, |
|
"loss": 0.7603, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.9630112008335505, |
|
"grad_norm": 2.286515235900879, |
|
"learning_rate": 8.1479551966658e-06, |
|
"loss": 0.7655, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.995571763480073, |
|
"grad_norm": 3.0863349437713623, |
|
"learning_rate": 8.017712946079709e-06, |
|
"loss": 0.7598, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.0281323261265953, |
|
"grad_norm": 2.7062647342681885, |
|
"learning_rate": 7.887470695493618e-06, |
|
"loss": 0.6164, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.060692888773118, |
|
"grad_norm": 3.3541259765625, |
|
"learning_rate": 7.75722844490753e-06, |
|
"loss": 0.5882, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.0932534514196406, |
|
"grad_norm": 3.511744260787964, |
|
"learning_rate": 7.6269861943214385e-06, |
|
"loss": 0.5884, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.125814014066163, |
|
"grad_norm": 3.1489553451538086, |
|
"learning_rate": 7.496743943735349e-06, |
|
"loss": 0.5837, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.1583745767126854, |
|
"grad_norm": 3.2325332164764404, |
|
"learning_rate": 7.366501693149258e-06, |
|
"loss": 0.5841, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.1909351393592083, |
|
"grad_norm": 3.4985926151275635, |
|
"learning_rate": 7.236259442563168e-06, |
|
"loss": 0.5847, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.2234957020057307, |
|
"grad_norm": 3.218742609024048, |
|
"learning_rate": 7.106017191977078e-06, |
|
"loss": 0.5868, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 3.256056264652253, |
|
"grad_norm": 3.2203478813171387, |
|
"learning_rate": 6.975774941390988e-06, |
|
"loss": 0.5883, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.288616827298776, |
|
"grad_norm": 3.2793335914611816, |
|
"learning_rate": 6.845532690804898e-06, |
|
"loss": 0.5876, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 3.3211773899452983, |
|
"grad_norm": 3.3763086795806885, |
|
"learning_rate": 6.715290440218808e-06, |
|
"loss": 0.5843, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.3537379525918207, |
|
"grad_norm": 3.314659833908081, |
|
"learning_rate": 6.585048189632718e-06, |
|
"loss": 0.5834, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 3.386298515238343, |
|
"grad_norm": 4.0635457038879395, |
|
"learning_rate": 6.4548059390466275e-06, |
|
"loss": 0.5839, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.418859077884866, |
|
"grad_norm": 3.561662197113037, |
|
"learning_rate": 6.324563688460537e-06, |
|
"loss": 0.586, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 3.4514196405313884, |
|
"grad_norm": 3.3345561027526855, |
|
"learning_rate": 6.194321437874446e-06, |
|
"loss": 0.5819, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.483980203177911, |
|
"grad_norm": 3.2945241928100586, |
|
"learning_rate": 6.064079187288356e-06, |
|
"loss": 0.5846, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 3.516540765824433, |
|
"grad_norm": 3.8004238605499268, |
|
"learning_rate": 5.9338369367022665e-06, |
|
"loss": 0.5847, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.549101328470956, |
|
"grad_norm": 3.7713723182678223, |
|
"learning_rate": 5.803594686116176e-06, |
|
"loss": 0.5846, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 3.5816618911174785, |
|
"grad_norm": 3.562333822250366, |
|
"learning_rate": 5.673352435530086e-06, |
|
"loss": 0.5849, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.6142224537640013, |
|
"grad_norm": 4.006633758544922, |
|
"learning_rate": 5.543110184943996e-06, |
|
"loss": 0.5847, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 3.6467830164105237, |
|
"grad_norm": 3.453509569168091, |
|
"learning_rate": 5.412867934357906e-06, |
|
"loss": 0.5825, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.679343579057046, |
|
"grad_norm": 3.36258864402771, |
|
"learning_rate": 5.282625683771816e-06, |
|
"loss": 0.5819, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 3.7119041417035685, |
|
"grad_norm": 3.6564488410949707, |
|
"learning_rate": 5.152383433185726e-06, |
|
"loss": 0.5809, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.744464704350091, |
|
"grad_norm": 3.977710485458374, |
|
"learning_rate": 5.022141182599636e-06, |
|
"loss": 0.5803, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 3.777025266996614, |
|
"grad_norm": 3.4889750480651855, |
|
"learning_rate": 4.891898932013545e-06, |
|
"loss": 0.5808, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.809585829643136, |
|
"grad_norm": 3.451753616333008, |
|
"learning_rate": 4.7616566814274556e-06, |
|
"loss": 0.5783, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 3.842146392289659, |
|
"grad_norm": 3.9667842388153076, |
|
"learning_rate": 4.631414430841366e-06, |
|
"loss": 0.578, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.8747069549361814, |
|
"grad_norm": 3.6356189250946045, |
|
"learning_rate": 4.501172180255275e-06, |
|
"loss": 0.5776, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 3.907267517582704, |
|
"grad_norm": 4.25313663482666, |
|
"learning_rate": 4.370929929669185e-06, |
|
"loss": 0.5775, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.9398280802292263, |
|
"grad_norm": 3.822178602218628, |
|
"learning_rate": 4.2406876790830946e-06, |
|
"loss": 0.5774, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.9723886428757487, |
|
"grad_norm": 3.882927179336548, |
|
"learning_rate": 4.110445428497005e-06, |
|
"loss": 0.5733, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.004949205522271, |
|
"grad_norm": 2.9648609161376953, |
|
"learning_rate": 3.980203177910915e-06, |
|
"loss": 0.553, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 4.037509768168794, |
|
"grad_norm": 3.1388580799102783, |
|
"learning_rate": 3.849960927324824e-06, |
|
"loss": 0.4113, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.070070330815317, |
|
"grad_norm": 3.7440290451049805, |
|
"learning_rate": 3.7197186767387344e-06, |
|
"loss": 0.4071, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 4.102630893461839, |
|
"grad_norm": 3.4993302822113037, |
|
"learning_rate": 3.589476426152644e-06, |
|
"loss": 0.4078, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.135191456108362, |
|
"grad_norm": 3.8999550342559814, |
|
"learning_rate": 3.4592341755665543e-06, |
|
"loss": 0.404, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 4.167752018754884, |
|
"grad_norm": 3.9213688373565674, |
|
"learning_rate": 3.328991924980464e-06, |
|
"loss": 0.4057, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.200312581401406, |
|
"grad_norm": 4.091826438903809, |
|
"learning_rate": 3.1987496743943734e-06, |
|
"loss": 0.4037, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 4.232873144047929, |
|
"grad_norm": 3.9140231609344482, |
|
"learning_rate": 3.0685074238082836e-06, |
|
"loss": 0.4053, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.265433706694452, |
|
"grad_norm": 4.0627760887146, |
|
"learning_rate": 2.9382651732221933e-06, |
|
"loss": 0.4029, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 4.2979942693409745, |
|
"grad_norm": 3.8601019382476807, |
|
"learning_rate": 2.8080229226361035e-06, |
|
"loss": 0.4005, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 4.330554831987497, |
|
"grad_norm": 3.769637107849121, |
|
"learning_rate": 2.6777806720500133e-06, |
|
"loss": 0.4001, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 4.363115394634019, |
|
"grad_norm": 4.234343528747559, |
|
"learning_rate": 2.547538421463923e-06, |
|
"loss": 0.4002, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 4.395675957280542, |
|
"grad_norm": 3.9124088287353516, |
|
"learning_rate": 2.417296170877833e-06, |
|
"loss": 0.4005, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 4.428236519927064, |
|
"grad_norm": 3.8314108848571777, |
|
"learning_rate": 2.2870539202917425e-06, |
|
"loss": 0.3993, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 4.4607970825735865, |
|
"grad_norm": 4.098474979400635, |
|
"learning_rate": 2.1568116697056527e-06, |
|
"loss": 0.3988, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 4.49335764522011, |
|
"grad_norm": 3.8353285789489746, |
|
"learning_rate": 2.0265694191195624e-06, |
|
"loss": 0.3987, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 4.525918207866632, |
|
"grad_norm": 3.7794976234436035, |
|
"learning_rate": 1.8963271685334724e-06, |
|
"loss": 0.3972, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 4.558478770513155, |
|
"grad_norm": 4.056552410125732, |
|
"learning_rate": 1.7660849179473824e-06, |
|
"loss": 0.3958, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 4.591039333159677, |
|
"grad_norm": 3.7579519748687744, |
|
"learning_rate": 1.6358426673612921e-06, |
|
"loss": 0.3955, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 4.6235998958061995, |
|
"grad_norm": 4.280270576477051, |
|
"learning_rate": 1.5056004167752019e-06, |
|
"loss": 0.3951, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 4.656160458452722, |
|
"grad_norm": 4.043455123901367, |
|
"learning_rate": 1.3753581661891118e-06, |
|
"loss": 0.3944, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 4.688721021099244, |
|
"grad_norm": 3.790985584259033, |
|
"learning_rate": 1.2451159156030216e-06, |
|
"loss": 0.395, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 4.721281583745768, |
|
"grad_norm": 3.877270460128784, |
|
"learning_rate": 1.1148736650169315e-06, |
|
"loss": 0.3916, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 4.75384214639229, |
|
"grad_norm": 4.055418491363525, |
|
"learning_rate": 9.846314144308415e-07, |
|
"loss": 0.3928, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 4.786402709038812, |
|
"grad_norm": 4.357405662536621, |
|
"learning_rate": 8.543891638447512e-07, |
|
"loss": 0.3911, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 4.818963271685335, |
|
"grad_norm": 3.596019983291626, |
|
"learning_rate": 7.241469132586612e-07, |
|
"loss": 0.3897, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.851523834331857, |
|
"grad_norm": 4.408013820648193, |
|
"learning_rate": 5.939046626725711e-07, |
|
"loss": 0.3887, |
|
"step": 74500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 76780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.199090369536721e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|