|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0007067137809187, |
|
"eval_steps": 332, |
|
"global_step": 1327, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007538280329799764, |
|
"grad_norm": 16.65915870666504, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9643, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0015076560659599528, |
|
"grad_norm": 18.07321548461914, |
|
"learning_rate": 4e-05, |
|
"loss": 0.99, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0022614840989399294, |
|
"grad_norm": 20.50757598876953, |
|
"learning_rate": 6e-05, |
|
"loss": 0.9625, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0030153121319199056, |
|
"grad_norm": 16.542922973632812, |
|
"learning_rate": 8e-05, |
|
"loss": 0.9352, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003769140164899882, |
|
"grad_norm": 16.435678482055664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9104, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004522968197879859, |
|
"grad_norm": 17.327836990356445, |
|
"learning_rate": 0.00012, |
|
"loss": 0.8723, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005276796230859835, |
|
"grad_norm": 15.497602462768555, |
|
"learning_rate": 0.00014, |
|
"loss": 0.7953, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006030624263839811, |
|
"grad_norm": 16.634872436523438, |
|
"learning_rate": 0.00016, |
|
"loss": 0.7444, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.006784452296819788, |
|
"grad_norm": 18.223051071166992, |
|
"learning_rate": 0.00018, |
|
"loss": 0.7296, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007538280329799764, |
|
"grad_norm": 12.145986557006836, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6559, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008292108362779741, |
|
"grad_norm": 12.613237380981445, |
|
"learning_rate": 0.00019999971548969982, |
|
"loss": 0.6663, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009045936395759718, |
|
"grad_norm": 12.572402954101562, |
|
"learning_rate": 0.0001999988619604182, |
|
"loss": 0.6575, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.009799764428739694, |
|
"grad_norm": 10.410853385925293, |
|
"learning_rate": 0.00019999743941701188, |
|
"loss": 0.6172, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01055359246171967, |
|
"grad_norm": 9.993011474609375, |
|
"learning_rate": 0.00019999544786757545, |
|
"loss": 0.6205, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.011307420494699646, |
|
"grad_norm": 10.803098678588867, |
|
"learning_rate": 0.00019999288732344122, |
|
"loss": 0.5932, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012061248527679622, |
|
"grad_norm": 10.612732887268066, |
|
"learning_rate": 0.0001999897577991792, |
|
"loss": 0.5989, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0128150765606596, |
|
"grad_norm": 10.924768447875977, |
|
"learning_rate": 0.0001999860593125971, |
|
"loss": 0.5946, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.013568904593639576, |
|
"grad_norm": 9.804558753967285, |
|
"learning_rate": 0.00019998179188473997, |
|
"loss": 0.5321, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.014322732626619553, |
|
"grad_norm": 10.648846626281738, |
|
"learning_rate": 0.00019997695553989042, |
|
"loss": 0.5584, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.015076560659599529, |
|
"grad_norm": 10.692992210388184, |
|
"learning_rate": 0.00019997155030556822, |
|
"loss": 0.5603, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.015830388692579505, |
|
"grad_norm": 10.715287208557129, |
|
"learning_rate": 0.00019996557621253027, |
|
"loss": 0.584, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.016584216725559483, |
|
"grad_norm": 9.866957664489746, |
|
"learning_rate": 0.0001999590332947704, |
|
"loss": 0.5571, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.017338044758539457, |
|
"grad_norm": 9.68693733215332, |
|
"learning_rate": 0.00019995192158951919, |
|
"loss": 0.5415, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.018091872791519435, |
|
"grad_norm": 10.831818580627441, |
|
"learning_rate": 0.00019994424113724363, |
|
"loss": 0.5458, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01884570082449941, |
|
"grad_norm": 9.90411376953125, |
|
"learning_rate": 0.00019993599198164715, |
|
"loss": 0.5368, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.019599528857479388, |
|
"grad_norm": 8.305344581604004, |
|
"learning_rate": 0.0001999271741696691, |
|
"loss": 0.5271, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.020353356890459365, |
|
"grad_norm": 9.10693645477295, |
|
"learning_rate": 0.00019991778775148465, |
|
"loss": 0.5038, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02110718492343934, |
|
"grad_norm": 8.622981071472168, |
|
"learning_rate": 0.00019990783278050448, |
|
"loss": 0.5039, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.021861012956419318, |
|
"grad_norm": 10.21834945678711, |
|
"learning_rate": 0.0001998973093133744, |
|
"loss": 0.5499, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.022614840989399292, |
|
"grad_norm": 10.313283920288086, |
|
"learning_rate": 0.00019988621740997512, |
|
"loss": 0.5042, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02336866902237927, |
|
"grad_norm": 8.889609336853027, |
|
"learning_rate": 0.00019987455713342187, |
|
"loss": 0.467, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.024122497055359245, |
|
"grad_norm": 8.749794006347656, |
|
"learning_rate": 0.000199862328550064, |
|
"loss": 0.5357, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.024876325088339223, |
|
"grad_norm": 8.97386360168457, |
|
"learning_rate": 0.00019984953172948465, |
|
"loss": 0.495, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0256301531213192, |
|
"grad_norm": 9.057605743408203, |
|
"learning_rate": 0.0001998361667445004, |
|
"loss": 0.487, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.026383981154299175, |
|
"grad_norm": 9.346535682678223, |
|
"learning_rate": 0.00019982223367116076, |
|
"loss": 0.5348, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.027137809187279153, |
|
"grad_norm": 10.248679161071777, |
|
"learning_rate": 0.00019980773258874778, |
|
"loss": 0.5234, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.027891637220259127, |
|
"grad_norm": 9.637868881225586, |
|
"learning_rate": 0.00019979266357977564, |
|
"loss": 0.5, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.028645465253239105, |
|
"grad_norm": 10.087867736816406, |
|
"learning_rate": 0.00019977702672999007, |
|
"loss": 0.4772, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02939929328621908, |
|
"grad_norm": 11.312880516052246, |
|
"learning_rate": 0.00019976082212836793, |
|
"loss": 0.4689, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.030153121319199058, |
|
"grad_norm": 10.71940803527832, |
|
"learning_rate": 0.0001997440498671168, |
|
"loss": 0.4961, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.030906949352179035, |
|
"grad_norm": 10.881596565246582, |
|
"learning_rate": 0.00019972671004167433, |
|
"loss": 0.5628, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03166077738515901, |
|
"grad_norm": 11.740187644958496, |
|
"learning_rate": 0.00019970880275070762, |
|
"loss": 0.5096, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03241460541813899, |
|
"grad_norm": 12.402807235717773, |
|
"learning_rate": 0.00019969032809611287, |
|
"loss": 0.6114, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.033168433451118966, |
|
"grad_norm": 13.945646286010742, |
|
"learning_rate": 0.0001996712861830147, |
|
"loss": 0.6009, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03392226148409894, |
|
"grad_norm": 14.720582962036133, |
|
"learning_rate": 0.00019965167711976552, |
|
"loss": 0.5767, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.034676089517078915, |
|
"grad_norm": 15.46834945678711, |
|
"learning_rate": 0.0001996315010179449, |
|
"loss": 0.5566, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03542991755005889, |
|
"grad_norm": 16.198299407958984, |
|
"learning_rate": 0.00019961075799235903, |
|
"loss": 0.6248, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03618374558303887, |
|
"grad_norm": 36.07707214355469, |
|
"learning_rate": 0.00019958944816104, |
|
"loss": 0.5949, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03693757361601885, |
|
"grad_norm": 25.19093894958496, |
|
"learning_rate": 0.00019956757164524516, |
|
"loss": 0.5619, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03769140164899882, |
|
"grad_norm": 20.562816619873047, |
|
"learning_rate": 0.00019954512856945632, |
|
"loss": 0.662, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0384452296819788, |
|
"grad_norm": 40.9443359375, |
|
"learning_rate": 0.00019952211906137932, |
|
"loss": 0.8705, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.039199057714958775, |
|
"grad_norm": 29.929576873779297, |
|
"learning_rate": 0.00019949854325194294, |
|
"loss": 0.7504, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03995288574793875, |
|
"grad_norm": 18.468313217163086, |
|
"learning_rate": 0.00019947440127529836, |
|
"loss": 0.7158, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04070671378091873, |
|
"grad_norm": 9.704629898071289, |
|
"learning_rate": 0.00019944969326881845, |
|
"loss": 0.5938, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0414605418138987, |
|
"grad_norm": 9.891565322875977, |
|
"learning_rate": 0.00019942441937309684, |
|
"loss": 0.5693, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04221436984687868, |
|
"grad_norm": 11.08341121673584, |
|
"learning_rate": 0.00019939857973194717, |
|
"loss": 0.5726, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04296819787985866, |
|
"grad_norm": 10.182625770568848, |
|
"learning_rate": 0.0001993721744924024, |
|
"loss": 0.5854, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.043722025912838636, |
|
"grad_norm": 10.336113929748535, |
|
"learning_rate": 0.00019934520380471372, |
|
"loss": 0.5341, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04447585394581861, |
|
"grad_norm": 7.881448745727539, |
|
"learning_rate": 0.0001993176678223499, |
|
"loss": 0.5013, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.045229681978798585, |
|
"grad_norm": 10.487141609191895, |
|
"learning_rate": 0.0001992895667019964, |
|
"loss": 0.5395, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04598351001177856, |
|
"grad_norm": 10.38466739654541, |
|
"learning_rate": 0.0001992609006035543, |
|
"loss": 0.5214, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04673733804475854, |
|
"grad_norm": 9.553030014038086, |
|
"learning_rate": 0.0001992316696901397, |
|
"loss": 0.4825, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04749116607773852, |
|
"grad_norm": 8.298136711120605, |
|
"learning_rate": 0.00019920187412808248, |
|
"loss": 0.4797, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04824499411071849, |
|
"grad_norm": 7.880730628967285, |
|
"learning_rate": 0.0001991715140869255, |
|
"loss": 0.4722, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04899882214369847, |
|
"grad_norm": 8.825959205627441, |
|
"learning_rate": 0.00019914058973942368, |
|
"loss": 0.5022, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.049752650176678445, |
|
"grad_norm": 8.143362998962402, |
|
"learning_rate": 0.00019910910126154293, |
|
"loss": 0.4975, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05050647820965842, |
|
"grad_norm": 7.704590797424316, |
|
"learning_rate": 0.00019907704883245916, |
|
"loss": 0.4973, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0512603062426384, |
|
"grad_norm": 7.914122104644775, |
|
"learning_rate": 0.00019904443263455728, |
|
"loss": 0.5046, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05201413427561837, |
|
"grad_norm": 8.946449279785156, |
|
"learning_rate": 0.00019901125285343022, |
|
"loss": 0.5124, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05276796230859835, |
|
"grad_norm": 7.793578147888184, |
|
"learning_rate": 0.0001989775096778777, |
|
"loss": 0.5141, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05352179034157833, |
|
"grad_norm": 7.8742756843566895, |
|
"learning_rate": 0.0001989432032999054, |
|
"loss": 0.5071, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.054275618374558306, |
|
"grad_norm": 8.052972793579102, |
|
"learning_rate": 0.0001989083339147237, |
|
"loss": 0.4938, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.055029446407538284, |
|
"grad_norm": 7.386316776275635, |
|
"learning_rate": 0.0001988729017207465, |
|
"loss": 0.4767, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.055783274440518255, |
|
"grad_norm": 7.526272773742676, |
|
"learning_rate": 0.00019883690691959035, |
|
"loss": 0.4642, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.05653710247349823, |
|
"grad_norm": 8.339061737060547, |
|
"learning_rate": 0.00019880034971607308, |
|
"loss": 0.4888, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05729093050647821, |
|
"grad_norm": 8.045515060424805, |
|
"learning_rate": 0.00019876323031821266, |
|
"loss": 0.4478, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.05804475853945819, |
|
"grad_norm": 8.333029747009277, |
|
"learning_rate": 0.00019872554893722618, |
|
"loss": 0.4695, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05879858657243816, |
|
"grad_norm": 8.050617218017578, |
|
"learning_rate": 0.0001986873057875284, |
|
"loss": 0.4532, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05955241460541814, |
|
"grad_norm": 8.27062702178955, |
|
"learning_rate": 0.00019864850108673073, |
|
"loss": 0.4654, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.060306242638398115, |
|
"grad_norm": 8.429513931274414, |
|
"learning_rate": 0.0001986091350556399, |
|
"loss": 0.4829, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06106007067137809, |
|
"grad_norm": 8.401616096496582, |
|
"learning_rate": 0.00019856920791825683, |
|
"loss": 0.5086, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06181389870435807, |
|
"grad_norm": 8.308648109436035, |
|
"learning_rate": 0.00019852871990177503, |
|
"loss": 0.4758, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.06256772673733804, |
|
"grad_norm": 8.516093254089355, |
|
"learning_rate": 0.00019848767123657976, |
|
"loss": 0.4423, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.06332155477031802, |
|
"grad_norm": 8.437211990356445, |
|
"learning_rate": 0.0001984460621562463, |
|
"loss": 0.4429, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.064075382803298, |
|
"grad_norm": 8.637296676635742, |
|
"learning_rate": 0.00019840389289753896, |
|
"loss": 0.4956, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06482921083627798, |
|
"grad_norm": 8.39278507232666, |
|
"learning_rate": 0.00019836116370040944, |
|
"loss": 0.4483, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.06558303886925795, |
|
"grad_norm": 9.617965698242188, |
|
"learning_rate": 0.00019831787480799568, |
|
"loss": 0.4714, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.06633686690223793, |
|
"grad_norm": 8.52342700958252, |
|
"learning_rate": 0.00019827402646662047, |
|
"loss": 0.4375, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.06709069493521791, |
|
"grad_norm": 9.882357597351074, |
|
"learning_rate": 0.0001982296189257898, |
|
"loss": 0.4796, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.06784452296819787, |
|
"grad_norm": 9.361654281616211, |
|
"learning_rate": 0.00019818465243819184, |
|
"loss": 0.4871, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06859835100117785, |
|
"grad_norm": 9.959556579589844, |
|
"learning_rate": 0.00019813912725969509, |
|
"loss": 0.472, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.06935217903415783, |
|
"grad_norm": 9.579131126403809, |
|
"learning_rate": 0.0001980930436493472, |
|
"loss": 0.4906, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07010600706713781, |
|
"grad_norm": 10.082910537719727, |
|
"learning_rate": 0.00019804640186937343, |
|
"loss": 0.537, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.07085983510011779, |
|
"grad_norm": 10.720930099487305, |
|
"learning_rate": 0.0001979992021851751, |
|
"loss": 0.5277, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.07161366313309776, |
|
"grad_norm": 10.86539363861084, |
|
"learning_rate": 0.00019795144486532814, |
|
"loss": 0.5511, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07236749116607774, |
|
"grad_norm": 13.410208702087402, |
|
"learning_rate": 0.00019790313018158156, |
|
"loss": 0.5658, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.07312131919905772, |
|
"grad_norm": 14.898797988891602, |
|
"learning_rate": 0.0001978542584088558, |
|
"loss": 0.5529, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0738751472320377, |
|
"grad_norm": 14.036768913269043, |
|
"learning_rate": 0.00019780482982524142, |
|
"loss": 0.5396, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.07462897526501767, |
|
"grad_norm": 15.9882173538208, |
|
"learning_rate": 0.00019775484471199715, |
|
"loss": 0.514, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.07538280329799764, |
|
"grad_norm": 17.01093864440918, |
|
"learning_rate": 0.0001977043033535486, |
|
"loss": 0.5262, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07613663133097762, |
|
"grad_norm": 41.135196685791016, |
|
"learning_rate": 0.00019765320603748655, |
|
"loss": 0.7909, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.0768904593639576, |
|
"grad_norm": 25.291397094726562, |
|
"learning_rate": 0.0001976015530545652, |
|
"loss": 0.714, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.07764428739693757, |
|
"grad_norm": 12.169105529785156, |
|
"learning_rate": 0.0001975493446987007, |
|
"loss": 0.5999, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.07839811542991755, |
|
"grad_norm": 8.400662422180176, |
|
"learning_rate": 0.00019749658126696934, |
|
"loss": 0.5707, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.07915194346289753, |
|
"grad_norm": 10.622336387634277, |
|
"learning_rate": 0.00019744326305960595, |
|
"loss": 0.5798, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0799057714958775, |
|
"grad_norm": 10.29685115814209, |
|
"learning_rate": 0.00019738939038000205, |
|
"loss": 0.5752, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.08065959952885748, |
|
"grad_norm": 7.853797435760498, |
|
"learning_rate": 0.00019733496353470433, |
|
"loss": 0.543, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.08141342756183746, |
|
"grad_norm": 7.910231113433838, |
|
"learning_rate": 0.00019727998283341274, |
|
"loss": 0.5155, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.08216725559481743, |
|
"grad_norm": 8.53306770324707, |
|
"learning_rate": 0.00019722444858897878, |
|
"loss": 0.5029, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0829210836277974, |
|
"grad_norm": 8.579912185668945, |
|
"learning_rate": 0.00019716836111740378, |
|
"loss": 0.487, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08367491166077738, |
|
"grad_norm": 8.553475379943848, |
|
"learning_rate": 0.00019711172073783696, |
|
"loss": 0.4853, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.08442873969375736, |
|
"grad_norm": 9.386043548583984, |
|
"learning_rate": 0.00019705452777257377, |
|
"loss": 0.4941, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.08518256772673734, |
|
"grad_norm": 8.391158103942871, |
|
"learning_rate": 0.000196996782547054, |
|
"loss": 0.4657, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.08593639575971732, |
|
"grad_norm": 8.52602481842041, |
|
"learning_rate": 0.00019693848538985983, |
|
"loss": 0.4744, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0866902237926973, |
|
"grad_norm": 7.8026885986328125, |
|
"learning_rate": 0.00019687963663271409, |
|
"loss": 0.4742, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08744405182567727, |
|
"grad_norm": 8.957297325134277, |
|
"learning_rate": 0.00019682023661047836, |
|
"loss": 0.4846, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.08819787985865725, |
|
"grad_norm": 8.33506965637207, |
|
"learning_rate": 0.00019676028566115102, |
|
"loss": 0.47, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.08895170789163721, |
|
"grad_norm": 7.720737934112549, |
|
"learning_rate": 0.00019669978412586528, |
|
"loss": 0.4512, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.08970553592461719, |
|
"grad_norm": 7.069596290588379, |
|
"learning_rate": 0.00019663873234888733, |
|
"loss": 0.4685, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.09045936395759717, |
|
"grad_norm": 7.589311599731445, |
|
"learning_rate": 0.0001965771306776144, |
|
"loss": 0.4702, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09121319199057715, |
|
"grad_norm": 7.950814723968506, |
|
"learning_rate": 0.00019651497946257266, |
|
"loss": 0.4797, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.09196702002355713, |
|
"grad_norm": 7.834803581237793, |
|
"learning_rate": 0.00019645227905741534, |
|
"loss": 0.4512, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0927208480565371, |
|
"grad_norm": 7.925727844238281, |
|
"learning_rate": 0.00019638902981892068, |
|
"loss": 0.4702, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.09347467608951708, |
|
"grad_norm": 7.2047038078308105, |
|
"learning_rate": 0.00019632523210698987, |
|
"loss": 0.4586, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.09422850412249706, |
|
"grad_norm": 8.701865196228027, |
|
"learning_rate": 0.00019626088628464498, |
|
"loss": 0.4629, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09498233215547704, |
|
"grad_norm": 7.792990684509277, |
|
"learning_rate": 0.00019619599271802706, |
|
"loss": 0.4578, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.09573616018845701, |
|
"grad_norm": 7.0652642250061035, |
|
"learning_rate": 0.00019613055177639384, |
|
"loss": 0.4439, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.09648998822143698, |
|
"grad_norm": 7.519805431365967, |
|
"learning_rate": 0.00019606456383211777, |
|
"loss": 0.4371, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.09724381625441696, |
|
"grad_norm": 7.7905659675598145, |
|
"learning_rate": 0.00019599802926068384, |
|
"loss": 0.4631, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.09799764428739693, |
|
"grad_norm": 7.713627338409424, |
|
"learning_rate": 0.00019593094844068748, |
|
"loss": 0.4415, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09875147232037691, |
|
"grad_norm": 7.864312171936035, |
|
"learning_rate": 0.00019586332175383238, |
|
"loss": 0.493, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.09950530035335689, |
|
"grad_norm": 7.424186706542969, |
|
"learning_rate": 0.00019579514958492826, |
|
"loss": 0.4105, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.10025912838633687, |
|
"grad_norm": 7.774516582489014, |
|
"learning_rate": 0.0001957264323218889, |
|
"loss": 0.4406, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.10101295641931685, |
|
"grad_norm": 8.56273365020752, |
|
"learning_rate": 0.0001956571703557296, |
|
"loss": 0.4743, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.10176678445229682, |
|
"grad_norm": 7.981069087982178, |
|
"learning_rate": 0.00019558736408056525, |
|
"loss": 0.4167, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1025206124852768, |
|
"grad_norm": 7.851569175720215, |
|
"learning_rate": 0.00019551701389360795, |
|
"loss": 0.4582, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.10327444051825677, |
|
"grad_norm": 7.7381510734558105, |
|
"learning_rate": 0.00019544612019516472, |
|
"loss": 0.4336, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.10402826855123674, |
|
"grad_norm": 8.127756118774414, |
|
"learning_rate": 0.00019537468338863537, |
|
"loss": 0.4588, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.10478209658421672, |
|
"grad_norm": 7.989606857299805, |
|
"learning_rate": 0.00019530270388050998, |
|
"loss": 0.4269, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1055359246171967, |
|
"grad_norm": 8.431105613708496, |
|
"learning_rate": 0.00019523018208036677, |
|
"loss": 0.4645, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10628975265017668, |
|
"grad_norm": 8.575553894042969, |
|
"learning_rate": 0.0001951571184008698, |
|
"loss": 0.4587, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.10704358068315666, |
|
"grad_norm": 9.703766822814941, |
|
"learning_rate": 0.00019508351325776642, |
|
"loss": 0.4826, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.10779740871613663, |
|
"grad_norm": 10.319994926452637, |
|
"learning_rate": 0.00019500936706988502, |
|
"loss": 0.5255, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.10855123674911661, |
|
"grad_norm": 11.801458358764648, |
|
"learning_rate": 0.00019493468025913276, |
|
"loss": 0.5143, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.10930506478209659, |
|
"grad_norm": 11.02754020690918, |
|
"learning_rate": 0.00019485945325049288, |
|
"loss": 0.4947, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11005889281507657, |
|
"grad_norm": 11.526784896850586, |
|
"learning_rate": 0.00019478368647202264, |
|
"loss": 0.5627, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.11081272084805653, |
|
"grad_norm": 11.704715728759766, |
|
"learning_rate": 0.00019470738035485058, |
|
"loss": 0.5015, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.11156654888103651, |
|
"grad_norm": 14.198360443115234, |
|
"learning_rate": 0.00019463053533317425, |
|
"loss": 0.5488, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.11232037691401649, |
|
"grad_norm": 14.75071907043457, |
|
"learning_rate": 0.0001945531518442576, |
|
"loss": 0.5327, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.11307420494699646, |
|
"grad_norm": 17.345752716064453, |
|
"learning_rate": 0.0001944752303284287, |
|
"loss": 0.4909, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11382803297997644, |
|
"grad_norm": 25.253982543945312, |
|
"learning_rate": 0.00019439677122907697, |
|
"loss": 0.7106, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.11458186101295642, |
|
"grad_norm": 22.05714988708496, |
|
"learning_rate": 0.00019431777499265087, |
|
"loss": 0.6719, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1153356890459364, |
|
"grad_norm": 14.386154174804688, |
|
"learning_rate": 0.00019423824206865527, |
|
"loss": 0.663, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.11608951707891638, |
|
"grad_norm": 8.701356887817383, |
|
"learning_rate": 0.00019415817290964883, |
|
"loss": 0.5581, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.11684334511189635, |
|
"grad_norm": 8.447550773620605, |
|
"learning_rate": 0.00019407756797124164, |
|
"loss": 0.5545, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11759717314487632, |
|
"grad_norm": 9.116722106933594, |
|
"learning_rate": 0.00019399642771209238, |
|
"loss": 0.5284, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.1183510011778563, |
|
"grad_norm": 9.142845153808594, |
|
"learning_rate": 0.00019391475259390584, |
|
"loss": 0.5052, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.11910482921083627, |
|
"grad_norm": 9.175527572631836, |
|
"learning_rate": 0.0001938325430814302, |
|
"loss": 0.524, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.11985865724381625, |
|
"grad_norm": 8.684857368469238, |
|
"learning_rate": 0.00019374979964245463, |
|
"loss": 0.5387, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.12061248527679623, |
|
"grad_norm": 9.40937328338623, |
|
"learning_rate": 0.00019366652274780628, |
|
"loss": 0.5081, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12136631330977621, |
|
"grad_norm": 9.983878135681152, |
|
"learning_rate": 0.00019358271287134784, |
|
"loss": 0.5234, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.12212014134275619, |
|
"grad_norm": 8.468266487121582, |
|
"learning_rate": 0.00019349837048997478, |
|
"loss": 0.5008, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.12287396937573616, |
|
"grad_norm": 7.315543174743652, |
|
"learning_rate": 0.00019341349608361267, |
|
"loss": 0.4778, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.12362779740871614, |
|
"grad_norm": 8.254434585571289, |
|
"learning_rate": 0.00019332809013521428, |
|
"loss": 0.4949, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.12438162544169612, |
|
"grad_norm": 9.409392356872559, |
|
"learning_rate": 0.00019324215313075706, |
|
"loss": 0.4842, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12513545347467608, |
|
"grad_norm": 7.584166526794434, |
|
"learning_rate": 0.00019315568555924035, |
|
"loss": 0.4859, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.12588928150765608, |
|
"grad_norm": 7.280964374542236, |
|
"learning_rate": 0.0001930686879126824, |
|
"loss": 0.4436, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.12664310954063604, |
|
"grad_norm": 7.54876708984375, |
|
"learning_rate": 0.0001929811606861177, |
|
"loss": 0.4636, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.12739693757361603, |
|
"grad_norm": 8.36787223815918, |
|
"learning_rate": 0.00019289310437759427, |
|
"loss": 0.4862, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.128150765606596, |
|
"grad_norm": 8.098321914672852, |
|
"learning_rate": 0.00019280451948817059, |
|
"loss": 0.4558, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12890459363957596, |
|
"grad_norm": 8.111252784729004, |
|
"learning_rate": 0.00019271540652191296, |
|
"loss": 0.461, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.12965842167255595, |
|
"grad_norm": 7.394045829772949, |
|
"learning_rate": 0.0001926257659858925, |
|
"loss": 0.4397, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.13041224970553592, |
|
"grad_norm": 7.361767768859863, |
|
"learning_rate": 0.00019253559839018235, |
|
"loss": 0.4811, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1311660777385159, |
|
"grad_norm": 7.598999500274658, |
|
"learning_rate": 0.00019244490424785468, |
|
"loss": 0.4353, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.13191990577149587, |
|
"grad_norm": 7.871952056884766, |
|
"learning_rate": 0.00019235368407497788, |
|
"loss": 0.4847, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13267373380447586, |
|
"grad_norm": 7.250602722167969, |
|
"learning_rate": 0.00019226193839061347, |
|
"loss": 0.4482, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.13342756183745583, |
|
"grad_norm": 7.890292644500732, |
|
"learning_rate": 0.0001921696677168133, |
|
"loss": 0.4475, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.13418138987043582, |
|
"grad_norm": 7.192571640014648, |
|
"learning_rate": 0.00019207687257861655, |
|
"loss": 0.4093, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.13493521790341578, |
|
"grad_norm": 8.001566886901855, |
|
"learning_rate": 0.00019198355350404667, |
|
"loss": 0.4729, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.13568904593639575, |
|
"grad_norm": 7.559464454650879, |
|
"learning_rate": 0.00019188971102410837, |
|
"loss": 0.4455, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13644287396937574, |
|
"grad_norm": 7.921515941619873, |
|
"learning_rate": 0.00019179534567278475, |
|
"loss": 0.4421, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1371967020023557, |
|
"grad_norm": 7.778410911560059, |
|
"learning_rate": 0.00019170045798703406, |
|
"loss": 0.4485, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.1379505300353357, |
|
"grad_norm": 7.606152534484863, |
|
"learning_rate": 0.0001916050485067868, |
|
"loss": 0.4235, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.13870435806831566, |
|
"grad_norm": 7.29620361328125, |
|
"learning_rate": 0.00019150911777494258, |
|
"loss": 0.4433, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.13945818610129565, |
|
"grad_norm": 7.7016072273254395, |
|
"learning_rate": 0.00019141266633736697, |
|
"loss": 0.444, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14021201413427561, |
|
"grad_norm": 7.524323463439941, |
|
"learning_rate": 0.0001913156947428886, |
|
"loss": 0.4481, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1409658421672556, |
|
"grad_norm": 7.7455525398254395, |
|
"learning_rate": 0.00019121820354329577, |
|
"loss": 0.4152, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.14171967020023557, |
|
"grad_norm": 8.12897777557373, |
|
"learning_rate": 0.00019112019329333346, |
|
"loss": 0.4443, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.14247349823321553, |
|
"grad_norm": 7.774250507354736, |
|
"learning_rate": 0.00019102166455070024, |
|
"loss": 0.4442, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.14322732626619553, |
|
"grad_norm": 8.02647876739502, |
|
"learning_rate": 0.00019092261787604492, |
|
"loss": 0.4489, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1439811542991755, |
|
"grad_norm": 7.7611799240112305, |
|
"learning_rate": 0.00019082305383296352, |
|
"loss": 0.4122, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.14473498233215548, |
|
"grad_norm": 9.484501838684082, |
|
"learning_rate": 0.00019072297298799589, |
|
"loss": 0.4725, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.14548881036513545, |
|
"grad_norm": 9.696186065673828, |
|
"learning_rate": 0.00019062237591062272, |
|
"loss": 0.4913, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.14624263839811544, |
|
"grad_norm": 11.048422813415527, |
|
"learning_rate": 0.00019052126317326207, |
|
"loss": 0.5425, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.1469964664310954, |
|
"grad_norm": 10.327349662780762, |
|
"learning_rate": 0.00019041963535126625, |
|
"loss": 0.5171, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1477502944640754, |
|
"grad_norm": 11.808932304382324, |
|
"learning_rate": 0.0001903174930229185, |
|
"loss": 0.504, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.14850412249705536, |
|
"grad_norm": 11.13871955871582, |
|
"learning_rate": 0.00019021483676942973, |
|
"loss": 0.5261, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.14925795053003535, |
|
"grad_norm": 11.771498680114746, |
|
"learning_rate": 0.00019011166717493517, |
|
"loss": 0.5062, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1500117785630153, |
|
"grad_norm": 13.0664644241333, |
|
"learning_rate": 0.000190007984826491, |
|
"loss": 0.5488, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.15076560659599528, |
|
"grad_norm": 15.87386417388916, |
|
"learning_rate": 0.00018990379031407124, |
|
"loss": 0.547, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15151943462897527, |
|
"grad_norm": 20.688980102539062, |
|
"learning_rate": 0.00018979908423056408, |
|
"loss": 0.7222, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.15227326266195523, |
|
"grad_norm": 16.90519905090332, |
|
"learning_rate": 0.0001896938671717687, |
|
"loss": 0.6582, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.15302709069493522, |
|
"grad_norm": 11.236451148986816, |
|
"learning_rate": 0.00018958813973639184, |
|
"loss": 0.6151, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.1537809187279152, |
|
"grad_norm": 8.368070602416992, |
|
"learning_rate": 0.0001894819025260444, |
|
"loss": 0.5729, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.15453474676089518, |
|
"grad_norm": 7.891096115112305, |
|
"learning_rate": 0.00018937515614523797, |
|
"loss": 0.5132, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15528857479387514, |
|
"grad_norm": 8.290247917175293, |
|
"learning_rate": 0.0001892679012013815, |
|
"loss": 0.5311, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.15604240282685514, |
|
"grad_norm": 8.068524360656738, |
|
"learning_rate": 0.00018916013830477766, |
|
"loss": 0.5038, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.1567962308598351, |
|
"grad_norm": 7.199114799499512, |
|
"learning_rate": 0.00018905186806861957, |
|
"loss": 0.4933, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.15755005889281506, |
|
"grad_norm": 6.769901275634766, |
|
"learning_rate": 0.00018894309110898712, |
|
"loss": 0.4743, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.15830388692579506, |
|
"grad_norm": 7.485007286071777, |
|
"learning_rate": 0.00018883380804484367, |
|
"loss": 0.4832, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15905771495877502, |
|
"grad_norm": 7.059638500213623, |
|
"learning_rate": 0.00018872401949803237, |
|
"loss": 0.4544, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.159811542991755, |
|
"grad_norm": 7.6718549728393555, |
|
"learning_rate": 0.00018861372609327263, |
|
"loss": 0.4727, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.16056537102473498, |
|
"grad_norm": 7.764082431793213, |
|
"learning_rate": 0.00018850292845815672, |
|
"loss": 0.4645, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.16131919905771497, |
|
"grad_norm": 8.037138938903809, |
|
"learning_rate": 0.0001883916272231459, |
|
"loss": 0.4712, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.16207302709069493, |
|
"grad_norm": 7.26751184463501, |
|
"learning_rate": 0.0001882798230215672, |
|
"loss": 0.4477, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.16282685512367492, |
|
"grad_norm": 7.747137069702148, |
|
"learning_rate": 0.00018816751648960956, |
|
"loss": 0.4544, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.1635806831566549, |
|
"grad_norm": 7.478286266326904, |
|
"learning_rate": 0.00018805470826632024, |
|
"loss": 0.4539, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.16433451118963485, |
|
"grad_norm": 7.051617622375488, |
|
"learning_rate": 0.0001879413989936013, |
|
"loss": 0.4688, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.16508833922261484, |
|
"grad_norm": 7.303111553192139, |
|
"learning_rate": 0.00018782758931620584, |
|
"loss": 0.4551, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1658421672555948, |
|
"grad_norm": 7.094053745269775, |
|
"learning_rate": 0.00018771327988173435, |
|
"loss": 0.4398, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1665959952885748, |
|
"grad_norm": 7.781626224517822, |
|
"learning_rate": 0.00018759847134063108, |
|
"loss": 0.4719, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.16734982332155476, |
|
"grad_norm": 7.860665321350098, |
|
"learning_rate": 0.0001874831643461803, |
|
"loss": 0.4573, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.16810365135453476, |
|
"grad_norm": 7.380893707275391, |
|
"learning_rate": 0.00018736735955450251, |
|
"loss": 0.4341, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.16885747938751472, |
|
"grad_norm": 7.672417163848877, |
|
"learning_rate": 0.0001872510576245509, |
|
"loss": 0.4511, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.1696113074204947, |
|
"grad_norm": 7.173273086547852, |
|
"learning_rate": 0.00018713425921810733, |
|
"loss": 0.4374, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.17036513545347468, |
|
"grad_norm": 7.41825532913208, |
|
"learning_rate": 0.00018701696499977884, |
|
"loss": 0.4464, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.17111896348645464, |
|
"grad_norm": 8.151430130004883, |
|
"learning_rate": 0.0001868991756369937, |
|
"loss": 0.4535, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.17187279151943463, |
|
"grad_norm": 7.760961532592773, |
|
"learning_rate": 0.00018678089179999762, |
|
"loss": 0.4731, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1726266195524146, |
|
"grad_norm": 8.02840518951416, |
|
"learning_rate": 0.00018666211416184999, |
|
"loss": 0.4745, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.1733804475853946, |
|
"grad_norm": 7.38688850402832, |
|
"learning_rate": 0.00018654284339842013, |
|
"loss": 0.4341, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17413427561837455, |
|
"grad_norm": 7.492348670959473, |
|
"learning_rate": 0.00018642308018838316, |
|
"loss": 0.4147, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.17488810365135454, |
|
"grad_norm": 7.687479019165039, |
|
"learning_rate": 0.00018630282521321645, |
|
"loss": 0.4404, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1756419316843345, |
|
"grad_norm": 7.790548324584961, |
|
"learning_rate": 0.0001861820791571956, |
|
"loss": 0.4389, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.1763957597173145, |
|
"grad_norm": 7.557417392730713, |
|
"learning_rate": 0.00018606084270739049, |
|
"loss": 0.4467, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.17714958775029446, |
|
"grad_norm": 7.971850872039795, |
|
"learning_rate": 0.0001859391165536615, |
|
"loss": 0.415, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17790341578327443, |
|
"grad_norm": 8.08571720123291, |
|
"learning_rate": 0.0001858169013886556, |
|
"loss": 0.4488, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.17865724381625442, |
|
"grad_norm": 7.706898212432861, |
|
"learning_rate": 0.00018569419790780218, |
|
"loss": 0.4296, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.17941107184923438, |
|
"grad_norm": 7.6118245124816895, |
|
"learning_rate": 0.00018557100680930937, |
|
"loss": 0.4223, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.18016489988221437, |
|
"grad_norm": 8.255146980285645, |
|
"learning_rate": 0.00018544732879415986, |
|
"loss": 0.4802, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.18091872791519434, |
|
"grad_norm": 9.077119827270508, |
|
"learning_rate": 0.00018532316456610704, |
|
"loss": 0.4376, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18167255594817433, |
|
"grad_norm": 8.465483665466309, |
|
"learning_rate": 0.00018519851483167097, |
|
"loss": 0.4339, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.1824263839811543, |
|
"grad_norm": 9.302364349365234, |
|
"learning_rate": 0.00018507338030013427, |
|
"loss": 0.4429, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.18318021201413429, |
|
"grad_norm": 10.150344848632812, |
|
"learning_rate": 0.00018494776168353827, |
|
"loss": 0.4768, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.18393404004711425, |
|
"grad_norm": 10.960404396057129, |
|
"learning_rate": 0.00018482165969667874, |
|
"loss": 0.5072, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.18468786808009424, |
|
"grad_norm": 10.028700828552246, |
|
"learning_rate": 0.00018469507505710194, |
|
"loss": 0.5194, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1854416961130742, |
|
"grad_norm": 10.371344566345215, |
|
"learning_rate": 0.00018456800848510056, |
|
"loss": 0.4974, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.18619552414605417, |
|
"grad_norm": 11.256722450256348, |
|
"learning_rate": 0.00018444046070370963, |
|
"loss": 0.4655, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.18694935217903416, |
|
"grad_norm": 11.339438438415527, |
|
"learning_rate": 0.00018431243243870223, |
|
"loss": 0.5004, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.18770318021201413, |
|
"grad_norm": 12.51115894317627, |
|
"learning_rate": 0.00018418392441858555, |
|
"loss": 0.5498, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.18845700824499412, |
|
"grad_norm": 12.920282363891602, |
|
"learning_rate": 0.0001840549373745968, |
|
"loss": 0.4545, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18921083627797408, |
|
"grad_norm": 17.809480667114258, |
|
"learning_rate": 0.0001839254720406987, |
|
"loss": 0.6779, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.18996466431095407, |
|
"grad_norm": 14.654753684997559, |
|
"learning_rate": 0.00018379552915357575, |
|
"loss": 0.639, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.19071849234393404, |
|
"grad_norm": 10.703547477722168, |
|
"learning_rate": 0.00018366510945262972, |
|
"loss": 0.6024, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.19147232037691403, |
|
"grad_norm": 9.329981803894043, |
|
"learning_rate": 0.00018353421367997563, |
|
"loss": 0.5221, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.192226148409894, |
|
"grad_norm": 7.0998663902282715, |
|
"learning_rate": 0.00018340284258043732, |
|
"loss": 0.5203, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19297997644287396, |
|
"grad_norm": 8.919529914855957, |
|
"learning_rate": 0.00018327099690154344, |
|
"loss": 0.5286, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.19373380447585395, |
|
"grad_norm": 8.378999710083008, |
|
"learning_rate": 0.00018313867739352304, |
|
"loss": 0.4929, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.1944876325088339, |
|
"grad_norm": 7.437035083770752, |
|
"learning_rate": 0.00018300588480930143, |
|
"loss": 0.4622, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.1952414605418139, |
|
"grad_norm": 7.368019104003906, |
|
"learning_rate": 0.0001828726199044957, |
|
"loss": 0.4824, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.19599528857479387, |
|
"grad_norm": 7.174773693084717, |
|
"learning_rate": 0.0001827388834374107, |
|
"loss": 0.4601, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19674911660777386, |
|
"grad_norm": 7.612614154815674, |
|
"learning_rate": 0.0001826046761690344, |
|
"loss": 0.474, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.19750294464075382, |
|
"grad_norm": 8.047442436218262, |
|
"learning_rate": 0.00018246999886303383, |
|
"loss": 0.4594, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.19825677267373382, |
|
"grad_norm": 7.06972599029541, |
|
"learning_rate": 0.00018233485228575063, |
|
"loss": 0.4944, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.19901060070671378, |
|
"grad_norm": 7.2451324462890625, |
|
"learning_rate": 0.00018219923720619663, |
|
"loss": 0.4748, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.19976442873969374, |
|
"grad_norm": 8.119038581848145, |
|
"learning_rate": 0.0001820631543960496, |
|
"loss": 0.4286, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.20051825677267374, |
|
"grad_norm": 8.046279907226562, |
|
"learning_rate": 0.0001819266046296487, |
|
"loss": 0.4566, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2012720848056537, |
|
"grad_norm": 6.79647970199585, |
|
"learning_rate": 0.00018178958868399033, |
|
"loss": 0.4214, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2020259128386337, |
|
"grad_norm": 6.761276721954346, |
|
"learning_rate": 0.00018165210733872336, |
|
"loss": 0.4272, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.20277974087161366, |
|
"grad_norm": 7.771080493927002, |
|
"learning_rate": 0.000181514161376145, |
|
"loss": 0.4602, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.20353356890459365, |
|
"grad_norm": 7.610669136047363, |
|
"learning_rate": 0.0001813757515811962, |
|
"loss": 0.4413, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2042873969375736, |
|
"grad_norm": 7.277632236480713, |
|
"learning_rate": 0.00018123687874145721, |
|
"loss": 0.417, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2050412249705536, |
|
"grad_norm": 7.344987869262695, |
|
"learning_rate": 0.00018109754364714305, |
|
"loss": 0.4326, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.20579505300353357, |
|
"grad_norm": 7.373658180236816, |
|
"learning_rate": 0.0001809577470910992, |
|
"loss": 0.4107, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.20654888103651353, |
|
"grad_norm": 8.498446464538574, |
|
"learning_rate": 0.00018081748986879679, |
|
"loss": 0.4463, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.20730270906949352, |
|
"grad_norm": 7.138429164886475, |
|
"learning_rate": 0.00018067677277832834, |
|
"loss": 0.4354, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2080565371024735, |
|
"grad_norm": 7.916346073150635, |
|
"learning_rate": 0.00018053559662040302, |
|
"loss": 0.448, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.20881036513545348, |
|
"grad_norm": 6.8389201164245605, |
|
"learning_rate": 0.00018039396219834237, |
|
"loss": 0.4095, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.20956419316843344, |
|
"grad_norm": 7.184628009796143, |
|
"learning_rate": 0.00018025187031807532, |
|
"loss": 0.421, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.21031802120141344, |
|
"grad_norm": 6.9601569175720215, |
|
"learning_rate": 0.00018010932178813397, |
|
"loss": 0.4329, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2110718492343934, |
|
"grad_norm": 7.579134464263916, |
|
"learning_rate": 0.00017996631741964888, |
|
"loss": 0.439, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2118256772673734, |
|
"grad_norm": 7.37368106842041, |
|
"learning_rate": 0.00017982285802634426, |
|
"loss": 0.4225, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.21257950530035336, |
|
"grad_norm": 7.1782145500183105, |
|
"learning_rate": 0.0001796789444245337, |
|
"loss": 0.4094, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 7.470993995666504, |
|
"learning_rate": 0.00017953457743311523, |
|
"loss": 0.4267, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.2140871613663133, |
|
"grad_norm": 7.285700798034668, |
|
"learning_rate": 0.00017938975787356673, |
|
"loss": 0.4113, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.21484098939929328, |
|
"grad_norm": 7.5254130363464355, |
|
"learning_rate": 0.00017924448656994133, |
|
"loss": 0.4362, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.21559481743227327, |
|
"grad_norm": 7.6265411376953125, |
|
"learning_rate": 0.00017909876434886273, |
|
"loss": 0.443, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.21634864546525323, |
|
"grad_norm": 7.822786808013916, |
|
"learning_rate": 0.00017895259203952032, |
|
"loss": 0.4385, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.21710247349823322, |
|
"grad_norm": 7.836915969848633, |
|
"learning_rate": 0.0001788059704736647, |
|
"loss": 0.4509, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2178563015312132, |
|
"grad_norm": 8.352907180786133, |
|
"learning_rate": 0.00017865890048560277, |
|
"loss": 0.4747, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.21861012956419318, |
|
"grad_norm": 8.010136604309082, |
|
"learning_rate": 0.00017851138291219301, |
|
"loss": 0.4662, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21936395759717314, |
|
"grad_norm": 8.264348983764648, |
|
"learning_rate": 0.00017836341859284093, |
|
"loss": 0.4473, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.22011778563015313, |
|
"grad_norm": 8.917752265930176, |
|
"learning_rate": 0.00017821500836949386, |
|
"loss": 0.4909, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.2208716136631331, |
|
"grad_norm": 9.103057861328125, |
|
"learning_rate": 0.0001780661530866366, |
|
"loss": 0.4885, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.22162544169611306, |
|
"grad_norm": 10.667252540588379, |
|
"learning_rate": 0.00017791685359128633, |
|
"loss": 0.5175, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.22237926972909305, |
|
"grad_norm": 9.840495109558105, |
|
"learning_rate": 0.000177767110732988, |
|
"loss": 0.5175, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.22313309776207302, |
|
"grad_norm": 10.290101051330566, |
|
"learning_rate": 0.00017761692536380928, |
|
"loss": 0.4749, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.223886925795053, |
|
"grad_norm": 10.657001495361328, |
|
"learning_rate": 0.00017746629833833585, |
|
"loss": 0.534, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.22464075382803297, |
|
"grad_norm": 10.042377471923828, |
|
"learning_rate": 0.00017731523051366658, |
|
"loss": 0.454, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.22539458186101297, |
|
"grad_norm": 12.303505897521973, |
|
"learning_rate": 0.00017716372274940843, |
|
"loss": 0.5157, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.22614840989399293, |
|
"grad_norm": 16.197650909423828, |
|
"learning_rate": 0.00017701177590767183, |
|
"loss": 0.5521, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22690223792697292, |
|
"grad_norm": 15.125090599060059, |
|
"learning_rate": 0.00017685939085306562, |
|
"loss": 0.6868, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.22765606595995289, |
|
"grad_norm": 13.107701301574707, |
|
"learning_rate": 0.00017670656845269214, |
|
"loss": 0.6326, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.22840989399293285, |
|
"grad_norm": 9.953380584716797, |
|
"learning_rate": 0.00017655330957614234, |
|
"loss": 0.596, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.22916372202591284, |
|
"grad_norm": 7.864305019378662, |
|
"learning_rate": 0.00017639961509549078, |
|
"loss": 0.5477, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2299175500588928, |
|
"grad_norm": 6.731385707855225, |
|
"learning_rate": 0.00017624548588529072, |
|
"loss": 0.4891, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2306713780918728, |
|
"grad_norm": 6.979381084442139, |
|
"learning_rate": 0.00017609092282256912, |
|
"loss": 0.4611, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.23142520612485276, |
|
"grad_norm": 8.147210121154785, |
|
"learning_rate": 0.00017593592678682166, |
|
"loss": 0.5077, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.23217903415783275, |
|
"grad_norm": 7.303165435791016, |
|
"learning_rate": 0.0001757804986600077, |
|
"loss": 0.4771, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.23293286219081272, |
|
"grad_norm": 7.042153358459473, |
|
"learning_rate": 0.0001756246393265453, |
|
"loss": 0.4718, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2336866902237927, |
|
"grad_norm": 7.572822570800781, |
|
"learning_rate": 0.00017546834967330617, |
|
"loss": 0.4719, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23444051825677267, |
|
"grad_norm": 7.078078269958496, |
|
"learning_rate": 0.00017531163058961066, |
|
"loss": 0.4345, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.23519434628975264, |
|
"grad_norm": 7.183956623077393, |
|
"learning_rate": 0.00017515448296722262, |
|
"loss": 0.4631, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.23594817432273263, |
|
"grad_norm": 7.140283584594727, |
|
"learning_rate": 0.00017499690770034443, |
|
"loss": 0.4554, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.2367020023557126, |
|
"grad_norm": 7.176611423492432, |
|
"learning_rate": 0.00017483890568561173, |
|
"loss": 0.4603, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.23745583038869258, |
|
"grad_norm": 6.916821002960205, |
|
"learning_rate": 0.00017468047782208865, |
|
"loss": 0.4406, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23820965842167255, |
|
"grad_norm": 7.564478874206543, |
|
"learning_rate": 0.00017452162501126227, |
|
"loss": 0.4608, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.23896348645465254, |
|
"grad_norm": 7.078012466430664, |
|
"learning_rate": 0.00017436234815703788, |
|
"loss": 0.4254, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2397173144876325, |
|
"grad_norm": 7.39133358001709, |
|
"learning_rate": 0.0001742026481657335, |
|
"loss": 0.4412, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.2404711425206125, |
|
"grad_norm": 7.540102005004883, |
|
"learning_rate": 0.0001740425259460751, |
|
"loss": 0.4444, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.24122497055359246, |
|
"grad_norm": 7.027541160583496, |
|
"learning_rate": 0.00017388198240919102, |
|
"loss": 0.439, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.24197879858657242, |
|
"grad_norm": 7.218184947967529, |
|
"learning_rate": 0.00017372101846860707, |
|
"loss": 0.4239, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.24273262661955242, |
|
"grad_norm": 7.92561674118042, |
|
"learning_rate": 0.00017355963504024123, |
|
"loss": 0.4278, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.24348645465253238, |
|
"grad_norm": 7.72558069229126, |
|
"learning_rate": 0.00017339783304239843, |
|
"loss": 0.4498, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.24424028268551237, |
|
"grad_norm": 7.2504096031188965, |
|
"learning_rate": 0.00017323561339576543, |
|
"loss": 0.4355, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.24499411071849234, |
|
"grad_norm": 7.207572937011719, |
|
"learning_rate": 0.0001730729770234054, |
|
"loss": 0.4192, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24574793875147233, |
|
"grad_norm": 7.010448455810547, |
|
"learning_rate": 0.00017290992485075282, |
|
"loss": 0.3983, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2465017667844523, |
|
"grad_norm": 7.16871452331543, |
|
"learning_rate": 0.0001727464578056081, |
|
"loss": 0.4454, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.24725559481743228, |
|
"grad_norm": 7.185717582702637, |
|
"learning_rate": 0.00017258257681813244, |
|
"loss": 0.426, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.24800942285041225, |
|
"grad_norm": 7.441746234893799, |
|
"learning_rate": 0.0001724182828208424, |
|
"loss": 0.4394, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.24876325088339224, |
|
"grad_norm": 7.429843902587891, |
|
"learning_rate": 0.0001722535767486047, |
|
"loss": 0.4377, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2495170789163722, |
|
"grad_norm": 7.528452396392822, |
|
"learning_rate": 0.00017208845953863076, |
|
"loss": 0.4256, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.25027090694935217, |
|
"grad_norm": 6.993783473968506, |
|
"learning_rate": 0.0001719229321304716, |
|
"loss": 0.4337, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.25027090694935217, |
|
"eval_loss": 0.47317659854888916, |
|
"eval_runtime": 126.4401, |
|
"eval_samples_per_second": 17.676, |
|
"eval_steps_per_second": 8.842, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.25102473498233213, |
|
"grad_norm": 7.080078601837158, |
|
"learning_rate": 0.00017175699546601223, |
|
"loss": 0.443, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.25177856301531215, |
|
"grad_norm": 7.021576404571533, |
|
"learning_rate": 0.00017159065048946644, |
|
"loss": 0.4211, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2525323910482921, |
|
"grad_norm": 7.684916019439697, |
|
"learning_rate": 0.00017142389814737142, |
|
"loss": 0.4115, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2532862190812721, |
|
"grad_norm": 7.011744976043701, |
|
"learning_rate": 0.00017125673938858237, |
|
"loss": 0.4057, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.25404004711425204, |
|
"grad_norm": 7.142672538757324, |
|
"learning_rate": 0.00017108917516426704, |
|
"loss": 0.4485, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.25479387514723206, |
|
"grad_norm": 7.860468864440918, |
|
"learning_rate": 0.00017092120642790042, |
|
"loss": 0.4134, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.255547703180212, |
|
"grad_norm": 8.12804889678955, |
|
"learning_rate": 0.00017075283413525916, |
|
"loss": 0.4449, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.256301531213192, |
|
"grad_norm": 7.87144136428833, |
|
"learning_rate": 0.00017058405924441636, |
|
"loss": 0.3987, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25705535924617195, |
|
"grad_norm": 7.7459588050842285, |
|
"learning_rate": 0.00017041488271573587, |
|
"loss": 0.4271, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2578091872791519, |
|
"grad_norm": 8.934653282165527, |
|
"learning_rate": 0.00017024530551186702, |
|
"loss": 0.4722, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.25856301531213194, |
|
"grad_norm": 8.811241149902344, |
|
"learning_rate": 0.000170075328597739, |
|
"loss": 0.4719, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.2593168433451119, |
|
"grad_norm": 9.294290542602539, |
|
"learning_rate": 0.00016990495294055548, |
|
"loss": 0.4963, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.26007067137809187, |
|
"grad_norm": 11.440875053405762, |
|
"learning_rate": 0.00016973417950978906, |
|
"loss": 0.5236, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.26082449941107183, |
|
"grad_norm": 10.008340835571289, |
|
"learning_rate": 0.00016956300927717575, |
|
"loss": 0.5081, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.26157832744405185, |
|
"grad_norm": 10.798213958740234, |
|
"learning_rate": 0.0001693914432167094, |
|
"loss": 0.5252, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2623321554770318, |
|
"grad_norm": 12.772528648376465, |
|
"learning_rate": 0.00016921948230463625, |
|
"loss": 0.5073, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.2630859835100118, |
|
"grad_norm": 12.81511402130127, |
|
"learning_rate": 0.00016904712751944931, |
|
"loss": 0.4699, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.26383981154299174, |
|
"grad_norm": 13.554988861083984, |
|
"learning_rate": 0.00016887437984188286, |
|
"loss": 0.4963, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2645936395759717, |
|
"grad_norm": 17.339111328125, |
|
"learning_rate": 0.00016870124025490673, |
|
"loss": 0.6331, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.2653474676089517, |
|
"grad_norm": 14.55565357208252, |
|
"learning_rate": 0.0001685277097437208, |
|
"loss": 0.6053, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.2661012956419317, |
|
"grad_norm": 11.207347869873047, |
|
"learning_rate": 0.0001683537892957495, |
|
"loss": 0.5787, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.26685512367491165, |
|
"grad_norm": 8.820387840270996, |
|
"learning_rate": 0.00016817947990063598, |
|
"loss": 0.5605, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.2676089517078916, |
|
"grad_norm": 7.382798194885254, |
|
"learning_rate": 0.0001680047825502366, |
|
"loss": 0.4917, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.26836277974087164, |
|
"grad_norm": 7.330126762390137, |
|
"learning_rate": 0.00016782969823861526, |
|
"loss": 0.4976, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2691166077738516, |
|
"grad_norm": 8.046545028686523, |
|
"learning_rate": 0.0001676542279620378, |
|
"loss": 0.4864, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.26987043580683157, |
|
"grad_norm": 7.838155746459961, |
|
"learning_rate": 0.00016747837271896622, |
|
"loss": 0.4797, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.27062426383981153, |
|
"grad_norm": 7.075133323669434, |
|
"learning_rate": 0.00016730213351005303, |
|
"loss": 0.4655, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.2713780918727915, |
|
"grad_norm": 6.840551853179932, |
|
"learning_rate": 0.00016712551133813572, |
|
"loss": 0.4453, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2721319199057715, |
|
"grad_norm": 7.175273418426514, |
|
"learning_rate": 0.0001669485072082308, |
|
"loss": 0.447, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.2728857479387515, |
|
"grad_norm": 8.195796012878418, |
|
"learning_rate": 0.00016677112212752824, |
|
"loss": 0.4869, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.27363957597173144, |
|
"grad_norm": 7.310915946960449, |
|
"learning_rate": 0.00016659335710538564, |
|
"loss": 0.4447, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.2743934040047114, |
|
"grad_norm": 7.676048755645752, |
|
"learning_rate": 0.00016641521315332265, |
|
"loss": 0.4507, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.2751472320376914, |
|
"grad_norm": 7.88531494140625, |
|
"learning_rate": 0.00016623669128501504, |
|
"loss": 0.4411, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2759010600706714, |
|
"grad_norm": 7.499680995941162, |
|
"learning_rate": 0.00016605779251628903, |
|
"loss": 0.4629, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.27665488810365135, |
|
"grad_norm": 6.773830890655518, |
|
"learning_rate": 0.00016587851786511543, |
|
"loss": 0.4571, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.2774087161366313, |
|
"grad_norm": 7.170431613922119, |
|
"learning_rate": 0.00016569886835160399, |
|
"loss": 0.4313, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.2781625441696113, |
|
"grad_norm": 6.66681432723999, |
|
"learning_rate": 0.0001655188449979974, |
|
"loss": 0.425, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.2789163722025913, |
|
"grad_norm": 6.042294025421143, |
|
"learning_rate": 0.00016533844882866568, |
|
"loss": 0.4236, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.27967020023557126, |
|
"grad_norm": 6.5642924308776855, |
|
"learning_rate": 0.00016515768087010013, |
|
"loss": 0.4404, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.28042402826855123, |
|
"grad_norm": 7.063207626342773, |
|
"learning_rate": 0.00016497654215090772, |
|
"loss": 0.428, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.2811778563015312, |
|
"grad_norm": 6.705799579620361, |
|
"learning_rate": 0.00016479503370180507, |
|
"loss": 0.431, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.2819316843345112, |
|
"grad_norm": 6.578817367553711, |
|
"learning_rate": 0.00016461315655561263, |
|
"loss": 0.4126, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 6.545943260192871, |
|
"learning_rate": 0.00016443091174724885, |
|
"loss": 0.4198, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.28343934040047114, |
|
"grad_norm": 6.834047794342041, |
|
"learning_rate": 0.00016424830031372425, |
|
"loss": 0.4378, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2841931684334511, |
|
"grad_norm": 7.931153774261475, |
|
"learning_rate": 0.00016406532329413546, |
|
"loss": 0.4529, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.28494699646643107, |
|
"grad_norm": 7.077485084533691, |
|
"learning_rate": 0.00016388198172965942, |
|
"loss": 0.4281, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2857008244994111, |
|
"grad_norm": 7.532230854034424, |
|
"learning_rate": 0.00016369827666354745, |
|
"loss": 0.4064, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.28645465253239105, |
|
"grad_norm": 7.111504554748535, |
|
"learning_rate": 0.00016351420914111916, |
|
"loss": 0.4392, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.287208480565371, |
|
"grad_norm": 7.107287883758545, |
|
"learning_rate": 0.0001633297802097567, |
|
"loss": 0.3896, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.287962308598351, |
|
"grad_norm": 6.906205654144287, |
|
"learning_rate": 0.0001631449909188987, |
|
"loss": 0.4263, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.288716136631331, |
|
"grad_norm": 7.226500034332275, |
|
"learning_rate": 0.00016295984232003426, |
|
"loss": 0.4482, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.28946996466431096, |
|
"grad_norm": 6.622352123260498, |
|
"learning_rate": 0.00016277433546669703, |
|
"loss": 0.4044, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.2902237926972909, |
|
"grad_norm": 7.164252281188965, |
|
"learning_rate": 0.00016258847141445928, |
|
"loss": 0.4253, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2909776207302709, |
|
"grad_norm": 7.356839656829834, |
|
"learning_rate": 0.00016240225122092573, |
|
"loss": 0.427, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.29173144876325086, |
|
"grad_norm": 8.345090866088867, |
|
"learning_rate": 0.00016221567594572762, |
|
"loss": 0.4204, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.2924852767962309, |
|
"grad_norm": 7.662243366241455, |
|
"learning_rate": 0.00016202874665051674, |
|
"loss": 0.393, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.29323910482921084, |
|
"grad_norm": 7.708904266357422, |
|
"learning_rate": 0.00016184146439895928, |
|
"loss": 0.411, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.2939929328621908, |
|
"grad_norm": 7.000946044921875, |
|
"learning_rate": 0.00016165383025672981, |
|
"loss": 0.3893, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.29474676089517077, |
|
"grad_norm": 7.401767253875732, |
|
"learning_rate": 0.00016146584529150526, |
|
"loss": 0.3869, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.2955005889281508, |
|
"grad_norm": 7.715709209442139, |
|
"learning_rate": 0.0001612775105729588, |
|
"loss": 0.402, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.29625441696113075, |
|
"grad_norm": 8.78487491607666, |
|
"learning_rate": 0.00016108882717275384, |
|
"loss": 0.4899, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.2970082449941107, |
|
"grad_norm": 9.631272315979004, |
|
"learning_rate": 0.0001608997961645377, |
|
"loss": 0.4919, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2977620730270907, |
|
"grad_norm": 9.458671569824219, |
|
"learning_rate": 0.00016071041862393578, |
|
"loss": 0.4955, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2985159010600707, |
|
"grad_norm": 10.232501029968262, |
|
"learning_rate": 0.0001605206956285454, |
|
"loss": 0.4977, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.29926972909305066, |
|
"grad_norm": 9.963619232177734, |
|
"learning_rate": 0.00016033062825792935, |
|
"loss": 0.4679, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.3000235571260306, |
|
"grad_norm": 12.23200798034668, |
|
"learning_rate": 0.0001601402175936102, |
|
"loss": 0.5541, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.3007773851590106, |
|
"grad_norm": 11.938904762268066, |
|
"learning_rate": 0.00015994946471906382, |
|
"loss": 0.4678, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.30153121319199055, |
|
"grad_norm": 14.236066818237305, |
|
"learning_rate": 0.0001597583707197134, |
|
"loss": 0.534, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3022850412249706, |
|
"grad_norm": 12.790224075317383, |
|
"learning_rate": 0.00015956693668292313, |
|
"loss": 0.6361, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.30303886925795054, |
|
"grad_norm": 14.324430465698242, |
|
"learning_rate": 0.00015937516369799216, |
|
"loss": 0.6471, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.3037926972909305, |
|
"grad_norm": 10.209970474243164, |
|
"learning_rate": 0.00015918305285614822, |
|
"loss": 0.5906, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.30454652532391047, |
|
"grad_norm": 7.869755744934082, |
|
"learning_rate": 0.00015899060525054157, |
|
"loss": 0.5408, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.3053003533568905, |
|
"grad_norm": 6.786082744598389, |
|
"learning_rate": 0.0001587978219762388, |
|
"loss": 0.5095, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.30605418138987045, |
|
"grad_norm": 8.50927448272705, |
|
"learning_rate": 0.00015860470413021642, |
|
"loss": 0.5117, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.3068080094228504, |
|
"grad_norm": 7.6895833015441895, |
|
"learning_rate": 0.00015841125281135473, |
|
"loss": 0.4919, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3075618374558304, |
|
"grad_norm": 7.566605567932129, |
|
"learning_rate": 0.00015821746912043165, |
|
"loss": 0.4561, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.30831566548881034, |
|
"grad_norm": 7.5333452224731445, |
|
"learning_rate": 0.00015802335416011625, |
|
"loss": 0.4735, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.30906949352179036, |
|
"grad_norm": 7.508667469024658, |
|
"learning_rate": 0.00015782890903496264, |
|
"loss": 0.4461, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3098233215547703, |
|
"grad_norm": 6.778057098388672, |
|
"learning_rate": 0.00015763413485140365, |
|
"loss": 0.4589, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.3105771495877503, |
|
"grad_norm": 6.7967915534973145, |
|
"learning_rate": 0.00015743903271774455, |
|
"loss": 0.4438, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.31133097762073025, |
|
"grad_norm": 7.60194730758667, |
|
"learning_rate": 0.0001572436037441566, |
|
"loss": 0.4371, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.3120848056537103, |
|
"grad_norm": 7.298644065856934, |
|
"learning_rate": 0.00015704784904267097, |
|
"loss": 0.4678, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.31283863368669024, |
|
"grad_norm": 6.711719036102295, |
|
"learning_rate": 0.00015685176972717223, |
|
"loss": 0.4511, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3135924617196702, |
|
"grad_norm": 8.647915840148926, |
|
"learning_rate": 0.00015665536691339207, |
|
"loss": 0.4697, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.31434628975265017, |
|
"grad_norm": 7.388605117797852, |
|
"learning_rate": 0.00015645864171890295, |
|
"loss": 0.4322, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.31510011778563013, |
|
"grad_norm": 7.3222198486328125, |
|
"learning_rate": 0.00015626159526311174, |
|
"loss": 0.4366, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.31585394581861015, |
|
"grad_norm": 6.875087738037109, |
|
"learning_rate": 0.00015606422866725343, |
|
"loss": 0.4464, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.3166077738515901, |
|
"grad_norm": 6.434317111968994, |
|
"learning_rate": 0.00015586654305438456, |
|
"loss": 0.4161, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3173616018845701, |
|
"grad_norm": 7.1308488845825195, |
|
"learning_rate": 0.00015566853954937694, |
|
"loss": 0.4558, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.31811542991755004, |
|
"grad_norm": 7.582878112792969, |
|
"learning_rate": 0.00015547021927891144, |
|
"loss": 0.4789, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.31886925795053006, |
|
"grad_norm": 6.73392391204834, |
|
"learning_rate": 0.00015527158337147112, |
|
"loss": 0.45, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.31962308598351, |
|
"grad_norm": 7.364933967590332, |
|
"learning_rate": 0.00015507263295733528, |
|
"loss": 0.4156, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.32037691401649, |
|
"grad_norm": 6.4493842124938965, |
|
"learning_rate": 0.00015487336916857278, |
|
"loss": 0.4147, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.32113074204946995, |
|
"grad_norm": 6.886701583862305, |
|
"learning_rate": 0.00015467379313903557, |
|
"loss": 0.4271, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.3218845700824499, |
|
"grad_norm": 6.938616752624512, |
|
"learning_rate": 0.00015447390600435238, |
|
"loss": 0.4356, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.32263839811542994, |
|
"grad_norm": 7.1376214027404785, |
|
"learning_rate": 0.00015427370890192224, |
|
"loss": 0.411, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3233922261484099, |
|
"grad_norm": 7.260872840881348, |
|
"learning_rate": 0.00015407320297090786, |
|
"loss": 0.4505, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.32414605418138986, |
|
"grad_norm": 7.035525321960449, |
|
"learning_rate": 0.00015387238935222927, |
|
"loss": 0.4032, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.32489988221436983, |
|
"grad_norm": 6.7771782875061035, |
|
"learning_rate": 0.00015367126918855738, |
|
"loss": 0.4135, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.32565371024734985, |
|
"grad_norm": 7.255315780639648, |
|
"learning_rate": 0.0001534698436243073, |
|
"loss": 0.4376, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.3264075382803298, |
|
"grad_norm": 6.563286781311035, |
|
"learning_rate": 0.00015326811380563204, |
|
"loss": 0.3936, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3271613663133098, |
|
"grad_norm": 8.582233428955078, |
|
"learning_rate": 0.0001530660808804158, |
|
"loss": 0.3979, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.32791519434628974, |
|
"grad_norm": 6.628231048583984, |
|
"learning_rate": 0.00015286374599826754, |
|
"loss": 0.4143, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3286690223792697, |
|
"grad_norm": 6.581121921539307, |
|
"learning_rate": 0.00015266111031051442, |
|
"loss": 0.4313, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3294228504122497, |
|
"grad_norm": 6.923291206359863, |
|
"learning_rate": 0.00015245817497019524, |
|
"loss": 0.3921, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3301766784452297, |
|
"grad_norm": 7.172369480133057, |
|
"learning_rate": 0.00015225494113205393, |
|
"loss": 0.4249, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.33093050647820965, |
|
"grad_norm": 7.134575843811035, |
|
"learning_rate": 0.00015205140995253283, |
|
"loss": 0.4148, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3316843345111896, |
|
"grad_norm": 8.403553009033203, |
|
"learning_rate": 0.00015184758258976637, |
|
"loss": 0.447, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.33243816254416964, |
|
"grad_norm": 7.707136154174805, |
|
"learning_rate": 0.00015164346020357417, |
|
"loss": 0.4165, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.3331919905771496, |
|
"grad_norm": 8.08395004272461, |
|
"learning_rate": 0.00015143904395545466, |
|
"loss": 0.461, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.33394581861012956, |
|
"grad_norm": 9.609329223632812, |
|
"learning_rate": 0.0001512343350085784, |
|
"loss": 0.5137, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.3346996466431095, |
|
"grad_norm": 9.876978874206543, |
|
"learning_rate": 0.0001510293345277815, |
|
"loss": 0.5053, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.3354534746760895, |
|
"grad_norm": 9.40042495727539, |
|
"learning_rate": 0.0001508240436795589, |
|
"loss": 0.5114, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3362073027090695, |
|
"grad_norm": 10.623950958251953, |
|
"learning_rate": 0.00015061846363205784, |
|
"loss": 0.497, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3369611307420495, |
|
"grad_norm": 10.993450164794922, |
|
"learning_rate": 0.00015041259555507108, |
|
"loss": 0.49, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.33771495877502944, |
|
"grad_norm": 11.963092803955078, |
|
"learning_rate": 0.00015020644062003046, |
|
"loss": 0.5261, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3384687868080094, |
|
"grad_norm": 11.985857963562012, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.5063, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.3392226148409894, |
|
"grad_norm": 13.582792282104492, |
|
"learning_rate": 0.00014979327486966938, |
|
"loss": 0.4568, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3399764428739694, |
|
"grad_norm": 10.956193923950195, |
|
"learning_rate": 0.0001495862664053471, |
|
"loss": 0.6271, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.34073027090694935, |
|
"grad_norm": 10.826944351196289, |
|
"learning_rate": 0.0001493789757849541, |
|
"loss": 0.5646, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3414840989399293, |
|
"grad_norm": 9.086105346679688, |
|
"learning_rate": 0.00014917140418801655, |
|
"loss": 0.5347, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.3422379269729093, |
|
"grad_norm": 7.542895317077637, |
|
"learning_rate": 0.00014896355279565976, |
|
"loss": 0.547, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.3429917550058893, |
|
"grad_norm": 6.925205707550049, |
|
"learning_rate": 0.00014875542279060085, |
|
"loss": 0.5174, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.34374558303886926, |
|
"grad_norm": 6.2740159034729, |
|
"learning_rate": 0.00014854701535714244, |
|
"loss": 0.4569, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.3444994110718492, |
|
"grad_norm": 6.751154899597168, |
|
"learning_rate": 0.00014833833168116582, |
|
"loss": 0.4848, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.3452532391048292, |
|
"grad_norm": 6.805966854095459, |
|
"learning_rate": 0.00014812937295012406, |
|
"loss": 0.454, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3460070671378092, |
|
"grad_norm": 6.805473327636719, |
|
"learning_rate": 0.00014792014035303535, |
|
"loss": 0.4459, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.3467608951707892, |
|
"grad_norm": 6.896597385406494, |
|
"learning_rate": 0.00014771063508047636, |
|
"loss": 0.4492, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.34751472320376914, |
|
"grad_norm": 6.992384433746338, |
|
"learning_rate": 0.00014750085832457519, |
|
"loss": 0.4737, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.3482685512367491, |
|
"grad_norm": 7.02846622467041, |
|
"learning_rate": 0.00014729081127900476, |
|
"loss": 0.4786, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.34902237926972907, |
|
"grad_norm": 7.123291015625, |
|
"learning_rate": 0.0001470804951389761, |
|
"loss": 0.4397, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.3497762073027091, |
|
"grad_norm": 6.681251049041748, |
|
"learning_rate": 0.00014686991110123135, |
|
"loss": 0.4398, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.35053003533568905, |
|
"grad_norm": 7.414073944091797, |
|
"learning_rate": 0.00014665906036403706, |
|
"loss": 0.4626, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.351283863368669, |
|
"grad_norm": 6.917845726013184, |
|
"learning_rate": 0.00014644794412717736, |
|
"loss": 0.4312, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.352037691401649, |
|
"grad_norm": 6.451867580413818, |
|
"learning_rate": 0.00014623656359194712, |
|
"loss": 0.4101, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.352791519434629, |
|
"grad_norm": 7.152139663696289, |
|
"learning_rate": 0.00014602491996114516, |
|
"loss": 0.4518, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.35354534746760896, |
|
"grad_norm": 7.701825141906738, |
|
"learning_rate": 0.0001458130144390673, |
|
"loss": 0.4568, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.3542991755005889, |
|
"grad_norm": 7.278562545776367, |
|
"learning_rate": 0.00014560084823149965, |
|
"loss": 0.4222, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3550530035335689, |
|
"grad_norm": 6.47285270690918, |
|
"learning_rate": 0.0001453884225457116, |
|
"loss": 0.465, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.35580683156654885, |
|
"grad_norm": 6.140552520751953, |
|
"learning_rate": 0.00014517573859044907, |
|
"loss": 0.4219, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.3565606595995289, |
|
"grad_norm": 6.481984615325928, |
|
"learning_rate": 0.00014496279757592766, |
|
"loss": 0.4446, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.35731448763250884, |
|
"grad_norm": 6.575818061828613, |
|
"learning_rate": 0.0001447496007138255, |
|
"loss": 0.4297, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.3580683156654888, |
|
"grad_norm": 6.637454509735107, |
|
"learning_rate": 0.00014453614921727668, |
|
"loss": 0.4311, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.35882214369846877, |
|
"grad_norm": 6.832921981811523, |
|
"learning_rate": 0.00014432244430086423, |
|
"loss": 0.4469, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.3595759717314488, |
|
"grad_norm": 7.260216236114502, |
|
"learning_rate": 0.00014410848718061312, |
|
"loss": 0.4206, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.36032979976442875, |
|
"grad_norm": 6.812548637390137, |
|
"learning_rate": 0.00014389427907398342, |
|
"loss": 0.4146, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.3610836277974087, |
|
"grad_norm": 6.668044090270996, |
|
"learning_rate": 0.00014367982119986342, |
|
"loss": 0.4333, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.3618374558303887, |
|
"grad_norm": 7.100220680236816, |
|
"learning_rate": 0.00014346511477856259, |
|
"loss": 0.4174, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3625912838633687, |
|
"grad_norm": 7.15718936920166, |
|
"learning_rate": 0.0001432501610318047, |
|
"loss": 0.4258, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.36334511189634866, |
|
"grad_norm": 7.051331520080566, |
|
"learning_rate": 0.00014303496118272084, |
|
"loss": 0.4048, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.3640989399293286, |
|
"grad_norm": 7.344452381134033, |
|
"learning_rate": 0.0001428195164558425, |
|
"loss": 0.4137, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.3648527679623086, |
|
"grad_norm": 7.5303850173950195, |
|
"learning_rate": 0.00014260382807709457, |
|
"loss": 0.421, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.36560659599528855, |
|
"grad_norm": 6.944647789001465, |
|
"learning_rate": 0.0001423878972737883, |
|
"loss": 0.4059, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.36636042402826857, |
|
"grad_norm": 7.10966682434082, |
|
"learning_rate": 0.0001421717252746145, |
|
"loss": 0.4038, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.36711425206124854, |
|
"grad_norm": 6.702695369720459, |
|
"learning_rate": 0.00014195531330963635, |
|
"loss": 0.3999, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.3678680800942285, |
|
"grad_norm": 8.255915641784668, |
|
"learning_rate": 0.0001417386626102825, |
|
"loss": 0.3961, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.36862190812720846, |
|
"grad_norm": 8.199605941772461, |
|
"learning_rate": 0.00014152177440934012, |
|
"loss": 0.4079, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.3693757361601885, |
|
"grad_norm": 7.717386245727539, |
|
"learning_rate": 0.0001413046499409477, |
|
"loss": 0.3932, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.37012956419316845, |
|
"grad_norm": 7.842260837554932, |
|
"learning_rate": 0.0001410872904405882, |
|
"loss": 0.4383, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.3708833922261484, |
|
"grad_norm": 8.819681167602539, |
|
"learning_rate": 0.00014086969714508196, |
|
"loss": 0.4763, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.3716372202591284, |
|
"grad_norm": 8.904485702514648, |
|
"learning_rate": 0.00014065187129257964, |
|
"loss": 0.4711, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.37239104829210834, |
|
"grad_norm": 9.481599807739258, |
|
"learning_rate": 0.00014043381412255526, |
|
"loss": 0.5002, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.37314487632508836, |
|
"grad_norm": 9.55698013305664, |
|
"learning_rate": 0.00014021552687579902, |
|
"loss": 0.454, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3738987043580683, |
|
"grad_norm": 9.685362815856934, |
|
"learning_rate": 0.00013999701079441028, |
|
"loss": 0.4687, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3746525323910483, |
|
"grad_norm": 10.087312698364258, |
|
"learning_rate": 0.00013977826712179058, |
|
"loss": 0.4855, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.37540636042402825, |
|
"grad_norm": 10.978914260864258, |
|
"learning_rate": 0.00013955929710263653, |
|
"loss": 0.485, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.37616018845700827, |
|
"grad_norm": 11.427350044250488, |
|
"learning_rate": 0.00013934010198293257, |
|
"loss": 0.4536, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.37691401648998824, |
|
"grad_norm": 12.61874771118164, |
|
"learning_rate": 0.00013912068300994413, |
|
"loss": 0.4844, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3776678445229682, |
|
"grad_norm": 11.156290054321289, |
|
"learning_rate": 0.0001389010414322104, |
|
"loss": 0.6025, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.37842167255594816, |
|
"grad_norm": 10.892552375793457, |
|
"learning_rate": 0.0001386811784995371, |
|
"loss": 0.6063, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3791755005889281, |
|
"grad_norm": 9.48608112335205, |
|
"learning_rate": 0.00013846109546298971, |
|
"loss": 0.5153, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.37992932862190815, |
|
"grad_norm": 7.735827922821045, |
|
"learning_rate": 0.00013824079357488598, |
|
"loss": 0.5102, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.3806831566548881, |
|
"grad_norm": 6.837904453277588, |
|
"learning_rate": 0.0001380202740887891, |
|
"loss": 0.4952, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3814369846878681, |
|
"grad_norm": 6.260585308074951, |
|
"learning_rate": 0.00013779953825950034, |
|
"loss": 0.4751, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.38219081272084804, |
|
"grad_norm": 6.398446083068848, |
|
"learning_rate": 0.00013757858734305203, |
|
"loss": 0.4449, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.38294464075382806, |
|
"grad_norm": 7.3623881340026855, |
|
"learning_rate": 0.0001373574225967004, |
|
"loss": 0.4859, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.383698468786808, |
|
"grad_norm": 7.673310279846191, |
|
"learning_rate": 0.00013713604527891844, |
|
"loss": 0.4804, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.384452296819788, |
|
"grad_norm": 6.531475067138672, |
|
"learning_rate": 0.00013691445664938866, |
|
"loss": 0.4491, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.38520612485276795, |
|
"grad_norm": 6.5302300453186035, |
|
"learning_rate": 0.00013669265796899607, |
|
"loss": 0.4277, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.3859599528857479, |
|
"grad_norm": 6.498359680175781, |
|
"learning_rate": 0.00013647065049982078, |
|
"loss": 0.4473, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.38671378091872793, |
|
"grad_norm": 7.777768135070801, |
|
"learning_rate": 0.0001362484355051311, |
|
"loss": 0.4485, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3874676089517079, |
|
"grad_norm": 6.4952192306518555, |
|
"learning_rate": 0.00013602601424937604, |
|
"loss": 0.4144, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.38822143698468786, |
|
"grad_norm": 7.111438274383545, |
|
"learning_rate": 0.00013580338799817844, |
|
"loss": 0.4314, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3889752650176678, |
|
"grad_norm": 6.711978435516357, |
|
"learning_rate": 0.00013558055801832748, |
|
"loss": 0.4476, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.38972909305064785, |
|
"grad_norm": 6.2299370765686035, |
|
"learning_rate": 0.0001353575255777717, |
|
"loss": 0.4211, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.3904829210836278, |
|
"grad_norm": 6.2404046058654785, |
|
"learning_rate": 0.0001351342919456116, |
|
"loss": 0.4195, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.3912367491166078, |
|
"grad_norm": 7.3141679763793945, |
|
"learning_rate": 0.0001349108583920925, |
|
"loss": 0.4473, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.39199057714958774, |
|
"grad_norm": 7.678971767425537, |
|
"learning_rate": 0.00013468722618859743, |
|
"loss": 0.4102, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3927444051825677, |
|
"grad_norm": 6.773143291473389, |
|
"learning_rate": 0.0001344633966076396, |
|
"loss": 0.4518, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.3934982332155477, |
|
"grad_norm": 6.161088943481445, |
|
"learning_rate": 0.00013423937092285555, |
|
"loss": 0.4, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3942520612485277, |
|
"grad_norm": 6.478328227996826, |
|
"learning_rate": 0.00013401515040899746, |
|
"loss": 0.4607, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.39500588928150765, |
|
"grad_norm": 6.1380157470703125, |
|
"learning_rate": 0.00013379073634192632, |
|
"loss": 0.4108, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.3957597173144876, |
|
"grad_norm": 6.8945441246032715, |
|
"learning_rate": 0.00013356612999860436, |
|
"loss": 0.4032, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.39651354534746763, |
|
"grad_norm": 6.745527267456055, |
|
"learning_rate": 0.000133341332657088, |
|
"loss": 0.402, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.3972673733804476, |
|
"grad_norm": 6.959543704986572, |
|
"learning_rate": 0.00013311634559652036, |
|
"loss": 0.4258, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.39802120141342756, |
|
"grad_norm": 6.5237298011779785, |
|
"learning_rate": 0.00013289117009712418, |
|
"loss": 0.4042, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.3987750294464075, |
|
"grad_norm": 6.997231483459473, |
|
"learning_rate": 0.00013266580744019445, |
|
"loss": 0.424, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.3995288574793875, |
|
"grad_norm": 7.053787708282471, |
|
"learning_rate": 0.00013244025890809112, |
|
"loss": 0.4436, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4002826855123675, |
|
"grad_norm": 6.5921831130981445, |
|
"learning_rate": 0.00013221452578423176, |
|
"loss": 0.4262, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.4010365135453475, |
|
"grad_norm": 7.524543285369873, |
|
"learning_rate": 0.00013198860935308444, |
|
"loss": 0.4205, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.40179034157832744, |
|
"grad_norm": 6.691077709197998, |
|
"learning_rate": 0.00013176251090016007, |
|
"loss": 0.4303, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.4025441696113074, |
|
"grad_norm": 6.8649749755859375, |
|
"learning_rate": 0.0001315362317120055, |
|
"loss": 0.4293, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.4032979976442874, |
|
"grad_norm": 7.226325035095215, |
|
"learning_rate": 0.00013130977307619594, |
|
"loss": 0.4118, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4040518256772674, |
|
"grad_norm": 6.9132843017578125, |
|
"learning_rate": 0.0001310831362813276, |
|
"loss": 0.4086, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.40480565371024735, |
|
"grad_norm": 6.638665199279785, |
|
"learning_rate": 0.00013085632261701063, |
|
"loss": 0.404, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.4055594817432273, |
|
"grad_norm": 6.809209823608398, |
|
"learning_rate": 0.00013062933337386142, |
|
"loss": 0.378, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.4063133097762073, |
|
"grad_norm": 6.697812557220459, |
|
"learning_rate": 0.00013040216984349555, |
|
"loss": 0.4068, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.4070671378091873, |
|
"grad_norm": 7.231639862060547, |
|
"learning_rate": 0.00013017483331852035, |
|
"loss": 0.4167, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.40782096584216726, |
|
"grad_norm": 7.607770919799805, |
|
"learning_rate": 0.00012994732509252744, |
|
"loss": 0.4298, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.4085747938751472, |
|
"grad_norm": 7.685420989990234, |
|
"learning_rate": 0.00012971964646008542, |
|
"loss": 0.4435, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.4093286219081272, |
|
"grad_norm": 9.00213623046875, |
|
"learning_rate": 0.00012949179871673278, |
|
"loss": 0.5072, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4100824499411072, |
|
"grad_norm": 9.699268341064453, |
|
"learning_rate": 0.00012926378315896998, |
|
"loss": 0.5158, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.41083627797408717, |
|
"grad_norm": 10.096549987792969, |
|
"learning_rate": 0.00012903560108425258, |
|
"loss": 0.479, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.41159010600706714, |
|
"grad_norm": 9.205822944641113, |
|
"learning_rate": 0.00012880725379098352, |
|
"loss": 0.4844, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.4123439340400471, |
|
"grad_norm": 10.534090995788574, |
|
"learning_rate": 0.00012857874257850605, |
|
"loss": 0.4998, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.41309776207302706, |
|
"grad_norm": 11.49348258972168, |
|
"learning_rate": 0.00012835006874709594, |
|
"loss": 0.4969, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.4138515901060071, |
|
"grad_norm": 11.891164779663086, |
|
"learning_rate": 0.00012812123359795446, |
|
"loss": 0.5109, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.41460541813898705, |
|
"grad_norm": 12.372316360473633, |
|
"learning_rate": 0.00012789223843320073, |
|
"loss": 0.4808, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.415359246171967, |
|
"grad_norm": 9.265199661254883, |
|
"learning_rate": 0.0001276630845558644, |
|
"loss": 0.6065, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.416113074204947, |
|
"grad_norm": 10.428581237792969, |
|
"learning_rate": 0.00012743377326987826, |
|
"loss": 0.5885, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.416866902237927, |
|
"grad_norm": 8.8326997756958, |
|
"learning_rate": 0.00012720430588007077, |
|
"loss": 0.5599, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.41762073027090696, |
|
"grad_norm": 6.87199592590332, |
|
"learning_rate": 0.00012697468369215863, |
|
"loss": 0.5212, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.4183745583038869, |
|
"grad_norm": 6.59550142288208, |
|
"learning_rate": 0.00012674490801273938, |
|
"loss": 0.5265, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4191283863368669, |
|
"grad_norm": 5.809760093688965, |
|
"learning_rate": 0.00012651498014928402, |
|
"loss": 0.4861, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.41988221436984685, |
|
"grad_norm": 5.872656345367432, |
|
"learning_rate": 0.00012628490141012937, |
|
"loss": 0.4476, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.42063604240282687, |
|
"grad_norm": 6.835720062255859, |
|
"learning_rate": 0.000126054673104471, |
|
"loss": 0.4838, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.42138987043580683, |
|
"grad_norm": 6.669496059417725, |
|
"learning_rate": 0.00012582429654235523, |
|
"loss": 0.4167, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.4221436984687868, |
|
"grad_norm": 6.77216100692749, |
|
"learning_rate": 0.00012559377303467226, |
|
"loss": 0.4469, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.42289752650176676, |
|
"grad_norm": 6.118035793304443, |
|
"learning_rate": 0.00012536310389314832, |
|
"loss": 0.439, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.4236513545347468, |
|
"grad_norm": 6.0063886642456055, |
|
"learning_rate": 0.0001251322904303383, |
|
"loss": 0.4246, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.42440518256772675, |
|
"grad_norm": 6.384454727172852, |
|
"learning_rate": 0.00012490133395961844, |
|
"loss": 0.4427, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.4251590106007067, |
|
"grad_norm": 6.875798225402832, |
|
"learning_rate": 0.00012467023579517856, |
|
"loss": 0.4746, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.4259128386336867, |
|
"grad_norm": 6.876395225524902, |
|
"learning_rate": 0.00012443899725201482, |
|
"loss": 0.4639, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 7.060841083526611, |
|
"learning_rate": 0.00012420761964592223, |
|
"loss": 0.4449, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.42742049469964666, |
|
"grad_norm": 6.859095573425293, |
|
"learning_rate": 0.000123976104293487, |
|
"loss": 0.4127, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.4281743227326266, |
|
"grad_norm": 6.3295135498046875, |
|
"learning_rate": 0.00012374445251207914, |
|
"loss": 0.4436, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.4289281507656066, |
|
"grad_norm": 6.203479766845703, |
|
"learning_rate": 0.00012351266561984507, |
|
"loss": 0.4493, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.42968197879858655, |
|
"grad_norm": 6.393275737762451, |
|
"learning_rate": 0.00012328074493569993, |
|
"loss": 0.451, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.43043580683156657, |
|
"grad_norm": 6.78492546081543, |
|
"learning_rate": 0.0001230486917793202, |
|
"loss": 0.4278, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.43118963486454653, |
|
"grad_norm": 6.327200889587402, |
|
"learning_rate": 0.00012281650747113612, |
|
"loss": 0.4422, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.4319434628975265, |
|
"grad_norm": 6.7098822593688965, |
|
"learning_rate": 0.0001225841933323242, |
|
"loss": 0.4556, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.43269729093050646, |
|
"grad_norm": 6.249898910522461, |
|
"learning_rate": 0.00012235175068479984, |
|
"loss": 0.4184, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.4334511189634865, |
|
"grad_norm": 6.380219459533691, |
|
"learning_rate": 0.00012211918085120954, |
|
"loss": 0.437, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.43420494699646645, |
|
"grad_norm": 6.367920875549316, |
|
"learning_rate": 0.00012188648515492355, |
|
"loss": 0.4269, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.4349587750294464, |
|
"grad_norm": 6.438598155975342, |
|
"learning_rate": 0.00012165366492002832, |
|
"loss": 0.4298, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.4357126030624264, |
|
"grad_norm": 6.798791408538818, |
|
"learning_rate": 0.00012142072147131898, |
|
"loss": 0.4204, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.43646643109540634, |
|
"grad_norm": 6.528103828430176, |
|
"learning_rate": 0.00012118765613429173, |
|
"loss": 0.4448, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.43722025912838636, |
|
"grad_norm": 6.5673909187316895, |
|
"learning_rate": 0.0001209544702351363, |
|
"loss": 0.432, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4379740871613663, |
|
"grad_norm": 7.303831577301025, |
|
"learning_rate": 0.00012072116510072858, |
|
"loss": 0.4125, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.4387279151943463, |
|
"grad_norm": 6.5421576499938965, |
|
"learning_rate": 0.00012048774205862279, |
|
"loss": 0.4171, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.43948174322732625, |
|
"grad_norm": 6.537741661071777, |
|
"learning_rate": 0.0001202542024370441, |
|
"loss": 0.385, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.44023557126030627, |
|
"grad_norm": 6.6051530838012695, |
|
"learning_rate": 0.00012002054756488115, |
|
"loss": 0.3888, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.44098939929328623, |
|
"grad_norm": 6.796999454498291, |
|
"learning_rate": 0.00011978677877167822, |
|
"loss": 0.4049, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4417432273262662, |
|
"grad_norm": 7.154036521911621, |
|
"learning_rate": 0.00011955289738762796, |
|
"loss": 0.4168, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.44249705535924616, |
|
"grad_norm": 6.852260112762451, |
|
"learning_rate": 0.00011931890474356358, |
|
"loss": 0.381, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.4432508833922261, |
|
"grad_norm": 6.91892671585083, |
|
"learning_rate": 0.00011908480217095141, |
|
"loss": 0.3895, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.44400471142520614, |
|
"grad_norm": 7.690057277679443, |
|
"learning_rate": 0.00011885059100188341, |
|
"loss": 0.4504, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.4447585394581861, |
|
"grad_norm": 7.000772476196289, |
|
"learning_rate": 0.00011861627256906929, |
|
"loss": 0.3868, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4455123674911661, |
|
"grad_norm": 7.221988201141357, |
|
"learning_rate": 0.00011838184820582923, |
|
"loss": 0.4119, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.44626619552414604, |
|
"grad_norm": 8.583606719970703, |
|
"learning_rate": 0.00011814731924608616, |
|
"loss": 0.4087, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.44702002355712606, |
|
"grad_norm": 8.559534072875977, |
|
"learning_rate": 0.00011791268702435816, |
|
"loss": 0.4469, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.447773851590106, |
|
"grad_norm": 8.477254867553711, |
|
"learning_rate": 0.0001176779528757509, |
|
"loss": 0.476, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.448527679623086, |
|
"grad_norm": 9.82533073425293, |
|
"learning_rate": 0.00011744311813595006, |
|
"loss": 0.5395, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.44928150765606595, |
|
"grad_norm": 9.407917022705078, |
|
"learning_rate": 0.00011720818414121368, |
|
"loss": 0.4716, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.4500353356890459, |
|
"grad_norm": 11.39129638671875, |
|
"learning_rate": 0.00011697315222836458, |
|
"loss": 0.4827, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.45078916372202593, |
|
"grad_norm": 11.540337562561035, |
|
"learning_rate": 0.0001167380237347828, |
|
"loss": 0.4713, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.4515429917550059, |
|
"grad_norm": 10.345648765563965, |
|
"learning_rate": 0.00011650279999839787, |
|
"loss": 0.4148, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.45229681978798586, |
|
"grad_norm": 12.826940536499023, |
|
"learning_rate": 0.00011626748235768128, |
|
"loss": 0.487, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4530506478209658, |
|
"grad_norm": 9.553250312805176, |
|
"learning_rate": 0.00011603207215163894, |
|
"loss": 0.5809, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.45380447585394584, |
|
"grad_norm": 9.77419662475586, |
|
"learning_rate": 0.0001157965707198034, |
|
"loss": 0.5538, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.4545583038869258, |
|
"grad_norm": 8.743382453918457, |
|
"learning_rate": 0.00011556097940222628, |
|
"loss": 0.5516, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.45531213191990577, |
|
"grad_norm": 7.538958549499512, |
|
"learning_rate": 0.00011532529953947075, |
|
"loss": 0.5119, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.45606595995288574, |
|
"grad_norm": 6.539525032043457, |
|
"learning_rate": 0.00011508953247260379, |
|
"loss": 0.499, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4568197879858657, |
|
"grad_norm": 6.682277679443359, |
|
"learning_rate": 0.00011485367954318856, |
|
"loss": 0.4594, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.4575736160188457, |
|
"grad_norm": 5.594506740570068, |
|
"learning_rate": 0.0001146177420932768, |
|
"loss": 0.4609, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.4583274440518257, |
|
"grad_norm": 6.195127964019775, |
|
"learning_rate": 0.00011438172146540123, |
|
"loss": 0.4413, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.45908127208480565, |
|
"grad_norm": 6.665927410125732, |
|
"learning_rate": 0.00011414561900256784, |
|
"loss": 0.4492, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.4598351001177856, |
|
"grad_norm": 7.045360088348389, |
|
"learning_rate": 0.00011390943604824826, |
|
"loss": 0.4508, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.46058892815076563, |
|
"grad_norm": 7.470615386962891, |
|
"learning_rate": 0.00011367317394637218, |
|
"loss": 0.46, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.4613427561837456, |
|
"grad_norm": 6.948364734649658, |
|
"learning_rate": 0.00011343683404131964, |
|
"loss": 0.477, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.46209658421672556, |
|
"grad_norm": 6.797374248504639, |
|
"learning_rate": 0.00011320041767791336, |
|
"loss": 0.4726, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.4628504122497055, |
|
"grad_norm": 6.488336563110352, |
|
"learning_rate": 0.00011296392620141114, |
|
"loss": 0.4403, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.4636042402826855, |
|
"grad_norm": 7.050676345825195, |
|
"learning_rate": 0.00011272736095749823, |
|
"loss": 0.475, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4643580683156655, |
|
"grad_norm": 6.4435038566589355, |
|
"learning_rate": 0.00011249072329227959, |
|
"loss": 0.4188, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.46511189634864547, |
|
"grad_norm": 6.662125110626221, |
|
"learning_rate": 0.0001122540145522723, |
|
"loss": 0.4365, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.46586572438162543, |
|
"grad_norm": 6.387564659118652, |
|
"learning_rate": 0.00011201723608439778, |
|
"loss": 0.4237, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.4666195524146054, |
|
"grad_norm": 6.151999473571777, |
|
"learning_rate": 0.0001117803892359744, |
|
"loss": 0.3967, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.4673733804475854, |
|
"grad_norm": 6.0764055252075195, |
|
"learning_rate": 0.00011154347535470947, |
|
"loss": 0.4032, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4681272084805654, |
|
"grad_norm": 6.069274425506592, |
|
"learning_rate": 0.00011130649578869173, |
|
"loss": 0.4234, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.46888103651354535, |
|
"grad_norm": 6.283833980560303, |
|
"learning_rate": 0.00011106945188638378, |
|
"loss": 0.4115, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.4696348645465253, |
|
"grad_norm": 6.327964782714844, |
|
"learning_rate": 0.00011083234499661426, |
|
"loss": 0.4293, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.4703886925795053, |
|
"grad_norm": 6.516750812530518, |
|
"learning_rate": 0.00011059517646857023, |
|
"loss": 0.3893, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.4711425206124853, |
|
"grad_norm": 7.370739936828613, |
|
"learning_rate": 0.00011035794765178941, |
|
"loss": 0.4385, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.47189634864546526, |
|
"grad_norm": 7.1700568199157715, |
|
"learning_rate": 0.0001101206598961527, |
|
"loss": 0.4221, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.4726501766784452, |
|
"grad_norm": 6.261050701141357, |
|
"learning_rate": 0.00010988331455187628, |
|
"loss": 0.4389, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.4734040047114252, |
|
"grad_norm": 6.810924530029297, |
|
"learning_rate": 0.00010964591296950406, |
|
"loss": 0.4653, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.4741578327444052, |
|
"grad_norm": 6.419404983520508, |
|
"learning_rate": 0.00010940845649989994, |
|
"loss": 0.4074, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.47491166077738517, |
|
"grad_norm": 6.0266008377075195, |
|
"learning_rate": 0.00010917094649424018, |
|
"loss": 0.3729, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.47566548881036513, |
|
"grad_norm": 6.674122333526611, |
|
"learning_rate": 0.00010893338430400562, |
|
"loss": 0.4016, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.4764193168433451, |
|
"grad_norm": 6.93697452545166, |
|
"learning_rate": 0.00010869577128097404, |
|
"loss": 0.3884, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.47717314487632506, |
|
"grad_norm": 6.370805263519287, |
|
"learning_rate": 0.00010845810877721252, |
|
"loss": 0.3835, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.4779269729093051, |
|
"grad_norm": 6.402405738830566, |
|
"learning_rate": 0.00010822039814506964, |
|
"loss": 0.396, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.47868080094228505, |
|
"grad_norm": 6.631165027618408, |
|
"learning_rate": 0.00010798264073716791, |
|
"loss": 0.4034, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.479434628975265, |
|
"grad_norm": 7.069218635559082, |
|
"learning_rate": 0.00010774483790639591, |
|
"loss": 0.4071, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.480188457008245, |
|
"grad_norm": 6.614718914031982, |
|
"learning_rate": 0.00010750699100590076, |
|
"loss": 0.3959, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.480942285041225, |
|
"grad_norm": 6.693352699279785, |
|
"learning_rate": 0.00010726910138908032, |
|
"loss": 0.3853, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.48169611307420496, |
|
"grad_norm": 6.8856940269470215, |
|
"learning_rate": 0.00010703117040957553, |
|
"loss": 0.3904, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.4824499411071849, |
|
"grad_norm": 7.3366522789001465, |
|
"learning_rate": 0.00010679319942126264, |
|
"loss": 0.4061, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4832037691401649, |
|
"grad_norm": 7.205180644989014, |
|
"learning_rate": 0.00010655518977824566, |
|
"loss": 0.4066, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.48395759717314485, |
|
"grad_norm": 9.314166069030762, |
|
"learning_rate": 0.00010631714283484842, |
|
"loss": 0.4507, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.48471142520612487, |
|
"grad_norm": 8.445844650268555, |
|
"learning_rate": 0.0001060790599456071, |
|
"loss": 0.4467, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.48546525323910483, |
|
"grad_norm": 8.920785903930664, |
|
"learning_rate": 0.00010584094246526237, |
|
"loss": 0.4593, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.4862190812720848, |
|
"grad_norm": 9.759257316589355, |
|
"learning_rate": 0.00010560279174875179, |
|
"loss": 0.5054, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.48697290930506476, |
|
"grad_norm": 9.649422645568848, |
|
"learning_rate": 0.0001053646091512019, |
|
"loss": 0.4891, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.4877267373380448, |
|
"grad_norm": 9.831908226013184, |
|
"learning_rate": 0.00010512639602792088, |
|
"loss": 0.4805, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.48848056537102474, |
|
"grad_norm": 11.026556968688965, |
|
"learning_rate": 0.00010488815373439036, |
|
"loss": 0.4875, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.4892343934040047, |
|
"grad_norm": 10.98789119720459, |
|
"learning_rate": 0.00010464988362625812, |
|
"loss": 0.4852, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.48998822143698467, |
|
"grad_norm": 12.804154396057129, |
|
"learning_rate": 0.00010441158705933016, |
|
"loss": 0.5069, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4907420494699647, |
|
"grad_norm": 7.31414270401001, |
|
"learning_rate": 0.00010417326538956305, |
|
"loss": 0.5666, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.49149587750294466, |
|
"grad_norm": 7.537758827209473, |
|
"learning_rate": 0.00010393491997305613, |
|
"loss": 0.5592, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.4922497055359246, |
|
"grad_norm": 7.580841064453125, |
|
"learning_rate": 0.00010369655216604397, |
|
"loss": 0.4984, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.4930035335689046, |
|
"grad_norm": 7.048511028289795, |
|
"learning_rate": 0.0001034581633248885, |
|
"loss": 0.5271, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.49375736160188455, |
|
"grad_norm": 6.32865047454834, |
|
"learning_rate": 0.00010321975480607129, |
|
"loss": 0.4999, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.49451118963486457, |
|
"grad_norm": 5.981396675109863, |
|
"learning_rate": 0.00010298132796618596, |
|
"loss": 0.4717, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.49526501766784453, |
|
"grad_norm": 5.971866130828857, |
|
"learning_rate": 0.00010274288416193034, |
|
"loss": 0.4357, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.4960188457008245, |
|
"grad_norm": 5.870616912841797, |
|
"learning_rate": 0.0001025044247500988, |
|
"loss": 0.4475, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.49677267373380446, |
|
"grad_norm": 6.04547119140625, |
|
"learning_rate": 0.00010226595108757451, |
|
"loss": 0.4641, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.4975265017667845, |
|
"grad_norm": 6.311388969421387, |
|
"learning_rate": 0.00010202746453132172, |
|
"loss": 0.4697, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.49828032979976444, |
|
"grad_norm": 5.957773208618164, |
|
"learning_rate": 0.00010178896643837809, |
|
"loss": 0.4381, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.4990341578327444, |
|
"grad_norm": 6.014715671539307, |
|
"learning_rate": 0.00010155045816584691, |
|
"loss": 0.4429, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.49978798586572437, |
|
"grad_norm": 5.99500846862793, |
|
"learning_rate": 0.00010131194107088935, |
|
"loss": 0.4544, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.5005418138987043, |
|
"grad_norm": 6.102397918701172, |
|
"learning_rate": 0.00010107341651071684, |
|
"loss": 0.4437, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5005418138987043, |
|
"eval_loss": 0.44807884097099304, |
|
"eval_runtime": 126.4853, |
|
"eval_samples_per_second": 17.67, |
|
"eval_steps_per_second": 8.839, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5012956419316843, |
|
"grad_norm": 5.838627338409424, |
|
"learning_rate": 0.00010083488584258326, |
|
"loss": 0.3961, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5020494699646643, |
|
"grad_norm": 6.225624084472656, |
|
"learning_rate": 0.00010059635042377725, |
|
"loss": 0.4199, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.5028032979976443, |
|
"grad_norm": 5.906275749206543, |
|
"learning_rate": 0.00010035781161161446, |
|
"loss": 0.4164, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.5035571260306243, |
|
"grad_norm": 5.818455696105957, |
|
"learning_rate": 0.0001001192707634299, |
|
"loss": 0.3753, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.5043109540636043, |
|
"grad_norm": 6.505937099456787, |
|
"learning_rate": 9.988072923657012e-05, |
|
"loss": 0.4058, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.5050647820965842, |
|
"grad_norm": 6.205794811248779, |
|
"learning_rate": 9.964218838838554e-05, |
|
"loss": 0.4176, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5058186101295642, |
|
"grad_norm": 6.019129753112793, |
|
"learning_rate": 9.940364957622276e-05, |
|
"loss": 0.4253, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.5065724381625442, |
|
"grad_norm": 5.988311290740967, |
|
"learning_rate": 9.916511415741676e-05, |
|
"loss": 0.399, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.5073262661955241, |
|
"grad_norm": 6.607666492462158, |
|
"learning_rate": 9.892658348928316e-05, |
|
"loss": 0.4154, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.5080800942285041, |
|
"grad_norm": 5.99027156829834, |
|
"learning_rate": 9.868805892911067e-05, |
|
"loss": 0.387, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.508833922261484, |
|
"grad_norm": 6.09193229675293, |
|
"learning_rate": 9.84495418341531e-05, |
|
"loss": 0.3817, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5095877502944641, |
|
"grad_norm": 6.635573863983154, |
|
"learning_rate": 9.821103356162189e-05, |
|
"loss": 0.4021, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.5103415783274441, |
|
"grad_norm": 6.2010884284973145, |
|
"learning_rate": 9.797253546867831e-05, |
|
"loss": 0.3915, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.511095406360424, |
|
"grad_norm": 6.824472427368164, |
|
"learning_rate": 9.773404891242551e-05, |
|
"loss": 0.3946, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.511849234393404, |
|
"grad_norm": 7.179849147796631, |
|
"learning_rate": 9.749557524990121e-05, |
|
"loss": 0.4281, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.512603062426384, |
|
"grad_norm": 6.765272617340088, |
|
"learning_rate": 9.72571158380697e-05, |
|
"loss": 0.4113, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.513356890459364, |
|
"grad_norm": 6.409517765045166, |
|
"learning_rate": 9.701867203381405e-05, |
|
"loss": 0.387, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5141107184923439, |
|
"grad_norm": 6.494263172149658, |
|
"learning_rate": 9.678024519392871e-05, |
|
"loss": 0.3783, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.5148645465253239, |
|
"grad_norm": 6.259777545928955, |
|
"learning_rate": 9.654183667511154e-05, |
|
"loss": 0.3996, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.5156183745583038, |
|
"grad_norm": 6.5478363037109375, |
|
"learning_rate": 9.630344783395604e-05, |
|
"loss": 0.3838, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.5163722025912839, |
|
"grad_norm": 7.6854071617126465, |
|
"learning_rate": 9.606508002694386e-05, |
|
"loss": 0.4235, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5171260306242639, |
|
"grad_norm": 7.029118537902832, |
|
"learning_rate": 9.5826734610437e-05, |
|
"loss": 0.418, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.5178798586572438, |
|
"grad_norm": 7.062952518463135, |
|
"learning_rate": 9.558841294066985e-05, |
|
"loss": 0.4281, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.5186336866902238, |
|
"grad_norm": 6.547257900238037, |
|
"learning_rate": 9.535011637374189e-05, |
|
"loss": 0.4008, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.5193875147232038, |
|
"grad_norm": 7.128522872924805, |
|
"learning_rate": 9.511184626560968e-05, |
|
"loss": 0.4072, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.5201413427561837, |
|
"grad_norm": 6.604221343994141, |
|
"learning_rate": 9.487360397207916e-05, |
|
"loss": 0.3906, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5208951707891637, |
|
"grad_norm": 7.471280574798584, |
|
"learning_rate": 9.463539084879809e-05, |
|
"loss": 0.4373, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.5216489988221437, |
|
"grad_norm": 7.444307804107666, |
|
"learning_rate": 9.439720825124827e-05, |
|
"loss": 0.4245, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.5224028268551236, |
|
"grad_norm": 7.748506546020508, |
|
"learning_rate": 9.415905753473765e-05, |
|
"loss": 0.4267, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.5231566548881037, |
|
"grad_norm": 8.47761344909668, |
|
"learning_rate": 9.392094005439291e-05, |
|
"loss": 0.4861, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.5239104829210837, |
|
"grad_norm": 9.239935874938965, |
|
"learning_rate": 9.368285716515162e-05, |
|
"loss": 0.45, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5246643109540636, |
|
"grad_norm": 9.59188461303711, |
|
"learning_rate": 9.344481022175436e-05, |
|
"loss": 0.4876, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.5254181389870436, |
|
"grad_norm": 10.498910903930664, |
|
"learning_rate": 9.320680057873735e-05, |
|
"loss": 0.5021, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.5261719670200236, |
|
"grad_norm": 11.162120819091797, |
|
"learning_rate": 9.29688295904245e-05, |
|
"loss": 0.5001, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.5269257950530035, |
|
"grad_norm": 11.781893730163574, |
|
"learning_rate": 9.273089861091969e-05, |
|
"loss": 0.456, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.5276796230859835, |
|
"grad_norm": 15.090996742248535, |
|
"learning_rate": 9.249300899409924e-05, |
|
"loss": 0.5593, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5284334511189634, |
|
"grad_norm": 9.527992248535156, |
|
"learning_rate": 9.225516209360413e-05, |
|
"loss": 0.5803, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.5291872791519434, |
|
"grad_norm": 8.856983184814453, |
|
"learning_rate": 9.201735926283213e-05, |
|
"loss": 0.5268, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.5299411071849235, |
|
"grad_norm": 7.78725528717041, |
|
"learning_rate": 9.177960185493036e-05, |
|
"loss": 0.5227, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.5306949352179035, |
|
"grad_norm": 7.152993679046631, |
|
"learning_rate": 9.154189122278754e-05, |
|
"loss": 0.5067, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.5314487632508834, |
|
"grad_norm": 6.18569278717041, |
|
"learning_rate": 9.1304228719026e-05, |
|
"loss": 0.476, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5322025912838634, |
|
"grad_norm": 6.376234531402588, |
|
"learning_rate": 9.106661569599442e-05, |
|
"loss": 0.4734, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.5329564193168433, |
|
"grad_norm": 6.275115489959717, |
|
"learning_rate": 9.082905350575986e-05, |
|
"loss": 0.4468, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.5337102473498233, |
|
"grad_norm": 5.899405479431152, |
|
"learning_rate": 9.059154350010008e-05, |
|
"loss": 0.4738, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.5344640753828033, |
|
"grad_norm": 6.213337421417236, |
|
"learning_rate": 9.035408703049596e-05, |
|
"loss": 0.4732, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.5352179034157832, |
|
"grad_norm": 6.043967247009277, |
|
"learning_rate": 9.011668544812377e-05, |
|
"loss": 0.4514, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5359717314487632, |
|
"grad_norm": 6.495950698852539, |
|
"learning_rate": 8.987934010384733e-05, |
|
"loss": 0.4468, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.5367255594817433, |
|
"grad_norm": 6.062058448791504, |
|
"learning_rate": 8.96420523482106e-05, |
|
"loss": 0.4311, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.5374793875147232, |
|
"grad_norm": 6.561244964599609, |
|
"learning_rate": 8.940482353142983e-05, |
|
"loss": 0.4621, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.5382332155477032, |
|
"grad_norm": 5.8635029792785645, |
|
"learning_rate": 8.916765500338575e-05, |
|
"loss": 0.4189, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.5389870435806832, |
|
"grad_norm": 6.959576606750488, |
|
"learning_rate": 8.893054811361624e-05, |
|
"loss": 0.4382, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5397408716136631, |
|
"grad_norm": 5.93906307220459, |
|
"learning_rate": 8.869350421130831e-05, |
|
"loss": 0.4202, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.5404946996466431, |
|
"grad_norm": 5.888154029846191, |
|
"learning_rate": 8.845652464529057e-05, |
|
"loss": 0.4098, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.5412485276796231, |
|
"grad_norm": 6.113773345947266, |
|
"learning_rate": 8.821961076402563e-05, |
|
"loss": 0.412, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.542002355712603, |
|
"grad_norm": 6.2954607009887695, |
|
"learning_rate": 8.79827639156022e-05, |
|
"loss": 0.4472, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.542756183745583, |
|
"grad_norm": 6.085266590118408, |
|
"learning_rate": 8.774598544772774e-05, |
|
"loss": 0.4134, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5435100117785631, |
|
"grad_norm": 5.995761871337891, |
|
"learning_rate": 8.750927670772044e-05, |
|
"loss": 0.4236, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.544263839811543, |
|
"grad_norm": 6.094368934631348, |
|
"learning_rate": 8.727263904250178e-05, |
|
"loss": 0.4344, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.545017667844523, |
|
"grad_norm": 6.14577579498291, |
|
"learning_rate": 8.703607379858889e-05, |
|
"loss": 0.396, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.545771495877503, |
|
"grad_norm": 5.814198970794678, |
|
"learning_rate": 8.679958232208668e-05, |
|
"loss": 0.3987, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.5465253239104829, |
|
"grad_norm": 6.348716735839844, |
|
"learning_rate": 8.656316595868037e-05, |
|
"loss": 0.4263, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5472791519434629, |
|
"grad_norm": 6.51011323928833, |
|
"learning_rate": 8.632682605362784e-05, |
|
"loss": 0.4361, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.5480329799764428, |
|
"grad_norm": 6.134734630584717, |
|
"learning_rate": 8.609056395175175e-05, |
|
"loss": 0.3946, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.5487868080094228, |
|
"grad_norm": 6.129810333251953, |
|
"learning_rate": 8.585438099743217e-05, |
|
"loss": 0.3948, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.5495406360424028, |
|
"grad_norm": 6.51365852355957, |
|
"learning_rate": 8.56182785345988e-05, |
|
"loss": 0.4182, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.5502944640753828, |
|
"grad_norm": 6.257938861846924, |
|
"learning_rate": 8.538225790672322e-05, |
|
"loss": 0.4041, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5510482921083628, |
|
"grad_norm": 6.626195430755615, |
|
"learning_rate": 8.514632045681145e-05, |
|
"loss": 0.4291, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.5518021201413428, |
|
"grad_norm": 6.350541591644287, |
|
"learning_rate": 8.491046752739624e-05, |
|
"loss": 0.4113, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.5525559481743227, |
|
"grad_norm": 6.342377185821533, |
|
"learning_rate": 8.467470046052927e-05, |
|
"loss": 0.3725, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.5533097762073027, |
|
"grad_norm": 6.338717460632324, |
|
"learning_rate": 8.443902059777373e-05, |
|
"loss": 0.4044, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.5540636042402827, |
|
"grad_norm": 6.489543914794922, |
|
"learning_rate": 8.420342928019666e-05, |
|
"loss": 0.3806, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5548174322732626, |
|
"grad_norm": 6.675236701965332, |
|
"learning_rate": 8.396792784836108e-05, |
|
"loss": 0.3937, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.5555712603062426, |
|
"grad_norm": 7.242746829986572, |
|
"learning_rate": 8.373251764231872e-05, |
|
"loss": 0.3968, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.5563250883392226, |
|
"grad_norm": 6.987369537353516, |
|
"learning_rate": 8.349720000160218e-05, |
|
"loss": 0.3878, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.5570789163722026, |
|
"grad_norm": 7.393560886383057, |
|
"learning_rate": 8.326197626521723e-05, |
|
"loss": 0.3883, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.5578327444051826, |
|
"grad_norm": 7.474055290222168, |
|
"learning_rate": 8.30268477716354e-05, |
|
"loss": 0.4183, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5585865724381626, |
|
"grad_norm": 7.556806564331055, |
|
"learning_rate": 8.279181585878635e-05, |
|
"loss": 0.4282, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.5593404004711425, |
|
"grad_norm": 8.794517517089844, |
|
"learning_rate": 8.255688186404996e-05, |
|
"loss": 0.4694, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.5600942285041225, |
|
"grad_norm": 9.162858963012695, |
|
"learning_rate": 8.232204712424911e-05, |
|
"loss": 0.4888, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.5608480565371025, |
|
"grad_norm": 9.154852867126465, |
|
"learning_rate": 8.208731297564189e-05, |
|
"loss": 0.4735, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.5616018845700824, |
|
"grad_norm": 9.025120735168457, |
|
"learning_rate": 8.185268075391388e-05, |
|
"loss": 0.4743, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5623557126030624, |
|
"grad_norm": 9.328535079956055, |
|
"learning_rate": 8.161815179417078e-05, |
|
"loss": 0.4575, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.5631095406360423, |
|
"grad_norm": 9.941339492797852, |
|
"learning_rate": 8.138372743093076e-05, |
|
"loss": 0.4969, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.5638633686690224, |
|
"grad_norm": 9.928484916687012, |
|
"learning_rate": 8.114940899811662e-05, |
|
"loss": 0.4634, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.5646171967020024, |
|
"grad_norm": 10.29101848602295, |
|
"learning_rate": 8.091519782904857e-05, |
|
"loss": 0.4114, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 15.212136268615723, |
|
"learning_rate": 8.068109525643647e-05, |
|
"loss": 0.516, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5661248527679623, |
|
"grad_norm": 8.223611831665039, |
|
"learning_rate": 8.044710261237207e-05, |
|
"loss": 0.541, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.5668786808009423, |
|
"grad_norm": 8.392924308776855, |
|
"learning_rate": 8.021322122832178e-05, |
|
"loss": 0.5317, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.5676325088339222, |
|
"grad_norm": 8.130448341369629, |
|
"learning_rate": 7.99794524351189e-05, |
|
"loss": 0.4935, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.5683863368669022, |
|
"grad_norm": 6.9753899574279785, |
|
"learning_rate": 7.974579756295591e-05, |
|
"loss": 0.4941, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.5691401648998822, |
|
"grad_norm": 6.365013122558594, |
|
"learning_rate": 7.951225794137724e-05, |
|
"loss": 0.4539, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5698939929328621, |
|
"grad_norm": 5.7341628074646, |
|
"learning_rate": 7.927883489927147e-05, |
|
"loss": 0.4197, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.5706478209658422, |
|
"grad_norm": 6.036746025085449, |
|
"learning_rate": 7.904552976486372e-05, |
|
"loss": 0.4361, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.5714016489988222, |
|
"grad_norm": 5.587414264678955, |
|
"learning_rate": 7.88123438657083e-05, |
|
"loss": 0.4294, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.5721554770318021, |
|
"grad_norm": 5.824455738067627, |
|
"learning_rate": 7.857927852868107e-05, |
|
"loss": 0.426, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.5729093050647821, |
|
"grad_norm": 5.811740398406982, |
|
"learning_rate": 7.83463350799717e-05, |
|
"loss": 0.4336, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5736631330977621, |
|
"grad_norm": 5.9260945320129395, |
|
"learning_rate": 7.811351484507647e-05, |
|
"loss": 0.4609, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.574416961130742, |
|
"grad_norm": 6.589666843414307, |
|
"learning_rate": 7.788081914879051e-05, |
|
"loss": 0.4384, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.575170789163722, |
|
"grad_norm": 5.957409858703613, |
|
"learning_rate": 7.764824931520018e-05, |
|
"loss": 0.4261, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.575924617196702, |
|
"grad_norm": 6.138071060180664, |
|
"learning_rate": 7.741580666767583e-05, |
|
"loss": 0.4189, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.5766784452296819, |
|
"grad_norm": 5.744472503662109, |
|
"learning_rate": 7.718349252886395e-05, |
|
"loss": 0.4086, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.577432273262662, |
|
"grad_norm": 6.045204162597656, |
|
"learning_rate": 7.695130822067984e-05, |
|
"loss": 0.4306, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.578186101295642, |
|
"grad_norm": 5.609772682189941, |
|
"learning_rate": 7.67192550643001e-05, |
|
"loss": 0.3998, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.5789399293286219, |
|
"grad_norm": 5.921622276306152, |
|
"learning_rate": 7.648733438015493e-05, |
|
"loss": 0.4225, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.5796937573616019, |
|
"grad_norm": 6.352652072906494, |
|
"learning_rate": 7.625554748792085e-05, |
|
"loss": 0.4193, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.5804475853945819, |
|
"grad_norm": 6.210894584655762, |
|
"learning_rate": 7.602389570651303e-05, |
|
"loss": 0.4119, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5812014134275618, |
|
"grad_norm": 6.061959743499756, |
|
"learning_rate": 7.579238035407776e-05, |
|
"loss": 0.4097, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.5819552414605418, |
|
"grad_norm": 6.42627477645874, |
|
"learning_rate": 7.556100274798519e-05, |
|
"loss": 0.4226, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.5827090694935217, |
|
"grad_norm": 6.124332904815674, |
|
"learning_rate": 7.532976420482146e-05, |
|
"loss": 0.396, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.5834628975265017, |
|
"grad_norm": 5.928023815155029, |
|
"learning_rate": 7.509866604038157e-05, |
|
"loss": 0.3897, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.5842167255594818, |
|
"grad_norm": 6.037590503692627, |
|
"learning_rate": 7.486770956966171e-05, |
|
"loss": 0.3958, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5849705535924618, |
|
"grad_norm": 6.051185131072998, |
|
"learning_rate": 7.463689610685171e-05, |
|
"loss": 0.4072, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.5857243816254417, |
|
"grad_norm": 6.234012126922607, |
|
"learning_rate": 7.440622696532775e-05, |
|
"loss": 0.4151, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.5864782096584217, |
|
"grad_norm": 6.273362636566162, |
|
"learning_rate": 7.417570345764481e-05, |
|
"loss": 0.418, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.5872320376914016, |
|
"grad_norm": 6.810718059539795, |
|
"learning_rate": 7.394532689552905e-05, |
|
"loss": 0.4082, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.5879858657243816, |
|
"grad_norm": 7.068334102630615, |
|
"learning_rate": 7.371509858987061e-05, |
|
"loss": 0.4031, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5887396937573616, |
|
"grad_norm": 6.441345691680908, |
|
"learning_rate": 7.348501985071603e-05, |
|
"loss": 0.3973, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.5894935217903415, |
|
"grad_norm": 6.285884380340576, |
|
"learning_rate": 7.325509198726064e-05, |
|
"loss": 0.3888, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.5902473498233216, |
|
"grad_norm": 5.942330360412598, |
|
"learning_rate": 7.302531630784137e-05, |
|
"loss": 0.3656, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.5910011778563016, |
|
"grad_norm": 6.333634376525879, |
|
"learning_rate": 7.279569411992926e-05, |
|
"loss": 0.4081, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.5917550058892815, |
|
"grad_norm": 6.436288833618164, |
|
"learning_rate": 7.256622673012175e-05, |
|
"loss": 0.4118, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5925088339222615, |
|
"grad_norm": 6.464933395385742, |
|
"learning_rate": 7.233691544413558e-05, |
|
"loss": 0.4269, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.5932626619552415, |
|
"grad_norm": 6.593018054962158, |
|
"learning_rate": 7.210776156679931e-05, |
|
"loss": 0.4124, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.5940164899882214, |
|
"grad_norm": 6.8628363609313965, |
|
"learning_rate": 7.187876640204556e-05, |
|
"loss": 0.4109, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.5947703180212014, |
|
"grad_norm": 7.0224151611328125, |
|
"learning_rate": 7.164993125290407e-05, |
|
"loss": 0.4141, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.5955241460541814, |
|
"grad_norm": 6.763969421386719, |
|
"learning_rate": 7.1421257421494e-05, |
|
"loss": 0.4093, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5962779740871613, |
|
"grad_norm": 7.6155781745910645, |
|
"learning_rate": 7.119274620901649e-05, |
|
"loss": 0.413, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.5970318021201414, |
|
"grad_norm": 7.919892311096191, |
|
"learning_rate": 7.096439891574745e-05, |
|
"loss": 0.422, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.5977856301531214, |
|
"grad_norm": 9.18865966796875, |
|
"learning_rate": 7.073621684103007e-05, |
|
"loss": 0.4679, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.5985394581861013, |
|
"grad_norm": 8.299490928649902, |
|
"learning_rate": 7.050820128326724e-05, |
|
"loss": 0.4638, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.5992932862190813, |
|
"grad_norm": 9.120932579040527, |
|
"learning_rate": 7.028035353991456e-05, |
|
"loss": 0.451, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6000471142520613, |
|
"grad_norm": 9.830779075622559, |
|
"learning_rate": 7.005267490747263e-05, |
|
"loss": 0.4778, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.6008009422850412, |
|
"grad_norm": 10.880460739135742, |
|
"learning_rate": 6.982516668147967e-05, |
|
"loss": 0.4404, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.6015547703180212, |
|
"grad_norm": 10.648106575012207, |
|
"learning_rate": 6.959783015650446e-05, |
|
"loss": 0.5199, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6023085983510011, |
|
"grad_norm": 11.122642517089844, |
|
"learning_rate": 6.937066662613863e-05, |
|
"loss": 0.4476, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.6030624263839811, |
|
"grad_norm": 12.062220573425293, |
|
"learning_rate": 6.914367738298941e-05, |
|
"loss": 0.4763, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6038162544169612, |
|
"grad_norm": 6.382950782775879, |
|
"learning_rate": 6.891686371867239e-05, |
|
"loss": 0.5237, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.6045700824499411, |
|
"grad_norm": 7.342101097106934, |
|
"learning_rate": 6.869022692380411e-05, |
|
"loss": 0.51, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.6053239104829211, |
|
"grad_norm": 7.170543670654297, |
|
"learning_rate": 6.846376828799451e-05, |
|
"loss": 0.4846, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.6060777385159011, |
|
"grad_norm": 6.772843360900879, |
|
"learning_rate": 6.823748909983994e-05, |
|
"loss": 0.4899, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.606831566548881, |
|
"grad_norm": 6.159712314605713, |
|
"learning_rate": 6.801139064691562e-05, |
|
"loss": 0.4651, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.607585394581861, |
|
"grad_norm": 6.47841739654541, |
|
"learning_rate": 6.778547421576825e-05, |
|
"loss": 0.4699, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.608339222614841, |
|
"grad_norm": 5.620822906494141, |
|
"learning_rate": 6.75597410919089e-05, |
|
"loss": 0.4317, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.6090930506478209, |
|
"grad_norm": 5.6669392585754395, |
|
"learning_rate": 6.733419255980559e-05, |
|
"loss": 0.4504, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.6098468786808009, |
|
"grad_norm": 5.989339828491211, |
|
"learning_rate": 6.710882990287585e-05, |
|
"loss": 0.4576, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.610600706713781, |
|
"grad_norm": 5.7165751457214355, |
|
"learning_rate": 6.688365440347965e-05, |
|
"loss": 0.4179, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6113545347467609, |
|
"grad_norm": 6.0307087898254395, |
|
"learning_rate": 6.665866734291205e-05, |
|
"loss": 0.4815, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.6121083627797409, |
|
"grad_norm": 6.319530010223389, |
|
"learning_rate": 6.643387000139565e-05, |
|
"loss": 0.4407, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.6128621908127209, |
|
"grad_norm": 5.93934440612793, |
|
"learning_rate": 6.620926365807372e-05, |
|
"loss": 0.4081, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.6136160188457008, |
|
"grad_norm": 5.771956443786621, |
|
"learning_rate": 6.598484959100257e-05, |
|
"loss": 0.3936, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.6143698468786808, |
|
"grad_norm": 6.20790433883667, |
|
"learning_rate": 6.576062907714448e-05, |
|
"loss": 0.4513, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6151236749116608, |
|
"grad_norm": 5.739172458648682, |
|
"learning_rate": 6.553660339236041e-05, |
|
"loss": 0.399, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.6158775029446407, |
|
"grad_norm": 6.355349540710449, |
|
"learning_rate": 6.53127738114026e-05, |
|
"loss": 0.4259, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.6166313309776207, |
|
"grad_norm": 5.847348213195801, |
|
"learning_rate": 6.508914160790752e-05, |
|
"loss": 0.4091, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.6173851590106008, |
|
"grad_norm": 5.917300224304199, |
|
"learning_rate": 6.486570805438843e-05, |
|
"loss": 0.4258, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.6181389870435807, |
|
"grad_norm": 6.199348449707031, |
|
"learning_rate": 6.46424744222283e-05, |
|
"loss": 0.4054, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6188928150765607, |
|
"grad_norm": 6.075807571411133, |
|
"learning_rate": 6.441944198167253e-05, |
|
"loss": 0.4334, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.6196466431095407, |
|
"grad_norm": 5.835407257080078, |
|
"learning_rate": 6.419661200182158e-05, |
|
"loss": 0.4124, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.6204004711425206, |
|
"grad_norm": 6.856280326843262, |
|
"learning_rate": 6.397398575062396e-05, |
|
"loss": 0.4316, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.6211542991755006, |
|
"grad_norm": 6.388029098510742, |
|
"learning_rate": 6.375156449486895e-05, |
|
"loss": 0.4096, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.6219081272084805, |
|
"grad_norm": 6.334976673126221, |
|
"learning_rate": 6.352934950017921e-05, |
|
"loss": 0.4267, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6226619552414605, |
|
"grad_norm": 6.394600868225098, |
|
"learning_rate": 6.330734203100394e-05, |
|
"loss": 0.4151, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.6234157832744405, |
|
"grad_norm": 6.139026165008545, |
|
"learning_rate": 6.308554335061135e-05, |
|
"loss": 0.4307, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.6241696113074205, |
|
"grad_norm": 6.6982102394104, |
|
"learning_rate": 6.286395472108158e-05, |
|
"loss": 0.4285, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.6249234393404005, |
|
"grad_norm": 5.852738857269287, |
|
"learning_rate": 6.26425774032996e-05, |
|
"loss": 0.3874, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.6256772673733805, |
|
"grad_norm": 6.24067497253418, |
|
"learning_rate": 6.2421412656948e-05, |
|
"loss": 0.3924, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6264310954063604, |
|
"grad_norm": 6.479643821716309, |
|
"learning_rate": 6.220046174049968e-05, |
|
"loss": 0.4109, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.6271849234393404, |
|
"grad_norm": 6.55532169342041, |
|
"learning_rate": 6.19797259112109e-05, |
|
"loss": 0.4151, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.6279387514723204, |
|
"grad_norm": 5.995844841003418, |
|
"learning_rate": 6.175920642511404e-05, |
|
"loss": 0.3872, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.6286925795053003, |
|
"grad_norm": 6.913110256195068, |
|
"learning_rate": 6.153890453701031e-05, |
|
"loss": 0.4105, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.6294464075382803, |
|
"grad_norm": 6.36851692199707, |
|
"learning_rate": 6.131882150046291e-05, |
|
"loss": 0.4048, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6302002355712603, |
|
"grad_norm": 5.844064712524414, |
|
"learning_rate": 6.109895856778967e-05, |
|
"loss": 0.3689, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.6309540636042403, |
|
"grad_norm": 7.132351398468018, |
|
"learning_rate": 6.087931699005588e-05, |
|
"loss": 0.4218, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.6317078916372203, |
|
"grad_norm": 6.560583114624023, |
|
"learning_rate": 6.065989801706744e-05, |
|
"loss": 0.4053, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.6324617196702003, |
|
"grad_norm": 6.6530351638793945, |
|
"learning_rate": 6.044070289736352e-05, |
|
"loss": 0.4061, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.6332155477031802, |
|
"grad_norm": 6.5088677406311035, |
|
"learning_rate": 6.0221732878209425e-05, |
|
"loss": 0.376, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6339693757361602, |
|
"grad_norm": 6.723409175872803, |
|
"learning_rate": 6.0002989205589734e-05, |
|
"loss": 0.3978, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.6347232037691402, |
|
"grad_norm": 9.00965404510498, |
|
"learning_rate": 5.978447312420103e-05, |
|
"loss": 0.4661, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.6354770318021201, |
|
"grad_norm": 8.346488952636719, |
|
"learning_rate": 5.9566185877444755e-05, |
|
"loss": 0.4812, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.6362308598351001, |
|
"grad_norm": 9.07754135131836, |
|
"learning_rate": 5.934812870742036e-05, |
|
"loss": 0.5042, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.63698468786808, |
|
"grad_norm": 9.425755500793457, |
|
"learning_rate": 5.913030285491808e-05, |
|
"loss": 0.5273, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6377385159010601, |
|
"grad_norm": 8.991804122924805, |
|
"learning_rate": 5.891270955941184e-05, |
|
"loss": 0.4724, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.6384923439340401, |
|
"grad_norm": 9.069438934326172, |
|
"learning_rate": 5.869535005905232e-05, |
|
"loss": 0.4694, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.63924617196702, |
|
"grad_norm": 9.837794303894043, |
|
"learning_rate": 5.847822559065992e-05, |
|
"loss": 0.4601, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 10.19363021850586, |
|
"learning_rate": 5.8261337389717506e-05, |
|
"loss": 0.4776, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.64075382803298, |
|
"grad_norm": 11.673394203186035, |
|
"learning_rate": 5.804468669036369e-05, |
|
"loss": 0.4425, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6415076560659599, |
|
"grad_norm": 6.468347072601318, |
|
"learning_rate": 5.7828274725385544e-05, |
|
"loss": 0.5469, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.6422614840989399, |
|
"grad_norm": 7.060529708862305, |
|
"learning_rate": 5.761210272621175e-05, |
|
"loss": 0.5067, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.6430153121319199, |
|
"grad_norm": 7.569014072418213, |
|
"learning_rate": 5.739617192290545e-05, |
|
"loss": 0.5057, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.6437691401648998, |
|
"grad_norm": 7.41010046005249, |
|
"learning_rate": 5.7180483544157546e-05, |
|
"loss": 0.4897, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.6445229681978799, |
|
"grad_norm": 6.627238750457764, |
|
"learning_rate": 5.696503881727917e-05, |
|
"loss": 0.5036, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6452767962308599, |
|
"grad_norm": 6.318825721740723, |
|
"learning_rate": 5.6749838968195326e-05, |
|
"loss": 0.4619, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.6460306242638398, |
|
"grad_norm": 5.585279941558838, |
|
"learning_rate": 5.653488522143744e-05, |
|
"loss": 0.4331, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.6467844522968198, |
|
"grad_norm": 5.902019500732422, |
|
"learning_rate": 5.6320178800136626e-05, |
|
"loss": 0.4596, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.6475382803297998, |
|
"grad_norm": 5.5325164794921875, |
|
"learning_rate": 5.610572092601659e-05, |
|
"loss": 0.4362, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.6482921083627797, |
|
"grad_norm": 5.381384372711182, |
|
"learning_rate": 5.589151281938695e-05, |
|
"loss": 0.4294, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6490459363957597, |
|
"grad_norm": 6.080218315124512, |
|
"learning_rate": 5.56775556991358e-05, |
|
"loss": 0.4304, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.6497997644287397, |
|
"grad_norm": 5.510005950927734, |
|
"learning_rate": 5.5463850782723346e-05, |
|
"loss": 0.4157, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.6505535924617196, |
|
"grad_norm": 5.572638511657715, |
|
"learning_rate": 5.5250399286174546e-05, |
|
"loss": 0.4238, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.6513074204946997, |
|
"grad_norm": 5.32048225402832, |
|
"learning_rate": 5.50372024240724e-05, |
|
"loss": 0.3929, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.6520612485276797, |
|
"grad_norm": 5.80560827255249, |
|
"learning_rate": 5.48242614095509e-05, |
|
"loss": 0.4251, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6528150765606596, |
|
"grad_norm": 5.714180946350098, |
|
"learning_rate": 5.461157745428841e-05, |
|
"loss": 0.4318, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.6535689045936396, |
|
"grad_norm": 5.553015232086182, |
|
"learning_rate": 5.439915176850037e-05, |
|
"loss": 0.3996, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.6543227326266196, |
|
"grad_norm": 5.774811744689941, |
|
"learning_rate": 5.418698556093271e-05, |
|
"loss": 0.4298, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.6550765606595995, |
|
"grad_norm": 5.804990291595459, |
|
"learning_rate": 5.397508003885483e-05, |
|
"loss": 0.4119, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.6558303886925795, |
|
"grad_norm": 5.6263556480407715, |
|
"learning_rate": 5.3763436408052904e-05, |
|
"loss": 0.394, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6565842167255594, |
|
"grad_norm": 5.699732303619385, |
|
"learning_rate": 5.3552055872822636e-05, |
|
"loss": 0.4152, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.6573380447585394, |
|
"grad_norm": 5.353825569152832, |
|
"learning_rate": 5.334093963596294e-05, |
|
"loss": 0.3798, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.6580918727915195, |
|
"grad_norm": 5.929776668548584, |
|
"learning_rate": 5.313008889876865e-05, |
|
"loss": 0.4142, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.6588457008244994, |
|
"grad_norm": 6.101897716522217, |
|
"learning_rate": 5.2919504861023903e-05, |
|
"loss": 0.4396, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.6595995288574794, |
|
"grad_norm": 6.041595458984375, |
|
"learning_rate": 5.270918872099522e-05, |
|
"loss": 0.4455, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6603533568904594, |
|
"grad_norm": 5.795607566833496, |
|
"learning_rate": 5.249914167542486e-05, |
|
"loss": 0.3927, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.6611071849234393, |
|
"grad_norm": 6.169924259185791, |
|
"learning_rate": 5.228936491952363e-05, |
|
"loss": 0.4022, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.6618610129564193, |
|
"grad_norm": 5.870789527893066, |
|
"learning_rate": 5.207985964696462e-05, |
|
"loss": 0.4012, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.6626148409893993, |
|
"grad_norm": 6.345909595489502, |
|
"learning_rate": 5.1870627049875954e-05, |
|
"loss": 0.3814, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.6633686690223792, |
|
"grad_norm": 6.1364569664001465, |
|
"learning_rate": 5.16616683188342e-05, |
|
"loss": 0.4032, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6641224970553592, |
|
"grad_norm": 5.976447582244873, |
|
"learning_rate": 5.145298464285757e-05, |
|
"loss": 0.3814, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.6648763250883393, |
|
"grad_norm": 7.229459285736084, |
|
"learning_rate": 5.12445772093992e-05, |
|
"loss": 0.4171, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.6656301531213192, |
|
"grad_norm": 5.863222599029541, |
|
"learning_rate": 5.103644720434027e-05, |
|
"loss": 0.3782, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.6663839811542992, |
|
"grad_norm": 6.049070835113525, |
|
"learning_rate": 5.082859581198344e-05, |
|
"loss": 0.3789, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.6671378091872792, |
|
"grad_norm": 6.35960578918457, |
|
"learning_rate": 5.062102421504593e-05, |
|
"loss": 0.4086, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6678916372202591, |
|
"grad_norm": 6.470669746398926, |
|
"learning_rate": 5.041373359465289e-05, |
|
"loss": 0.4076, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.6686454652532391, |
|
"grad_norm": 6.241630554199219, |
|
"learning_rate": 5.020672513033066e-05, |
|
"loss": 0.4007, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.669399293286219, |
|
"grad_norm": 6.308516502380371, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.3754, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.670153121319199, |
|
"grad_norm": 6.356692314147949, |
|
"learning_rate": 4.9793559379969566e-05, |
|
"loss": 0.3973, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.670906949352179, |
|
"grad_norm": 7.087871074676514, |
|
"learning_rate": 4.958740444492892e-05, |
|
"loss": 0.4128, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6716607773851591, |
|
"grad_norm": 7.447615623474121, |
|
"learning_rate": 4.9381536367942195e-05, |
|
"loss": 0.4111, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.672414605418139, |
|
"grad_norm": 7.260590076446533, |
|
"learning_rate": 4.917595632044113e-05, |
|
"loss": 0.3799, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.673168433451119, |
|
"grad_norm": 7.701971530914307, |
|
"learning_rate": 4.8970665472218537e-05, |
|
"loss": 0.4017, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.673922261484099, |
|
"grad_norm": 8.021989822387695, |
|
"learning_rate": 4.8765664991421634e-05, |
|
"loss": 0.4536, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.6746760895170789, |
|
"grad_norm": 8.987250328063965, |
|
"learning_rate": 4.856095604454539e-05, |
|
"loss": 0.4939, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6754299175500589, |
|
"grad_norm": 10.436625480651855, |
|
"learning_rate": 4.835653979642585e-05, |
|
"loss": 0.5239, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.6761837455830388, |
|
"grad_norm": 9.789538383483887, |
|
"learning_rate": 4.815241741023367e-05, |
|
"loss": 0.4798, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.6769375736160188, |
|
"grad_norm": 9.678764343261719, |
|
"learning_rate": 4.7948590047467153e-05, |
|
"loss": 0.4441, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.6776914016489988, |
|
"grad_norm": 10.444610595703125, |
|
"learning_rate": 4.774505886794609e-05, |
|
"loss": 0.4201, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.6784452296819788, |
|
"grad_norm": 12.58081340789795, |
|
"learning_rate": 4.754182502980477e-05, |
|
"loss": 0.4634, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6791990577149588, |
|
"grad_norm": 5.85378885269165, |
|
"learning_rate": 4.7338889689485624e-05, |
|
"loss": 0.5182, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.6799528857479388, |
|
"grad_norm": 6.6499857902526855, |
|
"learning_rate": 4.713625400173247e-05, |
|
"loss": 0.5216, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.6807067137809187, |
|
"grad_norm": 6.543797016143799, |
|
"learning_rate": 4.693391911958426e-05, |
|
"loss": 0.4798, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.6814605418138987, |
|
"grad_norm": 6.197330951690674, |
|
"learning_rate": 4.673188619436798e-05, |
|
"loss": 0.4892, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.6822143698468787, |
|
"grad_norm": 6.185276031494141, |
|
"learning_rate": 4.6530156375692726e-05, |
|
"loss": 0.474, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6829681978798586, |
|
"grad_norm": 5.581246376037598, |
|
"learning_rate": 4.632873081144267e-05, |
|
"loss": 0.4498, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.6837220259128386, |
|
"grad_norm": 5.916640281677246, |
|
"learning_rate": 4.6127610647770767e-05, |
|
"loss": 0.4619, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.6844758539458186, |
|
"grad_norm": 5.591888904571533, |
|
"learning_rate": 4.592679702909216e-05, |
|
"loss": 0.4275, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.6852296819787986, |
|
"grad_norm": 5.287500858306885, |
|
"learning_rate": 4.572629109807782e-05, |
|
"loss": 0.4073, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.6859835100117786, |
|
"grad_norm": 5.325054168701172, |
|
"learning_rate": 4.552609399564762e-05, |
|
"loss": 0.3894, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6867373380447586, |
|
"grad_norm": 5.576198101043701, |
|
"learning_rate": 4.532620686096446e-05, |
|
"loss": 0.4185, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.6874911660777385, |
|
"grad_norm": 5.555250644683838, |
|
"learning_rate": 4.5126630831427264e-05, |
|
"loss": 0.3818, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.6882449941107185, |
|
"grad_norm": 5.309383869171143, |
|
"learning_rate": 4.492736704266475e-05, |
|
"loss": 0.3835, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.6889988221436985, |
|
"grad_norm": 5.426351547241211, |
|
"learning_rate": 4.472841662852888e-05, |
|
"loss": 0.4087, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.6897526501766784, |
|
"grad_norm": 5.882096767425537, |
|
"learning_rate": 4.452978072108859e-05, |
|
"loss": 0.4398, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6905064782096584, |
|
"grad_norm": 5.80626916885376, |
|
"learning_rate": 4.4331460450623064e-05, |
|
"loss": 0.4234, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.6912603062426383, |
|
"grad_norm": 5.8705291748046875, |
|
"learning_rate": 4.413345694561549e-05, |
|
"loss": 0.4223, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.6920141342756184, |
|
"grad_norm": 5.822587966918945, |
|
"learning_rate": 4.393577133274658e-05, |
|
"loss": 0.4314, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.6927679623085984, |
|
"grad_norm": 6.2686872482299805, |
|
"learning_rate": 4.373840473688829e-05, |
|
"loss": 0.459, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.6935217903415783, |
|
"grad_norm": 5.543201923370361, |
|
"learning_rate": 4.354135828109707e-05, |
|
"loss": 0.3963, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6942756183745583, |
|
"grad_norm": 5.7019267082214355, |
|
"learning_rate": 4.3344633086607955e-05, |
|
"loss": 0.3964, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.6950294464075383, |
|
"grad_norm": 5.6861958503723145, |
|
"learning_rate": 4.3148230272827784e-05, |
|
"loss": 0.4175, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.6957832744405182, |
|
"grad_norm": 5.791751384735107, |
|
"learning_rate": 4.295215095732904e-05, |
|
"loss": 0.4196, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.6965371024734982, |
|
"grad_norm": 6.20761251449585, |
|
"learning_rate": 4.275639625584338e-05, |
|
"loss": 0.4159, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.6972909305064782, |
|
"grad_norm": 6.440983772277832, |
|
"learning_rate": 4.256096728225548e-05, |
|
"loss": 0.418, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6980447585394581, |
|
"grad_norm": 5.713172435760498, |
|
"learning_rate": 4.236586514859633e-05, |
|
"loss": 0.4084, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.6987985865724382, |
|
"grad_norm": 5.674785137176514, |
|
"learning_rate": 4.217109096503736e-05, |
|
"loss": 0.3978, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.6995524146054182, |
|
"grad_norm": 6.123269081115723, |
|
"learning_rate": 4.197664583988376e-05, |
|
"loss": 0.421, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.7003062426383981, |
|
"grad_norm": 5.961802959442139, |
|
"learning_rate": 4.1782530879568374e-05, |
|
"loss": 0.4027, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.7010600706713781, |
|
"grad_norm": 6.020455360412598, |
|
"learning_rate": 4.1588747188645275e-05, |
|
"loss": 0.3978, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7018138987043581, |
|
"grad_norm": 5.788726329803467, |
|
"learning_rate": 4.1395295869783615e-05, |
|
"loss": 0.3744, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.702567726737338, |
|
"grad_norm": 6.581162929534912, |
|
"learning_rate": 4.1202178023761195e-05, |
|
"loss": 0.4003, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.703321554770318, |
|
"grad_norm": 5.601202011108398, |
|
"learning_rate": 4.100939474945843e-05, |
|
"loss": 0.37, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.704075382803298, |
|
"grad_norm": 6.49223518371582, |
|
"learning_rate": 4.0816947143851816e-05, |
|
"loss": 0.4088, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.7048292108362779, |
|
"grad_norm": 6.10722541809082, |
|
"learning_rate": 4.0624836302007886e-05, |
|
"loss": 0.3835, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.705583038869258, |
|
"grad_norm": 6.136714935302734, |
|
"learning_rate": 4.0433063317076893e-05, |
|
"loss": 0.4056, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.706336866902238, |
|
"grad_norm": 6.344220161437988, |
|
"learning_rate": 4.024162928028663e-05, |
|
"loss": 0.386, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.7070906949352179, |
|
"grad_norm": 7.188864231109619, |
|
"learning_rate": 4.0050535280936205e-05, |
|
"loss": 0.3849, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.7078445229681979, |
|
"grad_norm": 6.800889492034912, |
|
"learning_rate": 3.985978240638981e-05, |
|
"loss": 0.3989, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.7085983510011779, |
|
"grad_norm": 7.130059242248535, |
|
"learning_rate": 3.966937174207066e-05, |
|
"loss": 0.3821, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7093521790341578, |
|
"grad_norm": 6.849576473236084, |
|
"learning_rate": 3.947930437145464e-05, |
|
"loss": 0.3843, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.7101060070671378, |
|
"grad_norm": 7.004662036895752, |
|
"learning_rate": 3.928958137606421e-05, |
|
"loss": 0.3686, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.7108598351001177, |
|
"grad_norm": 8.136757850646973, |
|
"learning_rate": 3.910020383546233e-05, |
|
"loss": 0.4558, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.7116136631330977, |
|
"grad_norm": 8.616293907165527, |
|
"learning_rate": 3.8911172827246215e-05, |
|
"loss": 0.4368, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.7123674911660778, |
|
"grad_norm": 8.701359748840332, |
|
"learning_rate": 3.8722489427041185e-05, |
|
"loss": 0.4512, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7131213191990577, |
|
"grad_norm": 9.437173843383789, |
|
"learning_rate": 3.853415470849479e-05, |
|
"loss": 0.481, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.7138751472320377, |
|
"grad_norm": 10.383941650390625, |
|
"learning_rate": 3.834616974327021e-05, |
|
"loss": 0.5005, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.7146289752650177, |
|
"grad_norm": 9.366165161132812, |
|
"learning_rate": 3.815853560104075e-05, |
|
"loss": 0.4548, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.7153828032979976, |
|
"grad_norm": 9.855792999267578, |
|
"learning_rate": 3.7971253349483285e-05, |
|
"loss": 0.4908, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.7161366313309776, |
|
"grad_norm": 11.261048316955566, |
|
"learning_rate": 3.7784324054272405e-05, |
|
"loss": 0.4601, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7168904593639576, |
|
"grad_norm": 5.492030143737793, |
|
"learning_rate": 3.759774877907428e-05, |
|
"loss": 0.5291, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.7176442873969375, |
|
"grad_norm": 6.00732421875, |
|
"learning_rate": 3.741152858554077e-05, |
|
"loss": 0.5058, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.7183981154299176, |
|
"grad_norm": 5.992036819458008, |
|
"learning_rate": 3.722566453330298e-05, |
|
"loss": 0.5028, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.7191519434628976, |
|
"grad_norm": 5.949222564697266, |
|
"learning_rate": 3.7040157679965796e-05, |
|
"loss": 0.4631, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.7199057714958775, |
|
"grad_norm": 5.833024978637695, |
|
"learning_rate": 3.6855009081101355e-05, |
|
"loss": 0.449, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7206595995288575, |
|
"grad_norm": 5.746013641357422, |
|
"learning_rate": 3.6670219790243344e-05, |
|
"loss": 0.4442, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.7214134275618375, |
|
"grad_norm": 5.595402240753174, |
|
"learning_rate": 3.648579085888085e-05, |
|
"loss": 0.4353, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.7221672555948174, |
|
"grad_norm": 5.437952995300293, |
|
"learning_rate": 3.630172333645261e-05, |
|
"loss": 0.434, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.7229210836277974, |
|
"grad_norm": 5.620044231414795, |
|
"learning_rate": 3.611801827034059e-05, |
|
"loss": 0.4137, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.7236749116607774, |
|
"grad_norm": 5.448288440704346, |
|
"learning_rate": 3.593467670586457e-05, |
|
"loss": 0.4197, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7244287396937573, |
|
"grad_norm": 5.672021389007568, |
|
"learning_rate": 3.5751699686275786e-05, |
|
"loss": 0.4495, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.7251825677267374, |
|
"grad_norm": 5.292520046234131, |
|
"learning_rate": 3.556908825275117e-05, |
|
"loss": 0.4203, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.7259363957597174, |
|
"grad_norm": 5.522578239440918, |
|
"learning_rate": 3.538684344438736e-05, |
|
"loss": 0.4043, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.7266902237926973, |
|
"grad_norm": 5.811888694763184, |
|
"learning_rate": 3.520496629819494e-05, |
|
"loss": 0.4239, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.7274440518256773, |
|
"grad_norm": 5.410277366638184, |
|
"learning_rate": 3.502345784909229e-05, |
|
"loss": 0.4163, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7281978798586572, |
|
"grad_norm": 5.810190677642822, |
|
"learning_rate": 3.484231912989989e-05, |
|
"loss": 0.4323, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.7289517078916372, |
|
"grad_norm": 5.343920707702637, |
|
"learning_rate": 3.466155117133433e-05, |
|
"loss": 0.4153, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.7297055359246172, |
|
"grad_norm": 5.489987373352051, |
|
"learning_rate": 3.448115500200263e-05, |
|
"loss": 0.3828, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.7304593639575971, |
|
"grad_norm": 5.753129005432129, |
|
"learning_rate": 3.430113164839601e-05, |
|
"loss": 0.4047, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.7312131919905771, |
|
"grad_norm": 5.8478569984436035, |
|
"learning_rate": 3.4121482134884575e-05, |
|
"loss": 0.4231, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7319670200235572, |
|
"grad_norm": 6.3078413009643555, |
|
"learning_rate": 3.3942207483710986e-05, |
|
"loss": 0.3913, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.7327208480565371, |
|
"grad_norm": 5.719088077545166, |
|
"learning_rate": 3.3763308714984974e-05, |
|
"loss": 0.4149, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.7334746760895171, |
|
"grad_norm": 5.784895420074463, |
|
"learning_rate": 3.358478684667734e-05, |
|
"loss": 0.3997, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.7342285041224971, |
|
"grad_norm": 5.888166427612305, |
|
"learning_rate": 3.3406642894614394e-05, |
|
"loss": 0.4064, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.734982332155477, |
|
"grad_norm": 6.573143482208252, |
|
"learning_rate": 3.3228877872471786e-05, |
|
"loss": 0.4188, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.735736160188457, |
|
"grad_norm": 5.861452102661133, |
|
"learning_rate": 3.305149279176921e-05, |
|
"loss": 0.3993, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.736489988221437, |
|
"grad_norm": 5.746969223022461, |
|
"learning_rate": 3.287448866186428e-05, |
|
"loss": 0.4014, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.7372438162544169, |
|
"grad_norm": 5.95499849319458, |
|
"learning_rate": 3.269786648994697e-05, |
|
"loss": 0.4129, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.7379976442873969, |
|
"grad_norm": 5.868785858154297, |
|
"learning_rate": 3.252162728103382e-05, |
|
"loss": 0.4006, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.738751472320377, |
|
"grad_norm": 6.216129779815674, |
|
"learning_rate": 3.234577203796223e-05, |
|
"loss": 0.4097, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7395053003533569, |
|
"grad_norm": 5.94473934173584, |
|
"learning_rate": 3.217030176138474e-05, |
|
"loss": 0.3947, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.7402591283863369, |
|
"grad_norm": 5.822911262512207, |
|
"learning_rate": 3.199521744976342e-05, |
|
"loss": 0.3838, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.7410129564193169, |
|
"grad_norm": 5.968900203704834, |
|
"learning_rate": 3.182052009936404e-05, |
|
"loss": 0.3945, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.7417667844522968, |
|
"grad_norm": 6.497354984283447, |
|
"learning_rate": 3.164621070425051e-05, |
|
"loss": 0.4138, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.7425206124852768, |
|
"grad_norm": 6.382023334503174, |
|
"learning_rate": 3.147229025627922e-05, |
|
"loss": 0.37, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.7432744405182568, |
|
"grad_norm": 6.162110328674316, |
|
"learning_rate": 3.129875974509332e-05, |
|
"loss": 0.3743, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.7440282685512367, |
|
"grad_norm": 6.0412116050720215, |
|
"learning_rate": 3.1125620158117186e-05, |
|
"loss": 0.3714, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.7447820965842167, |
|
"grad_norm": 6.072629451751709, |
|
"learning_rate": 3.095287248055069e-05, |
|
"loss": 0.369, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.7455359246171968, |
|
"grad_norm": 6.4712958335876465, |
|
"learning_rate": 3.078051769536378e-05, |
|
"loss": 0.3956, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.7462897526501767, |
|
"grad_norm": 6.292232036590576, |
|
"learning_rate": 3.060855678329063e-05, |
|
"loss": 0.3755, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7470435806831567, |
|
"grad_norm": 6.797161102294922, |
|
"learning_rate": 3.043699072282429e-05, |
|
"loss": 0.3941, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.7477974087161366, |
|
"grad_norm": 7.063961029052734, |
|
"learning_rate": 3.0265820490210973e-05, |
|
"loss": 0.4085, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.7485512367491166, |
|
"grad_norm": 8.036771774291992, |
|
"learning_rate": 3.0095047059444546e-05, |
|
"loss": 0.4553, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.7493050647820966, |
|
"grad_norm": 8.343942642211914, |
|
"learning_rate": 2.9924671402261018e-05, |
|
"loss": 0.4532, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.7500588928150765, |
|
"grad_norm": 8.597431182861328, |
|
"learning_rate": 2.9754694488133038e-05, |
|
"loss": 0.4544, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.7508127208480565, |
|
"grad_norm": 8.797038078308105, |
|
"learning_rate": 2.958511728426414e-05, |
|
"loss": 0.4565, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.7508127208480565, |
|
"eval_loss": 0.42347389459609985, |
|
"eval_runtime": 127.0592, |
|
"eval_samples_per_second": 17.59, |
|
"eval_steps_per_second": 8.799, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.7515665488810365, |
|
"grad_norm": 9.90727710723877, |
|
"learning_rate": 2.941594075558366e-05, |
|
"loss": 0.4791, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.7523203769140165, |
|
"grad_norm": 9.148994445800781, |
|
"learning_rate": 2.9247165864740856e-05, |
|
"loss": 0.4488, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.7530742049469965, |
|
"grad_norm": 10.751917839050293, |
|
"learning_rate": 2.9078793572099616e-05, |
|
"loss": 0.4695, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.7538280329799765, |
|
"grad_norm": 12.66123104095459, |
|
"learning_rate": 2.8910824835732952e-05, |
|
"loss": 0.4773, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7545818610129564, |
|
"grad_norm": 5.507136821746826, |
|
"learning_rate": 2.8743260611417665e-05, |
|
"loss": 0.5073, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.7553356890459364, |
|
"grad_norm": 5.805990695953369, |
|
"learning_rate": 2.857610185262859e-05, |
|
"loss": 0.4735, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.7560895170789164, |
|
"grad_norm": 5.612555980682373, |
|
"learning_rate": 2.8409349510533578e-05, |
|
"loss": 0.4536, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.7568433451118963, |
|
"grad_norm": 5.850246906280518, |
|
"learning_rate": 2.8243004533987793e-05, |
|
"loss": 0.4578, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.7575971731448763, |
|
"grad_norm": 5.569720268249512, |
|
"learning_rate": 2.8077067869528417e-05, |
|
"loss": 0.4135, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.7583510011778563, |
|
"grad_norm": 5.9112114906311035, |
|
"learning_rate": 2.7911540461369222e-05, |
|
"loss": 0.4445, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.7591048292108363, |
|
"grad_norm": 5.9236249923706055, |
|
"learning_rate": 2.774642325139535e-05, |
|
"loss": 0.4402, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.7598586572438163, |
|
"grad_norm": 6.210232257843018, |
|
"learning_rate": 2.7581717179157606e-05, |
|
"loss": 0.4605, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.7606124852767963, |
|
"grad_norm": 5.880030155181885, |
|
"learning_rate": 2.7417423181867585e-05, |
|
"loss": 0.4227, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.7613663133097762, |
|
"grad_norm": 5.549881458282471, |
|
"learning_rate": 2.72535421943919e-05, |
|
"loss": 0.4168, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7621201413427562, |
|
"grad_norm": 5.586158275604248, |
|
"learning_rate": 2.7090075149247217e-05, |
|
"loss": 0.4334, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.7628739693757361, |
|
"grad_norm": 5.5952348709106445, |
|
"learning_rate": 2.6927022976594607e-05, |
|
"loss": 0.4232, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.7636277974087161, |
|
"grad_norm": 5.478029727935791, |
|
"learning_rate": 2.676438660423457e-05, |
|
"loss": 0.4053, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.7643816254416961, |
|
"grad_norm": 5.441522121429443, |
|
"learning_rate": 2.660216695760157e-05, |
|
"loss": 0.3847, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.765135453474676, |
|
"grad_norm": 5.584785461425781, |
|
"learning_rate": 2.6440364959758813e-05, |
|
"loss": 0.4098, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.7658892815076561, |
|
"grad_norm": 5.545854091644287, |
|
"learning_rate": 2.6278981531392945e-05, |
|
"loss": 0.4002, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.7666431095406361, |
|
"grad_norm": 5.697778701782227, |
|
"learning_rate": 2.6118017590809017e-05, |
|
"loss": 0.4013, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.767396937573616, |
|
"grad_norm": 6.265735626220703, |
|
"learning_rate": 2.595747405392491e-05, |
|
"loss": 0.4102, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.768150765606596, |
|
"grad_norm": 5.284882545471191, |
|
"learning_rate": 2.579735183426649e-05, |
|
"loss": 0.3747, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.768904593639576, |
|
"grad_norm": 5.939345359802246, |
|
"learning_rate": 2.5637651842962164e-05, |
|
"loss": 0.4019, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7696584216725559, |
|
"grad_norm": 5.655182838439941, |
|
"learning_rate": 2.5478374988737753e-05, |
|
"loss": 0.4038, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.7704122497055359, |
|
"grad_norm": 5.510229587554932, |
|
"learning_rate": 2.531952217791136e-05, |
|
"loss": 0.3912, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.7711660777385159, |
|
"grad_norm": 5.720643997192383, |
|
"learning_rate": 2.5161094314388278e-05, |
|
"loss": 0.3995, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.7719199057714958, |
|
"grad_norm": 5.860435962677002, |
|
"learning_rate": 2.5003092299655584e-05, |
|
"loss": 0.3995, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.7726737338044759, |
|
"grad_norm": 6.223293304443359, |
|
"learning_rate": 2.4845517032777364e-05, |
|
"loss": 0.4424, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7734275618374559, |
|
"grad_norm": 6.027644157409668, |
|
"learning_rate": 2.4688369410389334e-05, |
|
"loss": 0.4299, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.7741813898704358, |
|
"grad_norm": 5.946674346923828, |
|
"learning_rate": 2.4531650326693822e-05, |
|
"loss": 0.3849, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.7749352179034158, |
|
"grad_norm": 6.277134895324707, |
|
"learning_rate": 2.4375360673454718e-05, |
|
"loss": 0.4147, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.7756890459363958, |
|
"grad_norm": 6.024038314819336, |
|
"learning_rate": 2.4219501339992334e-05, |
|
"loss": 0.3774, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.7764428739693757, |
|
"grad_norm": 5.8574910163879395, |
|
"learning_rate": 2.406407321317835e-05, |
|
"loss": 0.3865, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7771967020023557, |
|
"grad_norm": 6.022578239440918, |
|
"learning_rate": 2.3909077177430893e-05, |
|
"loss": 0.3957, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.7779505300353357, |
|
"grad_norm": 5.923416614532471, |
|
"learning_rate": 2.3754514114709304e-05, |
|
"loss": 0.3836, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.7787043580683156, |
|
"grad_norm": 6.270270824432373, |
|
"learning_rate": 2.3600384904509254e-05, |
|
"loss": 0.3979, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.7794581861012957, |
|
"grad_norm": 6.285928726196289, |
|
"learning_rate": 2.3446690423857685e-05, |
|
"loss": 0.4098, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.7802120141342757, |
|
"grad_norm": 6.104770660400391, |
|
"learning_rate": 2.3293431547307887e-05, |
|
"loss": 0.3746, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7809658421672556, |
|
"grad_norm": 6.284374237060547, |
|
"learning_rate": 2.31406091469344e-05, |
|
"loss": 0.3933, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.7817196702002356, |
|
"grad_norm": 6.502585411071777, |
|
"learning_rate": 2.298822409232817e-05, |
|
"loss": 0.3964, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.7824734982332155, |
|
"grad_norm": 6.121708869934082, |
|
"learning_rate": 2.2836277250591574e-05, |
|
"loss": 0.3822, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.7832273262661955, |
|
"grad_norm": 7.069113731384277, |
|
"learning_rate": 2.2684769486333445e-05, |
|
"loss": 0.3919, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.7839811542991755, |
|
"grad_norm": 6.825623035430908, |
|
"learning_rate": 2.2533701661664154e-05, |
|
"loss": 0.4296, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7847349823321554, |
|
"grad_norm": 7.632999897003174, |
|
"learning_rate": 2.2383074636190748e-05, |
|
"loss": 0.4266, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.7854888103651354, |
|
"grad_norm": 7.41874885559082, |
|
"learning_rate": 2.2232889267012038e-05, |
|
"loss": 0.4263, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.7862426383981155, |
|
"grad_norm": 7.6582417488098145, |
|
"learning_rate": 2.2083146408713673e-05, |
|
"loss": 0.4351, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.7869964664310954, |
|
"grad_norm": 9.17532730102539, |
|
"learning_rate": 2.1933846913363466e-05, |
|
"loss": 0.5107, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.7877502944640754, |
|
"grad_norm": 9.609545707702637, |
|
"learning_rate": 2.178499163050617e-05, |
|
"loss": 0.4606, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7885041224970554, |
|
"grad_norm": 9.567949295043945, |
|
"learning_rate": 2.1636581407159105e-05, |
|
"loss": 0.4663, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.7892579505300353, |
|
"grad_norm": 9.527708053588867, |
|
"learning_rate": 2.1488617087806982e-05, |
|
"loss": 0.4712, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.7900117785630153, |
|
"grad_norm": 9.680562973022461, |
|
"learning_rate": 2.1341099514397266e-05, |
|
"loss": 0.4975, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.7907656065959953, |
|
"grad_norm": 10.399216651916504, |
|
"learning_rate": 2.1194029526335303e-05, |
|
"loss": 0.4586, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.7915194346289752, |
|
"grad_norm": 10.869539260864258, |
|
"learning_rate": 2.1047407960479702e-05, |
|
"loss": 0.4429, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7922732626619552, |
|
"grad_norm": 5.385607719421387, |
|
"learning_rate": 2.0901235651137284e-05, |
|
"loss": 0.5019, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.7930270906949353, |
|
"grad_norm": 5.6260223388671875, |
|
"learning_rate": 2.0755513430058672e-05, |
|
"loss": 0.4988, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.7937809187279152, |
|
"grad_norm": 5.487570762634277, |
|
"learning_rate": 2.0610242126433297e-05, |
|
"loss": 0.4594, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.7945347467608952, |
|
"grad_norm": 5.6461591720581055, |
|
"learning_rate": 2.0465422566884805e-05, |
|
"loss": 0.4642, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.7952885747938752, |
|
"grad_norm": 5.7345123291015625, |
|
"learning_rate": 2.0321055575466284e-05, |
|
"loss": 0.4442, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7960424028268551, |
|
"grad_norm": 5.918202877044678, |
|
"learning_rate": 2.0177141973655766e-05, |
|
"loss": 0.4708, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.7967962308598351, |
|
"grad_norm": 5.593347549438477, |
|
"learning_rate": 2.0033682580351144e-05, |
|
"loss": 0.4277, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.797550058892815, |
|
"grad_norm": 5.557769775390625, |
|
"learning_rate": 1.9890678211866033e-05, |
|
"loss": 0.4267, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.798303886925795, |
|
"grad_norm": 5.38918924331665, |
|
"learning_rate": 1.9748129681924675e-05, |
|
"loss": 0.4112, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.799057714958775, |
|
"grad_norm": 5.82417631149292, |
|
"learning_rate": 1.9606037801657673e-05, |
|
"loss": 0.4104, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.799811542991755, |
|
"grad_norm": 5.548363208770752, |
|
"learning_rate": 1.9464403379596963e-05, |
|
"loss": 0.4127, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.800565371024735, |
|
"grad_norm": 5.548163890838623, |
|
"learning_rate": 1.932322722167168e-05, |
|
"loss": 0.4198, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.801319199057715, |
|
"grad_norm": 5.443014621734619, |
|
"learning_rate": 1.9182510131203224e-05, |
|
"loss": 0.4012, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.802073027090695, |
|
"grad_norm": 5.750105381011963, |
|
"learning_rate": 1.9042252908900814e-05, |
|
"loss": 0.4075, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.8028268551236749, |
|
"grad_norm": 5.6281418800354, |
|
"learning_rate": 1.8902456352856925e-05, |
|
"loss": 0.3896, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.8035806831566549, |
|
"grad_norm": 5.443961143493652, |
|
"learning_rate": 1.8763121258542815e-05, |
|
"loss": 0.4057, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.8043345111896348, |
|
"grad_norm": 5.808502674102783, |
|
"learning_rate": 1.86242484188038e-05, |
|
"loss": 0.4137, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.8050883392226148, |
|
"grad_norm": 5.866790294647217, |
|
"learning_rate": 1.848583862385501e-05, |
|
"loss": 0.4129, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.8058421672555948, |
|
"grad_norm": 5.517582893371582, |
|
"learning_rate": 1.8347892661276656e-05, |
|
"loss": 0.3901, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.8065959952885748, |
|
"grad_norm": 6.088197231292725, |
|
"learning_rate": 1.82104113160097e-05, |
|
"loss": 0.4125, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8073498233215548, |
|
"grad_norm": 5.613511562347412, |
|
"learning_rate": 1.8073395370351287e-05, |
|
"loss": 0.3968, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.8081036513545348, |
|
"grad_norm": 5.712565898895264, |
|
"learning_rate": 1.7936845603950447e-05, |
|
"loss": 0.3925, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.8088574793875147, |
|
"grad_norm": 5.371545314788818, |
|
"learning_rate": 1.780076279380337e-05, |
|
"loss": 0.3589, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.8096113074204947, |
|
"grad_norm": 5.599592208862305, |
|
"learning_rate": 1.7665147714249376e-05, |
|
"loss": 0.3838, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.8103651354534747, |
|
"grad_norm": 6.015298843383789, |
|
"learning_rate": 1.753000113696617e-05, |
|
"loss": 0.386, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.8111189634864546, |
|
"grad_norm": 5.434444427490234, |
|
"learning_rate": 1.7395323830965605e-05, |
|
"loss": 0.3771, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.8118727915194346, |
|
"grad_norm": 6.145053863525391, |
|
"learning_rate": 1.726111656258932e-05, |
|
"loss": 0.4039, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.8126266195524146, |
|
"grad_norm": 5.7801384925842285, |
|
"learning_rate": 1.7127380095504296e-05, |
|
"loss": 0.3955, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.8133804475853946, |
|
"grad_norm": 5.640938758850098, |
|
"learning_rate": 1.699411519069858e-05, |
|
"loss": 0.3788, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.8141342756183746, |
|
"grad_norm": 5.714921951293945, |
|
"learning_rate": 1.686132260647696e-05, |
|
"loss": 0.3637, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8148881036513546, |
|
"grad_norm": 6.3913750648498535, |
|
"learning_rate": 1.6729003098456576e-05, |
|
"loss": 0.3815, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.8156419316843345, |
|
"grad_norm": 5.981407642364502, |
|
"learning_rate": 1.6597157419562703e-05, |
|
"loss": 0.3756, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.8163957597173145, |
|
"grad_norm": 6.408857822418213, |
|
"learning_rate": 1.646578632002439e-05, |
|
"loss": 0.4219, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.8171495877502944, |
|
"grad_norm": 6.3557329177856445, |
|
"learning_rate": 1.6334890547370286e-05, |
|
"loss": 0.387, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.8179034157832744, |
|
"grad_norm": 6.406612873077393, |
|
"learning_rate": 1.6204470846424268e-05, |
|
"loss": 0.3736, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8186572438162544, |
|
"grad_norm": 6.225420951843262, |
|
"learning_rate": 1.607452795930131e-05, |
|
"loss": 0.3886, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.8194110718492343, |
|
"grad_norm": 6.3113789558410645, |
|
"learning_rate": 1.594506262540324e-05, |
|
"loss": 0.402, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.8201648998822144, |
|
"grad_norm": 6.504429817199707, |
|
"learning_rate": 1.5816075581414458e-05, |
|
"loss": 0.3911, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.8209187279151944, |
|
"grad_norm": 7.651139736175537, |
|
"learning_rate": 1.56875675612978e-05, |
|
"loss": 0.4127, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.8216725559481743, |
|
"grad_norm": 6.864494800567627, |
|
"learning_rate": 1.5559539296290403e-05, |
|
"loss": 0.3841, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8224263839811543, |
|
"grad_norm": 7.120053291320801, |
|
"learning_rate": 1.5431991514899446e-05, |
|
"loss": 0.4185, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.8231802120141343, |
|
"grad_norm": 7.861664295196533, |
|
"learning_rate": 1.5304924942898068e-05, |
|
"loss": 0.4293, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.8239340400471142, |
|
"grad_norm": 8.355661392211914, |
|
"learning_rate": 1.5178340303321314e-05, |
|
"loss": 0.4559, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.8246878680800942, |
|
"grad_norm": 8.859525680541992, |
|
"learning_rate": 1.5052238316461753e-05, |
|
"loss": 0.4503, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.8254416961130742, |
|
"grad_norm": 9.211348533630371, |
|
"learning_rate": 1.492661969986574e-05, |
|
"loss": 0.4435, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8261955241460541, |
|
"grad_norm": 8.610541343688965, |
|
"learning_rate": 1.4801485168329066e-05, |
|
"loss": 0.4625, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.8269493521790342, |
|
"grad_norm": 10.033802032470703, |
|
"learning_rate": 1.4676835433892989e-05, |
|
"loss": 0.437, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.8277031802120142, |
|
"grad_norm": 10.607207298278809, |
|
"learning_rate": 1.4552671205840163e-05, |
|
"loss": 0.4369, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.8284570082449941, |
|
"grad_norm": 10.07897663116455, |
|
"learning_rate": 1.4428993190690677e-05, |
|
"loss": 0.4563, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.8292108362779741, |
|
"grad_norm": 12.518508911132812, |
|
"learning_rate": 1.4305802092197829e-05, |
|
"loss": 0.4645, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8299646643109541, |
|
"grad_norm": 5.578033924102783, |
|
"learning_rate": 1.4183098611344415e-05, |
|
"loss": 0.51, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.830718492343934, |
|
"grad_norm": 5.301563739776611, |
|
"learning_rate": 1.4060883446338502e-05, |
|
"loss": 0.4486, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.831472320376914, |
|
"grad_norm": 5.3994293212890625, |
|
"learning_rate": 1.393915729260955e-05, |
|
"loss": 0.4536, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.832226148409894, |
|
"grad_norm": 5.560753345489502, |
|
"learning_rate": 1.3817920842804433e-05, |
|
"loss": 0.455, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.8329799764428739, |
|
"grad_norm": 5.810977935791016, |
|
"learning_rate": 1.3697174786783584e-05, |
|
"loss": 0.4373, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.833733804475854, |
|
"grad_norm": 5.4894256591796875, |
|
"learning_rate": 1.3576919811616862e-05, |
|
"loss": 0.4106, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.834487632508834, |
|
"grad_norm": 5.865782737731934, |
|
"learning_rate": 1.345715660157989e-05, |
|
"loss": 0.4151, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.8352414605418139, |
|
"grad_norm": 5.4949469566345215, |
|
"learning_rate": 1.3337885838149988e-05, |
|
"loss": 0.4422, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.8359952885747939, |
|
"grad_norm": 5.45637845993042, |
|
"learning_rate": 1.3219108200002418e-05, |
|
"loss": 0.4237, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.8367491166077738, |
|
"grad_norm": 5.681154251098633, |
|
"learning_rate": 1.3100824363006326e-05, |
|
"loss": 0.443, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8375029446407538, |
|
"grad_norm": 5.729828357696533, |
|
"learning_rate": 1.2983035000221177e-05, |
|
"loss": 0.4053, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.8382567726737338, |
|
"grad_norm": 6.101329326629639, |
|
"learning_rate": 1.2865740781892699e-05, |
|
"loss": 0.4384, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.8390106007067137, |
|
"grad_norm": 5.694645881652832, |
|
"learning_rate": 1.2748942375449135e-05, |
|
"loss": 0.4013, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.8397644287396937, |
|
"grad_norm": 5.564671516418457, |
|
"learning_rate": 1.263264044549748e-05, |
|
"loss": 0.4148, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.8405182567726738, |
|
"grad_norm": 5.393068313598633, |
|
"learning_rate": 1.2516835653819725e-05, |
|
"loss": 0.3981, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8412720848056537, |
|
"grad_norm": 5.637123107910156, |
|
"learning_rate": 1.2401528659368911e-05, |
|
"loss": 0.406, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.8420259128386337, |
|
"grad_norm": 5.908216953277588, |
|
"learning_rate": 1.2286720118265659e-05, |
|
"loss": 0.3637, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.8427797408716137, |
|
"grad_norm": 5.7352070808410645, |
|
"learning_rate": 1.2172410683794177e-05, |
|
"loss": 0.4082, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.8435335689045936, |
|
"grad_norm": 5.5727858543396, |
|
"learning_rate": 1.2058601006398718e-05, |
|
"loss": 0.3828, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.8442873969375736, |
|
"grad_norm": 6.22990608215332, |
|
"learning_rate": 1.1945291733679764e-05, |
|
"loss": 0.4306, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8450412249705536, |
|
"grad_norm": 5.981517314910889, |
|
"learning_rate": 1.1832483510390469e-05, |
|
"loss": 0.4177, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.8457950530035335, |
|
"grad_norm": 5.5717973709106445, |
|
"learning_rate": 1.1720176978432795e-05, |
|
"loss": 0.375, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.8465488810365136, |
|
"grad_norm": 5.83533239364624, |
|
"learning_rate": 1.1608372776854103e-05, |
|
"loss": 0.4141, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.8473027090694936, |
|
"grad_norm": 5.770301342010498, |
|
"learning_rate": 1.1497071541843306e-05, |
|
"loss": 0.3698, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 5.999599933624268, |
|
"learning_rate": 1.1386273906727363e-05, |
|
"loss": 0.4177, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8488103651354535, |
|
"grad_norm": 5.716385841369629, |
|
"learning_rate": 1.1275980501967642e-05, |
|
"loss": 0.3931, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.8495641931684335, |
|
"grad_norm": 6.15166711807251, |
|
"learning_rate": 1.1166191955156346e-05, |
|
"loss": 0.4025, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.8503180212014134, |
|
"grad_norm": 6.117612361907959, |
|
"learning_rate": 1.1056908891012884e-05, |
|
"loss": 0.4186, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.8510718492343934, |
|
"grad_norm": 6.109333038330078, |
|
"learning_rate": 1.0948131931380457e-05, |
|
"loss": 0.3863, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.8518256772673733, |
|
"grad_norm": 5.863979816436768, |
|
"learning_rate": 1.0839861695222354e-05, |
|
"loss": 0.3737, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8525795053003533, |
|
"grad_norm": 5.980686664581299, |
|
"learning_rate": 1.0732098798618517e-05, |
|
"loss": 0.3739, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 6.321891784667969, |
|
"learning_rate": 1.0624843854762034e-05, |
|
"loss": 0.416, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.8540871613663134, |
|
"grad_norm": 6.081487655639648, |
|
"learning_rate": 1.0518097473955624e-05, |
|
"loss": 0.3922, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.8548409893992933, |
|
"grad_norm": 6.287003040313721, |
|
"learning_rate": 1.0411860263608186e-05, |
|
"loss": 0.3747, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.8555948174322733, |
|
"grad_norm": 6.175232887268066, |
|
"learning_rate": 1.0306132828231318e-05, |
|
"loss": 0.3708, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.8563486454652532, |
|
"grad_norm": 6.49648904800415, |
|
"learning_rate": 1.0200915769435937e-05, |
|
"loss": 0.373, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.8571024734982332, |
|
"grad_norm": 6.249892234802246, |
|
"learning_rate": 1.009620968592876e-05, |
|
"loss": 0.3807, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.8578563015312132, |
|
"grad_norm": 6.616731643676758, |
|
"learning_rate": 9.992015173508995e-06, |
|
"loss": 0.3981, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.8586101295641931, |
|
"grad_norm": 6.801102638244629, |
|
"learning_rate": 9.88833282506486e-06, |
|
"loss": 0.3968, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.8593639575971731, |
|
"grad_norm": 6.820323467254639, |
|
"learning_rate": 9.785163230570282e-06, |
|
"loss": 0.3939, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8601177856301532, |
|
"grad_norm": 8.20490837097168, |
|
"learning_rate": 9.682506977081496e-06, |
|
"loss": 0.4353, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.8608716136631331, |
|
"grad_norm": 7.587864398956299, |
|
"learning_rate": 9.580364648733775e-06, |
|
"loss": 0.4369, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.8616254416961131, |
|
"grad_norm": 7.294688701629639, |
|
"learning_rate": 9.478736826737944e-06, |
|
"loss": 0.411, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.8623792697290931, |
|
"grad_norm": 7.802835464477539, |
|
"learning_rate": 9.37762408937729e-06, |
|
"loss": 0.424, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.863133097762073, |
|
"grad_norm": 8.21778678894043, |
|
"learning_rate": 9.277027012004125e-06, |
|
"loss": 0.4752, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.863886925795053, |
|
"grad_norm": 8.805744171142578, |
|
"learning_rate": 9.176946167036516e-06, |
|
"loss": 0.4736, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.864640753828033, |
|
"grad_norm": 10.24565601348877, |
|
"learning_rate": 9.07738212395508e-06, |
|
"loss": 0.4635, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.8653945818610129, |
|
"grad_norm": 9.218001365661621, |
|
"learning_rate": 8.978335449299791e-06, |
|
"loss": 0.4313, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.8661484098939929, |
|
"grad_norm": 10.276748657226562, |
|
"learning_rate": 8.87980670666655e-06, |
|
"loss": 0.421, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.866902237926973, |
|
"grad_norm": 11.982145309448242, |
|
"learning_rate": 8.781796456704262e-06, |
|
"loss": 0.4486, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8676560659599529, |
|
"grad_norm": 5.365624904632568, |
|
"learning_rate": 8.684305257111425e-06, |
|
"loss": 0.5014, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.8684098939929329, |
|
"grad_norm": 5.599196910858154, |
|
"learning_rate": 8.587333662633035e-06, |
|
"loss": 0.4984, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.8691637220259129, |
|
"grad_norm": 5.679477214813232, |
|
"learning_rate": 8.490882225057428e-06, |
|
"loss": 0.5011, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.8699175500588928, |
|
"grad_norm": 5.679898738861084, |
|
"learning_rate": 8.39495149321322e-06, |
|
"loss": 0.443, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.8706713780918728, |
|
"grad_norm": 5.414709091186523, |
|
"learning_rate": 8.299542012965944e-06, |
|
"loss": 0.4269, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.8714252061248527, |
|
"grad_norm": 5.3179426193237305, |
|
"learning_rate": 8.204654327215267e-06, |
|
"loss": 0.4395, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.8721790341578327, |
|
"grad_norm": 5.2444963455200195, |
|
"learning_rate": 8.110288975891634e-06, |
|
"loss": 0.4217, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.8729328621908127, |
|
"grad_norm": 5.733283996582031, |
|
"learning_rate": 8.016446495953367e-06, |
|
"loss": 0.4395, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.8736866902237928, |
|
"grad_norm": 5.545217037200928, |
|
"learning_rate": 7.923127421383458e-06, |
|
"loss": 0.436, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.8744405182567727, |
|
"grad_norm": 5.759894371032715, |
|
"learning_rate": 7.830332283186714e-06, |
|
"loss": 0.4376, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8751943462897527, |
|
"grad_norm": 5.31406831741333, |
|
"learning_rate": 7.73806160938656e-06, |
|
"loss": 0.4097, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.8759481743227326, |
|
"grad_norm": 5.372743129730225, |
|
"learning_rate": 7.646315925022152e-06, |
|
"loss": 0.4264, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.8767020023557126, |
|
"grad_norm": 5.223913192749023, |
|
"learning_rate": 7.555095752145313e-06, |
|
"loss": 0.3879, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.8774558303886926, |
|
"grad_norm": 5.493069171905518, |
|
"learning_rate": 7.4644016098176615e-06, |
|
"loss": 0.4099, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.8782096584216725, |
|
"grad_norm": 5.413908004760742, |
|
"learning_rate": 7.374234014107484e-06, |
|
"loss": 0.4041, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.8789634864546525, |
|
"grad_norm": 5.9703288078308105, |
|
"learning_rate": 7.284593478087043e-06, |
|
"loss": 0.4391, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.8797173144876325, |
|
"grad_norm": 6.033265590667725, |
|
"learning_rate": 7.195480511829411e-06, |
|
"loss": 0.4356, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.8804711425206125, |
|
"grad_norm": 5.589619159698486, |
|
"learning_rate": 7.106895622405752e-06, |
|
"loss": 0.4029, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.8812249705535925, |
|
"grad_norm": 5.580582141876221, |
|
"learning_rate": 7.018839313882286e-06, |
|
"loss": 0.4039, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.8819787985865725, |
|
"grad_norm": 5.605942726135254, |
|
"learning_rate": 6.931312087317632e-06, |
|
"loss": 0.3915, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8827326266195524, |
|
"grad_norm": 5.954355239868164, |
|
"learning_rate": 6.844314440759647e-06, |
|
"loss": 0.4119, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 0.8834864546525324, |
|
"grad_norm": 5.943442344665527, |
|
"learning_rate": 6.7578468692429345e-06, |
|
"loss": 0.4227, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.8842402826855124, |
|
"grad_norm": 6.070568561553955, |
|
"learning_rate": 6.6719098647857525e-06, |
|
"loss": 0.3824, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.8849941107184923, |
|
"grad_norm": 5.827738285064697, |
|
"learning_rate": 6.586503916387366e-06, |
|
"loss": 0.4358, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.8857479387514723, |
|
"grad_norm": 5.9503655433654785, |
|
"learning_rate": 6.501629510025231e-06, |
|
"loss": 0.3862, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8865017667844522, |
|
"grad_norm": 5.86431360244751, |
|
"learning_rate": 6.417287128652172e-06, |
|
"loss": 0.3849, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.8872555948174323, |
|
"grad_norm": 5.833621978759766, |
|
"learning_rate": 6.333477252193731e-06, |
|
"loss": 0.3935, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 0.8880094228504123, |
|
"grad_norm": 6.094554901123047, |
|
"learning_rate": 6.250200357545377e-06, |
|
"loss": 0.3911, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.8887632508833923, |
|
"grad_norm": 5.814612865447998, |
|
"learning_rate": 6.167456918569792e-06, |
|
"loss": 0.3738, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 0.8895170789163722, |
|
"grad_norm": 6.395360946655273, |
|
"learning_rate": 6.085247406094197e-06, |
|
"loss": 0.3692, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8902709069493522, |
|
"grad_norm": 5.914385795593262, |
|
"learning_rate": 6.003572287907633e-06, |
|
"loss": 0.4008, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 0.8910247349823321, |
|
"grad_norm": 6.416135787963867, |
|
"learning_rate": 5.922432028758362e-06, |
|
"loss": 0.3997, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.8917785630153121, |
|
"grad_norm": 5.680757522583008, |
|
"learning_rate": 5.841827090351171e-06, |
|
"loss": 0.347, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.8925323910482921, |
|
"grad_norm": 5.837109088897705, |
|
"learning_rate": 5.761757931344758e-06, |
|
"loss": 0.3623, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.893286219081272, |
|
"grad_norm": 5.914787769317627, |
|
"learning_rate": 5.68222500734914e-06, |
|
"loss": 0.3632, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.8940400471142521, |
|
"grad_norm": 6.179137229919434, |
|
"learning_rate": 5.603228770923041e-06, |
|
"loss": 0.3864, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.8947938751472321, |
|
"grad_norm": 5.854869365692139, |
|
"learning_rate": 5.524769671571317e-06, |
|
"loss": 0.3318, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 0.895547703180212, |
|
"grad_norm": 6.880571365356445, |
|
"learning_rate": 5.446848155742401e-06, |
|
"loss": 0.4063, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.896301531213192, |
|
"grad_norm": 6.602806568145752, |
|
"learning_rate": 5.3694646668257855e-06, |
|
"loss": 0.3698, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 0.897055359246172, |
|
"grad_norm": 7.17775821685791, |
|
"learning_rate": 5.292619645149433e-06, |
|
"loss": 0.4266, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8978091872791519, |
|
"grad_norm": 7.022253036499023, |
|
"learning_rate": 5.2163135279773904e-06, |
|
"loss": 0.3885, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.8985630153121319, |
|
"grad_norm": 7.834957599639893, |
|
"learning_rate": 5.140546749507136e-06, |
|
"loss": 0.4484, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.8993168433451119, |
|
"grad_norm": 8.505350112915039, |
|
"learning_rate": 5.06531974086728e-06, |
|
"loss": 0.4535, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 0.9000706713780918, |
|
"grad_norm": 8.074254035949707, |
|
"learning_rate": 4.9906329301149914e-06, |
|
"loss": 0.4528, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.9008244994110719, |
|
"grad_norm": 8.195548057556152, |
|
"learning_rate": 4.916486742233606e-06, |
|
"loss": 0.447, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.9015783274440519, |
|
"grad_norm": 9.020340919494629, |
|
"learning_rate": 4.8428815991302005e-06, |
|
"loss": 0.4507, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.9023321554770318, |
|
"grad_norm": 9.480902671813965, |
|
"learning_rate": 4.769817919633235e-06, |
|
"loss": 0.4905, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.9030859835100118, |
|
"grad_norm": 9.953953742980957, |
|
"learning_rate": 4.697296119490047e-06, |
|
"loss": 0.4291, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.9038398115429918, |
|
"grad_norm": 9.974310874938965, |
|
"learning_rate": 4.625316611364661e-06, |
|
"loss": 0.4283, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 0.9045936395759717, |
|
"grad_norm": 12.497854232788086, |
|
"learning_rate": 4.553879804835282e-06, |
|
"loss": 0.4614, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9053474676089517, |
|
"grad_norm": 4.8798136711120605, |
|
"learning_rate": 4.482986106392073e-06, |
|
"loss": 0.4771, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 0.9061012956419316, |
|
"grad_norm": 4.956184387207031, |
|
"learning_rate": 4.412635919434749e-06, |
|
"loss": 0.4444, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 0.9068551236749116, |
|
"grad_norm": 5.346173286437988, |
|
"learning_rate": 4.342829644270429e-06, |
|
"loss": 0.4442, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 0.9076089517078917, |
|
"grad_norm": 5.293701648712158, |
|
"learning_rate": 4.273567678111123e-06, |
|
"loss": 0.4614, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.9083627797408717, |
|
"grad_norm": 5.237243175506592, |
|
"learning_rate": 4.204850415071748e-06, |
|
"loss": 0.4512, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.9091166077738516, |
|
"grad_norm": 5.3798604011535645, |
|
"learning_rate": 4.136678246167636e-06, |
|
"loss": 0.4286, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.9098704358068316, |
|
"grad_norm": 5.367835998535156, |
|
"learning_rate": 4.069051559312531e-06, |
|
"loss": 0.4139, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 0.9106242638398115, |
|
"grad_norm": 5.50463342666626, |
|
"learning_rate": 4.001970739316163e-06, |
|
"loss": 0.4407, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.9113780918727915, |
|
"grad_norm": 5.295793056488037, |
|
"learning_rate": 3.935436167882234e-06, |
|
"loss": 0.418, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 0.9121319199057715, |
|
"grad_norm": 5.284564018249512, |
|
"learning_rate": 3.869448223606165e-06, |
|
"loss": 0.4096, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9128857479387514, |
|
"grad_norm": 5.553956031799316, |
|
"learning_rate": 3.8040072819729545e-06, |
|
"loss": 0.4141, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 0.9136395759717314, |
|
"grad_norm": 5.626007080078125, |
|
"learning_rate": 3.7391137153550137e-06, |
|
"loss": 0.4138, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.9143934040047115, |
|
"grad_norm": 5.603013038635254, |
|
"learning_rate": 3.6747678930101558e-06, |
|
"loss": 0.4148, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 0.9151472320376914, |
|
"grad_norm": 5.539734363555908, |
|
"learning_rate": 3.6109701810793208e-06, |
|
"loss": 0.4181, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 0.9159010600706714, |
|
"grad_norm": 5.379584789276123, |
|
"learning_rate": 3.5477209425846538e-06, |
|
"loss": 0.4015, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9166548881036514, |
|
"grad_norm": 5.433023929595947, |
|
"learning_rate": 3.4850205374273416e-06, |
|
"loss": 0.398, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.9174087161366313, |
|
"grad_norm": 5.5849199295043945, |
|
"learning_rate": 3.4228693223856136e-06, |
|
"loss": 0.4165, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 0.9181625441696113, |
|
"grad_norm": 5.703511714935303, |
|
"learning_rate": 3.361267651112676e-06, |
|
"loss": 0.422, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.9189163722025913, |
|
"grad_norm": 5.733764171600342, |
|
"learning_rate": 3.30021587413476e-06, |
|
"loss": 0.4017, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 0.9196702002355712, |
|
"grad_norm": 5.802048206329346, |
|
"learning_rate": 3.2397143388489983e-06, |
|
"loss": 0.3935, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9204240282685512, |
|
"grad_norm": 5.458968639373779, |
|
"learning_rate": 3.1797633895216394e-06, |
|
"loss": 0.3783, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 0.9211778563015313, |
|
"grad_norm": 5.353023052215576, |
|
"learning_rate": 3.120363367285917e-06, |
|
"loss": 0.3788, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 0.9219316843345112, |
|
"grad_norm": 5.518474578857422, |
|
"learning_rate": 3.0615146101401925e-06, |
|
"loss": 0.3944, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 0.9226855123674912, |
|
"grad_norm": 5.713134765625, |
|
"learning_rate": 3.0032174529460165e-06, |
|
"loss": 0.3953, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.9234393404004712, |
|
"grad_norm": 6.142655372619629, |
|
"learning_rate": 2.945472227426227e-06, |
|
"loss": 0.4168, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9241931684334511, |
|
"grad_norm": 5.580604553222656, |
|
"learning_rate": 2.8882792621630406e-06, |
|
"loss": 0.3642, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 0.9249469964664311, |
|
"grad_norm": 5.7619757652282715, |
|
"learning_rate": 2.8316388825962324e-06, |
|
"loss": 0.3708, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 0.925700824499411, |
|
"grad_norm": 6.232563018798828, |
|
"learning_rate": 2.7755514110212264e-06, |
|
"loss": 0.4063, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 0.926454652532391, |
|
"grad_norm": 5.895346164703369, |
|
"learning_rate": 2.7200171665872742e-06, |
|
"loss": 0.399, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 0.927208480565371, |
|
"grad_norm": 5.760490894317627, |
|
"learning_rate": 2.6650364652956894e-06, |
|
"loss": 0.3785, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.927962308598351, |
|
"grad_norm": 5.620173454284668, |
|
"learning_rate": 2.6106096199979614e-06, |
|
"loss": 0.3564, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 0.928716136631331, |
|
"grad_norm": 5.84246826171875, |
|
"learning_rate": 2.5567369403940776e-06, |
|
"loss": 0.3575, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.929469964664311, |
|
"grad_norm": 5.908325672149658, |
|
"learning_rate": 2.50341873303066e-06, |
|
"loss": 0.384, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 0.9302237926972909, |
|
"grad_norm": 5.850981712341309, |
|
"learning_rate": 2.4506553012993093e-06, |
|
"loss": 0.3704, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 0.9309776207302709, |
|
"grad_norm": 6.301943778991699, |
|
"learning_rate": 2.398446945434818e-06, |
|
"loss": 0.385, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.9317314487632509, |
|
"grad_norm": 6.557477951049805, |
|
"learning_rate": 2.346793962513483e-06, |
|
"loss": 0.3607, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.9324852767962308, |
|
"grad_norm": 6.442347049713135, |
|
"learning_rate": 2.2956966464514175e-06, |
|
"loss": 0.3829, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 0.9332391048292108, |
|
"grad_norm": 7.224841594696045, |
|
"learning_rate": 2.245155288002876e-06, |
|
"loss": 0.3964, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 0.9339929328621908, |
|
"grad_norm": 7.129518032073975, |
|
"learning_rate": 2.1951701747585982e-06, |
|
"loss": 0.3682, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.9347467608951708, |
|
"grad_norm": 6.685035228729248, |
|
"learning_rate": 2.1457415911442013e-06, |
|
"loss": 0.4049, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9355005889281508, |
|
"grad_norm": 7.421708583831787, |
|
"learning_rate": 2.0968698184184565e-06, |
|
"loss": 0.4029, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 0.9362544169611308, |
|
"grad_norm": 7.260560989379883, |
|
"learning_rate": 2.04855513467187e-06, |
|
"loss": 0.4232, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.9370082449941107, |
|
"grad_norm": 8.069437980651855, |
|
"learning_rate": 2.000797814824906e-06, |
|
"loss": 0.4409, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 0.9377620730270907, |
|
"grad_norm": 7.945827960968018, |
|
"learning_rate": 1.9535981306265884e-06, |
|
"loss": 0.4244, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 0.9385159010600707, |
|
"grad_norm": 8.818882942199707, |
|
"learning_rate": 1.9069563506527998e-06, |
|
"loss": 0.4722, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.9392697290930506, |
|
"grad_norm": 8.6805419921875, |
|
"learning_rate": 1.8608727403049309e-06, |
|
"loss": 0.4574, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.9400235571260306, |
|
"grad_norm": 8.550375938415527, |
|
"learning_rate": 1.8153475618081673e-06, |
|
"loss": 0.4289, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 0.9407773851590105, |
|
"grad_norm": 9.816337585449219, |
|
"learning_rate": 1.7703810742101813e-06, |
|
"loss": 0.4884, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.9415312131919906, |
|
"grad_norm": 9.228532791137695, |
|
"learning_rate": 1.7259735333795545e-06, |
|
"loss": 0.4282, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 0.9422850412249706, |
|
"grad_norm": 12.300414085388184, |
|
"learning_rate": 1.6821251920043246e-06, |
|
"loss": 0.4527, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9430388692579506, |
|
"grad_norm": 5.250865459442139, |
|
"learning_rate": 1.6388362995905848e-06, |
|
"loss": 0.509, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 0.9437926972909305, |
|
"grad_norm": 5.213113307952881, |
|
"learning_rate": 1.5961071024610752e-06, |
|
"loss": 0.4615, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 0.9445465253239105, |
|
"grad_norm": 5.200348377227783, |
|
"learning_rate": 1.5539378437536944e-06, |
|
"loss": 0.4463, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.9453003533568904, |
|
"grad_norm": 5.2860941886901855, |
|
"learning_rate": 1.5123287634202454e-06, |
|
"loss": 0.4441, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.9460541813898704, |
|
"grad_norm": 5.183274269104004, |
|
"learning_rate": 1.4712800982249474e-06, |
|
"loss": 0.4292, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9468080094228504, |
|
"grad_norm": 5.593634605407715, |
|
"learning_rate": 1.430792081743182e-06, |
|
"loss": 0.4589, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.9475618374558303, |
|
"grad_norm": 5.3267388343811035, |
|
"learning_rate": 1.3908649443600707e-06, |
|
"loss": 0.4336, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 0.9483156654888104, |
|
"grad_norm": 5.741166114807129, |
|
"learning_rate": 1.351498913269289e-06, |
|
"loss": 0.4008, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.9490694935217904, |
|
"grad_norm": 5.336604118347168, |
|
"learning_rate": 1.3126942124716213e-06, |
|
"loss": 0.4218, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 0.9498233215547703, |
|
"grad_norm": 5.611804962158203, |
|
"learning_rate": 1.2744510627738516e-06, |
|
"loss": 0.4434, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9505771495877503, |
|
"grad_norm": 5.724870204925537, |
|
"learning_rate": 1.2367696817873419e-06, |
|
"loss": 0.4227, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 0.9513309776207303, |
|
"grad_norm": 5.307777404785156, |
|
"learning_rate": 1.1996502839269453e-06, |
|
"loss": 0.4002, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 0.9520848056537102, |
|
"grad_norm": 5.79971170425415, |
|
"learning_rate": 1.1630930804096495e-06, |
|
"loss": 0.405, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 0.9528386336866902, |
|
"grad_norm": 5.324243068695068, |
|
"learning_rate": 1.127098279253491e-06, |
|
"loss": 0.4043, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.9535924617196702, |
|
"grad_norm": 5.532378673553467, |
|
"learning_rate": 1.0916660852763216e-06, |
|
"loss": 0.4068, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9543462897526501, |
|
"grad_norm": 5.695662021636963, |
|
"learning_rate": 1.0567967000945866e-06, |
|
"loss": 0.4286, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.9551001177856302, |
|
"grad_norm": 5.8561482429504395, |
|
"learning_rate": 1.0224903221222938e-06, |
|
"loss": 0.4249, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 0.9558539458186102, |
|
"grad_norm": 5.72511625289917, |
|
"learning_rate": 9.88747146569813e-07, |
|
"loss": 0.4021, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 0.9566077738515901, |
|
"grad_norm": 5.385478973388672, |
|
"learning_rate": 9.555673654427332e-07, |
|
"loss": 0.3788, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 0.9573616018845701, |
|
"grad_norm": 5.669264316558838, |
|
"learning_rate": 9.229511675408642e-07, |
|
"loss": 0.4148, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.95811542991755, |
|
"grad_norm": 5.313277244567871, |
|
"learning_rate": 8.90898738457091e-07, |
|
"loss": 0.3641, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 0.95886925795053, |
|
"grad_norm": 5.480482578277588, |
|
"learning_rate": 8.59410260576321e-07, |
|
"loss": 0.3971, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.95962308598351, |
|
"grad_norm": 5.8209757804870605, |
|
"learning_rate": 8.28485913074506e-07, |
|
"loss": 0.3919, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 0.96037691401649, |
|
"grad_norm": 5.919877052307129, |
|
"learning_rate": 7.981258719175322e-07, |
|
"loss": 0.3863, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.9611307420494699, |
|
"grad_norm": 5.9404144287109375, |
|
"learning_rate": 7.683303098602989e-07, |
|
"loss": 0.4059, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.96188457008245, |
|
"grad_norm": 5.609850883483887, |
|
"learning_rate": 7.39099396445686e-07, |
|
"loss": 0.3697, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 0.96263839811543, |
|
"grad_norm": 5.695891857147217, |
|
"learning_rate": 7.104332980036211e-07, |
|
"loss": 0.3917, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 0.9633922261484099, |
|
"grad_norm": 5.932850360870361, |
|
"learning_rate": 6.823321776501024e-07, |
|
"loss": 0.415, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.9641460541813899, |
|
"grad_norm": 6.023778438568115, |
|
"learning_rate": 6.547961952863002e-07, |
|
"loss": 0.3817, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 0.9648998822143698, |
|
"grad_norm": 5.926705360412598, |
|
"learning_rate": 6.278255075976125e-07, |
|
"loss": 0.3884, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9656537102473498, |
|
"grad_norm": 5.837738513946533, |
|
"learning_rate": 6.014202680528324e-07, |
|
"loss": 0.3598, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.9664075382803298, |
|
"grad_norm": 6.178413391113281, |
|
"learning_rate": 5.755806269031827e-07, |
|
"loss": 0.3917, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 0.9671613663133097, |
|
"grad_norm": 6.282332897186279, |
|
"learning_rate": 5.503067311815713e-07, |
|
"loss": 0.4286, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 0.9679151943462897, |
|
"grad_norm": 6.746578216552734, |
|
"learning_rate": 5.255987247016591e-07, |
|
"loss": 0.4118, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.9686690223792698, |
|
"grad_norm": 6.075422763824463, |
|
"learning_rate": 5.014567480570831e-07, |
|
"loss": 0.3829, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.9694228504122497, |
|
"grad_norm": 6.149974346160889, |
|
"learning_rate": 4.778809386206895e-07, |
|
"loss": 0.3847, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 0.9701766784452297, |
|
"grad_norm": 6.333911418914795, |
|
"learning_rate": 4.548714305436685e-07, |
|
"loss": 0.3638, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 0.9709305064782097, |
|
"grad_norm": 6.391441345214844, |
|
"learning_rate": 4.324283547548658e-07, |
|
"loss": 0.3893, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.9716843345111896, |
|
"grad_norm": 6.624934196472168, |
|
"learning_rate": 4.1055183896001606e-07, |
|
"loss": 0.378, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 0.9724381625441696, |
|
"grad_norm": 6.473977565765381, |
|
"learning_rate": 3.892420076409886e-07, |
|
"loss": 0.366, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9731919905771496, |
|
"grad_norm": 6.985432147979736, |
|
"learning_rate": 3.68498982055121e-07, |
|
"loss": 0.4335, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 0.9739458186101295, |
|
"grad_norm": 7.089210510253906, |
|
"learning_rate": 3.483228802344973e-07, |
|
"loss": 0.4066, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 0.9746996466431095, |
|
"grad_norm": 7.46934175491333, |
|
"learning_rate": 3.2871381698529324e-07, |
|
"loss": 0.4253, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 0.9754534746760896, |
|
"grad_norm": 8.461312294006348, |
|
"learning_rate": 3.0967190388712097e-07, |
|
"loss": 0.4596, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 0.9762073027090695, |
|
"grad_norm": 8.289325714111328, |
|
"learning_rate": 2.9119724929239645e-07, |
|
"loss": 0.4382, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.9769611307420495, |
|
"grad_norm": 8.890064239501953, |
|
"learning_rate": 2.7328995832568426e-07, |
|
"loss": 0.4469, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.9777149587750295, |
|
"grad_norm": 8.737083435058594, |
|
"learning_rate": 2.5595013288318703e-07, |
|
"loss": 0.4262, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 0.9784687868080094, |
|
"grad_norm": 9.281461715698242, |
|
"learning_rate": 2.391778716320792e-07, |
|
"loss": 0.4036, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 0.9792226148409894, |
|
"grad_norm": 9.91952896118164, |
|
"learning_rate": 2.2297327000996293e-07, |
|
"loss": 0.4469, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 0.9799764428739693, |
|
"grad_norm": 11.952555656433105, |
|
"learning_rate": 2.0733642022437994e-07, |
|
"loss": 0.4597, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9807302709069493, |
|
"grad_norm": 5.1298322677612305, |
|
"learning_rate": 1.922674112522227e-07, |
|
"loss": 0.478, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 0.9814840989399294, |
|
"grad_norm": 5.572525501251221, |
|
"learning_rate": 1.7776632883924615e-07, |
|
"loss": 0.4829, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.9822379269729093, |
|
"grad_norm": 5.343718528747559, |
|
"learning_rate": 1.638332554996125e-07, |
|
"loss": 0.4319, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 0.9829917550058893, |
|
"grad_norm": 5.716027736663818, |
|
"learning_rate": 1.5046827051536928e-07, |
|
"loss": 0.4378, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.9837455830388693, |
|
"grad_norm": 5.513693809509277, |
|
"learning_rate": 1.3767144993602766e-07, |
|
"loss": 0.4235, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.9844994110718492, |
|
"grad_norm": 5.508944988250732, |
|
"learning_rate": 1.254428665781515e-07, |
|
"loss": 0.4007, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 0.9852532391048292, |
|
"grad_norm": 5.180131435394287, |
|
"learning_rate": 1.1378259002488013e-07, |
|
"loss": 0.3939, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 0.9860070671378092, |
|
"grad_norm": 5.590184688568115, |
|
"learning_rate": 1.0269068662560611e-07, |
|
"loss": 0.4166, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.9867608951707891, |
|
"grad_norm": 5.44436502456665, |
|
"learning_rate": 9.216721949553142e-08, |
|
"loss": 0.4047, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 0.9875147232037691, |
|
"grad_norm": 5.489165782928467, |
|
"learning_rate": 8.221224851535647e-08, |
|
"loss": 0.3999, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9882685512367492, |
|
"grad_norm": 5.663797855377197, |
|
"learning_rate": 7.282583033091372e-08, |
|
"loss": 0.3842, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 0.9890223792697291, |
|
"grad_norm": 5.638896942138672, |
|
"learning_rate": 6.400801835286796e-08, |
|
"loss": 0.3977, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.9897762073027091, |
|
"grad_norm": 5.8632307052612305, |
|
"learning_rate": 5.57588627563721e-08, |
|
"loss": 0.3579, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 0.9905300353356891, |
|
"grad_norm": 5.826532363891602, |
|
"learning_rate": 4.807841048082296e-08, |
|
"loss": 0.4088, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.991283863368669, |
|
"grad_norm": 5.712516784667969, |
|
"learning_rate": 4.096670522959478e-08, |
|
"loss": 0.3853, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.992037691401649, |
|
"grad_norm": 6.0777459144592285, |
|
"learning_rate": 3.442378746972841e-08, |
|
"loss": 0.4111, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.992791519434629, |
|
"grad_norm": 5.916062831878662, |
|
"learning_rate": 2.844969443178691e-08, |
|
"loss": 0.3821, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 0.9935453474676089, |
|
"grad_norm": 5.911341190338135, |
|
"learning_rate": 2.304446010958916e-08, |
|
"loss": 0.38, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 0.9942991755005889, |
|
"grad_norm": 6.334498405456543, |
|
"learning_rate": 1.8208115260032187e-08, |
|
"loss": 0.3812, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 0.995053003533569, |
|
"grad_norm": 6.576707363128662, |
|
"learning_rate": 1.3940687402924646e-08, |
|
"loss": 0.3858, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9958068315665489, |
|
"grad_norm": 6.39242696762085, |
|
"learning_rate": 1.0242200820786974e-08, |
|
"loss": 0.3661, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 0.9965606595995289, |
|
"grad_norm": 7.869157791137695, |
|
"learning_rate": 7.112676558784781e-09, |
|
"loss": 0.3966, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 0.9973144876325089, |
|
"grad_norm": 7.689291954040527, |
|
"learning_rate": 4.552132424562317e-09, |
|
"loss": 0.4297, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.9980683156654888, |
|
"grad_norm": 8.572519302368164, |
|
"learning_rate": 2.5605829881203414e-09, |
|
"loss": 0.451, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 0.9988221436984688, |
|
"grad_norm": 9.072525024414062, |
|
"learning_rate": 1.1380395818050282e-09, |
|
"loss": 0.4373, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9995759717314487, |
|
"grad_norm": 9.224164962768555, |
|
"learning_rate": 2.8451030018583623e-10, |
|
"loss": 0.4368, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 1.0007067137809187, |
|
"grad_norm": 5.4062819480896, |
|
"learning_rate": 0.0, |
|
"loss": 0.4839, |
|
"step": 1327 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1327, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 332, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8606342447625667e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|